├── self-check
    ├── .gitignore
    ├── checks.cuh
    ├── Dockerfile
    ├── README.md
    ├── main.cpp
    ├── Makefile
    └── checks.cu
├── README.md
└── ionet-setup.sh


/self-check/.gitignore:
--------------------------------------------------------------------------------
1 | build*
2 | 


--------------------------------------------------------------------------------
/self-check/checks.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | int get_devices_count();
4 | int get_device_name(int device, char** result);
5 | int device_malloc(int device, void** result);
6 | int device_free(int device, void* ptr);
7 | 


--------------------------------------------------------------------------------
/self-check/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | 
 3 | RUN apt-get update && apt-get install -y wget && \
 4 |     wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb -O /tmp/cuda-keyring_1.0-1_all.deb && \
 5 |     dpkg -i /tmp/cuda-keyring_1.0-1_all.deb && \
 6 |     apt-get update && \
 7 |     apt-get -y install cuda-toolkit-11-8 && \
 8 |     apt-get install -y build-essential && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 


--------------------------------------------------------------------------------
/self-check/README.md:
--------------------------------------------------------------------------------
 1 | # IO Net CUDA Self-Check Binary
 2 | 
 3 | This is intentionally released in source form for transparency.
 4 | 
 5 | To run the check, get the binary from the Releases and run it on the Linux **host** (if your worker is Linux-based) or it WSL2 used to run our Launcher (if Windows-based).
 6 | 
 7 | It should perform simple CUDA checks and report the results.
 8 | 
 9 | ## Example of good output
10 | 
11 | ```
12 |     Reported 1 CUDA devices
13 |     Device #0: name=NVIDIA GeForce RTX 3080: memory alloc test pass
14 |     all cards look ok
15 | ```
16 | 
17 | ## Example of output when some issues are found
18 | ```
19 |     Cannot get device count: cuda error=35 - CUDA driver version is insufficient for CUDA runtime version
20 | ```
21 | 


--------------------------------------------------------------------------------
/self-check/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "checks.cuh"
 3 | 
 4 | int main() {
 5 |     int devices = get_devices_count();
 6 |     if (devices < 1) {
 7 |         printf("Cannot detect any CUDA devices\n");
 8 |         return 2;
 9 |     }
10 |     printf("Reported %d CUDA devices\n", devices);
11 | 
12 |     bool okay = true;
13 |     for (int device = 0; device < devices; device++) {
14 |         char* name;
15 |         if (get_device_name(device, &name) < 0) {
16 |             printf("Cannot get device name for #%d\n", device);
17 |             okay = false;
18 |             continue;
19 |         }
20 |         printf("Device #%d: name=%s: ", device, name);
21 |         void* ptr;
22 |         if (device_malloc(device, &ptr) < 0) {
23 |             printf("cannot allocate memory on device #%d\n", device);
24 |             okay = false;
25 |             continue;
26 |         }
27 |         if (device_free(device, ptr) < 0) {
28 |             printf("cannot free memory on device #%d\n", device);
29 |             okay = false;
30 |             continue;
31 |         }
32 |         printf("memory alloc test pass\n");
33 |     }
34 |     printf(okay ? "all cards look ok\n" : "some cards failed check\n");
35 |     return okay ? 0 : 1;
36 | }


--------------------------------------------------------------------------------
/self-check/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_ROOT := /usr/local/cuda-11
 2 | export PATH := $(CUDA_ROOT)/bin:$(PATH)
 3 | BUILD_DIR := build
 4 | MARKER := $(BUILD_DIR)/.marker
 5 | 
 6 | all: $(BUILD_DIR)/self-check
 7 | 
 8 | $(MARKER):
 9 | 	mkdir $(BUILD_DIR) -p
10 | 	touch $@
11 | 
12 | 
13 | $(BUILD_DIR)/checks.cu.o: checks.cu checks.cuh $(MARKER)
14 | 	nvcc -arch=sm_61 --device-c -O3 $< -c -o $@
15 | 
16 | $(BUILD_DIR)/checks.o: $(BUILD_DIR)/checks.cu.o
17 | 	nvcc -arch=sm_61 --device-link -o $@ $^
18 | 
19 | $(BUILD_DIR)/main.o: main.cpp checks.cuh $(MARKER)
20 | 	g++ -O3 -march=corei7-avx -mtune=corei7-avx -mno-avx -mno-aes $< -c -o $@
21 | 
22 | $(BUILD_DIR)/self-check: $(BUILD_DIR)/main.o $(BUILD_DIR)/checks.o $(BUILD_DIR)/checks.cu.o
23 | 	g++ $^ -o $@ -L$(CUDA_ROOT)/lib64 -lcudart_static -ldl -lrt -pthread
24 | 	strip $@
25 | 
26 | clean:
27 | 	rm -rf $(BUILD_DIR)
28 | 
29 | run: $(BUILD_DIR)/self-check
30 | 	$<
31 | 
32 | docker: Dockerfile
33 | 	docker build . -t self-check-build
34 | 	docker run --volume $(CURDIR):/checker --user $(shell id -u) self-check-build make BUILD_DIR=build-docker -C /checker clean all
35 | 	docker run -it --volume $(CURDIR):/checker --gpus all --entrypoint /checker/build-docker/self-check brunneis/python:3.9.0-ubuntu
36 | 
37 | .PHONY: all clean docker run
38 | 
39 | #.SILENT:
40 | 


--------------------------------------------------------------------------------
/self-check/checks.cu:
--------------------------------------------------------------------------------
 1 | #include "checks.cuh"
 2 | #include <stdio.h>
 3 | 
 4 | #define CUDA_ERROR_CHECK(err, msg) {    \
 5 |     if ((err) != cudaSuccess) {         \
 6 |         fprintf(stderr, "%s: cuda error=%d - %s\n", (msg), (int)(err), cudaGetErrorString(err));  \
 7 |         return -1;                      \
 8 |     }                                   \
 9 | }
10 | 
11 | int get_devices_count() {
12 |     int result;
13 |     auto err = cudaGetDeviceCount(&result);
14 |     CUDA_ERROR_CHECK(err, "Cannot get device count");
15 |     return result;
16 | }
17 | 
18 | int get_device_name(int device, char** result) {
19 |     if (result == nullptr) return -2;
20 |     cudaDeviceProp prop;
21 |     auto err = cudaGetDeviceProperties(&prop, device);
22 |     CUDA_ERROR_CHECK(err, "Cannot get device properties");
23 |     *result = prop.name;
24 |     return 0;
25 | }
26 | 
27 | int device_malloc(int device, void** result) {
28 |     if (result == nullptr) return -2;
29 |     auto err = cudaSetDevice(device);
30 |     CUDA_ERROR_CHECK(err, "Cannot set active device");
31 |     void* mem = nullptr;
32 |     err = cudaMalloc(&mem, 1024);
33 |     CUDA_ERROR_CHECK(err, "Cannot allocate memory");
34 |     *result = mem;
35 |     return 0;
36 | }
37 | 
38 | int device_free(int device, void* ptr) {
39 |     if (ptr == nullptr) return -2;
40 |     auto err = cudaSetDevice(device);
41 |     CUDA_ERROR_CHECK(err, "Cannot set active device");
42 |     err = cudaFree(ptr);
43 |     CUDA_ERROR_CHECK(err, "Cannot free memory");
44 |     return 0;
45 | }
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # IO Net Official Setup Script
 2 | 
 3 | This repository contains a Bash script for setting up prerequisites for the IO Net platform. The script is designed for various Linux distributions and handles necessary configurations, especially for systems with Nvidia GPUs.
 4 | 
 5 | ## Getting Started
 6 | 
 7 | These instructions will guide you on how to use the setup script from this repository.
 8 | 
 9 | ### Prerequisites
10 | 
11 | Ensure you have `curl` installed on your system to download the script. 
12 | - Run the following command to install if not installed already.<br>
13 |      ```
14 |      sudo apt install curl
15 |      ``` 
16 | 
17 | ### Installation
18 | 
19 | 1. **Download the setup script**:
20 | 
21 |    ```bash
22 |    curl -L https://raw.githubusercontent.com/ionet-official/io-net-official-setup-script/main/ionet-setup.sh -o ionet-setup.sh
23 | 
24 | 2. Run the script:
25 |    ```bash
26 |    chmod +x ionet-setup.sh && ./ionet-setup.sh
27 |    
28 | ## Script Overview
29 | 
30 | The `ionet-setup.sh` script performs a series of operations:
31 | 
32 | - Sets global variables and detects the operating system and its version.
33 | - Checks for Nvidia GPU presence and installs necessary drivers and CUDA toolkit based on the detected Linux distribution and version.
34 | - Installs Docker and Docker Compose, along with setting up Nvidia Docker if an Nvidia GPU is detected.
35 | - Adds the user to the Docker group.
36 | 
37 | ## Supported Distributions
38 | 
39 | - Ubuntu (20.04, 22.04, 18.04)
40 | - Debian (10, 11)
41 | 
42 | ## Contributions
43 | 
44 | Contributions to this script are welcome. Please ensure that any pull requests or issues are relevant to the script's functionality and compatibility.
45 | 
46 | ## Support
47 | 
48 | For support, please open an issue or contact our support team on [discord](https://discord.gg/kqFzFK7fg2)
49 | 


--------------------------------------------------------------------------------
/ionet-setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -euxo pipefail
  4 | 
  5 | export DEBIAN_FRONTEND=noninteractive
  6 | sudo dpkg --set-selections <<< "cloud-init install" || true
  7 | 
  8 | # Set Gloabal Variables
  9 |     # Detect OS
 10 |         OS="$(uname)"
 11 |         case $OS in
 12 |             "Linux")
 13 |                 # Detect Linux Distro
 14 |                 if [ -f /etc/os-release ]; then
 15 |                     . /etc/os-release
 16 |                     DISTRO=$ID
 17 |                     VERSION=$VERSION_ID
 18 |                 else
 19 |                     echo "Your Linux distribution is not supported."
 20 |                     exit 1
 21 |                 fi
 22 |                 ;;
 23 |         esac
 24 | 
 25 | # Detect if an Nvidia GPU is present
 26 | NVIDIA_PRESENT=$(lspci | grep -i nvidia || true)
 27 | 
 28 | # Only proceed with Nvidia-specific steps if an Nvidia device is detected
 29 | if [[ -z "$NVIDIA_PRESENT" ]]; then
 30 |     echo "No NVIDIA device detected on this system."
 31 | else
 32 |     # Check if nvidia-smi is available and working
 33 |     if command -v nvidia-smi && nvidia-smi | grep CUDA | grep -vi 'n/a' &>/dev/null; then
 34 |         # Extract the CUDA version from the output of `nvidia-smi`.
 35 |         cuda_version=$(nvidia-smi | grep "CUDA Version" | sed 's/.*CUDA Version: \([0-9]*\.[0-9]*\).*/\1/')
 36 | 
 37 |         # Define the minimum required CUDA version.
 38 |         min_version="11.8"
 39 | 
 40 |         # Compare the CUDA version extracted with the minimum required version.
 41 |         # Here, we sort the two versions and use `head` to get the lowest.
 42 |         # If the lowest version is not the minimum version, it means the installed version is lower.
 43 |         if [ "$(printf '%s\n%s' "$cuda_version" "$min_version" | sort -V | head -n1)" = "$min_version" ]; then
 44 |             echo "CUDA version $cuda_version is installed and meets the minimum requirement of $min_version."
 45 |         else
 46 |             echo "CUDA version $cuda_version is installed but does not meet the minimum requirement of $min_version. Please upgrade CUDA."
 47 |             exit 1
 48 |         fi
 49 | 
 50 |         # Check if ECC is enabled, and if so, disable it.
 51 |         # Query the number of GPUs in the system
 52 |         num_gpus=$(nvidia-smi --list-gpus | wc -l)
 53 | 
 54 |         echo "Found $num_gpus GPUs in the system."
 55 | 
 56 |         # Loop through each GPU and check/disable ECC
 57 |         for (( gpu_index=0; gpu_index<num_gpus; gpu_index++ ))
 58 |         do
 59 |             echo "Checking ECC status for GPU $gpu_index..."
 60 | 
 61 |             if nvidia-smi -i $gpu_index --query-gpu=ecc.mode.current --format=csv,noheader,nounits | grep -q "Enabled"; then
 62 |                 echo "ECC is enabled on GPU ${gpu_index}, attempting to disable..."
 63 |                 if sudo nvidia-smi -i $gpu_index --ecc-config=0; then
 64 |                     echo "ECC has been disabled for GPU ${gpu_index}, A Reboot is required to apply changes."
 65 |                 else
 66 |                     echo "Failed to disable ECC on GPU ${gpu_index}."
 67 |                 fi
 68 |             else
 69 |                 echo "ECC is already disabled on GPU ${gpu_index}."
 70 |             fi
 71 |         done
 72 | 
 73 |     else
 74 |                 # Depending on Distro
 75 |                 case $DISTRO in
 76 |                     "ubuntu")
 77 |                         case $VERSION in
 78 |                             "20.04")
 79 |                                 # Commands specific to Ubuntu 20.04
 80 |                                 sudo -- sh -c 'apt-get update; apt-get upgrade -y; apt-get autoremove -y; apt-get autoclean -y'
 81 |                                 sudo -- sh -c 'apt-get update; apt-get upgrade -y; apt-get autoremove -y; apt-get autoclean -y'
 82 |                                 sudo apt install linux-headers-$(uname -r) -y
 83 | 				sudo apt del 7fa2af80 || true
 84 |                                 sudo apt remove 7fa2af80 || true
 85 |                                 sudo apt install build-essential cmake gpg unzip pkg-config software-properties-common ubuntu-drivers-common -y
 86 |                                 sudo apt install libxmu-dev libxi-dev libglu1-mesa libglu1-mesa-dev -y || true
 87 |                                 sudo apt install libjpeg-dev libpng-dev libtiff-dev -y || true
 88 |                                 sudo apt install libavcodec-dev libavformat-dev libswscale-dev libv4l-dev -y || true
 89 |                                 sudo apt install libxvidcore-dev libx264-dev -y || true
 90 |                                 sudo apt install libopenblas-dev libatlas-base-dev liblapack-dev gfortran -y || true
 91 |                                 sudo apt install libhdf5-serial-dev -y || true
 92 |                                 sudo apt install python3-dev python3-tk python-imaging-tk curl cuda-keyring gnupg-agent dirmngr alsa-utils -y || true
 93 |                                 sudo apt install libgtk-3-dev -y || true
 94 |                                 sudo apt update -y
 95 |                                 sudo dirmngr </dev/null
 96 |                                 if sudo apt-add-repository -y ppa:graphics-drivers/ppa && sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys FCAE110B1118213C; then
 97 |                                     echo "Alternative method succeeded."
 98 |                                 else
 99 |                                     echo "Alternative method failed. Trying the original method..."
100 |                                     sudo dirmngr </dev/null
101 |                                     sudo apt-add-repository -y ppa:graphics-drivers/ppa
102 |                                     sudo gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/graphics-drivers.gpg --keyserver keyserver.ubuntu.com --recv-keys FCAE110B1118213C
103 |                                     sudo chmod 644 /etc/apt/trusted.gpg.d/graphics-drivers.gpg
104 |                                 fi
105 |                                 sudo ubuntu-drivers autoinstall
106 |                                 sudo apt update -y
107 |                                 wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
108 |                                 sudo dpkg -i cuda-keyring_1.1-1_all.deb
109 |                                 sudo apt update -y
110 |                                 sudo apt -y install cuda-toolkit
111 |                                 export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
112 |                                 sudo apt-get update
113 |                                 ;;
114 | 
115 |                             "22.04")
116 |                                 # Commands specific to Ubuntu 22.04
117 |                                 sudo -- sh -c 'apt-get update; apt-get upgrade -y; apt-get autoremove -y; apt-get autoclean -y'
118 |                                 sudo -- sh -c 'apt-get update; apt-get upgrade -y; apt-get autoremove -y; apt-get autoclean -y'
119 |                                 sudo apt install linux-headers-$(uname -r) -y
120 |                                 sudo apt del 7fa2af80 || true
121 |                                 sudo apt remove 7fa2af80 || true
122 |                                 sudo apt install build-essential cmake gpg unzip pkg-config software-properties-common ubuntu-drivers-common -y
123 |                                 sudo apt install libxmu-dev libxi-dev libglu1-mesa libglu1-mesa-dev -y
124 |                                 sudo apt install libjpeg-dev libpng-dev libtiff-dev -y
125 |                                 sudo apt install libavcodec-dev libavformat-dev libswscale-dev libv4l-dev -y
126 |                                 sudo apt install libxvidcore-dev libx264-dev -y
127 |                                 sudo apt install libopenblas-dev libatlas-base-dev liblapack-dev gfortran -y
128 |                                 sudo apt install libhdf5-serial-dev -y
129 |                                 sudo apt install python3-dev python3-tk curl gnupg-agent dirmngr alsa-utils -y
130 |                                 sudo apt install libgtk-3-dev -y
131 |                                 sudo apt update -y
132 |                                 sudo dirmngr </dev/null
133 |                                 if sudo apt-add-repository -y ppa:graphics-drivers/ppa && sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys FCAE110B1118213C; then
134 |                                     echo "Alternative method succeeded."
135 |                                 else
136 |                                     echo "Alternative method failed. Trying the original method..."
137 |                                     sudo dirmngr </dev/null
138 |                                     sudo apt-add-repository -y ppa:graphics-drivers/ppa
139 |                                     sudo gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/graphics-drivers.gpg --keyserver keyserver.ubuntu.com --recv-keys FCAE110B1118213C
140 |                                     sudo chmod 644 /etc/apt/trusted.gpg.d/graphics-drivers.gpg
141 |                                 fi
142 |                                 sudo ubuntu-drivers autoinstall
143 |                                 sudo apt update -y
144 |                                 wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
145 |                                 sudo dpkg -i cuda-keyring_1.1-1_all.deb
146 |                                 sudo apt update -y
147 |                                 sudo apt -y install cuda-toolkit
148 |                                 export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
149 |                                 sudo apt update -y
150 |                                 ;;
151 | 
152 |                             "18.04")
153 |                                 # Commands specific to Ubuntu 18.04
154 |                                 sudo -- sh -c 'apt-get update; apt-get upgrade -y; apt-get autoremove -y; apt-get autoclean -y'
155 |                                 sudo apt-get install linux-headers-$(uname -r) -y
156 |                                 sudo apt del 7fa2af80 || true
157 |                                 sudo apt remove 7fa2af80 || true
158 |                                 sudo apt install build-essential cmake gpg unzip pkg-config software-properties-common ubuntu-drivers-common alsa-utils -y
159 |                                 sudo apt install libxmu-dev libxi-dev libglu1-mesa libglu1-mesa-dev -y || true
160 |                                 sudo apt install libjpeg-dev libpng-dev libtiff-dev -y || true
161 |                                 sudo apt install libavcodec-dev libavformat-dev libswscale-dev libv4l-dev -y || true
162 |                                 sudo apt install libxvidcore-dev libx264-dev -y || true
163 |                                 sudo apt install libopenblas-dev libatlas-base-dev liblapack-dev gfortran -y || true
164 |                                 sudo apt install libhdf5-serial-dev -y || true
165 |                                 sudo apt install python3-dev python3-tk python-imaging-tk curl cuda-keyring -y || true
166 |                                 sudo apt install libgtk-3-dev -y || true
167 |                                 sudo apt update -y
168 |                                 sudo ubuntu-drivers install
169 |                                 sudo apt update -y
170 |                                 wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
171 |                                 sudo dpkg -i cuda-keyring_1.1-1_all.deb
172 |                                 sudo apt update -y
173 |                                 sudo apt -y install cuda-toolkit
174 |                                 export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
175 |                                 sudo apt update -y
176 |                                 ;;
177 | 
178 |                             *)
179 |                                 echo "This version of Ubuntu is not supported in this script."
180 |                                 exit 1
181 |                                 ;;
182 |                         esac
183 |                         ;;
184 | 
185 |                     "debian")
186 |                         case $VERSION in
187 |                             "10"|"11")
188 |                                 # Commands specific to Debian 10 & 11
189 |                                 sudo -- sh -c 'apt update; apt upgrade -y; apt autoremove -y; apt autoclean -y'
190 |                                 sudo apt install linux-headers-$(uname -r) -y
191 |                                 sudo apt update -y
192 |                                 sudo apt install nvidia-driver firmware-misc-nonfree
193 |                                 wget https://developer.download.nvidia.com/compute/cuda/repos/debian${VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
194 |                                 sudo apt install nvidia-cuda-dev nvidia-cuda-toolkit
195 |                                 sudo apt update -y
196 |                                 ;;
197 | 
198 |                             *)
199 |                                 echo "This version of Debian is not supported in this script."
200 |                                 exit 1
201 |                                 ;;
202 |                         esac
203 |                         ;;
204 | 
205 |                     *)
206 |                         echo "Your Linux distribution is not supported."
207 |                         exit 1
208 |                         ;;
209 | 
210 |             "Windows_NT")
211 |                 # For Windows Subsystem for Linux (WSL) with Ubuntu
212 |                 if grep -q Microsoft /proc/version; then
213 |                     wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-keyring_1.1-1_all.deb
214 |                     sudo dpkg -i cuda-keyring_1.1-1_all.deb
215 |                     sudo apt-get update
216 |                     sudo apt-get -y install cuda
217 |                 else
218 |                     echo "This bash script can't be executed on Windows directly unless using WSL with Ubuntu. For other scenarios, consider using a PowerShell script or manual installation."
219 |                     exit 1
220 |                 fi
221 |                 ;;
222 | 
223 |             *)
224 |                 echo "Your OS is not supported."
225 |                 exit 1
226 |                 ;;
227 |         esac
228 | 	echo "System will now reboot !!! Please re-run this script after restart to complete installation !"
229 |  	sleep 5s
230 |         sudo reboot
231 |     fi
232 | fi
233 | # For testing purposes, this should output NVIDIA's driver version
234 | if [[ ! -z "$NVIDIA_PRESENT" ]]; then
235 |     nvidia-smi
236 | fi
237 | 
238 | # Check if Docker is installed
239 | if command -v docker &>/dev/null; then
240 |     echo "Docker is already installed."
241 | else
242 |     echo "Docker is not installed. Proceeding with installations..."
243 |     # Install Docker-ce keyring
244 |     sudo apt update -y
245 |     sudo apt install -y ca-certificates curl gnupg
246 |     sudo install -m 0755 -d /etc/apt/keyrings
247 |     FILE=/etc/apt/keyrings/docker.gpg
248 |     if [ -f "$FILE" ]; then
249 |         sudo rm "$FILE"
250 |     fi
251 |     curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o "$FILE"
252 |     sudo chmod a+r /etc/apt/keyrings/docker.gpg
253 | 
254 |     # Add Docker-ce repository to Apt sources and install
255 |     echo \
256 |       "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
257 |       $(. /etc/os-release; echo "$VERSION_CODENAME") stable" | \
258 |       sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
259 |     sudo apt update -y
260 |     sudo apt -y install docker-ce
261 | fi
262 | 
263 | # Check if docker-compose v1 or docker compose v2 are installed
264 | if command -v docker-compose &>/dev/null || command -v docker compose &>/dev/null; then
265 |     echo "Docker-compose is already installed."
266 | else
267 |     echo "Docker-compose is not installed. Proceeding with installations..."
268 | 
269 |     # Install docker-compose subcommand
270 |     sudo apt -y install docker-compose-plugin
271 |     sudo ln -sv /usr/libexec/docker/cli-plugins/docker-compose /usr/bin/docker-compose
272 |     docker-compose --version
273 | fi
274 | 
275 | # Test / Install nvidia-docker
276 | if [[ ! -z "$NVIDIA_PRESENT" ]]; then
277 |     if sudo docker run --gpus all nvidia/cuda:11.0.3-base-ubuntu18.04 nvidia-smi &>/dev/null; then
278 |         echo "nvidia-docker is enabled and working. Exiting script."
279 |     else
280 |         echo "nvidia-docker does not seem to be enabled. Proceeding with installations..."
281 |         distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
282 |         curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add
283 |         curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
284 |         sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
285 |         sudo systemctl restart docker
286 |         sudo docker run --gpus all nvidia/cuda:11.0.3-base-ubuntu18.04 nvidia-smi
287 |     fi
288 | fi
289 | sudo apt-mark hold nvidia* libnvidia*
290 | # Add docker group and user to group docker
291 | sudo groupadd docker || true
292 | sudo usermod -aG docker $USER || true
293 | # Workaround for NVIDIA Docker Issue
294 | echo "Applying workaround for NVIDIA Docker issue as per https://github.com/NVIDIA/nvidia-docker/issues/1730"
295 | # Summary of issue and workaround:
296 | # The issue arises when the host performs daemon-reload, which may cause containers using systemd to lose access to NVIDIA GPUs.
297 | # To check if affected, run `sudo systemctl daemon-reload` on the host, then check GPU access in the container with `nvidia-smi`.
298 | # If affected, proceed with the workaround below.
299 | 
300 | # Workaround Steps:
301 | # Disable cgroups for Docker containers to prevent the issue.
302 | # Edit the Docker daemon configuration.
303 | sudo python3 <<END
304 | import json, pathlib, sys
305 | def update_key(dct: dict, key: str, value):
306 |     for item in key.split('.')[:-1]:
307 |         dct = dct.setdefault(item, {})
308 |     dct[key.split('.')[-1]] = value
309 | 
310 | cfg = pathlib.Path('/etc/docker/daemon.json')
311 | try:
312 |     config = json.loads(cfg.read_text(errors='ignore') if cfg.exists() else '{}')
313 |     update_key(config, 'runtimes.nvidia.path', 'nvidia-container-runtime')
314 |     update_key(config, 'runtimes.nvidia.runtimeArgs', [])
315 |     update_key(config, 'exec-opts', ['native.cgroupdriver=cgroupfs'])
316 |     cfg.write_text(json.dumps(config, sort_keys=True, indent=4))
317 | except Exception as e:
318 |     sys.exit('Cannot modify docker config, reason: ' + str(e))
319 | else:
320 |     print('Docker settings modified successfully')
321 | END
322 | 
323 | # Restart Docker to apply changes.
324 | sudo systemctl restart docker
325 | echo "Workaround applied. Docker has been configured to use 'cgroupfs' as the cgroup driver."
326 | 


--------------------------------------------------------------------------------