├── self-check ├── .gitignore ├── checks.cuh ├── Dockerfile ├── README.md ├── main.cpp ├── Makefile └── checks.cu ├── README.md └── ionet-setup.sh /self-check/.gitignore: -------------------------------------------------------------------------------- 1 | build* 2 | -------------------------------------------------------------------------------- /self-check/checks.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | int get_devices_count(); 4 | int get_device_name(int device, char** result); 5 | int device_malloc(int device, void** result); 6 | int device_free(int device, void* ptr); 7 | -------------------------------------------------------------------------------- /self-check/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | RUN apt-get update && apt-get install -y wget && \ 4 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb -O /tmp/cuda-keyring_1.0-1_all.deb && \ 5 | dpkg -i /tmp/cuda-keyring_1.0-1_all.deb && \ 6 | apt-get update && \ 7 | apt-get -y install cuda-toolkit-11-8 && \ 8 | apt-get install -y build-essential && \ 9 | apt-get clean && \ 10 | rm -rf /var/lib/apt/lists/* 11 | -------------------------------------------------------------------------------- /self-check/README.md: -------------------------------------------------------------------------------- 1 | # IO Net CUDA Self-Check Binary 2 | 3 | This is intentionally released in source form for transparency. 4 | 5 | To run the check, get the binary from the Releases and run it on the Linux **host** (if your worker is Linux-based) or it WSL2 used to run our Launcher (if Windows-based). 6 | 7 | It should perform simple CUDA checks and report the results. 8 | 9 | ## Example of good output 10 | 11 | ``` 12 | Reported 1 CUDA devices 13 | Device #0: name=NVIDIA GeForce RTX 3080: memory alloc test pass 14 | all cards look ok 15 | ``` 16 | 17 | ## Example of output when some issues are found 18 | ``` 19 | Cannot get device count: cuda error=35 - CUDA driver version is insufficient for CUDA runtime version 20 | ``` 21 | -------------------------------------------------------------------------------- /self-check/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "checks.cuh" 3 | 4 | int main() { 5 | int devices = get_devices_count(); 6 | if (devices < 1) { 7 | printf("Cannot detect any CUDA devices\n"); 8 | return 2; 9 | } 10 | printf("Reported %d CUDA devices\n", devices); 11 | 12 | bool okay = true; 13 | for (int device = 0; device < devices; device++) { 14 | char* name; 15 | if (get_device_name(device, &name) < 0) { 16 | printf("Cannot get device name for #%d\n", device); 17 | okay = false; 18 | continue; 19 | } 20 | printf("Device #%d: name=%s: ", device, name); 21 | void* ptr; 22 | if (device_malloc(device, &ptr) < 0) { 23 | printf("cannot allocate memory on device #%d\n", device); 24 | okay = false; 25 | continue; 26 | } 27 | if (device_free(device, ptr) < 0) { 28 | printf("cannot free memory on device #%d\n", device); 29 | okay = false; 30 | continue; 31 | } 32 | printf("memory alloc test pass\n"); 33 | } 34 | printf(okay ? "all cards look ok\n" : "some cards failed check\n"); 35 | return okay ? 0 : 1; 36 | } -------------------------------------------------------------------------------- /self-check/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_ROOT := /usr/local/cuda-11 2 | export PATH := $(CUDA_ROOT)/bin:$(PATH) 3 | BUILD_DIR := build 4 | MARKER := $(BUILD_DIR)/.marker 5 | 6 | all: $(BUILD_DIR)/self-check 7 | 8 | $(MARKER): 9 | mkdir $(BUILD_DIR) -p 10 | touch $@ 11 | 12 | 13 | $(BUILD_DIR)/checks.cu.o: checks.cu checks.cuh $(MARKER) 14 | nvcc -arch=sm_61 --device-c -O3 $< -c -o $@ 15 | 16 | $(BUILD_DIR)/checks.o: $(BUILD_DIR)/checks.cu.o 17 | nvcc -arch=sm_61 --device-link -o $@ $^ 18 | 19 | $(BUILD_DIR)/main.o: main.cpp checks.cuh $(MARKER) 20 | g++ -O3 -march=corei7-avx -mtune=corei7-avx -mno-avx -mno-aes $< -c -o $@ 21 | 22 | $(BUILD_DIR)/self-check: $(BUILD_DIR)/main.o $(BUILD_DIR)/checks.o $(BUILD_DIR)/checks.cu.o 23 | g++ $^ -o $@ -L$(CUDA_ROOT)/lib64 -lcudart_static -ldl -lrt -pthread 24 | strip $@ 25 | 26 | clean: 27 | rm -rf $(BUILD_DIR) 28 | 29 | run: $(BUILD_DIR)/self-check 30 | $< 31 | 32 | docker: Dockerfile 33 | docker build . -t self-check-build 34 | docker run --volume $(CURDIR):/checker --user $(shell id -u) self-check-build make BUILD_DIR=build-docker -C /checker clean all 35 | docker run -it --volume $(CURDIR):/checker --gpus all --entrypoint /checker/build-docker/self-check brunneis/python:3.9.0-ubuntu 36 | 37 | .PHONY: all clean docker run 38 | 39 | #.SILENT: 40 | -------------------------------------------------------------------------------- /self-check/checks.cu: -------------------------------------------------------------------------------- 1 | #include "checks.cuh" 2 | #include 3 | 4 | #define CUDA_ERROR_CHECK(err, msg) { \ 5 | if ((err) != cudaSuccess) { \ 6 | fprintf(stderr, "%s: cuda error=%d - %s\n", (msg), (int)(err), cudaGetErrorString(err)); \ 7 | return -1; \ 8 | } \ 9 | } 10 | 11 | int get_devices_count() { 12 | int result; 13 | auto err = cudaGetDeviceCount(&result); 14 | CUDA_ERROR_CHECK(err, "Cannot get device count"); 15 | return result; 16 | } 17 | 18 | int get_device_name(int device, char** result) { 19 | if (result == nullptr) return -2; 20 | cudaDeviceProp prop; 21 | auto err = cudaGetDeviceProperties(&prop, device); 22 | CUDA_ERROR_CHECK(err, "Cannot get device properties"); 23 | *result = prop.name; 24 | return 0; 25 | } 26 | 27 | int device_malloc(int device, void** result) { 28 | if (result == nullptr) return -2; 29 | auto err = cudaSetDevice(device); 30 | CUDA_ERROR_CHECK(err, "Cannot set active device"); 31 | void* mem = nullptr; 32 | err = cudaMalloc(&mem, 1024); 33 | CUDA_ERROR_CHECK(err, "Cannot allocate memory"); 34 | *result = mem; 35 | return 0; 36 | } 37 | 38 | int device_free(int device, void* ptr) { 39 | if (ptr == nullptr) return -2; 40 | auto err = cudaSetDevice(device); 41 | CUDA_ERROR_CHECK(err, "Cannot set active device"); 42 | err = cudaFree(ptr); 43 | CUDA_ERROR_CHECK(err, "Cannot free memory"); 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IO Net Official Setup Script 2 | 3 | This repository contains a Bash script for setting up prerequisites for the IO Net platform. The script is designed for various Linux distributions and handles necessary configurations, especially for systems with Nvidia GPUs. 4 | 5 | ## Getting Started 6 | 7 | These instructions will guide you on how to use the setup script from this repository. 8 | 9 | ### Prerequisites 10 | 11 | Ensure you have `curl` installed on your system to download the script. 12 | - Run the following command to install if not installed already.
13 | ``` 14 | sudo apt install curl 15 | ``` 16 | 17 | ### Installation 18 | 19 | 1. **Download the setup script**: 20 | 21 | ```bash 22 | curl -L https://raw.githubusercontent.com/ionet-official/io-net-official-setup-script/main/ionet-setup.sh -o ionet-setup.sh 23 | 24 | 2. Run the script: 25 | ```bash 26 | chmod +x ionet-setup.sh && ./ionet-setup.sh 27 | 28 | ## Script Overview 29 | 30 | The `ionet-setup.sh` script performs a series of operations: 31 | 32 | - Sets global variables and detects the operating system and its version. 33 | - Checks for Nvidia GPU presence and installs necessary drivers and CUDA toolkit based on the detected Linux distribution and version. 34 | - Installs Docker and Docker Compose, along with setting up Nvidia Docker if an Nvidia GPU is detected. 35 | - Adds the user to the Docker group. 36 | 37 | ## Supported Distributions 38 | 39 | - Ubuntu (20.04, 22.04, 18.04) 40 | - Debian (10, 11) 41 | 42 | ## Contributions 43 | 44 | Contributions to this script are welcome. Please ensure that any pull requests or issues are relevant to the script's functionality and compatibility. 45 | 46 | ## Support 47 | 48 | For support, please open an issue or contact our support team on [discord](https://discord.gg/kqFzFK7fg2) 49 | -------------------------------------------------------------------------------- /ionet-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euxo pipefail 4 | 5 | export DEBIAN_FRONTEND=noninteractive 6 | sudo dpkg --set-selections <<< "cloud-init install" || true 7 | 8 | # Set Gloabal Variables 9 | # Detect OS 10 | OS="$(uname)" 11 | case $OS in 12 | "Linux") 13 | # Detect Linux Distro 14 | if [ -f /etc/os-release ]; then 15 | . /etc/os-release 16 | DISTRO=$ID 17 | VERSION=$VERSION_ID 18 | else 19 | echo "Your Linux distribution is not supported." 20 | exit 1 21 | fi 22 | ;; 23 | esac 24 | 25 | # Detect if an Nvidia GPU is present 26 | NVIDIA_PRESENT=$(lspci | grep -i nvidia || true) 27 | 28 | # Only proceed with Nvidia-specific steps if an Nvidia device is detected 29 | if [[ -z "$NVIDIA_PRESENT" ]]; then 30 | echo "No NVIDIA device detected on this system." 31 | else 32 | # Check if nvidia-smi is available and working 33 | if command -v nvidia-smi && nvidia-smi | grep CUDA | grep -vi 'n/a' &>/dev/null; then 34 | # Extract the CUDA version from the output of `nvidia-smi`. 35 | cuda_version=$(nvidia-smi | grep "CUDA Version" | sed 's/.*CUDA Version: \([0-9]*\.[0-9]*\).*/\1/') 36 | 37 | # Define the minimum required CUDA version. 38 | min_version="11.8" 39 | 40 | # Compare the CUDA version extracted with the minimum required version. 41 | # Here, we sort the two versions and use `head` to get the lowest. 42 | # If the lowest version is not the minimum version, it means the installed version is lower. 43 | if [ "$(printf '%s\n%s' "$cuda_version" "$min_version" | sort -V | head -n1)" = "$min_version" ]; then 44 | echo "CUDA version $cuda_version is installed and meets the minimum requirement of $min_version." 45 | else 46 | echo "CUDA version $cuda_version is installed but does not meet the minimum requirement of $min_version. Please upgrade CUDA." 47 | exit 1 48 | fi 49 | 50 | # Check if ECC is enabled, and if so, disable it. 51 | # Query the number of GPUs in the system 52 | num_gpus=$(nvidia-smi --list-gpus | wc -l) 53 | 54 | echo "Found $num_gpus GPUs in the system." 55 | 56 | # Loop through each GPU and check/disable ECC 57 | for (( gpu_index=0; gpu_index/dev/null; then 240 | echo "Docker is already installed." 241 | else 242 | echo "Docker is not installed. Proceeding with installations..." 243 | # Install Docker-ce keyring 244 | sudo apt update -y 245 | sudo apt install -y ca-certificates curl gnupg 246 | sudo install -m 0755 -d /etc/apt/keyrings 247 | FILE=/etc/apt/keyrings/docker.gpg 248 | if [ -f "$FILE" ]; then 249 | sudo rm "$FILE" 250 | fi 251 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o "$FILE" 252 | sudo chmod a+r /etc/apt/keyrings/docker.gpg 253 | 254 | # Add Docker-ce repository to Apt sources and install 255 | echo \ 256 | "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ 257 | $(. /etc/os-release; echo "$VERSION_CODENAME") stable" | \ 258 | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null 259 | sudo apt update -y 260 | sudo apt -y install docker-ce 261 | fi 262 | 263 | # Check if docker-compose v1 or docker compose v2 are installed 264 | if command -v docker-compose &>/dev/null || command -v docker compose &>/dev/null; then 265 | echo "Docker-compose is already installed." 266 | else 267 | echo "Docker-compose is not installed. Proceeding with installations..." 268 | 269 | # Install docker-compose subcommand 270 | sudo apt -y install docker-compose-plugin 271 | sudo ln -sv /usr/libexec/docker/cli-plugins/docker-compose /usr/bin/docker-compose 272 | docker-compose --version 273 | fi 274 | 275 | # Test / Install nvidia-docker 276 | if [[ ! -z "$NVIDIA_PRESENT" ]]; then 277 | if sudo docker run --gpus all nvidia/cuda:11.0.3-base-ubuntu18.04 nvidia-smi &>/dev/null; then 278 | echo "nvidia-docker is enabled and working. Exiting script." 279 | else 280 | echo "nvidia-docker does not seem to be enabled. Proceeding with installations..." 281 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) 282 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add 283 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list 284 | sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit 285 | sudo systemctl restart docker 286 | sudo docker run --gpus all nvidia/cuda:11.0.3-base-ubuntu18.04 nvidia-smi 287 | fi 288 | fi 289 | sudo apt-mark hold nvidia* libnvidia* 290 | # Add docker group and user to group docker 291 | sudo groupadd docker || true 292 | sudo usermod -aG docker $USER || true 293 | # Workaround for NVIDIA Docker Issue 294 | echo "Applying workaround for NVIDIA Docker issue as per https://github.com/NVIDIA/nvidia-docker/issues/1730" 295 | # Summary of issue and workaround: 296 | # The issue arises when the host performs daemon-reload, which may cause containers using systemd to lose access to NVIDIA GPUs. 297 | # To check if affected, run `sudo systemctl daemon-reload` on the host, then check GPU access in the container with `nvidia-smi`. 298 | # If affected, proceed with the workaround below. 299 | 300 | # Workaround Steps: 301 | # Disable cgroups for Docker containers to prevent the issue. 302 | # Edit the Docker daemon configuration. 303 | sudo python3 <