├── UPDATE.md
├── BACKUP.md
├── gensyn.sh
├── README.md
├── backup.sh
└── cuda.sh
/UPDATE.md:
--------------------------------------------------------------------------------
1 | >[!Note]
2 | >Whenever Gensyn updates their node, use these commands one by one. However, wait for my update before proceeding. Join my Telegram channel to get the latest information when an update is needed : [ZunXBT](https://t.me/zunxbt)
3 |
4 | ### 1. First kill all the existing gensyn screen session
5 | ```
6 | pkill -f "SCREEN.*gensyn"
7 | ```
8 | ### 2. Create a screen session named `gensyn`
9 | ```
10 | screen -S gensyn
11 | ```
12 | ### 3. Delete exisiting temp-data
13 | ```
14 | [ -n "$(ls "$HOME/rl-swarm/modal-login/temp-data/"*.json 2>/dev/null)" ] && rm -f "$HOME/rl-swarm/modal-login/temp-data/"*.json 2>/dev/null || true
15 | ```
16 | ### 4. Now use this command to run the `rl-swarm`
17 | ```
18 | cd $HOME && rm -rf gensyn-testnet && git clone https://github.com/zunxbt/gensyn-testnet.git && chmod +x gensyn-testnet/gensyn.sh && ./gensyn-testnet/gensyn.sh
19 | ```
20 | - After running the above command, it will show something like this :
21 |
22 | 
23 |
24 | - You should choose 1 to use existing `swarm.pem` file
25 | >[!Note]
26 | > It will ask this question - ```Would you like to push models you train in the RL swarm to the Hugging Face Hub? [y/N]``` here Write `N` and at last you will see this : ```Your training session is about to begin``` then you can detach from this gensyn screen session
27 |
28 | ### 5. Detach from this screen session
29 | - Use `Ctrl + A` and then press `D` to detach from this screen session.
30 |
--------------------------------------------------------------------------------
/BACKUP.md:
--------------------------------------------------------------------------------
1 | ### 1. Back up `swarm.pem` from GPU server to local PC
2 | - For this, you must need to connect to GPU server using [SSH](https://github.com/zunxbt/gensyn-testnet?tab=readme-ov-file#-connect-via-ssh) (Recommened to do these stuffs on Command Prompt or Power Shell)
3 | - Now exit from this GPU server using this command
4 | ```
5 | exit
6 | ```
7 | - Now replace `SSH-COMMAND` in the below command with the command which your received from provider, then replace `YOUR-PC-PATH` where you want to download this swarm.pem file and then execute it on your Command prompt or Power shell
8 | ```
9 | SSH-COMMAND "cat ~/rl-swarm/swarm.pem" > "YOUR-PC-PATH\swarm.pem"
10 | ```
11 | - In my case, this command looks like this :
12 | ```
13 | ssh -p 69 root@69.69.69.69 "cat ~/rl-swarm/swarm.pem" > "C:\Users\USER\Downloads\swarm.pem"
14 | ```
15 | - Done, your `swarm.pem` file is now saved on your local system
16 |
17 | ### 2. Back up `swarm.pem` from VPS server to local PC
18 | - For this, I recommend to use `Command Prompt` or `Power Shell`
19 | - If you are using **Command Prompt** then use the below commmand, make sure to replace `VPS-USERNAME` , `VPS-IP`and `YOUR-PC-PATH` (where you want to save swarm.pem file) with actual value
20 | ```
21 | scp VPS-USERNAME@VPS-IP:~/rl-swarm/swarm.pem "YOUR-PC-PATH"
22 | ```
23 | - In my case this command looks like this :
24 | ```
25 | scp root@69.69.69.69:~/rl-swarm/swarm.pem "C:\Users\USER\Downloads"
26 | ```
27 | - If you are using **Powershell** then use the below commmand, make sure to replace `VPS-USERNAME` , `VPS-IP`and `YOUR-PC-PATH`(where you want to save swarm.pem file) with actual value
28 | ```
29 | scp VPS-USERNAME@VPS-IP:~/rl-swarm/swarm.pem 'YOUR-PC-PATH'
30 | ```
31 | - In my case this command looks like this :
32 | ```
33 | scp root@69.69.69.69:~/rl-swarm/swarm.pem 'C:\Users\USER\Downloads'
34 | ```
35 |
36 | ### 3. Back up `swarm.pem` from WSL to local PC
37 | - First, open `Command Prompt` or `Windows Powershell`
38 | - Then use the below command, make sure to replace `YOUR-WSL-USERNAME` `YOUR-PC-PATH`(where you want to save swarm.pem file) with actual value
39 | ```
40 | copy "\\wsl$\Ubuntu\home\YOUR-WSL-USERNAME\rl-swarm\swarm.pem" "YOUR-PC-PATH"
41 | ```
42 | - In my case, it looks like this
43 | ```
44 | copy "\\wsl$\Ubuntu\home\zun24\rl-swarm\swarm.pem" "C:\Users\USER\Downloads"
45 | ```
46 |
--------------------------------------------------------------------------------
/gensyn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | BOLD="\e[1m"
4 | RED="\e[31m"
5 | GREEN="\e[32m"
6 | YELLOW="\e[33m"
7 | NC="\e[0m"
8 |
9 | SWARM_DIR="$HOME/rl-swarm"
10 | TEMP_DATA_PATH="$SWARM_DIR/modal-login/temp-data"
11 | HOME_DIR="$HOME"
12 |
13 | cd $HOME
14 |
15 | if [ -f "$SWARM_DIR/swarm.pem" ]; then
16 | echo -e "${BOLD}${YELLOW}You already have an existing ${GREEN}swarm.pem${YELLOW} file.${NC}\n"
17 | echo -e "${BOLD}${YELLOW}Do you want to:${NC}"
18 | echo -e "${BOLD}1) Use the existing swarm.pem${NC}"
19 | echo -e "${BOLD}${RED}2) Delete existing swarm.pem and start fresh${NC}"
20 |
21 | while true; do
22 | read -p $'\e[1mEnter your choice (1 or 2): \e[0m' choice
23 | if [ "$choice" == "1" ]; then
24 | echo -e "\n${BOLD}${YELLOW}[✓] Using existing swarm.pem...${NC}"
25 | mv "$SWARM_DIR/swarm.pem" "$HOME_DIR/"
26 | mv "$TEMP_DATA_PATH/userData.json" "$HOME_DIR/" 2>/dev/null
27 | mv "$TEMP_DATA_PATH/userApiKey.json" "$HOME_DIR/" 2>/dev/null
28 |
29 | rm -rf "$SWARM_DIR"
30 |
31 | echo -e "${BOLD}${YELLOW}[✓] Cloning fresh repository...${NC}"
32 | cd $HOME && git clone https://github.com/gensyn-ai/rl-swarm.git > /dev/null 2>&1
33 |
34 | mv "$HOME_DIR/swarm.pem" rl-swarm/
35 | mv "$HOME_DIR/userData.json" rl-swarm/modal-login/temp-data/ 2>/dev/null
36 | mv "$HOME_DIR/userApiKey.json" rl-swarm/modal-login/temp-data/ 2>/dev/null
37 | break
38 | elif [ "$choice" == "2" ]; then
39 | echo -e "${BOLD}${YELLOW}[✓] Removing existing folder and starting fresh...${NC}"
40 | rm -rf "$SWARM_DIR"
41 | sleep 2
42 | cd $HOME && git clone https://github.com/gensyn-ai/rl-swarm.git > /dev/null 2>&1
43 | break
44 | else
45 | echo -e "\n${BOLD}${RED}[✗] Invalid choice. Please enter 1 or 2.${NC}"
46 | fi
47 | done
48 | else
49 | echo -e "${BOLD}${YELLOW}[✓] No existing swarm.pem found. Cloning repository...${NC}"
50 | cd $HOME && [ -d rl-swarm ] && rm -rf rl-swarm; git clone https://github.com/gensyn-ai/rl-swarm.git > /dev/null 2>&1
51 | fi
52 |
53 | cd rl-swarm || { echo -e "${BOLD}${RED}[✗] Failed to enter rl-swarm directory. Exiting.${NC}"; exit 1; }
54 |
55 | echo -e "${BOLD}${YELLOW}[✓] Running rl-swarm...${NC}"
56 | python3 -m venv .venv && . .venv/bin/activate
57 | sleep 2
58 | ./run_rl_swarm.sh
59 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
Gensyn Testnet Node Guide
2 |
3 | ## 💻 System Requirements
4 |
5 | | Requirement | Details |
6 | |-------------------------------------|-------------------------------------------------------------|
7 | | **CPU Architecture** | `arm64` or `amd64` |
8 | | **Recommended RAM** | 24 GB |
9 | | **CUDA Devices (Recommended)** | `RTX 3090`, `RTX 4070`, `RTX 4090`, `A100`, `H100` |
10 | | **Python Version** | Python >= 3.10 (For Mac, you may need to upgrade) |
11 |
12 |
13 | ## 🌐 Rent GPU
14 | > [!Note]
15 | > **Renting GPU is not necessarily needed, you can still run this node on VPS or on WSL, but you may face OOM error plus you may have less winning rate, that is why my recommendation is renting a GPU if you can, otherwise you can proceed with VPS or WSL.**
16 | - Visit : [Quick Pod Website](https://console.quickpod.io?affiliate=64e0d2b2-59ee-4989-a05f-f4c3b6dbb2e4)
17 | - Sign Up using email address
18 | - Go to your email and verify your Quick Pod account
19 | - Click on `Add` button in the corner to deposit fund
20 | - You can deposit using crypto currency (from metamask) or using Credit card
21 | - Now go to `template` section and then select `Ubuntu 22.04 jammy` in the below
22 | - Now click on `Select GPU` and search `RTX 4090` and choose it
23 | - Now choose a GPU and click on `Create POD` button
24 | - Your GPU server will be deployed soon
25 | - You can simply click on `Connect` button and then choose `Connect to Web Terminal`
26 | - But if you are using different gpu provider then you should use `Connect via SSH` method mentioned below
27 |
28 | ## 🛜 Connect via SSH (Only for GPU)
29 | > [!Note]
30 | > **This step is only required if you running this node on GPU, if you want to use VPS or WSL, then skip this step and login to your VPS server using username and password which u received from the VPS provider and then move to [installation](https://github.com/zunxbt/gensyn-testnet?tab=readme-ov-file#-installation) section**
31 | - First open a terminal (this could be either WSL / Codespace / Command Prompt)
32 | - Use this below command to generate SSH-Key
33 | ```
34 | ssh-keygen
35 | ```
36 | - It will ask 3 questions like this :
37 | ```
38 | Enter file in which to save the key (/home/codespace/.ssh/id_rsa):
39 | Enter passphrase (empty for no passphrase):
40 | Enter same passphrase again:
41 | ```
42 | - You need to press `Enter` 3 times
43 | - After that you will get a message like this on your terminal
44 | ```
45 | Your public key has been saved in /home/codespace/.ssh/id_rsa.pub
46 | ```
47 | - `/home/codespace/.ssh/id_rsa.pub` is the path of this public key in my case, in your case it might be different
48 |
49 | 
50 |
51 | - You should use this command to see those ssh key :
52 | - If you are using Linux/macOS (WSL) : `cat path/of/that/publickey` , in my case, it would be : `cat /home/codespace/.ssh/id_rsa.pub`
53 | - If you are using Command Prompt : `type path\of\that\publickey`, in my case, it would be : `type \home\codespace\.ssh\id_rsa.pub`
54 | - If you are using PowerShell : `Get-Content path\of\that\publickey`, in my case, it would be : `Get-Content \home\codespace\.ssh\id_rsa.pub`
55 | - Now copy this public key and go to hosting provider from where you bought GPU
56 | - After visiting the web hosting provider website, navigate to settings and there paste and save your ssh key
57 | - Now, copy the command you received after renting the GPU instance and paste it into the terminal where you generated the public key.
58 | - In my case, the command looks like this:
59 | ```
60 | ssh -p 69 root@69.69.69.69
61 | ```
62 | - Now paste this command on this terminal to access your GPU server
63 |
64 | ## 📥 Installation
65 |
66 | 1. **Install `sudo`**
67 | ```bash
68 | apt update && apt install -y sudo
69 | ```
70 | 2. **Install other dependencies**
71 | ```bash
72 | sudo apt update && sudo apt install -y python3 python3-venv python3-pip curl wget screen git lsof nano unzip iproute2 build-essential gcc g++
73 | ```
74 | 3. **Install CUDA**
75 | ```
76 | [ -f cuda.sh ] && rm cuda.sh; curl -o cuda.sh https://raw.githubusercontent.com/zunxbt/gensyn-testnet/main/cuda.sh && chmod +x cuda.sh && . ./cuda.sh
77 | ```
78 | 4. **Create a `screen` session**
79 | ```bash
80 | screen -S gensyn
81 | ```
82 | 5. **Clone official `rl-swarm` repo**
83 | ```
84 | git clone https://github.com/gensyn-ai/rl-swarm.git && cd rl-swarm
85 | ```
86 | 6. **Run the swarm**
87 | ```
88 | python3 -m venv .venv
89 | . .venv/bin/activate
90 | ./run_rl_swarm.sh
91 | ```
92 |
93 | 
94 |
95 | - After sometimes, u will see something like this if your running it on Linux system, so here follow the next step
96 |
97 | 7. **Tunnel the `localhost` link**
98 | - There are multiple way to do this - localtunnel / ngrok / cloudflared, in this guide, I will use localtunnel
99 | - Here, keep the rl-swarm running in one tab, and open the GPU/VPS/WSL again in another tab
100 | - Use the below command in the new terminal
101 | ```
102 | npm install -g localtunnel
103 | ```
104 | - Now use this command to get the password of this website
105 | ```
106 | curl https://loca.lt/mytunnelpassword
107 | ```
108 | - Or simply your password is your VPS IP
109 | - Then use this command to get the wesbite link
110 | ```
111 | lt --port 3000
112 | ```
113 | - You will get a link like this `https://true-things-cry.loca.lt`
114 |
115 | 
116 |
117 |
118 | - Visit the website and enter the password to access it
119 | - Sign In using email address and then paste OTP
120 | - Now comeback to your main terminal and you see logs started to progress
121 | ---
122 | - It will ask some questions, you should send response properly
123 | - ```Would you like to push models you train in the RL swarm to the Hugging Face Hub? [y/N]``` : Write `N`
124 | - When you will see interface like this, you can detach from this screen session
125 |
126 | 
127 |
128 | 7. **Detach from `screen session`**
129 | - Use `Ctrl + A` and then press `D` to detach from this screen session.
130 |
131 | ## 🔄️ Back up `swarm.pem`
132 | After running the Gensyn node, it is essential to back up the swarm.pem file from your remote server (GPU or VPS) to your local PC. If you lose this file, your contribution will also be lost. I will mention 2 method , 1 is simpler but not that much secure and another one is secure but a lil bit complex for the beginners
133 |
134 | ### Method 1 (Very Simple)
135 | - First make sure that you are in `rl-swarm` folder and then run this command
136 | ```
137 | [ -f backup.sh ] && rm backup.sh; curl -sSL -O https://raw.githubusercontent.com/zunxbt/gensyn-testnet/main/backup.sh && chmod +x backup.sh && ./backup.sh
138 | ```
139 | - It will show something like this in your terminal
140 |
141 | 
142 |
143 | 1️⃣ **VPS/GPU/WSL to PC**
144 | - If you want to backup `swarm.pem`(Must), `userData.json` (Optional), `userApiKey.json` (Optional) from VPS/GPU/WSL to your PC then simply **visit the URL** (don't use the commands mentioned below) and press `Ctrl + S` to save these files.
145 |
146 | 2️⃣ **One VPS/GPU/WSL to another VPS/GPU/WSL**
147 | - If you want to backup `swarm.pem`(Must), `userData.json` (Optional), `userApiKey.json` (Optional) from One VPS/GPU/WSL to another VPS/GPU/WSL then simply use the **commands** on another VPS/GPU/WSL where you want to use these files.
148 |
149 | ### Method 2 (Manual)
150 | If you face any issue with this automated back up process then it is recommended to use manual guide : [Click Here](https://github.com/zunxbt/gensyn-testnet/blob/main/BACKUP.md)
151 |
152 | ## 🟢 Node Status
153 |
154 | ### 1. Check Logs
155 | - To check whether your node is running or not, you can check logs
156 | - To check logs you need to re-attach with screen session, so use the below command
157 | ```
158 | screen -r gensyn
159 | ```
160 | - If you see everything running then it's fine
161 | - Now detach from this screen session, Use `Ctrl + A` and then press `D` to detach from this screen session.
162 | - Everytime you reattach, every time you should detach
163 |
164 | ### 2. Check Wins
165 | - Visit : https://gensyn-node.vercel.app/
166 | - Enter Peer-ID that you often see this in your logs
167 | - The more win, the better
168 |
169 | > [!Note]
170 | > If you see `0x0000000000000000000000000000000000000000` in `Connected EOA Address` section, that means your contribution is not being recorded, so you should run the node from beginning with fresh new email (means u need to delete existing `swarm.pem` file
171 |
172 | ## ⚠️ Troubleshooting
173 |
174 | ### 🔴 Daemon failed to start in 15.0 seconds
175 | - If you are facing this issue then follow this step by step guide
176 | - First use tihs command
177 | ```
178 | nano $(python3 -c "import hivemind.p2p.p2p_daemon as m; print(m.__file__)")
179 | ```
180 | - Then scroll down and look for this line `startup_timeout: float = 15,` , here u need to modify this 15 with 120, and after modifying it will look like this : `startup_timeout: float = 120,`
181 | - Save this changes, first use `Ctrl` + `X` and then press `Y` and then press `Enter`
182 | - Now use this command again to run `rl-swarm`
183 | ```bash
184 | python3 -m venv .venv && . .venv/bin/activate && ./run_rl_swarm.sh
185 | ```
186 |
--------------------------------------------------------------------------------
/backup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | RED='\033[1;31m'
4 | GREEN='\033[1;32m'
5 | YELLOW='\033[1;33m'
6 | BLUE='\033[1;34m'
7 | BOLD='\033[1m'
8 | NC='\033[0m'
9 |
10 | print_message() {
11 | echo -e "${BLUE}[INFO]${NC} $1"
12 | }
13 |
14 | print_success() {
15 | echo -e "${GREEN}[SUCCESS]${NC} $1"
16 | }
17 |
18 | print_warning() {
19 | echo -e "${YELLOW}[WARNING]${NC} $1"
20 | }
21 |
22 | print_error() {
23 | echo -e "${RED}[ERROR]${NC} $1"
24 | }
25 |
26 | print_message "Checking and installing dependencies (nc and lsof)..."
27 |
28 | if [[ "$OSTYPE" == "linux-gnu"* ]]; then
29 | if ! command -v nc &> /dev/null || ! command -v lsof &> /dev/null; then
30 | print_message "Installing netcat and lsof..."
31 | sudo apt-get update
32 | sudo apt-get install -y netcat lsof
33 | if ! command -v nc &> /dev/null || ! command -v lsof &> /dev/null; then
34 | print_error "Failed to install netcat or lsof. Please install them manually."
35 | exit 1
36 | fi
37 | print_success "Dependencies installed successfully."
38 | else
39 | print_success "Dependencies already installed."
40 | fi
41 | elif [[ "$OSTYPE" == "darwin"* ]]; then
42 | if ! command -v nc &> /dev/null || ! command -v lsof &> /dev/null; then
43 | if command -v brew &> /dev/null; then
44 | print_message "Installing netcat and lsof via Homebrew..."
45 | brew install netcat lsof
46 | else
47 | print_error "Homebrew not found. Please install netcat and lsof manually."
48 | exit 1
49 | fi
50 | if ! command -v nc &> /dev/null || ! command -v lsof &> /dev/null; then
51 | print_error "Failed to install netcat or lsof. Please install them manually."
52 | exit 1
53 | fi
54 | print_success "Dependencies installed successfully."
55 | else
56 | print_success "Dependencies already installed."
57 | fi
58 | else
59 | print_warning "Unsupported OS for automatic dependency installation. Ensure nc and lsof are installed."
60 | fi
61 |
62 | print_message "Checking rl-swarm directory..."
63 |
64 | if [[ $(basename "$PWD") == "rl-swarm" ]]; then
65 | print_success "Currently in rl-swarm directory."
66 | RL_SWARM_DIR="$PWD"
67 | else
68 | print_warning "Not in rl-swarm directory. Checking HOME directory..."
69 |
70 | if [[ -d "$HOME/rl-swarm" ]]; then
71 | print_success "Found rl-swarm directory in HOME."
72 | RL_SWARM_DIR="$HOME/rl-swarm"
73 | else
74 | print_error "rl-swarm directory not found in current directory or HOME."
75 | exit 1
76 | fi
77 | fi
78 |
79 | cd "$RL_SWARM_DIR" &> /dev/null
80 |
81 | print_message "Checking cloudflared..."
82 |
83 | ARCH=$(uname -m)
84 | case $ARCH in
85 | x86_64)
86 | CLOUDFLARED_ARCH="amd64"
87 | ;;
88 | aarch64|arm64)
89 | CLOUDFLARED_ARCH="arm64"
90 | ;;
91 | *)
92 | print_error "Unsupported architecture: $ARCH"
93 | exit 1
94 | ;;
95 | esac
96 |
97 | if command -v cloudflared &> /dev/null; then
98 | print_success "cloudflared is already installed."
99 | else
100 | print_message "Installing cloudflared for $ARCH architecture..."
101 |
102 | mkdir -p /tmp/cloudflared-install
103 | cd /tmp/cloudflared-install
104 |
105 | if [[ "$OSTYPE" == "linux-gnu"* ]]; then
106 | curl -L "https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-${CLOUDFLARED_ARCH}.deb" -o cloudflared.deb
107 | sudo dpkg -i cloudflared.deb || sudo apt-get install -f -y
108 |
109 | if ! command -v cloudflared &> /dev/null; then
110 | curl -L "https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-${CLOUDFLARED_ARCH}" -o cloudflared
111 | chmod +x cloudflared
112 | sudo mv cloudflared /usr/local/bin/
113 | fi
114 | elif [[ "$OSTYPE" == "darwin"* ]]; then
115 | if command -v brew &> /dev/null; then
116 | brew install cloudflared
117 | else
118 | curl -L "https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-darwin-${CLOUDFLARED_ARCH}.tgz" -o cloudflared.tgz
119 | tar -xzf cloudflared.tgz
120 | chmod +x cloudflared
121 | sudo mv cloudflared /usr/local/bin/
122 | fi
123 | else
124 | print_error "Unsupported operating system: $OSTYPE"
125 | exit 1
126 | fi
127 |
128 | cd "$RL_SWARM_DIR" &> /dev/null
129 |
130 | if command -v cloudflared &> /dev/null; then
131 | print_success "cloudflared installation completed successfully."
132 | else
133 | print_error "Failed to install cloudflared. Please install it manually."
134 | exit 1
135 | fi
136 | fi
137 |
138 | print_message "Checking python3..."
139 |
140 | if command -v python3 &> /dev/null; then
141 | print_success "python3 is already installed."
142 | else
143 | print_message "Installing python3..."
144 |
145 | if [[ "$OSTYPE" == "linux-gnu"* ]]; then
146 | sudo apt-get update
147 | sudo apt-get install -y python3 python3-pip
148 | elif [[ "$OSTYPE" == "darwin"* ]]; then
149 | if command -v brew &> /dev/null; then
150 | brew install python
151 | else
152 | print_error "Homebrew not found. Please install python3 manually."
153 | exit 1
154 | fi
155 | else
156 | print_error "Unsupported operating system: $OSTYPE"
157 | exit 1
158 | fi
159 |
160 | if command -v python3 &> /dev/null; then
161 | print_success "python3 installation completed successfully."
162 | else
163 | print_error "Failed to install python3. Please install it manually."
164 | exit 1
165 | fi
166 | fi
167 |
168 | print_message "Starting HTTP server..."
169 |
170 | PORT=8000
171 | MAX_RETRIES=10
172 | RETRY_COUNT=0
173 | SERVER_STARTED=false
174 |
175 | is_port_in_use() {
176 | if command -v nc &> /dev/null; then
177 | nc -z localhost "$1" &> /dev/null
178 | return $?
179 | elif command -v lsof &> /dev/null; then
180 | lsof -i:"$1" &> /dev/null
181 | return $?
182 | else
183 | (echo > /dev/tcp/127.0.0.1/"$1") &> /dev/null
184 | return $?
185 | fi
186 | }
187 |
188 | start_http_server() {
189 | local port="$1"
190 | local temp_log="/tmp/http_server_$$.log"
191 |
192 | python3 -m http.server "$port" > "$temp_log" 2>&1 &
193 | local pid=$!
194 |
195 | sleep 3
196 |
197 | if ps -p $pid > /dev/null; then
198 | print_success "HTTP server started successfully on port $port."
199 | echo "$pid"
200 | else
201 | if grep -q "Address already in use" "$temp_log"; then
202 | print_warning "Port $port is already in use."
203 | return 1
204 | else
205 | print_error "Failed to start HTTP server on port $port. Error log:"
206 | cat "$temp_log"
207 | return 1
208 | fi
209 | fi
210 | }
211 |
212 | while [[ $RETRY_COUNT -lt $MAX_RETRIES && $SERVER_STARTED == false ]]; do
213 | print_message "Attempting to start HTTP server on port $PORT..."
214 |
215 | if is_port_in_use "$PORT"; then
216 | print_warning "Port $PORT is already in use. Trying next port."
217 | PORT=$((PORT + 1))
218 | RETRY_COUNT=$((RETRY_COUNT + 1))
219 | continue
220 | fi
221 |
222 | HTTP_SERVER_PID=$(start_http_server "$PORT")
223 |
224 | if [[ -n "$HTTP_SERVER_PID" ]]; then
225 | print_message "Starting cloudflared tunnel to http://localhost:$PORT..."
226 |
227 | cloudflared tunnel --url "http://localhost:$PORT" > /tmp/cloudflared_$$.log 2>&1 &
228 | CLOUDFLARED_PID=$!
229 |
230 | sleep 10
231 |
232 | TUNNEL_URL=$(grep -o 'https://[^ ]*\.trycloudflare\.com' /tmp/cloudflared_$$.log | head -n 1)
233 |
234 | if [[ -n "$TUNNEL_URL" ]]; then
235 | print_success "Cloudflare tunnel established at: $TUNNEL_URL"
236 | SERVER_STARTED=true
237 | else
238 | print_warning "Cloudflared tunnel not established yet. Waiting longer..."
239 |
240 | sleep 10
241 | TUNNEL_URL=$(grep -o 'https://[^ ]*\.trycloudflare\.com' /tmp/cloudflared_$$.log | head -n 1)
242 |
243 | if [[ -n "$TUNNEL_URL" ]]; then
244 | print_success "Cloudflare tunnel established at: $TUNNEL_URL"
245 | SERVER_STARTED=true
246 | else
247 | print_error "Failed to establish cloudflared tunnel. Stopping services and trying another port."
248 |
249 | kill $HTTP_SERVER_PID 2>/dev/null
250 | kill $CLOUDFLARED_PID 2>/dev/null
251 |
252 | PORT=$((PORT + 1))
253 | RETRY_COUNT=$((RETRY_COUNT + 1))
254 | fi
255 | fi
256 | else
257 | PORT=$((PORT + 1))
258 | RETRY_COUNT=$((RETRY_COUNT + 1))
259 | fi
260 | done
261 |
262 | if [[ $SERVER_STARTED == false ]]; then
263 | print_error "Failed to start HTTP server after $MAX_RETRIES attempts."
264 | exit 1
265 | fi
266 |
267 | echo
268 | echo -e "${GREEN}${BOLD}========== VPS/GPU/WSL to PC ===========${NC}"
269 | echo -e "${BOLD}If you want to backup these files from VPS/GPU/WSL to your PC, visit the URLs and download.${NC}"
270 | echo
271 | echo -e "${BOLD}1. swarm.pem${NC}"
272 | echo -e " ${BLUE}${TUNNEL_URL}/swarm.pem${NC}"
273 | echo
274 | echo -e "${BOLD}2. userData.json${NC}"
275 | echo -e " ${BLUE}${TUNNEL_URL}/modal-login/temp-data/userData.json${NC}"
276 | echo
277 | echo -e "${BOLD}3. userApiKey.json${NC}"
278 | echo -e " ${BLUE}${TUNNEL_URL}/modal-login/temp-data/userApiKey.json${NC}"
279 | echo
280 | echo -e "${GREEN}${BOLD}======= ONE VPS/GPU/WSL to ANOTHER VPS/GPU/WSL ========${NC}"
281 | echo -e "${BOLD}To send these files to another VPS/GPU/WSL, use the wget commands instead of the URLs.${NC}"
282 | echo
283 | echo -e "${YELLOW}wget -O swarm.pem ${TUNNEL_URL}/swarm.pem${NC}"
284 | echo -e "${YELLOW}wget -O userData.json ${TUNNEL_URL}/modal-login/temp-data/userData.json${NC}"
285 | echo -e "${YELLOW}wget -O userApiKey.json ${TUNNEL_URL}/modal-login/temp-data/userApiKey.json${NC}"
286 | echo
287 | echo -e "${BLUE}${BOLD}Press Ctrl+C to stop the server when you're done.${NC}"
288 |
289 | # Wait for Ctrl+C
290 | trap 'echo -e "${YELLOW}Stopping servers...${NC}"; if [[ -n "$HTTP_SERVER_PID" ]]; then kill $HTTP_SERVER_PID 2>/dev/null; fi; if [[ -n "$CLOUDFLARED_PID" ]]; then kill $CLOUDFLARED_PID 2>/dev/null; fi; echo -e "${GREEN}Servers stopped.${NC}"' INT
291 | wait
292 |
--------------------------------------------------------------------------------
/cuda.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | GREEN="\033[1;32m"
4 | YELLOW="\033[1;33m"
5 | RED="\033[1;31m"
6 | BLUE="\033[1;34m"
7 | CYAN="\033[1;36m"
8 | BOLD="\033[1m"
9 | NC="\033[0m"
10 |
11 | CPU_ONLY="false"
12 | CUDA_INSTALLED=false
13 | NVCC_PATH=""
14 | CUDA_PATH=""
15 | CUDA_VERSION=""
16 | DRIVER_VERSION=""
17 |
18 | detect_environment() {
19 | IS_WSL=false
20 | IS_RENTED_SERVER=false
21 |
22 | if grep -q Microsoft /proc/version 2>/dev/null; then
23 | echo -e "${YELLOW}${BOLD}[!] WSL environment detected${NC}"
24 | IS_WSL=true
25 | fi
26 |
27 | if [ -d "/opt/deeplearning" ] || [ -d "/opt/aws" ] || [ -d "/opt/cloud" ] || [ -f "/.dockerenv" ]; then
28 | echo -e "${YELLOW}${BOLD}[!] Rented/Cloud server environment detected${NC}"
29 | IS_RENTED_SERVER=true
30 | fi
31 |
32 | UBUNTU_VERSION=""
33 | if [ -f /etc/lsb-release ]; then
34 | source /etc/lsb-release
35 | UBUNTU_VERSION=$DISTRIB_RELEASE
36 | elif [ -f /etc/os-release ]; then
37 | source /etc/os-release
38 | UBUNTU_VERSION=$(echo $VERSION_ID | tr -d '"')
39 | elif [ -f /etc/issue ]; then
40 | UBUNTU_VERSION=$(cat /etc/issue | grep -oP 'Ubuntu \K[0-9]+\.[0-9]+' | head -1)
41 | fi
42 |
43 | if [ -z "$UBUNTU_VERSION" ]; then
44 | if command -v lsb_release >/dev/null 2>&1; then
45 | UBUNTU_VERSION=$(lsb_release -rs)
46 | else
47 | apt-get update >/dev/null 2>&1
48 | apt-get install -y lsb-release >/dev/null 2>&1
49 | if command -v lsb_release >/dev/null 2>&1; then
50 | UBUNTU_VERSION=$(lsb_release -rs)
51 | else
52 | UBUNTU_VERSION="22.04"
53 | fi
54 | fi
55 | fi
56 |
57 | echo -e "${CYAN}${BOLD}[✓] System: Ubuntu ${UBUNTU_VERSION}, Architecture: $(uname -m)${NC}"
58 | }
59 |
60 | detect_gpu() {
61 | echo -e "\n${CYAN}${BOLD}[✓] Detecting NVIDIA GPU...${NC}"
62 |
63 | GPU_AVAILABLE=false
64 |
65 | if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
66 | echo -e "${GREEN}${BOLD}[✓] NVIDIA GPU detected (via nvidia-smi)${NC}"
67 | DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1)
68 | echo -e "${GREEN}${BOLD}[✓] NVIDIA driver version: ${DRIVER_VERSION}${NC}"
69 |
70 | # Get CUDA version directly from nvidia-smi
71 | DRIVER_CUDA_VERSION=$(nvidia-smi | grep -oP "CUDA Version: \K[0-9.]+" 2>/dev/null)
72 | if [ -n "$DRIVER_CUDA_VERSION" ]; then
73 | echo -e "${GREEN}${BOLD}[✓] NVIDIA driver supports CUDA ${DRIVER_CUDA_VERSION}${NC}"
74 | fi
75 |
76 | GPU_AVAILABLE=true
77 | return 0
78 | fi
79 |
80 | if command -v lspci &> /dev/null && lspci | grep -i nvidia &> /dev/null; then
81 | echo -e "${GREEN}${BOLD}[✓] NVIDIA GPU detected (via lspci)${NC}"
82 | GPU_AVAILABLE=true
83 | return 0
84 | fi
85 |
86 | if [ -d "/proc/driver/nvidia" ] || [ -d "/dev/nvidia0" ]; then
87 | echo -e "${GREEN}${BOLD}[✓] NVIDIA GPU detected (via system directories)${NC}"
88 | GPU_AVAILABLE=true
89 | return 0
90 | fi
91 |
92 | if [ "$IS_RENTED_SERVER" = true ]; then
93 | echo -e "${YELLOW}${BOLD}[!] Running on a cloud/rented server, assuming GPU is available${NC}"
94 | GPU_AVAILABLE=true
95 | return 0
96 | fi
97 |
98 | if [ "$IS_WSL" = true ] && grep -q "nvidia" /mnt/c/Windows/System32/drivers/etc/hosts 2>/dev/null; then
99 | echo -e "${YELLOW}${BOLD}[!] WSL environment with potential NVIDIA drivers on Windows host${NC}"
100 | GPU_AVAILABLE=true
101 | return 0
102 | fi
103 |
104 | echo -e "${YELLOW}${BOLD}[!] No NVIDIA GPU detected - using CPU-only mode${NC}"
105 | CPU_ONLY="true"
106 | return 1
107 | }
108 |
109 | detect_cuda() {
110 | echo -e "\n${CYAN}${BOLD}[✓] Checking for CUDA installation...${NC}"
111 |
112 | CUDA_AVAILABLE=false
113 | NVCC_AVAILABLE=false
114 | CUDA_INSTALLED=false
115 |
116 | # First check for CUDA in common locations
117 | for cuda_dir in /usr/local/cuda* /usr/local/cuda; do
118 | if [ -d "$cuda_dir" ] && [ -d "$cuda_dir/bin" ] && [ -f "$cuda_dir/bin/nvcc" ]; then
119 | CUDA_PATH=$cuda_dir
120 | NVCC_PATH="$cuda_dir/bin/nvcc"
121 |
122 | if [ -x "$NVCC_PATH" ]; then
123 | CUDA_VERSION=$($NVCC_PATH --version 2>/dev/null | grep -oP 'release \K[0-9.]+' | head -1)
124 | [ -z "$CUDA_VERSION" ] && CUDA_VERSION=$(echo $cuda_dir | grep -oP 'cuda-\K[0-9.]+' || echo $(echo $cuda_dir | grep -oP 'cuda\K[0-9.]+'))
125 | echo -e "${GREEN}${BOLD}[✓] CUDA detected at ${CUDA_PATH} (version ${CUDA_VERSION})${NC}"
126 | CUDA_AVAILABLE=true
127 | CUDA_INSTALLED=true
128 | break
129 | fi
130 | fi
131 | done
132 |
133 | # If CUDA wasn't found in standard locations but nvcc is in PATH
134 | if [ "$CUDA_INSTALLED" = false ] && command -v nvcc &> /dev/null; then
135 | NVCC_PATH=$(which nvcc)
136 | CUDA_PATH=$(dirname $(dirname $NVCC_PATH))
137 | CUDA_VERSION=$(nvcc --version | grep -oP 'release \K[0-9.]+' | head -1)
138 | echo -e "${GREEN}${BOLD}[✓] NVCC detected: ${NVCC_PATH} (version ${CUDA_VERSION})${NC}"
139 | NVCC_AVAILABLE=true
140 | CUDA_AVAILABLE=true
141 | CUDA_INSTALLED=true
142 | fi
143 |
144 | # Use CUDA version from nvidia-smi if available
145 | if command -v nvidia-smi &> /dev/null; then
146 | DRIVER_CUDA_VERSION=$(nvidia-smi | grep -oP "CUDA Version: \K[0-9.]+" 2>/dev/null)
147 | if [ -n "$DRIVER_CUDA_VERSION" ]; then
148 | # Use driver's CUDA version if we couldn't detect it through nvcc
149 | if [ -z "$CUDA_VERSION" ]; then
150 | CUDA_VERSION=$DRIVER_CUDA_VERSION
151 | fi
152 | CUDA_AVAILABLE=true
153 | fi
154 | fi
155 |
156 | # Check if environment paths are set up correctly
157 | if [ "$CUDA_INSTALLED" = true ]; then
158 | check_cuda_path
159 | fi
160 |
161 | return 0
162 | }
163 |
164 | check_cuda_path() {
165 | PATH_SET=false
166 | LD_LIBRARY_PATH_SET=false
167 |
168 | if [ -n "$CUDA_PATH" ]; then
169 | if [[ ":$PATH:" == *":$CUDA_PATH/bin:"* ]]; then
170 | PATH_SET=true
171 | fi
172 |
173 | if [[ ":$LD_LIBRARY_PATH:" == *":$CUDA_PATH/lib64:"* ]]; then
174 | LD_LIBRARY_PATH_SET=true
175 | fi
176 | fi
177 |
178 | if [ "$PATH_SET" = false ] || [ "$LD_LIBRARY_PATH_SET" = false ]; then
179 | echo -e "${YELLOW}${BOLD}[!] CUDA environment paths not properly set - auto-configuring now${NC}"
180 | setup_cuda_env
181 | return 1
182 | fi
183 |
184 | echo -e "${GREEN}${BOLD}[✓] CUDA environment paths are properly configured${NC}"
185 | return 0
186 | }
187 |
188 | setup_cuda_env() {
189 | echo -e "\n${CYAN}${BOLD}[✓] Setting up CUDA environment variables...${NC}"
190 |
191 | if [ -z "$CUDA_PATH" ]; then
192 | for cuda_dir in /usr/local/cuda* /usr/local/cuda; do
193 | if [ -d "$cuda_dir" ] && [ -d "$cuda_dir/bin" ]; then
194 | CUDA_PATH=$cuda_dir
195 | break
196 | fi
197 | done
198 | fi
199 |
200 | if [ -z "$CUDA_PATH" ] || [ ! -d "$CUDA_PATH" ]; then
201 | echo -e "${RED}${BOLD}[✗] Cannot find CUDA directory${NC}"
202 | return 1
203 | fi
204 |
205 | echo -e "${GREEN}${BOLD}[✓] Using CUDA path: ${CUDA_PATH}${NC}"
206 |
207 | # Create systemwide path setup
208 | cat > /etc/profile.d/cuda.sh </dev/null; then
221 | echo -e "\n# CUDA Path" >> ~/.bashrc
222 | echo "export CUDA_HOME=${CUDA_PATH}" >> ~/.bashrc
223 | echo "export PATH=\$CUDA_HOME/bin\${PATH:+:\${PATH}}" >> ~/.bashrc
224 | echo "export LD_LIBRARY_PATH=\$CUDA_HOME/lib64\${LD_LIBRARY_PATH:+:\${LD_LIBRARY_PATH}}" >> ~/.bashrc
225 | fi
226 |
227 | # Source bashrc to apply changes in current session
228 | source ~/.bashrc 2>/dev/null || true
229 |
230 | echo -e "${GREEN}${BOLD}[✓] CUDA environment variables configured and applied${NC}"
231 | return 0
232 | }
233 |
234 | determine_compatible_cuda() {
235 | echo -e "\n${CYAN}${BOLD}[✓] Determining compatible CUDA version...${NC}"
236 |
237 | local compatible_version=""
238 |
239 | if command -v nvidia-smi &> /dev/null; then
240 | local driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1)
241 |
242 | # First try to get CUDA version directly from nvidia-smi
243 | DRIVER_CUDA_VERSION=$(nvidia-smi | grep -oP "CUDA Version: \K[0-9.]+" 2>/dev/null)
244 | if [ -n "$DRIVER_CUDA_VERSION" ]; then
245 | compatible_version=$DRIVER_CUDA_VERSION
246 | echo -e "${GREEN}${BOLD}[✓] Compatible CUDA version detected: ${compatible_version}${NC}"
247 | return 0
248 | fi
249 |
250 | # If direct detection failed, estimate based on driver version
251 | if [ -n "$driver_version" ]; then
252 | local major_version=$(echo $driver_version | cut -d '.' -f 1)
253 |
254 | if [ "$major_version" -ge 545 ]; then
255 | compatible_version="12.6"
256 | elif [ "$major_version" -ge 535 ]; then
257 | compatible_version="12.2"
258 | elif [ "$major_version" -ge 525 ]; then
259 | compatible_version="12.1"
260 | elif [ "$major_version" -ge 520 ]; then
261 | compatible_version="12.0"
262 | elif [ "$major_version" -ge 510 ]; then
263 | compatible_version="11.6"
264 | elif [ "$major_version" -ge 470 ]; then
265 | compatible_version="11.4"
266 | elif [ "$major_version" -ge 450 ]; then
267 | compatible_version="11.0"
268 | elif [ "$major_version" -ge 440 ]; then
269 | compatible_version="10.2"
270 | elif [ "$major_version" -ge 418 ]; then
271 | compatible_version="10.1"
272 | elif [ "$major_version" -ge 410 ]; then
273 | compatible_version="10.0"
274 | else
275 | compatible_version="11.4" # Default fallback for older drivers
276 | fi
277 |
278 | echo -e "${GREEN}${BOLD}[✓] Driver version ${driver_version} is compatible with CUDA ${compatible_version}${NC}"
279 | return 0
280 | fi
281 | fi
282 |
283 | # Fallback to a safe version if detection fails
284 | compatible_version="12.6"
285 | echo -e "${YELLOW}${BOLD}[!] Could not determine driver version, defaulting to CUDA ${compatible_version}${NC}"
286 | return 0
287 | }
288 |
289 | install_cuda_toolkit() {
290 | echo -e "\n${CYAN}${BOLD}[✓] Installing CUDA Toolkit...${NC}"
291 |
292 | COMPATIBLE_CUDA_VERSION=""
293 | determine_compatible_cuda
294 |
295 | local install_success=false
296 |
297 | # Try method 1: Using apt repository
298 | install_cuda_apt_repo
299 | if [ $? -eq 0 ]; then
300 | install_success=true
301 | else
302 | echo -e "${YELLOW}${BOLD}[!] Repository installation failed, trying local package method...${NC}"
303 | install_cuda_local_package
304 | if [ $? -eq 0 ]; then
305 | install_success=true
306 | fi
307 | fi
308 |
309 | if [ "$install_success" = false ]; then
310 | echo -e "${RED}${BOLD}[✗] All CUDA installation methods failed${NC}"
311 | echo -e "${YELLOW}${BOLD}[!] Proceeding with CPU-only mode${NC}"
312 | CPU_ONLY="true"
313 | return 1
314 | fi
315 |
316 | setup_cuda_env
317 | detect_cuda
318 | verify_cuda_installation
319 |
320 | return 0
321 | }
322 |
323 | install_cuda_apt_repo() {
324 | local repo_url=""
325 | local keyring_url=""
326 |
327 | case $UBUNTU_VERSION in
328 | 24.04|"24.04")
329 | repo_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/"
330 | keyring_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb"
331 | ;;
332 | 22.04|"22.04")
333 | repo_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/"
334 | keyring_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb"
335 | ;;
336 | 20.04|"20.04")
337 | repo_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/"
338 | keyring_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb"
339 | ;;
340 | 18.04|"18.04")
341 | repo_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/"
342 | keyring_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.1-1_all.deb"
343 | ;;
344 | *)
345 | repo_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/"
346 | keyring_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb"
347 | ;;
348 | esac
349 |
350 | if [ "$IS_WSL" = true ]; then
351 | repo_url="https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/"
352 | keyring_url="https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-keyring_1.1-1_all.deb"
353 | fi
354 |
355 | local keyring_file="cuda-keyring.deb"
356 | echo -e "${CYAN}${BOLD}[✓] Downloading CUDA keyring from ${keyring_url}${NC}"
357 |
358 | wget --quiet "$keyring_url" -O "$keyring_file" || {
359 | if ! command -v curl &> /dev/null; then
360 | apt-get install -y curl >/dev/null 2>&1
361 | fi
362 | curl -L "$keyring_url" -o "$keyring_file" --progress-bar
363 | }
364 |
365 | if [ ! -f "$keyring_file" ] || [ ! -s "$keyring_file" ]; then
366 | echo -e "${RED}${BOLD}[✗] Failed to download keyring file${NC}"
367 | return 1
368 | fi
369 |
370 | dpkg -i "$keyring_file" || {
371 | echo -e "${RED}${BOLD}[✗] Failed to install CUDA keyring${NC}"
372 | rm -f "$keyring_file"
373 | return 1
374 | }
375 |
376 | echo -e "${CYAN}${BOLD}[✓] Updating package lists...${NC}"
377 | apt-get update -qq
378 |
379 | echo -e "${CYAN}${BOLD}[✓] Installing CUDA packages...${NC}"
380 |
381 | # Try to install the specific CUDA version based on driver support
382 | if [ -n "$DRIVER_CUDA_VERSION" ]; then
383 | local major_version=$(echo $DRIVER_CUDA_VERSION | cut -d '.' -f 1)
384 | local minor_version=$(echo $DRIVER_CUDA_VERSION | cut -d '.' -f 2)
385 |
386 | # Try specific version first, then fall back to more generic versions
387 | apt-get install -y cuda-toolkit-${major_version}-${minor_version} ||
388 | apt-get install -y cuda-toolkit-${major_version} ||
389 | apt-get install -y cuda || {
390 | echo -e "${RED}${BOLD}[✗] Failed to install CUDA packages${NC}"
391 | rm -f "$keyring_file"
392 | return 1
393 | }
394 | else
395 | # Try generic installation
396 | apt-get install -y cuda || {
397 | echo -e "${RED}${BOLD}[✗] Failed to install CUDA packages${NC}"
398 | rm -f "$keyring_file"
399 | return 1
400 | }
401 | fi
402 |
403 | rm -f "$keyring_file"
404 | echo -e "${GREEN}${BOLD}[✓] CUDA installed via repository method!${NC}"
405 | return 0
406 | }
407 |
408 | install_cuda_local_package() {
409 | echo -e "\n${CYAN}${BOLD}[✓] Installing CUDA using local package method...${NC}"
410 |
411 | local pin_file=""
412 | local pin_url=""
413 | local deb_file=""
414 | local deb_url=""
415 | local cuda_version="12.6"
416 |
417 |
418 | if [ -n "$DRIVER_CUDA_VERSION" ]; then
419 | cuda_version=$DRIVER_CUDA_VERSION
420 | fi
421 |
422 | local major_version=$(echo $cuda_version | cut -d '.' -f 1)
423 | local minor_version=$(echo $cuda_version | cut -d '.' -f 2)
424 |
425 | if [ "$IS_WSL" = true ]; then
426 | pin_file="cuda-wsl-ubuntu.pin"
427 | pin_url="https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-wsl-ubuntu.pin"
428 | deb_file="cuda-repo-wsl-ubuntu-${major_version}-${minor_version}-local_${major_version}.${minor_version}.0-1_amd64.deb"
429 | deb_url="https://developer.download.nvidia.com/compute/cuda/${major_version}.${minor_version}.0/local_installers/${deb_file}"
430 | else
431 | local ubuntu_ver_suffix=""
432 | case $UBUNTU_VERSION in
433 | 24.04|"24.04") ubuntu_ver_suffix="2404" ;;
434 | 22.04|"22.04") ubuntu_ver_suffix="2204" ;;
435 | 20.04|"20.04") ubuntu_ver_suffix="2004" ;;
436 | 18.04|"18.04") ubuntu_ver_suffix="1804" ;;
437 | *) ubuntu_ver_suffix="2204" ;;
438 | esac
439 |
440 | pin_file="cuda-ubuntu${ubuntu_ver_suffix}.pin"
441 | pin_url="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_ver_suffix}/x86_64/cuda-ubuntu${ubuntu_ver_suffix}.pin"
442 | deb_file="cuda-repo-ubuntu${ubuntu_ver_suffix}-${major_version}-${minor_version}-local_${major_version}.${minor_version}.0-1_amd64.deb"
443 | deb_url="https://developer.download.nvidia.com/compute/cuda/${major_version}.${minor_version}.0/local_installers/${deb_file}"
444 | fi
445 |
446 | wget --quiet "$pin_url" -O "$pin_file" 2>/dev/null || {
447 | if ! command -v curl &> /dev/null; then
448 | apt-get install -y curl >/dev/null 2>&1
449 | fi
450 | curl -sL "$pin_url" -o "$pin_file" 2>/dev/null
451 | }
452 |
453 | if [ -f "$pin_file" ] && [ -s "$pin_file" ]; then
454 | cp "$pin_file" /etc/apt/preferences.d/cuda-repository-pin-600
455 | else
456 | echo -e "${YELLOW}${BOLD}[!] Failed to download pin file, continuing without it${NC}"
457 | fi
458 |
459 | echo -e "${CYAN}${BOLD}[✓] Downloading CUDA repository package...${NC}"
460 | wget --progress=bar:force "$deb_url" -O "$deb_file" || {
461 | if ! command -v curl &> /dev/null; then
462 | apt-get install -y curl >/dev/null 2>&1
463 | fi
464 | curl -L "$deb_url" -o "$deb_file" --progress-bar
465 | }
466 |
467 | if [ ! -f "$deb_file" ] || [ ! -s "$deb_file" ]; then
468 | echo -e "${RED}${BOLD}[✗] Failed to download repository package${NC}"
469 | rm -f "$pin_file" "$deb_file"
470 | return 1
471 | fi
472 |
473 | echo -e "${CYAN}${BOLD}[✓] Installing CUDA repository package...${NC}"
474 | if ! dpkg -i "$deb_file"; then
475 | echo -e "${RED}${BOLD}[✗] Failed to install repository package${NC}"
476 | rm -f "$pin_file" "$deb_file"
477 | return 1
478 | fi
479 |
480 | if [ -f /var/cuda-repo-*/cuda-*-keyring.gpg ]; then
481 | cp /var/cuda-repo-*/cuda-*-keyring.gpg /usr/share/keyrings/
482 | fi
483 |
484 | echo -e "${CYAN}${BOLD}[✓] Updating package lists...${NC}"
485 | apt-get update -qq
486 |
487 | echo -e "${CYAN}${BOLD}[✓] Installing CUDA Toolkit...${NC}"
488 | apt-get install -y cuda || {
489 | echo -e "${RED}${BOLD}[✗] Failed to install CUDA${NC}"
490 | rm -f "$pin_file" "$deb_file"
491 | return 1
492 | }
493 |
494 | rm -f "$pin_file" "$deb_file"
495 | echo -e "${GREEN}${BOLD}[✓] CUDA installed via local package method!${NC}"
496 | return 0
497 | }
498 |
499 | verify_cuda_installation() {
500 | echo -e "\n${CYAN}${BOLD}[✓] Verifying CUDA installation...${NC}"
501 |
502 | if command -v nvcc &> /dev/null; then
503 | NVCC_VERSION=$(nvcc --version | grep -oP 'release \K[0-9.]+' | head -1)
504 | echo -e "${GREEN}${BOLD}[✓] NVCC compiler detected (version $NVCC_VERSION)${NC}"
505 |
506 | if [ "$IS_RENTED_SERVER" = true ] || [ "$IS_WSL" = true ]; then
507 | echo -e "${YELLOW}${BOLD}[!] Skipping CUDA test on rented/WSL environment${NC}"
508 | return 0
509 | fi
510 |
511 | if [ "$GPU_AVAILABLE" = true ] && command -v nvidia-smi &> /dev/null; then
512 | TEMP_DIR=$(mktemp -d)
513 | cd "$TEMP_DIR"
514 |
515 | echo -e "${CYAN}${BOLD}[✓] Running a simple CUDA test...${NC}"
516 | cat > cuda_test.cu << 'EOL'
517 | #include
518 |
519 | __global__ void testKernel() {
520 | printf("GPU kernel executed successfully!\n");
521 | }
522 |
523 | int main() {
524 | printf("Testing CUDA setup...\n");
525 | testKernel<<<1, 1>>>();
526 | cudaDeviceSynchronize();
527 | printf("CUDA test complete!\n");
528 | return 0;
529 | }
530 | EOL
531 |
532 | if nvcc cuda_test.cu -o cuda_test &>/dev/null; then
533 | echo -e "${GREEN}${BOLD}[✓] CUDA test compiled successfully${NC}"
534 | if ./cuda_test 2>/dev/null; then
535 | echo -e "${GREEN}${BOLD}[✓] CUDA test executed successfully${NC}"
536 | else
537 | echo -e "${YELLOW}${BOLD}[!] CUDA test execution failed, but compilation was successful${NC}"
538 | fi
539 | else
540 | echo -e "${YELLOW}${BOLD}[!] CUDA test compilation failed${NC}"
541 | fi
542 |
543 | rm -rf "$TEMP_DIR"
544 | fi
545 | else
546 | echo -e "${YELLOW}${BOLD}[!] NVCC compiler not detected in PATH${NC}"
547 | return 1
548 | fi
549 |
550 | return 0
551 | }
552 |
553 | check_cuda_installation() {
554 | echo -e "\n${CYAN}${BOLD}[✓] Checking CUDA installation status...${NC}"
555 |
556 | detect_environment
557 | detect_gpu
558 | detect_cuda
559 |
560 | if [ "$CUDA_INSTALLED" = true ] && command -v nvcc &> /dev/null; then
561 | echo -e "${GREEN}${BOLD}[✓] CUDA is properly installed and available${NC}"
562 | if ! check_cuda_path; then
563 | :
564 | fi
565 | CPU_ONLY="false"
566 | elif [ "$GPU_AVAILABLE" = true ]; then
567 | echo -e "${YELLOW}${BOLD}[!] NVIDIA GPU detected but CUDA environment not fully configured${NC}"
568 | echo -e "${CYAN}${BOLD}[✓] Installing and configuring CUDA automatically...${NC}"
569 | install_cuda_toolkit
570 | else
571 | echo -e "${YELLOW}${BOLD}[!] No NVIDIA GPU detected - using CPU-only mode${NC}"
572 | CPU_ONLY="true"
573 | fi
574 |
575 | if [ "$CPU_ONLY" = "true" ]; then
576 | echo -e "\n${YELLOW}${BOLD}[✓] Running in CPU-only mode${NC}"
577 | else
578 | echo -e "\n${GREEN}${BOLD}[✓] Running with GPU acceleration${NC}"
579 |
580 | if command -v nvidia-smi &> /dev/null; then
581 | echo -e "${CYAN}${BOLD}[✓] GPU information:${NC}"
582 | nvidia-smi --query-gpu=name,driver_version,temperature.gpu,utilization.gpu --format=csv,noheader
583 | fi
584 | fi
585 |
586 | export CPU_ONLY
587 | return 0
588 | }
589 |
590 | check_cuda_installation
591 |
--------------------------------------------------------------------------------