├── README.md ├── Scripts └── Dell-Server-Fan-Control.py └── Wiki ├── 2x-P40-Benchmarks.md ├── Budget-AI-Workstation-Build.md ├── R730-Build-Sound-Warnnings.md └── TPS-Chart.md /README.md: -------------------------------------------------------------------------------- 1 | # Magic-AI-Wiki -------------------------------------------------------------------------------- /Scripts/Dell-Server-Fan-Control.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | import re 4 | import os 5 | 6 | # Set the IPMI command interface credentials and IP 7 | IPMI_IP = "10.26.26.176" 8 | IPMI_USER = "username" 9 | IPMI_PASS = "password" 10 | 11 | # Function to set fan speed via IPMI 12 | def set_fan_speed(speed): 13 | hex_speed = format(min(max(int(speed), 10), 100), 'x') 14 | command = f"ipmitool -I lanplus -H {IPMI_IP} -U {IPMI_USER} -P {IPMI_PASS} raw 0x30 0x30 0x02 0xff 0x{hex_speed}" 15 | subprocess.run(command, shell=True) 16 | 17 | # Function to get GPU temperatures using nvidia-smi 18 | def get_gpu_temps(): 19 | nvidia_smi_cmd = 'nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader' 20 | try: 21 | output = subprocess.check_output(nvidia_smi_cmd, shell=True) 22 | temps = [int(temp) for temp in output.decode().strip().split('\n')] 23 | return temps 24 | except subprocess.CalledProcessError as e: 25 | print("Failed to get GPU temperatures", e.output) 26 | return [] 27 | 28 | # Function to get CPU temperatures using the updated ipmitool command 29 | def get_cpu_temps(): 30 | command = f"ipmitool -I lanplus -H {IPMI_IP} -U {IPMI_USER} -P {IPMI_PASS} sdr type temperature" 31 | try: 32 | output = subprocess.check_output(command, shell=True) 33 | # Extracting only the highest temperature value 34 | temp_line = subprocess.check_output("awk -F'|' '{print $1, $5}' | sed 's/ degrees C//g' | sort -nk2 | tail -1", input=output, shell=True) 35 | temp_value = int(re.search(r'\d+', temp_line.decode().strip()).group()) 36 | return [temp_value] # Return a list with a single value for consistency with the GPU temps function 37 | except subprocess.CalledProcessError as e: 38 | print("Failed to get CPU temperatures", e.output) 39 | return [] 40 | 41 | # Function to get fan speed via IPMI 42 | def get_fan_speed(): 43 | # Adjust these names based on your system after checking with 'ipmitool sensor' 44 | fan_sensors = ['Fan1', 'Fan2', 'Fan3'] # Example sensor names 45 | fan_speeds = [] 46 | for sensor in fan_sensors: 47 | command = f"ipmitool -I lanplus -H {IPMI_IP} -U {IPMI_USER} -P {IPMI_PASS} sensor get '{sensor}'" 48 | try: 49 | output = subprocess.check_output(command, shell=True) 50 | fan_speed = re.search(r'Sensor Reading\s+:\s+(\d+(\.\d+)?)(\s+RPM)?', output.decode()) 51 | if fan_speed: 52 | fan_speeds.append(fan_speed.group(1) + " RPM") 53 | except subprocess.CalledProcessError as e: 54 | print(f"Failed to get {sensor} speed", e.output) 55 | fan_speeds.append("Unknown") 56 | return fan_speeds 57 | 58 | # Function to determine fan speed based on temperatures 59 | def determine_fan_speed(gpu_temps, cpu_temps): 60 | temps = gpu_temps + cpu_temps 61 | fan_speed = 65 # Default fan speed 62 | if any(temp > 75 for temp in temps): 63 | fan_speed = 100 64 | elif any(temp > 73 for temp in temps): 65 | fan_speed = 90 66 | elif any(temp > 70 for temp in temps): 67 | fan_speed = 80 68 | elif any(temp > 67 for temp in temps): 69 | fan_speed = 75 70 | elif any(temp > 65 for temp in temps): 71 | fan_speed = 70 72 | elif any(temp > 63 for temp in temps): 73 | fan_speed = 65 74 | elif any(temp > 60 for temp in temps): 75 | fan_speed = 60 76 | elif any(temp > 57 for temp in temps): 77 | fan_speed = 55 78 | elif any(temp > 55 for temp in temps): 79 | fan_speed = 50 80 | elif any(temp > 53 for temp in temps): 81 | fan_speed = 45 82 | elif any(temp > 50 for temp in temps): 83 | fan_speed = 40 84 | elif any(temp > 48 for temp in temps): 85 | fan_speed = 35 86 | elif any(temp > 45 for temp in temps): 87 | fan_speed = 30 88 | elif any(temp > 43 for temp in temps): 89 | fan_speed = 25 90 | elif any(temp > 40 for temp in temps): 91 | fan_speed = 20 92 | elif any(temp > 38 for temp in temps): 93 | fan_speed = 18 94 | elif any(temp > 35 for temp in temps): 95 | fan_speed = 15 96 | elif any(temp > 33 for temp in temps): 97 | fan_speed = 13 98 | elif any(temp > 5 for temp in temps): 99 | fan_speed = 10 100 | return fan_speed 101 | 102 | def control_fan(): 103 | try: 104 | gpu_temps = get_gpu_temps() 105 | cpu_temps = get_cpu_temps() 106 | fan_speed = determine_fan_speed(gpu_temps, cpu_temps) 107 | set_fan_speed(fan_speed) 108 | current_fan_speeds = get_fan_speed() 109 | print(f"GPU Temps: {gpu_temps}, Highest CPU Temps: {cpu_temps}, Fan Speed Set To: {fan_speed}%, Current Fan Speeds: {current_fan_speeds}") 110 | except Exception as e: 111 | print(f"An error occurred: {e}") 112 | 113 | 114 | # Call the control function every 3 seconds 115 | # Main loop with try-except 116 | while True: 117 | try: 118 | control_fan() 119 | time.sleep(3) 120 | except Exception as e: 121 | print(f"Fatal error in main loop: {e}") 122 | time.sleep(10) # Wait a bit before trying again, to prevent rapid failure loop 123 | -------------------------------------------------------------------------------- /Wiki/2x-P40-Benchmarks.md: -------------------------------------------------------------------------------- 1 | # Llama 2 - Nouse Hermes GGUF 2 | This is using Llama 2 Nous Hermes GGUF with the llama.cpp loader. **For the 70b test scenarios, I used the nous-hermes-llama2-70b.Q4_K_M.gguf version!** If I just auto downloaded the 70b version from [here](https://huggingface.co/TheBloke/Nous-Hermes-Llama2-70B-GGUF), it would try to auto load the Q5_0 version which would be too close to the VRAM limit and cause errors and crashes. The 13b model and 7b model, I just let the llama.cpp auto load whichever it chose since there was plenty of VRAM for those. Additionally, I maxed the gpu layers settings and split the tensors 24,24 between the 2X P40's. 3 | 4 | ### Scenario Types 5 | **First** - This means I ran the prompt for the first time. This will also incur a significantly higher evaluation time as the LLM will need to 6 | 7 | **Cache** - This means I ran the prompt with no changes again in instruct mode. This will cache the prompt evaluation that was done previously which takes the prompt evaluation out of the calculations. This will result in a faster TPS overall when the prompt is cached. 8 | 9 | ### Context Meaning 10 | The contexts of 250, 500, 1k, 2k, 3k, and 4k are not the exact numbers, but just the ballpark of what I was aiming for. Here's the correlated numbers if you wish to see the details in complete accuracy. Also when I give the context to actual token count, I'm putting that context into Open Ai's tokenizer to get the actual token count from it. 11 | 12 | **Empty** - Means I had nothing in the context at all. This'll cause the fastest generation. 13 | 14 | **250** - is exactly 250 in context @ 219 tokens 15 | 16 | **500** - is 496 context @ 440 tokens 17 | 18 | **1k** - is 1009 context @ 886 tokens 19 | 20 | **2k** - is 2031 context @ 1734 tokens 21 | 22 | **3k** - is 3068 context @ 2620 tokens 23 | 24 | **4k** - is 4094 context @ 3498 tokens 25 | 26 | ## Benchmarks 27 | 28 | | Scenario | Model | Context | Tokens/s | Sample TPS | Prompt Eval TPS | Eval TPS | 29 | |----------|-------|---------|----------|------------|-----------------|----------| 30 | | First | 7b | Empty | 32.68 | 1328.72 |-----------------| 46.01 | 31 | | Cache | 7b | Empty | 33.01 | 1353.31 |-----------------| 45.86 | 32 | | First | 7b | 250 | 21.65 | 1105.18 | 704.79 | 42.95 | 33 | | Cache | 7b | 250 | 25.67 | 1460.93 |-----------------| 43.14 | 34 | | First | 7b | 500 | 22.51 | 1042.15 | 685.93 | 40.73 | 35 | | Cache | 7b | 500 | 28.51 | 1116.50 |-----------------| 40.93 | 36 | | First | 7b | 1k | 24.73 | 1601.33 | 690.03 | 36.82 | 37 | | Cache | 7b | 1k | 25.52 | 1295.59 |-----------------| 35.32 | 38 | | First | 7b | 2k | 20.33 | 1370.96 | 653.66 | 30.75 | 39 | | Cache | 7b | 2k | 23.37 | 1114.11 |-----------------| 30.34 | 40 | | First | 7b | 3k | 14.69 | 1774.00 | 528.50 | 27.01 | 41 | | Cache | 7b | 3k | 20.70 | 1173.04 |-----------------| 26.13 | 42 | | First | 7b | 4k | 10.25 | 1389.56 | 650.51 | 23.65 | 43 | | Cache | 7b | 4k | 18.83 | 1465.68 |-----------------| 23.55 | 44 | | First | 13b | Empty | 18.28 | 978.74 |-----------------| 30.67 | 45 | | Cache | 13b | Empty | 21.23 | 1086.48 |-----------------| 30.57 | 46 | | First | 13b | 250 | 16.58 | 941.63 | 437.06 | 29.17 | 47 | | Cache | 13b | 250 | 17.73 | 840.84 |-----------------| 28.47 | 48 | | First | 13b | 500 | 15.51 | 1257.55 | 431.48 | 27.53 | 49 | | Cache | 13b | 500 | 18.97 | 934.64 |-----------------| 27.04 | 50 | | First | 13b | 1k | 14.22 | 1710.57 | 428.55 | 25.07 | 51 | | Cache | 13b | 1k | 17.84 | 1194.65 |-----------------| 24.37 | 52 | | First | 13b | 2k | 9.99 | 1718.57 | 398.84 | 20.65 | 53 | | Cache | 13b | 2k | 15.19 | 945.12 |-----------------| 20.38 | 54 | | First | 13b | 3k | 8.72 | 1777.84 | 332.48 | 17.94 | 55 | | Cache | 13b | 3k | 13.15 | 1003.43 |-----------------| 17.4 | 56 | | First | 13b | 4k | 4.49 | 1248.8 | 393.6 | 15.71 | 57 | | Cache | 13b | 4k | 11.06 | 1195.7 |-----------------| 15.61 | 58 | | First | 70b | Empty | 7.08 | 1267.90 |-----------------| 8.08 | 59 | | Cache | 70b | Empty | 7.41 | 1456.45 |-----------------| 8.14 | 60 | | First | 70b | 250 | 5.85 | 1070.11 | 111.67 | 7.83 | 61 | | Cache | 70b | 250 | 6.66 | 1133.77 |-----------------| 7.83 | 62 | | First | 70b | 500 | 5.04 | 1576.71 | 112.47 | 7.64 | 63 | | Cache | 70b | 500 | 6.68 | 1564.42 |-----------------| 7.63 | 64 | | First | 70b | 1k | 4.42 | 1213.99 | 105.24 | 7.05 | 65 | | Cache | 70b | 1k | 6.24 | 1629.41 |-----------------| 7.13 | 66 | | First | 70b | 2k | 4.00 | 1614.98 | 117.06 | 6.28 | 67 | | Cache | 70b | 2k | 5.63 | 1428.57 |-----------------| 6.31 | 68 | | First | 70b | 3k | 4.03 | 1498.1 | 107.83 | 5.60 | 69 | | Cache | 70b | 3k | 5.04 | 1331.67 |-----------------| 5.61 | 70 | | First | 70b | 4k | 2.85 | 1596.34 | 108.05 | 5.71 | 71 | | Cache | 70b | 4k | 5.40 | 1764.06 |-----------------| 5.65 | 72 | -------------------------------------------------------------------------------- /Wiki/Budget-AI-Workstation-Build.md: -------------------------------------------------------------------------------- 1 | **PLEASE READ THIS** 2 | 3 | After much feedback (which is crazy appreciated), I'm re-doing the benchmarks. I'm now reconsidering my opinion on the Tesla P40 GPU's. I may update the guide to suggest the same build as I really like this build, but using 3090's which would bump the cost up to roughly $2,142 versus the $1,092 cost. But, I have multiple tests to perform before I give a final verdict. I'll release all my benchmarks and findings as well so you can make your own educated decisions as well too! And maybe the P40 performs as well as you need it too, but the goal of this build was to create a powerful mid range AI server on the cheap. I'll be keeping everyone updated as I move forward. I'll also be adding 3090's to this server as I have multiple. This way I can have direct comparisons for benchmarks. 4 | 5 | **Also please be aware I'm running into issues with the NVME speed so you may want to consider a cheaper NVME!** The NVME I chose should in theory be utilized to it's near full performance capabilities within this server. But I'm starting to run into some oddities and I think it's due to the fact I'm testing all this in a VM hosted in Proxmox. So, I need to make sure it's utilizing it fully without that overhead, but it may honestly be just as good to get the 3,500 MB/s NVME drives. As it'd reduce the price and you can get the multi slot ones too if you so chose. I believe this is just an overhead issue with the hypervisor, but I wanted to be clear that my testing is still heavily in progress. 6 | 7 | # My AI Server build guide 8 | This is not a guide to build the worlds fastest AI machine. Nor is this a guide to build a mid level AI machine. This is a guide to build a budget AI workstation/server with enough VRAM to play ball with the big boys and achieve speeds that're at least acceptable for most applicational use cases. I personally built the server in this guide and compared it to my main AI workstation with ridiculously faster parts and 2X 3090's versus 2X Tesla P40's. So, I'm accustomed to much faster hardware and the goal of this build was not to build some crazy high level machine, but instead to achieve very similar results to my much nicer machine, but at a much lower cost. The goal was to build a AI workstation on a budget, while still meeting what I consider the minimum requirements for the AI speeds to be at least useable. 9 | 10 | ## Redundancy Server Version With Cache Pool (Multi purpse AI machine) 11 | For those who want to follow my identical build. I went down the route of having 6X SAS drives in ZFSZ-6 which is effectively RAID 6, meaning 2 drives can fail before data loss or system failure. I simply wanted this redundancy on my end to do nightly backups of my AI VM since the AI VM will be located on a single NVME SSD with no redundancy. But you do not have to follow my exact configuration, especially if you do not care about redundancy or data loss. Also note that the 2X SATA SSD's that I bought is only necessary for those using the SAS drives for other VM's outside of AI work as I'm using the SSD's as the cache pool to speed up VM's, but if you're building a machine specific for AI work, this is unecessary. I also chose 2X SATA SSD's to run them in the cache pool effectively as a RAID 0. Which I know is bad when I'm talking all high and mighty about redunancy lol. But, this is my personal server build dedicated majorly for AI, but is being used for various other purpses as well if you want to use your server for more than just AI, plus wanting the redundancy for the VM backup. Also note that I've got a ton of cores on my machine and this isn't necessary for most people. Again, this build specifically was for AI, but I'm using this for various other tasks as well. If you need to use this server for AI but you can also use the server for other use cases, then you can follow this build. Otherwise, you can skip below to more cost effective versions you can buy if your machine truly is dedicated for only AI work. **This truly is a more fancy build only needed if you have multiple use cases for the server, want to utilize the resources effectively with Proxmox, but also requires a lot more technical know how to setup the PCIE passthrough and utilize all the resources**. 12 | 13 | | Item | Cost/Unit | Quantity | Total | Link | 14 | |------|----------|---------|------|------| 15 | | Nvidia Tesla P40 GPU | $175 | 2 | $350 | [Link](https://www.ebay.com/itm/204488727042) | 16 | | P40 power adapters | $15 | 2 | $30 | [Link](https://www.amazon.com/dp/B08N4BJL2J?psc=1&ref=ppx_yo2ov_dt_b_product_details) | 17 | | Dell PowerEdge R730 (128GB RAM, 2x E5-2690v4 2.6GHz =28 Cores, 16 bay) | $408 | 1 | $408 | [Link](https://www.ebay.com/itm/115819389907) | 18 | | 1100W Dell PSU | $21 | 2 | $42 | [Link](https://www.ebay.com/itm/134094913593?hash=item1f38ae0039:g:Kq0AAOSw2pBkPFHA&amdata=enc%3AAQAIAAAA8PrhTu%2BzhLKX0jsnp5Dqlh3DZjYc2cSClmmPJwTp5SzKy644q3A4w3%2BlLE0Gt%2Fyg9ybmNWXliWCiSnbNUIf%2FLjP%2F3eEuKZeDZJ%2BuRvwvosOgx4UkQldeWBLiDUaHlYxEMxm246bkNIr79UKBzvlMikro77OyAJRRQjWHSuI8NR%2FIP0JUl%2BzRxpqhXqdZwM6ZHGgbNV%2Bf8vTRTxbeTzhTYFnBcs1FKxuRHnD%2FhEWbjWDzjZm5wiNYH7qv5OPpxpn0lW0h50UTPKPzuSrgrhHVmBOHi%2F4SBptGQDTgG3g0Uyal9eiP8ftJsTyBGiQbVw026g%3D%3D%7Ctkp%3ABFBMxKX82_Ri) | 19 | | Samsung 870 Evo 500 GB SSD | $48 | 2 | $96 | [Link](https://www.amazon.com/dp/B08QBMD6P4?psc=1&ref=ppx_yo2ov_dt_b_product_details) | 20 | | Dell 1.2TB 10k RPM HDD | $62 | 6 | $372 | [Link](https://www.amazon.com/dp/B01LXGGRWA?psc=1&ref=ppx_yo2ov_dt_b_product_details) | 21 | | R730 Riser 3 GPU Addition | $15 | 1 | $15 | [Link](https://www.ebay.com/itm/185915362826) | 22 | | Drive Caddies for SSD's | $30 | 1 | $30 | [Link](https://www.amazon.com/dp/B0837SWX8Y?psc=1&ref=ppx_yo2ov_dt_b_product_details) | 23 | | NVME SSD 4TB 7.3k MB/s | $189 | 1 | $189 | [Link](https://www.amazon.com/gp/product/B0C91RNCDV/ref=ppx_yo_dt_b_asin_title_o00_s00?ie=UTF8&psc=1) | 24 | | NVME PCIE Addition Card | $18 | 1 | $18 | [Link](https://www.amazon.com/dp/B084GDY2PW?psc=1&ref=ppx_yo2ov_dt_b_product_details) | 25 | | **Total Cost** | $1,550 | ------ | ------ | ------ | 26 | 27 | Update 6/13/2024 28 | - Thank you to SourceWebMD on Reddit for helping bring to my attention that the 1600W PSU previously on the guide was not compatible with the R730 server. Even though the manufacturer site claims to be, it does seem to be lying. Please use the 1100W PSU for the build. 29 | 30 | ## AI Dedicated Build 31 | **This is the build I'd suggest to the majority of users who want to balance budget, performance, and have a server fully focused to be an AI workstation.** Also being honest as well, **the following build is also ridiculously easier to setup and cheaper and more performant**. 32 | 33 | | Item | Cost/Unit | Quantity | Total | Link | 34 | |------|----------|---------|------|------| 35 | | Nvidia Tesla P40 GPU | $175 | 2 | $350 | [Link](https://www.ebay.com/itm/204488727042) | 36 | | P40 power adapters | $15 | 2 | $30 | [Link](https://www.amazon.com/dp/B08N4BJL2J?psc=1&ref=ppx_yo2ov_dt_b_product_details) | 37 | | Dell PowerEdge R730 (64GB RAM, 2x E5-2667v4 3.2GHz = 16 Cores, 8 bay) | $364 | 1 | $364 | [Link](https://www.ebay.com/itm/115819389907) | 38 | | 1100W Dell PSU | $21 | 2 | $42 | [Link](https://www.ebay.com/itm/134094913593?hash=item1f38ae0039:g:Kq0AAOSw2pBkPFHA&amdata=enc%3AAQAIAAAA8PrhTu%2BzhLKX0jsnp5Dqlh3DZjYc2cSClmmPJwTp5SzKy644q3A4w3%2BlLE0Gt%2Fyg9ybmNWXliWCiSnbNUIf%2FLjP%2F3eEuKZeDZJ%2BuRvwvosOgx4UkQldeWBLiDUaHlYxEMxm246bkNIr79UKBzvlMikro77OyAJRRQjWHSuI8NR%2FIP0JUl%2BzRxpqhXqdZwM6ZHGgbNV%2Bf8vTRTxbeTzhTYFnBcs1FKxuRHnD%2FhEWbjWDzjZm5wiNYH7qv5OPpxpn0lW0h50UTPKPzuSrgrhHVmBOHi%2F4SBptGQDTgG3g0Uyal9eiP8ftJsTyBGiQbVw026g%3D%3D%7Ctkp%3ABFBMxKX82_Ri) | 39 | | Any Cheap SSD's | $27 | 2 | $54 | [Link](https://www.amazon.com/TEAMGROUP-T-Force-Vulcan-Internal-T253TZ001T0C101/dp/B0B6ZC5MS3/ref=sr_1_18?crid=99V4I5GLMCKE&keywords=1TB%2BSSD&qid=1699291793&sprefix=1tb%2Bssd%2Caps%2C67&sr=8-18&th=1) | 40 | | R730 Riser 3 GPU Addition | $15 | 1 | $15 | [Link](https://www.ebay.com/itm/185915362826) | 41 | | Drive Caddies for SSD's | $30 | 1 | $30 | [Link](https://www.amazon.com/dp/B0837SWX8Y?psc=1&ref=ppx_yo2ov_dt_b_product_details) | 42 | | NVME SSD 4TB 7.3k MB/s | $189 | 1 | $189 | [Link](https://www.amazon.com/gp/product/B0C91RNCDV/ref=ppx_yo_dt_b_asin_title_o00_s00?ie=UTF8&psc=1) | 43 | | NVME PCIE Addition Card | $18 | 1 | $18 | [Link](https://www.amazon.com/dp/B084GDY2PW?psc=1&ref=ppx_yo2ov_dt_b_product_details) | 44 | | **Total Cost** | $1,092 | ------ | ------ | ------ | 45 | 46 | **The 2X SSD's are not necessary, but I suggest you consider RAID 1 redundancy with 2X drives for your OS**. I'm a man of redundancy and I'd suggest just getting 2X SSD's and then in the R730 BIOS set them up in a RAID 1 setup so that if one of the drives dies, then you can order a new SSD as a replacement without losing your data on the OS. You can say the same thing for the NVME SSD', but I didn't care too much as worst case I'd just have to redownload the models. In my opinion, it's more annoying resintalling the OS and setting up everything again. Also, as you can see, I've removed the SAS drives as in this build, if we're focused on AI, we don't need the crazy redundancy like what I built previously nor do we need a ton of space for the OS. I found 512GB SSD's, but you don't even need it that big. 47 | 48 | I suggest you get SSD's simply for the OS in a RAID 1 hardware setup. Now why do I also suggest an NVME SSD? Because there's a lot of reports, forumns, and articles talking about the struggles of getting an NVME SSD to work properly on boot for your OS on an R730 or related servers. I personally think it's worth avoiding the headache as I did and just drop the $54 or less on drives to run the OS to avoid any and all issues around this subject as I don't think it's worth the headache. And then when we get the OS setup, simply target your installations to the NVME SSD when it comes to the AI models. Because as long as your local AI work is on the NVME SSD, it doesn't matter what your OS is on. I just suggest the SSD's for the OS because it's cheap any why not? 49 | 50 | Please note **this build has a different server price and link that I've provided!** This server has 16 cores, less 2.5 inch bays, and less RAM. The server I built is meant for various other tasks outside of AI as well, but if we're building only for AI then I've linked a 2X E5-2667V4 3.2GHz CPU's which has a faster single thread speed, which isn't necessary, but is a really nice luxury. Also, as a general rule, you won't really need more than 50% RAM to VRAM ratio for your AI work. That rule of thumb is super generalized and not hardcore, but I follow that rule roughly. For example, the goal of both builds is to be able to load at least the LLama 2 70b model in 4 bit between the 2X GPU's. Which 64GB is above what's needed for that task as that model configuration uses roughly 35 GB of VRAM. If we maxed the 48 GB of VRAM available, we still have 33% more RAM to VRAM ratio, which is good enough for me. Plus the server I linked has 8 bays as we don't need the 16 bay or higher version for what we're trying to caccomplish, so this brings the cost down for the server while also getting a better CPU speed in exchange! As having tons of cores isn't really necessary for AI builds, nor will it really utilize it. As long as you have quote on quote "enough", which 16 cores is overkill/plenty. 51 | 52 | The **PSU's are different as well** because my suggested server here when accounting for utilizing less bays and less RAM, you can get away with 1,100W PSU instead of the originally suggested 1,600W PSU. I was suggesting the higher watt PSU if you have multiple purposes and needed room for really filling out the server. I'm also accounting for trying to give at least a 20% headroom on the PSU capacity as it's not generally suggested you use a PSU to the near limit, as it's bad for the PSU health. Also remember you need 2 PSU's for this server as it's a redundant setup where you put 2 PSU's in it. Anyways, the 1,100W PSU definetly helps bring the cost down. Also note that the NewServerLife servers I bought said they'll send a 750W PSU, which isn't really sufficient for what we'll be doing, but when I told them about my goals, they sent me the 1,100W PSU. I don't know if it was purposeful because they were helping me out or an accident. Do not bank on that occuring for you, but the NewServerLife sellers were super helpful with my questions and I think they were just helping me out, which was really cool of them. 53 | 54 | Lastly on this build. If you're wanting to play with models like the Falcon 40B or some other models similar, you may want to opt for 128 GB of RAM total. But, it's up to you on what you'll need and you can always upgrade to more RAM later. 55 | 56 | # OS & Setup 57 | The multi purpose and more expensive AI server build that I personally built is a built that I would very much suggest you use Proxmox. Proxmox gave me really powerful capabilities to do PCIE Passthrough, host containers, host VM's, and overall gave me the flexibility I needed for my use cases. But on the VM itself that I am doing the AI work on, I am using Ubuntu Desktop OS. I tried Windows 10 as I'm a Windows guy personally, but I had tons of issues. Windows really struggled within the VM, even with the right settings, drivers, everything. Biggest struggles was getting the OS to perform at any optimal speed or recognizing a great deal of drivers or hardware within the system. In which when I tried Ubuntu, all my issues honestly went away. I'll try to write up a totally different guide on setting up Proxmox VM for AI use, but **this method is the more advanced method and I do not suggest anyone try to attempt if they're not truly tech savvy and willing to put in a lot of elbow greese.** Though if you do set it up this way, you can do some cool things. For example the LLAMA 2 70B at 4bit uses roughly only 35 GB of VRAM, leaving you with 13 GB of VRAM left. Though I had a ton of issues getting VGPU working and eventually gave up. If you wanted to push through that process, you could create one VM with 35-36 VRAM dedicated for the LLM and then have a spare 13 GB of VRAM for a stable diffusion or an additional small LLM to be running on a seperate container/VM. Which is pretty cool. But if you wanted to know, the Ubuntu VM for the AI workstation stuff I dedicated 4 cores and 2 socks (effectively 8 cores) total. This so far has been more than sufficient for my needs. Which also means you don't have to follow my guide exactly, not even for the more budget version, if you want to save like $70 getting a smaller and slower CPU. I personally just opted for the faster single thread speed because I really didn't want to find out the hard way or not if it'd matter to me. So, if you wanna find out for me, please be my guest and let me know! 58 | 59 | As for the **main suggested OS build I'd recommend** and what I would suggest for the AI specific cheaper build I provided would be to just install Ubuntu as the base OS. You could use Windows if you wanted. My main AI workstation which is much more expensive is windows based, but it's totally different parts and much more expensive. I can't express how much easier it was to set everything here up was on Ubuntu versus using Windows on a machine like this. But either way, whether it's Ubuntu or Windows, you'll also get vastly higher performance and quality since you're not hosting a VM with overhead or anything. The CPU will be faster, you won't have to setup PCIE passthrough, and much more. Honestly, this is the cheaper, faster, and better build in my opinion. Then, once your OS is installed, it's so much freaking easier to just get rolling, but don't forget when you install your AI programs and LLM models to target your NVME drive. 60 | 61 | # Why buy what I buy? 62 | 63 | ## Why is the NVME Drive so important? 64 | On my main workstation with 2X 3090's and tons of other much faster parts than what I've suggested here. I have an NVME SSD that is roughly 3.5k MB/s speed which is 1/2 the speed of the NVME I suggested above. And at that speed, when loading the LLAMA 2 70B at 4 bit, it takes roughly 7 minutes for me to load the model from my disk to memory. Which is acceptable and if that's acceptable to you, get a slower NVME for cheaper. You also don't need a 4TB, but I would suggest at least a 2TB, but you'd be surprised how fast you'll fill these drives up as you begin downloading the larger models. Anyways, I disrespected how important the NVME SSD was when loading the LLM into the VRAM when I built my server originally. Originally I thought the 10k RPM SAS drives with 2X Samsung 840 SATA SSD's in effectively RAID 0 as the cache pool would be more than sufficient to load the larger models from the disk to VRAM. I was horribly wrong. **The SAS drive setup I built took 4.6 hours to load the model from disk to VRAM!** And I tested this 3 seperate times and each time it took roughly 4.6 hours to load. This is obviously unacceptable in terms of how long it takes to load the model. This test was using Oogabooga's text gen web UI. I've gotten much faster results using OpenLLM to load the model, but this is still an unacceptable time to load a model. Now, I'm unsure how much faster it'd be if you only used SATA SSD cards, but honestly, NVME SSD's are so cheap now that you really should just get the NVME SSD. I also suggested the 7k+ MB/S NVME SSD because with the 2X Tesla P40's, we'll be utilizing both the available PCIE 3.0 X16 slots, leaving us with only PCIE 3.0 4X and 8X slots. You'll want to put the NVME SSD that has more than 3.5k MB/s speed in the X8 slots, which we should have plenty of as it's right under the max speed theoretically the PCIE 3.0 8X slot can utilize. You don't have to get the 7k+ speed NVME, but as I said previously, I have the 3.5k MB/s NVME in my main machine and it takes roughly 7 minutes to load the model, so I wanted to cut that time in 1/2 personally, so I thought the money was worth it. But even at 3.5k speed, it's a dramatic difference versus the max SATA SSD speeds of 540 MB/s, so save yourself a ton of time and get the NVME's. Do not disrespect the NVME speeds like I did as it caused me to waste time and money thinking SAS drives would be sufficient. Maybe I also did something wrong, but screw it, I'm just going to stick with the NVME's from now on for AI work. 65 | 66 | ## Why Buy Specific NVME PCIE Addition Card? 67 | Many older servers or just servers in general, but especially these Dell PowerEdge servers. **You cannot Just add any PCIE NVME slot!** These servers are very limited with what PCIE NVME addition cards you can use. I've seen some people online use some older DELL specific NVME cards, but they weren't very fast from what I saw, plus they were kind of expensive. If you ant an **NVME adapter card with 2X slots, I saw success with this card** ([Click Here](https://cloudninjas.com/products/supermicro-nvme-m-2-pcie-adapter-card)), but I didn't try that personally. But after you use the 2X P40's or 2X any GPU's, you'll only have access to PCIE 3.0 X8 slots, so if I'm remember correctly you'll only have a max theoretical speed that can be transmitted on a 3.0 X8 slot of 7,800. So, if you want to get the adapter with 2X slots, then aim for the NVME's with 3,500 MB/s speed so that when they are both working, they don't have to fight for bandwidth. I personally opted for a single NVME slot PCIE adapter card because it was only $16 versus the $75 of a 2X slot. Plus, I only wanted a single NVME because I wanted to buy an NVME with 7,300 MB/s speed, so adding another NVME would mean I would be likely fighting for bandwidth. And you'll have a couple more X8 slots available to you, so you can just add multiple. But for AI work, I just wanted that much faster speed personally more than anything. 68 | 69 | But please be mindful that not all PCIE NVME Adapter cards will work in the server! And also note that I heard extremely mixed, mostly negative results of people using the NVME on these PCIE adapters and being able to get the server to boot up and recognize the NVME with an OS on it. So, just to reiterate, I've seen very poor success, if any, of individuals using an NVME on these servers when they're trying to use it as their bootup operating system drive. This is why I suggest filing the bay slots with SATA SSD's or SAS drives and then using the NVME once you are booted into the OS and just target the NVME directory. 70 | 71 | Lastly, I'm sure there's plenty of other NVME PCIE adapters that'll work. I personally plugged in 2 of my old NVME adapters I had laying around with no success. I found this $16 adapter via some forum I ran into online where people were sharing what adapters worked or didn't work. 72 | 73 | ## Why the Nvidia Tesla P40 GPU's? 74 | This entire build revolved around the Nvidia Tesla P40 GPU's. Personally, I think the best budget GPU's that exist for AI hands down goes to the Nvidia 3090's. The 3090's have 10k+ cuda cores and 24 GB of VRAM. The 3090's are monsters, but they're also not cheap. As of when I'm writing this, you can go on ebay and snatch a used one for probably around $700. To get 2X of them, that's $1,400 total for just the GPU's alone. So, they're fast with plenty of VRAM, but they're also not cheap. I've not tried to stick the 3090's into my R730 server yet, though I'm sure you can without issue. If you want to do things like fine tuning or certain training or you just want way faster TPS (Tokens per second), then consider the 3090's if they fit in the server. I'll try to remember to update this later and let others know if I could fit the 3090 in the R730 server or not. I'm sure it'd fit though. Anyways, the 24 GB of VRAM is one of the biggest components of the 3090 that makes it so there's enough VRAM to stick in the much larger models like the LLAMA 2 70b. 75 | 76 | But I went on a hunt for GPU's with 24 GB of VRAM on a budget. And there's a couple options you'll run into like the Tesla M40's. The Nvidia Tesla M40 is a cool card and all with 24GB of VRAM and you can find them on ebay for as cheap as $110! But, **I DO NOT SUGGEST M40 GPU's** as you'll very likely run into issues. The M40 GPU for AI LLM work is funny. I'm not fully aware of all the details, but there's various videos and articles out there talking about individuals buying the M40 for AI uses and regretting it. Not that it was terrible or anything, but many said they'd want to get the P40 instead of they could go back in time. There's honestly a lot of reasons as to why, but save yourself the headache and get the P40's. The Nvidia Tesla P40, though a bit pricer, is rocking a sexy 3840 cuda cores and 24 GB of VRAM. Though not as many cuda cores as the 3090, the P40 matches the VRAM which is necessary to load these larger models and the P40 costs only 25% of what the 3090 will run you. It's a huge cost savings! Also note that there are down sides to the P40 though. The Tesla P40 is not recognized by the R730 server. So the fans are going to blow like jet engines to keep things cool since it can't recognize the sensors. Using Ipmitools I created a script that'll properly sensor the server and I'll share that script as well. But do note that if you don't fix up the fan speed, then you'll want to put this server in a room nowhere near where you sleep or work, like probably your garage. But it's not that much effort to fix up the fan speed, especially if you use my script. Then it'll only ramp up the fan speed (which makes it loud) when being used. Additionally note that the Tesla P40's are not recognized on many different motherboards or machines. An R730/R720 will recognize the GPU's, but plugging this into your normal consumer grade motherboards will often times result in failure. Also note that these are datacenter GPU's, so there's no HDMI ports, which isn't necessary in an R730, but this is one of the reasons I chose to build out a Dell server for this work. 77 | 78 | But this build really does revolve around the Nvidia Tesla P40's. When building an AI rig, it's honestly the most important aspect. Making sure your rig can hold the models you wish to run on the VRAM while also being fast enough with tokens per second is critical and I personally believe the P40's are an amazing budget, yet extremely performant card for the task! 79 | 80 | # Benchmarks (STILL REWORKING THE RESULTS) 81 | Please note I am benchmarking this on the multi functionality server build with Proxmox, so I have overhead from the hypervisor hosting VM's, but it should be relatively similar to what you'd get if you built the more specific AI machine. 82 | 83 | ### Used Open LLM & Oogabooga text generation web ui: 84 | **It has come to my attention that my benchmarks may be extremely low for reference.** The point stands with what was written, but I'm not using optimized settings at all. I've been informed of others using the same models as myself achieving 5.3X more TPS on their 3090's than what I've achieved. But I've been just using the standard default setting with zero optimization. More importantly, after some research, I simply learned I was using some really non optimize Nous Hermes LLama 2 70b models that were not with the times. Apparently you can achieve vastly better results with other models. I can report back on faster speeds in the future when I get some new models downloaded and tested. But for now, it's still a good comparison since I provide direct comparisons to my benchmarks to my Nvidia 3090. But after very early re-testing with different configurations, I already doubled my speeds, so I'm still working on the benchmarks. 85 | 86 | **Nouse Hermes LLAMA 2 70b 4bit** - 1.38 - 1.45 TPS 87 | 88 | **Nouse Hermes LLAMA 2 13b** - 5.1 - 5.6 TPS 89 | 90 | **Nouse Hermes LLAMA 2 7b** - 5.7 - 6.4 TPS 91 | 92 | Note I just wanted to mention, that for the 70b model, the ~1.4 TPS I got doesn't seem like much, and it's not, but at the same time it's acceptable. At least in my opinion it is. When going with older GPU's like the P40, you can only expect but so much. Plus, in my opinion, ~1.4 TPS is acceptable. Not necessarily fast, but it's useable for many applicational use cases. We all want even faster TPS obviously as the faster it is, the more we can do in a short time, and the cooler tasks we can accomplish. But when I ran the 70b 4bit model on my 2X 3090's, I was getting roughly 3.03-3.44 TPS. So the P40's obviously are not at the same speed the 3090's can achieve, but it's not like getting 3090's will 10X your speed. At the upward limits, the 3090 is still achieving almost 2.4X faster speeds than the P40, but lets be real here, neither the 3090's or P40's are actually that fast in the grand scheme of things. You're getting 2.4X the speed for spending 4X more money. THe P40 really shines here in this example for the performance you're getting per dollar you spend. And again, if you put the 3090's text gen side by side with the P40's TPS speed, there's absolutely a noticeable difference, but not enough in my eyes to say, "3090's are the hardcore minimum for AI work" because I really don't believe that's true. the ~1.4 TPS on the P40's are totally acceptable for a multitude of applicational use cases. Especially when you start taking into account the smaller model speeds. THe 13b scoring in the 5 range of TPS is more than useful! And check out the 7b speed, it's even better. 93 | 94 | Honestly, if these speeds feel slow to you, then that's fine! If it's not acceptable to you, then that simply means you have more hardcore requirements that I've needed personally. No shame in that, it just means you're extra cool, but it also means that if these are not acceptable speeds for you, then you'll definetly want to grab yourself at least the 3090. Possibly even consider the 4090 if you need to squeeze out all the speed you can. 95 | 96 | Also note, I didn't run the AI a crazy amount of times to get these benchmark results, but I just wanted to provide a decently good idea of the numbers you'll be able to achieve. 97 | 98 | ### Stable Diffusion 99 | I just used the default settings on Automatic1111 stable diffusion web UI on the 512x512 setup with 20 steps. I then typed in, "Cat" in the prompt and made a batch of 10. On average I scored **~2.5 it/s on stable diffusion** with the P40 server setup. Which is honestly more than enough for me. I personally think it was more than acceptable for this use case. Using tom's hardware stable diffusion IPS (iterations per second) benchmarks. The Tesla P40 is pretty comparable to the RX 6700 10 GB. Once you get into the 3090 GPU territory, it's obviously not even a competition as the 3090 scores nearer to 14.3 IPS. But it does truly depend on your use cases. If you're generating tons of images all the time, then maybe it's not fast enough for you. But with the extra VRAM, you have head room to experiment by running more stable diffusion instances at once, but I've never done that personally. But 2.5 isn't fast, but it's at a level that it's useable. Which in the end is the goal of this entire setup. 100 | 101 | # Fan Script 102 | As mentioned previously, the Nvidia Tesla P40 GPU's are not recognized by default on the R730 server. Due to this issue, the server will ramp the fan speeds up to an ungodly level. Unless you want to hear an F16 fighet jet going off in your home at all times, then listen up because this server will not let you sleep or work if you do not resolve this issue. I've noticed at idle that 20% fan speed is totally acceptable, though I set my idle at 30% personally. at 20%, the server is quiet enough to not blow your ears out. 30% you'll hear it if you're in the same room for sure, but it's fine. Once you get to 40% to 80% which I personally needed to stretch those ranges based on my workloads, then you'll need the fan to be louder. You're more than welcome to just leave it at the blasting sound it'll default too with the P40's if you're sticking it in a garage and don't mind. Otherwise follow these general steps as this is the resolution I've come to within Ubuntu, though you can modify my script to work in other environments like Windows as well. Maybe I'll write a windows version in the future. 103 | 104 | 1.) Setup and connect to your R730 IDRAC IP address and setup your username and password. 105 | 106 | 2.) Within the IDRAC GUI on the web browser, I set the custom fan control minimum to 20%. 107 | 108 | 3.) Within the IDRAC GUI, you need to enable ipmitools and then reboot the server. 109 | 110 | 4.) SSH using something like PUTTY into IDRAC and enable 3rd party GPU settings and manual fan control. 111 | - I don't honestly remember the exact commands I used. Using google and Chat GPT, I was able to easily get this setup. I'll try to go back and find out what I did to make a more comprensive guide in the future. 112 | 113 | 5.) Within your ubuntu OS, I created a python script where you can just copy what I did here by clicking [this link](https://github.com/magiccodingman/Magic-AI-Wiki/blob/main/Scripts/Dell-Server-Fan-Control.py). 114 | - Simply save that script and have it run on startup. This script may not be the best as I made it really quick and dirty, plus I'm not a python developer at all. So, I'm sure others could do significantly better than me. And maybe there's a better way than what I'm doing, but this was sufficent for my use cases as it'll control the fans properly and keep the GPU's and CPU's nice and cool under load while making it quiet with lower RPM fan speed when idle. 115 | 116 | 6.) Then I simply ran the script and have it running automatically at startup, but I initiated it with, "sudo python3 fan_control.py" because that's simply what I named it, but you may name it different. 117 | 118 | # Fan Sound 119 | As brought up in the "Fan Script" section, the script I provided works really well for my use case as the server is quiet enough for me when idle. And I don't mind personally the sound ramping up under load. As I personally have a room dedicated to my networking equipment. If the server is running full blast, even in it's own room, it's really annoying since bedrooms are nearby, but it's rarely running at night, so I don't care. I'll be working to get sound readings for everyone in the near future as well so you can decide for yourself if it's appropriate based on the location you'll be storing the server. But I personally think it's reasonable at idle now, but load is more where sound will begin to become an issue if it is an issue for you. There's some cool things I'm excited to try in the near future. Majorly I found this link ([Click Here](https://www.brentozar.com/archive/2010/01/how-to-make-a-dell-poweredge-quieter/)) where it goes over replacing the PowerEdge server fans with much quieter fans. I personally want to try this out and share the results and costs that went along with this. 120 | 121 | But do not get me wrong. This isn't exactly a quiet build that I've presented. These fans are seriously loud if you let it run at max 24/7. If you plan to have it at load constantly, then this may be an issue for you as well. If you have a network closet where you won't hear it, or if it'll be in your garage, then it's not really a problem. But, **if sound is a concern for you, please be aware that this is 100% not a quiet server!** You may seriously want to consider doing fan modifications like what is shown in the link above so that you can replace the fans with significantly quieter fans. And I believe it's at least 6X hot swappable fans in this case which are the major contributors to the sound. The Fans I'm personally eyeing and considering buying is the, "**noctua nf-a9 pwm**" fans, which you can check out by [clicking here](https://www.amazon.com/Noctua-NF-A9-PWM-Premium-Cooling/dp/B00RUZ059O/ref=sr_1_2?crid=1YLALIKDO57G0&keywords=noctua+nf-a9+pwm&qid=1699376151&sprefix=noctua+nf-a9+pwm%2Caps%2C98&sr=8-2). Noctua has a reputation for being reliable and very quiet, but these are also $19 each. So, we're talking roughly $114 for the fans here, which isn't exactly cheap, but it's not too bad I guess. But I'm also staring down these, "**Artic F9 PWM PST**" fans which are much cheaper running at $8 a fan, which you can see by [clicking here](https://www.amazon.com/ARCTIC-PWM-PST-Technology-Regulates/dp/B002QVLBPO/ref=sr_1_1?crid=1FM25TX91XCZ9&keywords=Arctic+F9+PWM+PST&qid=1699376184&sprefix=arctic+f9+pwm+pst%2Caps%2C64&sr=8-1). As these Artic fans would only end up costing you roughly $48, which is a serious decrease from the Noctua fans. But I may also try a side by side test of these fans out of curiosity to see if the Noctua is worth the extra pennies. Because I think many of us who've been in a room with these Dell servers can agree, that $114 to make this server quiet is not a bad deal when you realize just how loud these servers really are. 122 | 123 | But please do your own research into this topic and make your decisions based on the situation you're in. I'm not personally in need of the server to be any quieter right now and between this server, recent business events, buying a ring for my lady, vacations, and christmas coming up, I'm majorly tapped out on fun money I can spend on this server for the moment. So, I may not come back around to this until early 2024 at the earliest. So, please be aware that I've not done any mods other than the fan script to fix the sound issues. And also remember I haven't even gone fully in the direction to tackle sound yet. The CPU's don't need much cooling overall unless you're using the server for CPU intensive tasks. But the part that causes the fans to ramp up is you need to use the server fans to cool the GPU's. If you add active cooling on your P40's, which is totally possible. The fan script may already work perfectly and not ramp up or you can make minor adjustments to my script to make it so the script only has the fan ramp based on the CPU and not worry about the GPU tempuratures. 124 | 125 | # Conclusion 126 | Sorry for not having more detailed and consise instructions above or throughout the entire guide. I will be adding to this guide over time and adding additional details. I'd love to have a one stop shop that'll walk you through everything, but for now, most if not all of what I've shown will require you to utilize this as a great direction and general guide, but Chat GPT and google will be required for you to learn various aspects about setting this server up properly since I didn't provide everything like each command or settings required. But, I really hope this could be useful to some people. I honestly wish I had this guide when I first started figuring all this out. I wasted a good bit of money buying the wrong parts and I burnt a ton of time banging my head against the wall figuring out how to set all of this up. I'm a pretty technical guy, but I'm a C# developer with minimal network experience. I mean like I barely could even get a basic CCNA if I tried, so my knowledge on Python and a lot of this server work is very minimal for me. But, I worked really hard over the course of a bit over a month working on this constantly. I knew nothing of Proxmox, nothing of ipmitools, I've never even heard of ZFS before I went down this rabbit hole. It was a tremendously cool experience and I've grown a lot because of it. But, hopefully my effort can help others get on the AI bandwagon by building an affordable (not necessarily cheap) AI machine. My personal goal was that as a C# developer, I have so many use cases with AI. Of course I use Chat GPT, but there's a serious cost to utilizing GPT at any serious level. I've been utilizing AI to the degree that it's vastly too expensive for me to use GPT as my soul AI. 127 | 128 | So, this brought me down the path of utilizing local LLM's to build my own server for my clients and company. After the upfront costs, this will significantly reduce the costs of my AI useage over time. This has by no means cut out my Chat GPT use, but I've been able to supplement many different tasks I once used with GPT4 in exchange with fine tuned versions of LLAMA 2 70b. I'm still in the early stages of getting all the numbers, but I've already reduced my AI calls to Chat GPT 4 by 35%. This will already have a very quick ROI for me. And I wanted to create a setup that could scale over time on a budget, so that I could build clusters of these machines to power my AI addiction. Like many of you who're probably reading this, you see the use cases of AI, you are dying to use it more and faster. Heck, if I could use an GPT4 and get crazy large responses of hundreds of requests at once to satisfy a single agent job I'm working on, then maybe just maybe it'll finally be fast enough for me. But we're not there yet, the age of AI is just starting, and finding applicational use cases is extremely difficult right now due to the cost and due to the lack of capabilities AI has for being easily implemented into applications. Hopefully, you learned something, hopefully my ranting wasn't too bad, and hopefuly someone can make this easier for all of us in the future. But until then, built that baller R730 and venture forth into the unkown! 129 | 130 | # Future Notes I may Add 131 | [] Recovery of all my Proxmox settings so others can follow it step by step 132 | 133 | [] Full guide on setting BIOS settings on the R730 134 | 135 | [] General commands/setup/settings of everything would be really nice to have to walk everyone through the entire setup process. 136 | -------------------------------------------------------------------------------- /Wiki/R730-Build-Sound-Warnnings.md: -------------------------------------------------------------------------------- 1 | # Sound Warning (Solutions at the bottom) 2 | I'm very proud of what this server has accomplished, but that also doesn't mean it's perfect. The suggested server is a datacenter level server, which means it wasn't meant to be in your house! Please be mindful of the following warnings and precautions about the server before you make the decision to build/buy anything similar. 3 | 4 | ## Sound 5 | The following table will help you relate the decibel levels I'm about to describe to sounds you'd be familiar with. This way you know what you're getting into when it comes to how loud this server is: 6 | 7 | | Decibel Level (dB) | Comparable Sound | 8 | |--------------------|---------------------------------| 9 | | 0 | Threshold of hearing | 10 | | 10 | Breathing | 11 | | 20 | Whisper, rustling leaves | 12 | | 30 | Quiet rural area | 13 | | 40 | Library, bird calls | 14 | | 50 | Moderate rainfall | 15 | | 60 | Normal conversation | 16 | | 70 | Vacuum cleaner | 17 | | 80 | Heavy city traffic | 18 | | 90 | Lawnmower | 19 | | 100 | Motorcycle | 20 | | 110 | Rock concert, chainsaw | 21 | | 120 | Thunderclap, sirens | 22 | | 130 | Jet takeoff (100 meters away) | 23 | | 140 | Fireworks, gunshot | 24 | | 150 | Balloon pop | 25 | | 160 | Shotgun firing | 26 | 27 | ## This is not a quiet server by default! 28 | I provide a quickly made python script within the guide that'll help control the sound levels of the R730 server. Because by default, the server does not recognize the Tesla P40 GPU's and will max out the fans to keep everything very cool. This is ridiculously loud and not really acceptable for most homes unless you're stickingn this server in a garage or shed where the sound can't bother anyone. 29 | 30 | ### My Personal Decibel Level Readings** 31 | 32 | The following is my personal tests with a decibel reader. I walked quite close to the server and would go as far as a couple feet away from the server to get a broad range of the sound 33 | 34 | | Decibel Level (dB) | Fan % | Scenario | Notes | 35 | |--------------------|-----------|---------------------|-------| 36 | | 55 - 65 | 30% - 40% | Idle / low load | ----- | 37 | | 70 - 75 | 70% - 80% | Running 70b LLM | Would spike to 80% rarely and for a short period of time | 38 | | 73 - 78 | 80% | Generating 40 cat images on Stable Diffusion | Would hover 70% to 80% at first, but after 28 cat images, it stayed at 80% until all cats were generated | 39 | 40 | 41 | # Solutions 42 | I've got a whole room dedicated to my servers and equipment. And anything that's louder than what I'm okay with goes to my garage. But not everyone has that luxury and I understand that. There are some solutions though. But please note I have not worked on these solutions myself, nor will I in the immediate future. But here's some solutions I would personally implement that would be effective, cheap, and easy enough. 43 | 44 | 1.) **Add active cooling to the Tesla P40 GPU's!** The major cause of the fans needing to ramp up is the Tesla P40 GPU's as they've got no cooling on them as they rely on the server fans to cool them down. There's many guides and mods you can perform and find online that'll tell you how to easily add fans to the Tesla P40 GPU's. This is most likely the easiest solution in my opinion and you'll definetly see success with any amount of determination. If you do this, just remember to adjust the python script I provide by deleting part of the script where it ramps the fans based on the GPU's. This way it only ramps the fans based on the CPU's tempuratures. This'll reduce the sound low enough for the majority of individuals. You likely could get away with 20% or even 10% fan speeds at that level (my script defaults to 30% as the lowest) as long as you gauge the temps yourself properly. 45 | 46 | 2.) **Replace the R730 Fans!** There's some really cool guides online like [this link here if you click on this text](https://www.brentozar.com/archive/2010/01/how-to-make-a-dell-poweredge-quieter/). These guides will walk you through replacing the fans inside the server with much quieter fans. This direction is a really cool option to me and I'm personally considering it myself. Maybe not the easiest or cheapest solution, but it'd be really cool to perform. But please be mindful that this direction is not plug and play! You can't just change the fans on these servers and you'll be required to do some soddering and wire splicing. Just be weary of this fact. 47 | 48 | If anyone does either of the following above methods to quiet down their server. I'd love to have someone contact me on this repository or through any other means to give me their results and decibel readings! If yall learn something and tell me, I'd be happy to pass the knowledge along. 49 | -------------------------------------------------------------------------------- /Wiki/TPS-Chart.md: -------------------------------------------------------------------------------- 1 | | Tokens per Second (TPS) | Human Perspective Comparison | Use-Case Examples | 2 | |--------------------------|------------------------------|-------------------| 3 | | 0.1 TPS | Far slower than average human reading speed | Learning scenarios for AI, debugging | 4 | | 1 TPS | Comparable to slow reading aloud | Simple conversational AI | 5 | | 5 TPS | Average human reading speed (200 words per minute) | Interactive chatbots, basic Q&A | 6 | | 10 TPS | Speed of a fast reader | More fluid conversational AI, live Q&A | 7 | | 20 TPS | Speed of someone skimming text | High-efficiency workloads, fast response systems | 8 | | 40 TPS | Faster than most humans can comprehend while reading | Real-time translation, advanced analytics | 9 | --------------------------------------------------------------------------------