├── README.md
└── docker
    ├── llamacpp
        ├── Dockerfile
        ├── README.md
        └── start-server.sh
    └── vllm-librosa
        └── Dockerfile


/README.md:
--------------------------------------------------------------------------------
  1 | # one-click-llms
  2 | > [!TIP]
  3 | > Qwen3, Gemma 3 and Llama 4 Scout are now supported
  4 | 
  5 | > [!TIP]
  6 | > Post a new issue if you would like other templates.
  7 | Quickly boot up an API endpoint for a given language, vision or speech/transcription model.
  8 | 
  9 | Built by Trelis Research [YouTube](https://Youtube.com/@TrelisResearch), [Newsletter](blog.trelis.com), [Inferencing Scripts](https://trelis.com/enterprise-server-api-and-inference-guide/)
 10 | 
 11 | ## Runpod One-Click Templates
 12 | > [!TIP]
 13 | > To support the Trelis Research YouTube channel, you can sign up for an account with [this link](https://runpod.io?ref=jmfkcdio). Trelis is supported by a commission when you use one-click templates.
 14 | 
 15 | GPU Choices/Recommendations (last updated Oct 15 2024):
 16 | 1. VALUE and best UI: A40 on [Runpod](https://runpod.io?ref=jmfkcdio) (48 GB VRAM) ~$0.39/hr.
 17 | 2. Higher Speed: H100 PCI or SXM (80 GB VRAM) - best for fp8 models, but expensive.
 18 | 
 19 | ### Fine-tuning Notebook Setup
 20 | - CUDA 12.1 one-click template, w/ Jupyter Lab Web Terminal [here](https://runpod.io/gsc?template=ifyqsvjlzj&ref=jmfkcdio) - allows you to run Jupyter Lab inside runpod by default.
 21 | - CUDA 12.1 one-click template, w/ VSCode Web Terminal [here](https://runpod.io/console/deploy?template=epgodh4ed3&ref=jmfkcdio) - allows you to run VSCode in a web terminal (supporting Jupyter Lab, useful if you have issues with widgets within the template above).
 22 | 
 23 | Note that you can access pods from either template via SSH, there is no difference there.
 24 | 
 25 | ### Inference Engines
 26 | - [Transcription] Faster Whisper Server (Transcription only)
 27 | - [LLMs] SGLang is the fastest across all batch sizes.
 28 | - [LLMs and Multi-modal LLMs] vLLM and TGI are close on speed for small batches.
 29 | - [Multi-modal LLM] Moondream API (tiny vision + text language model).
 30 | - [LLMs] Nvidia NIM (paid service from Nvidia): a bit slower than SGLang. Also inconvenient to use as it requires login.
 31 | 
 32 | #### Faster Whisper
 33 | - [Whisper, incl. Turbo](https://runpod.io/console/deploy?template=v7xyt1e57i&ref=jmfkcdio)
 34 | 
 35 | #### SGLang (from lmsys)
 36 | - Qwen 3: [235B-A22B FP8 - currently stalling on startup](https://runpod.io/console/deploy?template=9lhiejtvka&ref=jmfkcdio), [32B dense FP8](https://runpod.io/console/deploy?template=8oc6sh1sth&ref=jmfkcdio), [30B-A3B FP8](https://runpod.io/console/deploy?template=i9yhn2ap2v&ref=jmfkcdio). Tested on 8xH100SXM (currently stalling, [issue here](https://github.com/sgl-project/sglang/issues/5950)), 1xH100SXM and 1xH100SXM, respectively. Use chat/completions. Reasoning tokens are returned separately. Note that the 30B-A3B is a little worse on performance than the 32B dense, but about 2-3x faster on inference.
 37 | - Gemma 3 it [27B FP-8](https://runpod.io/console/deploy?template=e2b4tetdya&ref=jmfkcdio)
 38 | - Llama 4 [Scout, 4xA100 or 4xH100/200 - takes about 15 mins to start](https://runpod.io/console/deploy?template=2d6atsxhzw&ref=jmfkcdio), [Maverick, 8xH100 or 8xH200 - not working yet with docker image](https://runpod.io/console/deploy?template=ubbydr232k&ref=jmfkcdio)
 39 | - DeepSeek R1 Distill Qwen: [32B](https://runpod.io/console/deploy?template=4bbrlx8ue4&ref=jmfkcdio), [1.5B](https://runpod.io/console/deploy?template=wxmkqr598l&ref=jmfkcdio)
 40 | - [Deepseek V2 Lite Chat](https://runpod.io/console/deploy?template=omqvfozjn7&ref=jmfkcdio)
 41 | - Deepseek R1 FP8 - Nvidia [(single node 8xH200)](https://runpod.io/console/deploy?template=9u3ytxvqdj&ref=jmfkcdio), [(dual node 2x8xH100)](https://runpod.io/console/deploy?template=sjy4afxvqx&ref=jmfkcdio)
 42 | - [Deepseek R1 FP8 - ROCM/AMD](https://runpod.io/console/deploy?template=xxarh523m5&ref=jmfkcdio) - Takes ~1-1.5 hours to download + load shards onto 8X MI300X.
 43 | - [Llama 3.1 Instruct 8B FP8 - ROCM/AMD](https://runpod.io/console/deploy?template=fngusf3aep&ref=jmfkcdio)
 44 | - [Qwen 2.5 Coder 32B](https://runpod.io/console/deploy?template=bxj0eugbk5&ref=jmfkcdio)
 45 | - [Llama 3.1 Instruct 8B FP8](https://runpod.io/console/deploy?template=egumitbrf3&ref=jmfkcdio), [Llama 3.1 Instruct 70B FP8](https://runpod.io/console/deploy?template=s51eiwrmif&ref=jmfkcdio), [Llama 3.1 Instruct 70B INT4](https://runpod.io/console/deploy?template=y7meeekjja&ref=jmfkcdio), [Llama 3.1 Instruct 405B FP8](https://runpod.io/console/deploy?template=psy8redq4i&ref=jmfkcdio), [Llama 3.1 Instruct 405B INT4](https://runpod.io/console/deploy?template=sn9qk811g5&ref=jmfkcdio)
 46 | 
 47 | #### vLLM (requires an A100 or H100 or A6000, i.e. ampere architecture):
 48 | - Gemma 3: [27B it (bf16)](https://runpod.io/console/deploy?template=iiiurkp5pw&ref=jmfkcdio)
 49 | - Qwen 3: [235B-A22B FP8](https://runpod.io/console/deploy?template=sb7nue68ax&ref=jmfkcdio), [32B dense FP8](https://runpod.io/console/deploy?template=sybrm5hsk7&ref=jmfkcdio), [30B-A3B FP8](https://runpod.io/console/deploy?template=y3syp133lq&ref=jmfkcdio). Tested on 8xH100SXM, 1xH100SXM and 1xH100SXM, respectively. Use chat/completions. Reasoning tokens are returned separately. Note that the 30B-A3B is a little worse on performance than the 32B dense, but about 2-3x faster on inference.
 50 | - [Qwen2.5 VL - PENDING SUPPORT](https://runpod.io/console/deploy?template=b3i7t84mco&ref=jmfkcdio)
 51 | - [Llama 3.1 8B multi-lora server](https://runpod.io/console/deploy?template=p4l5qvim7s&ref=jmfkcdio)
 52 | - [Llama 3.3 70B in fp8](https://runpod.io/console/deploy?template=wlh8cgobkq&ref=jmfkcdio).
 53 | - Phi 4: [fp8 - Runs at ~32 toks on an A40](https://runpod.io/console/deploy?template=rzgcdh9rqe&ref=jmfkcdio). [bf16 Runs at ~17 toks on an A40](https://runpod.io/console/deploy?template=wlh8cgobkq&ref=jmfkcdio). 
 54 | - [Qwen 2 Audio 7B](https://runpod.io/console/deploy?template=7nfkxpzhcn&ref=jmfkcdio)
 55 | - [Qwen 2 VL 2B](https://runpod.io/console/deploy?template=gf96yn5wjb&ref=jmfkcdio), [Qwen 2 VL 7B](https://runpod.io/console/deploy?template=3dwi64mha8&ref=jmfkcdio), [Qwen 2 VL 70B](https://runpod.io/console/deploy?template=fnpg1ra9ln&ref=jmfkcdio)
 56 | - [Llama 3.2 Vision](https://runpod.io/console/deploy?template=ejpb432goj&ref=jmfkcdio)
 57 | - [Pixtral](https://runpod.io/console/deploy?template=78dbz10el3&ref=jmfkcdio)
 58 | - [Llama 3.1 Instruct 8B](https://runpod.io/console/deploy?template=vfu808ard7&ref=jmfkcdio), [Llama 3.1 Instruct 70B](https://runpod.io/console/deploy?template=rb69yhtpep&ref=jmfkcdio), [Llama 3.1 Instruct FP8 405B](https://runpod.io/console/deploy?template=h3a6ufgwjv&ref=jmfkcdio), [Llama 3.1 Instruct INT4 405B](https://runpod.io/console/deploy?template=psy8redq4i&ref=jmfkcdio)
 59 | - [Phi 3 Mini](https://runpod.io/console/deploy?template=l0rcxaaqlj&ref=jmfkcdio), [Phi 3 Small](https://runpod.io/console/deploy?template=l0rcxaaqlj&ref=jmfkcdio), [Phi 3 Medium](https://runpod.io/console/deploy?template=c5937jor68&ref=jmfkcdio)
 60 | - [Mistral Nemo Instruct (fp8)](https://runpod.io/console/deploy?template=hf0z5wads4&ref=jmfkcdio)
 61 | - [Llama 3 8B Instruct](https://runpod.io/console/deploy?template=4qmr18da9x&ref=jmfkcdio)
 62 | - [Llama 3 70B Instruct](https://runpod.io/console/deploy?template=6e9yxszwne&ref=jmfkcdio)
 63 | - [Mistral Instruct 7B AWQ](https://runpod.io/gsc?template=z5n6lh4zux&ref=jmfkcdio)
 64 | - [Mixtral Instruct 8x7B AWQ](https://runpod.io/gsc?template=dmybzldpis&ref=jmfkcdio)
 65 | - [Qwen1.5 Chat 72B AWQ](https://runpod.io/console/gpu-cloud?template=ju7oo9mf5w&ref=jmfkcdio). Needs to be run on an A100 or H100. The 48 GB of VRAM on an A6000 is insufficient.
 66 | - [CodeLlama 70B Instruct - 4bit AWQ](https://runpod.io/gsc?template=tpqmplhj2a&ref=jmfkcdio). Requires an A6000 or A100 or H100.
 67 | 
 68 | > [!IMPORTANT]
 69 | > Note: vLLM runs into issues sometimes if the pod template does not have the correct CUDA drivers. Unfortunately there is no way to know when picking a GPU. An issue has been raised [here](https://github.com/vllm-project/vllm/issues/2393). As an alternative, you can run TGI (and even query in openai style, guide [here](https://github.com/huggingface/text-generation-inference/commit/0eabc83541225979209ff7183b4b4442e47adf92#diff-5b4ec6c8005e6b142ac974571bb8dc557a9bb7ab7c3d0b09554dfc9cc50dcf0b)). TGI is faster than vLLM and recommended in general. Note however, that TGI does not automatically apply the chat template to the prompt when using the OpenAI style endpoint.
 70 | 
 71 | #### Text Generation Inference:
 72 | - [Llama 3.1 8B](https://runpod.io/console/deploy?template=mwb88n0pu0&ref=jmfkcdio)
 73 | - [IDEFICS 2 8B multi-modal](https://runpod.io/console/deploy?template=d2tjii83ba&ref=jmfkcdio)
 74 | - [Llama 3 - 8B Instruct](https://runpod.io/console/deploy?template=qlkldvr7ys&ref=jmfkcdio)
 75 | - [Llama 3 - 70B Instruct](https://runpod.io/console/deploy?template=qlkldvr7ys&ref=jmfkcdio)
 76 | - [OpenChat 3.5 7B AWQ API - RECOMMENDED](https://runpod.io/gsc?template=8me7ywyjrb&ref=jmfkcdio), [OpenChat 3.5 7B bf16 - TGI API - lowest perplexity](https://runpod.io/gsc?template=xiwn7cb3ro&ref=jmfkcdio)
 77 | - [Mixtral Instruct API 4bit AWQ - RECOMMENDED](https://runpod.io/gsc?template=546m57v73a&ref=jmfkcdio), [Mixtral Instruct API 8bit eetq, pod needs to be restarted multiple times to download all weights](https://runpod.io/gsc?template=1ydpo4766w&ref=jmfkcdio). Requires an A6000 or A100 or H100.
 78 | - [Zephyr 141B - a Mixtral 8x22B fine-tune](https://runpod.io/console/deploy?template=0896yqcr0f&ref=jmfkcdio)
 79 | - [DRBX Instruct](https://runpod.io/console/gpu-cloud?template=tlt1i1welu&ref=jmfkcdio)
 80 | - [Smaug 34B Chat (a Yi fine-tune) - fits in bf16 on an A100. BEWARE that guardrails are weaker on this model than Yi. As such, it may be best suited for structured generation](https://runpod.io/console/gpu-cloud?template=4urlqq7olr&ref=jmfkcdio)
 81 | - [TowerInstruct 13B (multi-lingual Llama 2 fine-tune)](https://runpod.io/console/gpu-cloud?template=xwduaad0fr&ref=jmfkcdio) - needs ~30 GB to run in bf16 (fits on an A6000). Add `--quantize eetq` to run with under 15 GB of VRAM (e.g. A6000).
 82 | - [Yi 34B Chat - fits in 16-bit on an A100](https://runpod.io/console/gpu-cloud?template=hd35vhie4f&ref=jmfkcdio)
 83 | - [Gemma Chat 9B](https://runpod.io/console/gpu-cloud?template=ivkssv2y93&ref=jmfkcdio).
 84 | - [Notux 8x7B AWQ](https://runpod.io/gsc?template=qyhee1k9wx&ref=jmfkcdio). Requires an A6000 or A100 or H100.
 85 | - [CodeLlama 70B Instruct - 4bit AWQ](https://runpod.io/gsc?template=ze563fijpz&ref=jmfkcdio), [CodeLlama 70B Instruct - 4bit bitsandbytes](https://runpod.io/gsc?template=vrjiai47o0&ref=jmfkcdio). Requires an A6000 or A100 or H100.
 86 | - [Mamba Instruct OpenHermes](https://runpod.io/gsc?template=58bjvimzec&ref=jmfkcdio)
 87 | - [Llama 70B API by TrelisResearch - DEPRECATED  - USE LLAMA 3.1 TEMPLATES].
 88 | - [Deepseek Coder 33B Template](https://runpod.io/gsc?template=51tpe9tqk2&ref=jmfkcdio).
 89 | - [Medusa Vicuna](https://runpod.io/gsc?template=2xpg09eenv&ref=jmfkcdio) (high speed speculative decoding - mostly a glamour template because OpenChat with AWQ is better quality and faster)
 90 | 
 91 | #### llama.cpp One-click templates:
 92 | - [Llama 3.1 8B - 4_K_M](https://runpod.io/console/deploy?template=9aqe40bsts&ref=jmfkcdio)
 93 | - [Mistral Nemo Instruct - pending llama cpp support](https://runpod.io/console/deploy?template=dmefdk27fl&ref=jmfkcdio)
 94 | - [Mistral 7B Instruct v0.2 8-bit](https://runpod.io/gsc?template=4g0fj4rh32&ref=jmfkcdio)
 95 | 
 96 | #### Nvidia NIM
 97 | - [Llama 3.1 8B](https://runpod.io/console/deploy?template=iyvm48jw77&ref=jmfkcdio)
 98 | - [Llama 3.1 70B](https://runpod.io/console/deploy?template=xi0o9zze83&ref=jmfkcdio)
 99 | 
100 | #### MoonDream Multi-modal API (openai-ish)
101 | - [Moondream2 - a small but accurate model for querying images](https://runpod.io/console/deploy?template=0m232edqmj&ref=jmfkcdio)
102 | 
103 | #### HuggingFace Speech-to-Speech
104 | - [Combined voice-detection, speech to text, text to text, text to speech model](https://runpod.io/console/deploy?template=xen5lu2cuf&ref=jmfkcdio)
105 | 
106 | > [!TIP]
107 | > As of July 23rd 2024, function calling fine-tuned models are being deprecated in favour of a one-shot approach with stronger models. Find the "Tool Use" video on the [Trelis YouTube Channel](https://youtube.com/@trelisresearch/) for more info.
108 | 
109 | ## Changelog
110 | 15Oct2024:
111 | - Add whisper turbo endpoint
112 | - Deprecate Vast.AI templates.
113 | 
114 | 20Jul2023:
115 | - Update the ./llama-server.sh command in line with breaking changes to llama.cpp
116 | 
117 | Feb 16 2023:
118 | - Added a Mamba one click template.
119 | 
120 | Jan 21 2023:
121 | - Swapped Runpod to before Vast.AI as user experience is much better with Runpod.
122 | 
123 | Jan 9 2023:
124 | - Added Mixtral Instruct AWQ TGI
125 | 
126 | Dec 30 2023:
127 | - Support gated models by adding HUGGING_FACE_HUB_TOKEN env variable.
128 | - Speed up downloading using HuggingFace API.
129 | 
130 | Dec 29 2023:
131 | - Add in one-click llama.cpp server template.
132 | 
133 | ## Vast AI One-Click Templates (DEPRECATED AS OF OCTOBER 15TH 2024).
134 | > [!TIP]
135 | > To support the Trelis Research YouTube channel, you can sign up for an account with [this affiliate link](https://cloud.vast.ai/?ref_id=98762). Trelis is supported by a commission when you use one-click templates.
136 | 
137 | ### Fine-tuning Notebook Setup
138 | - CUDA 12.1 one-click template [here](https://cloud.vast.ai/?ref_id=98762&creator_id=98762&name=Fine-tuning%20Notebook%20by%20Trelis%20-%20Cuda%2012.1).
139 | 
140 | ### Text Generation Inference (fastest):
141 | - [Mistral 7B api](https://cloud.vast.ai/?ref_id=98762&creator_id=98762&name=Mistral-7B%20v0.2%20vLLM%20API)
142 | 
143 | ### vLLM (requires an A100 or H100 or A6000, i.e. ampere architecture):
144 | - [Mistral 7B v0.2 AWQ](https://cloud.vast.ai/?ref_id=98762&creator_id=98762&name=Mistral%207B%20v0.2%20vLLM%20API)
145 | - Post a new issue if you would like other templates
146 | 
147 | ### llama.cpp One-click templates:
148 | - [Mistral 7B Instruct v0.2 8-bit](https://cloud.vast.ai/?ref_id=98762&template_id=bc642dfd6e4c80a1e0807725047588b8)
149 | 
150 | ### Function-calling One-Click Templates
151 | One-click templates for function-calling are located on the HuggingFace model cards. Check out the collection [here](https://huggingface.co/collections/Trelis/function-calling-v3-657199ecbe378693925c7915).
152 | 
153 | ### HuggingFace Speech-to-Speech
154 | - [Combined voice-detection, speech to text, text to text, text to speech model](https://cloud.vast.ai/?ref_id=98762&creator_id=98762&name=HuggingFace%20Speech-to-Speech%20Server%20by%20Trelis)
155 | 
156 | 


--------------------------------------------------------------------------------
/docker/llamacpp/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the specified base image
 2 | FROM runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel-ubuntu22.04
 3 | 
 4 | RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
 5 | RUN echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null
 6 | RUN apt update
 7 | RUN apt -y install cmake
 8 | 
 9 | # Copy the script into the image
10 | COPY start-server.sh /start-server.sh
11 | 
12 | # Make the script executable
13 | RUN chmod +x /start-server.sh
14 | 
15 | # Set the entrypoint to the script
16 | ENTRYPOINT ["/start-server.sh"]
17 | 


--------------------------------------------------------------------------------
/docker/llamacpp/README.md:
--------------------------------------------------------------------------------
 1 | # Llama.cpp Dockerfile
 2 | - start-server.sh is the script to install llama.cpp and start a server within a pytorch-cuda container.
 3 | - Dockerfile is the full install.
 4 | 
 5 | ## Build command
 6 | ```
 7 | docker build -t trelis/llamacpp:v3.0 -t trelis/llamacpp:latest .
 8 | ```
 9 | 
10 | > Replace 'trelis' with your org on docker hub
11 | 
12 | ## Sample start-server command
13 | ```
14 | ./start-server.sh --gguf-file-name mistral-7b-instruct-v0.2.Q8_0.gguf --repo-slug TheBloke/Mistral-7B-Instruct-v0.2-GGUF --context-length 2048 --np 8
15 | ```
16 | 
17 | ## Test curl request to runpod
18 | ```
19 | curl --request POST \
20 |     --url https://{pod-id}-8080.proxy.runpod.net/completion \
21 |     --header "Content-Type: application/json" \
22 |     --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
23 | ```
24 | 
25 | # Next steps
26 | [] Allow for private repos and use of branches/revisions.
27 | [] Use HuggingFace API for the download, to speed up the download.
28 | [] Deploy a simple version of a Llama.cpp server and add that to the ADVANCED fine-tuning setup. Then try to do some fine-tuning.
29 | [x] Try to run the start script on a pytorch template. Try it in background and foreground. 
30 | [x] adjust the flags: https://github.com/ggerganov/llama.cpp/issues/4666


--------------------------------------------------------------------------------
/docker/llamacpp/start-server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Initialize variables
 4 | gguf_file_name=""
 5 | repo_slug=""
 6 | context_length=""
 7 | revision="main"  # Default to the main branch
 8 | num_parallel_threads=16  # Default to 16 parallel threads
 9 | model_dir="${MODEL_DIR:-/workspace}"  # Use the environment variable or default to /workspace
10 | 
11 | # Parse named command-line arguments
12 | while [ "$#" -gt 0 ]; do
13 |   case "$1" in
14 |     --gguf-file-name) gguf_file_name="$2"; shift 2;;
15 |     --repo-slug) repo_slug="$2"; shift 2;;
16 |     --context-length) context_length="$2"; shift 2;;
17 |     --np) num_parallel_threads="$2"; shift 2;;
18 |     --revision) revision="$2"; shift 2;;
19 |     *) echo "Unknown parameter passed: $1"; exit 1;;
20 |   esac
21 | done
22 | 
23 | # Check if variables are set
24 | if [ -z "$gguf_file_name" ] || [ -z "$repo_slug" ] || [ -z "$context_length" ]; then
25 |     echo "Error: Missing required arguments"
26 |     echo "Usage: $0 --gguf-file-name <file_name> --repo-slug <repo_slug> --context-length <context_length> [--np <num_parallel_threads>] [--revision <revision>]"
27 |     exit 1
28 | fi
29 | 
30 | # Install Hugging Face Hub Library and enable fast transfers
31 | pip install huggingface_hub hf_transfer
32 | export HF_HUB_ENABLE_HF_TRANSFER=1
33 | 
34 | # Function to download model using Hugging Face Hub API
35 | download_model() {
36 |     local repo_id="${1}"
37 |     local file_name="${2}"
38 |     local token="${3:-None}"  # Set default value to None for Python
39 |     local revision="${4:-main}"
40 | 
41 |     # Use Hugging Face Hub API to download (or locate in cache) the file
42 |     python -c "
43 | from huggingface_hub import hf_hub_download
44 | from pathlib import Path
45 | 
46 | # Determine if a token is provided
47 | token = None if '${token}' == 'None' else '${token}'
48 | 
49 | # Download or find the file in the cache
50 | file_path = hf_hub_download(repo_id='${repo_id}', filename='${file_name}', revision='${revision}', token=token)
51 | print(file_path)  # Print the path for use in the Bash script
52 | "
53 | }
54 | 
55 | cd ${model_dir}
56 | 
57 | # Clone the llama.cpp repository
58 | git clone https://github.com/ggerganov/llama.cpp
59 | 
60 | # Change directory to llama.cpp and build the project with make
61 | cd llama.cpp
62 | cmake -B build -DGGML_CUDA=ON
63 | cmake --build build --config Release -- -j 16
64 | 
65 | cd ../
66 | 
67 | # Assemble the model path
68 | model_path="${repo_slug}/resolve/${revision}/${gguf_file_name}?download=true"
69 | 
70 | # Check if the model file exists in the model directory and download if it doesn't
71 | if [ ! -f "${model_dir}/${gguf_file_name}" ]; then
72 |     echo "Model file not found in ${model_dir}. Downloading..."
73 |     cached_file_path=$(download_model "${repo_slug}" "${gguf_file_name}" "${HUGGING_FACE_HUB_TOKEN}" "${revision}")
74 |     echo "Download completed."
75 |     echo "Cached file path: $cached_file_path"
76 | else
77 |     echo "Model file ${gguf_file_name} already exists in ${model_dir}"
78 |     cached_file_path="${model_dir}/${gguf_file_name}"
79 | fi
80 | 
81 | # Calculate the actual context length by multiplying it with the number of parallel threads
82 | actual_context_length=$(($context_length * $num_parallel_threads))
83 | 
84 | # Build the command with the -ngl flag
85 | command="./llama.cpp/build/bin/llama-server -m ${cached_file_path} -np ${num_parallel_threads} -cb -c ${actual_context_length} --port 8080 --host 0.0.0.0 -ngl 100"
86 | echo "Server command: $command"
87 | 
88 | # Execute the command and output directly to console
89 | echo "Starting server..."
90 | $command
91 | 
92 | echo "Server process has exited or failed to start."
93 | 


--------------------------------------------------------------------------------
/docker/vllm-librosa/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the base image
 2 | FROM vllm/vllm-openai:v0.6.4.post1
 3 | 
 4 | # Install librosa
 5 | RUN pip install librosa
 6 | 
 7 | # Install curl if not already present
 8 | RUN apt-get update && apt-get install -y curl
 9 | 
10 | # Create directories and download the file using Python to find the correct path
11 | RUN python3 -c "import vllm, os; base_path = os.path.dirname(vllm.__file__); target_path = os.path.join(base_path, 'model_executor/models/qwen2_audio.py'); os.makedirs(os.path.dirname(target_path), exist_ok=True); os.system(f'curl -o {target_path} https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/model_executor/models/qwen2_audio.py')"


--------------------------------------------------------------------------------