├── models
    └── place_models_here.txt
├── docker-compose.yml
├── LICENSE
└── README.md


/models/place_models_here.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |     vllm-openai:
 4 |         deploy:
 5 |             resources:
 6 |                 reservations:
 7 |                     devices:
 8 |                         - driver: nvidia
 9 |                           count: all
10 |                           capabilities:
11 |                               - gpu
12 |         volumes:
13 |             - ~/.cache/huggingface:/root/.cache/huggingface
14 |             - ./models:/models
15 |         environment:
16 |             - HUGGING_FACE_HUB_TOKEN=<hugging_face_token>
17 |         ports:
18 |             - 8000:8000
19 |         ipc: host
20 |         image: vllm/vllm-openai:latest
21 |         command: --model /models/mistral-7b
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 aneeshjoy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Welcome to vLLM Windows Home!
 2 | This repository contains a Docker Compose setup for running vLLM on Windows. With this setup, you can easily run and experiment with vLLM on Windows Home.
 3 | 
 4 | Enjoy the state-of-the art LLM serving throughput on your Windows Home PC with efficient paged attention, continuous batching and fast inferencing, Plus Qualitzation.
 5 | 
 6 | ![vllm-windows-home](https://github.com/aneeshjoy/vllm-windows/assets/5285961/5b9caae0-1cfd-4fb4-b86f-a8330e8428f0)
 7 | 
 8 | Once the following is setup, you can start vLLM on the click of a button on Docker Desktop or configure to start at windows startup.
 9 | 
10 | ## Getting Started
11 | ### Prerequisites
12 | Docker Desktop: 
13 | Install Docker Desktop from <https://www.docker.com/products/docker-desktop>
14 | 
15 | Note: Windows Home uses WSL as backend engine.
16 | 
17 | ### Steps
18 | 1. Clone the Repository
19 | ```bash
20 | git clone https://github.com/aneeshjoy/vllm-windows.git
21 | cd vllm-windows
22 | ```
23 | 
24 | 2. Update Hugging Face Token
25 | Open `docker-compose.yml` and replace `<hugging_face_token>` with your own Hugging Face token. The format should be like this:
26 | 
27 | ```yaml
28 |   environment:
29 |     - HUGGING_FACE_HUB_TOKEN=<hugging_face_token>
30 | ```
31 | 
32 | 3. Copy Model Weights
33 |    
34 | Download or copy the desired LLM model weights into the `models` directory within the cloned repository and update the model name.
35 | ```yaml
36 |   command: --model /models/mistral-7b
37 | ```
38 | 4. Simply execute the following command at the root level of the project:
39 | 
40 | ```bash
41 | docker-compose up
42 | ```
43 | 5. Test by accessing the /models endpoints
44 | 
45 | http://127.0.0.1:8000/v1/models
46 | 
47 | 
48 | 6. Check throughput ( I am running on a RTX 3090 )
49 | 
50 | http://127.0.0.1:8000/metrics
51 | 
52 | ```
53 | # HELP exceptions_total_counter Total number of requested which generated an exception
54 | # TYPE exceptions_total_counter counter
55 | # HELP requests_total_counter Total number of requests received
56 | # TYPE requests_total_counter counter
57 | requests_total_counter{method="POST",path="/v1/completions"} 24
58 | # HELP responses_total_counter Total number of responses sent
59 | # TYPE responses_total_counter counter
60 | responses_total_counter{method="POST",path="/v1/completions"} 24
61 | # HELP status_codes_counter Total number of response status codes
62 | # TYPE status_codes_counter counter
63 | status_codes_counter{method="POST",path="/v1/completions",status_code="200"} 24
64 | # HELP vllm:avg_generation_throughput_toks_per_s Average generation throughput in tokens/s.
65 | # TYPE vllm:avg_generation_throughput_toks_per_s gauge
66 | vllm:avg_generation_throughput_toks_per_s{model_name="/models/mistral-7b"} 842.7750196184555
67 | # HELP vllm:avg_prompt_throughput_toks_per_s Average prefill throughput in tokens/s.
68 | # TYPE vllm:avg_prompt_throughput_toks_per_s gauge
69 | vllm:avg_prompt_throughput_toks_per_s{model_name="/models/mistral-7b"} 1211.5997677115236
70 | # HELP vllm:cpu_cache_usage_perc CPU KV-cache usage. 1 means 100 percent usage.
71 | # TYPE vllm:cpu_cache_usage_perc gauge
72 | vllm:cpu_cache_usage_perc{model_name="/models/mistral-7b"} 0.0
73 | # HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
74 | # TYPE vllm:gpu_cache_usage_perc gauge
75 | vllm:gpu_cache_usage_perc{model_name="/models/mistral-7b"} 0.38849487785658
76 | # HELP vllm:num_requests_running Number of requests that is currently running for inference.
77 | ```
78 | 


--------------------------------------------------------------------------------