├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── chatbot-ui
    ├── application
    │   ├── Dockerfile
    │   ├── app.py
    │   └── requirements.txt
    └── manifests
    │   ├── deployment.yaml
    │   └── ingress-class.yaml
├── helm.tf
├── main.tf
├── nodepool_automode.tf
├── static
    └── images
    │   ├── chatbot.jpg
    │   └── cloudshell.jpg
└── vllm-chart
    ├── .helmignore
    ├── Chart.yaml
    ├── templates
        ├── NOTES.txt
        ├── _helpers.tpl
        ├── deployment.yaml
        └── service.yaml
    └── values.yaml


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by https://www.toptal.com/developers/gitignore/api/terraform
 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=terraform
 3 | 
 4 | ### Terraform ###
 5 | # Local .terraform directories
 6 | **/.terraform/*
 7 | 
 8 | # .tfstate files
 9 | *.tfstate
10 | *.tfstate.*
11 | 
12 | # Crash log files
13 | crash.log
14 | crash.*.log
15 | 
16 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as
17 | # password, private keys, and other secrets. These should not be part of version
18 | # control as they are data points which are potentially sensitive and subject
19 | # to change depending on the environment.
20 | *.tfvars
21 | *.tfvars.json
22 | 
23 | # Ignore override files as they are usually used to override resources locally and so
24 | # are not checked in
25 | override.tf
26 | override.tf.json
27 | *_override.tf
28 | *_override.tf.json
29 | 
30 | # Include override files you do wish to add to version control using negated pattern
31 | # !example_override.tf
32 | 
33 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
34 | # example: *tfplan*
35 | 
36 | # Ignore CLI configuration files
37 | .terraformrc
38 | terraform.rc
39 | 
40 | # End of https://www.toptal.com/developers/gitignore/api/terraform
41 | .terraform.lock.hcl


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT No Attribution
 2 | 
 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 | 
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Hosting DeepSeek-R1 on Amazon EKS
  2 | 
  3 | In this tutorial, we’ll walk you through how to host [**DeepSeek-R1**](https://github.com/deepseek-ai/DeepSeek-R1) model on AWS using **Amazon EKS**. We are using [**Amazon EKS Auto Mode**](https://aws.amazon.com/eks/auto-mode/?trk=309fae93-0dac-4940-8d50-5b585d53959f&sc_channel=el) for the the flexibility and scalability that it provides, while eliminating the need for you to manage the Kubernetes control plane, compute, storage, and networking components.
  4 | 
  5 | ## Deploying DeepSeek-R1 on Amazon EKS Auto Mode
  6 | 
  7 | For this tutorial, we’ll use the [***DeepSeek-R1-Distill-Llama-8B***](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) distilled model. 
  8 | While it requires fewer resources (like GPU) compared to the full [***DeepSeek-R1***](https://huggingface.co/deepseek-ai/DeepSeek-R1) model with 671B parameters, it provides a lighter, though less powerful, option compared to the full model. 
  9 | 
 10 | If you'd prefer to deploy the full DeepSeek-R1 model, simply replace the distilled model in the vLLM configuration.
 11 | 
 12 | ###  PreReqs
 13 | 
 14 | - [Check AWS Instance Quota](https://docs.aws.amazon.com/ec2/latest/instancetypes/ec2-instance-quotas.html)
 15 | - [Install kubectl](https://kubernetes.io/docs/tasks/tools/)
 16 | - [Install terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli)
 17 | - [Install finch](https://runfinch.com/docs/getting-started/installation/) or [docker](https://docs.docker.com/get-started/get-docker/) 
 18 | 
 19 | ### Create an Amazon  EKS Cluster w/ Auto Mode using Terraform
 20 | We'll use Terraform to easily provision the infrastructure, including a VPC, ECR repository, and an EKS cluster with Auto Mode enabled.
 21 | 
 22 | ``` bash
 23 | # Clone the GitHub repo with the manifests
 24 | git clone https://github.com/aws-samples/deepseek-using-vllm-on-eks
 25 | cd deepseek-using-vllm-on-eks
 26 | 
 27 | # Apply the Terraform configuration
 28 | terraform init
 29 | terraform apply -auto-approve
 30 | 
 31 | $(terraform output configure_kubectl | jq -r)
 32 | ```
 33 | 
 34 | ### Deploy  DeepSeek Model
 35 | 
 36 | In this step, we will deploy the **DeepSeek-R1-Distill-Llama-8B** model using vLLM on Amazon EKS. 
 37 | We will walk through deploying the model with the option to enable GPU-based, Neuron-based (Inferentia and Trainium), 
 38 | or both, by configuring the parameters accordingly.
 39 | 
 40 | #### Configuring Node Pools
 41 | The `enable_auto_mode_node_pool` parameter can be set to `true` to automatically create node pools when using EKS AutoMode. 
 42 | This configuration is defined in the [nodepool_automode.tf](./nodepool_automode.tf) file. If you're using EKS AutoMode, this will ensure that the appropriate node pools are provisioned.
 43 | 
 44 | #### Customizing Helm Chart Values
 45 | To customize the values used to host your model using vLLM, check the [helm.tf](./helm.tf) file. 
 46 | This file defines the model to be deployed (**deepseek-ai/DeepSeek-R1-Distill-Llama-8B**) and allows you to pass additional parameters to vLLM. 
 47 | You can modify this file to change resource configurations, node selectors, or tolerations as needed.
 48 | 
 49 | ``` bash
 50 | # Let's start by just enabling the GPU based option:
 51 | terraform apply -auto-approve -var="enable_deep_seek_gpu=true" -var="enable_auto_mode_node_pool=true"
 52 | 
 53 | # Check the pods in the 'deepseek' namespace 
 54 | kubectl get po -n deepseek
 55 | ```
 56 | 
 57 | <details>
 58 |   <summary>Click to deploy with Neuron based Instances</summary>
 59 | 
 60 |   ``` bash
 61 |   # Before Adding Neuron support we need to build the image for the vllm deepseek neuron based deployment.
 62 |   
 63 |   # Let's start by getting the ECR repo name where we'll be pushing the image
 64 |   export ECR_REPO_NEURON=$(terraform output ecr_repository_uri_neuron | jq -r)
 65 | 
 66 |   # Now, let's clone the official vLLM repo and use its official container image with the neuron drivers installed
 67 |   git clone https://github.com/vllm-project/vllm
 68 |   cd vllm
 69 | 
 70 |   # Building image
 71 |   finch build --platform linux/amd64 -f Dockerfile.neuron -t $ECR_REPO_NEURON:0.1 .
 72 | 
 73 |   # Login on ECR repository
 74 |   aws ecr get-login-password | finch login --username AWS --password-stdin $ECR_REPO_NEURON
 75 | 
 76 |   # Pushing the image
 77 |   finch push $ECR_REPO_NEURON:0.1
 78 | 
 79 |   # Remove vllm repo and container image from local machine
 80 |   cd ..
 81 |   rm -rf vllm
 82 |   finch rmi $ECR_REPO_NEURON:0.1
 83 | 
 84 |   # Enable additional nodepool and deploy vLLM DeepSeek model
 85 |   terraform apply -auto-approve -var="enable_deep_seek_gpu=true" -var="enable_deep_seek_neuron=true" -var="enable_auto_mode_node_pool=true"
 86 |   ```
 87 | </details>
 88 | 
 89 | Initially, the pod might be in a **Pending state** while EKS Auto Mode provisions the underlying EC2 instances with the required drivers.
 90 | 
 91 | <details>
 92 |   <summary>Click if your pod is stuck in a "pending" state for several minutes</summary>
 93 |    
 94 |   ``` bash
 95 |   # Check if the node was provisioned
 96 |   kubectl get nodes -l owner=data-engineer
 97 |   ```
 98 |   If no nodes are displayed, verify that your AWS account has sufficient service quota to launch the required instances.
 99 |   Check the quota limits for G, P, or Inf instances (e.g., GPU or Neuron based instances).
100 |   
101 |   For more information, refer to the [AWS EC2 Instance Quotas documentation](https://docs.aws.amazon.com/ec2/latest/instancetypes/ec2-instance-quotas.html).
102 | 
103 |   **Note:** Those quotas are based on vCPUs, not the number of instances, so be sure to request accordingly.
104 | 
105 | </details>
106 | 
107 | ``` bash
108 | # Wait for the pod to reach the 'Running' state
109 | kubectl get po -n deepseek --watch
110 | 
111 | # Verify that a new Node has been created
112 | kubectl get nodes -l owner=data-engineer -o wide
113 | 
114 | # Check the logs to confirm that vLLM has started 
115 | # Select the command based on the accelerator you choose to deploy.
116 | kubectl logs deployment.apps/deepseek-gpu-vllm-chart -n deepseek
117 | kubectl logs deployment.apps/deepseek-neuron-vllm-chart -n deepseek
118 | ```
119 | 
120 | You should see the log entry **Application startup complete** once the deployment is ready.
121 | 
122 | ### Interact with the LLM
123 | 
124 | Next, we can create a local proxy to interact with the model using a curl request.
125 | 
126 | ``` bash
127 | # Set up a proxy to forward the service port to your local terminal
128 | # We are exposing Neuron based on port 8080 and GPU based on port 8081
129 | kubectl port-forward svc/deepseek-neuron-vllm-chart -n deepseek 8080:80 > port-forward-neuron.log 2>&1 &
130 | kubectl port-forward svc/deepseek-gpu-vllm-chart -n deepseek 8081:80 > port-forward-gpu.log 2>&1 &
131 | 
132 | # Send a curl request to the model (change the port according to the accelerator you are using)
133 | curl -X POST "http://localhost:8080/v1/chat/completions" -H "Content-Type: application/json" --data '{
134 |  "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
135 |  "messages": [
136 |  {
137 |  "role": "user",
138 |  "content": "What is Kubernetes?"
139 |  }
140 |  ]
141 |  }'
142 | ```
143 | The response may take a few seconds to build, depending on the complexity of the model’s output. 
144 | You can monitor the progress via the `deepseek-gpu-vllm-chart` or `deepseek-neuron-vllm-chart` deployment logs.
145 | 
146 | ### Build a Chatbot UI for the Model
147 | 
148 | While direct API requests work fine, let’s build a more user-friendly Chatbot UI to interact with the model. The source code for the UI is already available in the GitHub repository.
149 | 
150 | ``` bash
151 | # Retrieve the ECR repository URI created by Terraform
152 | export ECR_REPO=$(terraform output ecr_repository_uri | jq -r)
153 | 
154 | # Build the container image for the Chatbot UI
155 | finch build --platform linux/amd64 -t $ECR_REPO:0.1 chatbot-ui/application/.
156 | 
157 | # Login to ECR and push the image
158 | aws ecr get-login-password | finch login --username AWS --password-stdin $ECR_REPO
159 | finch push $ECR_REPO:0.1
160 | 
161 | # Update the deployment manifest to use the image
162 | sed -i "s#__IMAGE_DEEPSEEK_CHATBOT__#$ECR_REPO:0.1#g" chatbot-ui/manifests/deployment.yaml
163 | 
164 | # Generate a random password for the Chatbot UI login
165 | sed -i "s|__PASSWORD__|$(openssl rand -base64 12 | tr -dc A-Za-z0-9 | head -c 16)|" chatbot-ui/manifests/deployment.yaml
166 | 
167 | # Deploy the UI and create the ingress class required for load balancers
168 | kubectl apply -f chatbot-ui/manifests/ingress-class.yaml
169 | kubectl apply -f chatbot-ui/manifests/deployment.yaml
170 | 
171 | # Get the URL for the load balancer to access the application
172 | echo http://$(kubectl get ingress/deepseek-chatbot-ingress -n deepseek -o json | jq -r '.status.loadBalancer.ingress[0].hostname')
173 | ```
174 | 
175 | To access the Chatbot UI, you'll need the username and password stored in a Kubernetes secret.
176 | 
177 | ``` bash
178 | echo -e "Username=$(kubectl get secret deepseek-chatbot-secrets -n deepseek -o jsonpath='{.data.admin-username}' | base64 --decode)\nPassword=$(kubectl get secret deepseek-chatbot-secrets -n deepseek -o jsonpath='{.data.admin-password}' | base64 --decode)"
179 | ```
180 | After logging in, you'll see a new **Chatbot tab** where you can interact with the model!
181 | In this tab, you'll notice a dropdown menu that lets you switch between Neuron-based and GPU-based deployments!
182 | 
183 | ![chatbot-ui](/static/images/chatbot.jpg)
184 | 
185 | ---
186 | ### Disclaimer
187 | 
188 | **This repository is intended for demonstration and learning purposes only.**
189 | It is **not** intended for production use. The code provided here is for educational purposes and should not be used in a live environment without proper testing, validation, and modifications.
190 | 
191 | Use at your own risk. The authors are not responsible for any issues, damages, or losses that may result from using this code in production.
192 | 


--------------------------------------------------------------------------------
/chatbot-ui/application/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM python:3.12-slim
 3 | 
 4 | # Set the working directory in the container
 5 | WORKDIR /app
 6 | 
 7 | # Copy the current directory contents into the container at /app
 8 | COPY . /app
 9 | 
10 | # Install any needed packages specified in requirements.txt
11 | RUN pip install --no-cache-dir -r requirements.txt
12 | 
13 | # Make port 7860 available to the world outside this container
14 | EXPOSE 7860
15 | 
16 | # Run app.py when the container launches
17 | CMD ["python", "app.py"]


--------------------------------------------------------------------------------
/chatbot-ui/application/app.py:
--------------------------------------------------------------------------------
  1 | import gradio as gr
  2 | import requests
  3 | import os
  4 | import re
  5 | import json
  6 | 
  7 | # Get credentials from environment variables
  8 | ADMIN_USERNAME = os.getenv("ADMIN_USERNAME", "admin")
  9 | ADMIN_PASSWORD = os.getenv("ADMIN_PASSWORD", "password")
 10 | 
 11 | # List of possible model base URLs
 12 | MODEL_URLS = json.loads(os.getenv("MODEL_URLS", "http://localhost"))
 13 | 
 14 | 
 15 | def authenticate(username, password):
 16 |     return username == ADMIN_USERNAME and password == ADMIN_PASSWORD
 17 | 
 18 | 
 19 | def query_model(question, model_base_url):
 20 |     url = f"{model_base_url}/v1/chat/completions"
 21 |     headers = {"Content-Type": "application/json"}
 22 |     data = {
 23 |         "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
 24 |         "messages": [
 25 |             {
 26 |                 "role": "user",
 27 |                 "content": question
 28 |             }
 29 |         ]
 30 |     }
 31 | 
 32 |     response = requests.post(url, headers=headers, json=data)
 33 | 
 34 |     if response.status_code == 200:
 35 |         result = response.json()
 36 |         content = result['choices'][0]['message']['content']
 37 |         # Remove <think> and </think> tags
 38 |         content = re.sub(r'<think>|</think>', '', content).strip()
 39 |         return content
 40 |     else:
 41 |         return f"Error: {response.status_code} - {response.text}"
 42 | 
 43 | 
 44 | def chatbot(message, history, model_base_url):
 45 |     response = query_model(message, model_base_url)
 46 |     history.append((message, response))
 47 |     return history
 48 | 
 49 | 
 50 | def create_demo():
 51 |     with gr.Blocks() as demo:
 52 |         gr.Markdown("# DeepSeek AI Chatbot")
 53 | 
 54 |         with gr.Tab("Login"):
 55 |             username = gr.Textbox(label="Username")
 56 |             password = gr.Textbox(label="Password", type="password")
 57 |             login_button = gr.Button("Login")
 58 |             login_message = gr.Markdown(visible=False)
 59 | 
 60 |         with gr.Tab("Chatbot", visible=False) as chatbot_tab:
 61 |             model_base_url_dropdown = gr.Dropdown(
 62 |                 choices=MODEL_URLS,
 63 |                 label="Select Model Base URL",
 64 |                 value=MODEL_URLS[0]  # Default to the first URL
 65 |             )
 66 |             chatbot = gr.Chatbot(height=300)
 67 |             msg = gr.Textbox(placeholder="Type your message here...", label="User Input")
 68 |             clear = gr.Button("Clear")
 69 | 
 70 |             def user(user_message, history, model_base_url):
 71 |                 return "", history + [[user_message, None]]
 72 | 
 73 |             def bot(history, model_base_url):
 74 |                 if not history:  # Check if history is empty
 75 |                     return history
 76 |                 user_message = history[-1][0]
 77 |                 bot_message = query_model(user_message, model_base_url)
 78 |                 history[-1][1] = bot_message
 79 |                 return history
 80 | 
 81 |             msg.submit(user, [msg, chatbot, model_base_url_dropdown], [msg, chatbot], queue=False).then(
 82 |                 bot, [chatbot, model_base_url_dropdown], chatbot
 83 |             )
 84 |             clear.click(lambda: None, None, chatbot, queue=False)
 85 | 
 86 |         def login(username, password):
 87 |             if authenticate(username, password):
 88 |                 return (
 89 |                     gr.update(value="Login successful!", visible=True),  # login_message
 90 |                     gr.update(visible=True),  # chatbot_tab
 91 |                     "",  # username
 92 |                     "",  # password
 93 |                 )
 94 |             else:
 95 |                 return (
 96 |                     gr.update(value="Invalid credentials. Please try again.", visible=True),  # login_message
 97 |                     gr.update(visible=False),  # chatbot_tab
 98 |                     gr.update(),  # username (no change)
 99 |                     gr.update(),  # password (no change)
100 |                 )
101 | 
102 |         login_button.click(login, inputs=[username, password], outputs=[login_message, chatbot_tab, username, password])
103 | 
104 |     return demo
105 | 
106 | if __name__ == "__main__":
107 |     demo = create_demo()
108 |     demo.launch(server_name="0.0.0.0", server_port=7860)
109 | 


--------------------------------------------------------------------------------
/chatbot-ui/application/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiofiles==23.2.1
 2 | annotated-types==0.7.0
 3 | anyio==4.8.0
 4 | certifi==2024.12.14
 5 | charset-normalizer==3.4.1
 6 | click==8.1.8
 7 | fastapi==0.115.7
 8 | ffmpy==0.5.0
 9 | filelock==3.17.0
10 | fsspec==2024.12.0
11 | gradio==5.13.1
12 | gradio_client==1.6.0
13 | h11==0.14.0
14 | httpcore==1.0.7
15 | httpx==0.28.1
16 | huggingface-hub==0.28.0
17 | idna==3.10
18 | Jinja2==3.1.5
19 | markdown-it-py==3.0.0
20 | MarkupSafe==2.1.5
21 | mdurl==0.1.2
22 | numpy==2.2.2
23 | orjson==3.10.15
24 | packaging==24.2
25 | pandas==2.2.3
26 | pillow==11.1.0
27 | pydantic==2.10.6
28 | pydantic_core==2.27.2
29 | pydub==0.25.1
30 | Pygments==2.19.1
31 | python-dateutil==2.9.0.post0
32 | python-multipart==0.0.20
33 | pytz==2024.2
34 | PyYAML==6.0.2
35 | requests==2.32.3
36 | rich==13.9.4
37 | ruff==0.9.3
38 | safehttpx==0.1.6
39 | semantic-version==2.10.0
40 | shellingham==1.5.4
41 | six==1.17.0
42 | sniffio==1.3.1
43 | starlette==0.45.3
44 | tomlkit==0.13.2
45 | tqdm==4.67.1
46 | typer==0.15.1
47 | typing_extensions==4.12.2
48 | tzdata==2025.1
49 | urllib3==2.3.0
50 | uvicorn==0.34.0
51 | websockets==14.2
52 | 


--------------------------------------------------------------------------------
/chatbot-ui/manifests/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: deepseek-chatbot
 5 |   namespace: deepseek
 6 |   labels:
 7 |     app: deepseek-chatbot
 8 | spec:
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |       app: deepseek-chatbot
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app: deepseek-chatbot
17 |     spec:
18 |       containers:
19 |       - name: deepseek-chatbot
20 |         image: __IMAGE_DEEPSEEK_CHATBOT__
21 |         ports:
22 |         - containerPort: 7860
23 |         env:
24 |         - name: ADMIN_USERNAME
25 |           valueFrom:
26 |             secretKeyRef:
27 |               name: deepseek-chatbot-secrets
28 |               key: admin-username
29 |         - name: ADMIN_PASSWORD
30 |           valueFrom:
31 |             secretKeyRef:
32 |               name: deepseek-chatbot-secrets
33 |               key: admin-password
34 |         - name: MODEL_URLS
35 |           value: '["http://deepseek-gpu-vllm-chart", "http://deepseek-neuron-vllm-chart"]'
36 |         resources:
37 |           requests:
38 |             cpu: "250m"
39 |             memory: "512Mi"
40 |           limits:
41 |             cpu: "500m"
42 |             memory: "1Gi"
43 | ---
44 | apiVersion: v1
45 | kind: Service
46 | metadata:
47 |   name: deepseek-chatbot-service
48 |   namespace: deepseek
49 | spec:
50 |   selector:
51 |     app: deepseek-chatbot
52 |   ports:
53 |     - protocol: TCP
54 |       port: 80
55 |       targetPort: 7860
56 |   type: ClusterIP
57 | ---
58 | apiVersion: v1
59 | kind: Secret
60 | metadata:
61 |   name: deepseek-chatbot-secrets
62 |   namespace: deepseek
63 | type: Opaque
64 | stringData:
65 |   admin-username: admin
66 |   admin-password: __PASSWORD__
67 | ---
68 | apiVersion: networking.k8s.io/v1
69 | kind: Ingress
70 | metadata:
71 |   name: deepseek-chatbot-ingress
72 |   namespace: deepseek
73 |   annotations:
74 |     alb.ingress.kubernetes.io/scheme: internet-facing
75 |     alb.ingress.kubernetes.io/target-type: ip
76 | spec:
77 |   ingressClassName: alb
78 |   rules:
79 |   - http:
80 |       paths:
81 |       - path: /
82 |         pathType: Prefix
83 |         backend:
84 |           service:
85 |             name: deepseek-chatbot-service
86 |             port:
87 |               number: 80
88 | 


--------------------------------------------------------------------------------
/chatbot-ui/manifests/ingress-class.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: eks.amazonaws.com/v1
 2 | kind: IngressClassParams
 3 | metadata:
 4 |   name: alb
 5 | spec:
 6 |   scheme: internet-facing
 7 | ---
 8 | apiVersion: networking.k8s.io/v1
 9 | kind: IngressClass
10 | metadata:
11 |   name: alb
12 |   annotations:
13 |     ingressclass.kubernetes.io/is-default-class: "true"
14 | spec:
15 |   controller: eks.amazonaws.com/alb
16 |   parameters:
17 |     apiGroup: eks.amazonaws.com
18 |     kind: IngressClassParams
19 |     name: alb


--------------------------------------------------------------------------------
/helm.tf:
--------------------------------------------------------------------------------
  1 | resource "helm_release" "deepseek_gpu" {
  2 |   count            = var.enable_deep_seek_gpu ? 1 : 0
  3 |   name             = "deepseek-gpu"
  4 |   chart            = "./vllm-chart"
  5 |   create_namespace = true
  6 |   wait             = false
  7 |   replace          = true
  8 |   namespace        = "deepseek"
  9 | 
 10 |   values = [
 11 |     <<-EOT
 12 |     nodeSelector:
 13 |       owner: "data-engineer"
 14 |       instanceType: "gpu"
 15 |     tolerations:
 16 |       - key: "nvidia.com/gpu"
 17 |         operator: "Exists"
 18 |         effect: "NoSchedule"
 19 |     resources:
 20 |       limits:
 21 |         cpu: "32"
 22 |         memory: 100G
 23 |         nvidia.com/gpu: "1"
 24 |       requests:
 25 |         cpu: "16"
 26 |         memory: 30G
 27 |         nvidia.com/gpu: "1"
 28 |     command: "vllm serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --max_model 2048"
 29 | 
 30 |     livenessProbe:
 31 |       httpGet:
 32 |         path: /health
 33 |         port: 8000
 34 |       initialDelaySeconds: 1800
 35 |       periodSeconds: 10
 36 | 
 37 |     readinessProbe:
 38 |       httpGet:
 39 |         path: /health
 40 |         port: 8000
 41 |       initialDelaySeconds: 1800
 42 |       periodSeconds: 5
 43 | 
 44 |     EOT
 45 |   ]
 46 |   depends_on = [module.eks, kubernetes_manifest.gpu_nodepool]
 47 | }
 48 | 
 49 | resource "helm_release" "deepseek_neuron" {
 50 |   count            = var.enable_deep_seek_neuron ? 1 : 0
 51 |   name             = "deepseek-neuron"
 52 |   chart            = "./vllm-chart"
 53 |   create_namespace = true
 54 |   wait             = false
 55 |   replace          = true
 56 |   namespace        = "deepseek"
 57 | 
 58 |   values = [
 59 |     <<-EOT
 60 |     image:
 61 |       repository: ${aws_ecr_repository.neuron-ecr.repository_url}
 62 |       tag: 0.1
 63 |       pullPolicy: IfNotPresent
 64 | 
 65 |     nodeSelector:
 66 |       owner: "data-engineer"
 67 |       instanceType: "neuron"
 68 |     tolerations:
 69 |       - key: "aws.amazon.com/neuron"
 70 |         operator: "Exists"
 71 |         effect: "NoSchedule"
 72 | 
 73 |     command: "vllm serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --device neuron --tensor-parallel-size 2 --max-num-seqs 4 --block-size 8 --use-v2-block-manager --max-model-len 2048"
 74 | 
 75 |     env:
 76 |       - name: NEURON_RT_NUM_CORES
 77 |         value: "2"
 78 |       - name: NEURON_RT_VISIBLE_CORES
 79 |         value: "0,1"
 80 |       - name: VLLM_LOGGING_LEVEL
 81 |         value: "INFO"
 82 | 
 83 |     resources:
 84 |       limits:
 85 |         cpu: "30"
 86 |         memory: 64G
 87 |         aws.amazon.com/neuron: "1"
 88 |       requests:
 89 |         cpu: "30"
 90 |         memory: 64G
 91 |         aws.amazon.com/neuron: "1"
 92 | 
 93 |     livenessProbe:
 94 |       httpGet:
 95 |         path: /health
 96 |         port: 8000
 97 |       initialDelaySeconds: 1800
 98 |       periodSeconds: 10
 99 | 
100 |     readinessProbe:
101 |       httpGet:
102 |         path: /health
103 |         port: 8000
104 |       initialDelaySeconds: 1800
105 |       periodSeconds: 5
106 |     EOT
107 |   ]
108 |   depends_on = [module.eks, kubernetes_manifest.neuron_nodepool]
109 | }
110 | 


--------------------------------------------------------------------------------
/main.tf:
--------------------------------------------------------------------------------
  1 | variable "enable_deep_seek_gpu" {
  2 |   description = "Enable DeepSeek using GPUs"
  3 |   type        = bool
  4 |   default     = false
  5 | }
  6 | 
  7 | variable "enable_deep_seek_neuron" {
  8 |   description = "Enable DeepSeek using Neuron"
  9 |   type        = bool
 10 |   default     = false
 11 | }
 12 | 
 13 | variable "enable_auto_mode_node_pool" {
 14 |   description = "Enable EKS AutoMode NodePool"
 15 |   type        = bool
 16 |   default     = false
 17 | }
 18 | 
 19 | locals {
 20 |   region   = "us-east-1"
 21 |   vpc_cidr = "10.0.0.0/16"
 22 |   name     = "eks-automode"
 23 |   azs      = slice(data.aws_availability_zones.available.names, 0, 3)
 24 | 
 25 |   tags = {
 26 |     Blueprint = local.name
 27 |   }
 28 | }
 29 | 
 30 | 
 31 | # Define the required providers
 32 | provider "aws" {
 33 |   region = local.region # Change to your desired region
 34 | }
 35 | 
 36 | provider "kubernetes" {
 37 |   host                   = module.eks.cluster_endpoint
 38 |   cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
 39 | 
 40 |   exec {
 41 |     api_version = "client.authentication.k8s.io/v1beta1"
 42 |     command     = "aws"
 43 |     # This requires the awscli to be installed locally where Terraform is executed
 44 |     args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
 45 |   }
 46 | }
 47 | 
 48 | provider "helm" {
 49 |   kubernetes {
 50 |     host                   = module.eks.cluster_endpoint
 51 |     cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
 52 | 
 53 |     exec {
 54 |       api_version = "client.authentication.k8s.io/v1beta1"
 55 |       command     = "aws"
 56 |       # This requires the awscli to be installed locally where Terraform is executed
 57 |       args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
 58 |     }
 59 |   }
 60 | }
 61 | 
 62 | data "aws_availability_zones" "available" {
 63 |   # Do not include local zones
 64 |   filter {
 65 |     name   = "opt-in-status"
 66 |     values = ["opt-in-not-required"]
 67 |   }
 68 | }
 69 | 
 70 | # Use the Terraform VPC module to create a VPC
 71 | module "vpc" {
 72 |   source  = "terraform-aws-modules/vpc/aws"
 73 |   version = "5.17.0" # Use the latest version available
 74 | 
 75 |   name = "${local.name}-vpc"
 76 |   cidr = local.vpc_cidr
 77 | 
 78 |   azs             = local.azs
 79 |   private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)]
 80 |   public_subnets  = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)]
 81 | 
 82 |   enable_nat_gateway = true
 83 |   single_nat_gateway = true
 84 | 
 85 |   public_subnet_tags = {
 86 |     "kubernetes.io/role/elb" = 1
 87 |   }
 88 | 
 89 |   private_subnet_tags = {
 90 |     "kubernetes.io/role/internal-elb" = 1
 91 |   }
 92 | 
 93 |   tags = local.tags
 94 | }
 95 | 
 96 | # Use the Terraform EKS module to create an EKS cluster
 97 | module "eks" {
 98 |   source  = "terraform-aws-modules/eks/aws"
 99 |   version = "20.33.1" # Use the latest version available
100 | 
101 |   cluster_name    = local.name
102 |   cluster_version = "1.31" # Specify the EKS version you want to use
103 | 
104 |   cluster_endpoint_public_access           = true
105 |   enable_irsa                              = true
106 |   enable_cluster_creator_admin_permissions = true
107 | 
108 |   cluster_compute_config = {
109 |     enabled    = true
110 |     node_pools = ["general-purpose"]
111 |   }
112 | 
113 | 
114 |   vpc_id     = module.vpc.vpc_id
115 |   subnet_ids = module.vpc.private_subnets
116 | 
117 |   tags = local.tags
118 | }
119 | 
120 | 
121 | resource "aws_ecr_repository" "chatbot-ecr" {
122 |   name                 = "${local.name}-chatbot"
123 |   image_tag_mutability = "MUTABLE"
124 | }
125 | 
126 | resource "aws_ecr_repository" "neuron-ecr" {
127 |   name                 = "${local.name}-neuron-base"
128 |   image_tag_mutability = "MUTABLE"
129 | }
130 | 
131 | # Outputs
132 | output "configure_kubectl" {
133 |   description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
134 |   value       = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}"
135 | }
136 | 
137 | output "ecr_repository_uri" {
138 |   value = aws_ecr_repository.chatbot-ecr.repository_url
139 | }
140 | 
141 | output "ecr_repository_uri_neuron" {
142 |   value = aws_ecr_repository.neuron-ecr.repository_url
143 | }


--------------------------------------------------------------------------------
/nodepool_automode.tf:
--------------------------------------------------------------------------------
  1 | resource "kubernetes_manifest" "gpu_nodepool" {
  2 |   count = var.enable_auto_mode_node_pool && var.enable_deep_seek_gpu ? 1 : 0
  3 |   manifest = {
  4 |     apiVersion = "karpenter.sh/v1"
  5 |     kind       = "NodePool"
  6 |     metadata = {
  7 |       name = "gpu-nodepool"
  8 |     }
  9 |     spec = {
 10 |       template = {
 11 |         metadata = {
 12 |           labels = {
 13 |             owner = "data-engineer"
 14 |             instanceType = "gpu"
 15 |           }
 16 |         }
 17 |         spec = {
 18 |           nodeClassRef = {
 19 |             group = "eks.amazonaws.com"
 20 |             kind  = "NodeClass"
 21 |             name  = "default"
 22 |           }
 23 |           taints = [
 24 |             {
 25 |               key    = "nvidia.com/gpu"
 26 |               value  = "Exists"
 27 |               effect = "NoSchedule"
 28 |             }
 29 |           ]
 30 |           requirements = [
 31 |             {
 32 |               key      = "eks.amazonaws.com/instance-family"
 33 |               operator = "In"
 34 |               values   = ["g5", "g6", "g6e", "p5", "p4"]
 35 |             },
 36 |             {
 37 |               key      = "kubernetes.io/arch"
 38 |               operator = "In"
 39 |               values   = ["amd64"]
 40 |             },
 41 |             {
 42 |               key      = "karpenter.sh/capacity-type"
 43 |               operator = "In"
 44 |               values   = ["spot", "on-demand"]
 45 |             }
 46 |           ]
 47 |         }
 48 |       }
 49 |       limits = {
 50 |         cpu    = "1000"
 51 |         memory = "1000Gi"
 52 |       }
 53 |     }
 54 |   }
 55 | 
 56 |   depends_on = [module.eks]
 57 | }
 58 | 
 59 | resource "kubernetes_manifest" "neuron_nodepool" {
 60 |   count = var.enable_auto_mode_node_pool && var.enable_deep_seek_neuron ? 1 : 0
 61 |   manifest = {
 62 |     apiVersion = "karpenter.sh/v1"
 63 |     kind       = "NodePool"
 64 |     metadata = {
 65 |       name = "neuron-nodepool"
 66 |     }
 67 |     spec = {
 68 |       template = {
 69 |         metadata = {
 70 |           labels = {
 71 |             owner = "data-engineer"
 72 |             instanceType = "neuron"
 73 |           }
 74 |         }
 75 |         spec = {
 76 |           nodeClassRef = {
 77 |             group = "eks.amazonaws.com"
 78 |             kind  = "NodeClass"
 79 |             name  = "default"
 80 |           }
 81 |           taints = [
 82 |             {
 83 |               key    = "aws.amazon.com/neuron"
 84 |               value  = "Exists"
 85 |               effect = "NoSchedule"
 86 |             }
 87 |           ]
 88 |           requirements = [
 89 |             {
 90 |               key      = "eks.amazonaws.com/instance-family"
 91 |               operator = "In"
 92 |               values   = ["inf2"]
 93 |             },
 94 |             {
 95 |               key      = "karpenter.sh/capacity-type"
 96 |               operator = "In"
 97 |               values   = ["spot", "on-demand"]
 98 |             }
 99 |           ]
100 |         }
101 |       }
102 |       limits = {
103 |         cpu    = "1000"
104 |         memory = "1000Gi"
105 |       }
106 |     }
107 |   }
108 | 
109 |   depends_on = [module.eks]
110 | }


--------------------------------------------------------------------------------
/static/images/chatbot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/deepseek-using-vllm-on-eks/d05003f1744b921081fabed9a3615bf71eea68aa/static/images/chatbot.jpg


--------------------------------------------------------------------------------
/static/images/cloudshell.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/deepseek-using-vllm-on-eks/d05003f1744b921081fabed9a3615bf71eea68aa/static/images/cloudshell.jpg


--------------------------------------------------------------------------------
/vllm-chart/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/vllm-chart/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: vllm-chart
 3 | description: A Helm chart for Kubernetes
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.1.0
19 | 
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "1.16.0"
25 | 


--------------------------------------------------------------------------------
/vllm-chart/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | {{- if contains "NodePort" .Values.service.type }}
 2 |   export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "vllm-chart.fullname" . }})
 3 |   export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
 4 |   echo http://$NODE_IP:$NODE_PORT
 5 | {{- else if contains "LoadBalancer" .Values.service.type }}
 6 |      NOTE: It may take a few minutes for the LoadBalancer IP to be available.
 7 |            You can watch its status by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "vllm-chart.fullname" . }}'
 8 |   export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "vllm-chart.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
 9 |   echo http://$SERVICE_IP:{{ .Values.service.port }}
10 | {{- else if contains "ClusterIP" .Values.service.type }}
11 |   export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "vllm-chart.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
12 |   export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
13 |   echo "Visit http://127.0.0.1:8080 to use your application"
14 |   kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
15 | {{- end }}
16 | 


--------------------------------------------------------------------------------
/vllm-chart/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | Expand the name of the chart.
 3 | */}}
 4 | {{- define "vllm-chart.name" -}}
 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 6 | {{- end }}
 7 | 
 8 | {{/*
 9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "vllm-chart.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 | 
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "vllm-chart.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 | 
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "vllm-chart.labels" -}}
37 | helm.sh/chart: {{ include "vllm-chart.chart" . }}
38 | {{ include "vllm-chart.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- end }}
44 | 
45 | {{/*
46 | Selector labels
47 | */}}
48 | {{- define "vllm-chart.selectorLabels" -}}
49 | app.kubernetes.io/name: {{ include "vllm-chart.name" . }}
50 | app.kubernetes.io/instance: {{ .Release.Name }}
51 | {{- end }}
52 | 
53 | {{/*
54 | Create the name of the service account to use
55 | */}}
56 | {{- define "vllm-chart.serviceAccountName" -}}
57 | {{- if .Values.serviceAccount.create }}
58 | {{- default (include "vllm-chart.fullname" .) .Values.serviceAccount.name }}
59 | {{- else }}
60 | {{- default "default" .Values.serviceAccount.name }}
61 | {{- end }}
62 | {{- end }}
63 | 


--------------------------------------------------------------------------------
/vllm-chart/templates/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: {{ include "vllm-chart.fullname" . }}
 5 |   namespace: {{ .Values.namespace }}
 6 |   labels:
 7 |     {{- include "vllm-chart.labels" . | nindent 4 }}
 8 | spec:
 9 |   replicas: {{ .Values.replicaCount }}
10 |   selector:
11 |     matchLabels:
12 |       {{- include "vllm-chart.selectorLabels" . | nindent 6 }}
13 |   template:
14 |     metadata:
15 |       labels:
16 |         {{- include "vllm-chart.labels" . | nindent 8 }}
17 |     spec:
18 |       nodeSelector:
19 |         {{- toYaml .Values.nodeSelector | nindent 8 }}
20 |       tolerations:
21 |         {{- toYaml .Values.tolerations | nindent 8 }}
22 |       volumes:
23 |         - name: cache-volume
24 |           hostPath:
25 |             path: {{ .Values.cacheVolume.path }}
26 |             type: DirectoryOrCreate
27 |         - name: shm
28 |           emptyDir:
29 |             medium: Memory
30 |             sizeLimit: {{ .Values.shmVolume.sizeLimit }}
31 |       containers:
32 |         - name: {{ .Chart.Name }}
33 |           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
34 |           imagePullPolicy: {{ .Values.image.pullPolicy }}
35 |           command: ["/bin/sh", "-c"]
36 |           args:
37 |             - {{ .Values.command }}
38 |           {{- if .Values.env }}
39 |           env:
40 |             {{- toYaml .Values.env | nindent 12 }}
41 |           {{- end }}
42 |           ports:
43 |             - containerPort: {{ .Values.containerPort }}
44 |           resources:
45 |             {{- toYaml .Values.resources | nindent 12 }}
46 |           volumeMounts:
47 |             - mountPath: /root/.cache/huggingface
48 |               name: cache-volume
49 |             - name: shm
50 |               mountPath: /dev/shm
51 |           livenessProbe:
52 |             {{- toYaml .Values.livenessProbe | nindent 12 }}
53 |           readinessProbe:
54 |             {{- toYaml .Values.readinessProbe | nindent 12 }}


--------------------------------------------------------------------------------
/vllm-chart/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "vllm-chart.fullname" . }}
 5 |   namespace: {{ .Values.namespace }}
 6 |   labels:
 7 |     {{- include "vllm-chart.labels" . | nindent 4 }}
 8 | spec:
 9 |   type: {{ .Values.service.type }}
10 |   ports:
11 |     - name: http
12 |       port: {{ .Values.service.port }}
13 |       protocol: TCP
14 |       targetPort: {{ .Values.service.targetPort }}
15 |   selector:
16 |     {{- include "vllm-chart.selectorLabels" . | nindent 4 }}


--------------------------------------------------------------------------------
/vllm-chart/values.yaml:
--------------------------------------------------------------------------------
 1 | namespace: deepseek
 2 | replicaCount: 1
 3 | 
 4 | containerPort: 8000
 5 | 
 6 | image:
 7 |   repository: vllm/vllm-openai
 8 |   tag: latest
 9 |   pullPolicy: IfNotPresent
10 | 
11 | nodeSelector: {}
12 | 
13 | tolerations: []
14 | 
15 | cacheVolume:
16 |   path: /tmp/deepseek
17 | 
18 | shmVolume:
19 |   sizeLimit: 2Gi
20 | 
21 | command: "vllm serve __MODEL_NAME_AND_PARAMETERS__"
22 | 
23 | resources:
24 |   limits:
25 |     cpu: "32"
26 |     memory: 100G
27 |   requests:
28 |     cpu: "16"
29 |     memory: 30G
30 | 
31 | service:
32 |   type: ClusterIP
33 |   port: 80
34 |   targetPort: 8000
35 | 
36 | livenessProbe:
37 |   httpGet:
38 |     path: /health
39 |     port: 8000
40 |   initialDelaySeconds: 60
41 |   periodSeconds: 10
42 | 
43 | readinessProbe:
44 |   httpGet:
45 |     path: /health
46 |     port: 8000
47 |   initialDelaySeconds: 60
48 |   periodSeconds: 5
49 | 
50 | env: []


--------------------------------------------------------------------------------