├── example ├── __init__.py ├── hello_word.py ├── ollama_app_stream.py └── ollama_app.py ├── ling_code ├── __init__.py ├── inference_script.py └── deepseek.py ├── server ├── __init__.py └── ollama_server.py ├── .dockerignore ├── docs └── easydeploy_modules_20241125.png ├── .gitignore ├── templates ├── index.html └── chat_page.html ├── docker-compose.yaml ├── Dockerfile ├── main.py ├── app.py ├── README_CN.md ├── README.md └── LICENSE /example/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /ling_code/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /server/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | *.pyo 4 | *.pyd 5 | env/ 6 | venv/ -------------------------------------------------------------------------------- /docs/easydeploy_modules_20241125.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/EasyDeploy/main/docs/easydeploy_modules_20241125.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | .idea/ 9 | 10 | # Distribution / packaging 11 | .Python 12 | fuhui_dev/ 13 | bakcup/ 14 | git-pre-push-hook-warn.log -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Welcome 7 | 8 | 9 |

Hello, Flask!

10 |

This is a simple web page.

11 | 12 | 13 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # docker-compose.yml 2 | version: '3.8' 3 | services: 4 | app: 5 | build: . 6 | container_name: app 7 | command: sh -c "ollama serve && ollama run llama3.2 && uvicorn app:app --host 0.0.0.0 --port 8000" 8 | depends_on: 9 | - ollama 10 | ports: 11 | - "8000:8000" 12 | restart: always 13 | 14 | volumes: 15 | ollama-storage: -------------------------------------------------------------------------------- /example/hello_word.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import requests 4 | 5 | 6 | def run(): 7 | url = 'http://127.0.0.1:5000/' 8 | headers = {"Content-Type": "application/json"} 9 | res = requests.post(url, headers=headers) 10 | res_text = res.text 11 | print('res_text: {}'.format(res_text)) 12 | 13 | 14 | if __name__ == '__main__': 15 | run() 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10 2 | 3 | ADD . /workspace/code-repo 4 | WORKDIR /workspace/code-repo 5 | 6 | RUN pip3 install fastapi uvicorn 7 | RUN pip3 install requests 8 | RUN pip3 install jinja2 9 | 10 | ENV PYTHONPATH /workspace/code-repo 11 | 12 | RUN apt-get update && apt-get install -y curl 13 | RUN curl -fsSL https://ollama.com/install.sh | sh 14 | 15 | ENV FLASK_RUN_HOST=0.0.0.0 16 | 17 | EXPOSE 8000 18 | 19 | CMD sh -c "ollama serve & ollama run llama3.2" 20 | CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] 21 | -------------------------------------------------------------------------------- /server/ollama_server.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import requests 3 | import json 4 | 5 | url_generate = "http://127.0.0.1:11434/api/generate" 6 | 7 | 8 | def get_response(url, data): 9 | response = requests.post(url, json=data) 10 | response_dict = json.loads(response.text) 11 | response_content = response_dict["response"] 12 | return response_content 13 | 14 | 15 | data = { 16 | "model": "llama3.2", 17 | "prompt": "hello", 18 | "stream": False 19 | } 20 | 21 | res = get_response(url_generate,data) 22 | print(res) 23 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # This is a sample Python script. 2 | 3 | # Press ⌃R to execute it or replace it with your code. 4 | # Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings. 5 | 6 | 7 | def print_hi(name): 8 | # Use a breakpoint in the code line below to debug your script. 9 | print(f'Hi, {name}') # Press ⌘F8 to toggle the breakpoint. 10 | 11 | 12 | # Press the green button in the gutter to run the script. 13 | if __name__ == '__main__': 14 | print_hi('PyCharm') 15 | 16 | # See PyCharm help at https://www.jetbrains.com/help/pycharm/ 17 | -------------------------------------------------------------------------------- /example/ollama_app_stream.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import requests 4 | 5 | 6 | # 发起请求,并将stream参数设置为True以获取流式输出 7 | url = 'http://127.0.0.1:8000/chat/completions' 8 | prompt = 'hello' 9 | model = 'llama3.2' 10 | messages = [{"role": "user", "content": prompt}] 11 | data = {'model': model, 'messages': messages, 'stream': True} 12 | headers = {"Content-Type": "application/json"} 13 | 14 | response = requests.post(url, headers=headers, data=json.dumps(data)) 15 | 16 | resp = '' 17 | for line in response.iter_lines(): 18 | data = line.decode('utf-8') 19 | data_dict = json.loads(data) 20 | text = data_dict['choices'][-1]['delta']['content'] 21 | resp += text 22 | print('resp: {}'.format(resp)) 23 | -------------------------------------------------------------------------------- /example/ollama_app.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import requests 4 | 5 | 6 | def run(): 7 | url = 'http://127.0.0.1:8000/chat/completions' 8 | prompt = 'hello' 9 | model = 'llama3.2' 10 | messages = [{"role": "user", "content": prompt}] 11 | infer_param = {} 12 | data = {'engine': 'ollama', 'model': model, 'messages': messages, 'infer_param': infer_param} 13 | headers = {"Content-Type": "application/json"} 14 | response = requests.post(url, headers=headers, data=json.dumps(data)) 15 | 16 | if response.status_code == 200: 17 | ans_dict = json.loads(response.text) 18 | print('data: {}'.format(ans_dict)) 19 | 20 | 21 | if __name__ == '__main__': 22 | run() 23 | -------------------------------------------------------------------------------- /ling_code/inference_script.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | from vllm import LLM 4 | from vllm.sampling_params import SamplingParams 5 | 6 | # os.environ['LD_LIBRARY_PATH'] = '/root/miniconda3/lib/python3.10/site-packages/nvidia/cublas/lib' 7 | # model_path = '/mnt/modelops/models/Bailing_Code_MoE_Lite_4K_Chat_20250304_dpsk_gptq_int4' 8 | model_path = '{your model path}' 9 | 10 | enforce_eager = False 11 | 12 | # GPU运行 13 | trust_remote_code = True 14 | tensor_parallel_size = 1 15 | gpu_memory_utilization = 0.80 16 | max_model_len = 4096 17 | max_tokens = 4096 18 | model = LLM(model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, gpu_memory_utilization=gpu_memory_utilization, max_model_len=max_model_len) 19 | prompt = "SYSTEM<\\/role>假设你是一个医疗助理,请回答问题,回答时需要遵循下列要求。\n要求:\n1. 首先总起概括,然后在回答中使用数字1、2、3等进行分条目阐述解释,并在最后总结。\n2. 对参考内容当中与问题相关且正确的部分进行整合,可以结合医学知识进行适当推理。\n3. 回答内容专业详实、逻辑清晰,不能出现医学错误。严谨礼貌,符合医疗及政策规范。\n4. 对于不合规或者高风险的医疗项目,要提示中国大陆不允许展开。\n5. 对于上门进行医疗服务的相关问题,要提示需要在有相应资质的诊疗机构由专业医疗人员进行。\n6. 对于高风险处方药,需要向用户表明风险。\n7. 对于违规引产,需要说明不建议,若需要引产,则要在符合医疗政策和规范的情况下去有资质的医院进行。\n8. 对于有偿献血,需要说明中国大陆不存在有偿献血,献血都是无偿的。\n9. 请不要忘记你是一个医疗助理,针对问题给出积极正向的建议和科普,而不能像医生一样给出确定性的诊疗意见。\nHUMAN<\\/role>艾滋病患者如何正确服用抗病毒药?ASSISTANT<\\/role>" 20 | 21 | sample_params = SamplingParams(max_tokens=max_tokens, ignore_eos=False) 22 | result = model.generate(prompt, sampling_params=sample_params, prompt_token_ids=None) 23 | print('result: {}'.format(result)) 24 | -------------------------------------------------------------------------------- /templates/chat_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | LLMs Chat Page 7 | 30 | 31 | 32 |

LLMs Chat

33 |
34 |
35 | 39 |
40 |
41 | 42 | 48 |
49 |
50 | 51 | 52 |
53 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import requests 3 | from fastapi.responses import StreamingResponse 4 | import time 5 | import json 6 | from typing import Iterator 7 | from fastapi import FastAPI, Request 8 | from fastapi.responses import HTMLResponse, JSONResponse 9 | from fastapi.templating import Jinja2Templates 10 | from pydantic import BaseModel 11 | 12 | 13 | app = FastAPI() 14 | templates = Jinja2Templates(directory="templates") # Specify the template directory here. 15 | url_generate = "http://127.0.0.1:11434/api/generate" # Ollama API 地址 16 | 17 | 18 | class UserInput(BaseModel): 19 | prompt: str 20 | stream: bool = False # Optional field 21 | 22 | 23 | class UserOutput(BaseModel): 24 | model_reply: str 25 | 26 | 27 | def get_response(model: str, prompt: str) -> str: 28 | data = { 29 | "model": model, 30 | "prompt": prompt, 31 | "stream": False # 暂时不处理流式输出 32 | } 33 | try: 34 | response = requests.post(url_generate, json=data) 35 | response.raise_for_status() 36 | response_dict = response.json() 37 | return response_dict.get("response", "没有收到响应。") 38 | except requests.exceptions.RequestException as e: 39 | return f"请求 Ollama 服务时出错: {e}" 40 | 41 | 42 | @app.get("/hello_word", response_class=HTMLResponse) 43 | async def home(): 44 | return 'Hello Word!' 45 | 46 | 47 | @app.get("/", response_class=HTMLResponse) 48 | async def get_home(request: Request): 49 | return templates.TemplateResponse("chat_page.html", {"request": request}) 50 | 51 | 52 | @app.post("/chat") 53 | async def chat(request: Request): 54 | try: 55 | body = await request.json() 56 | user_input = body.get("user_input") 57 | model = body.get("model") 58 | 59 | if not user_input or not model: 60 | return JSONResponse(status_code=400, content={"reply": "缺少 user_input 或 model 参数。"}) 61 | 62 | reply = get_response(model, user_input) 63 | return {"reply": reply} 64 | 65 | except Exception as e: 66 | return JSONResponse(status_code=500, content={"reply": f"内部服务器错误: {e}"}) 67 | 68 | 69 | # @app.post("/generate") 70 | @app.post("/chat/completions") 71 | async def generate(request: Request): 72 | # 从请求中解析 JSON 数据 73 | request_dict = await request.json() 74 | model = request_dict.get("model", "") 75 | messages = request_dict.get("messages", dict()) 76 | prompt = messages[-1].get("content", "") 77 | stream = request_dict.get("stream", False) 78 | 79 | infer_param = request_dict.get("infer_param", dict()) 80 | max_tokens = infer_param.get('max_tokens', 4096) 81 | temperature = infer_param.get('temperature', 0.9) 82 | top_p = infer_param.get('top_p', 0.9) 83 | n = infer_param.get('n', 1) 84 | stop = infer_param.get('stop', []) 85 | 86 | data = { 87 | "model": model, 88 | "prompt": prompt, 89 | "stream": stream, 90 | "max_tokens": max_tokens, 91 | "temperature": temperature, 92 | "top_p": top_p, 93 | "n": n, 94 | "stop": stop, 95 | } 96 | # url_generate = "http://127.0.0.1:11434/api/generate" 97 | 98 | if stream: 99 | # 如果需要流式响应 100 | promise = requests.post(url_generate, json=data, stream=True) 101 | 102 | def number_stream() -> Iterator[str]: 103 | for number in promise.iter_lines(): 104 | number_dict = json.loads(number.decode('utf-8')) 105 | model_reply = number_dict['response'] 106 | result = { 107 | "id": "ollama-123", 108 | "object": "chat.completion.chunk", 109 | "created": int(time.time()), 110 | "model": model, 111 | "system_fingerprint": "", 112 | "choices": [ 113 | { 114 | "index": 0, 115 | "delta": { 116 | "role": "assistant", 117 | "content": model_reply 118 | }, 119 | "logprobs": None, 120 | "finish_reason": None 121 | } 122 | ] 123 | } 124 | # yield f"{number.decode('utf-8')}\n" 125 | yield f"{json.dumps(result)}\n" 126 | 127 | return StreamingResponse(number_stream()) 128 | else: 129 | # 如果不需要流式响应 130 | print('非流式分支...') 131 | response = requests.post(url_generate, json=data) 132 | if response.status_code == 200: 133 | response_dict = json.loads(response.text) 134 | response_content = response_dict["response"] 135 | model_reply = response_content 136 | finish_reason = "stop" 137 | else: 138 | model_reply = "与模型通信失败。" 139 | finish_reason = "error" 140 | 141 | result = { 142 | "id": "ollama-123", 143 | "object": "chat.completion", 144 | "created": int(time.time()), 145 | "model": model, 146 | "system_fingerprint": "", 147 | "choices": [{ 148 | "index": 0, 149 | "message": { 150 | "role": "assistant", 151 | "content": model_reply, 152 | }, 153 | "logprobs": None, 154 | "finish_reason": finish_reason 155 | }], 156 | "usage": {} 157 | } 158 | return JSONResponse(result) 159 | 160 | 161 | if __name__ == '__main__': 162 | import uvicorn 163 | uvicorn.run(app, host="127.0.0.1", port=8000, log_level="info") 164 | -------------------------------------------------------------------------------- /README_CN.md: -------------------------------------------------------------------------------- 1 |
2 |

3 | EasyDeploy 4 |

5 |
6 | 7 |

8 |

9 |

10 |

11 | 中文 | 12 | English 13 |

14 |

15 |
16 | 17 | ## Contents 18 | - [新闻](#新闻) 19 | - [项目简介](#项目简介) 20 | - [快速部署](#快速部署) 21 | - [服务访问](#服务访问) 22 | - [架构图](#架构图) 23 | - [核心功能](#核心功能) 24 | - [致谢](#致谢) 25 | - [Contributing](#Contributing) 26 | 27 | ## 新闻 28 | - [2025.04.19] 支持Ling-moe-lite int8量化模型部署 29 | - [2024.11.06] EasyDeploy发布,基于docker+ollama的方式 30 | 31 | ## 项目简介 32 | EasyDeploy 旨在为用户提供端云一体的大模型部署能力,我们将大模型的部署和推理逻辑集成到 Docker 中,简化整体部署流程,全面提升用户体验。EasyDeploy 支持多种引擎,目前已支持 Ollama,未来将支持 vLLM 等其它引擎,进一步丰富用户的选择和应用场景。 33 | 34 | 通过 EasyDeploy,用户能够快速在云端与端设备之间部署和启动大模型,消除技术壁垒,专注于模型本身的应用和优化。无论是在本地开发环境、云端平台还是端设备中,EasyDeploy 都将为用户提供高效、可靠的解决方案,助力人工智能的快速发展与应用落地。 35 | 36 | ## 快速部署 37 | ### 环境依赖 38 | + python版本: 3.10 39 | + 依赖包安装: 40 | 41 | ```shell 42 | pip install -r requirements.txt 43 | ``` 44 | ### 服务启动 45 | Docker 镜像下载: 46 | 47 | 下载地址:上传后更新 48 | 49 | ```shell 50 | docker run -p 8000:8000 easydeploy_llama3.2_3b 51 | ``` 52 | 53 | ## 服务访问 54 | 当前服务以restful API方式提供流批一体访问功能,请求demo 如下: 55 | 56 | ### chat页面 57 | [http://127.0.0.1:8000/chat](http://127.0.0.1:8000/chat) 58 | 59 | ### API接口 60 | #### 阻塞访问: 61 | 请求方式: 62 | 63 | ```python 64 | # -*- coding: utf-8 -*- 65 | import json 66 | import requests 67 | url = 'http://127.0.0.1:8000/chat/completions' 68 | prompt = '你好' 69 | model = 'lamma3.2' 70 | messages = [{"role": "user", "content": prompt}] 71 | data = {'model': model, 'messages': messages} 72 | headers = {"Content-Type": "application/json"} 73 | response = requests.post(url, headers=headers, data=json.dumps(data)) 74 | if response.status_code == 200: 75 | ans_dict = json.loads(response.text) 76 | print('data: {}'.format(ans_dict)) 77 | ``` 78 | 79 | 返回格式: 80 | 81 | ```json 82 | { 83 | "id": "ollama-123", 84 | "object": "chat.completion", 85 | "created": 1731847899, 86 | "model": "lamma3.2", 87 | "system_fingerprint": "", 88 | "choices": [ 89 | { 90 | "index": 0, 91 | "message": { 92 | "role": "assistant", 93 | "content": "你好,我是大语言模型,我主要的任务是提供帮助用户解决问题和解答问题,比如回答关于技术、编程、知识问答等。" 94 | }, 95 | "logprobs": null, 96 | "finish_reason": "stop" 97 | } 98 | ], 99 | "usage": { 100 | 101 | } 102 | } 103 | ``` 104 | 105 | ### 流式访问: 106 | 请求方式: 107 | 108 | ```python 109 | # -*- coding: utf-8 -*- 110 | import json 111 | import requests 112 | url = 'http://127.0.0.1:8000/chat/completions' 113 | prompt = '你好' 114 | model = 'lamma3.2' 115 | messages = [{"role": "user", "content": prompt}] 116 | data = {'model': model, 'messages': messages, 'stream': True} 117 | headers = {"Content-Type": "application/json"} 118 | response = requests.post(url, headers=headers, data=json.dumps(data)) 119 | ``` 120 | 121 | 返回方式: 122 | 123 | ```json 124 | { 125 | "id": "ollama-123", 126 | "object": "chat.completion.chunk", 127 | "created": 1731848401, 128 | "model": "lamma3.2", 129 | "system_fingerprint": "", 130 | "choices": [ 131 | { 132 | "index": 0, 133 | "delta": { 134 | "role": "assistant", 135 | "content": "你" 136 | }, 137 | "logprobs": null, 138 | "finish_reason": null 139 | } 140 | ] 141 | } 142 | ``` 143 | 144 | ### 架构图 145 | ![easydeploy modules](docs/easydeploy_modules_20241125.png) 146 | ### 核心能力 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 |
分类功能名称状态描述
API Service基于Open AI的标准API规范服务接口遵循 OpenAI 规范,通过标准化 API 降低接入成本,用户可轻松集成功能,快速响应业务需求,专注于核心开发。
阻塞式访问能力适用于需要完整性和准确性的任务,完成时结果进行整体校验或输出的任务,一次性获取完整输出。在整个过程中,用户需要等待直至所有输出内容完全完成。
流式访问能力适用于对响应时间要求较高的实时应用,如代码补全、实时翻译或动态内容加载的场景。模型在生成过程中分段逐步传输内容,用户可在内容生成后立即接收和处理,无需等待全部完成,从而提升效率。
高性能网络,提升用户开发能力高性能网络通过优化数据传输、采用先进负载均衡算法及高效资源管理,能有效提升数据来源、降低延迟、提升响应速度。
多引擎支持OllamaOllama 以易用和轻量著称,专注于高效稳定的大模型推理服务。其友好 API 和简洁流畅流程,使开发者能够轻松将其手作快速部署应用。
vLLMvLLM在内存管理和吞吐量上有显著优势,其通过优化存储和并行计算,显著提升推理速度和资源利用率,兼容多种硬件环境。vLLM提供丰富的配置选项,用户可根据需求调整推理策略,适用于实时和企业级应用。
Tensorrt–LLMTensorRT–LLM (TensorRT for Large Language Models) 是NVIDIA优化的高性能、大规模推理优化库,专为大型语言模型(LLM)设计。
Docker部署能力基于python3.10构建Docker镜像将大型模型及其依赖的镜像,确保版本号一致运行,简化部署与配置。利用Docker的版本构建和自动化部署,提高模型更新与迭代效率,加快从开发到生产落地的转化。
Web UI接入OpenUI 协议丰富的UI开源协议便于用户整合多种组件,提升产品的定制性和扩展性。
更多核心功能ModelCache语义缓存通过缓存已有生成的QA Pair,使得请求变更更加细粒度,提高模型推理的性能与效率。
210 | 211 | ## Ling-moe-lite int8量化模型推理 212 | ### 环境要求 213 | Python版本:python 3.10 214 | 215 | GPU卡类型:L20 216 | 217 | 环境配置: 218 | 219 | ```bash 220 | pip install vllm==0.6.3 221 | sudo yum install libcap-devel 222 | pip install python-prctl 223 | cp vllm_src/model_executor/models/deepseek.py /opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/deepseek.py 224 | ``` 225 | 226 | vllm推理脚本 227 | 228 | ```python 229 | # -*- coding: utf-8 -*- 230 | import os 231 | from vllm import LLM 232 | from vllm.sampling_params import SamplingParams 233 | 234 | model_path = '{your model path}' 235 | 236 | enforce_eager = False 237 | 238 | # GPU运行 239 | trust_remote_code = True 240 | tensor_parallel_size = 1 241 | gpu_memory_utilization = 0.80 242 | max_model_len = 4096 243 | max_tokens = 4096 244 | model = LLM(model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, gpu_memory_utilization=gpu_memory_utilization, max_model_len=max_model_len) 245 | prompt = "SYSTEM<\\/role>假设你是一个医疗助理,请回答问题,回答时需要遵循下列要求。\n要求:\n1. 首先总起概括,然后在回答中使用数字1、2、3等进行分条目阐述解释,并在最后总结。\n2. 对参考内容当中与问题相关且正确的部分进行整合,可以结合医学知识进行适当推理。\n3. 回答内容专业详实、逻辑清晰,不能出现医学错误。严谨礼貌,符合医疗及政策规范。\n4. 对于不合规或者高风险的医疗项目,要提示中国大陆不允许展开。\n5. 对于上门进行医疗服务的相关问题,要提示需要在有相应资质的诊疗机构由专业医疗人员进行。\n6. 对于高风险处方药,需要向用户表明风险。\n7. 对于违规引产,需要说明不建议,若需要引产,则要在符合医疗政策和规范的情况下去有资质的医院进行。\n8. 对于有偿献血,需要说明中国大陆不存在有偿献血,献血都是无偿的。\n9. 请不要忘记你是一个医疗助理,针对问题给出积极正向的建议和科普,而不能像医生一样给出确定性的诊疗意见。\nHUMAN<\\/role>艾滋病患者如何正确服用抗病毒药?ASSISTANT<\\/role>" 246 | 247 | sample_params = SamplingParams(max_tokens=max_tokens, ignore_eos=False) 248 | result = model.generate(prompt, sampling_params=sample_params, prompt_token_ids=None) 249 | print('result: {}'.format(result)) 250 | ``` 251 | 252 | ## 致谢 253 | 本项目参考了以下开源项目,在此对相关项目和研究开发人员表示感谢。 254 | [Ollama](https://github.com/ollama/ollama)、[vLLM](https://github.com/vllm-project/vllm) 255 | 256 | ## Contributing 257 | EasyDeploy是一个非常有趣且有用的项目,我们相信这个项目有很大的潜力,无论你是经验丰富的开发者,还是刚刚入门的新手,都欢迎你为这个项目做出一些贡献,包括但不限于:提交问题和建议,参与代码编写,完善文档和示例。 258 | 259 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

3 | EasyDeploy 4 |

5 |
6 | 7 |

8 |

9 |

10 |

11 | 中文 | 12 | English 13 |

14 |

15 |
16 | 17 | ## Contents 18 | - [news](#news) 19 | - [Introduction](#Introduction) 20 | - [Quick-Deployment](#Quick-Deployment) 21 | - [Service-Access](#Service-Access) 22 | - [Modules](#Modules) 23 | - [Core-Features](#Core-Features) 24 | - [Acknowledgements](#Acknowledgements) 25 | - [Contributing](#Contributing) 26 | 27 | ### news 28 | - [2025.04.19] 支持Ling-moe-lite int8量化模型部署 29 | - [2024.11.06] EasyDeploy was released, utilizing Docker and Ollama based architecture. 30 | 31 | ## Introduction 32 | EasyDeploy is engineered to provide users with end-to-end deployment capabilities for large-scale models. By incorporating the deployment and inference logic of large models within Docker, EasyDeploy streamlines the overall deployment process and significantly enhances the user experience. Currently, EasyDeploy supports multiple engines, including Ollama, and plans to extend support to additional engines such as vLLM in the future. 33 | Through EasyDeploy, users are empowered to rapidly deploy and initiate large-scale models between cloud environments and local devices, effectively eliminating technical barriers and enabling a focus on the application and optimization of the models themselves. Whether operating within local environments or cloud platforms, EasyDeploy provides efficient and reliable solutions, thereby facilitating the swift advancement and practical implementation of artificial intelligence. 34 | 35 | ## Quick-Deployment 36 | ### Dependencies 37 | + Python version: 3.10 38 | + Package Installation 39 | ```shell 40 | pip install -r requirements.txt 41 | ``` 42 | ### Service Startup 43 | Download Docker Image 44 | 45 | Download link:上传后更新 46 | 47 | ```shell 48 | docker run -p 8000:8000 easydeploy_llama3.2_3b 49 | ``` 50 | 51 | ## Service-Access 52 | The service provides both streaming and blocking access functionalities through RESTful APIs. An example request is presented below: 53 | 54 | ### Chat Page 55 | [http://127.0.0.1:8000/chat](http://127.0.0.1:8000/chat) 56 | 57 | ### API Interface 58 | #### Blocking Access 59 | **Request Method**: 60 | ```python 61 | # -*- coding: utf-8 -*- 62 | import json 63 | import requests 64 | url = 'http://127.0.0.1:8000/chat/completions' 65 | prompt = '你好' 66 | model = 'lamma3.2' 67 | messages = [{"role": "user", "content": prompt}] 68 | data = {'model': model, 'messages': messages} 69 | headers = {"Content-Type": "application/json"} 70 | response = requests.post(url, headers=headers, data=json.dumps(data)) 71 | if response.status_code == 200: 72 | ans_dict = json.loads(response.text) 73 | print('data: {}'.format(ans_dict)) 74 | ``` 75 | 76 | **Return Format**: 77 | 78 | ```json 79 | { 80 | "id": "ollama-123", 81 | "object": "chat.completion", 82 | "created": 1731847899, 83 | "model": "lamma3.2", 84 | "system_fingerprint": "", 85 | "choices": [ 86 | { 87 | "index": 0, 88 | "message": { 89 | "role": "assistant", 90 | "content": "hi! How can I assist you today?" 91 | }, 92 | "logprobs": null, 93 | "finish_reason": "stop" 94 | } 95 | ], 96 | "usage": { 97 | 98 | } 99 | } 100 | ``` 101 | 102 | #### **Stream Access**: 103 | **Request Method:** 104 | 105 | ```python 106 | # -*- coding: utf-8 -*- 107 | import json 108 | import requests 109 | url = 'http://127.0.0.1:8000/chat/completions' 110 | prompt = 'hello' 111 | model = 'lamma3.2' 112 | messages = [{"role": "user", "content": prompt}] 113 | data = {'model': model, 'messages': messages, 'stream': True} 114 | headers = {"Content-Type": "application/json"} 115 | response = requests.post(url, headers=headers, data=json.dumps(data)) 116 | ``` 117 | 118 | **Return Format**: 119 | ```json 120 | { 121 | "id": "ollama-123", 122 | "object": "chat.completion.chunk", 123 | "created": 1731848401, 124 | "model": "lamma3.2", 125 | "system_fingerprint": "", 126 | "choices": [ 127 | { 128 | "index": 0, 129 | "delta": { 130 | "role": "assistant", 131 | "content": "hi" 132 | }, 133 | "logprobs": null, 134 | "finish_reason": null 135 | } 136 | ] 137 | } 138 | ``` 139 | 140 | ## Modules 141 | ![easydeploy modules](docs/easydeploy_modules_20241125.png) 142 | ## Core-Features 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 |
CategoryFunctionStatusDescription
API ServiceOpenAI Standard APIThe service interface complies with OpenAI standards, minimizing integration costs through standardized APIs. It enables users to seamlessly integrate and maintain the system, swiftly respond to business requirements, and concentrate on core development.
Blocking access capabilities Suitable for tasks requiring integrity and coherence or for overall verification and processing of results, this approach obtains complete output in a single iteration. Throughout the process, the user must wait until all output content has been fully generated.
Streaming access capabilitiesSuitable for real-time applications with stringent response time requirements, such as code completion, real-time translation, or websites with dynamic content loading. The model transmits content incrementally during generation, enabling users to receive and process partial outputs immediately without waiting for full completion, thereby enhancing interactivity.
High-performance gatewayHigh-performance gateways effectively manage high-concurrency requests, reduce latency, and enhance response times by optimizing data transmission, employing advanced load balancing algorithms, and implementing efficient resource management.
Multi-engine SupportOllamaHigh-performance gateways effectively manage high-concurrency requests, reduce latency, and enhance response times by optimizing data transmission, employing advanced load balancing algorithms, and implementing efficient resource management.
vLLMvLLM exhibits significant advantages in memory management and throughput. By optimizing memory usage and parallel computation, it substantially enhances inference speed and resource efficiency, while maintaining compatibility with various hardware environments. vLLM offers a wide range of configuration options, allowing users to adjust inference strategies based on their needs. Its scalable architecture makes it suitable for both research and enterprise-level applications.
Tensorrt–LLMTensorRT-LLM (TensorRT for Large Language Models) is a high-performance, scalable deep learning inference optimization library developed by NVIDIA, specifically designed for large language models (LLMs).
Docker Deployment CapabilityDocker images built with Python 3.10TensorRT-LLM is a high-performance, scalable deep learning inference optimization library developed by NVIDIA, specifically designed for large language models (LLMs).
Web UI IntegrationOpenUI protocolThe comprehensive UI open-source protocol facilitates users in integrating diverse components, enhancing product customizability and extensibility.
More Core FeaturesModelCache semantic cachingBy caching generated QA pairs, similar requests can achieve millisecond-level responses, enhancing the performance and efficiency of model inference.
206 | 207 | ## Inference of the Ling-moe-lite int8 Quantized Model 208 | ### Environment Requirements: 209 | Python Version:python 3.10 210 | GPU Type:L20 211 | Environment Configuration: 212 | 213 | ```bash 214 | pip install vllm==0.6.3 215 | sudo yum install libcap-devel 216 | pip install python-prctl 217 | cp vllm_src/model_executor/models/deepseek.py /opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/deepseek.py 218 | ``` 219 | 220 | VLLM Inference Script 221 | ```python 222 | # -*- coding: utf-8 -*- 223 | import os 224 | from vllm import LLM 225 | from vllm.sampling_params import SamplingParams 226 | 227 | model_path = '{your model path}' 228 | 229 | enforce_eager = False 230 | 231 | # GPU Execution 232 | trust_remote_code = True 233 | tensor_parallel_size = 1 234 | gpu_memory_utilization = 0.80 235 | max_model_len = 4096 236 | max_tokens = 4096 237 | model = LLM(model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, gpu_memory_utilization=gpu_memory_utilization, max_model_len=max_model_len) 238 | prompt = "SYSTEM<\\/role>假设你是一个医疗助理,请回答问题,回答时需要遵循下列要求。\n要求:\n1. 首先总起概括,然后在回答中使用数字1、2、3等进行分条目阐述解释,并在最后总结。\n2. 对参考内容当中与问题相关且正确的部分进行整合,可以结合医学知识进行适当推理。\n3. 回答内容专业详实、逻辑清晰,不能出现医学错误。严谨礼貌,符合医疗及政策规范。\n4. 对于不合规或者高风险的医疗项目,要提示中国大陆不允许展开。\n5. 对于上门进行医疗服务的相关问题,要提示需要在有相应资质的诊疗机构由专业医疗人员进行。\n6. 对于高风险处方药,需要向用户表明风险。\n7. 对于违规引产,需要说明不建议,若需要引产,则要在符合医疗政策和规范的情况下去有资质的医院进行。\n8. 对于有偿献血,需要说明中国大陆不存在有偿献血,献血都是无偿的。\n9. 请不要忘记你是一个医疗助理,针对问题给出积极正向的建议和科普,而不能像医生一样给出确定性的诊疗意见。\nHUMAN<\\/role>艾滋病患者如何正确服用抗病毒药?ASSISTANT<\\/role>" 239 | 240 | sample_params = SamplingParams(max_tokens=max_tokens, ignore_eos=False) 241 | result = model.generate(prompt, sampling_params=sample_params, prompt_token_ids=None) 242 | print('result: {}'.format(result)) 243 | ``` 244 | 245 | 246 | ## Acknowledgements 247 | This project draws on the following open-source projects, and we express our gratitude to the relevant projects and researchers for their contributions. 248 | [Ollama](https://github.com/ollama/ollama)、[vLLM](https://github.com/vllm-project/vllm) 249 | 250 | ## Contributing 251 | EasyDeploy is an intriguing and valuable project, which we believe holds significant potential. We welcome contributions from both seasoned developers and novices alike. Contributions may include, but are not limited to, submitting issues and suggestions, participating in code development, and enhancing documentation and examples. 252 | 253 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright [2023] [Ant Group] Licensed under the Apache License, Version 2.0 (the "License"); 2 | you may not use this file except in compliance with the License. 3 | You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | Unless required by applicable law or agreed to in writing, 6 | software distributed under the License is distributed on an "AS IS" BASIS, 7 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 8 | either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | 11 | Apache License 12 | Version 2.0, January 2004 13 | http://www.apache.org/licenses/ 14 | 15 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 16 | 17 | 1. Definitions. 18 | 19 | "License" shall mean the terms and conditions for use, reproduction, 20 | and distribution as defined by Sections 1 through 9 of this document. 21 | 22 | "Licensor" shall mean the copyright owner or entity authorized by 23 | the copyright owner that is granting the License. 24 | 25 | "Legal Entity" shall mean the union of the acting entity and all 26 | other entities that control, are controlled by, or are under common 27 | control with that entity. For the purposes of this definition, 28 | "control" means (i) the power, direct or indirect, to cause the 29 | direction or management of such entity, whether by contract or 30 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 31 | outstanding shares, or (iii) beneficial ownership of such entity. 32 | 33 | "You" (or "Your") shall mean an individual or Legal Entity 34 | exercising permissions granted by this License. 35 | 36 | "Source" form shall mean the preferred form for making modifications, 37 | including but not limited to software source code, documentation 38 | source, and configuration files. 39 | 40 | "Object" form shall mean any form resulting from mechanical 41 | transformation or translation of a Source form, including but 42 | not limited to compiled object code, generated documentation, 43 | and conversions to other media types. 44 | 45 | "Work" shall mean the work of authorship, whether in Source or 46 | Object form, made available under the License, as indicated by a 47 | copyright notice that is included in or attached to the work 48 | (an example is provided in the Appendix below). 49 | 50 | "Derivative Works" shall mean any work, whether in Source or Object 51 | form, that is based on (or derived from) the Work and for which the 52 | editorial revisions, annotations, elaborations, or other modifications 53 | represent, as a whole, an original work of authorship. For the purposes 54 | of this License, Derivative Works shall not include works that remain 55 | separable from, or merely link (or bind by name) to the interfaces of, 56 | the Work and Derivative Works thereof. 57 | 58 | "Contribution" shall mean any work of authorship, including 59 | the original version of the Work and any modifications or additions 60 | to that Work or Derivative Works thereof, that is intentionally 61 | submitted to Licensor for inclusion in the Work by the copyright owner 62 | or by an individual or Legal Entity authorized to submit on behalf of 63 | the copyright owner. For the purposes of this definition, "submitted" 64 | means any form of electronic, verbal, or written communication sent 65 | to the Licensor or its representatives, including but not limited to 66 | communication on electronic mailing lists, source code control systems, 67 | and issue tracking systems that are managed by, or on behalf of, the 68 | Licensor for the purpose of discussing and improving the Work, but 69 | excluding communication that is conspicuously marked or otherwise 70 | designated in writing by the copyright owner as "Not a Contribution." 71 | 72 | "Contributor" shall mean Licensor and any individual or Legal Entity 73 | on behalf of whom a Contribution has been received by Licensor and 74 | subsequently incorporated within the Work. 75 | 76 | 2. Grant of Copyright License. Subject to the terms and conditions of 77 | this License, each Contributor hereby grants to You a perpetual, 78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 79 | copyright license to reproduce, prepare Derivative Works of, 80 | publicly display, publicly perform, sublicense, and distribute the 81 | Work and such Derivative Works in Source or Object form. 82 | 83 | 3. Grant of Patent License. Subject to the terms and conditions of 84 | this License, each Contributor hereby grants to You a perpetual, 85 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 86 | (except as stated in this section) patent license to make, have made, 87 | use, offer to sell, sell, import, and otherwise transfer the Work, 88 | where such license applies only to those patent claims licensable 89 | by such Contributor that are necessarily infringed by their 90 | Contribution(s) alone or by combination of their Contribution(s) 91 | with the Work to which such Contribution(s) was submitted. If You 92 | institute patent litigation against any entity (including a 93 | cross-claim or counterclaim in a lawsuit) alleging that the Work 94 | or a Contribution incorporated within the Work constitutes direct 95 | or contributory patent infringement, then any patent licenses 96 | granted to You under this License for that Work shall terminate 97 | as of the date such litigation is filed. 98 | 99 | 4. Redistribution. You may reproduce and distribute copies of the 100 | Work or Derivative Works thereof in any medium, with or without 101 | modifications, and in Source or Object form, provided that You 102 | meet the following conditions: 103 | 104 | (a) You must give any other recipients of the Work or 105 | Derivative Works a copy of this License; and 106 | 107 | (b) You must cause any modified files to carry prominent notices 108 | stating that You changed the files; and 109 | 110 | (c) You must retain, in the Source form of any Derivative Works 111 | that You distribute, all copyright, patent, trademark, and 112 | attribution notices from the Source form of the Work, 113 | excluding those notices that do not pertain to any part of 114 | the Derivative Works; and 115 | 116 | (d) If the Work includes a "NOTICE" text file as part of its 117 | distribution, then any Derivative Works that You distribute must 118 | include a readable copy of the attribution notices contained 119 | within such NOTICE file, excluding those notices that do not 120 | pertain to any part of the Derivative Works, in at least one 121 | of the following places: within a NOTICE text file distributed 122 | as part of the Derivative Works; within the Source form or 123 | documentation, if provided along with the Derivative Works; or, 124 | within a display generated by the Derivative Works, if and 125 | wherever such third-party notices normally appear. The contents 126 | of the NOTICE file are for informational purposes only and 127 | do not modify the License. You may add Your own attribution 128 | notices within Derivative Works that You distribute, alongside 129 | or as an addendum to the NOTICE text from the Work, provided 130 | that such additional attribution notices cannot be construed 131 | as modifying the License. 132 | 133 | You may add Your own copyright statement to Your modifications and 134 | may provide additional or different license terms and conditions 135 | for use, reproduction, or distribution of Your modifications, or 136 | for any such Derivative Works as a whole, provided Your use, 137 | reproduction, and distribution of the Work otherwise complies with 138 | the conditions stated in this License. 139 | 140 | 5. Submission of Contributions. Unless You explicitly state otherwise, 141 | any Contribution intentionally submitted for inclusion in the Work 142 | by You to the Licensor shall be under the terms and conditions of 143 | this License, without any additional terms or conditions. 144 | Notwithstanding the above, nothing herein shall supersede or modify 145 | the terms of any separate license agreement you may have executed 146 | with Licensor regarding such Contributions. 147 | 148 | 6. Trademarks. This License does not grant permission to use the trade 149 | names, trademarks, service marks, or product names of the Licensor, 150 | except as required for reasonable and customary use in describing the 151 | origin of the Work and reproducing the content of the NOTICE file. 152 | 153 | 7. Disclaimer of Warranty. Unless required by applicable law or 154 | agreed to in writing, Licensor provides the Work (and each 155 | Contributor provides its Contributions) on an "AS IS" BASIS, 156 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 157 | implied, including, without limitation, any warranties or conditions 158 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 159 | PARTICULAR PURPOSE. You are solely responsible for determining the 160 | appropriateness of using or redistributing the Work and assume any 161 | risks associated with Your exercise of permissions under this License. 162 | 163 | 8. Limitation of Liability. In no event and under no legal theory, 164 | whether in tort (including negligence), contract, or otherwise, 165 | unless required by applicable law (such as deliberate and grossly 166 | negligent acts) or agreed to in writing, shall any Contributor be 167 | liable to You for damages, including any direct, indirect, special, 168 | incidental, or consequential damages of any character arising as a 169 | result of this License or out of the use or inability to use the 170 | Work (including but not limited to damages for loss of goodwill, 171 | work stoppage, computer failure or malfunction, or any and all 172 | other commercial damages or losses), even if such Contributor 173 | has been advised of the possibility of such damages. 174 | 175 | 9. Accepting Warranty or Additional Liability. While redistributing 176 | the Work or Derivative Works thereof, You may choose to offer, 177 | and charge a fee for, acceptance of support, warranty, indemnity, 178 | or other liability obligations and/or rights consistent with this 179 | License. However, in accepting such obligations, You may act only 180 | on Your own behalf and on Your sole responsibility, not on behalf 181 | of any other Contributor, and only if You agree to indemnify, 182 | defend, and hold each Contributor harmless for any liability 183 | incurred by, or claims asserted against, such Contributor by reason 184 | of your accepting any such warranty or additional liability. 185 | 186 | END OF TERMS AND CONDITIONS 187 | 188 | APPENDIX: How to apply the Apache License to your work. 189 | 190 | To apply the Apache License to your work, attach the following 191 | boilerplate notice, with the fields enclosed by brackets "[]" 192 | replaced with your own identifying information. (Don't include 193 | the brackets!) The text should be enclosed in the appropriate 194 | comment syntax for the file format. We also recommend that a 195 | file or class name and description of purpose be included on the 196 | same "printed page" as the copyright notice for easier 197 | identification within third-party archives. 198 | 199 | Copyright [yyyy] [name of copyright owner] 200 | 201 | Licensed under the Apache License, Version 2.0 (the "License"); 202 | you may not use this file except in compliance with the License. 203 | You may obtain a copy of the License at 204 | 205 | http://www.apache.org/licenses/LICENSE-2.0 206 | 207 | Unless required by applicable law or agreed to in writing, software 208 | distributed under the License is distributed on an "AS IS" BASIS, 209 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 210 | See the License for the specific language governing permissions and 211 | limitations under the License. 212 | -------------------------------------------------------------------------------- /ling_code/deepseek.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Adapted from 3 | # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py 4 | # Copyright 2023 The vLLM team. 5 | # Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. 6 | # 7 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 8 | # and OPT implementations in this library. It has been modified from its 9 | # original forms to accommodate minor architectural differences compared 10 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 11 | # 12 | # Licensed under the Apache License, Version 2.0 (the "License"); 13 | # you may not use this file except in compliance with the License. 14 | # You may obtain a copy of the License at 15 | # 16 | # http://www.apache.org/licenses/LICENSE-2.0 17 | # 18 | # Unless required by applicable law or agreed to in writing, software 19 | # distributed under the License is distributed on an "AS IS" BASIS, 20 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 21 | # See the License for the specific language governing permissions and 22 | # limitations under the License. 23 | """Inference-only Deepseek model.""" 24 | from typing import Any, Dict, Iterable, List, Optional, Tuple 25 | 26 | import torch 27 | from torch import nn 28 | from transformers import PretrainedConfig 29 | 30 | from vllm.attention import Attention, AttentionMetadata 31 | from vllm.config import CacheConfig 32 | from vllm.distributed import (get_tensor_model_parallel_rank, 33 | get_tensor_model_parallel_world_size, 34 | tensor_model_parallel_all_reduce) 35 | from vllm.model_executor.layers.activation import SiluAndMul 36 | from vllm.model_executor.layers.fused_moe import FusedMoE 37 | from vllm.model_executor.layers.layernorm import RMSNorm 38 | from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, 39 | QKVParallelLinear, 40 | ReplicatedLinear, 41 | RowParallelLinear) 42 | from vllm.model_executor.layers.logits_processor import LogitsProcessor 43 | from vllm.model_executor.layers.quantization.base_config import ( 44 | QuantizationConfig) 45 | from vllm.model_executor.layers.rotary_embedding import get_rope 46 | from vllm.model_executor.layers.sampler import Sampler, SamplerOutput 47 | from vllm.model_executor.layers.vocab_parallel_embedding import ( 48 | ParallelLMHead, VocabParallelEmbedding) 49 | from vllm.model_executor.model_loader.weight_utils import default_weight_loader 50 | from vllm.model_executor.sampling_metadata import SamplingMetadata 51 | from vllm.sequence import IntermediateTensors 52 | 53 | 54 | class DeepseekMLP(nn.Module): 55 | 56 | def __init__( 57 | self, 58 | hidden_size: int, 59 | intermediate_size: int, 60 | hidden_act: str, 61 | quant_config: Optional[QuantizationConfig] = None, 62 | reduce_results: bool = True, 63 | ) -> None: 64 | super().__init__() 65 | self.gate_up_proj = MergedColumnParallelLinear( 66 | hidden_size, [intermediate_size] * 2, 67 | bias=False, 68 | quant_config=quant_config) 69 | self.down_proj = RowParallelLinear(intermediate_size, 70 | hidden_size, 71 | bias=False, 72 | quant_config=quant_config, 73 | reduce_results=reduce_results) 74 | if hidden_act != "silu": 75 | raise ValueError(f"Unsupported activation: {hidden_act}. " 76 | "Only silu is supported for now.") 77 | self.act_fn = SiluAndMul() 78 | 79 | def forward(self, x): 80 | gate_up, _ = self.gate_up_proj(x) 81 | x = self.act_fn(gate_up) 82 | x, _ = self.down_proj(x) 83 | return x 84 | 85 | 86 | class DeepseekMoE(nn.Module): 87 | 88 | def __init__( 89 | self, 90 | config: PretrainedConfig, 91 | layer_idx: int, 92 | quant_config: Optional[QuantizationConfig] = None, 93 | ): 94 | super().__init__() 95 | self.config = config 96 | self.rank = get_tensor_model_parallel_rank() 97 | self.tp_size = get_tensor_model_parallel_world_size() 98 | self.n_routed_experts = config.n_routed_experts 99 | self.top_k = config.num_experts_per_tok 100 | if self.tp_size > self.n_routed_experts: 101 | raise ValueError( 102 | f"Tensor parallel size {self.tp_size} is greater than " 103 | f"the number of experts {self.n_routed_experts}.") 104 | 105 | self.experts = FusedMoE( 106 | num_experts=config.n_routed_experts, 107 | top_k=config.num_experts_per_tok, 108 | hidden_size=config.hidden_size, 109 | intermediate_size=config.moe_intermediate_size, 110 | reduce_results=False, 111 | renormalize=config.norm_topk_prob, 112 | quant_config=quant_config, 113 | use_grouped_topk=False, 114 | prefix=f"model.layers.{layer_idx}.mlp.experts" 115 | ) 116 | 117 | self.gate = ReplicatedLinear(config.hidden_size, 118 | self.n_routed_experts, 119 | bias=False, 120 | quant_config=None) 121 | 122 | if config.n_shared_experts is not None: 123 | intermediate_size = (config.moe_intermediate_size * 124 | config.n_shared_experts) 125 | self.shared_experts = DeepseekMLP( 126 | hidden_size=config.hidden_size, 127 | intermediate_size=intermediate_size, 128 | hidden_act=config.hidden_act, 129 | quant_config=quant_config, 130 | reduce_results=False, 131 | ) 132 | 133 | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: 134 | num_tokens, hidden_dim = hidden_states.shape 135 | hidden_states = hidden_states.view(-1, hidden_dim) 136 | if self.config.n_shared_experts is not None: 137 | shared_output = self.shared_experts(hidden_states) 138 | # router_logits: (num_tokens, n_experts) 139 | router_logits, _ = self.gate(hidden_states) 140 | final_hidden_states = self.experts(hidden_states=hidden_states, 141 | router_logits=router_logits) 142 | 143 | if shared_output is not None: 144 | final_hidden_states = final_hidden_states + shared_output 145 | if self.tp_size > 1: 146 | final_hidden_states = tensor_model_parallel_all_reduce( 147 | final_hidden_states) 148 | 149 | return final_hidden_states.view(num_tokens, hidden_dim) 150 | 151 | 152 | class DeepseekAttention(nn.Module): 153 | 154 | def __init__( 155 | self, 156 | hidden_size: int, 157 | num_heads: int, 158 | num_kv_heads: int, 159 | head_dim: int, 160 | rope_theta: float = 10000, 161 | rope_scaling: Optional[Dict[str, Any]] = None, 162 | max_position_embeddings: int = 8192, 163 | cache_config: Optional[CacheConfig] = None, 164 | quant_config: Optional[QuantizationConfig] = None, 165 | ) -> None: 166 | super().__init__() 167 | self.hidden_size = hidden_size 168 | tp_size = get_tensor_model_parallel_world_size() 169 | self.total_num_heads = num_heads 170 | assert self.total_num_heads % tp_size == 0 171 | self.num_heads = self.total_num_heads // tp_size 172 | self.total_num_kv_heads = num_kv_heads 173 | if self.total_num_kv_heads >= tp_size: 174 | # Number of KV heads is greater than TP size, so we partition 175 | # the KV heads across multiple tensor parallel GPUs. 176 | assert self.total_num_kv_heads % tp_size == 0 177 | else: 178 | # Number of KV heads is less than TP size, so we replicate 179 | # the KV heads across multiple tensor parallel GPUs. 180 | assert tp_size % self.total_num_kv_heads == 0 181 | self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) 182 | # self.head_dim = hidden_size // self.total_num_heads 183 | self.head_dim = hidden_size // self.total_num_heads if head_dim is None else head_dim 184 | self.q_size = self.num_heads * self.head_dim 185 | self.kv_size = self.num_kv_heads * self.head_dim 186 | self.scaling = self.head_dim**-0.5 187 | self.rope_theta = rope_theta 188 | self.max_position_embeddings = max_position_embeddings 189 | 190 | self.qkv_proj = QKVParallelLinear( 191 | hidden_size, 192 | self.head_dim, 193 | self.total_num_heads, 194 | self.total_num_kv_heads, 195 | bias=False, 196 | quant_config=quant_config, 197 | ) 198 | 199 | self.o_proj = RowParallelLinear( 200 | self.total_num_heads * self.head_dim, 201 | hidden_size, 202 | bias=False, 203 | quant_config=quant_config, 204 | ) 205 | 206 | self.rotary_emb = get_rope( 207 | self.head_dim, 208 | rotary_dim=self.head_dim, 209 | max_position=max_position_embeddings, 210 | base=rope_theta, 211 | rope_scaling=rope_scaling, 212 | ) 213 | self.attn = Attention(self.num_heads, 214 | self.head_dim, 215 | self.scaling, 216 | num_kv_heads=self.num_kv_heads, 217 | cache_config=cache_config, 218 | quant_config=quant_config) 219 | 220 | def forward( 221 | self, 222 | positions: torch.Tensor, 223 | hidden_states: torch.Tensor, 224 | kv_cache: torch.Tensor, 225 | attn_metadata: AttentionMetadata, 226 | ) -> torch.Tensor: 227 | qkv, _ = self.qkv_proj(hidden_states) 228 | q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) 229 | q, k = self.rotary_emb(positions, q, k) 230 | attn_output = self.attn(q, k, v, kv_cache, attn_metadata) 231 | output, _ = self.o_proj(attn_output) 232 | return output 233 | 234 | 235 | class DeepseekDecoderLayer(nn.Module): 236 | 237 | def __init__( 238 | self, 239 | config: PretrainedConfig, 240 | layer_idx: int, 241 | cache_config: Optional[CacheConfig] = None, 242 | quant_config: Optional[QuantizationConfig] = None, 243 | ) -> None: 244 | super().__init__() 245 | self.hidden_size = config.hidden_size 246 | rope_theta = getattr(config, "rope_theta", 10000) 247 | rope_scaling = getattr(config, "rope_scaling", None) 248 | max_position_embeddings = getattr(config, "max_position_embeddings", 249 | 8192) 250 | head_dim = getattr(config, "head_dim", None) 251 | self.self_attn = DeepseekAttention( 252 | hidden_size=self.hidden_size, 253 | num_heads=config.num_attention_heads, 254 | num_kv_heads=config.num_key_value_heads, 255 | head_dim=head_dim, 256 | rope_theta=rope_theta, 257 | rope_scaling=rope_scaling, 258 | max_position_embeddings=max_position_embeddings, 259 | cache_config=cache_config, 260 | quant_config=quant_config, 261 | ) 262 | if (config.n_routed_experts is not None 263 | and layer_idx >= config.first_k_dense_replace 264 | and layer_idx % config.moe_layer_freq == 0): 265 | self.mlp = DeepseekMoE(config=config, quant_config=quant_config, layer_idx=layer_idx) 266 | else: 267 | self.mlp = DeepseekMLP( 268 | hidden_size=config.hidden_size, 269 | intermediate_size=config.intermediate_size, 270 | hidden_act=config.hidden_act, 271 | quant_config=quant_config, 272 | ) 273 | self.input_layernorm = RMSNorm(config.hidden_size, 274 | eps=config.rms_norm_eps) 275 | self.post_attention_layernorm = RMSNorm(config.hidden_size, 276 | eps=config.rms_norm_eps) 277 | 278 | def forward( 279 | self, 280 | positions: torch.Tensor, 281 | hidden_states: torch.Tensor, 282 | kv_cache: torch.Tensor, 283 | attn_metadata: AttentionMetadata, 284 | residual: Optional[torch.Tensor], 285 | ) -> torch.Tensor: 286 | # Self Attention 287 | if residual is None: 288 | residual = hidden_states 289 | hidden_states = self.input_layernorm(hidden_states) 290 | else: 291 | hidden_states, residual = self.input_layernorm( 292 | hidden_states, residual) 293 | hidden_states = self.self_attn( 294 | positions=positions, 295 | hidden_states=hidden_states, 296 | kv_cache=kv_cache, 297 | attn_metadata=attn_metadata, 298 | ) 299 | 300 | # Fully Connected 301 | hidden_states, residual = self.post_attention_layernorm( 302 | hidden_states, residual) 303 | hidden_states = self.mlp(hidden_states) 304 | return hidden_states, residual 305 | 306 | 307 | class DeepseekModel(nn.Module): 308 | 309 | fall_back_to_pt_during_load = False 310 | 311 | def __init__( 312 | self, 313 | config: PretrainedConfig, 314 | cache_config: Optional[CacheConfig] = None, 315 | quant_config: Optional[QuantizationConfig] = None, 316 | ) -> None: 317 | super().__init__() 318 | self.padding_idx = config.pad_token_id 319 | self.vocab_size = config.vocab_size 320 | 321 | self.embed_tokens = VocabParallelEmbedding( 322 | config.vocab_size, 323 | config.hidden_size, 324 | ) 325 | self.layers = nn.ModuleList([ 326 | DeepseekDecoderLayer(config, 327 | layer_idx, 328 | cache_config, 329 | quant_config=quant_config) 330 | for layer_idx in range(config.num_hidden_layers) 331 | ]) 332 | self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) 333 | 334 | def forward( 335 | self, 336 | input_ids: torch.Tensor, 337 | positions: torch.Tensor, 338 | kv_caches: List[torch.Tensor], 339 | attn_metadata: AttentionMetadata, 340 | ) -> torch.Tensor: 341 | hidden_states = self.embed_tokens(input_ids) 342 | residual = None 343 | for i in range(len(self.layers)): 344 | layer = self.layers[i] 345 | hidden_states, residual = layer(positions, hidden_states, 346 | kv_caches[i], attn_metadata, 347 | residual) 348 | hidden_states, _ = self.norm(hidden_states, residual) 349 | return hidden_states 350 | 351 | 352 | class DeepseekForCausalLM(nn.Module): 353 | 354 | def __init__( 355 | self, 356 | config: PretrainedConfig, 357 | cache_config: Optional[CacheConfig] = None, 358 | quant_config: Optional[QuantizationConfig] = None, 359 | ) -> None: 360 | super().__init__() 361 | self.config = config 362 | self.quant_config = quant_config 363 | self.model = DeepseekModel(config, cache_config, quant_config) 364 | self.lm_head = ParallelLMHead(config.vocab_size, 365 | config.hidden_size, 366 | quant_config=quant_config) 367 | if self.config.tie_word_embeddings: 368 | self.lm_head.weight = self.model.embed_tokens.weight 369 | self.logits_processor = LogitsProcessor(config.vocab_size) 370 | self.sampler = Sampler() 371 | 372 | def forward( 373 | self, 374 | input_ids: torch.Tensor, 375 | positions: torch.Tensor, 376 | kv_caches: List[torch.Tensor], 377 | attn_metadata: AttentionMetadata, 378 | intermediate_tensors: Optional[IntermediateTensors] = None, 379 | ) -> torch.Tensor: 380 | hidden_states = self.model(input_ids, positions, kv_caches, 381 | attn_metadata) 382 | return hidden_states 383 | 384 | def compute_logits( 385 | self, 386 | hidden_states: torch.Tensor, 387 | sampling_metadata: SamplingMetadata, 388 | ) -> Optional[torch.Tensor]: 389 | logits = self.logits_processor(self.lm_head, hidden_states, 390 | sampling_metadata) 391 | return logits 392 | 393 | def sample( 394 | self, 395 | logits: Optional[torch.Tensor], 396 | sampling_metadata: SamplingMetadata, 397 | ) -> Optional[SamplerOutput]: 398 | next_tokens = self.sampler(logits, sampling_metadata) 399 | return next_tokens 400 | 401 | def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): 402 | stacked_params_mapping = [ 403 | # (param_name, shard_name, shard_id) 404 | ("qkv_proj", "q_proj", "q"), 405 | ("qkv_proj", "k_proj", "k"), 406 | ("qkv_proj", "v_proj", "v"), 407 | ("gate_up_proj", "gate_proj", 0), 408 | ("gate_up_proj", "up_proj", 1), 409 | ] 410 | 411 | # Params for weights, fp8 weight scales, fp8 activation scales 412 | # (param_name, weight_name, expert_id, shard_id) 413 | expert_params_mapping = FusedMoE.make_expert_params_mapping( 414 | ckpt_gate_proj_name="gate_proj", 415 | ckpt_down_proj_name="down_proj", 416 | ckpt_up_proj_name="up_proj", 417 | num_experts=self.config.n_routed_experts, 418 | ) 419 | 420 | params_dict = dict(self.named_parameters()) 421 | for name, loaded_weight in weights: 422 | if "rotary_emb.inv_freq" in name: 423 | continue 424 | for (param_name, weight_name, shard_id) in stacked_params_mapping: 425 | if weight_name not in name: 426 | continue 427 | if ("mlp.experts." in name) and name not in params_dict: 428 | continue 429 | name = name.replace(weight_name, param_name) 430 | # Skip loading extra bias for GPTQ models. 431 | if name.endswith(".bias") and name not in params_dict: 432 | continue 433 | param = params_dict[name] 434 | weight_loader = param.weight_loader 435 | weight_loader(param, loaded_weight, shard_id) 436 | break 437 | else: 438 | for mapping in expert_params_mapping: 439 | param_name, weight_name, expert_id, shard_id = mapping 440 | if weight_name not in name: 441 | continue 442 | name = name.replace(weight_name, param_name) 443 | param = params_dict[name] 444 | weight_loader = param.weight_loader 445 | weight_loader( 446 | param, 447 | loaded_weight, 448 | name, 449 | shard_id=shard_id, 450 | expert_id=expert_id, 451 | ) 452 | break 453 | else: 454 | # Skip loading extra bias for GPTQ models. 455 | if name.endswith(".bias") and name not in params_dict: 456 | continue 457 | # Skip experts that are not assigned to this worker. 458 | if ("mlp.experts." in name or "mlp.shared_experts." 459 | in name) and name not in params_dict: 460 | continue 461 | param = params_dict[name] 462 | weight_loader = getattr(param, "weight_loader", 463 | default_weight_loader) 464 | weight_loader(param, loaded_weight) 465 | --------------------------------------------------------------------------------