├── static ├── .nojekyll └── img │ ├── logo.png │ └── favicon.png ├── docs ├── GLM │ ├── _category_.json │ ├── GLM-4.5.md │ ├── GLM-4.5V.md │ ├── Glyph.md │ └── GLM-4.6V.md ├── Ernie │ ├── _category_.json │ ├── Ernie4.5-VL.md │ └── Ernie4.5.md ├── Llama │ ├── _category_.json │ ├── Llama3.1.md │ ├── Llama3.3-70B.md │ └── Llama4-Scout.md ├── Qwen │ ├── _category_.json │ ├── Qwen2.5-VL.md │ └── Qwen3-Coder-480B-A35B.md ├── DeepSeek │ ├── _category_.json │ ├── DeepSeek-OCR.md │ ├── DeepSeek-V3_1.md │ ├── DeepSeek-V3.md │ └── DeepSeek-V3_2.md ├── Jina │ ├── _category_.json │ └── Jina-reranker-m0.md ├── MiniMax │ ├── _category_.json │ └── MiniMax-M2.md ├── NVIDIA │ ├── _category_.json │ └── Nemotron3-Nano.md ├── OpenAI │ └── _category_.json ├── InternLM │ ├── _category_.json │ └── Intern-S1.md ├── InternVL │ ├── _category_.json │ └── InternVL3_5.md ├── Mistral │ ├── _category_.json │ ├── Mistral-3.md │ └── Devstral-2.md ├── Moonshotai │ ├── _category_.json │ ├── Kimi-Linear.md │ └── Kimi-K2.md └── intro.md ├── src ├── pages │ └── index.js ├── components │ ├── HomepageFeatures │ │ ├── styles.module.css │ │ └── index.js │ ├── Llama31ConfigGenerator │ │ └── index.js │ ├── Devstral2ConfigGenerator │ │ └── index.js │ ├── ConfigGenerator │ │ ├── styles.module.css │ │ ├── index.js │ │ ├── QUICKSTART.md │ │ └── README.md │ ├── Llama4ScoutConfigGenerator │ │ └── index.js │ ├── KimiK2ConfigGenerator │ │ └── index.js │ ├── InternS1ConfigGenerator │ │ └── index.js │ ├── NemotronConfigGenerator │ │ └── index.js │ ├── Qwen3NextConfigGenerator │ │ └── index.js │ ├── GLM46VConfigGenerator │ │ └── index.js │ ├── DeepSeekR1ConfigGenerator │ │ └── index.js │ ├── GLM46ConfigGenerator │ │ └── index.js │ ├── DeepSeekConfigGenerator │ │ └── index.js │ ├── GPTOSSConfigGenerator │ │ └── index.js │ ├── Qwen3VLConfigGenerator │ │ └── index.js │ └── Qwen3ConfigGenerator │ │ └── index.js └── css │ └── custom.css ├── .gitignore ├── sidebars.js ├── .github └── workflows │ └── deploy.yml ├── package.json ├── README.md ├── docusaurus.config.js └── LICENSE /static/.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/GLM/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "GLM", 3 | "position": 5 4 | } 5 | -------------------------------------------------------------------------------- /docs/Ernie/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Ernie", 3 | "position": 9 4 | } 5 | -------------------------------------------------------------------------------- /docs/Llama/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Llama", 3 | "position": 4 4 | } 5 | -------------------------------------------------------------------------------- /docs/Qwen/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Qwen", 3 | "position": 2 4 | } 5 | -------------------------------------------------------------------------------- /docs/DeepSeek/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "DeepSeek", 3 | "position": 3 4 | } 5 | -------------------------------------------------------------------------------- /docs/Jina/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Jina AI", 3 | "position": 12 4 | } 5 | -------------------------------------------------------------------------------- /docs/MiniMax/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "MiniMax", 3 | "position": 8 4 | } 5 | -------------------------------------------------------------------------------- /docs/NVIDIA/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "NVIDIA", 3 | "position": 9 4 | } 5 | -------------------------------------------------------------------------------- /docs/OpenAI/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "OpenAI", 3 | "position": 6 4 | } 5 | -------------------------------------------------------------------------------- /docs/InternLM/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "InternLM", 3 | "position": 11 4 | } 5 | -------------------------------------------------------------------------------- /docs/InternVL/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "InternVL", 3 | "position": 10 4 | } 5 | -------------------------------------------------------------------------------- /docs/Mistral/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Mistral", 3 | "position": 13 4 | } 5 | 6 | -------------------------------------------------------------------------------- /docs/Moonshotai/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Moonshotai", 3 | "position": 7 4 | } 5 | -------------------------------------------------------------------------------- /static/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgl-project/sgl-cookbook/HEAD/static/img/logo.png -------------------------------------------------------------------------------- /static/img/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgl-project/sgl-cookbook/HEAD/static/img/favicon.png -------------------------------------------------------------------------------- /src/pages/index.js: -------------------------------------------------------------------------------- 1 | import {Redirect} from '@docusaurus/router'; 2 | 3 | export default function Home() { 4 | return ; 5 | } 6 | -------------------------------------------------------------------------------- /src/components/HomepageFeatures/styles.module.css: -------------------------------------------------------------------------------- 1 | .features { 2 | display: flex; 3 | align-items: center; 4 | padding: 2rem 0; 5 | width: 100%; 6 | } 7 | 8 | .featureSvg { 9 | height: 200px; 10 | width: 200px; 11 | } 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | /node_modules 3 | 4 | # Production 5 | /build 6 | 7 | # Generated files 8 | .docusaurus 9 | .cache-loader 10 | 11 | # Misc 12 | .DS_Store 13 | .env.local 14 | .env.development.local 15 | .env.test.local 16 | .env.production.local 17 | 18 | npm-debug.log* 19 | yarn-debug.log* 20 | yarn-error.log* 21 | -------------------------------------------------------------------------------- /docs/GLM/GLM-4.5.md: -------------------------------------------------------------------------------- 1 | # GLM-4.5 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **GLM-4.5** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-glm-4-5-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [GLM-4.6](/docs/GLM/GLM-4.6.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/GLM/GLM-4.5V.md: -------------------------------------------------------------------------------- 1 | # GLM-4.5V 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **GLM-4.5V** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-glm-4-5v-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [GLM-4.6V](/docs/GLM/GLM-4.6V.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/GLM/Glyph.md: -------------------------------------------------------------------------------- 1 | # GLM Glyph 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **GLM Glyph** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-glm-glyph-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [GLM-4.6](/docs/GLM/GLM-4.6.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/InternLM/Intern-S1.md: -------------------------------------------------------------------------------- 1 | # Intern-S1 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **Intern-S1** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-intern-s1-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [GLM-4.6V](/docs/GLM/GLM-4.6V.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/Qwen/Qwen2.5-VL.md: -------------------------------------------------------------------------------- 1 | # Qwen2.5-VL 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **Qwen2.5-VL** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-qwen2-5-vl-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [Qwen3-VL](/docs/Qwen/Qwen3-VL.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/Ernie/Ernie4.5-VL.md: -------------------------------------------------------------------------------- 1 | # Ernie4.5-VL 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **Ernie4.5-VL** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-ernie4-5-vl-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [GLM-4.6V](/docs/GLM/GLM-4.6V.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/Llama/Llama3.1.md: -------------------------------------------------------------------------------- 1 | # Llama3.1 Usage Guide 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **Llama3.3-70B** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-llama3-1-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [Qwen3](/docs/Qwen/Qwen3.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/Llama/Llama3.3-70B.md: -------------------------------------------------------------------------------- 1 | # Llama3.3-70B 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **Llama3.3-70B** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-llama3-3-70b-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [Qwen3](/docs/Qwen/Qwen3.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/Ernie/Ernie4.5.md: -------------------------------------------------------------------------------- 1 | # Ernie4.5 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **Ernie4.5** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-ernie4-5-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/InternVL/InternVL3_5.md: -------------------------------------------------------------------------------- 1 | # InternVL3.5 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **InternVL3.5** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-internvl3-5-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [GLM-4.6V](/docs/GLM/GLM-4.6V.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/MiniMax/MiniMax-M2.md: -------------------------------------------------------------------------------- 1 | # MiniMax-M2 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **MiniMax-M2** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-minimax-m2-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [Kimi-K2](/docs/Moonshotai/Kimi-K2.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/Llama/Llama4-Scout.md: -------------------------------------------------------------------------------- 1 | # Llama4-Scout Usage Guide 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **Llama4-Scout** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-llama4-scout-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [Qwen3](/docs/Qwen/Qwen3.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/Moonshotai/Kimi-Linear.md: -------------------------------------------------------------------------------- 1 | # Kimi-Linear 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **Kimi-Linear** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-kimi-linear-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [Kimi-K2](/docs/Moonshotai/Kimi-K2.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/DeepSeek/DeepSeek-OCR.md: -------------------------------------------------------------------------------- 1 | # DeepSeek-OCR 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **DeepSeek-OCR** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-deepseek-ocr-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/DeepSeek/DeepSeek-V3_1.md: -------------------------------------------------------------------------------- 1 | # DeepSeek-V3.1 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **DeepSeek-V3.1** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-deepseek-v3-1-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/Jina/Jina-reranker-m0.md: -------------------------------------------------------------------------------- 1 | # Jina-reranker-m0 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **Jina-reranker-m0** with SGLang, please help us complete this documentation. 8 | 9 | ## 🚀 How to Contribute 10 | 11 | ```shell 12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 13 | cd sglang-cookbook 14 | git checkout -b add-jina-reranker-m0-guide 15 | # Edit this file and submit a PR 16 | ``` 17 | 18 | ## 📚 Reference 19 | 20 | - [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2.md) 21 | 22 | --- 23 | 24 | **Let's build this together!** 🌟 25 | -------------------------------------------------------------------------------- /docs/DeepSeek/DeepSeek-V3.md: -------------------------------------------------------------------------------- 1 | # DeepSeek-V3 2 | 3 | ## 📝 Community Contribution Welcome 4 | 5 | This guide is currently under development. We welcome community contributions! 6 | 7 | If you have experience deploying **DeepSeek-V3** with SGLang, please help us complete this documentation by: 8 | 9 | - Sharing your deployment configurations and optimization tips 10 | - Adding code examples and troubleshooting guides 11 | - Documenting best practices 12 | 13 | ## 🚀 How to Contribute 14 | 15 | ```shell 16 | # Fork the repository 17 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 18 | cd sglang-cookbook 19 | git checkout -b add-deepseek-v3-guide 20 | 21 | # Edit this file and follow the format of existing guides 22 | # Submit a Pull Request 23 | ``` 24 | 25 | ## 📚 Reference 26 | 27 | - [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2.md) 28 | 29 | --- 30 | 31 | **Let's build this together!** 🌟 32 | -------------------------------------------------------------------------------- /sidebars.js: -------------------------------------------------------------------------------- 1 | // @ts-check 2 | 3 | // This runs in Node.js - Don't use client-side code here (browser APIs, JSX...) 4 | 5 | /** 6 | * Creating a sidebar enables you to: 7 | - create an ordered group of docs 8 | - render a sidebar for each doc of that group 9 | - provide next/previous navigation 10 | 11 | The sidebars can be generated from the filesystem, or explicitly defined here. 12 | 13 | Create as many sidebars as you want. 14 | 15 | @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} 16 | */ 17 | const sidebars = { 18 | // By default, Docusaurus generates a sidebar from the docs folder structure 19 | tutorialSidebar: [{type: 'autogenerated', dirName: '.'}], 20 | 21 | // But you can create a sidebar manually 22 | /* 23 | tutorialSidebar: [ 24 | 'intro', 25 | 'hello', 26 | { 27 | type: 'category', 28 | label: 'Tutorial', 29 | items: ['tutorial-basics/create-a-document'], 30 | }, 31 | ], 32 | */ 33 | }; 34 | 35 | export default sidebars; 36 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy Docusaurus 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build-deploy: 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - name: Checkout repository 12 | uses: actions/checkout@v4 13 | 14 | - name: Setup Node.js 15 | uses: actions/setup-node@v4 16 | with: 17 | node-version: 20 18 | cache: npm 19 | 20 | - name: Install dependencies 21 | run: npm ci 22 | 23 | - name: Build Docusaurus 24 | run: npm run build 25 | 26 | - name: Install Vercel CLI 27 | run: npm install -g vercel 28 | 29 | - name: Deploy to Vercel 30 | run: | 31 | vercel deploy build --prod \ 32 | --yes \ 33 | --force \ 34 | --scope ${{ secrets.VERCEL_ORG_ID }} \ 35 | --token ${{ secrets.VERCEL_TOKEN }} 36 | env: 37 | VERCEL_ORG_ID: ${{ secrets.VERCEL_ORG_ID }} 38 | VERCEL_PROJECT_ID: ${{ secrets.VERCEL_PROJECT_ID }} 39 | -------------------------------------------------------------------------------- /src/css/custom.css: -------------------------------------------------------------------------------- 1 | /** 2 | * Any CSS included here will be global. The classic template 3 | * bundles Infima by default. Infima is a CSS framework designed to 4 | * work well for content-centric websites. 5 | */ 6 | 7 | /* You can override the default Infima variables here. */ 8 | :root { 9 | --ifm-color-primary: #2e8555; 10 | --ifm-color-primary-dark: #29784c; 11 | --ifm-color-primary-darker: #277148; 12 | --ifm-color-primary-darkest: #205d3b; 13 | --ifm-color-primary-light: #33925d; 14 | --ifm-color-primary-lighter: #359962; 15 | --ifm-color-primary-lightest: #3cad6e; 16 | --ifm-code-font-size: 95%; 17 | --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1); 18 | } 19 | 20 | /* For readability concerns, you should choose a lighter palette in dark mode. */ 21 | [data-theme='dark'] { 22 | --ifm-color-primary: #25c2a0; 23 | --ifm-color-primary-dark: #21af90; 24 | --ifm-color-primary-darker: #1fa588; 25 | --ifm-color-primary-darkest: #1a8870; 26 | --ifm-color-primary-light: #29d5b0; 27 | --ifm-color-primary-lighter: #32d8b4; 28 | --ifm-color-primary-lightest: #4fddbf; 29 | --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3); 30 | } 31 | -------------------------------------------------------------------------------- /docs/Mistral/Mistral-3.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 2 3 | --- 4 | 5 | # Mistral 3 6 | 7 | :::info Community contribution welcome 8 | This guide is currently under development. If you have experience deploying **Mistral 3** with SGLang, please help us complete this documentation. 9 | 10 | To contribute, fork the repo, edit this page, and open a PR. 11 | ::: 12 | 13 | ## 1. Model Introduction 14 | 15 | This page will cover practical deployment configs and usage patterns for **Mistral 3** with SGLang. 16 | 17 | ## 2. SGLang Installation 18 | 19 | Please refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html). 20 | 21 | ## 3. Model Deployment 22 | 23 | Coming soon: recommended launch configs (TP/PP, quantization, context length) and tuning tips. 24 | 25 | ## 4. Model Invocation 26 | 27 | Coming soon: OpenAI-compatible API examples and tool-calling notes. 28 | 29 | ## Contributing 30 | 31 | ```shell 32 | git clone https://github.com/YOUR_USERNAME/sgl-cookbook.git 33 | cd sgl-cookbook 34 | git checkout -b add-mistral-3-guide 35 | # Edit this file and submit a PR 36 | ``` 37 | 38 | ## Reference 39 | 40 | - [Devstral 2](/docs/Mistral/Devstral-2.md) 41 | -------------------------------------------------------------------------------- /docs/Qwen/Qwen3-Coder-480B-A35B.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 4 3 | --- 4 | 5 | # Qwen3-Coder-480B-A35B 6 | 7 | :::info Community contribution welcome 8 | This guide is currently under development. If you have experience deploying **Qwen3-Coder-480B-A35B** with SGLang, please help us complete this documentation. 9 | 10 | To contribute, fork the repo, edit this page, and open a PR. 11 | ::: 12 | 13 | ## 1. Model Introduction 14 | 15 | This page will cover practical deployment configs and usage patterns for **Qwen3-Coder-480B-A35B** with SGLang. 16 | 17 | ## 2. SGLang Installation 18 | 19 | Please refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html). 20 | 21 | ## 3. Model Deployment 22 | 23 | Coming soon: recommended launch configs (TP/PP, quantization, context length) and tuning tips. 24 | 25 | ## 4. Model Invocation 26 | 27 | Coming soon: OpenAI-compatible API examples and tool-calling notes. 28 | 29 | ## Contributing 30 | 31 | ```shell 32 | git clone https://github.com/YOUR_USERNAME/sgl-cookbook.git 33 | cd sgl-cookbook 34 | git checkout -b add-qwen3-coder-480b-guide 35 | # Edit this file and submit a PR 36 | ``` 37 | 38 | ## Reference 39 | 40 | - [Qwen3](./Qwen3) 41 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "sglang-cookbook", 3 | "version": "0.0.0", 4 | "private": true, 5 | "scripts": { 6 | "docusaurus": "docusaurus", 7 | "start": "docusaurus start", 8 | "build": "docusaurus build", 9 | "swizzle": "docusaurus swizzle", 10 | "deploy": "docusaurus deploy", 11 | "clear": "docusaurus clear", 12 | "serve": "docusaurus serve", 13 | "write-translations": "docusaurus write-translations", 14 | "write-heading-ids": "docusaurus write-heading-ids" 15 | }, 16 | "dependencies": { 17 | "@docusaurus/core": "3.9.2", 18 | "@docusaurus/preset-classic": "3.9.2", 19 | "@mdx-js/react": "^3.0.0", 20 | "clsx": "^2.0.0", 21 | "prism-react-renderer": "^2.3.0", 22 | "react": "^19.0.0", 23 | "react-dom": "^19.0.0" 24 | }, 25 | "devDependencies": { 26 | "@docusaurus/module-type-aliases": "3.9.2", 27 | "@docusaurus/types": "3.9.2" 28 | }, 29 | "browserslist": { 30 | "production": [ 31 | ">0.5%", 32 | "not dead", 33 | "not op_mini all" 34 | ], 35 | "development": [ 36 | "last 3 chrome version", 37 | "last 3 firefox version", 38 | "last 5 safari version" 39 | ] 40 | }, 41 | "engines": { 42 | "node": ">=20.0" 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/components/HomepageFeatures/index.js: -------------------------------------------------------------------------------- 1 | import clsx from 'clsx'; 2 | import Heading from '@theme/Heading'; 3 | import styles from './styles.module.css'; 4 | 5 | const FeatureList = [ 6 | { 7 | title: 'Easy to Use', 8 | Svg: require('@site/static/img/undraw_docusaurus_mountain.svg').default, 9 | description: ( 10 | <> 11 | Docusaurus was designed from the ground up to be easily installed and 12 | used to get your website up and running quickly. 13 | 14 | ), 15 | }, 16 | { 17 | title: 'Focus on What Matters', 18 | Svg: require('@site/static/img/undraw_docusaurus_tree.svg').default, 19 | description: ( 20 | <> 21 | Docusaurus lets you focus on your docs, and we'll do the chores. Go 22 | ahead and move your docs into the docs directory. 23 | 24 | ), 25 | }, 26 | { 27 | title: 'Powered by React', 28 | Svg: require('@site/static/img/undraw_docusaurus_react.svg').default, 29 | description: ( 30 | <> 31 | Extend or customize your website layout by reusing React. Docusaurus can 32 | be extended while reusing the same header and footer. 33 | 34 | ), 35 | }, 36 | ]; 37 | 38 | function Feature({Svg, title, description}) { 39 | return ( 40 |
41 |
42 | 43 |
44 |
45 | {title} 46 |

{description}

47 |
48 |
49 | ); 50 | } 51 | 52 | export default function HomepageFeatures() { 53 | return ( 54 |
55 |
56 |
57 | {FeatureList.map((props, idx) => ( 58 | 59 | ))} 60 |
61 |
62 |
63 | ); 64 | } 65 | -------------------------------------------------------------------------------- /src/components/Llama31ConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * Llama 3.1 70B Configuration Generator 6 | */ 7 | const Llama31ConfigGenerator = () => { 8 | const config = { 9 | modelFamily: 'meta-llama', 10 | 11 | options: { 12 | hardware: { 13 | name: 'hardware', 14 | title: 'Hardware Platform', 15 | items: [ 16 | { id: 'h100', label: 'H100 (4x)', default: true }, 17 | { id: 'h200', label: 'H200 (4x)', default: false } 18 | ] 19 | }, 20 | optimization: { 21 | name: 'optimization', 22 | title: 'Optimization Mode', 23 | items: [ 24 | { id: 'basic', label: 'Basic', default: true }, 25 | { id: 'throughput', label: 'Throughput Optimized', default: false }, 26 | { id: 'latency', label: 'Latency Optimized', default: false } 27 | ] 28 | } 29 | }, 30 | 31 | generateCommand: function(values) { 32 | const { hardware, optimization } = values; 33 | 34 | let cmd = 'python3 -m sglang.launch_server \\\n'; 35 | cmd += ` --model meta-llama/Llama-3.1-70B-Instruct \\\n`; 36 | cmd += ` --tp 4`; 37 | 38 | if (optimization === 'throughput') { 39 | cmd += ` \\\n --enable-dp-attention \\\n`; 40 | cmd += ` --mem-fraction-static 0.85`; 41 | } else if (optimization === 'latency') { 42 | cmd += ` \\\n --speculative-algorithm EAGLE \\\n`; 43 | cmd += ` --speculative-num-steps 3 \\\n`; 44 | cmd += ` --speculative-eagle-topk 1 \\\n`; 45 | cmd += ` --speculative-num-draft-tokens 4 \\\n`; 46 | cmd += ` --disable-shared-experts-fusion \\\n`; 47 | cmd += ` --max-running-requests 64 \\\n`; 48 | cmd += ` --mem-fraction-static 0.85 \\\n`; 49 | cmd += ` --kv-cache-dtype fp8_e4m3 \\\n`; 50 | cmd += ` --context-length 32768 \\\n`; 51 | cmd += ` --quantization fp8`; 52 | } 53 | 54 | cmd += ` \\\n --host 0.0.0.0 \\\n`; 55 | cmd += ` --port 8000`; 56 | 57 | return cmd; 58 | } 59 | }; 60 | 61 | return ; 62 | }; 63 | 64 | export default Llama31ConfigGenerator; 65 | 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SGLang Cookbook 2 | 3 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 4 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/sgl-project/sgl-cookbook/pulls) 5 | 6 | A community-maintained repository of practical guides and recipes for deploying and using SGLang in production environments. Our mission is simple: answer the question **"How do I use SGLang (and related models) on hardware Y for task Z?"** with clear, actionable solutions. 7 | 8 | ## 🎯 What You'll Find Here 9 | 10 | This cookbook aggregates battle-tested SGLang recipes covering: 11 | 12 | - **Models**: Mainstream LLMs and Vision-Language Models (VLMs) 13 | - **Use Cases**: Inference serving, deployment strategies, multimodal applications 14 | - **Hardware**: GPU and CPU configurations, optimization for different accelerators 15 | - **Best Practices**: Configuration templates, performance tuning, troubleshooting guides 16 | 17 | Each recipe provides step-by-step instructions to help you quickly implement SGLang solutions for your specific requirements. 18 | 19 | ## 🚀 Quick Start 20 | 21 | 1. Browse the recipe index above to find your model 22 | 2. Follow the step-by-step instructions in each guide 23 | 3. Adapt configurations to your specific hardware and requirements 24 | 4. Join our community to share feedback and improvements 25 | 26 | ## 🤝 Contributing 27 | 28 | We believe the best documentation comes from practitioners. Whether you've optimized SGLang for a specific model, solved a tricky deployment challenge, or discovered performance improvements, we encourage you to contribute your recipes! 29 | 30 | **Ways to contribute:** 31 | 32 | - Add a new recipe for a model not yet covered 33 | - Improve existing recipes with additional tips or configurations 34 | - Report issues or suggest enhancements 35 | - Share your production deployment experiences 36 | 37 | **To contribute:** 38 | 39 | ```shell 40 | # Fork the repo and clone locally 41 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 42 | cd sglang-cookbook 43 | 44 | # Create a new branch 45 | git checkout -b add-my-recipe 46 | 47 | # Add your recipe following the template in DeepSeek-V3.2 48 | # Submit a PR! 49 | ``` 50 | 51 | ## 🛠️ Local Development 52 | 53 | ### Prerequisites 54 | 55 | - Node.js >= 20.0 56 | - npm or yarn 57 | 58 | ### Setup and Run 59 | 60 | Install dependencies and start the development server: 61 | 62 | ```shell 63 | # Install dependencies 64 | npm install 65 | 66 | # Start development server (hot reload enabled) 67 | npm start 68 | ``` 69 | 70 | The site will automatically open in your browser at `http://localhost:3000`. 71 | 72 | ## 📖 Resources 73 | 74 | - [SGLang GitHub](https://github.com/sgl-project/sglang) 75 | - [SGLang Documentation](https://sgl-project.github.io) 76 | - [Community Slack/Discord](https://discord.gg/MpEEuAeb) 77 | 78 | ## 📄 License 79 | 80 | This project is licensed under the Apache License 2.0 - see the [LICENSE](https://github.com/sgl-project/sgl-cookbook/blob/main/LICENSE) file for details. 81 | 82 | --- 83 | 84 | **Let's build this resource together!** 🚀 Star the repo and contribute your recipes to help the SGLang community grow. 85 | -------------------------------------------------------------------------------- /src/components/Devstral2ConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * Devstral 2 Configuration Generator 6 | * Covers: 7 | * - mistralai/Devstral-Small-2-24B-Instruct-2512 8 | * - mistralai/Devstral-2-123B-Instruct-2512 (FP8 weights) 9 | */ 10 | const Devstral2ConfigGenerator = () => { 11 | const config = { 12 | modelFamily: 'Mistral', 13 | 14 | options: { 15 | hardware: { 16 | name: 'hardware', 17 | title: 'Hardware Platform', 18 | items: [ 19 | { id: 'b200', label: 'B200', default: true }, 20 | { id: 'h200', label: 'H200', default: false }, 21 | { id: 'h100', label: 'H100', default: false } 22 | ] 23 | }, 24 | model: { 25 | name: 'model', 26 | title: 'Model', 27 | items: [ 28 | { id: 'small', label: 'Devstral Small 2 (24B)', default: true }, 29 | { id: 'large', label: 'Devstral 2 (123B)', default: false } 30 | ] 31 | }, 32 | weights: { 33 | name: 'weights', 34 | title: 'Weights / Precision', 35 | items: [ 36 | { id: 'fp8', label: 'FP8', default: true } 37 | ] 38 | }, 39 | toolcall: { 40 | name: 'toolcall', 41 | title: 'Tool Call Parser', 42 | items: [ 43 | { id: 'disabled', label: 'Disabled', default: true }, 44 | { id: 'enabled', label: 'Enabled', default: false } 45 | ], 46 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser mistral' : null 47 | } 48 | }, 49 | 50 | modelConfigs: { 51 | small: { 52 | modelId: 'mistralai/Devstral-Small-2-24B-Instruct-2512', 53 | tpByHardware: { h100: 1, h200: 1, b200: 1 }, 54 | allowedWeights: ['fp8'] 55 | }, 56 | large: { 57 | modelId: 'mistralai/Devstral-2-123B-Instruct-2512', 58 | tpByHardware: { h100: 4, h200: 2, b200: 2 }, 59 | allowedWeights: ['fp8'] 60 | } 61 | }, 62 | 63 | generateCommand: function (values) { 64 | const { hardware, model, weights } = values; 65 | 66 | const modelCfg = this.modelConfigs[model]; 67 | if (!modelCfg) return `# Error: Unknown model selection: ${model}`; 68 | 69 | if (!modelCfg.allowedWeights.includes(weights)) { 70 | const allowed = modelCfg.allowedWeights.map(w => w.toUpperCase()).join(', '); 71 | return `# Error: ${modelCfg.modelId} only supports: ${allowed}\n# Please change "Weights / Precision" to a supported value.`; 72 | } 73 | 74 | const tp = modelCfg.tpByHardware[hardware]; 75 | if (!tp) return `# Error: Unknown hardware platform: ${hardware}`; 76 | 77 | let cmd = 'python -m sglang.launch_server \\\n'; 78 | cmd += ` --model ${modelCfg.modelId}`; 79 | 80 | if (tp > 1) { 81 | cmd += ` \\\n --tp ${tp}`; 82 | } 83 | 84 | // Append optional flags (e.g. tool calling) 85 | for (const [key, option] of Object.entries(this.options)) { 86 | if (option.commandRule) { 87 | const rule = option.commandRule(values[key]); 88 | if (rule) cmd += ` \\\n ${rule}`; 89 | } 90 | } 91 | 92 | return cmd; 93 | } 94 | }; 95 | 96 | return ; 97 | }; 98 | 99 | export default Devstral2ConfigGenerator; 100 | 101 | -------------------------------------------------------------------------------- /src/components/ConfigGenerator/styles.module.css: -------------------------------------------------------------------------------- 1 | .configContainer { 2 | max-width: 1000px; 3 | margin: 0 auto; 4 | display: flex; 5 | flex-wrap: wrap; 6 | gap: 12px; 7 | } 8 | 9 | .optionCard { 10 | background: var(--ifm-background-surface-color); 11 | padding: 16px; 12 | border-radius: 10px; 13 | box-shadow: 0 3px 10px rgba(0, 0, 0, 0.08); 14 | border: 1px solid var(--ifm-color-emphasis-300); 15 | flex: 1 1 calc(50% - 6px); 16 | min-width: 400px; 17 | } 18 | 19 | .optionTitle { 20 | font-size: 14px; 21 | font-weight: 600; 22 | color: var(--ifm-font-color-base); 23 | margin-bottom: 10px; 24 | display: flex; 25 | align-items: center; 26 | } 27 | 28 | .optionNumber { 29 | background: #667eea; 30 | color: white; 31 | width: 22px; 32 | height: 22px; 33 | border-radius: 50%; 34 | display: inline-flex; 35 | align-items: center; 36 | justify-content: center; 37 | margin-right: 8px; 38 | font-size: 12px; 39 | } 40 | 41 | .optionItems { 42 | display: flex; 43 | gap: 8px; 44 | flex-wrap: wrap; 45 | align-items: center; 46 | } 47 | 48 | .hiddenInput { 49 | display: none; 50 | } 51 | 52 | .textInput { 53 | flex: 1; 54 | min-width: 200px; 55 | padding: 10px 14px; 56 | border: 2px solid var(--ifm-color-emphasis-300); 57 | border-radius: 6px; 58 | font-size: 14px; 59 | transition: all 0.3s; 60 | background: var(--ifm-background-surface-color); 61 | color: var(--ifm-font-color-base); 62 | } 63 | 64 | .textInput:focus { 65 | outline: none; 66 | border-color: #667eea; 67 | box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1); 68 | } 69 | 70 | .optionLabel { 71 | padding: 8px 18px; 72 | border: 2px solid var(--ifm-color-emphasis-300); 73 | border-radius: 6px; 74 | cursor: pointer; 75 | display: inline-block; 76 | font-weight: 500; 77 | font-size: 13px; 78 | transition: all 0.3s; 79 | background: var(--ifm-background-surface-color); 80 | color: var(--ifm-font-color-base); 81 | user-select: none; 82 | flex-shrink: 0; 83 | } 84 | 85 | .optionLabel:hover { 86 | border-color: #667eea; 87 | } 88 | 89 | .optionLabel.checked { 90 | background: #dc3545; 91 | color: white; 92 | border-color: #d55816; 93 | } 94 | 95 | .optionLabel.disabled { 96 | cursor: not-allowed; 97 | opacity: 0.7; 98 | } 99 | 100 | .optionLabel.disabled:hover { 101 | border-color: var(--ifm-color-emphasis-300); 102 | } 103 | 104 | .subtitle { 105 | display: block; 106 | color: var(--ifm-color-emphasis-600); 107 | font-size: 10px; 108 | margin-top: 2px; 109 | } 110 | 111 | .optionLabel.checked .subtitle { 112 | color: rgba(255, 255, 255, 0.85); 113 | } 114 | 115 | .commandCard { 116 | background: var(--ifm-background-surface-color); 117 | padding: 16px; 118 | border-radius: 10px; 119 | box-shadow: 0 3px 10px rgba(0, 0, 0, 0.08); 120 | border: 1px solid var(--ifm-color-emphasis-300); 121 | flex: 1 1 100%; 122 | width: 100%; 123 | } 124 | 125 | .commandTitle { 126 | font-size: 15px; 127 | font-weight: 600; 128 | color: var(--ifm-font-color-base); 129 | margin-bottom: 10px; 130 | } 131 | 132 | .commandDisplay { 133 | padding: 16px; 134 | background: #2d3748; 135 | border-radius: 6px; 136 | font-family: 'Menlo', 'Monaco', 'Courier New', monospace; 137 | font-size: 13px; 138 | line-height: 1.7; 139 | color: #e2e8f0; 140 | white-space: pre-wrap; 141 | overflow-x: auto; 142 | border: none; 143 | margin: 0; 144 | } 145 | 146 | -------------------------------------------------------------------------------- /src/components/Llama4ScoutConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * Llama 4-Scout Configuration Generator 6 | */ 7 | const Llama4ScoutConfigGenerator = () => { 8 | const config = { 9 | modelFamily: 'meta-llama', 10 | 11 | options: { 12 | hardware: { 13 | name: 'hardware', 14 | title: 'Hardware Platform', 15 | items: [ 16 | { id: 'b200', label: 'B200', default: false }, 17 | { id: 'h100', label: 'H100', default: true }, 18 | { id: 'h200', label: 'H200', default: false } 19 | ] 20 | }, 21 | quantization: { 22 | name: 'quantization', 23 | title: 'Quantization', 24 | items: [ 25 | { id: 'bf16', label: 'BF16', default: true }, 26 | { id: 'fp8', label: 'FP8', default: false } 27 | ] 28 | }, 29 | toolcall: { 30 | name: 'toolcall', 31 | title: 'Tool Call Parser', 32 | items: [ 33 | { id: 'disabled', label: 'Disabled', default: true }, 34 | { id: 'enabled', label: 'Enabled', default: false } 35 | ] 36 | }, 37 | speculative: { 38 | name: 'speculative', 39 | title: 'Speculative Decoding (EAGLE3)', 40 | items: [ 41 | { id: 'disabled', label: 'Disabled', default: true }, 42 | { id: 'enabled', label: 'Enable EAGLE3', default: false } 43 | ] 44 | }, 45 | host: { 46 | name: 'host', 47 | title: 'Host', 48 | type: 'text', 49 | default: '0.0.0.0', 50 | placeholder: '0.0.0.0' 51 | }, 52 | port: { 53 | name: 'port', 54 | title: 'Port', 55 | type: 'text', 56 | default: '8000', 57 | placeholder: '8000' 58 | } 59 | }, 60 | 61 | generateCommand: function(values) { 62 | const { hardware, quantization, toolcall, speculative, host, port } = values; 63 | 64 | let cmd = 'python -m sglang.launch_server \\\n'; 65 | cmd += ` --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct`; 66 | 67 | if (hardware === 'h100' || hardware === 'h200') { 68 | cmd += ` \\\n --tp 8`; 69 | } else if (hardware === 'b200') { 70 | cmd += ` \\\n --tp 8`; 71 | } 72 | 73 | if (quantization === 'fp8') { 74 | cmd += ` \\\n --quantization fp8`; 75 | } 76 | 77 | if (toolcall === 'enabled') { 78 | cmd += ` \\\n --tool-call-parser pythonic`; 79 | } 80 | 81 | if (speculative === 'enabled') { 82 | cmd += ` \\\n --speculative-algorithm EAGLE3 \\\n`; 83 | cmd += ` --speculative-draft-model-path lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1 \\\n`; 84 | cmd += ` --speculative-num-steps 3 \\\n`; 85 | cmd += ` --speculative-eagle-topk 1 \\\n`; 86 | cmd += ` --speculative-num-draft-tokens 4 \\\n`; 87 | cmd += ` --mem-fraction-static 0.75 \\\n`; 88 | cmd += ` --cuda-graph-max-bs 2`; 89 | } 90 | 91 | cmd += ` \\\n --enable-multimodal`; 92 | cmd += ` \\\n --context-length 65536`; 93 | cmd += ` \\\n --dtype bfloat16`; 94 | cmd += ` \\\n --trust-remote-code`; 95 | cmd += ` \\\n --host ${host || '0.0.0.0'}`; 96 | cmd += ` \\\n --port ${port || '8000'}`; 97 | 98 | return cmd; 99 | } 100 | }; 101 | 102 | return ; 103 | }; 104 | 105 | export default Llama4ScoutConfigGenerator; 106 | 107 | -------------------------------------------------------------------------------- /src/components/KimiK2ConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * Kimi-K2 Configuration Generator 6 | * Supports Kimi-K2-Instruct and Kimi-K2-Thinking models 7 | */ 8 | const KimiK2ConfigGenerator = () => { 9 | const config = { 10 | modelFamily: 'moonshotai', 11 | 12 | options: { 13 | hardware: { 14 | name: 'hardware', 15 | title: 'Hardware Platform', 16 | items: [ 17 | { id: 'h200', label: 'H200', default: true }, 18 | { id: 'b200', label: 'B200', default: false } 19 | ] 20 | }, 21 | modelname: { 22 | name: 'modelname', 23 | title: 'Model Name', 24 | items: [ 25 | { id: 'instruct', label: 'Kimi-K2-Instruct', default: true }, 26 | { id: 'thinking', label: 'Kimi-K2-Thinking', default: false } 27 | ] 28 | }, 29 | strategy: { 30 | name: 'strategy', 31 | title: 'Deployment Strategy', 32 | type: 'checkbox', 33 | items: [ 34 | { id: 'tp', label: 'TP', default: true, required: true }, 35 | { id: 'dp', label: 'DP attention', default: false }, 36 | { id: 'ep', label: 'EP', default: false } 37 | ] 38 | }, 39 | reasoning: { 40 | name: 'reasoning', 41 | title: 'Reasoning Parser', 42 | items: [ 43 | { id: 'disabled', label: 'Disabled', default: true }, 44 | { id: 'enabled', label: 'Enabled', default: false } 45 | ] 46 | }, 47 | toolcall: { 48 | name: 'toolcall', 49 | title: 'Tool Call Parser', 50 | items: [ 51 | { id: 'disabled', label: 'Disabled', default: true }, 52 | { id: 'enabled', label: 'Enabled', default: false } 53 | ] 54 | } 55 | }, 56 | 57 | generateCommand: function (values) { 58 | const { hardware, modelname, strategy, reasoning, toolcall } = values; 59 | 60 | // Validation: Kimi-K2-Instruct doesn't support reasoning parser 61 | if (modelname === 'instruct' && reasoning === 'enabled') { 62 | return `# Error: Kimi-K2-Instruct doesn't support reasoning parser\n# Please select "Disabled" for Reasoning Parser or choose Kimi-K2-Thinking model`; 63 | } 64 | 65 | // Model name mapping 66 | const modelMap = { 67 | 'instruct': 'Kimi-K2-Instruct', 68 | 'thinking': 'Kimi-K2-Thinking' 69 | }; 70 | 71 | const modelName = `${this.modelFamily}/${modelMap[modelname]}`; 72 | 73 | let cmd = 'python3 -m sglang.launch_server \\\n'; 74 | cmd += ` --model-path ${modelName}`; 75 | 76 | // Strategy configurations 77 | const strategyArray = Array.isArray(strategy) ? strategy : []; 78 | // TP is mandatory 79 | cmd += ` \\\n --tp 8`; 80 | if (strategyArray.includes('dp')) { 81 | cmd += ` \\\n --dp 4 \\\n --enable-dp-attention`; 82 | } 83 | if (strategyArray.includes('ep')) { 84 | cmd += ` \\\n --ep 4`; 85 | } 86 | 87 | // Add trust-remote-code (required for Kimi-K2) 88 | cmd += ` \\\n --trust-remote-code`; 89 | 90 | // Add tool-call-parser if enabled 91 | if (toolcall === 'enabled') { 92 | cmd += ` \\\n --tool-call-parser kimi_k2`; 93 | } 94 | 95 | // Add reasoning-parser if enabled 96 | if (reasoning === 'enabled') { 97 | cmd += ` \\\n --reasoning-parser kimi_k2`; 98 | } 99 | 100 | return cmd; 101 | } 102 | }; 103 | 104 | return ; 105 | }; 106 | 107 | export default KimiK2ConfigGenerator; 108 | 109 | -------------------------------------------------------------------------------- /src/components/InternS1ConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * Intern-S1 Configuration Generator 6 | * Supports Intern-S1 (235B MOE) and Intern-S1-mini (8B Dense) models 7 | */ 8 | const InternS1ConfigGenerator = () => { 9 | const config = { 10 | modelFamily: 'Intern', 11 | 12 | options: { 13 | hardware: { 14 | name: 'hardware', 15 | title: 'Hardware Platform', 16 | items: [ 17 | { id: 'b200', label: 'B200', default: true }, 18 | { id: 'h100', label: 'H100', default: false }, 19 | { id: 'h200', label: 'H200', default: false } 20 | ] 21 | }, 22 | modelsize: { 23 | name: 'modelsize', 24 | title: 'Model Size', 25 | items: [ 26 | { id: 'S1', label: '235b', subtitle: 'MOE', default: true }, 27 | { id: 'S1-mini', label: '8b', subtitle: 'Dense', default: false } 28 | ] 29 | }, 30 | quantization: { 31 | name: 'quantization', 32 | title: 'Quantization', 33 | items: [ 34 | { id: 'bf16', label: 'BF16', default: true }, 35 | { id: 'fp8', label: 'FP8', default: false } 36 | ] 37 | }, 38 | reasoning_parser: { 39 | name: 'reasoning_parser', 40 | title: 'Reasoning Parser', 41 | items: [ 42 | { id: 'disabled', label: 'Disabled', default: true }, 43 | { id: 'enabled', label: 'Enabled', default: false } 44 | ] 45 | }, 46 | toolcall: { 47 | name: 'toolcall', 48 | title: 'Tool Call Parser', 49 | items: [ 50 | { id: 'disabled', label: 'Disabled', default: true }, 51 | { id: 'enabled', label: 'Enabled', default: false } 52 | ] 53 | } 54 | }, 55 | 56 | modelConfigs: { 57 | 'S1': { 58 | baseName: 'S1', 59 | isMOE: true, 60 | h100: { tp: 8, ep: 0, bf16: true, fp8: true }, 61 | h200: { tp: 8, ep: 0, bf16: true, fp8: true }, 62 | b200: { tp: 8, ep: 0, bf16: true, fp8: true } 63 | }, 64 | 'S1-mini': { 65 | baseName: 'S1-mini', 66 | isMOE: true, 67 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 68 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 69 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 70 | } 71 | }, 72 | 73 | generateCommand: function(values) { 74 | const { hardware, modelsize, quantization, reasoning_parser, toolcall } = values; 75 | 76 | const modelConfig = this.modelConfigs[modelsize]; 77 | if (!modelConfig) { 78 | return `# Error: Unknown model size: ${modelsize}`; 79 | } 80 | 81 | const hwConfig = modelConfig[hardware]; 82 | if (!hwConfig) { 83 | return `# Error: Unknown hardware platform: ${hardware}`; 84 | } 85 | 86 | const quantSuffix = quantization === 'fp8' ? '-FP8' : ''; 87 | const modelName = `internlm/Intern-${modelConfig.baseName}${quantSuffix}`; 88 | 89 | let cmd = 'python -m sglang.launch_server \\\n'; 90 | cmd += ` --model ${modelName}`; 91 | 92 | if (hwConfig.tp > 1) { 93 | cmd += ` \\\n --tp ${hwConfig.tp}`; 94 | } 95 | 96 | let ep = hwConfig.ep; 97 | if (quantization === 'fp8' && hwConfig.tp === 8) { 98 | ep = 2; 99 | } 100 | 101 | if (ep > 0) { 102 | cmd += ` \\\n --ep ${ep}`; 103 | } 104 | 105 | if (reasoning_parser === 'enabled') { 106 | cmd += ` \\\n --reasoning-parser interns1`; 107 | } 108 | 109 | if (toolcall === 'enabled') { 110 | cmd += ` \\\n --tool-call-parser interns1`; 111 | } 112 | 113 | cmd += ` \\\n --trust-remote-code`; 114 | 115 | return cmd; 116 | } 117 | }; 118 | 119 | return ; 120 | }; 121 | 122 | export default InternS1ConfigGenerator; 123 | 124 | -------------------------------------------------------------------------------- /src/components/NemotronConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * NVIDIA Nemotron-Nano-3-30B-A3B Configuration Generator 6 | */ 7 | const NemotronNano3ConfigGenerator = () => { 8 | const config = { 9 | modelFamily: 'nvidia', 10 | 11 | options: { 12 | hardware: { 13 | name: 'hardware', 14 | title: 'Hardware Platform', 15 | items: [ 16 | { id: 'h200', label: 'H200', default: false }, 17 | { id: 'b200', label: 'B200', default: true } 18 | ] 19 | }, 20 | modelVariant: { 21 | name: 'modelVariant', 22 | title: 'Model Variant', 23 | items: [ 24 | { id: 'bf16', label: 'BF16', default: true }, 25 | { id: 'fp8', label: 'FP8', default: false } 26 | ] 27 | }, 28 | tp: { 29 | name: 'tp', 30 | title: 'Tensor Parallel (TP)', 31 | items: [ 32 | { id: '1', label: 'TP=1', default: true }, 33 | { id: '2', label: 'TP=2', default: false }, 34 | { id: '4', label: 'TP=4', default: false }, 35 | { id: '8', label: 'TP=8', default: false } 36 | ] 37 | }, 38 | kvcache: { 39 | name: 'kvcache', 40 | title: 'KV Cache DType', 41 | items: [ 42 | { id: 'fp8_e4m3', label: 'fp8_e4m3', default: true }, 43 | { id: 'bf16', label: 'bf16', default: false } 44 | ] 45 | }, 46 | thinking: { 47 | name: 'thinking', 48 | title: 'Reasoning Parser', 49 | items: [ 50 | { id: 'disabled', label: 'Disabled', default: true }, 51 | { id: 'enabled', label: 'Enabled', default: false } 52 | ], 53 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser nano_v3' : null 54 | }, 55 | toolcall: { 56 | name: 'toolcall', 57 | title: 'Tool Call Parser', 58 | items: [ 59 | { id: 'disabled', label: 'Disabled', default: true }, 60 | { id: 'enabled', label: 'Enabled', default: false } 61 | ], 62 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser qwen3_coder' : null 63 | }, 64 | host: { 65 | name: 'host', 66 | title: 'Host', 67 | type: 'text', 68 | default: '0.0.0.0', 69 | placeholder: '0.0.0.0' 70 | }, 71 | port: { 72 | name: 'port', 73 | title: 'Port', 74 | type: 'text', 75 | default: '30000', 76 | placeholder: '30000' 77 | } 78 | }, 79 | 80 | generateCommand: function(values) { 81 | const { hardware, modelVariant, tp, kvcache, thinking, toolcall, host, port } = values; 82 | 83 | // Default to FP8 if not selected 84 | const variant = modelVariant || 'fp8'; 85 | const baseName = 'NVIDIA-Nemotron-3-Nano-30B-A3B'; 86 | 87 | const modelName = 88 | variant === 'bf16' 89 | ? `${this.modelFamily}/${baseName}-BF16` 90 | : `${this.modelFamily}/${baseName}-FP8`; 91 | 92 | let cmd = 'python3 -m sglang.launch_server \\\n'; 93 | cmd += ` --model-path ${modelName} \\\n`; 94 | cmd += ` --trust-remote-code \\\n`; 95 | cmd += ` --tp ${tp} \\\n`; 96 | cmd += ` --kv-cache-dtype ${kvcache} \\\n`; 97 | 98 | // Add thinking parser and tool call parser if enabled 99 | for (const [key, option] of Object.entries(this.options)) { 100 | if (option.commandRule) { 101 | const rule = option.commandRule(values[key]); 102 | if (rule) { 103 | cmd += ` ${rule} \\\n`; 104 | } 105 | } 106 | } 107 | 108 | 109 | cmd += ` --host ${host || '0.0.0.0'} \\\n`; 110 | cmd += ` --port ${port || '30000'}`; 111 | 112 | return cmd; 113 | } 114 | }; 115 | 116 | return ; 117 | }; 118 | 119 | export default NemotronNano3ConfigGenerator; 120 | -------------------------------------------------------------------------------- /src/components/Qwen3NextConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * Qwen3-Next Configuration Generator 6 | * Supports Qwen3-Next 80B model with speculative decoding option 7 | */ 8 | const Qwen3NextConfigGenerator = () => { 9 | const config = { 10 | modelFamily: 'Qwen', 11 | 12 | options: { 13 | hardware: { 14 | name: 'hardware', 15 | title: 'Hardware Platform', 16 | items: [ 17 | { id: 'b200', label: 'B200', default: true }, 18 | { id: 'h200', label: 'H200', default: false }, 19 | { id: 'h100', label: 'H100', default: false } 20 | ] 21 | }, 22 | modelsize: { 23 | name: 'modelsize', 24 | title: 'Model Size', 25 | items: [ 26 | { id: '80b', label: '80B', subtitle: 'MOE', default: true }, 27 | ] 28 | }, 29 | quantization: { 30 | name: 'quantization', 31 | title: 'Quantization', 32 | items: [ 33 | { id: 'bf16', label: 'BF16', subtitle: 'Full Weights', default: true }, 34 | { id: 'fp8', label: 'FP8', subtitle: 'High Throughput', default: false } 35 | ] 36 | }, 37 | thinking: { 38 | name: 'thinking', 39 | title: 'Thinking Capabilities', 40 | items: [ 41 | { id: 'instruct', label: 'Instruct', subtitle: 'General Purpose', default: true }, 42 | { id: 'thinking', label: 'Thinking', subtitle: 'Reasoning / CoT', default: false } 43 | ], 44 | commandRule: (value) => value === 'thinking' ? '--reasoning-parser qwen3' : null 45 | }, 46 | toolcall: { 47 | name: 'toolcall', 48 | title: 'Tool Call Parser', 49 | items: [ 50 | { id: 'disabled', label: 'Disabled', default: true }, 51 | { id: 'enabled', label: 'Enabled', default: false } 52 | ], 53 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser qwen' : null 54 | }, 55 | speculative: { 56 | name: 'speculative', 57 | title: 'Speculative Decoding', 58 | items: [ 59 | { id: 'disabled', label: 'Disabled', default: true }, 60 | { id: 'enabled', label: 'Enabled', default: false } 61 | ], 62 | commandRule: (value) => value === 'enabled' ? '--speculative-algorithm EAGLE \\\n --speculative-num-steps 3 \\\n --speculative-eagle-topk 1 \\\n --speculative-num-draft-tokens 4' : null 63 | } 64 | }, 65 | 66 | modelConfigs: { 67 | '80b': { 68 | baseName: '80B-A3B', 69 | isMOE: true, 70 | h100: { tp: 4, ep: 0, bf16: true, fp8: true }, 71 | h200: { tp: 2, ep: 0, bf16: true, fp8: true }, 72 | b200: { tp: 2, ep: 0, bf16: true, fp8: true } 73 | } 74 | }, 75 | 76 | generateCommand: function (values) { 77 | const { hardware, modelsize: modelSize, quantization, thinking } = values; 78 | const commandKey = `${hardware}-${modelSize}-${quantization}-${thinking}`; 79 | 80 | const config = this.modelConfigs[modelSize]; 81 | if (!config) { 82 | return `# Error: Unknown model size: ${modelSize}`; 83 | } 84 | 85 | const hwConfig = config[hardware]; 86 | if (!hwConfig) { 87 | return `# Error: Unknown hardware platform: ${hardware}`; 88 | } 89 | 90 | const quantSuffix = quantization === 'fp8' ? '-FP8' : ''; 91 | const thinkingSuffix = thinking === 'thinking' ? '-Thinking' : '-Instruct'; 92 | const modelName = `Qwen/Qwen3-Next-${config.baseName}${thinkingSuffix}${quantSuffix}`; 93 | 94 | let cmd = 'python -m sglang.launch_server \\\n'; 95 | cmd += ` --model ${modelName}`; 96 | 97 | if (hwConfig.tp > 1) { 98 | cmd += ` \\\n --tp ${hwConfig.tp}`; 99 | } 100 | 101 | let ep = hwConfig.ep; 102 | if (quantization === 'fp8' && hwConfig.tp === 8) { 103 | ep = 2; 104 | } 105 | 106 | if (ep > 0) { 107 | cmd += ` \\\n --ep ${ep}`; 108 | } 109 | 110 | for (const [key, option] of Object.entries(this.options)) { 111 | 112 | if (option.commandRule) { 113 | const rule = option.commandRule(values[key]); 114 | if (rule) { 115 | cmd += ` \\\n ${rule}`; 116 | } 117 | } 118 | } 119 | 120 | return cmd; 121 | } 122 | }; 123 | 124 | return ; 125 | }; 126 | 127 | export default Qwen3NextConfigGenerator; 128 | 129 | -------------------------------------------------------------------------------- /src/components/GLM46VConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * GLM-4.6V Configuration Generator 6 | * Supports GLM-4.6V (106B) and GLM-4.6V-Flash (9B) models 7 | */ 8 | const GLM46VConfigGenerator = () => { 9 | const config = { 10 | modelFamily: 'GLM', 11 | 12 | options: { 13 | hardware: { 14 | name: 'hardware', 15 | title: 'Hardware Platform', 16 | items: [ 17 | { id: 'b200', label: 'B200', default: true }, 18 | { id: 'h100', label: 'H100', default: false }, 19 | { id: 'h200', label: 'H200', default: false } 20 | ] 21 | }, 22 | modelsize: { 23 | name: 'modelsize', 24 | title: 'Model Size', 25 | items: [ 26 | { id: '106b', label: '106B', subtitle: 'GLM-4.6V', default: true }, 27 | { id: '9b', label: '9B', subtitle: 'GLM-4.6V-Flash', default: false } 28 | ] 29 | }, 30 | quantization: { 31 | name: 'quantization', 32 | title: 'Quantization', 33 | items: [ 34 | { id: 'bf16', label: 'BF16', default: true }, 35 | { id: 'fp8', label: 'FP8', default: false } 36 | ] 37 | }, 38 | reasoning: { 39 | name: 'reasoning', 40 | title: 'Reasoning Parser', 41 | items: [ 42 | { id: 'enabled', label: 'Enabled', default: true }, 43 | { id: 'disabled', label: 'Disabled', default: false } 44 | ], 45 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser glm45' : null 46 | }, 47 | toolcall: { 48 | name: 'toolcall', 49 | title: 'Tool Call Parser', 50 | items: [ 51 | { id: 'enabled', label: 'Enabled', default: true }, 52 | { id: 'disabled', label: 'Disabled', default: false } 53 | ], 54 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser glm45' : null 55 | }, 56 | host: { 57 | name: 'host', 58 | title: 'Host', 59 | type: 'text', 60 | default: '0.0.0.0', 61 | placeholder: '0.0.0.0' 62 | }, 63 | port: { 64 | name: 'port', 65 | title: 'Port', 66 | type: 'text', 67 | default: '30000', 68 | placeholder: '30000' 69 | } 70 | }, 71 | 72 | modelConfigs: { 73 | '106b': { 74 | baseName: 'GLM-4.6V', 75 | h100: { tp: 8, bf16: true, fp8: true }, 76 | h200: { tp: 8, bf16: true, fp8: true }, 77 | b200: { tp: 8, bf16: true, fp8: true } 78 | }, 79 | '9b': { 80 | baseName: 'GLM-4.6V-Flash', 81 | h100: { tp: 1, bf16: true, fp8: true }, 82 | h200: { tp: 1, bf16: true, fp8: true }, 83 | b200: { tp: 1, bf16: true, fp8: true } 84 | } 85 | }, 86 | 87 | specialCommands: {}, 88 | 89 | generateCommand: function (values) { 90 | const { hardware, modelsize: modelSize, quantization, reasoning, toolcall } = values; 91 | const commandKey = `${hardware}-${modelSize}-${quantization}`; 92 | 93 | if (this.specialCommands[commandKey]) { 94 | return this.specialCommands[commandKey]; 95 | } 96 | 97 | const config = this.modelConfigs[modelSize]; 98 | if (!config) { 99 | return `# Error: Unknown model size: ${modelSize}`; 100 | } 101 | 102 | const hwConfig = config[hardware]; 103 | if (!hwConfig) { 104 | return `# Error: Unknown hardware platform: ${hardware}`; 105 | } 106 | 107 | const quantSuffix = quantization === 'fp8' ? '-FP8' : ''; 108 | const modelName = `zai-org/${config.baseName}${quantSuffix}`; 109 | 110 | let cmd = 'python -m sglang.launch_server \\\n'; 111 | cmd += ` --model ${modelName}`; 112 | 113 | if (hwConfig.tp > 1) { 114 | cmd += ` \\\n --tp ${hwConfig.tp}`; 115 | } 116 | 117 | for (const [key, option] of Object.entries(this.options)) { 118 | if (key === 'host' || key === 'port') continue; 119 | 120 | if (option.commandRule) { 121 | const rule = option.commandRule(values[key]); 122 | if (rule) { 123 | cmd += ` \\\n ${rule}`; 124 | } 125 | } 126 | } 127 | 128 | const host = values.host || CONFIG.options.host.default; 129 | const port = values.port || CONFIG.options.port.default; 130 | cmd += ` \\\n --host ${host} \\\n --port ${port}`; 131 | 132 | return cmd; 133 | } 134 | }; 135 | 136 | return ; 137 | }; 138 | 139 | export default GLM46VConfigGenerator; 140 | 141 | -------------------------------------------------------------------------------- /src/components/DeepSeekR1ConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | const DeepSeekR1ConfigGenerator = () => { 5 | const config = { 6 | modelFamily: 'deepseek-ai', 7 | 8 | options: { 9 | hardware: { 10 | name: 'hardware', 11 | title: 'Hardware Platform', 12 | items: [ 13 | { id: 'h100', label: 'H100', default: false }, 14 | { id: 'h200', label: 'H200', default: false }, 15 | { id: 'b200', label: 'B200', default: true } 16 | ] 17 | }, 18 | quantization: { 19 | name: 'quantization', 20 | title: 'Quantization', 21 | items: [ 22 | { id: 'fp8', label: 'FP8', default: true }, 23 | { id: 'fp4', label: 'FP4', default: false } 24 | ] 25 | }, 26 | strategy: { 27 | name: 'strategy', 28 | title: 'Deployment Strategy', 29 | type: 'checkbox', 30 | items: [ 31 | { id: 'tp', label: 'TP', subtitle: 'Tensor Parallel', default: true, required: true }, 32 | { id: 'dp', label: 'DP', subtitle: 'Data Parallel', default: false }, 33 | { id: 'ep', label: 'EP', subtitle: 'Expert Parallel', default: false }, 34 | { id: 'mtp', label: 'MTP', subtitle: 'Multi-token Prediction', default: false } 35 | ] 36 | }, 37 | thinking: { 38 | name: 'thinking', 39 | title: 'Reasoning Parser', 40 | items: [ 41 | { id: 'disabled', label: 'Disabled', default: true }, 42 | { id: 'enabled', label: 'Enabled', default: false } 43 | ], 44 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser deepseek-r1' : null 45 | }, 46 | toolcall: { 47 | name: 'toolcall', 48 | title: 'Tool Call Parser', 49 | items: [ 50 | { id: 'disabled', label: 'Disabled', default: true }, 51 | { id: 'enabled', label: 'Enabled', default: false } 52 | ], 53 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser deepseekv3 \\\n --chat-template examples/chat_template/tool_chat_template_deepseekr1.jinja' : null 54 | } 55 | }, 56 | 57 | generateCommand: function (values) { 58 | const { hardware, quantization, strategy } = values; 59 | 60 | const strategyArray = Array.isArray(strategy) ? strategy : []; 61 | 62 | // Validation checks 63 | // Check H100 compatibility - H100 only supports FP8 64 | if (hardware === 'h100' && quantization === 'fp4') { 65 | return '# Error: H100 only supports FP8 quantization\n# Please select FP8 quantization or use B200 hardware'; 66 | } 67 | 68 | // Model path based on quantization 69 | let modelPath = ''; 70 | if (quantization === 'fp8') { 71 | modelPath = 'deepseek-ai/DeepSeek-R1-0528'; 72 | } else if (quantization === 'fp4') { 73 | modelPath = 'nvidia/DeepSeek-R1-0528-FP4-v2'; 74 | } 75 | 76 | let cmd = 'python3 -m sglang.launch_server \\\n'; 77 | cmd += ` --model-path ${modelPath}`; 78 | 79 | // TP strategy 80 | if (strategyArray.includes('tp')) { 81 | cmd += ` \\\n --tp 8`; 82 | } 83 | 84 | // DP strategy 85 | if (strategyArray.includes('dp')) { 86 | cmd += ` \\\n --dp 8 \\\n --enable-dp-attention`; 87 | } 88 | 89 | // EP strategy 90 | if (strategyArray.includes('ep')) { 91 | cmd += ` \\\n --ep 8`; 92 | } 93 | 94 | // MTP strategy 95 | if (strategyArray.includes('mtp')) { 96 | cmd = 'SGLANG_ENABLE_SPEC_V2=1 ' + cmd; 97 | cmd += ` \\\n --speculative-algorithm EAGLE \\\n --speculative-num-steps 3 \\\n --speculative-eagle-topk 1 \\\n --speculative-num-draft-tokens 4`; 98 | } 99 | 100 | cmd += ` \\\n --enable-symm-mem # Optional: improves performance, but may be unstable`; 101 | 102 | if (hardware === 'b200') { 103 | cmd += ` \\\n --kv-cache-dtype fp8_e4m3 # Optional: enables fp8 kv cache and fp8 attention kernels to improve performance`; 104 | } 105 | 106 | // Add thinking parser and tool call parser if enabled 107 | for (const [key, option] of Object.entries(this.options)) { 108 | if (option.commandRule) { 109 | const rule = option.commandRule(values[key]); 110 | if (rule) { 111 | cmd += ` \\\n ${rule}`; 112 | } 113 | } 114 | } 115 | 116 | return cmd; 117 | } 118 | }; 119 | 120 | return ; 121 | }; 122 | 123 | export default DeepSeekR1ConfigGenerator; 124 | 125 | -------------------------------------------------------------------------------- /src/components/GLM46ConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * GLM-4.6 Configuration Generator 6 | * Supports GLM-4.6 model deployment configuration 7 | */ 8 | const GLM46ConfigGenerator = () => { 9 | const config = { 10 | modelFamily: 'zai-org', 11 | 12 | options: { 13 | hardware: { 14 | name: 'hardware', 15 | title: 'Hardware Platform', 16 | items: [ 17 | { id: 'h100', label: 'H100', default: true }, 18 | { id: 'h200', label: 'H200', default: false }, 19 | { id: 'b200', label: 'B200', default: false } 20 | ] 21 | }, 22 | quantization: { 23 | name: 'quantization', 24 | title: 'Quantization', 25 | items: [ 26 | { id: 'bf16', label: 'BF16', default: true }, 27 | { id: 'fp8', label: 'FP8', default: false } 28 | ] 29 | }, 30 | strategy: { 31 | name: 'strategy', 32 | title: 'Deployment Strategy', 33 | type: 'checkbox', 34 | items: [ 35 | { id: 'tp', label: 'TP', subtitle: 'Tensor Parallel', default: true, required: true }, 36 | { id: 'dp', label: 'DP', subtitle: 'Data Parallel', default: false }, 37 | { id: 'ep', label: 'EP', subtitle: 'Expert Parallel', default: false }, 38 | { id: 'mtp', label: 'MTP', subtitle: 'Multi-token Prediction', default: false } 39 | ] 40 | }, 41 | thinking: { 42 | name: 'thinking', 43 | title: 'Thinking Capabilities', 44 | items: [ 45 | { id: 'disabled', label: 'Disabled', default: true }, 46 | { id: 'enabled', label: 'Enabled', default: false } 47 | ], 48 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser glm45' : null 49 | }, 50 | toolcall: { 51 | name: 'toolcall', 52 | title: 'Tool Call Parser', 53 | items: [ 54 | { id: 'disabled', label: 'Disabled', default: true }, 55 | { id: 'enabled', label: 'Enabled', default: false } 56 | ], 57 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser glm45' : null 58 | } 59 | }, 60 | 61 | specialCommands: { 62 | 'h100-bf16-tp': '# Error: GLM-4.6 in BF16 precision requires more VRAM than 8*H100\n# Please use H200/B200 or select FP8 quantization', 63 | 'h100-bf16-dp': '# Error: GLM-4.6 in BF16 precision requires more VRAM than 8*H100\n# Please use H200/B200 or select FP8 quantization', 64 | 'h100-bf16-ep': '# Error: GLM-4.6 in BF16 precision requires more VRAM than 8*H100\n# Please use H200/B200 or select FP8 quantization', 65 | 'h100-bf16-mtp': '# Error: GLM-4.6 in BF16 precision requires more VRAM than 8*H100\n# Please use H200/B200 or select FP8 quantization' 66 | }, 67 | 68 | generateCommand: function (values) { 69 | const { hardware, quantization, strategy, thinking, toolcall } = values; 70 | 71 | // Check for H100 + BF16 error 72 | const strategyArray = Array.isArray(strategy) ? strategy : []; 73 | if (hardware === 'h100' && quantization === 'bf16') { 74 | return '# Error: GLM-4.6 in BF16 precision requires more VRAM than 8*H100\n# Please use H200/B200 or select FP8 quantization'; 75 | } 76 | 77 | const modelSuffix = quantization === 'fp8' ? '-FP8' : ''; 78 | const modelName = `${this.modelFamily}/GLM-4.6${modelSuffix}`; 79 | 80 | let cmd = 'python -m sglang.launch_server \\\n'; 81 | cmd += ` --model ${modelName}`; 82 | 83 | // TP is mandatory 84 | cmd += ` \\\n --tp 8`; 85 | 86 | // Strategy-specific parameters 87 | if (strategyArray.includes('dp')) { 88 | cmd += ` \\\n --dp 8 \\\n --enable-dp-attention`; 89 | } 90 | if (strategyArray.includes('ep')) { 91 | cmd += ` \\\n --ep 8`; 92 | } 93 | if (strategyArray.includes('mtp')) { 94 | cmd = 'SGLANG_ENABLE_SPEC_V2=1 ' + cmd; 95 | cmd += ` \\\n --speculative-algorithm EAGLE \\\n --speculative-num-steps 3 \\\n --speculative-eagle-topk 1 \\\n --speculative-num-draft-tokens 4`; 96 | } 97 | 98 | // Add tool call parser if enabled 99 | if (toolcall === 'enabled') { 100 | cmd += ` \\\n --tool-call-parser glm45`; 101 | } 102 | 103 | // Add thinking parser if enabled 104 | if (thinking === 'enabled') { 105 | cmd += ` \\\n --reasoning-parser glm45`; 106 | } 107 | 108 | return cmd; 109 | } 110 | }; 111 | 112 | return ; 113 | }; 114 | 115 | export default GLM46ConfigGenerator; 116 | 117 | -------------------------------------------------------------------------------- /docs/intro.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 1 3 | --- 4 | 5 | # SGLang Cookbook 6 | 7 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 8 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/sgl-project/sgl-cookbook/pulls) 9 | 10 | A community-maintained repository of practical guides and recipes for deploying and using SGLang in production environments. Our mission is simple: answer the question **"How do I use SGLang (and related models) on hardware Y for task Z?"** with clear, actionable solutions. 11 | 12 | ## 🎯 What You'll Find Here 13 | 14 | This cookbook aggregates battle-tested SGLang recipes covering: 15 | 16 | - **Models**: Mainstream LLMs and Vision-Language Models (VLMs) 17 | - **Use Cases**: Inference serving, deployment strategies, multimodal applications 18 | - **Hardware**: GPU and CPU configurations, optimization for different accelerators 19 | - **Best Practices**: Configuration templates, performance tuning, troubleshooting guides 20 | 21 | Each recipe provides step-by-step instructions to help you quickly implement SGLang solutions for your specific requirements. 22 | 23 | ## Guides 24 | 25 | ### DeepSeek 26 | 27 | - [x] [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2) 28 | - [ ] [DeepSeek-V3.1](./DeepSeek/DeepSeek-V3_1) 29 | - [ ] [DeepSeek-V3](./DeepSeek/DeepSeek-V3) 30 | - [x] [DeepSeek-R1](/docs/DeepSeek/DeepSeek-R1) 31 | 32 | ### Ernie 33 | 34 | - [ ] [Ernie4.5](./Ernie/Ernie4.5) 35 | - [ ] [Ernie4.5-VL](./Ernie/Ernie4.5-VL) 36 | 37 | ### GLM 38 | 39 | - [ ] [Glyph](./GLM/Glyph) 40 | - [ ] [GLM-4.5V](./GLM/GLM-4.5V) 41 | - [x] [GLM-4.6](./GLM/GLM-4.6) 42 | - [x] [GLM-4.6V](./GLM/GLM-4.6V) 43 | 44 | ### InternVL 45 | 46 | - [ ] [InternVL3.5](./InternVL/InternVL3_5) 47 | 48 | ### InternLM 49 | 50 | - [ ] [Intern-S1](./InternLM/Intern-S1) 51 | 52 | ### Jina AI 53 | 54 | - [ ] [Jina-reranker-m0](./Jina/Jina-reranker-m0) 55 | 56 | ### Llama 57 | 58 | - [ ] [Llama4-Scout](./Llama/Llama4-Scout) 59 | - [ ] [Llama3.3-70B](./Llama/Llama3.3-70B) 60 | - [ ] [Llama3.1](./Llama/Llama3.1) 61 | 62 | ### MiniMax 63 | 64 | - [ ] [MiniMax-M2](./MiniMax/MiniMax-M2) 65 | 66 | ### Mistral 67 | 68 | - [x] [Devstral 2](./Mistral/Devstral-2) 69 | - [ ] [Mistral-3](./Mistral/Mistral-3) 70 | 71 | ### OpenAI 72 | 73 | - [ ] [gpt-oss](./OpenAI/GPT-OSS) 74 | 75 | ### Qwen 76 | 77 | - [x] [Qwen3](./Qwen/Qwen3) 78 | - [x] [Qwen3-VL](./Qwen/Qwen3-VL) 79 | - [x] [Qwen3-Next](./Qwen/Qwen3-Next) 80 | - [ ] [Qwen3-Coder-480B-A35B](./Qwen/Qwen3-Coder-480B-A35B) 81 | - [ ] [Qwen2.5-VL](./Qwen/Qwen2.5-VL) 82 | 83 | ### Moonshotai 84 | 85 | - [x] [Kimi-K2](./Moonshotai/Kimi-K2) 86 | - [ ] [Kimi-Linear](./Moonshotai/Kimi-Linear) 87 | 88 | ### NVIDIA 89 | 90 | - [x] [Nemotron-Nano-3-30B-A3B](./NVIDIA/Nemotron3-Nano) 91 | 92 | ## 🚀 Quick Start 93 | 94 | 1. Browse the recipe index above to find your model 95 | 2. Follow the step-by-step instructions in each guide 96 | 3. Adapt configurations to your specific hardware and requirements 97 | 4. Join our community to share feedback and improvements 98 | 99 | ## 🤝 Contributing 100 | 101 | We believe the best documentation comes from practitioners. Whether you've optimized SGLang for a specific model, solved a tricky deployment challenge, or discovered performance improvements, we encourage you to contribute your recipes! 102 | 103 | **Ways to contribute:** 104 | 105 | - Add a new recipe for a model not yet covered 106 | - Improve existing recipes with additional tips or configurations 107 | - Report issues or suggest enhancements 108 | - Share your production deployment experiences 109 | 110 | **To contribute:** 111 | 112 | ```shell 113 | # Fork the repo and clone locally 114 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git 115 | cd sglang-cookbook 116 | 117 | # Create a new branch 118 | git checkout -b add-my-recipe 119 | 120 | # Add your recipe following the template in DeepSeek-V3.2 121 | # Submit a PR! 122 | ``` 123 | 124 | ## 🛠️ Local Development 125 | 126 | ### Prerequisites 127 | 128 | - Node.js >= 20.0 129 | - npm or yarn 130 | 131 | ### Setup and Run 132 | 133 | Install dependencies and start the development server: 134 | 135 | ```shell 136 | # Install dependencies 137 | npm install 138 | 139 | # Start development server (hot reload enabled) 140 | npm start 141 | ``` 142 | 143 | The site will automatically open in your browser at `http://localhost:3000`. 144 | 145 | ## 📖 Resources 146 | 147 | - [SGLang GitHub](https://github.com/sgl-project/sglang) 148 | - [SGLang Documentation](https://sgl-project.github.io) 149 | - [Community Slack/Discord](https://discord.gg/MpEEuAeb) 150 | 151 | ## 📄 License 152 | 153 | This project is licensed under the Apache License 2.0 - see the [LICENSE](https://github.com/sgl-project/sgl-cookbook/blob/main/LICENSE) file for details. 154 | 155 | --- 156 | 157 | **Let's build this resource together!** 🚀 Star the repo and contribute your recipes to help the SGLang community grow. 158 | -------------------------------------------------------------------------------- /src/components/DeepSeekConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * DeepSeek V3.2 Configuration Generator 6 | * Supports DeepSeek-V3.2, V3.2-Speciale, and V3.2-Exp models 7 | */ 8 | const DeepSeekConfigGenerator = () => { 9 | const config = { 10 | modelFamily: 'deepseek-ai', 11 | 12 | options: { 13 | hardware: { 14 | name: 'hardware', 15 | title: 'Hardware Platform', 16 | items: [ 17 | { id: 'h200', label: 'H200', default: true }, 18 | { id: 'b200', label: 'B200', default: false } 19 | ] 20 | }, 21 | modelname: { 22 | name: 'modelname', 23 | title: 'Model Name', 24 | items: [ 25 | { id: 'v32', label: 'DeepSeek-V3.2', default: true }, 26 | { id: 'v32speciale', label: 'DeepSeek-V3.2-Speciale', default: false }, 27 | { id: 'v32exp', label: 'DeepSeek-V3.2-Exp', default: false } 28 | ] 29 | }, 30 | strategy: { 31 | name: 'strategy', 32 | title: 'Deployment Strategy', 33 | type: 'checkbox', 34 | items: [ 35 | { id: 'tp', label: 'TP', default: true, required: true }, 36 | { id: 'dp', label: 'DP attention', default: false }, 37 | { id: 'ep', label: 'EP', default: false }, 38 | { id: 'mtp', label: 'Multi-token Prediction', default: false } 39 | ] 40 | }, 41 | reasoningParser: { 42 | name: 'reasoningParser', 43 | title: 'Reasoning Parser', 44 | items: [ 45 | { id: 'disabled', label: 'Disabled', default: true }, 46 | { id: 'enabled', label: 'Enabled', default: false } 47 | ], 48 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser deepseek-v3' : null 49 | }, 50 | toolcall: { 51 | name: 'toolcall', 52 | title: 'Tool Call Parser', 53 | items: [ 54 | { id: 'disabled', label: 'Disabled', default: true }, 55 | { id: 'enabled', label: 'Enabled', default: false } 56 | ] 57 | } 58 | }, 59 | 60 | generateCommand: function(values) { 61 | const { hardware, modelname, strategy, reasoningParser, toolcall } = values; 62 | 63 | // Validation: DeepSeek-V3.2-Speciale doesn't support tool calling 64 | if (modelname === 'v32speciale' && toolcall === 'enabled') { 65 | return `# Error: DeepSeek-V3.2-Speciale doesn't support tool calling\n# Please select "Disabled" for Tool Call Parser or choose a different model`; 66 | } 67 | 68 | // Model name mapping 69 | const modelMap = { 70 | 'v32': 'DeepSeek-V3.2', 71 | 'v32exp': 'DeepSeek-V3.2-Exp', 72 | 'v32speciale': 'DeepSeek-V3.2-Speciale' 73 | }; 74 | 75 | const modelName = `${this.modelFamily}/${modelMap[modelname]}`; 76 | 77 | let cmd = 'python3 -m sglang.launch_server \\\n'; 78 | cmd += ` --model-path ${modelName}`; 79 | 80 | // Strategy configurations 81 | const strategyArray = Array.isArray(strategy) ? strategy : []; 82 | // TP is mandatory 83 | cmd += ` \\\n --tp 8`; 84 | if (strategyArray.includes('dp')) { 85 | cmd += ` \\\n --dp 8 \\\n --enable-dp-attention`; 86 | } 87 | if (strategyArray.includes('ep')) { 88 | cmd += ` \\\n --ep 8`; 89 | } 90 | // Multi-token prediction (MTP) configuration 91 | if (strategyArray.includes('mtp')) { 92 | cmd += ` \\\n --speculative-algorithm EAGLE \\\n --speculative-num-steps 3 \\\n --speculative-eagle-topk 1 \\\n --speculative-num-draft-tokens 4`; 93 | } 94 | 95 | // Add tool-call-parser if enabled (not supported for Speciale) 96 | if (toolcall === 'enabled' && modelname !== 'v32speciale') { 97 | if (modelname === 'v32exp') { 98 | cmd += ` \\\n --tool-call-parser deepseekv31`; 99 | } else if (modelname === 'v32') { 100 | cmd += ` \\\n --tool-call-parser deepseekv32`; 101 | } 102 | } 103 | 104 | // Add reasoning-parser when enabled 105 | if (reasoningParser === 'enabled') { 106 | cmd += ` \\\n --reasoning-parser deepseek-v3`; 107 | } 108 | 109 | // Add chat-template if tool calling is enabled (only for v32exp) 110 | if (toolcall === 'enabled' && modelname === 'v32exp') { 111 | cmd += ` \\\n --chat-template ./examples/chat_template/tool_chat_template_deepseekv32.jinja`; 112 | } 113 | 114 | return cmd; 115 | } 116 | }; 117 | 118 | return ; 119 | }; 120 | 121 | export default DeepSeekConfigGenerator; 122 | 123 | -------------------------------------------------------------------------------- /src/components/GPTOSSConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * GPT-OSS Configuration Generator 6 | * Supports GPT-OSS 120B and 20B models with speculative decoding 7 | */ 8 | const GPTOSSConfigGenerator = () => { 9 | const config = { 10 | modelFamily: 'GPT-OSS', 11 | 12 | options: { 13 | hardware: { 14 | name: 'hardware', 15 | title: 'Hardware Platform', 16 | items: [ 17 | { id: 'b200', label: 'B200', default: true }, 18 | { id: 'h200', label: 'H200', default: false }, 19 | { id: 'h100', label: 'H100', default: false } 20 | ] 21 | }, 22 | modelsize: { 23 | name: 'modelsize', 24 | title: 'Model Size', 25 | items: [ 26 | { id: '120b', label: '120B', subtitle: 'MOE', default: true }, 27 | { id: '20b', label: '20B', subtitle: 'MOE', default: false }, 28 | ] 29 | }, 30 | quantization: { 31 | name: 'quantization', 32 | title: 'Quantization', 33 | items: [ 34 | { id: 'mxfp4', label: 'MXFP4', default: true }, 35 | { id: 'bf16', label: 'BF16', default: false } 36 | ] 37 | }, 38 | reasoningParser: { 39 | name: 'reasoningParser', 40 | title: 'Reasoning Parser', 41 | items: [ 42 | { id: 'disabled', label: 'Disabled', default: true }, 43 | { id: 'enabled', label: 'Enabled', default: false } 44 | ], 45 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser gpt-oss' : null 46 | }, 47 | toolcall: { 48 | name: 'toolcall', 49 | title: 'Tool Call Parser', 50 | items: [ 51 | { id: 'disabled', label: 'Disabled', default: true }, 52 | { id: 'enabled', label: 'Enabled', default: false } 53 | ], 54 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser gpt-oss' : null 55 | }, 56 | speculative: { 57 | name: 'speculative', 58 | title: 'Speculative Decoding', 59 | items: [ 60 | { id: 'disabled', label: 'Disabled', default: true }, 61 | { id: 'enabled', label: 'Enabled', default: false } 62 | ], 63 | commandRule: (value, allValues) => { 64 | if (value !== 'enabled') return null; 65 | 66 | let cmd = '--speculative-algorithm EAGLE3 \\\n --speculative-num-steps 3 \\\n --speculative-eagle-topk 1 \\\n --speculative-num-draft-tokens 4'; 67 | 68 | if (allValues.modelsize === '120b') { 69 | cmd += ' \\\n --speculative-draft-model-path nvidia/gpt-oss-120b-Eagle3'; 70 | } else if (allValues.modelsize === '20b') { 71 | cmd += ' \\\n --speculative-draft-model-path zhuyksir/EAGLE3-gpt-oss-20b-bf16'; 72 | } 73 | 74 | return cmd; 75 | } 76 | } 77 | }, 78 | 79 | modelConfigs: { 80 | '120b': { 81 | baseName: '120b', 82 | isMOE: true, 83 | h100: { tp: 8, ep: 0, mxfp4: true, bf16: false }, 84 | h200: { tp: 8, ep: 0, mxfp4: true, bf16: false }, 85 | b200: { tp: 8, ep: 0, mxfp4: true, bf16: false } 86 | }, 87 | '20b': { 88 | baseName: '20b', 89 | isMOE: true, 90 | h100: { tp: 1, ep: 0, mxfp4: true, bf16: false }, 91 | h200: { tp: 1, ep: 0, mxfp4: true, bf16: false }, 92 | b200: { tp: 1, ep: 0, mxfp4: true, bf16: false } 93 | } 94 | }, 95 | 96 | generateCommand: function (values) { 97 | const { hardware, modelsize: modelSize, quantization, reasoningParser } = values; 98 | const commandKey = `${hardware}-${modelSize}-${quantization}-${reasoningParser}`; 99 | 100 | const config = this.modelConfigs[modelSize]; 101 | if (!config) { 102 | return `# Error: Unknown model size: ${modelSize}`; 103 | } 104 | 105 | const hwConfig = config[hardware]; 106 | if (!hwConfig) { 107 | return `# Error: Unknown hardware platform: ${hardware}`; 108 | } 109 | 110 | const quantSuffix = quantization === 'bf16' ? '-bf16' : ''; 111 | const orgPrefix = quantization === 'bf16' ? 'lmsys' : 'openai'; 112 | const modelName = `${orgPrefix}/gpt-oss-${config.baseName}${quantSuffix}`; 113 | 114 | let cmd = ''; 115 | 116 | if (values.speculative === 'enabled') { 117 | cmd += 'SGLANG_ENABLE_SPEC_V2=1 SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 '; 118 | } 119 | 120 | cmd += 'python -m sglang.launch_server \\\n'; 121 | 122 | cmd += ` --model ${modelName}`; 123 | 124 | if (hwConfig.tp > 1) { 125 | cmd += ` \\\n --tp ${hwConfig.tp}`; 126 | } 127 | 128 | let ep = hwConfig.ep; 129 | 130 | if (ep > 0) { 131 | cmd += ` \\\n --ep ${ep}`; 132 | } 133 | 134 | for (const [key, option] of Object.entries(this.options)) { 135 | 136 | if (option.commandRule) { 137 | const rule = option.commandRule(values[key], values); 138 | 139 | if (rule) { 140 | cmd += ` \\\n ${rule}`; 141 | } 142 | } 143 | } 144 | 145 | return cmd; 146 | } 147 | }; 148 | 149 | return ; 150 | }; 151 | 152 | export default GPTOSSConfigGenerator; 153 | -------------------------------------------------------------------------------- /src/components/ConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React, { useState } from 'react'; 2 | import styles from './styles.module.css'; 3 | 4 | /** 5 | * Generic Configuration Generator Component 6 | * 7 | * @param {Object} config - Configuration object with the following structure: 8 | * - modelFamily: string (optional) 9 | * - options: object with option groups 10 | * - generateCommand: function(values) => string 11 | */ 12 | const ConfigGenerator = ({ config }) => { 13 | if (!config || !config.options) { 14 | return
Error: Invalid configuration provided
; 15 | } 16 | 17 | // Initialize state with default values 18 | const getInitialState = () => { 19 | const initialState = {}; 20 | Object.entries(config.options).forEach(([key, option]) => { 21 | if (option.type === 'checkbox') { 22 | initialState[key] = option.items 23 | .filter(item => item.default) 24 | .map(item => item.id); 25 | } else if (option.type === 'text') { 26 | initialState[key] = option.default || ''; 27 | } else { 28 | const defaultItem = option.items.find(item => item.default); 29 | initialState[key] = defaultItem ? defaultItem.id : option.items[0].id; 30 | } 31 | }); 32 | return initialState; 33 | }; 34 | 35 | const [values, setValues] = useState(getInitialState()); 36 | 37 | const handleRadioChange = (optionName, value) => { 38 | setValues(prev => ({ 39 | ...prev, 40 | [optionName]: value 41 | })); 42 | }; 43 | 44 | const handleCheckboxChange = (optionName, itemId, isChecked) => { 45 | setValues(prev => { 46 | const currentValues = prev[optionName] || []; 47 | if (isChecked) { 48 | return { 49 | ...prev, 50 | [optionName]: [...currentValues, itemId] 51 | }; 52 | } else { 53 | return { 54 | ...prev, 55 | [optionName]: currentValues.filter(id => id !== itemId) 56 | }; 57 | } 58 | }); 59 | }; 60 | 61 | const handleTextChange = (optionName, value) => { 62 | setValues(prev => ({ 63 | ...prev, 64 | [optionName]: value 65 | })); 66 | }; 67 | 68 | const command = config.generateCommand ? config.generateCommand(values) : ''; 69 | 70 | return ( 71 |
72 | {Object.entries(config.options).map(([key, option], index) => ( 73 |
74 |
75 | {index + 1} 76 | {option.title} 77 |
78 |
79 | {option.type === 'text' ? ( 80 | // Text input 81 | handleTextChange(option.name, e.target.value)} 86 | className={styles.textInput} 87 | /> 88 | ) : option.type === 'checkbox' ? ( 89 | // Checkbox items 90 | option.items.map(item => { 91 | const isChecked = (values[option.name] || []).includes(item.id); 92 | const isDisabled = item.required; 93 | return ( 94 | 110 | ); 111 | }) 112 | ) : ( 113 | // Radio items 114 | option.items.map(item => { 115 | const isChecked = values[option.name] === item.id; 116 | return ( 117 | 134 | ); 135 | }) 136 | )} 137 |
138 |
139 | ))} 140 | 141 |
142 |
Generated Command
143 |
{command}
144 |
145 |
146 | ); 147 | }; 148 | 149 | export default ConfigGenerator; 150 | 151 | -------------------------------------------------------------------------------- /docusaurus.config.js: -------------------------------------------------------------------------------- 1 | // @ts-check 2 | // `@type` JSDoc annotations allow editor autocompletion and type checking 3 | // (when paired with `@ts-check`). 4 | // There are various equivalent ways to declare your Docusaurus config. 5 | // See: https://docusaurus.io/docs/api/docusaurus-config 6 | 7 | import {themes as prismThemes} from 'prism-react-renderer'; 8 | 9 | // This runs in Node.js - Don't use client-side code here (browser APIs, JSX...) 10 | 11 | /** @type {import('@docusaurus/types').Config} */ 12 | const config = { 13 | title: 'SGLang Cookbook', 14 | favicon: 'img/favicon.png', 15 | 16 | // Future flags, see https://docusaurus.io/docs/api/docusaurus-config#future 17 | future: { 18 | v4: true, // Improve compatibility with the upcoming Docusaurus v4 19 | }, 20 | 21 | // Set the production url of your site here 22 | url: 'https://cookbook-sg-lang.vercel.app', 23 | // Set the // pathname under which your site is served 24 | // For GitHub pages deployment, it is often '//' 25 | baseUrl: '/', 26 | 27 | // GitHub pages deployment config. 28 | // If you aren't using GitHub pages, you don't need these. 29 | organizationName: 'sgl-project', // Usually your GitHub org/user name. 30 | projectName: 'sgl-cookbook', // Usually your repo name. 31 | 32 | onBrokenLinks: 'throw', 33 | 34 | // Even if you don't use internationalization, you can use this field to set 35 | // useful metadata like html lang. For example, if your site is Chinese, you 36 | // may want to replace "en" with "zh-Hans". 37 | i18n: { 38 | defaultLocale: 'en', 39 | locales: ['en'], 40 | }, 41 | 42 | presets: [ 43 | [ 44 | 'classic', 45 | /** @type {import('@docusaurus/preset-classic').Options} */ 46 | ({ 47 | docs: { 48 | sidebarPath: './sidebars.js', 49 | editUrl: 50 | 'https://github.com/sgl-project/sgl-cookbook/tree/main', 51 | }, 52 | // blog: { 53 | // showReadingTime: true, 54 | // feedOptions: { 55 | // type: ['rss', 'atom'], 56 | // xslt: true, 57 | // }, 58 | // // Please change this to your repo. 59 | // // Remove this to remove the "edit this page" links. 60 | // editUrl: 61 | // 'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/', 62 | // // Useful options to enforce blogging best practices 63 | // onInlineTags: 'warn', 64 | // onInlineAuthors: 'warn', 65 | // onUntruncatedBlogPosts: 'warn', 66 | // }, 67 | theme: { 68 | customCss: './src/css/custom.css', 69 | }, 70 | }), 71 | ], 72 | ], 73 | headTags: [ 74 | { 75 | tagName: 'meta', 76 | attributes: { 77 | name: 'algolia-site-verification', 78 | content: 'B137E28CCDDFD715', 79 | }, 80 | }, 81 | ], 82 | themeConfig: 83 | /** @type {import('@docusaurus/preset-classic').ThemeConfig} */ 84 | ({ 85 | // Replace with your project's social card 86 | // image: 'img/docusaurus-social-card.jpg', 87 | // colorMode: { 88 | // respectPrefersColorScheme: true, 89 | // }, 90 | navbar: { 91 | title: 'SGLang Cookbook', 92 | logo: { 93 | alt: 'SGLang Cookbook Logo', 94 | src: 'img/logo.png', 95 | }, 96 | items: [ 97 | // { 98 | // type: 'docSidebar', 99 | // sidebarId: 'tutorialSidebar', 100 | // position: 'left', 101 | // label: 'Tutorial', 102 | // }, 103 | // {to: '/blog', label: 'Blog', position: 'left'}, 104 | { 105 | href: 'https://github.com/sgl-project/sgl-cookbook', 106 | label: 'GitHub', 107 | position: 'right', 108 | }, 109 | ], 110 | }, 111 | footer: { 112 | style: 'dark', 113 | // links: [ 114 | // { 115 | // title: 'Docs', 116 | // items: [ 117 | // { 118 | // label: 'Tutorial', 119 | // to: '/docs/intro', 120 | // }, 121 | // ], 122 | // }, 123 | // { 124 | // title: 'Community', 125 | // items: [ 126 | // { 127 | // label: 'Stack Overflow', 128 | // href: 'https://stackoverflow.com/questions/tagged/docusaurus', 129 | // }, 130 | // { 131 | // label: 'Discord', 132 | // href: 'https://discordapp.com/invite/docusaurus', 133 | // }, 134 | // { 135 | // label: 'X', 136 | // href: 'https://x.com/docusaurus', 137 | // }, 138 | // ], 139 | // }, 140 | // { 141 | // title: 'More', 142 | // items: [ 143 | // { 144 | // label: 'Blog', 145 | // to: '/blog', 146 | // }, 147 | // { 148 | // label: 'GitHub', 149 | // href: 'https://github.com/facebook/docusaurus', 150 | // }, 151 | // ], 152 | // }, 153 | // ], 154 | copyright: `Copyright © ${new Date().getFullYear()} SGLang Team.`, 155 | }, 156 | prism: { 157 | theme: prismThemes.github, 158 | darkTheme: prismThemes.dracula, 159 | }, 160 | algolia: { 161 | appId: '5PDGY21FSS', 162 | apiKey: '58c29a0ac6c2759e581d630b54e57564', 163 | indexName: 'sgl-cookbook', 164 | }, 165 | }), 166 | }; 167 | 168 | export default config; 169 | -------------------------------------------------------------------------------- /src/components/Qwen3VLConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ConfigGenerator from '../ConfigGenerator'; 3 | 4 | /** 5 | * Qwen3-VL Configuration Generator 6 | * Supports multiple Qwen3-VL model sizes (235B, 30B, 32B, 8B, 4B, 2B) 7 | */ 8 | const Qwen3VLConfigGenerator = () => { 9 | const config = { 10 | modelFamily: 'Qwen', 11 | 12 | options: { 13 | hardware: { 14 | name: 'hardware', 15 | title: 'Hardware Platform', 16 | items: [ 17 | { id: 'b200', label: 'B200', default: true }, 18 | { id: 'h100', label: 'H100', default: false }, 19 | { id: 'h200', label: 'H200', default: false } 20 | ] 21 | }, 22 | modelsize: { 23 | name: 'modelsize', 24 | title: 'Model Size', 25 | items: [ 26 | { id: '235b', label: '235B', subtitle: 'MOE', default: true }, 27 | { id: '30b', label: '30B', subtitle: 'MOE', default: false }, 28 | { id: '32b', label: '32B', subtitle: 'Dense', default: false }, 29 | { id: '8b', label: '8B', subtitle: 'Dense', default: false }, 30 | { id: '4b', label: '4B', subtitle: 'Dense', default: false }, 31 | { id: '2b', label: '2B', subtitle: 'Dense', default: false } 32 | ] 33 | }, 34 | quantization: { 35 | name: 'quantization', 36 | title: 'Quantization', 37 | items: [ 38 | { id: 'bf16', label: 'BF16', default: true }, 39 | { id: 'fp8', label: 'FP8', default: false } 40 | ] 41 | }, 42 | thinking: { 43 | name: 'thinking', 44 | title: 'Thinking Capabilities', 45 | items: [ 46 | { id: 'instruct', label: 'Instruct', default: true }, 47 | { id: 'thinking', label: 'Thinking', default: false } 48 | ], 49 | commandRule: (value) => value === 'thinking' ? '--reasoning-parser qwen3' : null 50 | }, 51 | toolcall: { 52 | name: 'toolcall', 53 | title: 'Tool Call Parser', 54 | items: [ 55 | { id: 'disabled', label: 'Disabled', default: true }, 56 | { id: 'enabled', label: 'Enabled', default: false } 57 | ], 58 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser qwen' : null 59 | } 60 | }, 61 | 62 | modelConfigs: { 63 | '235b': { 64 | baseName: '235B-A22B', 65 | isMOE: true, 66 | h100: { tp: 8, ep: 0, bf16: true, fp8: true }, 67 | h200: { tp: 8, ep: 0, bf16: true, fp8: true }, 68 | b200: { tp: 8, ep: 0, bf16: true, fp8: true } 69 | }, 70 | '30b': { 71 | baseName: '30B-A3B', 72 | isMOE: true, 73 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 74 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 75 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 76 | }, 77 | '32b': { 78 | baseName: '32B', 79 | isMOE: false, 80 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 81 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 82 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 83 | }, 84 | '8b': { 85 | baseName: '8B', 86 | isMOE: false, 87 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 88 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 89 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 90 | }, 91 | '4b': { 92 | baseName: '4B', 93 | isMOE: false, 94 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 95 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 96 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 97 | }, 98 | '2b': { 99 | baseName: '2B', 100 | isMOE: false, 101 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 102 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 103 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 104 | } 105 | }, 106 | 107 | specialCommands: { 108 | 'h100-235b-bf16-instruct': '# Error: Model is too large, cannot fit into 8*H100\n# Please use H200 (141GB) or select FP8 quantization', 109 | 'h100-235b-bf16-thinking': '# Error: Model is too large, cannot fit into 8*H100\n# Please use H200 (141GB) or select FP8 quantization' 110 | }, 111 | 112 | generateCommand: function (values) { 113 | const { hardware, modelsize: modelSize, quantization, thinking } = values; 114 | const commandKey = `${hardware}-${modelSize}-${quantization}-${thinking}`; 115 | 116 | if (this.specialCommands[commandKey]) { 117 | return this.specialCommands[commandKey]; 118 | } 119 | 120 | const config = this.modelConfigs[modelSize]; 121 | if (!config) { 122 | return `# Error: Unknown model size: ${modelSize}`; 123 | } 124 | 125 | const hwConfig = config[hardware]; 126 | if (!hwConfig) { 127 | return `# Error: Unknown hardware platform: ${hardware}`; 128 | } 129 | 130 | const quantSuffix = quantization === 'fp8' ? '-FP8' : ''; 131 | const thinkingSuffix = thinking === 'thinking' ? '-Thinking' : '-Instruct'; 132 | const modelName = `Qwen/Qwen3-VL-${config.baseName}${thinkingSuffix}${quantSuffix}`; 133 | 134 | let cmd = 'python -m sglang.launch_server \\\n'; 135 | cmd += ` --model ${modelName}`; 136 | 137 | if (hwConfig.tp > 1) { 138 | cmd += ` \\\n --tp ${hwConfig.tp}`; 139 | } 140 | 141 | let ep = hwConfig.ep; 142 | if (quantization === 'fp8' && hwConfig.tp === 8) { 143 | ep = 2; 144 | } 145 | 146 | if (ep > 0) { 147 | cmd += ` \\\n --ep ${ep}`; 148 | } 149 | 150 | for (const [key, option] of Object.entries(this.options)) { 151 | if (key === 'host' || key === 'port') continue; 152 | 153 | if (option.commandRule) { 154 | const rule = option.commandRule(values[key]); 155 | if (rule) { 156 | cmd += ` \\\n ${rule}`; 157 | } 158 | } 159 | } 160 | 161 | return cmd; 162 | } 163 | }; 164 | 165 | return ; 166 | }; 167 | 168 | export default Qwen3VLConfigGenerator; 169 | 170 | -------------------------------------------------------------------------------- /src/components/ConfigGenerator/QUICKSTART.md: -------------------------------------------------------------------------------- 1 | # Quick Start Guide: Creating a New Config Generator 2 | 3 | This guide shows you how to quickly create a new configuration generator for any model. 4 | 5 | ## Step 1: Create Your Component File 6 | 7 | Create a new file: `src/components/YourModelConfigGenerator/index.js` 8 | 9 | ```jsx 10 | import React from 'react'; 11 | import ConfigGenerator from '../ConfigGenerator'; 12 | 13 | const YourModelConfigGenerator = () => { 14 | const config = { 15 | options: { 16 | // Add your options here 17 | }, 18 | generateCommand: function(values) { 19 | // Add your command generation logic here 20 | return 'your-command'; 21 | } 22 | }; 23 | 24 | return ; 25 | }; 26 | 27 | export default YourModelConfigGenerator; 28 | ``` 29 | 30 | ## Step 2: Define Your Options 31 | 32 | Add configuration options based on your needs: 33 | 34 | ### Radio Button Options (Single Choice) 35 | 36 | ```javascript 37 | hardware: { 38 | name: 'hardware', // Internal identifier 39 | title: 'Hardware Platform', // Display title 40 | items: [ 41 | { id: 'gpu_a', label: 'GPU A', default: true }, // Default selected 42 | { id: 'gpu_b', label: 'GPU B', default: false } 43 | ] 44 | } 45 | ``` 46 | 47 | ### Checkbox Options (Multiple Choice) 48 | 49 | ```javascript 50 | features: { 51 | name: 'features', 52 | title: 'Features', 53 | type: 'checkbox', // Important: specify type 54 | items: [ 55 | { id: 'feature1', label: 'Feature 1', default: true }, 56 | { id: 'feature2', label: 'Feature 2', default: false }, 57 | { id: 'feature3', label: 'Feature 3', default: false, required: true } // Can't be unchecked 58 | ] 59 | } 60 | ``` 61 | 62 | ### Text Input Options 63 | 64 | ```javascript 65 | modelPath: { 66 | name: 'modelPath', 67 | title: 'Model Path', 68 | type: 'text', // Important: specify type 69 | default: 'path/to/model', 70 | placeholder: 'Enter model path...' 71 | } 72 | ``` 73 | 74 | ## Step 3: Implement Command Generation 75 | 76 | Write the logic to generate commands based on user selections: 77 | 78 | ```javascript 79 | generateCommand: function(values) { 80 | // Extract values 81 | const { hardware, features, modelPath } = values; 82 | 83 | // Start building command 84 | let cmd = 'python3 -m sglang.launch_server'; 85 | cmd += ` --model ${modelPath}`; 86 | 87 | // Handle radio button (single value) 88 | if (hardware === 'gpu_a') { 89 | cmd += ' --device-type gpu_a'; 90 | } else if (hardware === 'gpu_b') { 91 | cmd += ' --device-type gpu_b'; 92 | } 93 | 94 | // Handle checkboxes (array of values) 95 | const featureArray = Array.isArray(features) ? features : []; 96 | if (featureArray.includes('feature1')) { 97 | cmd += ' --enable-feature1'; 98 | } 99 | if (featureArray.includes('feature2')) { 100 | cmd += ' --enable-feature2'; 101 | } 102 | 103 | return cmd; 104 | } 105 | ``` 106 | 107 | ## Step 4: Use in Markdown 108 | 109 | In your `.md` or `.mdx` file: 110 | 111 | ```mdx 112 | --- 113 | title: Your Model Documentation 114 | --- 115 | 116 | import YourModelConfigGenerator from '@site/src/components/YourModelConfigGenerator'; 117 | 118 | # Your Model 119 | 120 | ## Deployment Configuration 121 | 122 | 123 | ``` 124 | 125 | ## Complete Example 126 | 127 | Here's a complete, working example: 128 | 129 | ```jsx 130 | import React from 'react'; 131 | import ConfigGenerator from '../ConfigGenerator'; 132 | 133 | const ExampleConfigGenerator = () => { 134 | const config = { 135 | options: { 136 | hardware: { 137 | name: 'hardware', 138 | title: 'Hardware Platform', 139 | items: [ 140 | { id: 'h100', label: 'H100', default: true }, 141 | { id: 'a100', label: 'A100', default: false } 142 | ] 143 | }, 144 | quantization: { 145 | name: 'quantization', 146 | title: 'Quantization', 147 | items: [ 148 | { id: 'fp16', label: 'FP16', default: true }, 149 | { id: 'int8', label: 'INT8', default: false }, 150 | { id: 'int4', label: 'INT4', default: false } 151 | ] 152 | }, 153 | parallelism: { 154 | name: 'parallelism', 155 | title: 'Parallelism Strategy', 156 | type: 'checkbox', 157 | items: [ 158 | { id: 'tp', label: 'Tensor Parallel', subtitle: 'TP', default: true, required: true }, 159 | { id: 'dp', label: 'Data Parallel', subtitle: 'DP', default: false }, 160 | { id: 'pp', label: 'Pipeline Parallel', subtitle: 'PP', default: false } 161 | ] 162 | }, 163 | modelPath: { 164 | name: 'modelPath', 165 | title: 'Model Path', 166 | type: 'text', 167 | default: 'org/model-name', 168 | placeholder: 'Enter model path from Hugging Face...' 169 | } 170 | }, 171 | 172 | generateCommand: function(values) { 173 | const { hardware, quantization, parallelism, modelPath } = values; 174 | const parallelismArray = Array.isArray(parallelism) ? parallelism : []; 175 | 176 | // Validation example 177 | if (hardware === 'a100' && quantization === 'int4') { 178 | return '# Error: A100 does not support INT4 quantization\n' + 179 | '# Please choose FP16 or INT8, or use H100 hardware'; 180 | } 181 | 182 | // Build command 183 | let cmd = 'python3 -m sglang.launch_server \\\n'; 184 | cmd += ` --model-path ${modelPath}`; 185 | 186 | // Add quantization 187 | if (quantization !== 'fp16') { 188 | cmd += ` \\\n --quantization ${quantization}`; 189 | } 190 | 191 | // Add parallelism strategies 192 | if (parallelismArray.includes('tp')) { 193 | cmd += ' \\\n --tp 8'; 194 | } 195 | if (parallelismArray.includes('dp')) { 196 | cmd += ' \\\n --dp 4'; 197 | } 198 | if (parallelismArray.includes('pp')) { 199 | cmd += ' \\\n --pp 2'; 200 | } 201 | 202 | // Hardware-specific options 203 | if (hardware === 'h100') { 204 | cmd += ' \\\n --enable-h100-optimizations'; 205 | } 206 | 207 | return cmd; 208 | } 209 | }; 210 | 211 | return ; 212 | }; 213 | 214 | export default ExampleConfigGenerator; 215 | ``` 216 | 217 | ## Tips 218 | 219 | 1. **Keep it simple**: Start with basic radio buttons, add complexity as needed 220 | 2. **Test thoroughly**: Try all combinations to ensure correct commands 221 | 3. **Add validation**: Check for incompatible option combinations 222 | 4. **Use subtitles**: Add helpful context with the `subtitle` property 223 | 5. **Multi-line commands**: Use `\\\n` for readable multi-line output 224 | 6. **Error messages**: Return clear error messages with solutions 225 | 226 | ## Next Steps 227 | 228 | - See [README.md](./README.md) for detailed API documentation 229 | - Check [DeepSeekR1ConfigGenerator](../DeepSeekR1ConfigGenerator/index.js) for a real-world example 230 | - Customize styles in `styles.module.css` if needed 231 | 232 | ## Common Patterns 233 | 234 | ### Conditional Options Based on Previous Selection 235 | 236 | ```javascript 237 | generateCommand: function(values) { 238 | const { hardware, quantization } = values; 239 | 240 | // Only add EP for specific hardware 241 | if (hardware === 'b200' && values.parallelism.includes('ep')) { 242 | cmd += ' --ep 8'; 243 | } 244 | } 245 | ``` 246 | 247 | ### Model Path Mapping 248 | 249 | ```javascript 250 | const modelMap = { 251 | 'small': 'org/model-7b', 252 | 'medium': 'org/model-13b', 253 | 'large': 'org/model-70b' 254 | }; 255 | const modelPath = modelMap[values.modelSize]; 256 | ``` 257 | 258 | ### Complex Validation 259 | 260 | ```javascript 261 | generateCommand: function(values) { 262 | // Multiple validation checks 263 | const errors = []; 264 | 265 | if (values.hardware === 'a100' && values.quantization === 'int4') { 266 | errors.push('A100 does not support INT4'); 267 | } 268 | 269 | if (values.batchSize > 128 && !values.features.includes('optimization')) { 270 | errors.push('Large batch sizes require optimization enabled'); 271 | } 272 | 273 | if (errors.length > 0) { 274 | return '# Errors:\n' + errors.map(e => `# - ${e}`).join('\n'); 275 | } 276 | 277 | // ... normal command generation 278 | } 279 | ``` 280 | 281 | -------------------------------------------------------------------------------- /docs/Mistral/Devstral-2.md: -------------------------------------------------------------------------------- 1 | # Devstral 2 (Mistral) 2 | 3 | ## 1. Model Introduction 4 | 5 | **Devstral 2** is an agentic LLM family for software engineering tasks. It is designed for agentic workflows such as tool use, codebase exploration, and multi-file edits, and achieves strong performance on **SWE-bench**. 6 | 7 | The **Devstral 2 Instruct** checkpoints are instruction-tuned **FP8** models, making them a good fit for chat, tool-using agents, and instruction-following SWE workloads. 8 | 9 | **Key Features:** 10 | 11 | - **Agentic coding**: Optimized for tool-driven coding and software engineering agents 12 | - **Improved performance**: A step up compared to earlier Devstral models 13 | - **Better generalization**: More robust across diverse prompts and coding environments 14 | - **Long context**: Up to a **256K** context window 15 | 16 | **Use Cases:** 17 | AI code assistants, agentic coding, and software engineering tasks that require deep codebase understanding and tool integration. 18 | 19 | For enterprises requiring specialized capabilities (increased context, domain-specific knowledge, etc.), please reach out to Mistral. 20 | 21 | **Models:** 22 | 23 | - **Collection**: [mistralai/devstral-2 (Hugging Face)](https://huggingface.co/collections/mistralai/devstral-2) 24 | - **FP8 Instruct**: 25 | - **[mistralai/Devstral-2-123B-Instruct-2512](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512)** 26 | - **[mistralai/Devstral-Small-2-24B-Instruct-2512](https://huggingface.co/mistralai/Devstral-Small-2-24B-Instruct-2512)** 27 | 28 | --- 29 | 30 | ## 2. SGLang Installation 31 | 32 | SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements. 33 | 34 | Please refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html) for installation instructions. 35 | 36 | :::caution Transformers version requirement 37 | Devstral 2 requires a recent `transformers`. Please verify `transformers >= 5.0.0.rc`: 38 | 39 | ```shell 40 | python -c "import transformers; print(transformers.__version__)" 41 | ``` 42 | 43 | If your version is lower, upgrade: 44 | 45 | ```shell 46 | pip install -U --pre "transformers>=5.0.0rc0" 47 | ``` 48 | ::: 49 | 50 | --- 51 | 52 | ## 3. Model Deployment 53 | 54 | ### 3.1 Basic configuration 55 | 56 | **Interactive Command Generator**: Use the configuration selector below to generate a launch command for Devstral Small 2 (24B) or Devstral 2 (123B). 57 | 58 | :::note 59 | The TP size is set to the minimum required for the selected model size. 60 | ::: 61 | 62 | import Devstral2ConfigGenerator from '@site/src/components/Devstral2ConfigGenerator'; 63 | 64 | 65 | 66 | ### 3.2 Configuration tips 67 | 68 | - **Context length vs memory**: Devstral 2 advertises a long context window; if you are memory-constrained, start by lowering `--context-length` (for example `32768`) and increase once things are stable. 69 | - **FP8 checkpoints**: Both Devstral Small 2 and Devstral 2 are published as **FP8** weights. If you hit kernel / dtype issues, try a newer SGLang build and recent CUDA drivers. 70 | 71 | --- 72 | 73 | ## 4. Model Invocation 74 | 75 | ### 4.1 Basic Usage (OpenAI-Compatible API) 76 | 77 | SGLang exposes an OpenAI-compatible endpoint. Example: 78 | 79 | ```python 80 | from openai import OpenAI 81 | 82 | client = OpenAI( 83 | base_url="http://localhost:30000/v1", 84 | api_key="EMPTY", 85 | ) 86 | 87 | resp = client.chat.completions.create( 88 | model="mistralai/Devstral-Small-2-24B-Instruct-2512", 89 | messages=[ 90 | {"role": "system", "content": "You are a helpful coding assistant."}, 91 | {"role": "user", "content": "Write a Python function that retries a request with exponential backoff."}, 92 | ], 93 | temperature=0.2, 94 | max_tokens=512, 95 | ) 96 | 97 | print(resp.choices[0].message.content) 98 | ``` 99 | 100 | **Output Example:** 101 | 102 | ``` 103 | Here's a Python function that implements exponential backoff for retrying a request. This function uses the `requests` library to make HTTP requests and includes error handling for common HTTP and connection errors. 104 | 105 | ```python 106 | import time 107 | import requests 108 | from requests.exceptions import RequestException 109 | 110 | def retry_with_exponential_backoff( 111 | url, 112 | max_retries=3, 113 | initial_delay=1, 114 | backoff_factor=2, 115 | method="GET", 116 | **kwargs 117 | ): 118 | """ 119 | Retry a request with exponential backoff. 120 | 121 | Parameters: 122 | - url: The URL to request. 123 | - max_retries: Maximum number of retry attempts (default: 3). 124 | - initial_delay: Initial delay in seconds (default: 1). 125 | - backoff_factor: Multiplier for the delay between retries (default: 2). 126 | - method: HTTP method to use (default: "GET"). 127 | - **kwargs: Additional arguments to pass to the request function (e.g., headers, data, etc.). 128 | 129 | Returns: 130 | - Response object if the request succeeds. 131 | - Raises an exception if all retries fail. 132 | """ 133 | retry_count = 0 134 | delay = initial_delay 135 | 136 | while retry_count < max_retries: 137 | try: 138 | response = requests.request(method, url, **kwargs) 139 | # Check if the response status code indicates success 140 | if response.status_code < 400: 141 | return response 142 | else: 143 | raise RequestException(f"HTTP {response.status_code}: {response.text}") 144 | 145 | except RequestException as e: 146 | if retry_count == max_retries - 1: 147 | raise Exception(f"All retries failed. Last error: {e}") 148 | 149 | print(f"Attempt {retry_count + 1} failed. Retrying in {delay} seconds...") 150 | time.sleep(delay) 151 | ... 152 | ``` 153 | 154 | ### 4.2 Tool calling (optional) 155 | 156 | Devstral 2 supports tool calling capabilities. Enable the tool call parser: 157 | 158 | ```shell 159 | python -m sglang.launch_server \ 160 | --model mistralai/Devstral-2-123B-Instruct-2512 \ 161 | --tp 2 \ 162 | --tool-call-parser mistral 163 | ``` 164 | 165 | **Python Example (with Thinking Process):** 166 | 167 | ```python 168 | from openai import OpenAI 169 | 170 | client = OpenAI( 171 | base_url="http://localhost:30000/v1", 172 | api_key="EMPTY" 173 | ) 174 | 175 | # Define available tools 176 | tools = [ 177 | { 178 | "type": "function", 179 | "function": { 180 | "name": "get_weather", 181 | "description": "Get the current weather for a location", 182 | "parameters": { 183 | "type": "object", 184 | "properties": { 185 | "location": { 186 | "type": "string", 187 | "description": "The city name" 188 | }, 189 | "unit": { 190 | "type": "string", 191 | "enum": ["celsius", "fahrenheit"], 192 | "description": "Temperature unit" 193 | } 194 | }, 195 | "required": ["location"] 196 | } 197 | } 198 | } 199 | ] 200 | 201 | # Make request with streaming to see thinking process 202 | response = client.chat.completions.create( 203 | model="mistralai/Devstral-2-123B-Instruct-2512", 204 | messages=[ 205 | {"role": "user", "content": "What's the weather in Beijing?"} 206 | ], 207 | tools=tools, 208 | temperature=0.7, 209 | stream=True 210 | ) 211 | 212 | # Process streaming response 213 | thinking_started = False 214 | has_thinking = False 215 | tool_calls_accumulator = {} 216 | 217 | for chunk in response: 218 | if chunk.choices and len(chunk.choices) > 0: 219 | delta = chunk.choices[0].delta 220 | 221 | # Accumulate tool calls 222 | if hasattr(delta, 'tool_calls') and delta.tool_calls: 223 | # Close thinking section if needed 224 | if has_thinking and thinking_started: 225 | print("\n=============== Content =================\n", flush=True) 226 | thinking_started = False 227 | 228 | for tool_call in delta.tool_calls: 229 | index = tool_call.index 230 | if index not in tool_calls_accumulator: 231 | tool_calls_accumulator[index] = { 232 | 'name': None, 233 | 'arguments': '' 234 | } 235 | 236 | if tool_call.function: 237 | if tool_call.function.name: 238 | tool_calls_accumulator[index]['name'] = tool_call.function.name 239 | if tool_call.function.arguments: 240 | tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments 241 | 242 | # Print content 243 | if delta.content: 244 | print(delta.content, end="", flush=True) 245 | 246 | # Print accumulated tool calls 247 | for index, tool_call in sorted(tool_calls_accumulator.items()): 248 | print(f"🔧 Tool Call: {tool_call['name']}") 249 | print(f" Arguments: {tool_call['arguments']}") 250 | 251 | print() 252 | ``` 253 | 254 | **Output Example:** 255 | 256 | ``` 257 | 🔧 Tool Call: get_weather 258 | Arguments: {"location": "Beijing"} 259 | ``` 260 | -------------------------------------------------------------------------------- /src/components/ConfigGenerator/README.md: -------------------------------------------------------------------------------- 1 | # ConfigGenerator Component 2 | 3 | A reusable, generic configuration generator component for creating interactive command builders in documentation. 4 | 5 | ## Features 6 | 7 | - **Flexible Configuration**: Support for radio buttons, checkboxes, and text inputs 8 | - **Real-time Command Generation**: Automatically updates command output based on selections 9 | - **Custom Validation**: Add custom validation rules and error messages 10 | - **Theme Support**: Works with light and dark modes using Docusaurus CSS variables 11 | - **Responsive Design**: Mobile-friendly layout 12 | 13 | ## Usage 14 | 15 | ### 1. Create a Wrapper Component 16 | 17 | Create a new component that imports `ConfigGenerator` and provides your custom configuration: 18 | 19 | ```jsx 20 | import React from 'react'; 21 | import ConfigGenerator from '../ConfigGenerator'; 22 | 23 | const MyModelConfigGenerator = () => { 24 | const config = { 25 | // Optional: Model family identifier 26 | modelFamily: 'my-model-family', 27 | 28 | // Define your configuration options 29 | options: { 30 | hardware: { 31 | name: 'hardware', 32 | title: 'Hardware Platform', 33 | items: [ 34 | { id: 'h100', label: 'H100', default: true }, 35 | { id: 'h200', label: 'H200', default: false } 36 | ] 37 | }, 38 | // ... more options 39 | }, 40 | 41 | // Command generation function 42 | generateCommand: function(values) { 43 | const { hardware } = values; 44 | let cmd = 'python3 -m sglang.launch_server'; 45 | 46 | if (hardware === 'h100') { 47 | cmd += ' --gpu-type h100'; 48 | } 49 | 50 | return cmd; 51 | } 52 | }; 53 | 54 | return ; 55 | }; 56 | 57 | export default MyModelConfigGenerator; 58 | ``` 59 | 60 | ### 2. Use in Markdown/MDX Files 61 | 62 | Import and use your wrapper component in any `.md` or `.mdx` file: 63 | 64 | ```mdx 65 | --- 66 | title: My Model 67 | --- 68 | 69 | import MyModelConfigGenerator from '@site/src/components/MyModelConfigGenerator'; 70 | 71 | # Model Deployment 72 | 73 | Use the interactive configuration generator below: 74 | 75 | 76 | ``` 77 | 78 | ## Configuration Object Structure 79 | 80 | ### Basic Structure 81 | 82 | ```javascript 83 | const config = { 84 | modelFamily: 'optional-model-family', // Optional 85 | options: { 86 | // Option definitions (see below) 87 | }, 88 | generateCommand: function(values) { 89 | // Command generation logic 90 | return 'generated-command-string'; 91 | } 92 | }; 93 | ``` 94 | 95 | ### Option Types 96 | 97 | #### 1. Radio Button (Single Selection) 98 | 99 | Default behavior when `type` is not specified: 100 | 101 | ```javascript 102 | optionName: { 103 | name: 'optionName', 104 | title: 'Display Title', 105 | items: [ 106 | { id: 'choice1', label: 'Choice 1', default: true }, 107 | { id: 'choice2', label: 'Choice 2', default: false }, 108 | { id: 'choice3', label: 'Choice 3', subtitle: 'Additional info', default: false } 109 | ] 110 | } 111 | ``` 112 | 113 | **Properties:** 114 | - `name`: Internal identifier (string) 115 | - `title`: Display title (string) 116 | - `items`: Array of choices 117 | - `id`: Unique identifier (string) 118 | - `label`: Display label (string) 119 | - `subtitle`: Optional subtitle text (string) 120 | - `default`: Whether this is the default selection (boolean) 121 | 122 | #### 2. Checkbox (Multiple Selection) 123 | 124 | Set `type: 'checkbox'`: 125 | 126 | ```javascript 127 | optionName: { 128 | name: 'optionName', 129 | title: 'Display Title', 130 | type: 'checkbox', 131 | items: [ 132 | { id: 'option1', label: 'Option 1', default: true }, 133 | { id: 'option2', label: 'Option 2', default: false, required: true }, 134 | { id: 'option3', label: 'Option 3', subtitle: 'Additional info', default: false } 135 | ] 136 | } 137 | ``` 138 | 139 | **Additional Properties:** 140 | - `required`: If `true`, prevents the user from unchecking this option (boolean) 141 | 142 | **Note:** In `generateCommand`, checkbox values are returned as an array: 143 | ```javascript 144 | generateCommand: function(values) { 145 | const strategies = values.optionName; // e.g., ['option1', 'option2'] 146 | if (strategies.includes('option1')) { 147 | // ... 148 | } 149 | } 150 | ``` 151 | 152 | #### 3. Text Input 153 | 154 | Set `type: 'text'`: 155 | 156 | ```javascript 157 | optionName: { 158 | name: 'optionName', 159 | title: 'Display Title', 160 | type: 'text', 161 | default: 'default value', 162 | placeholder: 'Enter value...' 163 | } 164 | ``` 165 | 166 | **Properties:** 167 | - `default`: Default text value (string) 168 | - `placeholder`: Placeholder text (string) 169 | 170 | ### Command Generation Function 171 | 172 | The `generateCommand` function receives a `values` object containing all user selections: 173 | 174 | ```javascript 175 | generateCommand: function(values) { 176 | const { hardware, quantization, strategy } = values; 177 | 178 | // For radio buttons: string value 179 | if (hardware === 'h100') { 180 | // ... 181 | } 182 | 183 | // For checkboxes: array of strings 184 | const strategyArray = Array.isArray(strategy) ? strategy : []; 185 | if (strategyArray.includes('tp')) { 186 | // ... 187 | } 188 | 189 | // For text inputs: string value 190 | const modelPath = values.modelName || ''; 191 | 192 | // Build and return command string 193 | let cmd = 'python3 -m sglang.launch_server'; 194 | cmd += ` --model ${modelPath}`; 195 | 196 | return cmd; 197 | } 198 | ``` 199 | 200 | **Tips:** 201 | - Use multi-line strings with `\\n` for readable output 202 | - Add validation checks and return error messages when needed 203 | - Use template literals for cleaner string building 204 | 205 | ## Examples 206 | 207 | ### Example 1: Simple Configuration 208 | 209 | ```javascript 210 | const config = { 211 | options: { 212 | model: { 213 | name: 'model', 214 | title: 'Model Selection', 215 | items: [ 216 | { id: 'small', label: 'Small (7B)', default: true }, 217 | { id: 'medium', label: 'Medium (13B)', default: false }, 218 | { id: 'large', label: 'Large (70B)', default: false } 219 | ] 220 | } 221 | }, 222 | generateCommand: function(values) { 223 | const modelSizes = { small: '7B', medium: '13B', large: '70B' }; 224 | return `python3 -m sglang.launch_server --model my-model-${modelSizes[values.model]}`; 225 | } 226 | }; 227 | ``` 228 | 229 | ### Example 2: With Validation 230 | 231 | ```javascript 232 | const config = { 233 | options: { 234 | hardware: { 235 | name: 'hardware', 236 | title: 'Hardware', 237 | items: [ 238 | { id: 'cpu', label: 'CPU', default: true }, 239 | { id: 'gpu', label: 'GPU', default: false } 240 | ] 241 | }, 242 | precision: { 243 | name: 'precision', 244 | title: 'Precision', 245 | items: [ 246 | { id: 'fp32', label: 'FP32', default: true }, 247 | { id: 'fp16', label: 'FP16', default: false } 248 | ] 249 | } 250 | }, 251 | generateCommand: function(values) { 252 | // Validation 253 | if (values.hardware === 'cpu' && values.precision === 'fp16') { 254 | return '# Error: FP16 is not supported on CPU\n# Please select FP32 or use GPU'; 255 | } 256 | 257 | let cmd = 'python3 -m sglang.launch_server'; 258 | cmd += ` --device ${values.hardware}`; 259 | cmd += ` --precision ${values.precision}`; 260 | return cmd; 261 | } 262 | }; 263 | ``` 264 | 265 | ### Example 3: Complex Configuration with Checkboxes 266 | 267 | ```javascript 268 | const config = { 269 | options: { 270 | model: { 271 | name: 'model', 272 | title: 'Model', 273 | items: [ 274 | { id: 'model-a', label: 'Model A', default: true }, 275 | { id: 'model-b', label: 'Model B', default: false } 276 | ] 277 | }, 278 | features: { 279 | name: 'features', 280 | title: 'Features', 281 | type: 'checkbox', 282 | items: [ 283 | { id: 'cache', label: 'Enable Cache', default: true, required: true }, 284 | { id: 'logging', label: 'Enable Logging', default: false }, 285 | { id: 'profiling', label: 'Enable Profiling', default: false } 286 | ] 287 | }, 288 | batchSize: { 289 | name: 'batchSize', 290 | title: 'Batch Size', 291 | type: 'text', 292 | default: '32', 293 | placeholder: 'Enter batch size' 294 | } 295 | }, 296 | generateCommand: function(values) { 297 | const { model, features, batchSize } = values; 298 | const featureArray = Array.isArray(features) ? features : []; 299 | 300 | let cmd = `python3 -m sglang.launch_server --model ${model}`; 301 | cmd += ` --batch-size ${batchSize}`; 302 | 303 | if (featureArray.includes('cache')) { 304 | cmd += ' --enable-cache'; 305 | } 306 | if (featureArray.includes('logging')) { 307 | cmd += ' --enable-logging'; 308 | } 309 | if (featureArray.includes('profiling')) { 310 | cmd += ' --enable-profiling'; 311 | } 312 | 313 | return cmd; 314 | } 315 | }; 316 | ``` 317 | 318 | ## Styling 319 | 320 | The component uses CSS modules with Docusaurus CSS variables for theme compatibility. The styles automatically adapt to light and dark modes. 321 | 322 | To customize the appearance, you can: 323 | 324 | 1. Modify `/src/components/ConfigGenerator/styles.module.css` 325 | 2. Override CSS variables in your custom CSS 326 | 3. Use inline styles in your wrapper component (not recommended) 327 | 328 | ## Real-World Example 329 | 330 | See `/src/components/DeepSeekR1ConfigGenerator/index.js` for a complete, production-ready example with: 331 | - Multiple option types (radio, checkbox) 332 | - Complex validation logic 333 | - Conditional command generation 334 | - Hardware-specific optimizations 335 | 336 | ## Best Practices 337 | 338 | 1. **Clear Labels**: Use descriptive labels and subtitles 339 | 2. **Sensible Defaults**: Set appropriate default values 340 | 3. **Validation**: Add validation for incompatible options 341 | 4. **Error Messages**: Provide clear error messages with solutions 342 | 5. **Documentation**: Add comments explaining complex logic 343 | 6. **Testing**: Test all combinations to ensure correct output 344 | 345 | ## Support 346 | 347 | For issues or questions, please open an issue in the repository. 348 | 349 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /docs/NVIDIA/Nemotron3-Nano.md: -------------------------------------------------------------------------------- 1 | # NVIDIA Nemotron3-Nano 2 | 3 | ## 1. Model Introduction 4 | 5 | `NVIDIA Nemotron3-Nano` is a 30B-parameter hybrid LLM that mixes Mixture-of-Experts (MoE) feed-forward layers, Mamba2 sequence-modeling layers, and standard self-attention layers in a single stack rather than classic “attention + MLP” transformer blocks. 6 | 7 | The BF16 variant (`nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16`) is designed as a high-fidelity reference model, while the FP8 variant (`nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8`) targets optimized inference performance on modern NVIDIA GPUs. 8 | 9 | At a high level: 10 | 11 | - **Hybrid layer stack (Mamba2 + MoE + attention):** The network is composed of interleaved layers that are *either* Mamba2, *or* MoE feed-forward, *or* attention-only. 12 | - **Non-uniform layer ordering:** The order and mix of these specialized layers is not a simple, rigid pattern, enabling the model to trade off sequence modeling, routing capacity, and expressivity across depth. 13 | - **Deployment-friendly precision:** Use BF16 for accuracy-sensitive and evaluation workloads; use FP8 for latency- and throughput-critical serving on recent NVIDIA GPUs. 14 | 15 | --- 16 | 17 | ## 2. SGLang Installation 18 | 19 | SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements. 20 | 21 | For a quick start, please install the nightly wheel for SGLang: 22 | ```bash 23 | pip install sglang==0.5.6.post2.dev7852+g8102e36b5 --extra-index-url https://sgl-project.github.io/whl/nightly/ 24 | ``` 25 | --- 26 | 27 | ## 3. Model Deployment 28 | 29 | This section provides a progressive guide from quick deployment to performance tuning. 30 | 31 | ### 3.1 Basic Configuration 32 | 33 | **Interactive Command Generator**: select hardware, model variant, and common knobs to generate a launch command. 34 | 35 | import NemotronNano3ConfigGenerator from '@site/src/components/NemotronConfigGenerator'; 36 | 37 | 38 | 39 | ### 3.2 Configuration Tips 40 | 41 | 42 | - **Attention backend**: 43 | 44 | **H200/B200**: use flashinfer attention backend by default. 45 | 46 | - **TP support**: 47 | 48 | To set tp size, use `--tp <1|2|4|8>`. 49 | 50 | - **FP8 KV cache**: 51 | 52 | To enable fp8 kv cache, please append `--kv-cache-dtype fp8_e4m3`. 53 | 54 | --- 55 | 56 | ## 4. Model Invocation 57 | 58 | ### 4.1 Basic Usage (OpenAI-Compatible API) 59 | 60 | SGLang provides an OpenAI-compatible endpoint. Example with the OpenAI Python client: 61 | 62 | ```python 63 | from openai import OpenAI 64 | 65 | client = OpenAI( 66 | base_url="http://localhost:30000/v1", 67 | api_key="EMPTY", 68 | ) 69 | 70 | resp = client.chat.completions.create( 71 | model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", 72 | messages=[ 73 | {"role": "system", "content": "You are a helpful assistant."}, 74 | {"role": "user", "content": "Summarize what MoE models are in 5 bullets."}, 75 | ], 76 | temperature=0.7, 77 | max_tokens=256, 78 | ) 79 | 80 | print(resp.choices[0].message.content) 81 | 82 | ``` 83 | 84 | Streaming chat completion 85 | ```python 86 | from openai import OpenAI 87 | 88 | client = OpenAI( 89 | base_url="http://localhost:30000/v1", 90 | api_key="EMPTY", 91 | ) 92 | 93 | stream = client.chat.completions.create( 94 | model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", 95 | messages=[ 96 | {"role": "system", "content": "You are a helpful AI assistant."}, 97 | {"role": "user", "content": "What are the first 5 prime numbers?"} 98 | ], 99 | temperature=0.7, 100 | max_tokens=1024, 101 | stream=True, 102 | ) 103 | for chunk in stream: 104 | delta = chunk.choices[0].delta 105 | if delta and delta.content: 106 | print(delta.content, end="", flush=True) 107 | ``` 108 | 109 | ### 4.2 Reasoning 110 | To enable reasoning, `--reasoning-parser nano_v3` should be appended to the launching command. The model supports two modes - Reasoning ON (default) vs OFF. This can be toggled by setting enable_thinking to False, as shown below. 111 | 112 | ```python 113 | from openai import OpenAI 114 | 115 | client = OpenAI( 116 | base_url="http://localhost:30000/v1", 117 | api_key="EMPTY", 118 | ) 119 | 120 | # Reasoning on (default) 121 | print("Reasoning on") 122 | resp = client.chat.completions.create( 123 | model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", 124 | messages=[ 125 | {"role": "system", "content": "You are a helpful assistant."}, 126 | {"role": "user", "content": "Write a haiku about GPUs."} 127 | ], 128 | temperature=0.7, 129 | max_tokens=512, 130 | ) 131 | print(resp.choices[0].message.reasoning_content) 132 | 133 | # Reasoning off 134 | print("Reasoning off") 135 | resp = client.chat.completions.create( 136 | model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", 137 | messages=[ 138 | {"role": "system", "content": "You are a helpful assistant."}, 139 | {"role": "user", "content": "Write a haiku about GPUs."} 140 | ], 141 | temperature=0.6, 142 | max_tokens=256, 143 | extra_body={"chat_template_kwargs": {"enable_thinking": False}} 144 | ) 145 | print(resp.choices[0].message.reasoning_content) 146 | 147 | ``` 148 | 149 | ### 4.3 Tool calling 150 | To enable reasoning, `--tool-call-parser qwen3_coder` should be appended to the launching command. Call functions using the OpenAI Tools schema and inspect returned tool_calls. 151 | 152 | ```python 153 | from openai import OpenAI 154 | 155 | client = OpenAI( 156 | base_url="http://localhost:30000/v1", 157 | api_key="EMPTY", 158 | ) 159 | 160 | # Tool calling via OpenAI tools schema 161 | TOOLS = [ 162 | { 163 | "type": "function", 164 | "function": { 165 | "name": "calculate_tip", 166 | "parameters": { 167 | "type": "object", 168 | "properties": { 169 | "bill_total": { 170 | "type": "integer", 171 | "description": "The total amount of the bill" 172 | }, 173 | "tip_percentage": { 174 | "type": "integer", 175 | "description": "The percentage of tip to be applied" 176 | } 177 | }, 178 | "required": ["bill_total", "tip_percentage"] 179 | } 180 | } 181 | } 182 | ] 183 | 184 | completion = client.chat.completions.create( 185 | model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", 186 | messages=[ 187 | {"role": "system", "content": ""}, 188 | {"role": "user", "content": "My bill is $50. What will be the amount for 15% tip?"} 189 | ], 190 | tools=TOOLS, 191 | temperature=0.6, 192 | top_p=0.95, 193 | max_tokens=512, 194 | stream=False 195 | ) 196 | 197 | print(completion.choices[0].message.reasoning_content) 198 | print(completion.choices[0].message.tool_calls) 199 | ``` 200 | 201 | --- 202 | 203 | ## 5. Benchmark 204 | 205 | ### 5.1 Speed Benchmark 206 | 207 | **Test Environment:** 208 | 209 | - Hardware: NVIDIA B200 GPU 210 | 211 | **FP8 variant** 212 | 213 | - Model Deployment Command: 214 | 215 | ```shell 216 | python3 -m sglang.launch_server \ 217 | --model-path nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 \ 218 | --trust-remote-code \ 219 | --max-running-requests 1024 \ 220 | --host 0.0.0.0 \ 221 | --port 30000 222 | ``` 223 | 224 | - Benchmark Command: 225 | 226 | ```shell 227 | python3 -m sglang.bench_serving \ 228 | --backend sglang \ 229 | --host 127.0.0.1 \ 230 | --port 30000 \ 231 | --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 \ 232 | --dataset-name random \ 233 | --random-input-len 1024 \ 234 | --random-output-len 1024 \ 235 | --num-prompts 4096 \ 236 | --max-concurrency 256 237 | ``` 238 | 239 | - **Test Results:** 240 | 241 | ``` 242 | ============ Serving Benchmark Result ============ 243 | Backend: sglang 244 | Traffic request rate: inf 245 | Max request concurrency: 256 246 | Successful requests: 4096 247 | Benchmark duration (s): 183.18 248 | Total input tokens: 2081726 249 | Total input text tokens: 2081726 250 | Total input vision tokens: 0 251 | Total generated tokens: 2116125 252 | Total generated tokens (retokenized): 1076256 253 | Request throughput (req/s): 22.36 254 | Input token throughput (tok/s): 11364.25 255 | Output token throughput (tok/s): 11552.04 256 | Peak output token throughput (tok/s): 24692.00 257 | Peak concurrent requests: 294 258 | Total token throughput (tok/s): 22916.30 259 | Concurrency: 251.19 260 | ----------------End-to-End Latency---------------- 261 | Mean E2E Latency (ms): 11233.74 262 | Median E2E Latency (ms): 11142.97 263 | ---------------Time to First Token---------------- 264 | Mean TTFT (ms): 172.99 265 | Median TTFT (ms): 116.57 266 | P99 TTFT (ms): 1193.68 267 | -----Time per Output Token (excl. 1st token)------ 268 | Mean TPOT (ms): 21.74 269 | Median TPOT (ms): 21.14 270 | P99 TPOT (ms): 41.12 271 | ---------------Inter-Token Latency---------------- 272 | Mean ITL (ms): 21.45 273 | Median ITL (ms): 9.06 274 | P95 ITL (ms): 62.59 275 | P99 ITL (ms): 110.83 276 | Max ITL (ms): 5368.19 277 | ================================================== 278 | ``` 279 | 280 | **BF16 variant** 281 | 282 | - Model Deployment Command: 283 | 284 | ```shell 285 | python3 -m sglang.launch_server \ 286 | --model-path nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \ 287 | --trust-remote-code \ 288 | --max-running-requests 1024 \ 289 | --host 0.0.0.0 \ 290 | --port 30000 291 | ``` 292 | 293 | - Benchmark Command: 294 | 295 | ```shell 296 | python3 -m sglang.bench_serving \ 297 | --backend sglang \ 298 | --host 127.0.0.1 \ 299 | --port 30000 \ 300 | --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \ 301 | --dataset-name random \ 302 | --random-input-len 1024 \ 303 | --random-output-len 1024 \ 304 | --num-prompts 4096 \ 305 | --max-concurrency 256 306 | ``` 307 | 308 | - **Test Results:** 309 | 310 | ``` 311 | ============ Serving Benchmark Result ============ 312 | Backend: sglang 313 | Traffic request rate: inf 314 | Max request concurrency: 256 315 | Successful requests: 4096 316 | Benchmark duration (s): 360.22 317 | Total input tokens: 2081726 318 | Total input text tokens: 2081726 319 | Total input vision tokens: 0 320 | Total generated tokens: 2087288 321 | Total generated tokens (retokenized): 1940652 322 | Request throughput (req/s): 11.37 323 | Input token throughput (tok/s): 5779.10 324 | Output token throughput (tok/s): 5794.55 325 | Peak output token throughput (tok/s): 9169.00 326 | Peak concurrent requests: 276 327 | Total token throughput (tok/s): 11573.65 328 | Concurrency: 249.76 329 | ----------------End-to-End Latency---------------- 330 | Mean E2E Latency (ms): 21965.10 331 | Median E2E Latency (ms): 21706.35 332 | ---------------Time to First Token---------------- 333 | Mean TTFT (ms): 211.54 334 | Median TTFT (ms): 93.06 335 | P99 TTFT (ms): 2637.66 336 | -----Time per Output Token (excl. 1st token)------ 337 | Mean TPOT (ms): 43.27 338 | Median TPOT (ms): 43.04 339 | P99 TPOT (ms): 61.15 340 | ---------------Inter-Token Latency---------------- 341 | Mean ITL (ms): 42.77 342 | Median ITL (ms): 28.46 343 | P95 ITL (ms): 71.85 344 | P99 ITL (ms): 113.20 345 | Max ITL (ms): 5237.28 346 | ================================================== 347 | 348 | ``` 349 | ### 5.2 Accuracy Benchmark 350 | 351 | 352 | #### 5.2.1 GSM8K Benchmark 353 | 354 | **Environment** 355 | - Hardware: NVIDIA B200 GPU 356 | - Model: BF16 checkpoint 357 | 358 | **Launch Model** 359 | ```bash 360 | python3 -m sglang.launch_server \ 361 | --model-path nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \ 362 | --trust-remote-code \ 363 | --reasoning-parser nano_v3 364 | ``` 365 | 366 | **Run Benchmark with lm-eval** 367 | ```bash 368 | pip install lm-eval[api]==0.4.9.2 369 | 370 | lm_eval --model local-completions --tasks gsm8k --model_args "model=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16,base_url=http://127.0.0.1:30000/v1/completions,num_concurrent=4,max_retries=3,tokenized_requests=False,max_lengths=16384" --gen_kwargs '{"chat_template_kwargs":{"thinking":true}}' --batch_size 256 371 | ``` 372 | 373 | **Test Results:** 374 | ``` 375 | |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| 376 | |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| 377 | |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.5603|± |0.0137| 378 | | | |strict-match | 5|exact_match|↑ |0.8453|± |0.0100| 379 | ``` 380 | 381 | 382 | 383 | 384 | -------------------------------------------------------------------------------- /src/components/Qwen3ConfigGenerator/index.js: -------------------------------------------------------------------------------- 1 | import React, { useState, useMemo } from 'react'; 2 | import styles from '../ConfigGenerator/styles.module.css'; 3 | 4 | /** 5 | * Qwen3 Configuration Generator 6 | * Supports multiple Qwen3 model sizes (235B, 30B, 32B, 14B, 8B, 4B, 1.7B, 0.6B) 7 | * Custom implementation to handle model-specific logic without modifying ConfigGenerator 8 | */ 9 | const Qwen3ConfigGenerator = () => { 10 | const baseConfig = { 11 | modelFamily: 'Qwen', 12 | 13 | options: { 14 | hardware: { 15 | name: 'hardware', 16 | title: 'Hardware Platform', 17 | items: [ 18 | { id: 'b200', label: 'B200', default: true }, 19 | { id: 'h100', label: 'H100', default: false }, 20 | { id: 'h200', label: 'H200', default: false } 21 | ] 22 | }, 23 | modelsize: { 24 | name: 'modelsize', 25 | title: 'Model Size', 26 | items: [ 27 | { id: '235b', label: '235B', subtitle: 'MOE', default: true }, 28 | { id: '30b', label: '30B', subtitle: 'MOE', default: false }, 29 | { id: '32b', label: '32B', subtitle: 'Dense', default: false }, 30 | { id: '14b', label: '14B', subtitle: 'Dense', default: false }, 31 | { id: '8b', label: '8B', subtitle: 'Dense', default: false }, 32 | { id: '4b', label: '4B', subtitle: 'Dense', default: false }, 33 | { id: '1.7b', label: '1.7B', subtitle: 'Dense', default: false }, 34 | { id: '0.6b', label: '0.6B', subtitle: 'Dense', default: false } 35 | ] 36 | }, 37 | quantization: { 38 | name: 'quantization', 39 | title: 'Quantization', 40 | items: [ 41 | { id: 'bf16', label: 'BF16', default: true }, 42 | { id: 'fp8', label: 'FP8', default: false } 43 | ] 44 | }, 45 | category: { 46 | name: 'category', 47 | title: 'Categories', 48 | items: [ 49 | { id: 'base', label: 'Base', default: true }, 50 | { id: 'instruct', label: 'Instruct', default: false }, 51 | { id: 'thinking', label: 'Thinking', default: false } 52 | ] 53 | }, 54 | reasoningParser: { 55 | name: 'reasoningParser', 56 | title: 'Reasoning Parser', 57 | items: [ 58 | { id: 'disabled', label: 'Disabled', default: true }, 59 | { id: 'enabled', label: 'Enabled', default: false } 60 | ], 61 | // Only visible when category is not 'instruct' 62 | visibleWhen: (values) => values.category !== 'instruct', 63 | // Only add command when category is not 'instruct' and enabled 64 | commandRule: (value, values) => { 65 | if (value === 'enabled' && values.category !== 'instruct') { 66 | return '--reasoning-parser qwen3'; 67 | } 68 | return null; 69 | } 70 | }, 71 | toolcall: { 72 | name: 'toolcall', 73 | title: 'Tool Call Parser', 74 | items: [ 75 | { id: 'disabled', label: 'Disabled', default: true }, 76 | { id: 'enabled', label: 'Enabled', default: false } 77 | ], 78 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser qwen' : null 79 | } 80 | }, 81 | 82 | modelConfigs: { 83 | '235b': { 84 | baseName: '235B-A22B', 85 | hasThinkingVariants: true, 86 | h100: { tp: 8, ep: 0, bf16: true, fp8: true }, 87 | h200: { tp: 8, ep: 0, bf16: true, fp8: true }, 88 | b200: { tp: 8, ep: 0, bf16: true, fp8: true } 89 | }, 90 | '30b': { 91 | baseName: '30B-A3B', 92 | hasThinkingVariants: true, 93 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 94 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 95 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 96 | }, 97 | '32b': { 98 | baseName: '32B', 99 | hasThinkingVariants: false, 100 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 101 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 102 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 103 | }, 104 | '14b': { 105 | baseName: '14B', 106 | hasThinkingVariants: false, 107 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 108 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 109 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 110 | }, 111 | '8b': { 112 | baseName: '8B', 113 | hasThinkingVariants: false, 114 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 115 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 116 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 117 | }, 118 | '4b': { 119 | baseName: '4B', 120 | hasThinkingVariants: true, 121 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 122 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 123 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 124 | }, 125 | '1.7b': { 126 | baseName: '1.7B', 127 | hasThinkingVariants: false, 128 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 129 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 130 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 131 | }, 132 | '0.6b': { 133 | baseName: '0.6B', 134 | hasThinkingVariants: false, 135 | h100: { tp: 1, ep: 0, bf16: true, fp8: true }, 136 | h200: { tp: 1, ep: 0, bf16: true, fp8: true }, 137 | b200: { tp: 1, ep: 0, bf16: true, fp8: true } 138 | } 139 | }, 140 | 141 | specialCommands: { 142 | 'h100-235b-bf16-instruct': '# Error: Model is too large, cannot fit into 8*H100\n# Please use H200 (141GB) or select FP8 quantization', 143 | 'h100-235b-bf16-thinking': '# Error: Model is too large, cannot fit into 8*H100\n# Please use H200 (141GB) or select FP8 quantization' 144 | }, 145 | 146 | generateCommand: function (values) { 147 | const { hardware, modelsize: modelSize, quantization, category } = values; 148 | const commandKey = `${hardware}-${modelSize}-${quantization}-${category}`; 149 | 150 | if (this.specialCommands[commandKey]) { 151 | return this.specialCommands[commandKey]; 152 | } 153 | 154 | const config = this.modelConfigs[modelSize]; 155 | if (!config) { 156 | return `# Error: Unknown model size: ${modelSize}`; 157 | } 158 | 159 | const hwConfig = config[hardware]; 160 | if (!hwConfig) { 161 | return `# Error: Unknown hardware platform: ${hardware}`; 162 | } 163 | 164 | const quantSuffix = quantization === 'fp8' ? '-FP8' : ''; 165 | 166 | // Build model name based on model category 167 | let modelName; 168 | if (config.hasThinkingVariants) { 169 | // Models with Instruct/Thinking variants (235B, 30B, 4B) 170 | // 4B is Dense but treated as having variants here 171 | if (category === 'base') { 172 | // Explicitly handle base selection for variant-capable models if needed, 173 | // though the next block handles 'base only' models. 174 | // If 'base' is selected on a variant model, we usually want just the base name 175 | // or we need to ensure the thinking logic handles it. 176 | // Based on the code structure: 177 | // If category is 'base', we probably want just Qwen/Qwen3-XB[-FP8] 178 | // BUT the existing logic adds suffixes based on hasThinkingVariants. 179 | // Let's refine logic: if user selected 'base', don't add suffixes. 180 | modelName = `Qwen/Qwen3-${config.baseName}${quantSuffix}`; 181 | } else { 182 | const thinkingSuffix = category === 'thinking' ? '-Thinking' : '-Instruct'; 183 | const dateSuffix = config.hasThinkingVariants ? '-2507' : ''; 184 | modelName = `Qwen/Qwen3-${config.baseName}${thinkingSuffix}${dateSuffix}${quantSuffix}`; 185 | } 186 | } else { 187 | // Models without variants (32B, 14B, 8B, 1.7B, 0.6B) - base model only 188 | modelName = `Qwen/Qwen3-${config.baseName}${quantSuffix}`; 189 | } 190 | 191 | let cmd = 'python -m sglang.launch_server \\\n'; 192 | cmd += ` --model ${modelName}`; 193 | 194 | if (hwConfig.tp > 1) { 195 | cmd += ` \\\n --tp ${hwConfig.tp}`; 196 | } 197 | 198 | let ep = hwConfig.ep; 199 | if (quantization === 'fp8' && hwConfig.tp === 8) { 200 | ep = 2; 201 | } 202 | 203 | if (ep > 0) { 204 | cmd += ` \\\n --ep ${ep}`; 205 | } 206 | 207 | // Apply commandRule from all options 208 | Object.entries(this.options).forEach(([key, option]) => { 209 | if (option.commandRule && values[key]) { 210 | // Pass the full values object so commandRule can access other option values 211 | const additionalCmd = option.commandRule(values[key], values); 212 | if (additionalCmd) { 213 | cmd += ` \\\n ${additionalCmd}`; 214 | } 215 | } 216 | }); 217 | 218 | return cmd; 219 | } 220 | }; 221 | 222 | // Initialize state with default values 223 | const getInitialState = () => { 224 | const initialState = {}; 225 | Object.entries(baseConfig.options).forEach(([key, option]) => { 226 | const defaultItem = option.items.find(item => item.default); 227 | initialState[key] = defaultItem ? defaultItem.id : option.items[0].id; 228 | }); 229 | return initialState; 230 | }; 231 | 232 | const [values, setValues] = useState(getInitialState()); 233 | 234 | // Get current model config 235 | const currentModelConfig = baseConfig.modelConfigs[values.modelsize]; 236 | 237 | // Dynamically adjust options based on model selection and filter by visibleWhen 238 | const displayOptions = useMemo(() => { 239 | const options = { ...baseConfig.options }; 240 | 241 | // If model doesn't have thinking variants, modify category options 242 | if (currentModelConfig && !currentModelConfig.hasThinkingVariants) { 243 | options.category = { 244 | ...baseConfig.options.category, 245 | items: baseConfig.options.category.items.map(item => ({ 246 | ...item, 247 | // Disable any option that is not 'base' 248 | disabled: item.id !== 'base' 249 | })) 250 | }; 251 | } 252 | 253 | // Filter options based on visibleWhen condition 254 | const filteredOptions = {}; 255 | Object.entries(options).forEach(([key, option]) => { 256 | // Check if option has visibleWhen condition 257 | if (option.visibleWhen) { 258 | // Only include if visibleWhen returns true 259 | if (option.visibleWhen(values)) { 260 | filteredOptions[key] = option; 261 | } 262 | } else { 263 | // No visibleWhen condition, always include 264 | filteredOptions[key] = option; 265 | } 266 | }); 267 | 268 | return filteredOptions; 269 | }, [values, currentModelConfig]); 270 | 271 | // Handle radio change with auto-switching for non-variant models 272 | const handleRadioChange = (optionName, itemId) => { 273 | setValues(prev => { 274 | const newValues = { ...prev, [optionName]: itemId }; 275 | 276 | // Auto-switch to 'base' category for models without thinking variants 277 | if (optionName === 'modelsize') { 278 | const modelConfig = baseConfig.modelConfigs[itemId]; 279 | if (modelConfig && !modelConfig.hasThinkingVariants) { 280 | // If current category is not base, switch to base 281 | if (newValues.category !== 'base') { 282 | newValues.category = 'base'; 283 | } 284 | } 285 | } 286 | 287 | // Reset reasoningParser when switching to 'instruct' category 288 | if (optionName === 'category' && itemId === 'instruct') { 289 | newValues.reasoningParser = 'disabled'; 290 | } 291 | 292 | return newValues; 293 | }); 294 | }; 295 | 296 | // Generate command 297 | const command = baseConfig.generateCommand(values); 298 | 299 | return ( 300 |
301 | {Object.entries(displayOptions).map(([key, option], index) => ( 302 |
303 |
304 | {index + 1} 305 | {option.title} 306 |
307 |
308 | {option.items.map(item => { 309 | const isChecked = values[option.name] === item.id; 310 | const isDisabled = item.disabled; 311 | return ( 312 | 330 | ); 331 | })} 332 |
333 |
334 | ))} 335 | 336 |
337 |
Generated Command
338 |
{command}
339 |
340 |
341 | ); 342 | }; 343 | 344 | export default Qwen3ConfigGenerator; 345 | 346 | -------------------------------------------------------------------------------- /docs/GLM/GLM-4.6V.md: -------------------------------------------------------------------------------- 1 | # GLM-4.6V 2 | 3 | ## 1. Model Introduction 4 | 5 | GLM-4.6V series model includes two versions: GLM-4.6V (106B), a foundation model designed for cloud and high-performance cluster scenarios, and GLM-4.6V-Flash (9B), a lightweight model optimized for local deployment and low-latency applications. GLM-4.6V scales its context window to 128k tokens in training, and achieves SoTA performance in visual understanding among models of similar parameter scales. Crucially, GLM team integrated native Function Calling capabilities for the first time. This effectively bridges the gap between "visual perception" and "executable action" providing a unified technical foundation for multimodal agents in real-world business scenarios. 6 | 7 | Beyond achieves SoTA performance across major multimodal benchmarks at comparable model scales. GLM-4.6V introduces several key features: 8 | 9 | - **Native Multimodal Function Calling** Enables native vision-driven tool use. Images, screenshots, and document pages can be passed directly as tool inputs without text conversion, while visual outputs (charts, search images, rendered pages) are interpreted and integrated into the reasoning chain. This closes the loop from perception to understanding to execution. Please refer to this [example](#tool-call-example). 10 | - **Interleaved Image-Text Content Generation** Supports high-quality mixed media creation from complex multimodal inputs. GLM-4.6V takes a multimodal context—spanning documents, user inputs, and tool-retrieved images—and synthesizes coherent, interleaved image-text content tailored to the task. During generation it can actively call search and retrieval tools to gather and curate additional text and visuals, producing rich, visually grounded content. 11 | - **Multimodal Document Understanding** GLM-4.6V can process up to 128K tokens of multi-document or long-document input, directly interpreting richly formatted pages as images. It understands text, layout, charts, tables, and figures jointly, enabling accurate comprehension of complex, image-heavy documents without requiring prior conversion to plain text. 12 | - **Frontend Replication & Visual Editing** Reconstructs pixel-accurate HTML/CSS from UI screenshots and supports natural-language-driven edits. It detects layout, components, and styles visually, generates clean code, and applies iterative visual modifications through simple user instructions. 13 | 14 | ## 2. SGLang Installation 15 | 16 | SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements. 17 | 18 | ### 2.1 Docker Installation (Recommended) 19 | 20 | ```shell 21 | docker pull lmsysorg/sglang:latest 22 | ``` 23 | 24 | **Advantages:** 25 | 26 | - Ready to use out of the box, no manual environment configuration needed 27 | - Avoids dependency conflict issues 28 | - Easy to migrate between different environments 29 | 30 | ### 2.2 Build from Source 31 | 32 | If you need to use the latest development version or require custom modifications, you can build from source: 33 | 34 | ```bash 35 | # Install SGLang using UV (recommended) 36 | git clone https://github.com/sgl-project/sglang.git 37 | cd sglang 38 | uv venv 39 | source .venv/bin/activate 40 | uv pip install -e "python[all]" --index-url=https://pypi.org/simple 41 | pip install nvidia-cudnn-cu12==9.16.0.29 42 | # Install ffmpeg to support video input 43 | sudo apt update 44 | sudo apt install ffmpeg 45 | ``` 46 | 47 | **Use Cases:** 48 | 49 | - Need to customize and modify SGLang source code 50 | - Want to use the latest development features 51 | - Participate in SGLang project development 52 | 53 | For general installation instructions, you can also refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html). 54 | 55 | ## 3. Model Deployment 56 | 57 | ### 3.1 Basic Configuration 58 | 59 | **Interactive Command Generator**: Use the interactive configuration generator below to customize your deployment settings. Select your hardware platform, model size, quantization method, and other options to generate the appropriate launch command. 60 | 61 | import GLM46VConfigGenerator from '@site/src/components/GLM46VConfigGenerator'; 62 | 63 | 64 | 65 | ### 3.2 Configuration Tips 66 | For more detailed configuration tips, please refer to [GLM-4.5V/GLM-4.6V Usage](https://docs.sglang.io/basic_usage/glmv.html). 67 | 68 | ## 4. Example APIs 69 | 70 | ### Image Input Example 71 | 72 | #### API Payload 73 | ```python 74 | curl_command = f""" 75 | curl -s http://localhost:{30000}/v1/chat/completions \\ 76 | -H "Content-Type: application/json" \\ 77 | -d '{{ 78 | "model": "default", 79 | "messages": [ 80 | {{ 81 | "role": "user", 82 | "content": [ 83 | {{ 84 | "type": "image_url", 85 | "image_url": {{ 86 | "url": "/home/jobuser/sgl_logo.png" 87 | }} 88 | }}, 89 | {{ 90 | "type": "text", 91 | "text": "What is the image" 92 | }} 93 | ] 94 | }} 95 | ], 96 | "temperature": "0", 97 | "max_completion_tokens": "1000", 98 | "max_tokens": "1000" 99 | }}' 100 | """ 101 | 102 | response = subprocess.check_output(curl_command, shell=True).decode() 103 | print(response) 104 | ``` 105 | 106 | #### API Response 107 | ```shell 108 | {"id":"b61596ca71394dd699fd8abd4f650c44","object":"chat.completion","created":1765259019,"model":"default","choices":[{"index":0,"message":{"role":"assistant","content":"The image is a logo featuring the text \"SGL\" (in a bold, orange-brown font) alongside a stylized icon. The icon includes a network-like structure with circular nodes (suggesting connectivity or a tree/graph structure) and a tag with \"\" (a common symbol for coding, web development, or software). The color scheme uses warm orange-brown tones with a black background, giving it a tech-focused, modern aesthetic (likely representing a company, project, or tool related to software, web development, or digital technology).<|begin_of_box|>SGL logo (stylized text + network/coding icon)<|end_of_box|>","reasoning_content":"Okay, let's see. The image has a logo with the text \"SGL\" and a little icon on the left. The icon looks like a network or a tree structure with circles, and there's a tag with \"\" which is a common symbol for coding or web development. The colors are orange and brown tones, with a black background. So probably a logo for a company or project named SGL, maybe related to software, web development, or a tech company.","tool_calls":null},"logprobs":null,"finish_reason":"stop","matched_stop":151336}],"usage":{"prompt_tokens":2222,"total_tokens":2448,"completion_tokens":226,"prompt_tokens_details":null,"reasoning_tokens":0},"metadata":{"weight_version":"default"}} 109 | ``` 110 | 111 | ### Video Input Example 112 | 113 | #### API Payload 114 | ```python 115 | curl_command = f""" 116 | curl -s http://localhost:{30000}/v1/chat/completions \\ 117 | -H "Content-Type: application/json" \\ 118 | -d '{{ 119 | "model": "default", 120 | "messages": [ 121 | {{ 122 | "role": "user", 123 | "content": [ 124 | {{ 125 | "type": "video_url", 126 | "video_url": {{ 127 | "url": "/home/jobuser/jobs_presenting_ipod.mp4" 128 | }} 129 | }}, 130 | {{ 131 | "type": "text", 132 | "text": "What is the image" 133 | }} 134 | ] 135 | }} 136 | ], 137 | "temperature": "0", 138 | "max_completion_tokens": "1000", 139 | "max_tokens": "1000" 140 | }}' 141 | """ 142 | 143 | response = subprocess.check_output(curl_command, shell=True).decode() 144 | print(response) 145 | ``` 146 | 147 | #### API Response 148 | ```shell 149 | {"id":"520e0a079e5d4b17b82a6af619315a97","object":"chat.completion","created":1765259029,"model":"default","choices":[{"index":0,"message":{"role":"assistant","content":"The image is a still from a presentation by a man on a stage. He is pointing to a small pocket on his jeans and asking the audience what the pocket is for. The video is being shared by Evan Carmichael. The man then reveals that the pocket is for an iPod Nano.","reasoning_content":"Based on the visual evidence in the video, here is a breakdown of what is being shown:\n\n* **Subject:** The video features a man on a stage, giving a presentation. He is wearing a black t-shirt and dark jeans.\n* **Action:** The man is pointing to a pocket on his jeans. He is asking the audience a question about the purpose of this pocket.\n* **Context:** The presentation is being filmed, and the video is being shared by \"Evan Carmichael,\" a well-known motivational speaker and content creator. The source of the clip is credited to \"JoshuaG.\"\n* **Reveal:** The man then reveals the answer to his question. He pulls a small, white, rectangular device out of the pocket. He identifies this device as an \"iPod Nano.\"\n\nIn summary, the image is a still from a presentation where a speaker is explaining the purpose of the small pocket found on many pairs of jeans.","tool_calls":null},"logprobs":null,"finish_reason":"stop","matched_stop":151336}],"usage":{"prompt_tokens":30276,"total_tokens":30532,"completion_tokens":256,"prompt_tokens_details":null,"reasoning_tokens":0},"metadata":{"weight_version":"default"}} 150 | ``` 151 | 152 | ### Tool Call Example 153 | 154 | ### Payload 155 | ```python 156 | from openai import OpenAI 157 | import argparse 158 | import sys 159 | import base64 160 | 161 | def image_to_base64(image_path): 162 | """Convert image file to base64 data URL format for OpenAI API""" 163 | with open(image_path, 'rb') as image_file: 164 | image_data = image_file.read() 165 | base64_string = base64.b64encode(image_data).decode('utf-8') 166 | return f"data:image/png;base64,{base64_string}" 167 | 168 | openai_api_key = "EMPTY" 169 | openai_api_base = "http://127.0.0.1:30000/v1" 170 | client = OpenAI(api_key=openai_api_key, base_url=openai_api_base) 171 | 172 | 173 | 174 | tools = [ 175 | { 176 | "type": "function", 177 | "function": { 178 | "name": "get_weather", 179 | "description": "Get current temperature for a given location.", 180 | "parameters": { 181 | "type": "object", 182 | "properties": { 183 | "location": { 184 | "type": "string", 185 | "description": "City and country e.g. Beijing, China", 186 | } 187 | }, 188 | "required": ["location"], 189 | "additionalProperties": False, 190 | }, 191 | }, 192 | } 193 | ] 194 | 195 | 196 | messages = [ 197 | { 198 | "role": "user", 199 | "content": "Please help me check today’s weather in Beijing, and tell me whether the tool returned an image." 200 | }, 201 | { 202 | "role": "assistant", 203 | "tool_calls": [ 204 | { 205 | "id": "call_bk32t88BGpSdbtDgzT044Rh4", 206 | "type": "function", 207 | "function": { 208 | "name": 'get_weather', 209 | "arguments": '{"location":"Beijing, China"}' 210 | } 211 | } 212 | ] 213 | }, 214 | { 215 | "role": "tool", 216 | "tool_call_id": "call_bk32t88BGpSdbtDgzT044Rh4", 217 | "content": [ 218 | { 219 | "type": "text", 220 | "text": "Weather report generated: Beijing, November 7, 2025, sunny, temperature 2°C." 221 | }, 222 | { 223 | "type": "image_url", 224 | "image_url": { 225 | "url": "/home/jobuser/sgl_logo.png" 226 | } 227 | } 228 | ] 229 | }, 230 | ] 231 | 232 | response = client.chat.completions.create( 233 | model="zai-org/GLM-4.6V", 234 | messages=messages, 235 | timeout=900, 236 | tools=tools 237 | ) 238 | print(response.choices[0].message.content.strip()) 239 | ``` 240 | 241 | ### Output 242 | 243 | ```shell 244 | The weather in Beijing today (November 7, 2025) is sunny with a temperature of 2°C. 245 | 246 | Yes, the tool returned an image (the SGL logo). 247 | ``` 248 | 249 | ## 5. Benchmark 250 | 251 | ### 5.1. Text Benchmark: Latency, Throughput and Accuracy 252 | 253 | ```shell 254 | python3 ./benchmark/gsm8k/bench_sglang.py 255 | ``` 256 | 257 | ### 5.2. Multimodal Benchmark - Latency and Throughput 258 | 259 | #### Command 260 | ```shell 261 | python3 -m sglang.bench_serving \ 262 | --backend sglang \ 263 | --port 30000 \ 264 | --model zai-org/GLM-4.6V \ 265 | --dataset-name image \ 266 | --image-count 2 \ 267 | --image-resolution 720p \ 268 | --random-input-len 128 \ 269 | --random-output-len 1024 \ 270 | --num-prompts 128 \ 271 | --max-concurrency 4 272 | ``` 273 | 274 | #### Response 275 | ```shell 276 | ============ Serving Benchmark Result ============ 277 | Backend: sglang 278 | Traffic request rate: inf 279 | Max request concurrency: 64 280 | Successful requests: 128 281 | Benchmark duration (s): 30.60 282 | Total input tokens: 315362 283 | Total input text tokens: 8674 284 | Total input vision tokens: 306688 285 | Total generated tokens: 63692 286 | Total generated tokens (retokenized): 63662 287 | Request throughput (req/s): 4.18 288 | Input token throughput (tok/s): 10305.12 289 | Output token throughput (tok/s): 2081.27 290 | Peak output token throughput (tok/s): 3007.00 291 | Peak concurrent requests: 71 292 | Total token throughput (tok/s): 12386.39 293 | Concurrency: 48.29 294 | ----------------End-to-End Latency---------------- 295 | Mean E2E Latency (ms): 11546.09 296 | Median E2E Latency (ms): 11856.43 297 | ---------------Time to First Token---------------- 298 | Mean TTFT (ms): 286.91 299 | Median TTFT (ms): 259.37 300 | P99 TTFT (ms): 575.39 301 | -----Time per Output Token (excl. 1st token)------ 302 | Mean TPOT (ms): 22.87 303 | Median TPOT (ms): 23.48 304 | P99 TPOT (ms): 25.89 305 | ---------------Inter-Token Latency---------------- 306 | Mean ITL (ms): 22.67 307 | Median ITL (ms): 20.01 308 | P95 ITL (ms): 68.51 309 | P99 ITL (ms): 74.81 310 | Max ITL (ms): 189.34 311 | ================================================== 312 | ``` 313 | 314 | 315 | ### 5.3. Multimodal Accuracy Benchmark - MMMU 316 | 317 | #### Command 318 | ```shell 319 | python3 benchmark/mmmu/bench_sglang.py --response-answer-regex "<\|begin_of_box\|>(.*)<\|end_of_box\|>" --port 30000 --concurrency 64 --extra-request-body '{"max_tokens": 4096}' 320 | ``` 321 | 322 | #### Response 323 | ```shell 324 | Benchmark time: 487.2229107860476 325 | answers saved to: ./answer_sglang.json 326 | Evaluating... 327 | answers saved to: ./answer_sglang.json 328 | {'Accounting': {'acc': 0.962, 'num': 26}, 329 | 'Agriculture': {'acc': 0.5, 'num': 30}, 330 | 'Architecture_and_Engineering': {'acc': 0.733, 'num': 15}, 331 | 'Art': {'acc': 0.833, 'num': 30}, 332 | 'Art_Theory': {'acc': 0.9, 'num': 30}, 333 | 'Basic_Medical_Science': {'acc': 0.733, 'num': 30}, 334 | 'Biology': {'acc': 0.586, 'num': 29}, 335 | 'Chemistry': {'acc': 0.654, 'num': 26}, 336 | 'Clinical_Medicine': {'acc': 0.633, 'num': 30}, 337 | 'Computer_Science': {'acc': 0.76, 'num': 25}, 338 | 'Design': {'acc': 0.867, 'num': 30}, 339 | 'Diagnostics_and_Laboratory_Medicine': {'acc': 0.633, 'num': 30}, 340 | 'Economics': {'acc': 0.862, 'num': 29}, 341 | 'Electronics': {'acc': 0.5, 'num': 18}, 342 | 'Energy_and_Power': {'acc': 0.875, 'num': 16}, 343 | 'Finance': {'acc': 0.857, 'num': 28}, 344 | 'Geography': {'acc': 0.714, 'num': 28}, 345 | 'History': {'acc': 0.767, 'num': 30}, 346 | 'Literature': {'acc': 0.897, 'num': 29}, 347 | 'Manage': {'acc': 0.759, 'num': 29}, 348 | 'Marketing': {'acc': 1.0, 'num': 26}, 349 | 'Materials': {'acc': 0.833, 'num': 18}, 350 | 'Math': {'acc': 0.76, 'num': 25}, 351 | 'Mechanical_Engineering': {'acc': 0.619, 'num': 21}, 352 | 'Music': {'acc': 0.286, 'num': 28}, 353 | 'Overall': {'acc': 0.761, 'num': 803}, 354 | 'Overall-Art and Design': {'acc': 0.729, 'num': 118}, 355 | 'Overall-Business': {'acc': 0.884, 'num': 138}, 356 | 'Overall-Health and Medicine': {'acc': 0.773, 'num': 150}, 357 | 'Overall-Humanities and Social Science': {'acc': 0.78, 'num': 118}, 358 | 'Overall-Science': {'acc': 0.728, 'num': 136}, 359 | 'Overall-Tech and Engineering': {'acc': 0.671, 'num': 143}, 360 | 'Pharmacy': {'acc': 0.933, 'num': 30}, 361 | 'Physics': {'acc': 0.929, 'num': 28}, 362 | 'Psychology': {'acc': 0.733, 'num': 30}, 363 | 'Public_Health': {'acc': 0.933, 'num': 30}, 364 | 'Sociology': {'acc': 0.724, 'num': 29}} 365 | eval out saved to ./val_sglang.json 366 | Overall accuracy: 0.761 367 | ``` -------------------------------------------------------------------------------- /docs/Moonshotai/Kimi-K2.md: -------------------------------------------------------------------------------- 1 | # Kimi-K2 2 | 3 | ## 1. Model Introduction 4 | 5 | [Kimi-K2](https://moonshotai.github.io/Kimi-K2/) is a state-of-the-art MoE language model by Moonshot AI with 32B activated parameters and 1T total parameters. 6 | 7 | **Model Variants:** 8 | 9 | - **[Kimi-K2-Instruct](https://huggingface.co/moonshotai/Kimi-K2-Instruct)**: Post-trained model optimized for general-purpose chat and agentic tasks. Compatible with vLLM, SGLang, KTransformers, and TensorRT-LLM. 10 | - **[Kimi-K2-Thinking](https://huggingface.co/moonshotai/Kimi-K2-Thinking)**: Advanced thinking model with step-by-step reasoning and tool calling. Native INT4 quantization with 256k context window. Ideal for complex reasoning and multi-step tool use. 11 | 12 | For details, see [official documentation](https://github.com/MoonshotAI/Kimi-K2) and [technical report](https://www.arxiv.org/abs/2507.20534). 13 | 14 | ## 2. SGLang Installation 15 | 16 | Refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html). 17 | 18 | ## 3. Model Deployment 19 | 20 | This section provides a progressive guide from quick deployment to performance optimization, suitable for users at different levels. 21 | 22 | ### 3.1 Basic Configuration 23 | 24 | **Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model variant, deployment strategy, and capabilities. 25 | 26 | import KimiK2ConfigGenerator from '@site/src/components/KimiK2ConfigGenerator'; 27 | 28 | 29 | 30 | ### 3.2 Configuration Tips 31 | 32 | - **Memory**: Requires 8 GPUs with ≥140GB each (H200/B200). Use `--context-length 128000` to conserve memory. 33 | - **Expert Parallelism (EP)**: Use `--ep` for better MoE throughput. See [EP docs](https://github.com/sgl-project/sglang/blob/main/docs/advanced_features/expert_parallelism.md). 34 | - **Data Parallel (DP)**: Enable with `--dp 4 --enable-dp-attention` for production throughput. 35 | - **KV Cache**: Use `--kv-cache-dtype fp8_e4m3` to reduce memory by 50% (CUDA 11.8+). 36 | - **Reasoning Parser**: Add `--reasoning-parser kimi_k2` for Kimi-K2-Thinking to separate thinking and content. 37 | - **Tool Call Parser**: Add `--tool-call-parser kimi_k2` for structured tool calls. 38 | 39 | ## 4. Model Invocation 40 | 41 | ### 4.1 Basic Usage 42 | 43 | See [Basic API Usage](https://docs.sglang.ai/get_started/quick_start.html). 44 | 45 | ### 4.2 Advanced Usage 46 | 47 | #### 4.2.1 Reasoning Parser 48 | 49 | Enable reasoning parser for Kimi-K2-Thinking: 50 | 51 | ```shell 52 | python -m sglang.launch_server \ 53 | --model moonshotai/Kimi-K2-Thinking \ 54 | --reasoning-parser kimi_k2 \ 55 | --tp 8 \ 56 | --host 0.0.0.0 \ 57 | --port 8000 58 | ``` 59 | 60 | **Example:** 61 | 62 | ```python 63 | from openai import OpenAI 64 | 65 | client = OpenAI( 66 | base_url="http://localhost:8000/v1", 67 | api_key="EMPTY" 68 | ) 69 | 70 | # Enable streaming to see the thinking process in real-time 71 | response = client.chat.completions.create( 72 | model="moonshotai/Kimi-K2-Thinking", 73 | messages=[ 74 | {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"} 75 | ], 76 | temperature=0.6, 77 | max_tokens=2048, 78 | stream=True 79 | ) 80 | 81 | # Process the stream 82 | has_thinking = False 83 | has_answer = False 84 | thinking_started = False 85 | 86 | for chunk in response: 87 | if chunk.choices and len(chunk.choices) > 0: 88 | delta = chunk.choices[0].delta 89 | 90 | # Print thinking process 91 | if hasattr(delta, 'reasoning_content') and delta.reasoning_content: 92 | if not thinking_started: 93 | print("=============== Thinking =================", flush=True) 94 | thinking_started = True 95 | has_thinking = True 96 | print(delta.reasoning_content, end="", flush=True) 97 | 98 | # Print answer content 99 | if delta.content: 100 | # Close thinking section and add content header 101 | if has_thinking and not has_answer: 102 | print("\n=============== Content =================", flush=True) 103 | has_answer = True 104 | print(delta.content, end="", flush=True) 105 | 106 | print() 107 | ``` 108 | 109 | **Output Example:** 110 | 111 | ```text 112 | =============== Thinking ================= 113 | The user asks: "What is 15% of 240?" This is a straightforward percentage calculation problem. I need to solve it step by step. 114 | 115 | Step 1: Understand what "percent" means. 116 | - "Percent" means "per hundred". So 15% means 15 per 100, or 15/100, or 0.15. 117 | 118 | Step 2: Convert the percentage to a decimal. 119 | - 15% = 15 / 100 = 0.15 120 | 121 | Step 3: Multiply the decimal by the number. 122 | - 0.15 * 240 123 | 124 | Step 4: Perform the multiplication. 125 | - 0.15 * 240 = (15/100) * 240 126 | - = 15 * 240 / 100 127 | - = 3600 / 100 128 | - = 36 129 | 130 | Alternatively, I can calculate it directly: 131 | - 0.15 * 240 132 | - 15 * 240 = 3600 133 | - 3600 / 100 = 36 134 | 135 | Or, break it down: 136 | - 10% of 240 = 24 137 | - 5% of 240 = half of 10% = 12 138 | - 15% of 240 = 10% + 5% = 24 + 12 = 36 139 | 140 | I should present the solution clearly with steps. The most standard method is converting to decimal and multiplying. 141 | 142 | Let me structure the answer: 143 | 1. Convert the percentage to a decimal. 144 | 2. Multiply the decimal by the number. 145 | 3. Show the calculation. 146 | 4. State the final answer. 147 | 148 | This is simple and easy to follow. 149 | =============== Content ================= 150 | Here is the step-by-step solution: 151 | 152 | **Step 1: Convert the percentage to a decimal** 153 | 15% means 15 per 100, which is 15 ÷ 100 = **0.15** 154 | 155 | **Step 2: Multiply the decimal by the number** 156 | 0.15 × 240 157 | 158 | **Step 3: Calculate the result** 159 | 0.15 × 240 = **36** 160 | 161 | **Answer:** 15% of 240 is **36**. 162 | ``` 163 | 164 | **Note:** The reasoning parser captures the model's step-by-step thinking process, allowing you to see how the model arrives at its conclusions. 165 | 166 | #### 4.2.2 Tool Calling 167 | 168 | Kimi-K2-Instruct and Kimi-K2-Thinking support tool calling capabilities. Enable the tool call parser during deployment: 169 | 170 | **Deployment Command:** 171 | 172 | ```shell 173 | python -m sglang.launch_server \ 174 | --model moonshotai/Kimi-K2-Instruct \ 175 | --tool-call-parser kimi_k2 \ 176 | --tp 8 \ 177 | --trust-remote-code \ 178 | --host 0.0.0.0 \ 179 | --port 8000 180 | ``` 181 | 182 | **Python Example (with Thinking Process):** 183 | 184 | ```python 185 | from openai import OpenAI 186 | 187 | client = OpenAI( 188 | base_url="http://localhost:8000/v1", 189 | api_key="EMPTY" 190 | ) 191 | 192 | # Define available tools 193 | tools = [ 194 | { 195 | "type": "function", 196 | "function": { 197 | "name": "get_weather", 198 | "description": "Get the current weather for a location", 199 | "parameters": { 200 | "type": "object", 201 | "properties": { 202 | "location": { 203 | "type": "string", 204 | "description": "The city name" 205 | }, 206 | "unit": { 207 | "type": "string", 208 | "enum": ["celsius", "fahrenheit"], 209 | "description": "Temperature unit" 210 | } 211 | }, 212 | "required": ["location"] 213 | } 214 | } 215 | } 216 | ] 217 | 218 | # Make request with streaming to see thinking process 219 | response = client.chat.completions.create( 220 | model="moonshotai/Kimi-K2-Thinking", 221 | messages=[ 222 | {"role": "user", "content": "What's the weather in Beijing?"} 223 | ], 224 | tools=tools, 225 | temperature=0.7, 226 | stream=True 227 | ) 228 | 229 | # Process streaming response 230 | thinking_started = False 231 | has_thinking = False 232 | tool_calls_accumulator = {} 233 | 234 | for chunk in response: 235 | if chunk.choices and len(chunk.choices) > 0: 236 | delta = chunk.choices[0].delta 237 | 238 | # Print thinking process 239 | if hasattr(delta, 'reasoning_content') and delta.reasoning_content: 240 | if not thinking_started: 241 | print("=============== Thinking =================", flush=True) 242 | thinking_started = True 243 | has_thinking = True 244 | print(delta.reasoning_content, end="", flush=True) 245 | 246 | # Accumulate tool calls 247 | if hasattr(delta, 'tool_calls') and delta.tool_calls: 248 | # Close thinking section if needed 249 | if has_thinking and thinking_started: 250 | print("\n=============== Content =================\n", flush=True) 251 | thinking_started = False 252 | 253 | for tool_call in delta.tool_calls: 254 | index = tool_call.index 255 | if index not in tool_calls_accumulator: 256 | tool_calls_accumulator[index] = { 257 | 'name': None, 258 | 'arguments': '' 259 | } 260 | 261 | if tool_call.function: 262 | if tool_call.function.name: 263 | tool_calls_accumulator[index]['name'] = tool_call.function.name 264 | if tool_call.function.arguments: 265 | tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments 266 | 267 | # Print content 268 | if delta.content: 269 | print(delta.content, end="", flush=True) 270 | 271 | # Print accumulated tool calls 272 | for index, tool_call in sorted(tool_calls_accumulator.items()): 273 | print(f"🔧 Tool Call: {tool_call['name']}") 274 | print(f" Arguments: {tool_call['arguments']}") 275 | 276 | print() 277 | ``` 278 | 279 | **Output Example:** 280 | 281 | ``` 282 | =============== Thinking ================= 283 | The user is asking about the weather in Beijing. I need to use the get_weather function to retrieve this information. Beijing is a major city in China, so I should be able to get weather data for it. The location parameter is required, but the unit parameter is optional. Since the user didn't specify a temperature unit, I can just provide the location and let the function use its default. I'll check the weather in Beijing for you. 284 | =============== Content ================= 285 | 286 | 🔧 Tool Call: get_weather 287 | Arguments: {"location":"Beijing"} 288 | ``` 289 | 290 | **Note:** 291 | 292 | - The reasoning parser shows how the model decides to use a tool 293 | - Tool calls are clearly marked with the function name and arguments 294 | - You can then execute the function and send the result back to continue the conversation 295 | 296 | **Handling Tool Call Results:** 297 | 298 | ```python 299 | # After getting the tool call, execute the function 300 | def get_weather(location, unit="celsius"): 301 | # Your actual weather API call here 302 | return f"The weather in {location} is 22°{unit[0].upper()} and sunny." 303 | 304 | # Send tool result back to the model 305 | messages = [ 306 | {"role": "user", "content": "What's the weather in Beijing?"}, 307 | { 308 | "role": "assistant", 309 | "content": None, 310 | "tool_calls": [{ 311 | "id": "call_123", 312 | "type": "function", 313 | "function": { 314 | "name": "get_weather", 315 | "arguments": '{"location": "Beijing", "unit": "celsius"}' 316 | } 317 | }] 318 | }, 319 | { 320 | "role": "tool", 321 | "tool_call_id": "call_123", 322 | "content": get_weather("Beijing", "celsius") 323 | } 324 | ] 325 | 326 | final_response = client.chat.completions.create( 327 | model="moonshotai/Kimi-K2-Thinking", 328 | messages=messages, 329 | temperature=0.7 330 | ) 331 | 332 | print(final_response.choices[0].message.content) 333 | # Output: "The weather in Beijing is currently 22°C and sunny." 334 | ``` 335 | 336 | ## 5. Benchmark 337 | 338 | ### 5.1 Speed Benchmark 339 | 340 | **Test Environment:** 341 | 342 | - Hardware: NVIDIA B200 GPU (8x) 343 | - Model: Kimi-K2-Instruct 344 | - sglang version: 0.5.6.post1 345 | 346 | We use SGLang's built-in benchmarking tool to conduct performance evaluation on the [ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) dataset. This dataset contains real conversation data and can better reflect performance in actual use scenarios. 347 | 348 | #### 5.1.1 Latency-Sensitive Benchmark 349 | 350 | - Model Deployment Command: 351 | 352 | ```shell 353 | python3 -m sglang.launch_server \ 354 | --model-path moonshotai/Kimi-K2-Instruct \ 355 | --tp 8 \ 356 | --dp 4 \ 357 | --enable-dp-attention \ 358 | --trust-remote-code \ 359 | --host 0.0.0.0 \ 360 | --port 8000 361 | ``` 362 | 363 | - Benchmark Command: 364 | 365 | ```shell 366 | python3 -m sglang.bench_serving \ 367 | --backend sglang \ 368 | --host 127.0.0.1 \ 369 | --port 8000 \ 370 | --model moonshotai/Kimi-K2-Instruct\ 371 | --num-prompts 10 \ 372 | --max-concurrency 1 373 | ``` 374 | 375 | - **Test Results**: 376 | 377 | ``` 378 | ============ Serving Benchmark Result ============ 379 | Backend: sglang 380 | Traffic request rate: inf 381 | Max request concurrency: 1 382 | Successful requests: 10 383 | Benchmark duration (s): 44.93 384 | Total input tokens: 1951 385 | Total input text tokens: 1951 386 | Total input vision tokens: 0 387 | Total generated tokens: 2755 388 | Total generated tokens (retokenized): 2748 389 | Request throughput (req/s): 0.22 390 | Input token throughput (tok/s): 43.42 391 | Output token throughput (tok/s): 61.32 392 | Peak output token throughput (tok/s): 64.00 393 | Peak concurrent requests: 3 394 | Total token throughput (tok/s): 104.74 395 | Concurrency: 1.00 396 | ----------------End-to-End Latency---------------- 397 | Mean E2E Latency (ms): 4489.56 398 | Median E2E Latency (ms): 4994.53 399 | ---------------Time to First Token---------------- 400 | Mean TTFT (ms): 141.22 401 | Median TTFT (ms): 158.28 402 | P99 TTFT (ms): 166.90 403 | -----Time per Output Token (excl. 1st token)------ 404 | Mean TPOT (ms): 18.40 405 | Median TPOT (ms): 15.63 406 | P99 TPOT (ms): 39.88 407 | ---------------Inter-Token Latency---------------- 408 | Mean ITL (ms): 15.78 409 | Median ITL (ms): 15.76 410 | P95 ITL (ms): 16.36 411 | P99 ITL (ms): 16.59 412 | Max ITL (ms): 19.94 413 | ================================================== 414 | ``` 415 | 416 | #### 5.1.2 Throughput-Sensitive Benchmark 417 | 418 | - Model Deployment Command: 419 | 420 | ```shell 421 | python3 -m sglang.launch_server \ 422 | --model-path moonshotai/Kimi-K2-Instruct \ 423 | --tp 8 \ 424 | --dp 4 \ 425 | --ep 4 \ 426 | --enable-dp-attention \ 427 | --trust-remote-code \ 428 | --host 0.0.0.0 \ 429 | --port 8000 430 | ``` 431 | 432 | - Benchmark Command: 433 | 434 | ```shell 435 | python3 -m sglang.bench_serving \ 436 | --backend sglang \ 437 | --host 127.0.0.1 \ 438 | --port 8000 \ 439 | --model moonshotai/Kimi-K2-Instruct\ 440 | --num-prompts 1000 \ 441 | --max-concurrency 100 442 | ``` 443 | 444 | - **Test Results**: 445 | 446 | ``` 447 | ============ Serving Benchmark Result ============ 448 | Backend: sglang 449 | Traffic request rate: inf 450 | Max request concurrency: 100 451 | Successful requests: 1000 452 | Benchmark duration (s): 174.11 453 | Total input tokens: 296642 454 | Total input text tokens: 296642 455 | Total input vision tokens: 0 456 | Total generated tokens: 193831 457 | Total generated tokens (retokenized): 168687 458 | Request throughput (req/s): 5.74 459 | Input token throughput (tok/s): 1703.73 460 | Output token throughput (tok/s): 1113.25 461 | Peak output token throughput (tok/s): 2383.00 462 | Peak concurrent requests: 112 463 | Total token throughput (tok/s): 2816.97 464 | Concurrency: 89.60 465 | ----------------End-to-End Latency---------------- 466 | Mean E2E Latency (ms): 15601.09 467 | Median E2E Latency (ms): 10780.52 468 | ---------------Time to First Token---------------- 469 | Mean TTFT (ms): 457.42 470 | Median TTFT (ms): 221.62 471 | P99 TTFT (ms): 2475.32 472 | -----Time per Output Token (excl. 1st token)------ 473 | Mean TPOT (ms): 97.23 474 | Median TPOT (ms): 85.61 475 | P99 TPOT (ms): 435.95 476 | ---------------Inter-Token Latency---------------- 477 | Mean ITL (ms): 78.61 478 | Median ITL (ms): 43.66 479 | P95 ITL (ms): 169.53 480 | P99 ITL (ms): 260.91 481 | Max ITL (ms): 1703.21 482 | ================================================== 483 | ``` 484 | 485 | ### 5.2 Accuracy Benchmark 486 | 487 | #### 5.2.1 GSM8K Benchmark 488 | 489 | - Server Command 490 | 491 | ```shell 492 | python3 -m sglang.launch_server \ 493 | --model-path moonshotai/Kimi-K2-Instruct \ 494 | --tp 8 \ 495 | --dp 4 \ 496 | --trust-remote-code \ 497 | --host 0.0.0.0 \ 498 | --port 8000 499 | ``` 500 | 501 | - Benchmark Command 502 | 503 | ```shell 504 | python3 -m sglang.test.few_shot_gsm8k --num-questions 200 --port 8000 505 | ``` 506 | 507 | - **Result**: 508 | 509 | ``` 510 | Accuracy: 0.960 511 | Invalid: 0.000 512 | Latency: 15.956 s 513 | Output throughput: 1231.699 token/s 514 | ``` 515 | -------------------------------------------------------------------------------- /docs/DeepSeek/DeepSeek-V3_2.md: -------------------------------------------------------------------------------- 1 | # DeepSeek-V3.2 2 | 3 | ## 1. Model Introduction 4 | 5 | The DeepSeek-V3.2 series includes three model variants, each optimized for different use cases: 6 | 7 | **[DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp)** is an upgraded version of DeepSeek-V3.1-Terminus, introducing the DeepSeek Sparse Attention (DSA) mechanism through continued training. DSA is a fine-grained sparse attention mechanism powered by a lightning indexer, enabling DeepSeek-V3.2-Exp to achieve significant efficiency improvements in long-context scenarios. As an intermediate step toward the next-generation architecture, V3.2-Exp builds upon V3.1-Terminus by introducing DeepSeek Sparse Attention—a sparse attention mechanism designed to explore and validate optimizations for training and inference efficiency in long-context scenarios. Recommended for general conversations, long-context processing, and efficient inference. 8 | 9 | **[DeepSeek-V3.2](https://huggingface.co/deepseek-ai/DeepSeek-V3.2)** is the standard version suitable for general tasks and conversational scenarios. For local deployment, we recommend setting the sampling parameters to temperature = 1.0, top_p = 0.95. Recommended for standard conversations and general tasks. 10 | 11 | **[DeepSeek-V3.2-Speciale](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale)** is a special variant designed exclusively for deep reasoning tasks. This model is specifically optimized for scenarios requiring complex logical reasoning and deep thinking. For local deployment, we recommend setting the sampling parameters to temperature = 1.0, top_p = 0.95. Recommended for deep reasoning tasks, complex logical problems, and mathematical reasoning. 12 | 13 | ## 2. SGLang Installation 14 | 15 | SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements. 16 | 17 | Please refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html) for installation instructions. 18 | 19 | ## 3. Model Deployment 20 | 21 | This section provides a progressive guide from quick deployment to performance optimization, suitable for users at different levels. 22 | 23 | ### 3.1 Basic Configuration 24 | 25 | **Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model variant, deployment strategy, and thinking capabilities. 26 | 27 | import DeepSeekConfigGenerator from '@site/src/components/DeepSeekConfigGenerator'; 28 | 29 | 30 | 31 | ### 3.2 Configuration Tips 32 | For more detailed configuration tips, please refer to [DeepSeek-V3.2 Usage](https://docs.sglang.io/basic_usage/deepseek_v32.html). 33 | 34 | ## 4. Model Invocation 35 | 36 | ### 4.1 Basic Usage 37 | 38 | For basic API usage and request examples, please refer to: 39 | 40 | - [Basic API Usage](https://docs.sglang.ai/get_started/quick_start.html) 41 | 42 | ### 4.2 Advanced Usage 43 | 44 | #### 4.2.1 Reasoning Parser 45 | 46 | DeepSeek-V3.2 supports reasoning mode. Enable the reasoning parser during deployment to separate the thinking and content sections: 47 | 48 | ```shell 49 | python -m sglang.launch_server \ 50 | --model deepseek-ai/DeepSeek-V3.2-Exp \ 51 | --reasoning-parser deepseek-v3 \ 52 | --tp 8 \ 53 | --host 0.0.0.0 \ 54 | --port 8000 55 | ``` 56 | 57 | **Streaming with Thinking Process:** 58 | 59 | ```python 60 | from openai import OpenAI 61 | 62 | client = OpenAI( 63 | base_url="http://localhost:8000/v1", 64 | api_key="EMPTY" 65 | ) 66 | 67 | # Enable streaming to see the thinking process in real-time 68 | response = client.chat.completions.create( 69 | model="deepseek-ai/DeepSeek-V3.2-Exp", 70 | messages=[ 71 | {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"} 72 | ], 73 | temperature=0.7, 74 | max_tokens=2048, 75 | extra_body = {"chat_template_kwargs": {"thinking": True}}, 76 | stream=True 77 | ) 78 | 79 | # Process the stream 80 | has_thinking = False 81 | has_answer = False 82 | thinking_started = False 83 | 84 | for chunk in response: 85 | print(chunk) 86 | if chunk.choices and len(chunk.choices) > 0: 87 | delta = chunk.choices[0].delta 88 | 89 | # Print thinking process 90 | if hasattr(delta, 'reasoning_content') and delta.reasoning_content: 91 | if not thinking_started: 92 | print("=============== Thinking =================", flush=True) 93 | thinking_started = True 94 | has_thinking = True 95 | print(delta.reasoning_content, end="", flush=True) 96 | 97 | # Print answer content 98 | if delta.content: 99 | # Close thinking section and add content header 100 | if has_thinking and not has_answer: 101 | print("\n=============== Content =================", flush=True) 102 | has_answer = True 103 | print(delta.content, end="", flush=True) 104 | 105 | print() 106 | ``` 107 | 108 | **Output Example:** 109 | 110 | ``` 111 | =============== Thinking ================= 112 | To solve this problem, I need to calculate 15% of 240. 113 | Step 1: Convert 15% to decimal: 15% = 0.15 114 | Step 2: Multiply 240 by 0.15 115 | Step 3: 240 × 0.15 = 36 116 | =============== Content ================= 117 | 118 | The answer is 36. To find 15% of 240, we multiply 240 by 0.15, which equals 36. 119 | ``` 120 | 121 | **Note:** The reasoning parser captures the model's step-by-step thinking process, allowing you to see how the model arrives at its conclusions. 122 | 123 | #### 4.2.2 Tool Calling 124 | 125 | DeepSeek-V3.2 and DeepSeek-V3.2-Exp support tool calling capabilities. Enable the tool call parser: 126 | 127 | **Note:** DeepSeek-V3.2-Speciale does **NOT** support tool calling. It is designed exclusively for deep reasoning tasks. 128 | 129 | **Deployment Command:** 130 | 131 | ```shell 132 | python -m sglang.launch_server \ 133 | --model deepseek-ai/DeepSeek-V3.2-Exp \ 134 | --tool-call-parser deepseekv31 \ 135 | --reasoning-parser deepseek-v3 \ 136 | --chat-template ./examples/chat_template/tool_chat_template_deepseekv32.jinja \ 137 | --tp 8 \ 138 | --host 0.0.0.0 \ 139 | --port 8000 140 | ``` 141 | 142 | For DeepSeek-V3.2, use `--tool-call-parser deepseekv32` instead. 143 | 144 | **Python Example (with Thinking Process):** 145 | 146 | ```python 147 | from openai import OpenAI 148 | 149 | client = OpenAI( 150 | base_url="http://localhost:8000/v1", 151 | api_key="EMPTY" 152 | ) 153 | 154 | # Define available tools 155 | tools = [ 156 | { 157 | "type": "function", 158 | "function": { 159 | "name": "get_weather", 160 | "description": "Get the current weather for a location", 161 | "parameters": { 162 | "type": "object", 163 | "properties": { 164 | "location": { 165 | "type": "string", 166 | "description": "The city name" 167 | }, 168 | "unit": { 169 | "type": "string", 170 | "enum": ["celsius", "fahrenheit"], 171 | "description": "Temperature unit" 172 | } 173 | }, 174 | "required": ["location"] 175 | } 176 | } 177 | } 178 | ] 179 | 180 | # Make request with streaming to see thinking process 181 | response = client.chat.completions.create( 182 | model="deepseek-ai/DeepSeek-V3.2-Exp", 183 | messages=[ 184 | {"role": "user", "content": "What's the weather in Beijing?"} 185 | ], 186 | tools=tools, 187 | extra_body = {"chat_template_kwargs": {"thinking": True}}, 188 | temperature=0.7, 189 | stream=True 190 | ) 191 | 192 | # Process streaming response 193 | thinking_started = False 194 | has_thinking = False 195 | tool_calls_accumulator = {} 196 | 197 | for chunk in response: 198 | if chunk.choices and len(chunk.choices) > 0: 199 | delta = chunk.choices[0].delta 200 | 201 | # Print thinking process 202 | if hasattr(delta, 'reasoning_content') and delta.reasoning_content: 203 | if not thinking_started: 204 | print("=============== Thinking =================", flush=True) 205 | thinking_started = True 206 | has_thinking = True 207 | print(delta.reasoning_content, end="", flush=True) 208 | 209 | # Accumulate tool calls 210 | if hasattr(delta, 'tool_calls') and delta.tool_calls: 211 | # Close thinking section if needed 212 | if has_thinking and thinking_started: 213 | print("\n=============== Content =================\n", flush=True) 214 | thinking_started = False 215 | 216 | for tool_call in delta.tool_calls: 217 | index = tool_call.index 218 | if index not in tool_calls_accumulator: 219 | tool_calls_accumulator[index] = { 220 | 'name': None, 221 | 'arguments': '' 222 | } 223 | 224 | if tool_call.function: 225 | if tool_call.function.name: 226 | tool_calls_accumulator[index]['name'] = tool_call.function.name 227 | if tool_call.function.arguments: 228 | tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments 229 | 230 | # Print content 231 | if delta.content: 232 | print(delta.content, end="", flush=True) 233 | 234 | # Print accumulated tool calls 235 | for index, tool_call in sorted(tool_calls_accumulator.items()): 236 | print(f"🔧 Tool Call: {tool_call['name']}") 237 | print(f" Arguments: {tool_call['arguments']}") 238 | 239 | print() 240 | ``` 241 | 242 | **Output Example:** 243 | 244 | ``` 245 | =============== Thinking ================= 246 | The user is asking about the weather in Beijing. I need to use the get_weather function to retrieve this information. 247 | I should call the function with location="Beijing". 248 | =============== Content ================= 249 | 250 | 🔧 Tool Call: get_weather 251 | Arguments: {"location": "Beijing", "unit": "celsius"} 252 | ``` 253 | 254 | **Note:** 255 | 256 | - The reasoning parser shows how the model decides to use a tool 257 | - Tool calls are clearly marked with the function name and arguments 258 | - You can then execute the function and send the result back to continue the conversation 259 | 260 | **Handling Tool Call Results:** 261 | 262 | ```python 263 | # After getting the tool call, execute the function 264 | def get_weather(location, unit="celsius"): 265 | # Your actual weather API call here 266 | return f"The weather in {location} is 22°{unit[0].upper()} and sunny." 267 | 268 | # Send tool result back to the model 269 | messages = [ 270 | {"role": "user", "content": "What's the weather in Beijing?"}, 271 | { 272 | "role": "assistant", 273 | "content": None, 274 | "tool_calls": [{ 275 | "id": "call_123", 276 | "type": "function", 277 | "function": { 278 | "name": "get_weather", 279 | "arguments": '{"location": "Beijing", "unit": "celsius"}' 280 | } 281 | }] 282 | }, 283 | { 284 | "role": "tool", 285 | "tool_call_id": "call_123", 286 | "content": get_weather("Beijing", "celsius") 287 | } 288 | ] 289 | 290 | final_response = client.chat.completions.create( 291 | model="deepseek-ai/DeepSeek-V3.2-Exp", 292 | messages=messages, 293 | temperature=0.7 294 | ) 295 | 296 | print(final_response.choices[0].message.content) 297 | # Output: "The weather in Beijing is currently 22°C and sunny." 298 | ``` 299 | 300 | ## 5. Benchmark 301 | 302 | ### 5.1 Speed Benchmark 303 | 304 | **Test Environment:** 305 | 306 | - Hardware: NVIDIA B200 GPU (8x) 307 | - Model: DeepSeek-V3.2-Exp 308 | - Tensor Parallelism: 8 309 | - sglang version: 0.5.6 310 | 311 | We use SGLang's built-in benchmarking tool to conduct performance evaluation on the [ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) dataset. This dataset contains real conversation data and can better reflect performance in actual use scenarios. To simulate real-world usage patterns, we configure each request with 1024 input tokens and 1024 output tokens, representing typical medium-length conversations with detailed responses. 312 | 313 | #### 5.1.1 Latency-Sensitive Benchmark 314 | 315 | - Model Deployment Command: 316 | 317 | ```shell 318 | python3 -m sglang.launch_server \ 319 | --model-path deepseek-ai/DeepSeek-V3.2-Exp \ 320 | --tp 8 \ 321 | --dp 8 \ 322 | --enable-dp-attention \ 323 | --speculative-algorithm EAGLE \ 324 | --speculative-num-steps 3 \ 325 | --speculative-eagle-topk 1 \ 326 | --speculative-num-draft-tokens 4 \ 327 | --host 0.0.0.0 \ 328 | --port 8000 329 | ``` 330 | 331 | - Benchmark Command: 332 | 333 | ```shell 334 | python3 -m sglang.bench_serving \ 335 | --backend sglang \ 336 | --host 127.0.0.1 \ 337 | --port 8000 \ 338 | --model deepseek-ai/DeepSeek-V3.2-Exp \ 339 | --random-input-len 1024 \ 340 | --random-output-len 1024 \ 341 | --num-prompts 10 \ 342 | --max-concurrency 1 343 | ``` 344 | 345 | - **Test Results:** 346 | 347 | ``` 348 | ============ Serving Benchmark Result ============ 349 | Backend: sglang 350 | Traffic request rate: inf 351 | Max request concurrency: 1 352 | Successful requests: 10 353 | Benchmark duration (s): 41.23 354 | Total input tokens: 1972 355 | Total input text tokens: 1972 356 | Total input vision tokens: 0 357 | Total generated tokens: 2784 358 | Total generated tokens (retokenized): 2775 359 | Request throughput (req/s): 0.24 360 | Input token throughput (tok/s): 47.83 361 | Output token throughput (tok/s): 67.53 362 | Peak output token throughput (tok/s): 110.00 363 | Peak concurrent requests: 3 364 | Total token throughput (tok/s): 115.36 365 | Concurrency: 1.00 366 | Accept length: 2.52 367 | ----------------End-to-End Latency---------------- 368 | Mean E2E Latency (ms): 4120.71 369 | Median E2E Latency (ms): 4713.16 370 | ---------------Time to First Token---------------- 371 | Mean TTFT (ms): 165.02 372 | Median TTFT (ms): 163.64 373 | P99 TTFT (ms): 199.88 374 | -----Time per Output Token (excl. 1st token)------ 375 | Mean TPOT (ms): 13.14 376 | Median TPOT (ms): 13.60 377 | P99 TPOT (ms): 17.71 378 | ---------------Inter-Token Latency---------------- 379 | Mean ITL (ms): 14.28 380 | Median ITL (ms): 12.10 381 | P95 ITL (ms): 36.23 382 | P99 ITL (ms): 36.87 383 | Max ITL (ms): 37.53 384 | ================================================== 385 | ``` 386 | 387 | #### 5.1.2 Throughput-Sensitive Benchmark 388 | 389 | - Model Deployment Command: 390 | 391 | ```shell 392 | python3 -m sglang.launch_server \ 393 | --model-path deepseek-ai/DeepSeek-V3.2-Exp \ 394 | --tp 8 \ 395 | --ep 8 \ 396 | --dp 8 \ 397 | --enable-dp-attention \ 398 | --host 0.0.0.0 \ 399 | --port 8000 400 | ``` 401 | 402 | - Benchmark Command: 403 | 404 | ```shell 405 | python3 -m sglang.bench_serving \ 406 | --backend sglang \ 407 | --host 127.0.0.1 \ 408 | --port 8000 \ 409 | --model deepseek-ai/DeepSeek-V3.2-Exp \ 410 | --random-input-len 1024 \ 411 | --random-output-len 1024 \ 412 | --num-prompts 1000 \ 413 | --max-concurrency 100 414 | ``` 415 | 416 | - **Test Results:** 417 | 418 | ``` 419 | ============ Serving Benchmark Result ============ 420 | Backend: sglang 421 | Traffic request rate: inf 422 | Max request concurrency: 100 423 | Successful requests: 1000 424 | Benchmark duration (s): 219.09 425 | Total input tokens: 301701 426 | Total input text tokens: 301701 427 | Total input vision tokens: 0 428 | Total generated tokens: 188375 429 | Total generated tokens (retokenized): 187443 430 | Request throughput (req/s): 4.56 431 | Input token throughput (tok/s): 1377.06 432 | Output token throughput (tok/s): 859.80 433 | Peak output token throughput (tok/s): 2465.00 434 | Peak concurrent requests: 109 435 | Total token throughput (tok/s): 2236.86 436 | Concurrency: 88.05 437 | ----------------End-to-End Latency---------------- 438 | Mean E2E Latency (ms): 19291.23 439 | Median E2E Latency (ms): 11927.39 440 | ---------------Time to First Token---------------- 441 | Mean TTFT (ms): 530.36 442 | Median TTFT (ms): 444.00 443 | P99 TTFT (ms): 1504.78 444 | -----Time per Output Token (excl. 1st token)------ 445 | Mean TPOT (ms): 106.16 446 | Median TPOT (ms): 106.69 447 | P99 TPOT (ms): 221.12 448 | ---------------Inter-Token Latency---------------- 449 | Mean ITL (ms): 100.46 450 | Median ITL (ms): 41.73 451 | P95 ITL (ms): 225.67 452 | P99 ITL (ms): 392.37 453 | Max ITL (ms): 975.03 454 | ================================================== 455 | ``` 456 | 457 | ### 5.2 Accuracy Benchmark 458 | 459 | #### 5.2.1 GSM8K Benchmark 460 | 461 | - **Benchmark Command:** 462 | 463 | ```shell 464 | python3 -m sglang.test.few_shot_gsm8k --num-questions 200 --port 8000 465 | ``` 466 | 467 | - **Test Results**: 468 | - DeepSeek-V3.2-Exp 469 | ``` 470 | Accuracy: 0.980 471 | Invalid: 0.000 472 | Latency: 19.128 s 473 | Output throughput: 965.919 token/s 474 | ``` 475 | 476 | #### 5.2.2 MMLU Benchmark 477 | 478 | - **Benchmark Command:** 479 | 480 | ```shell 481 | cd sglang 482 | bash benchmark/mmlu/download_data.sh 483 | python3 benchmark/mmlu/bench_sglang.py --nsub 10 --port 8000 484 | ``` 485 | 486 | - **Test Results**: 487 | - DeepSeek-V3.2-Exp 488 | ``` 489 | subject: abstract_algebra, #q:100, acc: 0.780 490 | subject: anatomy, #q:135, acc: 0.874 491 | subject: astronomy, #q:152, acc: 0.961 492 | subject: business_ethics, #q:100, acc: 0.860 493 | subject: clinical_knowledge, #q:265, acc: 0.925 494 | subject: college_biology, #q:144, acc: 0.972 495 | subject: college_chemistry, #q:100, acc: 0.660 496 | subject: college_computer_science, #q:100, acc: 0.880 497 | subject: college_mathematics, #q:100, acc: 0.840 498 | subject: college_medicine, #q:173, acc: 0.879 499 | Total latency: 7.961 500 | Average accuracy: 0.879 501 | ``` 502 | --------------------------------------------------------------------------------