├── static
├── .nojekyll
└── img
│ ├── logo.png
│ └── favicon.png
├── docs
├── GLM
│ ├── _category_.json
│ ├── GLM-4.5.md
│ ├── GLM-4.5V.md
│ ├── Glyph.md
│ └── GLM-4.6V.md
├── Ernie
│ ├── _category_.json
│ ├── Ernie4.5-VL.md
│ └── Ernie4.5.md
├── Llama
│ ├── _category_.json
│ ├── Llama3.1.md
│ ├── Llama3.3-70B.md
│ └── Llama4-Scout.md
├── Qwen
│ ├── _category_.json
│ ├── Qwen2.5-VL.md
│ └── Qwen3-Coder-480B-A35B.md
├── DeepSeek
│ ├── _category_.json
│ ├── DeepSeek-OCR.md
│ ├── DeepSeek-V3_1.md
│ ├── DeepSeek-V3.md
│ └── DeepSeek-V3_2.md
├── Jina
│ ├── _category_.json
│ └── Jina-reranker-m0.md
├── MiniMax
│ ├── _category_.json
│ └── MiniMax-M2.md
├── NVIDIA
│ ├── _category_.json
│ └── Nemotron3-Nano.md
├── OpenAI
│ └── _category_.json
├── InternLM
│ ├── _category_.json
│ └── Intern-S1.md
├── InternVL
│ ├── _category_.json
│ └── InternVL3_5.md
├── Mistral
│ ├── _category_.json
│ ├── Mistral-3.md
│ └── Devstral-2.md
├── Moonshotai
│ ├── _category_.json
│ ├── Kimi-Linear.md
│ └── Kimi-K2.md
└── intro.md
├── src
├── pages
│ └── index.js
├── components
│ ├── HomepageFeatures
│ │ ├── styles.module.css
│ │ └── index.js
│ ├── Llama31ConfigGenerator
│ │ └── index.js
│ ├── Devstral2ConfigGenerator
│ │ └── index.js
│ ├── ConfigGenerator
│ │ ├── styles.module.css
│ │ ├── index.js
│ │ ├── QUICKSTART.md
│ │ └── README.md
│ ├── Llama4ScoutConfigGenerator
│ │ └── index.js
│ ├── KimiK2ConfigGenerator
│ │ └── index.js
│ ├── InternS1ConfigGenerator
│ │ └── index.js
│ ├── NemotronConfigGenerator
│ │ └── index.js
│ ├── Qwen3NextConfigGenerator
│ │ └── index.js
│ ├── GLM46VConfigGenerator
│ │ └── index.js
│ ├── DeepSeekR1ConfigGenerator
│ │ └── index.js
│ ├── GLM46ConfigGenerator
│ │ └── index.js
│ ├── DeepSeekConfigGenerator
│ │ └── index.js
│ ├── GPTOSSConfigGenerator
│ │ └── index.js
│ ├── Qwen3VLConfigGenerator
│ │ └── index.js
│ └── Qwen3ConfigGenerator
│ │ └── index.js
└── css
│ └── custom.css
├── .gitignore
├── sidebars.js
├── .github
└── workflows
│ └── deploy.yml
├── package.json
├── README.md
├── docusaurus.config.js
└── LICENSE
/static/.nojekyll:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/GLM/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "GLM",
3 | "position": 5
4 | }
5 |
--------------------------------------------------------------------------------
/docs/Ernie/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "Ernie",
3 | "position": 9
4 | }
5 |
--------------------------------------------------------------------------------
/docs/Llama/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "Llama",
3 | "position": 4
4 | }
5 |
--------------------------------------------------------------------------------
/docs/Qwen/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "Qwen",
3 | "position": 2
4 | }
5 |
--------------------------------------------------------------------------------
/docs/DeepSeek/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "DeepSeek",
3 | "position": 3
4 | }
5 |
--------------------------------------------------------------------------------
/docs/Jina/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "Jina AI",
3 | "position": 12
4 | }
5 |
--------------------------------------------------------------------------------
/docs/MiniMax/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "MiniMax",
3 | "position": 8
4 | }
5 |
--------------------------------------------------------------------------------
/docs/NVIDIA/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "NVIDIA",
3 | "position": 9
4 | }
5 |
--------------------------------------------------------------------------------
/docs/OpenAI/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "OpenAI",
3 | "position": 6
4 | }
5 |
--------------------------------------------------------------------------------
/docs/InternLM/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "InternLM",
3 | "position": 11
4 | }
5 |
--------------------------------------------------------------------------------
/docs/InternVL/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "InternVL",
3 | "position": 10
4 | }
5 |
--------------------------------------------------------------------------------
/docs/Mistral/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "Mistral",
3 | "position": 13
4 | }
5 |
6 |
--------------------------------------------------------------------------------
/docs/Moonshotai/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 | "label": "Moonshotai",
3 | "position": 7
4 | }
5 |
--------------------------------------------------------------------------------
/static/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgl-project/sgl-cookbook/HEAD/static/img/logo.png
--------------------------------------------------------------------------------
/static/img/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgl-project/sgl-cookbook/HEAD/static/img/favicon.png
--------------------------------------------------------------------------------
/src/pages/index.js:
--------------------------------------------------------------------------------
1 | import {Redirect} from '@docusaurus/router';
2 |
3 | export default function Home() {
4 | return ;
5 | }
6 |
--------------------------------------------------------------------------------
/src/components/HomepageFeatures/styles.module.css:
--------------------------------------------------------------------------------
1 | .features {
2 | display: flex;
3 | align-items: center;
4 | padding: 2rem 0;
5 | width: 100%;
6 | }
7 |
8 | .featureSvg {
9 | height: 200px;
10 | width: 200px;
11 | }
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Dependencies
2 | /node_modules
3 |
4 | # Production
5 | /build
6 |
7 | # Generated files
8 | .docusaurus
9 | .cache-loader
10 |
11 | # Misc
12 | .DS_Store
13 | .env.local
14 | .env.development.local
15 | .env.test.local
16 | .env.production.local
17 |
18 | npm-debug.log*
19 | yarn-debug.log*
20 | yarn-error.log*
21 |
--------------------------------------------------------------------------------
/docs/GLM/GLM-4.5.md:
--------------------------------------------------------------------------------
1 | # GLM-4.5
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **GLM-4.5** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-glm-4-5-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [GLM-4.6](/docs/GLM/GLM-4.6.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/GLM/GLM-4.5V.md:
--------------------------------------------------------------------------------
1 | # GLM-4.5V
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **GLM-4.5V** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-glm-4-5v-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [GLM-4.6V](/docs/GLM/GLM-4.6V.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/GLM/Glyph.md:
--------------------------------------------------------------------------------
1 | # GLM Glyph
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **GLM Glyph** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-glm-glyph-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [GLM-4.6](/docs/GLM/GLM-4.6.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/InternLM/Intern-S1.md:
--------------------------------------------------------------------------------
1 | # Intern-S1
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **Intern-S1** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-intern-s1-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [GLM-4.6V](/docs/GLM/GLM-4.6V.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/Qwen/Qwen2.5-VL.md:
--------------------------------------------------------------------------------
1 | # Qwen2.5-VL
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **Qwen2.5-VL** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-qwen2-5-vl-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [Qwen3-VL](/docs/Qwen/Qwen3-VL.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/Ernie/Ernie4.5-VL.md:
--------------------------------------------------------------------------------
1 | # Ernie4.5-VL
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **Ernie4.5-VL** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-ernie4-5-vl-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [GLM-4.6V](/docs/GLM/GLM-4.6V.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/Llama/Llama3.1.md:
--------------------------------------------------------------------------------
1 | # Llama3.1 Usage Guide
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **Llama3.3-70B** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-llama3-1-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [Qwen3](/docs/Qwen/Qwen3.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/Llama/Llama3.3-70B.md:
--------------------------------------------------------------------------------
1 | # Llama3.3-70B
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **Llama3.3-70B** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-llama3-3-70b-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [Qwen3](/docs/Qwen/Qwen3.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/Ernie/Ernie4.5.md:
--------------------------------------------------------------------------------
1 | # Ernie4.5
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **Ernie4.5** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-ernie4-5-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/InternVL/InternVL3_5.md:
--------------------------------------------------------------------------------
1 | # InternVL3.5
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **InternVL3.5** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-internvl3-5-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [GLM-4.6V](/docs/GLM/GLM-4.6V.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/MiniMax/MiniMax-M2.md:
--------------------------------------------------------------------------------
1 | # MiniMax-M2
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **MiniMax-M2** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-minimax-m2-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [Kimi-K2](/docs/Moonshotai/Kimi-K2.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/Llama/Llama4-Scout.md:
--------------------------------------------------------------------------------
1 | # Llama4-Scout Usage Guide
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **Llama4-Scout** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-llama4-scout-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [Qwen3](/docs/Qwen/Qwen3.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/Moonshotai/Kimi-Linear.md:
--------------------------------------------------------------------------------
1 | # Kimi-Linear
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **Kimi-Linear** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-kimi-linear-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [Kimi-K2](/docs/Moonshotai/Kimi-K2.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/DeepSeek/DeepSeek-OCR.md:
--------------------------------------------------------------------------------
1 | # DeepSeek-OCR
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **DeepSeek-OCR** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-deepseek-ocr-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/DeepSeek/DeepSeek-V3_1.md:
--------------------------------------------------------------------------------
1 | # DeepSeek-V3.1
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **DeepSeek-V3.1** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-deepseek-v3-1-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/Jina/Jina-reranker-m0.md:
--------------------------------------------------------------------------------
1 | # Jina-reranker-m0
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **Jina-reranker-m0** with SGLang, please help us complete this documentation.
8 |
9 | ## 🚀 How to Contribute
10 |
11 | ```shell
12 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
13 | cd sglang-cookbook
14 | git checkout -b add-jina-reranker-m0-guide
15 | # Edit this file and submit a PR
16 | ```
17 |
18 | ## 📚 Reference
19 |
20 | - [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2.md)
21 |
22 | ---
23 |
24 | **Let's build this together!** 🌟
25 |
--------------------------------------------------------------------------------
/docs/DeepSeek/DeepSeek-V3.md:
--------------------------------------------------------------------------------
1 | # DeepSeek-V3
2 |
3 | ## 📝 Community Contribution Welcome
4 |
5 | This guide is currently under development. We welcome community contributions!
6 |
7 | If you have experience deploying **DeepSeek-V3** with SGLang, please help us complete this documentation by:
8 |
9 | - Sharing your deployment configurations and optimization tips
10 | - Adding code examples and troubleshooting guides
11 | - Documenting best practices
12 |
13 | ## 🚀 How to Contribute
14 |
15 | ```shell
16 | # Fork the repository
17 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
18 | cd sglang-cookbook
19 | git checkout -b add-deepseek-v3-guide
20 |
21 | # Edit this file and follow the format of existing guides
22 | # Submit a Pull Request
23 | ```
24 |
25 | ## 📚 Reference
26 |
27 | - [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2.md)
28 |
29 | ---
30 |
31 | **Let's build this together!** 🌟
32 |
--------------------------------------------------------------------------------
/sidebars.js:
--------------------------------------------------------------------------------
1 | // @ts-check
2 |
3 | // This runs in Node.js - Don't use client-side code here (browser APIs, JSX...)
4 |
5 | /**
6 | * Creating a sidebar enables you to:
7 | - create an ordered group of docs
8 | - render a sidebar for each doc of that group
9 | - provide next/previous navigation
10 |
11 | The sidebars can be generated from the filesystem, or explicitly defined here.
12 |
13 | Create as many sidebars as you want.
14 |
15 | @type {import('@docusaurus/plugin-content-docs').SidebarsConfig}
16 | */
17 | const sidebars = {
18 | // By default, Docusaurus generates a sidebar from the docs folder structure
19 | tutorialSidebar: [{type: 'autogenerated', dirName: '.'}],
20 |
21 | // But you can create a sidebar manually
22 | /*
23 | tutorialSidebar: [
24 | 'intro',
25 | 'hello',
26 | {
27 | type: 'category',
28 | label: 'Tutorial',
29 | items: ['tutorial-basics/create-a-document'],
30 | },
31 | ],
32 | */
33 | };
34 |
35 | export default sidebars;
36 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy Docusaurus
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 | jobs:
7 | build-deploy:
8 | runs-on: ubuntu-latest
9 |
10 | steps:
11 | - name: Checkout repository
12 | uses: actions/checkout@v4
13 |
14 | - name: Setup Node.js
15 | uses: actions/setup-node@v4
16 | with:
17 | node-version: 20
18 | cache: npm
19 |
20 | - name: Install dependencies
21 | run: npm ci
22 |
23 | - name: Build Docusaurus
24 | run: npm run build
25 |
26 | - name: Install Vercel CLI
27 | run: npm install -g vercel
28 |
29 | - name: Deploy to Vercel
30 | run: |
31 | vercel deploy build --prod \
32 | --yes \
33 | --force \
34 | --scope ${{ secrets.VERCEL_ORG_ID }} \
35 | --token ${{ secrets.VERCEL_TOKEN }}
36 | env:
37 | VERCEL_ORG_ID: ${{ secrets.VERCEL_ORG_ID }}
38 | VERCEL_PROJECT_ID: ${{ secrets.VERCEL_PROJECT_ID }}
39 |
--------------------------------------------------------------------------------
/src/css/custom.css:
--------------------------------------------------------------------------------
1 | /**
2 | * Any CSS included here will be global. The classic template
3 | * bundles Infima by default. Infima is a CSS framework designed to
4 | * work well for content-centric websites.
5 | */
6 |
7 | /* You can override the default Infima variables here. */
8 | :root {
9 | --ifm-color-primary: #2e8555;
10 | --ifm-color-primary-dark: #29784c;
11 | --ifm-color-primary-darker: #277148;
12 | --ifm-color-primary-darkest: #205d3b;
13 | --ifm-color-primary-light: #33925d;
14 | --ifm-color-primary-lighter: #359962;
15 | --ifm-color-primary-lightest: #3cad6e;
16 | --ifm-code-font-size: 95%;
17 | --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);
18 | }
19 |
20 | /* For readability concerns, you should choose a lighter palette in dark mode. */
21 | [data-theme='dark'] {
22 | --ifm-color-primary: #25c2a0;
23 | --ifm-color-primary-dark: #21af90;
24 | --ifm-color-primary-darker: #1fa588;
25 | --ifm-color-primary-darkest: #1a8870;
26 | --ifm-color-primary-light: #29d5b0;
27 | --ifm-color-primary-lighter: #32d8b4;
28 | --ifm-color-primary-lightest: #4fddbf;
29 | --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3);
30 | }
31 |
--------------------------------------------------------------------------------
/docs/Mistral/Mistral-3.md:
--------------------------------------------------------------------------------
1 | ---
2 | sidebar_position: 2
3 | ---
4 |
5 | # Mistral 3
6 |
7 | :::info Community contribution welcome
8 | This guide is currently under development. If you have experience deploying **Mistral 3** with SGLang, please help us complete this documentation.
9 |
10 | To contribute, fork the repo, edit this page, and open a PR.
11 | :::
12 |
13 | ## 1. Model Introduction
14 |
15 | This page will cover practical deployment configs and usage patterns for **Mistral 3** with SGLang.
16 |
17 | ## 2. SGLang Installation
18 |
19 | Please refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html).
20 |
21 | ## 3. Model Deployment
22 |
23 | Coming soon: recommended launch configs (TP/PP, quantization, context length) and tuning tips.
24 |
25 | ## 4. Model Invocation
26 |
27 | Coming soon: OpenAI-compatible API examples and tool-calling notes.
28 |
29 | ## Contributing
30 |
31 | ```shell
32 | git clone https://github.com/YOUR_USERNAME/sgl-cookbook.git
33 | cd sgl-cookbook
34 | git checkout -b add-mistral-3-guide
35 | # Edit this file and submit a PR
36 | ```
37 |
38 | ## Reference
39 |
40 | - [Devstral 2](/docs/Mistral/Devstral-2.md)
41 |
--------------------------------------------------------------------------------
/docs/Qwen/Qwen3-Coder-480B-A35B.md:
--------------------------------------------------------------------------------
1 | ---
2 | sidebar_position: 4
3 | ---
4 |
5 | # Qwen3-Coder-480B-A35B
6 |
7 | :::info Community contribution welcome
8 | This guide is currently under development. If you have experience deploying **Qwen3-Coder-480B-A35B** with SGLang, please help us complete this documentation.
9 |
10 | To contribute, fork the repo, edit this page, and open a PR.
11 | :::
12 |
13 | ## 1. Model Introduction
14 |
15 | This page will cover practical deployment configs and usage patterns for **Qwen3-Coder-480B-A35B** with SGLang.
16 |
17 | ## 2. SGLang Installation
18 |
19 | Please refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html).
20 |
21 | ## 3. Model Deployment
22 |
23 | Coming soon: recommended launch configs (TP/PP, quantization, context length) and tuning tips.
24 |
25 | ## 4. Model Invocation
26 |
27 | Coming soon: OpenAI-compatible API examples and tool-calling notes.
28 |
29 | ## Contributing
30 |
31 | ```shell
32 | git clone https://github.com/YOUR_USERNAME/sgl-cookbook.git
33 | cd sgl-cookbook
34 | git checkout -b add-qwen3-coder-480b-guide
35 | # Edit this file and submit a PR
36 | ```
37 |
38 | ## Reference
39 |
40 | - [Qwen3](./Qwen3)
41 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "sglang-cookbook",
3 | "version": "0.0.0",
4 | "private": true,
5 | "scripts": {
6 | "docusaurus": "docusaurus",
7 | "start": "docusaurus start",
8 | "build": "docusaurus build",
9 | "swizzle": "docusaurus swizzle",
10 | "deploy": "docusaurus deploy",
11 | "clear": "docusaurus clear",
12 | "serve": "docusaurus serve",
13 | "write-translations": "docusaurus write-translations",
14 | "write-heading-ids": "docusaurus write-heading-ids"
15 | },
16 | "dependencies": {
17 | "@docusaurus/core": "3.9.2",
18 | "@docusaurus/preset-classic": "3.9.2",
19 | "@mdx-js/react": "^3.0.0",
20 | "clsx": "^2.0.0",
21 | "prism-react-renderer": "^2.3.0",
22 | "react": "^19.0.0",
23 | "react-dom": "^19.0.0"
24 | },
25 | "devDependencies": {
26 | "@docusaurus/module-type-aliases": "3.9.2",
27 | "@docusaurus/types": "3.9.2"
28 | },
29 | "browserslist": {
30 | "production": [
31 | ">0.5%",
32 | "not dead",
33 | "not op_mini all"
34 | ],
35 | "development": [
36 | "last 3 chrome version",
37 | "last 3 firefox version",
38 | "last 5 safari version"
39 | ]
40 | },
41 | "engines": {
42 | "node": ">=20.0"
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/components/HomepageFeatures/index.js:
--------------------------------------------------------------------------------
1 | import clsx from 'clsx';
2 | import Heading from '@theme/Heading';
3 | import styles from './styles.module.css';
4 |
5 | const FeatureList = [
6 | {
7 | title: 'Easy to Use',
8 | Svg: require('@site/static/img/undraw_docusaurus_mountain.svg').default,
9 | description: (
10 | <>
11 | Docusaurus was designed from the ground up to be easily installed and
12 | used to get your website up and running quickly.
13 | >
14 | ),
15 | },
16 | {
17 | title: 'Focus on What Matters',
18 | Svg: require('@site/static/img/undraw_docusaurus_tree.svg').default,
19 | description: (
20 | <>
21 | Docusaurus lets you focus on your docs, and we'll do the chores. Go
22 | ahead and move your docs into the docs directory.
23 | >
24 | ),
25 | },
26 | {
27 | title: 'Powered by React',
28 | Svg: require('@site/static/img/undraw_docusaurus_react.svg').default,
29 | description: (
30 | <>
31 | Extend or customize your website layout by reusing React. Docusaurus can
32 | be extended while reusing the same header and footer.
33 | >
34 | ),
35 | },
36 | ];
37 |
38 | function Feature({Svg, title, description}) {
39 | return (
40 |
41 |
42 |
43 |
44 |
45 |
{title}
46 |
{description}
47 |
48 |
49 | );
50 | }
51 |
52 | export default function HomepageFeatures() {
53 | return (
54 |
55 |
56 |
57 | {FeatureList.map((props, idx) => (
58 |
59 | ))}
60 |
61 |
62 |
63 | );
64 | }
65 |
--------------------------------------------------------------------------------
/src/components/Llama31ConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * Llama 3.1 70B Configuration Generator
6 | */
7 | const Llama31ConfigGenerator = () => {
8 | const config = {
9 | modelFamily: 'meta-llama',
10 |
11 | options: {
12 | hardware: {
13 | name: 'hardware',
14 | title: 'Hardware Platform',
15 | items: [
16 | { id: 'h100', label: 'H100 (4x)', default: true },
17 | { id: 'h200', label: 'H200 (4x)', default: false }
18 | ]
19 | },
20 | optimization: {
21 | name: 'optimization',
22 | title: 'Optimization Mode',
23 | items: [
24 | { id: 'basic', label: 'Basic', default: true },
25 | { id: 'throughput', label: 'Throughput Optimized', default: false },
26 | { id: 'latency', label: 'Latency Optimized', default: false }
27 | ]
28 | }
29 | },
30 |
31 | generateCommand: function(values) {
32 | const { hardware, optimization } = values;
33 |
34 | let cmd = 'python3 -m sglang.launch_server \\\n';
35 | cmd += ` --model meta-llama/Llama-3.1-70B-Instruct \\\n`;
36 | cmd += ` --tp 4`;
37 |
38 | if (optimization === 'throughput') {
39 | cmd += ` \\\n --enable-dp-attention \\\n`;
40 | cmd += ` --mem-fraction-static 0.85`;
41 | } else if (optimization === 'latency') {
42 | cmd += ` \\\n --speculative-algorithm EAGLE \\\n`;
43 | cmd += ` --speculative-num-steps 3 \\\n`;
44 | cmd += ` --speculative-eagle-topk 1 \\\n`;
45 | cmd += ` --speculative-num-draft-tokens 4 \\\n`;
46 | cmd += ` --disable-shared-experts-fusion \\\n`;
47 | cmd += ` --max-running-requests 64 \\\n`;
48 | cmd += ` --mem-fraction-static 0.85 \\\n`;
49 | cmd += ` --kv-cache-dtype fp8_e4m3 \\\n`;
50 | cmd += ` --context-length 32768 \\\n`;
51 | cmd += ` --quantization fp8`;
52 | }
53 |
54 | cmd += ` \\\n --host 0.0.0.0 \\\n`;
55 | cmd += ` --port 8000`;
56 |
57 | return cmd;
58 | }
59 | };
60 |
61 | return ;
62 | };
63 |
64 | export default Llama31ConfigGenerator;
65 |
66 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SGLang Cookbook
2 |
3 | [](https://opensource.org/licenses/Apache-2.0)
4 | [](https://github.com/sgl-project/sgl-cookbook/pulls)
5 |
6 | A community-maintained repository of practical guides and recipes for deploying and using SGLang in production environments. Our mission is simple: answer the question **"How do I use SGLang (and related models) on hardware Y for task Z?"** with clear, actionable solutions.
7 |
8 | ## 🎯 What You'll Find Here
9 |
10 | This cookbook aggregates battle-tested SGLang recipes covering:
11 |
12 | - **Models**: Mainstream LLMs and Vision-Language Models (VLMs)
13 | - **Use Cases**: Inference serving, deployment strategies, multimodal applications
14 | - **Hardware**: GPU and CPU configurations, optimization for different accelerators
15 | - **Best Practices**: Configuration templates, performance tuning, troubleshooting guides
16 |
17 | Each recipe provides step-by-step instructions to help you quickly implement SGLang solutions for your specific requirements.
18 |
19 | ## 🚀 Quick Start
20 |
21 | 1. Browse the recipe index above to find your model
22 | 2. Follow the step-by-step instructions in each guide
23 | 3. Adapt configurations to your specific hardware and requirements
24 | 4. Join our community to share feedback and improvements
25 |
26 | ## 🤝 Contributing
27 |
28 | We believe the best documentation comes from practitioners. Whether you've optimized SGLang for a specific model, solved a tricky deployment challenge, or discovered performance improvements, we encourage you to contribute your recipes!
29 |
30 | **Ways to contribute:**
31 |
32 | - Add a new recipe for a model not yet covered
33 | - Improve existing recipes with additional tips or configurations
34 | - Report issues or suggest enhancements
35 | - Share your production deployment experiences
36 |
37 | **To contribute:**
38 |
39 | ```shell
40 | # Fork the repo and clone locally
41 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
42 | cd sglang-cookbook
43 |
44 | # Create a new branch
45 | git checkout -b add-my-recipe
46 |
47 | # Add your recipe following the template in DeepSeek-V3.2
48 | # Submit a PR!
49 | ```
50 |
51 | ## 🛠️ Local Development
52 |
53 | ### Prerequisites
54 |
55 | - Node.js >= 20.0
56 | - npm or yarn
57 |
58 | ### Setup and Run
59 |
60 | Install dependencies and start the development server:
61 |
62 | ```shell
63 | # Install dependencies
64 | npm install
65 |
66 | # Start development server (hot reload enabled)
67 | npm start
68 | ```
69 |
70 | The site will automatically open in your browser at `http://localhost:3000`.
71 |
72 | ## 📖 Resources
73 |
74 | - [SGLang GitHub](https://github.com/sgl-project/sglang)
75 | - [SGLang Documentation](https://sgl-project.github.io)
76 | - [Community Slack/Discord](https://discord.gg/MpEEuAeb)
77 |
78 | ## 📄 License
79 |
80 | This project is licensed under the Apache License 2.0 - see the [LICENSE](https://github.com/sgl-project/sgl-cookbook/blob/main/LICENSE) file for details.
81 |
82 | ---
83 |
84 | **Let's build this resource together!** 🚀 Star the repo and contribute your recipes to help the SGLang community grow.
85 |
--------------------------------------------------------------------------------
/src/components/Devstral2ConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * Devstral 2 Configuration Generator
6 | * Covers:
7 | * - mistralai/Devstral-Small-2-24B-Instruct-2512
8 | * - mistralai/Devstral-2-123B-Instruct-2512 (FP8 weights)
9 | */
10 | const Devstral2ConfigGenerator = () => {
11 | const config = {
12 | modelFamily: 'Mistral',
13 |
14 | options: {
15 | hardware: {
16 | name: 'hardware',
17 | title: 'Hardware Platform',
18 | items: [
19 | { id: 'b200', label: 'B200', default: true },
20 | { id: 'h200', label: 'H200', default: false },
21 | { id: 'h100', label: 'H100', default: false }
22 | ]
23 | },
24 | model: {
25 | name: 'model',
26 | title: 'Model',
27 | items: [
28 | { id: 'small', label: 'Devstral Small 2 (24B)', default: true },
29 | { id: 'large', label: 'Devstral 2 (123B)', default: false }
30 | ]
31 | },
32 | weights: {
33 | name: 'weights',
34 | title: 'Weights / Precision',
35 | items: [
36 | { id: 'fp8', label: 'FP8', default: true }
37 | ]
38 | },
39 | toolcall: {
40 | name: 'toolcall',
41 | title: 'Tool Call Parser',
42 | items: [
43 | { id: 'disabled', label: 'Disabled', default: true },
44 | { id: 'enabled', label: 'Enabled', default: false }
45 | ],
46 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser mistral' : null
47 | }
48 | },
49 |
50 | modelConfigs: {
51 | small: {
52 | modelId: 'mistralai/Devstral-Small-2-24B-Instruct-2512',
53 | tpByHardware: { h100: 1, h200: 1, b200: 1 },
54 | allowedWeights: ['fp8']
55 | },
56 | large: {
57 | modelId: 'mistralai/Devstral-2-123B-Instruct-2512',
58 | tpByHardware: { h100: 4, h200: 2, b200: 2 },
59 | allowedWeights: ['fp8']
60 | }
61 | },
62 |
63 | generateCommand: function (values) {
64 | const { hardware, model, weights } = values;
65 |
66 | const modelCfg = this.modelConfigs[model];
67 | if (!modelCfg) return `# Error: Unknown model selection: ${model}`;
68 |
69 | if (!modelCfg.allowedWeights.includes(weights)) {
70 | const allowed = modelCfg.allowedWeights.map(w => w.toUpperCase()).join(', ');
71 | return `# Error: ${modelCfg.modelId} only supports: ${allowed}\n# Please change "Weights / Precision" to a supported value.`;
72 | }
73 |
74 | const tp = modelCfg.tpByHardware[hardware];
75 | if (!tp) return `# Error: Unknown hardware platform: ${hardware}`;
76 |
77 | let cmd = 'python -m sglang.launch_server \\\n';
78 | cmd += ` --model ${modelCfg.modelId}`;
79 |
80 | if (tp > 1) {
81 | cmd += ` \\\n --tp ${tp}`;
82 | }
83 |
84 | // Append optional flags (e.g. tool calling)
85 | for (const [key, option] of Object.entries(this.options)) {
86 | if (option.commandRule) {
87 | const rule = option.commandRule(values[key]);
88 | if (rule) cmd += ` \\\n ${rule}`;
89 | }
90 | }
91 |
92 | return cmd;
93 | }
94 | };
95 |
96 | return ;
97 | };
98 |
99 | export default Devstral2ConfigGenerator;
100 |
101 |
--------------------------------------------------------------------------------
/src/components/ConfigGenerator/styles.module.css:
--------------------------------------------------------------------------------
1 | .configContainer {
2 | max-width: 1000px;
3 | margin: 0 auto;
4 | display: flex;
5 | flex-wrap: wrap;
6 | gap: 12px;
7 | }
8 |
9 | .optionCard {
10 | background: var(--ifm-background-surface-color);
11 | padding: 16px;
12 | border-radius: 10px;
13 | box-shadow: 0 3px 10px rgba(0, 0, 0, 0.08);
14 | border: 1px solid var(--ifm-color-emphasis-300);
15 | flex: 1 1 calc(50% - 6px);
16 | min-width: 400px;
17 | }
18 |
19 | .optionTitle {
20 | font-size: 14px;
21 | font-weight: 600;
22 | color: var(--ifm-font-color-base);
23 | margin-bottom: 10px;
24 | display: flex;
25 | align-items: center;
26 | }
27 |
28 | .optionNumber {
29 | background: #667eea;
30 | color: white;
31 | width: 22px;
32 | height: 22px;
33 | border-radius: 50%;
34 | display: inline-flex;
35 | align-items: center;
36 | justify-content: center;
37 | margin-right: 8px;
38 | font-size: 12px;
39 | }
40 |
41 | .optionItems {
42 | display: flex;
43 | gap: 8px;
44 | flex-wrap: wrap;
45 | align-items: center;
46 | }
47 |
48 | .hiddenInput {
49 | display: none;
50 | }
51 |
52 | .textInput {
53 | flex: 1;
54 | min-width: 200px;
55 | padding: 10px 14px;
56 | border: 2px solid var(--ifm-color-emphasis-300);
57 | border-radius: 6px;
58 | font-size: 14px;
59 | transition: all 0.3s;
60 | background: var(--ifm-background-surface-color);
61 | color: var(--ifm-font-color-base);
62 | }
63 |
64 | .textInput:focus {
65 | outline: none;
66 | border-color: #667eea;
67 | box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
68 | }
69 |
70 | .optionLabel {
71 | padding: 8px 18px;
72 | border: 2px solid var(--ifm-color-emphasis-300);
73 | border-radius: 6px;
74 | cursor: pointer;
75 | display: inline-block;
76 | font-weight: 500;
77 | font-size: 13px;
78 | transition: all 0.3s;
79 | background: var(--ifm-background-surface-color);
80 | color: var(--ifm-font-color-base);
81 | user-select: none;
82 | flex-shrink: 0;
83 | }
84 |
85 | .optionLabel:hover {
86 | border-color: #667eea;
87 | }
88 |
89 | .optionLabel.checked {
90 | background: #dc3545;
91 | color: white;
92 | border-color: #d55816;
93 | }
94 |
95 | .optionLabel.disabled {
96 | cursor: not-allowed;
97 | opacity: 0.7;
98 | }
99 |
100 | .optionLabel.disabled:hover {
101 | border-color: var(--ifm-color-emphasis-300);
102 | }
103 |
104 | .subtitle {
105 | display: block;
106 | color: var(--ifm-color-emphasis-600);
107 | font-size: 10px;
108 | margin-top: 2px;
109 | }
110 |
111 | .optionLabel.checked .subtitle {
112 | color: rgba(255, 255, 255, 0.85);
113 | }
114 |
115 | .commandCard {
116 | background: var(--ifm-background-surface-color);
117 | padding: 16px;
118 | border-radius: 10px;
119 | box-shadow: 0 3px 10px rgba(0, 0, 0, 0.08);
120 | border: 1px solid var(--ifm-color-emphasis-300);
121 | flex: 1 1 100%;
122 | width: 100%;
123 | }
124 |
125 | .commandTitle {
126 | font-size: 15px;
127 | font-weight: 600;
128 | color: var(--ifm-font-color-base);
129 | margin-bottom: 10px;
130 | }
131 |
132 | .commandDisplay {
133 | padding: 16px;
134 | background: #2d3748;
135 | border-radius: 6px;
136 | font-family: 'Menlo', 'Monaco', 'Courier New', monospace;
137 | font-size: 13px;
138 | line-height: 1.7;
139 | color: #e2e8f0;
140 | white-space: pre-wrap;
141 | overflow-x: auto;
142 | border: none;
143 | margin: 0;
144 | }
145 |
146 |
--------------------------------------------------------------------------------
/src/components/Llama4ScoutConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * Llama 4-Scout Configuration Generator
6 | */
7 | const Llama4ScoutConfigGenerator = () => {
8 | const config = {
9 | modelFamily: 'meta-llama',
10 |
11 | options: {
12 | hardware: {
13 | name: 'hardware',
14 | title: 'Hardware Platform',
15 | items: [
16 | { id: 'b200', label: 'B200', default: false },
17 | { id: 'h100', label: 'H100', default: true },
18 | { id: 'h200', label: 'H200', default: false }
19 | ]
20 | },
21 | quantization: {
22 | name: 'quantization',
23 | title: 'Quantization',
24 | items: [
25 | { id: 'bf16', label: 'BF16', default: true },
26 | { id: 'fp8', label: 'FP8', default: false }
27 | ]
28 | },
29 | toolcall: {
30 | name: 'toolcall',
31 | title: 'Tool Call Parser',
32 | items: [
33 | { id: 'disabled', label: 'Disabled', default: true },
34 | { id: 'enabled', label: 'Enabled', default: false }
35 | ]
36 | },
37 | speculative: {
38 | name: 'speculative',
39 | title: 'Speculative Decoding (EAGLE3)',
40 | items: [
41 | { id: 'disabled', label: 'Disabled', default: true },
42 | { id: 'enabled', label: 'Enable EAGLE3', default: false }
43 | ]
44 | },
45 | host: {
46 | name: 'host',
47 | title: 'Host',
48 | type: 'text',
49 | default: '0.0.0.0',
50 | placeholder: '0.0.0.0'
51 | },
52 | port: {
53 | name: 'port',
54 | title: 'Port',
55 | type: 'text',
56 | default: '8000',
57 | placeholder: '8000'
58 | }
59 | },
60 |
61 | generateCommand: function(values) {
62 | const { hardware, quantization, toolcall, speculative, host, port } = values;
63 |
64 | let cmd = 'python -m sglang.launch_server \\\n';
65 | cmd += ` --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct`;
66 |
67 | if (hardware === 'h100' || hardware === 'h200') {
68 | cmd += ` \\\n --tp 8`;
69 | } else if (hardware === 'b200') {
70 | cmd += ` \\\n --tp 8`;
71 | }
72 |
73 | if (quantization === 'fp8') {
74 | cmd += ` \\\n --quantization fp8`;
75 | }
76 |
77 | if (toolcall === 'enabled') {
78 | cmd += ` \\\n --tool-call-parser pythonic`;
79 | }
80 |
81 | if (speculative === 'enabled') {
82 | cmd += ` \\\n --speculative-algorithm EAGLE3 \\\n`;
83 | cmd += ` --speculative-draft-model-path lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1 \\\n`;
84 | cmd += ` --speculative-num-steps 3 \\\n`;
85 | cmd += ` --speculative-eagle-topk 1 \\\n`;
86 | cmd += ` --speculative-num-draft-tokens 4 \\\n`;
87 | cmd += ` --mem-fraction-static 0.75 \\\n`;
88 | cmd += ` --cuda-graph-max-bs 2`;
89 | }
90 |
91 | cmd += ` \\\n --enable-multimodal`;
92 | cmd += ` \\\n --context-length 65536`;
93 | cmd += ` \\\n --dtype bfloat16`;
94 | cmd += ` \\\n --trust-remote-code`;
95 | cmd += ` \\\n --host ${host || '0.0.0.0'}`;
96 | cmd += ` \\\n --port ${port || '8000'}`;
97 |
98 | return cmd;
99 | }
100 | };
101 |
102 | return ;
103 | };
104 |
105 | export default Llama4ScoutConfigGenerator;
106 |
107 |
--------------------------------------------------------------------------------
/src/components/KimiK2ConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * Kimi-K2 Configuration Generator
6 | * Supports Kimi-K2-Instruct and Kimi-K2-Thinking models
7 | */
8 | const KimiK2ConfigGenerator = () => {
9 | const config = {
10 | modelFamily: 'moonshotai',
11 |
12 | options: {
13 | hardware: {
14 | name: 'hardware',
15 | title: 'Hardware Platform',
16 | items: [
17 | { id: 'h200', label: 'H200', default: true },
18 | { id: 'b200', label: 'B200', default: false }
19 | ]
20 | },
21 | modelname: {
22 | name: 'modelname',
23 | title: 'Model Name',
24 | items: [
25 | { id: 'instruct', label: 'Kimi-K2-Instruct', default: true },
26 | { id: 'thinking', label: 'Kimi-K2-Thinking', default: false }
27 | ]
28 | },
29 | strategy: {
30 | name: 'strategy',
31 | title: 'Deployment Strategy',
32 | type: 'checkbox',
33 | items: [
34 | { id: 'tp', label: 'TP', default: true, required: true },
35 | { id: 'dp', label: 'DP attention', default: false },
36 | { id: 'ep', label: 'EP', default: false }
37 | ]
38 | },
39 | reasoning: {
40 | name: 'reasoning',
41 | title: 'Reasoning Parser',
42 | items: [
43 | { id: 'disabled', label: 'Disabled', default: true },
44 | { id: 'enabled', label: 'Enabled', default: false }
45 | ]
46 | },
47 | toolcall: {
48 | name: 'toolcall',
49 | title: 'Tool Call Parser',
50 | items: [
51 | { id: 'disabled', label: 'Disabled', default: true },
52 | { id: 'enabled', label: 'Enabled', default: false }
53 | ]
54 | }
55 | },
56 |
57 | generateCommand: function (values) {
58 | const { hardware, modelname, strategy, reasoning, toolcall } = values;
59 |
60 | // Validation: Kimi-K2-Instruct doesn't support reasoning parser
61 | if (modelname === 'instruct' && reasoning === 'enabled') {
62 | return `# Error: Kimi-K2-Instruct doesn't support reasoning parser\n# Please select "Disabled" for Reasoning Parser or choose Kimi-K2-Thinking model`;
63 | }
64 |
65 | // Model name mapping
66 | const modelMap = {
67 | 'instruct': 'Kimi-K2-Instruct',
68 | 'thinking': 'Kimi-K2-Thinking'
69 | };
70 |
71 | const modelName = `${this.modelFamily}/${modelMap[modelname]}`;
72 |
73 | let cmd = 'python3 -m sglang.launch_server \\\n';
74 | cmd += ` --model-path ${modelName}`;
75 |
76 | // Strategy configurations
77 | const strategyArray = Array.isArray(strategy) ? strategy : [];
78 | // TP is mandatory
79 | cmd += ` \\\n --tp 8`;
80 | if (strategyArray.includes('dp')) {
81 | cmd += ` \\\n --dp 4 \\\n --enable-dp-attention`;
82 | }
83 | if (strategyArray.includes('ep')) {
84 | cmd += ` \\\n --ep 4`;
85 | }
86 |
87 | // Add trust-remote-code (required for Kimi-K2)
88 | cmd += ` \\\n --trust-remote-code`;
89 |
90 | // Add tool-call-parser if enabled
91 | if (toolcall === 'enabled') {
92 | cmd += ` \\\n --tool-call-parser kimi_k2`;
93 | }
94 |
95 | // Add reasoning-parser if enabled
96 | if (reasoning === 'enabled') {
97 | cmd += ` \\\n --reasoning-parser kimi_k2`;
98 | }
99 |
100 | return cmd;
101 | }
102 | };
103 |
104 | return ;
105 | };
106 |
107 | export default KimiK2ConfigGenerator;
108 |
109 |
--------------------------------------------------------------------------------
/src/components/InternS1ConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * Intern-S1 Configuration Generator
6 | * Supports Intern-S1 (235B MOE) and Intern-S1-mini (8B Dense) models
7 | */
8 | const InternS1ConfigGenerator = () => {
9 | const config = {
10 | modelFamily: 'Intern',
11 |
12 | options: {
13 | hardware: {
14 | name: 'hardware',
15 | title: 'Hardware Platform',
16 | items: [
17 | { id: 'b200', label: 'B200', default: true },
18 | { id: 'h100', label: 'H100', default: false },
19 | { id: 'h200', label: 'H200', default: false }
20 | ]
21 | },
22 | modelsize: {
23 | name: 'modelsize',
24 | title: 'Model Size',
25 | items: [
26 | { id: 'S1', label: '235b', subtitle: 'MOE', default: true },
27 | { id: 'S1-mini', label: '8b', subtitle: 'Dense', default: false }
28 | ]
29 | },
30 | quantization: {
31 | name: 'quantization',
32 | title: 'Quantization',
33 | items: [
34 | { id: 'bf16', label: 'BF16', default: true },
35 | { id: 'fp8', label: 'FP8', default: false }
36 | ]
37 | },
38 | reasoning_parser: {
39 | name: 'reasoning_parser',
40 | title: 'Reasoning Parser',
41 | items: [
42 | { id: 'disabled', label: 'Disabled', default: true },
43 | { id: 'enabled', label: 'Enabled', default: false }
44 | ]
45 | },
46 | toolcall: {
47 | name: 'toolcall',
48 | title: 'Tool Call Parser',
49 | items: [
50 | { id: 'disabled', label: 'Disabled', default: true },
51 | { id: 'enabled', label: 'Enabled', default: false }
52 | ]
53 | }
54 | },
55 |
56 | modelConfigs: {
57 | 'S1': {
58 | baseName: 'S1',
59 | isMOE: true,
60 | h100: { tp: 8, ep: 0, bf16: true, fp8: true },
61 | h200: { tp: 8, ep: 0, bf16: true, fp8: true },
62 | b200: { tp: 8, ep: 0, bf16: true, fp8: true }
63 | },
64 | 'S1-mini': {
65 | baseName: 'S1-mini',
66 | isMOE: true,
67 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
68 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
69 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
70 | }
71 | },
72 |
73 | generateCommand: function(values) {
74 | const { hardware, modelsize, quantization, reasoning_parser, toolcall } = values;
75 |
76 | const modelConfig = this.modelConfigs[modelsize];
77 | if (!modelConfig) {
78 | return `# Error: Unknown model size: ${modelsize}`;
79 | }
80 |
81 | const hwConfig = modelConfig[hardware];
82 | if (!hwConfig) {
83 | return `# Error: Unknown hardware platform: ${hardware}`;
84 | }
85 |
86 | const quantSuffix = quantization === 'fp8' ? '-FP8' : '';
87 | const modelName = `internlm/Intern-${modelConfig.baseName}${quantSuffix}`;
88 |
89 | let cmd = 'python -m sglang.launch_server \\\n';
90 | cmd += ` --model ${modelName}`;
91 |
92 | if (hwConfig.tp > 1) {
93 | cmd += ` \\\n --tp ${hwConfig.tp}`;
94 | }
95 |
96 | let ep = hwConfig.ep;
97 | if (quantization === 'fp8' && hwConfig.tp === 8) {
98 | ep = 2;
99 | }
100 |
101 | if (ep > 0) {
102 | cmd += ` \\\n --ep ${ep}`;
103 | }
104 |
105 | if (reasoning_parser === 'enabled') {
106 | cmd += ` \\\n --reasoning-parser interns1`;
107 | }
108 |
109 | if (toolcall === 'enabled') {
110 | cmd += ` \\\n --tool-call-parser interns1`;
111 | }
112 |
113 | cmd += ` \\\n --trust-remote-code`;
114 |
115 | return cmd;
116 | }
117 | };
118 |
119 | return ;
120 | };
121 |
122 | export default InternS1ConfigGenerator;
123 |
124 |
--------------------------------------------------------------------------------
/src/components/NemotronConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * NVIDIA Nemotron-Nano-3-30B-A3B Configuration Generator
6 | */
7 | const NemotronNano3ConfigGenerator = () => {
8 | const config = {
9 | modelFamily: 'nvidia',
10 |
11 | options: {
12 | hardware: {
13 | name: 'hardware',
14 | title: 'Hardware Platform',
15 | items: [
16 | { id: 'h200', label: 'H200', default: false },
17 | { id: 'b200', label: 'B200', default: true }
18 | ]
19 | },
20 | modelVariant: {
21 | name: 'modelVariant',
22 | title: 'Model Variant',
23 | items: [
24 | { id: 'bf16', label: 'BF16', default: true },
25 | { id: 'fp8', label: 'FP8', default: false }
26 | ]
27 | },
28 | tp: {
29 | name: 'tp',
30 | title: 'Tensor Parallel (TP)',
31 | items: [
32 | { id: '1', label: 'TP=1', default: true },
33 | { id: '2', label: 'TP=2', default: false },
34 | { id: '4', label: 'TP=4', default: false },
35 | { id: '8', label: 'TP=8', default: false }
36 | ]
37 | },
38 | kvcache: {
39 | name: 'kvcache',
40 | title: 'KV Cache DType',
41 | items: [
42 | { id: 'fp8_e4m3', label: 'fp8_e4m3', default: true },
43 | { id: 'bf16', label: 'bf16', default: false }
44 | ]
45 | },
46 | thinking: {
47 | name: 'thinking',
48 | title: 'Reasoning Parser',
49 | items: [
50 | { id: 'disabled', label: 'Disabled', default: true },
51 | { id: 'enabled', label: 'Enabled', default: false }
52 | ],
53 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser nano_v3' : null
54 | },
55 | toolcall: {
56 | name: 'toolcall',
57 | title: 'Tool Call Parser',
58 | items: [
59 | { id: 'disabled', label: 'Disabled', default: true },
60 | { id: 'enabled', label: 'Enabled', default: false }
61 | ],
62 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser qwen3_coder' : null
63 | },
64 | host: {
65 | name: 'host',
66 | title: 'Host',
67 | type: 'text',
68 | default: '0.0.0.0',
69 | placeholder: '0.0.0.0'
70 | },
71 | port: {
72 | name: 'port',
73 | title: 'Port',
74 | type: 'text',
75 | default: '30000',
76 | placeholder: '30000'
77 | }
78 | },
79 |
80 | generateCommand: function(values) {
81 | const { hardware, modelVariant, tp, kvcache, thinking, toolcall, host, port } = values;
82 |
83 | // Default to FP8 if not selected
84 | const variant = modelVariant || 'fp8';
85 | const baseName = 'NVIDIA-Nemotron-3-Nano-30B-A3B';
86 |
87 | const modelName =
88 | variant === 'bf16'
89 | ? `${this.modelFamily}/${baseName}-BF16`
90 | : `${this.modelFamily}/${baseName}-FP8`;
91 |
92 | let cmd = 'python3 -m sglang.launch_server \\\n';
93 | cmd += ` --model-path ${modelName} \\\n`;
94 | cmd += ` --trust-remote-code \\\n`;
95 | cmd += ` --tp ${tp} \\\n`;
96 | cmd += ` --kv-cache-dtype ${kvcache} \\\n`;
97 |
98 | // Add thinking parser and tool call parser if enabled
99 | for (const [key, option] of Object.entries(this.options)) {
100 | if (option.commandRule) {
101 | const rule = option.commandRule(values[key]);
102 | if (rule) {
103 | cmd += ` ${rule} \\\n`;
104 | }
105 | }
106 | }
107 |
108 |
109 | cmd += ` --host ${host || '0.0.0.0'} \\\n`;
110 | cmd += ` --port ${port || '30000'}`;
111 |
112 | return cmd;
113 | }
114 | };
115 |
116 | return ;
117 | };
118 |
119 | export default NemotronNano3ConfigGenerator;
120 |
--------------------------------------------------------------------------------
/src/components/Qwen3NextConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * Qwen3-Next Configuration Generator
6 | * Supports Qwen3-Next 80B model with speculative decoding option
7 | */
8 | const Qwen3NextConfigGenerator = () => {
9 | const config = {
10 | modelFamily: 'Qwen',
11 |
12 | options: {
13 | hardware: {
14 | name: 'hardware',
15 | title: 'Hardware Platform',
16 | items: [
17 | { id: 'b200', label: 'B200', default: true },
18 | { id: 'h200', label: 'H200', default: false },
19 | { id: 'h100', label: 'H100', default: false }
20 | ]
21 | },
22 | modelsize: {
23 | name: 'modelsize',
24 | title: 'Model Size',
25 | items: [
26 | { id: '80b', label: '80B', subtitle: 'MOE', default: true },
27 | ]
28 | },
29 | quantization: {
30 | name: 'quantization',
31 | title: 'Quantization',
32 | items: [
33 | { id: 'bf16', label: 'BF16', subtitle: 'Full Weights', default: true },
34 | { id: 'fp8', label: 'FP8', subtitle: 'High Throughput', default: false }
35 | ]
36 | },
37 | thinking: {
38 | name: 'thinking',
39 | title: 'Thinking Capabilities',
40 | items: [
41 | { id: 'instruct', label: 'Instruct', subtitle: 'General Purpose', default: true },
42 | { id: 'thinking', label: 'Thinking', subtitle: 'Reasoning / CoT', default: false }
43 | ],
44 | commandRule: (value) => value === 'thinking' ? '--reasoning-parser qwen3' : null
45 | },
46 | toolcall: {
47 | name: 'toolcall',
48 | title: 'Tool Call Parser',
49 | items: [
50 | { id: 'disabled', label: 'Disabled', default: true },
51 | { id: 'enabled', label: 'Enabled', default: false }
52 | ],
53 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser qwen' : null
54 | },
55 | speculative: {
56 | name: 'speculative',
57 | title: 'Speculative Decoding',
58 | items: [
59 | { id: 'disabled', label: 'Disabled', default: true },
60 | { id: 'enabled', label: 'Enabled', default: false }
61 | ],
62 | commandRule: (value) => value === 'enabled' ? '--speculative-algorithm EAGLE \\\n --speculative-num-steps 3 \\\n --speculative-eagle-topk 1 \\\n --speculative-num-draft-tokens 4' : null
63 | }
64 | },
65 |
66 | modelConfigs: {
67 | '80b': {
68 | baseName: '80B-A3B',
69 | isMOE: true,
70 | h100: { tp: 4, ep: 0, bf16: true, fp8: true },
71 | h200: { tp: 2, ep: 0, bf16: true, fp8: true },
72 | b200: { tp: 2, ep: 0, bf16: true, fp8: true }
73 | }
74 | },
75 |
76 | generateCommand: function (values) {
77 | const { hardware, modelsize: modelSize, quantization, thinking } = values;
78 | const commandKey = `${hardware}-${modelSize}-${quantization}-${thinking}`;
79 |
80 | const config = this.modelConfigs[modelSize];
81 | if (!config) {
82 | return `# Error: Unknown model size: ${modelSize}`;
83 | }
84 |
85 | const hwConfig = config[hardware];
86 | if (!hwConfig) {
87 | return `# Error: Unknown hardware platform: ${hardware}`;
88 | }
89 |
90 | const quantSuffix = quantization === 'fp8' ? '-FP8' : '';
91 | const thinkingSuffix = thinking === 'thinking' ? '-Thinking' : '-Instruct';
92 | const modelName = `Qwen/Qwen3-Next-${config.baseName}${thinkingSuffix}${quantSuffix}`;
93 |
94 | let cmd = 'python -m sglang.launch_server \\\n';
95 | cmd += ` --model ${modelName}`;
96 |
97 | if (hwConfig.tp > 1) {
98 | cmd += ` \\\n --tp ${hwConfig.tp}`;
99 | }
100 |
101 | let ep = hwConfig.ep;
102 | if (quantization === 'fp8' && hwConfig.tp === 8) {
103 | ep = 2;
104 | }
105 |
106 | if (ep > 0) {
107 | cmd += ` \\\n --ep ${ep}`;
108 | }
109 |
110 | for (const [key, option] of Object.entries(this.options)) {
111 |
112 | if (option.commandRule) {
113 | const rule = option.commandRule(values[key]);
114 | if (rule) {
115 | cmd += ` \\\n ${rule}`;
116 | }
117 | }
118 | }
119 |
120 | return cmd;
121 | }
122 | };
123 |
124 | return ;
125 | };
126 |
127 | export default Qwen3NextConfigGenerator;
128 |
129 |
--------------------------------------------------------------------------------
/src/components/GLM46VConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * GLM-4.6V Configuration Generator
6 | * Supports GLM-4.6V (106B) and GLM-4.6V-Flash (9B) models
7 | */
8 | const GLM46VConfigGenerator = () => {
9 | const config = {
10 | modelFamily: 'GLM',
11 |
12 | options: {
13 | hardware: {
14 | name: 'hardware',
15 | title: 'Hardware Platform',
16 | items: [
17 | { id: 'b200', label: 'B200', default: true },
18 | { id: 'h100', label: 'H100', default: false },
19 | { id: 'h200', label: 'H200', default: false }
20 | ]
21 | },
22 | modelsize: {
23 | name: 'modelsize',
24 | title: 'Model Size',
25 | items: [
26 | { id: '106b', label: '106B', subtitle: 'GLM-4.6V', default: true },
27 | { id: '9b', label: '9B', subtitle: 'GLM-4.6V-Flash', default: false }
28 | ]
29 | },
30 | quantization: {
31 | name: 'quantization',
32 | title: 'Quantization',
33 | items: [
34 | { id: 'bf16', label: 'BF16', default: true },
35 | { id: 'fp8', label: 'FP8', default: false }
36 | ]
37 | },
38 | reasoning: {
39 | name: 'reasoning',
40 | title: 'Reasoning Parser',
41 | items: [
42 | { id: 'enabled', label: 'Enabled', default: true },
43 | { id: 'disabled', label: 'Disabled', default: false }
44 | ],
45 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser glm45' : null
46 | },
47 | toolcall: {
48 | name: 'toolcall',
49 | title: 'Tool Call Parser',
50 | items: [
51 | { id: 'enabled', label: 'Enabled', default: true },
52 | { id: 'disabled', label: 'Disabled', default: false }
53 | ],
54 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser glm45' : null
55 | },
56 | host: {
57 | name: 'host',
58 | title: 'Host',
59 | type: 'text',
60 | default: '0.0.0.0',
61 | placeholder: '0.0.0.0'
62 | },
63 | port: {
64 | name: 'port',
65 | title: 'Port',
66 | type: 'text',
67 | default: '30000',
68 | placeholder: '30000'
69 | }
70 | },
71 |
72 | modelConfigs: {
73 | '106b': {
74 | baseName: 'GLM-4.6V',
75 | h100: { tp: 8, bf16: true, fp8: true },
76 | h200: { tp: 8, bf16: true, fp8: true },
77 | b200: { tp: 8, bf16: true, fp8: true }
78 | },
79 | '9b': {
80 | baseName: 'GLM-4.6V-Flash',
81 | h100: { tp: 1, bf16: true, fp8: true },
82 | h200: { tp: 1, bf16: true, fp8: true },
83 | b200: { tp: 1, bf16: true, fp8: true }
84 | }
85 | },
86 |
87 | specialCommands: {},
88 |
89 | generateCommand: function (values) {
90 | const { hardware, modelsize: modelSize, quantization, reasoning, toolcall } = values;
91 | const commandKey = `${hardware}-${modelSize}-${quantization}`;
92 |
93 | if (this.specialCommands[commandKey]) {
94 | return this.specialCommands[commandKey];
95 | }
96 |
97 | const config = this.modelConfigs[modelSize];
98 | if (!config) {
99 | return `# Error: Unknown model size: ${modelSize}`;
100 | }
101 |
102 | const hwConfig = config[hardware];
103 | if (!hwConfig) {
104 | return `# Error: Unknown hardware platform: ${hardware}`;
105 | }
106 |
107 | const quantSuffix = quantization === 'fp8' ? '-FP8' : '';
108 | const modelName = `zai-org/${config.baseName}${quantSuffix}`;
109 |
110 | let cmd = 'python -m sglang.launch_server \\\n';
111 | cmd += ` --model ${modelName}`;
112 |
113 | if (hwConfig.tp > 1) {
114 | cmd += ` \\\n --tp ${hwConfig.tp}`;
115 | }
116 |
117 | for (const [key, option] of Object.entries(this.options)) {
118 | if (key === 'host' || key === 'port') continue;
119 |
120 | if (option.commandRule) {
121 | const rule = option.commandRule(values[key]);
122 | if (rule) {
123 | cmd += ` \\\n ${rule}`;
124 | }
125 | }
126 | }
127 |
128 | const host = values.host || CONFIG.options.host.default;
129 | const port = values.port || CONFIG.options.port.default;
130 | cmd += ` \\\n --host ${host} \\\n --port ${port}`;
131 |
132 | return cmd;
133 | }
134 | };
135 |
136 | return ;
137 | };
138 |
139 | export default GLM46VConfigGenerator;
140 |
141 |
--------------------------------------------------------------------------------
/src/components/DeepSeekR1ConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | const DeepSeekR1ConfigGenerator = () => {
5 | const config = {
6 | modelFamily: 'deepseek-ai',
7 |
8 | options: {
9 | hardware: {
10 | name: 'hardware',
11 | title: 'Hardware Platform',
12 | items: [
13 | { id: 'h100', label: 'H100', default: false },
14 | { id: 'h200', label: 'H200', default: false },
15 | { id: 'b200', label: 'B200', default: true }
16 | ]
17 | },
18 | quantization: {
19 | name: 'quantization',
20 | title: 'Quantization',
21 | items: [
22 | { id: 'fp8', label: 'FP8', default: true },
23 | { id: 'fp4', label: 'FP4', default: false }
24 | ]
25 | },
26 | strategy: {
27 | name: 'strategy',
28 | title: 'Deployment Strategy',
29 | type: 'checkbox',
30 | items: [
31 | { id: 'tp', label: 'TP', subtitle: 'Tensor Parallel', default: true, required: true },
32 | { id: 'dp', label: 'DP', subtitle: 'Data Parallel', default: false },
33 | { id: 'ep', label: 'EP', subtitle: 'Expert Parallel', default: false },
34 | { id: 'mtp', label: 'MTP', subtitle: 'Multi-token Prediction', default: false }
35 | ]
36 | },
37 | thinking: {
38 | name: 'thinking',
39 | title: 'Reasoning Parser',
40 | items: [
41 | { id: 'disabled', label: 'Disabled', default: true },
42 | { id: 'enabled', label: 'Enabled', default: false }
43 | ],
44 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser deepseek-r1' : null
45 | },
46 | toolcall: {
47 | name: 'toolcall',
48 | title: 'Tool Call Parser',
49 | items: [
50 | { id: 'disabled', label: 'Disabled', default: true },
51 | { id: 'enabled', label: 'Enabled', default: false }
52 | ],
53 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser deepseekv3 \\\n --chat-template examples/chat_template/tool_chat_template_deepseekr1.jinja' : null
54 | }
55 | },
56 |
57 | generateCommand: function (values) {
58 | const { hardware, quantization, strategy } = values;
59 |
60 | const strategyArray = Array.isArray(strategy) ? strategy : [];
61 |
62 | // Validation checks
63 | // Check H100 compatibility - H100 only supports FP8
64 | if (hardware === 'h100' && quantization === 'fp4') {
65 | return '# Error: H100 only supports FP8 quantization\n# Please select FP8 quantization or use B200 hardware';
66 | }
67 |
68 | // Model path based on quantization
69 | let modelPath = '';
70 | if (quantization === 'fp8') {
71 | modelPath = 'deepseek-ai/DeepSeek-R1-0528';
72 | } else if (quantization === 'fp4') {
73 | modelPath = 'nvidia/DeepSeek-R1-0528-FP4-v2';
74 | }
75 |
76 | let cmd = 'python3 -m sglang.launch_server \\\n';
77 | cmd += ` --model-path ${modelPath}`;
78 |
79 | // TP strategy
80 | if (strategyArray.includes('tp')) {
81 | cmd += ` \\\n --tp 8`;
82 | }
83 |
84 | // DP strategy
85 | if (strategyArray.includes('dp')) {
86 | cmd += ` \\\n --dp 8 \\\n --enable-dp-attention`;
87 | }
88 |
89 | // EP strategy
90 | if (strategyArray.includes('ep')) {
91 | cmd += ` \\\n --ep 8`;
92 | }
93 |
94 | // MTP strategy
95 | if (strategyArray.includes('mtp')) {
96 | cmd = 'SGLANG_ENABLE_SPEC_V2=1 ' + cmd;
97 | cmd += ` \\\n --speculative-algorithm EAGLE \\\n --speculative-num-steps 3 \\\n --speculative-eagle-topk 1 \\\n --speculative-num-draft-tokens 4`;
98 | }
99 |
100 | cmd += ` \\\n --enable-symm-mem # Optional: improves performance, but may be unstable`;
101 |
102 | if (hardware === 'b200') {
103 | cmd += ` \\\n --kv-cache-dtype fp8_e4m3 # Optional: enables fp8 kv cache and fp8 attention kernels to improve performance`;
104 | }
105 |
106 | // Add thinking parser and tool call parser if enabled
107 | for (const [key, option] of Object.entries(this.options)) {
108 | if (option.commandRule) {
109 | const rule = option.commandRule(values[key]);
110 | if (rule) {
111 | cmd += ` \\\n ${rule}`;
112 | }
113 | }
114 | }
115 |
116 | return cmd;
117 | }
118 | };
119 |
120 | return ;
121 | };
122 |
123 | export default DeepSeekR1ConfigGenerator;
124 |
125 |
--------------------------------------------------------------------------------
/src/components/GLM46ConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * GLM-4.6 Configuration Generator
6 | * Supports GLM-4.6 model deployment configuration
7 | */
8 | const GLM46ConfigGenerator = () => {
9 | const config = {
10 | modelFamily: 'zai-org',
11 |
12 | options: {
13 | hardware: {
14 | name: 'hardware',
15 | title: 'Hardware Platform',
16 | items: [
17 | { id: 'h100', label: 'H100', default: true },
18 | { id: 'h200', label: 'H200', default: false },
19 | { id: 'b200', label: 'B200', default: false }
20 | ]
21 | },
22 | quantization: {
23 | name: 'quantization',
24 | title: 'Quantization',
25 | items: [
26 | { id: 'bf16', label: 'BF16', default: true },
27 | { id: 'fp8', label: 'FP8', default: false }
28 | ]
29 | },
30 | strategy: {
31 | name: 'strategy',
32 | title: 'Deployment Strategy',
33 | type: 'checkbox',
34 | items: [
35 | { id: 'tp', label: 'TP', subtitle: 'Tensor Parallel', default: true, required: true },
36 | { id: 'dp', label: 'DP', subtitle: 'Data Parallel', default: false },
37 | { id: 'ep', label: 'EP', subtitle: 'Expert Parallel', default: false },
38 | { id: 'mtp', label: 'MTP', subtitle: 'Multi-token Prediction', default: false }
39 | ]
40 | },
41 | thinking: {
42 | name: 'thinking',
43 | title: 'Thinking Capabilities',
44 | items: [
45 | { id: 'disabled', label: 'Disabled', default: true },
46 | { id: 'enabled', label: 'Enabled', default: false }
47 | ],
48 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser glm45' : null
49 | },
50 | toolcall: {
51 | name: 'toolcall',
52 | title: 'Tool Call Parser',
53 | items: [
54 | { id: 'disabled', label: 'Disabled', default: true },
55 | { id: 'enabled', label: 'Enabled', default: false }
56 | ],
57 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser glm45' : null
58 | }
59 | },
60 |
61 | specialCommands: {
62 | 'h100-bf16-tp': '# Error: GLM-4.6 in BF16 precision requires more VRAM than 8*H100\n# Please use H200/B200 or select FP8 quantization',
63 | 'h100-bf16-dp': '# Error: GLM-4.6 in BF16 precision requires more VRAM than 8*H100\n# Please use H200/B200 or select FP8 quantization',
64 | 'h100-bf16-ep': '# Error: GLM-4.6 in BF16 precision requires more VRAM than 8*H100\n# Please use H200/B200 or select FP8 quantization',
65 | 'h100-bf16-mtp': '# Error: GLM-4.6 in BF16 precision requires more VRAM than 8*H100\n# Please use H200/B200 or select FP8 quantization'
66 | },
67 |
68 | generateCommand: function (values) {
69 | const { hardware, quantization, strategy, thinking, toolcall } = values;
70 |
71 | // Check for H100 + BF16 error
72 | const strategyArray = Array.isArray(strategy) ? strategy : [];
73 | if (hardware === 'h100' && quantization === 'bf16') {
74 | return '# Error: GLM-4.6 in BF16 precision requires more VRAM than 8*H100\n# Please use H200/B200 or select FP8 quantization';
75 | }
76 |
77 | const modelSuffix = quantization === 'fp8' ? '-FP8' : '';
78 | const modelName = `${this.modelFamily}/GLM-4.6${modelSuffix}`;
79 |
80 | let cmd = 'python -m sglang.launch_server \\\n';
81 | cmd += ` --model ${modelName}`;
82 |
83 | // TP is mandatory
84 | cmd += ` \\\n --tp 8`;
85 |
86 | // Strategy-specific parameters
87 | if (strategyArray.includes('dp')) {
88 | cmd += ` \\\n --dp 8 \\\n --enable-dp-attention`;
89 | }
90 | if (strategyArray.includes('ep')) {
91 | cmd += ` \\\n --ep 8`;
92 | }
93 | if (strategyArray.includes('mtp')) {
94 | cmd = 'SGLANG_ENABLE_SPEC_V2=1 ' + cmd;
95 | cmd += ` \\\n --speculative-algorithm EAGLE \\\n --speculative-num-steps 3 \\\n --speculative-eagle-topk 1 \\\n --speculative-num-draft-tokens 4`;
96 | }
97 |
98 | // Add tool call parser if enabled
99 | if (toolcall === 'enabled') {
100 | cmd += ` \\\n --tool-call-parser glm45`;
101 | }
102 |
103 | // Add thinking parser if enabled
104 | if (thinking === 'enabled') {
105 | cmd += ` \\\n --reasoning-parser glm45`;
106 | }
107 |
108 | return cmd;
109 | }
110 | };
111 |
112 | return ;
113 | };
114 |
115 | export default GLM46ConfigGenerator;
116 |
117 |
--------------------------------------------------------------------------------
/docs/intro.md:
--------------------------------------------------------------------------------
1 | ---
2 | sidebar_position: 1
3 | ---
4 |
5 | # SGLang Cookbook
6 |
7 | [](https://opensource.org/licenses/Apache-2.0)
8 | [](https://github.com/sgl-project/sgl-cookbook/pulls)
9 |
10 | A community-maintained repository of practical guides and recipes for deploying and using SGLang in production environments. Our mission is simple: answer the question **"How do I use SGLang (and related models) on hardware Y for task Z?"** with clear, actionable solutions.
11 |
12 | ## 🎯 What You'll Find Here
13 |
14 | This cookbook aggregates battle-tested SGLang recipes covering:
15 |
16 | - **Models**: Mainstream LLMs and Vision-Language Models (VLMs)
17 | - **Use Cases**: Inference serving, deployment strategies, multimodal applications
18 | - **Hardware**: GPU and CPU configurations, optimization for different accelerators
19 | - **Best Practices**: Configuration templates, performance tuning, troubleshooting guides
20 |
21 | Each recipe provides step-by-step instructions to help you quickly implement SGLang solutions for your specific requirements.
22 |
23 | ## Guides
24 |
25 | ### DeepSeek
26 |
27 | - [x] [DeepSeek-V3.2](/docs/DeepSeek/DeepSeek-V3_2)
28 | - [ ] [DeepSeek-V3.1](./DeepSeek/DeepSeek-V3_1)
29 | - [ ] [DeepSeek-V3](./DeepSeek/DeepSeek-V3)
30 | - [x] [DeepSeek-R1](/docs/DeepSeek/DeepSeek-R1)
31 |
32 | ### Ernie
33 |
34 | - [ ] [Ernie4.5](./Ernie/Ernie4.5)
35 | - [ ] [Ernie4.5-VL](./Ernie/Ernie4.5-VL)
36 |
37 | ### GLM
38 |
39 | - [ ] [Glyph](./GLM/Glyph)
40 | - [ ] [GLM-4.5V](./GLM/GLM-4.5V)
41 | - [x] [GLM-4.6](./GLM/GLM-4.6)
42 | - [x] [GLM-4.6V](./GLM/GLM-4.6V)
43 |
44 | ### InternVL
45 |
46 | - [ ] [InternVL3.5](./InternVL/InternVL3_5)
47 |
48 | ### InternLM
49 |
50 | - [ ] [Intern-S1](./InternLM/Intern-S1)
51 |
52 | ### Jina AI
53 |
54 | - [ ] [Jina-reranker-m0](./Jina/Jina-reranker-m0)
55 |
56 | ### Llama
57 |
58 | - [ ] [Llama4-Scout](./Llama/Llama4-Scout)
59 | - [ ] [Llama3.3-70B](./Llama/Llama3.3-70B)
60 | - [ ] [Llama3.1](./Llama/Llama3.1)
61 |
62 | ### MiniMax
63 |
64 | - [ ] [MiniMax-M2](./MiniMax/MiniMax-M2)
65 |
66 | ### Mistral
67 |
68 | - [x] [Devstral 2](./Mistral/Devstral-2)
69 | - [ ] [Mistral-3](./Mistral/Mistral-3)
70 |
71 | ### OpenAI
72 |
73 | - [ ] [gpt-oss](./OpenAI/GPT-OSS)
74 |
75 | ### Qwen
76 |
77 | - [x] [Qwen3](./Qwen/Qwen3)
78 | - [x] [Qwen3-VL](./Qwen/Qwen3-VL)
79 | - [x] [Qwen3-Next](./Qwen/Qwen3-Next)
80 | - [ ] [Qwen3-Coder-480B-A35B](./Qwen/Qwen3-Coder-480B-A35B)
81 | - [ ] [Qwen2.5-VL](./Qwen/Qwen2.5-VL)
82 |
83 | ### Moonshotai
84 |
85 | - [x] [Kimi-K2](./Moonshotai/Kimi-K2)
86 | - [ ] [Kimi-Linear](./Moonshotai/Kimi-Linear)
87 |
88 | ### NVIDIA
89 |
90 | - [x] [Nemotron-Nano-3-30B-A3B](./NVIDIA/Nemotron3-Nano)
91 |
92 | ## 🚀 Quick Start
93 |
94 | 1. Browse the recipe index above to find your model
95 | 2. Follow the step-by-step instructions in each guide
96 | 3. Adapt configurations to your specific hardware and requirements
97 | 4. Join our community to share feedback and improvements
98 |
99 | ## 🤝 Contributing
100 |
101 | We believe the best documentation comes from practitioners. Whether you've optimized SGLang for a specific model, solved a tricky deployment challenge, or discovered performance improvements, we encourage you to contribute your recipes!
102 |
103 | **Ways to contribute:**
104 |
105 | - Add a new recipe for a model not yet covered
106 | - Improve existing recipes with additional tips or configurations
107 | - Report issues or suggest enhancements
108 | - Share your production deployment experiences
109 |
110 | **To contribute:**
111 |
112 | ```shell
113 | # Fork the repo and clone locally
114 | git clone https://github.com/YOUR_USERNAME/sglang-cookbook.git
115 | cd sglang-cookbook
116 |
117 | # Create a new branch
118 | git checkout -b add-my-recipe
119 |
120 | # Add your recipe following the template in DeepSeek-V3.2
121 | # Submit a PR!
122 | ```
123 |
124 | ## 🛠️ Local Development
125 |
126 | ### Prerequisites
127 |
128 | - Node.js >= 20.0
129 | - npm or yarn
130 |
131 | ### Setup and Run
132 |
133 | Install dependencies and start the development server:
134 |
135 | ```shell
136 | # Install dependencies
137 | npm install
138 |
139 | # Start development server (hot reload enabled)
140 | npm start
141 | ```
142 |
143 | The site will automatically open in your browser at `http://localhost:3000`.
144 |
145 | ## 📖 Resources
146 |
147 | - [SGLang GitHub](https://github.com/sgl-project/sglang)
148 | - [SGLang Documentation](https://sgl-project.github.io)
149 | - [Community Slack/Discord](https://discord.gg/MpEEuAeb)
150 |
151 | ## 📄 License
152 |
153 | This project is licensed under the Apache License 2.0 - see the [LICENSE](https://github.com/sgl-project/sgl-cookbook/blob/main/LICENSE) file for details.
154 |
155 | ---
156 |
157 | **Let's build this resource together!** 🚀 Star the repo and contribute your recipes to help the SGLang community grow.
158 |
--------------------------------------------------------------------------------
/src/components/DeepSeekConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * DeepSeek V3.2 Configuration Generator
6 | * Supports DeepSeek-V3.2, V3.2-Speciale, and V3.2-Exp models
7 | */
8 | const DeepSeekConfigGenerator = () => {
9 | const config = {
10 | modelFamily: 'deepseek-ai',
11 |
12 | options: {
13 | hardware: {
14 | name: 'hardware',
15 | title: 'Hardware Platform',
16 | items: [
17 | { id: 'h200', label: 'H200', default: true },
18 | { id: 'b200', label: 'B200', default: false }
19 | ]
20 | },
21 | modelname: {
22 | name: 'modelname',
23 | title: 'Model Name',
24 | items: [
25 | { id: 'v32', label: 'DeepSeek-V3.2', default: true },
26 | { id: 'v32speciale', label: 'DeepSeek-V3.2-Speciale', default: false },
27 | { id: 'v32exp', label: 'DeepSeek-V3.2-Exp', default: false }
28 | ]
29 | },
30 | strategy: {
31 | name: 'strategy',
32 | title: 'Deployment Strategy',
33 | type: 'checkbox',
34 | items: [
35 | { id: 'tp', label: 'TP', default: true, required: true },
36 | { id: 'dp', label: 'DP attention', default: false },
37 | { id: 'ep', label: 'EP', default: false },
38 | { id: 'mtp', label: 'Multi-token Prediction', default: false }
39 | ]
40 | },
41 | reasoningParser: {
42 | name: 'reasoningParser',
43 | title: 'Reasoning Parser',
44 | items: [
45 | { id: 'disabled', label: 'Disabled', default: true },
46 | { id: 'enabled', label: 'Enabled', default: false }
47 | ],
48 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser deepseek-v3' : null
49 | },
50 | toolcall: {
51 | name: 'toolcall',
52 | title: 'Tool Call Parser',
53 | items: [
54 | { id: 'disabled', label: 'Disabled', default: true },
55 | { id: 'enabled', label: 'Enabled', default: false }
56 | ]
57 | }
58 | },
59 |
60 | generateCommand: function(values) {
61 | const { hardware, modelname, strategy, reasoningParser, toolcall } = values;
62 |
63 | // Validation: DeepSeek-V3.2-Speciale doesn't support tool calling
64 | if (modelname === 'v32speciale' && toolcall === 'enabled') {
65 | return `# Error: DeepSeek-V3.2-Speciale doesn't support tool calling\n# Please select "Disabled" for Tool Call Parser or choose a different model`;
66 | }
67 |
68 | // Model name mapping
69 | const modelMap = {
70 | 'v32': 'DeepSeek-V3.2',
71 | 'v32exp': 'DeepSeek-V3.2-Exp',
72 | 'v32speciale': 'DeepSeek-V3.2-Speciale'
73 | };
74 |
75 | const modelName = `${this.modelFamily}/${modelMap[modelname]}`;
76 |
77 | let cmd = 'python3 -m sglang.launch_server \\\n';
78 | cmd += ` --model-path ${modelName}`;
79 |
80 | // Strategy configurations
81 | const strategyArray = Array.isArray(strategy) ? strategy : [];
82 | // TP is mandatory
83 | cmd += ` \\\n --tp 8`;
84 | if (strategyArray.includes('dp')) {
85 | cmd += ` \\\n --dp 8 \\\n --enable-dp-attention`;
86 | }
87 | if (strategyArray.includes('ep')) {
88 | cmd += ` \\\n --ep 8`;
89 | }
90 | // Multi-token prediction (MTP) configuration
91 | if (strategyArray.includes('mtp')) {
92 | cmd += ` \\\n --speculative-algorithm EAGLE \\\n --speculative-num-steps 3 \\\n --speculative-eagle-topk 1 \\\n --speculative-num-draft-tokens 4`;
93 | }
94 |
95 | // Add tool-call-parser if enabled (not supported for Speciale)
96 | if (toolcall === 'enabled' && modelname !== 'v32speciale') {
97 | if (modelname === 'v32exp') {
98 | cmd += ` \\\n --tool-call-parser deepseekv31`;
99 | } else if (modelname === 'v32') {
100 | cmd += ` \\\n --tool-call-parser deepseekv32`;
101 | }
102 | }
103 |
104 | // Add reasoning-parser when enabled
105 | if (reasoningParser === 'enabled') {
106 | cmd += ` \\\n --reasoning-parser deepseek-v3`;
107 | }
108 |
109 | // Add chat-template if tool calling is enabled (only for v32exp)
110 | if (toolcall === 'enabled' && modelname === 'v32exp') {
111 | cmd += ` \\\n --chat-template ./examples/chat_template/tool_chat_template_deepseekv32.jinja`;
112 | }
113 |
114 | return cmd;
115 | }
116 | };
117 |
118 | return ;
119 | };
120 |
121 | export default DeepSeekConfigGenerator;
122 |
123 |
--------------------------------------------------------------------------------
/src/components/GPTOSSConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * GPT-OSS Configuration Generator
6 | * Supports GPT-OSS 120B and 20B models with speculative decoding
7 | */
8 | const GPTOSSConfigGenerator = () => {
9 | const config = {
10 | modelFamily: 'GPT-OSS',
11 |
12 | options: {
13 | hardware: {
14 | name: 'hardware',
15 | title: 'Hardware Platform',
16 | items: [
17 | { id: 'b200', label: 'B200', default: true },
18 | { id: 'h200', label: 'H200', default: false },
19 | { id: 'h100', label: 'H100', default: false }
20 | ]
21 | },
22 | modelsize: {
23 | name: 'modelsize',
24 | title: 'Model Size',
25 | items: [
26 | { id: '120b', label: '120B', subtitle: 'MOE', default: true },
27 | { id: '20b', label: '20B', subtitle: 'MOE', default: false },
28 | ]
29 | },
30 | quantization: {
31 | name: 'quantization',
32 | title: 'Quantization',
33 | items: [
34 | { id: 'mxfp4', label: 'MXFP4', default: true },
35 | { id: 'bf16', label: 'BF16', default: false }
36 | ]
37 | },
38 | reasoningParser: {
39 | name: 'reasoningParser',
40 | title: 'Reasoning Parser',
41 | items: [
42 | { id: 'disabled', label: 'Disabled', default: true },
43 | { id: 'enabled', label: 'Enabled', default: false }
44 | ],
45 | commandRule: (value) => value === 'enabled' ? '--reasoning-parser gpt-oss' : null
46 | },
47 | toolcall: {
48 | name: 'toolcall',
49 | title: 'Tool Call Parser',
50 | items: [
51 | { id: 'disabled', label: 'Disabled', default: true },
52 | { id: 'enabled', label: 'Enabled', default: false }
53 | ],
54 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser gpt-oss' : null
55 | },
56 | speculative: {
57 | name: 'speculative',
58 | title: 'Speculative Decoding',
59 | items: [
60 | { id: 'disabled', label: 'Disabled', default: true },
61 | { id: 'enabled', label: 'Enabled', default: false }
62 | ],
63 | commandRule: (value, allValues) => {
64 | if (value !== 'enabled') return null;
65 |
66 | let cmd = '--speculative-algorithm EAGLE3 \\\n --speculative-num-steps 3 \\\n --speculative-eagle-topk 1 \\\n --speculative-num-draft-tokens 4';
67 |
68 | if (allValues.modelsize === '120b') {
69 | cmd += ' \\\n --speculative-draft-model-path nvidia/gpt-oss-120b-Eagle3';
70 | } else if (allValues.modelsize === '20b') {
71 | cmd += ' \\\n --speculative-draft-model-path zhuyksir/EAGLE3-gpt-oss-20b-bf16';
72 | }
73 |
74 | return cmd;
75 | }
76 | }
77 | },
78 |
79 | modelConfigs: {
80 | '120b': {
81 | baseName: '120b',
82 | isMOE: true,
83 | h100: { tp: 8, ep: 0, mxfp4: true, bf16: false },
84 | h200: { tp: 8, ep: 0, mxfp4: true, bf16: false },
85 | b200: { tp: 8, ep: 0, mxfp4: true, bf16: false }
86 | },
87 | '20b': {
88 | baseName: '20b',
89 | isMOE: true,
90 | h100: { tp: 1, ep: 0, mxfp4: true, bf16: false },
91 | h200: { tp: 1, ep: 0, mxfp4: true, bf16: false },
92 | b200: { tp: 1, ep: 0, mxfp4: true, bf16: false }
93 | }
94 | },
95 |
96 | generateCommand: function (values) {
97 | const { hardware, modelsize: modelSize, quantization, reasoningParser } = values;
98 | const commandKey = `${hardware}-${modelSize}-${quantization}-${reasoningParser}`;
99 |
100 | const config = this.modelConfigs[modelSize];
101 | if (!config) {
102 | return `# Error: Unknown model size: ${modelSize}`;
103 | }
104 |
105 | const hwConfig = config[hardware];
106 | if (!hwConfig) {
107 | return `# Error: Unknown hardware platform: ${hardware}`;
108 | }
109 |
110 | const quantSuffix = quantization === 'bf16' ? '-bf16' : '';
111 | const orgPrefix = quantization === 'bf16' ? 'lmsys' : 'openai';
112 | const modelName = `${orgPrefix}/gpt-oss-${config.baseName}${quantSuffix}`;
113 |
114 | let cmd = '';
115 |
116 | if (values.speculative === 'enabled') {
117 | cmd += 'SGLANG_ENABLE_SPEC_V2=1 SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 ';
118 | }
119 |
120 | cmd += 'python -m sglang.launch_server \\\n';
121 |
122 | cmd += ` --model ${modelName}`;
123 |
124 | if (hwConfig.tp > 1) {
125 | cmd += ` \\\n --tp ${hwConfig.tp}`;
126 | }
127 |
128 | let ep = hwConfig.ep;
129 |
130 | if (ep > 0) {
131 | cmd += ` \\\n --ep ${ep}`;
132 | }
133 |
134 | for (const [key, option] of Object.entries(this.options)) {
135 |
136 | if (option.commandRule) {
137 | const rule = option.commandRule(values[key], values);
138 |
139 | if (rule) {
140 | cmd += ` \\\n ${rule}`;
141 | }
142 | }
143 | }
144 |
145 | return cmd;
146 | }
147 | };
148 |
149 | return ;
150 | };
151 |
152 | export default GPTOSSConfigGenerator;
153 |
--------------------------------------------------------------------------------
/src/components/ConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React, { useState } from 'react';
2 | import styles from './styles.module.css';
3 |
4 | /**
5 | * Generic Configuration Generator Component
6 | *
7 | * @param {Object} config - Configuration object with the following structure:
8 | * - modelFamily: string (optional)
9 | * - options: object with option groups
10 | * - generateCommand: function(values) => string
11 | */
12 | const ConfigGenerator = ({ config }) => {
13 | if (!config || !config.options) {
14 | return Error: Invalid configuration provided
;
15 | }
16 |
17 | // Initialize state with default values
18 | const getInitialState = () => {
19 | const initialState = {};
20 | Object.entries(config.options).forEach(([key, option]) => {
21 | if (option.type === 'checkbox') {
22 | initialState[key] = option.items
23 | .filter(item => item.default)
24 | .map(item => item.id);
25 | } else if (option.type === 'text') {
26 | initialState[key] = option.default || '';
27 | } else {
28 | const defaultItem = option.items.find(item => item.default);
29 | initialState[key] = defaultItem ? defaultItem.id : option.items[0].id;
30 | }
31 | });
32 | return initialState;
33 | };
34 |
35 | const [values, setValues] = useState(getInitialState());
36 |
37 | const handleRadioChange = (optionName, value) => {
38 | setValues(prev => ({
39 | ...prev,
40 | [optionName]: value
41 | }));
42 | };
43 |
44 | const handleCheckboxChange = (optionName, itemId, isChecked) => {
45 | setValues(prev => {
46 | const currentValues = prev[optionName] || [];
47 | if (isChecked) {
48 | return {
49 | ...prev,
50 | [optionName]: [...currentValues, itemId]
51 | };
52 | } else {
53 | return {
54 | ...prev,
55 | [optionName]: currentValues.filter(id => id !== itemId)
56 | };
57 | }
58 | });
59 | };
60 |
61 | const handleTextChange = (optionName, value) => {
62 | setValues(prev => ({
63 | ...prev,
64 | [optionName]: value
65 | }));
66 | };
67 |
68 | const command = config.generateCommand ? config.generateCommand(values) : '';
69 |
70 | return (
71 |
72 | {Object.entries(config.options).map(([key, option], index) => (
73 |
139 | ))}
140 |
141 |
142 |
Generated Command
143 |
{command}
144 |
145 |
146 | );
147 | };
148 |
149 | export default ConfigGenerator;
150 |
151 |
--------------------------------------------------------------------------------
/docusaurus.config.js:
--------------------------------------------------------------------------------
1 | // @ts-check
2 | // `@type` JSDoc annotations allow editor autocompletion and type checking
3 | // (when paired with `@ts-check`).
4 | // There are various equivalent ways to declare your Docusaurus config.
5 | // See: https://docusaurus.io/docs/api/docusaurus-config
6 |
7 | import {themes as prismThemes} from 'prism-react-renderer';
8 |
9 | // This runs in Node.js - Don't use client-side code here (browser APIs, JSX...)
10 |
11 | /** @type {import('@docusaurus/types').Config} */
12 | const config = {
13 | title: 'SGLang Cookbook',
14 | favicon: 'img/favicon.png',
15 |
16 | // Future flags, see https://docusaurus.io/docs/api/docusaurus-config#future
17 | future: {
18 | v4: true, // Improve compatibility with the upcoming Docusaurus v4
19 | },
20 |
21 | // Set the production url of your site here
22 | url: 'https://cookbook-sg-lang.vercel.app',
23 | // Set the // pathname under which your site is served
24 | // For GitHub pages deployment, it is often '//'
25 | baseUrl: '/',
26 |
27 | // GitHub pages deployment config.
28 | // If you aren't using GitHub pages, you don't need these.
29 | organizationName: 'sgl-project', // Usually your GitHub org/user name.
30 | projectName: 'sgl-cookbook', // Usually your repo name.
31 |
32 | onBrokenLinks: 'throw',
33 |
34 | // Even if you don't use internationalization, you can use this field to set
35 | // useful metadata like html lang. For example, if your site is Chinese, you
36 | // may want to replace "en" with "zh-Hans".
37 | i18n: {
38 | defaultLocale: 'en',
39 | locales: ['en'],
40 | },
41 |
42 | presets: [
43 | [
44 | 'classic',
45 | /** @type {import('@docusaurus/preset-classic').Options} */
46 | ({
47 | docs: {
48 | sidebarPath: './sidebars.js',
49 | editUrl:
50 | 'https://github.com/sgl-project/sgl-cookbook/tree/main',
51 | },
52 | // blog: {
53 | // showReadingTime: true,
54 | // feedOptions: {
55 | // type: ['rss', 'atom'],
56 | // xslt: true,
57 | // },
58 | // // Please change this to your repo.
59 | // // Remove this to remove the "edit this page" links.
60 | // editUrl:
61 | // 'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/',
62 | // // Useful options to enforce blogging best practices
63 | // onInlineTags: 'warn',
64 | // onInlineAuthors: 'warn',
65 | // onUntruncatedBlogPosts: 'warn',
66 | // },
67 | theme: {
68 | customCss: './src/css/custom.css',
69 | },
70 | }),
71 | ],
72 | ],
73 | headTags: [
74 | {
75 | tagName: 'meta',
76 | attributes: {
77 | name: 'algolia-site-verification',
78 | content: 'B137E28CCDDFD715',
79 | },
80 | },
81 | ],
82 | themeConfig:
83 | /** @type {import('@docusaurus/preset-classic').ThemeConfig} */
84 | ({
85 | // Replace with your project's social card
86 | // image: 'img/docusaurus-social-card.jpg',
87 | // colorMode: {
88 | // respectPrefersColorScheme: true,
89 | // },
90 | navbar: {
91 | title: 'SGLang Cookbook',
92 | logo: {
93 | alt: 'SGLang Cookbook Logo',
94 | src: 'img/logo.png',
95 | },
96 | items: [
97 | // {
98 | // type: 'docSidebar',
99 | // sidebarId: 'tutorialSidebar',
100 | // position: 'left',
101 | // label: 'Tutorial',
102 | // },
103 | // {to: '/blog', label: 'Blog', position: 'left'},
104 | {
105 | href: 'https://github.com/sgl-project/sgl-cookbook',
106 | label: 'GitHub',
107 | position: 'right',
108 | },
109 | ],
110 | },
111 | footer: {
112 | style: 'dark',
113 | // links: [
114 | // {
115 | // title: 'Docs',
116 | // items: [
117 | // {
118 | // label: 'Tutorial',
119 | // to: '/docs/intro',
120 | // },
121 | // ],
122 | // },
123 | // {
124 | // title: 'Community',
125 | // items: [
126 | // {
127 | // label: 'Stack Overflow',
128 | // href: 'https://stackoverflow.com/questions/tagged/docusaurus',
129 | // },
130 | // {
131 | // label: 'Discord',
132 | // href: 'https://discordapp.com/invite/docusaurus',
133 | // },
134 | // {
135 | // label: 'X',
136 | // href: 'https://x.com/docusaurus',
137 | // },
138 | // ],
139 | // },
140 | // {
141 | // title: 'More',
142 | // items: [
143 | // {
144 | // label: 'Blog',
145 | // to: '/blog',
146 | // },
147 | // {
148 | // label: 'GitHub',
149 | // href: 'https://github.com/facebook/docusaurus',
150 | // },
151 | // ],
152 | // },
153 | // ],
154 | copyright: `Copyright © ${new Date().getFullYear()} SGLang Team.`,
155 | },
156 | prism: {
157 | theme: prismThemes.github,
158 | darkTheme: prismThemes.dracula,
159 | },
160 | algolia: {
161 | appId: '5PDGY21FSS',
162 | apiKey: '58c29a0ac6c2759e581d630b54e57564',
163 | indexName: 'sgl-cookbook',
164 | },
165 | }),
166 | };
167 |
168 | export default config;
169 |
--------------------------------------------------------------------------------
/src/components/Qwen3VLConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ConfigGenerator from '../ConfigGenerator';
3 |
4 | /**
5 | * Qwen3-VL Configuration Generator
6 | * Supports multiple Qwen3-VL model sizes (235B, 30B, 32B, 8B, 4B, 2B)
7 | */
8 | const Qwen3VLConfigGenerator = () => {
9 | const config = {
10 | modelFamily: 'Qwen',
11 |
12 | options: {
13 | hardware: {
14 | name: 'hardware',
15 | title: 'Hardware Platform',
16 | items: [
17 | { id: 'b200', label: 'B200', default: true },
18 | { id: 'h100', label: 'H100', default: false },
19 | { id: 'h200', label: 'H200', default: false }
20 | ]
21 | },
22 | modelsize: {
23 | name: 'modelsize',
24 | title: 'Model Size',
25 | items: [
26 | { id: '235b', label: '235B', subtitle: 'MOE', default: true },
27 | { id: '30b', label: '30B', subtitle: 'MOE', default: false },
28 | { id: '32b', label: '32B', subtitle: 'Dense', default: false },
29 | { id: '8b', label: '8B', subtitle: 'Dense', default: false },
30 | { id: '4b', label: '4B', subtitle: 'Dense', default: false },
31 | { id: '2b', label: '2B', subtitle: 'Dense', default: false }
32 | ]
33 | },
34 | quantization: {
35 | name: 'quantization',
36 | title: 'Quantization',
37 | items: [
38 | { id: 'bf16', label: 'BF16', default: true },
39 | { id: 'fp8', label: 'FP8', default: false }
40 | ]
41 | },
42 | thinking: {
43 | name: 'thinking',
44 | title: 'Thinking Capabilities',
45 | items: [
46 | { id: 'instruct', label: 'Instruct', default: true },
47 | { id: 'thinking', label: 'Thinking', default: false }
48 | ],
49 | commandRule: (value) => value === 'thinking' ? '--reasoning-parser qwen3' : null
50 | },
51 | toolcall: {
52 | name: 'toolcall',
53 | title: 'Tool Call Parser',
54 | items: [
55 | { id: 'disabled', label: 'Disabled', default: true },
56 | { id: 'enabled', label: 'Enabled', default: false }
57 | ],
58 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser qwen' : null
59 | }
60 | },
61 |
62 | modelConfigs: {
63 | '235b': {
64 | baseName: '235B-A22B',
65 | isMOE: true,
66 | h100: { tp: 8, ep: 0, bf16: true, fp8: true },
67 | h200: { tp: 8, ep: 0, bf16: true, fp8: true },
68 | b200: { tp: 8, ep: 0, bf16: true, fp8: true }
69 | },
70 | '30b': {
71 | baseName: '30B-A3B',
72 | isMOE: true,
73 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
74 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
75 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
76 | },
77 | '32b': {
78 | baseName: '32B',
79 | isMOE: false,
80 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
81 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
82 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
83 | },
84 | '8b': {
85 | baseName: '8B',
86 | isMOE: false,
87 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
88 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
89 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
90 | },
91 | '4b': {
92 | baseName: '4B',
93 | isMOE: false,
94 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
95 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
96 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
97 | },
98 | '2b': {
99 | baseName: '2B',
100 | isMOE: false,
101 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
102 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
103 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
104 | }
105 | },
106 |
107 | specialCommands: {
108 | 'h100-235b-bf16-instruct': '# Error: Model is too large, cannot fit into 8*H100\n# Please use H200 (141GB) or select FP8 quantization',
109 | 'h100-235b-bf16-thinking': '# Error: Model is too large, cannot fit into 8*H100\n# Please use H200 (141GB) or select FP8 quantization'
110 | },
111 |
112 | generateCommand: function (values) {
113 | const { hardware, modelsize: modelSize, quantization, thinking } = values;
114 | const commandKey = `${hardware}-${modelSize}-${quantization}-${thinking}`;
115 |
116 | if (this.specialCommands[commandKey]) {
117 | return this.specialCommands[commandKey];
118 | }
119 |
120 | const config = this.modelConfigs[modelSize];
121 | if (!config) {
122 | return `# Error: Unknown model size: ${modelSize}`;
123 | }
124 |
125 | const hwConfig = config[hardware];
126 | if (!hwConfig) {
127 | return `# Error: Unknown hardware platform: ${hardware}`;
128 | }
129 |
130 | const quantSuffix = quantization === 'fp8' ? '-FP8' : '';
131 | const thinkingSuffix = thinking === 'thinking' ? '-Thinking' : '-Instruct';
132 | const modelName = `Qwen/Qwen3-VL-${config.baseName}${thinkingSuffix}${quantSuffix}`;
133 |
134 | let cmd = 'python -m sglang.launch_server \\\n';
135 | cmd += ` --model ${modelName}`;
136 |
137 | if (hwConfig.tp > 1) {
138 | cmd += ` \\\n --tp ${hwConfig.tp}`;
139 | }
140 |
141 | let ep = hwConfig.ep;
142 | if (quantization === 'fp8' && hwConfig.tp === 8) {
143 | ep = 2;
144 | }
145 |
146 | if (ep > 0) {
147 | cmd += ` \\\n --ep ${ep}`;
148 | }
149 |
150 | for (const [key, option] of Object.entries(this.options)) {
151 | if (key === 'host' || key === 'port') continue;
152 |
153 | if (option.commandRule) {
154 | const rule = option.commandRule(values[key]);
155 | if (rule) {
156 | cmd += ` \\\n ${rule}`;
157 | }
158 | }
159 | }
160 |
161 | return cmd;
162 | }
163 | };
164 |
165 | return ;
166 | };
167 |
168 | export default Qwen3VLConfigGenerator;
169 |
170 |
--------------------------------------------------------------------------------
/src/components/ConfigGenerator/QUICKSTART.md:
--------------------------------------------------------------------------------
1 | # Quick Start Guide: Creating a New Config Generator
2 |
3 | This guide shows you how to quickly create a new configuration generator for any model.
4 |
5 | ## Step 1: Create Your Component File
6 |
7 | Create a new file: `src/components/YourModelConfigGenerator/index.js`
8 |
9 | ```jsx
10 | import React from 'react';
11 | import ConfigGenerator from '../ConfigGenerator';
12 |
13 | const YourModelConfigGenerator = () => {
14 | const config = {
15 | options: {
16 | // Add your options here
17 | },
18 | generateCommand: function(values) {
19 | // Add your command generation logic here
20 | return 'your-command';
21 | }
22 | };
23 |
24 | return ;
25 | };
26 |
27 | export default YourModelConfigGenerator;
28 | ```
29 |
30 | ## Step 2: Define Your Options
31 |
32 | Add configuration options based on your needs:
33 |
34 | ### Radio Button Options (Single Choice)
35 |
36 | ```javascript
37 | hardware: {
38 | name: 'hardware', // Internal identifier
39 | title: 'Hardware Platform', // Display title
40 | items: [
41 | { id: 'gpu_a', label: 'GPU A', default: true }, // Default selected
42 | { id: 'gpu_b', label: 'GPU B', default: false }
43 | ]
44 | }
45 | ```
46 |
47 | ### Checkbox Options (Multiple Choice)
48 |
49 | ```javascript
50 | features: {
51 | name: 'features',
52 | title: 'Features',
53 | type: 'checkbox', // Important: specify type
54 | items: [
55 | { id: 'feature1', label: 'Feature 1', default: true },
56 | { id: 'feature2', label: 'Feature 2', default: false },
57 | { id: 'feature3', label: 'Feature 3', default: false, required: true } // Can't be unchecked
58 | ]
59 | }
60 | ```
61 |
62 | ### Text Input Options
63 |
64 | ```javascript
65 | modelPath: {
66 | name: 'modelPath',
67 | title: 'Model Path',
68 | type: 'text', // Important: specify type
69 | default: 'path/to/model',
70 | placeholder: 'Enter model path...'
71 | }
72 | ```
73 |
74 | ## Step 3: Implement Command Generation
75 |
76 | Write the logic to generate commands based on user selections:
77 |
78 | ```javascript
79 | generateCommand: function(values) {
80 | // Extract values
81 | const { hardware, features, modelPath } = values;
82 |
83 | // Start building command
84 | let cmd = 'python3 -m sglang.launch_server';
85 | cmd += ` --model ${modelPath}`;
86 |
87 | // Handle radio button (single value)
88 | if (hardware === 'gpu_a') {
89 | cmd += ' --device-type gpu_a';
90 | } else if (hardware === 'gpu_b') {
91 | cmd += ' --device-type gpu_b';
92 | }
93 |
94 | // Handle checkboxes (array of values)
95 | const featureArray = Array.isArray(features) ? features : [];
96 | if (featureArray.includes('feature1')) {
97 | cmd += ' --enable-feature1';
98 | }
99 | if (featureArray.includes('feature2')) {
100 | cmd += ' --enable-feature2';
101 | }
102 |
103 | return cmd;
104 | }
105 | ```
106 |
107 | ## Step 4: Use in Markdown
108 |
109 | In your `.md` or `.mdx` file:
110 |
111 | ```mdx
112 | ---
113 | title: Your Model Documentation
114 | ---
115 |
116 | import YourModelConfigGenerator from '@site/src/components/YourModelConfigGenerator';
117 |
118 | # Your Model
119 |
120 | ## Deployment Configuration
121 |
122 |
123 | ```
124 |
125 | ## Complete Example
126 |
127 | Here's a complete, working example:
128 |
129 | ```jsx
130 | import React from 'react';
131 | import ConfigGenerator from '../ConfigGenerator';
132 |
133 | const ExampleConfigGenerator = () => {
134 | const config = {
135 | options: {
136 | hardware: {
137 | name: 'hardware',
138 | title: 'Hardware Platform',
139 | items: [
140 | { id: 'h100', label: 'H100', default: true },
141 | { id: 'a100', label: 'A100', default: false }
142 | ]
143 | },
144 | quantization: {
145 | name: 'quantization',
146 | title: 'Quantization',
147 | items: [
148 | { id: 'fp16', label: 'FP16', default: true },
149 | { id: 'int8', label: 'INT8', default: false },
150 | { id: 'int4', label: 'INT4', default: false }
151 | ]
152 | },
153 | parallelism: {
154 | name: 'parallelism',
155 | title: 'Parallelism Strategy',
156 | type: 'checkbox',
157 | items: [
158 | { id: 'tp', label: 'Tensor Parallel', subtitle: 'TP', default: true, required: true },
159 | { id: 'dp', label: 'Data Parallel', subtitle: 'DP', default: false },
160 | { id: 'pp', label: 'Pipeline Parallel', subtitle: 'PP', default: false }
161 | ]
162 | },
163 | modelPath: {
164 | name: 'modelPath',
165 | title: 'Model Path',
166 | type: 'text',
167 | default: 'org/model-name',
168 | placeholder: 'Enter model path from Hugging Face...'
169 | }
170 | },
171 |
172 | generateCommand: function(values) {
173 | const { hardware, quantization, parallelism, modelPath } = values;
174 | const parallelismArray = Array.isArray(parallelism) ? parallelism : [];
175 |
176 | // Validation example
177 | if (hardware === 'a100' && quantization === 'int4') {
178 | return '# Error: A100 does not support INT4 quantization\n' +
179 | '# Please choose FP16 or INT8, or use H100 hardware';
180 | }
181 |
182 | // Build command
183 | let cmd = 'python3 -m sglang.launch_server \\\n';
184 | cmd += ` --model-path ${modelPath}`;
185 |
186 | // Add quantization
187 | if (quantization !== 'fp16') {
188 | cmd += ` \\\n --quantization ${quantization}`;
189 | }
190 |
191 | // Add parallelism strategies
192 | if (parallelismArray.includes('tp')) {
193 | cmd += ' \\\n --tp 8';
194 | }
195 | if (parallelismArray.includes('dp')) {
196 | cmd += ' \\\n --dp 4';
197 | }
198 | if (parallelismArray.includes('pp')) {
199 | cmd += ' \\\n --pp 2';
200 | }
201 |
202 | // Hardware-specific options
203 | if (hardware === 'h100') {
204 | cmd += ' \\\n --enable-h100-optimizations';
205 | }
206 |
207 | return cmd;
208 | }
209 | };
210 |
211 | return ;
212 | };
213 |
214 | export default ExampleConfigGenerator;
215 | ```
216 |
217 | ## Tips
218 |
219 | 1. **Keep it simple**: Start with basic radio buttons, add complexity as needed
220 | 2. **Test thoroughly**: Try all combinations to ensure correct commands
221 | 3. **Add validation**: Check for incompatible option combinations
222 | 4. **Use subtitles**: Add helpful context with the `subtitle` property
223 | 5. **Multi-line commands**: Use `\\\n` for readable multi-line output
224 | 6. **Error messages**: Return clear error messages with solutions
225 |
226 | ## Next Steps
227 |
228 | - See [README.md](./README.md) for detailed API documentation
229 | - Check [DeepSeekR1ConfigGenerator](../DeepSeekR1ConfigGenerator/index.js) for a real-world example
230 | - Customize styles in `styles.module.css` if needed
231 |
232 | ## Common Patterns
233 |
234 | ### Conditional Options Based on Previous Selection
235 |
236 | ```javascript
237 | generateCommand: function(values) {
238 | const { hardware, quantization } = values;
239 |
240 | // Only add EP for specific hardware
241 | if (hardware === 'b200' && values.parallelism.includes('ep')) {
242 | cmd += ' --ep 8';
243 | }
244 | }
245 | ```
246 |
247 | ### Model Path Mapping
248 |
249 | ```javascript
250 | const modelMap = {
251 | 'small': 'org/model-7b',
252 | 'medium': 'org/model-13b',
253 | 'large': 'org/model-70b'
254 | };
255 | const modelPath = modelMap[values.modelSize];
256 | ```
257 |
258 | ### Complex Validation
259 |
260 | ```javascript
261 | generateCommand: function(values) {
262 | // Multiple validation checks
263 | const errors = [];
264 |
265 | if (values.hardware === 'a100' && values.quantization === 'int4') {
266 | errors.push('A100 does not support INT4');
267 | }
268 |
269 | if (values.batchSize > 128 && !values.features.includes('optimization')) {
270 | errors.push('Large batch sizes require optimization enabled');
271 | }
272 |
273 | if (errors.length > 0) {
274 | return '# Errors:\n' + errors.map(e => `# - ${e}`).join('\n');
275 | }
276 |
277 | // ... normal command generation
278 | }
279 | ```
280 |
281 |
--------------------------------------------------------------------------------
/docs/Mistral/Devstral-2.md:
--------------------------------------------------------------------------------
1 | # Devstral 2 (Mistral)
2 |
3 | ## 1. Model Introduction
4 |
5 | **Devstral 2** is an agentic LLM family for software engineering tasks. It is designed for agentic workflows such as tool use, codebase exploration, and multi-file edits, and achieves strong performance on **SWE-bench**.
6 |
7 | The **Devstral 2 Instruct** checkpoints are instruction-tuned **FP8** models, making them a good fit for chat, tool-using agents, and instruction-following SWE workloads.
8 |
9 | **Key Features:**
10 |
11 | - **Agentic coding**: Optimized for tool-driven coding and software engineering agents
12 | - **Improved performance**: A step up compared to earlier Devstral models
13 | - **Better generalization**: More robust across diverse prompts and coding environments
14 | - **Long context**: Up to a **256K** context window
15 |
16 | **Use Cases:**
17 | AI code assistants, agentic coding, and software engineering tasks that require deep codebase understanding and tool integration.
18 |
19 | For enterprises requiring specialized capabilities (increased context, domain-specific knowledge, etc.), please reach out to Mistral.
20 |
21 | **Models:**
22 |
23 | - **Collection**: [mistralai/devstral-2 (Hugging Face)](https://huggingface.co/collections/mistralai/devstral-2)
24 | - **FP8 Instruct**:
25 | - **[mistralai/Devstral-2-123B-Instruct-2512](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512)**
26 | - **[mistralai/Devstral-Small-2-24B-Instruct-2512](https://huggingface.co/mistralai/Devstral-Small-2-24B-Instruct-2512)**
27 |
28 | ---
29 |
30 | ## 2. SGLang Installation
31 |
32 | SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.
33 |
34 | Please refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html) for installation instructions.
35 |
36 | :::caution Transformers version requirement
37 | Devstral 2 requires a recent `transformers`. Please verify `transformers >= 5.0.0.rc`:
38 |
39 | ```shell
40 | python -c "import transformers; print(transformers.__version__)"
41 | ```
42 |
43 | If your version is lower, upgrade:
44 |
45 | ```shell
46 | pip install -U --pre "transformers>=5.0.0rc0"
47 | ```
48 | :::
49 |
50 | ---
51 |
52 | ## 3. Model Deployment
53 |
54 | ### 3.1 Basic configuration
55 |
56 | **Interactive Command Generator**: Use the configuration selector below to generate a launch command for Devstral Small 2 (24B) or Devstral 2 (123B).
57 |
58 | :::note
59 | The TP size is set to the minimum required for the selected model size.
60 | :::
61 |
62 | import Devstral2ConfigGenerator from '@site/src/components/Devstral2ConfigGenerator';
63 |
64 |
65 |
66 | ### 3.2 Configuration tips
67 |
68 | - **Context length vs memory**: Devstral 2 advertises a long context window; if you are memory-constrained, start by lowering `--context-length` (for example `32768`) and increase once things are stable.
69 | - **FP8 checkpoints**: Both Devstral Small 2 and Devstral 2 are published as **FP8** weights. If you hit kernel / dtype issues, try a newer SGLang build and recent CUDA drivers.
70 |
71 | ---
72 |
73 | ## 4. Model Invocation
74 |
75 | ### 4.1 Basic Usage (OpenAI-Compatible API)
76 |
77 | SGLang exposes an OpenAI-compatible endpoint. Example:
78 |
79 | ```python
80 | from openai import OpenAI
81 |
82 | client = OpenAI(
83 | base_url="http://localhost:30000/v1",
84 | api_key="EMPTY",
85 | )
86 |
87 | resp = client.chat.completions.create(
88 | model="mistralai/Devstral-Small-2-24B-Instruct-2512",
89 | messages=[
90 | {"role": "system", "content": "You are a helpful coding assistant."},
91 | {"role": "user", "content": "Write a Python function that retries a request with exponential backoff."},
92 | ],
93 | temperature=0.2,
94 | max_tokens=512,
95 | )
96 |
97 | print(resp.choices[0].message.content)
98 | ```
99 |
100 | **Output Example:**
101 |
102 | ```
103 | Here's a Python function that implements exponential backoff for retrying a request. This function uses the `requests` library to make HTTP requests and includes error handling for common HTTP and connection errors.
104 |
105 | ```python
106 | import time
107 | import requests
108 | from requests.exceptions import RequestException
109 |
110 | def retry_with_exponential_backoff(
111 | url,
112 | max_retries=3,
113 | initial_delay=1,
114 | backoff_factor=2,
115 | method="GET",
116 | **kwargs
117 | ):
118 | """
119 | Retry a request with exponential backoff.
120 |
121 | Parameters:
122 | - url: The URL to request.
123 | - max_retries: Maximum number of retry attempts (default: 3).
124 | - initial_delay: Initial delay in seconds (default: 1).
125 | - backoff_factor: Multiplier for the delay between retries (default: 2).
126 | - method: HTTP method to use (default: "GET").
127 | - **kwargs: Additional arguments to pass to the request function (e.g., headers, data, etc.).
128 |
129 | Returns:
130 | - Response object if the request succeeds.
131 | - Raises an exception if all retries fail.
132 | """
133 | retry_count = 0
134 | delay = initial_delay
135 |
136 | while retry_count < max_retries:
137 | try:
138 | response = requests.request(method, url, **kwargs)
139 | # Check if the response status code indicates success
140 | if response.status_code < 400:
141 | return response
142 | else:
143 | raise RequestException(f"HTTP {response.status_code}: {response.text}")
144 |
145 | except RequestException as e:
146 | if retry_count == max_retries - 1:
147 | raise Exception(f"All retries failed. Last error: {e}")
148 |
149 | print(f"Attempt {retry_count + 1} failed. Retrying in {delay} seconds...")
150 | time.sleep(delay)
151 | ...
152 | ```
153 |
154 | ### 4.2 Tool calling (optional)
155 |
156 | Devstral 2 supports tool calling capabilities. Enable the tool call parser:
157 |
158 | ```shell
159 | python -m sglang.launch_server \
160 | --model mistralai/Devstral-2-123B-Instruct-2512 \
161 | --tp 2 \
162 | --tool-call-parser mistral
163 | ```
164 |
165 | **Python Example (with Thinking Process):**
166 |
167 | ```python
168 | from openai import OpenAI
169 |
170 | client = OpenAI(
171 | base_url="http://localhost:30000/v1",
172 | api_key="EMPTY"
173 | )
174 |
175 | # Define available tools
176 | tools = [
177 | {
178 | "type": "function",
179 | "function": {
180 | "name": "get_weather",
181 | "description": "Get the current weather for a location",
182 | "parameters": {
183 | "type": "object",
184 | "properties": {
185 | "location": {
186 | "type": "string",
187 | "description": "The city name"
188 | },
189 | "unit": {
190 | "type": "string",
191 | "enum": ["celsius", "fahrenheit"],
192 | "description": "Temperature unit"
193 | }
194 | },
195 | "required": ["location"]
196 | }
197 | }
198 | }
199 | ]
200 |
201 | # Make request with streaming to see thinking process
202 | response = client.chat.completions.create(
203 | model="mistralai/Devstral-2-123B-Instruct-2512",
204 | messages=[
205 | {"role": "user", "content": "What's the weather in Beijing?"}
206 | ],
207 | tools=tools,
208 | temperature=0.7,
209 | stream=True
210 | )
211 |
212 | # Process streaming response
213 | thinking_started = False
214 | has_thinking = False
215 | tool_calls_accumulator = {}
216 |
217 | for chunk in response:
218 | if chunk.choices and len(chunk.choices) > 0:
219 | delta = chunk.choices[0].delta
220 |
221 | # Accumulate tool calls
222 | if hasattr(delta, 'tool_calls') and delta.tool_calls:
223 | # Close thinking section if needed
224 | if has_thinking and thinking_started:
225 | print("\n=============== Content =================\n", flush=True)
226 | thinking_started = False
227 |
228 | for tool_call in delta.tool_calls:
229 | index = tool_call.index
230 | if index not in tool_calls_accumulator:
231 | tool_calls_accumulator[index] = {
232 | 'name': None,
233 | 'arguments': ''
234 | }
235 |
236 | if tool_call.function:
237 | if tool_call.function.name:
238 | tool_calls_accumulator[index]['name'] = tool_call.function.name
239 | if tool_call.function.arguments:
240 | tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments
241 |
242 | # Print content
243 | if delta.content:
244 | print(delta.content, end="", flush=True)
245 |
246 | # Print accumulated tool calls
247 | for index, tool_call in sorted(tool_calls_accumulator.items()):
248 | print(f"🔧 Tool Call: {tool_call['name']}")
249 | print(f" Arguments: {tool_call['arguments']}")
250 |
251 | print()
252 | ```
253 |
254 | **Output Example:**
255 |
256 | ```
257 | 🔧 Tool Call: get_weather
258 | Arguments: {"location": "Beijing"}
259 | ```
260 |
--------------------------------------------------------------------------------
/src/components/ConfigGenerator/README.md:
--------------------------------------------------------------------------------
1 | # ConfigGenerator Component
2 |
3 | A reusable, generic configuration generator component for creating interactive command builders in documentation.
4 |
5 | ## Features
6 |
7 | - **Flexible Configuration**: Support for radio buttons, checkboxes, and text inputs
8 | - **Real-time Command Generation**: Automatically updates command output based on selections
9 | - **Custom Validation**: Add custom validation rules and error messages
10 | - **Theme Support**: Works with light and dark modes using Docusaurus CSS variables
11 | - **Responsive Design**: Mobile-friendly layout
12 |
13 | ## Usage
14 |
15 | ### 1. Create a Wrapper Component
16 |
17 | Create a new component that imports `ConfigGenerator` and provides your custom configuration:
18 |
19 | ```jsx
20 | import React from 'react';
21 | import ConfigGenerator from '../ConfigGenerator';
22 |
23 | const MyModelConfigGenerator = () => {
24 | const config = {
25 | // Optional: Model family identifier
26 | modelFamily: 'my-model-family',
27 |
28 | // Define your configuration options
29 | options: {
30 | hardware: {
31 | name: 'hardware',
32 | title: 'Hardware Platform',
33 | items: [
34 | { id: 'h100', label: 'H100', default: true },
35 | { id: 'h200', label: 'H200', default: false }
36 | ]
37 | },
38 | // ... more options
39 | },
40 |
41 | // Command generation function
42 | generateCommand: function(values) {
43 | const { hardware } = values;
44 | let cmd = 'python3 -m sglang.launch_server';
45 |
46 | if (hardware === 'h100') {
47 | cmd += ' --gpu-type h100';
48 | }
49 |
50 | return cmd;
51 | }
52 | };
53 |
54 | return ;
55 | };
56 |
57 | export default MyModelConfigGenerator;
58 | ```
59 |
60 | ### 2. Use in Markdown/MDX Files
61 |
62 | Import and use your wrapper component in any `.md` or `.mdx` file:
63 |
64 | ```mdx
65 | ---
66 | title: My Model
67 | ---
68 |
69 | import MyModelConfigGenerator from '@site/src/components/MyModelConfigGenerator';
70 |
71 | # Model Deployment
72 |
73 | Use the interactive configuration generator below:
74 |
75 |
76 | ```
77 |
78 | ## Configuration Object Structure
79 |
80 | ### Basic Structure
81 |
82 | ```javascript
83 | const config = {
84 | modelFamily: 'optional-model-family', // Optional
85 | options: {
86 | // Option definitions (see below)
87 | },
88 | generateCommand: function(values) {
89 | // Command generation logic
90 | return 'generated-command-string';
91 | }
92 | };
93 | ```
94 |
95 | ### Option Types
96 |
97 | #### 1. Radio Button (Single Selection)
98 |
99 | Default behavior when `type` is not specified:
100 |
101 | ```javascript
102 | optionName: {
103 | name: 'optionName',
104 | title: 'Display Title',
105 | items: [
106 | { id: 'choice1', label: 'Choice 1', default: true },
107 | { id: 'choice2', label: 'Choice 2', default: false },
108 | { id: 'choice3', label: 'Choice 3', subtitle: 'Additional info', default: false }
109 | ]
110 | }
111 | ```
112 |
113 | **Properties:**
114 | - `name`: Internal identifier (string)
115 | - `title`: Display title (string)
116 | - `items`: Array of choices
117 | - `id`: Unique identifier (string)
118 | - `label`: Display label (string)
119 | - `subtitle`: Optional subtitle text (string)
120 | - `default`: Whether this is the default selection (boolean)
121 |
122 | #### 2. Checkbox (Multiple Selection)
123 |
124 | Set `type: 'checkbox'`:
125 |
126 | ```javascript
127 | optionName: {
128 | name: 'optionName',
129 | title: 'Display Title',
130 | type: 'checkbox',
131 | items: [
132 | { id: 'option1', label: 'Option 1', default: true },
133 | { id: 'option2', label: 'Option 2', default: false, required: true },
134 | { id: 'option3', label: 'Option 3', subtitle: 'Additional info', default: false }
135 | ]
136 | }
137 | ```
138 |
139 | **Additional Properties:**
140 | - `required`: If `true`, prevents the user from unchecking this option (boolean)
141 |
142 | **Note:** In `generateCommand`, checkbox values are returned as an array:
143 | ```javascript
144 | generateCommand: function(values) {
145 | const strategies = values.optionName; // e.g., ['option1', 'option2']
146 | if (strategies.includes('option1')) {
147 | // ...
148 | }
149 | }
150 | ```
151 |
152 | #### 3. Text Input
153 |
154 | Set `type: 'text'`:
155 |
156 | ```javascript
157 | optionName: {
158 | name: 'optionName',
159 | title: 'Display Title',
160 | type: 'text',
161 | default: 'default value',
162 | placeholder: 'Enter value...'
163 | }
164 | ```
165 |
166 | **Properties:**
167 | - `default`: Default text value (string)
168 | - `placeholder`: Placeholder text (string)
169 |
170 | ### Command Generation Function
171 |
172 | The `generateCommand` function receives a `values` object containing all user selections:
173 |
174 | ```javascript
175 | generateCommand: function(values) {
176 | const { hardware, quantization, strategy } = values;
177 |
178 | // For radio buttons: string value
179 | if (hardware === 'h100') {
180 | // ...
181 | }
182 |
183 | // For checkboxes: array of strings
184 | const strategyArray = Array.isArray(strategy) ? strategy : [];
185 | if (strategyArray.includes('tp')) {
186 | // ...
187 | }
188 |
189 | // For text inputs: string value
190 | const modelPath = values.modelName || '';
191 |
192 | // Build and return command string
193 | let cmd = 'python3 -m sglang.launch_server';
194 | cmd += ` --model ${modelPath}`;
195 |
196 | return cmd;
197 | }
198 | ```
199 |
200 | **Tips:**
201 | - Use multi-line strings with `\\n` for readable output
202 | - Add validation checks and return error messages when needed
203 | - Use template literals for cleaner string building
204 |
205 | ## Examples
206 |
207 | ### Example 1: Simple Configuration
208 |
209 | ```javascript
210 | const config = {
211 | options: {
212 | model: {
213 | name: 'model',
214 | title: 'Model Selection',
215 | items: [
216 | { id: 'small', label: 'Small (7B)', default: true },
217 | { id: 'medium', label: 'Medium (13B)', default: false },
218 | { id: 'large', label: 'Large (70B)', default: false }
219 | ]
220 | }
221 | },
222 | generateCommand: function(values) {
223 | const modelSizes = { small: '7B', medium: '13B', large: '70B' };
224 | return `python3 -m sglang.launch_server --model my-model-${modelSizes[values.model]}`;
225 | }
226 | };
227 | ```
228 |
229 | ### Example 2: With Validation
230 |
231 | ```javascript
232 | const config = {
233 | options: {
234 | hardware: {
235 | name: 'hardware',
236 | title: 'Hardware',
237 | items: [
238 | { id: 'cpu', label: 'CPU', default: true },
239 | { id: 'gpu', label: 'GPU', default: false }
240 | ]
241 | },
242 | precision: {
243 | name: 'precision',
244 | title: 'Precision',
245 | items: [
246 | { id: 'fp32', label: 'FP32', default: true },
247 | { id: 'fp16', label: 'FP16', default: false }
248 | ]
249 | }
250 | },
251 | generateCommand: function(values) {
252 | // Validation
253 | if (values.hardware === 'cpu' && values.precision === 'fp16') {
254 | return '# Error: FP16 is not supported on CPU\n# Please select FP32 or use GPU';
255 | }
256 |
257 | let cmd = 'python3 -m sglang.launch_server';
258 | cmd += ` --device ${values.hardware}`;
259 | cmd += ` --precision ${values.precision}`;
260 | return cmd;
261 | }
262 | };
263 | ```
264 |
265 | ### Example 3: Complex Configuration with Checkboxes
266 |
267 | ```javascript
268 | const config = {
269 | options: {
270 | model: {
271 | name: 'model',
272 | title: 'Model',
273 | items: [
274 | { id: 'model-a', label: 'Model A', default: true },
275 | { id: 'model-b', label: 'Model B', default: false }
276 | ]
277 | },
278 | features: {
279 | name: 'features',
280 | title: 'Features',
281 | type: 'checkbox',
282 | items: [
283 | { id: 'cache', label: 'Enable Cache', default: true, required: true },
284 | { id: 'logging', label: 'Enable Logging', default: false },
285 | { id: 'profiling', label: 'Enable Profiling', default: false }
286 | ]
287 | },
288 | batchSize: {
289 | name: 'batchSize',
290 | title: 'Batch Size',
291 | type: 'text',
292 | default: '32',
293 | placeholder: 'Enter batch size'
294 | }
295 | },
296 | generateCommand: function(values) {
297 | const { model, features, batchSize } = values;
298 | const featureArray = Array.isArray(features) ? features : [];
299 |
300 | let cmd = `python3 -m sglang.launch_server --model ${model}`;
301 | cmd += ` --batch-size ${batchSize}`;
302 |
303 | if (featureArray.includes('cache')) {
304 | cmd += ' --enable-cache';
305 | }
306 | if (featureArray.includes('logging')) {
307 | cmd += ' --enable-logging';
308 | }
309 | if (featureArray.includes('profiling')) {
310 | cmd += ' --enable-profiling';
311 | }
312 |
313 | return cmd;
314 | }
315 | };
316 | ```
317 |
318 | ## Styling
319 |
320 | The component uses CSS modules with Docusaurus CSS variables for theme compatibility. The styles automatically adapt to light and dark modes.
321 |
322 | To customize the appearance, you can:
323 |
324 | 1. Modify `/src/components/ConfigGenerator/styles.module.css`
325 | 2. Override CSS variables in your custom CSS
326 | 3. Use inline styles in your wrapper component (not recommended)
327 |
328 | ## Real-World Example
329 |
330 | See `/src/components/DeepSeekR1ConfigGenerator/index.js` for a complete, production-ready example with:
331 | - Multiple option types (radio, checkbox)
332 | - Complex validation logic
333 | - Conditional command generation
334 | - Hardware-specific optimizations
335 |
336 | ## Best Practices
337 |
338 | 1. **Clear Labels**: Use descriptive labels and subtitles
339 | 2. **Sensible Defaults**: Set appropriate default values
340 | 3. **Validation**: Add validation for incompatible options
341 | 4. **Error Messages**: Provide clear error messages with solutions
342 | 5. **Documentation**: Add comments explaining complex logic
343 | 6. **Testing**: Test all combinations to ensure correct output
344 |
345 | ## Support
346 |
347 | For issues or questions, please open an issue in the repository.
348 |
349 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/docs/NVIDIA/Nemotron3-Nano.md:
--------------------------------------------------------------------------------
1 | # NVIDIA Nemotron3-Nano
2 |
3 | ## 1. Model Introduction
4 |
5 | `NVIDIA Nemotron3-Nano` is a 30B-parameter hybrid LLM that mixes Mixture-of-Experts (MoE) feed-forward layers, Mamba2 sequence-modeling layers, and standard self-attention layers in a single stack rather than classic “attention + MLP” transformer blocks.
6 |
7 | The BF16 variant (`nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16`) is designed as a high-fidelity reference model, while the FP8 variant (`nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8`) targets optimized inference performance on modern NVIDIA GPUs.
8 |
9 | At a high level:
10 |
11 | - **Hybrid layer stack (Mamba2 + MoE + attention):** The network is composed of interleaved layers that are *either* Mamba2, *or* MoE feed-forward, *or* attention-only.
12 | - **Non-uniform layer ordering:** The order and mix of these specialized layers is not a simple, rigid pattern, enabling the model to trade off sequence modeling, routing capacity, and expressivity across depth.
13 | - **Deployment-friendly precision:** Use BF16 for accuracy-sensitive and evaluation workloads; use FP8 for latency- and throughput-critical serving on recent NVIDIA GPUs.
14 |
15 | ---
16 |
17 | ## 2. SGLang Installation
18 |
19 | SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.
20 |
21 | For a quick start, please install the nightly wheel for SGLang:
22 | ```bash
23 | pip install sglang==0.5.6.post2.dev7852+g8102e36b5 --extra-index-url https://sgl-project.github.io/whl/nightly/
24 | ```
25 | ---
26 |
27 | ## 3. Model Deployment
28 |
29 | This section provides a progressive guide from quick deployment to performance tuning.
30 |
31 | ### 3.1 Basic Configuration
32 |
33 | **Interactive Command Generator**: select hardware, model variant, and common knobs to generate a launch command.
34 |
35 | import NemotronNano3ConfigGenerator from '@site/src/components/NemotronConfigGenerator';
36 |
37 |
38 |
39 | ### 3.2 Configuration Tips
40 |
41 |
42 | - **Attention backend**:
43 |
44 | **H200/B200**: use flashinfer attention backend by default.
45 |
46 | - **TP support**:
47 |
48 | To set tp size, use `--tp <1|2|4|8>`.
49 |
50 | - **FP8 KV cache**:
51 |
52 | To enable fp8 kv cache, please append `--kv-cache-dtype fp8_e4m3`.
53 |
54 | ---
55 |
56 | ## 4. Model Invocation
57 |
58 | ### 4.1 Basic Usage (OpenAI-Compatible API)
59 |
60 | SGLang provides an OpenAI-compatible endpoint. Example with the OpenAI Python client:
61 |
62 | ```python
63 | from openai import OpenAI
64 |
65 | client = OpenAI(
66 | base_url="http://localhost:30000/v1",
67 | api_key="EMPTY",
68 | )
69 |
70 | resp = client.chat.completions.create(
71 | model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
72 | messages=[
73 | {"role": "system", "content": "You are a helpful assistant."},
74 | {"role": "user", "content": "Summarize what MoE models are in 5 bullets."},
75 | ],
76 | temperature=0.7,
77 | max_tokens=256,
78 | )
79 |
80 | print(resp.choices[0].message.content)
81 |
82 | ```
83 |
84 | Streaming chat completion
85 | ```python
86 | from openai import OpenAI
87 |
88 | client = OpenAI(
89 | base_url="http://localhost:30000/v1",
90 | api_key="EMPTY",
91 | )
92 |
93 | stream = client.chat.completions.create(
94 | model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
95 | messages=[
96 | {"role": "system", "content": "You are a helpful AI assistant."},
97 | {"role": "user", "content": "What are the first 5 prime numbers?"}
98 | ],
99 | temperature=0.7,
100 | max_tokens=1024,
101 | stream=True,
102 | )
103 | for chunk in stream:
104 | delta = chunk.choices[0].delta
105 | if delta and delta.content:
106 | print(delta.content, end="", flush=True)
107 | ```
108 |
109 | ### 4.2 Reasoning
110 | To enable reasoning, `--reasoning-parser nano_v3` should be appended to the launching command. The model supports two modes - Reasoning ON (default) vs OFF. This can be toggled by setting enable_thinking to False, as shown below.
111 |
112 | ```python
113 | from openai import OpenAI
114 |
115 | client = OpenAI(
116 | base_url="http://localhost:30000/v1",
117 | api_key="EMPTY",
118 | )
119 |
120 | # Reasoning on (default)
121 | print("Reasoning on")
122 | resp = client.chat.completions.create(
123 | model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
124 | messages=[
125 | {"role": "system", "content": "You are a helpful assistant."},
126 | {"role": "user", "content": "Write a haiku about GPUs."}
127 | ],
128 | temperature=0.7,
129 | max_tokens=512,
130 | )
131 | print(resp.choices[0].message.reasoning_content)
132 |
133 | # Reasoning off
134 | print("Reasoning off")
135 | resp = client.chat.completions.create(
136 | model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
137 | messages=[
138 | {"role": "system", "content": "You are a helpful assistant."},
139 | {"role": "user", "content": "Write a haiku about GPUs."}
140 | ],
141 | temperature=0.6,
142 | max_tokens=256,
143 | extra_body={"chat_template_kwargs": {"enable_thinking": False}}
144 | )
145 | print(resp.choices[0].message.reasoning_content)
146 |
147 | ```
148 |
149 | ### 4.3 Tool calling
150 | To enable reasoning, `--tool-call-parser qwen3_coder` should be appended to the launching command. Call functions using the OpenAI Tools schema and inspect returned tool_calls.
151 |
152 | ```python
153 | from openai import OpenAI
154 |
155 | client = OpenAI(
156 | base_url="http://localhost:30000/v1",
157 | api_key="EMPTY",
158 | )
159 |
160 | # Tool calling via OpenAI tools schema
161 | TOOLS = [
162 | {
163 | "type": "function",
164 | "function": {
165 | "name": "calculate_tip",
166 | "parameters": {
167 | "type": "object",
168 | "properties": {
169 | "bill_total": {
170 | "type": "integer",
171 | "description": "The total amount of the bill"
172 | },
173 | "tip_percentage": {
174 | "type": "integer",
175 | "description": "The percentage of tip to be applied"
176 | }
177 | },
178 | "required": ["bill_total", "tip_percentage"]
179 | }
180 | }
181 | }
182 | ]
183 |
184 | completion = client.chat.completions.create(
185 | model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
186 | messages=[
187 | {"role": "system", "content": ""},
188 | {"role": "user", "content": "My bill is $50. What will be the amount for 15% tip?"}
189 | ],
190 | tools=TOOLS,
191 | temperature=0.6,
192 | top_p=0.95,
193 | max_tokens=512,
194 | stream=False
195 | )
196 |
197 | print(completion.choices[0].message.reasoning_content)
198 | print(completion.choices[0].message.tool_calls)
199 | ```
200 |
201 | ---
202 |
203 | ## 5. Benchmark
204 |
205 | ### 5.1 Speed Benchmark
206 |
207 | **Test Environment:**
208 |
209 | - Hardware: NVIDIA B200 GPU
210 |
211 | **FP8 variant**
212 |
213 | - Model Deployment Command:
214 |
215 | ```shell
216 | python3 -m sglang.launch_server \
217 | --model-path nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 \
218 | --trust-remote-code \
219 | --max-running-requests 1024 \
220 | --host 0.0.0.0 \
221 | --port 30000
222 | ```
223 |
224 | - Benchmark Command:
225 |
226 | ```shell
227 | python3 -m sglang.bench_serving \
228 | --backend sglang \
229 | --host 127.0.0.1 \
230 | --port 30000 \
231 | --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 \
232 | --dataset-name random \
233 | --random-input-len 1024 \
234 | --random-output-len 1024 \
235 | --num-prompts 4096 \
236 | --max-concurrency 256
237 | ```
238 |
239 | - **Test Results:**
240 |
241 | ```
242 | ============ Serving Benchmark Result ============
243 | Backend: sglang
244 | Traffic request rate: inf
245 | Max request concurrency: 256
246 | Successful requests: 4096
247 | Benchmark duration (s): 183.18
248 | Total input tokens: 2081726
249 | Total input text tokens: 2081726
250 | Total input vision tokens: 0
251 | Total generated tokens: 2116125
252 | Total generated tokens (retokenized): 1076256
253 | Request throughput (req/s): 22.36
254 | Input token throughput (tok/s): 11364.25
255 | Output token throughput (tok/s): 11552.04
256 | Peak output token throughput (tok/s): 24692.00
257 | Peak concurrent requests: 294
258 | Total token throughput (tok/s): 22916.30
259 | Concurrency: 251.19
260 | ----------------End-to-End Latency----------------
261 | Mean E2E Latency (ms): 11233.74
262 | Median E2E Latency (ms): 11142.97
263 | ---------------Time to First Token----------------
264 | Mean TTFT (ms): 172.99
265 | Median TTFT (ms): 116.57
266 | P99 TTFT (ms): 1193.68
267 | -----Time per Output Token (excl. 1st token)------
268 | Mean TPOT (ms): 21.74
269 | Median TPOT (ms): 21.14
270 | P99 TPOT (ms): 41.12
271 | ---------------Inter-Token Latency----------------
272 | Mean ITL (ms): 21.45
273 | Median ITL (ms): 9.06
274 | P95 ITL (ms): 62.59
275 | P99 ITL (ms): 110.83
276 | Max ITL (ms): 5368.19
277 | ==================================================
278 | ```
279 |
280 | **BF16 variant**
281 |
282 | - Model Deployment Command:
283 |
284 | ```shell
285 | python3 -m sglang.launch_server \
286 | --model-path nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \
287 | --trust-remote-code \
288 | --max-running-requests 1024 \
289 | --host 0.0.0.0 \
290 | --port 30000
291 | ```
292 |
293 | - Benchmark Command:
294 |
295 | ```shell
296 | python3 -m sglang.bench_serving \
297 | --backend sglang \
298 | --host 127.0.0.1 \
299 | --port 30000 \
300 | --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \
301 | --dataset-name random \
302 | --random-input-len 1024 \
303 | --random-output-len 1024 \
304 | --num-prompts 4096 \
305 | --max-concurrency 256
306 | ```
307 |
308 | - **Test Results:**
309 |
310 | ```
311 | ============ Serving Benchmark Result ============
312 | Backend: sglang
313 | Traffic request rate: inf
314 | Max request concurrency: 256
315 | Successful requests: 4096
316 | Benchmark duration (s): 360.22
317 | Total input tokens: 2081726
318 | Total input text tokens: 2081726
319 | Total input vision tokens: 0
320 | Total generated tokens: 2087288
321 | Total generated tokens (retokenized): 1940652
322 | Request throughput (req/s): 11.37
323 | Input token throughput (tok/s): 5779.10
324 | Output token throughput (tok/s): 5794.55
325 | Peak output token throughput (tok/s): 9169.00
326 | Peak concurrent requests: 276
327 | Total token throughput (tok/s): 11573.65
328 | Concurrency: 249.76
329 | ----------------End-to-End Latency----------------
330 | Mean E2E Latency (ms): 21965.10
331 | Median E2E Latency (ms): 21706.35
332 | ---------------Time to First Token----------------
333 | Mean TTFT (ms): 211.54
334 | Median TTFT (ms): 93.06
335 | P99 TTFT (ms): 2637.66
336 | -----Time per Output Token (excl. 1st token)------
337 | Mean TPOT (ms): 43.27
338 | Median TPOT (ms): 43.04
339 | P99 TPOT (ms): 61.15
340 | ---------------Inter-Token Latency----------------
341 | Mean ITL (ms): 42.77
342 | Median ITL (ms): 28.46
343 | P95 ITL (ms): 71.85
344 | P99 ITL (ms): 113.20
345 | Max ITL (ms): 5237.28
346 | ==================================================
347 |
348 | ```
349 | ### 5.2 Accuracy Benchmark
350 |
351 |
352 | #### 5.2.1 GSM8K Benchmark
353 |
354 | **Environment**
355 | - Hardware: NVIDIA B200 GPU
356 | - Model: BF16 checkpoint
357 |
358 | **Launch Model**
359 | ```bash
360 | python3 -m sglang.launch_server \
361 | --model-path nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \
362 | --trust-remote-code \
363 | --reasoning-parser nano_v3
364 | ```
365 |
366 | **Run Benchmark with lm-eval**
367 | ```bash
368 | pip install lm-eval[api]==0.4.9.2
369 |
370 | lm_eval --model local-completions --tasks gsm8k --model_args "model=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16,base_url=http://127.0.0.1:30000/v1/completions,num_concurrent=4,max_retries=3,tokenized_requests=False,max_lengths=16384" --gen_kwargs '{"chat_template_kwargs":{"thinking":true}}' --batch_size 256
371 | ```
372 |
373 | **Test Results:**
374 | ```
375 | |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
376 | |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
377 | |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.5603|± |0.0137|
378 | | | |strict-match | 5|exact_match|↑ |0.8453|± |0.0100|
379 | ```
380 |
381 |
382 |
383 |
384 |
--------------------------------------------------------------------------------
/src/components/Qwen3ConfigGenerator/index.js:
--------------------------------------------------------------------------------
1 | import React, { useState, useMemo } from 'react';
2 | import styles from '../ConfigGenerator/styles.module.css';
3 |
4 | /**
5 | * Qwen3 Configuration Generator
6 | * Supports multiple Qwen3 model sizes (235B, 30B, 32B, 14B, 8B, 4B, 1.7B, 0.6B)
7 | * Custom implementation to handle model-specific logic without modifying ConfigGenerator
8 | */
9 | const Qwen3ConfigGenerator = () => {
10 | const baseConfig = {
11 | modelFamily: 'Qwen',
12 |
13 | options: {
14 | hardware: {
15 | name: 'hardware',
16 | title: 'Hardware Platform',
17 | items: [
18 | { id: 'b200', label: 'B200', default: true },
19 | { id: 'h100', label: 'H100', default: false },
20 | { id: 'h200', label: 'H200', default: false }
21 | ]
22 | },
23 | modelsize: {
24 | name: 'modelsize',
25 | title: 'Model Size',
26 | items: [
27 | { id: '235b', label: '235B', subtitle: 'MOE', default: true },
28 | { id: '30b', label: '30B', subtitle: 'MOE', default: false },
29 | { id: '32b', label: '32B', subtitle: 'Dense', default: false },
30 | { id: '14b', label: '14B', subtitle: 'Dense', default: false },
31 | { id: '8b', label: '8B', subtitle: 'Dense', default: false },
32 | { id: '4b', label: '4B', subtitle: 'Dense', default: false },
33 | { id: '1.7b', label: '1.7B', subtitle: 'Dense', default: false },
34 | { id: '0.6b', label: '0.6B', subtitle: 'Dense', default: false }
35 | ]
36 | },
37 | quantization: {
38 | name: 'quantization',
39 | title: 'Quantization',
40 | items: [
41 | { id: 'bf16', label: 'BF16', default: true },
42 | { id: 'fp8', label: 'FP8', default: false }
43 | ]
44 | },
45 | category: {
46 | name: 'category',
47 | title: 'Categories',
48 | items: [
49 | { id: 'base', label: 'Base', default: true },
50 | { id: 'instruct', label: 'Instruct', default: false },
51 | { id: 'thinking', label: 'Thinking', default: false }
52 | ]
53 | },
54 | reasoningParser: {
55 | name: 'reasoningParser',
56 | title: 'Reasoning Parser',
57 | items: [
58 | { id: 'disabled', label: 'Disabled', default: true },
59 | { id: 'enabled', label: 'Enabled', default: false }
60 | ],
61 | // Only visible when category is not 'instruct'
62 | visibleWhen: (values) => values.category !== 'instruct',
63 | // Only add command when category is not 'instruct' and enabled
64 | commandRule: (value, values) => {
65 | if (value === 'enabled' && values.category !== 'instruct') {
66 | return '--reasoning-parser qwen3';
67 | }
68 | return null;
69 | }
70 | },
71 | toolcall: {
72 | name: 'toolcall',
73 | title: 'Tool Call Parser',
74 | items: [
75 | { id: 'disabled', label: 'Disabled', default: true },
76 | { id: 'enabled', label: 'Enabled', default: false }
77 | ],
78 | commandRule: (value) => value === 'enabled' ? '--tool-call-parser qwen' : null
79 | }
80 | },
81 |
82 | modelConfigs: {
83 | '235b': {
84 | baseName: '235B-A22B',
85 | hasThinkingVariants: true,
86 | h100: { tp: 8, ep: 0, bf16: true, fp8: true },
87 | h200: { tp: 8, ep: 0, bf16: true, fp8: true },
88 | b200: { tp: 8, ep: 0, bf16: true, fp8: true }
89 | },
90 | '30b': {
91 | baseName: '30B-A3B',
92 | hasThinkingVariants: true,
93 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
94 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
95 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
96 | },
97 | '32b': {
98 | baseName: '32B',
99 | hasThinkingVariants: false,
100 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
101 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
102 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
103 | },
104 | '14b': {
105 | baseName: '14B',
106 | hasThinkingVariants: false,
107 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
108 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
109 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
110 | },
111 | '8b': {
112 | baseName: '8B',
113 | hasThinkingVariants: false,
114 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
115 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
116 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
117 | },
118 | '4b': {
119 | baseName: '4B',
120 | hasThinkingVariants: true,
121 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
122 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
123 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
124 | },
125 | '1.7b': {
126 | baseName: '1.7B',
127 | hasThinkingVariants: false,
128 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
129 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
130 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
131 | },
132 | '0.6b': {
133 | baseName: '0.6B',
134 | hasThinkingVariants: false,
135 | h100: { tp: 1, ep: 0, bf16: true, fp8: true },
136 | h200: { tp: 1, ep: 0, bf16: true, fp8: true },
137 | b200: { tp: 1, ep: 0, bf16: true, fp8: true }
138 | }
139 | },
140 |
141 | specialCommands: {
142 | 'h100-235b-bf16-instruct': '# Error: Model is too large, cannot fit into 8*H100\n# Please use H200 (141GB) or select FP8 quantization',
143 | 'h100-235b-bf16-thinking': '# Error: Model is too large, cannot fit into 8*H100\n# Please use H200 (141GB) or select FP8 quantization'
144 | },
145 |
146 | generateCommand: function (values) {
147 | const { hardware, modelsize: modelSize, quantization, category } = values;
148 | const commandKey = `${hardware}-${modelSize}-${quantization}-${category}`;
149 |
150 | if (this.specialCommands[commandKey]) {
151 | return this.specialCommands[commandKey];
152 | }
153 |
154 | const config = this.modelConfigs[modelSize];
155 | if (!config) {
156 | return `# Error: Unknown model size: ${modelSize}`;
157 | }
158 |
159 | const hwConfig = config[hardware];
160 | if (!hwConfig) {
161 | return `# Error: Unknown hardware platform: ${hardware}`;
162 | }
163 |
164 | const quantSuffix = quantization === 'fp8' ? '-FP8' : '';
165 |
166 | // Build model name based on model category
167 | let modelName;
168 | if (config.hasThinkingVariants) {
169 | // Models with Instruct/Thinking variants (235B, 30B, 4B)
170 | // 4B is Dense but treated as having variants here
171 | if (category === 'base') {
172 | // Explicitly handle base selection for variant-capable models if needed,
173 | // though the next block handles 'base only' models.
174 | // If 'base' is selected on a variant model, we usually want just the base name
175 | // or we need to ensure the thinking logic handles it.
176 | // Based on the code structure:
177 | // If category is 'base', we probably want just Qwen/Qwen3-XB[-FP8]
178 | // BUT the existing logic adds suffixes based on hasThinkingVariants.
179 | // Let's refine logic: if user selected 'base', don't add suffixes.
180 | modelName = `Qwen/Qwen3-${config.baseName}${quantSuffix}`;
181 | } else {
182 | const thinkingSuffix = category === 'thinking' ? '-Thinking' : '-Instruct';
183 | const dateSuffix = config.hasThinkingVariants ? '-2507' : '';
184 | modelName = `Qwen/Qwen3-${config.baseName}${thinkingSuffix}${dateSuffix}${quantSuffix}`;
185 | }
186 | } else {
187 | // Models without variants (32B, 14B, 8B, 1.7B, 0.6B) - base model only
188 | modelName = `Qwen/Qwen3-${config.baseName}${quantSuffix}`;
189 | }
190 |
191 | let cmd = 'python -m sglang.launch_server \\\n';
192 | cmd += ` --model ${modelName}`;
193 |
194 | if (hwConfig.tp > 1) {
195 | cmd += ` \\\n --tp ${hwConfig.tp}`;
196 | }
197 |
198 | let ep = hwConfig.ep;
199 | if (quantization === 'fp8' && hwConfig.tp === 8) {
200 | ep = 2;
201 | }
202 |
203 | if (ep > 0) {
204 | cmd += ` \\\n --ep ${ep}`;
205 | }
206 |
207 | // Apply commandRule from all options
208 | Object.entries(this.options).forEach(([key, option]) => {
209 | if (option.commandRule && values[key]) {
210 | // Pass the full values object so commandRule can access other option values
211 | const additionalCmd = option.commandRule(values[key], values);
212 | if (additionalCmd) {
213 | cmd += ` \\\n ${additionalCmd}`;
214 | }
215 | }
216 | });
217 |
218 | return cmd;
219 | }
220 | };
221 |
222 | // Initialize state with default values
223 | const getInitialState = () => {
224 | const initialState = {};
225 | Object.entries(baseConfig.options).forEach(([key, option]) => {
226 | const defaultItem = option.items.find(item => item.default);
227 | initialState[key] = defaultItem ? defaultItem.id : option.items[0].id;
228 | });
229 | return initialState;
230 | };
231 |
232 | const [values, setValues] = useState(getInitialState());
233 |
234 | // Get current model config
235 | const currentModelConfig = baseConfig.modelConfigs[values.modelsize];
236 |
237 | // Dynamically adjust options based on model selection and filter by visibleWhen
238 | const displayOptions = useMemo(() => {
239 | const options = { ...baseConfig.options };
240 |
241 | // If model doesn't have thinking variants, modify category options
242 | if (currentModelConfig && !currentModelConfig.hasThinkingVariants) {
243 | options.category = {
244 | ...baseConfig.options.category,
245 | items: baseConfig.options.category.items.map(item => ({
246 | ...item,
247 | // Disable any option that is not 'base'
248 | disabled: item.id !== 'base'
249 | }))
250 | };
251 | }
252 |
253 | // Filter options based on visibleWhen condition
254 | const filteredOptions = {};
255 | Object.entries(options).forEach(([key, option]) => {
256 | // Check if option has visibleWhen condition
257 | if (option.visibleWhen) {
258 | // Only include if visibleWhen returns true
259 | if (option.visibleWhen(values)) {
260 | filteredOptions[key] = option;
261 | }
262 | } else {
263 | // No visibleWhen condition, always include
264 | filteredOptions[key] = option;
265 | }
266 | });
267 |
268 | return filteredOptions;
269 | }, [values, currentModelConfig]);
270 |
271 | // Handle radio change with auto-switching for non-variant models
272 | const handleRadioChange = (optionName, itemId) => {
273 | setValues(prev => {
274 | const newValues = { ...prev, [optionName]: itemId };
275 |
276 | // Auto-switch to 'base' category for models without thinking variants
277 | if (optionName === 'modelsize') {
278 | const modelConfig = baseConfig.modelConfigs[itemId];
279 | if (modelConfig && !modelConfig.hasThinkingVariants) {
280 | // If current category is not base, switch to base
281 | if (newValues.category !== 'base') {
282 | newValues.category = 'base';
283 | }
284 | }
285 | }
286 |
287 | // Reset reasoningParser when switching to 'instruct' category
288 | if (optionName === 'category' && itemId === 'instruct') {
289 | newValues.reasoningParser = 'disabled';
290 | }
291 |
292 | return newValues;
293 | });
294 | };
295 |
296 | // Generate command
297 | const command = baseConfig.generateCommand(values);
298 |
299 | return (
300 |
301 | {Object.entries(displayOptions).map(([key, option], index) => (
302 |
303 |
304 | {index + 1}
305 | {option.title}
306 |
307 |
308 | {option.items.map(item => {
309 | const isChecked = values[option.name] === item.id;
310 | const isDisabled = item.disabled;
311 | return (
312 |
330 | );
331 | })}
332 |
333 |
334 | ))}
335 |
336 |
337 |
Generated Command
338 |
{command}
339 |
340 |
341 | );
342 | };
343 |
344 | export default Qwen3ConfigGenerator;
345 |
346 |
--------------------------------------------------------------------------------
/docs/GLM/GLM-4.6V.md:
--------------------------------------------------------------------------------
1 | # GLM-4.6V
2 |
3 | ## 1. Model Introduction
4 |
5 | GLM-4.6V series model includes two versions: GLM-4.6V (106B), a foundation model designed for cloud and high-performance cluster scenarios, and GLM-4.6V-Flash (9B), a lightweight model optimized for local deployment and low-latency applications. GLM-4.6V scales its context window to 128k tokens in training, and achieves SoTA performance in visual understanding among models of similar parameter scales. Crucially, GLM team integrated native Function Calling capabilities for the first time. This effectively bridges the gap between "visual perception" and "executable action" providing a unified technical foundation for multimodal agents in real-world business scenarios.
6 |
7 | Beyond achieves SoTA performance across major multimodal benchmarks at comparable model scales. GLM-4.6V introduces several key features:
8 |
9 | - **Native Multimodal Function Calling** Enables native vision-driven tool use. Images, screenshots, and document pages can be passed directly as tool inputs without text conversion, while visual outputs (charts, search images, rendered pages) are interpreted and integrated into the reasoning chain. This closes the loop from perception to understanding to execution. Please refer to this [example](#tool-call-example).
10 | - **Interleaved Image-Text Content Generation** Supports high-quality mixed media creation from complex multimodal inputs. GLM-4.6V takes a multimodal context—spanning documents, user inputs, and tool-retrieved images—and synthesizes coherent, interleaved image-text content tailored to the task. During generation it can actively call search and retrieval tools to gather and curate additional text and visuals, producing rich, visually grounded content.
11 | - **Multimodal Document Understanding** GLM-4.6V can process up to 128K tokens of multi-document or long-document input, directly interpreting richly formatted pages as images. It understands text, layout, charts, tables, and figures jointly, enabling accurate comprehension of complex, image-heavy documents without requiring prior conversion to plain text.
12 | - **Frontend Replication & Visual Editing** Reconstructs pixel-accurate HTML/CSS from UI screenshots and supports natural-language-driven edits. It detects layout, components, and styles visually, generates clean code, and applies iterative visual modifications through simple user instructions.
13 |
14 | ## 2. SGLang Installation
15 |
16 | SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.
17 |
18 | ### 2.1 Docker Installation (Recommended)
19 |
20 | ```shell
21 | docker pull lmsysorg/sglang:latest
22 | ```
23 |
24 | **Advantages:**
25 |
26 | - Ready to use out of the box, no manual environment configuration needed
27 | - Avoids dependency conflict issues
28 | - Easy to migrate between different environments
29 |
30 | ### 2.2 Build from Source
31 |
32 | If you need to use the latest development version or require custom modifications, you can build from source:
33 |
34 | ```bash
35 | # Install SGLang using UV (recommended)
36 | git clone https://github.com/sgl-project/sglang.git
37 | cd sglang
38 | uv venv
39 | source .venv/bin/activate
40 | uv pip install -e "python[all]" --index-url=https://pypi.org/simple
41 | pip install nvidia-cudnn-cu12==9.16.0.29
42 | # Install ffmpeg to support video input
43 | sudo apt update
44 | sudo apt install ffmpeg
45 | ```
46 |
47 | **Use Cases:**
48 |
49 | - Need to customize and modify SGLang source code
50 | - Want to use the latest development features
51 | - Participate in SGLang project development
52 |
53 | For general installation instructions, you can also refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html).
54 |
55 | ## 3. Model Deployment
56 |
57 | ### 3.1 Basic Configuration
58 |
59 | **Interactive Command Generator**: Use the interactive configuration generator below to customize your deployment settings. Select your hardware platform, model size, quantization method, and other options to generate the appropriate launch command.
60 |
61 | import GLM46VConfigGenerator from '@site/src/components/GLM46VConfigGenerator';
62 |
63 |
64 |
65 | ### 3.2 Configuration Tips
66 | For more detailed configuration tips, please refer to [GLM-4.5V/GLM-4.6V Usage](https://docs.sglang.io/basic_usage/glmv.html).
67 |
68 | ## 4. Example APIs
69 |
70 | ### Image Input Example
71 |
72 | #### API Payload
73 | ```python
74 | curl_command = f"""
75 | curl -s http://localhost:{30000}/v1/chat/completions \\
76 | -H "Content-Type: application/json" \\
77 | -d '{{
78 | "model": "default",
79 | "messages": [
80 | {{
81 | "role": "user",
82 | "content": [
83 | {{
84 | "type": "image_url",
85 | "image_url": {{
86 | "url": "/home/jobuser/sgl_logo.png"
87 | }}
88 | }},
89 | {{
90 | "type": "text",
91 | "text": "What is the image"
92 | }}
93 | ]
94 | }}
95 | ],
96 | "temperature": "0",
97 | "max_completion_tokens": "1000",
98 | "max_tokens": "1000"
99 | }}'
100 | """
101 |
102 | response = subprocess.check_output(curl_command, shell=True).decode()
103 | print(response)
104 | ```
105 |
106 | #### API Response
107 | ```shell
108 | {"id":"b61596ca71394dd699fd8abd4f650c44","object":"chat.completion","created":1765259019,"model":"default","choices":[{"index":0,"message":{"role":"assistant","content":"The image is a logo featuring the text \"SGL\" (in a bold, orange-brown font) alongside a stylized icon. The icon includes a network-like structure with circular nodes (suggesting connectivity or a tree/graph structure) and a tag with \">\" (a common symbol for coding, web development, or software). The color scheme uses warm orange-brown tones with a black background, giving it a tech-focused, modern aesthetic (likely representing a company, project, or tool related to software, web development, or digital technology).<|begin_of_box|>SGL logo (stylized text + network/coding icon)<|end_of_box|>","reasoning_content":"Okay, let's see. The image has a logo with the text \"SGL\" and a little icon on the left. The icon looks like a network or a tree structure with circles, and there's a tag with \">\" which is a common symbol for coding or web development. The colors are orange and brown tones, with a black background. So probably a logo for a company or project named SGL, maybe related to software, web development, or a tech company.","tool_calls":null},"logprobs":null,"finish_reason":"stop","matched_stop":151336}],"usage":{"prompt_tokens":2222,"total_tokens":2448,"completion_tokens":226,"prompt_tokens_details":null,"reasoning_tokens":0},"metadata":{"weight_version":"default"}}
109 | ```
110 |
111 | ### Video Input Example
112 |
113 | #### API Payload
114 | ```python
115 | curl_command = f"""
116 | curl -s http://localhost:{30000}/v1/chat/completions \\
117 | -H "Content-Type: application/json" \\
118 | -d '{{
119 | "model": "default",
120 | "messages": [
121 | {{
122 | "role": "user",
123 | "content": [
124 | {{
125 | "type": "video_url",
126 | "video_url": {{
127 | "url": "/home/jobuser/jobs_presenting_ipod.mp4"
128 | }}
129 | }},
130 | {{
131 | "type": "text",
132 | "text": "What is the image"
133 | }}
134 | ]
135 | }}
136 | ],
137 | "temperature": "0",
138 | "max_completion_tokens": "1000",
139 | "max_tokens": "1000"
140 | }}'
141 | """
142 |
143 | response = subprocess.check_output(curl_command, shell=True).decode()
144 | print(response)
145 | ```
146 |
147 | #### API Response
148 | ```shell
149 | {"id":"520e0a079e5d4b17b82a6af619315a97","object":"chat.completion","created":1765259029,"model":"default","choices":[{"index":0,"message":{"role":"assistant","content":"The image is a still from a presentation by a man on a stage. He is pointing to a small pocket on his jeans and asking the audience what the pocket is for. The video is being shared by Evan Carmichael. The man then reveals that the pocket is for an iPod Nano.","reasoning_content":"Based on the visual evidence in the video, here is a breakdown of what is being shown:\n\n* **Subject:** The video features a man on a stage, giving a presentation. He is wearing a black t-shirt and dark jeans.\n* **Action:** The man is pointing to a pocket on his jeans. He is asking the audience a question about the purpose of this pocket.\n* **Context:** The presentation is being filmed, and the video is being shared by \"Evan Carmichael,\" a well-known motivational speaker and content creator. The source of the clip is credited to \"JoshuaG.\"\n* **Reveal:** The man then reveals the answer to his question. He pulls a small, white, rectangular device out of the pocket. He identifies this device as an \"iPod Nano.\"\n\nIn summary, the image is a still from a presentation where a speaker is explaining the purpose of the small pocket found on many pairs of jeans.","tool_calls":null},"logprobs":null,"finish_reason":"stop","matched_stop":151336}],"usage":{"prompt_tokens":30276,"total_tokens":30532,"completion_tokens":256,"prompt_tokens_details":null,"reasoning_tokens":0},"metadata":{"weight_version":"default"}}
150 | ```
151 |
152 | ### Tool Call Example
153 |
154 | ### Payload
155 | ```python
156 | from openai import OpenAI
157 | import argparse
158 | import sys
159 | import base64
160 |
161 | def image_to_base64(image_path):
162 | """Convert image file to base64 data URL format for OpenAI API"""
163 | with open(image_path, 'rb') as image_file:
164 | image_data = image_file.read()
165 | base64_string = base64.b64encode(image_data).decode('utf-8')
166 | return f"data:image/png;base64,{base64_string}"
167 |
168 | openai_api_key = "EMPTY"
169 | openai_api_base = "http://127.0.0.1:30000/v1"
170 | client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
171 |
172 |
173 |
174 | tools = [
175 | {
176 | "type": "function",
177 | "function": {
178 | "name": "get_weather",
179 | "description": "Get current temperature for a given location.",
180 | "parameters": {
181 | "type": "object",
182 | "properties": {
183 | "location": {
184 | "type": "string",
185 | "description": "City and country e.g. Beijing, China",
186 | }
187 | },
188 | "required": ["location"],
189 | "additionalProperties": False,
190 | },
191 | },
192 | }
193 | ]
194 |
195 |
196 | messages = [
197 | {
198 | "role": "user",
199 | "content": "Please help me check today’s weather in Beijing, and tell me whether the tool returned an image."
200 | },
201 | {
202 | "role": "assistant",
203 | "tool_calls": [
204 | {
205 | "id": "call_bk32t88BGpSdbtDgzT044Rh4",
206 | "type": "function",
207 | "function": {
208 | "name": 'get_weather',
209 | "arguments": '{"location":"Beijing, China"}'
210 | }
211 | }
212 | ]
213 | },
214 | {
215 | "role": "tool",
216 | "tool_call_id": "call_bk32t88BGpSdbtDgzT044Rh4",
217 | "content": [
218 | {
219 | "type": "text",
220 | "text": "Weather report generated: Beijing, November 7, 2025, sunny, temperature 2°C."
221 | },
222 | {
223 | "type": "image_url",
224 | "image_url": {
225 | "url": "/home/jobuser/sgl_logo.png"
226 | }
227 | }
228 | ]
229 | },
230 | ]
231 |
232 | response = client.chat.completions.create(
233 | model="zai-org/GLM-4.6V",
234 | messages=messages,
235 | timeout=900,
236 | tools=tools
237 | )
238 | print(response.choices[0].message.content.strip())
239 | ```
240 |
241 | ### Output
242 |
243 | ```shell
244 | The weather in Beijing today (November 7, 2025) is sunny with a temperature of 2°C.
245 |
246 | Yes, the tool returned an image (the SGL logo).
247 | ```
248 |
249 | ## 5. Benchmark
250 |
251 | ### 5.1. Text Benchmark: Latency, Throughput and Accuracy
252 |
253 | ```shell
254 | python3 ./benchmark/gsm8k/bench_sglang.py
255 | ```
256 |
257 | ### 5.2. Multimodal Benchmark - Latency and Throughput
258 |
259 | #### Command
260 | ```shell
261 | python3 -m sglang.bench_serving \
262 | --backend sglang \
263 | --port 30000 \
264 | --model zai-org/GLM-4.6V \
265 | --dataset-name image \
266 | --image-count 2 \
267 | --image-resolution 720p \
268 | --random-input-len 128 \
269 | --random-output-len 1024 \
270 | --num-prompts 128 \
271 | --max-concurrency 4
272 | ```
273 |
274 | #### Response
275 | ```shell
276 | ============ Serving Benchmark Result ============
277 | Backend: sglang
278 | Traffic request rate: inf
279 | Max request concurrency: 64
280 | Successful requests: 128
281 | Benchmark duration (s): 30.60
282 | Total input tokens: 315362
283 | Total input text tokens: 8674
284 | Total input vision tokens: 306688
285 | Total generated tokens: 63692
286 | Total generated tokens (retokenized): 63662
287 | Request throughput (req/s): 4.18
288 | Input token throughput (tok/s): 10305.12
289 | Output token throughput (tok/s): 2081.27
290 | Peak output token throughput (tok/s): 3007.00
291 | Peak concurrent requests: 71
292 | Total token throughput (tok/s): 12386.39
293 | Concurrency: 48.29
294 | ----------------End-to-End Latency----------------
295 | Mean E2E Latency (ms): 11546.09
296 | Median E2E Latency (ms): 11856.43
297 | ---------------Time to First Token----------------
298 | Mean TTFT (ms): 286.91
299 | Median TTFT (ms): 259.37
300 | P99 TTFT (ms): 575.39
301 | -----Time per Output Token (excl. 1st token)------
302 | Mean TPOT (ms): 22.87
303 | Median TPOT (ms): 23.48
304 | P99 TPOT (ms): 25.89
305 | ---------------Inter-Token Latency----------------
306 | Mean ITL (ms): 22.67
307 | Median ITL (ms): 20.01
308 | P95 ITL (ms): 68.51
309 | P99 ITL (ms): 74.81
310 | Max ITL (ms): 189.34
311 | ==================================================
312 | ```
313 |
314 |
315 | ### 5.3. Multimodal Accuracy Benchmark - MMMU
316 |
317 | #### Command
318 | ```shell
319 | python3 benchmark/mmmu/bench_sglang.py --response-answer-regex "<\|begin_of_box\|>(.*)<\|end_of_box\|>" --port 30000 --concurrency 64 --extra-request-body '{"max_tokens": 4096}'
320 | ```
321 |
322 | #### Response
323 | ```shell
324 | Benchmark time: 487.2229107860476
325 | answers saved to: ./answer_sglang.json
326 | Evaluating...
327 | answers saved to: ./answer_sglang.json
328 | {'Accounting': {'acc': 0.962, 'num': 26},
329 | 'Agriculture': {'acc': 0.5, 'num': 30},
330 | 'Architecture_and_Engineering': {'acc': 0.733, 'num': 15},
331 | 'Art': {'acc': 0.833, 'num': 30},
332 | 'Art_Theory': {'acc': 0.9, 'num': 30},
333 | 'Basic_Medical_Science': {'acc': 0.733, 'num': 30},
334 | 'Biology': {'acc': 0.586, 'num': 29},
335 | 'Chemistry': {'acc': 0.654, 'num': 26},
336 | 'Clinical_Medicine': {'acc': 0.633, 'num': 30},
337 | 'Computer_Science': {'acc': 0.76, 'num': 25},
338 | 'Design': {'acc': 0.867, 'num': 30},
339 | 'Diagnostics_and_Laboratory_Medicine': {'acc': 0.633, 'num': 30},
340 | 'Economics': {'acc': 0.862, 'num': 29},
341 | 'Electronics': {'acc': 0.5, 'num': 18},
342 | 'Energy_and_Power': {'acc': 0.875, 'num': 16},
343 | 'Finance': {'acc': 0.857, 'num': 28},
344 | 'Geography': {'acc': 0.714, 'num': 28},
345 | 'History': {'acc': 0.767, 'num': 30},
346 | 'Literature': {'acc': 0.897, 'num': 29},
347 | 'Manage': {'acc': 0.759, 'num': 29},
348 | 'Marketing': {'acc': 1.0, 'num': 26},
349 | 'Materials': {'acc': 0.833, 'num': 18},
350 | 'Math': {'acc': 0.76, 'num': 25},
351 | 'Mechanical_Engineering': {'acc': 0.619, 'num': 21},
352 | 'Music': {'acc': 0.286, 'num': 28},
353 | 'Overall': {'acc': 0.761, 'num': 803},
354 | 'Overall-Art and Design': {'acc': 0.729, 'num': 118},
355 | 'Overall-Business': {'acc': 0.884, 'num': 138},
356 | 'Overall-Health and Medicine': {'acc': 0.773, 'num': 150},
357 | 'Overall-Humanities and Social Science': {'acc': 0.78, 'num': 118},
358 | 'Overall-Science': {'acc': 0.728, 'num': 136},
359 | 'Overall-Tech and Engineering': {'acc': 0.671, 'num': 143},
360 | 'Pharmacy': {'acc': 0.933, 'num': 30},
361 | 'Physics': {'acc': 0.929, 'num': 28},
362 | 'Psychology': {'acc': 0.733, 'num': 30},
363 | 'Public_Health': {'acc': 0.933, 'num': 30},
364 | 'Sociology': {'acc': 0.724, 'num': 29}}
365 | eval out saved to ./val_sglang.json
366 | Overall accuracy: 0.761
367 | ```
--------------------------------------------------------------------------------
/docs/Moonshotai/Kimi-K2.md:
--------------------------------------------------------------------------------
1 | # Kimi-K2
2 |
3 | ## 1. Model Introduction
4 |
5 | [Kimi-K2](https://moonshotai.github.io/Kimi-K2/) is a state-of-the-art MoE language model by Moonshot AI with 32B activated parameters and 1T total parameters.
6 |
7 | **Model Variants:**
8 |
9 | - **[Kimi-K2-Instruct](https://huggingface.co/moonshotai/Kimi-K2-Instruct)**: Post-trained model optimized for general-purpose chat and agentic tasks. Compatible with vLLM, SGLang, KTransformers, and TensorRT-LLM.
10 | - **[Kimi-K2-Thinking](https://huggingface.co/moonshotai/Kimi-K2-Thinking)**: Advanced thinking model with step-by-step reasoning and tool calling. Native INT4 quantization with 256k context window. Ideal for complex reasoning and multi-step tool use.
11 |
12 | For details, see [official documentation](https://github.com/MoonshotAI/Kimi-K2) and [technical report](https://www.arxiv.org/abs/2507.20534).
13 |
14 | ## 2. SGLang Installation
15 |
16 | Refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html).
17 |
18 | ## 3. Model Deployment
19 |
20 | This section provides a progressive guide from quick deployment to performance optimization, suitable for users at different levels.
21 |
22 | ### 3.1 Basic Configuration
23 |
24 | **Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model variant, deployment strategy, and capabilities.
25 |
26 | import KimiK2ConfigGenerator from '@site/src/components/KimiK2ConfigGenerator';
27 |
28 |
29 |
30 | ### 3.2 Configuration Tips
31 |
32 | - **Memory**: Requires 8 GPUs with ≥140GB each (H200/B200). Use `--context-length 128000` to conserve memory.
33 | - **Expert Parallelism (EP)**: Use `--ep` for better MoE throughput. See [EP docs](https://github.com/sgl-project/sglang/blob/main/docs/advanced_features/expert_parallelism.md).
34 | - **Data Parallel (DP)**: Enable with `--dp 4 --enable-dp-attention` for production throughput.
35 | - **KV Cache**: Use `--kv-cache-dtype fp8_e4m3` to reduce memory by 50% (CUDA 11.8+).
36 | - **Reasoning Parser**: Add `--reasoning-parser kimi_k2` for Kimi-K2-Thinking to separate thinking and content.
37 | - **Tool Call Parser**: Add `--tool-call-parser kimi_k2` for structured tool calls.
38 |
39 | ## 4. Model Invocation
40 |
41 | ### 4.1 Basic Usage
42 |
43 | See [Basic API Usage](https://docs.sglang.ai/get_started/quick_start.html).
44 |
45 | ### 4.2 Advanced Usage
46 |
47 | #### 4.2.1 Reasoning Parser
48 |
49 | Enable reasoning parser for Kimi-K2-Thinking:
50 |
51 | ```shell
52 | python -m sglang.launch_server \
53 | --model moonshotai/Kimi-K2-Thinking \
54 | --reasoning-parser kimi_k2 \
55 | --tp 8 \
56 | --host 0.0.0.0 \
57 | --port 8000
58 | ```
59 |
60 | **Example:**
61 |
62 | ```python
63 | from openai import OpenAI
64 |
65 | client = OpenAI(
66 | base_url="http://localhost:8000/v1",
67 | api_key="EMPTY"
68 | )
69 |
70 | # Enable streaming to see the thinking process in real-time
71 | response = client.chat.completions.create(
72 | model="moonshotai/Kimi-K2-Thinking",
73 | messages=[
74 | {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"}
75 | ],
76 | temperature=0.6,
77 | max_tokens=2048,
78 | stream=True
79 | )
80 |
81 | # Process the stream
82 | has_thinking = False
83 | has_answer = False
84 | thinking_started = False
85 |
86 | for chunk in response:
87 | if chunk.choices and len(chunk.choices) > 0:
88 | delta = chunk.choices[0].delta
89 |
90 | # Print thinking process
91 | if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
92 | if not thinking_started:
93 | print("=============== Thinking =================", flush=True)
94 | thinking_started = True
95 | has_thinking = True
96 | print(delta.reasoning_content, end="", flush=True)
97 |
98 | # Print answer content
99 | if delta.content:
100 | # Close thinking section and add content header
101 | if has_thinking and not has_answer:
102 | print("\n=============== Content =================", flush=True)
103 | has_answer = True
104 | print(delta.content, end="", flush=True)
105 |
106 | print()
107 | ```
108 |
109 | **Output Example:**
110 |
111 | ```text
112 | =============== Thinking =================
113 | The user asks: "What is 15% of 240?" This is a straightforward percentage calculation problem. I need to solve it step by step.
114 |
115 | Step 1: Understand what "percent" means.
116 | - "Percent" means "per hundred". So 15% means 15 per 100, or 15/100, or 0.15.
117 |
118 | Step 2: Convert the percentage to a decimal.
119 | - 15% = 15 / 100 = 0.15
120 |
121 | Step 3: Multiply the decimal by the number.
122 | - 0.15 * 240
123 |
124 | Step 4: Perform the multiplication.
125 | - 0.15 * 240 = (15/100) * 240
126 | - = 15 * 240 / 100
127 | - = 3600 / 100
128 | - = 36
129 |
130 | Alternatively, I can calculate it directly:
131 | - 0.15 * 240
132 | - 15 * 240 = 3600
133 | - 3600 / 100 = 36
134 |
135 | Or, break it down:
136 | - 10% of 240 = 24
137 | - 5% of 240 = half of 10% = 12
138 | - 15% of 240 = 10% + 5% = 24 + 12 = 36
139 |
140 | I should present the solution clearly with steps. The most standard method is converting to decimal and multiplying.
141 |
142 | Let me structure the answer:
143 | 1. Convert the percentage to a decimal.
144 | 2. Multiply the decimal by the number.
145 | 3. Show the calculation.
146 | 4. State the final answer.
147 |
148 | This is simple and easy to follow.
149 | =============== Content =================
150 | Here is the step-by-step solution:
151 |
152 | **Step 1: Convert the percentage to a decimal**
153 | 15% means 15 per 100, which is 15 ÷ 100 = **0.15**
154 |
155 | **Step 2: Multiply the decimal by the number**
156 | 0.15 × 240
157 |
158 | **Step 3: Calculate the result**
159 | 0.15 × 240 = **36**
160 |
161 | **Answer:** 15% of 240 is **36**.
162 | ```
163 |
164 | **Note:** The reasoning parser captures the model's step-by-step thinking process, allowing you to see how the model arrives at its conclusions.
165 |
166 | #### 4.2.2 Tool Calling
167 |
168 | Kimi-K2-Instruct and Kimi-K2-Thinking support tool calling capabilities. Enable the tool call parser during deployment:
169 |
170 | **Deployment Command:**
171 |
172 | ```shell
173 | python -m sglang.launch_server \
174 | --model moonshotai/Kimi-K2-Instruct \
175 | --tool-call-parser kimi_k2 \
176 | --tp 8 \
177 | --trust-remote-code \
178 | --host 0.0.0.0 \
179 | --port 8000
180 | ```
181 |
182 | **Python Example (with Thinking Process):**
183 |
184 | ```python
185 | from openai import OpenAI
186 |
187 | client = OpenAI(
188 | base_url="http://localhost:8000/v1",
189 | api_key="EMPTY"
190 | )
191 |
192 | # Define available tools
193 | tools = [
194 | {
195 | "type": "function",
196 | "function": {
197 | "name": "get_weather",
198 | "description": "Get the current weather for a location",
199 | "parameters": {
200 | "type": "object",
201 | "properties": {
202 | "location": {
203 | "type": "string",
204 | "description": "The city name"
205 | },
206 | "unit": {
207 | "type": "string",
208 | "enum": ["celsius", "fahrenheit"],
209 | "description": "Temperature unit"
210 | }
211 | },
212 | "required": ["location"]
213 | }
214 | }
215 | }
216 | ]
217 |
218 | # Make request with streaming to see thinking process
219 | response = client.chat.completions.create(
220 | model="moonshotai/Kimi-K2-Thinking",
221 | messages=[
222 | {"role": "user", "content": "What's the weather in Beijing?"}
223 | ],
224 | tools=tools,
225 | temperature=0.7,
226 | stream=True
227 | )
228 |
229 | # Process streaming response
230 | thinking_started = False
231 | has_thinking = False
232 | tool_calls_accumulator = {}
233 |
234 | for chunk in response:
235 | if chunk.choices and len(chunk.choices) > 0:
236 | delta = chunk.choices[0].delta
237 |
238 | # Print thinking process
239 | if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
240 | if not thinking_started:
241 | print("=============== Thinking =================", flush=True)
242 | thinking_started = True
243 | has_thinking = True
244 | print(delta.reasoning_content, end="", flush=True)
245 |
246 | # Accumulate tool calls
247 | if hasattr(delta, 'tool_calls') and delta.tool_calls:
248 | # Close thinking section if needed
249 | if has_thinking and thinking_started:
250 | print("\n=============== Content =================\n", flush=True)
251 | thinking_started = False
252 |
253 | for tool_call in delta.tool_calls:
254 | index = tool_call.index
255 | if index not in tool_calls_accumulator:
256 | tool_calls_accumulator[index] = {
257 | 'name': None,
258 | 'arguments': ''
259 | }
260 |
261 | if tool_call.function:
262 | if tool_call.function.name:
263 | tool_calls_accumulator[index]['name'] = tool_call.function.name
264 | if tool_call.function.arguments:
265 | tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments
266 |
267 | # Print content
268 | if delta.content:
269 | print(delta.content, end="", flush=True)
270 |
271 | # Print accumulated tool calls
272 | for index, tool_call in sorted(tool_calls_accumulator.items()):
273 | print(f"🔧 Tool Call: {tool_call['name']}")
274 | print(f" Arguments: {tool_call['arguments']}")
275 |
276 | print()
277 | ```
278 |
279 | **Output Example:**
280 |
281 | ```
282 | =============== Thinking =================
283 | The user is asking about the weather in Beijing. I need to use the get_weather function to retrieve this information. Beijing is a major city in China, so I should be able to get weather data for it. The location parameter is required, but the unit parameter is optional. Since the user didn't specify a temperature unit, I can just provide the location and let the function use its default. I'll check the weather in Beijing for you.
284 | =============== Content =================
285 |
286 | 🔧 Tool Call: get_weather
287 | Arguments: {"location":"Beijing"}
288 | ```
289 |
290 | **Note:**
291 |
292 | - The reasoning parser shows how the model decides to use a tool
293 | - Tool calls are clearly marked with the function name and arguments
294 | - You can then execute the function and send the result back to continue the conversation
295 |
296 | **Handling Tool Call Results:**
297 |
298 | ```python
299 | # After getting the tool call, execute the function
300 | def get_weather(location, unit="celsius"):
301 | # Your actual weather API call here
302 | return f"The weather in {location} is 22°{unit[0].upper()} and sunny."
303 |
304 | # Send tool result back to the model
305 | messages = [
306 | {"role": "user", "content": "What's the weather in Beijing?"},
307 | {
308 | "role": "assistant",
309 | "content": None,
310 | "tool_calls": [{
311 | "id": "call_123",
312 | "type": "function",
313 | "function": {
314 | "name": "get_weather",
315 | "arguments": '{"location": "Beijing", "unit": "celsius"}'
316 | }
317 | }]
318 | },
319 | {
320 | "role": "tool",
321 | "tool_call_id": "call_123",
322 | "content": get_weather("Beijing", "celsius")
323 | }
324 | ]
325 |
326 | final_response = client.chat.completions.create(
327 | model="moonshotai/Kimi-K2-Thinking",
328 | messages=messages,
329 | temperature=0.7
330 | )
331 |
332 | print(final_response.choices[0].message.content)
333 | # Output: "The weather in Beijing is currently 22°C and sunny."
334 | ```
335 |
336 | ## 5. Benchmark
337 |
338 | ### 5.1 Speed Benchmark
339 |
340 | **Test Environment:**
341 |
342 | - Hardware: NVIDIA B200 GPU (8x)
343 | - Model: Kimi-K2-Instruct
344 | - sglang version: 0.5.6.post1
345 |
346 | We use SGLang's built-in benchmarking tool to conduct performance evaluation on the [ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) dataset. This dataset contains real conversation data and can better reflect performance in actual use scenarios.
347 |
348 | #### 5.1.1 Latency-Sensitive Benchmark
349 |
350 | - Model Deployment Command:
351 |
352 | ```shell
353 | python3 -m sglang.launch_server \
354 | --model-path moonshotai/Kimi-K2-Instruct \
355 | --tp 8 \
356 | --dp 4 \
357 | --enable-dp-attention \
358 | --trust-remote-code \
359 | --host 0.0.0.0 \
360 | --port 8000
361 | ```
362 |
363 | - Benchmark Command:
364 |
365 | ```shell
366 | python3 -m sglang.bench_serving \
367 | --backend sglang \
368 | --host 127.0.0.1 \
369 | --port 8000 \
370 | --model moonshotai/Kimi-K2-Instruct\
371 | --num-prompts 10 \
372 | --max-concurrency 1
373 | ```
374 |
375 | - **Test Results**:
376 |
377 | ```
378 | ============ Serving Benchmark Result ============
379 | Backend: sglang
380 | Traffic request rate: inf
381 | Max request concurrency: 1
382 | Successful requests: 10
383 | Benchmark duration (s): 44.93
384 | Total input tokens: 1951
385 | Total input text tokens: 1951
386 | Total input vision tokens: 0
387 | Total generated tokens: 2755
388 | Total generated tokens (retokenized): 2748
389 | Request throughput (req/s): 0.22
390 | Input token throughput (tok/s): 43.42
391 | Output token throughput (tok/s): 61.32
392 | Peak output token throughput (tok/s): 64.00
393 | Peak concurrent requests: 3
394 | Total token throughput (tok/s): 104.74
395 | Concurrency: 1.00
396 | ----------------End-to-End Latency----------------
397 | Mean E2E Latency (ms): 4489.56
398 | Median E2E Latency (ms): 4994.53
399 | ---------------Time to First Token----------------
400 | Mean TTFT (ms): 141.22
401 | Median TTFT (ms): 158.28
402 | P99 TTFT (ms): 166.90
403 | -----Time per Output Token (excl. 1st token)------
404 | Mean TPOT (ms): 18.40
405 | Median TPOT (ms): 15.63
406 | P99 TPOT (ms): 39.88
407 | ---------------Inter-Token Latency----------------
408 | Mean ITL (ms): 15.78
409 | Median ITL (ms): 15.76
410 | P95 ITL (ms): 16.36
411 | P99 ITL (ms): 16.59
412 | Max ITL (ms): 19.94
413 | ==================================================
414 | ```
415 |
416 | #### 5.1.2 Throughput-Sensitive Benchmark
417 |
418 | - Model Deployment Command:
419 |
420 | ```shell
421 | python3 -m sglang.launch_server \
422 | --model-path moonshotai/Kimi-K2-Instruct \
423 | --tp 8 \
424 | --dp 4 \
425 | --ep 4 \
426 | --enable-dp-attention \
427 | --trust-remote-code \
428 | --host 0.0.0.0 \
429 | --port 8000
430 | ```
431 |
432 | - Benchmark Command:
433 |
434 | ```shell
435 | python3 -m sglang.bench_serving \
436 | --backend sglang \
437 | --host 127.0.0.1 \
438 | --port 8000 \
439 | --model moonshotai/Kimi-K2-Instruct\
440 | --num-prompts 1000 \
441 | --max-concurrency 100
442 | ```
443 |
444 | - **Test Results**:
445 |
446 | ```
447 | ============ Serving Benchmark Result ============
448 | Backend: sglang
449 | Traffic request rate: inf
450 | Max request concurrency: 100
451 | Successful requests: 1000
452 | Benchmark duration (s): 174.11
453 | Total input tokens: 296642
454 | Total input text tokens: 296642
455 | Total input vision tokens: 0
456 | Total generated tokens: 193831
457 | Total generated tokens (retokenized): 168687
458 | Request throughput (req/s): 5.74
459 | Input token throughput (tok/s): 1703.73
460 | Output token throughput (tok/s): 1113.25
461 | Peak output token throughput (tok/s): 2383.00
462 | Peak concurrent requests: 112
463 | Total token throughput (tok/s): 2816.97
464 | Concurrency: 89.60
465 | ----------------End-to-End Latency----------------
466 | Mean E2E Latency (ms): 15601.09
467 | Median E2E Latency (ms): 10780.52
468 | ---------------Time to First Token----------------
469 | Mean TTFT (ms): 457.42
470 | Median TTFT (ms): 221.62
471 | P99 TTFT (ms): 2475.32
472 | -----Time per Output Token (excl. 1st token)------
473 | Mean TPOT (ms): 97.23
474 | Median TPOT (ms): 85.61
475 | P99 TPOT (ms): 435.95
476 | ---------------Inter-Token Latency----------------
477 | Mean ITL (ms): 78.61
478 | Median ITL (ms): 43.66
479 | P95 ITL (ms): 169.53
480 | P99 ITL (ms): 260.91
481 | Max ITL (ms): 1703.21
482 | ==================================================
483 | ```
484 |
485 | ### 5.2 Accuracy Benchmark
486 |
487 | #### 5.2.1 GSM8K Benchmark
488 |
489 | - Server Command
490 |
491 | ```shell
492 | python3 -m sglang.launch_server \
493 | --model-path moonshotai/Kimi-K2-Instruct \
494 | --tp 8 \
495 | --dp 4 \
496 | --trust-remote-code \
497 | --host 0.0.0.0 \
498 | --port 8000
499 | ```
500 |
501 | - Benchmark Command
502 |
503 | ```shell
504 | python3 -m sglang.test.few_shot_gsm8k --num-questions 200 --port 8000
505 | ```
506 |
507 | - **Result**:
508 |
509 | ```
510 | Accuracy: 0.960
511 | Invalid: 0.000
512 | Latency: 15.956 s
513 | Output throughput: 1231.699 token/s
514 | ```
515 |
--------------------------------------------------------------------------------
/docs/DeepSeek/DeepSeek-V3_2.md:
--------------------------------------------------------------------------------
1 | # DeepSeek-V3.2
2 |
3 | ## 1. Model Introduction
4 |
5 | The DeepSeek-V3.2 series includes three model variants, each optimized for different use cases:
6 |
7 | **[DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp)** is an upgraded version of DeepSeek-V3.1-Terminus, introducing the DeepSeek Sparse Attention (DSA) mechanism through continued training. DSA is a fine-grained sparse attention mechanism powered by a lightning indexer, enabling DeepSeek-V3.2-Exp to achieve significant efficiency improvements in long-context scenarios. As an intermediate step toward the next-generation architecture, V3.2-Exp builds upon V3.1-Terminus by introducing DeepSeek Sparse Attention—a sparse attention mechanism designed to explore and validate optimizations for training and inference efficiency in long-context scenarios. Recommended for general conversations, long-context processing, and efficient inference.
8 |
9 | **[DeepSeek-V3.2](https://huggingface.co/deepseek-ai/DeepSeek-V3.2)** is the standard version suitable for general tasks and conversational scenarios. For local deployment, we recommend setting the sampling parameters to temperature = 1.0, top_p = 0.95. Recommended for standard conversations and general tasks.
10 |
11 | **[DeepSeek-V3.2-Speciale](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale)** is a special variant designed exclusively for deep reasoning tasks. This model is specifically optimized for scenarios requiring complex logical reasoning and deep thinking. For local deployment, we recommend setting the sampling parameters to temperature = 1.0, top_p = 0.95. Recommended for deep reasoning tasks, complex logical problems, and mathematical reasoning.
12 |
13 | ## 2. SGLang Installation
14 |
15 | SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.
16 |
17 | Please refer to the [official SGLang installation guide](https://docs.sglang.ai/get_started/install.html) for installation instructions.
18 |
19 | ## 3. Model Deployment
20 |
21 | This section provides a progressive guide from quick deployment to performance optimization, suitable for users at different levels.
22 |
23 | ### 3.1 Basic Configuration
24 |
25 | **Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model variant, deployment strategy, and thinking capabilities.
26 |
27 | import DeepSeekConfigGenerator from '@site/src/components/DeepSeekConfigGenerator';
28 |
29 |
30 |
31 | ### 3.2 Configuration Tips
32 | For more detailed configuration tips, please refer to [DeepSeek-V3.2 Usage](https://docs.sglang.io/basic_usage/deepseek_v32.html).
33 |
34 | ## 4. Model Invocation
35 |
36 | ### 4.1 Basic Usage
37 |
38 | For basic API usage and request examples, please refer to:
39 |
40 | - [Basic API Usage](https://docs.sglang.ai/get_started/quick_start.html)
41 |
42 | ### 4.2 Advanced Usage
43 |
44 | #### 4.2.1 Reasoning Parser
45 |
46 | DeepSeek-V3.2 supports reasoning mode. Enable the reasoning parser during deployment to separate the thinking and content sections:
47 |
48 | ```shell
49 | python -m sglang.launch_server \
50 | --model deepseek-ai/DeepSeek-V3.2-Exp \
51 | --reasoning-parser deepseek-v3 \
52 | --tp 8 \
53 | --host 0.0.0.0 \
54 | --port 8000
55 | ```
56 |
57 | **Streaming with Thinking Process:**
58 |
59 | ```python
60 | from openai import OpenAI
61 |
62 | client = OpenAI(
63 | base_url="http://localhost:8000/v1",
64 | api_key="EMPTY"
65 | )
66 |
67 | # Enable streaming to see the thinking process in real-time
68 | response = client.chat.completions.create(
69 | model="deepseek-ai/DeepSeek-V3.2-Exp",
70 | messages=[
71 | {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"}
72 | ],
73 | temperature=0.7,
74 | max_tokens=2048,
75 | extra_body = {"chat_template_kwargs": {"thinking": True}},
76 | stream=True
77 | )
78 |
79 | # Process the stream
80 | has_thinking = False
81 | has_answer = False
82 | thinking_started = False
83 |
84 | for chunk in response:
85 | print(chunk)
86 | if chunk.choices and len(chunk.choices) > 0:
87 | delta = chunk.choices[0].delta
88 |
89 | # Print thinking process
90 | if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
91 | if not thinking_started:
92 | print("=============== Thinking =================", flush=True)
93 | thinking_started = True
94 | has_thinking = True
95 | print(delta.reasoning_content, end="", flush=True)
96 |
97 | # Print answer content
98 | if delta.content:
99 | # Close thinking section and add content header
100 | if has_thinking and not has_answer:
101 | print("\n=============== Content =================", flush=True)
102 | has_answer = True
103 | print(delta.content, end="", flush=True)
104 |
105 | print()
106 | ```
107 |
108 | **Output Example:**
109 |
110 | ```
111 | =============== Thinking =================
112 | To solve this problem, I need to calculate 15% of 240.
113 | Step 1: Convert 15% to decimal: 15% = 0.15
114 | Step 2: Multiply 240 by 0.15
115 | Step 3: 240 × 0.15 = 36
116 | =============== Content =================
117 |
118 | The answer is 36. To find 15% of 240, we multiply 240 by 0.15, which equals 36.
119 | ```
120 |
121 | **Note:** The reasoning parser captures the model's step-by-step thinking process, allowing you to see how the model arrives at its conclusions.
122 |
123 | #### 4.2.2 Tool Calling
124 |
125 | DeepSeek-V3.2 and DeepSeek-V3.2-Exp support tool calling capabilities. Enable the tool call parser:
126 |
127 | **Note:** DeepSeek-V3.2-Speciale does **NOT** support tool calling. It is designed exclusively for deep reasoning tasks.
128 |
129 | **Deployment Command:**
130 |
131 | ```shell
132 | python -m sglang.launch_server \
133 | --model deepseek-ai/DeepSeek-V3.2-Exp \
134 | --tool-call-parser deepseekv31 \
135 | --reasoning-parser deepseek-v3 \
136 | --chat-template ./examples/chat_template/tool_chat_template_deepseekv32.jinja \
137 | --tp 8 \
138 | --host 0.0.0.0 \
139 | --port 8000
140 | ```
141 |
142 | For DeepSeek-V3.2, use `--tool-call-parser deepseekv32` instead.
143 |
144 | **Python Example (with Thinking Process):**
145 |
146 | ```python
147 | from openai import OpenAI
148 |
149 | client = OpenAI(
150 | base_url="http://localhost:8000/v1",
151 | api_key="EMPTY"
152 | )
153 |
154 | # Define available tools
155 | tools = [
156 | {
157 | "type": "function",
158 | "function": {
159 | "name": "get_weather",
160 | "description": "Get the current weather for a location",
161 | "parameters": {
162 | "type": "object",
163 | "properties": {
164 | "location": {
165 | "type": "string",
166 | "description": "The city name"
167 | },
168 | "unit": {
169 | "type": "string",
170 | "enum": ["celsius", "fahrenheit"],
171 | "description": "Temperature unit"
172 | }
173 | },
174 | "required": ["location"]
175 | }
176 | }
177 | }
178 | ]
179 |
180 | # Make request with streaming to see thinking process
181 | response = client.chat.completions.create(
182 | model="deepseek-ai/DeepSeek-V3.2-Exp",
183 | messages=[
184 | {"role": "user", "content": "What's the weather in Beijing?"}
185 | ],
186 | tools=tools,
187 | extra_body = {"chat_template_kwargs": {"thinking": True}},
188 | temperature=0.7,
189 | stream=True
190 | )
191 |
192 | # Process streaming response
193 | thinking_started = False
194 | has_thinking = False
195 | tool_calls_accumulator = {}
196 |
197 | for chunk in response:
198 | if chunk.choices and len(chunk.choices) > 0:
199 | delta = chunk.choices[0].delta
200 |
201 | # Print thinking process
202 | if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
203 | if not thinking_started:
204 | print("=============== Thinking =================", flush=True)
205 | thinking_started = True
206 | has_thinking = True
207 | print(delta.reasoning_content, end="", flush=True)
208 |
209 | # Accumulate tool calls
210 | if hasattr(delta, 'tool_calls') and delta.tool_calls:
211 | # Close thinking section if needed
212 | if has_thinking and thinking_started:
213 | print("\n=============== Content =================\n", flush=True)
214 | thinking_started = False
215 |
216 | for tool_call in delta.tool_calls:
217 | index = tool_call.index
218 | if index not in tool_calls_accumulator:
219 | tool_calls_accumulator[index] = {
220 | 'name': None,
221 | 'arguments': ''
222 | }
223 |
224 | if tool_call.function:
225 | if tool_call.function.name:
226 | tool_calls_accumulator[index]['name'] = tool_call.function.name
227 | if tool_call.function.arguments:
228 | tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments
229 |
230 | # Print content
231 | if delta.content:
232 | print(delta.content, end="", flush=True)
233 |
234 | # Print accumulated tool calls
235 | for index, tool_call in sorted(tool_calls_accumulator.items()):
236 | print(f"🔧 Tool Call: {tool_call['name']}")
237 | print(f" Arguments: {tool_call['arguments']}")
238 |
239 | print()
240 | ```
241 |
242 | **Output Example:**
243 |
244 | ```
245 | =============== Thinking =================
246 | The user is asking about the weather in Beijing. I need to use the get_weather function to retrieve this information.
247 | I should call the function with location="Beijing".
248 | =============== Content =================
249 |
250 | 🔧 Tool Call: get_weather
251 | Arguments: {"location": "Beijing", "unit": "celsius"}
252 | ```
253 |
254 | **Note:**
255 |
256 | - The reasoning parser shows how the model decides to use a tool
257 | - Tool calls are clearly marked with the function name and arguments
258 | - You can then execute the function and send the result back to continue the conversation
259 |
260 | **Handling Tool Call Results:**
261 |
262 | ```python
263 | # After getting the tool call, execute the function
264 | def get_weather(location, unit="celsius"):
265 | # Your actual weather API call here
266 | return f"The weather in {location} is 22°{unit[0].upper()} and sunny."
267 |
268 | # Send tool result back to the model
269 | messages = [
270 | {"role": "user", "content": "What's the weather in Beijing?"},
271 | {
272 | "role": "assistant",
273 | "content": None,
274 | "tool_calls": [{
275 | "id": "call_123",
276 | "type": "function",
277 | "function": {
278 | "name": "get_weather",
279 | "arguments": '{"location": "Beijing", "unit": "celsius"}'
280 | }
281 | }]
282 | },
283 | {
284 | "role": "tool",
285 | "tool_call_id": "call_123",
286 | "content": get_weather("Beijing", "celsius")
287 | }
288 | ]
289 |
290 | final_response = client.chat.completions.create(
291 | model="deepseek-ai/DeepSeek-V3.2-Exp",
292 | messages=messages,
293 | temperature=0.7
294 | )
295 |
296 | print(final_response.choices[0].message.content)
297 | # Output: "The weather in Beijing is currently 22°C and sunny."
298 | ```
299 |
300 | ## 5. Benchmark
301 |
302 | ### 5.1 Speed Benchmark
303 |
304 | **Test Environment:**
305 |
306 | - Hardware: NVIDIA B200 GPU (8x)
307 | - Model: DeepSeek-V3.2-Exp
308 | - Tensor Parallelism: 8
309 | - sglang version: 0.5.6
310 |
311 | We use SGLang's built-in benchmarking tool to conduct performance evaluation on the [ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) dataset. This dataset contains real conversation data and can better reflect performance in actual use scenarios. To simulate real-world usage patterns, we configure each request with 1024 input tokens and 1024 output tokens, representing typical medium-length conversations with detailed responses.
312 |
313 | #### 5.1.1 Latency-Sensitive Benchmark
314 |
315 | - Model Deployment Command:
316 |
317 | ```shell
318 | python3 -m sglang.launch_server \
319 | --model-path deepseek-ai/DeepSeek-V3.2-Exp \
320 | --tp 8 \
321 | --dp 8 \
322 | --enable-dp-attention \
323 | --speculative-algorithm EAGLE \
324 | --speculative-num-steps 3 \
325 | --speculative-eagle-topk 1 \
326 | --speculative-num-draft-tokens 4 \
327 | --host 0.0.0.0 \
328 | --port 8000
329 | ```
330 |
331 | - Benchmark Command:
332 |
333 | ```shell
334 | python3 -m sglang.bench_serving \
335 | --backend sglang \
336 | --host 127.0.0.1 \
337 | --port 8000 \
338 | --model deepseek-ai/DeepSeek-V3.2-Exp \
339 | --random-input-len 1024 \
340 | --random-output-len 1024 \
341 | --num-prompts 10 \
342 | --max-concurrency 1
343 | ```
344 |
345 | - **Test Results:**
346 |
347 | ```
348 | ============ Serving Benchmark Result ============
349 | Backend: sglang
350 | Traffic request rate: inf
351 | Max request concurrency: 1
352 | Successful requests: 10
353 | Benchmark duration (s): 41.23
354 | Total input tokens: 1972
355 | Total input text tokens: 1972
356 | Total input vision tokens: 0
357 | Total generated tokens: 2784
358 | Total generated tokens (retokenized): 2775
359 | Request throughput (req/s): 0.24
360 | Input token throughput (tok/s): 47.83
361 | Output token throughput (tok/s): 67.53
362 | Peak output token throughput (tok/s): 110.00
363 | Peak concurrent requests: 3
364 | Total token throughput (tok/s): 115.36
365 | Concurrency: 1.00
366 | Accept length: 2.52
367 | ----------------End-to-End Latency----------------
368 | Mean E2E Latency (ms): 4120.71
369 | Median E2E Latency (ms): 4713.16
370 | ---------------Time to First Token----------------
371 | Mean TTFT (ms): 165.02
372 | Median TTFT (ms): 163.64
373 | P99 TTFT (ms): 199.88
374 | -----Time per Output Token (excl. 1st token)------
375 | Mean TPOT (ms): 13.14
376 | Median TPOT (ms): 13.60
377 | P99 TPOT (ms): 17.71
378 | ---------------Inter-Token Latency----------------
379 | Mean ITL (ms): 14.28
380 | Median ITL (ms): 12.10
381 | P95 ITL (ms): 36.23
382 | P99 ITL (ms): 36.87
383 | Max ITL (ms): 37.53
384 | ==================================================
385 | ```
386 |
387 | #### 5.1.2 Throughput-Sensitive Benchmark
388 |
389 | - Model Deployment Command:
390 |
391 | ```shell
392 | python3 -m sglang.launch_server \
393 | --model-path deepseek-ai/DeepSeek-V3.2-Exp \
394 | --tp 8 \
395 | --ep 8 \
396 | --dp 8 \
397 | --enable-dp-attention \
398 | --host 0.0.0.0 \
399 | --port 8000
400 | ```
401 |
402 | - Benchmark Command:
403 |
404 | ```shell
405 | python3 -m sglang.bench_serving \
406 | --backend sglang \
407 | --host 127.0.0.1 \
408 | --port 8000 \
409 | --model deepseek-ai/DeepSeek-V3.2-Exp \
410 | --random-input-len 1024 \
411 | --random-output-len 1024 \
412 | --num-prompts 1000 \
413 | --max-concurrency 100
414 | ```
415 |
416 | - **Test Results:**
417 |
418 | ```
419 | ============ Serving Benchmark Result ============
420 | Backend: sglang
421 | Traffic request rate: inf
422 | Max request concurrency: 100
423 | Successful requests: 1000
424 | Benchmark duration (s): 219.09
425 | Total input tokens: 301701
426 | Total input text tokens: 301701
427 | Total input vision tokens: 0
428 | Total generated tokens: 188375
429 | Total generated tokens (retokenized): 187443
430 | Request throughput (req/s): 4.56
431 | Input token throughput (tok/s): 1377.06
432 | Output token throughput (tok/s): 859.80
433 | Peak output token throughput (tok/s): 2465.00
434 | Peak concurrent requests: 109
435 | Total token throughput (tok/s): 2236.86
436 | Concurrency: 88.05
437 | ----------------End-to-End Latency----------------
438 | Mean E2E Latency (ms): 19291.23
439 | Median E2E Latency (ms): 11927.39
440 | ---------------Time to First Token----------------
441 | Mean TTFT (ms): 530.36
442 | Median TTFT (ms): 444.00
443 | P99 TTFT (ms): 1504.78
444 | -----Time per Output Token (excl. 1st token)------
445 | Mean TPOT (ms): 106.16
446 | Median TPOT (ms): 106.69
447 | P99 TPOT (ms): 221.12
448 | ---------------Inter-Token Latency----------------
449 | Mean ITL (ms): 100.46
450 | Median ITL (ms): 41.73
451 | P95 ITL (ms): 225.67
452 | P99 ITL (ms): 392.37
453 | Max ITL (ms): 975.03
454 | ==================================================
455 | ```
456 |
457 | ### 5.2 Accuracy Benchmark
458 |
459 | #### 5.2.1 GSM8K Benchmark
460 |
461 | - **Benchmark Command:**
462 |
463 | ```shell
464 | python3 -m sglang.test.few_shot_gsm8k --num-questions 200 --port 8000
465 | ```
466 |
467 | - **Test Results**:
468 | - DeepSeek-V3.2-Exp
469 | ```
470 | Accuracy: 0.980
471 | Invalid: 0.000
472 | Latency: 19.128 s
473 | Output throughput: 965.919 token/s
474 | ```
475 |
476 | #### 5.2.2 MMLU Benchmark
477 |
478 | - **Benchmark Command:**
479 |
480 | ```shell
481 | cd sglang
482 | bash benchmark/mmlu/download_data.sh
483 | python3 benchmark/mmlu/bench_sglang.py --nsub 10 --port 8000
484 | ```
485 |
486 | - **Test Results**:
487 | - DeepSeek-V3.2-Exp
488 | ```
489 | subject: abstract_algebra, #q:100, acc: 0.780
490 | subject: anatomy, #q:135, acc: 0.874
491 | subject: astronomy, #q:152, acc: 0.961
492 | subject: business_ethics, #q:100, acc: 0.860
493 | subject: clinical_knowledge, #q:265, acc: 0.925
494 | subject: college_biology, #q:144, acc: 0.972
495 | subject: college_chemistry, #q:100, acc: 0.660
496 | subject: college_computer_science, #q:100, acc: 0.880
497 | subject: college_mathematics, #q:100, acc: 0.840
498 | subject: college_medicine, #q:173, acc: 0.879
499 | Total latency: 7.961
500 | Average accuracy: 0.879
501 | ```
502 |
--------------------------------------------------------------------------------