├── .github
    ├── ISSUE_TEMPLATE
    │   ├── badcase.yml
    │   ├── bug_report.yml
    │   ├── config.yml
    │   └── feature_request.yml
    └── workflows
    │   └── inactive.yml
├── .gitignore
├── .readthedocs.yaml
├── Qwen3_Technical_Report.pdf
├── README.md
├── docker
    ├── Dockerfile-cu121
    ├── docker_cli_demo.sh
    └── docker_web_demo.sh
├── docs
    ├── Makefile
    ├── README.md
    ├── locales
    │   └── zh_CN
    │   │   └── LC_MESSAGES
    │   │       ├── deployment
    │   │           ├── openllm.po
    │   │           ├── sglang.po
    │   │           ├── skypilot.po
    │   │           ├── tgi.po
    │   │           └── vllm.po
    │   │       ├── framework
    │   │           ├── Langchain.po
    │   │           ├── LlamaIndex.po
    │   │           ├── function_call.po
    │   │           └── qwen_agent.po
    │   │       ├── getting_started
    │   │           ├── concepts.po
    │   │           ├── quantization_benchmark.po
    │   │           ├── quickstart.po
    │   │           └── speed_benchmark.po
    │   │       ├── index.po
    │   │       ├── inference
    │   │           └── transformers.po
    │   │       ├── quantization
    │   │           ├── awq.po
    │   │           ├── gptq.po
    │   │           └── llama.cpp.po
    │   │       ├── run_locally
    │   │           ├── llama.cpp.po
    │   │           ├── mlx-lm.po
    │   │           └── ollama.po
    │   │       └── training
    │   │           ├── llama_factory.po
    │   │           ├── ms_swift.po
    │   │           └── verl.po
    ├── make.bat
    ├── requirements-docs.txt
    └── source
    │   ├── _static
    │       ├── css
    │       │   └── custom.css
    │       └── design-tabs.js
    │   ├── assets
    │       ├── .DS_Store
    │       ├── qwen-openllm-ui-demo.png
    │       └── qwen3_nonthinking.jinja
    │   ├── conf.py
    │   ├── deployment
    │       ├── openllm.rst
    │       ├── sglang.md
    │       ├── skypilot.rst
    │       ├── tgi.rst
    │       └── vllm.md
    │   ├── framework
    │       ├── Langchain.rst
    │       ├── LlamaIndex.rst
    │       ├── function_call.md
    │       └── qwen_agent.rst
    │   ├── getting_started
    │       ├── concepts.md
    │       ├── quantization_benchmark.rst
    │       ├── quickstart.md
    │       └── speed_benchmark.md
    │   ├── index.rst
    │   ├── inference
    │       └── transformers.md
    │   ├── quantization
    │       ├── awq.md
    │       ├── gptq.md
    │       └── llama.cpp.md
    │   ├── run_locally
    │       ├── llama.cpp.md
    │       ├── mlx-lm.md
    │       └── ollama.md
    │   └── training
    │       ├── axolotl.rst
    │       ├── llama_factory.rst
    │       ├── ms_swift.rst
    │       └── verl.rst
└── examples
    ├── README.md
    ├── demo
        ├── cli_demo.py
        └── web_demo.py
    ├── gcu-support
        ├── README.md
        └── gcu_demo.py
    ├── llama-factory
        ├── finetune-zh.md
        ├── qwen2-7b-full-sft.yaml
        ├── qwen2-7b-lora-sft.yaml
        ├── qwen2-7b-merge-lora.yaml
        └── qwen2-7b-qlora-sft.yaml
    └── speed-benchmark
        ├── README.md
        ├── README_zh.md
        ├── requirements-perf-transformers.txt
        ├── requirements-perf-vllm.txt
        ├── speed_benchmark_transformers.py
        └── speed_benchmark_vllm.py


/.github/ISSUE_TEMPLATE/badcase.yml:
--------------------------------------------------------------------------------
  1 | name: Badcase Report
  2 | description: Report a badcase.
  3 | title: "[Badcase]: "
  4 | body:
  5 |   - type: dropdown
  6 |     id: series
  7 |     attributes:
  8 |       label: Model Series
  9 |       description: |
 10 |         What series of Qwen models were you running? 
 11 |         Please note that there may not be response for previous model series.
 12 |       options:
 13 |         - Qwen3
 14 |         - Qwen2.5
 15 |       default: 0
 16 |     validations:
 17 |       required: true
 18 |   - type: input
 19 |     attributes:
 20 |       label: What are the models used?
 21 |       description: |
 22 |         Please list the model used, e.g., Qwen3-8B, Qwen3-8B-GGUF, etc.
 23 |         Note that we only maintain models at <https://huggingface.co/Qwen> and <https://www.modelscope.com/organization/qwen>.
 24 |       placeholder: "ex: Qwen3-xxx"
 25 |     validations:
 26 |       required: true
 27 |   - type: input
 28 |     attributes:
 29 |       label: What is the scenario where the problem happened?
 30 |       description: | 
 31 |         Please briefly describe the scenario, including the framework or the platform, 
 32 |         e.g., Qwen Chat, Transformers, Ollama, SGLang, vLLM, Hugging Face Demo, etc.
 33 |       placeholder: "ex: Qwen3-8B cannot generate long texts with Transformers."
 34 |     validations:
 35 |       required: true
 36 |   - type: checkboxes
 37 |     attributes:
 38 |       label: Is this badcase known and can it be solved using avaiable techniques?
 39 |       description: |
 40 |         Please first check if you have followed the usage guide in related documentation and if the badcase is known: 
 41 |         either a workaround is avaiable or the badcase has been already reported.
 42 |       options:
 43 |         - label: I have followed [the GitHub README](https://github.com/QwenLM/Qwen3).
 44 |           required: true
 45 |         - label: I have checked [the Qwen documentation](https://qwen.readthedocs.io) and cannot find a solution there.
 46 |           required: true
 47 |         - label: I have checked the documentation of the related framework and cannot find useful information.
 48 |           required: true
 49 |         - label: I have searched [the issues](https://github.com/QwenLM/Qwen3/issues?q=is%3Aissue) and there is not a similar one.
 50 |           required: true
 51 |   - type: textarea
 52 |     attributes:
 53 |       label: Information about environment
 54 |       description: |
 55 |         Please provide information about you environment, 
 56 |         e.g., the software versions and the information on the OS, GPUs, CUDA, and NVIDIA Driver if GPUs are used.
 57 |         
 58 |         For example:
 59 |         - OS: Ubuntu 24.04
 60 |         - Python: Python 3.11
 61 |         - GPUs: 4 x NVIDIA A20
 62 |         - NVIDIA driver: 560 (from `nvidia-smi`)
 63 |         - CUDA compiler: 12.4 (from `nvcc -V`)
 64 |         - PyTorch: 2.6.0+cu124 (from `python -c "import troch; print(torch.__version__)"`)
 65 | 
 66 |         Python packages (from `pip list`)
 67 |         ```
 68 |         Package                                  Version
 69 |         ---------------------------------------- -----------
 70 |         accelerate                               0.33.0
 71 |         ...
 72 |         ```
 73 |     validations:
 74 |       required: true
 75 | 
 76 |   - type: textarea
 77 |     attributes:
 78 |       label: Description
 79 |       description: |
 80 |         Please describe the badcase you have encountered.
 81 |         The following template is recommended.
 82 |         Feel free to modify as you needed.
 83 |       value: |
 84 |         #### Steps to reproduce
 85 | 
 86 |         This happens to Qwen3-xB-xxx and xxx.
 87 |         The badcase can be reproduced with the following steps:
 88 |         1. ...
 89 |         2. ...
 90 | 
 91 |         The following example input & output can be used:
 92 |         ```
 93 |         system: ...
 94 |         user: ...
 95 |         ...
 96 |         ```
 97 | 
 98 |         #### Expected results
 99 | 
100 |         The results are expected to be ...
101 | 
102 |         #### Attempts to fix
103 | 
104 |         I have tried several ways to fix this, including:
105 |         1. adjusting the sampling parameters, but ...
106 |         2. prompt engineering, but ...
107 | 
108 |         #### Anything else helpful for investigation
109 | 
110 |         I find that this problem also happens to ... 
111 |     validations:
112 |       required: true
113 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
  1 | name: Bug Report
  2 | description: File a bug report.
  3 | title: "[Bug]: "
  4 | body:
  5 |   - type: dropdown
  6 |     id: series
  7 |     attributes:
  8 |       label: Model Series
  9 |       description: |
 10 |         What series of Qwen models were you running? 
 11 |         Please note that there may not be response for previous model series.
 12 |       options:
 13 |         - Qwen3
 14 |         - Qwen2.5
 15 |       default: 0
 16 |     validations:
 17 |       required: true
 18 |   - type: input
 19 |     attributes:
 20 |       label: What are the models used?
 21 |       description: |
 22 |         Please list the model used, e.g., Qwen3-8B, Qwen3-8B-GGUF, etc.
 23 |         Note that we only maintain models at <https://huggingface.co/Qwen> and <https://www.modelscope.com/organization/qwen>.
 24 |       placeholder: "ex: Qwen3-xxx"
 25 |     validations:
 26 |       required: true
 27 |   - type: input
 28 |     attributes:
 29 |       label: What is the scenario where the problem happened?
 30 |       description: | 
 31 |         Please briefly describe the scenario, including the type of use and the framework, 
 32 |         e.g., inference with `transformers`, deployment with `vllm`, SFT with `llama-factory`, tool calling with `ollama`, etc.
 33 |       placeholder: "ex: [type of usage] with [framework]"
 34 |     validations:
 35 |       required: true
 36 |   - type: checkboxes
 37 |     attributes:
 38 |       label: Is this a known issue?
 39 |       description: |
 40 |         Please first check if you have followed the usage guide in related documentation and if the issue is known: 
 41 |         either a workaround is avaiable or the issue has been already reported.
 42 |       options:
 43 |         - label: I have followed [the GitHub README](https://github.com/QwenLM/Qwen3).
 44 |           required: true
 45 |         - label: I have checked [the Qwen documentation](https://qwen.readthedocs.io) and cannot find an answer there.
 46 |           required: true
 47 |         - label: I have checked the documentation of the related framework and cannot find useful information.
 48 |           required: true
 49 |         - label: I have searched [the issues](https://github.com/QwenLM/Qwen3/issues?q=is%3Aissue) and there is not a similar one.
 50 |           required: true
 51 |   - type: textarea
 52 |     attributes:
 53 |       label: Information about environment
 54 |       description: |
 55 |         Please provide information about you environment, 
 56 |         e.g., the software versions and the information on the OS, GPUs, CUDA, and NVIDIA Driver if GPUs are used.
 57 |         
 58 |         For example:
 59 |         - OS: Ubuntu 24.04
 60 |         - Python: Python 3.11
 61 |         - GPUs: 4 x NVIDIA A20
 62 |         - NVIDIA driver: 560 (from `nvidia-smi`)
 63 |         - CUDA compiler: 12.4 (from `nvcc -V`)
 64 |         - PyTorch: 2.6.0+cu124 (from `python -c "import troch; print(torch.__version__)"`)
 65 | 
 66 |         Python packages (from `pip list`)
 67 |         ```
 68 |         Package                                  Version
 69 |         ---------------------------------------- -----------
 70 |         accelerate                               0.33.0
 71 |         ...
 72 |         ```
 73 |     validations:
 74 |       required: true
 75 |   - type: textarea
 76 |     attributes:
 77 |       label: Log output
 78 |       description: | 
 79 |         Please copy and paste any relevant log output. 
 80 |         This will be automatically formatted into code, so no need for backticks. 
 81 |         If the log suggested things like
 82 |         ```
 83 |         RuntimeError: CUDA error: device-side assert triggered
 84 |         CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
 85 |         For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
 86 |         ```
 87 |         please follow the instructions and set the corresponding parameters.
 88 |       render: shell
 89 |     validations:
 90 |       required: true
 91 |   - type: textarea
 92 |     attributes:
 93 |       label: Description
 94 |       description: |
 95 |         Please describe the problem you have encountered.
 96 |         The following template is recommended.
 97 |         Remember to delete the
 98 |       value: |
 99 |         #### Steps to reproduce
100 | 
101 |         This happens to Qwen3-xB-xxx and xxx.
102 |         The problem can be reproduced with the following steps:
103 |         1. ...
104 |         2. ...
105 | 
106 |         #### Expected results
107 | 
108 |         The results are expected to be ...
109 | 
110 |         #### Attempts to fix
111 | 
112 |         I have tried several ways to fix this, including:
113 |         1. ...
114 |         2. ...
115 | 
116 |         #### Anything else helpful for investigation
117 | 
118 |         I find that this problem also happens to ... 
119 |     validations:
120 |       required: true
121 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | contact_links:
2 |   - name: QwQ
3 |     url: https://github.com/QwenLM/QwQ/issues
4 |     about: For issues related to QwQ, please report to the QwQ repository.
5 |   - name: Question
6 |     url: https://github.com/QwenLM/Qwen3/discussions
7 |     about: Please ask and answer questions in discussions. Issues are mainly for Bugs and Features.
8 |     
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: "Feature Request"
 2 | description: "Request a new feature."
 3 | title: "[REQUEST]: "
 4 | body:
 5 |   - type: checkboxes
 6 |     attributes:
 7 |       label: Has this been supported or requested before?
 8 |       description: |
 9 |         Please first check if the feature is supported in related documentation and if it has been already requested.
10 |       options:
11 |         - label: I have checked [the GitHub README](https://github.com/QwenLM/Qwen3).
12 |           required: true
13 |         - label: I have checked [the Qwen documentation](https://qwen.readthedocs.io).
14 |           required: true
15 |         - label: I have checked the documentation of the related framework.
16 |           required: true
17 |         - label: I have searched [the issues](https://github.com/QwenLM/Qwen3/issues?q=is%3Aissue) and there is not a similar one.
18 |           required: true
19 |   - type: input
20 |     attributes:
21 |       label: What is this feature about?
22 |       description: | 
23 |         Please briefly describe the feature, including the type of use and the framework, 
24 |         e.g., support quantized MoE in vLLM, or a model with xxB parameters, etc.
25 |     validations:
26 |       required: true
27 |   - type: textarea
28 |     attributes:
29 |       label: Proposal
30 |       description: |
31 |         Please describe the feature you have requested and the rationale behind it.
32 |         The following template is recommended.
33 |         Feel free to modify it as you needed.
34 |       value: |
35 |         #### Introduction
36 | 
37 |         I would like that ...
38 |         
39 |         #### Rational
40 | 
41 |         Implementation of this feature will help the following usecase:
42 |         - ...
43 |         - ...
44 | 
45 |         #### Anything else
46 | 
47 |         I find ... has this feature and xxx can serve as a reference for implementation.
48 |     validations:
49 |       required: true
50 |   - type: checkboxes
51 |     attributes:
52 |       label: Contributions are welcomed
53 |       description: We would greatly appreciated if you could help implement this feature.
54 |       options:
55 |         - label: I am willing to help implement this feature.
56 |       
57 | 


--------------------------------------------------------------------------------
/.github/workflows/inactive.yml:
--------------------------------------------------------------------------------
 1 | name: Close and lock inactive threads
 2 | on:
 3 |   schedule:
 4 |     - cron: "0 8 * * *"
 5 | jobs:
 6 |   manage-inactive:
 7 |     runs-on: ubuntu-latest
 8 |     permissions:
 9 |       actions: write
10 |       issues: write
11 |       pull-requests: write
12 |     steps:
13 |       - uses: actions/stale@v9
14 |         with:
15 |           days-before-issue-stale: 30
16 |           days-before-issue-close: 7
17 |           stale-issue-label: inactive
18 |           stale-issue-message: >
19 |             This issue has been automatically marked as inactive 
20 |             due to lack of recent activity. 
21 |             Should you believe it remains unresolved and warrants attention, 
22 |             kindly leave a comment on this thread. 
23 |           exempt-issue-labels: enhancement,discussion
24 |           days-before-pr-stale: -1
25 |           days-before-pr-close: -1
26 |           operations-per-run: 128
27 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
28 |       - uses: dessant/lock-threads@v5
29 |         with:
30 |           issue-inactive-days: 30
31 |           issue-comment: >
32 |             This issue has been automatically locked since there
33 |             has not been any recent activity after it was closed.
34 |             Please open a new issue for related bugs.
35 |           pr-inactive-days: 30
36 |           pr-comment: >
37 |             This pull request has been automatically locked since there
38 |             has not been any recent activity after it was closed.
39 |             Please open a new issue for related bugs.
40 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Sphinx documentation
2 | docs/_build/
3 | docs/build/
4 | docs/**/*.mo
5 | .vscode
6 | .idea
7 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3"
10 | 
11 | sphinx:
12 |    configuration: docs/source/conf.py
13 | 
14 | # If using Sphinx, optionally build your docs in additional formats such as PDF
15 | # formats:
16 | #    - pdf
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |    - requirements: docs/requirements-docs.txt
22 | 


--------------------------------------------------------------------------------
/Qwen3_Technical_Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen3/73027650806c879079b2601dfa7446655d5745af/Qwen3_Technical_Report.pdf


--------------------------------------------------------------------------------
/docker/Dockerfile-cu121:
--------------------------------------------------------------------------------
 1 | ARG CUDA_VERSION=12.1.0
 2 | ARG from=nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
 3 | 
 4 | FROM ${from} as base
 5 | 
 6 | RUN <<EOF
 7 | apt update -y && apt upgrade -y && apt install -y --no-install-recommends  \
 8 |     git \
 9 |     git-lfs \
10 |     python3 \
11 |     python3-pip \
12 |     python3-dev \
13 |     wget \
14 |     vim \
15 | && rm -rf /var/lib/apt/lists/*
16 | EOF
17 | 
18 | RUN ln -s /usr/bin/python3 /usr/bin/python
19 | 
20 | RUN git lfs install
21 | 
22 | FROM base as dev
23 | 
24 | WORKDIR /
25 | 
26 | RUN mkdir -p /data/shared/Qwen
27 | 
28 | WORKDIR /data/shared/Qwen/
29 | 
30 | FROM dev as bundle_req
31 | RUN pip install --no-cache-dir networkx==3.1
32 | RUN pip3 install --no-cache-dir torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
33 | RUN pip3 install --no-cache-dir transformers==4.40.2 accelerate tiktoken einops scipy
34 |     
35 | FROM bundle_req as bundle_finetune
36 | ARG BUNDLE_FINETUNE=true
37 | 
38 | RUN <<EOF
39 | if [ "$BUNDLE_FINETUNE" = "true" ]; then
40 |     cd /data/shared/Qwen
41 | 
42 |     # Full-finetune / LoRA.
43 |     pip3 install --no-cache-dir "deepspeed==0.14.2" "peft==0.11.1"
44 | 
45 |     # Q-LoRA.
46 |     apt update -y && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \
47 |         libopenmpi-dev openmpi-bin \
48 |         && rm -rf /var/lib/apt/lists/*
49 |     pip3 install --no-cache-dir "optimum==1.20.0" "auto-gptq==0.7.1" "autoawq==0.2.5" mpi4py
50 | fi
51 | EOF
52 | 
53 | FROM bundle_finetune as bundle_vllm
54 | ARG BUNDLE_VLLM=true
55 | 
56 | RUN <<EOF
57 | if [ "$BUNDLE_VLLM" = "true" ]; then
58 |     cd /data/shared/Qwen
59 | 
60 |     pip3 install --no-cache-dir vllm==0.4.3 "fschat[model_worker,webui]==0.2.36"
61 | fi
62 | EOF
63 | 
64 | FROM bundle_vllm as bundle_flash_attention
65 | ARG BUNDLE_FLASH_ATTENTION=true
66 | 
67 | RUN <<EOF 
68 | if [ "$BUNDLE_FLASH_ATTENTION" = "true" ]; then
69 |     pip3 install --no-cache-dir flash-attn==2.5.8 --no-build-isolation
70 | fi
71 | EOF
72 | 
73 | FROM bundle_flash_attention as final
74 | 
75 | COPY ../examples/sft/* ./
76 | COPY ../examples/demo/* ./
77 | 
78 | EXPOSE 80
79 | 


--------------------------------------------------------------------------------
/docker/docker_cli_demo.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # This script will automatically pull docker image from DockerHub, and start a container to run the Qwen-Chat cli-demo.
 4 | 
 5 | IMAGE_NAME=qwenllm/qwen:2-cu121
 6 | QWEN_CHECKPOINT_PATH=/path/to/Qwen-Instruct
 7 | CONTAINER_NAME=qwen2
 8 | 
 9 | function usage() {
10 |     echo '
11 | Usage: bash docker/docker_cli_demo.sh [-i IMAGE_NAME] -c [/path/to/Qwen-Instruct] [-n CONTAINER_NAME]
12 | '
13 | }
14 | 
15 | while [[ "$1" != "" ]]; do
16 |     case $1 in
17 |         -i | --image-name )
18 |             shift
19 |             IMAGE_NAME=$1
20 |             ;;
21 |         -c | --checkpoint )
22 |             shift
23 |             QWEN_CHECKPOINT_PATH=$1
24 |             ;;
25 |         -n | --container-name )
26 |             shift
27 |             CONTAINER_NAME=$1
28 |             ;;
29 |         -h | --help )
30 |             usage
31 |             exit 0
32 |             ;;
33 |         * )
34 |             echo "Unknown argument ${1}"
35 |             exit 1
36 |             ;;
37 |     esac
38 |     shift
39 | done
40 | 
41 | if [ ! -e ${QWEN_CHECKPOINT_PATH}/config.json ]; then
42 |     echo "Checkpoint config.json file not found in ${QWEN_CHECKPOINT_PATH}, exit."
43 |     exit 1
44 | fi
45 | 
46 | sudo docker pull ${IMAGE_NAME} || {
47 |     echo "Pulling image ${IMAGE_NAME} failed, exit."
48 |     exit 1
49 | }
50 | 
51 | sudo docker run --gpus all --rm --name ${CONTAINER_NAME} \
52 |     --mount type=bind,source=${QWEN_CHECKPOINT_PATH},target=/data/shared/Qwen/Qwen-Instruct \
53 |     -it ${IMAGE_NAME} \
54 |     python cli_demo.py -c /data/shared/Qwen/Qwen-Instruct/


--------------------------------------------------------------------------------
/docker/docker_web_demo.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # This script will automatically pull docker image from DockerHub, and start a daemon container to run the Qwen-Chat web-demo.
 4 | 
 5 | IMAGE_NAME=qwenllm/qwen:2-cu121
 6 | QWEN_CHECKPOINT_PATH=/path/to/Qwen-Instruct
 7 | PORT=8901
 8 | CONTAINER_NAME=qwen2
 9 | 
10 | function usage() {
11 |     echo '
12 | Usage: bash docker/docker_web_demo.sh [-i IMAGE_NAME] -c [/path/to/Qwen-Instruct] [-n CONTAINER_NAME] [--port PORT]
13 | '
14 | }
15 | 
16 | while [[ "$1" != "" ]]; do
17 |     case $1 in
18 |         -i | --image-name )
19 |             shift
20 |             IMAGE_NAME=$1
21 |             ;;
22 |         -c | --checkpoint )
23 |             shift
24 |             QWEN_CHECKPOINT_PATH=$1
25 |             ;;
26 |         -n | --container-name )
27 |             shift
28 |             CONTAINER_NAME=$1
29 |             ;;
30 |         --port )
31 |             shift
32 |             PORT=$1
33 |             ;;
34 |         -h | --help )
35 |             usage
36 |             exit 0
37 |             ;;
38 |         * )
39 |             echo "Unknown argument ${1}"
40 |             exit 1
41 |             ;;
42 |     esac
43 |     shift
44 | done
45 | 
46 | if [ ! -e ${QWEN_CHECKPOINT_PATH}/config.json ]; then
47 |     echo "Checkpoint config.json file not found in ${QWEN_CHECKPOINT_PATH}, exit."
48 |     exit 1
49 | fi
50 | 
51 | sudo docker pull ${IMAGE_NAME} || {
52 |     echo "Pulling image ${IMAGE_NAME} failed, exit."
53 |     exit 1
54 | }
55 | 
56 | sudo docker run --gpus all -d --restart always --name ${CONTAINER_NAME} \
57 |     -v /var/run/docker.sock:/var/run/docker.sock -p ${PORT}:80 \
58 |     --mount type=bind,source=${QWEN_CHECKPOINT_PATH},target=/data/shared/Qwen/Qwen-Instruct \
59 |     -it ${IMAGE_NAME} \
60 |     python web_demo.py --server-port 80 --server-name 0.0.0.0 -c /data/shared/Qwen/Qwen-Instruct/ && {
61 |     echo "Successfully started web demo. Open 'http://localhost:${PORT}' to try!
62 | Run \`docker logs ${CONTAINER_NAME}\` to check demo status.
63 | Run \`docker rm -f ${CONTAINER_NAME}\` to stop and remove the demo."
64 | }


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Qwen Documentation
 2 | 
 3 | This is the source of the documentation at <https://qwen.readthedocs.io>.
 4 | 
 5 | ## Quick Start
 6 | 
 7 | We use `sphinx` to manage the documentation and use the `furo` theme.
 8 | To get started, simply run
 9 | ```bash
10 | pip install -r requirements-docs.txt
11 | ```
12 | 
13 | Then run `make html` or `sphinx-build -M html source build` and it will compile the docs and put it under the `build/html` directory.
14 | 
15 | 
16 | ## Translation
17 | 
18 | The documentation is available in both English and Simplified Chinese. We use
19 | `sphinx-intl` to work with Sphinx translation flow, following [this article](https://www.sphinx-doc.org/en/master/usage/advanced/intl.html).
20 | 
21 | You need to install the Python package `sphinx-intl` before starting.
22 | 
23 | 1. After updating the English documentation, run `make gettext`, and the pot files will be placed in the `build/gettext` directory. `make gettext` can be slow if the doc is long.
24 | 
25 | 2. Use the generated pot files to update the po files:
26 |     ```bash
27 |     sphinx-intl update -p build/gettext -l zh_CN -w 0
28 |     ```
29 | 
30 | 3. Translate po files at `locales\zh_CN\LC_MESSAGES`. Pay attention to fuzzy matches (messages after `#, fuzzy`). Please be careful not to break reST notation.
31 | 
32 | 4. Build translated document: `make -e SPHINXOPTS="-D language='zh_CN'" html` or `sphinx-build -M html source build -D language=zh_CN`
33 | 
34 | ## Auto Build
35 | 
36 | ```bash
37 | pip install sphinx-autobuild
38 | ```
39 | 
40 | To autobuild the default version:
41 | ```bash
42 | sphinx-autobuild source build/html
43 | ```
44 | 
45 | To autobuild the translated version:
46 | ```bash
47 | sphinx-autobuild source build/html -D language=zh_CN --watch locales/zh_CN
48 | ```
49 | 
50 | By default, the doc is at `http://127.0.0.1:8000`


--------------------------------------------------------------------------------
/docs/locales/zh_CN/LC_MESSAGES/deployment/openllm.po:
--------------------------------------------------------------------------------
  1 | # SOME DESCRIPTIVE TITLE.
  2 | # Copyright (C) 2024, Qwen Team
  3 | # This file is distributed under the same license as the Qwen package.
  4 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2024.
  5 | #
  6 | #, fuzzy
  7 | msgid ""
  8 | msgstr ""
  9 | "Project-Id-Version: Qwen \n"
 10 | "Report-Msgid-Bugs-To: \n"
 11 | "POT-Creation-Date: 2025-04-28 19:42+0800\n"
 12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 13 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 14 | "Language: zh_CN\n"
 15 | "Language-Team: zh_CN <LL@li.org>\n"
 16 | "Plural-Forms: nplurals=1; plural=0;\n"
 17 | "MIME-Version: 1.0\n"
 18 | "Content-Type: text/plain; charset=utf-8\n"
 19 | "Content-Transfer-Encoding: 8bit\n"
 20 | "Generated-By: Babel 2.17.0\n"
 21 | 
 22 | #: ../../Qwen/source/deployment/openllm.rst:2 986ea00cb5af4a0d82f974ed79a82430
 23 | msgid "OpenLLM"
 24 | msgstr "OpenLLM"
 25 | 
 26 | #: ../../Qwen/source/deployment/openllm.rst:5 78be03fbdccb429892b03bf84596411b
 27 | msgid "To be updated for Qwen3."
 28 | msgstr "仍需为Qwen3更新。"
 29 | 
 30 | #: ../../Qwen/source/deployment/openllm.rst:7 a001f11d1c5440188121d20b3baf59db
 31 | msgid "OpenLLM allows developers to run Qwen2.5 models of different sizes as OpenAI-compatible APIs with a single command. It features a built-in chat UI, state-of-the-art inference backends, and a simplified workflow for creating enterprise-grade cloud deployment with Qwen2.5. Visit `the OpenLLM repository <https://github.com/bentoml/OpenLLM/>`_ to learn more."
 32 | msgstr "OpenLLM 允许开发者通过一个命令运行不同大小的 Qwen2.5 模型，提供 OpenAI 兼容的 API。它具有内置的聊天 UI，先进的推理后端，以及简化的工作流程来使用 Qwen2.5 创建企业级云部署。访问 `OpenLLM 仓库 <https://github.com/bentoml/OpenLLM/>`_ 了解更多信息。"
 33 | 
 34 | #: ../../Qwen/source/deployment/openllm.rst:10 229f89c3be65442bbe15905d75a0d13d
 35 | msgid "Installation"
 36 | msgstr "安装"
 37 | 
 38 | #: ../../Qwen/source/deployment/openllm.rst:12 79421f700fbc426cb6ce9841aff67503
 39 | msgid "Install OpenLLM using ``pip``."
 40 | msgstr "使用 ``pip`` 安装 OpenLLM。"
 41 | 
 42 | #: ../../Qwen/source/deployment/openllm.rst:18 69cfd6fe2e274173ad4065be91b71472
 43 | msgid "Verify the installation and display the help information:"
 44 | msgstr "验证安装并显示帮助信息："
 45 | 
 46 | #: ../../Qwen/source/deployment/openllm.rst:25 503cae99b14c4ef4b322b8ec0bd2d32d
 47 | msgid "Quickstart"
 48 | msgstr "快速开始"
 49 | 
 50 | #: ../../Qwen/source/deployment/openllm.rst:27 0ea788c801404d8780404611c87644b0
 51 | msgid "Before you run any Qwen2.5 model, ensure your model repository is up to date by syncing it with OpenLLM's latest official repository."
 52 | msgstr "在运行任何 Qwen2.5 模型之前，确保您的模型仓库与 OpenLLM 的最新官方仓库同步。"
 53 | 
 54 | #: ../../Qwen/source/deployment/openllm.rst:33 8852ff46ecdb45b2bfc9885bbfaacb02
 55 | msgid "List the supported Qwen2.5 models:"
 56 | msgstr "列出支持的 Qwen2.5 模型："
 57 | 
 58 | #: ../../Qwen/source/deployment/openllm.rst:39 3e4f6c11396844adb30d4e5812339484
 59 | msgid "The results also display the required GPU resources and supported platforms:"
 60 | msgstr "结果还会显示所需的 GPU 资源和支持的平台："
 61 | 
 62 | #: ../../Qwen/source/deployment/openllm.rst:57 ac4c0db02f5249d5882940820779db9a
 63 | msgid "To start a server with one of the models, use ``openllm serve`` like this:"
 64 | msgstr "要使用其中一个模型来启动服务器，请使用 ``openllm serve`` 命令，例如："
 65 | 
 66 | #: ../../Qwen/source/deployment/openllm.rst:63 0a1d3ec35c684e3bb3e971c916aa9be7
 67 | msgid "By default, the server starts at ``http://localhost:3000/``."
 68 | msgstr "默认情况下，服务器启动在 http://localhost:3000/。"
 69 | 
 70 | #: ../../Qwen/source/deployment/openllm.rst:66 2e787de9a62f4342bdf8f88ee0df5379
 71 | msgid "Interact with the model server"
 72 | msgstr "与模型服务器交互"
 73 | 
 74 | #: ../../Qwen/source/deployment/openllm.rst:68 b22802ad9027458bb30ea0da665fea36
 75 | msgid "With the model server up and running, you can call its APIs in the following ways:"
 76 | msgstr "服务器运行后，可以通过以下方式调用其 API："
 77 | 
 78 | #: ../../Qwen/source/deployment/openllm.rst 76214ea690094930899d6f2eddcc1454
 79 | msgid "CURL"
 80 | msgstr "CURL"
 81 | 
 82 | #: ../../Qwen/source/deployment/openllm.rst:74 42775a3df58f474782d29f2f82707bd9
 83 | msgid "Send an HTTP request to its ``/generate`` endpoint via CURL:"
 84 | msgstr "通过 CURL 向其 ``/generate`` 端点发送 HTTP 请求："
 85 | 
 86 | #: ../../Qwen/source/deployment/openllm.rst 4f0ff3eee2ab49dda5a72bd611a9d45e
 87 | msgid "Python client"
 88 | msgstr "Python 客户端"
 89 | 
 90 | #: ../../Qwen/source/deployment/openllm.rst:91 ce2e11a46e434798947b1e74ce82a19c
 91 | msgid "Call the OpenAI-compatible endpoints with frameworks and tools that support the OpenAI API protocol. Here is an example:"
 92 | msgstr "使用支持 OpenAI API 协议的框架和工具来调用。例如："
 93 | 
 94 | #: ../../Qwen/source/deployment/openllm.rst 107921d1a855430ca70c8c163d37c7f2
 95 | msgid "Chat UI"
 96 | msgstr "聊天 UI"
 97 | 
 98 | #: ../../Qwen/source/deployment/openllm.rst:118
 99 | #: b92df2759cd54c2b8316e2a160ede656
100 | msgid "OpenLLM provides a chat UI at the ``/chat`` endpoint for the LLM server at http://localhost:3000/chat."
101 | msgstr "OpenLLM 为 LLM 服务器提供的聊天 UI 位于 ``/chat`` 端点，地址为 http://localhost:3000/chat。"
102 | 
103 | #: ../../Qwen/source/deployment/openllm.rst:123
104 | #: 0d3fa679178f443caf9c87623001be1f
105 | msgid "Model repository"
106 | msgstr "模型仓库"
107 | 
108 | #: ../../Qwen/source/deployment/openllm.rst:125
109 | #: 54d6a9bdcc064aeb95a23b60d3d575ab
110 | msgid "A model repository in OpenLLM represents a catalog of available LLMs. You can add your own repository to OpenLLM with custom Qwen2.5 variants for your specific needs. See our `documentation to learn details <https://github.com/bentoml/OpenLLM?tab=readme-ov-file#model-repository>`_."
111 | msgstr "OpenLLM 中的模型仓库表示可用的 LLM 目录。您可以为 OpenLLM 添加自定义的 Qwen2.5 模型仓库，以满足您的特定需求。请参阅 `我们的文档 <https://github.com/bentoml/OpenLLM?tab=readme-ov-file#model-repository>`_ 了解详细信息。"
112 | 
113 | 


--------------------------------------------------------------------------------
/docs/locales/zh_CN/LC_MESSAGES/framework/Langchain.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2024, Qwen Team
 3 | # This file is distributed under the same license as the Qwen package.
 4 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2024.
 5 | #
 6 | msgid ""
 7 | msgstr ""
 8 | "Project-Id-Version: Qwen \n"
 9 | "Report-Msgid-Bugs-To: \n"
10 | "POT-Creation-Date: 2025-04-28 19:42+0800\n"
11 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
12 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
13 | "Language: zh_CN\n"
14 | "Language-Team: zh_CN <LL@li.org>\n"
15 | "Plural-Forms: nplurals=1; plural=0;\n"
16 | "MIME-Version: 1.0\n"
17 | "Content-Type: text/plain; charset=utf-8\n"
18 | "Content-Transfer-Encoding: 8bit\n"
19 | "Generated-By: Babel 2.17.0\n"
20 | 
21 | #: ../../Qwen/source/framework/Langchain.rst:2 6f9b66430d9c495592b1e275fdfd7c9e
22 | msgid "Langchain"
23 | msgstr ""
24 | 
25 | #: ../../Qwen/source/framework/Langchain.rst:5 1205af46f88e4d6681003403109385c3
26 | msgid "To be updated for Qwen3."
27 | msgstr "仍需为Qwen3更新。"
28 | 
29 | #: ../../Qwen/source/framework/Langchain.rst:7 115ee7b1c8404629a8f98175264cc114
30 | msgid "This guide helps you build a question-answering application based on a local knowledge base using ``Qwen2.5-7B-Instruct`` with ``langchain``. The goal is to establish a knowledge base Q&A solution."
31 | msgstr "本教程旨在帮助您利用 ``Qwen2.5-7B-Instruct`` 与 ``langchain`` ，基于本地知识库构建问答应用。目标是建立一个知识库问答解决方案。"
32 | 
33 | #: ../../Qwen/source/framework/Langchain.rst:12
34 | #: 7257b95612fb423bb9ca73212fd12a02
35 | msgid "Basic Usage"
36 | msgstr "基础用法"
37 | 
38 | #: ../../Qwen/source/framework/Langchain.rst:14
39 | #: fecf7a682dcc4c15a53da1f7cdf145e5
40 | msgid "The implementation process of this project includes loading files -> reading text -> segmenting text -> vectorizing text -> vectorizing questions -> matching the top k most similar text vectors with the question vectors -> incorporating the matched text as context along with the question into the prompt -> submitting to the Qwen2.5-7B-Instruct to generate an answer. Below is an example:"
41 | msgstr "您可以仅使用您的文档配合 ``langchain`` 来构建一个问答应用。该项目的实现流程包括加载文件 -> 阅读文本 -> 文本分段 -> 文本向量化 -> 问题向量化 -> 将最相似的前k个文本向量与问题向量匹配 -> 将匹配的文本作为上下文连同问题一起纳入提示 -> 提交给Qwen2.5-7B-Instruct生成答案。以下是一个示例："
42 | 
43 | #: ../../Qwen/source/framework/Langchain.rst:98
44 | #: 6ad1ebd2ef4a49f9aa66cfdf777e1290
45 | msgid "After loading the Qwen2.5-7B-Instruct model, you should specify the txt file for retrieval."
46 | msgstr "加载Qwen2.5-7B-Instruct模型后，您可以指定需要用于知识库问答的txt文件。"
47 | 
48 | #: ../../Qwen/source/framework/Langchain.rst:274
49 | #: 00467b1e4e294a26b9f49886633331e0
50 | msgid "Next Step"
51 | msgstr "下一步"
52 | 
53 | #: ../../Qwen/source/framework/Langchain.rst:276
54 | #: 15ed906687054af78545290ba0746380
55 | msgid "Now you can chat with Qwen2.5 use your own document. Continue to read the documentation and try to figure out more advanced usages of model retrieval!"
56 | msgstr "现在，您可以在您自己的文档上与Qwen2.5进行交流。继续阅读文档，尝试探索模型检索的更多高级用法！"
57 | 
58 | 


--------------------------------------------------------------------------------
/docs/locales/zh_CN/LC_MESSAGES/framework/LlamaIndex.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2024, Qwen Team
 3 | # This file is distributed under the same license as the Qwen package.
 4 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2024.
 5 | #
 6 | msgid ""
 7 | msgstr ""
 8 | "Project-Id-Version: Qwen \n"
 9 | "Report-Msgid-Bugs-To: \n"
10 | "POT-Creation-Date: 2025-04-28 19:42+0800\n"
11 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
12 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
13 | "Language: zh_CN\n"
14 | "Language-Team: zh_CN <LL@li.org>\n"
15 | "Plural-Forms: nplurals=1; plural=0;\n"
16 | "MIME-Version: 1.0\n"
17 | "Content-Type: text/plain; charset=utf-8\n"
18 | "Content-Transfer-Encoding: 8bit\n"
19 | "Generated-By: Babel 2.17.0\n"
20 | 
21 | #: ../../Qwen/source/framework/LlamaIndex.rst:2
22 | #: 2e41f8696c20488d8593b670c6361edf
23 | msgid "LlamaIndex"
24 | msgstr "LlamaIndex"
25 | 
26 | #: ../../Qwen/source/framework/LlamaIndex.rst:5
27 | #: 20b3836fd391457bb00bf75b61e23e0d
28 | msgid "To be updated for Qwen3."
29 | msgstr "仍需为Qwen3更新。"
30 | 
31 | #: ../../Qwen/source/framework/LlamaIndex.rst:7
32 | #: 86d9e6f0684749aab40a9824cd026fa3
33 | msgid "To connect Qwen2.5 with external data, such as documents, web pages, etc., we offer a tutorial on `LlamaIndex <https://www.llamaindex.ai/>`__. This guide helps you quickly implement retrieval-augmented generation (RAG) using LlamaIndex with Qwen2.5."
34 | msgstr "为了实现 Qwen2.5 与外部数据（例如文档、网页等）的连接，我们提供了 `LlamaIndex <https://www.llamaindex.ai/>`__ 的详细教程。本指南旨在帮助用户利用 LlamaIndex 与 Qwen2.5 快速部署检索增强生成（RAG）技术。"
35 | 
36 | #: ../../Qwen/source/framework/LlamaIndex.rst:11
37 | #: 71ed222858054687a5b33222bb6ac086
38 | msgid "Preparation"
39 | msgstr "环境准备"
40 | 
41 | #: ../../Qwen/source/framework/LlamaIndex.rst:13
42 | #: 161d9153d6484dd5a1f1bdb340847814
43 | msgid "To implement RAG, we advise you to install the LlamaIndex-related packages first."
44 | msgstr "为实现检索增强生成（RAG），我们建议您首先安装与 LlamaIndex 相关的软件包。"
45 | 
46 | #: ../../Qwen/source/framework/LlamaIndex.rst:16
47 | #: a8d6acb1001a42c88185b971ae2de3bf
48 | msgid "The following is a simple code snippet showing how to do this:"
49 | msgstr "以下是一个简单的代码示例："
50 | 
51 | #: ../../Qwen/source/framework/LlamaIndex.rst:25
52 | #: e441d3b8fb6d4a13b52e1560ef250b16
53 | msgid "Set Parameters"
54 | msgstr "设置参数"
55 | 
56 | #: ../../Qwen/source/framework/LlamaIndex.rst:27
57 | #: c2481804c3f34c7f883eed92ffa3111e
58 | msgid "Now we can set up LLM, embedding model, and the related configurations. Qwen2.5-Instruct supports conversations in multiple languages, including English and Chinese. You can use the ``bge-base-en-v1.5`` model to retrieve from English documents, and you can download the ``bge-base-zh-v1.5`` model to retrieve from Chinese documents. You can also choose ``bge-large`` or ``bge-small`` as the embedding model or modify the context window size or text chunk size depending on your computing resources. Qwen2.5 model families support a maximum of 32K context window size (up to 128K for 7B, 14B, 32B, and 72B, requiring extra configuration)"
59 | msgstr "现在，我们可以设置语言模型和向量模型。Qwen2.5-Instruct支持包括英语和中文在内的多种语言对话。您可以使用 ``bge-base-en-v1.5`` 模型来检索英文文档，下载 ``bge-base-zh-v1.5`` 模型以检索中文文档。根据您的计算资源，您还可以选择 ``bge-large`` 或 ``bge-small`` 作为向量模型，或调整上下文窗口大小或文本块大小。Qwen2.5模型系列支持最大32K上下文窗口大小（7B 、14B 、32B 及 72B可扩展支持 128K 上下文，但需要额外配置）"
60 | 
61 | #: ../../Qwen/source/framework/LlamaIndex.rst:85
62 | #: 74c35d5a03734c289d162dfa3813ada6
63 | msgid "Build Index"
64 | msgstr "构建索引"
65 | 
66 | #: ../../Qwen/source/framework/LlamaIndex.rst:87
67 | #: c49859d4ea5f49dba1fa2263f3ae284d
68 | msgid "Now we can build index from documents or websites."
69 | msgstr "现在我们可以从文档或网站构建索引。"
70 | 
71 | #: ../../Qwen/source/framework/LlamaIndex.rst:89
72 | #: b460d000037e4266a4d9f43d38f1f9b0
73 | msgid "The following code snippet demonstrates how to build an index for files (regardless of whether they are in PDF or TXT format) in a local folder named 'document'."
74 | msgstr "以下代码片段展示了如何为本地名为'document'的文件夹中的文件（无论是PDF格式还是TXT格式）构建索引。"
75 | 
76 | #: ../../Qwen/source/framework/LlamaIndex.rst:102
77 | #: a416d18b227940e29fac1f59851ff8c4
78 | msgid "The following code snippet demonstrates how to build an index for the content in a list of websites."
79 | msgstr "以下代码片段展示了如何为一系列网站的内容构建索引。"
80 | 
81 | #: ../../Qwen/source/framework/LlamaIndex.rst:118
82 | #: 487cf928d048424fa1b50438f701137c
83 | msgid "To save and load the index, you can use the following code snippet."
84 | msgstr "要保存和加载已构建的索引，您可以使用以下代码示例。"
85 | 
86 | #: ../../Qwen/source/framework/LlamaIndex.rst:132
87 | #: c68419c4318d46e891f5df9191be6d2d
88 | msgid "RAG"
89 | msgstr "检索增强（RAG）"
90 | 
91 | #: ../../Qwen/source/framework/LlamaIndex.rst:134
92 | #: 8ad20a8f43fe496084a40f963ba97440
93 | msgid "Now you can perform queries, and Qwen2.5 will answer based on the content of the indexed documents."
94 | msgstr "现在您可以输入查询，Qwen2.5 将基于索引文档的内容提供答案。"
95 | 
96 | 


--------------------------------------------------------------------------------
/docs/locales/zh_CN/LC_MESSAGES/framework/qwen_agent.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2024, Qwen Team
 3 | # This file is distributed under the same license as the Qwen package.
 4 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2024.
 5 | #
 6 | msgid ""
 7 | msgstr ""
 8 | "Project-Id-Version: Qwen \n"
 9 | "Report-Msgid-Bugs-To: \n"
10 | "POT-Creation-Date: 2025-05-16 18:57+0800\n"
11 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
12 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
13 | "Language: zh_CN\n"
14 | "Language-Team: zh_CN <LL@li.org>\n"
15 | "Plural-Forms: nplurals=1; plural=0;\n"
16 | "MIME-Version: 1.0\n"
17 | "Content-Type: text/plain; charset=utf-8\n"
18 | "Content-Transfer-Encoding: 8bit\n"
19 | "Generated-By: Babel 2.17.0\n"
20 | 
21 | #: ../../source/framework/qwen_agent.rst:2 74719c4bae294c5ea93e9f8542cef14c
22 | msgid "Qwen-Agent"
23 | msgstr "Qwen-Agent"
24 | 
25 | #: ../../source/framework/qwen_agent.rst:4 2a3d08cd70a34436bf5d9e1617a4d392
26 | msgid "`Qwen-Agent <https://github.com/QwenLM/Qwen-Agent>`__ is a framework for developing LLM applications based on the instruction following, tool usage, planning, and memory capabilities of Qwen."
27 | msgstr "`Qwen-Agent <https://github.com/QwenLM/Qwen-Agent>`__ 是一个基于 Qwen 的指令跟随、工具使用、计划和记忆能力来开发 LLM 应用程序的框架。"
28 | 
29 | #: ../../source/framework/qwen_agent.rst:8 ada0e9f26c6748768f66c1c62e4b6d75
30 | msgid "This is a simple tutorial on using Qwen-Agent to quickly experience the agentic capabilities of Qwen3. For more detailed information, please refer to `Qwen-Agent <https://github.com/QwenLM/Qwen-Agent>`__ repository."
31 | msgstr "本教程展示基于 Qwen-Agent 快速体验 Qwen3 智能体能力的流程。更多信息请参考 `Qwen-Agent <https://github.com/QwenLM/Qwen-Agent>`__ 仓库。"
32 | 
33 | #: ../../source/framework/qwen_agent.rst:14 b0997eeb63844471b1637075add23cb0
34 | msgid "Installation"
35 | msgstr "安装"
36 | 
37 | #: ../../source/framework/qwen_agent.rst:16 b6ba4e319cd24dee88d5cdbde60b096b
38 | msgid "Install the stable version from PyPI:"
39 | msgstr "从 PyPI 安装 Qwen-Agent 的稳定版本："
40 | 
41 | #: ../../source/framework/qwen_agent.rst:29 3f5ac104f7a647b99b1146d72fdda96d
42 | msgid "Developing Your Own Agent"
43 | msgstr "开发您自己的智能体"
44 | 
45 | #: ../../source/framework/qwen_agent.rst:31 e85c4ea6d3d9406da3823cb4f188ffc4
46 | msgid "Qwen3 excels in tool calling capabilities. Qwen-Agent encapsulates tool-calling templates and tool-calling parsers internally, greatly reducing coding complexity."
47 | msgstr "Qwen3 在工具调用能力方面表现出色。Qwen-Agent 内部封装了工具调用模板和工具调用解析器，大大降低了编码复杂性。"
48 | 
49 | #: ../../source/framework/qwen_agent.rst:35 d63624897ee340fb8d31eeea6a02e995
50 | msgid "To define the available tools, you can use the MCP configuration file, use the integrated tool of Qwen-Agent, or integrate other tools by yourself."
51 | msgstr "要定义可用的工具，您可以使用 MCP 配置文件，使用 Qwen-Agent 的集成工具，或者自行集成其他工具。"
52 | 
53 | #: ../../source/framework/qwen_agent.rst:112 ada0e9f26c6748768f66c1c62e4b6d75
54 | msgid "For more detailed examples and MCP cookbooks, please refer to `Qwen-Agent <https://github.com/QwenLM/Qwen-Agent>`__ repository."
55 | msgstr "有关更详细的示例和 MCP 使用指南，请参阅 `Qwen-Agent <https://github.com/QwenLM/Qwen-Agent>`__ 仓库。"
56 | 
57 | #~ msgid "To be updated for Qwen3."
58 | #~ msgstr "仍需为Qwen3更新。"
59 | 
60 | #~ msgid "Qwen-Agent provides atomic components such as LLMs and prompts, as well as high-level components such as Agents. The example below uses the Assistant component as an illustration, demonstrating how to add custom tools and quickly develop an agent that uses tools."
61 | #~ msgstr "Qwen-Agent 提供包括语言模型和提示词等原子级组件，及智能体等高级组件在内的多种组件。以下示例选取助理组件进行展示，阐述了如何整合自定义工具以及如何迅速开发出一个能够应用这些工具的代理程序。"
62 | 
63 | #~ msgid "The framework also provides more atomic components for developers to combine. For additional showcases, please refer to `examples <https://github.com/QwenLM/Qwen-Agent/tree/main/examples>`__."
64 | #~ msgstr "该框架还为开发者提供了更多的原子组件以供组合使用。欲了解更多示例，请参见 `examples <https://github.com/QwenLM/Qwen-Agent/tree/main/examples>`__。"
65 | 
66 | #~ msgid "This is the simplest tutorial on using Qwen-Agent to quickly experience the agentic capabilities of Qwen3. For more detailed information, please refer to `Qwen-Agent <https://github.com/QwenLM/Qwen-Agent>`__ repository."
67 | #~ msgstr ""
68 | 
69 | 


--------------------------------------------------------------------------------
/docs/locales/zh_CN/LC_MESSAGES/index.po:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2024, Qwen Team, Alibaba Group.
  2 | # This file is distributed under the same license as the Qwen package.
  3 | #
  4 | msgid ""
  5 | msgstr ""
  6 | "Project-Id-Version: Qwen \n"
  7 | "Report-Msgid-Bugs-To: \n"
  8 | "POT-Creation-Date: 2025-04-28 19:42+0800\n"
  9 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 10 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 11 | "Language: zh_CN\n"
 12 | "Language-Team: zh_CN <LL@li.org>\n"
 13 | "Plural-Forms: nplurals=1; plural=0;\n"
 14 | "MIME-Version: 1.0\n"
 15 | "Content-Type: text/plain; charset=utf-8\n"
 16 | "Content-Transfer-Encoding: 8bit\n"
 17 | "Generated-By: Babel 2.17.0\n"
 18 | 
 19 | #: ../../Qwen/source/index.rst:34
 20 | msgid "Getting Started"
 21 | msgstr "快速开始"
 22 | 
 23 | #: ../../Qwen/source/index.rst:44
 24 | msgid "Inference"
 25 | msgstr "推理"
 26 | 
 27 | #: ../../Qwen/source/index.rst:51
 28 | msgid "Run Locally"
 29 | msgstr "本地运行"
 30 | 
 31 | #: ../../Qwen/source/index.rst:60
 32 | msgid "Deployment"
 33 | msgstr "部署"
 34 | 
 35 | #: ../../Qwen/source/index.rst:71
 36 | msgid "Quantization"
 37 | msgstr "量化"
 38 | 
 39 | #: ../../Qwen/source/index.rst:80
 40 | msgid "Training"
 41 | msgstr "训练"
 42 | 
 43 | #: ../../Qwen/source/index.rst:87
 44 | msgid "Framework"
 45 | msgstr "框架"
 46 | 
 47 | #: ../../Qwen/source/index.rst:2 6e52d3a497924f828d4c6b9dd59370d5
 48 | msgid "Welcome to Qwen!"
 49 | msgstr "欢迎来到Qwen"
 50 | 
 51 | #: ../../Qwen/source/index.rst:4 235805a6d4a34184821c0f4f81020ef1
 52 | msgid "Qwen3"
 53 | msgstr ""
 54 | 
 55 | #: ../../Qwen/source/index.rst:11 b8a3aa3f31594232959a08d89e9dc7db
 56 | msgid "Qwen is the large language model and large multimodal model series of the Qwen Team, Alibaba Group. Both language models and multimodal models are pretrained on large-scale multilingual and multimodal data and post-trained on quality data for aligning to human preferences. Qwen is capable of natural language understanding, text generation, vision understanding, audio understanding, tool use, role play, playing as AI agent, etc."
 57 | msgstr "Qwen是阿里巴巴集团Qwen团队研发的大语言模型和大型多模态模型系列。无论是语言模型还是多模态模型，均在大规模多语言和多模态数据上进行预训练，并通过高质量数据进行后期微调以贴近人类偏好。Qwen具备自然语言理解、文本生成、视觉理解、音频理解、工具使用、角色扮演、作为AI Agent进行互动等多种能力。"
 58 | 
 59 | #: ../../Qwen/source/index.rst:14 8735c67355064a97b2793b721a701b21
 60 | msgid "The latest version, Qwen3, has the following features:"
 61 | msgstr "最新版本Qwen3有以下特点："
 62 | 
 63 | #: ../../Qwen/source/index.rst:16 1956d75084244379aad9503fcc572f00
 64 | msgid "**Dense and Mixture-of-Experts (MoE) models**, available in 0.6B, 1.7B, 4B, 8B, 14B, 32B and 30B-A3B, 235B-A22B."
 65 | msgstr "**全尺寸稠密与混合专家模型**：0.6B, 1.7B, 4B, 8B, 14B, 32B and 30B-A3B, 235B-A22B"
 66 | 
 67 | #: ../../Qwen/source/index.rst:17 1fdf12161cd14663b67b2c08f9219ddb
 68 | msgid "**Seamless switching between thinking mode** (for complex logical reasoning, math, and coding) and **non-thinking mode** (for efficient, general-purpose chat) **within a single model**, ensuring optimal performance across various scenarios."
 69 | msgstr "支持在**思考模式**（用于复杂逻辑推理、数学和编码）和 **非思考模式** （用于高效通用对话）之间**无缝切换**，确保在各种场景下的最佳性能。"
 70 | 
 71 | #: ../../Qwen/source/index.rst:18 189ff2a03ad249ef88202c34e9f8aa86
 72 | msgid "**Significantly enhancement in reasoning capabilities**, surpassing previous QwQ (in thinking mode) and Qwen2.5 instruct models (in non-thinking mode) on mathematics, code generation, and commonsense logical reasoning."
 73 | msgstr "**显著增强的推理能力**，在数学、代码生成和常识逻辑推理方面超越了之前的 QwQ（在思考模式下）和 Qwen2.5 指令模型（在非思考模式下）。"
 74 | 
 75 | #: ../../Qwen/source/index.rst:19 64ebcda0381148cb8edf8d92b49469ea
 76 | msgid "**Superior human preference alignment**, excelling in creative writing, role-playing, multi-turn dialogues, and instruction following, to deliver a more natural, engaging, and immersive conversational experience."
 77 | msgstr "**卓越的人类偏好对齐**，在创意写作、角色扮演、多轮对话和指令跟随方面表现出色，提供更自然、更吸引人和更具沉浸感的对话体验。"
 78 | 
 79 | #: ../../Qwen/source/index.rst:20 ec0ebb91f1ed491f8672aefef6307d85
 80 | msgid "**Expertise in agent capabilities**, enabling precise integration with external tools in both thinking and unthinking modes and achieving leading performance among open-source models in complex agent-based tasks."
 81 | msgstr "**擅长智能体能力**，可以在思考和非思考模式下精确集成外部工具，在复杂的基于代理的任务中在开源模型中表现领先。"
 82 | 
 83 | #: ../../Qwen/source/index.rst:21 526b161edf284e1b913aabc7e7fcc77c
 84 | msgid "**Support of 100+ languages and dialects** with strong capabilities for **multilingual instruction following** and **translation**."
 85 | msgstr "**支持 100 多种语言和方言**，具有强大的多语言理解、推理、指令跟随和生成能力。"
 86 | 
 87 | #: ../../Qwen/source/index.rst:23 79ed3f0e7da043bb8b53f510ed244814
 88 | msgid "For more information, please visit our:"
 89 | msgstr "想了解更多信息，欢迎访问："
 90 | 
 91 | #: ../../Qwen/source/index.rst:25 b2e579ae57de4d2985ab1c350fdf2458
 92 | msgid "`Blog <https://qwenlm.github.io/>`__"
 93 | msgstr "`博客 <https://qwenlm.github.io/>`__"
 94 | 
 95 | #: ../../Qwen/source/index.rst:26 406389fe90064e879bd28665a021ee7e
 96 | msgid "`GitHub <https://github.com/QwenLM>`__"
 97 | msgstr "`GitHub <https://github.com/QwenLM>`__"
 98 | 
 99 | #: ../../Qwen/source/index.rst:27 714c64df6aed4e608571de0155199fef
100 | msgid "`Hugging Face <https://huggingface.co/Qwen>`__"
101 | msgstr "`Hugging Face <https://huggingface.co/Qwen>`__"
102 | 
103 | #: ../../Qwen/source/index.rst:28 214e12e0b1c04b268582b2c46d22334d
104 | msgid "`ModelScope <https://modelscope.cn/organization/qwen>`__"
105 | msgstr "`ModelScope <https://modelscope.cn/organization/qwen>`__"
106 | 
107 | #: ../../Qwen/source/index.rst:29 9c64e461dc3a440ab92d94887fe3d2d8
108 | msgid "`Qwen3 Collection <https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f>`__"
109 | msgstr ""
110 | 
111 | #: ../../Qwen/source/index.rst:31 c6056edc8a3a4a12bd3a75eeb210f7a2
112 | msgid "Join our community by joining our `Discord <https://discord.gg/yPEP2vHTu4>`__ and `WeChat <https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png>`__ group. We are looking forward to seeing you there!"
113 | msgstr "加入社区，加入 `Discord <https://discord.gg/yPEP2vHTu4>`__ 和 `微信群 <https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png>`__ 。很期待见到你们！"
114 | 
115 | #~ msgid "Web UI"
116 | #~ msgstr "Web UI"
117 | 
118 | #~ msgid "Benchmark"
119 | #~ msgstr "评测"
120 | 
121 | #~ msgid "Qwen2.5"
122 | #~ msgstr ""
123 | 
124 | #~ msgid "Dense, easy-to-use, decoder-only language models, available in **0.5B**, **1.5B**, **3B**, **7B**, **14B**, **32B**, and **72B** sizes, and base and instruct variants."
125 | #~ msgstr "易于使用的仅解码器稠密语言模型，提供 **0.5B** 、**1.5B** 、**3B** 、**7B** 、**14B** 、**32B** 和 **72B** 共7种参数规模的模型，并且有基模型和指令微调模型两种变体（其中“ B ”表示“十亿”， 72B 即为 720 亿）"
126 | 
127 | #~ msgid "Pretrained on our latest large-scale dataset, encompassing up to **18T** tokens."
128 | #~ msgstr "利用我们最新的数据集进行预训练，包含多达 18T tokens （其中“ T ”表示“万亿”， 18T 即为 18 万亿）"
129 | 
130 | #~ msgid "Significant improvements in instruction following, generating long texts (over 8K tokens), understanding structured data (e.g, tables), and generating structured outputs especially JSON."
131 | #~ msgstr "在遵循指令、生成长文本（超过 8K tokens ）、理解结构化数据（例如，表格）以及生成结构化输出特别是 JSON 方面有了显著改进"
132 | 
133 | #~ msgid "More resilient to the diversity of system prompts, enhancing role-play implementation and condition-setting for chatbots."
134 | #~ msgstr "更加适应多样化的系统提示，增强了角色扮演的实现和聊天机器人的背景设置。"
135 | 
136 | #~ msgid "Context length support up to **128K** tokens and can generate up to **8K** tokens."
137 | #~ msgstr "支持最多达 **128K** tokens 的上下文长度，并能生成多达 **8K** tokens 的文本。"
138 | 
139 | #~ msgid "Multilingual support for over **29** languages, including Chinese, English, French, Spanish, Portuguese, German, Italian, Russian, Japanese, Korean, Vietnamese, Thai, Arabic, and more."
140 | #~ msgstr "支持超过 **29** 种语言，包括中文、英文、法文、西班牙文、葡萄牙文、德文、意大利文、俄文、日文、韩文、越南文、泰文、阿拉伯文等。"
141 | 
142 | #~ msgid "`Qwen2.5 Collection <https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e>`__"
143 | #~ msgstr ""
144 | 
145 | 


--------------------------------------------------------------------------------
/docs/locales/zh_CN/LC_MESSAGES/quantization/awq.po:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2024, Qwen Team, Alibaba Group.
  2 | # This file is distributed under the same license as the Qwen package.
  3 | #
  4 | msgid ""
  5 | msgstr ""
  6 | "Project-Id-Version: Qwen \n"
  7 | "Report-Msgid-Bugs-To: \n"
  8 | "POT-Creation-Date: 2025-04-28 19:42+0800\n"
  9 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 10 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 11 | "Language: zh_CN\n"
 12 | "Language-Team: zh_CN <LL@li.org>\n"
 13 | "Plural-Forms: nplurals=1; plural=0;\n"
 14 | "MIME-Version: 1.0\n"
 15 | "Content-Type: text/plain; charset=utf-8\n"
 16 | "Content-Transfer-Encoding: 8bit\n"
 17 | "Generated-By: Babel 2.17.0\n"
 18 | 
 19 | #: ../../Qwen/source/quantization/awq.md:1 363514c3e24c4d2aa54832e85acf34ef
 20 | msgid "AWQ"
 21 | msgstr "AWQ"
 22 | 
 23 | #: ../../Qwen/source/quantization/awq.md:4 36b5c0de1013499f9f1e41edf8fa28ca
 24 | msgid "To be updated for Qwen3."
 25 | msgstr "仍需为Qwen3更新。"
 26 | 
 27 | #: ../../Qwen/source/quantization/awq.md:7 9d6a80a82b044628bc9c911785ac9160
 28 | msgid "For quantized models, one of our recommendations is the usage of [AWQ](https://arxiv.org/abs/2306.00978) with [AutoAWQ](https://github.com/casper-hansen/AutoAWQ)."
 29 | msgstr "对于量化模型，我们推荐使用 [AWQ](https://arxiv.org/abs/2306.00978) 结合 [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) "
 30 | 
 31 | #: ../../Qwen/source/quantization/awq.md:9 139542ed4b414cfb834b3fd81ea88d51
 32 | msgid "**AWQ** refers to Activation-aware Weight Quantization, a hardware-friendly approach for LLM low-bit weight-only quantization."
 33 | msgstr "**AWQ**即激活值感知的权重量化(Activation-aware Weight Quantization)，是一种针对LLM的低比特权重量化的硬件友好方法。"
 34 | 
 35 | #: ../../Qwen/source/quantization/awq.md:11 9a2959bb9f984e36a299bc40abca9402
 36 | msgid "**AutoAWQ** is an easy-to-use Python library for 4-bit quantized models.  AutoAWQ speeds up models by 3x and reduces memory requirements by 3x compared to FP16.  AutoAWQ implements the Activation-aware Weight Quantization (AWQ) algorithm for quantizing LLMs."
 37 | msgstr "**AutoAWQ**是一个易于使用的工具包，用于4比特量化模型。相较于FP16，AutoAWQ能够将模型的运行速度提升3倍，并将内存需求降低至原来的三分之一。AutoAWQ实现了AWQ算法，可用于LLM的量化处理。"
 38 | 
 39 | #: ../../Qwen/source/quantization/awq.md:15 4f9fcd93d1f44b48869224c0f4e8b76a
 40 | msgid "In this document, we show you how to use the quantized model with Hugging Face `transformers` and also how to quantize your own model."
 41 | msgstr "在本文档中，我们将向您展示如何在Hugging Face `transformers`框架下使用量化模型，以及如何对您自己的模型进行量化"
 42 | 
 43 | #: ../../Qwen/source/quantization/awq.md:17 870ebc162f3749b48fe454df85aaaf4b
 44 | msgid "Usage of AWQ Models with Hugging Face transformers"
 45 | msgstr "在Hugging Face transformers中使用AWQ量化模型"
 46 | 
 47 | #: ../../Qwen/source/quantization/awq.md:19 cc7bd785c7ac45a4980fbda683699e43
 48 | msgid "Now, `transformers` has officially supported AutoAWQ, which means that you can directly use the quantized model with `transformers`.  The following is a very simple code snippet showing how to run `Qwen2.5-7B-Instruct-AWQ` with the quantized model:"
 49 | msgstr "现在，`transformers`已经正式支持AutoAWQ，这意味着您可以直接在`transformers`中使用AWQ量化模型。以下是一个非常简单的代码片段，展示如何运行量化模型 `Qwen2.5-7B-Instruct-AWQ` ："
 50 | 
 51 | #: ../../Qwen/source/quantization/awq.md:56 47826d51abf54ad8a89ef9b91127a700
 52 | msgid "Usage of AWQ  Models with vLLM"
 53 | msgstr "在vLLM中使用AWQ量化模型"
 54 | 
 55 | #: ../../Qwen/source/quantization/awq.md:58 b7235ae8f8344dd4a3d2029bbe7a40fc
 56 | msgid "vLLM has supported AWQ, which means that you can directly use our provided AWQ models or those quantized with `AutoAWQ` with vLLM. We recommend using the latest version of vLLM (`vllm>=0.6.1`) which brings performance improvements to AWQ models; otherwise, the performance might not be well-optimized."
 57 | msgstr "vLLM已支持AWQ，您可以直接使用我们提供的AWQ量化模型或使用`AutoAWQ`量化的模型。我们建议使用最新版的vLLM (`vllm>=0.6.1`)，新版为AWQ量化模型提升了效率提；不然推理效率可能并为被良好优化（即效率可能较非量化模型低）。"
 58 | 
 59 | #: ../../Qwen/source/quantization/awq.md:61 940ce8fdb5da442b99af2bc1739911c6
 60 | msgid "Actually, the usage is the same with the basic usage of vLLM.  We provide a simple example of how to launch OpenAI-API compatible API with vLLM and `Qwen2.5-7B-Instruct-AWQ`:"
 61 | msgstr "实际上，使用AWQ模型与vLLM的基本用法相同。我们提供了一个简单的示例，展示了如何通过vLLM启动与OpenAI API兼容的接口，并使用 `Qwen2.5-7B-Instruct-AWQ` 模型："
 62 | 
 63 | #: ../../Qwen/source/quantization/awq.md:64 2d249915352049a6a8d5a06e1f4682ee
 64 | msgid "Run the following in a shell to start an OpenAI-compatible API service:"
 65 | msgstr "在终端中运行以下命令以开启OpenAI兼容API："
 66 | 
 67 | #: ../../Qwen/source/quantization/awq.md:70 be7bfbb81698429cbfcbcd24d062fc08
 68 | msgid "Then, you can call the API as"
 69 | msgstr "随后，您可以这样调用API："
 70 | 
 71 | #: ../../Qwen/source/quantization/awq.md:86 0dff7d5c7b044548a82e0ba68a043d80
 72 | msgid "or you can use the API client with the `openai` Python package as shown below:"
 73 | msgstr "或者你可以按照下面所示的方式，使用 `openai` Python包中的API客户端："
 74 | 
 75 | #: ../../Qwen/source/quantization/awq.md:115 65f4d60502ee486382e9bda9a5a826bb
 76 | msgid "Quantize Your Own Model with AutoAWQ"
 77 | msgstr "使用AutoAWQ量化你的模型"
 78 | 
 79 | #: ../../Qwen/source/quantization/awq.md:117 c7c42af91c1a419194d65200bcfa2f26
 80 | #, fuzzy
 81 | msgid "If you want to quantize your own model to AWQ quantized models, we advise you to use AutoAWQ."
 82 | msgstr "如果您希望将自定义模型量化为AWQ量化模型，我们建议您使用AutoAWQ。推荐通过安装源代码来获取并安装该工具包的最新版本："
 83 | 
 84 | #: ../../Qwen/source/quantization/awq.md:123 232e94883d044030b2193392788b9314
 85 | msgid "Suppose you have finetuned a model based on `Qwen2.5-7B`, which is named `Qwen2.5-7B-finetuned`, with your own dataset, e.g., Alpaca.  To build your own AWQ quantized model, you need to use the training data for calibration.  Below, we provide a simple demonstration for you to run:"
 86 | msgstr "假设你已经基于 `Qwen2.5-7B` 模型进行了微调，并将其命名为 `Qwen2.5-7B-finetuned` ，且使用的是你自己的数据集，比如Alpaca。若要构建你自己的AWQ量化模型，你需要使用训练数据进行校准。以下，我们将为你提供一个简单的演示示例以便运行："
 87 | 
 88 | #: ../../Qwen/source/quantization/awq.md:141 5162195f32ee4ecba229aa137da1aba4
 89 | msgid "Then you need to prepare your data for calibration.  What you need to do is just put samples into a list, each of which is a text.  As we directly use our finetuning data for calibration, we first format it with ChatML template.  For example,"
 90 | msgstr "接下来，您需要准备数据以进行校准。您需要做的就是将样本放入一个列表中，其中每个样本都是一段文本。由于我们直接使用微调数据来进行校准，所以我们首先使用ChatML模板对其进行格式化。例如："
 91 | 
 92 | #: ../../Qwen/source/quantization/awq.md:153 0d4736e90e0242a8be15533de3aab6ff
 93 | msgid "where each `msg` is a typical chat message as shown below:"
 94 | msgstr "其中每个 `msg` 是一个典型的聊天消息，如下所示："
 95 | 
 96 | #: ../../Qwen/source/quantization/awq.md:163 79d86630600945ac85dbe13d07987016
 97 | msgid "Then just run the calibration process by one line of code:"
 98 | msgstr "然后只需通过一行代码运行校准过程："
 99 | 
100 | #: ../../Qwen/source/quantization/awq.md:169 1ae219a50508465b98e3b3398e631681
101 | msgid "Finally, save the quantized model:"
102 | msgstr "最后，保存量化模型："
103 | 
104 | #: ../../Qwen/source/quantization/awq.md:176 58316c1a4172418aba9f37925963e17f
105 | msgid "Then you can obtain your own AWQ quantized model for deployment.  Enjoy!"
106 | msgstr "然后你就可以得到一个可以用于部署的AWQ量化模型。玩得开心！"
107 | 
108 | 


--------------------------------------------------------------------------------
/docs/locales/zh_CN/LC_MESSAGES/run_locally/mlx-lm.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2024, Qwen Team
 3 | # This file is distributed under the same license as the Qwen package.
 4 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2024.
 5 | #
 6 | msgid ""
 7 | msgstr ""
 8 | "Project-Id-Version: Qwen \n"
 9 | "Report-Msgid-Bugs-To: \n"
10 | "POT-Creation-Date: 2025-04-29 16:34+0800\n"
11 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
12 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
13 | "Language: zh_CN\n"
14 | "Language-Team: zh_CN <LL@li.org>\n"
15 | "Plural-Forms: nplurals=1; plural=0;\n"
16 | "MIME-Version: 1.0\n"
17 | "Content-Type: text/plain; charset=utf-8\n"
18 | "Content-Transfer-Encoding: 8bit\n"
19 | "Generated-By: Babel 2.17.0\n"
20 | 
21 | #: ../../source/run_locally/mlx-lm.md:1 47d6da370d364ad6a80f76d5d1f8a80d
22 | msgid "MLX LM"
23 | msgstr ""
24 | 
25 | #: ../../source/run_locally/mlx-lm.md:4 f11e8701708e48559587dd3a8404be92
26 | msgid "To be updated for Qwen3."
27 | msgstr "仍需为Qwen3更新。"
28 | 
29 | #: ../../source/run_locally/mlx-lm.md:7 9ff094754ecc40b79c3bc6737d1264dc
30 | #, fuzzy
31 | msgid "[mlx-lm](https://github.com/ml-explore/mlx-examples/tree/main/llms) helps you run LLMs locally on Apple Silicon.  It is available at macOS.  It has already supported Qwen models and this time, we have also provided checkpoints that you can directly use with it."
32 | msgstr "[mlx-lm](https://github.com/ml-explore/mlx-examples/tree/main/llms)能让你在Apple Silicon上运行大型语言模型，适用于MacOS。mlx-lm已支持Qwen模型，此次我们提供直接可用的模型文件。"
33 | 
34 | #: ../../source/run_locally/mlx-lm.md:11 df3a6d381ce44151b810ee2ca012c6d3
35 | msgid "Prerequisites"
36 | msgstr "准备工作"
37 | 
38 | #: ../../source/run_locally/mlx-lm.md:13 2f6092faf4904990a208b2cecdc623b4
39 | msgid "The easiest way to get started is to install the `mlx-lm` package:"
40 | msgstr "首先需要安装`mlx-lm`包："
41 | 
42 | #: ../../source/run_locally/mlx-lm.md:15 8ae883c84d7b435293dab8502884cec6
43 | msgid "with `pip`:"
44 | msgstr "使用`pip`："
45 | 
46 | #: ../../source/run_locally/mlx-lm.md:21 0cd594d753fa478ca025271dca9af3b5
47 | msgid "with `conda`:"
48 | msgstr "使用`conda`："
49 | 
50 | #: ../../source/run_locally/mlx-lm.md:27 a82de876ad354ca5ae7eb2cc9fad0f1e
51 | #, fuzzy
52 | msgid "Running with Qwen MLX Files"
53 | msgstr "使用Qwen MLX模型文件"
54 | 
55 | #: ../../source/run_locally/mlx-lm.md:29 36d5c803c391420fabc5640f57a6509b
56 | msgid "We provide model checkpoints with `mlx-lm` in our Hugging Face organization, and to search for what you need you can search the repo names with `-MLX`."
57 | msgstr "我们已在Hugging Face提供了适用于`mlx-lm`的模型文件，请搜索带`-MLX`的存储库。"
58 | 
59 | #: ../../source/run_locally/mlx-lm.md:31 916de6911595431f9ebf5cc3eec51fe3
60 | msgid "Here provides a code snippet with `apply_chat_template` to show you how to load the tokenizer and model and how to generate contents."
61 | msgstr "这里我们展示了一个代码样例，其中使用了`apply_chat_template`来应用对话模板。"
62 | 
63 | #: ../../source/run_locally/mlx-lm.md:52 299ccc7dc4984f5e97c58b58a6216d69
64 | msgid "Make Your MLX files"
65 | msgstr "自行制作MLX格式模型"
66 | 
67 | #: ../../source/run_locally/mlx-lm.md:54 433503979d404191b934cf5e1ed7f655
68 | #, fuzzy
69 | msgid "You can make MLX files with just one command:"
70 | msgstr "仅用一条命令即可制作mlx格式模型"
71 | 
72 | #: ../../source/run_locally/mlx-lm.md:60 b2302dcae1074aff87fe3a1ec49c15f0
73 | msgid "where"
74 | msgstr "参数含义分别是"
75 | 
76 | #: ../../source/run_locally/mlx-lm.md:62 cbb011f4b76346cca7369b0d7538f899
77 | msgid "`--hf-path`: the model name on Hugging Face Hub or the local path"
78 | msgstr "`--hf-path`: Hugging Face Hub上的模型名或本地路径"
79 | 
80 | #: ../../source/run_locally/mlx-lm.md:63 16348ddca1e34966a51cb192d1c7d064
81 | msgid "`--mlx-path`: the path for output files"
82 | msgstr "`--mlx-path`: 输出模型文件的存储路径"
83 | 
84 | #: ../../source/run_locally/mlx-lm.md:64 34f6d5f01dcb4381ade2da35c75ea566
85 | msgid "`-q`: enable quantization"
86 | msgstr "`-q`: 启用量化"
87 | 
88 | #~ msgid "MLX-LM"
89 | #~ msgstr ""
90 | 
91 | 


--------------------------------------------------------------------------------
/docs/locales/zh_CN/LC_MESSAGES/run_locally/ollama.po:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2024, Qwen Team, Alibaba Group.
  2 | # This file is distributed under the same license as the Qwen package.
  3 | #
  4 | msgid ""
  5 | msgstr ""
  6 | "Project-Id-Version: Qwen \n"
  7 | "Report-Msgid-Bugs-To: \n"
  8 | "POT-Creation-Date: 2025-04-29 16:34+0800\n"
  9 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 10 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 11 | "Language: zh_CN\n"
 12 | "Language-Team: zh_CN <LL@li.org>\n"
 13 | "Plural-Forms: nplurals=1; plural=0;\n"
 14 | "MIME-Version: 1.0\n"
 15 | "Content-Type: text/plain; charset=utf-8\n"
 16 | "Content-Transfer-Encoding: 8bit\n"
 17 | "Generated-By: Babel 2.17.0\n"
 18 | 
 19 | #: ../../source/run_locally/ollama.md:1 66b5f4776d0a45ec9a7ed2c147a6323e
 20 | msgid "Ollama"
 21 | msgstr "Ollama"
 22 | 
 23 | #: ../../source/run_locally/ollama.md:4 1a60e4f166184588802505205b51ea7a
 24 | msgid "To be updated for Qwen3."
 25 | msgstr "仍需为Qwen3更新。"
 26 | 
 27 | #: ../../source/run_locally/ollama.md:7 06925624447242879d12bcbc3be61ce6
 28 | #, fuzzy
 29 | msgid "[Ollama](https://ollama.com/) helps you run LLMs locally with only a few commands. It is available at macOS, Linux, and Windows. Now, Qwen2.5 is officially on Ollama, and you can run it with one command:"
 30 | msgstr "[Ollama](https://ollama.com/)帮助您通过少量命令即可在本地运行LLM。它适用于MacOS、Linux和Windows操作系统。现在，Qwen2.5正式上线Ollama，您只需一条命令即可运行它："
 31 | 
 32 | #: ../../source/run_locally/ollama.md:15 1180461df89c41c586d499daf5c6e0a0
 33 | msgid "Next, we introduce more detailed usages of Ollama for running Qwen2.5 models."
 34 | msgstr "接着，我们介绍在Ollama使用Qwen2.5模型的更多用法"
 35 | 
 36 | #: ../../source/run_locally/ollama.md:17 816ea502f0be44bcab371fd76e486618
 37 | msgid "Quickstart"
 38 | msgstr "快速开始"
 39 | 
 40 | #: ../../source/run_locally/ollama.md:19 c6d1c533593b495da62c73c8434636f8
 41 | msgid "Visit the official website [Ollama](https://ollama.com/) and click download to install Ollama on your device. You can also search models on the website, where you can find the Qwen2.5 models. Except for the default one, you can choose to run Qwen2.5-Instruct models of different sizes by:"
 42 | msgstr "访问官方网站[Ollama](https://ollama.com/)，点击`Download`以在您的设备上安装Ollama。您还可以在网站上搜索模型，在这里您可以找到Qwen2.5系列模型。除了默认模型之外，您可以通过以下方式选择运行不同大小的Qwen2.5-Instruct模型："
 43 | 
 44 | #: ../../source/run_locally/ollama.md:23 9f87565dcf4b44b8adce249c1863adbd
 45 | msgid "`ollama run qwen2.5:0.5b`"
 46 | msgstr ""
 47 | 
 48 | #: ../../source/run_locally/ollama.md:24 dcd5286c72a2440fb0fc7bfd210dd8d9
 49 | msgid "`ollama run qwen2.5:1.5b`"
 50 | msgstr ""
 51 | 
 52 | #: ../../source/run_locally/ollama.md:25 d2c13d08cad34893804497095676ddd4
 53 | msgid "`ollama run qwen2.5:3b`"
 54 | msgstr ""
 55 | 
 56 | #: ../../source/run_locally/ollama.md:26 e4a8e4cbd090469e9094afae3abe30f7
 57 | msgid "`ollama run qwen2.5:7b`"
 58 | msgstr ""
 59 | 
 60 | #: ../../source/run_locally/ollama.md:27 a97beb23d6074c2db90428b717ab61dd
 61 | msgid "`ollama run qwen2.5:14b`"
 62 | msgstr ""
 63 | 
 64 | #: ../../source/run_locally/ollama.md:28 049274657be149b4976d722cd608eb09
 65 | msgid "`ollama run qwen2.5:32b`"
 66 | msgstr ""
 67 | 
 68 | #: ../../source/run_locally/ollama.md:29 baec60a5ed5047e3bd080aa96cfc577b
 69 | msgid "`ollama run qwen2.5:72b`"
 70 | msgstr ""
 71 | 
 72 | #: ../../source/run_locally/ollama.md:32 2f9ef84694f5446f87db2333336f93c9
 73 | msgid "`ollama` does not host base models. Even though the tag may not have the instruct suffix, they are all instruct models."
 74 | msgstr "`ollama`并不托管基模型。即便模型标签不带instruct后缀，实际也是instruct模型。"
 75 | 
 76 | #: ../../source/run_locally/ollama.md:36 9545d728b8aa4163a66accdb54694caf
 77 | msgid "Run Ollama with Your GGUF Files"
 78 | msgstr "用Ollama运行你自己的GGUF文件"
 79 | 
 80 | #: ../../source/run_locally/ollama.md:38 f5a073220da84c19975adcce41f76d5c
 81 | msgid "Sometimes you don't want to pull models and you just want to use Ollama with your own GGUF files. Suppose you have a GGUF file of Qwen2.5, `qwen2.5-7b-instruct-q5_0.gguf`. For the first step, you need to create a file called `Modelfile`. The content of the file is shown below:"
 82 | msgstr "有时您可能不想拉取模型，而是希望直接使用自己的GGUF文件来配合Ollama。假设您有一个名为`qwen2.5-7b-instruct-q5_0.gguf`的Qwen2.5的GGUF文件。在第一步中，您需要创建一个名为`Modelfile`的文件。该文件的内容如下所示："
 83 | 
 84 | #: ../../source/run_locally/ollama.md:101 15e36d92505e4e21a2fe5dd07c5861e7
 85 | #, fuzzy
 86 | msgid "Then create the Ollama model by running:"
 87 | msgstr "然后通过运行下列命令来创建一个ollama模型"
 88 | 
 89 | #: ../../source/run_locally/ollama.md:107 4087d4c583fe44acb1456ca2412f8c85
 90 | #, fuzzy
 91 | msgid "Once it is finished, you can run your Ollama model by:"
 92 | msgstr "完成后，你即可运行你的ollama模型："
 93 | 
 94 | #: ../../source/run_locally/ollama.md:113 083503ce2e9d48eab757e8a4cf805d8d
 95 | msgid "Tool Use"
 96 | msgstr "工具调用"
 97 | 
 98 | #: ../../source/run_locally/ollama.md:115 b7635c6f3ccc4294a2ba4a45f17ae163
 99 | #, fuzzy
100 | msgid "Tool use is now supported Ollama and you should be able to run Qwen2.5 models with it. For more details, see our [function calling guide](../framework/function_call)."
101 | msgstr "Ollama现已支持工具调用，Qwen2.5也已适配。更多详情，请参阅我们的[函数调用指南](../framework/function_call)"
102 | 
103 | 


--------------------------------------------------------------------------------
/docs/locales/zh_CN/LC_MESSAGES/training/llama_factory.po:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2024, Qwen Team, Alibaba Group.
  2 | # This file is distributed under the same license as the Qwen package.
  3 | #
  4 | msgid ""
  5 | msgstr ""
  6 | "Project-Id-Version: Qwen \n"
  7 | "Report-Msgid-Bugs-To: \n"
  8 | "POT-Creation-Date: 2025-04-28 19:42+0800\n"
  9 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 10 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 11 | "Language-Team: LANGUAGE <LL@li.org>\n"
 12 | "MIME-Version: 1.0\n"
 13 | "Content-Type: text/plain; charset=utf-8\n"
 14 | "Content-Transfer-Encoding: 8bit\n"
 15 | "Generated-By: Babel 2.17.0\n"
 16 | 
 17 | #: ../../Qwen/source/training/llama_factory.rst:2
 18 | #: 7a9018d9e7ee41858ac5c59723365a63
 19 | msgid "LLaMA-Factory"
 20 | msgstr ""
 21 | 
 22 | #: ../../Qwen/source/training/llama_factory.rst:5
 23 | #: 6e90d8f392914d029783ed85b510063f
 24 | msgid "To be updated for Qwen3."
 25 | msgstr "仍需为Qwen3更新。"
 26 | 
 27 | #: ../../Qwen/source/training/llama_factory.rst:7
 28 | #: e82fbe9827774824a4259372afda3240
 29 | msgid "Here we provide a script for supervised finetuning Qwen2.5 with `LLaMA-Factory <https://github.com/hiyouga/LLaMA-Factory>`__. This script for supervised finetuning (SFT) has the following features:"
 30 | msgstr "我们将介绍如何使用 `LLaMA-Factory <https://github.com/hiyouga/LLaMA-Factory>`__ 微调 Qwen2.5 模型。本脚本包含如下特点："
 31 | 
 32 | #: ../../Qwen/source/training/llama_factory.rst:11
 33 | #: 7d37d7835f514ce68b9e4e3054919d3c
 34 | msgid "Support single-GPU and multi-GPU training;"
 35 | msgstr "支持单卡和多卡分布式训练"
 36 | 
 37 | #: ../../Qwen/source/training/llama_factory.rst:13
 38 | #: 232bdd05e26846989ae770a8da52ccc3
 39 | msgid "Support full-parameter tuning, LoRA, Q-LoRA, Dora."
 40 | msgstr "支持全参数微调、LoRA、Q-LoRA 和 DoRA 。"
 41 | 
 42 | #: ../../Qwen/source/training/llama_factory.rst:15
 43 | #: 0cf21b5d01024a0999a290c5fa0f4e9e
 44 | msgid "In the following, we introduce more details about the usage of the script."
 45 | msgstr "下文将介绍更多关于脚本的用法。"
 46 | 
 47 | #: ../../Qwen/source/training/llama_factory.rst:19
 48 | #: aa67b2029a4449a8838c80545256d4c0
 49 | msgid "Installation"
 50 | msgstr "安装"
 51 | 
 52 | #: ../../Qwen/source/training/llama_factory.rst:21
 53 | #: 2b4a2d1e20c342948e987ce6abad71d0
 54 | msgid "Before you start, make sure you have installed the following packages:"
 55 | msgstr "开始之前，确保你已经安装了以下代码库："
 56 | 
 57 | #: ../../Qwen/source/training/llama_factory.rst:23
 58 | #: 488203d62c3143f09325bbe587ef3f7a
 59 | msgid "Follow the instructions of `LLaMA-Factory <https://github.com/hiyouga/LLaMA-Factory>`__, and build the environment."
 60 | msgstr "根据 `LLaMA-Factory <https://github.com/hiyouga/LLaMA-Factory>`__ 官方指引构建好你的环境"
 61 | 
 62 | #: ../../Qwen/source/training/llama_factory.rst:26
 63 | #: 98fc755a8555428fbcf01b547bcc270f
 64 | msgid "Install these packages (Optional):"
 65 | msgstr "安装下列代码库（可选）："
 66 | 
 67 | #: ../../Qwen/source/training/llama_factory.rst:33
 68 | #: b4d6aa9134de4f35800126a4b71e7a72
 69 | msgid "If you want to use `FlashAttention-2 <https://github.com/Dao-AILab/flash-attention>`__, make sure your CUDA is 11.6 and above."
 70 | msgstr "如你使用 `FlashAttention-2 <https://github.com/Dao-AILab/flash-attention>`__  ，请确保你的CUDA版本在11.6以上。"
 71 | 
 72 | #: ../../Qwen/source/training/llama_factory.rst:38
 73 | #: 5859e0c6dbd24040b05778b6fdea052e
 74 | msgid "Data Preparation"
 75 | msgstr "准备数据"
 76 | 
 77 | #: ../../Qwen/source/training/llama_factory.rst:40
 78 | #: 6ffed6a5040d48238e5d10b3f984a73a
 79 | msgid "LLaMA-Factory provides several training datasets in ``data`` folder, you can use it directly. If you are using a custom dataset, please prepare your dataset as follows."
 80 | msgstr "LLaMA-Factory 在 ``data`` 文件夹中提供了多个训练数据集，您可以直接使用它们。如果您打算使用自定义数据集，请按照以下方式准备您的数据集。"
 81 | 
 82 | #: ../../Qwen/source/training/llama_factory.rst:44
 83 | #: b6d0fcedc5fc40e291255c528bc988fb
 84 | msgid "Organize your data in a **json** file and put your data in ``data`` folder. LLaMA-Factory supports dataset in ``alpaca`` or ``sharegpt`` format."
 85 | msgstr "请将您的数据以 ``json`` 格式进行组织，并将数据放入 data 文件夹中。LLaMA-Factory 支持以 ``alpaca`` 或 ``sharegpt`` 格式的数据集。"
 86 | 
 87 | #: ../../Qwen/source/training/llama_factory.rst:48
 88 | #: 2f73d4edea9044a8bd42c9e4e25e992c
 89 | msgid "The dataset in ``alpaca`` format should follow the below format:"
 90 | msgstr "``alpaca`` 格式的数据集应遵循以下格式："
 91 | 
 92 | #: ../../Qwen/source/training/llama_factory.rst:65
 93 | #: 0669bfde81294b459125997c0a6e8257
 94 | msgid "The dataset in ``sharegpt`` format should follow the below format:"
 95 | msgstr "``sharegpt`` 格式的数据集应遵循以下格式："
 96 | 
 97 | #: ../../Qwen/source/training/llama_factory.rst:86
 98 | #: f1749224279f40bb8b6a3adf517af147
 99 | msgid "Provide your dataset definition in ``data/dataset_info.json`` in the following format ."
100 | msgstr "在 ``data/dataset_info.json`` 文件中提供您的数据集定义，并采用以下格式："
101 | 
102 | #: ../../Qwen/source/training/llama_factory.rst:89
103 | #: a7f285a82bbd495ab10b76fb5a2be6fb
104 | msgid "For ``alpaca`` format dataset, the columns in ``dataset_info.json`` should be:"
105 | msgstr "对于 ``alpaca`` 格式的数据集，其 ``dataset_info.json`` 文件中的列应为："
106 | 
107 | #: ../../Qwen/source/training/llama_factory.rst:105
108 | #: f90f0ba55b93436c9a096afa85489698
109 | msgid "For ``sharegpt`` format dataset, the columns in ``dataset_info.json`` should be:"
110 | msgstr "对于 ``sharegpt`` 格式的数据集，``dataset_info.json`` 文件中的列应该包括："
111 | 
112 | #: ../../Qwen/source/training/llama_factory.rst:127
113 | #: f91215519e61450c9c4c245beb4d26d6
114 | msgid "Training"
115 | msgstr "训练"
116 | 
117 | #: ../../Qwen/source/training/llama_factory.rst:129
118 | #: 1624352503e24ceb927d2dba808df7ae
119 | msgid "Execute the following training command:"
120 | msgstr "执行下列命令："
121 | 
122 | #: ../../Qwen/source/training/llama_factory.rst:169
123 | #: 24676444d9cd42069f3bd760d3c5b0cd
124 | msgid "and enjoy the training process. To make changes to your training, you can modify the arguments in the training command to adjust the hyperparameters. One argument to note is ``cutoff_len``, which is the maximum length of the training data. Control this parameter to avoid OOM error."
125 | msgstr "并享受训练过程。若要调整您的训练，您可以通过修改训练命令中的参数来调整超参数。其中一个需要注意的参数是 ``cutoff_len`` ，它代表训练数据的最大长度。通过控制这个参数，可以避免出现OOM（内存溢出）错误。"
126 | 
127 | #: ../../Qwen/source/training/llama_factory.rst:176
128 | #: bd1b02c65c5e4216bde43f8b1dd60ca6
129 | msgid "Merge LoRA"
130 | msgstr "合并LoRA"
131 | 
132 | #: ../../Qwen/source/training/llama_factory.rst:178
133 | #: b581a862018f412db480ec68be1512fa
134 | msgid "If you train your model with LoRA, you probably need to merge adapter parameters to the main branch. Run the following command to perform the merging of LoRA adapters."
135 | msgstr "如果你使用 LoRA 训练模型，可能需要将adapter参数合并到主分支中。请运行以下命令以执行 LoRA adapter 的合并操作。"
136 | 
137 | #: ../../Qwen/source/training/llama_factory.rst:194
138 | #: fee1bd0eca4b41e1bf3efa0d42ee401e
139 | msgid "Conclusion"
140 | msgstr "结语"
141 | 
142 | #: ../../Qwen/source/training/llama_factory.rst:196
143 | #: cfb87dc99dbe4e85a304593333c2241d
144 | msgid "The above content is the simplest way to use LLaMA-Factory to train Qwen. Feel free to dive into the details by checking the official repo!"
145 | msgstr "上述内容是使用LLaMA-Factory训练Qwen的最简单方法。 欢迎通过查看官方仓库深入了解详细信息！"
146 | 
147 | 


--------------------------------------------------------------------------------
/docs/locales/zh_CN/LC_MESSAGES/training/verl.po:
--------------------------------------------------------------------------------
  1 | # SOME DESCRIPTIVE TITLE.
  2 | # Copyright (C) 2024, Qwen Team
  3 | # This file is distributed under the same license as the Qwen package.
  4 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
  5 | #
  6 | #, fuzzy
  7 | msgid ""
  8 | msgstr ""
  9 | "Project-Id-Version: Qwen \n"
 10 | "Report-Msgid-Bugs-To: \n"
 11 | "POT-Creation-Date: 2025-05-07 12:34+0800\n"
 12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 13 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 14 | "Language: zh_CN\n"
 15 | "Language-Team: zh_CN <LL@li.org>\n"
 16 | "Plural-Forms: nplurals=1; plural=0;\n"
 17 | "MIME-Version: 1.0\n"
 18 | "Content-Type: text/plain; charset=utf-8\n"
 19 | "Content-Transfer-Encoding: 8bit\n"
 20 | "Generated-By: Babel 2.17.0\n"
 21 | 
 22 | #: ../../source/training/verl.rst:2 937d32044f9d4ce685b5c0d297d2c48d
 23 | msgid "verl"
 24 | msgstr ""
 25 | 
 26 | #: ../../source/training/verl.rst:4 11c0fdd831b444f28be5cce4fd8f8b38
 27 | msgid "verl is a flexible, efficient and production-ready RL training library for large language models (LLMs)."
 28 | msgstr "verl 是一个灵活、高效且被广泛使用的强化学习（RL）训练库，专为大型语言模型（LLM）设计。"
 29 | 
 30 | #: ../../source/training/verl.rst:6 64a80c70bd8a49cbb40d4acd035c6212
 31 | msgid "verl is the open-source version of `HybridFlow: A Flexible and Efficient RLHF Framework <https://arxiv.org/abs/2409.19256v2>`__ paper."
 32 | msgstr "verl 是论文 `HybridFlow: A Flexible and Efficient RLHF Framework <https://arxiv.org/abs/2409.19256v2>`__ 的开源实现"
 33 | 
 34 | #: ../../source/training/verl.rst:8 81aa9801baa54d72bef51c26925278d2
 35 | msgid "GitHub repository: `verl <https://github.com/volcengine/verl>`__"
 36 | msgstr "仓库地址：`verl <https://github.com/volcengine/verl>`__"
 37 | 
 38 | #: ../../source/training/verl.rst:10 2080b3c322ac41299917a122b4826994
 39 | msgid "verl is flexible and easy to use with:"
 40 | msgstr "verl 的灵活性和易用性体现在以下几个方面："
 41 | 
 42 | #: ../../source/training/verl.rst:12 564fbddc57ae43e090e24fe6426d670a
 43 | msgid "**Easy extension of diverse RL algorithms**: The hybrid-controller programming model enables flexible representation and efficient execution of complex Post-Training dataflows. Build RL dataflows such as GRPO, PPO in a few lines of code."
 44 | msgstr "**支持多样化的强化学习算法扩展**：verl 采用混合编程模型，结合了单一控制器和多控制器的优势，能够灵活表示和高效执行复杂的后训练数据流。用户只需几行代码即可构建强化学习数据流，例如 PPO、GRPO 等。"
 45 | 
 46 | #: ../../source/training/verl.rst:13 690f3983ad004db081baba2bc5d80d32
 47 | msgid "**Seamless integration of existing LLM infra with modular APIs**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as FSDP, Megatron-LM, vLLM, SGLang, etc"
 48 | msgstr "**与现有大语言模型基础设施无缝集成**：verl 通过模块化 API 解耦计算和数据依赖，支持与 PyTorch FSDP、Megatron-LM、vLLM 等现有大语言模型框架无缝集成，且用户可以轻松扩展到其他训练和推理框架。"
 49 | 
 50 | #: ../../source/training/verl.rst:14 323cbf0f7b034e3aadaa3b86e4680058
 51 | msgid "**Flexible device mapping**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes."
 52 | msgstr "**灵活的设备映射和并行性**：verl 支持将模型放置到不同 GPU 集合上，以实现高效的资源利用和跨不同集群规模的可扩展性。"
 53 | 
 54 | #: ../../source/training/verl.rst:15 e76a04d1c8aa491e8d6ae1d24d3b71d8
 55 | msgid "**Ready integration with popular HuggingFace models**: verl supports popular LLM models, including Qwen, Llama, and more."
 56 | msgstr "**与热门 HuggingFace 模型的及时集成**：verl 支持多种流行的 LLM 模型，包括 Qwen、Llama 等。"
 57 | 
 58 | #: ../../source/training/verl.rst:17 acc32ec6e23248d1b3714424bcb868f5
 59 | msgid "verl is fast with:"
 60 | msgstr "verl 的高效性体现在以下几个方面："
 61 | 
 62 | #: ../../source/training/verl.rst:19 bdeb7205dbd04ab689ab15907fe8ced1
 63 | msgid "**State-of-the-art throughput**: SOTA LLM training and inference engine integrations and SOTA RL throughput."
 64 | msgstr "**最高效的吞吐量**：verl 集成了最先进的 LLM 训练和推理引擎，并实现了最先进的强化学习（RL）吞吐量。"
 65 | 
 66 | #: ../../source/training/verl.rst:21 314eba723e854d1ab020ec72a0251706
 67 | msgid "**Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases."
 68 | msgstr "**使用 3D-HybridEngine 实现高效的 Actor 模型分片**：消除内存冗余，并显著减少训练和生成阶段转换期间的通信开销。"
 69 | 
 70 | #: ../../source/training/verl.rst:23 d76b21e9b42c414283609c3846c1e75c
 71 | msgid "Next, we will introduce how to use verl for training Qwen3 models."
 72 | msgstr "接下来，我们将介绍如何使用 verl 训练 Qwen3 模型。"
 73 | 
 74 | #: ../../source/training/verl.rst:26 6924574168a841e39a4ef0dfb11e6439
 75 | msgid "Reinforcement Learning (RL)"
 76 | msgstr "强化学习（RL）"
 77 | 
 78 | #: ../../source/training/verl.rst:28 5dae4a3e148042c3b9a2d42e967f7798
 79 | msgid "Now, verl supports various combinations of training frameworks and inference frameworks, including FSDP, Megatron-LM, vLLM, SGLang, etc. verl also supports training with multiple algorithms such as PPO, GRPO, DAPO, etc."
 80 | msgstr "现在，verl 支持多种训练框架和推理框架的组合，包括 FSDP、Megatron-LM、vLLM、SGLang 等。此外，verl 还支持使用多种算法进行训练，例如 PPO、GRPO、DAPO 等。"
 81 | 
 82 | #: ../../source/training/verl.rst:31 5fe2ddcafab04734a5ce56358f213de2
 83 | msgid "Step1: Environment and Training Preparation"
 84 | msgstr "第一步：环境和训练准备"
 85 | 
 86 | #: ../../source/training/verl.rst:33 f5bd55e1b12e4cf0a5c77959e30a2add
 87 | msgid "You can follow verl's `installation guide <https://verl.readthedocs.io/en/latest/start/install.html>`__ to complete the environment configuration."
 88 | msgstr "你可以按照 verl 的 `安装指南 <https://verl.readthedocs.io/en/latest/start/install.html>`__ 完成环境配置。"
 89 | 
 90 | #: ../../source/training/verl.rst:35 c5a07c1e71a44be699c2628b0af66fa7
 91 | msgid "Data preparation can be done by running the following command:"
 92 | msgstr "数据准备可以通过运行以下命令完成："
 93 | 
 94 | #: ../../source/training/verl.rst:43 e626176f3660473ba8a834ba471135eb
 95 | msgid "Model download can be done using the following command:"
 96 | msgstr "模型下载可以使用以下命令完成："
 97 | 
 98 | #: ../../source/training/verl.rst:51 fda85ea40df948188f0b765379c5f5b0
 99 | msgid "Step2: Start Training"
100 | msgstr "第二步：开始训练"
101 | 
102 | #: ../../source/training/verl.rst:53 24994606b01b452abb1f3c487862bf57
103 | msgid "In verl, training frameworks and inference frameworks can be combined freely, as long as the training framework and inference framework themselves support model training and inference tasks, so that verl can support RL-related training."
104 | msgstr "在 verl 中，训练框架和推理框架可以自由组合，只要训练框架和推理框架本身支持模型训练和推理任务，verl 就能够支持与强化学习（RL）相关的训练。"
105 | 
106 | #: ../../source/training/verl.rst:55 3f8ac764b49c42a2bd5a673461866371
107 | msgid "Below is an example using FSDP and vLLM to demonstrate how to train Qwen3 models in verl. We chose Qwen3-1.7B as the example, as it only requires a single 80GB GPU and a machine with more than 64GB of memory to start training."
108 | msgstr "以下是一个使用 FSDP 和 vLLM 的示例，展示如何在 verl 中训练 Qwen3 模型。我们选择了Qwen3-1.7B作为例子，因为他仅需使用一张80GB显存的显卡，以及大于64G内存的机器即可开始训练。"
109 | 
110 | #: ../../source/training/verl.rst:100 8b8ed7d9463c4a8086d5fe1a452b34e0
111 | msgid "Finally"
112 | msgstr "结束语"
113 | 
114 | #: ../../source/training/verl.rst:102 294699cc5278422f85beebb282ad0c2a
115 | msgid "If you encounter any difficulties during use, please join the discussion at `GitHub <https://github.com/volcengine/verl/discussions>`__."
116 | msgstr "如果在使用过程中遇到任何困难，请在 `GitHub <https://github.com/volcengine/verl/discussions>`__ 参与讨论。"
117 | 
118 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | furo
2 | myst-parser==4.0.0
3 | sphinx<8,>4.5.0
4 | sphinx-copybutton
5 | sphinx-design>=0.6.0
6 | 


--------------------------------------------------------------------------------
/docs/source/_static/css/custom.css:
--------------------------------------------------------------------------------
 1 | html {
 2 |     font-size: 16px;
 3 | }
 4 | 
 5 | h1 {
 6 |     font-size: 1.75rem;
 7 |     line-height: 2.5rem;
 8 | }
 9 | 
10 | h2 {
11 |     font-size: 1.5rem;
12 |     line-height: 2rem;
13 | }
14 | 
15 | h3 {
16 |     font-size: 1.25rem;
17 |     line-height: 1.75rem;
18 | }
19 | 
20 | h4 {
21 |     font-size: 1.125rem;
22 |     line-height: 1.5rem;
23 | }
24 | 
25 | h5 {
26 |     font-size: 1rem;
27 | }
28 | 
29 | h6 {
30 |     font-size: 0.75rem;
31 | }
32 | 
33 | h1,
34 | h2,
35 | h3,
36 | h4,
37 | h5,
38 | h6 {
39 |     margin-top: 1.875rem;
40 |     margin-bottom: 1rem;
41 | }
42 | 
43 | p strong {
44 |     font-weight: 500;
45 | }
46 | 
47 | p:target {
48 |     background-color: var(--color-highlight-on-target);
49 | }
50 | 
51 | details.sd-dropdown summary.sd-summary-title {
52 |     flex-direction: row-reverse;
53 |     font-weight: 500;
54 |     padding-left: 0;
55 | }
56 | 
57 | details.sd-dropdown summary.sd-summary-title code.literal {
58 |     font-weight: bolder;
59 |     filter: brightness(95%);
60 | }
61 | 
62 | details.sd-dropdown summary.sd-summary-title span.sd-summary-state-marker {
63 |     padding-left: 0.5em;
64 |     padding-right: 0.5em
65 | }
66 | 
67 | details.sd-dropdown div.sd-summary-content {
68 |     padding-left: 2.5em;
69 | }
70 | 
71 | pre.terminal {
72 |     font-size: 12px !important;
73 |     line-height: 16px;
74 |     background-color: black;
75 |     color: white;
76 |     padding: .5em;
77 |     text-wrap: wrap;
78 |     word-break: break-all;
79 | }
80 | 
81 | pre.terminal span.system {
82 |     color: greenyellow
83 | }
84 | 
85 | pre.terminal span.user {
86 |     color: yellowgreen
87 | }


--------------------------------------------------------------------------------
/docs/source/_static/design-tabs.js:
--------------------------------------------------------------------------------
  1 | // @ts-check
  2 | 
  3 | // Extra JS capability for selected tabs to be synced
  4 | // The selection is stored in local storage so that it persists across page loads.
  5 | 
  6 | /**
  7 |  * @type {Record<string, HTMLElement[]>}
  8 |  */
  9 | let sd_id_to_elements = {};
 10 | const storageKeyPrefix = "sphinx-design-tab-id-";
 11 | 
 12 | /**
 13 |  * Create a key for a tab element.
 14 |  * @param {HTMLElement} el - The tab element.
 15 |  * @returns {[string, string, string] | null} - The key.
 16 |  *
 17 |  */
 18 | function create_key(el) {
 19 |   let syncId = el.getAttribute("data-sync-id");
 20 |   let syncGroup = el.getAttribute("data-sync-group");
 21 |   if (!syncId || !syncGroup) return null;
 22 |   return [syncGroup, syncId, syncGroup + "--" + syncId];
 23 | }
 24 | 
 25 | /**
 26 |  * Initialize the tab selection.
 27 |  *
 28 |  */
 29 | function ready() {
 30 |   // Find all tabs with sync data
 31 | 
 32 |   /** @type {string[]} */
 33 |   let groups = [];
 34 | 
 35 |   document.querySelectorAll(".sd-tab-label").forEach((label) => {
 36 |     if (label instanceof HTMLElement) {
 37 |       let data = create_key(label);
 38 |       if (data) {
 39 |         let [group, id, key] = data;
 40 | 
 41 |         // add click event listener
 42 |         // @ts-ignore
 43 |         label.onclick = onSDLabelClick;
 44 | 
 45 |         // store map of key to elements
 46 |         if (!sd_id_to_elements[key]) {
 47 |           sd_id_to_elements[key] = [];
 48 |         }
 49 |         sd_id_to_elements[key].push(label);
 50 | 
 51 |         if (groups.indexOf(group) === -1) {
 52 |           groups.push(group);
 53 |           // Check if a specific tab has been selected via URL parameter
 54 |           const tabParam = new URLSearchParams(window.location.search).get(
 55 |             group
 56 |           );
 57 |           if (tabParam) {
 58 |             console.log(
 59 |               "sphinx-design: Selecting tab id for group '" +
 60 |               group +
 61 |               "' from URL parameter: " +
 62 |               tabParam
 63 |             );
 64 |             window.sessionStorage.setItem(storageKeyPrefix + group, tabParam);
 65 |           }
 66 |         }
 67 | 
 68 |         // Check is a specific tab has been selected previously
 69 |         let previousId = window.sessionStorage.getItem(
 70 |           storageKeyPrefix + group
 71 |         );
 72 |         if (previousId === id) {
 73 |           // console.log(
 74 |           //   "sphinx-design: Selecting tab from session storage: " + id
 75 |           // );
 76 |           // @ts-ignore
 77 |           label.previousElementSibling.checked = true;
 78 |         }
 79 |       }
 80 |     }
 81 |   });
 82 | }
 83 | 
 84 | /**
 85 |  *  Activate other tabs with the same sync id.
 86 |  *
 87 |  * @this {HTMLElement} - The element that was clicked.
 88 |  */
 89 | function onSDLabelClick() {
 90 |   let data = create_key(this);
 91 |   if (!data) return;
 92 |   const top = this.parentElement?.offsetTop || 0;
 93 |   console.log(top);
 94 |   let [group, id, key] = data;
 95 |   for (const label of sd_id_to_elements[key]) {
 96 |     if (label === this) continue;
 97 |     // @ts-ignore
 98 |     label.previousElementSibling.checked = true;
 99 |   }
100 |   const diff = (this.parentElement?.offsetTop || 0) - top;
101 |   if (diff !== 0) {
102 |     window.scrollBy({ left: 0, top: diff, behavior: "instant" });
103 |   }
104 |   window.sessionStorage.setItem(storageKeyPrefix + group, id);
105 | }
106 | 
107 | document.addEventListener("DOMContentLoaded", ready, false);
108 | 


--------------------------------------------------------------------------------
/docs/source/assets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen3/73027650806c879079b2601dfa7446655d5745af/docs/source/assets/.DS_Store


--------------------------------------------------------------------------------
/docs/source/assets/qwen-openllm-ui-demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen3/73027650806c879079b2601dfa7446655d5745af/docs/source/assets/qwen-openllm-ui-demo.png


--------------------------------------------------------------------------------
/docs/source/assets/qwen3_nonthinking.jinja:
--------------------------------------------------------------------------------
 1 | {%- if tools %}
 2 |     {{- '<|im_start|>system\n' }}
 3 |     {%- if messages[0].role == 'system' %}
 4 |         {{- messages[0].content + '\n\n' }}
 5 |     {%- endif %}
 6 |     {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
 7 |     {%- for tool in tools %}
 8 |         {{- "\n" }}
 9 |         {{- tool | tojson }}
10 |     {%- endfor %}
11 |     {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12 | {%- else %}
13 |     {%- if messages[0].role == 'system' %}
14 |         {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15 |     {%- endif %}
16 | {%- endif %}
17 | {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18 | {%- for message in messages[::-1] %}
19 |     {%- set index = (messages|length - 1) - loop.index0 %}
20 |     {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21 |         {%- set ns.multi_step_tool = false %}
22 |         {%- set ns.last_query_index = index %}
23 |     {%- endif %}
24 | {%- endfor %}
25 | {%- for message in messages %}
26 |     {%- if message.content is string %}
27 |         {%- set content = message.content %}
28 |     {%- else %}
29 |         {%- set content = '' %}
30 |     {%- endif %}
31 |     {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32 |         {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33 |     {%- elif message.role == "assistant" %}
34 |         {%- set reasoning_content = '' %}
35 |         {%- if message.reasoning_content is string %}
36 |             {%- set reasoning_content = message.reasoning_content %}
37 |         {%- else %}
38 |             {%- if '</think>' in content %}
39 |                 {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40 |                 {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41 |             {%- endif %}
42 |         {%- endif %}
43 |         {%- if loop.index0 > ns.last_query_index %}
44 |             {%- if loop.last or (not loop.last and reasoning_content) %}
45 |                 {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46 |             {%- else %}
47 |                 {{- '<|im_start|>' + message.role + '\n' + content }}
48 |             {%- endif %}
49 |         {%- else %}
50 |             {{- '<|im_start|>' + message.role + '\n' + content }}
51 |         {%- endif %}
52 |         {%- if message.tool_calls %}
53 |             {%- for tool_call in message.tool_calls %}
54 |                 {%- if (loop.first and content) or (not loop.first) %}
55 |                     {{- '\n' }}
56 |                 {%- endif %}
57 |                 {%- if tool_call.function %}
58 |                     {%- set tool_call = tool_call.function %}
59 |                 {%- endif %}
60 |                 {{- '<tool_call>\n{"name": "' }}
61 |                 {{- tool_call.name }}
62 |                 {{- '", "arguments": ' }}
63 |                 {%- if tool_call.arguments is string %}
64 |                     {{- tool_call.arguments }}
65 |                 {%- else %}
66 |                     {{- tool_call.arguments | tojson }}
67 |                 {%- endif %}
68 |                 {{- '}\n</tool_call>' }}
69 |             {%- endfor %}
70 |         {%- endif %}
71 |         {{- '<|im_end|>\n' }}
72 |     {%- elif message.role == "tool" %}
73 |         {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74 |             {{- '<|im_start|>user' }}
75 |         {%- endif %}
76 |         {{- '\n<tool_response>\n' }}
77 |         {{- content }}
78 |         {{- '\n</tool_response>' }}
79 |         {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80 |             {{- '<|im_end|>\n' }}
81 |         {%- endif %}
82 |     {%- endif %}
83 | {%- endfor %}
84 | {%- if add_generation_prompt %}
85 |     {{- '<|im_start|>assistant\n<think>\n\n</think>\n\n' }}
86 | {%- endif %}


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | 
 13 | import sys
 14 | from sphinx.ext import autodoc
 15 | import logging
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = "Qwen"
 23 | copyright = "2024, Qwen Team"
 24 | author = "Qwen Team"
 25 | 
 26 | # -- General configuration ---------------------------------------------------
 27 | 
 28 | # Add any Sphinx extension module names here, as strings. They can be
 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 30 | # ones.
 31 | extensions = [
 32 |     "sphinx.ext.napoleon",
 33 |     "sphinx.ext.viewcode",
 34 |     "sphinx.ext.intersphinx",
 35 |     # "sphinx_copybutton",
 36 |     "sphinx.ext.autodoc",
 37 |     "sphinx.ext.autosummary",
 38 |     "myst_parser",
 39 |     "sphinx_design",
 40 | ]
 41 | 
 42 | myst_enable_extensions = ["colon_fence", "attrs_block", "attrs_inline", "fieldlist"]
 43 | 
 44 | # Add any paths that contain templates here, relative to this directory.
 45 | templates_path = ["_templates"]
 46 | 
 47 | # List of patterns, relative to source directory, that match files and
 48 | # directories to ignore when looking for source files.
 49 | # This pattern also affects html_static_path and html_extra_path.
 50 | exclude_patterns = []
 51 | 
 52 | # Exclude the prompt "$" when copying code
 53 | copybutton_prompt_text = r"\$ "
 54 | copybutton_prompt_is_regexp = True
 55 | 
 56 | # -- Options for HTML output -------------------------------------------------
 57 | 
 58 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 59 | # a list of builtin themes.
 60 | #
 61 | html_title = project
 62 | html_theme = "furo"
 63 | # html_logo = 'assets/logo/qwen.png'
 64 | # html_theme_options = {
 65 | #     'path_to_docs': 'docs/source',
 66 | #     'repository_url': 'https://github.com/QwenLM/Qwen2',
 67 | #     # 'use_repository_button': True,
 68 | # }
 69 | html_sidebars = {
 70 |     "**": [
 71 |         "sidebar/scroll-start.html",
 72 |         "sidebar/brand.html",
 73 |         "sidebar/navigation.html",
 74 |         "sidebar/ethical-ads.html",
 75 |         "sidebar/scroll-end.html",
 76 |     ]
 77 | }
 78 | 
 79 | # multi-language docs
 80 | language = "en"
 81 | locale_dirs = ["../locales/"]  # path is example but recommended.
 82 | gettext_compact = False  # optional.
 83 | gettext_uuid = True  # optional.
 84 | 
 85 | # Add any paths that contain custom static files (such as style sheets) here,
 86 | # relative to this directory. They are copied after the builtin static files,
 87 | # so a file named "default.css" will overwrite the builtin "default.css".
 88 | html_static_path = ["_static"]
 89 | html_css_files = [
 90 |     "css/custom.css",
 91 | ]
 92 | # FIXME: figure out why this file is not copied
 93 | html_js_files = [
 94 |     "design-tabs.js",
 95 | ]
 96 | 
 97 | # Mock out external dependencies here.
 98 | autodoc_mock_imports = ["torch", "transformers"]
 99 | 
100 | for mock_target in autodoc_mock_imports:
101 |     if mock_target in sys.modules:
102 |         logger.info(
103 |             f"Potentially problematic mock target ({mock_target}) found; "
104 |             "autodoc_mock_imports cannot mock modules that have already "
105 |             "been loaded into sys.modules when the sphinx build starts."
106 |         )
107 | 
108 | 
109 | class MockedClassDocumenter(autodoc.ClassDocumenter):
110 |     """Remove note about base class when a class is derived from object."""
111 | 
112 |     def add_line(self, line: str, source: str, *lineno: int) -> None:
113 |         if line == "   Bases: :py:class:`object`":
114 |             return
115 |         super().add_line(line, source, *lineno)
116 | 
117 | 
118 | autodoc.ClassDocumenter = MockedClassDocumenter
119 | 
120 | navigation_with_keys = False
121 | 


--------------------------------------------------------------------------------
/docs/source/deployment/openllm.rst:
--------------------------------------------------------------------------------
  1 | OpenLLM
  2 | =======
  3 | 
  4 | .. attention:: 
  5 |     To be updated for Qwen3.
  6 | 
  7 | OpenLLM allows developers to run Qwen2.5 models of different sizes as OpenAI-compatible APIs with a single command. It features a built-in chat UI, state-of-the-art inference backends, and a simplified workflow for creating enterprise-grade cloud deployment with Qwen2.5. Visit `the OpenLLM repository <https://github.com/bentoml/OpenLLM/>`_ to learn more.
  8 | 
  9 | Installation
 10 | ------------
 11 | 
 12 | Install OpenLLM using ``pip``.
 13 | 
 14 | .. code:: bash
 15 | 
 16 |    pip install openllm
 17 | 
 18 | Verify the installation and display the help information:
 19 | 
 20 | .. code:: bash
 21 | 
 22 |    openllm --help
 23 | 
 24 | Quickstart
 25 | ----------
 26 | 
 27 | Before you run any Qwen2.5 model, ensure your model repository is up to date by syncing it with OpenLLM's latest official repository.
 28 | 
 29 | .. code:: bash
 30 | 
 31 |    openllm repo update
 32 | 
 33 | List the supported Qwen2.5 models:
 34 | 
 35 | .. code:: bash
 36 | 
 37 |    openllm model list --tag qwen2.5
 38 | 
 39 | The results also display the required GPU resources and supported platforms:
 40 | 
 41 | .. code:: bash
 42 | 
 43 |    model    version                repo     required GPU RAM    platforms
 44 |    -------  ---------------------  -------  ------------------  -----------
 45 |    qwen2.5  qwen2.5:0.5b           default  12G                 linux
 46 |             qwen2.5:1.5b           default  12G                 linux
 47 |             qwen2.5:3b             default  12G                 linux
 48 |             qwen2.5:7b             default  24G                 linux
 49 |             qwen2.5:14b            default  80G                 linux
 50 |             qwen2.5:14b-ggml-q4    default                      macos
 51 |             qwen2.5:14b-ggml-q8    default                      macos
 52 |             qwen2.5:32b            default  80G                 linux
 53 |             qwen2.5:32b-ggml-fp16  default                      macos
 54 |             qwen2.5:72b            default  80Gx2               linux
 55 |             qwen2.5:72b-ggml-q4    default                      macos
 56 | 
 57 | To start a server with one of the models, use ``openllm serve`` like this:
 58 | 
 59 | .. code:: bash
 60 | 
 61 |    openllm serve qwen2.5:7b
 62 | 
 63 | By default, the server starts at ``http://localhost:3000/``.
 64 | 
 65 | Interact with the model server
 66 | ------------------------------
 67 | 
 68 | With the model server up and running, you can call its APIs in the following ways:
 69 | 
 70 | .. tab-set::
 71 | 
 72 |     .. tab-item:: CURL
 73 | 
 74 |        Send an HTTP request to its ``/generate`` endpoint via CURL:
 75 | 
 76 |        .. code-block:: bash
 77 | 
 78 |             curl -X 'POST' \
 79 |                'http://localhost:3000/api/generate' \
 80 |                -H 'accept: text/event-stream' \
 81 |                -H 'Content-Type: application/json' \
 82 |                -d '{
 83 |                "prompt": "Tell me something about large language models.",
 84 |                "model": "Qwen/Qwen2.5-7B-Instruct",
 85 |                "max_tokens": 2048,
 86 |                "stop": null
 87 |             }'
 88 | 
 89 |     .. tab-item:: Python client
 90 | 
 91 |        Call the OpenAI-compatible endpoints with frameworks and tools that support the OpenAI API protocol. Here is an example:
 92 | 
 93 |        .. code-block:: python
 94 | 
 95 |             from openai import OpenAI
 96 | 
 97 |             client = OpenAI(base_url='http://localhost:3000/v1', api_key='na')
 98 | 
 99 |             # Use the following func to get the available models
100 |             # model_list = client.models.list()
101 |             # print(model_list)
102 | 
103 |             chat_completion = client.chat.completions.create(
104 |                model="Qwen/Qwen2.5-7B-Instruct",
105 |                messages=[
106 |                   {
107 |                         "role": "user",
108 |                         "content": "Tell me something about large language models."
109 |                   }
110 |                ],
111 |                stream=True,
112 |             )
113 |             for chunk in chat_completion:
114 |                print(chunk.choices[0].delta.content or "", end="")
115 | 
116 |     .. tab-item:: Chat UI
117 | 
118 |        OpenLLM provides a chat UI at the ``/chat`` endpoint for the LLM server at http://localhost:3000/chat.
119 | 
120 |        .. image:: ../../source/assets/qwen-openllm-ui-demo.png
121 | 
122 | Model repository
123 | ----------------
124 | 
125 | A model repository in OpenLLM represents a catalog of available LLMs. You can add your own repository to OpenLLM with custom Qwen2.5 variants for your specific needs. See our `documentation to learn details <https://github.com/bentoml/OpenLLM?tab=readme-ov-file#model-repository>`_.


--------------------------------------------------------------------------------
/docs/source/deployment/skypilot.rst:
--------------------------------------------------------------------------------
  1 | SkyPilot
  2 | ========
  3 | 
  4 | .. attention:: 
  5 |     To be updated for Qwen3.
  6 | 
  7 | What is SkyPilot
  8 | ----------------
  9 | 
 10 | SkyPilot is a framework for running LLMs, AI, and batch jobs on any
 11 | cloud, offering maximum cost savings, the highest GPU availability, and
 12 | managed execution. Its features include:
 13 | 
 14 | -  Get the best GPU availability by utilizing multiple resources pools
 15 |    across multiple regions and clouds.
 16 | -  Pay absolute minimum — SkyPilot picks the cheapest resources across
 17 |    regions and clouds. No managed solution markups.
 18 | -  Scale up to multiple replicas across different locations and
 19 |    accelerators, all served with a single endpoint
 20 | -  Everything stays in your cloud account (your VMs & buckets)
 21 | -  Completely private - no one else sees your chat history
 22 | 
 23 | Install SkyPilot
 24 | ----------------
 25 | 
 26 | We advise you to follow the
 27 | `instruction <https://skypilot.readthedocs.io/en/latest/getting-started/installation.html>`__
 28 | to install SkyPilot. Here we provide a simple example of using ``pip``
 29 | for the installation as shown below.
 30 | 
 31 | .. code:: bash
 32 | 
 33 |    # You can use any of the following clouds that you have access to:
 34 |    # aws, gcp, azure, oci, lamabda, runpod, fluidstack, paperspace,
 35 |    # cudo, ibm, scp, vsphere, kubernetes
 36 |    pip install "skypilot-nightly[aws,gcp]"
 37 | 
 38 | After that, you need to verify cloud access with a command like:
 39 | 
 40 | .. code:: bash
 41 | 
 42 |    sky check
 43 | 
 44 | For more information, check the `official document <https://skypilot.readthedocs.io/en/latest/getting-started/installation.html>`__ and see if you have
 45 | set up your cloud accounts correctly.
 46 | 
 47 | Alternatively, you can also use the official docker image with SkyPilot
 48 | master branch automatically cloned by running:
 49 | 
 50 | .. code:: bash
 51 | 
 52 |    # NOTE: '--platform linux/amd64' is needed for Apple Silicon Macs
 53 |    docker run --platform linux/amd64 \
 54 |      -td --rm --name sky \
 55 |      -v "$HOME/.sky:/root/.sky:rw" \
 56 |      -v "$HOME/.aws:/root/.aws:rw" \
 57 |      -v "$HOME/.config/gcloud:/root/.config/gcloud:rw" \
 58 |      berkeleyskypilot/skypilot-nightly
 59 | 
 60 |    docker exec -it sky /bin/bash
 61 | 
 62 | Running Qwen2.5-72B-Instruct with SkyPilot
 63 | ------------------------------------------
 64 | 
 65 | 1. Start serving Qwen2.5-72B-Instruct on a single instance with any
 66 |    available GPU in the list specified in
 67 |    `serve-72b.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/serve-72b.yaml>`__
 68 |    with a vLLM-powered OpenAI-compatible endpoint:
 69 |    
 70 |    .. code:: bash
 71 | 
 72 |       sky launch -c qwen serve-72b.yaml
 73 | 
 74 |    **Before launching, make sure you have changed Qwen/Qwen2-72B-Instruct to Qwen/Qwen2.5-72B-Instruct in the YAML file.**
 75 | 
 76 | 2. Send a request to the endpoint for completion:
 77 | 
 78 |    .. code:: bash
 79 | 
 80 |       IP=$(sky status --ip qwen)
 81 | 
 82 |       curl -L http://$IP:8000/v1/completions \
 83 |          -H "Content-Type: application/json" \
 84 |          -d '{
 85 |             "model": "Qwen/Qwen2.5-72B-Instruct",
 86 |             "prompt": "My favorite food is",
 87 |             "max_tokens": 512
 88 |       }' | jq -r '.choices[0].text'
 89 | 
 90 | 3. Send a request for chat completion:
 91 | 
 92 |    .. code:: bash
 93 | 
 94 |       curl -L http://$IP:8000/v1/chat/completions \
 95 |          -H "Content-Type: application/json" \
 96 |          -d '{
 97 |             "model": "Qwen/Qwen2.5-72B-Instruct",
 98 |             "messages": [
 99 |             {
100 |                "role": "system",
101 |                "content": "You are Qwen, created by Alibaba Cloud. You are a helpful and honest chat expert."
102 |             },
103 |             {
104 |                "role": "user",
105 |                "content": "What is the best food?"
106 |             }
107 |             ],
108 |             "max_tokens": 512
109 |       }' | jq -r '.choices[0].message.content'
110 | 
111 | Scale up the service with SkyPilot Serve
112 | ----------------------------------------
113 | 
114 | 1. With `SkyPilot
115 |    Serve <https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html>`__,
116 |    a serving library built on top of SkyPilot, scaling up the Qwen
117 |    service is as simple as running:
118 | 
119 |    .. code:: bash
120 | 
121 |       sky serve up -n qwen ./serve-72b.yaml
122 | 
123 |    **Before launching, make sure you have changed Qwen/Qwen2-72B-Instruct to Qwen/Qwen2.5-72B-Instruct in the YAML file.**
124 | 
125 |    This will start the service with multiple replicas on the cheapest
126 |    available locations and accelerators. SkyServe will automatically manage
127 |    the replicas, monitor their health, autoscale based on load, and restart
128 |    them when needed.
129 | 
130 |    A single endpoint will be returned and any request sent to the endpoint
131 |    will be routed to the ready replicas.
132 | 
133 | 2. To check the status of the service, run:
134 | 
135 |    .. code:: bash
136 | 
137 |       sky serve status qwen
138 | 
139 |    After a while, you will see the following output:
140 | 
141 |    ::
142 | 
143 |       Services
144 |       NAME        VERSION  UPTIME  STATUS        REPLICAS  ENDPOINT            
145 |       Qwen  1        -       READY         2/2       3.85.107.228:30002  
146 | 
147 |       Service Replicas
148 |       SERVICE_NAME  ID  VERSION  IP  LAUNCHED    RESOURCES                   STATUS REGION  
149 |       Qwen          1   1        -   2 mins ago  1x Azure({'A100-80GB': 8}) READY  eastus  
150 |       Qwen          2   1        -   2 mins ago  1x GCP({'L4': 8})          READY  us-east4-a 
151 | 
152 |    As shown, the service is now backed by 2 replicas, one on Azure and one
153 |    on GCP, and the accelerator type is chosen to be **the cheapest
154 |    available one** on the clouds. That said, it maximizes the availability
155 |    of the service while minimizing the cost.
156 | 
157 | 3. To access the model, we use a ``curl -L`` command (``-L`` to follow
158 |    redirect) to send the request to the endpoint:
159 | 
160 |    .. code:: bash
161 | 
162 |       ENDPOINT=$(sky serve status --endpoint qwen)
163 | 
164 |       curl -L http://$ENDPOINT/v1/chat/completions \
165 |          -H "Content-Type: application/json" \
166 |          -d '{
167 |             "model": "Qwen/Qwen2.5-72B-Instruct",
168 |             "messages": [
169 |             {
170 |                "role": "system",
171 |                "content": "You are Qwen, created by Alibaba Cloud. You are a helpful and honest code assistant expert in Python."
172 |             },
173 |             {
174 |                "role": "user",
175 |                "content": "Show me the python code for quick sorting a list of integers."
176 |             }
177 |             ],
178 |             "max_tokens": 512
179 |       }' | jq -r '.choices[0].message.content'
180 | 
181 | Accessing Qwen2.5 with Chat GUI
182 | ---------------------------------------------
183 | 
184 | It is also possible to access the Qwen2.5 service with GUI by connecting a
185 | `FastChat GUI server <https://github.com/lm-sys/FastChat>`__ to the endpoint launched
186 | above (see `gui.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/gui.yaml>`__).
187 | 
188 | 1. Start the Chat Web UI:
189 | 
190 |    .. code:: bash
191 | 
192 |       sky launch -c qwen-gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint qwen)
193 | 
194 |    **Before launching, make sure you have changed Qwen/Qwen1.5-72B-Chat to Qwen/Qwen2.5-72B-Instruct in the YAML file.**
195 | 
196 | 2. Then, we can access the GUI at the returned gradio link:
197 | 
198 |    ::
199 | 
200 |       | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
201 | 
202 |    Note that you may get better results by using a different temperature and top_p value.
203 | 
204 | Summary
205 | -------
206 | 
207 | With SkyPilot, it is easy for you to deploy Qwen2.5 on any cloud. We
208 | advise you to read the official doc for more usages and updates.
209 | Check `this <https://skypilot.readthedocs.io/>`__ out!
210 | 


--------------------------------------------------------------------------------
/docs/source/deployment/tgi.rst:
--------------------------------------------------------------------------------
  1 | TGI
  2 | =====================
  3 | 
  4 | .. attention:: 
  5 |     To be updated for Qwen3.
  6 | 
  7 | Hugging Face's Text Generation Inference (TGI) is a production-ready framework specifically designed for deploying and serving large language models (LLMs) for text generation tasks. It offers a seamless deployment experience, powered by a robust set of features:
  8 | 
  9 | * `Speculative Decoding <Speculative Decoding_>`_: Accelerates generation speeds.
 10 | * `Tensor Parallelism`_: Enables efficient deployment across multiple GPUs.
 11 | * `Token Streaming`_: Allows for the continuous generation of text.
 12 | * Versatile Device Support: Works seamlessly with `AMD`_, `Gaudi`_ and `AWS Inferentia`_.
 13 | 
 14 | .. _AMD: https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/deploy-your-model.html#serving-using-hugging-face-tgi
 15 | .. _Gaudi: https://github.com/huggingface/tgi-gaudi
 16 | .. _AWS Inferentia: https://aws.amazon.com/blogs/machine-learning/announcing-the-launch-of-new-hugging-face-llm-inference-containers-on-amazon-sagemaker/#:~:text=Get%20started%20with%20TGI%20on%20SageMaker%20Hosting
 17 | .. _Tensor Parallelism: https://huggingface.co/docs/text-generation-inference/conceptual/tensor_parallelism
 18 | .. _Token Streaming: https://huggingface.co/docs/text-generation-inference/conceptual/streaming
 19 | 
 20 | Installation
 21 | -----------------
 22 | 
 23 | The easiest way to use TGI is via the TGI docker image. In this guide, we show how to use TGI with docker.
 24 | 
 25 | It's possible to run it locally via Conda or build locally. Please refer to `Installation Guide <https://huggingface.co/docs/text-generation-inference/installation>`_  and `CLI tool <https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/using_cli>`_ for detailed instructions.
 26 | 
 27 | Deploy Qwen2.5 with TGI
 28 | -----------------------
 29 | 
 30 | 1. **Find a Qwen2.5 Model:** Choose a model from `the Qwen2.5 collection <https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e>`_.
 31 | 2. **Deployment Command:** Run the following command in your terminal, replacing ``model`` with your chosen Qwen2.5 model ID and ``volume`` with the path to your local data directory:
 32 | 
 33 | .. code:: bash
 34 | 
 35 |    model=Qwen/Qwen2.5-7B-Instruct
 36 |    volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 37 | 
 38 |    docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
 39 | 
 40 | 
 41 | Using TGI API
 42 | -------------
 43 | 
 44 | Once deployed, the model will be available on the mapped port (8080).
 45 | 
 46 | TGI comes with a handy API for streaming response:
 47 | 
 48 | .. code:: bash
 49 | 
 50 |    curl http://localhost:8080/generate_stream -H 'Content-Type: application/json' \
 51 |            -d '{"inputs":"Tell me something about large language models.","parameters":{"max_new_tokens":512}}'
 52 | 
 53 | 
 54 | It's also available on OpenAI style API:
 55 | 
 56 | .. code:: bash
 57 | 
 58 |     curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
 59 |       "model": "",
 60 |       "messages": [
 61 |         {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
 62 |         {"role": "user", "content": "Tell me something about large language models."}
 63 |       ],
 64 |       "temperature": 0.7,
 65 |       "top_p": 0.8,
 66 |       "repetition_penalty": 1.05,
 67 |       "max_tokens": 512
 68 |     }'
 69 | 
 70 | 
 71 | .. note::
 72 | 
 73 |    The model field in the JSON is not used by TGI, you can put anything. 
 74 | 
 75 | Refer to the `TGI Swagger UI <https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/completions>`_ for a complete API reference.
 76 | 
 77 | You can also use Python API:
 78 | 
 79 | .. code:: python
 80 | 
 81 |    from openai import OpenAI
 82 |    
 83 |    # initialize the client but point it to TGI
 84 |    client = OpenAI(
 85 |       base_url="http://localhost:8080/v1/",  # replace with your endpoint url
 86 |       api_key="",  # this field is not used when running locally
 87 |    )
 88 |    chat_completion = client.chat.completions.create(
 89 |       model="",  # it is not used by TGI, you can put anything
 90 |       messages=[
 91 |          {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
 92 |          {"role": "user", "content": "Tell me something about large language models."},
 93 |       ],
 94 |       stream=True,
 95 |       temperature=0.7,
 96 |       top_p=0.8,
 97 |       max_tokens=512,
 98 |    )
 99 | 
100 |    # iterate and print stream
101 |    for message in chat_completion:
102 |       print(message.choices[0].delta.content, end="")
103 | 
104 | 
105 | Quantization for Performance
106 | ----------------------------
107 | 
108 | 1. Data-dependent quantization (GPTQ and AWQ)
109 | 
110 | Both GPTQ and AWQ models are data-dependent. The official quantized models can be found from `the Qwen2.5 collection`_ and you can also quantize models with your own dataset to make it perform better on your use case. 
111 | 
112 | The following shows the command to start TGI with Qwen2.5-7B-Instruct-GPTQ-Int4:
113 | 
114 | .. code:: bash
115 | 
116 |    model=Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4
117 |    volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
118 | 
119 |    docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model --quantize gptq
120 | 
121 | 
122 | If the model is quantized with AWQ, e.g. Qwen/Qwen2.5-7B-Instruct-AWQ, please use ``--quantize awq``.
123 | 
124 | 2. Data-agnostic quantization
125 | 
126 | EETQ on the other side is not data dependent and can be used with any model. Note that we're passing in the original model (instead of a quantized model) with the ``--quantize eetq`` flag.
127 | 
128 | .. code:: bash
129 | 
130 |    model=Qwen/Qwen2.5-7B-Instruct
131 |    volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
132 | 
133 |    docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model --quantize eetq
134 | 
135 | 
136 | 
137 | Multi-Accelerators Deployment
138 | -----------------------------
139 | 
140 | Use the ``--num-shard`` flag to specify the number of accelerators. Please also use ``--shm-size 1g`` to enable shared memory for optimal NCCL performance (`reference <https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#a-note-on-shared-memory-shm>`__):
141 | 
142 | .. code:: bash
143 | 
144 |    model=Qwen/Qwen2.5-7B-Instruct
145 |    volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
146 | 
147 |    docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model --num-shard 2
148 | 
149 | 
150 | Speculative Decoding
151 | --------------------
152 | 
153 | Speculative decoding can reduce the time per token by speculating on the next token. Use the ``--speculative-decoding`` flag, setting the value to the number of tokens to speculate on (default: 0 for no speculation):
154 | 
155 | 
156 | .. code:: bash
157 | 
158 |    model=Qwen/Qwen2.5-7B-Instruct
159 |    volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
160 | 
161 |    docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model --speculate 2
162 | 
163 | 
164 | The overall performance of speculative decoding highly depends on the type of task. It works best for code or highly repetitive text.
165 | 
166 | More context on speculative decoding can be found `here <https://huggingface.co/docs/text-generation-inference/conceptual/speculation>`__.
167 | 
168 | 
169 | Zero-Code Deployment with HF Inference Endpoints
170 | ---------------------------------------------------
171 | 
172 | For effortless deployment, leverage Hugging Face Inference Endpoints:
173 | 
174 | - **GUI interface:** `<https://huggingface.co/inference-endpoints/dedicated>`__
175 | - **Coding interface:** `<https://huggingface.co/blog/tgi-messages-api>`__
176 | 
177 | Once deployed, the endpoint can be used as usual.
178 | 
179 | 
180 | Common Issues
181 | ----------------
182 | 
183 | Qwen2.5 supports long context lengths, so carefully choose the values for ``--max-batch-prefill-tokens``, ``--max-total-tokens``, and ``--max-input-tokens`` to avoid potential out-of-memory (OOM) issues. If an OOM occurs, you'll receive an error message upon startup. The following shows an example to modify those parameters:
184 | 
185 | .. code:: bash
186 | 
187 |    model=Qwen/Qwen2.5-7B-Instruct
188 |    volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
189 | 
190 |    docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model --max-batch-prefill-tokens 4096 --max-total-tokens 4096 --max-input-tokens 2048


--------------------------------------------------------------------------------
/docs/source/framework/LlamaIndex.rst:
--------------------------------------------------------------------------------
  1 | LlamaIndex
  2 | ==========
  3 | 
  4 | .. attention:: 
  5 |     To be updated for Qwen3.
  6 | 
  7 | To connect Qwen2.5 with external data, such as documents, web pages, etc., we offer a tutorial on `LlamaIndex <https://www.llamaindex.ai/>`__.
  8 | This guide helps you quickly implement retrieval-augmented generation (RAG) using LlamaIndex with Qwen2.5.
  9 | 
 10 | Preparation
 11 | --------------------------------------
 12 | 
 13 | To implement RAG, 
 14 | we advise you to install the LlamaIndex-related packages first. 
 15 | 
 16 | The following is a simple code snippet showing how to do this:
 17 | 
 18 | .. code:: bash
 19 | 
 20 |    pip install llama-index
 21 |    pip install llama-index-llms-huggingface
 22 |    pip install llama-index-readers-web
 23 | 
 24 | Set Parameters
 25 | --------------------------------------
 26 | 
 27 | Now we can set up LLM, embedding model, and the related configurations.  
 28 | Qwen2.5-Instruct supports conversations in multiple languages, including English and Chinese.
 29 | You can use the ``bge-base-en-v1.5`` model to retrieve from English documents, and you can download the ``bge-base-zh-v1.5`` model to retrieve from Chinese documents. 
 30 | You can also choose ``bge-large`` or ``bge-small`` as the embedding model or modify the context window size or text chunk size depending on your computing resources.
 31 | Qwen2.5 model families support a maximum of 32K context window size (up to 128K for 7B, 14B, 32B, and 72B, requiring extra configuration)
 32 | 
 33 | .. code:: python
 34 |   
 35 |     import torch
 36 |     from llama_index.core import Settings
 37 |     from llama_index.core.node_parser import SentenceSplitter
 38 |     from llama_index.llms.huggingface import HuggingFaceLLM
 39 |     from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 40 | 
 41 |     # Set prompt template for generation (optional)
 42 |     from llama_index.core import PromptTemplate
 43 |   
 44 |     def completion_to_prompt(completion):
 45 |        return f"<|im_start|>system\n<|im_end|>\n<|im_start|>user\n{completion}<|im_end|>\n<|im_start|>assistant\n"
 46 |     
 47 |     def messages_to_prompt(messages):
 48 |         prompt = ""
 49 |         for message in messages:
 50 |             if message.role == "system":
 51 |                 prompt += f"<|im_start|>system\n{message.content}<|im_end|>\n"
 52 |             elif message.role == "user":
 53 |                 prompt += f"<|im_start|>user\n{message.content}<|im_end|>\n"
 54 |             elif message.role == "assistant":
 55 |                 prompt += f"<|im_start|>assistant\n{message.content}<|im_end|>\n"
 56 |     
 57 |         if not prompt.startswith("<|im_start|>system"):
 58 |             prompt = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n" + prompt
 59 |     
 60 |         prompt = prompt + "<|im_start|>assistant\n"
 61 |     
 62 |         return prompt
 63 |     
 64 |     # Set Qwen2.5 as the language model and set generation config
 65 |     Settings.llm = HuggingFaceLLM(
 66 |         model_name="Qwen/Qwen2.5-7B-Instruct",
 67 |         tokenizer_name="Qwen/Qwen2.5-7B-Instruct",
 68 |         context_window=30000,
 69 |         max_new_tokens=2000,
 70 |         generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
 71 |         messages_to_prompt=messages_to_prompt,
 72 |         completion_to_prompt=completion_to_prompt,
 73 |         device_map="auto",
 74 |     )
 75 | 
 76 |     # Set embedding model                       
 77 |     Settings.embed_model = HuggingFaceEmbedding(
 78 |         model_name = "BAAI/bge-base-en-v1.5"
 79 |     )
 80 | 
 81 |     # Set the size of the text chunk for retrieval
 82 |     Settings.transformations = [SentenceSplitter(chunk_size=1024)]
 83 | 
 84 | Build Index
 85 | --------------------------------------
 86 | 
 87 | Now we can build index from documents or websites.
 88 | 
 89 | The following code snippet demonstrates how to build an index for files (regardless of whether they are in PDF or TXT format) in a local folder named 'document'.                               
 90 | 
 91 | .. code:: python
 92 |     
 93 |     from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
 94 |     
 95 |     documents = SimpleDirectoryReader("./document").load_data()
 96 |     index = VectorStoreIndex.from_documents(
 97 |         documents,
 98 |         embed_model=Settings.embed_model,
 99 |         transformations=Settings.transformations
100 |     )
101 | 
102 | The following code snippet demonstrates how to build an index for the content in a list of websites.                               
103 |                                
104 | .. code:: python
105 |                                
106 |     from llama_index.readers.web import SimpleWebPageReader
107 |     from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
108 |     
109 |     documents = SimpleWebPageReader(html_to_text=True).load_data(
110 |         ["web_address_1","web_address_2",...]
111 |     )
112 |     index = VectorStoreIndex.from_documents(
113 |         documents,
114 |         embed_model=Settings.embed_model, 
115 |         transformations=Settings.transformations
116 |     )
117 | 
118 | To save and load the index, you can use the following code snippet.                              
119 | 
120 | .. code:: python
121 | 
122 |     from llama_index.core import StorageContext, load_index_from_storage
123 | 
124 |     # save index
125 |     storage_context = StorageContext.from_defaults(persist_dir="save")
126 |     
127 |     # load index
128 |     index = load_index_from_storage(storage_context)
129 |                             
130 |                                
131 | RAG
132 | -------------------
133 | 
134 | Now you can perform queries, and Qwen2.5 will answer based on the content of the indexed documents.                               
135 |                                
136 | .. code:: python
137 | 
138 |   query_engine = index.as_query_engine()
139 |   your_query = "<your query here>"                             
140 |   print(query_engine.query(your_query).response)
141 | 
142 | 


--------------------------------------------------------------------------------
/docs/source/framework/qwen_agent.rst:
--------------------------------------------------------------------------------
  1 | Qwen-Agent
  2 | ==========
  3 | 
  4 | `Qwen-Agent <https://github.com/QwenLM/Qwen-Agent>`__ is a framework for
  5 | developing LLM applications based on the instruction following, tool
  6 | usage, planning, and memory capabilities of Qwen.
  7 | 
  8 | This is a simple tutorial on using Qwen-Agent to quickly experience the
  9 | agentic capabilities of Qwen3. For more detailed information, please
 10 | refer to `Qwen-Agent <https://github.com/QwenLM/Qwen-Agent>`__
 11 | repository.
 12 | 
 13 | Installation
 14 | ------------
 15 | 
 16 | -  Install the stable version from PyPI:
 17 | 
 18 | .. code:: bash
 19 | 
 20 |    pip install -U "qwen-agent[gui,rag,code_interpreter,mcp]"
 21 |    # Or use `pip install -U qwen-agent` for the minimal requirements.
 22 |    # The optional requirements, specified in double brackets, are:
 23 |    #   [gui] for Gradio-based GUI support;
 24 |    #   [rag] for RAG support;
 25 |    #   [code_interpreter] for Code Interpreter support;
 26 |    #   [mcp] for MCP support.
 27 | 
 28 | Developing Your Own Agent
 29 | -------------------------
 30 | 
 31 | Qwen3 excels in tool calling capabilities. Qwen-Agent encapsulates
 32 | tool-calling templates and tool-calling parsers internally, greatly
 33 | reducing coding complexity.
 34 | 
 35 | To define the available tools, you can use the MCP configuration file,
 36 | use the integrated tool of Qwen-Agent, or integrate other tools by
 37 | yourself.
 38 | 
 39 | .. code:: python
 40 | 
 41 |    import os
 42 |    from qwen_agent.agents import Assistant
 43 | 
 44 |    # Define LLM
 45 |    llm_cfg = {
 46 |        # Use a custom endpoint compatible with OpenAI API by vLLM/SGLang:
 47 |        'model': 'Qwen/Qwen3-32B',
 48 |        'model_server': 'http://localhost:8000/v1',  # api_base
 49 |        'api_key': 'EMPTY',
 50 | 
 51 |        # 'generate_cfg': {
 52 |        #     # When using vLLM/SGLang OAI API, pass the parameter of whether to enable thinking mode in this way
 53 |        #     'extra_body': {
 54 |        #         'chat_template_kwargs': {'enable_thinking': False}
 55 |        #     },
 56 |        #
 57 |        #     # Add: When the content is `<think>this is the thought</think>this is the answer`
 58 |        #     # Do not add: When the response has been separated by reasoning_content and content
 59 |        #     # This parameter will affect the parsing strategy of tool call
 60 |        #     # 'thought_in_content': True,
 61 |        # },
 62 |    }
 63 |    # llm_cfg = {
 64 |    #         # Use the model service provided by DashScope:
 65 |    #         'model': 'qwen3-235b-a22b',
 66 |    #         'model_type': 'qwen_dashscope',
 67 |    #
 68 |    #         # 'generate_cfg': {
 69 |    #         #     # When using the Dash Scope API, pass the parameter of whether to enable thinking mode in this way
 70 |    #         #     'enable_thinking': False,
 71 |    #         # },
 72 |    # }
 73 |    # llm_cfg = {
 74 |    #     # Use the OpenAI-compatible model service provided by DashScope:
 75 |    #     'model': 'qwen3-235b-a22b',
 76 |    #     'model_server': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
 77 |    #     'api_key': os.getenv('DASHSCOPE_API_KEY'),
 78 |    #
 79 |    #     # 'generate_cfg': {
 80 |    #     #     # When using Dash Scope OAI API, pass the parameter of whether to enable thinking mode in this way
 81 |    #     #     'extra_body': {
 82 |    #     #         'enable_thinking': False
 83 |    #     #     },
 84 |    #     # },
 85 |    # }
 86 | 
 87 |    # Define Tools
 88 |    tools = [
 89 |        {'mcpServers': {  # You can specify the MCP configuration file
 90 |                'time': {
 91 |                    'command': 'uvx',
 92 |                    'args': ['mcp-server-time', '--local-timezone=Asia/Shanghai']
 93 |                },
 94 |                "fetch": {
 95 |                    "command": "uvx",
 96 |                    "args": ["mcp-server-fetch"]
 97 |                }
 98 |            }
 99 |        },
100 |      'code_interpreter',  # Built-in tools
101 |    ]
102 | 
103 |    # Define Agent
104 |    bot = Assistant(llm=llm_cfg, function_list=tools)
105 | 
106 |    # Streaming generation
107 |    messages = [{'role': 'user', 'content': 'https://qwenlm.github.io/blog/ Introduce the latest developments of Qwen'}]
108 |    for responses in bot.run(messages=messages):
109 |        pass
110 |    print(responses)
111 | 
112 | For more detailed examples and MCP cookbooks, please refer to
113 | `Qwen-Agent <https://github.com/QwenLM/Qwen-Agent>`__ repository.
114 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/quantization_benchmark.rst:
--------------------------------------------------------------------------------
 1 | Performance of Quantized Models
 2 | ==================================
 3 | 
 4 | .. attention:: 
 5 |     To be updated for Qwen3.
 6 | 
 7 | This section reports the generation performance of quantized
 8 | models (including GPTQ and AWQ) of the Qwen2 series. Specifically, we
 9 | report:
10 | 
11 | * MMLU (Accuracy)
12 | * C-Eval (Accuracy)
13 | * IFEval (Strict Prompt-Level Accuracy)
14 | 
15 | We use greedy decoding in evaluating all models.
16 | 
17 | +---------------------+--------------+---------+-------+--------+--------+
18 | |                     | Quantization | Average | MMLU  | C-Eval | IFEval |
19 | +=====================+==============+=========+=======+========+========+
20 | | Qwen2-72B-Instruct  | BF16         | 81.3    | 82.3  | 83.8   | 77.6   |
21 | +                     +--------------+---------+-------+--------+--------+
22 | |                     | GPTQ-Int8    | 80.7    | 81.3  | 83.4   | 77.5   |
23 | +                     +--------------+---------+-------+--------+--------+
24 | |                     | GPTQ-Int4    | 81.2    | 80.8  | 83.9   | 78.9   |
25 | +                     +--------------+---------+-------+--------+--------+
26 | |                     | AWQ          | 80.4    | 80.5  | 83.9   | 76.9   |
27 | +---------------------+--------------+---------+-------+--------+--------+
28 | | Qwen2-7B-Instruct   | BF16         | 66.9    | 70.5  | 77.2   | 53.1   |
29 | +                     +--------------+---------+-------+--------+--------+
30 | |                     | GPTQ-Int8    | 66.2    | 69.1  | 76.7   | 52.9   |
31 | +                     +--------------+---------+-------+--------+--------+
32 | |                     | GPTQ-Int4    | 64.1    | 67.8  | 75.2   | 49.4   |
33 | +                     +--------------+---------+-------+--------+--------+
34 | |                     | AWQ          | 64.1    | 67.4  | 73.6   | 51.4   |
35 | +---------------------+--------------+---------+-------+--------+--------+
36 | | Qwen2-1.5B-Instruct | BF16         | 48.4    | 52.4  | 63.8   | 29.0   |
37 | +                     +--------------+---------+-------+--------+--------+
38 | |                     | GPTQ-Int8    | 48.1    | 53.0  | 62.5   | 28.8   |
39 | +                     +--------------+---------+-------+--------+--------+
40 | |                     | GPTQ-Int4    | 45.0    | 50.7  | 57.4   | 27.0   |
41 | +                     +--------------+---------+-------+--------+--------+
42 | |                     | AWQ          | 46.5    | 51.6  | 58.1   | 29.9   |
43 | +---------------------+--------------+---------+-------+--------+--------+
44 | | Qwen2-0.5B-Instruct | BF16         | 34.4    | 37.9  | 45.2   | 20.0   |
45 | +                     +--------------+---------+-------+--------+--------+
46 | |                     | GPTQ-Int8    | 32.6    | 35.6  | 43.9   | 18.1   |
47 | +                     +--------------+---------+-------+--------+--------+
48 | |                     | GPTQ-Int4    | 29.7    | 33.0  | 39.2   | 16.8   |
49 | +                     +--------------+---------+-------+--------+--------+
50 | |                     | AWQ          | 31.1    | 34.4  | 42.1   | 16.7   |
51 | +---------------------+--------------+---------+-------+--------+--------+
52 | 
53 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/quickstart.md:
--------------------------------------------------------------------------------
  1 | # Quickstart
  2 | 
  3 | This guide helps you quickly start using Qwen3. 
  4 | We provide examples of [Hugging Face Transformers](https://github.com/huggingface/transformers) as well as [ModelScope](https://github.com/modelscope/modelscope), and [vLLM](https://github.com/vllm-project/vllm) for deployment.
  5 | 
  6 | You can find Qwen3 models in [the Qwen3 collection](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) at Hugging Face Hub and [the Qwen3 collection](https://www.modelscope.cn/collections/Qwen3-9743180bdc6b48) at ModelScope.
  7 | 
  8 | ## Transformers
  9 | 
 10 | To get a quick start with Qwen3, you can try the inference with `transformers` first.
 11 | Make sure that you have installed `transformers>=4.51.0`.
 12 | We advise you to use Python 3.10 or higher, and PyTorch 2.6 or higher.
 13 | 
 14 | The following is a very simple code snippet showing how to run Qwen3-8B:
 15 | 
 16 | ```python
 17 | from transformers import AutoModelForCausalLM, AutoTokenizer
 18 | 
 19 | model_name = "Qwen/Qwen3-8B"
 20 | 
 21 | # load the tokenizer and the model
 22 | model = AutoModelForCausalLM.from_pretrained(
 23 |     model_name,
 24 |     torch_dtype="auto",
 25 |     device_map="auto"
 26 | )
 27 | tokenizer = AutoTokenizer.from_pretrained(model_name)
 28 | 
 29 | # prepare the model input
 30 | prompt = "Give me a short introduction to large language models."
 31 | messages = [
 32 |     {"role": "user", "content": prompt},
 33 | ]
 34 | text = tokenizer.apply_chat_template(
 35 |     messages,
 36 |     tokenize=False,
 37 |     add_generation_prompt=True,
 38 |     enable_thinking=True, # Switches between thinking and non-thinking modes. Default is True.
 39 | )
 40 | model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
 41 | 
 42 | # conduct text completion
 43 | generated_ids = model.generate(
 44 |     **model_inputs,
 45 |     max_new_tokens=32768
 46 | )
 47 | output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
 48 | 
 49 | # parse thinking content
 50 | try:
 51 |     # rindex finding 151668 (</think>)
 52 |     index = len(output_ids) - output_ids[::-1].index(151668)
 53 | except ValueError:
 54 |     index = 0
 55 | 
 56 | thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
 57 | content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
 58 | 
 59 | print("thinking content:", thinking_content)
 60 | print("content:", content)
 61 | ```
 62 | 
 63 | Qwen3 will think before respond, similar to QwQ models.
 64 | This means the model will use its reasoning abilities to enhance the quality of generated responses.
 65 | The model will first generate thinking content wrapped in a `<think>...</think>` block, followed by the final response.
 66 | 
 67 | -   Hard Switch:
 68 |     To strictly disable the model's thinking behavior, aligning its functionality with the previous Qwen2.5-Instruct models, you can set `enable_thinking=False` when formatting the text. 
 69 |     ```python
 70 |     text = tokenizer.apply_chat_template(
 71 |         messages,
 72 |         tokenize=False,
 73 |         add_generation_prompt=True,
 74 |         enable_thinking=False,  # Setting enable_thinking=False disables thinking mode
 75 |     )
 76 |     ```
 77 |     It can be particularly useful in scenarios where disabling thinking is essential for enhancing efficiency.
 78 | 
 79 | -   Soft Switch:
 80 |     Qwen3 also understands the user's instruction on its thinking behavior, in particular, the soft switch `/think` and `/no_think`.
 81 |     You can add them to user prompts or system messages to switch the model's thinking mode from turn to turn. 
 82 |     The model will follow the most recent instruction in multi-turn conversations.
 83 | 
 84 | :::{note}
 85 | For thinking mode, use Temperature=0.6, TopP=0.95, TopK=20, and MinP=0 (the default setting in `generation_config.json`).
 86 | DO NOT use greedy decoding, as it can lead to performance degradation and endless repetitions. 
 87 | For more detailed guidance, please refer to the Best Practices section.
 88 | 
 89 | For non-thinking mode, we suggest using Temperature=0.7, TopP=0.8, TopK=20, and MinP=0. 
 90 | :::
 91 | 
 92 | 
 93 | ## ModelScope
 94 | 
 95 | To tackle with downloading issues, we advise you to try [ModelScope](https://github.com/modelscope/modelscope).
 96 | Before starting, you need to install `modelscope` with `pip`. 
 97 | 
 98 | `modelscope` adopts a programmatic interface similar (but not identical) to `transformers`.
 99 | For basic usage, you can simply change the first line of code above to the following:
100 | 
101 | ```python
102 | from modelscope import AutoModelForCausalLM, AutoTokenizer
103 | ```
104 | 
105 | For more information, please refer to [the documentation of `modelscope`](https://www.modelscope.cn/docs).
106 | 
107 | ## vLLM 
108 | 
109 | vLLM is a fast and easy-to-use framework for LLM inference and serving. 
110 | In the following, we demonstrate how to build a OpenAI-API compatible API service with vLLM.
111 | 
112 | First, make sure you have `vllm>=0.8.5` installed.
113 | 
114 | Run the following code to build up a vLLM service. 
115 | Here we take Qwen3-8B as an example:
116 | 
117 | ```bash
118 | vllm serve Qwen/Qwen3-8B --enable-reasoning --reasoning-parser deepseek_r1
119 | ```
120 | 
121 | Then, you can use the [create chat interface](https://platform.openai.com/docs/api-reference/chat/completions/create) to communicate with Qwen:
122 | 
123 | ::::{tab-set}
124 | 
125 | :::{tab-item} curl
126 | ```shell
127 | curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
128 |   "model": "Qwen/Qwen3-8B",
129 |   "messages": [
130 |     {"role": "user", "content": "Give me a short introduction to large language models."}
131 |   ],
132 |   "temperature": 0.6,
133 |   "top_p": 0.95,
134 |   "top_k": 20,
135 |   "max_tokens": 32768
136 | }'
137 | ```
138 | :::
139 | 
140 | :::{tab-item} Python
141 | You can use the API client with the `openai` Python SDK as shown below:
142 | 
143 | ```python
144 | from openai import OpenAI
145 | # Set OpenAI's API key and API base to use vLLM's API server.
146 | openai_api_key = "EMPTY"
147 | openai_api_base = "http://localhost:8000/v1"
148 | 
149 | client = OpenAI(
150 |     api_key=openai_api_key,
151 |     base_url=openai_api_base,
152 | )
153 | 
154 | chat_response = client.chat.completions.create(
155 |     model="Qwen/Qwen3-8B",
156 |     messages=[
157 |         {"role": "user", "content": "Give me a short introduction to large language models."},
158 |     ],
159 |     max_tokens=32768,
160 |     temperature=0.6,
161 |     top_p=0.95,
162 |     extra_body={
163 |         "top_k": 20,
164 |     }
165 | )
166 | print("Chat response:", chat_response)
167 | ```
168 | ::::
169 | 
170 | While the soft switch is always available, the hard switch is also available in vLLM through the following configuration to the API call.
171 | For more usage, please refer to [our document on vLLM](../deployment/vllm).
172 | 
173 | 
174 | ## Next Step
175 | 
176 | Now, you can have fun with Qwen3 models. 
177 | Would love to know more about its usage? 
178 | Feel free to check other documents in this documentation.
179 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to Qwen!
 2 | ================
 3 | 
 4 | .. figure:: https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/logo_qwen3.png
 5 |   :width: 60%
 6 |   :align: center
 7 |   :alt: Qwen3
 8 |   :class: no-scaled-link
 9 | 
10 | 
11 | Qwen is the large language model and large multimodal model series of the Qwen Team, Alibaba Group. Both language models and multimodal models are pretrained on large-scale multilingual and multimodal data and post-trained on quality data for aligning to human preferences. 
12 | Qwen is capable of natural language understanding, text generation, vision understanding, audio understanding, tool use, role play, playing as AI agent, etc. 
13 | 
14 | The latest version, Qwen3, has the following features:
15 | 
16 | - **Dense and Mixture-of-Experts (MoE) models**, available in 0.6B, 1.7B, 4B, 8B, 14B, 32B and 30B-A3B, 235B-A22B.
17 | - **Seamless switching between thinking mode** (for complex logical reasoning, math, and coding) and **non-thinking mode** (for efficient, general-purpose chat) **within a single model**, ensuring optimal performance across various scenarios.
18 | - **Significantly enhancement in reasoning capabilities**, surpassing previous QwQ (in thinking mode) and Qwen2.5 instruct models (in non-thinking mode) on mathematics, code generation, and commonsense logical reasoning.
19 | - **Superior human preference alignment**, excelling in creative writing, role-playing, multi-turn dialogues, and instruction following, to deliver a more natural, engaging, and immersive conversational experience.
20 | - **Expertise in agent capabilities**, enabling precise integration with external tools in both thinking and unthinking modes and achieving leading performance among open-source models in complex agent-based tasks.
21 | - **Support of 100+ languages and dialects** with strong capabilities for **multilingual instruction following** and **translation**.
22 | 
23 | For more information, please visit our:
24 | 
25 | * `Blog <https://qwenlm.github.io/>`__
26 | * `GitHub <https://github.com/QwenLM>`__
27 | * `Hugging Face <https://huggingface.co/Qwen>`__
28 | * `ModelScope <https://modelscope.cn/organization/qwen>`__
29 | * `Qwen3 Collection <https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f>`__
30 | 
31 | Join our community by joining our `Discord <https://discord.gg/yPEP2vHTu4>`__ and `WeChat <https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png>`__ group. We are looking forward to seeing you there!
32 | 
33 | 
34 | .. toctree::
35 |    :maxdepth: 1
36 |    :caption: Getting Started
37 |    :hidden:
38 | 
39 |    getting_started/quickstart
40 |    getting_started/concepts
41 |    getting_started/speed_benchmark
42 |    getting_started/quantization_benchmark
43 |    
44 | .. toctree::
45 |    :maxdepth: 1
46 |    :caption: Inference
47 |    :hidden:
48 | 
49 |    inference/transformers
50 | 
51 | .. toctree::
52 |    :maxdepth: 1
53 |    :caption: Run Locally
54 |    :hidden:
55 | 
56 |    run_locally/llama.cpp
57 |    run_locally/ollama
58 |    run_locally/mlx-lm
59 | 
60 | .. toctree::
61 |    :maxdepth: 1
62 |    :caption: Deployment
63 |    :hidden:
64 | 
65 |    deployment/sglang
66 |    deployment/vllm
67 |    deployment/tgi
68 |    deployment/skypilot
69 |    deployment/openllm
70 | 
71 | .. toctree::
72 |    :maxdepth: 1
73 |    :caption: Quantization
74 |    :hidden:
75 | 
76 |    quantization/awq
77 |    quantization/gptq
78 |    quantization/llama.cpp
79 | 
80 | .. toctree::
81 |    :maxdepth: 1
82 |    :caption: Training
83 |    :hidden:
84 | 
85 |    training/llama_factory
86 |    training/ms_swift
87 |    training/verl
88 |    training/axolotl
89 | 
90 | .. toctree::
91 |    :maxdepth: 1
92 |    :caption: Framework
93 |    :hidden:
94 | 
95 |    framework/function_call
96 |    framework/qwen_agent
97 |    framework/LlamaIndex
98 |    framework/Langchain
99 | 


--------------------------------------------------------------------------------
/docs/source/quantization/awq.md:
--------------------------------------------------------------------------------
  1 | # AWQ
  2 | 
  3 | :::{attention}
  4 | To be updated for Qwen3.
  5 | :::
  6 | 
  7 | For quantized models, one of our recommendations is the usage of [AWQ](https://arxiv.org/abs/2306.00978) with [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). 
  8 | 
  9 | **AWQ** refers to Activation-aware Weight Quantization, a hardware-friendly approach for LLM low-bit weight-only quantization. 
 10 | 
 11 | **AutoAWQ** is an easy-to-use Python library for 4-bit quantized models. 
 12 | AutoAWQ speeds up models by 3x and reduces memory requirements by 3x compared to FP16. 
 13 | AutoAWQ implements the Activation-aware Weight Quantization (AWQ) algorithm for quantizing LLMs. 
 14 | 
 15 | In this document, we show you how to use the quantized model with Hugging Face `transformers` and also how to quantize your own model.
 16 | 
 17 | ## Usage of AWQ Models with Hugging Face transformers
 18 | 
 19 | Now, `transformers` has officially supported AutoAWQ, which means that you can directly use the quantized model with `transformers`. 
 20 | The following is a very simple code snippet showing how to run `Qwen2.5-7B-Instruct-AWQ` with the quantized model:
 21 | 
 22 | ```python
 23 | from transformers import AutoModelForCausalLM, AutoTokenizer
 24 | 
 25 | model_name = "Qwen/Qwen2.5-7B-Instruct-AWQ"
 26 | 
 27 | model = AutoModelForCausalLM.from_pretrained(
 28 |     model_name, 
 29 |     device_map="auto",
 30 | )
 31 | tokenizer = AutoTokenizer.from_pretrained(model_name)
 32 | 
 33 | prompt = "Give me a short introduction to large language models."
 34 | messages = [
 35 |     {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
 36 |     {"role": "user", "content": prompt},
 37 | ]
 38 | text = tokenizer.apply_chat_template(
 39 |     messages,
 40 |     tokenize=False,
 41 |     add_generation_prompt=True,
 42 | )
 43 | model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
 44 | 
 45 | generated_ids = model.generate(
 46 |     **model_inputs,
 47 |     max_new_tokens=512,
 48 | )
 49 | generated_ids = [
 50 |     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
 51 | ]
 52 | 
 53 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 54 | ```
 55 | 
 56 | ## Usage of AWQ  Models with vLLM
 57 | 
 58 | vLLM has supported AWQ, which means that you can directly use our provided AWQ models or those quantized with `AutoAWQ` with vLLM.
 59 | We recommend using the latest version of vLLM (`vllm>=0.6.1`) which brings performance improvements to AWQ models; otherwise, the performance might not be well-optimized.
 60 | 
 61 | Actually, the usage is the same with the basic usage of vLLM. 
 62 | We provide a simple example of how to launch OpenAI-API compatible API with vLLM and `Qwen2.5-7B-Instruct-AWQ`:
 63 | 
 64 | Run the following in a shell to start an OpenAI-compatible API service:
 65 | 
 66 | ```bash
 67 | vllm serve Qwen/Qwen2.5-7B-Instruct-AWQ
 68 | ```
 69 | 
 70 | Then, you can call the API as 
 71 | 
 72 | ```bash
 73 | curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
 74 |   "model": "Qwen/Qwen2.5-7B-Instruct-AWQ",
 75 |   "messages": [
 76 |     {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
 77 |     {"role": "user", "content": "Tell me something about large language models."}
 78 |   ],
 79 |   "temperature": 0.7,
 80 |   "top_p": 0.8,
 81 |   "repetition_penalty": 1.05,
 82 |   "max_tokens": 512
 83 | }'
 84 | ```
 85 | 
 86 | or you can use the API client with the `openai` Python package as shown below:
 87 | 
 88 | ```python
 89 | from openai import OpenAI
 90 | 
 91 | openai_api_key = "EMPTY"
 92 | openai_api_base = "http://localhost:8000/v1"
 93 | 
 94 | client = OpenAI(
 95 |     api_key=openai_api_key,
 96 |     base_url=openai_api_base,
 97 | )
 98 | 
 99 | chat_response = client.chat.completions.create(
100 |     model="Qwen/Qwen2.5-7B-Instruct-AWQ",
101 |     messages=[
102 |         {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
103 |         {"role": "user", "content": "Tell me something about large language models."},
104 |     ],
105 |     temperature=0.7,
106 |     top_p=0.8,
107 |     max_tokens=512,
108 |     extra_body={
109 |         "repetition_penalty": 1.05,
110 |     },
111 | )
112 | print("Chat response:", chat_response)
113 | ```
114 | 
115 | ## Quantize Your Own Model with AutoAWQ
116 | 
117 | If you want to quantize your own model to AWQ quantized models, we advise you to use AutoAWQ. 
118 | 
119 | ```bash
120 | pip install "autoawq<0.2.7"
121 | ```
122 | 
123 | Suppose you have finetuned a model based on `Qwen2.5-7B`, which is named `Qwen2.5-7B-finetuned`, with your own dataset, e.g., Alpaca. 
124 | To build your own AWQ quantized model, you need to use the training data for calibration. 
125 | Below, we provide a simple demonstration for you to run:
126 | 
127 | ```python
128 | from awq import AutoAWQForCausalLM
129 | from transformers import AutoTokenizer
130 | 
131 | # Specify paths and hyperparameters for quantization
132 | model_path = "your_model_path"
133 | quant_path = "your_quantized_model_path"
134 | quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
135 | 
136 | # Load your tokenizer and model with AutoAWQ
137 | tokenizer = AutoTokenizer.from_pretrained(model_path)
138 | model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="auto", safetensors=True)
139 | ```
140 | 
141 | Then you need to prepare your data for calibration. 
142 | What you need to do is just put samples into a list, each of which is a text. 
143 | As we directly use our finetuning data for calibration, we first format it with ChatML template. 
144 | For example,
145 | 
146 | ```python
147 | data = []
148 | for msg in dataset:
149 |     text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
150 |     data.append(text.strip())
151 | ```
152 | 
153 | where each `msg` is a typical chat message as shown below:
154 | 
155 | ```json
156 | [
157 |     {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
158 |     {"role": "user", "content": "Tell me who you are."},
159 |     {"role": "assistant", "content": "I am a large language model named Qwen..."}
160 | ]
161 | ```
162 | 
163 | Then just run the calibration process by one line of code:
164 | 
165 | ```python
166 | model.quantize(tokenizer, quant_config=quant_config, calib_data=data)
167 | ```
168 | 
169 | Finally, save the quantized model:
170 | 
171 | ```python
172 | model.save_quantized(quant_path, safetensors=True, shard_size="4GB")
173 | tokenizer.save_pretrained(quant_path)
174 | ```
175 | 
176 | Then you can obtain your own AWQ quantized model for deployment. 
177 | Enjoy!
178 | 


--------------------------------------------------------------------------------
/docs/source/quantization/llama.cpp.md:
--------------------------------------------------------------------------------
  1 | # llama.cpp
  2 | 
  3 | Quantization is a major topic for local inference of LLMs, as it reduces the memory footprint.
  4 | Undoubtably, llama.cpp natively supports LLM quantization and of course, with flexibility as always.
  5 | 
  6 | At high-level, all quantization supported by llama.cpp is weight quantization: 
  7 | Model parameters are quantized into lower bits, and in inference, they are dequantized and used in computation.
  8 | 
  9 | In addition, you can mix different quantization data types in a single quantized model, e.g., you can quantize the embedding weights using a quantization data type and other weights using a different one.
 10 | With an adequate mixture of quantization types, much lower quantization error can be attained with just a slight increase of bit-per-weight.
 11 | The example program `llama-quantize` supports many quantization presets, such as Q4_K_M and Q8_0.
 12 | 
 13 | If you find the quantization errors still more than expected, you can bring your own scales, e.g., as computed by AWQ, or use calibration data to compute an importance matrix using `llama-imatrix`, which can then be used during quantization to enhance the quality of the quantized models.
 14 | 
 15 | In this document, we demonstrate the common way to quantize your model and evaluate the performance of the quantized model.
 16 | We will assume you have the example programs from llama.cpp at your hand.
 17 | If you don't, check our guide [here](../run_locally/llama.cpp.html#getting-the-program){.external}.
 18 | 
 19 | ## Getting the GGUF
 20 | 
 21 | Now, suppose you would like to quantize `Qwen3-8B`. 
 22 | You need to first make a GGUF file as shown below:
 23 | ```bash
 24 | python convert-hf-to-gguf.py Qwen/Qwen3-8B --outfile Qwen3-8B-F16.gguf
 25 | ```
 26 | 
 27 | Since Qwen3 are trained using the bfloat16 precision, the following should keep most information on supported machines:
 28 | ```bash
 29 | python convert-hf-to-gguf.py Qwen/Qwen3-8B --outtype bf16 --outfile Qwen3-8B-BF16.gguf
 30 | ```
 31 | 
 32 | Sometimes, it may be better to use fp32 as the start point for quantization.
 33 | In that case, use
 34 | ```bash
 35 | python convert-hf-to-gguf.py Qwen/Qwen3-8B --outtype f32 --outfile Qwen3-8B-F32.gguf
 36 | ```
 37 | 
 38 | ## Quantizing the GGUF without Calibration
 39 | 
 40 | For the simplest way, you can directly quantize the model to lower-bits based on your requirements. 
 41 | An example of quantizing the model to 8 bits is shown below:
 42 | ```bash
 43 | ./llama-quantize Qwen3-8B-F16.gguf Qwen3-8B-Q8_0.gguf Q8_0
 44 | ```
 45 | 
 46 | `Q8_0` is a code for a quantization preset.
 47 | You can find all the presets in [the source code of `llama-quantize`](https://github.com/ggml-org/llama.cpp/blob/master/tools/quantize/quantize.cpp).
 48 | Look for the variable `QUANT_OPTIONS`.
 49 | Common ones used for 8B models include `Q8_0`, `Q5_K_M`, and `Q4_K_M`. 
 50 | The letter case doesn't matter, so `q8_0` or `q4_K_m` are perfectly fine.
 51 | 
 52 | Now you can use the GGUF file of the quantized model with applications based on llama.cpp.
 53 | Very simple indeed.
 54 | 
 55 | However, the accuracy of the quantized model could be lower than expected occasionally, especially for lower-bit quantization.
 56 | The program may even prevent you from doing that. 
 57 | 
 58 | There are several ways to improve quality of quantized models.
 59 | A common way is to use a calibration dataset in the target domain to identify the weights that really matter and quantize the model in a way that those weights have lower quantization errors, as introduced in the next two methods.
 60 | 
 61 | 
 62 | ## Quantizing the GGUF with AWQ Scale
 63 | 
 64 | :::{attention}
 65 | To be updated for Qwen3.
 66 | :::
 67 | 
 68 | To improve the quality of your quantized models, one possible solution is to apply the AWQ scale, following [this script](https://github.com/casper-hansen/AutoAWQ/blob/main/docs/examples.md#gguf-export).
 69 | First, when you run `model.quantize()` with `autoawq`, remember to add `export_compatible=True` as shown below:
 70 | ```python
 71 | ...
 72 | model.quantize(
 73 |     tokenizer,
 74 |     quant_config=quant_config,
 75 |     export_compatible=True
 76 | )
 77 | model.save_pretrained(quant_path)
 78 | ...
 79 | ```
 80 | 
 81 | The above code will not actually quantize the weights.
 82 | Instead, it adjusts weights based on a dataset so that they are "easier" to quantize.[^AWQ]
 83 | 
 84 | Then, when you run `convert-hf-to-gguf.py`, remember to replace the model path with the path to the new model:
 85 | ```bash
 86 | python convert-hf-to-gguf.py <quant_path> --outfile qwen2.5-7b-instruct-f16-awq.gguf
 87 | ```
 88 | 
 89 | Finally, you can quantize the model as in the last example:
 90 | ```bash
 91 | ./llama-quantize qwen2.5-7b-instruct-f16-awq.gguf qwen2.5-7b-instruct-q8_0.gguf Q8_0
 92 | ```
 93 | 
 94 | In this way, it should be possible to achieve similar quality with lower bit-per-weight.
 95 | 
 96 | [^AWQ]: If you are interested in what this means, refer to [the AWQ paper](https://arxiv.org/abs/2306.00978).
 97 |         Basically, important weights (called salient weights in the paper) are identified based on activations across data examples.
 98 |         The weights are scaled accordingly such that the salient weights are protected even after quantization.
 99 | 
100 | ## Quantizing the GGUF with Importance Matrix
101 | 
102 | Another possible solution is to use the "important matrix"[^imatrix], following [this](https://github.com/ggml-org/llama.cpp/tree/master/tools/imatrix).
103 | 
104 | First, you need to compute the importance matrix data of the weights of a model (`-m`) using a calibration dataset (`-f`):
105 | ```bash
106 | ./llama-imatrix -m Qwen3-8B-F16.gguf -f calibration-text.txt --chunk 512 -o Qwen3-8B-imatrix.dat -ngl 80
107 | ```
108 | 
109 | The text is cut in chunks of length `--chunk` for computation.
110 | Preferably, the text should be representative of the target domain.
111 | The final results will be saved in a file named `Qwen3-8B-imatrix.dat` (`-o`), which can then be used:
112 | ```bash
113 | ./llama-quantize --imatrix Qwen3-8B-imatrix.dat \
114 |     Qwen3-8B-F16.gguf Qwen3-8B-Q4_K_M.gguf Q4_K_M
115 | ```
116 | 
117 | For lower-bit quantization mixtures for 1-bit or 2-bit, if you do not provide `--imatrix`, a helpful warning will be printed by `llama-quantize`.
118 | 
119 | [^imatrix]: Here, the importance matrix keeps record of how weights affect the output: the weight should be important is a slight change in its value causes huge difference in the results, akin to the [GPTQ](https://arxiv.org/abs/2210.17323) algorithm.
120 | 
121 | ## Perplexity Evaluation
122 | 
123 | `llama.cpp` provides an example program for us to calculate the perplexity, which evaluate how unlikely the given text is to the model.
124 | It should be mostly used for comparisons: the lower the perplexity, the better the model remembers the given text.
125 | 
126 | To do this, you need to prepare a dataset, say "wiki test"[^wiki]. 
127 | You can download the dataset with:
128 | ```bash
129 | wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research -O wikitext-2-raw-v1.zip
130 | unzip wikitext-2-raw-v1.zip
131 | ```
132 | 
133 | Then you can run the test with the following command:
134 | ```bash
135 | ./llama-perplexity -m Qwen3-8B-Q8_0.gguf -f wiki.test.raw -ngl 80
136 | ```
137 | Wait for some time and you will get the perplexity of the model.
138 | There are some numbers of different kinds of quantization mixture [here](https://github.com/ggml-org/llama.cpp/blob/master/tools/perplexity/README.md).
139 | It might be helpful to look at the difference and grab a sense of how that kind of quantization might perform.
140 | 
141 | [^wiki]: It is not a good evaluation dataset for instruct models though, but it is very common and easily accessible.
142 |          You probably want to use a dataset similar to your target domain.
143 | 
144 | ## Finally
145 | 
146 | In this guide, we demonstrate how to conduct quantization and evaluate the perplexity with llama.cpp.
147 | For more information, please visit the [llama.cpp GitHub repo](https://github.com/ggml-org/llama.cpp).
148 | 
149 | We usually quantize the fp16 model to 4, 5, 6, and 8-bit models with different quantization mixtures, but sometimes a particular mixture just does not work, so we don't provide those in our Hugging Face Hub.
150 | However, others in the community may have success, so if you haven't found what you need in our repos, look around.
151 | 
152 | Enjoy your freshly quantized models!
153 | 


--------------------------------------------------------------------------------
/docs/source/run_locally/mlx-lm.md:
--------------------------------------------------------------------------------
 1 | # MLX LM
 2 | 
 3 | :::{attention}
 4 | To be updated for Qwen3.
 5 | :::
 6 | 
 7 | [mlx-lm](https://github.com/ml-explore/mlx-examples/tree/main/llms) helps you run LLMs locally on Apple Silicon. 
 8 | It is available at macOS. 
 9 | It has already supported Qwen models and this time, we have also provided checkpoints that you can directly use with it.
10 | 
11 | ## Prerequisites
12 | 
13 | The easiest way to get started is to install the `mlx-lm` package:
14 | 
15 | - with `pip`:
16 | 
17 |   ```bash
18 |   pip install mlx-lm
19 |   ```
20 | 
21 | - with `conda`:
22 | 
23 |   ```bash
24 |   conda install -c conda-forge mlx-lm
25 |   ```
26 | 
27 | ## Running with Qwen MLX Files
28 | 
29 | We provide model checkpoints with `mlx-lm` in our Hugging Face organization, and to search for what you need you can search the repo names with `-MLX`.
30 | 
31 | Here provides a code snippet with `apply_chat_template` to show you how to load the tokenizer and model and how to generate contents.
32 | 
33 | ```python
34 | from mlx_lm import load, generate
35 | 
36 | model, tokenizer = load('Qwen/Qwen2.5-7B-Instruct-MLX', tokenizer_config={"eos_token": "<|im_end|>"})
37 | 
38 | prompt = "Give me a short introduction to large language models."
39 | messages = [
40 |     {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
41 |     {"role": "user", "content": prompt}
42 | ]
43 | text = tokenizer.apply_chat_template(
44 |     messages,
45 |     tokenize=False,
46 |     add_generation_prompt=True
47 | )
48 | 
49 | response = generate(model, tokenizer, prompt=text, verbose=True, top_p=0.8, temp=0.7, repetition_penalty=1.05, max_tokens=512)
50 | ```
51 | 
52 | ## Make Your MLX files
53 | 
54 | You can make MLX files with just one command:
55 | 
56 | ```bash
57 | mlx_lm.convert --hf-path Qwen/Qwen2.5-7B-Instruct --mlx-path mlx/Qwen2.5-7B-Instruct/ -q
58 | ```
59 | 
60 | where
61 | 
62 | - `--hf-path`: the model name on Hugging Face Hub or the local path
63 | - `--mlx-path`: the path for output files
64 | - `-q`: enable quantization
65 | 


--------------------------------------------------------------------------------
/docs/source/run_locally/ollama.md:
--------------------------------------------------------------------------------
  1 | # Ollama
  2 | 
  3 | :::{attention}
  4 | To be updated for Qwen3.
  5 | :::
  6 | 
  7 | [Ollama](https://ollama.com/) helps you run LLMs locally with only a few commands.
  8 | It is available at macOS, Linux, and Windows.
  9 | Now, Qwen2.5 is officially on Ollama, and you can run it with one command:
 10 | 
 11 | ```bash
 12 | ollama run qwen2.5
 13 | ```
 14 | 
 15 | Next, we introduce more detailed usages of Ollama for running Qwen2.5 models.
 16 | 
 17 | ## Quickstart
 18 | 
 19 | Visit the official website [Ollama](https://ollama.com/) and click download to install Ollama on your device.
 20 | You can also search models on the website, where you can find the Qwen2.5 models.
 21 | Except for the default one, you can choose to run Qwen2.5-Instruct models of different sizes by:
 22 | 
 23 | - `ollama run qwen2.5:0.5b`
 24 | - `ollama run qwen2.5:1.5b`
 25 | - `ollama run qwen2.5:3b`
 26 | - `ollama run qwen2.5:7b`
 27 | - `ollama run qwen2.5:14b`
 28 | - `ollama run qwen2.5:32b`
 29 | - `ollama run qwen2.5:72b`
 30 | 
 31 | :::{note}
 32 | `ollama` does not host base models.
 33 | Even though the tag may not have the instruct suffix, they are all instruct models.
 34 | :::
 35 | 
 36 | ## Run Ollama with Your GGUF Files
 37 | 
 38 | Sometimes you don't want to pull models and you just want to use Ollama with your own GGUF files.
 39 | Suppose you have a GGUF file of Qwen2.5, `qwen2.5-7b-instruct-q5_0.gguf`.
 40 | For the first step, you need to create a file called `Modelfile`.
 41 | The content of the file is shown below:
 42 | 
 43 | ```text
 44 | FROM qwen2.5-7b-instruct-q5_0.gguf
 45 | 
 46 | # set the temperature to 1 [higher is more creative, lower is more coherent]
 47 | PARAMETER temperature 0.7
 48 | PARAMETER top_p 0.8
 49 | PARAMETER repeat_penalty 1.05
 50 | PARAMETER top_k 20
 51 | 
 52 | TEMPLATE """{{ if .Messages }}
 53 | {{- if or .System .Tools }}<|im_start|>system
 54 | {{ .System }}
 55 | {{- if .Tools }}
 56 | 
 57 | # Tools
 58 | 
 59 | You are provided with function signatures within <tools></tools> XML tags:
 60 | <tools>{{- range .Tools }}
 61 | {"type": "function", "function": {{ .Function }}}{{- end }}
 62 | </tools>
 63 | 
 64 | For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
 65 | <tool_call>
 66 | {"name": <function-name>, "arguments": <args-json-object>}
 67 | </tool_call>
 68 | {{- end }}<|im_end|>
 69 | {{ end }}
 70 | {{- range $i, $_ := .Messages }}
 71 | {{- $last := eq (len (slice $.Messages $i)) 1 -}}
 72 | {{- if eq .Role "user" }}<|im_start|>user
 73 | {{ .Content }}<|im_end|>
 74 | {{ else if eq .Role "assistant" }}<|im_start|>assistant
 75 | {{ if .Content }}{{ .Content }}
 76 | {{- else if .ToolCalls }}<tool_call>
 77 | {{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
 78 | {{ end }}</tool_call>
 79 | {{- end }}{{ if not $last }}<|im_end|>
 80 | {{ end }}
 81 | {{- else if eq .Role "tool" }}<|im_start|>user
 82 | <tool_response>
 83 | {{ .Content }}
 84 | </tool_response><|im_end|>
 85 | {{ end }}
 86 | {{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
 87 | {{ end }}
 88 | {{- end }}
 89 | {{- else }}
 90 | {{- if .System }}<|im_start|>system
 91 | {{ .System }}<|im_end|>
 92 | {{ end }}{{ if .Prompt }}<|im_start|>user
 93 | {{ .Prompt }}<|im_end|>
 94 | {{ end }}<|im_start|>assistant
 95 | {{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}"""
 96 | 
 97 | # set the system message
 98 | SYSTEM """You are Qwen, created by Alibaba Cloud. You are a helpful assistant."""
 99 | ```
100 | 
101 | Then create the Ollama model by running:
102 | 
103 | ```bash
104 | ollama create qwen2.5_7b -f Modelfile
105 | ```
106 | 
107 | Once it is finished, you can run your Ollama model by:
108 | 
109 | ```bash
110 | ollama run qwen2.5_7b
111 | ```
112 | 
113 | ## Tool Use
114 | 
115 | Tool use is now supported Ollama and you should be able to run Qwen2.5 models with it.
116 | For more details, see our [function calling guide](../framework/function_call).


--------------------------------------------------------------------------------
/docs/source/training/axolotl.rst:
--------------------------------------------------------------------------------
  1 | Axolotl Qwen3 Training Guide
  2 | =============================
  3 | 
  4 | This guide will help you get started with post-training (SFT, RLHF, RM, PRM) for Qwen3 / Qwen3_MOE using Axolotl, and covers optimizations to enable for better performance.
  5 | 
  6 | Requirements
  7 | ============
  8 | 
  9 | - **GPU:** NVIDIA Ampere (or newer) for ``bf16`` and ``Flash Attention``, or AMD GPU
 10 | - **Python:** ≥3.11
 11 | - **CUDA:** ≥12.4 (for NVIDIA GPUs)
 12 | 
 13 | Installation
 14 | ============
 15 | 
 16 | You can install Axolotl using PyPI, Conda, Git, Docker, or launch a cloud environment.
 17 | 
 18 | .. important::
 19 | 
 20 |    Install PyTorch *before* installing Axolotl to ensure CUDA compatibility.
 21 | 
 22 | For the latest instructions, see the official `Axolotl Installation Guide <https://docs.axolotl.ai/docs/installation.html>`_.
 23 | 
 24 | Quickstart
 25 | ==========
 26 | 
 27 | SFT
 28 | ---
 29 | 
 30 | We have provided a sample YAML config for SFT with Qwen/Qwen3-32B: `SFT 32B QLoRA config <https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/examples/qwen3/32b-qlora.yaml>`_.
 31 | 
 32 | .. code:: bash
 33 | 
 34 |    # Train the model
 35 |    axolotl train path/to/32b-qlora.yaml
 36 | 
 37 |    # Merge LoRA weights with the base model
 38 |    # This will create a new `merged` directory under `{output_dir}`
 39 |    axolotl merge-lora path/to/32b-qlora.yaml
 40 | 
 41 | .. tip::
 42 | 
 43 |    To train a smaller model, edit the ``base_model`` in your config:
 44 | 
 45 |    .. code:: yaml
 46 | 
 47 |       base_model: Qwen/Qwen3-8B
 48 | 
 49 | Qwen3 works with all Axolotl features including ``Flash Attention``, ``bf16``, ``LoRA``, ``torch_compile``, and ``QLoRA``.
 50 | 
 51 | To run on more than single GPU, please take a look at the `Multi-GPU Training Guide <https://docs.axolotl.ai/docs/multi-gpu.html>`_ or `Multi-node Training Guide <https://docs.axolotl.ai/docs/multi-node.html>`_.
 52 | 
 53 | RLHF
 54 | ----
 55 | 
 56 | See the `RLHF Guide <https://docs.axolotl.ai/docs/rlhf.html>`_ for required dataset formats and examples for each method.
 57 | 
 58 | RM/PRM
 59 | ------
 60 | 
 61 | Please refer to the `Reward Modelling Guide <https://docs.axolotl.ai/docs/reward_modelling.html>`_ for required dataset formats and config examples.
 62 | 
 63 | Dataset
 64 | =======
 65 | 
 66 | By default, the example config uses the ``mlabonne/FineTome-100k`` dataset (from HuggingFace Hub). You can substitute any dataset of your own.
 67 | 
 68 | SFT Dataset Format
 69 | ------------------
 70 | 
 71 | Axolotl handles various SFT dataset formats, but the current **recommended** format (for use with ``chat_template``) is the OpenAI Messages format:
 72 | 
 73 | .. code:: json
 74 | 
 75 |    [
 76 |      {
 77 |        "messages": [
 78 |          {
 79 |            "role": "user",
 80 |            "content": "What is Qwen3?"
 81 |          },
 82 |          {
 83 |            "role": "assistant",
 84 |            "content": "Qwen3 is a language model..."
 85 |          }
 86 |        ]
 87 |      }
 88 |    ]
 89 | 
 90 | Use this in your config:
 91 | 
 92 | .. code:: yaml
 93 | 
 94 |    datasets:
 95 |      - path: path/to/your/dataset.json
 96 |        type: chat_template
 97 | 
 98 | You can also load datasets from multiple sources: HuggingFace Hub, local files, directories, S3, GCS, Azure, etc.
 99 | 
100 | See the `Dataset Loading Guide <https://docs.axolotl.ai/docs/dataset_loading.html>`_ for more details.
101 | 
102 | To load different dataset formats, refer to the `SFT Dataset Formats Guide <https://docs.axolotl.ai/docs/dataset-formats/#supervised-fine-tuning-sft>`_.
103 | 
104 | Optimizations
105 | =============
106 | 
107 | With Qwen3/Qwen3_MOE, you can leverage Axolotl's custom optimizations for improved speed and reduced memory usage:
108 | 
109 | - `Cut Cross Entropy <https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy>`_
110 | - `Liger Kernels <https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels>`_
111 | - (LoRA/QLoRA only): `LoRA Kernels Optimization <https://docs.axolotl.ai/docs/lora_optims.html>`_
112 | 
113 | Additional Suggestions
114 | =======================
115 | 
116 | Troubleshooting
117 | ---------------
118 | 
119 | - Ensure your CUDA version matches your GPU and PyTorch version.
120 | - If running into out-of-memory issues, try reducing your batch size, enable the optimizations above, or reduce sequence length.
121 | - Qwen3 MoE may have slower training due to the upstream transformer's handling of MoE layers.
122 | - For help, check the help channel on `Axolotl Discord <https://discord.gg/7m9sfhzaf3>`_ or create a Discussion on `Axolotl GitHub <https://github.com/axolotl-ai-cloud/axolotl>`_.
123 | 
124 | Links
125 | -----
126 | 
127 | - `Axolotl Documentation <https://docs.axolotl.ai/>`_
128 | - `Axolotl Discord <https://discord.gg/7m9sfhzaf3>`_
129 | - `Axolotl GitHub <https://github.com/axolotl-ai-cloud/axolotl>`_
130 | - `Axolotl Website <https://axolotl.ai>`_
131 | 


--------------------------------------------------------------------------------
/docs/source/training/llama_factory.rst:
--------------------------------------------------------------------------------
  1 | LLaMA-Factory
  2 | ===================================
  3 | 
  4 | .. attention:: 
  5 |     To be updated for Qwen3.
  6 | 
  7 | Here we provide a script for supervised finetuning Qwen2.5 with
  8 | `LLaMA-Factory <https://github.com/hiyouga/LLaMA-Factory>`__. This
  9 | script for supervised finetuning (SFT) has the following features:
 10 | 
 11 | -  Support single-GPU and multi-GPU training;
 12 | 
 13 | -  Support full-parameter tuning, LoRA, Q-LoRA, Dora.
 14 | 
 15 | In the following, we introduce more details about the usage of the
 16 | script.
 17 | 
 18 | Installation
 19 | ------------
 20 | 
 21 | Before you start, make sure you have installed the following packages:
 22 | 
 23 | 1. Follow the instructions of
 24 |    `LLaMA-Factory <https://github.com/hiyouga/LLaMA-Factory>`__, and build
 25 |    the environment.
 26 | 2. Install these packages (Optional):
 27 | 
 28 | ::
 29 | 
 30 |    pip install deepspeed
 31 |    pip install flash-attn --no-build-isolation
 32 | 
 33 | 3. If you want to use
 34 |    `FlashAttention-2 <https://github.com/Dao-AILab/flash-attention>`__,
 35 |    make sure your CUDA is 11.6 and above.
 36 | 
 37 | Data Preparation
 38 | ----------------
 39 | 
 40 | LLaMA-Factory provides several training datasets in ``data`` folder, you
 41 | can use it directly. If you are using a custom dataset, please prepare
 42 | your dataset as follows.
 43 | 
 44 | 1. Organize your data in a **json** file and put your data in ``data``
 45 |    folder. LLaMA-Factory supports dataset in ``alpaca`` or ``sharegpt``
 46 |    format.
 47 | 
 48 | -  The dataset in ``alpaca`` format should follow the below format:
 49 | 
 50 | .. code:: json
 51 | 
 52 |    [
 53 |      {
 54 |        "instruction": "user instruction (required)",
 55 |        "input": "user input (optional)",
 56 |        "output": "model response (required)",
 57 |        "system": "system prompt (optional)",
 58 |        "history": [
 59 |          ["user instruction in the first round (optional)", "model response in the first round (optional)"],
 60 |          ["user instruction in the second round (optional)", "model response in the second round (optional)"]
 61 |        ]
 62 |      }
 63 |    ]
 64 | 
 65 | -  The dataset in ``sharegpt`` format should follow the below format:
 66 | 
 67 | .. code:: json
 68 | 
 69 |    [
 70 |      {
 71 |        "conversations": [
 72 |          {
 73 |            "from": "human",
 74 |            "value": "user instruction"
 75 |          },
 76 |          {
 77 |            "from": "gpt",
 78 |            "value": "model response"
 79 |          }
 80 |        ],
 81 |        "system": "system prompt (optional)",
 82 |        "tools": "tool description (optional)"
 83 |      }
 84 |    ]
 85 | 
 86 | 2. Provide your dataset definition in ``data/dataset_info.json`` in the
 87 |    following format .
 88 | 
 89 | -  For ``alpaca`` format dataset, the columns in ``dataset_info.json``
 90 |    should be:
 91 | 
 92 | .. code:: json
 93 | 
 94 |    "dataset_name": {
 95 |      "file_name": "dataset_name.json",
 96 |      "columns": {
 97 |        "prompt": "instruction",
 98 |        "query": "input",
 99 |        "response": "output",
100 |        "system": "system",
101 |        "history": "history"
102 |      }
103 |    }
104 | 
105 | -  For ``sharegpt`` format dataset, the columns in ``dataset_info.json``
106 |    should be:
107 | 
108 | .. code:: json
109 | 
110 |    "dataset_name": {
111 |        "file_name": "dataset_name.json",
112 |        "formatting": "sharegpt",
113 |        "columns": {
114 |          "messages": "conversations",
115 |          "system": "system",
116 |          "tools": "tools"
117 |        },
118 |        "tags": {
119 |          "role_tag": "from",
120 |          "content_tag": "value",
121 |          "user_tag": "user",
122 |          "assistant_tag": "assistant"
123 |        }
124 |      }
125 | 
126 | Training
127 | --------
128 | 
129 | Execute the following training command:
130 | 
131 | .. code:: bash
132 | 
133 |    DISTRIBUTED_ARGS="
134 |        --nproc_per_node $NPROC_PER_NODE \
135 |        --nnodes $NNODES \
136 |        --node_rank $NODE_RANK \
137 |        --master_addr $MASTER_ADDR \
138 |        --master_port $MASTER_PORT
139 |      "
140 | 
141 |    torchrun $DISTRIBUTED_ARGS src/train.py \
142 |        --deepspeed $DS_CONFIG_PATH \
143 |        --stage sft \
144 |        --do_train \
145 |        --use_fast_tokenizer \
146 |        --flash_attn \
147 |        --model_name_or_path $MODEL_PATH \
148 |        --dataset your_dataset \
149 |        --template qwen \
150 |        --finetuning_type lora \
151 |        --lora_target q_proj,v_proj\
152 |        --output_dir $OUTPUT_PATH \
153 |        --overwrite_cache \
154 |        --overwrite_output_dir \
155 |        --warmup_steps 100 \
156 |        --weight_decay 0.1 \
157 |        --per_device_train_batch_size 4 \
158 |        --gradient_accumulation_steps 4 \
159 |        --ddp_timeout 9000 \
160 |        --learning_rate 5e-6 \
161 |        --lr_scheduler_type cosine \
162 |        --logging_steps 1 \
163 |        --cutoff_len 4096 \
164 |        --save_steps 1000 \
165 |        --plot_loss \
166 |        --num_train_epochs 3 \
167 |        --bf16 
168 | 
169 | and enjoy the training process. To make changes to your training, you
170 | can modify the arguments in the training command to adjust the
171 | hyperparameters. One argument to note is ``cutoff_len``, which is the
172 | maximum length of the training data. Control this parameter to avoid OOM
173 | error.
174 | 
175 | Merge LoRA
176 | ----------
177 | 
178 | If you train your model with LoRA, you probably need to merge adapter
179 | parameters to the main branch. Run the following command to perform the
180 | merging of LoRA adapters.
181 | 
182 | .. code:: bash
183 | 
184 |    CUDA_VISIBLE_DEVICES=0 llamafactory-cli export \
185 |        --model_name_or_path path_to_base_model \
186 |        --adapter_name_or_path path_to_adapter \
187 |        --template qwen \
188 |        --finetuning_type lora \
189 |        --export_dir path_to_export \
190 |        --export_size 2 \
191 |        --export_legacy_format False
192 | 
193 | Conclusion
194 | ----------
195 | 
196 | The above content is the simplest way to use LLaMA-Factory to train
197 | Qwen. Feel free to dive into the details by checking the official repo!
198 | 


--------------------------------------------------------------------------------
/docs/source/training/verl.rst:
--------------------------------------------------------------------------------
  1 | verl
  2 | ====
  3 | 
  4 | verl is a flexible, efficient and production-ready RL training library for large language models (LLMs).
  5 | 
  6 | verl is the open-source version of `HybridFlow: A Flexible and Efficient RLHF Framework <https://arxiv.org/abs/2409.19256v2>`__ paper.
  7 | 
  8 | GitHub repository: `verl <https://github.com/volcengine/verl>`__
  9 | 
 10 | verl is flexible and easy to use with:
 11 | 
 12 | - **Easy extension of diverse RL algorithms**: The hybrid-controller programming model enables flexible representation and efficient execution of complex Post-Training dataflows. Build RL dataflows such as GRPO, PPO in a few lines of code.
 13 | - **Seamless integration of existing LLM infra with modular APIs**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as FSDP, Megatron-LM, vLLM, SGLang, etc
 14 | - **Flexible device mapping**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
 15 | - **Ready integration with popular HuggingFace models**: verl supports popular LLM models, including Qwen, Llama, and more.
 16 | 
 17 | verl is fast with:
 18 | 
 19 | - **State-of-the-art throughput**: SOTA LLM training and inference engine integrations and SOTA RL throughput.
 20 | 
 21 | - **Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
 22 | 
 23 | Next, we will introduce how to use verl for training Qwen3 models.
 24 | 
 25 | Reinforcement Learning (RL)
 26 | -----------------------------
 27 | 
 28 | Now, verl supports various combinations of training frameworks and inference frameworks, including FSDP, Megatron-LM, vLLM, SGLang, etc. verl also supports training with multiple algorithms such as PPO, GRPO, DAPO, etc.
 29 | 
 30 | Step1: Environment and Training Preparation
 31 | +++++++++++++++++++++++++++++++++++++++++++
 32 | 
 33 | You can follow verl's `installation guide <https://verl.readthedocs.io/en/latest/start/install.html>`__ to complete the environment configuration.
 34 | 
 35 | Data preparation can be done by running the following command:
 36 | 
 37 | .. code-block:: bash
 38 | 
 39 |     git clone https://github.com/volcengine/verl.git
 40 |     cd verl
 41 |     python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
 42 | 
 43 | Model download can be done using the following command:
 44 | 
 45 | .. code-block:: bash
 46 | 
 47 |     python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen3-1.7B')"
 48 | 
 49 | 
 50 | Step2: Start Training
 51 | ++++++++++++++++++++++
 52 | 
 53 | In verl, training frameworks and inference frameworks can be combined freely, as long as the training framework and inference framework themselves support model training and inference tasks, so that verl can support RL-related training.
 54 | 
 55 | Below is an example using FSDP and vLLM to demonstrate how to train Qwen3 models in verl. We chose Qwen3-1.7B as the example, as it only requires a single 80GB GPU and a machine with more than 64GB of memory to start training.
 56 | 
 57 | .. code-block:: bash
 58 | 
 59 |     python3 -m verl.trainer.main_ppo \
 60 |         algorithm.adv_estimator=grpo \
 61 |         data.train_files=$HOME/data/gsm8k/train.parquet \
 62 |         data.val_files=$HOME/data/gsm8k/test.parquet \
 63 |         data.train_batch_size=1024 \
 64 |         data.max_prompt_length=512 \
 65 |         data.max_response_length=1024 \
 66 |         data.filter_overlong_prompts=True \
 67 |         data.truncation='error' \
 68 |         actor_rollout_ref.model.path=Qwen/Qwen3-1.7B \
 69 |         actor_rollout_ref.actor.optim.lr=1e-6 \
 70 |         actor_rollout_ref.model.use_remove_padding=True \
 71 |         actor_rollout_ref.actor.ppo_mini_batch_size=80 \
 72 |         actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=20 \
 73 |         actor_rollout_ref.actor.use_kl_loss=True \
 74 |         actor_rollout_ref.actor.kl_loss_coef=0.001 \
 75 |         actor_rollout_ref.actor.kl_loss_type=low_var_kl \
 76 |         actor_rollout_ref.actor.entropy_coeff=0 \
 77 |         actor_rollout_ref.model.enable_gradient_checkpointing=True \
 78 |         actor_rollout_ref.actor.fsdp_config.param_offload=False \
 79 |         actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
 80 |         actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
 81 |         actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
 82 |         actor_rollout_ref.rollout.name=vllm \
 83 |         actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
 84 |         actor_rollout_ref.rollout.n=3 \
 85 |         actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
 86 |         actor_rollout_ref.ref.fsdp_config.param_offload=True \
 87 |         algorithm.use_kl_in_reward=False \
 88 |         trainer.critic_warmup=0 \
 89 |         trainer.logger=['console'] \
 90 |         trainer.project_name='verl_grpo_example_gsm8k' \
 91 |         trainer.experiment_name='qwen3_1_7b_function_rm' \
 92 |         trainer.n_gpus_per_node=1 \
 93 |         trainer.nnodes=1 \
 94 |         trainer.save_freq=-1 \
 95 |         trainer.test_freq=5 \
 96 |         trainer.total_epochs=15 $@
 97 | 
 98 | 
 99 | Finally
100 | -----------------------------
101 | 
102 | If you encounter any difficulties during use, please join the discussion at `GitHub <https://github.com/volcengine/verl/discussions>`__.
103 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | > [!IMPORTANT]
4 | > The examples in this directory should be considered deprecated at the moment and they are not updated for Qwen3.
5 | > 
6 | 


--------------------------------------------------------------------------------
/examples/demo/web_demo.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Alibaba Cloud.
  2 | #
  3 | # This source code is licensed under the license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | """A simple web interactive chat demo based on gradio."""
  7 | 
  8 | from argparse import ArgumentParser
  9 | from threading import Thread
 10 | 
 11 | import gradio as gr
 12 | import torch
 13 | from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 14 | 
 15 | DEFAULT_CKPT_PATH = "Qwen/Qwen2.5-7B-Instruct"
 16 | 
 17 | 
 18 | def _get_args():
 19 |     parser = ArgumentParser(description="Qwen2.5-Instruct web chat demo.")
 20 |     parser.add_argument(
 21 |         "-c",
 22 |         "--checkpoint-path",
 23 |         type=str,
 24 |         default=DEFAULT_CKPT_PATH,
 25 |         help="Checkpoint name or path, default to %(default)r",
 26 |     )
 27 |     parser.add_argument(
 28 |         "--cpu-only", action="store_true", help="Run demo with CPU only"
 29 |     )
 30 | 
 31 |     parser.add_argument(
 32 |         "--share",
 33 |         action="store_true",
 34 |         default=False,
 35 |         help="Create a publicly shareable link for the interface.",
 36 |     )
 37 |     parser.add_argument(
 38 |         "--inbrowser",
 39 |         action="store_true",
 40 |         default=False,
 41 |         help="Automatically launch the interface in a new tab on the default browser.",
 42 |     )
 43 |     parser.add_argument(
 44 |         "--server-port", type=int, default=8000, help="Demo server port."
 45 |     )
 46 |     parser.add_argument(
 47 |         "--server-name", type=str, default="127.0.0.1", help="Demo server name."
 48 |     )
 49 | 
 50 |     args = parser.parse_args()
 51 |     return args
 52 | 
 53 | 
 54 | def _load_model_tokenizer(args):
 55 |     tokenizer = AutoTokenizer.from_pretrained(
 56 |         args.checkpoint_path,
 57 |         resume_download=True,
 58 |     )
 59 | 
 60 |     if args.cpu_only:
 61 |         device_map = "cpu"
 62 |     else:
 63 |         device_map = "auto"
 64 | 
 65 |     model = AutoModelForCausalLM.from_pretrained(
 66 |         args.checkpoint_path,
 67 |         torch_dtype="auto",
 68 |         device_map=device_map,
 69 |         resume_download=True,
 70 |     ).eval()
 71 |     model.generation_config.max_new_tokens = 2048  # For chat.
 72 | 
 73 |     return model, tokenizer
 74 | 
 75 | 
 76 | def _chat_stream(model, tokenizer, query, history):
 77 |     conversation = []
 78 |     for query_h, response_h in history:
 79 |         conversation.append({"role": "user", "content": query_h})
 80 |         conversation.append({"role": "assistant", "content": response_h})
 81 |     conversation.append({"role": "user", "content": query})
 82 |     input_text = tokenizer.apply_chat_template(
 83 |         conversation,
 84 |         add_generation_prompt=True,
 85 |         tokenize=False,
 86 |     )
 87 |     inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
 88 |     streamer = TextIteratorStreamer(
 89 |         tokenizer=tokenizer, skip_prompt=True, timeout=60.0, skip_special_tokens=True
 90 |     )
 91 |     generation_kwargs = {
 92 |         **inputs,
 93 |         "streamer": streamer,
 94 |     }
 95 |     thread = Thread(target=model.generate, kwargs=generation_kwargs)
 96 |     thread.start()
 97 | 
 98 |     for new_text in streamer:
 99 |         yield new_text
100 | 
101 | 
102 | def _gc():
103 |     import gc
104 | 
105 |     gc.collect()
106 |     if torch.cuda.is_available():
107 |         torch.cuda.empty_cache()
108 | 
109 | 
110 | def _launch_demo(args, model, tokenizer):
111 |     def predict(_query, _chatbot, _task_history):
112 |         print(f"User: {_query}")
113 |         _chatbot.append((_query, ""))
114 |         full_response = ""
115 |         response = ""
116 |         for new_text in _chat_stream(model, tokenizer, _query, history=_task_history):
117 |             response += new_text
118 |             _chatbot[-1] = (_query, response)
119 | 
120 |             yield _chatbot
121 |             full_response = response
122 | 
123 |         print(f"History: {_task_history}")
124 |         _task_history.append((_query, full_response))
125 |         print(f"Qwen: {full_response}")
126 | 
127 |     def regenerate(_chatbot, _task_history):
128 |         if not _task_history:
129 |             yield _chatbot
130 |             return
131 |         item = _task_history.pop(-1)
132 |         _chatbot.pop(-1)
133 |         yield from predict(item[0], _chatbot, _task_history)
134 | 
135 |     def reset_user_input():
136 |         return gr.update(value="")
137 | 
138 |     def reset_state(_chatbot, _task_history):
139 |         _task_history.clear()
140 |         _chatbot.clear()
141 |         _gc()
142 |         return _chatbot
143 | 
144 |     with gr.Blocks() as demo:
145 |         gr.Markdown("""\
146 | <p align="center"><img src="https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/assets/logo/qwen2.5_logo.png" style="height: 120px"/><p>""")
147 |         gr.Markdown(
148 |             """\
149 | <center><font size=3>This WebUI is based on Qwen2.5-Instruct, developed by Alibaba Cloud. \
150 | (本WebUI基于Qwen2.5-Instruct打造，实现聊天机器人功能。)</center>"""
151 |         )
152 |         gr.Markdown("""\
153 | <center><font size=4>
154 | Qwen2.5-7B-Instruct <a href="https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct/summary">🤖 </a> | 
155 | <a href="https://huggingface.co/Qwen/Qwen2.5-7B-Instruct">🤗</a>&nbsp ｜ 
156 | Qwen2.5-32B-Instruct <a href="https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct/summary">🤖 </a> | 
157 | <a href="https://huggingface.co/Qwen/Qwen2.5-32B-Instruct">🤗</a>&nbsp ｜ 
158 | Qwen2.5-72B-Instruct <a href="https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct/summary">🤖 </a> | 
159 | <a href="https://huggingface.co/Qwen/Qwen2.5-72B-Instruct">🤗</a>&nbsp ｜ 
160 | &nbsp<a href="https://github.com/QwenLM/Qwen2.5">Github</a></center>""")
161 | 
162 |         chatbot = gr.Chatbot(label="Qwen", elem_classes="control-height")
163 |         query = gr.Textbox(lines=2, label="Input")
164 |         task_history = gr.State([])
165 | 
166 |         with gr.Row():
167 |             empty_btn = gr.Button("🧹 Clear History (清除历史)")
168 |             submit_btn = gr.Button("🚀 Submit (发送)")
169 |             regen_btn = gr.Button("🤔️ Regenerate (重试)")
170 | 
171 |         submit_btn.click(
172 |             predict, [query, chatbot, task_history], [chatbot], show_progress=True
173 |         )
174 |         submit_btn.click(reset_user_input, [], [query])
175 |         empty_btn.click(
176 |             reset_state, [chatbot, task_history], outputs=[chatbot], show_progress=True
177 |         )
178 |         regen_btn.click(
179 |             regenerate, [chatbot, task_history], [chatbot], show_progress=True
180 |         )
181 | 
182 |         gr.Markdown("""\
183 | <font size=2>Note: This demo is governed by the original license of Qwen2.5. \
184 | We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, \
185 | including hate speech, violence, pornography, deception, etc. \
186 | (注：本演示受Qwen2.5的许可协议限制。我们强烈建议，用户不应传播及不应允许他人传播以下内容，\
187 | 包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)""")
188 | 
189 |     demo.queue().launch(
190 |         share=args.share,
191 |         inbrowser=args.inbrowser,
192 |         server_port=args.server_port,
193 |         server_name=args.server_name,
194 |     )
195 | 
196 | 
197 | def main():
198 |     args = _get_args()
199 | 
200 |     model, tokenizer = _load_model_tokenizer(args)
201 | 
202 |     _launch_demo(args, model, tokenizer)
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     main()
207 | 


--------------------------------------------------------------------------------
/examples/gcu-support/README.md:
--------------------------------------------------------------------------------
 1 | # Qwen2.5 推理
 2 | 
 3 | ## 1、配置运行环境
 4 | 
 5 | **安装驱动**
 6 | 
 7 | ```
 8 | # <version_id> 为软件包具体版本号。
 9 | chmod +x TopsRider_i3x_<version_id>_deb_amd64.run
10 | ./TopsRider_i3x_<version_id>_deb_amd64.run -y
11 | ```
12 | 
13 | **创建并启动 docker**
14 | 
15 | ```
16 | # 创建 docker 容器，将在基础镜像 artifact.enflame.cn/enflame_docker_images/ubuntu/qic_ubuntu_2004_gcc9:1.4.4 的基础上创建 docker。
17 | # <project_path> 当前工程所在路径
18 | # -e ENFLAME_VISIBLE_DEVICES=2 进行 GCU 资源隔离，如需多卡可以改为 0,1,2,3 等
19 | docker run -itd -e ENFLAME_VISIBLE_DEVICES=2 --name qwen-infer -v <project_path>:/work -v /root/:/root/ --privileged --network host  artifact.enflame.cn/enflame_docker_images/ubuntu/qic_ubuntu_2004_gcc9:1.4.4 bash
20 | ```
21 | 
22 | **进入 docker 安装环境**
23 | 
24 | ```
25 | # 进入 docker 容器
26 | docker exec -it qwen-infer bash
27 | 
28 | # 安装 SDK 框架，进入软件包所在地址。
29 | # <version_id> 为软件包具体版本号。
30 | ./TopsRider_i3x_<version_id>_amd64.run -C torch-gcu-2 -y
31 | ./TopsRider_i3x_<version_id>_deb_amd64.run -C tops-sdk -y
32 | 
33 | # 安装 python 库
34 | pip3.8 install transformers==4.40.2
35 | pip3.8 install accelerate
36 | ```
37 | 
38 | ## 2、推理
39 | 
40 | ```
41 | # 进入本工程目录，包含运行代码、推理输入等文件。
42 | .
43 | ├── README.md
44 | └── gcu_demo.py
45 | ```
46 | 
47 | **启动推理示例**
48 | 
49 | ```
50 | python3.8 gcu_demo.py
51 | ```
52 | 执行 gcu_demo.py 推理示例，代码改编自 [仓库 README](https://github.com/QwenLM/Qwen2.5/blob/main/README.md) 中的给的 Huggingface quick start 用例。
53 | 
54 | **GCU PyTorch 原生推理支持**
55 | 
56 | GCU 支持 pytorch 原生推理，在 pytorch 代码上只需做少许改动就可以在 GCU 上顺利运行：
57 | 
58 | 1. 导入 *torch_gcu* 后端库，并载入 transfer_to_gcu
59 |     ``` python
60 |     try:
61 |         import torch_gcu # 导入 torch_gcu
62 |         from torch_gcu import transfer_to_gcu #  transfer_to_gcu
63 |     except Exception as e:
64 |         print(e)
65 |     ```
66 | 2. device 名改为 *gcu*
67 |    ``` python
68 |    device = "gcu"
69 |    ```
70 | 
71 | **GCU vLLM 推理**
72 | 
73 | GCU 也支持 *vLLM* 原生推理，需要安装 GCU 版本的 *vLLM* 后，将设备名改为 gcu
74 | 
75 | ```
76 | python -m vllm.entrypoints.openai.api_server --served-model-name Qwen2.5-7B-Instruct --model Qwen/Qwen2.5-7B-Instruct --device gcu
77 | ```
78 | 


--------------------------------------------------------------------------------
/examples/gcu-support/gcu_demo.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch_gcu # 导入 torch_gcu
 3 |     from torch_gcu import transfer_to_gcu #  transfer_to_gcu
 4 | except Exception as e:
 5 |     print(e)
 6 | 
 7 | from transformers import AutoModelForCausalLM, AutoTokenizer
 8 | 
 9 | model_name = "Qwen/Qwen2.5-7B-Instruct"
10 | device = "gcu" # the device to load the model onto
11 | 
12 | model = AutoModelForCausalLM.from_pretrained(
13 |     model_name,
14 |     torch_dtype="auto",
15 |     device_map="auto"
16 | )
17 | tokenizer = AutoTokenizer.from_pretrained(model_name)
18 | 
19 | prompt = "Give me a short introduction to large language models."
20 | messages = [
21 |     {"role": "system", "content": "You are a helpful assistant."},
22 |     {"role": "user", "content": prompt}
23 | ]
24 | text = tokenizer.apply_chat_template(
25 |     messages,
26 |     tokenize=False,
27 |     add_generation_prompt=True
28 | )
29 | model_inputs = tokenizer([text], return_tensors="pt").to(device)
30 | 
31 | generated_ids = model.generate(
32 |     **model_inputs,
33 |     max_new_tokens=512
34 | )
35 | generated_ids = [
36 |     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
37 | ]
38 | 
39 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
40 | 


--------------------------------------------------------------------------------
/examples/llama-factory/finetune-zh.md:
--------------------------------------------------------------------------------
  1 | # 使用LLaMA-Factory微调Qwen模型
  2 | 
  3 | ## LLAMA-Factory简介
  4 | [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)是一个简单易用且高效的大模型训练框架，支持上百种大模型的训练，框架特性主要包括：
  5 | - 模型种类：LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
  6 | - 训练算法：（增量）预训练、（多模态）指令监督微调、奖励模型训练、PPO 训练、DPO 训练、KTO 训练、ORPO 训练等等。
  7 | - 运算精度：16比特全参数微调、冻结微调、LoRA微调和基于AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ的2/3/4/5/6/8比特QLoRA 微调。
  8 | - 优化算法：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ和PiSSA。
  9 | - 加速算子：FlashAttention-2和Unsloth。
 10 | - 推理引擎：Transformers和vLLM。
 11 | - 实验面板：LlamaBoard、TensorBoard、Wandb、MLflow等等。
 12 | 
 13 | 本文将介绍如何使用LLAMA-Factory对Qwen2系列大模型进行微调（Qwen1.5系列模型也适用），更多特性请参考[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)。
 14 | 
 15 | ## 安装LLaMA-Factory
 16 | 下载并安装LLaMA-Factory：
 17 | ```bash
 18 | git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
 19 | cd LLaMA-Factory
 20 | pip install -e ".[torch,metrics]"
 21 | ```
 22 | 
 23 | 安装完成后，执行`llamafactory-cli version`，若出现以下提示，则表明安装成功：
 24 | ```
 25 | ----------------------------------------------------------
 26 | | Welcome to LLaMA Factory, version 0.8.4.dev0           |
 27 | |                                                        |
 28 | | Project page: https://github.com/hiyouga/LLaMA-Factory |
 29 | ----------------------------------------------------------
 30 | ```
 31 | 
 32 | ## 准备训练数据
 33 | 自定义的训练数据应保存为jsonl文件，每一行的格式如下：
 34 | ```json
 35 | {
 36 |     "messages": [
 37 |         {
 38 |             "role": "system",
 39 |             "content": "You are a helpful assistant."
 40 |         },
 41 |         {
 42 |             "role": "user",
 43 |             "content": "Tell me something about large language models."
 44 |         },
 45 |         {
 46 |             "role": "assistant",
 47 |             "content": "Large language models are a type of language model that is trained on a large corpus of text data. They are capable of generating human-like text and are used in a variety of natural language processing tasks..."
 48 |         },
 49 |         {
 50 |             "role": "user",
 51 |             "content": "How about Qwen2?"
 52 |         },
 53 |         {
 54 |             "role": "assistant",
 55 |             "content": "Qwen2 is a large language model developed by Alibaba Cloud..."
 56 |         }
 57 |       
 58 |     ]
 59 | }
 60 | ```
 61 | 
 62 | 在LLaMA-Factory文件夹下的`data/dataset_info.json`文件中注册自定义的训练数据，在文件尾部添加如下配置信息：
 63 | ```
 64 | "qwen_train_data": {
 65 |     "file_name": "PATH-TO-YOUR-TRAIN-DATA",
 66 |     "formatting": "sharegpt",
 67 |     "columns": {
 68 |       "messages": "messages"
 69 |     },
 70 |     "tags": {
 71 |       "role_tag": "role",
 72 |       "content_tag": "content",
 73 |       "user_tag": "user",
 74 |       "assistant_tag": "assistant",
 75 |       "system_tag": "system"
 76 |     }
 77 | }
 78 | ```
 79 | 
 80 | ## 配置训练参数
 81 | 设置训练参数的配置文件，我们提供了全量参数、LoRA、QLoRA训练所对应的示例文件，你可以根据自身需求自行修改，配置详情见本目录下对应的文件:
 82 | - `qwen2-7b-full-sft.yaml`: 全量参数训练
 83 | - `qwen2-7b-lora-sft.yaml`: LoRA训练
 84 | - `qwen2-7b-qlora-sft.yaml`: QLoRA训练
 85 | 
 86 | 全量参数训练时的deepspeed配置文件可参考[文件](https://github.com/hiyouga/LLaMA-Factory/tree/main/examples/deepspeed)
 87 | 
 88 | 部分训练参数说明：
 89 | 
 90 | | 参数                          | 说明                                                                                           |
 91 | |-----------------------------|----------------------------------------------------------------------------------------------|
 92 | | model_name_or_path          | 模型名称或路径                                                                                      |
 93 | | stage                       | 训练阶段，可选: rm(reward modeling), pt(pretrain), sft(Supervised Fine-Tuning), PPO, DPO, KTO, ORPO |
 94 | | do_train                    | true用于训练, false用于评估                                                                          |
 95 | | finetuning_type             | 微调方式。可选: freeze, LoRA, full                                                                  |
 96 | | lora_target                 | 采取LoRA方法的目标模块，默认值为all。                                                                       |
 97 | | dataset                     | 使用的数据集，使用”,”分隔多个数据集                                                                          |
 98 | | template                    | 数据集模板，请保证数据集模板与模型相对应。                                                                        |
 99 | | output_dir                  | 输出路径                                                                                         |
100 | | logging_steps               | 日志输出步数间隔                                                                                     |
101 | | save_steps                  | 模型断点保存间隔                                                                                     |
102 | | overwrite_output_dir        | 是否允许覆盖输出目录                                                                                   |
103 | | per_device_train_batch_size | 每个设备上训练的批次大小                                                                                 |
104 | | gradient_accumulation_steps | 梯度积累步数                                                                                       |
105 | | learning_rate               | 学习率                                                                                          |
106 | | lr_scheduler_type           | 学习率曲线，可选 linear, cosine, polynomial, constant 等。                                             |
107 | | num_train_epochs            | 训练周期数                                                                                        |
108 | | bf16                        | 是否使用 bf16 格式                                                                                 |
109 | 
110 | ## 开始训练
111 | 
112 | 全量参数训练：
113 | ```bash
114 | FORCE_TORCHRUN=1 llamafactory-cli train qwen2-7b-full-sft.yaml 
115 | ```
116 | 
117 | LoRA训练：
118 | ```bash
119 | llamafactory-cli train qwen2-7b-lora-sft.yaml 
120 | ```
121 | 
122 | QLoRA训练：
123 | ```bash
124 | llamafactory-cli train qwen2-7b-qlora-sft.yaml 
125 | ```
126 | 
127 | 使用上述训练配置，各个方法实测的显存占用如下。训练中的显存占用与训练参数配置息息相关，可根据自身实际需求进行设置。
128 | - 全量参数训练：42.18GB
129 | - LoRA训练：20.17GB
130 | - QLoRA训练: 10.97GB
131 | 
132 | ## 合并模型权重
133 | 如果采用LoRA或者QLoRA进行训练，脚本只保存对应的LoRA权重，需要合并权重才能进行推理。**全量参数训练无需执行此步骤**
134 | 
135 | 
136 | ```bash
137 | llamafactory-cli export qwen2-7b-merge-lora.yaml
138 | ```
139 | 
140 | 权重合并的部分参数说明：
141 | 
142 | | 参数                   | 说明          |
143 | |----------------------|-------------|
144 | | model_name_or_path   | 预训练模型的名称或路径 |
145 | | template             | 模型模板        |
146 | | export_dir           | 导出路径        |
147 | | export_size          | 最大导出模型文件大小  |
148 | | export_device        | 导出设备        |
149 | | export_legacy_format | 是否使用旧格式导出   |
150 | 
151 | 注意：
152 | - 合并Qwen2模型权重，务必将template设为`qwen`；无论LoRA还是QLoRA训练，合并权重时，`finetuning_type`均为`lora`。
153 | - adapter_name_or_path需要与微调中的适配器输出路径output_dir相对应。
154 | 
155 | ## 模型推理
156 | 训练完成，合并模型权重之后，即可加载完整的模型权重进行推理， 推理的示例脚本如下：
157 | ```python
158 | from transformers import AutoModelForCausalLM, AutoTokenizer
159 | device = "cuda" # the device to load the model onto
160 | model_name_or_path = YOUR-MODEL-PATH
161 | 
162 | model = AutoModelForCausalLM.from_pretrained(
163 |     model_name_or_path,
164 |     torch_dtype="auto",
165 |     device_map="auto"
166 | )
167 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
168 | 
169 | prompt = "Give me a short introduction to large language models."
170 | messages = [
171 |     {"role": "system", "content": "You are a helpful assistant."},
172 |     {"role": "user", "content": prompt}
173 | ]
174 | text = tokenizer.apply_chat_template(
175 |     messages,
176 |     tokenize=False,
177 |     add_generation_prompt=True
178 | )
179 | model_inputs = tokenizer([text], return_tensors="pt").to(device)
180 | 
181 | generated_ids = model.generate(
182 |     model_inputs.input_ids,
183 |     max_new_tokens=512
184 | )
185 | generated_ids = [
186 |     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
187 | ]
188 | 
189 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
190 | ```
191 | 


--------------------------------------------------------------------------------
/examples/llama-factory/qwen2-7b-full-sft.yaml:
--------------------------------------------------------------------------------
 1 | ### model
 2 | model_name_or_path: Qwen/Qwen2-7B-Instruct
 3 | 
 4 | ### method
 5 | stage: sft
 6 | do_train: true
 7 | finetuning_type: full
 8 | deepspeed: PATH-TO-DS-CONFIG
 9 | 
10 | ### dataset
11 | dataset: qwen_train_data
12 | template: qwen
13 | cutoff_len: 1024
14 | overwrite_cache: true
15 | preprocessing_num_workers: 16
16 | 
17 | ### output
18 | output_dir: saves/qwen2-7b/full/sft
19 | logging_steps: 10
20 | save_steps: 100
21 | plot_loss: true
22 | overwrite_output_dir: true
23 | 
24 | ### train
25 | per_device_train_batch_size: 1
26 | gradient_accumulation_steps: 16
27 | learning_rate: 1.0e-5
28 | num_train_epochs: 1.0
29 | lr_scheduler_type: cosine
30 | warmup_ratio: 0.1
31 | bf16: true
32 | ddp_timeout: 180000000
33 | 
34 | ### eval
35 | val_size: 0.1
36 | per_device_eval_batch_size: 1
37 | eval_strategy: steps
38 | eval_steps: 500
39 | 


--------------------------------------------------------------------------------
/examples/llama-factory/qwen2-7b-lora-sft.yaml:
--------------------------------------------------------------------------------
 1 | ### model
 2 | model_name_or_path: Qwen/Qwen2-7B-Instruct
 3 | 
 4 | ### method
 5 | stage: sft
 6 | do_train: true
 7 | finetuning_type: lora
 8 | lora_target: all
 9 | lora_rank: 16
10 | lora_alpha: 16
11 | lora_dropout: 0.05
12 | 
13 | ### dataset
14 | dataset: qwen_train_data
15 | template: qwen
16 | cutoff_len: 1024
17 | overwrite_cache: true
18 | preprocessing_num_workers: 16
19 | 
20 | ### output
21 | output_dir: saves/qwen2-7b/lora/sft
22 | logging_steps: 100
23 | save_steps: 100
24 | plot_loss: true
25 | overwrite_output_dir: true
26 | 
27 | ### train
28 | per_device_train_batch_size: 1
29 | gradient_accumulation_steps: 16
30 | learning_rate: 1.0e-4
31 | num_train_epochs: 1.0
32 | lr_scheduler_type: cosine
33 | warmup_ratio: 0.1
34 | bf16: true
35 | ddp_timeout: 180000000
36 | 
37 | ### eval
38 | val_size: 0.1
39 | per_device_eval_batch_size: 1
40 | eval_strategy: steps
41 | eval_steps: 500
42 | 


--------------------------------------------------------------------------------
/examples/llama-factory/qwen2-7b-merge-lora.yaml:
--------------------------------------------------------------------------------
 1 | ### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
 2 | 
 3 | ### model
 4 | model_name_or_path: Qwen/Qwen2-7B-Instruct
 5 | adapter_name_or_path: PATH-TO-LORA
 6 | template: qwen
 7 | finetuning_type: lora
 8 | 
 9 | ### export
10 | export_dir: models/qwen2-7b-sft-lora-merged
11 | export_size: 2
12 | export_device: cpu
13 | export_legacy_format: false


--------------------------------------------------------------------------------
/examples/llama-factory/qwen2-7b-qlora-sft.yaml:
--------------------------------------------------------------------------------
 1 | ### model
 2 | model_name_or_path: Qwen/Qwen2-7B-Instruct
 3 | 
 4 | ### method
 5 | stage: sft
 6 | do_train: true
 7 | finetuning_type: lora
 8 | lora_target: all
 9 | quantization_bit: 4
10 | quantization_method: bitsandbytes  # choices: [bitsandbytes (4/8), hqq (2/3/4/5/6/8), eetq (8)]
11 | lora_rank: 16
12 | lora_alpha: 16
13 | lora_dropout: 0.05
14 | 
15 | ### dataset
16 | dataset: qwen_train_data
17 | template: qwen
18 | cutoff_len: 1024
19 | overwrite_cache: true
20 | preprocessing_num_workers: 16
21 | 
22 | ### output
23 | output_dir: saves/qwen2-7b/qlora/sft
24 | logging_steps: 100
25 | save_steps: 100
26 | plot_loss: true
27 | overwrite_output_dir: true
28 | 
29 | ### train
30 | per_device_train_batch_size: 1
31 | gradient_accumulation_steps: 16
32 | learning_rate: 1.0e-4
33 | num_train_epochs: 1.0
34 | lr_scheduler_type: cosine
35 | warmup_ratio: 0.1
36 | bf16: true
37 | ddp_timeout: 180000000
38 | 
39 | ### eval
40 | val_size: 0.1
41 | per_device_eval_batch_size: 1
42 | eval_strategy: steps
43 | eval_steps: 500
44 | 


--------------------------------------------------------------------------------
/examples/speed-benchmark/README.md:
--------------------------------------------------------------------------------
  1 | # Speed Benchmark
  2 | 
  3 | This document introduces the speed benchmark testing process for the Qwen2.5 series models (original and quantized models). For detailed reports, please refer to the [Qwen2.5 Speed Benchmark](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html).
  4 | 
  5 | ## 1. Model Collections
  6 | 
  7 | For models hosted on HuggingFace, refer to [Qwen2.5 Collections-HuggingFace](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e).
  8 | 
  9 | For models hosted on ModelScope, refer to [Qwen2.5 Collections-ModelScope](https://modelscope.cn/collections/Qwen25-dbc4d30adb768).
 10 | 
 11 | ## 2. Environment Setup
 12 | 
 13 | 
 14 | For inference using HuggingFace transformers:
 15 | 
 16 | ```shell
 17 | conda create -n qwen_perf_transformers python=3.10
 18 | conda activate qwen_perf_transformers
 19 | 
 20 | pip install torch==2.3.1
 21 | pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@v0.7.1
 22 | pip install git+https://github.com/Dao-AILab/flash-attention.git@v2.5.8
 23 | pip install -r requirements-perf-transformers.txt
 24 | ```
 25 | 
 26 | > [!Important]
 27 | > - For `flash-attention`, you can use the prebulit wheels from [GitHub Releases](https://github.com/Dao-AILab/flash-attention/releases/tag/v2.5.8) or installing from source, which requires a compatible CUDA compiler.
 28 | >   - You don't actually need to install `flash-attention`. It has been intergrated into `torch` as a backend of `sdpa`.
 29 | > - For `auto_gptq` to use efficent kernels, you need to install from source, because the prebuilt wheels require incompatible `torch` versions. Installing from source also requires a compatible CUDA compiler.
 30 | > - For `autoawq` to use efficent kenerls, you need `autoawq-kernels`, which should be automatically installed. If not, run `pip install autoawq-kernels`.
 31 | 
 32 | For inference using vLLM:
 33 | 
 34 | ```shell
 35 | conda create -n qwen_perf_vllm python=3.10
 36 | conda activate qwen_perf_vllm
 37 | 
 38 | pip install -r requirements-perf-vllm.txt
 39 | ```
 40 | 
 41 | ## 3. Execute Tests
 42 | 
 43 | Below are two methods for executing tests: using a script or the Speed Benchmark tool.
 44 | 
 45 | ### Method 1: Testing with Speed Benchmark Tool
 46 | 
 47 | Use the Speed Benchmark tool developed by [EvalScope](https://github.com/modelscope/evalscope), which supports automatic model downloads from ModelScope and outputs test results. It also allows testing by specifying the model service URL. For details, please refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/speed_benchmark.html).
 48 | 
 49 | **Install Dependencies**
 50 | ```shell
 51 | pip install 'evalscope[perf]' -U
 52 | ```
 53 | 
 54 | #### HuggingFace Transformers Inference
 55 | 
 56 | Execute the command as follows:
 57 | ```shell
 58 | CUDA_VISIBLE_DEVICES=0 evalscope perf \
 59 |  --parallel 1 \
 60 |  --model Qwen/Qwen2.5-0.5B-Instruct \
 61 |  --attn-implementation flash_attention_2 \
 62 |  --log-every-n-query 5 \
 63 |  --connect-timeout 6000 \
 64 |  --read-timeout 6000 \
 65 |  --max-tokens 2048 \
 66 |  --min-tokens 2048 \
 67 |  --api local \
 68 |  --dataset speed_benchmark 
 69 | ```
 70 | 
 71 | #### vLLM Inference
 72 | 
 73 | ```shell
 74 | CUDA_VISIBLE_DEVICES=0 evalscope perf \
 75 |  --parallel 1 \
 76 |  --model Qwen/Qwen2.5-0.5B-Instruct \
 77 |  --log-every-n-query 1 \
 78 |  --connect-timeout 60000 \
 79 |  --read-timeout 60000\
 80 |  --max-tokens 2048 \
 81 |  --min-tokens 2048 \
 82 |  --api local_vllm \
 83 |  --dataset speed_benchmark
 84 | ```
 85 | 
 86 | #### Parameter Explanation
 87 | - `--parallel` sets the number of worker threads for concurrent requests, should be fixed at 1.
 88 | - `--model` specifies the model file path or model ID, supporting automatic downloads from ModelScope, e.g., Qwen/Qwen2.5-0.5B-Instruct.
 89 | - `--attn-implementation` sets the attention implementation method, with optional values: flash_attention_2|eager|sdpa.
 90 | - `--log-every-n-query`: sets how often to log every n requests.
 91 | - `--connect-timeout`: sets the connection timeout in seconds.
 92 | - `--read-timeout`: sets the read timeout in seconds.
 93 | - `--max-tokens`: sets the maximum output length in tokens.
 94 | - `--min-tokens`: sets the minimum output length in tokens; both parameters set to 2048 means the model will output a fixed length of 2048.
 95 | - `--api`: sets the inference interface; local inference options are local|local_vllm.
 96 | - `--dataset`: sets the test dataset; options are speed_benchmark|speed_benchmark_long.
 97 | 
 98 | #### Test Results
 99 | 
100 | Test results can be found in the `outputs/{model_name}/{timestamp}/speed_benchmark.json` file, which contains all request results and test parameters.
101 | 
102 | ### Method 2: Testing with Scripts
103 | 
104 | #### HuggingFace Transformers Inference
105 | 
106 | - Using HuggingFace Hub
107 | 
108 | ```shell
109 | python speed_benchmark_transformers.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --gpus 0 --outputs_dir outputs/transformers
110 | ```
111 | 
112 | - Using ModelScope Hub
113 | 
114 | ```shell
115 | python speed_benchmark_transformers.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --gpus 0 --use_modelscope --outputs_dir outputs/transformers
116 | ```
117 | 
118 | Parameter Explanation:
119 | 
120 |     `--model_id_or_path`: Model ID or local path, optional values refer to the `Model Resources` section  
121 |     `--context_length`: Input length in tokens; optional values are 1, 6144, 14336, 30720, 63488, 129024; refer to the `Qwen2.5 Model Efficiency Evaluation Report` for specifics  
122 |     `--generate_length`: Number of tokens to generate; default is 2048
123 |     `--gpus`: Equivalent to the environment variable CUDA_VISIBLE_DEVICES, e.g., `0,1,2,3`, `4,5`  
124 |     `--use_modelscope`: If set, uses ModelScope to load the model; otherwise, uses HuggingFace  
125 |     `--outputs_dir`: Output directory, default is `outputs/transformers`  
126 | 
127 | #### vLLM Inference
128 | 
129 | - Using HuggingFace Hub
130 | 
131 | ```shell
132 | python speed_benchmark_vllm.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --max_model_len 32768 --gpus 0 --gpu_memory_utilization 0.9 --outputs_dir outputs/vllm
133 | ```
134 | 
135 | - Using ModelScope Hub
136 | 
137 | ```shell
138 | python speed_benchmark_vllm.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --max_model_len 32768 --gpus 0 --use_modelscope --gpu_memory_utilization 0.9 --outputs_dir outputs/vllm
139 | ```
140 | 
141 | Parameter Explanation:
142 | 
143 |     `--model_id_or_path`: Model ID or local path, optional values refer to the `Model Resources` section  
144 |     `--context_length`: Input length in tokens; optional values are 1, 6144, 14336, 30720, 63488, 129024; refer to the `Qwen2.5 Model Efficiency Evaluation Report` for specifics  
145 |     `--generate_length`: Number of tokens to generate; default is 2048
146 |     `--max_model_len`: Maximum model length in tokens; default is 32768  
147 |     `--gpus`: Equivalent to the environment variable CUDA_VISIBLE_DEVICES, e.g., `0,1,2,3`, `4,5`   
148 |     `--use_modelscope`: If set, uses ModelScope to load the model; otherwise, uses HuggingFace  
149 |     `--gpu_memory_utilization`: GPU memory utilization, range (0, 1]; default is 0.9  
150 |     `--outputs_dir`: Output directory, default is `outputs/vllm`  
151 |     `--enforce_eager`: Whether to enforce eager mode; default is False  
152 | 
153 | #### Test Results
154 | 
155 | Test results can be found in the `outputs` directory, which by default includes two folders for `transformers` and `vllm`, storing test results for HuggingFace transformers and vLLM respectively.
156 | 
157 | ## Notes
158 | 
159 | 1. Conduct multiple tests and take the average, with a typical value of 3 tests.
160 | 2. Ensure the GPU is idle before testing to avoid interference from other tasks.


--------------------------------------------------------------------------------
/examples/speed-benchmark/README_zh.md:
--------------------------------------------------------------------------------
  1 | # 效率评估
  2 | 
  3 | 本文介绍Qwen2.5系列模型（原始模型和量化模型）的效率测试流程，详细报告可参考 [Qwen2.5模型效率评估报告](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html)。
  4 | 
  5 | ## 1. 模型资源
  6 | 
  7 | 对于托管在HuggingFace上的模型，可参考 [Qwen2.5模型-HuggingFace](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)。
  8 | 
  9 | 对于托管在ModelScope上的模型，可参考 [Qwen2.5模型-ModelScope](https://modelscope.cn/collections/Qwen25-dbc4d30adb768)。
 10 | 
 11 | 
 12 | ## 2. 环境安装
 13 | 
 14 | 使用HuggingFace transformers推理，安装环境如下：
 15 | 
 16 | ```shell
 17 | conda create -n qwen_perf_transformers python=3.10
 18 | conda activate qwen_perf_transformers
 19 | 
 20 | pip install torch==2.3.1
 21 | pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@v0.7.1
 22 | pip install git+https://github.com/Dao-AILab/flash-attention.git@v2.5.8
 23 | pip install -r requirements-perf-transformers.txt
 24 | ```
 25 | 
 26 | > [!Important]
 27 | > - 对于 `flash-attention`，您可以从 [GitHub 发布页面](https://github.com/Dao-AILab/flash-attention/releases/tag/v2.5.8) 使用预编译的 wheel 包进行安装，或者从源代码安装，后者需要一个兼容的 CUDA 编译器。
 28 | >   - 实际上，您并不需要单独安装 `flash-attention`。它已经被集成到了 `torch` 中作为 `sdpa` 的后端实现。
 29 | > - 若要使 `auto_gptq` 使用高效的内核，您需要从源代码安装，因为预编译的 wheel 包依赖于与之不兼容的 `torch` 版本。从源代码安装同样需要一个兼容的 CUDA 编译器。
 30 | > - 若要使 `autoawq` 使用高效的内核，您需要安装 `autoawq-kernels`，该组件应当会自动安装。如果未自动安装，请运行 `pip install autoawq-kernels` 进行手动安装。
 31 | 
 32 | 
 33 | 使用vLLM推理，安装环境如下：
 34 | 
 35 | ```shell
 36 | conda create -n qwen_perf_vllm python=3.10
 37 | conda activate qwen_perf_vllm
 38 | 
 39 | pip install -r requirements-perf-vllm.txt
 40 | ```
 41 | 
 42 | 
 43 | ## 3. 执行测试
 44 | 
 45 | 下面介绍两种执行测试的方法，分别是使用脚本测试和使用Speed Benchmark工具进行测试。
 46 | 
 47 | ### 方法1：使用Speed Benchmark工具测试
 48 | 
 49 | 使用[EvalScope](https://github.com/modelscope/evalscope)开发的Speed Benchmark工具进行测试，支持自动从modelscope下载模型并输出测试结果，也支持指定模型服务的url进行测试，具体请参考[📖使用指南](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/stress_test/speed_benchmark.html)。
 50 | 
 51 | **安装依赖**
 52 | ```shell
 53 | pip install 'evalscope[perf]' -U
 54 | ```
 55 | 
 56 | #### HuggingFace transformers推理
 57 | 
 58 | 执行命令如下：
 59 | ```shell
 60 | CUDA_VISIBLE_DEVICES=0 evalscope perf \
 61 |  --parallel 1 \
 62 |  --model Qwen/Qwen2.5-0.5B-Instruct \
 63 |  --attn-implementation flash_attention_2 \
 64 |  --log-every-n-query 5 \
 65 |  --connect-timeout 6000 \
 66 |  --read-timeout 6000 \
 67 |  --max-tokens 2048 \
 68 |  --min-tokens 2048 \
 69 |  --api local \
 70 |  --dataset speed_benchmark 
 71 | ```
 72 | 
 73 | #### vLLM推理
 74 | 
 75 | ```shell
 76 | CUDA_VISIBLE_DEVICES=0 evalscope perf \
 77 |  --parallel 1 \
 78 |  --model Qwen/Qwen2.5-0.5B-Instruct \
 79 |  --log-every-n-query 1 \
 80 |  --connect-timeout 60000 \
 81 |  --read-timeout 60000\
 82 |  --max-tokens 2048 \
 83 |  --min-tokens 2048 \
 84 |  --api local_vllm \
 85 |  --dataset speed_benchmark
 86 | ```
 87 | 
 88 | #### 参数说明
 89 | - `--parallel` 设置并发请求的worker数量，需固定为1。
 90 | - `--model` 测试模型文件路径，也可为模型ID，支持自动从modelscope下载模型，例如Qwen/Qwen2.5-0.5B-Instruct。
 91 | - `--attn-implementation` 设置attention实现方式，可选值为flash_attention_2|eager|sdpa。
 92 | - `--log-every-n-query`: 设置每n个请求打印一次日志。
 93 | - `--connect-timeout`: 设置连接超时时间，单位为秒。
 94 | - `--read-timeout`: 设置读取超时时间，单位为秒。
 95 | - `--max-tokens`: 设置最大输出长度，单位为token。
 96 | - `--min-tokens`: 设置最小输出长度，单位为token；两个参数同时设置为2048则模型固定输出长度为2048。
 97 | - `--api`: 设置推理接口，本地推理可选值为local|local_vllm。
 98 | - `--dataset`: 设置测试数据集，可选值为speed_benchmark|speed_benchmark_long。
 99 | 
100 | #### 测试结果
101 | 
102 | 测试结果详见`outputs/{model_name}/{timestamp}/speed_benchmark.json`文件，其中包含所有请求结果和测试参数。
103 | 
104 | ### 方法2：使用脚本测试
105 | 
106 | #### HuggingFace transformers推理
107 | 
108 | - 使用HuggingFace hub
109 | 
110 | ```shell
111 | python speed_benchmark_transformers.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --gpus 0 --outputs_dir outputs/transformers
112 | 
113 | # 指定HF_ENDPOINT
114 | HF_ENDPOINT=https://hf-mirror.com python speed_benchmark_transformers.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --gpus 0 --outputs_dir outputs/transformers
115 | ```
116 | 
117 | - 使用ModelScope hub
118 | 
119 | ```shell
120 | python speed_benchmark_transformers.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --gpus 0 --use_modelscope --outputs_dir outputs/transformers
121 | ```
122 | 
123 | 参数说明：
124 | 
125 |     `--model_id_or_path`: 模型ID或本地路径， 可选值参考`模型资源`章节  
126 |     `--context_length`: 输入长度，单位为token数；可选值为1, 6144, 14336, 30720, 63488, 129024；具体可参考`Qwen2.5模型效率评估报告`  
127 |     `--generate_length`: 生成token数量；默认为2048
128 |     `--gpus`: 等价于环境变量CUDA_VISIBLE_DEVICES，例如`0,1,2,3`，`4,5`  
129 |     `--use_modelscope`: 如果设置该值，则使用ModelScope加载模型，否则使用HuggingFace  
130 |     `--outputs_dir`: 输出目录， 默认为`outputs/transformers`  
131 | 
132 | 
133 | #### vLLM推理
134 | 
135 | - 使用HuggingFace hub
136 | 
137 | ```shell
138 | python speed_benchmark_vllm.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --max_model_len 32768 --gpus 0 --gpu_memory_utilization 0.9 --outputs_dir outputs/vllm
139 | 
140 | # 指定HF_ENDPOINT
141 | HF_ENDPOINT=https://hf-mirror.com python speed_benchmark_vllm.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --max_model_len 32768 --gpus 0 --gpu_memory_utilization 0.9 --outputs_dir outputs/vllm
142 | ```
143 | 
144 | - 使用ModelScope hub
145 | 
146 | ```shell
147 | python speed_benchmark_vllm.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --max_model_len 32768 --gpus 0 --use_modelscope --gpu_memory_utilization 0.9 --outputs_dir outputs/vllm
148 | ```
149 | 
150 | 参数说明：
151 | 
152 |     `--model_id_or_path`: 模型ID或本地路径， 可选值参考`模型资源`章节  
153 |     `--context_length`: 输入长度，单位为token数；可选值为1, 6144, 14336, 30720, 63488, 129024；具体可参考`Qwen2.5模型效率评估报告`  
154 |     `--generate_length`: 生成token数量；默认为2048
155 |     `--max_model_len`: 模型最大长度，单位为token数；默认为32768  
156 |     `--gpus`: 等价于环境变量CUDA_VISIBLE_DEVICES，例如`0,1,2,3`，`4,5`   
157 |     `--use_modelscope`: 如果设置该值，则使用ModelScope加载模型，否则使用HuggingFace  
158 |     `--gpu_memory_utilization`: GPU内存利用率，取值范围为(0, 1]；默认为0.9  
159 |     `--outputs_dir`: 输出目录， 默认为`outputs/vllm`  
160 |     `--enforce_eager`: 是否强制使用eager模式；默认为False  
161 | 
162 | #### 测试结果
163 | 
164 | 测试结果详见`outputs`目录下的文件，默认包括`transformers`和`vllm`两个目录，分别存放HuggingFace transformers和vLLM的测试结果。
165 | 
166 | ## 注意事项
167 | 
168 | 1. 多次测试，取平均值，典型值为3次
169 | 2. 测试前请确保GPU处于空闲状态，避免其他任务影响测试结果
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/examples/speed-benchmark/requirements-perf-transformers.txt:
--------------------------------------------------------------------------------
 1 | # Note: install following requirements saparately
 2 | # pip install torch==2.3.1
 3 | # pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@v0.7.1
 4 | # pip install git+https://github.com/Dao-AILab/flash-attention.git@v2.5.8
 5 | 
 6 | transformers==4.46.0
 7 | autoawq==0.2.6
 8 | modelscope[framework]
 9 | accelerate
10 | optimum>=1.20.0
11 | 


--------------------------------------------------------------------------------
/examples/speed-benchmark/requirements-perf-vllm.txt:
--------------------------------------------------------------------------------
1 | vllm==0.6.3.post1
2 | torch==2.4.0
3 | modelscope[framework]
4 | accelerate
5 | 


--------------------------------------------------------------------------------
/examples/speed-benchmark/speed_benchmark_transformers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Alibaba Cloud.
  2 | #
  3 | # This source code is licensed under the license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | """
  7 | Qwen2.5 Speed Benchmark for transformers(pt) inference.
  8 | """
  9 | 
 10 | import os
 11 | import time
 12 | import json
 13 | import csv
 14 | 
 15 | import torch
 16 | from transformers.trainer_utils import set_seed
 17 | 
 18 | 
 19 | class SpeedBenchmarkTransformers:
 20 | 
 21 |     SEED = 1024
 22 |     BATCH_SIZE = 1
 23 |     USE_FLASH_ATTN = True
 24 |     COMMENT = 'default'
 25 |     DEVICE_MAP = 'auto'
 26 |     TORCH_DTYPE = 'auto'
 27 |     OVERWRITE_RESULT = False
 28 |     DUMMY_INPUT = '我'
 29 | 
 30 |     def __init__(self, model_id_or_path, use_modelscope: bool = True, outputs_dir: str = 'outputs/transformers'):
 31 |         """
 32 |         Speed benchmark for transformers(pt) inference.
 33 | 
 34 |         Args:
 35 |             model_id_or_path: The model id on ModelScope or HuggingFace hub, or local model path.
 36 |             use_modelscope: Use ModelScope, otherwise HuggingFace.
 37 |             outputs_dir: The output directory. Default is 'outputs/transformers'.
 38 |         """
 39 | 
 40 |         set_seed(self.SEED)
 41 |         self.model_id_or_path = model_id_or_path
 42 |         self.outputs_dir = outputs_dir
 43 | 
 44 |         if use_modelscope:
 45 |             from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 46 |         else:
 47 |             from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 48 | 
 49 |         self.tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=True)
 50 |         attn_impl = 'flash_attention_2' if self.USE_FLASH_ATTN else 'eager'
 51 |         self.model = AutoModelForCausalLM.from_pretrained(model_id_or_path,
 52 |                                                           torch_dtype=self.TORCH_DTYPE,
 53 |                                                           device_map=self.DEVICE_MAP,
 54 |                                                           attn_implementation=attn_impl
 55 |                                                           ).eval()
 56 | 
 57 |         self.generation_config = GenerationConfig.from_pretrained(model_id_or_path, trust_remote_code=True)
 58 | 
 59 |     def run(self, context_length: int, generate_length: int) -> str:
 60 | 
 61 |         # Specify hyperparameters for generation
 62 |         self.generation_config.min_length = generate_length + context_length
 63 |         self.generation_config.max_new_tokens = generate_length
 64 |         print(f'Generation config: {self.generation_config}')
 65 | 
 66 |         # Prepare inputs
 67 |         batch_size = self.BATCH_SIZE
 68 |         context_str = self.DUMMY_INPUT * context_length
 69 |         inputs = self.tokenizer([context_str for _ in range(batch_size)], return_tensors='pt')
 70 |         assert inputs['input_ids'].shape[1] == context_length
 71 |         assert inputs['input_ids'].shape[0] == batch_size
 72 |         inputs = inputs.to(self.model.device)
 73 | 
 74 |         # Run inference
 75 |         print(f'Start running inference for model {self.model_id_or_path} with input length {context_length} ...')
 76 |         start_time = time.time()
 77 |         torch.cuda.synchronize()
 78 |         pred = self.model.generate(**inputs, generation_config=self.generation_config)
 79 |         torch.cuda.synchronize()
 80 |         time_cost = time.time() - start_time
 81 |         assert pred.shape[1] == self.generation_config.min_length
 82 |         m = 0
 83 |         max_gpu_memory_cost = 0
 84 |         for i in range(torch.cuda.device_count()):
 85 |             m += torch.cuda.max_memory_allocated(i)
 86 |         max_gpu_memory_cost = max(max_gpu_memory_cost, m)
 87 |         torch.cuda.empty_cache()
 88 | 
 89 |         # Prepare results
 90 |         tokens_per_second: float = generate_length / time_cost
 91 |         # Compute the maximum GPU memory cost (in GB)
 92 |         max_gpu_memory_cost_gb = max_gpu_memory_cost / 1024 / 1024 / 1024
 93 | 
 94 |         data = {
 95 |             "model_id_or_path": self.model_id_or_path,
 96 |             "batch_size": batch_size,
 97 |             "context_length_per_experiment": context_length,
 98 |             "generate_length_per_experiment": generate_length,
 99 |             "use_flash_attn": self.USE_FLASH_ATTN,
100 |             "comment": self.COMMENT,
101 |             "tokens_per_second": round(tokens_per_second, 4),
102 |             "max_gpu_memory_cost_gb": round(max_gpu_memory_cost_gb, 4),
103 |         }
104 |         data_json = json.dumps(data, indent=4, ensure_ascii=False)
105 |         print(f'**Final result **\n{data_json}\n')
106 | 
107 |         # Dump results to CSV file
108 |         from datetime import datetime
109 |         now = datetime.now()
110 |         timestamp: str = now.strftime("%m%d%H%M%S")
111 | 
112 |         model_id_or_path_str = self.model_id_or_path.split(os.sep)[-1] \
113 |             if os.path.isdir(self.model_id_or_path) else self.model_id_or_path.replace('/', '__')
114 | 
115 |         out_file: str = os.path.join(self.outputs_dir,
116 |                                      f"{model_id_or_path_str}"
117 |                                      f"_context_length-{context_length}_{timestamp}.csv")
118 |         out_dir = os.path.dirname(out_file)
119 |         os.makedirs(out_dir, exist_ok=True)
120 |         self.save_result(data, out_file)
121 | 
122 |         return out_file
123 | 
124 |     @staticmethod
125 |     def save_result(data: dict, out_file: str) -> None:
126 | 
127 |         with open(out_file, mode='w') as file:
128 |             writer = csv.DictWriter(file, fieldnames=data.keys())
129 |             writer.writeheader()
130 |             writer.writerows([data])
131 | 
132 |         print(f"Results saved to {out_file}")
133 | 
134 | 
135 | def main():
136 | 
137 |     import argparse
138 | 
139 |     # Parse args
140 |     parser = argparse.ArgumentParser(description='Speed benchmark for transformers(pt) deployment')
141 |     parser.add_argument('--model_id_or_path', type=str, help='The model path or id on ModelScope or HuggingFace hub')
142 |     parser.add_argument('--context_length', type=int, help='The input length for each experiment.'
143 |                                                            'e.g. 1, 6144, 14336, 30720, 63488, 129024')
144 |     parser.add_argument('--generate_length', type=int, default=2048, help='Output length in tokens; default is 2048.')
145 |     parser.add_argument('--gpus', type=str, help='Equivalent to the env CUDA_VISIBLE_DEVICES.  e.g. `0,1,2,3`, `4,5`')
146 |     parser.add_argument('--use_modelscope', action='store_true',
147 |                         help='Use ModelScope when set this flag. Otherwise, use HuggingFace.')
148 |     parser.add_argument('--outputs_dir', type=str, default='outputs/transformers', help='The output directory')
149 | 
150 |     args = parser.parse_args()
151 | 
152 |     model_id_or_path: str = args.model_id_or_path
153 |     envs: str = args.gpus
154 |     context_length: int = args.context_length
155 |     generate_length: int = args.generate_length
156 |     use_modelscope: bool = args.use_modelscope
157 |     outputs_dir: str = args.outputs_dir
158 | 
159 |     print(f'Set CUDA_VISIBLE_DEVICES={envs} for model {model_id_or_path} with input_length {context_length}')
160 |     os.environ["CUDA_VISIBLE_DEVICES"] = envs
161 | 
162 |     speed_benchmark = SpeedBenchmarkTransformers(model_id_or_path=model_id_or_path,
163 |                                                  use_modelscope=use_modelscope,
164 |                                                  outputs_dir=outputs_dir)
165 |     speed_benchmark.run(context_length=context_length, generate_length=generate_length)
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     # Usage: python speed_benchmark_transformers.py --model_id_or_path Qwen/Qwen2.5-0.5B-Instruct --context_length 1 --gpus 0 --use_modelscope --outputs_dir outputs/transformers
170 |     main()
171 | 


--------------------------------------------------------------------------------