├── .dockerignore
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.yaml
│ ├── config.yaml
│ └── feature_request.yaml
└── workflows
│ └── stale.yml
├── .gitignore
├── FAQ.md
├── FAQ_ja.md
├── FAQ_zh.md
├── LICENSE
├── NOTICE
├── QWEN_TECHNICAL_REPORT.pdf
├── README.md
├── README_CN.md
├── README_ES.md
├── README_FR.md
├── README_JA.md
├── Tongyi Qianwen LICENSE AGREEMENT
├── Tongyi Qianwen RESEARCH LICENSE AGREEMENT
├── ascend-support
├── README.md
└── docker_qwen.sh
├── assets
├── cli_demo.gif
├── code_interpreter_showcase_001.jpg
├── hfagent_chat_1.png
├── hfagent_chat_2.png
├── hfagent_run.png
├── logo.jpg
├── openai_api.gif
├── performance.png
├── qwen_72b_needle_in_a_haystack.png
├── qwen_tokenizer.png
├── radar_14b.jpg
├── radar_72b.jpg
├── react_showcase_001.png
├── react_showcase_002.png
├── react_tutorial_001.png
├── react_tutorial_002.png
├── system_prompt_behavior_setting.png
├── system_prompt_behavior_setting_en.png
├── system_prompt_language_style.png
├── system_prompt_language_style_en.png
├── system_prompt_role_play.png
├── system_prompt_role_play_en.png
├── system_prompt_task_setting.png
├── system_prompt_task_setting_en.png
├── tokenizer.pdf
├── tokenizer.png
├── wanx_colorful_black.png
├── web_demo.gif
└── wechat.png
├── cli_demo.py
├── dcu-support
├── README.md
├── cli_demo.py
├── cli_demo_batch.py
├── model.properties
├── package
│ ├── fastllm_pytools
│ │ ├── __init__.py
│ │ ├── hf_model.py
│ │ ├── llm.py
│ │ └── torch2flm.py
│ └── setup.py
├── qwen2flm.py
├── requirements.txt
└── web_demo.py
├── docker
├── Dockerfile
├── Dockerfile-cu114
├── Dockerfile-cu121
├── docker_cli_demo.sh
├── docker_openai_api.sh
└── docker_web_demo.sh
├── eval
├── EVALUATION.md
├── evaluate_ceval.py
├── evaluate_chat_ceval.py
├── evaluate_chat_gsm8k.py
├── evaluate_chat_humaneval.py
├── evaluate_chat_mmlu.py
├── evaluate_cmmlu.py
├── evaluate_gsm8k.py
├── evaluate_humaneval.py
├── evaluate_mmlu.py
├── evaluate_plugin.py
└── gsm8k_prompt.txt
├── examples
├── add_merges.py
├── auto_comments.md
├── auto_comments.py
├── function_call_examples.py
├── function_call_finetune_examples.py
├── langchain_tooluse.ipynb
├── qwen_extra.tiktoken
├── qwen_extra_vocab.txt
├── react_demo.py
├── react_prompt.md
├── system_prompt.md
├── tokenizer_showcase.ipynb
├── transformers_agent.md
└── vllm_wrapper.py
├── finetune.py
├── finetune
├── ds_config_zero2.json
├── ds_config_zero3.json
├── finetune_ds.sh
├── finetune_lora_ds.sh
├── finetune_lora_single_gpu.sh
├── finetune_qlora_ds.sh
└── finetune_qlora_single_gpu.sh
├── openai_api.py
├── recipes
├── applications
│ ├── chatbot
│ │ └── qwen_chatbot.ipynb
│ ├── domain_finetune
│ │ └── qwen_domain_finetune.ipynb
│ └── retrieval
│ │ └── retrieval.ipynb
├── finetune
│ ├── ascend
│ │ └── README.md
│ ├── deepspeed
│ │ ├── finetune_fullparameter_multi_gpu.ipynb
│ │ ├── finetune_fullparameter_single_gpu.ipynb
│ │ ├── finetune_lora_multi_gpu.ipynb
│ │ ├── finetune_lora_single_gpu.ipynb
│ │ ├── finetune_qlora_multi_gpu.ipynb
│ │ ├── finetune_qlora_single_gpu.ipynb
│ │ ├── readme.md
│ │ └── requirements.txt
│ └── swift
│ │ ├── README.md
│ │ └── README_CN.md
├── inference
│ ├── dashscope
│ │ └── README.md
│ ├── hf_modelscope
│ │ └── README.md
│ ├── quantization
│ │ └── README.md
│ ├── tensorrt
│ │ ├── README.md
│ │ └── docker
│ │ │ └── Dockerfile
│ └── vllm
│ │ ├── README.md
│ │ ├── template_chatml.jinja
│ │ └── vllm_wrapper.py
├── quickstart
│ └── qwen.ipynb
└── tests
│ ├── README.md
│ ├── __init__.py
│ ├── assets
│ └── test_sampled_qwen.json
│ ├── test_finetune
│ └── test_finetune_ds.py
│ ├── test_inference
│ ├── test_inference_api.py
│ └── test_inference_vllm_fschat.py
│ ├── ut_config.py
│ └── utils.py
├── requirements.txt
├── requirements_web_demo.txt
├── run_gptq.py
├── tech_memo.md
├── tokenization_note.md
├── tokenization_note_ja.md
├── tokenization_note_zh.md
├── utils.py
└── web_demo.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.so
3 | build
4 | .coverage_*
5 | *.egg-info
6 | *~
7 | .vscode/
8 | .idea/
9 | .git/
10 | .github/
11 | .DS_Store
12 |
13 | /private/
14 | /README-docker.md
15 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yaml:
--------------------------------------------------------------------------------
1 | name: 🐞 Bug
2 | description: 提交错误报告 | File a bug/issue
3 | title: "[BUG]
"
4 | labels: []
5 | body:
6 | - type: checkboxes
7 | attributes:
8 | label: 是否已有关于该错误的issue或讨论? | Is there an existing issue / discussion for this?
9 | description: |
10 | 请先搜索您遇到的错误是否在已有的issues或讨论中提到过。
11 | Please search to see if an issue / discussion already exists for the bug you encountered.
12 | [Issues](https://github.com/QwenLM/Qwen-7B/issues)
13 | [Discussions](https://github.com/QwenLM/Qwen-7B/discussions)
14 | options:
15 | - label: 我已经搜索过已有的issues和讨论 | I have searched the existing issues / discussions
16 | required: true
17 | - type: checkboxes
18 | attributes:
19 | label: 该问题是否在FAQ中有解答? | Is there an existing answer for this in FAQ?
20 | description: |
21 | 请先搜索您遇到的错误是否已在FAQ中有相关解答。
22 | Please search to see if an answer already exists in FAQ for the bug you encountered.
23 | [FAQ-en](https://github.com/QwenLM/Qwen-7B/blob/main/FAQ.md)
24 | [FAQ-zh](https://github.com/QwenLM/Qwen-7B/blob/main/FAQ_zh.md)
25 | options:
26 | - label: 我已经搜索过FAQ | I have searched FAQ
27 | required: true
28 | - type: textarea
29 | attributes:
30 | label: 当前行为 | Current Behavior
31 | description: |
32 | 准确描述遇到的行为。
33 | A concise description of what you're experiencing.
34 | validations:
35 | required: false
36 | - type: textarea
37 | attributes:
38 | label: 期望行为 | Expected Behavior
39 | description: |
40 | 准确描述预期的行为。
41 | A concise description of what you expected to happen.
42 | validations:
43 | required: false
44 | - type: textarea
45 | attributes:
46 | label: 复现方法 | Steps To Reproduce
47 | description: |
48 | 复现当前行为的详细步骤。
49 | Steps to reproduce the behavior.
50 | placeholder: |
51 | 1. In this environment...
52 | 2. With this config...
53 | 3. Run '...'
54 | 4. See error...
55 | validations:
56 | required: false
57 | - type: textarea
58 | attributes:
59 | label: 运行环境 | Environment
60 | description: |
61 | examples:
62 | - **OS**: Ubuntu 20.04
63 | - **Python**: 3.8
64 | - **Transformers**: 4.31.0
65 | - **PyTorch**: 2.0.1
66 | - **CUDA**: 11.4
67 | value: |
68 | - OS:
69 | - Python:
70 | - Transformers:
71 | - PyTorch:
72 | - CUDA (`python -c 'import torch; print(torch.version.cuda)'`):
73 | render: Markdown
74 | validations:
75 | required: false
76 | - type: textarea
77 | attributes:
78 | label: 备注 | Anything else?
79 | description: |
80 | 您可以在这里补充其他关于该问题背景信息的描述、链接或引用等。
81 |
82 | 您可以通过点击高亮此区域然后拖动文件的方式上传图片或日志文件。
83 |
84 | Links? References? Anything that will give us more context about the issue you are encountering!
85 |
86 | Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in.
87 | validations:
88 | required: false
89 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yaml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yaml:
--------------------------------------------------------------------------------
1 | name: "💡 Feature Request"
2 | description: 创建新功能请求 | Create a new ticket for a new feature request
3 | title: "💡 [REQUEST] - "
4 | labels: [
5 | "question"
6 | ]
7 | body:
8 | - type: input
9 | id: start_date
10 | attributes:
11 | label: "起始日期 | Start Date"
12 | description: |
13 | 起始开发日期
14 | Start of development
15 | placeholder: "month/day/year"
16 | validations:
17 | required: false
18 | - type: textarea
19 | id: implementation_pr
20 | attributes:
21 | label: "实现PR | Implementation PR"
22 | description: |
23 | 实现该功能的Pull request
24 | Pull request used
25 | placeholder: "#Pull Request ID"
26 | validations:
27 | required: false
28 | - type: textarea
29 | id: reference_issues
30 | attributes:
31 | label: "相关Issues | Reference Issues"
32 | description: |
33 | 与该功能相关的issues
34 | Common issues
35 | placeholder: "#Issues IDs"
36 | validations:
37 | required: false
38 | - type: textarea
39 | id: summary
40 | attributes:
41 | label: "摘要 | Summary"
42 | description: |
43 | 简要描述新功能的特点
44 | Provide a brief explanation of the feature
45 | placeholder: |
46 | Describe in a few lines your feature request
47 | validations:
48 | required: true
49 | - type: textarea
50 | id: basic_example
51 | attributes:
52 | label: "基本示例 | Basic Example"
53 | description: Indicate here some basic examples of your feature.
54 | placeholder: A few specific words about your feature request.
55 | validations:
56 | required: true
57 | - type: textarea
58 | id: drawbacks
59 | attributes:
60 | label: "缺陷 | Drawbacks"
61 | description: |
62 | 该新功能有哪些缺陷/可能造成哪些影响?
63 | What are the drawbacks/impacts of your feature request ?
64 | placeholder: |
65 | Identify the drawbacks and impacts while being neutral on your feature request
66 | validations:
67 | required: true
68 | - type: textarea
69 | id: unresolved_question
70 | attributes:
71 | label: "未解决问题 | Unresolved questions"
72 | description: |
73 | 有哪些尚未解决的问题?
74 | What questions still remain unresolved ?
75 | placeholder: |
76 | Identify any unresolved issues.
77 | validations:
78 | required: false
--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
1 | name: Close stale issues
2 | on:
3 | schedule:
4 | - cron: "0 8 * * *"
5 |
6 | jobs:
7 | close-issues:
8 | runs-on: ubuntu-latest
9 | permissions:
10 | actions: write
11 | issues: write
12 | steps:
13 | - uses: actions/stale@v9
14 | with:
15 | days-before-issue-stale: 30
16 | days-before-issue-close: 7
17 | stale-issue-label: inactive
18 | stale-issue-message: This issue has been automatically marked as inactive due to lack of recent activity.
19 | Should you believe it remains unresolved and warrants attention, kindly leave a comment on this thread.
20 |
21 | 此问题由于长期未有新进展而被系统自动标记为不活跃。如果您认为它仍有待解决,请在此帖下方留言以补充信息。
22 | days-before-pr-stale: -1
23 | days-before-pr-close: -1
24 | operations-per-run: 128
25 | repo-token: ${{ secrets.GITHUB_TOKEN }}
26 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.so
3 | build
4 | .coverage_*
5 | *.egg-info
6 | *~
7 | .vscode/
8 | .idea/
9 | .DS_Store
10 |
11 | /private/
12 |
--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 |
3 | ## Installation & Environment
4 |
5 | #### Failure in installing flash attention
6 |
7 | Flash attention is an option for accelerating training and inference. Only NVIDIA GPUs of Turing, Ampere, Ada, and Hopper architecture, e.g., H100, A100, RTX 3090, T4, RTX 2080, can support flash attention. **You can use our models without installing it.**
8 |
9 | #### Which version of transformers should I use?
10 |
11 | 4.32.0 is preferred.
12 |
13 | #### I downloaded the codes and checkpoints but I can't load the model locally. What should I do?
14 |
15 | Please check if you have updated the code to the latest, and correctly downloaded all the sharded checkpoint files.
16 |
17 | #### `qwen.tiktoken` is not found. What is it?
18 |
19 | This is the merge file of the tokenizer. You have to download it. Note that if you just git clone the repo without [git-lfs](https://git-lfs.com), you cannot download this file.
20 |
21 | #### transformers_stream_generator/tiktoken/accelerate not found
22 |
23 | Run the command `pip install -r requirements.txt`. You can find the file at [https://github.com/QwenLM/Qwen-7B/blob/main/requirements.txt](https://github.com/QwenLM/Qwen/blob/main/requirements.txt).
24 |
25 |
26 |
27 |
28 | ## Demo & Inference
29 |
30 | #### Is there any demo? CLI demo and Web UI demo?
31 |
32 | Yes, see `web_demo.py` for web demo and `cli_demo.py` for CLI demo. See README for more information.
33 |
34 |
35 | #### Can I use CPU only?
36 |
37 | Yes, run `python cli_demo.py --cpu-only` will load the model and inference on CPU only.
38 |
39 | #### Can Qwen support streaming?
40 |
41 | Yes. See the function `chat_stream` in `modeling_qwen.py`.
42 |
43 | #### Gibberish in result when using chat_stream().
44 |
45 | This is because tokens represent bytes and a single token may be a meaningless string. We have updated the default setting of our tokenizer to avoid such decoding results. Please update the code to the latest version.
46 |
47 | #### It seems that the generation is not related to the instruction...
48 |
49 | Please check if you are loading Qwen-Chat instead of Qwen. Qwen is the base model without alignment, which behaves differently from the SFT/Chat model.
50 |
51 | #### Is quantization supported?
52 |
53 | Yes, the quantization is supported by AutoGPTQ.
54 |
55 |
56 | #### Slow when processing long sequences
57 |
58 | Updating the code to the latest version can help.
59 |
60 | #### Unsatisfactory performance in processing long sequences
61 |
62 | Please ensure that NTK is applied. `use_dynamc_ntk` and `use_logn_attn` in `config.json` should be set to `true` (`true` by default).
63 |
64 |
65 |
66 |
67 | ## Finetuning
68 |
69 | #### Can Qwen support SFT or even RLHF?
70 |
71 | Yes, we now support SFT, including full-parameter finetuning, LoRA, and Q-LoRA. Also you can check other projects like [FastChat](**[https://github.com/lm-sys/FastChat](https://github.com/lm-sys/FastChat)), [Firefly]([https://github.com/yangjianxin1/Firefly](https://github.com/yangjianxin1/Firefly)), [**LLaMA Efficient Tuning**]([https://github.com/hiyouga/LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning)), etc.
72 |
73 | However, temporarily we do not support RLHF. We will provide the code in the near future.
74 |
75 |
76 |
77 |
78 | ## Tokenizer
79 |
80 | #### bos_id/eos_id/pad_id not found
81 |
82 | In our training, we only use `<|endoftext|>` as the separator and padding token. You can set bos_id, eos_id, and pad_id to tokenizer.eod_id. Learn more about our tokenizer from our documents about the tokenizer.
83 |
84 |
85 |
86 | ## Docker
87 |
88 | #### Download official docker image is very slow
89 |
90 | When downloading our official docker image, you may have a slow download speed due to some network issues. You can refer to [Alibaba Cloud Container Image Service](https://help.aliyun.com/zh/acr/user-guide/accelerate-the-pulls-of-docker-official-images) to accelerate the download of official images.
91 |
--------------------------------------------------------------------------------
/FAQ_ja.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 |
3 | ## インストールと環境
4 |
5 | #### Flash attention 導入の失敗例
6 |
7 | Flash attention は、トレーニングと推論を加速するオプションです。H100、A100、RTX 3090、T4、RTX 2080 などの Turing、Ampere、Ada、および Hopper アーキテクチャの NVIDIA GPU だけが、flash attention をサポートできます。それをインストールせずに私たちのモデルを使用することができます。
8 |
9 | #### transformers のバージョンは?
10 |
11 | 4.32.0 が望ましいです。
12 |
13 | #### コードとチェックポイントをダウンロードしましたが、モデルをローカルにロードできません。どうすればよいでしょうか?
14 |
15 | コードを最新のものに更新し、すべてのシャードされたチェックポイントファイルを正しくダウンロードしたかどうか確認してください。
16 |
17 | #### `qwen.tiktoken` が見つかりません。これは何ですか?
18 |
19 | これはトークナイザーのマージファイルです。ダウンロードする必要があります。[git-lfs](https://git-lfs.com) を使わずにリポジトリを git clone しただけでは、このファイルをダウンロードできないことに注意してください。
20 |
21 | #### transformers_stream_generator/tiktoken/accelerate が見つかりません。
22 |
23 | コマンド `pip install -r requirements.txt` を実行してください。このファイルは [https://github.com/QwenLM/Qwen/blob/main/requirements.txt](https://github.com/QwenLM/Qwen/blob/main/requirements.txt) にあります。
24 |
25 |
26 |
27 |
28 | ## デモと推論
29 |
30 | #### デモはありますか?CLI と Web UI のデモはありますか?
31 |
32 | はい、Web デモは `web_demo.py` を、CLI デモは `cli_demo.py` を参照してください。詳しくは README を参照してください。
33 |
34 |
35 |
36 | #### CPU のみを使うことはできますか?
37 |
38 | はい、`python cli_demo.py --cpu-only` を実行すると、CPU のみでモデルと推論をロードします。
39 |
40 | #### Qwen はストリーミングに対応していますか?
41 |
42 | `modeling_qwen.py` の `chat_stream` 関数を参照してください。
43 |
44 | #### chat_stream() を使用すると、結果に文字化けが発生します。
45 |
46 | これは、トークンがバイトを表し、単一のトークンが無意味な文字列である可能性があるためです。このようなデコード結果を避けるため、トークナイザのデフォルト設定を更新しました。コードを最新版に更新してください。
47 |
48 | #### インストラクションとは関係ないようですが...
49 |
50 | Qwen ではなく Qwen-Chat を読み込んでいないか確認してください。Qwen はアライメントなしのベースモデルで、SFT/Chat モデルとは挙動が異なります。
51 |
52 | #### 量子化はサポートされていますか?
53 |
54 | はい、量子化は AutoGPTQ でサポートされています。
55 |
56 |
57 | #### 長いシーケンスの処理に時間がかかる
58 |
59 | コードを最新版に更新することで解決します。
60 |
61 | #### 長いシーケンスの処理で不満足なパフォーマンス
62 |
63 | NTK が適用されていることを確認してください。`config.json` の `use_dynamc_ntk` と `use_logn_attn` を `true` に設定する必要があります(デフォルトでは `true`)。
64 |
65 |
66 |
67 |
68 | ## ファインチューニング
69 |
70 | #### Qwen は SFT、あるいは RLHF に対応できますか?
71 |
72 | SFTのコードは提供します。[FastChat](**[https://github.com/lm-sys/FastChat](https://github.com/lm-sys/FastChat))、[Firefly]([https://github.com/yangjianxin1/Firefly](https://github.com/yangjianxin1/Firefly))、[**LLaMA Efficient Tuning**]([https://github.com/hiyouga/LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning))など、いくつかのプロジェクトではファインチューニングをサポートしています。近日中に関連コードを更新する予定です。
73 |
74 |
75 |
76 |
77 | ## トークナイザー
78 |
79 | #### bos_id/eos_id/pad_id が見つかりません。
80 |
81 | 私たちのトレーニングでは、セパレータとパディングトークンとして `<|endoftext|>` のみを使用しています。bos_id、eos_id、pad_id は tokenizer.eod_id に設定できます。私たちのトークナイザーについて詳しくは、トークナイザーについてのドキュメントをご覧ください。
82 |
83 |
--------------------------------------------------------------------------------
/FAQ_zh.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 |
3 | ## 安装&环境
4 |
5 | #### flash attention 安装失败
6 |
7 | flash attention是一个用于加速模型训练推理的可选项,且仅适用于Turing、Ampere、Ada、Hopper架构的Nvidia GPU显卡(如H100、A100、RTX 3090、T4、RTX 2080),您可以在不安装flash attention的情况下正常使用模型进行推理。
8 |
9 | #### 我应该用哪个transformers版本?
10 |
11 | 建议使用4.32.0。
12 |
13 | #### 我把模型和代码下到本地,按照教程无法使用,该怎么办?
14 |
15 | 答:别着急,先检查你的代码是不是更新到最新版本,然后确认你是否完整地将模型checkpoint下到本地。
16 |
17 | #### `qwen.tiktoken`这个文件找不到,怎么办?
18 |
19 | 这个是我们的tokenizer的merge文件,你必须下载它才能使用我们的tokenizer。注意,如果你使用git clone却没有使用git-lfs,这个文件不会被下载。如果你不了解git-lfs,可点击[官网](https://git-lfs.com/)了解。
20 |
21 | #### transformers_stream_generator/tiktoken/accelerate,这几个库提示找不到,怎么办?
22 |
23 | 运行如下命令:`pip install -r requirements.txt`。相关依赖库在[https://github.com/QwenLM/Qwen-7B/blob/main/requirements.txt](https://github.com/QwenLM/Qwen/blob/main/requirements.txt) 可以找到。
24 |
25 |
26 |
27 | ## Demo & 推理
28 |
29 | #### 是否提供Demo?CLI Demo及Web UI Demo?
30 |
31 | `web_demo.py`和`cli_demo.py`分别提供了Web UI以及CLI的Demo。请查看README相关内容了解更多。
32 |
33 | #### 我没有GPU,只用CPU运行CLI demo可以吗?
34 |
35 | 可以的,运行`python cli_demo.py --cpu-only`命令即可将模型读取到CPU并使用CPU进行推理。
36 |
37 | #### Qwen支持流式推理吗?
38 |
39 | Qwen当前支持流式推理。见位于`modeling_qwen.py`的`chat_stream`函数。
40 |
41 | #### 使用`chat_stream()`生成混乱的内容及乱码,为什么?
42 |
43 | 这是由于模型生成过程中输出的部分token需要与后续token一起解码才能输出正常文本,单个token解码结果是无意义字符串,我们已经更新了tokenizer解码时的默认设置,避免这些字符串在生成结果中出现,如果仍有类似问题请更新模型至最新版本。
44 |
45 | #### 模型的输出看起来与输入无关/没有遵循指令/看起来呆呆的
46 |
47 | 请检查是否加载的是Qwen-Chat模型进行推理,Qwen模型是未经align的预训练基模型,不期望具备响应用户指令的能力。我们在模型最新版本已经对`chat`及`chat_stream`接口内进行了检查,避免您误将预训练模型作为SFT/Chat模型使用。
48 |
49 | #### 是否有量化版本模型
50 |
51 | 目前Qwen支持基于AutoGPTQ的4-bit的量化推理。
52 |
53 | #### 生成序列较长后速度显著变慢
54 |
55 | 请更新到最新代码。
56 |
57 | #### 处理长序列时效果有问题
58 |
59 | 请确认是否开启ntk。若要启用这些技巧,请将`config.json`里的`use_dynamc_ntk`和`use_logn_attn`设置为`true`。最新代码默认为`true`。
60 |
61 |
62 |
63 | ## 微调
64 |
65 | #### 当前是否支持SFT和RLHF?
66 |
67 | 我们目前提供了SFT的代码,支持全参数微调、LoRA和Q-LoRA。此外,当前有多个外部项目也已实现支持,如[FastChat](**[https://github.com/lm-sys/FastChat](https://github.com/lm-sys/FastChat))、[Firefly]([https://github.com/yangjianxin1/Firefly](https://github.com/yangjianxin1/Firefly))、[**LLaMA Efficient Tuning**]([https://github.com/hiyouga/LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning))等。我们会尽快更新这部分代码和说明。
68 |
69 | 我们还没提供对RLHF训练的支持,敬请期待。
70 |
71 |
72 |
73 | ## Tokenizer
74 |
75 | #### bos_id/eos_id/pad_id,这些token id不存在,为什么?
76 |
77 | 在训练过程中,我们仅使用<|endoftext|>这一token作为sample/document之间的分隔符及padding位置占位符,你可以将bos_id, eos_id, pad_id均指向tokenizer.eod_id。请阅读我们关于tokenizer的文档,了解如何设置这些id。
78 |
79 |
80 | ## Docker
81 |
82 | #### 下载官方Docker镜像速度很慢
83 |
84 | 在下载官方镜像时,您可能由于某些网络原因导致下载速度变慢。可以参考[阿里云容器镜像服务](https://help.aliyun.com/zh/acr/user-guide/accelerate-the-pulls-of-docker-official-images)加速官方镜像的下载。
--------------------------------------------------------------------------------
/QWEN_TECHNICAL_REPORT.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/QWEN_TECHNICAL_REPORT.pdf
--------------------------------------------------------------------------------
/Tongyi Qianwen LICENSE AGREEMENT:
--------------------------------------------------------------------------------
1 | Tongyi Qianwen LICENSE AGREEMENT
2 |
3 | Tongyi Qianwen Release Date: August 3, 2023
4 |
5 | By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
6 |
7 | 1. Definitions
8 | a. This Tongyi Qianwen LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
9 | b. "We"(or "Us") shall mean Alibaba Cloud.
10 | c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
11 | d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You.
12 | e. "Tongyi Qianwen" shall mean the large language models (including Qwen model and Qwen-Chat model), and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us.
13 | f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement.
14 | g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
15 | h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation,
16 | and conversions to other media types.
17 |
18 | 2. Grant of Rights
19 | You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials.
20 |
21 | 3. Redistribution
22 | You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
23 | a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
24 | b. You shall cause any modified files to carry prominent notices stating that You changed the files;
25 | c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
26 | d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
27 |
28 | 4. Restrictions
29 | If you are commercially using the Materials, and your product or service has more than 100 million monthly active users, You shall request a license from Us. You cannot exercise your rights under this Agreement without our express authorization.
30 |
31 | 5. Rules of use
32 | a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
33 | b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof).
34 |
35 | 6. Intellectual Property
36 | a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
37 | b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
38 | c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
39 |
40 | 7. Disclaimer of Warranty and Limitation of Liability
41 |
42 | a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto.
43 | b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
44 | c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
45 | d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
46 |
47 | 8. Survival and Termination.
48 | a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
49 | b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 7 and 9 shall survive the termination of this Agreement.
50 |
51 | 9. Governing Law and Jurisdiction.
52 | a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
53 | b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.
--------------------------------------------------------------------------------
/Tongyi Qianwen RESEARCH LICENSE AGREEMENT:
--------------------------------------------------------------------------------
1 | Tongyi Qianwen RESEARCH LICENSE AGREEMENT
2 |
3 | Tongyi Qianwen Release Date: November 30, 2023
4 |
5 | By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
6 |
7 | 1. Definitions
8 | a. This Tongyi Qianwen RESEARCH LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
9 | b. "We"(or "Us") shall mean Alibaba Cloud.
10 | c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
11 | d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You.
12 | e. "Tongyi Qianwen" shall mean the large language models, and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us.
13 | f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement.
14 | g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
15 | h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation,
16 | and conversions to other media types.
17 | i. "Non-Commercial" shall mean for research or evaluation purposes only.
18 |
19 | 2. Grant of Rights
20 | a. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials FOR NON-COMMERCIAL PURPOSES ONLY.
21 | b. If you are commercially using the Materials, You shall request a license from Us.
22 |
23 | 3. Redistribution
24 | You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
25 | a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
26 | b. You shall cause any modified files to carry prominent notices stating that You changed the files;
27 | c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen RESEARCH LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
28 | d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
29 |
30 | 4. Rules of use
31 | a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
32 | b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof).
33 |
34 | 5. Intellectual Property
35 | a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
36 | b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
37 | c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
38 |
39 | 6. Disclaimer of Warranty and Limitation of Liability
40 | a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto.
41 | b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
42 | c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
43 | d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
44 |
45 | 7. Survival and Termination.
46 | a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
47 | b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 6 and 8 shall survive the termination of this Agreement.
48 |
49 | 8. Governing Law and Jurisdiction.
50 | a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
51 | b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.
52 |
53 | 9. Other Terms and Conditions.
54 | a. Any arrangements, understandings, or agreements regarding the Material not stated herein are separate from and independent of the terms and conditions of this Agreement. You shall request a seperate license from Us, if You use the Materials in ways not expressly agreed to in this Agreement.
55 | b. We shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
56 |
--------------------------------------------------------------------------------
/ascend-support/README.md:
--------------------------------------------------------------------------------
1 | # 昇腾910架构基于mindformers推理Qwen-7B-Chat模型
2 |
3 | ## 环境要求
4 |
5 | - 硬件:Ascend 910A/B
6 |
7 | ## 运行步骤
8 |
9 | 首先参考Qwen README下载官方模型到`/path/to/Qwen-7B-Chat`。
10 |
11 | ### 下载并启动镜像
12 |
13 | ```bash
14 | docker pull qwenllm/qwen-mindspore:latest
15 |
16 | cd /path/to/Qwen/ascend-support
17 |
18 | # 下载模型到此处
19 | CHECKPOINT_PATH=/path/to/Qwen-7B-Chat
20 |
21 | cd ascend-support
22 |
23 | # 启动docker容器
24 | bash docker_qwen.sh -c ${CHECKPOINT_PATH}
25 | ```
26 |
27 | ### 执行权重转换
28 |
29 | 在容器内执行下面的命令,将Qwen模型转换为适配`mindformers`的格式:
30 |
31 | ```bash
32 | python3 /data/qwen/mindformers/research/qwen/convert_weight.py
33 | ```
34 |
35 | 转换后模型的输出位置为`${CHECKPOINT_PATH}/qwen-7b-chat.ckpt`。
36 |
37 | ### 执行推理
38 |
39 | 在容器内执行下面的命令,进行推理:
40 |
41 | ```bash
42 | cd /data/qwen/mindformers/research/qwen
43 | export PYTHONPATH=/data/qwen/mindformers:$PYTHONPATH
44 | python3 infer_qwen.py
45 | ```
46 |
--------------------------------------------------------------------------------
/ascend-support/docker_qwen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | IMAGE_NAME=qwenllm/qwen-mindspore:v23.0.RC3
4 | CONTAINER_NAME=qwen-mindspore
5 | CHECKPOINT_PATH='NOT_SET'
6 |
7 | DOCKER_CHECKPOINT_PATH=/data/qwen/models/Qwen-7B-Chat
8 |
9 | function usage() {
10 | echo '
11 | Usage: bash ascend-support/docker_qwen.sh [-i IMAGE_NAME] -c [/path/to/Qwen-7B-Chat] [-n CONTAINER_NAME]
12 | '
13 | }
14 |
15 | while [[ "$1" != "" ]]; do
16 | case $1 in
17 | -i | --image )
18 | shift
19 | IMAGE_NAME=$1
20 | ;;
21 | -c | --checkpoint )
22 | shift
23 | CHECKPOINT_PATH=$1
24 | ;;
25 | -n | --name )
26 | shift
27 | CONTAINER_NAME=$1
28 | ;;
29 | -h )
30 | usage
31 | exit
32 | ;;
33 | * )
34 | echo "Unknown argument ${1}"
35 | exit 1
36 | ;;
37 | esac
38 | shift
39 | done
40 |
41 | docker run -it --rm -u root --network=host --ipc=host \
42 | --device=/dev/davinci0 \
43 | --device=/dev/davinci1 \
44 | --device=/dev/davinci2 \
45 | --device=/dev/davinci3 \
46 | --device=/dev/davinci4 \
47 | --device=/dev/davinci5 \
48 | --device=/dev/davinci6 \
49 | --device=/dev/davinci7 \
50 | --name=${CONTAINER_NAME} \
51 | --device=/dev/davinci_manager \
52 | --device=/dev/devmm_svm \
53 | --device=/dev/hisi_hdc \
54 | -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
55 | -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
56 | -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
57 | -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
58 | -v /etc/ascend_install.info:/etc/ascend_install.info \
59 | -v ${CHECKPOINT_PATH}:${DOCKER_CHECKPOINT_PATH} \
60 | -v /var/log/npu/:/usr/slog \
61 | ${IMAGE_NAME} /bin/bash
62 |
--------------------------------------------------------------------------------
/assets/cli_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/cli_demo.gif
--------------------------------------------------------------------------------
/assets/code_interpreter_showcase_001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/code_interpreter_showcase_001.jpg
--------------------------------------------------------------------------------
/assets/hfagent_chat_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/hfagent_chat_1.png
--------------------------------------------------------------------------------
/assets/hfagent_chat_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/hfagent_chat_2.png
--------------------------------------------------------------------------------
/assets/hfagent_run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/hfagent_run.png
--------------------------------------------------------------------------------
/assets/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/logo.jpg
--------------------------------------------------------------------------------
/assets/openai_api.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/openai_api.gif
--------------------------------------------------------------------------------
/assets/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/performance.png
--------------------------------------------------------------------------------
/assets/qwen_72b_needle_in_a_haystack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/qwen_72b_needle_in_a_haystack.png
--------------------------------------------------------------------------------
/assets/qwen_tokenizer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/qwen_tokenizer.png
--------------------------------------------------------------------------------
/assets/radar_14b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/radar_14b.jpg
--------------------------------------------------------------------------------
/assets/radar_72b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/radar_72b.jpg
--------------------------------------------------------------------------------
/assets/react_showcase_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/react_showcase_001.png
--------------------------------------------------------------------------------
/assets/react_showcase_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/react_showcase_002.png
--------------------------------------------------------------------------------
/assets/react_tutorial_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/react_tutorial_001.png
--------------------------------------------------------------------------------
/assets/react_tutorial_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/react_tutorial_002.png
--------------------------------------------------------------------------------
/assets/system_prompt_behavior_setting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_behavior_setting.png
--------------------------------------------------------------------------------
/assets/system_prompt_behavior_setting_en.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_behavior_setting_en.png
--------------------------------------------------------------------------------
/assets/system_prompt_language_style.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_language_style.png
--------------------------------------------------------------------------------
/assets/system_prompt_language_style_en.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_language_style_en.png
--------------------------------------------------------------------------------
/assets/system_prompt_role_play.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_role_play.png
--------------------------------------------------------------------------------
/assets/system_prompt_role_play_en.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_role_play_en.png
--------------------------------------------------------------------------------
/assets/system_prompt_task_setting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_task_setting.png
--------------------------------------------------------------------------------
/assets/system_prompt_task_setting_en.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_task_setting_en.png
--------------------------------------------------------------------------------
/assets/tokenizer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/tokenizer.pdf
--------------------------------------------------------------------------------
/assets/tokenizer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/tokenizer.png
--------------------------------------------------------------------------------
/assets/wanx_colorful_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/wanx_colorful_black.png
--------------------------------------------------------------------------------
/assets/web_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/web_demo.gif
--------------------------------------------------------------------------------
/assets/wechat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/wechat.png
--------------------------------------------------------------------------------
/cli_demo.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba Cloud.
2 | #
3 | # This source code is licensed under the license found in the
4 | # LICENSE file in the root directory of this source tree.
5 |
6 | """A simple command-line interactive chat demo."""
7 |
8 | import argparse
9 | import os
10 | import platform
11 | import shutil
12 | from copy import deepcopy
13 |
14 | import torch
15 | from transformers import AutoModelForCausalLM, AutoTokenizer
16 | from transformers.generation import GenerationConfig
17 | from transformers.trainer_utils import set_seed
18 |
19 | DEFAULT_CKPT_PATH = 'Qwen/Qwen-7B-Chat'
20 |
21 | _WELCOME_MSG = '''\
22 | Welcome to use Qwen-Chat model, type text to start chat, type :h to show command help.
23 | (欢迎使用 Qwen-Chat 模型,输入内容即可进行对话,:h 显示命令帮助。)
24 |
25 | Note: This demo is governed by the original license of Qwen.
26 | We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, including hate speech, violence, pornography, deception, etc.
27 | (注:本演示受Qwen的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)
28 | '''
29 | _HELP_MSG = '''\
30 | Commands:
31 | :help / :h Show this help message 显示帮助信息
32 | :exit / :quit / :q Exit the demo 退出Demo
33 | :clear / :cl Clear screen 清屏
34 | :clear-his / :clh Clear history 清除对话历史
35 | :history / :his Show history 显示对话历史
36 | :seed Show current random seed 显示当前随机种子
37 | :seed Set random seed to 设置随机种子
38 | :conf Show current generation config 显示生成配置
39 | :conf = Change generation config 修改生成配置
40 | :reset-conf Reset generation config 重置生成配置
41 | '''
42 |
43 |
44 | def _load_model_tokenizer(args):
45 | tokenizer = AutoTokenizer.from_pretrained(
46 | args.checkpoint_path, trust_remote_code=True, resume_download=True,
47 | )
48 |
49 | if args.cpu_only:
50 | device_map = "cpu"
51 | else:
52 | device_map = "auto"
53 |
54 | model = AutoModelForCausalLM.from_pretrained(
55 | args.checkpoint_path,
56 | device_map=device_map,
57 | trust_remote_code=True,
58 | resume_download=True,
59 | ).eval()
60 |
61 | config = GenerationConfig.from_pretrained(
62 | args.checkpoint_path, trust_remote_code=True, resume_download=True,
63 | )
64 |
65 | return model, tokenizer, config
66 |
67 |
68 | def _gc():
69 | import gc
70 | gc.collect()
71 | if torch.cuda.is_available():
72 | torch.cuda.empty_cache()
73 |
74 |
75 | def _clear_screen():
76 | if platform.system() == "Windows":
77 | os.system("cls")
78 | else:
79 | os.system("clear")
80 |
81 |
82 | def _print_history(history):
83 | terminal_width = shutil.get_terminal_size()[0]
84 | print(f'History ({len(history)})'.center(terminal_width, '='))
85 | for index, (query, response) in enumerate(history):
86 | print(f'User[{index}]: {query}')
87 | print(f'QWen[{index}]: {response}')
88 | print('=' * terminal_width)
89 |
90 |
91 | def _get_input() -> str:
92 | while True:
93 | try:
94 | message = input('User> ').strip()
95 | except UnicodeDecodeError:
96 | print('[ERROR] Encoding error in input')
97 | continue
98 | except KeyboardInterrupt:
99 | exit(1)
100 | if message:
101 | return message
102 | print('[ERROR] Query is empty')
103 |
104 |
105 | def main():
106 | parser = argparse.ArgumentParser(
107 | description='QWen-Chat command-line interactive chat demo.')
108 | parser.add_argument("-c", "--checkpoint-path", type=str, default=DEFAULT_CKPT_PATH,
109 | help="Checkpoint name or path, default to %(default)r")
110 | parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed")
111 | parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only")
112 | args = parser.parse_args()
113 |
114 | history, response = [], ''
115 |
116 | model, tokenizer, config = _load_model_tokenizer(args)
117 | orig_gen_config = deepcopy(model.generation_config)
118 |
119 | _clear_screen()
120 | print(_WELCOME_MSG)
121 |
122 | seed = args.seed
123 |
124 | while True:
125 | query = _get_input()
126 |
127 | # Process commands.
128 | if query.startswith(':'):
129 | command_words = query[1:].strip().split()
130 | if not command_words:
131 | command = ''
132 | else:
133 | command = command_words[0]
134 |
135 | if command in ['exit', 'quit', 'q']:
136 | break
137 | elif command in ['clear', 'cl']:
138 | _clear_screen()
139 | print(_WELCOME_MSG)
140 | _gc()
141 | continue
142 | elif command in ['clear-his', 'clh']:
143 | print(f'[INFO] All {len(history)} history cleared')
144 | history.clear()
145 | _gc()
146 | continue
147 | elif command in ['help', 'h']:
148 | print(_HELP_MSG)
149 | continue
150 | elif command in ['history', 'his']:
151 | _print_history(history)
152 | continue
153 | elif command in ['seed']:
154 | if len(command_words) == 1:
155 | print(f'[INFO] Current random seed: {seed}')
156 | continue
157 | else:
158 | new_seed_s = command_words[1]
159 | try:
160 | new_seed = int(new_seed_s)
161 | except ValueError:
162 | print(f'[WARNING] Fail to change random seed: {new_seed_s!r} is not a valid number')
163 | else:
164 | print(f'[INFO] Random seed changed to {new_seed}')
165 | seed = new_seed
166 | continue
167 | elif command in ['conf']:
168 | if len(command_words) == 1:
169 | print(model.generation_config)
170 | else:
171 | for key_value_pairs_str in command_words[1:]:
172 | eq_idx = key_value_pairs_str.find('=')
173 | if eq_idx == -1:
174 | print('[WARNING] format: =')
175 | continue
176 | conf_key, conf_value_str = key_value_pairs_str[:eq_idx], key_value_pairs_str[eq_idx + 1:]
177 | try:
178 | conf_value = eval(conf_value_str)
179 | except Exception as e:
180 | print(e)
181 | continue
182 | else:
183 | print(f'[INFO] Change config: model.generation_config.{conf_key} = {conf_value}')
184 | setattr(model.generation_config, conf_key, conf_value)
185 | continue
186 | elif command in ['reset-conf']:
187 | print('[INFO] Reset generation config')
188 | model.generation_config = deepcopy(orig_gen_config)
189 | print(model.generation_config)
190 | continue
191 | else:
192 | # As normal query.
193 | pass
194 |
195 | # Run chat.
196 | set_seed(seed)
197 | try:
198 | for response in model.chat_stream(tokenizer, query, history=history, generation_config=config):
199 | _clear_screen()
200 | print(f"\nUser: {query}")
201 | print(f"\nQwen-Chat: {response}")
202 | except KeyboardInterrupt:
203 | print('[WARNING] Generation interrupted')
204 | continue
205 |
206 | history.append((query, response))
207 |
208 |
209 | if __name__ == "__main__":
210 | main()
211 |
--------------------------------------------------------------------------------
/dcu-support/README.md:
--------------------------------------------------------------------------------
1 | # DCU 架构基于 fastllm 推理 Qwen 模型
2 |
3 |
4 | ## 环境配置
5 |
6 | ### 环境准备
7 |
8 | ```
9 | docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest
10 | ```
11 |
12 | ### 容器启动
13 |
14 | 根据如下命令启动推理容器,其中需自定义一个容器名,即为本目录的路径:
15 | ```
16 | # 自定义容器名
17 | # 当前工程所在路径
18 | docker run -it --name= -v :/work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add 39 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest /bin/bash
19 | ```
20 |
21 | ### 加载环境
22 |
23 | 进入容器后执行如下命令,加载运行环境变量
24 |
25 | ```
26 | source /opt/dtk-23.04/cuda/env.sh
27 | ```
28 |
29 | ### 安装方法
30 |
31 | ```
32 | #进入本工程目录
33 | cd package
34 | python setup.py install
35 | ```
36 |
37 | ## 推理
38 |
39 | ### 模型转换
40 |
41 | 首先参考Qwen README下载官方模型,并通过如下方式将模型转换为 fastllm 用于推理的形式:
42 |
43 | - 通过`pip install -r requirements.txt`安装模型转换所需依赖
44 |
45 | - 如果使用已经下载完成的模型或者自己finetune的模型需要修改qwen2flm.py文件中创建tokenizer, model时的模型存放路径
46 |
47 | ```
48 | # 在本工程目录下执行:
49 | python3 qwen2flm.py qwen-7b-fp16.bin float16 # 导出fp16模型,参数为导出的模型路径
50 | ```
51 |
52 |
53 | ### 模型推理
54 |
55 | ```
56 | # 命令行聊天程序,使用了模型创建以及流式对话效果
57 | python cli_demo.py -p qwen-7b-fp16.bin
58 |
59 | # batch推理程序
60 | python cli_demo_batch.py -p qwen-7b-fp16.bin
61 |
62 | # 简易webui,需要先安装streamlit-chat
63 | streamlit run web_demo.py qwen-7b-fp16.bin
64 | ```
65 |
--------------------------------------------------------------------------------
/dcu-support/cli_demo.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import argparse
3 | from fastllm_pytools import llm
4 |
5 | def args_parser():
6 | parser = argparse.ArgumentParser(description = 'qwen_chat_demo')
7 | parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
8 | args = parser.parse_args()
9 | return args
10 |
11 | if __name__ == "__main__":
12 | args = args_parser()
13 | model = llm.model(args.path)
14 |
15 | history = []
16 | print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
17 | while True:
18 | query = input("\n用户:")
19 | if query.strip() == "stop":
20 | break
21 | if query.strip() == "clear":
22 | history = []
23 | print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
24 | continue
25 | print("AI:", end = "")
26 | curResponse = ""
27 | for response in model.stream_response(query, history = history, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0):
28 | curResponse += response
29 | print(response, flush = True, end = "")
30 | history.append((query, curResponse))
--------------------------------------------------------------------------------
/dcu-support/cli_demo_batch.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from fastllm_pytools import llm
3 | import time
4 |
5 | def args_parser():
6 | parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
7 | parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
8 | args = parser.parse_args()
9 | return args
10 |
11 | if __name__ == "__main__":
12 | args = args_parser()
13 |
14 | model_path = args.path
15 |
16 | prompts = ["深圳有什么好玩的", "上海有什么好玩的", "晚上睡不着怎么办", "南京有什么好吃的"] * 2
17 | print(prompts)
18 |
19 | responses, historys = [], []
20 |
21 | model = llm.model(model_path)
22 |
23 | t0 = time.time()
24 | responses, historys = model.response_batch(prompts)
25 | t1 = time.time()
26 |
27 | token_output_count = 0
28 | word_len = 0
29 | for i, res in enumerate(responses):
30 | tokens = model.tokenizer_encode_string(res)
31 | token_output_count += len(tokens)
32 | word_len += len(res)
33 |
34 | print("batch index: ", i)
35 | print(res)
36 | print("")
37 |
38 | print("\ntoken/s: {:.2f}, character/s: {:.2f}".format(token_output_count/(t1-t0), word_len/(t1-t0)))
39 |
40 |
--------------------------------------------------------------------------------
/dcu-support/model.properties:
--------------------------------------------------------------------------------
1 | # 模型唯一标识
2 | modelCode = 411
3 | # 模型名称
4 | modelName=qwen-7b_fastllm
5 | # 模型描述
6 | modelDescription=qwen-7b是阿里云研发的通义千问大模型系列的70亿参数规模的模型
7 | # 应用场景
8 | appScenario=推理,对话问答,医疗,科研,金融,教育
9 | # 框架类型
10 | frameType=fastllm
11 |
--------------------------------------------------------------------------------
/dcu-support/package/fastllm_pytools/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["llm"]
--------------------------------------------------------------------------------
/dcu-support/package/fastllm_pytools/hf_model.py:
--------------------------------------------------------------------------------
1 | from fastllm_pytools import llm;
2 | import torch;
3 | import ctypes;
4 | import numpy as np;
5 |
6 | fastllm_data_type_dict = {
7 | "int4": 8,
8 | "int8": 3,
9 | "float16": 7
10 | }
11 | fastllm_weight_type_dict = {
12 | "linear": 1,
13 | "embedding": 2,
14 | "QuantizedLinear": 111
15 | }
16 |
17 | def create(model,
18 | tokenizer = None,
19 | pre_prompt = None,
20 | user_role = None,
21 | bot_role = None,
22 | history_sep = None,
23 | dtype = "float16"):
24 | if (dtype not in fastllm_data_type_dict):
25 | print("dtype should in ", list(fastllm_data_type_dict.keys()));
26 | exit(0);
27 |
28 | # 0.1 model info
29 | if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
30 | model.config.model_type = "chatglm3"
31 | modelInfo = model.config.__dict__
32 | if model.generation_config is not None:
33 | modelInfo.update(model.generation_config.__dict__)
34 | if (pre_prompt):
35 | modelInfo["pre_prompt"] = pre_prompt;
36 | if (user_role):
37 | modelInfo["user_role"] = user_role;
38 | if (bot_role):
39 | modelInfo["bot_role"] = bot_role;
40 | if (history_sep):
41 | modelInfo["history_sep"] = history_sep;
42 | if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
43 | # Baichuan 2代
44 | modelInfo["use_alibi"] = "1";
45 | modelInfo["pre_prompt"] = "";
46 | modelInfo["user_role"] = (" ") if hasattr(model.generation_config, "user_token_id") else "";
47 | modelInfo["bot_role"] = ("") if hasattr(model.generation_config, "assistant_token_id") else "";
48 | modelInfo["history_sep"] = "";
49 | if (modelInfo["model_type"] == "qwen"):
50 | if modelInfo["chat_format"] == "chatml":
51 | modelInfo["im_end_id"] = tokenizer.im_end_id
52 | modelInfo["im_start_id"] = tokenizer.im_start_id
53 |
54 |
55 | weight_type_dict = {};
56 | module_dict = {};
57 | weight_bits = {};
58 | for key, m in model.named_modules():
59 | if (str(type(m)).find("QuantizedLinear") != -1):
60 | weight_type_dict[key + ".weight"] = "QuantizedLinear";
61 | weight_bits[key + ".weight"] = m.weight_bit_width;
62 | if (isinstance(m, torch.nn.Linear)):
63 | weight_type_dict[key + ".weight"] = "linear";
64 | module_dict[key + ".weight"] = m;
65 | if (isinstance(m, torch.nn.Embedding)):
66 | weight_type_dict[key] = "embedding";
67 |
68 | peft_config = {}
69 | active_adapter = ""
70 | if hasattr(model, "peft_config"):
71 | peft_config = model.peft_config
72 | if hasattr(model, "active_adapter") and isinstance(model.active_adapter, str):
73 | # in transformers >= 4.33.0, active_adapter is a funtion in model, ignore it now
74 | active_adapter = model.active_adapter
75 |
76 | model = model.cpu();
77 | dict = model.state_dict();
78 | model_type = model.config.__dict__["model_type"];
79 | model = llm.fastllm_lib.create_empty_llm_model(model_type.encode());
80 | for it in modelInfo.keys():
81 | llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode());
82 |
83 | for adapter_name in peft_config.keys():
84 | adapter_dict = peft_config[adapter_name].__dict__
85 | for it in adapter_dict.keys():
86 | llm.fastllm_lib.add_adapter_dict_llm_model(model, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode())
87 | if len(active_adapter) != 0:
88 | llm.fastllm_lib.set_adapter(model, str(active_adapter).encode())
89 |
90 | # 1. vocab
91 | if (tokenizer):
92 | if (hasattr(tokenizer, "tokenizer")):
93 | if modelInfo["model_type"] == "qwen":
94 | pass
95 | else:
96 | tokenizer = tokenizer.tokenizer;
97 | if (hasattr(tokenizer, "sp_model")):
98 | piece_size = tokenizer.sp_model.piece_size();
99 | for i in range(piece_size):
100 | llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(),
101 | i, ctypes.c_float(tokenizer.sp_model.get_score(i)));
102 | else:
103 | vocab = tokenizer.get_vocab();
104 | for v in vocab.keys():
105 | if (modelInfo["model_type"] == "moss"):
106 | vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v];
107 | llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0));
108 | elif (modelInfo["model_type"] == "qwen"):
109 | llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0));
110 | else:
111 | llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0));
112 | tot = 0;
113 | for key in dict:
114 | ori_data_type = 0;
115 | ori_np_data_type = np.float32;
116 | cur_weight_type = 0;
117 | if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
118 | cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]];
119 | to_data_type = 0;
120 |
121 | if (cur_weight_type == 1):
122 | to_data_type = fastllm_data_type_dict[dtype];
123 | if (to_data_type == 7):
124 | ori_data_type = 7;
125 | ori_np_data_type = np.float16;
126 | elif (cur_weight_type == 2):
127 | # TODO bfloat
128 | to_data_type = 0;
129 |
130 | weight_name = key
131 | if peft_config is not None:
132 | weight_name = weight_name.replace('base_model.model.', '')
133 | if (cur_weight_type == 111):
134 | llm.fastllm_lib.add_qlinear_weight_llm_model(model, weight_name.encode(),
135 | len(dict[key].shape),
136 | (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
137 | weight_bits[key],
138 | dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p),
139 | dict[key].numpy().ctypes.data_as(ctypes.c_void_p));
140 | else:
141 | llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(),
142 | len(dict[key].shape),
143 | (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
144 | to_data_type, cur_weight_type, ori_data_type,
145 | dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p));
146 | tot += 1;
147 | print("convert (", tot, "/", len(dict), end = " )\r");
148 |
149 | print("");
150 | llm.fastllm_lib.init_params_llm_model(model);
151 | llm.fastllm_lib.warmup_llm_model(model);
152 | ret = llm.model("", id = model);
153 | return ret;
154 |
155 |
--------------------------------------------------------------------------------
/dcu-support/package/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup (
4 | name = "fastllm_pytools",
5 | version = "0.0.1",
6 | description = "Fastllm pytools",
7 | packages = ['fastllm_pytools'],
8 | url = "https://developer.hpccube.com/codes/aicomponent/fastllm",
9 | package_data = {
10 | '': ['*.dll', '*.so']
11 | }
12 | )
13 |
--------------------------------------------------------------------------------
/dcu-support/qwen2flm.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from transformers import AutoModelForCausalLM, AutoTokenizer
3 | from transformers.generation import GenerationConfig
4 | from fastllm_pytools import torch2flm
5 |
6 | if __name__ == "__main__":
7 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
8 | model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True, fp32=True).eval()
9 | model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
10 |
11 | dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
12 | exportPath = sys.argv[1] if len(sys.argv) >= 2 else "qwen-7b-" + dtype + ".flm"
13 | torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
--------------------------------------------------------------------------------
/dcu-support/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.32.0
2 | tiktoken
3 | streamlit>=1.24.0
4 | sentencepiece
5 | urllib3==1.26.16
6 | transformers_stream_generator==0.0.4
7 | accelerate
8 | einops
9 | #scipy
10 |
--------------------------------------------------------------------------------
/dcu-support/web_demo.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from streamlit_chat import message
3 | from fastllm_pytools import llm
4 | import sys
5 |
6 | st.set_page_config(
7 | page_title="fastllm web demo",
8 | page_icon=":robot:"
9 | )
10 |
11 | @st.cache_resource
12 | def get_model():
13 | model = llm.model(sys.argv[1])
14 | return model
15 |
16 | if "messages" not in st.session_state:
17 | st.session_state.messages = []
18 |
19 | for i, (prompt, response) in enumerate(st.session_state.messages):
20 | with st.chat_message("user"):
21 | st.markdown(prompt)
22 | with st.chat_message("assistant"):
23 | st.markdown(response)
24 |
25 | if prompt := st.chat_input("请开始对话"):
26 | model = get_model()
27 | with st.chat_message("user"):
28 | st.markdown(prompt)
29 |
30 | with st.chat_message("assistant"):
31 | message_placeholder = st.empty()
32 | full_response = ""
33 | for chunk in model.stream_response(prompt, st.session_state.messages, one_by_one = True):
34 | full_response += chunk
35 | message_placeholder.markdown(full_response + "▌")
36 | message_placeholder.markdown(full_response)
37 | st.session_state.messages.append((prompt, full_response))
38 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG CUDA_VERSION=11.7.1
2 | ARG from=nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
3 |
4 | FROM ${from} as base
5 |
6 | ARG from
7 |
8 | RUN <")[0]
32 | sent = sent.split("\n\n\n")[0]
33 | sent = sent.split("\n\n")[0]
34 | sent = sent.split("Question:")[0]
35 | sents.append(sent)
36 | return sents
37 |
38 |
39 | def generate_sample(model, tokenizer, input_txt):
40 | input_ids = tokenizer.tokenizer.encode(input_txt)
41 | raw_text_len = len(input_ids)
42 | context_enc = torch.tensor([input_ids]).to(model.device)
43 | print(f"Input text: {input_txt}\n")
44 | outputs = model.generate(context_enc)
45 | output_text = decode(outputs, tokenizer, raw_text_len)[0]
46 | print(f"\nOutput text: {output_text}\n")
47 | return output_text
48 |
49 |
50 | def extract_answer_hf(completion):
51 | match = ANS_RE.search(completion)
52 | if match:
53 | match_str = match.group(1).strip()
54 | match_str = match_str.replace(",", "")
55 | return eval(match_str)
56 | else:
57 | return INVALID_ANS
58 |
59 |
60 | def extract_answer(completion):
61 | try:
62 | last_number = re.findall(r"\d+", completion)[-1]
63 | return eval(last_number)
64 | except:
65 | return INVALID_ANS
66 |
67 |
68 | def is_correct(completion, answer):
69 | gold = extract_answer_hf(answer)
70 | assert gold != INVALID_ANS, "No ground truth answer found in the document."
71 | return extract_answer(completion) == gold
72 |
73 |
74 | if __name__ == "__main__":
75 | parser = argparse.ArgumentParser(description="Test HF checkpoint.")
76 | parser.add_argument(
77 | "-c",
78 | "--checkpoint-path",
79 | type=str,
80 | help="Checkpoint path",
81 | default="Qwen/Qwen-7B",
82 | )
83 | parser.add_argument("-f", "--sample-input-file", type=str, default=None)
84 | parser.add_argument(
85 | "-o", "--sample-output-file", type=str, default="gsm8k_res.jsonl"
86 | )
87 |
88 | args = parser.parse_args()
89 |
90 | fewshot_prompt = open("gsm8k_prompt.txt").read()
91 | if args.sample_input_file is not None:
92 | dataset = load_from_disk(args.sample_input_file)
93 | else:
94 | config = datasets.DownloadConfig(resume_download=True, max_retries=100)
95 | dataset = load_dataset("gsm8k", "main", download_config=config)
96 |
97 | test = dataset["test"]
98 |
99 | print("Loading tokenizer ...")
100 | tokenizer = AutoTokenizer.from_pretrained(
101 | args.checkpoint_path, trust_remote_code=True
102 | )
103 |
104 | print("Loading model ...")
105 | model = AutoModelForCausalLM.from_pretrained(
106 | args.checkpoint_path, device_map="auto", trust_remote_code=True
107 | ).eval()
108 | model.generation_config = GenerationConfig.from_pretrained(
109 | args.checkpoint_path, trust_remote_code=True
110 | )
111 | model.generation_config.do_sample = False
112 |
113 | f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8"))
114 | tot_length = test.num_rows
115 | acc_res = []
116 | for doc in test:
117 | context = doc_to_text(doc)
118 | completion = generate_sample(model, tokenizer, context)
119 | answer = doc["answer"]
120 | acc = is_correct(completion, answer)
121 | doc["completion"] = completion
122 | doc["acc"] = acc
123 | f_output.write(doc)
124 | acc_res.append(acc)
125 |
126 | f_output.close()
127 | print("Acc: ", np.mean(acc_res))
128 |
--------------------------------------------------------------------------------
/eval/evaluate_humaneval.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import tqdm
3 | import torch
4 | import jsonlines
5 | from transformers import AutoModelForCausalLM, AutoTokenizer
6 | from transformers.generation import GenerationConfig
7 |
8 | """
9 | git clone https://github.com/openai/human-eval
10 | $ pip install -e human-eval
11 | evaluate_functional_correctness sample-output-file
12 | """
13 |
14 |
15 | def decode(tokens_list, tokenizer, raw_text_len):
16 | sents = []
17 | # print(len(tokens_list))
18 | for tokens in tokens_list:
19 | tokens = tokens.cpu().numpy().tolist()
20 | sent = tokenizer.tokenizer.decode(tokens[raw_text_len:])
21 | sent = sent.split("<|endoftext|>")[0]
22 | sent = sent.split("\n\n\n")[0]
23 | sent = sent.split("\n\n")[0]
24 | sent = sent.split("def ")[0]
25 | sents.append(sent)
26 | return sents
27 |
28 |
29 | def generate_sample(model, tokenizer, input_txt):
30 | input_ids = tokenizer.tokenizer.encode(input_txt)
31 | raw_text_len = len(input_ids)
32 | context_enc = torch.tensor([input_ids]).to(model.device)
33 | print(f"Input text: {input_txt}\n")
34 | outputs = model.generate(context_enc)
35 | output_text = decode(outputs, tokenizer, raw_text_len)[0]
36 | print(f"\nOutput text: \n{output_text}\n")
37 | return output_text
38 |
39 |
40 | if __name__ == "__main__":
41 | parser = argparse.ArgumentParser(description="Test HF checkpoint.")
42 | parser.add_argument(
43 | "-c",
44 | "--checkpoint-path",
45 | type=str,
46 | help="Checkpoint path",
47 | default="Qwen/Qwen-7B",
48 | )
49 | parser.add_argument(
50 | "-f",
51 | "--sample-input-file",
52 | type=str,
53 | default=None,
54 | help="data path to HumanEval.jsonl",
55 | )
56 | parser.add_argument(
57 | "-o", "--sample-output-file", type=str, default="HumanEval_res.jsonl"
58 | )
59 |
60 | args = parser.parse_args()
61 | print("Loading tokenizer ...")
62 | tokenizer = AutoTokenizer.from_pretrained(
63 | args.checkpoint_path, trust_remote_code=True
64 | )
65 |
66 | print("Loading model ...")
67 | model = AutoModelForCausalLM.from_pretrained(
68 | args.checkpoint_path, device_map="auto", trust_remote_code=True
69 | ).eval()
70 | model.generation_config = GenerationConfig.from_pretrained(
71 | args.checkpoint_path, trust_remote_code=True
72 | )
73 | model.generation_config.do_sample = False
74 |
75 | f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8"))
76 |
77 | f = jsonlines.open(args.sample_input_file)
78 | with f_output as output:
79 | for jobj in tqdm.tqdm(f, desc="task_idx"):
80 | prompt = jobj["prompt"]
81 | task_id = jobj["task_id"]
82 | gen_sents = generate_sample(model, tokenizer, prompt)
83 | gen_jobjs = {"task_id": task_id, "completion": gen_sents}
84 | output.write(gen_jobjs)
85 | f_output.close()
86 |
--------------------------------------------------------------------------------
/eval/gsm8k_prompt.txt:
--------------------------------------------------------------------------------
1 | Question: In 2004, there were 60 kids at a cookout. In 2005, half the number of kids came to the cookout as compared to 2004. In 2006, 2/3 as many kids came to the cookout as in 2005. How many kids came to the cookout in 2006?
2 | Let's think step by step
3 | In 2005, 60/2=30 kids came to the cookout.
4 | In 2006, 30/3*2=20 kids came to the cookout.
5 | The answer is 20
6 |
7 | Question: Zilla spent 7% of her monthly earnings on rent, half of it on her other monthly expenses, and put the rest in her savings. If she spent $133 on her rent, how much does she deposit into her savings account in a month?
8 | Let's think step by step
9 | Since $133 is equal to 7% of her earnings, then 1% is equal to $133/7 = $19.
10 | The total monthly earning of Zilla is represented by 100%, so $19 x 100 = $1900 is her monthly earnings.
11 | So, $1900/2 = $950 is spent on her other monthly expenses.
12 | The total amount spent on the rent and other monthly expenses is $133 + $950 = $1083.
13 | Hence, she saves $1900 - $1083 = $817 per month.
14 | The answer is 817
15 |
16 | Question: If Buzz bought a pizza with 78 slices at a restaurant and then decided to share it with the waiter in the ratio of 5:8, with Buzz's ratio being 5, what's twenty less the number of slices of pizza that the waiter ate?
17 | Let's think step by step
18 | The total ratio representing the slices of pizza that Buzz bought is 5+8=13
19 | If he shared the slices of pizza with the waiter, the waiter received a fraction of 8/13 of the total number of slices, which totals 8/13 * 78 = 48 slices
20 | Twenty less the number of slices of pizza that the waiter ate is 48-20 = 28
21 | The answer is 28
22 |
23 | Question: Jame gets a raise to $20 per hour and works 40 hours a week. His old job was $16 an hour for 25 hours per week. How much more money does he make per year in his new job than the old job if he works 52 weeks a year?
24 | Let's think step by step
25 | He makes 20*40=$800 per week
26 | He used to make 16*25=$400 per week
27 | So his raise was 800-400=$400 per week
28 | So he makes 400*52=$20,800 per year more
29 | The answer is 20800
30 |
31 | Question: Mr. Gardner bakes 20 cookies, 25 cupcakes, and 35 brownies for his second-grade class of 20 students. If he wants to give each student an equal amount of sweet treats, how many sweet treats will each student receive?
32 | Let's think step by step
33 | Mr. Gardner bakes a total of 20 + 25 + 35 = 80 sweet treats
34 | Each student will receive 80 / 20 = 4 sweet treats
35 | The answer is 4
36 |
37 | Question: A used car lot has 24 cars and motorcycles (in total) for sale. A third of the vehicles are motorcycles, and a quarter of the cars have a spare tire included. How many tires are on the used car lot’s vehicles in all?
38 | Let's think step by step
39 | The used car lot has 24 / 3 = 8 motorcycles with 2 tires each.
40 | The lot has 24 - 8 = 16 cars for sale
41 | There are 16 / 4 = 4 cars with a spare tire with 5 tires each.
42 | The lot has 16 - 4 = 12 cars with 4 tires each.
43 | Thus, the used car lot’s vehicles have 8 * 2 + 4 * 5 + 12 * 4 = 16 + 20 + 48 = 84 tires in all.
44 | The answer is 84
45 |
46 | Question: Norma takes her clothes to the laundry. She leaves 9 T-shirts and twice as many sweaters as T-shirts in the washer. When she returns she finds 3 sweaters and triple the number of T-shirts. How many items are missing?
47 | Let's think step by step
48 | Norma left 9 T-shirts And twice as many sweaters, she took 9 * 2= 18 sweaters
49 | Adding the T-shirts and sweaters, Norma left 9 + 18 = 27 clothes
50 | When she came back, she found 3 sweaters And triple the number of T-shirts, she found 3 * 3 = 9 T-shirts
51 | Adding the T-shirts and sweaters, Norma found 3 + 9 = 12 clothes
52 | Subtracting the clothes she left from the clothes she found, 27 - 12 = 15 clothes are missing
53 | The answer is 15
54 |
55 | Question: Adam has an orchard. Every day for 30 days he picks 4 apples from his orchard. After a month, Adam has collected all the remaining apples, which were 230. How many apples in total has Adam collected from his orchard?
56 | Let's think step by step
57 | During 30 days Adam picked 4 * 30 = 120 apples.
58 | So in total with all the remaining apples, he picked 120 + 230 = 350 apples from his orchard.
59 | The answer is 350
60 |
--------------------------------------------------------------------------------
/examples/auto_comments.md:
--------------------------------------------------------------------------------
1 | # Auto Comments
2 | 本文档介绍Auto Comments,这是一个利用Qwen模型为代码文件自动生成注释的使用案例。
3 |
4 | # 使用方法
5 | 您可以直接执行如下命令,为提供的代码文件生成注释:
6 | ```
7 | python auto_comments.py --path 'path of file or folder'
8 | ```
9 |
10 | 参数:
11 | - path:文件路径。可以是文件(目前支持python代码文件),也可以是文件夹(会扫描文件夹下所有python代码文件)
12 | - regenerate:重新生成。默认False,如果针对同一文件需要重新生成注释,请设置为True
13 |
14 | # 使用样例
15 | - 执行:python auto_comments.py --path test_file.py
16 | - test_file.py 内容为:
17 | ```
18 | import numpy as np
19 | import pandas as pd
20 | import seaborn as sns
21 | sns.set_theme(style="whitegrid")
22 |
23 | rs = np.random.RandomState(365)
24 | values = rs.randn(365, 4).cumsum(axis=0)
25 | dates = pd.date_range("1 1 2016", periods=365, freq="D")
26 | data = pd.DataFrame(values, dates, columns=["A", "B", "C", "D"])
27 | data = data.rolling(7).mean()
28 |
29 | sns.lineplot(data=data, palette="tab10", linewidth=2.5)
30 | ```
31 |
32 | - 输出:test_file_comments.py(包含注释的代码文件),文件内容如下:
33 | ```
34 | # 导入需要的库
35 | import numpy as np
36 | import pandas as pd
37 | import seaborn as sns
38 |
39 | # 设置 Seaborn 的主题风格为白色网格
40 | sns.set_theme(style="whitegrid")
41 |
42 | # 生成随机数
43 | rs = np.random.RandomState(365)
44 |
45 | # 生成 365 行 4 列的随机数,并按行累加
46 | values = rs.randn(365, 4).cumsum(axis=0)
47 |
48 | # 生成日期
49 | dates = pd.date_range("1 1 2016", periods=365, freq="D")
50 |
51 | # 将随机数和日期组合成 DataFrame
52 | data = pd.DataFrame(values, dates, columns=["A", "B", "C", "D"])
53 |
54 | # 对 DataFrame 进行 7 天滑动平均
55 | data = data.rolling(7).mean()
56 |
57 | # 使用 Seaborn 绘制折线图
58 | sns.lineplot(data=data, palette="tab10", linewidth=2.5)
59 | ```
60 |
--------------------------------------------------------------------------------
/examples/auto_comments.py:
--------------------------------------------------------------------------------
1 | # 运行方式:python auto_comments.py --path 'path of file or folder'
2 | # 脚本功能:使用QWen-7B-Chat为提供的代码文件自动生成注释。(详见auto_comments.md)
3 |
4 |
5 | import argparse
6 | import os
7 | from transformers import AutoModelForCausalLM, AutoTokenizer
8 | from transformers.generation import GenerationConfig
9 |
10 | MaxLine = 50 # 限制单次处理最大代码行数
11 | SplitKey = ["\ndef "] # 自定义的切分代码标识
12 | CodeFileType = ["py"] # 目前仅测试过对python文件生成注释
13 |
14 | def parse_args():
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--path', type=str, default='Qwen-7B/eval/evaluate_ceval.py')
17 | parser.add_argument('--regenerate', action='store_true', default=False) #如果已经生成过注释,默认不会重新生成
18 | args = parser.parse_args()
19 | return args
20 |
21 | class QWenChat():
22 | def __init__(self):
23 | self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
24 |
25 | # use bf16
26 | # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
27 | # use fp16
28 | # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
29 | # use cpu only
30 | # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True).eval()
31 | # use auto mode, automatically select precision based on the device.
32 | self.model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True).eval()
33 |
34 | # Specify hyperparameters for generation
35 | self.model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
36 | self.history = None
37 |
38 | def chat(self, query, system = ""):
39 |
40 | # use history
41 | # response, history = self.model.chat(self.tokenizer, query, history=self.history)
42 |
43 | # 默认不使用history
44 | response, history = self.model.chat(self.tokenizer, query, history=None)
45 | self.history = history
46 |
47 | return response
48 | # 生成注释
49 | def gen_code_comments(context, model = None, **kwargs):
50 | prompt = "\n为以上代码生成细致的中文注释,注意使用合适的语法。要求必须在每个函数开头生成一段统一的函数功能注释。\n除了注释,请保证原始代码内容不变。不要返回除了注释和代码以外的其余信息,不要生成额外代码。\n"
51 | return model.chat(context + prompt)
52 |
53 | def read_file(path):
54 | f = open(path, "r",encoding='utf-8')
55 | lines = f.readlines()
56 | return "".join(lines)
57 |
58 | def write_file(path, context):
59 | with open(path,'w') as f:
60 | f.write(context)
61 |
62 | # 如果代码文件过长,可以简单按照最大行数切分代码
63 | def split_context_by_maxline(text):
64 | lines = text.split("\n")
65 | lines_len = len(lines)
66 | res = []
67 | for i in range(MaxLine, lines_len, MaxLine):
68 | res.append("\n".join(lines[i-MaxLine:i]))
69 |
70 | if i < lines_len:
71 | res.append("\n".join(lines[i:]))
72 | return res
73 |
74 | # 如果代码文件过长,可以简单按照函数切分代码
75 | def split_context_by_splitkey(text):
76 | blocks = text.split(SplitKey[0])
77 | return [blocks[0]] + [SplitKey[0]+x for x in blocks[1:]]
78 |
79 | # merge原始代码和生成的注释,目的是保证原始代码不被更改。这部分可以使用各种不同的策略处理。
80 | def merge_code_and_comments(original_file, comments_path):
81 | res = []
82 | ori_f = open(original_file, "r",encoding='utf-8')
83 | ori_lines = ori_f.readlines()
84 |
85 | com_f = open(comments_path, "r",encoding='utf-8')
86 | com_lines = com_f.readlines()
87 | len_com_lines = len(com_lines)
88 | p = 0
89 | j = 0
90 | for i, line in enumerate(ori_lines):
91 | if line.isspace():
92 | continue
93 | if line.strip()[0] == '#':
94 | res.append(line)
95 | continue
96 | while j < len_com_lines and line[:-1] not in com_lines[j]:
97 | j += 1
98 | if j < len_com_lines:
99 | p = j - 1
100 | up_comments = []
101 | triple_dot_flag = 0
102 | while p < j:
103 | if p < 0 or (res and res[-1] and com_lines[p] == res[-1]):
104 | break
105 | if com_lines[p].strip() and (len(com_lines[p].strip())>3 and com_lines[p].strip()[-3:] == '"""' and com_lines[p].strip()[:3] == '"""') or (len(com_lines[p].strip())>3 and com_lines[p].strip()[-3:] == "'''" and com_lines[p].strip()[:3] == "'''"):
106 | up_comments.append(com_lines[p])
107 | p -= 1
108 | continue
109 | if com_lines[p].strip() and (com_lines[p].strip()[-3:] == '"""' or com_lines[p].strip()[:3] == '"""' or com_lines[p].strip()[-3:] == "'''" or com_lines[p].strip()[:3] == "'''"):
110 | triple_dot_flag = (triple_dot_flag + 1)%2
111 | up_comments.append(com_lines[p])
112 | p -= 1
113 | continue
114 | if triple_dot_flag:
115 | up_comments.append(com_lines[p])
116 | p -= 1
117 | continue
118 | if (com_lines[p].strip()=="") or (com_lines[p].strip() and com_lines[p].strip()[0] == '#' and "省略部分内容" not in com_lines[p]):
119 | up_comments.append(com_lines[p])
120 | else:
121 | break
122 | p -= 1
123 | if up_comments:
124 | res.extend(reversed(up_comments))
125 | if "#" in com_lines[j] and "#" not in line:
126 | in_line_comments = " #" + com_lines[j].split("#")[-1]
127 | res.append(line[:-1]+in_line_comments)
128 | else:
129 | res.append(line)
130 | p = j+1
131 | else:
132 | res.append(line)
133 | j = p
134 |
135 | write_file(comments_path, "".join(res))
136 |
137 | # 处理单个文件
138 | def deal_one_file(model, path, args):
139 | context = read_file(path)
140 |
141 | fname = path.split("/")[-1]
142 | fpath = "/".join(path.split("/")[:-1])
143 | outfname = fname.split(".")[0]+"_comments."+fname.split(".")[-1]
144 |
145 | comments_path = os.path.join(fpath, outfname)
146 | if (not args.regenerate) and os.path.exists(comments_path):
147 | print("use cache: ", comments_path)
148 | return
149 |
150 | context_line = len(context.split("\n"))
151 | if context_line < MaxLine:
152 | res = gen_code_comments(context, model = model)
153 | elif SplitKey[0] not in context:
154 | context_list = split_context_by_maxline(context)
155 | res = "\n".join([gen_code_comments(context_block, model = model) for context_block in context_list])
156 | else:
157 | context_list = split_context_by_splitkey(context)
158 | res = "\n".join([gen_code_comments(context_block, model = model) for context_block in context_list])
159 |
160 | write_file(comments_path, res)
161 | merge_code_and_comments(path, comments_path)
162 |
163 | # 处理文件夹
164 | def deal_folder(model, path, args):
165 | for fl in os.listdir(path):
166 | now_path = os.path.join(path, fl)
167 | if os.path.isfile(now_path):
168 | if (now_path.split(".")[-1] in CodeFileType) and ("_comments" not in now_path):
169 | deal_one_file(model, now_path, args)
170 | elif os.path.isdir(now_path):
171 | deal_folder(model, now_path, args)
172 | else:
173 | print("Please specify a correct path!")
174 |
175 | def transfer(args):
176 | model = QWenChat()
177 |
178 | if os.path.isfile(args.path):
179 | if (args.path.split(".")[-1] in CodeFileType) and ("_comments" not in args.path):
180 | deal_one_file(model, args.path, args)
181 | elif os.path.isdir(args.path):
182 | deal_folder(model, args.path, args)
183 | else:
184 | print("Please specify a correct path!")
185 |
186 | if __name__ == '__main__':
187 | args = parse_args()
188 | print(args)
189 | transfer(args)
190 |
--------------------------------------------------------------------------------
/examples/function_call_examples.py:
--------------------------------------------------------------------------------
1 | # Reference: https://openai.com/blog/function-calling-and-other-api-updates
2 | import json
3 | from pprint import pprint
4 |
5 | import openai
6 |
7 | # To start an OpenAI-like Qwen server, use the following commands:
8 | # git clone https://github.com/QwenLM/Qwen-7B;
9 | # cd Qwen-7B;
10 | # pip install fastapi uvicorn openai pydantic sse_starlette;
11 | # python openai_api.py;
12 | #
13 | # Then configure the api_base and api_key in your client:
14 | openai.api_base = 'http://localhost:8000/v1'
15 | openai.api_key = 'none'
16 |
17 |
18 | def call_qwen(messages, functions=None):
19 | print('input:')
20 | pprint(messages, indent=2)
21 | if functions:
22 | response = openai.ChatCompletion.create(model='Qwen',
23 | messages=messages,
24 | functions=functions)
25 | else:
26 | response = openai.ChatCompletion.create(model='Qwen',
27 | messages=messages)
28 | response = response.choices[0]['message']
29 | response = json.loads(json.dumps(response,
30 | ensure_ascii=False)) # fix zh rendering
31 | print('output:')
32 | pprint(response, indent=2)
33 | print()
34 | return response
35 |
36 |
37 | def test_1():
38 | messages = [{'role': 'user', 'content': '你好'}]
39 | call_qwen(messages)
40 | messages.append({'role': 'assistant', 'content': '你好!很高兴为你提供帮助。'})
41 |
42 | messages.append({
43 | 'role': 'user',
44 | 'content': '给我讲一个年轻人奋斗创业最终取得成功的故事。故事只能有一句话。'
45 | })
46 | call_qwen(messages)
47 | messages.append({
48 | 'role':
49 | 'assistant',
50 | 'content':
51 | '故事的主人公叫李明,他来自一个普通的家庭,父母都是普通的工人。李明想要成为一名成功的企业家。……',
52 | })
53 |
54 | messages.append({'role': 'user', 'content': '给这个故事起一个标题'})
55 | call_qwen(messages)
56 |
57 |
58 | def test_2():
59 | functions = [
60 | {
61 | 'name_for_human':
62 | '谷歌搜索',
63 | 'name_for_model':
64 | 'google_search',
65 | 'description_for_model':
66 | '谷歌搜索是一个通用搜索引擎,可用于访问互联网、查询百科知识、了解时事新闻等。' +
67 | ' Format the arguments as a JSON object.',
68 | 'parameters': [{
69 | 'name': 'search_query',
70 | 'description': '搜索关键词或短语',
71 | 'required': True,
72 | 'schema': {
73 | 'type': 'string'
74 | },
75 | }],
76 | },
77 | {
78 | 'name_for_human':
79 | '文生图',
80 | 'name_for_model':
81 | 'image_gen',
82 | 'description_for_model':
83 | '文生图是一个AI绘画(图像生成)服务,输入文本描述,返回根据文本作画得到的图片的URL。' +
84 | ' Format the arguments as a JSON object.',
85 | 'parameters': [{
86 | 'name': 'prompt',
87 | 'description': '英文关键词,描述了希望图像具有什么内容',
88 | 'required': True,
89 | 'schema': {
90 | 'type': 'string'
91 | },
92 | }],
93 | },
94 | ]
95 |
96 | messages = [{'role': 'user', 'content': '(请不要调用工具)\n\n你好'}]
97 | call_qwen(messages, functions)
98 | messages.append({
99 | 'role': 'assistant',
100 | 'content': '你好!很高兴见到你。有什么我可以帮忙的吗?'
101 | }, )
102 |
103 | messages.append({'role': 'user', 'content': '搜索一下谁是周杰伦'})
104 | call_qwen(messages, functions)
105 | messages.append({
106 | 'role': 'assistant',
107 | 'content': '我应该使用Google搜索查找相关信息。',
108 | 'function_call': {
109 | 'name': 'google_search',
110 | 'arguments': '{"search_query": "周杰伦"}',
111 | },
112 | })
113 |
114 | messages.append({
115 | 'role': 'function',
116 | 'name': 'google_search',
117 | 'content': 'Jay Chou is a Taiwanese singer.',
118 | })
119 | call_qwen(messages, functions)
120 | messages.append(
121 | {
122 | 'role': 'assistant',
123 | 'content': '周杰伦(Jay Chou)是一位来自台湾的歌手。',
124 | }, )
125 |
126 | messages.append({'role': 'user', 'content': '搜索一下他老婆是谁'})
127 | call_qwen(messages, functions)
128 | messages.append({
129 | 'role': 'assistant',
130 | 'content': '我应该使用Google搜索查找相关信息。',
131 | 'function_call': {
132 | 'name': 'google_search',
133 | 'arguments': '{"search_query": "周杰伦 老婆"}',
134 | },
135 | })
136 |
137 | messages.append({
138 | 'role': 'function',
139 | 'name': 'google_search',
140 | 'content': 'Hannah Quinlivan'
141 | })
142 | call_qwen(messages, functions)
143 | messages.append(
144 | {
145 | 'role': 'assistant',
146 | 'content': '周杰伦的老婆是Hannah Quinlivan。',
147 | }, )
148 |
149 | messages.append({'role': 'user', 'content': '用文生图工具画个可爱的小猫吧,最好是黑猫'})
150 | call_qwen(messages, functions)
151 | messages.append({
152 | 'role': 'assistant',
153 | 'content': '我应该使用文生图API来生成一张可爱的小猫图片。',
154 | 'function_call': {
155 | 'name': 'image_gen',
156 | 'arguments': '{"prompt": "cute black cat"}',
157 | },
158 | })
159 |
160 | messages.append({
161 | 'role':
162 | 'function',
163 | 'name':
164 | 'image_gen',
165 | 'content':
166 | '{"image_url": "https://image.pollinations.ai/prompt/cute%20black%20cat"}',
167 | })
168 | call_qwen(messages, functions)
169 |
170 |
171 | def test_3():
172 | functions = [{
173 | 'name': 'get_current_weather',
174 | 'description': 'Get the current weather in a given location.',
175 | 'parameters': {
176 | 'type': 'object',
177 | 'properties': {
178 | 'location': {
179 | 'type': 'string',
180 | 'description':
181 | 'The city and state, e.g. San Francisco, CA',
182 | },
183 | 'unit': {
184 | 'type': 'string',
185 | 'enum': ['celsius', 'fahrenheit']
186 | },
187 | },
188 | 'required': ['location'],
189 | },
190 | }]
191 |
192 | messages = [{
193 | 'role': 'user',
194 | # Note: The current version of Qwen-7B-Chat (as of 2023.08) performs okay with Chinese tool-use prompts,
195 | # but performs terribly when it comes to English tool-use prompts, due to a mistake in data collecting.
196 | 'content': '波士顿天气如何?',
197 | }]
198 | call_qwen(messages, functions)
199 | messages.append(
200 | {
201 | 'role': 'assistant',
202 | 'content': None,
203 | 'function_call': {
204 | 'name': 'get_current_weather',
205 | 'arguments': '{"location": "Boston, MA"}',
206 | },
207 | }, )
208 |
209 | messages.append({
210 | 'role':
211 | 'function',
212 | 'name':
213 | 'get_current_weather',
214 | 'content':
215 | '{"temperature": "22", "unit": "celsius", "description": "Sunny"}',
216 | })
217 | call_qwen(messages, functions)
218 |
219 |
220 | def test_4():
221 | from langchain.agents import AgentType, initialize_agent, load_tools
222 | from langchain.chat_models import ChatOpenAI
223 |
224 | llm = ChatOpenAI(
225 | model_name='Qwen',
226 | openai_api_base='http://localhost:8000/v1',
227 | openai_api_key='EMPTY',
228 | streaming=False,
229 | )
230 | tools = load_tools(['arxiv'], )
231 | agent_chain = initialize_agent(
232 | tools,
233 | llm,
234 | agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
235 | verbose=True,
236 | )
237 | # TODO: The performance is okay with Chinese prompts, but not so good when it comes to English.
238 | agent_chain.run('查一下论文 1605.08386 的信息')
239 |
240 |
241 | if __name__ == '__main__':
242 | print('### Test Case 1 - No Function Calling (普通问答、无函数调用) ###')
243 | test_1()
244 | print('### Test Case 2 - Use Qwen-Style Functions (函数调用,千问格式) ###')
245 | test_2()
246 | print('### Test Case 3 - Use GPT-Style Functions (函数调用,GPT格式) ###')
247 | test_3()
248 | print('### Test Case 4 - Use LangChain (接入Langchain) ###')
249 | test_4()
250 |
--------------------------------------------------------------------------------
/examples/qwen_extra.tiktoken:
--------------------------------------------------------------------------------
1 | 5LiA5Y+q54yr 151851
2 | 5Y+q54yr 151852
3 | 5piv5LiA5Y+q54yr 151853
4 | 5oiR5piv5LiA5Y+q54yr 151854
5 | 5L2g5piv5LiA5Y+q54yr 151855
6 | 5LuW5piv5LiA5Y+q54yr 151856
7 |
--------------------------------------------------------------------------------
/examples/qwen_extra_vocab.txt:
--------------------------------------------------------------------------------
1 | 我是一只猫 20
2 | 你是一只猫 10
3 | 他是一只猫 5
4 | 一只 200
5 | 一只猫 100
6 | 夸张的 比喻手法 20
--------------------------------------------------------------------------------
/examples/system_prompt.md:
--------------------------------------------------------------------------------
1 | # 系统指令 (System Prompts)
2 |
3 | ## 什么是系统指令? (What is the System Prompts?)
4 |
5 | 系统指令设定了AI助手的行为模式,例如人物设定、语言风格、任务模式、甚至针对具体问题的具体行为。
6 |
7 | System Propmts set the behavior mode of the AI assistant, such as character settings, language styles, task modes, and even specific behaviors for specific tasks.
8 |
9 | 系统指令可以是一个广泛的人物设定,如“You are a helpful assistant”;也可以是一个十分详细的要求,如“拒绝回答所有代码相关的问题”。
10 |
11 | The System Prompts can be a broad character setting, such as "You are a helpful assistant"; or it can be a very detailed request, such as "Refuse to answer all code-related questions."
12 |
13 | 系统指令为用户提供了一个易组织、上下文稳定的控制AI助手行为的方式,可以从多种角度定制属于你自己的AI助手。
14 |
15 | System Prompts provide users with an easy-to-organize, context-stable way to control the behavior of the AI assistant. You can customize your own AI assistant from multiple perspectives.
16 |
17 | 系统指令需要在多轮对话中稳定,例如角色扮演类系统指令被设定后AI助手不应该在多轮对话中跳脱自身的设定。
18 |
19 | System Prompts need to be stable across multiple rounds of dialogue. For example, after a role-playing system prompt is set, the AI assistant should not escape its own settings in multiple rounds of dialogue.
20 |
21 | 同时,模型也需要具有基于系统指令中对自身行为进行推理的能力。这两者都是为模型赋予跟随系统指令能力时需要克服的难点。
22 |
23 | At the same time, the model also needs to have the ability to reason about its own behavior based on system prompts. Both of these are difficulties that need to be overcome when giving the model the ability to follow system prompts.
24 |
25 | Qwen-1.8B-Chat 和 Qwen-72B-Chat在多样且存在多轮复杂交互的系统指令上进行了充分训练,使模型可以跟随多样的系统指令,实现上下文(in-context)中的模型定制化,进一步提升了通义千问的可扩展性。
26 |
27 | Qwen-1.8-Chat and Qwen-72B-Chat have been fully trained on diverse system prompts with multiple rounds of complex interactions, so that they can follow a variety of system prompts and realize model customization in context, further improving the scalability of Qwen-chat.
28 |
29 | ## 系统指令能做什么? (What can System Prompts do?)
30 |
31 | ### 角色扮演 Role Play
32 |
33 | 在系统指令中告诉千问你需要它扮演的角色,即可沉浸式和该角色对话交流
34 |
35 | Tell Qwen-Chat the role you want it to play in the System Prompt, and you can have an immersive conversation with that role.
36 |
37 |
38 | 
39 |
40 | 
41 |
42 | ### 语言风格 Language Style
43 |
44 |
45 | 简单调整千问的语言风格
46 |
47 | Simple adjustment of the Qwen-Chat's language style
48 |
49 | 
50 |
51 | 
52 |
53 | ### 任务设定 Task Setting
54 |
55 | 指定具体任务,打造处理专项任务的千问模型
56 |
57 | Setting specific tasks and creating a Qwen-Chat model to handle special tasks
58 |
59 | 
60 |
61 | 
62 |
63 | ### 行为设定 Behavior Setting
64 |
65 | 设定千问对具体任务的行为模式
66 |
67 | Set behavior patterns of Qwen-Chat for specific tasks
68 |
69 | 
70 |
71 | 
72 |
73 | ## 代码示例 Example
74 |
75 | ```python
76 | from transformers import AutoModelForCausalLM, AutoTokenizer
77 | from transformers.generation import GenerationConfig
78 |
79 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-1_8B-Chat", trust_remote_code=True)
80 |
81 | # Only Qwen-72B-Chat and Qwen-1_8B-Chat has system prompt enhancement now.
82 | model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-1_8B-Chat", device_map="auto", trust_remote_code=True).eval()
83 | # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-72B-Chat", device_map="auto", trust_remote_code=True).eval()
84 |
85 | response, _ = model.chat(tokenizer, "你好呀", history=None, system="请用二次元可爱语气和我说话")
86 | print(response)
87 | # 你好啊!我是一只可爱的二次元猫咪哦,不知道你有什么问题需要我帮忙解答吗?
88 |
89 | response, _ = model.chat(tokenizer, "My colleague works diligently", history=None, system="You will write beautiful compliments according to needs")
90 | print(response)
91 | # Your colleague is an outstanding worker! Their dedication and hard work are truly inspiring. They always go above and beyond to ensure that their tasks are completed on time and to the highest standard. I am lucky to have them as a colleague, and I know I can count on them to handle any challenge that comes their way.
92 | ```
--------------------------------------------------------------------------------
/examples/transformers_agent.md:
--------------------------------------------------------------------------------
1 | ## 什么是HuggingFace Agent
2 | 使用大模型作为Agent,仅需自然语言就可调用HuggingFace中的模型,目前支持两种模式:
3 |
4 | - run模式:单轮对话,没有上下文,单个prompt多tool组合调用能力好
5 | - chat模式:多轮对话,有上下文,单次调用能力好,可能需要多次prompt实现多tool组合调用
6 | > 详见官方文档:[Transformers Agents](https://huggingface.co/docs/transformers/transformers_agents)
7 |
8 | ## 使用通义千问作为Agent
9 | ### 安装依赖
10 | ```
11 | pip install transformers
12 | ```
13 | ### 构建QWenAgent
14 | 以下代码便可实现QWenAgent:
15 | ```python
16 | import torch
17 | from transformers import AutoModelForCausalLM, AutoTokenizer, Agent
18 | from transformers.generation import GenerationConfig
19 |
20 |
21 | class QWenAgent(Agent):
22 | """
23 | Agent that uses QWen model and tokenizer to generate code.
24 |
25 | Args:
26 | chat_prompt_template (`str`, *optional*):
27 | Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
28 | actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
29 | `chat_prompt_template.txt` in this repo in this case.
30 | run_prompt_template (`str`, *optional*):
31 | Pass along your own prompt if you want to override the default template for the `run` method. Can be the
32 | actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
33 | `run_prompt_template.txt` in this repo in this case.
34 | additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
35 | Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
36 | one of the default tools, that default tool will be overridden.
37 |
38 | Example:
39 |
40 | ```py
41 | agent = QWenAgent()
42 | agent.run("Draw me a picture of rivers and lakes.")
43 | ```
44 | """
45 | def __init__(self, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
46 | checkpoint = "Qwen/Qwen-7B-Chat"
47 | self.tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
48 | self.model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", trust_remote_code=True).cuda().eval()
49 | self.model.generation_config = GenerationConfig.from_pretrained(checkpoint, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
50 | self.model.generation_config.do_sample = False # greedy
51 |
52 | super().__init__(
53 | chat_prompt_template=chat_prompt_template,
54 | run_prompt_template=run_prompt_template,
55 | additional_tools=additional_tools,
56 | )
57 |
58 | def generate_one(self, prompt, stop):
59 | # "Human:" 和 "Assistant:" 曾为通义千问的特殊保留字,需要替换为 "_HUMAN_:" 和 "_ASSISTANT_:"。这一问题将在未来版本修复。
60 | prompt = prompt.replace("Human:", "_HUMAN_:").replace("Assistant:", "_ASSISTANT_:")
61 | stop = [item.replace("Human:", "_HUMAN_:").replace("Assistant:", "_ASSISTANT_:") for item in stop]
62 |
63 | result, _ = self.model.chat(self.tokenizer, prompt, history=None)
64 | for stop_seq in stop:
65 | if result.endswith(stop_seq):
66 | result = result[: -len(stop_seq)]
67 |
68 | result = result.replace("_HUMAN_:", "Human:").replace("_ASSISTANT_:", "Assistant:")
69 | return result
70 |
71 |
72 | agent = QWenAgent()
73 | agent.run("Draw me a picture of rivers and lakes.")
74 | ```
75 | ### 使用示例
76 | ```python
77 | agent = QWenAgent()
78 | agent.run("generate an image of panda", remote=True)
79 | ```
80 | 
81 | 
82 | 
83 | > 更多玩法参考HuggingFace官方文档[Transformers Agents](https://huggingface.co/docs/transformers/transformers_agents)
84 |
85 | ## Tools
86 | ### Tools支持
87 | HuggingFace Agent官方14个tool:
88 |
89 | - **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document (Donut)
90 | - **Text question answering**: given a long text and a question, answer the question in the text (Flan-T5)
91 | - **Unconditional image captioning**: Caption the image! (BLIP)
92 | - **Image question answering**: given an image, answer a question on this image (VILT)
93 | - **Image segmentation**: given an image and a prompt, output the segmentation mask of that prompt (CLIPSeg)
94 | - **Speech to text**: given an audio recording of a person talking, transcribe the speech into text (Whisper)
95 | - **Text to speech**: convert text to speech (SpeechT5)
96 | - **Zero-shot text classification**: given a text and a list of labels, identify to which label the text corresponds the most (BART)
97 | - **Text summarization**: summarize a long text in one or a few sentences (BART)
98 | - **Translation**: translate the text into a given language (NLLB)
99 | - **Text downloader**: to download a text from a web URL
100 | - **Text to image**: generate an image according to a prompt, leveraging stable diffusion
101 | - **Image transformation**: transforms an image
102 | - **Text to video**: generate a small video according to a prompt, leveraging damo-vilab
103 | ### Tools模型部署
104 | 部分工具涉及的模型HuggingFace已进行在线部署,仅需设置remote=True便可实现在线调用:
105 | > agent.run(xxx, remote=True)
106 |
107 | HuggingFace没有在线部署的模型会自动下载checkpoint进行本地inference
108 | 网络原因偶尔连不上HuggingFace,请多次尝试
109 |
--------------------------------------------------------------------------------
/finetune/ds_config_zero2.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "betas": "auto",
18 | "eps": "auto",
19 | "weight_decay": "auto"
20 | }
21 | },
22 |
23 | "scheduler": {
24 | "type": "WarmupLR",
25 | "params": {
26 | "warmup_min_lr": "auto",
27 | "warmup_max_lr": "auto",
28 | "warmup_num_steps": "auto"
29 | }
30 | },
31 |
32 | "zero_optimization": {
33 | "stage": 2,
34 | "offload_optimizer": {
35 | "device": "none",
36 | "pin_memory": true
37 | },
38 | "allgather_partitions": true,
39 | "allgather_bucket_size": 2e8,
40 | "overlap_comm": true,
41 | "reduce_scatter": true,
42 | "reduce_bucket_size": 2e8,
43 | "contiguous_gradients": true
44 | },
45 |
46 | "gradient_accumulation_steps": "auto",
47 | "gradient_clipping": "auto",
48 | "steps_per_print": 100,
49 | "train_batch_size": "auto",
50 | "train_micro_batch_size_per_gpu": "auto",
51 | "wall_clock_breakdown": false
52 | }
--------------------------------------------------------------------------------
/finetune/ds_config_zero3.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "betas": "auto",
18 | "eps": "auto",
19 | "weight_decay": "auto"
20 | }
21 | },
22 |
23 | "scheduler": {
24 | "type": "WarmupLR",
25 | "params": {
26 | "warmup_min_lr": "auto",
27 | "warmup_max_lr": "auto",
28 | "warmup_num_steps": "auto"
29 | }
30 | },
31 |
32 | "zero_optimization": {
33 | "stage": 3,
34 | "offload_optimizer": {
35 | "device": "none",
36 | "pin_memory": true
37 | },
38 | "offload_param": {
39 | "device": "none",
40 | "pin_memory": true
41 | },
42 | "overlap_comm": true,
43 | "contiguous_gradients": true,
44 | "sub_group_size": 1e9,
45 | "reduce_bucket_size": "auto",
46 | "stage3_prefetch_bucket_size": "auto",
47 | "stage3_param_persistence_threshold": "auto",
48 | "stage3_max_live_parameters": 1e9,
49 | "stage3_max_reuse_distance": 1e9,
50 | "stage3_gather_16bit_weights_on_model_save": true
51 | },
52 |
53 | "gradient_accumulation_steps": "auto",
54 | "gradient_clipping": "auto",
55 | "steps_per_print": 100,
56 | "train_batch_size": "auto",
57 | "train_micro_batch_size_per_gpu": "auto",
58 | "wall_clock_breakdown": false
59 | }
60 |
--------------------------------------------------------------------------------
/finetune/finetune_ds.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
3 | DIR=`pwd`
4 |
5 | # Guide:
6 | # This script supports distributed training on multi-gpu workers (as well as single-worker training).
7 | # Please set the options below according to the comments.
8 | # For multi-gpu workers training, these options should be manually set for each worker.
9 | # After setting the options, please run the script on each worker.
10 |
11 | # Number of GPUs per GPU worker
12 | GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
13 |
14 | # Number of GPU workers, for single-worker training, please set to 1
15 | NNODES=${NNODES:-1}
16 |
17 | # The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
18 | NODE_RANK=${NODE_RANK:-0}
19 |
20 | # The ip address of the rank-0 worker, for single-worker training, please set to localhost
21 | MASTER_ADDR=${MASTER_ADDR:-localhost}
22 |
23 | # The port for communication
24 | MASTER_PORT=${MASTER_PORT:-6001}
25 |
26 | MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly
27 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
28 | # See the section for finetuning in README for more information.
29 | DATA="path_to_data"
30 |
31 | function usage() {
32 | echo '
33 | Usage: bash finetune/finetune_ds.sh [-m MODEL_PATH] [-d DATA_PATH]
34 | '
35 | }
36 |
37 | while [[ "$1" != "" ]]; do
38 | case $1 in
39 | -m | --model )
40 | shift
41 | MODEL=$1
42 | ;;
43 | -d | --data )
44 | shift
45 | DATA=$1
46 | ;;
47 | -h | --help )
48 | usage
49 | exit 0
50 | ;;
51 | * )
52 | echo "Unknown argument ${1}"
53 | exit 1
54 | ;;
55 | esac
56 | shift
57 | done
58 |
59 | DISTRIBUTED_ARGS="
60 | --nproc_per_node $GPUS_PER_NODE \
61 | --nnodes $NNODES \
62 | --node_rank $NODE_RANK \
63 | --master_addr $MASTER_ADDR \
64 | --master_port $MASTER_PORT
65 | "
66 |
67 | torchrun $DISTRIBUTED_ARGS finetune.py \
68 | --model_name_or_path $MODEL \
69 | --data_path $DATA \
70 | --bf16 True \
71 | --output_dir output_qwen \
72 | --num_train_epochs 5 \
73 | --per_device_train_batch_size 1 \
74 | --per_device_eval_batch_size 1 \
75 | --gradient_accumulation_steps 16 \
76 | --evaluation_strategy "no" \
77 | --save_strategy "steps" \
78 | --save_steps 1000 \
79 | --save_total_limit 10 \
80 | --learning_rate 1e-5 \
81 | --weight_decay 0.1 \
82 | --adam_beta2 0.95 \
83 | --warmup_ratio 0.01 \
84 | --lr_scheduler_type "cosine" \
85 | --logging_steps 1 \
86 | --report_to "none" \
87 | --model_max_length 512 \
88 | --gradient_checkpointing True \
89 | --lazy_preprocess True \
90 | --deepspeed finetune/ds_config_zero3.json
91 |
--------------------------------------------------------------------------------
/finetune/finetune_lora_ds.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
3 | DIR=`pwd`
4 |
5 | # Guide:
6 | # This script supports distributed training on multi-gpu workers (as well as single-worker training).
7 | # Please set the options below according to the comments.
8 | # For multi-gpu workers training, these options should be manually set for each worker.
9 | # After setting the options, please run the script on each worker.
10 |
11 | # Number of GPUs per GPU worker
12 | GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
13 |
14 | # Number of GPU workers, for single-worker training, please set to 1
15 | NNODES=${NNODES:-1}
16 |
17 | # The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
18 | NODE_RANK=${NODE_RANK:-0}
19 |
20 | # The ip address of the rank-0 worker, for single-worker training, please set to localhost
21 | MASTER_ADDR=${MASTER_ADDR:-localhost}
22 |
23 | # The port for communication
24 | MASTER_PORT=${MASTER_PORT:-6001}
25 |
26 | MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly
27 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
28 | # See the section for finetuning in README for more information.
29 | DATA="path_to_data"
30 | DS_CONFIG_PATH="finetune/ds_config_zero2.json"
31 |
32 | function usage() {
33 | echo '
34 | Usage: bash finetune/finetune_lora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] [--deepspeed DS_CONFIG_PATH]
35 | '
36 | }
37 |
38 | while [[ "$1" != "" ]]; do
39 | case $1 in
40 | -m | --model )
41 | shift
42 | MODEL=$1
43 | ;;
44 | -d | --data )
45 | shift
46 | DATA=$1
47 | ;;
48 | --deepspeed )
49 | shift
50 | DS_CONFIG_PATH=$1
51 | ;;
52 | -h | --help )
53 | usage
54 | exit 0
55 | ;;
56 | * )
57 | echo "Unknown argument ${1}"
58 | exit 1
59 | ;;
60 | esac
61 | shift
62 | done
63 |
64 | DISTRIBUTED_ARGS="
65 | --nproc_per_node $GPUS_PER_NODE \
66 | --nnodes $NNODES \
67 | --node_rank $NODE_RANK \
68 | --master_addr $MASTER_ADDR \
69 | --master_port $MASTER_PORT
70 | "
71 |
72 | torchrun $DISTRIBUTED_ARGS finetune.py \
73 | --model_name_or_path $MODEL \
74 | --data_path $DATA \
75 | --bf16 True \
76 | --output_dir output_qwen \
77 | --num_train_epochs 5 \
78 | --per_device_train_batch_size 2 \
79 | --per_device_eval_batch_size 1 \
80 | --gradient_accumulation_steps 8 \
81 | --evaluation_strategy "no" \
82 | --save_strategy "steps" \
83 | --save_steps 1000 \
84 | --save_total_limit 10 \
85 | --learning_rate 3e-4 \
86 | --weight_decay 0.1 \
87 | --adam_beta2 0.95 \
88 | --warmup_ratio 0.01 \
89 | --lr_scheduler_type "cosine" \
90 | --logging_steps 1 \
91 | --report_to "none" \
92 | --model_max_length 512 \
93 | --lazy_preprocess True \
94 | --use_lora \
95 | --gradient_checkpointing \
96 | --deepspeed ${DS_CONFIG_PATH}
97 |
--------------------------------------------------------------------------------
/finetune/finetune_lora_single_gpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
3 |
4 | MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly
5 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
6 | # See the section for finetuning in README for more information.
7 | DATA="path_to_data"
8 |
9 | function usage() {
10 | echo '
11 | Usage: bash finetune/finetune_lora_single_gpu.sh [-m MODEL_PATH] [-d DATA_PATH]
12 | '
13 | }
14 |
15 | while [[ "$1" != "" ]]; do
16 | case $1 in
17 | -m | --model )
18 | shift
19 | MODEL=$1
20 | ;;
21 | -d | --data )
22 | shift
23 | DATA=$1
24 | ;;
25 | -h | --help )
26 | usage
27 | exit 0
28 | ;;
29 | * )
30 | echo "Unknown argument ${1}"
31 | exit 1
32 | ;;
33 | esac
34 | shift
35 | done
36 |
37 | export CUDA_VISIBLE_DEVICES=0
38 |
39 | python finetune.py \
40 | --model_name_or_path $MODEL \
41 | --data_path $DATA \
42 | --bf16 True \
43 | --output_dir output_qwen \
44 | --num_train_epochs 5 \
45 | --per_device_train_batch_size 2 \
46 | --per_device_eval_batch_size 1 \
47 | --gradient_accumulation_steps 8 \
48 | --evaluation_strategy "no" \
49 | --save_strategy "steps" \
50 | --save_steps 1000 \
51 | --save_total_limit 10 \
52 | --learning_rate 3e-4 \
53 | --weight_decay 0.1 \
54 | --adam_beta2 0.95 \
55 | --warmup_ratio 0.01 \
56 | --lr_scheduler_type "cosine" \
57 | --logging_steps 1 \
58 | --report_to "none" \
59 | --model_max_length 512 \
60 | --lazy_preprocess True \
61 | --gradient_checkpointing \
62 | --use_lora
63 |
64 | # If you use fp16 instead of bf16, you should use deepspeed
65 | # --fp16 True --deepspeed finetune/ds_config_zero2.json
66 |
--------------------------------------------------------------------------------
/finetune/finetune_qlora_ds.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
3 | DIR=`pwd`
4 |
5 | # Guide:
6 | # This script supports distributed training on multi-gpu workers (as well as single-worker training).
7 | # Please set the options below according to the comments.
8 | # For multi-gpu workers training, these options should be manually set for each worker.
9 | # After setting the options, please run the script on each worker.
10 |
11 | # Number of GPUs per GPU worker
12 | GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
13 |
14 | # Number of GPU workers, for single-worker training, please set to 1
15 | NNODES=${NNODES:-1}
16 |
17 | # The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
18 | NODE_RANK=${NODE_RANK:-0}
19 |
20 | # The ip address of the rank-0 worker, for single-worker training, please set to localhost
21 | MASTER_ADDR=${MASTER_ADDR:-localhost}
22 |
23 | # The port for communication
24 | MASTER_PORT=${MASTER_PORT:-6001}
25 |
26 | MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly
27 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
28 | # See the section for finetuning in README for more information.
29 | DATA="path_to_data"
30 |
31 | function usage() {
32 | echo '
33 | Usage: bash finetune/finetune_qlora_ds.sh [-m MODEL_PATH] [-d DATA_PATH]
34 | '
35 | }
36 |
37 | while [[ "$1" != "" ]]; do
38 | case $1 in
39 | -m | --model )
40 | shift
41 | MODEL=$1
42 | ;;
43 | -d | --data )
44 | shift
45 | DATA=$1
46 | ;;
47 | -h | --help )
48 | usage
49 | exit 0
50 | ;;
51 | * )
52 | echo "Unknown argument ${1}"
53 | exit 1
54 | ;;
55 | esac
56 | shift
57 | done
58 |
59 | DISTRIBUTED_ARGS="
60 | --nproc_per_node $GPUS_PER_NODE \
61 | --nnodes $NNODES \
62 | --node_rank $NODE_RANK \
63 | --master_addr $MASTER_ADDR \
64 | --master_port $MASTER_PORT
65 | "
66 |
67 | # Remember to use --fp16 instead of --bf16 due to autogptq
68 | torchrun $DISTRIBUTED_ARGS finetune.py \
69 | --model_name_or_path $MODEL \
70 | --data_path $DATA \
71 | --fp16 True \
72 | --output_dir output_qwen \
73 | --num_train_epochs 5 \
74 | --per_device_train_batch_size 2 \
75 | --per_device_eval_batch_size 1 \
76 | --gradient_accumulation_steps 8 \
77 | --evaluation_strategy "no" \
78 | --save_strategy "steps" \
79 | --save_steps 1000 \
80 | --save_total_limit 10 \
81 | --learning_rate 3e-4 \
82 | --weight_decay 0.1 \
83 | --adam_beta2 0.95 \
84 | --warmup_ratio 0.01 \
85 | --lr_scheduler_type "cosine" \
86 | --logging_steps 1 \
87 | --report_to "none" \
88 | --model_max_length 512 \
89 | --lazy_preprocess True \
90 | --use_lora \
91 | --q_lora \
92 | --gradient_checkpointing \
93 | --deepspeed finetune/ds_config_zero2.json
94 |
--------------------------------------------------------------------------------
/finetune/finetune_qlora_single_gpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
3 | DIR=`pwd`
4 |
5 | MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly
6 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
7 | # See the section for finetuning in README for more information.
8 | DATA="path_to_data"
9 |
10 | function usage() {
11 | echo '
12 | Usage: bash finetune/finetune_qlora_single_gpu.sh [-m MODEL_PATH] [-d DATA_PATH]
13 | '
14 | }
15 |
16 | while [[ "$1" != "" ]]; do
17 | case $1 in
18 | -m | --model )
19 | shift
20 | MODEL=$1
21 | ;;
22 | -d | --data )
23 | shift
24 | DATA=$1
25 | ;;
26 | -h | --help )
27 | usage
28 | exit 0
29 | ;;
30 | * )
31 | echo "Unknown argument ${1}"
32 | exit 1
33 | ;;
34 | esac
35 | shift
36 | done
37 |
38 | export CUDA_VISIBLE_DEVICES=0
39 |
40 | # Remember to use --fp16 instead of --bf16 due to autogptq
41 | python finetune.py \
42 | --model_name_or_path $MODEL \
43 | --data_path $DATA \
44 | --fp16 True \
45 | --output_dir output_qwen \
46 | --num_train_epochs 5 \
47 | --per_device_train_batch_size 2 \
48 | --per_device_eval_batch_size 1 \
49 | --gradient_accumulation_steps 8 \
50 | --evaluation_strategy "no" \
51 | --save_strategy "steps" \
52 | --save_steps 1000 \
53 | --save_total_limit 10 \
54 | --learning_rate 3e-4 \
55 | --weight_decay 0.1 \
56 | --adam_beta2 0.95 \
57 | --warmup_ratio 0.01 \
58 | --lr_scheduler_type "cosine" \
59 | --logging_steps 1 \
60 | --report_to "none" \
61 | --model_max_length 512 \
62 | --lazy_preprocess True \
63 | --gradient_checkpointing \
64 | --use_lora \
65 | --q_lora \
66 | --deepspeed finetune/ds_config_zero2.json
67 |
--------------------------------------------------------------------------------
/recipes/applications/chatbot/qwen_chatbot.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "54d5d255-aa98-4655-8dd1-bc726430d86a",
6 | "metadata": {},
7 | "source": [
8 | "# Qwen-7B-Chat Chat Demo"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "31e04af4-eb27-4802-a7b2-6ea0525f1dc8",
14 | "metadata": {},
15 | "source": [
16 | "This notebook uses Qwen-7B-Chat as an example to introduce you to how to build a web-based conversational assistant using Gradio."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "75e51155-9f8e-40dc-8432-60f4567d93a8",
22 | "metadata": {},
23 | "source": [
24 | "## Preparation"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "ff6f061c-a033-49f2-8f7d-af3f23ac9125",
30 | "metadata": {},
31 | "source": [
32 | "Download Qwen-7B-Chat\n",
33 | "\n",
34 | "Firstly, we need to download the model. You can use the snapshot_download that comes with modelscope to download the model to a specified directory."
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "id": "c469a129-451f-4d01-8bc0-e2cf70a262c8",
41 | "metadata": {
42 | "tags": []
43 | },
44 | "outputs": [],
45 | "source": [
46 | "!pip install modelscope"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "id": "69af626e-22b8-49ad-8869-8354f4c72bcc",
53 | "metadata": {
54 | "tags": []
55 | },
56 | "outputs": [],
57 | "source": [
58 | "from modelscope.hub.snapshot_download import snapshot_download\n",
59 | "snapshot_download(\"qwen/Qwen-7B-Chat\",cache_dir='/tmp/models') "
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "id": "01d2ff34-4053-4710-a289-e354673be1ca",
65 | "metadata": {},
66 | "source": [
67 | "## Install Dependencies"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "id": "48b51791-4bbc-4d12-9cd6-587c24c8bea7",
74 | "metadata": {
75 | "tags": []
76 | },
77 | "outputs": [],
78 | "source": [
79 | "!pip install -r ../../../requirements.txt\n",
80 | "!pip install gradio==3.37.0 mdtex2html"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "id": "7732037a-246a-4953-af07-dae7a3ae5937",
86 | "metadata": {},
87 | "source": [
88 | "## Run the web UI code to start the Qwen chatbot\n",
89 | "\n",
90 | "Users can run the web_demo.py file to have real-time conversations with Qwen-7b-chat on the webpage."
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "id": "9e256f0a-d96d-4fd7-b305-fe43c6959dc8",
97 | "metadata": {
98 | "ExecutionIndicator": {
99 | "show": true
100 | },
101 | "tags": []
102 | },
103 | "outputs": [],
104 | "source": [
105 | "!python ../../../web_demo.py -c /tmp/models/qwen/Qwen-7B-Chat"
106 | ]
107 | }
108 | ],
109 | "metadata": {
110 | "kernelspec": {
111 | "display_name": "Python 3 (ipykernel)",
112 | "language": "python",
113 | "name": "python3"
114 | },
115 | "language_info": {
116 | "codemirror_mode": {
117 | "name": "ipython",
118 | "version": 3
119 | },
120 | "file_extension": ".py",
121 | "mimetype": "text/x-python",
122 | "name": "python",
123 | "nbconvert_exporter": "python",
124 | "pygments_lexer": "ipython3",
125 | "version": "3.10.13"
126 | }
127 | },
128 | "nbformat": 4,
129 | "nbformat_minor": 5
130 | }
131 |
--------------------------------------------------------------------------------
/recipes/finetune/ascend/README.md:
--------------------------------------------------------------------------------
1 | # Fine-tuning Qwen by Ascend NPU
2 | Below, we provide a simple example to show how to finetune Qwen by Ascend NPU. Currently, fine-tuning and inference are supported for Qwen 7B and 14B models. You can also refer to the official [mindformers](https://gitee.com/mindspore/mindformers/blob/dev/research/qwen/qwen.md) for detailed usage.
3 |
4 | ## Environment Requirement
5 |
6 | - Hardware: Ascend 910A/B
7 |
8 | ## Quickstart
9 |
10 | 1. Launch Docker Image
11 |
12 | ```bash
13 | ImageID=pai-image-manage-registry.cn-wulanchabu.cr.aliyuncs.com/pai/llm-inference:qwen_v23.0.rc3
14 | docker run -it -u root --ipc=host \
15 | --device=/dev/davinci0 \
16 | --device=/dev/davinci1 \
17 | --device=/dev/davinci2 \
18 | --device=/dev/davinci3 \
19 | --device=/dev/davinci4 \
20 | --device=/dev/davinci5 \
21 | --device=/dev/davinci6 \
22 | --device=/dev/davinci7 \
23 | --device=/dev/davinci_manager \
24 | --device=/dev/devmm_svm \
25 | --device=/dev/hisi_hdc \
26 | -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
27 | -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
28 | -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
29 | -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
30 | -v /etc/ascend_install.info:/etc/ascend_install.info \
31 | -v /var/log/npu/:/usr/slog \
32 | -v /etc/hccn.conf:/etc/hccn.conf \
33 | ${ImageID} /bin/bash
34 | ```
35 |
36 | 2. Download and Convert model
37 |
38 | - download model by modelscope
39 |
40 | ```bash
41 | cd mindformers
42 | python3 -c "from modelscope.hub.snapshot_download import snapshot_download; snapshot_download('Qwen/Qwen-7B-Chat', cache_dir='.', revision='master')"
43 | ```
44 |
45 | - convert hf model weights to ckpt weights
46 |
47 | ```bash
48 | python research/qwen/convert_weight.py \
49 | --torch_ckpt_dir Qwen/Qwen-7B-Chat \
50 | --mindspore_ckpt_path qwen-7b-chat.ckpt
51 |
52 | mkdir -vp load_checkpoint/rank_0
53 | mv qwen-7b-chat.ckpt load_checkpoint/rank_0/
54 | ```
55 |
56 | 3. Prepare training data
57 |
58 | - download demo data
59 |
60 | ```bash
61 | wget -c https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/alpaca_data_min.json
62 | ```
63 |
64 | - Converts the raw data to the specified format
65 |
66 | ```bash
67 | python research/qwen/alpaca_converter.py \
68 | --data_path alpaca_data_min.json \
69 | --output_path alpaca-data-conversation_min.json
70 | ```
71 |
72 | - Generate Mindrecord data
73 |
74 | ```bash
75 | python research/qwen/qwen_preprocess.py \
76 | --input_glob alpaca-data-conversation_min.json \
77 | --model_file Qwen/Qwen-7B-Chat/qwen.tiktoken \
78 | --seq_length 1024 \
79 | --output_file alpaca_min.mindrecord
80 | ```
81 |
82 | 4. Prepare RANK_TABLE_FILE
83 |
84 | ```bash
85 | # generate RANK_TABLE_FILE with 8 npu
86 | python mindformers/tools/hccl_tools.py --device_num "[0,8)"
87 | ```
88 |
89 | 5. Fine-tune
90 |
91 | You need to replace RANK_TABLE_FILE with the file generated in step 5.
92 |
93 | ```bash
94 | export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE
95 | bash research/run_singlenode.sh "python3 research/qwen/run_qwen.py \
96 | --config research/qwen/run_qwen_7b.yaml \
97 | --load_checkpoint /mindformers/research/qwen/load_checkpoint \
98 | --vocab_file Qwen/Qwen-7B-Chat/qwen.tiktoken \
99 | --use_parallel True \
100 | --run_mode finetune \
101 | --auto_trans_ckpt True \
102 | --train_data alpaca_min.mindrecord" \
103 | RANK_TABLE_FILE [0,8] 8
104 | ```
105 |
106 | 6. Merge model weights
107 |
108 | - Rename model weights
109 |
110 | ```bash
111 | cd output/checkpoint_network
112 | mv rank_0/qwen_rank_0-network.ckpt rank_0/checkpoint_0.ckpt
113 | mv rank_1/qwen_rank_1-network.ckpt rank_1/checkpoint_1.ckpt
114 | mv rank_2/qwen_rank_2-network.ckpt rank_2/checkpoint_2.ckpt
115 | mv rank_3/qwen_rank_3-network.ckpt rank_3/checkpoint_3.ckpt
116 | mv rank_4/qwen_rank_4-network.ckpt rank_4/checkpoint_4.ckpt
117 | mv rank_5/qwen_rank_5-network.ckpt rank_5/checkpoint_5.ckpt
118 | mv rank_6/qwen_rank_6-network.ckpt rank_6/checkpoint_6.ckpt
119 | mv rank_7/qwen_rank_7-network.ckpt rank_7/checkpoint_7.ckpt
120 | cd ../..
121 | ```
122 |
123 | - Merge model weights
124 |
125 | ```bash
126 | python mindformers/tools/transform_ckpt.py \
127 | --src_ckpt_strategy output/strategy \
128 | --src_ckpt_dir output/checkpoint_network \
129 | --dst_ckpt_dir output/merged_model
130 | ```
131 |
132 | 7. Inference fine-tuned model
133 |
134 | ```bash
135 | python research/qwen/run_qwen.py \
136 | --config research/qwen/run_qwen_7b.yaml \
137 | --predict_data '比较适合深度学习入门的书籍有' \
138 | --run_mode predict \
139 | --load_checkpoint output/merged_model/rank_0/checkpoint_0.ckpt \
140 | --vocab_file Qwen/Qwen-7B-Chat/qwen.tiktoken \
141 | --auto_trans_ckpt False \
142 | --device_id 0
143 | ```
--------------------------------------------------------------------------------
/recipes/finetune/deepspeed/finetune_fullparameter_multi_gpu.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "6e6981ab-2d9a-4280-923f-235a166855ba",
6 | "metadata": {},
7 | "source": [
8 | "# Fine-Tuning Qwen-Chat Large Language Model (Multiple GPUs)\n",
9 | "\n",
10 | "Tongyi Qianwen is a large language model developed by Alibaba Cloud based on the Transformer architecture, trained on an extensive set of pre-training data. The pre-training data is diverse and covers a wide range, including a large amount of internet text, specialized books, code, etc. In addition, an AI assistant called Qwen-Chat has been created based on the pre-trained model using alignment mechanism.\n",
11 | "\n",
12 | "This notebook uses Qwen-1.8B-Chat as an example to introduce how to fine-tune the Qianwen model using Deepspeed.\n",
13 | "\n",
14 | "## Environment Requirements\n",
15 | "\n",
16 | "Please refer to **requirements.txt** to install the required dependencies.\n",
17 | "\n",
18 | "## Preparation\n",
19 | "\n",
20 | "### Download Qwen-1.8B-Chat\n",
21 | "\n",
22 | "First, download the model files. You can choose to download directly from ModelScope."
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "id": "248488f9-4a86-4f35-9d56-50f8e91a8f11",
29 | "metadata": {
30 | "ExecutionIndicator": {
31 | "show": true
32 | },
33 | "tags": []
34 | },
35 | "outputs": [],
36 | "source": [
37 | "from modelscope.hub.snapshot_download import snapshot_download\n",
38 | "model_dir = snapshot_download('Qwen/Qwen-1_8B-Chat', cache_dir='.', revision='master')"
39 | ]
40 | },
41 | {
42 | "attachments": {},
43 | "cell_type": "markdown",
44 | "id": "7b2a92b1-f08e-4413-9f92-8f23761e6e1f",
45 | "metadata": {},
46 | "source": [
47 | "### Download Example Training Data\n",
48 | "\n",
49 | "Download the data required for training; here, we provide a tiny dataset as an example. It is sampled from [Belle](https://github.com/LianjiaTech/BELLE).\n",
50 | "\n",
51 | "Disclaimer: the dataset can be only used for the research purpose."
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "id": "ce195f08-fbb2-470e-b6c0-9a03457458c7",
58 | "metadata": {
59 | "tags": []
60 | },
61 | "outputs": [],
62 | "source": [
63 | "!wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/release/tutorials/qwen_recipes/Belle_sampled_qwen.json"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "id": "7226bed0-171b-4d45-a3f9-b3d81ec2bb9f",
69 | "metadata": {},
70 | "source": [
71 | "You can also refer to this format to prepare the dataset. Below is a simple example list with 1 sample:\n",
72 | "\n",
73 | "```json\n",
74 | "[\n",
75 | " {\n",
76 | " \"id\": \"identity_0\",\n",
77 | " \"conversations\": [\n",
78 | " {\n",
79 | " \"from\": \"user\",\n",
80 | " \"value\": \"你好\"\n",
81 | " },\n",
82 | " {\n",
83 | " \"from\": \"assistant\",\n",
84 | " \"value\": \"我是一个语言模型,我叫通义千问。\"\n",
85 | " }\n",
86 | " ]\n",
87 | " }\n",
88 | "]\n",
89 | "```\n",
90 | "\n",
91 | "You can also use multi-turn conversations as the training set. Here is a simple example:\n",
92 | "\n",
93 | "```json\n",
94 | "[\n",
95 | " {\n",
96 | " \"id\": \"identity_0\",\n",
97 | " \"conversations\": [\n",
98 | " {\n",
99 | " \"from\": \"user\",\n",
100 | " \"value\": \"你好,能告诉我遛狗的最佳时间吗?\"\n",
101 | " },\n",
102 | " {\n",
103 | " \"from\": \"assistant\",\n",
104 | " \"value\": \"当地最佳遛狗时间因地域差异而异,请问您所在的城市是哪里?\"\n",
105 | " },\n",
106 | " {\n",
107 | " \"from\": \"user\",\n",
108 | " \"value\": \"我在纽约市。\"\n",
109 | " },\n",
110 | " {\n",
111 | " \"from\": \"assistant\",\n",
112 | " \"value\": \"纽约市的遛狗最佳时间通常在早晨6点至8点和晚上8点至10点之间,因为这些时间段气温较低,遛狗更加舒适。但具体时间还需根据气候、气温和季节变化而定。\"\n",
113 | " }\n",
114 | " ]\n",
115 | " }\n",
116 | "]\n",
117 | "```\n",
118 | "\n",
119 | "## Fine-Tune the Model\n",
120 | "\n",
121 | "You can directly run the prepared training script to fine-tune the model. **nproc_per_node** refers to the number of GPUs used fro training."
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "id": "7ab0581e-be85-45e6-a5b7-af9c42ea697b",
128 | "metadata": {
129 | "ExecutionIndicator": {
130 | "show": true
131 | },
132 | "tags": []
133 | },
134 | "outputs": [],
135 | "source": [
136 | "!torchrun --nproc_per_node 2 --nnodes 1 --node_rank 0 --master_addr localhost --master_port 6601 ../../finetune.py \\\n",
137 | " --model_name_or_path \"Qwen/Qwen-1_8B-Chat/\" \\\n",
138 | " --data_path \"Belle_sampled_qwen.json\" \\\n",
139 | " --bf16 True \\\n",
140 | " --output_dir \"output_qwen\" \\\n",
141 | " --num_train_epochs 5 \\\n",
142 | " --per_device_train_batch_size 1 \\\n",
143 | " --per_device_eval_batch_size 1 \\\n",
144 | " --gradient_accumulation_steps 16 \\\n",
145 | " --evaluation_strategy \"no\" \\\n",
146 | " --save_strategy \"steps\" \\\n",
147 | " --save_steps 1000 \\\n",
148 | " --save_total_limit 10 \\\n",
149 | " --learning_rate 1e-5 \\\n",
150 | " --weight_decay 0.1 \\\n",
151 | " --adam_beta2 0.95 \\\n",
152 | " --warmup_ratio 0.01 \\\n",
153 | " --lr_scheduler_type \"cosine\" \\\n",
154 | " --logging_steps 1 \\\n",
155 | " --report_to \"none\" \\\n",
156 | " --model_max_length 512 \\\n",
157 | " --gradient_checkpointing True \\\n",
158 | " --lazy_preprocess True \\\n",
159 | " --deepspeed \"../../finetune/ds_config_zero2.json\""
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "## Test the Model\n",
167 | "\n",
168 | "We can test the model as follows:"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": [
177 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
178 | "from transformers.generation import GenerationConfig\n",
179 | "\n",
180 | "tokenizer = AutoTokenizer.from_pretrained(\"output_qwen\", trust_remote_code=True)\n",
181 | "model = AutoModelForCausalLM.from_pretrained(\n",
182 | " \"output_qwen\",\n",
183 | " device_map=\"auto\",\n",
184 | " trust_remote_code=True\n",
185 | ").eval()\n",
186 | "\n",
187 | "response, history = model.chat(tokenizer, \"你好\", history=None)\n",
188 | "print(response)"
189 | ]
190 | }
191 | ],
192 | "metadata": {
193 | "kernelspec": {
194 | "display_name": "Python 3 (ipykernel)",
195 | "language": "python",
196 | "name": "python3"
197 | },
198 | "language_info": {
199 | "codemirror_mode": {
200 | "name": "ipython",
201 | "version": 3
202 | },
203 | "file_extension": ".py",
204 | "mimetype": "text/x-python",
205 | "name": "python",
206 | "nbconvert_exporter": "python",
207 | "pygments_lexer": "ipython3",
208 | "version": "3.10.13"
209 | }
210 | },
211 | "nbformat": 4,
212 | "nbformat_minor": 5
213 | }
214 |
--------------------------------------------------------------------------------
/recipes/finetune/deepspeed/finetune_fullparameter_single_gpu.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "6e6981ab-2d9a-4280-923f-235a166855ba",
6 | "metadata": {},
7 | "source": [
8 | "# Fine-Tuning Qwen-Chat Large Language Model (Single GPU)\n",
9 | "\n",
10 | "Tongyi Qianwen is a large language model developed by Alibaba Cloud based on the Transformer architecture, trained on an extensive set of pre-training data. The pre-training data is diverse and covers a wide range, including a large amount of internet text, specialized books, code, etc. In addition, an AI assistant called Qwen-Chat has been created based on the pre-trained model using alignment mechanism.\n",
11 | "\n",
12 | "This notebook uses Qwen-1.8B-Chat as an example to introduce how to fine-tune the Qianwen model using Deepspeed.\n",
13 | "\n",
14 | "## Environment Requirements\n",
15 | "\n",
16 | "Please refer to **requirements.txt** to install the required dependencies.\n",
17 | "\n",
18 | "## Preparation\n",
19 | "\n",
20 | "### Download Qwen-1.8B-Chat\n",
21 | "\n",
22 | "First, download the model files. You can choose to download directly from ModelScope."
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "id": "248488f9-4a86-4f35-9d56-50f8e91a8f11",
29 | "metadata": {
30 | "ExecutionIndicator": {
31 | "show": true
32 | },
33 | "execution": {
34 | "iopub.execute_input": "2023-12-31T03:19:11.059814Z",
35 | "iopub.status.busy": "2023-12-31T03:19:11.059177Z",
36 | "iopub.status.idle": "2023-12-31T03:21:54.157827Z",
37 | "shell.execute_reply": "2023-12-31T03:21:54.157333Z",
38 | "shell.execute_reply.started": "2023-12-31T03:19:11.059783Z"
39 | },
40 | "tags": []
41 | },
42 | "outputs": [],
43 | "source": [
44 | "from modelscope.hub.snapshot_download import snapshot_download\n",
45 | "model_dir = snapshot_download('Qwen/Qwen-1_8B-Chat', cache_dir='.', revision='master')"
46 | ]
47 | },
48 | {
49 | "attachments": {},
50 | "cell_type": "markdown",
51 | "id": "7b2a92b1-f08e-4413-9f92-8f23761e6e1f",
52 | "metadata": {},
53 | "source": [
54 | "### Download Example Training Data\n",
55 | "\n",
56 | "Download the data required for training; here, we provide a tiny dataset as an example. It is sampled from [Belle](https://github.com/LianjiaTech/BELLE).\n",
57 | "\n",
58 | "Disclaimer: the dataset can be only used for the research purpose."
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "id": "ce195f08-fbb2-470e-b6c0-9a03457458c7",
65 | "metadata": {
66 | "execution": {
67 | "iopub.execute_input": "2023-12-31T03:21:57.596577Z",
68 | "iopub.status.busy": "2023-12-31T03:21:57.595847Z",
69 | "iopub.status.idle": "2023-12-31T03:21:57.971112Z",
70 | "shell.execute_reply": "2023-12-31T03:21:57.970576Z",
71 | "shell.execute_reply.started": "2023-12-31T03:21:57.596555Z"
72 | },
73 | "tags": []
74 | },
75 | "outputs": [],
76 | "source": [
77 | "!wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/release/tutorials/qwen_recipes/Belle_sampled_qwen.json"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "id": "7226bed0-171b-4d45-a3f9-b3d81ec2bb9f",
83 | "metadata": {},
84 | "source": [
85 | "You can also refer to this format to prepare the dataset. Below is a simple example list with 1 sample:\n",
86 | "\n",
87 | "```json\n",
88 | "[\n",
89 | " {\n",
90 | " \"id\": \"identity_0\",\n",
91 | " \"conversations\": [\n",
92 | " {\n",
93 | " \"from\": \"user\",\n",
94 | " \"value\": \"你好\"\n",
95 | " },\n",
96 | " {\n",
97 | " \"from\": \"assistant\",\n",
98 | " \"value\": \"我是一个语言模型,我叫通义千问。\"\n",
99 | " }\n",
100 | " ]\n",
101 | " }\n",
102 | "]\n",
103 | "```\n",
104 | "\n",
105 | "You can also use multi-turn conversations as the training set. Here is a simple example:\n",
106 | "\n",
107 | "```json\n",
108 | "[\n",
109 | " {\n",
110 | " \"id\": \"identity_0\",\n",
111 | " \"conversations\": [\n",
112 | " {\n",
113 | " \"from\": \"user\",\n",
114 | " \"value\": \"你好,能告诉我遛狗的最佳时间吗?\"\n",
115 | " },\n",
116 | " {\n",
117 | " \"from\": \"assistant\",\n",
118 | " \"value\": \"当地最佳遛狗时间因地域差异而异,请问您所在的城市是哪里?\"\n",
119 | " },\n",
120 | " {\n",
121 | " \"from\": \"user\",\n",
122 | " \"value\": \"我在纽约市。\"\n",
123 | " },\n",
124 | " {\n",
125 | " \"from\": \"assistant\",\n",
126 | " \"value\": \"纽约市的遛狗最佳时间通常在早晨6点至8点和晚上8点至10点之间,因为这些时间段气温较低,遛狗更加舒适。但具体时间还需根据气候、气温和季节变化而定。\"\n",
127 | " }\n",
128 | " ]\n",
129 | " }\n",
130 | "]\n",
131 | "```\n",
132 | "\n",
133 | "\n",
134 | "## Fine-Tune the Model\n",
135 | "\n",
136 | "You can directly run the prepared training script to fine-tune the model."
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "id": "7ab0581e-be85-45e6-a5b7-af9c42ea697b",
143 | "metadata": {
144 | "ExecutionIndicator": {
145 | "show": true
146 | },
147 | "execution": {
148 | "iopub.execute_input": "2023-12-31T03:23:52.455178Z",
149 | "iopub.status.busy": "2023-12-31T03:23:52.454615Z",
150 | "iopub.status.idle": "2023-12-31T03:24:15.699948Z",
151 | "shell.execute_reply": "2023-12-31T03:24:15.699358Z",
152 | "shell.execute_reply.started": "2023-12-31T03:23:52.455144Z"
153 | },
154 | "tags": []
155 | },
156 | "outputs": [],
157 | "source": [
158 | "!python ../../finetune.py \\\n",
159 | " --model_name_or_path \"Qwen/Qwen-1_8B-Chat/\"\\\n",
160 | " --data_path \"Belle_sampled_qwen.json\"\\\n",
161 | " --bf16 \\\n",
162 | " --output_dir \"output_qwen\" \\\n",
163 | " --num_train_epochs 5 \\\n",
164 | " --per_device_train_batch_size 1 \\\n",
165 | " --per_device_eval_batch_size 1 \\\n",
166 | " --gradient_accumulation_steps 16 \\\n",
167 | " --evaluation_strategy \"no\" \\\n",
168 | " --save_strategy \"steps\" \\\n",
169 | " --save_steps 1000 \\\n",
170 | " --save_total_limit 10 \\\n",
171 | " --learning_rate 1e-5 \\\n",
172 | " --weight_decay 0.1 \\\n",
173 | " --adam_beta2 0.95 \\\n",
174 | " --warmup_ratio 0.01 \\\n",
175 | " --lr_scheduler_type \"cosine\" \\\n",
176 | " --logging_steps 1 \\\n",
177 | " --report_to \"none\" \\\n",
178 | " --model_max_length 512 \\\n",
179 | " --gradient_checkpointing \\\n",
180 | " --lazy_preprocess"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "## Test the Model\n",
188 | "\n",
189 | "We can test the model as follows:"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": [
198 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
199 | "from transformers.generation import GenerationConfig\n",
200 | "\n",
201 | "tokenizer = AutoTokenizer.from_pretrained(\"output_qwen\", trust_remote_code=True)\n",
202 | "model = AutoModelForCausalLM.from_pretrained(\n",
203 | " \"output_qwen\",\n",
204 | " device_map=\"auto\",\n",
205 | " trust_remote_code=True\n",
206 | ").eval()\n",
207 | "\n",
208 | "response, history = model.chat(tokenizer, \"你好\", history=None)\n",
209 | "print(response)"
210 | ]
211 | }
212 | ],
213 | "metadata": {
214 | "kernelspec": {
215 | "display_name": "Python 3 (ipykernel)",
216 | "language": "python",
217 | "name": "python3"
218 | },
219 | "language_info": {
220 | "codemirror_mode": {
221 | "name": "ipython",
222 | "version": 3
223 | },
224 | "file_extension": ".py",
225 | "mimetype": "text/x-python",
226 | "name": "python",
227 | "nbconvert_exporter": "python",
228 | "pygments_lexer": "ipython3",
229 | "version": "3.10.13"
230 | }
231 | },
232 | "nbformat": 4,
233 | "nbformat_minor": 5
234 | }
235 |
--------------------------------------------------------------------------------
/recipes/finetune/deepspeed/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed
2 | peft
--------------------------------------------------------------------------------
/recipes/inference/dashscope/README.md:
--------------------------------------------------------------------------------
1 | # Inference Qwen Using DashScope
2 |
3 | The most simple way to use Qwen through APIs is DashScope API service through Alibaba Cloud. We give an introduction to the usage. Additionally, we provide a script for you to deploy an OpenAI-style API on your own servers.
4 |
5 | DashScope is the large language model API service provided by Alibaba Cloud, which now supports Qwen. Note that the models behind DashScope are in-house versions temporarily without details provided. The services include `qwen-turbo` and `qwen-plus`, where the former one runs faster and the latter achieves better performance. For more information, visit the documentation [here](https://dashscope.aliyun.com).
6 |
7 | Please head to the official website [link](https://help.aliyun.com/zh/dashscope/developer-reference/activate-dashscope-and-create-an-api-key?spm=a2c4g.11186623.0.0.6c2774fahtfXdn) to create a DashScope account and obtain the API key (AK). We recommend setting the AK with an environment variable:
8 | ```bash
9 | export DASHSCOPE_API_KEY="YOUR_DASHSCOPE_API_KEY"
10 | ```
11 | Then please install the packages and click [here](https://help.aliyun.com/zh/dashscope/developer-reference/install-dashscope-sdk) for the documentation. If you use Python, you can install DashScope with pip:
12 | ```bash
13 | pip install dashscope
14 | ```
15 | If you use JAVA SDK, you can install it in this way:
16 | ```xml
17 |
18 |
19 | com.alibaba
20 | dashscope-sdk-java
21 | the-latest-version
22 |
23 | ```
24 | The simplest way to use DashScope is the usage with messages, which is similar to OpenAI API. The example is demonstrated below:
25 | ```python
26 | import random
27 | from http import HTTPStatus
28 | from dashscope import Generation
29 |
30 |
31 | def call_with_messages():
32 | messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
33 | {'role': 'user', 'content': '如何做西红柿鸡蛋?'}]
34 | gen = Generation()
35 | response = gen.call(
36 | Generation.Models.qwen_turbo,
37 | messages=messages,
38 | seed=random.randint(1, 10000), # set the random seed, optional, default to 1234 if not set
39 | result_format='message', # set the result to be "message" format.
40 | )
41 | return response
42 |
43 |
44 | if __name__ == '__main__':
45 | response = call_with_messages()
46 | if response.status_code == HTTPStatus.OK:
47 | print(response)
48 | else:
49 | print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
50 | response.request_id, response.status_code,
51 | response.code, response.message
52 | ))
53 | ```
54 | For more usages, please visit the official website for more details.
55 |
56 |
57 |
--------------------------------------------------------------------------------
/recipes/inference/quantization/README.md:
--------------------------------------------------------------------------------
1 | # Quantization
2 |
3 | ## GPTQ
4 |
5 | We provide a solution based on [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), and release the Int4 and Int8 quantized models, which achieve nearly lossless model effects but improved performance on both memory costs and inference speed.
6 |
7 | Here we demonstrate how to use our provided quantized models for inference. Before you start, make sure you meet the requirements of auto-gptq (e.g., torch 2.0 and above, transformers 4.32.0 and above, etc.) and install the required packages:
8 |
9 | ```bash
10 | pip install auto-gptq optimum
11 | ```
12 |
13 | If you meet problems installing `auto-gptq`, we advise you to check out the official [repo](https://github.com/PanQiWei/AutoGPTQ) to find a wheel.
14 |
15 | > Note: The pre-compiled `auto-gptq` packages strongly depend on the version of `torch` and its CUDA version. Moreover, due to recent update,
16 | > you may also encounter unsupported version errors from `transformers`, `optimum`, or `peft`.
17 | > We recommend using the latest versions meeting the following requirements:
18 | > - torch==2.1 auto-gptq>=0.5.1 transformers>=4.35.0 optimum>=1.14.0 peft>=0.6.1
19 | > - torch>=2.0,<2.1 auto-gptq<0.5.0 transformers<4.35.0 optimum<1.14.0 peft>=0.5.0,<0.6.0
20 |
21 | Then you can load the quantized model easily and run inference as same as usual:
22 |
23 | ```python
24 | # Model names: "Qwen/Qwen-7B-Chat-Int4", "Qwen/Qwen-14B-Chat-Int4"
25 | model = AutoModelForCausalLM.from_pretrained(
26 | "Qwen/Qwen-7B-Chat-Int4",
27 | device_map="auto",
28 | trust_remote_code=True
29 | ).eval()
30 | response, history = model.chat(tokenizer, "Hi", history=None)
31 | ```
32 |
33 | We illustrate the model performance of both BF16, Int8 and Int4 models on the benchmark, and we find that the quantized model does not suffer from significant performance degradation. Results are shown below:
34 |
35 | | Quantization | MMLU | CEval (val) | GSM8K | Humaneval |
36 | |----------------------|:----:|:-----------:|:-----:|:---------:|
37 | | Qwen-1.8B-Chat (BF16)| 43.3 | 55.6 | 33.7 | 26.2 |
38 | | Qwen-1.8B-Chat (Int8)| 43.1 | 55.8 | 33.0 | 27.4 |
39 | | Qwen-1.8B-Chat (Int4)| 42.9 | 52.8 | 31.2 | 25.0 |
40 | | Qwen-7B-Chat (BF16) | 55.8 | 59.7 | 50.3 | 37.2 |
41 | | Qwen-7B-Chat (Int8) | 55.4 | 59.4 | 48.3 | 34.8 |
42 | | Qwen-7B-Chat (Int4) | 55.1 | 59.2 | 49.7 | 29.9 |
43 | | Qwen-14B-Chat (BF16) | 64.6 | 69.8 | 60.1 | 43.9 |
44 | | Qwen-14B-Chat (Int8) | 63.6 | 68.6 | 60.0 | 48.2 |
45 | | Qwen-14B-Chat (Int4) | 63.3 | 69.0 | 59.8 | 45.7 |
46 | | Qwen-72B-Chat (BF16) | 74.4 | 80.1 | 76.4 | 64.6 |
47 | | Qwen-72B-Chat (Int8) | 73.5 | 80.1 | 73.5 | 62.2 |
48 | | Qwen-72B-Chat (Int4) | 73.4 | 80.1 | 75.3 | 61.6 |
49 |
50 | ## Quantization of KV cache
51 |
52 | > NOTE: Please be aware that due to the internal mechanism of Hugging Face, the support files for this functionality
53 | > (i.e., `cache_autogptq_cuda_256.cpp` and `cache_autogptq_cuda_kernel_256.cu`) may be missing. Please manually download
54 | > them from the Hugging Face Hub and place them into the same folder as the other module files.
55 |
56 | The attention KV cache can be quantized and compressed for storage, to get a higher sample throughput. The arguments `use_cache_quantization` and `use_cache_kernel` in `config.json` are provided to enable KV cache quantization. The specific use method is as follows:
57 | ```python
58 | model = AutoModelForCausalLM.from_pretrained(
59 | "Qwen/Qwen-7B-Chat",
60 | device_map="auto",
61 | trust_remote_code=True,
62 | use_cache_quantization=True,
63 | use_cache_kernel=True,
64 | use_flash_attn=False
65 | )
66 | ```
67 | Attention: Currently, KV cache quantization and flash attention cannot be used at the same time.
68 | If you enable KV cache quantization and flash attention at the same time (`use_flash_attn=True`, `use_cache_quantization=True`, `use_cache_kernel=True`), `use_flash_attn` is disabled by default (`use_flash_attn=false`).
69 |
70 | We have verified that the use of the quantized Int8-KV-Cache model does not suffer from significant performance degradation in downstream evaluation. In the following, we focus on profiling its memory footprint in different conditions.
71 | The profiling runs on a single A100-SXM4-80G GPU with PyTorch 2.0.1 and CUDA 11.4.
72 | We use BF16 models to generate 1024 tokens by default, and "OOM" indicates out-of-memory error.
73 |
74 | With KV cache quantization, the model can infer with a larger batch size (bs).
75 |
76 | | USE KV Cache | bs=1 | bs=4 | bs=16 | bs=32 | bs=64 | bs=100 |
77 | |--------------|:------:|:------:|:------:|:------:|:------:|:------:|
78 | | No | 16.3GB | 24.1GB | 31.7GB | 48.7GB | OOM | OOM |
79 | | Yes | 15.5GB | 17.2GB | 22.3GB | 30.2GB | 48.2GB | 72.4GB |
80 |
81 | With KV cache quantization the model can save more memory when generating longer sequence (`sl`, sequence length, referring to the number of tokens generated) at the stage of inference.
82 |
83 | | USE KV Cache | sl=512 | sl=1024 | sl=2048 | sl=4096 | sl=8192 |
84 | |--------------|:------:|:-------:|:-------:|:-------:|:-------:|
85 | | No | 15.2GB | 16.3GB | 17.6GB | 19.5GB | 23.2GB |
86 | | Yes | 15GB | 15.5GB | 15.8GB | 16.6GB | 17.6GB |
87 |
88 | The model with KV cache quantization will convert the format of `layer_past` from float to int8, and meanwhile the quantized `layer-past` will also store the quantization parameters.
89 |
90 | Specific steps are as follows:
91 |
92 | 1. Quantize key/value
93 | ```
94 | qv,scale,zero_point=quantize_cache_v(v)
95 | ```
96 | 2. Store into layer_past
97 |
98 | The following is the format of quantized `layer_past`:
99 | ```
100 | layer_past=((q_key,key_scale,key_zero_point),
101 | (q_value,value_scale,value_zero_point))
102 | ```
103 |
104 | The original format of `layer_past` is shown below:
105 | ```
106 | layer_past=(key,value)
107 | ```
108 |
109 | If you want to use the attention KV which is quantized, you can use the dequantization operation to convert the Int8 key/value back to the float format as follows:
110 | ```
111 | v=dequantize_cache_torch(qv,scale,zero_point)
112 | ```
113 |
--------------------------------------------------------------------------------
/recipes/inference/tensorrt/README.md:
--------------------------------------------------------------------------------
1 | # Inference Qwen Using TensorRT-LLM
2 | Below, we provide a simple example to show how to inference Qwen by TensorRT-LLM. We recommend using GPUs with compute capability of at least SM_80 such as A10 and A800 to run this example, as we have tested on these GPUs. You can find your gpu compute capability on this [link](https://developer.nvidia.com/cuda-gpus).
3 |
4 | ## Installation
5 | You can use pre-built docker image to run this example. Simultaneously, You can also refer to the official [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) for installation and detailed usage.
6 | ```bash
7 | docker run --gpus all -it --ipc=host --network=host pai-image-manage-registry.cn-wulanchabu.cr.aliyuncs.com/pai/llm-inference:tensorrt-llm-0.8.0 bash
8 | ```
9 | ## Quickstart
10 | 1. Download model by modelscope
11 |
12 | ```bash
13 | cd TensorRT-LLM/examples/qwen
14 | python3 -c "from modelscope.hub.snapshot_download import snapshot_download; snapshot_download('Qwen/Qwen-1_8B-Chat', cache_dir='.', revision='master')"
15 | mkdir -p ./tmp/Qwen
16 | mv Qwen/Qwen-1_8B-Chat ./tmp/Qwen/1_8B
17 | ```
18 |
19 | 2. Build TensorRT engine from HF checkpoint
20 |
21 | ```bash
22 | python3 build.py --hf_model_dir ./tmp/Qwen/1_8B/ \
23 | --dtype float16 \
24 | --remove_input_padding \
25 | --use_gpt_attention_plugin float16 \
26 | --enable_context_fmha \
27 | --use_gemm_plugin float16 \
28 | --output_dir ./tmp/Qwen/1_8B/trt_engines/fp16/1-gpu/
29 | ```
30 |
31 | 3. Inference
32 | ```bash
33 | python3 ../run.py --input_text "你好,请问你叫什么?" \
34 | --max_output_len=512 \
35 | --tokenizer_dir ./tmp/Qwen/1_8B/ \
36 | --engine_dir=./tmp/Qwen/1_8B/trt_engines/fp16/1-gpu
37 | ```
38 | ```
39 | Input [Text 0]: "<|im_start|>system
40 | You are a helpful assistant.<|im_end|>
41 | <|im_start|>user
42 | 你好,请问你叫什么?<|im_end|>
43 | <|im_start|>assistant
44 | "
45 | Output [Text 0 Beam 0]: "你好,我是来自阿里云的大规模语言模型,我叫通义千问。"
46 | ```
47 |
--------------------------------------------------------------------------------
/recipes/inference/tensorrt/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
2 |
3 | RUN apt-get update && \
4 | apt-get -y install python3.10 python3-pip openmpi-bin libopenmpi-dev git && \
5 | rm -rf /var/lib/apt/lists/*
6 |
7 | RUN pip install tensorrt_llm==0.8.0.dev2024011601 -U --no-cache-dir --pre --extra-index-url https://pypi.nvidia.com
8 |
9 | RUN pip install --no-cache-dir modelscope==1.11.1
10 |
11 | RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
12 | cd TensorRT-LLM && \
13 | git checkout c89653021e66ca78c55f02b366f404455bc12e8d && \
14 | pip install --no-cache-dir -r examples/qwen/requirements.txt
--------------------------------------------------------------------------------
/recipes/inference/vllm/README.md:
--------------------------------------------------------------------------------
1 | # Inference Qwen Using vLLM
2 |
3 | For deployment and fast inference, we suggest using vLLM.
4 |
5 | ## Installation
6 |
7 | If you use cuda 12.1 and pytorch 2.1, you can directly use the following command to install vLLM.
8 | ```bash
9 | # Install vLLM with CUDA 12.1.
10 | pip install vllm
11 | ```
12 | Otherwise, please refer to the official vLLM [Installation Instructions](https://docs.vllm.ai/en/latest/getting_started/installation.html).
13 |
14 | If you have trouble building vLLM, we recommend using Docker image.
15 |
16 | ```bash
17 | docker run --gpus all -it --rm --ipc=host --network=host qwenllm/qwen:cu121 bash
18 | ```
19 |
20 | ## GPU Requirements
21 |
22 | Qwen model use Bfloat16 by default, but Bfloat16 is only supported on GPUs with compute capability of at least 8. For GPUs with compute capability less than 8.0, it is recommended to set the dtype to float16. You can find your gpu compute capability on this [link](https://developer.nvidia.com/cuda-gpus).
23 |
24 | We have tested the GPU memory usage on NVIDIA Tesla V100 32GB by manually adjusting gpu-memory-utilization in eager mode, you can refer to the following table to determine whether your machine is capable of running these models.
25 | | Model | seq_len 2048 | seq_len 8192 | seq_len 16384 | seq_len 32768 |
26 | | :--- | ---: | ---: | ---: | ---: |
27 | | Qwen-1.8B | 6.22G | 7.46G | | |
28 | | Qwen-7B | 17.94G | 20.96G | | |
29 | | Qwen-7B-Int4 | 9.10G | 12.26G | | |
30 | | Qwen-14B | 33.40G | | | |
31 | | Qwen-14B-Int4 | 13.30G | | | |
32 | | Qwen-72B | 166.87G | 185.50G | 210.80G | 253.80G |
33 | | Qwen-72B-int4 | 55.37G | 73.66G | 97.79G | 158.80G |
34 |
35 | We have also listed the models that can run on consumer graphics cards by default sequence length in the following table. If the GPU memory only exceeds the model's memory usage by a small margin, you can make the model run on your machine by reducing the max-model-len parameter.
36 | (ps: To run Qwen-14B-Int4 on NVIDIA RTX 3080Ti, you need to set gpu-memory-utilization as 0.99 and enforce eager mode)
37 |
38 | | GPU Memory | GPU | Support Model |
39 | | :---: | :---: | :---: |
40 | | 24GB | NVIDIA RTX 4090/3090/A5000 | Qwen-1.8B/Qwen-7B/Qwen-7B-Int4/Qwen-14B-Int4 |
41 | | 16GB | NVIDIA RTX A4000 | Qwen-1.8B/Qwen-7B-Int4/Qwen-14B-Int4 |
42 | | 12GB | NVIDIA RTX 3080Ti/TITAN Xp | Qwen-1.8B/Qwen-14B-Int4 |
43 | | 11GB | NVIDIA RTX 2080Ti/GTX 1080Ti | Qwen-1.8B |
44 | | 10GB | NVIDIA RTX 3080 | Qwen-1.8B |
45 |
46 | ## Usage
47 |
48 | ### vLLM + Web Demo / OpenAI-like API
49 |
50 | You can use FastChat to launch a web demo or an OpenAI API server. First, install FastChat:
51 |
52 | ```bash
53 | pip install "fschat[model_worker,webui]=0.2.33" "openai<1.0"
54 | ```
55 |
56 | To run Qwen with vLLM and FastChat, you need launch a controller by:
57 | ```bash
58 | python -m fastchat.serve.controller
59 | ```
60 |
61 | Then you can launch the model worker, which means loading your model for inference. For single GPU inference, you can directly run:
62 | ```bash
63 | python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --dtype bfloat16
64 | # run int4 model or GPUs with compute capability less than 8.0
65 | # python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --dtype float16
66 | ```
67 |
68 | However, if you hope to run the model on multiple GPUs for faster inference or larger memory, you can use tensor parallelism supported by vLLM. Suppose you run the model on 4 GPUs, the command is shown below:
69 | ```bash
70 | python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --tensor-parallel-size 4 --dtype bfloat16
71 | # run int4 model or GPUs with compute capability less than 8.0
72 | # python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --tensor-parallel-size 4 --dtype float16
73 | ```
74 |
75 | After launching your model worker, you can launch a:
76 |
77 | * Web UI Demo
78 | ```bash
79 | python -m fastchat.serve.gradio_web_server
80 | ```
81 |
82 | * OpenAI API
83 | ```bash
84 | python -m fastchat.serve.openai_api_server --host localhost --port 8000
85 | ```
86 |
87 | For OpenAI API server, you can invoke the server in the following manner.
88 |
89 | ```python
90 | import openai
91 | openai.api_base = "http://localhost:8000/v1"
92 | openai.api_key = "none"
93 |
94 | # create a request activating streaming response
95 | for chunk in openai.ChatCompletion.create(
96 | model="Qwen",
97 | messages=[
98 | {"role": "user", "content": "你好"}
99 | ],
100 | stream=True
101 | # Specifying stop words in streaming output format is not yet supported and is under development.
102 | ):
103 | if hasattr(chunk.choices[0].delta, "content"):
104 | print(chunk.choices[0].delta.content, end="", flush=True)
105 |
106 | # create a request not activating streaming response
107 | response = openai.ChatCompletion.create(
108 | model="Qwen",
109 | messages=[
110 | {"role": "user", "content": "你好"}
111 | ],
112 | stream=False,
113 | stop=[] # You can add custom stop words here, e.g., stop=["Observation:"] for ReAct prompting.
114 | )
115 | print(response.choices[0].message.content)
116 | ```
117 |
118 | If you find `"POST /v1/chat/completions HTTP/1.1" 200 OK` in openai_api_server log, it indicates that the call was successful.
119 |
120 | vLLM does not support dynamic-NTK ROPE. Therefore, extending long sequences for Qwen model may lead to quality degradation(even gibberish).
121 |
122 | ### vLLM + Transformer-like Wrapper
123 |
124 | You can download the [wrapper codes](vllm_wrapper.py) and execute the following commands for multiple rounds of dialogue interaction. (Note: It currently only supports the ``model.chat()`` method.)
125 |
126 | ```python
127 | from vllm_wrapper import vLLMWrapper
128 |
129 | # Bfloat16 is only supported on GPUs with compute capability of at least 8.0,
130 | model = vLLMWrapper('Qwen/Qwen-7B-Chat', tensor_parallel_size=1)
131 |
132 | # run int4 model or GPUs with compute capability less than 8.0
133 | # model = vLLMWrapper('Qwen/Qwen-7B-Chat-Int4', tensor_parallel_size=1, dtype="float16")
134 |
135 | response, history = model.chat(query="你好", history=None)
136 | print(response)
137 | response, history = model.chat(query="给我讲一个年轻人奋斗创业最终取得成功的故事。", history=history)
138 | print(response)
139 | response, history = model.chat(query="给这个故事起一个标题", history=history)
140 | print(response)
141 | ```
142 | ### vLLM Standalone OpenAI-like API
143 |
144 | You can also deploy an OpenAI API server independently through vLLM. First, you need to download [chat template file](template_chatml.jinja).
145 |
146 | Then, you can launch an OpenAI API server by following command:
147 |
148 | ```bash
149 | python -m vllm.entrypoints.openai.api_server --model $model_path --trust-remote-code --chat-template template_chatml.jinja
150 |
151 | # run int4 model or GPUs with compute capability less than 8.0
152 | # python -m vllm.entrypoints.openai.api_server --model $model_path --trust-remote-code --dtype float16 --chat-template template_chatml.jinja
153 | ```
154 |
155 | For vLLM standalone OpenAI API server, You need to set the `stop_token_ids` parameter to `[151645]` or `stop` parameter to `["<|im_end|>"]` when invoking the server.
156 |
157 | ```python
158 | import openai
159 | openai.api_base = "http://localhost:8000/v1"
160 | openai.api_key = "none"
161 |
162 | # create a request activating streaming response
163 | for chunk in openai.ChatCompletion.create(
164 | model="Qwen",
165 | messages=[
166 | {"role": "user", "content": "你好"}
167 | ],
168 | stream=True,
169 | stop_token_ids=[151645]
170 | ):
171 | if hasattr(chunk.choices[0].delta, "content"):
172 | print(chunk.choices[0].delta.content, end="", flush=True)
173 |
174 | # create a request not activating streaming response
175 | response = openai.ChatCompletion.create(
176 | model="Qwen",
177 | messages=[
178 | {"role": "user", "content": "你好"}
179 | ],
180 | stream=False,
181 | stop_token_ids=[151645]
182 | )
183 | print(response.choices[0].message.content)
184 | ```
--------------------------------------------------------------------------------
/recipes/inference/vllm/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}
2 | {% if loop.first and message['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}
3 | {{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>\n' }}
4 | {% if loop.last and add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}
5 | {% endfor %}
6 |
--------------------------------------------------------------------------------
/recipes/tests/README.md:
--------------------------------------------------------------------------------
1 | # Unit testing
2 | - Run all unit testing
3 | ```bash
4 | cd tests && pytest -s
5 | ```
6 | - Run unit testing under a single folder
7 | ```bash
8 | cd tests && pytest -s {dir}
9 | ```
10 | - Rerun the test cases that failed in the last run
11 | ```bash
12 | cd tests && pytest -s --lf
13 | ```
--------------------------------------------------------------------------------
/recipes/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/recipes/tests/__init__.py
--------------------------------------------------------------------------------
/recipes/tests/assets/test_sampled_qwen.json:
--------------------------------------------------------------------------------
1 | [{"conversations": [{"from": "user", "value": "你好"}, {"from": "assistant", "value": "你好!很高兴为你提供帮助。"}], "id": "identity_0"}, {"conversations": [{"from": "user", "value": "给我讲一个年轻人奋斗创业最终取得成功的故事。"}, {"from": "assistant", "value": "这是一个关于一个年轻人奋斗创业最终取得成功的故事。"}], "id": "identity_1"}]
--------------------------------------------------------------------------------
/recipes/tests/test_finetune/test_finetune_ds.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import pytest
4 | import shutil
5 | from itertools import product
6 | import torch
7 | from modelscope.hub.snapshot_download import snapshot_download
8 |
9 | sys.path.append(os.path.dirname(__file__) + "/..")
10 | from utils import run_in_subprocess
11 | from ut_config import (
12 | MODEL_TYPE,
13 | DOCKER_VERSION_CU114,
14 | DOCKER_VERSION_CU117,
15 | DOCKER_VERSION_CU121,
16 | DOCKER_MOUNT_DIR,
17 | DOCKER_TEST_DIR,
18 | DATA_DIR,
19 | DS_CONFIG_ZERO2_DIR,
20 | DS_CONFIG_ZERO3_DIR,
21 | )
22 |
23 | is_chat = ["chat", "base"]
24 | docker_version = [DOCKER_VERSION_CU114, DOCKER_VERSION_CU117, DOCKER_VERSION_CU121]
25 | # ZeRO3 is incompatible with LoRA when finetuning on base model.
26 | # FSDP or ZeRO3 are incompatible with QLoRA.
27 | parametrize_list_none_ds = list(
28 | product(*[[1], ["full", "lora"], is_chat, docker_version, [None]])
29 | )
30 | parametrize_list_ds_zero2 = list(
31 | product(*[[2], ["full", "lora"], is_chat, docker_version, [DS_CONFIG_ZERO2_DIR]])
32 | )
33 | parametrize_list_ds_zero3 = list(
34 | product(*[[2], ["full"], is_chat, docker_version, [DS_CONFIG_ZERO3_DIR]])
35 | ) + list(product(*[[2], ["lora"], ["chat"], docker_version, [DS_CONFIG_ZERO3_DIR]]))
36 | parametrize_list_qlora = list(
37 | product(*[[1, 2], ["qlora"], ["chat"], docker_version, [None, DS_CONFIG_ZERO2_DIR]])
38 | )
39 | parametrize_list = (
40 | parametrize_list_none_ds
41 | + parametrize_list_ds_zero2
42 | + parametrize_list_ds_zero3
43 | + parametrize_list_qlora
44 | )
45 |
46 |
47 | @pytest.mark.parametrize(
48 | "num_gpus,train_type,is_chat,docker_version,deepspeed", parametrize_list
49 | )
50 | def test_finetune(num_gpus, train_type, is_chat, docker_version, deepspeed):
51 | cmd_docker = f"docker run --gpus all --ipc=host --network=host --rm -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {docker_version} /bin/bash -c "
52 | cmd = ""
53 | # for GPUs SM < 80
54 | is_ampere = torch.cuda.get_device_capability()[0] >= 8
55 | if not is_ampere:
56 | cmd = f"pip uninstall -y flash-attn && "
57 |
58 | model_type = f"{MODEL_TYPE}-Chat" if is_chat == "chat" else MODEL_TYPE
59 | model_type = f"{model_type}-Int4" if train_type == "qlora" else model_type
60 | cmd += f"""torchrun --nproc_per_node {num_gpus} --nnodes 1 --node_rank 0 --master_addr localhost --master_port 12345 {DOCKER_MOUNT_DIR}/finetune.py \
61 | --model_name_or_path "{DOCKER_TEST_DIR}/{model_type}/" \
62 | --data_path {DATA_DIR} \
63 | --output_dir "{DOCKER_TEST_DIR}/output_qwen" \
64 | --num_train_epochs 1 \
65 | --per_device_train_batch_size 1 \
66 | --per_device_eval_batch_size 1 \
67 | --gradient_accumulation_steps 2 \
68 | --evaluation_strategy "no" \
69 | --save_strategy "steps" \
70 | --save_steps 1000 \
71 | --save_total_limit 10 \
72 | --learning_rate 1e-5 \
73 | --weight_decay 0.1 \
74 | --adam_beta2 0.95 \
75 | --warmup_ratio 0.01 \
76 | --lr_scheduler_type "cosine" \
77 | --logging_steps 1 \
78 | --report_to "none" \
79 | --model_max_length 512"""
80 | if deepspeed:
81 | cmd += f" --deepspeed {deepspeed}"
82 | if train_type == "lora":
83 | cmd += " --use_lora"
84 | elif train_type == "qlora":
85 | cmd += " --use_lora --q_lora"
86 | # for SM < 80
87 | if (
88 | (not is_ampere)
89 | and train_type == "lora"
90 | and (deepspeed and "zero2" in deepspeed)
91 | and is_chat == "base"
92 | ):
93 | cmd += " --fp16 True"
94 | snapshot_download(model_type, cache_dir=".", revision="master")
95 | run_in_subprocess(cmd_docker + f'"{cmd}"')
96 | if train_type == "full":
97 | assert os.path.exists("output_qwen/config.json")
98 | else:
99 | assert os.path.exists("output_qwen/adapter_config.json")
100 | shutil.rmtree("output_qwen")
101 |
--------------------------------------------------------------------------------
/recipes/tests/test_inference/test_inference_api.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 | import pytest
5 | import subprocess
6 | import torch
7 | from modelscope.hub.snapshot_download import snapshot_download
8 |
9 | sys.path.append(os.path.dirname(__file__) + "/..")
10 | from utils import run_in_subprocess, simple_openai_api, TelnetPort
11 | from ut_config import (
12 | MODEL_TYPE,
13 | DOCKER_VERSION_CU114,
14 | DOCKER_VERSION_CU117,
15 | DOCKER_VERSION_CU121,
16 | DOCKER_MOUNT_DIR,
17 | DOCKER_TEST_DIR,
18 | )
19 |
20 |
21 | # use_cpu=True,use_int=False RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
22 | # use_cpu=True,use_int4=True ValueError: Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU.You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object
23 | @pytest.mark.parametrize(
24 | "docker_version,use_cpu,use_int4",
25 | [
26 | (DOCKER_VERSION_CU114, False, False),
27 | (DOCKER_VERSION_CU114, False, True),
28 | (DOCKER_VERSION_CU117, False, False),
29 | (DOCKER_VERSION_CU117, False, True),
30 | (DOCKER_VERSION_CU121, False, False),
31 | (DOCKER_VERSION_CU121, False, True),
32 | ],
33 | )
34 | def test_inference_api(docker_version, use_cpu, use_int4):
35 | container_name = "test_inference_api"
36 | model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat"
37 | cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {docker_version} /bin/bash -c '
38 | cmd = ""
39 | # for GPUs SM < 80
40 | is_ampere = torch.cuda.get_device_capability()[0] >= 8
41 | if not is_ampere:
42 | cmd += f"pip uninstall -y flash-attn && "
43 |
44 | cmd += f"""python {DOCKER_MOUNT_DIR}/openai_api.py -c {DOCKER_TEST_DIR}/{model_type}"""
45 |
46 | if use_cpu:
47 | cmd += " --cpu-only"
48 |
49 | snapshot_download(model_type, cache_dir=".", revision="master")
50 | # start model server
51 | print(cmd_docker + f'"{cmd}"')
52 | run_in_subprocess(
53 | f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."'
54 | )
55 | run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &")
56 |
57 | while not TelnetPort("localhost", 8000):
58 | print("Wait for the model service start.")
59 | time.sleep(0.5)
60 |
61 | if (
62 | subprocess.run(
63 | f"docker inspect {container_name}",
64 | shell=True,
65 | stdout=subprocess.DEVNULL,
66 | ).returncode
67 | != 0
68 | ):
69 | break
70 | try:
71 | # while load int4 model such as Qwen-1_8B-Chat-Int4, the model name is Qwen-1_8B-Chat
72 | simple_openai_api(f"{MODEL_TYPE}-Chat".split("/")[-1])
73 | except Exception as e:
74 | time.sleep(1)
75 | with open("tmp.log") as f:
76 | raise Exception(f"{e} \n {f.read()}")
77 |
78 | run_in_subprocess(f"docker rm -f {container_name}")
79 |
--------------------------------------------------------------------------------
/recipes/tests/test_inference/test_inference_vllm_fschat.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 | import pytest
5 | import subprocess
6 | import torch
7 | from modelscope.hub.snapshot_download import snapshot_download
8 |
9 | sys.path.append(os.path.dirname(__file__) + "/..")
10 | from utils import run_in_subprocess, simple_openai_api, TelnetPort
11 | from ut_config import (
12 | MODEL_TYPE,
13 | DOCKER_VERSION_CU121,
14 | DOCKER_MOUNT_DIR,
15 | DOCKER_TEST_DIR,
16 | )
17 |
18 |
19 | @pytest.mark.parametrize(
20 | "num_gpus,use_int4",
21 | [
22 | (1, False),
23 | (1, True),
24 | (2, False),
25 | # ValueError: The input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.
26 | # (2, True)
27 | ],
28 | )
29 | def test_inference_vllm_fschat(num_gpus, use_int4):
30 | model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat"
31 | container_name = "test_inference_vllm_fschat"
32 | cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {DOCKER_VERSION_CU121} /bin/bash -c '
33 | cmd = ""
34 |
35 | cmd += f"""nohup python -m fastchat.serve.controller > /dev/null 2>&1 \
36 | & python -m fastchat.serve.openai_api_server --host localhost --port 8000 > /dev/null 2>&1 \
37 | & python -m fastchat.serve.vllm_worker --model-path {DOCKER_TEST_DIR}/{model_type} --tensor-parallel-size {num_gpus} --trust-remote-code"""
38 |
39 | # for GPUS SM < 80 and use_int==True
40 | is_ampere = torch.cuda.get_device_capability()[0] >= 8
41 | if not is_ampere or use_int4:
42 | cmd += " --dtype half"
43 |
44 | snapshot_download(model_type, cache_dir=".", revision="master")
45 | # start model server
46 | run_in_subprocess(
47 | f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."'
48 | )
49 | print(cmd_docker + f'"{cmd}"')
50 | run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &")
51 |
52 | while not TelnetPort("localhost", 21002):
53 | print("Wait for the model service start.")
54 | time.sleep(0.5)
55 |
56 | if (
57 | subprocess.run(
58 | f"docker inspect {container_name}",
59 | shell=True,
60 | stdout=subprocess.DEVNULL,
61 | ).returncode
62 | != 0
63 | ):
64 | break
65 |
66 | try:
67 | simple_openai_api(model_type.split("/")[-1])
68 | except Exception as e:
69 | time.sleep(1)
70 | with open("tmp.log") as f:
71 | raise Exception(f"{e} \n {f.read()}")
72 |
73 | run_in_subprocess(f"docker rm -f {container_name}")
74 |
--------------------------------------------------------------------------------
/recipes/tests/ut_config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | # common
4 | MODEL_TYPE = "Qwen/Qwen-1_8B"
5 | DOCKER_VERSION_CU114 = "qwenllm/qwen:cu114"
6 | DOCKER_VERSION_CU117 = "qwenllm/qwen:cu117"
7 | DOCKER_VERSION_CU121 = "qwenllm/qwen:cu121"
8 | DOCKER_MOUNT_DIR = "/qwen-recipes"
9 | DOCKER_TEST_DIR = os.path.join(DOCKER_MOUNT_DIR, "recipes/tests")
10 |
11 | # finetune
12 | DATA_DIR = os.path.join(DOCKER_MOUNT_DIR, "recipes/tests/assets/test_sampled_qwen.json")
13 | DS_CONFIG_ZERO2_DIR = os.path.join(
14 | DOCKER_MOUNT_DIR, "finetune/ds_config_zero2.json"
15 | )
16 | DS_CONFIG_ZERO3_DIR = os.path.join(
17 | DOCKER_MOUNT_DIR, "finetune/ds_config_zero3.json"
18 | )
19 |
--------------------------------------------------------------------------------
/recipes/tests/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import subprocess
3 | import socket
4 | import openai
5 |
6 |
7 | def run_in_subprocess(cmd):
8 | try:
9 | with subprocess.Popen(
10 | cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
11 | ) as return_info:
12 | while True:
13 | next_line = return_info.stdout.readline()
14 | return_line = next_line.decode("utf-8", "ignore").strip()
15 | if return_line == "" and return_info.poll() != None:
16 | break
17 | if return_line != "":
18 | logging.info(return_line)
19 |
20 | err_lines = ""
21 | while True:
22 | next_line = return_info.stderr.readline()
23 | return_line = next_line.decode("utf-8", "ignore").strip()
24 | if return_line == "" and return_info.poll() != None:
25 | break
26 | if return_line != "":
27 | logging.info(return_line)
28 | err_lines += return_line + "\n"
29 |
30 | return_code = return_info.wait()
31 | if return_code:
32 | raise RuntimeError(err_lines)
33 | except Exception as e:
34 | raise e
35 |
36 |
37 | def simple_openai_api(model):
38 | openai.api_base = "http://localhost:8000/v1"
39 | openai.api_key = "none"
40 |
41 | # create a request not activating streaming response
42 | response = openai.ChatCompletion.create(
43 | model=model,
44 | messages=[{"role": "user", "content": "你好"}],
45 | stream=False,
46 | stop=[], # You can add custom stop words here, e.g., stop=["Observation:"] for ReAct prompting.
47 | )
48 | print(response.choices[0].message.content)
49 |
50 |
51 | def TelnetPort(server_ip, port):
52 | sk = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
53 | sk.settimeout(1)
54 | connect_flag = False
55 | try:
56 | sk.connect((server_ip, port))
57 | connect_flag = True
58 | except Exception:
59 | connect_flag = False
60 | sk.close()
61 | return connect_flag
62 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.32.0,<4.38.0
2 | accelerate
3 | tiktoken
4 | einops
5 | transformers_stream_generator==0.0.4
6 | scipy
7 |
--------------------------------------------------------------------------------
/requirements_web_demo.txt:
--------------------------------------------------------------------------------
1 | gradio<3.42
2 | mdtex2html
3 |
--------------------------------------------------------------------------------
/run_gptq.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | from typing import Dict
4 | import logging
5 |
6 | import torch
7 | import transformers
8 | from transformers import AutoTokenizer
9 | from transformers.trainer_pt_utils import LabelSmoother
10 | from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
11 | IGNORE_TOKEN_ID = LabelSmoother.ignore_index
12 |
13 | def preprocess(
14 | sources,
15 | tokenizer: transformers.PreTrainedTokenizer,
16 | max_len: int,
17 | system_message: str = "You are a helpful assistant."
18 | ) -> Dict:
19 | roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
20 |
21 | im_start = tokenizer.im_start_id
22 | im_end = tokenizer.im_end_id
23 | nl_tokens = tokenizer('\n').input_ids
24 | _system = tokenizer('system').input_ids + nl_tokens
25 | _user = tokenizer('user').input_ids + nl_tokens
26 | _assistant = tokenizer('assistant').input_ids + nl_tokens
27 |
28 | # Apply prompt templates
29 | data = []
30 | # input_ids, targets = [], []
31 | for i, source in enumerate(sources):
32 | source = source["conversations"]
33 | if roles[source[0]["from"]] != roles["user"]:
34 | source = source[1:]
35 |
36 | input_id, target = [], []
37 | system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
38 | input_id += system
39 | target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
40 | assert len(input_id) == len(target)
41 | for j, sentence in enumerate(source):
42 | role = roles[sentence["from"]]
43 | _input_id = tokenizer(role).input_ids + nl_tokens + \
44 | tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
45 | input_id += _input_id
46 | if role == '<|im_start|>user':
47 | _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
48 | elif role == '<|im_start|>assistant':
49 | _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
50 | _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
51 | else:
52 | raise NotImplementedError
53 | target += _target
54 | assert len(input_id) == len(target)
55 | input_id = torch.tensor(input_id[:max_len], dtype=torch.int)
56 | target = torch.tensor(target[:max_len], dtype=torch.int)
57 | data.append(dict(input_ids=input_id, attention_mask=input_id.ne(tokenizer.pad_token_id)))
58 |
59 | return data
60 |
61 |
62 | if __name__ == "__main__":
63 | parser = argparse.ArgumentParser("Model Quantization using AutoGPTQ")
64 | parser.add_argument("--model_name_or_path", type=str, help="model path")
65 | parser.add_argument("--data_path", type=str, help="calibration data path")
66 | parser.add_argument("--out_path", type=str, help="output path of the quantized model")
67 | parser.add_argument("--max_len", type=int, default=8192, help="max length of calibration data")
68 | parser.add_argument("--bits", type=int, default=4, help="the bits of quantized model. 4 indicates int4 models.")
69 | parser.add_argument("--group-size", type=int, default=128, help="the group size of quantized model")
70 | args = parser.parse_args()
71 |
72 | quantize_config = BaseQuantizeConfig(
73 | bits=args.bits,
74 | group_size=args.group_size,
75 | damp_percent=0.01,
76 | desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
77 | static_groups=False,
78 | sym=True,
79 | true_sequential=True,
80 | model_name_or_path=None,
81 | model_file_base_name="model"
82 | )
83 |
84 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True)
85 | tokenizer.pad_token_id = tokenizer.eod_id
86 | data = preprocess(json.load(open(args.data_path)), tokenizer, args.max_len)
87 |
88 | model = AutoGPTQForCausalLM.from_pretrained(args.model_name_or_path, quantize_config, device_map="auto", trust_remote_code=True)
89 |
90 | logging.basicConfig(
91 | format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
92 | )
93 | model.quantize(data, cache_examples_on_gpu=False)
94 |
95 | model.save_quantized(args.out_path, use_safetensors=True)
96 | tokenizer.save_pretrained(args.out_path)
97 |
--------------------------------------------------------------------------------
/tokenization_note_ja.md:
--------------------------------------------------------------------------------
1 | # トークン化
2 |
3 | Qwen-7B は `tiktoken` パッケージを使用して、UTF-8 バイトを BPE トークン化します。
4 | Qwen-7B には 2 種類のトークンがあります。BPE の通常のトークン (`bytes` 型) と特殊/制御トークン (`str` 型) です。
5 |
6 | ```python
7 | from transformers import AutoTokenizer
8 |
9 | tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True)
10 | ```
11 |
12 | ## 通常のトークン
13 |
14 | 通常のトークンは、UTF-8 エンコーディングでエンコードされたテキストのバイト列から学習した BPE トークンです。
15 | これによってすべてのテキストをトークン化することができ、未知のトークンは存在しませんが、一般的でないテキストをトークン化するときにシングルバイトを使用するようにフォールバックすることがあります。
16 | UTF-8 のデコードエラーに遭遇することがあり、そのエラーのデフォルトは `replace` であるため、不完全な生成では置換文字 (�) が使用されます。
17 | この動作は `errors="ignore"` を `decode` 関数に渡すことで変更することができる。
18 | `errors` のオプションについては、[Python ドキュメント](https://docs.python.org/3/library/stdtypes.html#bytes.decode) を参照してください。
19 |
20 | ```python
21 | >>> tokenizer.decode([51461])
22 | ' �'
23 |
24 | >>> tokenizer.convert_ids_to_tokens([51461])
25 | [b' \xe6\xa0']
26 |
27 | >>> b' \xe6\xa0'.decode("utf-8", errors='replace')
28 | ' �'
29 |
30 | >>> tokenizer.decode([51461, 117])
31 | ' 根'
32 |
33 | >>> tokenizer.convert_ids_to_tokens([51461, 117])
34 | [b' \xe6\xa0', b'\xb9']
35 |
36 | >>> b' \xe6\xa0\xb9'.decode("utf-8", errors='replace')
37 | ' 根'
38 | ```
39 |
40 | 通常のトークン (`bytes` 単位) からその ID へのマッピングは `tokenizer.get_vocab()` から取得できます。
41 | 通常のトークンを語彙に追加することはサポートしていませんし、推奨もしていません。
42 |
43 | ## 特別なトークン
44 |
45 | 特別なトークンは、例えば文書の最後に到達するなど、モデルにとって特別な機能を意味します。
46 | 理論的には、これらは入力テキストには存在せず、入力テキストが処理された後にのみ現れます。
47 | 例えば、文書の終わりを表す `<|endoftext|>` のような表面的な形は、参照を容易にするためだけのものである。
48 | 現在、Qwen-7B では `<|endoftext|>` が、Qwen-7B-Chat では `<|endoftext|>`, `<|im_start|>`, `<|im_end|>` が特殊トークンとして使われています。
49 | 他の目的のために、`<|extra_0|>` から `<|extra_204|>` までの特別なトークンを保持しています。
50 | 特殊トークンの表面形式 (`str` 内) から ID へのマッピングは `tokenizer.special_tokens` から取得できます。
51 |
52 | `bos`、`eos`、`unk`、`pad`、`mask`、`sep` などの概念は学習済みモデル(Qwen-7B と Qwen-7B-Chat)には適用できません。
53 | しかし、`pad` トークンは話が別です。理論的には、モデルがこのトークンを見たり計算したりすることはないので、既知のトークンを使用することができます。
54 | しかし、安全のために、トークナイザーの初期化で指定する特別なトークンの値は、既知の特別なトークンに限定します。
55 | 微調整やその他のフレームワークで特別なトークンを必要とする場合は、次のように指定できます
56 |
57 | ```python
58 | from transformers import AutoTokenizer
59 |
60 | tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True, pad_token='<|endoftext|>')
61 | ```
62 |
63 | > 警告: 私たちが事前に学習したモデルでは、`bos`, `eos`, `unk` などを設定しても意味がありません。
64 | > 特に、`<` を `eos` のように使ってはいけません。
65 | > 特に `<|endoftext|>` を `eos` として使用することは、文末と文末が同じであると確信できる場合を除き、避けるべきです。
66 |
67 | ## インジェクション攻撃の防止
68 |
69 | 特殊トークンは通常のトークンとは異なるため、コントロールトークンの表面形が入力テキストに現れるとどうなるでしょうか?
70 | 例えば、次のようなテキストがあるとします
71 |
72 | ```
73 | print("<|endoftext|>")
74 | ```
75 |
76 | これは次のようにしてトークン化する必要があります
77 |
78 | ```
79 | ids:[1350, 9639, 91, 8691, 723, 427, 91, 82598]
80 | tokens: [b'print', b'("<', b'|', b'endo', b'ft', b'ext', b'|', b'>")']
81 | ```
82 |
83 | こちらではありません
84 |
85 | ```
86 | ids: [1350, 445, 151643, 899]
87 | tokens: [b'print', b'("', '<|endoftext|>', b'")']
88 | ```
89 |
90 | つまり、特殊トークンの表面形は通常のテキストと同じように扱い、特殊トークンはテキストのトークン化後に開発者が処理するというものです。
91 | しかし、これはコミュニティにおける(安全ではないとはいえ)慣習に抵触し、開発者が車輪を再利用するための新たなステップを追加することになります。
92 |
93 | デフォルトの動作は、すべての既知の特殊トークンの表面形を特殊トークンとして解析するように変更されました。
94 | インジェクション防止を有効にするには、トークナイザーの呼び出しに `allowed_special=set()` を渡します:
95 |
96 | ```python
97 | >>> tokenizer('print("<|endoftext|>")', allowed_special=set())
98 | {'input_ids': [1350, 9639, 91, 8691, 723, 427, 91, 82598], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
99 | ```
100 |
101 | `str` のセットを `allowed_special` として渡すことで、きめ細かく動作を制御することができます
102 |
103 | ```python
104 | >>> tokenizer('print("<|extra_0|>")<|endoftext|>', allowed_special={'<|endoftext|>'})
105 | {'input_ids': [1350, 9639, 91, 15460, 62, 15, 91, 82598, 151643], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
106 | ```
107 |
108 | `str` のコレクションを `disallowed_special` として渡すことで、特定の特殊なトークンの表形式が入力テキストで遭遇した場合にトークナイザーがエラーを発生するようにすることもできます
109 |
110 | ```python
111 | >>> tokenizer('print("<|extra_0|>")<|endoftext|>', allowed_special={'<|endoftext|>'}, disallowed_special=('<|extra_0|>', ))
112 | ...
113 | ValueError: Encountered text corresponding to disallowed special token '<|extra_0|>'.
114 | If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|extra_0|>', ...}`.
115 | If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|extra_0|>'})`.
116 | To disable this check for all special tokens, pass `disallowed_special=()`.
117 | ```
118 |
119 | `allowed_special` と `disallowed_special` の詳細については、[`tiktoken` ドキュメント](https://github.com/openai/tiktoken/blob/095924e02c85617df6889698d94515f91666c7ea/tiktoken/core.py#L75)を参照してください。
120 |
121 | 新しいデフォルトは以下の通り
122 |
123 | ```python
124 | >>> tokenizer('print("<|endoftext|>")', allowed_special="all", disallowed_special=())
125 | {'input_ids': [1350, 445, 151643, 899], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}
126 | ```
127 |
128 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM
3 | from accelerate import dispatch_model
4 |
5 |
6 | def _device_map(num_gpus, num_layers):
7 | per_gpu_layers = (num_layers + 2) / num_gpus
8 |
9 | device_map = {
10 | 'transformer.wte': 0,
11 | 'transformer.ln_f': 0,
12 | 'lm_head': num_gpus-1
13 | }
14 |
15 | used = 1
16 | gpu_target = 0
17 | for i in range(num_layers):
18 | if used >= per_gpu_layers:
19 | gpu_target += 1
20 | used = 0 if gpu_target < num_gpus-1 else 1
21 | assert gpu_target < num_gpus
22 | device_map[f'transformer.h.{i}'] = gpu_target
23 | used += 1
24 |
25 | return device_map
26 |
27 |
28 | def load_model_on_gpus(model_name_or_path, num_gpus: int = 2):
29 | num_devices = torch.cuda.device_count()
30 |
31 | if num_gpus == 1:
32 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='auto',
33 | trust_remote_code=True).eval()
34 | elif 1 < num_gpus <= num_devices:
35 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu',
36 | trust_remote_code=True).eval()
37 | num_layers = model.config.num_hidden_layers
38 | device_map = _device_map(num_gpus, num_layers)
39 | print(device_map)
40 | model = dispatch_model(model, device_map=device_map)
41 | else:
42 | raise KeyError
43 |
44 | return model
45 |
--------------------------------------------------------------------------------
/web_demo.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba Cloud.
2 | #
3 | # This source code is licensed under the license found in the
4 | # LICENSE file in the root directory of this source tree.
5 |
6 | """A simple web interactive chat demo based on gradio."""
7 | import os
8 | from argparse import ArgumentParser
9 |
10 | import gradio as gr
11 | import mdtex2html
12 |
13 | import torch
14 | from transformers import AutoModelForCausalLM, AutoTokenizer
15 | from transformers.generation import GenerationConfig
16 |
17 |
18 | DEFAULT_CKPT_PATH = 'Qwen/Qwen-7B-Chat'
19 |
20 |
21 | def _get_args():
22 | parser = ArgumentParser()
23 | parser.add_argument("-c", "--checkpoint-path", type=str, default=DEFAULT_CKPT_PATH,
24 | help="Checkpoint name or path, default to %(default)r")
25 | parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only")
26 |
27 | parser.add_argument("--share", action="store_true", default=False,
28 | help="Create a publicly shareable link for the interface.")
29 | parser.add_argument("--inbrowser", action="store_true", default=False,
30 | help="Automatically launch the interface in a new tab on the default browser.")
31 | parser.add_argument("--server-port", type=int, default=8000,
32 | help="Demo server port.")
33 | parser.add_argument("--server-name", type=str, default="127.0.0.1",
34 | help="Demo server name.")
35 |
36 | args = parser.parse_args()
37 | return args
38 |
39 |
40 | def _load_model_tokenizer(args):
41 | tokenizer = AutoTokenizer.from_pretrained(
42 | args.checkpoint_path, trust_remote_code=True, resume_download=True,
43 | )
44 |
45 | if args.cpu_only:
46 | device_map = "cpu"
47 | else:
48 | device_map = "auto"
49 |
50 | model = AutoModelForCausalLM.from_pretrained(
51 | args.checkpoint_path,
52 | device_map=device_map,
53 | trust_remote_code=True,
54 | resume_download=True,
55 | ).eval()
56 |
57 | config = GenerationConfig.from_pretrained(
58 | args.checkpoint_path, trust_remote_code=True, resume_download=True,
59 | )
60 |
61 | return model, tokenizer, config
62 |
63 |
64 | def postprocess(self, y):
65 | if y is None:
66 | return []
67 | for i, (message, response) in enumerate(y):
68 | y[i] = (
69 | None if message is None else mdtex2html.convert(message),
70 | None if response is None else mdtex2html.convert(response),
71 | )
72 | return y
73 |
74 |
75 | gr.Chatbot.postprocess = postprocess
76 |
77 |
78 | def _parse_text(text):
79 | lines = text.split("\n")
80 | lines = [line for line in lines if line != ""]
81 | count = 0
82 | for i, line in enumerate(lines):
83 | if "```" in line:
84 | count += 1
85 | items = line.split("`")
86 | if count % 2 == 1:
87 | lines[i] = f''
88 | else:
89 | lines[i] = f"
"
90 | else:
91 | if i > 0:
92 | if count % 2 == 1:
93 | line = line.replace("`", r"\`")
94 | line = line.replace("<", "<")
95 | line = line.replace(">", ">")
96 | line = line.replace(" ", " ")
97 | line = line.replace("*", "*")
98 | line = line.replace("_", "_")
99 | line = line.replace("-", "-")
100 | line = line.replace(".", ".")
101 | line = line.replace("!", "!")
102 | line = line.replace("(", "(")
103 | line = line.replace(")", ")")
104 | line = line.replace("$", "$")
105 | lines[i] = "
" + line
106 | text = "".join(lines)
107 | return text
108 |
109 |
110 | def _gc():
111 | import gc
112 | gc.collect()
113 | if torch.cuda.is_available():
114 | torch.cuda.empty_cache()
115 |
116 |
117 | def _launch_demo(args, model, tokenizer, config):
118 |
119 | def predict(_query, _chatbot, _task_history):
120 | print(f"User: {_parse_text(_query)}")
121 | _chatbot.append((_parse_text(_query), ""))
122 | full_response = ""
123 |
124 | for response in model.chat_stream(tokenizer, _query, history=_task_history, generation_config=config):
125 | _chatbot[-1] = (_parse_text(_query), _parse_text(response))
126 |
127 | yield _chatbot
128 | full_response = _parse_text(response)
129 |
130 | print(f"History: {_task_history}")
131 | _task_history.append((_query, full_response))
132 | print(f"Qwen-Chat: {_parse_text(full_response)}")
133 |
134 | def regenerate(_chatbot, _task_history):
135 | if not _task_history:
136 | yield _chatbot
137 | return
138 | item = _task_history.pop(-1)
139 | _chatbot.pop(-1)
140 | yield from predict(item[0], _chatbot, _task_history)
141 |
142 | def reset_user_input():
143 | return gr.update(value="")
144 |
145 | def reset_state(_chatbot, _task_history):
146 | _task_history.clear()
147 | _chatbot.clear()
148 | _gc()
149 | return _chatbot
150 |
151 | with gr.Blocks() as demo:
152 | gr.Markdown("""\
153 | 
""")
154 | gr.Markdown("""
Qwen-Chat Bot""")
155 | gr.Markdown(
156 | """\
157 | This WebUI is based on Qwen-Chat, developed by Alibaba Cloud. \
158 | (本WebUI基于Qwen-Chat打造,实现聊天机器人功能。)""")
159 | gr.Markdown("""\
160 |
161 | Qwen-7B 🤖 |
162 | 🤗  |
163 | Qwen-7B-Chat 🤖 |
164 | 🤗  |
165 | Qwen-14B 🤖 |
166 | 🤗  |
167 | Qwen-14B-Chat 🤖 |
168 | 🤗  |
169 |  Github""")
170 |
171 | chatbot = gr.Chatbot(label='Qwen-Chat', elem_classes="control-height")
172 | query = gr.Textbox(lines=2, label='Input')
173 | task_history = gr.State([])
174 |
175 | with gr.Row():
176 | empty_btn = gr.Button("🧹 Clear History (清除历史)")
177 | submit_btn = gr.Button("🚀 Submit (发送)")
178 | regen_btn = gr.Button("🤔️ Regenerate (重试)")
179 |
180 | submit_btn.click(predict, [query, chatbot, task_history], [chatbot], show_progress=True)
181 | submit_btn.click(reset_user_input, [], [query])
182 | empty_btn.click(reset_state, [chatbot, task_history], outputs=[chatbot], show_progress=True)
183 | regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
184 |
185 | gr.Markdown("""\
186 | Note: This demo is governed by the original license of Qwen. \
187 | We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, \
188 | including hate speech, violence, pornography, deception, etc. \
189 | (注:本演示受Qwen的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,\
190 | 包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)""")
191 |
192 | demo.queue().launch(
193 | share=args.share,
194 | inbrowser=args.inbrowser,
195 | server_port=args.server_port,
196 | server_name=args.server_name,
197 | )
198 |
199 |
200 | def main():
201 | args = _get_args()
202 |
203 | model, tokenizer, config = _load_model_tokenizer(args)
204 |
205 | _launch_demo(args, model, tokenizer, config)
206 |
207 |
208 | if __name__ == '__main__':
209 | main()
210 |
--------------------------------------------------------------------------------