├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yaml │ ├── config.yaml │ └── feature_request.yaml └── workflows │ └── stale.yml ├── .gitignore ├── FAQ.md ├── FAQ_ja.md ├── FAQ_zh.md ├── LICENSE ├── NOTICE ├── QWEN_TECHNICAL_REPORT.pdf ├── README.md ├── README_CN.md ├── README_ES.md ├── README_FR.md ├── README_JA.md ├── Tongyi Qianwen LICENSE AGREEMENT ├── Tongyi Qianwen RESEARCH LICENSE AGREEMENT ├── ascend-support ├── README.md └── docker_qwen.sh ├── assets ├── cli_demo.gif ├── code_interpreter_showcase_001.jpg ├── hfagent_chat_1.png ├── hfagent_chat_2.png ├── hfagent_run.png ├── logo.jpg ├── openai_api.gif ├── performance.png ├── qwen_72b_needle_in_a_haystack.png ├── qwen_tokenizer.png ├── radar_14b.jpg ├── radar_72b.jpg ├── react_showcase_001.png ├── react_showcase_002.png ├── react_tutorial_001.png ├── react_tutorial_002.png ├── system_prompt_behavior_setting.png ├── system_prompt_behavior_setting_en.png ├── system_prompt_language_style.png ├── system_prompt_language_style_en.png ├── system_prompt_role_play.png ├── system_prompt_role_play_en.png ├── system_prompt_task_setting.png ├── system_prompt_task_setting_en.png ├── tokenizer.pdf ├── tokenizer.png ├── wanx_colorful_black.png ├── web_demo.gif └── wechat.png ├── cli_demo.py ├── dcu-support ├── README.md ├── cli_demo.py ├── cli_demo_batch.py ├── model.properties ├── package │ ├── fastllm_pytools │ │ ├── __init__.py │ │ ├── hf_model.py │ │ ├── llm.py │ │ └── torch2flm.py │ └── setup.py ├── qwen2flm.py ├── requirements.txt └── web_demo.py ├── docker ├── Dockerfile ├── Dockerfile-cu114 ├── Dockerfile-cu121 ├── docker_cli_demo.sh ├── docker_openai_api.sh └── docker_web_demo.sh ├── eval ├── EVALUATION.md ├── evaluate_ceval.py ├── evaluate_chat_ceval.py ├── evaluate_chat_gsm8k.py ├── evaluate_chat_humaneval.py ├── evaluate_chat_mmlu.py ├── evaluate_cmmlu.py ├── evaluate_gsm8k.py ├── evaluate_humaneval.py ├── evaluate_mmlu.py ├── evaluate_plugin.py └── gsm8k_prompt.txt ├── examples ├── add_merges.py ├── auto_comments.md ├── auto_comments.py ├── function_call_examples.py ├── function_call_finetune_examples.py ├── langchain_tooluse.ipynb ├── qwen_extra.tiktoken ├── qwen_extra_vocab.txt ├── react_demo.py ├── react_prompt.md ├── system_prompt.md ├── tokenizer_showcase.ipynb ├── transformers_agent.md └── vllm_wrapper.py ├── finetune.py ├── finetune ├── ds_config_zero2.json ├── ds_config_zero3.json ├── finetune_ds.sh ├── finetune_lora_ds.sh ├── finetune_lora_single_gpu.sh ├── finetune_qlora_ds.sh └── finetune_qlora_single_gpu.sh ├── openai_api.py ├── recipes ├── applications │ ├── chatbot │ │ └── qwen_chatbot.ipynb │ ├── domain_finetune │ │ └── qwen_domain_finetune.ipynb │ └── retrieval │ │ └── retrieval.ipynb ├── finetune │ ├── ascend │ │ └── README.md │ ├── deepspeed │ │ ├── finetune_fullparameter_multi_gpu.ipynb │ │ ├── finetune_fullparameter_single_gpu.ipynb │ │ ├── finetune_lora_multi_gpu.ipynb │ │ ├── finetune_lora_single_gpu.ipynb │ │ ├── finetune_qlora_multi_gpu.ipynb │ │ ├── finetune_qlora_single_gpu.ipynb │ │ ├── readme.md │ │ └── requirements.txt │ └── swift │ │ ├── README.md │ │ └── README_CN.md ├── inference │ ├── dashscope │ │ └── README.md │ ├── hf_modelscope │ │ └── README.md │ ├── quantization │ │ └── README.md │ ├── tensorrt │ │ ├── README.md │ │ └── docker │ │ │ └── Dockerfile │ └── vllm │ │ ├── README.md │ │ ├── template_chatml.jinja │ │ └── vllm_wrapper.py ├── quickstart │ └── qwen.ipynb └── tests │ ├── README.md │ ├── __init__.py │ ├── assets │ └── test_sampled_qwen.json │ ├── test_finetune │ └── test_finetune_ds.py │ ├── test_inference │ ├── test_inference_api.py │ └── test_inference_vllm_fschat.py │ ├── ut_config.py │ └── utils.py ├── requirements.txt ├── requirements_web_demo.txt ├── run_gptq.py ├── tech_memo.md ├── tokenization_note.md ├── tokenization_note_ja.md ├── tokenization_note_zh.md ├── utils.py └── web_demo.py /.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.so 3 | build 4 | .coverage_* 5 | *.egg-info 6 | *~ 7 | .vscode/ 8 | .idea/ 9 | .git/ 10 | .github/ 11 | .DS_Store 12 | 13 | /private/ 14 | /README-docker.md 15 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yaml: -------------------------------------------------------------------------------- 1 | name: 🐞 Bug 2 | description: 提交错误报告 | File a bug/issue 3 | title: "[BUG] " 4 | labels: [] 5 | body: 6 | - type: checkboxes 7 | attributes: 8 | label: 是否已有关于该错误的issue或讨论? | Is there an existing issue / discussion for this? 9 | description: | 10 | 请先搜索您遇到的错误是否在已有的issues或讨论中提到过。 11 | Please search to see if an issue / discussion already exists for the bug you encountered. 12 | [Issues](https://github.com/QwenLM/Qwen-7B/issues) 13 | [Discussions](https://github.com/QwenLM/Qwen-7B/discussions) 14 | options: 15 | - label: 我已经搜索过已有的issues和讨论 | I have searched the existing issues / discussions 16 | required: true 17 | - type: checkboxes 18 | attributes: 19 | label: 该问题是否在FAQ中有解答? | Is there an existing answer for this in FAQ? 20 | description: | 21 | 请先搜索您遇到的错误是否已在FAQ中有相关解答。 22 | Please search to see if an answer already exists in FAQ for the bug you encountered. 23 | [FAQ-en](https://github.com/QwenLM/Qwen-7B/blob/main/FAQ.md) 24 | [FAQ-zh](https://github.com/QwenLM/Qwen-7B/blob/main/FAQ_zh.md) 25 | options: 26 | - label: 我已经搜索过FAQ | I have searched FAQ 27 | required: true 28 | - type: textarea 29 | attributes: 30 | label: 当前行为 | Current Behavior 31 | description: | 32 | 准确描述遇到的行为。 33 | A concise description of what you're experiencing. 34 | validations: 35 | required: false 36 | - type: textarea 37 | attributes: 38 | label: 期望行为 | Expected Behavior 39 | description: | 40 | 准确描述预期的行为。 41 | A concise description of what you expected to happen. 42 | validations: 43 | required: false 44 | - type: textarea 45 | attributes: 46 | label: 复现方法 | Steps To Reproduce 47 | description: | 48 | 复现当前行为的详细步骤。 49 | Steps to reproduce the behavior. 50 | placeholder: | 51 | 1. In this environment... 52 | 2. With this config... 53 | 3. Run '...' 54 | 4. See error... 55 | validations: 56 | required: false 57 | - type: textarea 58 | attributes: 59 | label: 运行环境 | Environment 60 | description: | 61 | examples: 62 | - **OS**: Ubuntu 20.04 63 | - **Python**: 3.8 64 | - **Transformers**: 4.31.0 65 | - **PyTorch**: 2.0.1 66 | - **CUDA**: 11.4 67 | value: | 68 | - OS: 69 | - Python: 70 | - Transformers: 71 | - PyTorch: 72 | - CUDA (`python -c 'import torch; print(torch.version.cuda)'`): 73 | render: Markdown 74 | validations: 75 | required: false 76 | - type: textarea 77 | attributes: 78 | label: 备注 | Anything else? 79 | description: | 80 | 您可以在这里补充其他关于该问题背景信息的描述、链接或引用等。 81 | 82 | 您可以通过点击高亮此区域然后拖动文件的方式上传图片或日志文件。 83 | 84 | Links? References? Anything that will give us more context about the issue you are encountering! 85 | 86 | Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in. 87 | validations: 88 | required: false 89 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yaml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yaml: -------------------------------------------------------------------------------- 1 | name: "💡 Feature Request" 2 | description: 创建新功能请求 | Create a new ticket for a new feature request 3 | title: "💡 [REQUEST] - <title>" 4 | labels: [ 5 | "question" 6 | ] 7 | body: 8 | - type: input 9 | id: start_date 10 | attributes: 11 | label: "起始日期 | Start Date" 12 | description: | 13 | 起始开发日期 14 | Start of development 15 | placeholder: "month/day/year" 16 | validations: 17 | required: false 18 | - type: textarea 19 | id: implementation_pr 20 | attributes: 21 | label: "实现PR | Implementation PR" 22 | description: | 23 | 实现该功能的Pull request 24 | Pull request used 25 | placeholder: "#Pull Request ID" 26 | validations: 27 | required: false 28 | - type: textarea 29 | id: reference_issues 30 | attributes: 31 | label: "相关Issues | Reference Issues" 32 | description: | 33 | 与该功能相关的issues 34 | Common issues 35 | placeholder: "#Issues IDs" 36 | validations: 37 | required: false 38 | - type: textarea 39 | id: summary 40 | attributes: 41 | label: "摘要 | Summary" 42 | description: | 43 | 简要描述新功能的特点 44 | Provide a brief explanation of the feature 45 | placeholder: | 46 | Describe in a few lines your feature request 47 | validations: 48 | required: true 49 | - type: textarea 50 | id: basic_example 51 | attributes: 52 | label: "基本示例 | Basic Example" 53 | description: Indicate here some basic examples of your feature. 54 | placeholder: A few specific words about your feature request. 55 | validations: 56 | required: true 57 | - type: textarea 58 | id: drawbacks 59 | attributes: 60 | label: "缺陷 | Drawbacks" 61 | description: | 62 | 该新功能有哪些缺陷/可能造成哪些影响? 63 | What are the drawbacks/impacts of your feature request ? 64 | placeholder: | 65 | Identify the drawbacks and impacts while being neutral on your feature request 66 | validations: 67 | required: true 68 | - type: textarea 69 | id: unresolved_question 70 | attributes: 71 | label: "未解决问题 | Unresolved questions" 72 | description: | 73 | 有哪些尚未解决的问题? 74 | What questions still remain unresolved ? 75 | placeholder: | 76 | Identify any unresolved issues. 77 | validations: 78 | required: false -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: Close stale issues 2 | on: 3 | schedule: 4 | - cron: "0 8 * * *" 5 | 6 | jobs: 7 | close-issues: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | actions: write 11 | issues: write 12 | steps: 13 | - uses: actions/stale@v9 14 | with: 15 | days-before-issue-stale: 30 16 | days-before-issue-close: 7 17 | stale-issue-label: inactive 18 | stale-issue-message: This issue has been automatically marked as inactive due to lack of recent activity. 19 | Should you believe it remains unresolved and warrants attention, kindly leave a comment on this thread. 20 | 21 | 此问题由于长期未有新进展而被系统自动标记为不活跃。如果您认为它仍有待解决,请在此帖下方留言以补充信息。 22 | days-before-pr-stale: -1 23 | days-before-pr-close: -1 24 | operations-per-run: 128 25 | repo-token: ${{ secrets.GITHUB_TOKEN }} 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.so 3 | build 4 | .coverage_* 5 | *.egg-info 6 | *~ 7 | .vscode/ 8 | .idea/ 9 | .DS_Store 10 | 11 | /private/ 12 | -------------------------------------------------------------------------------- /FAQ.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## Installation & Environment 4 | 5 | #### Failure in installing flash attention 6 | 7 | Flash attention is an option for accelerating training and inference. Only NVIDIA GPUs of Turing, Ampere, Ada, and Hopper architecture, e.g., H100, A100, RTX 3090, T4, RTX 2080, can support flash attention. **You can use our models without installing it.** 8 | 9 | #### Which version of transformers should I use? 10 | 11 | 4.32.0 is preferred. 12 | 13 | #### I downloaded the codes and checkpoints but I can't load the model locally. What should I do? 14 | 15 | Please check if you have updated the code to the latest, and correctly downloaded all the sharded checkpoint files. 16 | 17 | #### `qwen.tiktoken` is not found. What is it? 18 | 19 | This is the merge file of the tokenizer. You have to download it. Note that if you just git clone the repo without [git-lfs](https://git-lfs.com), you cannot download this file. 20 | 21 | #### transformers_stream_generator/tiktoken/accelerate not found 22 | 23 | Run the command `pip install -r requirements.txt`. You can find the file at [https://github.com/QwenLM/Qwen-7B/blob/main/requirements.txt](https://github.com/QwenLM/Qwen/blob/main/requirements.txt). 24 | <br><br> 25 | 26 | 27 | 28 | ## Demo & Inference 29 | 30 | #### Is there any demo? CLI demo and Web UI demo? 31 | 32 | Yes, see `web_demo.py` for web demo and `cli_demo.py` for CLI demo. See README for more information. 33 | 34 | 35 | #### Can I use CPU only? 36 | 37 | Yes, run `python cli_demo.py --cpu-only` will load the model and inference on CPU only. 38 | 39 | #### Can Qwen support streaming? 40 | 41 | Yes. See the function `chat_stream` in `modeling_qwen.py`. 42 | 43 | #### Gibberish in result when using chat_stream(). 44 | 45 | This is because tokens represent bytes and a single token may be a meaningless string. We have updated the default setting of our tokenizer to avoid such decoding results. Please update the code to the latest version. 46 | 47 | #### It seems that the generation is not related to the instruction... 48 | 49 | Please check if you are loading Qwen-Chat instead of Qwen. Qwen is the base model without alignment, which behaves differently from the SFT/Chat model. 50 | 51 | #### Is quantization supported? 52 | 53 | Yes, the quantization is supported by AutoGPTQ. 54 | 55 | 56 | #### Slow when processing long sequences 57 | 58 | Updating the code to the latest version can help. 59 | 60 | #### Unsatisfactory performance in processing long sequences 61 | 62 | Please ensure that NTK is applied. `use_dynamc_ntk` and `use_logn_attn` in `config.json` should be set to `true` (`true` by default). 63 | <br><br> 64 | 65 | 66 | 67 | ## Finetuning 68 | 69 | #### Can Qwen support SFT or even RLHF? 70 | 71 | Yes, we now support SFT, including full-parameter finetuning, LoRA, and Q-LoRA. Also you can check other projects like [FastChat](**[https://github.com/lm-sys/FastChat](https://github.com/lm-sys/FastChat)), [Firefly]([https://github.com/yangjianxin1/Firefly](https://github.com/yangjianxin1/Firefly)), [**LLaMA Efficient Tuning**]([https://github.com/hiyouga/LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning)), etc. 72 | 73 | However, temporarily we do not support RLHF. We will provide the code in the near future. 74 | <br><br> 75 | 76 | 77 | 78 | ## Tokenizer 79 | 80 | #### bos_id/eos_id/pad_id not found 81 | 82 | In our training, we only use `<|endoftext|>` as the separator and padding token. You can set bos_id, eos_id, and pad_id to tokenizer.eod_id. Learn more about our tokenizer from our documents about the tokenizer. 83 | 84 | 85 | 86 | ## Docker 87 | 88 | #### Download official docker image is very slow 89 | 90 | When downloading our official docker image, you may have a slow download speed due to some network issues. You can refer to [Alibaba Cloud Container Image Service](https://help.aliyun.com/zh/acr/user-guide/accelerate-the-pulls-of-docker-official-images) to accelerate the download of official images. 91 | -------------------------------------------------------------------------------- /FAQ_ja.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## インストールと環境 4 | 5 | #### Flash attention 導入の失敗例 6 | 7 | Flash attention は、トレーニングと推論を加速するオプションです。H100、A100、RTX 3090、T4、RTX 2080 などの Turing、Ampere、Ada、および Hopper アーキテクチャの NVIDIA GPU だけが、flash attention をサポートできます。それをインストールせずに私たちのモデルを使用することができます。 8 | 9 | #### transformers のバージョンは? 10 | 11 | 4.32.0 が望ましいです。 12 | 13 | #### コードとチェックポイントをダウンロードしましたが、モデルをローカルにロードできません。どうすればよいでしょうか? 14 | 15 | コードを最新のものに更新し、すべてのシャードされたチェックポイントファイルを正しくダウンロードしたかどうか確認してください。 16 | 17 | #### `qwen.tiktoken` が見つかりません。これは何ですか? 18 | 19 | これはトークナイザーのマージファイルです。ダウンロードする必要があります。[git-lfs](https://git-lfs.com) を使わずにリポジトリを git clone しただけでは、このファイルをダウンロードできないことに注意してください。 20 | 21 | #### transformers_stream_generator/tiktoken/accelerate が見つかりません。 22 | 23 | コマンド `pip install -r requirements.txt` を実行してください。このファイルは [https://github.com/QwenLM/Qwen/blob/main/requirements.txt](https://github.com/QwenLM/Qwen/blob/main/requirements.txt) にあります。 24 | <br><br> 25 | 26 | 27 | 28 | ## デモと推論 29 | 30 | #### デモはありますか?CLI と Web UI のデモはありますか? 31 | 32 | はい、Web デモは `web_demo.py` を、CLI デモは `cli_demo.py` を参照してください。詳しくは README を参照してください。 33 | 34 | 35 | 36 | #### CPU のみを使うことはできますか? 37 | 38 | はい、`python cli_demo.py --cpu-only` を実行すると、CPU のみでモデルと推論をロードします。 39 | 40 | #### Qwen はストリーミングに対応していますか? 41 | 42 | `modeling_qwen.py` の `chat_stream` 関数を参照してください。 43 | 44 | #### chat_stream() を使用すると、結果に文字化けが発生します。 45 | 46 | これは、トークンがバイトを表し、単一のトークンが無意味な文字列である可能性があるためです。このようなデコード結果を避けるため、トークナイザのデフォルト設定を更新しました。コードを最新版に更新してください。 47 | 48 | #### インストラクションとは関係ないようですが... 49 | 50 | Qwen ではなく Qwen-Chat を読み込んでいないか確認してください。Qwen はアライメントなしのベースモデルで、SFT/Chat モデルとは挙動が異なります。 51 | 52 | #### 量子化はサポートされていますか? 53 | 54 | はい、量子化は AutoGPTQ でサポートされています。 55 | 56 | 57 | #### 長いシーケンスの処理に時間がかかる 58 | 59 | コードを最新版に更新することで解決します。 60 | 61 | #### 長いシーケンスの処理で不満足なパフォーマンス 62 | 63 | NTK が適用されていることを確認してください。`config.json` の `use_dynamc_ntk` と `use_logn_attn` を `true` に設定する必要があります(デフォルトでは `true`)。 64 | <br><br> 65 | 66 | 67 | 68 | ## ファインチューニング 69 | 70 | #### Qwen は SFT、あるいは RLHF に対応できますか? 71 | 72 | SFTのコードは提供します。[FastChat](**[https://github.com/lm-sys/FastChat](https://github.com/lm-sys/FastChat))、[Firefly]([https://github.com/yangjianxin1/Firefly](https://github.com/yangjianxin1/Firefly))、[**LLaMA Efficient Tuning**]([https://github.com/hiyouga/LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning))など、いくつかのプロジェクトではファインチューニングをサポートしています。近日中に関連コードを更新する予定です。 73 | <br><br> 74 | 75 | 76 | 77 | ## トークナイザー 78 | 79 | #### bos_id/eos_id/pad_id が見つかりません。 80 | 81 | 私たちのトレーニングでは、セパレータとパディングトークンとして `<|endoftext|>` のみを使用しています。bos_id、eos_id、pad_id は tokenizer.eod_id に設定できます。私たちのトークナイザーについて詳しくは、トークナイザーについてのドキュメントをご覧ください。 82 | 83 | -------------------------------------------------------------------------------- /FAQ_zh.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## 安装&环境 4 | 5 | #### flash attention 安装失败 6 | 7 | flash attention是一个用于加速模型训练推理的可选项,且仅适用于Turing、Ampere、Ada、Hopper架构的Nvidia GPU显卡(如H100、A100、RTX 3090、T4、RTX 2080),您可以在不安装flash attention的情况下正常使用模型进行推理。 8 | 9 | #### 我应该用哪个transformers版本? 10 | 11 | 建议使用4.32.0。 12 | 13 | #### 我把模型和代码下到本地,按照教程无法使用,该怎么办? 14 | 15 | 答:别着急,先检查你的代码是不是更新到最新版本,然后确认你是否完整地将模型checkpoint下到本地。 16 | 17 | #### `qwen.tiktoken`这个文件找不到,怎么办? 18 | 19 | 这个是我们的tokenizer的merge文件,你必须下载它才能使用我们的tokenizer。注意,如果你使用git clone却没有使用git-lfs,这个文件不会被下载。如果你不了解git-lfs,可点击[官网](https://git-lfs.com/)了解。 20 | 21 | #### transformers_stream_generator/tiktoken/accelerate,这几个库提示找不到,怎么办? 22 | 23 | 运行如下命令:`pip install -r requirements.txt`。相关依赖库在[https://github.com/QwenLM/Qwen-7B/blob/main/requirements.txt](https://github.com/QwenLM/Qwen/blob/main/requirements.txt) 可以找到。 24 | <br><br> 25 | 26 | 27 | ## Demo & 推理 28 | 29 | #### 是否提供Demo?CLI Demo及Web UI Demo? 30 | 31 | `web_demo.py`和`cli_demo.py`分别提供了Web UI以及CLI的Demo。请查看README相关内容了解更多。 32 | 33 | #### 我没有GPU,只用CPU运行CLI demo可以吗? 34 | 35 | 可以的,运行`python cli_demo.py --cpu-only`命令即可将模型读取到CPU并使用CPU进行推理。 36 | 37 | #### Qwen支持流式推理吗? 38 | 39 | Qwen当前支持流式推理。见位于`modeling_qwen.py`的`chat_stream`函数。 40 | 41 | #### 使用`chat_stream()`生成混乱的内容及乱码,为什么? 42 | 43 | 这是由于模型生成过程中输出的部分token需要与后续token一起解码才能输出正常文本,单个token解码结果是无意义字符串,我们已经更新了tokenizer解码时的默认设置,避免这些字符串在生成结果中出现,如果仍有类似问题请更新模型至最新版本。 44 | 45 | #### 模型的输出看起来与输入无关/没有遵循指令/看起来呆呆的 46 | 47 | 请检查是否加载的是Qwen-Chat模型进行推理,Qwen模型是未经align的预训练基模型,不期望具备响应用户指令的能力。我们在模型最新版本已经对`chat`及`chat_stream`接口内进行了检查,避免您误将预训练模型作为SFT/Chat模型使用。 48 | 49 | #### 是否有量化版本模型 50 | 51 | 目前Qwen支持基于AutoGPTQ的4-bit的量化推理。 52 | 53 | #### 生成序列较长后速度显著变慢 54 | 55 | 请更新到最新代码。 56 | 57 | #### 处理长序列时效果有问题 58 | 59 | 请确认是否开启ntk。若要启用这些技巧,请将`config.json`里的`use_dynamc_ntk`和`use_logn_attn`设置为`true`。最新代码默认为`true`。 60 | <br><br> 61 | 62 | 63 | ## 微调 64 | 65 | #### 当前是否支持SFT和RLHF? 66 | 67 | 我们目前提供了SFT的代码,支持全参数微调、LoRA和Q-LoRA。此外,当前有多个外部项目也已实现支持,如[FastChat](**[https://github.com/lm-sys/FastChat](https://github.com/lm-sys/FastChat))、[Firefly]([https://github.com/yangjianxin1/Firefly](https://github.com/yangjianxin1/Firefly))、[**LLaMA Efficient Tuning**]([https://github.com/hiyouga/LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning))等。我们会尽快更新这部分代码和说明。 68 | 69 | 我们还没提供对RLHF训练的支持,敬请期待。 70 | <br><br> 71 | 72 | 73 | ## Tokenizer 74 | 75 | #### bos_id/eos_id/pad_id,这些token id不存在,为什么? 76 | 77 | 在训练过程中,我们仅使用<|endoftext|>这一token作为sample/document之间的分隔符及padding位置占位符,你可以将bos_id, eos_id, pad_id均指向tokenizer.eod_id。请阅读我们关于tokenizer的文档,了解如何设置这些id。 78 | 79 | 80 | ## Docker 81 | 82 | #### 下载官方Docker镜像速度很慢 83 | 84 | 在下载官方镜像时,您可能由于某些网络原因导致下载速度变慢。可以参考[阿里云容器镜像服务](https://help.aliyun.com/zh/acr/user-guide/accelerate-the-pulls-of-docker-official-images)加速官方镜像的下载。 -------------------------------------------------------------------------------- /QWEN_TECHNICAL_REPORT.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/QWEN_TECHNICAL_REPORT.pdf -------------------------------------------------------------------------------- /Tongyi Qianwen LICENSE AGREEMENT: -------------------------------------------------------------------------------- 1 | Tongyi Qianwen LICENSE AGREEMENT 2 | 3 | Tongyi Qianwen Release Date: August 3, 2023 4 | 5 | By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately. 6 | 7 | 1. Definitions 8 | a. This Tongyi Qianwen LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement. 9 | b. "We"(or "Us") shall mean Alibaba Cloud. 10 | c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use. 11 | d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You. 12 | e. "Tongyi Qianwen" shall mean the large language models (including Qwen model and Qwen-Chat model), and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us. 13 | f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement. 14 | g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files. 15 | h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, 16 | and conversions to other media types. 17 | 18 | 2. Grant of Rights 19 | You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials. 20 | 21 | 3. Redistribution 22 | You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 23 | a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement; 24 | b. You shall cause any modified files to carry prominent notices stating that You changed the files; 25 | c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and 26 | d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement. 27 | 28 | 4. Restrictions 29 | If you are commercially using the Materials, and your product or service has more than 100 million monthly active users, You shall request a license from Us. You cannot exercise your rights under this Agreement without our express authorization. 30 | 31 | 5. Rules of use 32 | a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials. 33 | b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof). 34 | 35 | 6. Intellectual Property 36 | a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications. 37 | b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials. 38 | c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought. 39 | 40 | 7. Disclaimer of Warranty and Limitation of Liability 41 | 42 | a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto. 43 | b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM. 44 | c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED. 45 | d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials. 46 | 47 | 8. Survival and Termination. 48 | a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. 49 | b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 7 and 9 shall survive the termination of this Agreement. 50 | 51 | 9. Governing Law and Jurisdiction. 52 | a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. 53 | b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement. -------------------------------------------------------------------------------- /Tongyi Qianwen RESEARCH LICENSE AGREEMENT: -------------------------------------------------------------------------------- 1 | Tongyi Qianwen RESEARCH LICENSE AGREEMENT 2 | 3 | Tongyi Qianwen Release Date: November 30, 2023 4 | 5 | By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately. 6 | 7 | 1. Definitions 8 | a. This Tongyi Qianwen RESEARCH LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement. 9 | b. "We"(or "Us") shall mean Alibaba Cloud. 10 | c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use. 11 | d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You. 12 | e. "Tongyi Qianwen" shall mean the large language models, and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us. 13 | f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement. 14 | g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files. 15 | h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, 16 | and conversions to other media types. 17 | i. "Non-Commercial" shall mean for research or evaluation purposes only. 18 | 19 | 2. Grant of Rights 20 | a. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials FOR NON-COMMERCIAL PURPOSES ONLY. 21 | b. If you are commercially using the Materials, You shall request a license from Us. 22 | 23 | 3. Redistribution 24 | You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 25 | a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement; 26 | b. You shall cause any modified files to carry prominent notices stating that You changed the files; 27 | c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen RESEARCH LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and 28 | d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement. 29 | 30 | 4. Rules of use 31 | a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials. 32 | b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof). 33 | 34 | 5. Intellectual Property 35 | a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications. 36 | b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials. 37 | c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought. 38 | 39 | 6. Disclaimer of Warranty and Limitation of Liability 40 | a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto. 41 | b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM. 42 | c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED. 43 | d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials. 44 | 45 | 7. Survival and Termination. 46 | a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. 47 | b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 6 and 8 shall survive the termination of this Agreement. 48 | 49 | 8. Governing Law and Jurisdiction. 50 | a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. 51 | b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement. 52 | 53 | 9. Other Terms and Conditions. 54 | a. Any arrangements, understandings, or agreements regarding the Material not stated herein are separate from and independent of the terms and conditions of this Agreement. You shall request a seperate license from Us, if You use the Materials in ways not expressly agreed to in this Agreement. 55 | b. We shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 56 | -------------------------------------------------------------------------------- /ascend-support/README.md: -------------------------------------------------------------------------------- 1 | # 昇腾910架构基于mindformers推理Qwen-7B-Chat模型 2 | 3 | ## 环境要求 4 | 5 | - 硬件:Ascend 910A/B 6 | 7 | ## 运行步骤 8 | 9 | 首先参考Qwen README下载官方模型到`/path/to/Qwen-7B-Chat`。 10 | 11 | ### 下载并启动镜像 12 | 13 | ```bash 14 | docker pull qwenllm/qwen-mindspore:latest 15 | 16 | cd /path/to/Qwen/ascend-support 17 | 18 | # 下载模型到此处 19 | CHECKPOINT_PATH=/path/to/Qwen-7B-Chat 20 | 21 | cd ascend-support 22 | 23 | # 启动docker容器 24 | bash docker_qwen.sh -c ${CHECKPOINT_PATH} 25 | ``` 26 | 27 | ### 执行权重转换 28 | 29 | 在容器内执行下面的命令,将Qwen模型转换为适配`mindformers`的格式: 30 | 31 | ```bash 32 | python3 /data/qwen/mindformers/research/qwen/convert_weight.py 33 | ``` 34 | 35 | 转换后模型的输出位置为`${CHECKPOINT_PATH}/qwen-7b-chat.ckpt`。 36 | 37 | ### 执行推理 38 | 39 | 在容器内执行下面的命令,进行推理: 40 | 41 | ```bash 42 | cd /data/qwen/mindformers/research/qwen 43 | export PYTHONPATH=/data/qwen/mindformers:$PYTHONPATH 44 | python3 infer_qwen.py 45 | ``` 46 | -------------------------------------------------------------------------------- /ascend-support/docker_qwen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMAGE_NAME=qwenllm/qwen-mindspore:v23.0.RC3 4 | CONTAINER_NAME=qwen-mindspore 5 | CHECKPOINT_PATH='NOT_SET' 6 | 7 | DOCKER_CHECKPOINT_PATH=/data/qwen/models/Qwen-7B-Chat 8 | 9 | function usage() { 10 | echo ' 11 | Usage: bash ascend-support/docker_qwen.sh [-i IMAGE_NAME] -c [/path/to/Qwen-7B-Chat] [-n CONTAINER_NAME] 12 | ' 13 | } 14 | 15 | while [[ "$1" != "" ]]; do 16 | case $1 in 17 | -i | --image ) 18 | shift 19 | IMAGE_NAME=$1 20 | ;; 21 | -c | --checkpoint ) 22 | shift 23 | CHECKPOINT_PATH=$1 24 | ;; 25 | -n | --name ) 26 | shift 27 | CONTAINER_NAME=$1 28 | ;; 29 | -h ) 30 | usage 31 | exit 32 | ;; 33 | * ) 34 | echo "Unknown argument ${1}" 35 | exit 1 36 | ;; 37 | esac 38 | shift 39 | done 40 | 41 | docker run -it --rm -u root --network=host --ipc=host \ 42 | --device=/dev/davinci0 \ 43 | --device=/dev/davinci1 \ 44 | --device=/dev/davinci2 \ 45 | --device=/dev/davinci3 \ 46 | --device=/dev/davinci4 \ 47 | --device=/dev/davinci5 \ 48 | --device=/dev/davinci6 \ 49 | --device=/dev/davinci7 \ 50 | --name=${CONTAINER_NAME} \ 51 | --device=/dev/davinci_manager \ 52 | --device=/dev/devmm_svm \ 53 | --device=/dev/hisi_hdc \ 54 | -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ 55 | -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \ 56 | -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ 57 | -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \ 58 | -v /etc/ascend_install.info:/etc/ascend_install.info \ 59 | -v ${CHECKPOINT_PATH}:${DOCKER_CHECKPOINT_PATH} \ 60 | -v /var/log/npu/:/usr/slog \ 61 | ${IMAGE_NAME} /bin/bash 62 | -------------------------------------------------------------------------------- /assets/cli_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/cli_demo.gif -------------------------------------------------------------------------------- /assets/code_interpreter_showcase_001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/code_interpreter_showcase_001.jpg -------------------------------------------------------------------------------- /assets/hfagent_chat_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/hfagent_chat_1.png -------------------------------------------------------------------------------- /assets/hfagent_chat_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/hfagent_chat_2.png -------------------------------------------------------------------------------- /assets/hfagent_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/hfagent_run.png -------------------------------------------------------------------------------- /assets/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/logo.jpg -------------------------------------------------------------------------------- /assets/openai_api.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/openai_api.gif -------------------------------------------------------------------------------- /assets/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/performance.png -------------------------------------------------------------------------------- /assets/qwen_72b_needle_in_a_haystack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/qwen_72b_needle_in_a_haystack.png -------------------------------------------------------------------------------- /assets/qwen_tokenizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/qwen_tokenizer.png -------------------------------------------------------------------------------- /assets/radar_14b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/radar_14b.jpg -------------------------------------------------------------------------------- /assets/radar_72b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/radar_72b.jpg -------------------------------------------------------------------------------- /assets/react_showcase_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/react_showcase_001.png -------------------------------------------------------------------------------- /assets/react_showcase_002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/react_showcase_002.png -------------------------------------------------------------------------------- /assets/react_tutorial_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/react_tutorial_001.png -------------------------------------------------------------------------------- /assets/react_tutorial_002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/react_tutorial_002.png -------------------------------------------------------------------------------- /assets/system_prompt_behavior_setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_behavior_setting.png -------------------------------------------------------------------------------- /assets/system_prompt_behavior_setting_en.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_behavior_setting_en.png -------------------------------------------------------------------------------- /assets/system_prompt_language_style.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_language_style.png -------------------------------------------------------------------------------- /assets/system_prompt_language_style_en.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_language_style_en.png -------------------------------------------------------------------------------- /assets/system_prompt_role_play.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_role_play.png -------------------------------------------------------------------------------- /assets/system_prompt_role_play_en.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_role_play_en.png -------------------------------------------------------------------------------- /assets/system_prompt_task_setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_task_setting.png -------------------------------------------------------------------------------- /assets/system_prompt_task_setting_en.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/system_prompt_task_setting_en.png -------------------------------------------------------------------------------- /assets/tokenizer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/tokenizer.pdf -------------------------------------------------------------------------------- /assets/tokenizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/tokenizer.png -------------------------------------------------------------------------------- /assets/wanx_colorful_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/wanx_colorful_black.png -------------------------------------------------------------------------------- /assets/web_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/web_demo.gif -------------------------------------------------------------------------------- /assets/wechat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/assets/wechat.png -------------------------------------------------------------------------------- /cli_demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba Cloud. 2 | # 3 | # This source code is licensed under the license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """A simple command-line interactive chat demo.""" 7 | 8 | import argparse 9 | import os 10 | import platform 11 | import shutil 12 | from copy import deepcopy 13 | 14 | import torch 15 | from transformers import AutoModelForCausalLM, AutoTokenizer 16 | from transformers.generation import GenerationConfig 17 | from transformers.trainer_utils import set_seed 18 | 19 | DEFAULT_CKPT_PATH = 'Qwen/Qwen-7B-Chat' 20 | 21 | _WELCOME_MSG = '''\ 22 | Welcome to use Qwen-Chat model, type text to start chat, type :h to show command help. 23 | (欢迎使用 Qwen-Chat 模型,输入内容即可进行对话,:h 显示命令帮助。) 24 | 25 | Note: This demo is governed by the original license of Qwen. 26 | We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, including hate speech, violence, pornography, deception, etc. 27 | (注:本演示受Qwen的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。) 28 | ''' 29 | _HELP_MSG = '''\ 30 | Commands: 31 | :help / :h Show this help message 显示帮助信息 32 | :exit / :quit / :q Exit the demo 退出Demo 33 | :clear / :cl Clear screen 清屏 34 | :clear-his / :clh Clear history 清除对话历史 35 | :history / :his Show history 显示对话历史 36 | :seed Show current random seed 显示当前随机种子 37 | :seed <N> Set random seed to <N> 设置随机种子 38 | :conf Show current generation config 显示生成配置 39 | :conf <key>=<value> Change generation config 修改生成配置 40 | :reset-conf Reset generation config 重置生成配置 41 | ''' 42 | 43 | 44 | def _load_model_tokenizer(args): 45 | tokenizer = AutoTokenizer.from_pretrained( 46 | args.checkpoint_path, trust_remote_code=True, resume_download=True, 47 | ) 48 | 49 | if args.cpu_only: 50 | device_map = "cpu" 51 | else: 52 | device_map = "auto" 53 | 54 | model = AutoModelForCausalLM.from_pretrained( 55 | args.checkpoint_path, 56 | device_map=device_map, 57 | trust_remote_code=True, 58 | resume_download=True, 59 | ).eval() 60 | 61 | config = GenerationConfig.from_pretrained( 62 | args.checkpoint_path, trust_remote_code=True, resume_download=True, 63 | ) 64 | 65 | return model, tokenizer, config 66 | 67 | 68 | def _gc(): 69 | import gc 70 | gc.collect() 71 | if torch.cuda.is_available(): 72 | torch.cuda.empty_cache() 73 | 74 | 75 | def _clear_screen(): 76 | if platform.system() == "Windows": 77 | os.system("cls") 78 | else: 79 | os.system("clear") 80 | 81 | 82 | def _print_history(history): 83 | terminal_width = shutil.get_terminal_size()[0] 84 | print(f'History ({len(history)})'.center(terminal_width, '=')) 85 | for index, (query, response) in enumerate(history): 86 | print(f'User[{index}]: {query}') 87 | print(f'QWen[{index}]: {response}') 88 | print('=' * terminal_width) 89 | 90 | 91 | def _get_input() -> str: 92 | while True: 93 | try: 94 | message = input('User> ').strip() 95 | except UnicodeDecodeError: 96 | print('[ERROR] Encoding error in input') 97 | continue 98 | except KeyboardInterrupt: 99 | exit(1) 100 | if message: 101 | return message 102 | print('[ERROR] Query is empty') 103 | 104 | 105 | def main(): 106 | parser = argparse.ArgumentParser( 107 | description='QWen-Chat command-line interactive chat demo.') 108 | parser.add_argument("-c", "--checkpoint-path", type=str, default=DEFAULT_CKPT_PATH, 109 | help="Checkpoint name or path, default to %(default)r") 110 | parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") 111 | parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only") 112 | args = parser.parse_args() 113 | 114 | history, response = [], '' 115 | 116 | model, tokenizer, config = _load_model_tokenizer(args) 117 | orig_gen_config = deepcopy(model.generation_config) 118 | 119 | _clear_screen() 120 | print(_WELCOME_MSG) 121 | 122 | seed = args.seed 123 | 124 | while True: 125 | query = _get_input() 126 | 127 | # Process commands. 128 | if query.startswith(':'): 129 | command_words = query[1:].strip().split() 130 | if not command_words: 131 | command = '' 132 | else: 133 | command = command_words[0] 134 | 135 | if command in ['exit', 'quit', 'q']: 136 | break 137 | elif command in ['clear', 'cl']: 138 | _clear_screen() 139 | print(_WELCOME_MSG) 140 | _gc() 141 | continue 142 | elif command in ['clear-his', 'clh']: 143 | print(f'[INFO] All {len(history)} history cleared') 144 | history.clear() 145 | _gc() 146 | continue 147 | elif command in ['help', 'h']: 148 | print(_HELP_MSG) 149 | continue 150 | elif command in ['history', 'his']: 151 | _print_history(history) 152 | continue 153 | elif command in ['seed']: 154 | if len(command_words) == 1: 155 | print(f'[INFO] Current random seed: {seed}') 156 | continue 157 | else: 158 | new_seed_s = command_words[1] 159 | try: 160 | new_seed = int(new_seed_s) 161 | except ValueError: 162 | print(f'[WARNING] Fail to change random seed: {new_seed_s!r} is not a valid number') 163 | else: 164 | print(f'[INFO] Random seed changed to {new_seed}') 165 | seed = new_seed 166 | continue 167 | elif command in ['conf']: 168 | if len(command_words) == 1: 169 | print(model.generation_config) 170 | else: 171 | for key_value_pairs_str in command_words[1:]: 172 | eq_idx = key_value_pairs_str.find('=') 173 | if eq_idx == -1: 174 | print('[WARNING] format: <key>=<value>') 175 | continue 176 | conf_key, conf_value_str = key_value_pairs_str[:eq_idx], key_value_pairs_str[eq_idx + 1:] 177 | try: 178 | conf_value = eval(conf_value_str) 179 | except Exception as e: 180 | print(e) 181 | continue 182 | else: 183 | print(f'[INFO] Change config: model.generation_config.{conf_key} = {conf_value}') 184 | setattr(model.generation_config, conf_key, conf_value) 185 | continue 186 | elif command in ['reset-conf']: 187 | print('[INFO] Reset generation config') 188 | model.generation_config = deepcopy(orig_gen_config) 189 | print(model.generation_config) 190 | continue 191 | else: 192 | # As normal query. 193 | pass 194 | 195 | # Run chat. 196 | set_seed(seed) 197 | try: 198 | for response in model.chat_stream(tokenizer, query, history=history, generation_config=config): 199 | _clear_screen() 200 | print(f"\nUser: {query}") 201 | print(f"\nQwen-Chat: {response}") 202 | except KeyboardInterrupt: 203 | print('[WARNING] Generation interrupted') 204 | continue 205 | 206 | history.append((query, response)) 207 | 208 | 209 | if __name__ == "__main__": 210 | main() 211 | -------------------------------------------------------------------------------- /dcu-support/README.md: -------------------------------------------------------------------------------- 1 | # DCU 架构基于 fastllm 推理 Qwen 模型 2 | 3 | 4 | ## 环境配置 5 | 6 | ### 环境准备 7 | 8 | ``` 9 | docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest 10 | ``` 11 | 12 | ### 容器启动 13 | 14 | 根据如下命令启动推理容器,其中需自定义一个容器名<container_name>,<project_path>即为本目录的路径: 15 | ``` 16 | # <container_name> 自定义容器名 17 | # <project_path> 当前工程所在路径 18 | docker run -it --name=<container_name> -v <project_path>:/work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add 39 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest /bin/bash 19 | ``` 20 | 21 | ### 加载环境 22 | 23 | 进入容器后执行如下命令,加载运行环境变量 24 | 25 | ``` 26 | source /opt/dtk-23.04/cuda/env.sh 27 | ``` 28 | 29 | ### 安装方法 30 | 31 | ``` 32 | #进入本工程目录 33 | cd package 34 | python setup.py install 35 | ``` 36 | 37 | ## 推理 38 | 39 | ### 模型转换 40 | 41 | 首先参考Qwen README下载官方模型,并通过如下方式将模型转换为 fastllm 用于推理的形式: 42 | 43 | - 通过`pip install -r requirements.txt`安装模型转换所需依赖 44 | 45 | - 如果使用已经下载完成的模型或者自己finetune的模型需要修改qwen2flm.py文件中创建tokenizer, model时的模型存放路径 46 | 47 | ``` 48 | # 在本工程目录下执行: 49 | python3 qwen2flm.py qwen-7b-fp16.bin float16 # 导出fp16模型,参数为导出的模型路径 50 | ``` 51 | 52 | 53 | ### 模型推理 54 | 55 | ``` 56 | # 命令行聊天程序,使用了模型创建以及流式对话效果 57 | python cli_demo.py -p qwen-7b-fp16.bin 58 | 59 | # batch推理程序 60 | python cli_demo_batch.py -p qwen-7b-fp16.bin 61 | 62 | # 简易webui,需要先安装streamlit-chat 63 | streamlit run web_demo.py qwen-7b-fp16.bin 64 | ``` 65 | -------------------------------------------------------------------------------- /dcu-support/cli_demo.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import argparse 3 | from fastllm_pytools import llm 4 | 5 | def args_parser(): 6 | parser = argparse.ArgumentParser(description = 'qwen_chat_demo') 7 | parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径') 8 | args = parser.parse_args() 9 | return args 10 | 11 | if __name__ == "__main__": 12 | args = args_parser() 13 | model = llm.model(args.path) 14 | 15 | history = [] 16 | print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序") 17 | while True: 18 | query = input("\n用户:") 19 | if query.strip() == "stop": 20 | break 21 | if query.strip() == "clear": 22 | history = [] 23 | print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序") 24 | continue 25 | print("AI:", end = "") 26 | curResponse = "" 27 | for response in model.stream_response(query, history = history, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0): 28 | curResponse += response 29 | print(response, flush = True, end = "") 30 | history.append((query, curResponse)) -------------------------------------------------------------------------------- /dcu-support/cli_demo_batch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from fastllm_pytools import llm 3 | import time 4 | 5 | def args_parser(): 6 | parser = argparse.ArgumentParser(description = 'fastllm_chat_demo') 7 | parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径') 8 | args = parser.parse_args() 9 | return args 10 | 11 | if __name__ == "__main__": 12 | args = args_parser() 13 | 14 | model_path = args.path 15 | 16 | prompts = ["深圳有什么好玩的", "上海有什么好玩的", "晚上睡不着怎么办", "南京有什么好吃的"] * 2 17 | print(prompts) 18 | 19 | responses, historys = [], [] 20 | 21 | model = llm.model(model_path) 22 | 23 | t0 = time.time() 24 | responses, historys = model.response_batch(prompts) 25 | t1 = time.time() 26 | 27 | token_output_count = 0 28 | word_len = 0 29 | for i, res in enumerate(responses): 30 | tokens = model.tokenizer_encode_string(res) 31 | token_output_count += len(tokens) 32 | word_len += len(res) 33 | 34 | print("batch index: ", i) 35 | print(res) 36 | print("") 37 | 38 | print("\ntoken/s: {:.2f}, character/s: {:.2f}".format(token_output_count/(t1-t0), word_len/(t1-t0))) 39 | 40 | -------------------------------------------------------------------------------- /dcu-support/model.properties: -------------------------------------------------------------------------------- 1 | # 模型唯一标识 2 | modelCode = 411 3 | # 模型名称 4 | modelName=qwen-7b_fastllm 5 | # 模型描述 6 | modelDescription=qwen-7b是阿里云研发的通义千问大模型系列的70亿参数规模的模型 7 | # 应用场景 8 | appScenario=推理,对话问答,医疗,科研,金融,教育 9 | # 框架类型 10 | frameType=fastllm 11 | -------------------------------------------------------------------------------- /dcu-support/package/fastllm_pytools/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["llm"] -------------------------------------------------------------------------------- /dcu-support/package/fastllm_pytools/hf_model.py: -------------------------------------------------------------------------------- 1 | from fastllm_pytools import llm; 2 | import torch; 3 | import ctypes; 4 | import numpy as np; 5 | 6 | fastllm_data_type_dict = { 7 | "int4": 8, 8 | "int8": 3, 9 | "float16": 7 10 | } 11 | fastllm_weight_type_dict = { 12 | "linear": 1, 13 | "embedding": 2, 14 | "QuantizedLinear": 111 15 | } 16 | 17 | def create(model, 18 | tokenizer = None, 19 | pre_prompt = None, 20 | user_role = None, 21 | bot_role = None, 22 | history_sep = None, 23 | dtype = "float16"): 24 | if (dtype not in fastllm_data_type_dict): 25 | print("dtype should in ", list(fastllm_data_type_dict.keys())); 26 | exit(0); 27 | 28 | # 0.1 model info 29 | if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2": 30 | model.config.model_type = "chatglm3" 31 | modelInfo = model.config.__dict__ 32 | if model.generation_config is not None: 33 | modelInfo.update(model.generation_config.__dict__) 34 | if (pre_prompt): 35 | modelInfo["pre_prompt"] = pre_prompt; 36 | if (user_role): 37 | modelInfo["user_role"] = user_role; 38 | if (bot_role): 39 | modelInfo["bot_role"] = bot_role; 40 | if (history_sep): 41 | modelInfo["history_sep"] = history_sep; 42 | if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")): 43 | # Baichuan 2代 44 | modelInfo["use_alibi"] = "1"; 45 | modelInfo["pre_prompt"] = ""; 46 | modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""; 47 | modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""; 48 | modelInfo["history_sep"] = ""; 49 | if (modelInfo["model_type"] == "qwen"): 50 | if modelInfo["chat_format"] == "chatml": 51 | modelInfo["im_end_id"] = tokenizer.im_end_id 52 | modelInfo["im_start_id"] = tokenizer.im_start_id 53 | 54 | 55 | weight_type_dict = {}; 56 | module_dict = {}; 57 | weight_bits = {}; 58 | for key, m in model.named_modules(): 59 | if (str(type(m)).find("QuantizedLinear") != -1): 60 | weight_type_dict[key + ".weight"] = "QuantizedLinear"; 61 | weight_bits[key + ".weight"] = m.weight_bit_width; 62 | if (isinstance(m, torch.nn.Linear)): 63 | weight_type_dict[key + ".weight"] = "linear"; 64 | module_dict[key + ".weight"] = m; 65 | if (isinstance(m, torch.nn.Embedding)): 66 | weight_type_dict[key] = "embedding"; 67 | 68 | peft_config = {} 69 | active_adapter = "" 70 | if hasattr(model, "peft_config"): 71 | peft_config = model.peft_config 72 | if hasattr(model, "active_adapter") and isinstance(model.active_adapter, str): 73 | # in transformers >= 4.33.0, active_adapter is a funtion in model, ignore it now 74 | active_adapter = model.active_adapter 75 | 76 | model = model.cpu(); 77 | dict = model.state_dict(); 78 | model_type = model.config.__dict__["model_type"]; 79 | model = llm.fastllm_lib.create_empty_llm_model(model_type.encode()); 80 | for it in modelInfo.keys(): 81 | llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode()); 82 | 83 | for adapter_name in peft_config.keys(): 84 | adapter_dict = peft_config[adapter_name].__dict__ 85 | for it in adapter_dict.keys(): 86 | llm.fastllm_lib.add_adapter_dict_llm_model(model, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode()) 87 | if len(active_adapter) != 0: 88 | llm.fastllm_lib.set_adapter(model, str(active_adapter).encode()) 89 | 90 | # 1. vocab 91 | if (tokenizer): 92 | if (hasattr(tokenizer, "tokenizer")): 93 | if modelInfo["model_type"] == "qwen": 94 | pass 95 | else: 96 | tokenizer = tokenizer.tokenizer; 97 | if (hasattr(tokenizer, "sp_model")): 98 | piece_size = tokenizer.sp_model.piece_size(); 99 | for i in range(piece_size): 100 | llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(), 101 | i, ctypes.c_float(tokenizer.sp_model.get_score(i))); 102 | else: 103 | vocab = tokenizer.get_vocab(); 104 | for v in vocab.keys(): 105 | if (modelInfo["model_type"] == "moss"): 106 | vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]; 107 | llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0)); 108 | elif (modelInfo["model_type"] == "qwen"): 109 | llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0)); 110 | else: 111 | llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0)); 112 | tot = 0; 113 | for key in dict: 114 | ori_data_type = 0; 115 | ori_np_data_type = np.float32; 116 | cur_weight_type = 0; 117 | if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict): 118 | cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]; 119 | to_data_type = 0; 120 | 121 | if (cur_weight_type == 1): 122 | to_data_type = fastllm_data_type_dict[dtype]; 123 | if (to_data_type == 7): 124 | ori_data_type = 7; 125 | ori_np_data_type = np.float16; 126 | elif (cur_weight_type == 2): 127 | # TODO bfloat 128 | to_data_type = 0; 129 | 130 | weight_name = key 131 | if peft_config is not None: 132 | weight_name = weight_name.replace('base_model.model.', '') 133 | if (cur_weight_type == 111): 134 | llm.fastllm_lib.add_qlinear_weight_llm_model(model, weight_name.encode(), 135 | len(dict[key].shape), 136 | (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)), 137 | weight_bits[key], 138 | dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p), 139 | dict[key].numpy().ctypes.data_as(ctypes.c_void_p)); 140 | else: 141 | llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(), 142 | len(dict[key].shape), 143 | (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)), 144 | to_data_type, cur_weight_type, ori_data_type, 145 | dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p)); 146 | tot += 1; 147 | print("convert (", tot, "/", len(dict), end = " )\r"); 148 | 149 | print(""); 150 | llm.fastllm_lib.init_params_llm_model(model); 151 | llm.fastllm_lib.warmup_llm_model(model); 152 | ret = llm.model("", id = model); 153 | return ret; 154 | 155 | -------------------------------------------------------------------------------- /dcu-support/package/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup ( 4 | name = "fastllm_pytools", 5 | version = "0.0.1", 6 | description = "Fastllm pytools", 7 | packages = ['fastllm_pytools'], 8 | url = "https://developer.hpccube.com/codes/aicomponent/fastllm", 9 | package_data = { 10 | '': ['*.dll', '*.so'] 11 | } 12 | ) 13 | -------------------------------------------------------------------------------- /dcu-support/qwen2flm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | from transformers.generation import GenerationConfig 4 | from fastllm_pytools import torch2flm 5 | 6 | if __name__ == "__main__": 7 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) 8 | model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True, fp32=True).eval() 9 | model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参 10 | 11 | dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16" 12 | exportPath = sys.argv[1] if len(sys.argv) >= 2 else "qwen-7b-" + dtype + ".flm" 13 | torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype) -------------------------------------------------------------------------------- /dcu-support/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.32.0 2 | tiktoken 3 | streamlit>=1.24.0 4 | sentencepiece 5 | urllib3==1.26.16 6 | transformers_stream_generator==0.0.4 7 | accelerate 8 | einops 9 | #scipy 10 | -------------------------------------------------------------------------------- /dcu-support/web_demo.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from streamlit_chat import message 3 | from fastllm_pytools import llm 4 | import sys 5 | 6 | st.set_page_config( 7 | page_title="fastllm web demo", 8 | page_icon=":robot:" 9 | ) 10 | 11 | @st.cache_resource 12 | def get_model(): 13 | model = llm.model(sys.argv[1]) 14 | return model 15 | 16 | if "messages" not in st.session_state: 17 | st.session_state.messages = [] 18 | 19 | for i, (prompt, response) in enumerate(st.session_state.messages): 20 | with st.chat_message("user"): 21 | st.markdown(prompt) 22 | with st.chat_message("assistant"): 23 | st.markdown(response) 24 | 25 | if prompt := st.chat_input("请开始对话"): 26 | model = get_model() 27 | with st.chat_message("user"): 28 | st.markdown(prompt) 29 | 30 | with st.chat_message("assistant"): 31 | message_placeholder = st.empty() 32 | full_response = "" 33 | for chunk in model.stream_response(prompt, st.session_state.messages, one_by_one = True): 34 | full_response += chunk 35 | message_placeholder.markdown(full_response + "▌") 36 | message_placeholder.markdown(full_response) 37 | st.session_state.messages.append((prompt, full_response)) 38 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG CUDA_VERSION=11.7.1 2 | ARG from=nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 3 | 4 | FROM ${from} as base 5 | 6 | ARG from 7 | 8 | RUN <<EOF 9 | apt update -y && apt upgrade -y && apt install -y --no-install-recommends \ 10 | git \ 11 | git-lfs \ 12 | python3 \ 13 | python3-pip \ 14 | python3-dev \ 15 | wget \ 16 | vim \ 17 | && rm -rf /var/lib/apt/lists/* 18 | EOF 19 | 20 | RUN ln -s /usr/bin/python3 /usr/bin/python 21 | 22 | RUN git lfs install 23 | 24 | FROM base as dev 25 | 26 | WORKDIR / 27 | 28 | RUN mkdir -p /data/shared/Qwen 29 | 30 | WORKDIR /data/shared/Qwen/ 31 | 32 | # Users can also mount '/data/shared/Qwen/' to keep the data 33 | COPY ../requirements.txt ./ 34 | COPY ../requirements_web_demo.txt ./ 35 | 36 | FROM dev as bundle_req 37 | 38 | ARG BUNDLE_REQUIREMENTS=true 39 | 40 | RUN <<EOF 41 | if [ "$BUNDLE_REQUIREMENTS" = "true" ]; then 42 | cd /data/shared/Qwen 43 | pip3 install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 44 | pip3 install -r requirements.txt 45 | pip3 install -r requirements_web_demo.txt 46 | fi 47 | EOF 48 | 49 | FROM bundle_req as bundle_flash_attention 50 | ARG BUNDLE_FLASH_ATTENTION=true 51 | 52 | RUN <<EOF 53 | if [ "$BUNDLE_FLASH_ATTENTION" = "true" ]; then 54 | cd /data/shared/Qwen 55 | test -d flash-attention || git clone -b v2.3.3 https://github.com/Dao-AILab/flash-attention 56 | cd /data/shared/Qwen/flash-attention && 57 | pip3 install . && 58 | pip3 install csrc/layer_norm 59 | fi 60 | EOF 61 | 62 | FROM bundle_flash_attention as bundle_finetune 63 | ARG BUNDLE_FINETUNE=true 64 | 65 | RUN <<EOF 66 | if [ "$BUNDLE_FINETUNE" = "true" ]; then 67 | cd /data/shared/Qwen 68 | 69 | # Full-finetune / LoRA. 70 | pip3 install deepspeed "peft==0.5.0" 71 | 72 | # Q-LoRA. 73 | apt update -y && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ 74 | libopenmpi-dev openmpi-bin \ 75 | && rm -rf /var/lib/apt/lists/* 76 | pip3 install "optimum==1.12.0" "auto-gptq==0.4.2" mpi4py 77 | fi 78 | EOF 79 | 80 | FROM bundle_finetune as bundle_openai_api 81 | ARG BUNDLE_OPENAI_API=true 82 | 83 | RUN <<EOF 84 | if [ "$BUNDLE_OPENAI_API" = "true" ]; then 85 | cd /data/shared/Qwen 86 | 87 | pip3 install fastapi uvicorn "openai<1.0.0" sse_starlette "pydantic<=1.10.13" 88 | fi 89 | EOF 90 | 91 | FROM bundle_openai_api as final 92 | ARG from 93 | 94 | COPY ../requirements.txt ./ 95 | COPY ../requirements_web_demo.txt ./ 96 | COPY ../cli_demo.py ./ 97 | COPY ../web_demo.py ./ 98 | COPY ../openai_api.py ./ 99 | COPY ../finetune.py ./ 100 | COPY ../utils.py ./ 101 | COPY ./examples/* ./examples/ 102 | COPY ./eval/* ./eval/ 103 | COPY ./finetune/* ./finetune/ 104 | 105 | EXPOSE 80 106 | 107 | WORKDIR /data/shared/Qwen/ 108 | 109 | CMD ["python3", "web_demo.py", "--server-port", "80", "--server-name", "0.0.0.0", "-c", "/data/shared/Qwen/Qwen-Chat/"] 110 | -------------------------------------------------------------------------------- /docker/Dockerfile-cu114: -------------------------------------------------------------------------------- 1 | ARG CUDA_VERSION=11.4.3 2 | ARG from=nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 3 | 4 | FROM ${from} as base 5 | 6 | ARG from 7 | 8 | RUN <<EOF 9 | apt update -y && apt upgrade -y && apt install -y --no-install-recommends \ 10 | git \ 11 | git-lfs \ 12 | python3 \ 13 | python3-pip \ 14 | python3-dev \ 15 | wget \ 16 | vim \ 17 | && rm -rf /var/lib/apt/lists/* 18 | EOF 19 | 20 | RUN ln -s /usr/bin/python3 /usr/bin/python 21 | 22 | RUN git lfs install 23 | 24 | FROM base as dev 25 | 26 | WORKDIR / 27 | 28 | RUN mkdir -p /data/shared/Qwen 29 | 30 | WORKDIR /data/shared/Qwen/ 31 | 32 | # Users can also mount '/data/shared/Qwen/' to keep the data 33 | COPY ../requirements.txt ./ 34 | COPY ../requirements_web_demo.txt ./ 35 | 36 | FROM dev as bundle_req 37 | 38 | ARG BUNDLE_REQUIREMENTS=true 39 | 40 | RUN <<EOF 41 | if [ "$BUNDLE_REQUIREMENTS" = "true" ]; then 42 | cd /data/shared/Qwen 43 | pip3 install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 44 | pip3 install -r requirements.txt 45 | pip3 install -r requirements_web_demo.txt 46 | fi 47 | EOF 48 | 49 | FROM bundle_req as bundle_flash_attention 50 | ARG BUNDLE_FLASH_ATTENTION=true 51 | 52 | RUN <<EOF 53 | if [ "$BUNDLE_FLASH_ATTENTION" = "true" ]; then 54 | echo "CUDA 11.4 does not support flash-attention, please try other images." 55 | fi 56 | EOF 57 | 58 | FROM bundle_flash_attention as bundle_finetune 59 | ARG BUNDLE_FINETUNE=true 60 | 61 | RUN <<EOF 62 | if [ "$BUNDLE_FINETUNE" = "true" ]; then 63 | cd /data/shared/Qwen 64 | 65 | # Full-finetune / LoRA. 66 | pip3 install deepspeed "peft==0.5.0" 67 | 68 | # Q-LoRA. 69 | apt update -y && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ 70 | libopenmpi-dev openmpi-bin \ 71 | && rm -rf /var/lib/apt/lists/* 72 | pip3 install "optimum==1.12.0" "auto-gptq==0.4.2" mpi4py 73 | fi 74 | EOF 75 | 76 | FROM bundle_finetune as bundle_openai_api 77 | ARG BUNDLE_OPENAI_API=true 78 | 79 | RUN <<EOF 80 | if [ "$BUNDLE_OPENAI_API" = "true" ]; then 81 | cd /data/shared/Qwen 82 | 83 | pip3 install fastapi uvicorn "openai<1.0.0" sse_starlette "pydantic<=1.10.13" 84 | fi 85 | EOF 86 | 87 | FROM bundle_openai_api as final 88 | ARG from 89 | 90 | COPY ../requirements.txt ./ 91 | COPY ../requirements_web_demo.txt ./ 92 | COPY ../cli_demo.py ./ 93 | COPY ../web_demo.py ./ 94 | COPY ../openai_api.py ./ 95 | COPY ../finetune.py ./ 96 | COPY ../utils.py ./ 97 | COPY ./examples/* ./examples/ 98 | COPY ./eval/* ./eval/ 99 | COPY ./finetune/* ./finetune/ 100 | 101 | EXPOSE 80 102 | 103 | WORKDIR /data/shared/Qwen/ 104 | 105 | CMD ["python3", "web_demo.py", "--server-port", "80", "--server-name", "0.0.0.0", "-c", "/data/shared/Qwen/Qwen-Chat/"] 106 | -------------------------------------------------------------------------------- /docker/Dockerfile-cu121: -------------------------------------------------------------------------------- 1 | ARG CUDA_VERSION=12.1.0 2 | ARG from=nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 3 | 4 | FROM ${from} as base 5 | 6 | ARG from 7 | 8 | RUN <<EOF 9 | apt update -y && apt upgrade -y && apt install -y --no-install-recommends \ 10 | git \ 11 | git-lfs \ 12 | python3 \ 13 | python3-pip \ 14 | python3-dev \ 15 | wget \ 16 | vim \ 17 | && rm -rf /var/lib/apt/lists/* 18 | EOF 19 | 20 | RUN ln -s /usr/bin/python3 /usr/bin/python 21 | 22 | RUN git lfs install 23 | 24 | FROM base as dev 25 | 26 | WORKDIR / 27 | 28 | RUN mkdir -p /data/shared/Qwen 29 | 30 | WORKDIR /data/shared/Qwen/ 31 | 32 | # Users can also mount '/data/shared/Qwen/' to keep the data 33 | COPY ../requirements.txt ./ 34 | COPY ../requirements_web_demo.txt ./ 35 | 36 | FROM dev as bundle_req 37 | 38 | ARG BUNDLE_REQUIREMENTS=true 39 | 40 | RUN <<EOF 41 | if [ "$BUNDLE_REQUIREMENTS" = "true" ]; then 42 | cd /data/shared/Qwen 43 | pip3 install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 44 | pip3 install -r requirements.txt 45 | pip3 install -r requirements_web_demo.txt 46 | 47 | pip3 install transformers==4.36.0 48 | fi 49 | EOF 50 | 51 | FROM bundle_req as bundle_flash_attention 52 | ARG BUNDLE_FLASH_ATTENTION=true 53 | 54 | RUN <<EOF 55 | if [ "$BUNDLE_FLASH_ATTENTION" = "true" ]; then 56 | cd /data/shared/Qwen 57 | test -d flash-attention || git clone -b v2.3.3 https://github.com/Dao-AILab/flash-attention 58 | cd /data/shared/Qwen/flash-attention && 59 | pip3 install . && 60 | pip3 install csrc/layer_norm 61 | fi 62 | EOF 63 | 64 | FROM bundle_flash_attention as bundle_finetune 65 | ARG BUNDLE_FINETUNE=true 66 | 67 | RUN <<EOF 68 | if [ "$BUNDLE_FINETUNE" = "true" ]; then 69 | cd /data/shared/Qwen 70 | 71 | # Full-finetune / LoRA. 72 | pip3 install "deepspeed==0.12.6" "peft==0.7.1" 73 | 74 | # Q-LoRA. 75 | apt update -y && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ 76 | libopenmpi-dev openmpi-bin \ 77 | && rm -rf /var/lib/apt/lists/* 78 | pip3 install "optimum==1.14.0" "auto-gptq==0.5.0" mpi4py 79 | fi 80 | EOF 81 | 82 | FROM bundle_finetune as bundle_openai_api 83 | ARG BUNDLE_OPENAI_API=true 84 | 85 | RUN <<EOF 86 | if [ "$BUNDLE_OPENAI_API" = "true" ]; then 87 | cd /data/shared/Qwen 88 | 89 | pip3 install fastapi uvicorn "openai<1.0.0" sse_starlette "pydantic<=1.10.13" 90 | fi 91 | EOF 92 | 93 | FROM bundle_openai_api as bundle_vllm 94 | ARG BUNDLE_VLLM=true 95 | 96 | RUN <<EOF 97 | if [ "$BUNDLE_VLLM" = "true" ]; then 98 | cd /data/shared/Qwen 99 | 100 | pip3 install vllm==0.2.7 "fschat[model_worker,webui]==0.2.33" 101 | fi 102 | EOF 103 | 104 | FROM bundle_vllm as final 105 | ARG from 106 | 107 | COPY ../requirements.txt ./ 108 | COPY ../requirements_web_demo.txt ./ 109 | COPY ../cli_demo.py ./ 110 | COPY ../web_demo.py ./ 111 | COPY ../openai_api.py ./ 112 | COPY ../finetune.py ./ 113 | COPY ../utils.py ./ 114 | COPY ./examples/* ./examples/ 115 | COPY ./eval/* ./eval/ 116 | COPY ./finetune/* ./finetune/ 117 | 118 | EXPOSE 80 119 | 120 | WORKDIR /data/shared/Qwen/ 121 | 122 | CMD ["python3", "web_demo.py", "--server-port", "80", "--server-name", "0.0.0.0", "-c", "/data/shared/Qwen/Qwen-Chat/"] 123 | -------------------------------------------------------------------------------- /docker/docker_cli_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # This script will automatically pull docker image from DockerHub, and start a container to run the Qwen-Chat cli-demo. 4 | 5 | IMAGE_NAME=qwenllm/qwen:cu117 6 | QWEN_CHECKPOINT_PATH=/path/to/Qwen-Chat 7 | CONTAINER_NAME=qwen 8 | 9 | function usage() { 10 | echo ' 11 | Usage: bash docker/docker_cli_demo.sh [-i IMAGE_NAME] -c [/path/to/Qwen-Chat] [-n CONTAINER_NAME] 12 | ' 13 | } 14 | 15 | while [[ "$1" != "" ]]; do 16 | case $1 in 17 | -i | --image-name ) 18 | shift 19 | IMAGE_NAME=$1 20 | ;; 21 | -c | --checkpoint ) 22 | shift 23 | QWEN_CHECKPOINT_PATH=$1 24 | ;; 25 | -n | --container-name ) 26 | shift 27 | CONTAINER_NAME=$1 28 | ;; 29 | -h | --help ) 30 | usage 31 | exit 0 32 | ;; 33 | * ) 34 | echo "Unknown argument ${1}" 35 | exit 1 36 | ;; 37 | esac 38 | shift 39 | done 40 | 41 | if [ ! -e ${QWEN_CHECKPOINT_PATH}/config.json ]; then 42 | echo "Checkpoint config.json file not found in ${QWEN_CHECKPOINT_PATH}, exit." 43 | exit 1 44 | fi 45 | 46 | sudo docker pull ${IMAGE_NAME} || { 47 | echo "Pulling image ${IMAGE_NAME} failed, exit." 48 | exit 1 49 | } 50 | 51 | sudo docker run --gpus all --rm --name ${CONTAINER_NAME} \ 52 | --mount type=bind,source=${QWEN_CHECKPOINT_PATH},target=/data/shared/Qwen/Qwen-Chat \ 53 | -it ${IMAGE_NAME} \ 54 | python cli_demo.py -c /data/shared/Qwen/Qwen-Chat/ 55 | -------------------------------------------------------------------------------- /docker/docker_openai_api.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # This script will automatically pull docker image from DockerHub, and start a daemon container to run the Qwen-Chat OpenAI API. 4 | 5 | IMAGE_NAME=qwenllm/qwen:cu117 6 | QWEN_CHECKPOINT_PATH=/path/to/Qwen-Chat 7 | PORT=8000 8 | CONTAINER_NAME=qwen 9 | 10 | function usage() { 11 | echo ' 12 | Usage: bash docker/docker_openai_api.sh [-i IMAGE_NAME] -c [/path/to/Qwen-Chat] [-n CONTAINER_NAME] [--port PORT] 13 | ' 14 | } 15 | 16 | while [[ "$1" != "" ]]; do 17 | case $1 in 18 | -i | --image-name ) 19 | shift 20 | IMAGE_NAME=$1 21 | ;; 22 | -c | --checkpoint ) 23 | shift 24 | QWEN_CHECKPOINT_PATH=$1 25 | ;; 26 | -n | --container-name ) 27 | shift 28 | CONTAINER_NAME=$1 29 | ;; 30 | --port ) 31 | shift 32 | PORT=$1 33 | ;; 34 | -h | --help ) 35 | usage 36 | exit 0 37 | ;; 38 | * ) 39 | echo "Unknown argument ${1}" 40 | exit 1 41 | ;; 42 | esac 43 | shift 44 | done 45 | 46 | if [ ! -e ${QWEN_CHECKPOINT_PATH}/config.json ]; then 47 | echo "Checkpoint config.json file not found in ${QWEN_CHECKPOINT_PATH}, exit." 48 | exit 1 49 | fi 50 | 51 | sudo docker pull ${IMAGE_NAME} || { 52 | echo "Pulling image ${IMAGE_NAME} failed, exit." 53 | exit 1 54 | } 55 | 56 | sudo docker run --gpus all -d --restart always --name ${CONTAINER_NAME} \ 57 | -v /var/run/docker.sock:/var/run/docker.sock -p ${PORT}:80 \ 58 | --mount type=bind,source=${QWEN_CHECKPOINT_PATH},target=/data/shared/Qwen/Qwen-Chat \ 59 | -it ${IMAGE_NAME} \ 60 | python openai_api.py --server-port 80 --server-name 0.0.0.0 -c /data/shared/Qwen/Qwen-Chat/ && { 61 | echo "Successfully started OpenAI API server. Access 'http://localhost:${PORT}/v1' to try! 62 | Run \`docker logs ${CONTAINER_NAME}\` to check server status. 63 | Run \`docker rm -f ${CONTAINER_NAME}\` to stop and remove the server." 64 | } 65 | -------------------------------------------------------------------------------- /docker/docker_web_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # This script will automatically pull docker image from DockerHub, and start a daemon container to run the Qwen-Chat web-demo. 4 | 5 | IMAGE_NAME=qwenllm/qwen:cu117 6 | QWEN_CHECKPOINT_PATH=/path/to/Qwen-7B-Chat 7 | PORT=8901 8 | CONTAINER_NAME=qwen 9 | 10 | function usage() { 11 | echo ' 12 | Usage: bash docker/docker_web_demo.sh [-i IMAGE_NAME] -c [/path/to/Qwen-Chat] [-n CONTAINER_NAME] [--port PORT] 13 | ' 14 | } 15 | 16 | while [[ "$1" != "" ]]; do 17 | case $1 in 18 | -i | --image-name ) 19 | shift 20 | IMAGE_NAME=$1 21 | ;; 22 | -c | --checkpoint ) 23 | shift 24 | QWEN_CHECKPOINT_PATH=$1 25 | ;; 26 | -n | --container-name ) 27 | shift 28 | CONTAINER_NAME=$1 29 | ;; 30 | --port ) 31 | shift 32 | PORT=$1 33 | ;; 34 | -h | --help ) 35 | usage 36 | exit 0 37 | ;; 38 | * ) 39 | echo "Unknown argument ${1}" 40 | exit 1 41 | ;; 42 | esac 43 | shift 44 | done 45 | 46 | if [ ! -e ${QWEN_CHECKPOINT_PATH}/config.json ]; then 47 | echo "Checkpoint config.json file not found in ${QWEN_CHECKPOINT_PATH}, exit." 48 | exit 1 49 | fi 50 | 51 | sudo docker pull ${IMAGE_NAME} || { 52 | echo "Pulling image ${IMAGE_NAME} failed, exit." 53 | exit 1 54 | } 55 | 56 | sudo docker run --gpus all -d --restart always --name ${CONTAINER_NAME} \ 57 | -v /var/run/docker.sock:/var/run/docker.sock -p ${PORT}:80 \ 58 | --mount type=bind,source=${QWEN_CHECKPOINT_PATH},target=/data/shared/Qwen/Qwen-Chat \ 59 | -it ${IMAGE_NAME} \ 60 | python web_demo.py --server-port 80 --server-name 0.0.0.0 -c /data/shared/Qwen/Qwen-Chat/ && { 61 | echo "Successfully started web demo. Open 'http://localhost:${PORT}' to try! 62 | Run \`docker logs ${CONTAINER_NAME}\` to check demo status. 63 | Run \`docker rm -f ${CONTAINER_NAME}\` to stop and remove the demo." 64 | } 65 | -------------------------------------------------------------------------------- /eval/EVALUATION.md: -------------------------------------------------------------------------------- 1 | ## 评测复现 2 | 3 | - CEVAL 4 | 5 | ```Shell 6 | wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip 7 | mkdir data/ceval 8 | mv ceval-exam.zip data/ceval 9 | cd data/ceval; unzip ceval-exam.zip 10 | cd ../../ 11 | 12 | # Qwen-7B 13 | python evaluate_ceval.py -d data/ceval/ 14 | 15 | # Qwen-7B-Chat (We only provide 0-shot reproduction scripts. 5-shot results are obtained by OpenCompass (https://github.com/InternLM/opencompass).) 16 | pip install thefuzz 17 | python evaluate_chat_ceval.py -d data/ceval/ 18 | ``` 19 | 20 | - MMLU 21 | 22 | ```Shell 23 | wget https://people.eecs.berkeley.edu/~hendrycks/data.tar 24 | mkdir data/mmlu 25 | mv data.tar data/mmlu 26 | cd data/mmlu; tar xf data.tar 27 | cd ../../ 28 | 29 | # Qwen-7B 30 | python evaluate_mmlu.py -d data/mmlu/data/ 31 | 32 | # Qwen-7B-Chat (We only provide 0-shot reproduction scripts. 5-shot results are obtained by OpenCompass (https://github.com/InternLM/opencompass).) 33 | pip install thefuzz 34 | python evaluate_chat_mmlu.py -d data/mmlu/data/ 35 | ``` 36 | 37 | - CMMLU 38 | 39 | ```Shell 40 | wget https://huggingface.co/datasets/haonan-li/cmmlu/resolve/main/cmmlu_v1_0_1.zip 41 | mkdir data/cmmlu 42 | mv cmmlu_v1_0_1.zip data/cmmlu 43 | cd data/cmmlu; unzip cmmlu_v1_0_1.zip 44 | cd ../../ 45 | 46 | # Qwen-7B 47 | python evaluate_cmmlu.py -d data/cmmlu/ 48 | ``` 49 | 50 | - HumanEval 51 | 52 | Get the HumanEval.jsonl file from [here](https://github.com/openai/human-eval/tree/master/data) 53 | 54 | ```Shell 55 | git clone https://github.com/openai/human-eval 56 | pip install -e human-eval 57 | 58 | # Qwen-7B 59 | python evaluate_humaneval.py -f HumanEval.jsonl -o HumanEval_res.jsonl 60 | evaluate_functional_correctness HumanEval_res.jsonl 61 | # Qwen-7B-Chat 62 | python evaluate_chat_humaneval.py -f HumanEval.jsonl -o HumanEval_res_chat.jsonl 63 | evaluate_functional_correctness HumanEval_res_chat.jsonl 64 | ``` 65 | 66 | When installing package human-eval, please note its following disclaimer: 67 | 68 | This program exists to run untrusted model-generated code. Users are strongly encouraged not to do so outside of a robust security sandbox. The execution call in execution.py is deliberately commented out to ensure users read this disclaimer before running code in a potentially unsafe manner. See the comment in execution.py for more information and instructions. 69 | 70 | - GSM8K 71 | 72 | ```Shell 73 | # Qwen-7B 74 | python evaluate_gsm8k.py 75 | 76 | # Qwen-7B-Chat (We only provide 0-shot reproduction scripts. 5-shot results are obtained by OpenCompass (https://github.com/InternLM/opencompass).) 77 | python evaluate_chat_gsm8k.py # zeroshot 78 | ``` 79 | 80 | - PLUGIN 81 | 82 | This script is used to reproduce the results of the ReAct and Hugging Face Agent in the Tool Usage section of the README document. 83 | 84 | ```Shell 85 | # Qwen-7B-Chat 86 | mkdir data; 87 | cd data; 88 | ## Old Evaluation Dataset (Version 20230803) 89 | # wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v1/exam_plugin_v1_react_positive.jsonl; 90 | # wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v1/exam_plugin_v1_react_negative.jsonl; 91 | ## New Evaluation Dataset (Version 20231206) 92 | wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v20231206/exam_plugin_v20231206_react_positive.jsonl; 93 | wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v20231206/exam_plugin_v20231206_react_negative.jsonl;cd ..; 94 | pip install json5; 95 | pip install jsonlines; 96 | pip install rouge_score; 97 | python evaluate_plugin.py --eval-react-positive --eval-react-negative --eval-hfagent 98 | ``` 99 | -------------------------------------------------------------------------------- /eval/evaluate_chat_humaneval.py: -------------------------------------------------------------------------------- 1 | import re 2 | import textwrap 3 | import argparse 4 | from pathlib import Path 5 | import tqdm 6 | import jsonlines 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | from transformers.generation import GenerationConfig 9 | 10 | """ 11 | Get the HumanEval.jsonl file from [here](https://github.com/openai/human-eval/tree/master/data) 12 | 13 | python eval/evaluate_chat_humaneval.py -f HumanEval.jsonl -o HumanEval_res.jsonl 14 | git clone https://github.com/openai/human-eval 15 | pip install -e human-eval 16 | evaluate_functional_correctness HumanEval_res.jsonl 17 | """ 18 | 19 | DEVICE = "cuda:0" 20 | 21 | 22 | def extract_code(text, entry_point): 23 | # 正则表达式匹配代码块 24 | code_block_pattern = re.compile( 25 | rf"```(?:[Pp]ython\n)?.*?def\s+{entry_point}.*?:\n(.*?)\n```", re.DOTALL 26 | ) 27 | code_block = code_block_pattern.search(text) 28 | if code_block is None: 29 | code_block_pattern = re.compile( 30 | rf"def\s+{entry_point}.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL 31 | ) 32 | code_block = code_block_pattern.search(text) 33 | if code_block is None: 34 | code_block_pattern = re.compile( 35 | r"def.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL 36 | ) 37 | code_block = code_block_pattern.search(text) 38 | 39 | if code_block is not None: 40 | return code_block.group(1) 41 | 42 | # if no code block is found, assume the LM is simply filling the code 43 | return textwrap.indent(text, " " * 4) 44 | 45 | 46 | def generate_sample(model, tokenizer, question, entry_point): 47 | response, _ = model.chat( 48 | tokenizer, 49 | question, 50 | history=None, 51 | ) 52 | print(question) 53 | print(response) 54 | answer = extract_code(response, entry_point) 55 | return answer, response 56 | 57 | 58 | if __name__ == "__main__": 59 | parser = argparse.ArgumentParser(description="Test HF checkpoint.") 60 | parser.add_argument( 61 | "-c", 62 | "--checkpoint-path", 63 | type=Path, 64 | help="Checkpoint path", 65 | default="Qwen/Qwen-7B-Chat", 66 | ) 67 | parser.add_argument( 68 | "-f", 69 | "--sample-input-file", 70 | type=str, 71 | default=None, 72 | help="data path to HumanEval.jsonl", 73 | ) 74 | parser.add_argument( 75 | "-o", "--sample-output-file", type=str, default="HumanEval_res.jsonl" 76 | ) 77 | 78 | args = parser.parse_args() 79 | print("Loading tokenizer ...") 80 | tokenizer = AutoTokenizer.from_pretrained( 81 | args.checkpoint_path, trust_remote_code=True 82 | ) 83 | 84 | print("Loading model ...") 85 | model = AutoModelForCausalLM.from_pretrained( 86 | args.checkpoint_path, 87 | device_map="auto", 88 | trust_remote_code=True, 89 | bf16=True, 90 | use_flash_attn=True, 91 | ).eval() 92 | model.generation_config = GenerationConfig.from_pretrained( 93 | args.checkpoint_path, trust_remote_code=True 94 | ) 95 | model.generation_config.do_sample = False # use greedy decoding 96 | model.generation_config.repetition_penalty = 1.0 # disable repetition penalty 97 | 98 | f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8")) 99 | 100 | f = jsonlines.open(args.sample_input_file) 101 | with f_output as output: 102 | for jobj in tqdm.tqdm(f, desc="task_idx"): 103 | # use humanevalpack prompt 104 | signature = re.search( 105 | rf"def\s+({jobj['entry_point']}.*?):\s*\n", jobj["prompt"] 106 | ).group(1) 107 | description = "\n".join( 108 | [ 109 | line.strip() 110 | for line in re.search( 111 | rf"(?:\"\"\"|''')(.*?)(?:\"\"\"|''')", jobj["prompt"], re.DOTALL 112 | ) 113 | .group(1) 114 | .split("\n") 115 | ] 116 | ) 117 | prompt = ( 118 | f"Write a Python function `{signature}` to solve the following problem:\n" 119 | f"{description}\n" 120 | f"{jobj['prompt']}" 121 | ) 122 | 123 | task_id = jobj["task_id"] 124 | answer, response = generate_sample( 125 | model, tokenizer, prompt, jobj["entry_point"] 126 | ) 127 | gen_jobjs = {"task_id": task_id, "completion": answer, "response": response} 128 | output.write(gen_jobjs) 129 | f_output.close() 130 | -------------------------------------------------------------------------------- /eval/evaluate_gsm8k.py: -------------------------------------------------------------------------------- 1 | import re 2 | import torch 3 | import argparse 4 | import jsonlines 5 | import numpy as np 6 | import datasets 7 | from datasets import load_from_disk, load_dataset 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | from transformers.generation import GenerationConfig 10 | 11 | 12 | ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)") 13 | INVALID_ANS = "[invalid]" 14 | 15 | 16 | def doc_to_text(doc): 17 | return ( 18 | fewshot_prompt 19 | + "\nQuestion: " 20 | + doc["question"] 21 | + "\nLet's think step by step\n" 22 | ) 23 | 24 | 25 | def decode(tokens_list, tokenizer, raw_text_len): 26 | sents = [] 27 | # print(len(tokens_list)) 28 | for tokens in tokens_list: 29 | tokens = tokens.cpu().numpy().tolist() 30 | sent = tokenizer.tokenizer.decode(tokens[raw_text_len:]) 31 | sent = sent.split("<|endoftext|>")[0] 32 | sent = sent.split("\n\n\n")[0] 33 | sent = sent.split("\n\n")[0] 34 | sent = sent.split("Question:")[0] 35 | sents.append(sent) 36 | return sents 37 | 38 | 39 | def generate_sample(model, tokenizer, input_txt): 40 | input_ids = tokenizer.tokenizer.encode(input_txt) 41 | raw_text_len = len(input_ids) 42 | context_enc = torch.tensor([input_ids]).to(model.device) 43 | print(f"Input text: {input_txt}\n") 44 | outputs = model.generate(context_enc) 45 | output_text = decode(outputs, tokenizer, raw_text_len)[0] 46 | print(f"\nOutput text: {output_text}\n") 47 | return output_text 48 | 49 | 50 | def extract_answer_hf(completion): 51 | match = ANS_RE.search(completion) 52 | if match: 53 | match_str = match.group(1).strip() 54 | match_str = match_str.replace(",", "") 55 | return eval(match_str) 56 | else: 57 | return INVALID_ANS 58 | 59 | 60 | def extract_answer(completion): 61 | try: 62 | last_number = re.findall(r"\d+", completion)[-1] 63 | return eval(last_number) 64 | except: 65 | return INVALID_ANS 66 | 67 | 68 | def is_correct(completion, answer): 69 | gold = extract_answer_hf(answer) 70 | assert gold != INVALID_ANS, "No ground truth answer found in the document." 71 | return extract_answer(completion) == gold 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = argparse.ArgumentParser(description="Test HF checkpoint.") 76 | parser.add_argument( 77 | "-c", 78 | "--checkpoint-path", 79 | type=str, 80 | help="Checkpoint path", 81 | default="Qwen/Qwen-7B", 82 | ) 83 | parser.add_argument("-f", "--sample-input-file", type=str, default=None) 84 | parser.add_argument( 85 | "-o", "--sample-output-file", type=str, default="gsm8k_res.jsonl" 86 | ) 87 | 88 | args = parser.parse_args() 89 | 90 | fewshot_prompt = open("gsm8k_prompt.txt").read() 91 | if args.sample_input_file is not None: 92 | dataset = load_from_disk(args.sample_input_file) 93 | else: 94 | config = datasets.DownloadConfig(resume_download=True, max_retries=100) 95 | dataset = load_dataset("gsm8k", "main", download_config=config) 96 | 97 | test = dataset["test"] 98 | 99 | print("Loading tokenizer ...") 100 | tokenizer = AutoTokenizer.from_pretrained( 101 | args.checkpoint_path, trust_remote_code=True 102 | ) 103 | 104 | print("Loading model ...") 105 | model = AutoModelForCausalLM.from_pretrained( 106 | args.checkpoint_path, device_map="auto", trust_remote_code=True 107 | ).eval() 108 | model.generation_config = GenerationConfig.from_pretrained( 109 | args.checkpoint_path, trust_remote_code=True 110 | ) 111 | model.generation_config.do_sample = False 112 | 113 | f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8")) 114 | tot_length = test.num_rows 115 | acc_res = [] 116 | for doc in test: 117 | context = doc_to_text(doc) 118 | completion = generate_sample(model, tokenizer, context) 119 | answer = doc["answer"] 120 | acc = is_correct(completion, answer) 121 | doc["completion"] = completion 122 | doc["acc"] = acc 123 | f_output.write(doc) 124 | acc_res.append(acc) 125 | 126 | f_output.close() 127 | print("Acc: ", np.mean(acc_res)) 128 | -------------------------------------------------------------------------------- /eval/evaluate_humaneval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tqdm 3 | import torch 4 | import jsonlines 5 | from transformers import AutoModelForCausalLM, AutoTokenizer 6 | from transformers.generation import GenerationConfig 7 | 8 | """ 9 | git clone https://github.com/openai/human-eval 10 | $ pip install -e human-eval 11 | evaluate_functional_correctness sample-output-file 12 | """ 13 | 14 | 15 | def decode(tokens_list, tokenizer, raw_text_len): 16 | sents = [] 17 | # print(len(tokens_list)) 18 | for tokens in tokens_list: 19 | tokens = tokens.cpu().numpy().tolist() 20 | sent = tokenizer.tokenizer.decode(tokens[raw_text_len:]) 21 | sent = sent.split("<|endoftext|>")[0] 22 | sent = sent.split("\n\n\n")[0] 23 | sent = sent.split("\n\n")[0] 24 | sent = sent.split("def ")[0] 25 | sents.append(sent) 26 | return sents 27 | 28 | 29 | def generate_sample(model, tokenizer, input_txt): 30 | input_ids = tokenizer.tokenizer.encode(input_txt) 31 | raw_text_len = len(input_ids) 32 | context_enc = torch.tensor([input_ids]).to(model.device) 33 | print(f"Input text: {input_txt}\n") 34 | outputs = model.generate(context_enc) 35 | output_text = decode(outputs, tokenizer, raw_text_len)[0] 36 | print(f"\nOutput text: \n{output_text}\n") 37 | return output_text 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser(description="Test HF checkpoint.") 42 | parser.add_argument( 43 | "-c", 44 | "--checkpoint-path", 45 | type=str, 46 | help="Checkpoint path", 47 | default="Qwen/Qwen-7B", 48 | ) 49 | parser.add_argument( 50 | "-f", 51 | "--sample-input-file", 52 | type=str, 53 | default=None, 54 | help="data path to HumanEval.jsonl", 55 | ) 56 | parser.add_argument( 57 | "-o", "--sample-output-file", type=str, default="HumanEval_res.jsonl" 58 | ) 59 | 60 | args = parser.parse_args() 61 | print("Loading tokenizer ...") 62 | tokenizer = AutoTokenizer.from_pretrained( 63 | args.checkpoint_path, trust_remote_code=True 64 | ) 65 | 66 | print("Loading model ...") 67 | model = AutoModelForCausalLM.from_pretrained( 68 | args.checkpoint_path, device_map="auto", trust_remote_code=True 69 | ).eval() 70 | model.generation_config = GenerationConfig.from_pretrained( 71 | args.checkpoint_path, trust_remote_code=True 72 | ) 73 | model.generation_config.do_sample = False 74 | 75 | f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8")) 76 | 77 | f = jsonlines.open(args.sample_input_file) 78 | with f_output as output: 79 | for jobj in tqdm.tqdm(f, desc="task_idx"): 80 | prompt = jobj["prompt"] 81 | task_id = jobj["task_id"] 82 | gen_sents = generate_sample(model, tokenizer, prompt) 83 | gen_jobjs = {"task_id": task_id, "completion": gen_sents} 84 | output.write(gen_jobjs) 85 | f_output.close() 86 | -------------------------------------------------------------------------------- /eval/gsm8k_prompt.txt: -------------------------------------------------------------------------------- 1 | Question: In 2004, there were 60 kids at a cookout. In 2005, half the number of kids came to the cookout as compared to 2004. In 2006, 2/3 as many kids came to the cookout as in 2005. How many kids came to the cookout in 2006? 2 | Let's think step by step 3 | In 2005, 60/2=30 kids came to the cookout. 4 | In 2006, 30/3*2=20 kids came to the cookout. 5 | The answer is 20 6 | 7 | Question: Zilla spent 7% of her monthly earnings on rent, half of it on her other monthly expenses, and put the rest in her savings. If she spent $133 on her rent, how much does she deposit into her savings account in a month? 8 | Let's think step by step 9 | Since $133 is equal to 7% of her earnings, then 1% is equal to $133/7 = $19. 10 | The total monthly earning of Zilla is represented by 100%, so $19 x 100 = $1900 is her monthly earnings. 11 | So, $1900/2 = $950 is spent on her other monthly expenses. 12 | The total amount spent on the rent and other monthly expenses is $133 + $950 = $1083. 13 | Hence, she saves $1900 - $1083 = $817 per month. 14 | The answer is 817 15 | 16 | Question: If Buzz bought a pizza with 78 slices at a restaurant and then decided to share it with the waiter in the ratio of 5:8, with Buzz's ratio being 5, what's twenty less the number of slices of pizza that the waiter ate? 17 | Let's think step by step 18 | The total ratio representing the slices of pizza that Buzz bought is 5+8=13 19 | If he shared the slices of pizza with the waiter, the waiter received a fraction of 8/13 of the total number of slices, which totals 8/13 * 78 = 48 slices 20 | Twenty less the number of slices of pizza that the waiter ate is 48-20 = 28 21 | The answer is 28 22 | 23 | Question: Jame gets a raise to $20 per hour and works 40 hours a week. His old job was $16 an hour for 25 hours per week. How much more money does he make per year in his new job than the old job if he works 52 weeks a year? 24 | Let's think step by step 25 | He makes 20*40=$800 per week 26 | He used to make 16*25=$400 per week 27 | So his raise was 800-400=$400 per week 28 | So he makes 400*52=$20,800 per year more 29 | The answer is 20800 30 | 31 | Question: Mr. Gardner bakes 20 cookies, 25 cupcakes, and 35 brownies for his second-grade class of 20 students. If he wants to give each student an equal amount of sweet treats, how many sweet treats will each student receive? 32 | Let's think step by step 33 | Mr. Gardner bakes a total of 20 + 25 + 35 = 80 sweet treats 34 | Each student will receive 80 / 20 = 4 sweet treats 35 | The answer is 4 36 | 37 | Question: A used car lot has 24 cars and motorcycles (in total) for sale. A third of the vehicles are motorcycles, and a quarter of the cars have a spare tire included. How many tires are on the used car lot’s vehicles in all? 38 | Let's think step by step 39 | The used car lot has 24 / 3 = 8 motorcycles with 2 tires each. 40 | The lot has 24 - 8 = 16 cars for sale 41 | There are 16 / 4 = 4 cars with a spare tire with 5 tires each. 42 | The lot has 16 - 4 = 12 cars with 4 tires each. 43 | Thus, the used car lot’s vehicles have 8 * 2 + 4 * 5 + 12 * 4 = 16 + 20 + 48 = 84 tires in all. 44 | The answer is 84 45 | 46 | Question: Norma takes her clothes to the laundry. She leaves 9 T-shirts and twice as many sweaters as T-shirts in the washer. When she returns she finds 3 sweaters and triple the number of T-shirts. How many items are missing? 47 | Let's think step by step 48 | Norma left 9 T-shirts And twice as many sweaters, she took 9 * 2= 18 sweaters 49 | Adding the T-shirts and sweaters, Norma left 9 + 18 = 27 clothes 50 | When she came back, she found 3 sweaters And triple the number of T-shirts, she found 3 * 3 = 9 T-shirts 51 | Adding the T-shirts and sweaters, Norma found 3 + 9 = 12 clothes 52 | Subtracting the clothes she left from the clothes she found, 27 - 12 = 15 clothes are missing 53 | The answer is 15 54 | 55 | Question: Adam has an orchard. Every day for 30 days he picks 4 apples from his orchard. After a month, Adam has collected all the remaining apples, which were 230. How many apples in total has Adam collected from his orchard? 56 | Let's think step by step 57 | During 30 days Adam picked 4 * 30 = 120 apples. 58 | So in total with all the remaining apples, he picked 120 + 230 = 350 apples from his orchard. 59 | The answer is 350 60 | -------------------------------------------------------------------------------- /examples/auto_comments.md: -------------------------------------------------------------------------------- 1 | # Auto Comments 2 | 本文档介绍Auto Comments,这是一个利用Qwen模型为代码文件自动生成注释的使用案例。 3 | 4 | # 使用方法 5 | 您可以直接执行如下命令,为提供的代码文件生成注释: 6 | ``` 7 | python auto_comments.py --path 'path of file or folder' 8 | ``` 9 | 10 | 参数: 11 | - path:文件路径。可以是文件(目前支持python代码文件),也可以是文件夹(会扫描文件夹下所有python代码文件) 12 | - regenerate:重新生成。默认False,如果针对同一文件需要重新生成注释,请设置为True 13 | 14 | # 使用样例 15 | - 执行:python auto_comments.py --path test_file.py 16 | - test_file.py 内容为: 17 | ``` 18 | import numpy as np 19 | import pandas as pd 20 | import seaborn as sns 21 | sns.set_theme(style="whitegrid") 22 | 23 | rs = np.random.RandomState(365) 24 | values = rs.randn(365, 4).cumsum(axis=0) 25 | dates = pd.date_range("1 1 2016", periods=365, freq="D") 26 | data = pd.DataFrame(values, dates, columns=["A", "B", "C", "D"]) 27 | data = data.rolling(7).mean() 28 | 29 | sns.lineplot(data=data, palette="tab10", linewidth=2.5) 30 | ``` 31 | 32 | - 输出:test_file_comments.py(包含注释的代码文件),文件内容如下: 33 | ``` 34 | # 导入需要的库 35 | import numpy as np 36 | import pandas as pd 37 | import seaborn as sns 38 | 39 | # 设置 Seaborn 的主题风格为白色网格 40 | sns.set_theme(style="whitegrid") 41 | 42 | # 生成随机数 43 | rs = np.random.RandomState(365) 44 | 45 | # 生成 365 行 4 列的随机数,并按行累加 46 | values = rs.randn(365, 4).cumsum(axis=0) 47 | 48 | # 生成日期 49 | dates = pd.date_range("1 1 2016", periods=365, freq="D") 50 | 51 | # 将随机数和日期组合成 DataFrame 52 | data = pd.DataFrame(values, dates, columns=["A", "B", "C", "D"]) 53 | 54 | # 对 DataFrame 进行 7 天滑动平均 55 | data = data.rolling(7).mean() 56 | 57 | # 使用 Seaborn 绘制折线图 58 | sns.lineplot(data=data, palette="tab10", linewidth=2.5) 59 | ``` 60 | -------------------------------------------------------------------------------- /examples/auto_comments.py: -------------------------------------------------------------------------------- 1 | # 运行方式:python auto_comments.py --path 'path of file or folder' 2 | # 脚本功能:使用QWen-7B-Chat为提供的代码文件自动生成注释。(详见auto_comments.md) 3 | 4 | 5 | import argparse 6 | import os 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | from transformers.generation import GenerationConfig 9 | 10 | MaxLine = 50 # 限制单次处理最大代码行数 11 | SplitKey = ["\ndef "] # 自定义的切分代码标识 12 | CodeFileType = ["py"] # 目前仅测试过对python文件生成注释 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--path', type=str, default='Qwen-7B/eval/evaluate_ceval.py') 17 | parser.add_argument('--regenerate', action='store_true', default=False) #如果已经生成过注释,默认不会重新生成 18 | args = parser.parse_args() 19 | return args 20 | 21 | class QWenChat(): 22 | def __init__(self): 23 | self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) 24 | 25 | # use bf16 26 | # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval() 27 | # use fp16 28 | # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval() 29 | # use cpu only 30 | # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True).eval() 31 | # use auto mode, automatically select precision based on the device. 32 | self.model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True).eval() 33 | 34 | # Specify hyperparameters for generation 35 | self.model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) 36 | self.history = None 37 | 38 | def chat(self, query, system = ""): 39 | 40 | # use history 41 | # response, history = self.model.chat(self.tokenizer, query, history=self.history) 42 | 43 | # 默认不使用history 44 | response, history = self.model.chat(self.tokenizer, query, history=None) 45 | self.history = history 46 | 47 | return response 48 | # 生成注释 49 | def gen_code_comments(context, model = None, **kwargs): 50 | prompt = "\n为以上代码生成细致的中文注释,注意使用合适的语法。要求必须在每个函数开头生成一段统一的函数功能注释。\n除了注释,请保证原始代码内容不变。不要返回除了注释和代码以外的其余信息,不要生成额外代码。\n" 51 | return model.chat(context + prompt) 52 | 53 | def read_file(path): 54 | f = open(path, "r",encoding='utf-8') 55 | lines = f.readlines() 56 | return "".join(lines) 57 | 58 | def write_file(path, context): 59 | with open(path,'w') as f: 60 | f.write(context) 61 | 62 | # 如果代码文件过长,可以简单按照最大行数切分代码 63 | def split_context_by_maxline(text): 64 | lines = text.split("\n") 65 | lines_len = len(lines) 66 | res = [] 67 | for i in range(MaxLine, lines_len, MaxLine): 68 | res.append("\n".join(lines[i-MaxLine:i])) 69 | 70 | if i < lines_len: 71 | res.append("\n".join(lines[i:])) 72 | return res 73 | 74 | # 如果代码文件过长,可以简单按照函数切分代码 75 | def split_context_by_splitkey(text): 76 | blocks = text.split(SplitKey[0]) 77 | return [blocks[0]] + [SplitKey[0]+x for x in blocks[1:]] 78 | 79 | # merge原始代码和生成的注释,目的是保证原始代码不被更改。这部分可以使用各种不同的策略处理。 80 | def merge_code_and_comments(original_file, comments_path): 81 | res = [] 82 | ori_f = open(original_file, "r",encoding='utf-8') 83 | ori_lines = ori_f.readlines() 84 | 85 | com_f = open(comments_path, "r",encoding='utf-8') 86 | com_lines = com_f.readlines() 87 | len_com_lines = len(com_lines) 88 | p = 0 89 | j = 0 90 | for i, line in enumerate(ori_lines): 91 | if line.isspace(): 92 | continue 93 | if line.strip()[0] == '#': 94 | res.append(line) 95 | continue 96 | while j < len_com_lines and line[:-1] not in com_lines[j]: 97 | j += 1 98 | if j < len_com_lines: 99 | p = j - 1 100 | up_comments = [] 101 | triple_dot_flag = 0 102 | while p < j: 103 | if p < 0 or (res and res[-1] and com_lines[p] == res[-1]): 104 | break 105 | if com_lines[p].strip() and (len(com_lines[p].strip())>3 and com_lines[p].strip()[-3:] == '"""' and com_lines[p].strip()[:3] == '"""') or (len(com_lines[p].strip())>3 and com_lines[p].strip()[-3:] == "'''" and com_lines[p].strip()[:3] == "'''"): 106 | up_comments.append(com_lines[p]) 107 | p -= 1 108 | continue 109 | if com_lines[p].strip() and (com_lines[p].strip()[-3:] == '"""' or com_lines[p].strip()[:3] == '"""' or com_lines[p].strip()[-3:] == "'''" or com_lines[p].strip()[:3] == "'''"): 110 | triple_dot_flag = (triple_dot_flag + 1)%2 111 | up_comments.append(com_lines[p]) 112 | p -= 1 113 | continue 114 | if triple_dot_flag: 115 | up_comments.append(com_lines[p]) 116 | p -= 1 117 | continue 118 | if (com_lines[p].strip()=="") or (com_lines[p].strip() and com_lines[p].strip()[0] == '#' and "省略部分内容" not in com_lines[p]): 119 | up_comments.append(com_lines[p]) 120 | else: 121 | break 122 | p -= 1 123 | if up_comments: 124 | res.extend(reversed(up_comments)) 125 | if "#" in com_lines[j] and "#" not in line: 126 | in_line_comments = " #" + com_lines[j].split("#")[-1] 127 | res.append(line[:-1]+in_line_comments) 128 | else: 129 | res.append(line) 130 | p = j+1 131 | else: 132 | res.append(line) 133 | j = p 134 | 135 | write_file(comments_path, "".join(res)) 136 | 137 | # 处理单个文件 138 | def deal_one_file(model, path, args): 139 | context = read_file(path) 140 | 141 | fname = path.split("/")[-1] 142 | fpath = "/".join(path.split("/")[:-1]) 143 | outfname = fname.split(".")[0]+"_comments."+fname.split(".")[-1] 144 | 145 | comments_path = os.path.join(fpath, outfname) 146 | if (not args.regenerate) and os.path.exists(comments_path): 147 | print("use cache: ", comments_path) 148 | return 149 | 150 | context_line = len(context.split("\n")) 151 | if context_line < MaxLine: 152 | res = gen_code_comments(context, model = model) 153 | elif SplitKey[0] not in context: 154 | context_list = split_context_by_maxline(context) 155 | res = "\n".join([gen_code_comments(context_block, model = model) for context_block in context_list]) 156 | else: 157 | context_list = split_context_by_splitkey(context) 158 | res = "\n".join([gen_code_comments(context_block, model = model) for context_block in context_list]) 159 | 160 | write_file(comments_path, res) 161 | merge_code_and_comments(path, comments_path) 162 | 163 | # 处理文件夹 164 | def deal_folder(model, path, args): 165 | for fl in os.listdir(path): 166 | now_path = os.path.join(path, fl) 167 | if os.path.isfile(now_path): 168 | if (now_path.split(".")[-1] in CodeFileType) and ("_comments" not in now_path): 169 | deal_one_file(model, now_path, args) 170 | elif os.path.isdir(now_path): 171 | deal_folder(model, now_path, args) 172 | else: 173 | print("Please specify a correct path!") 174 | 175 | def transfer(args): 176 | model = QWenChat() 177 | 178 | if os.path.isfile(args.path): 179 | if (args.path.split(".")[-1] in CodeFileType) and ("_comments" not in args.path): 180 | deal_one_file(model, args.path, args) 181 | elif os.path.isdir(args.path): 182 | deal_folder(model, args.path, args) 183 | else: 184 | print("Please specify a correct path!") 185 | 186 | if __name__ == '__main__': 187 | args = parse_args() 188 | print(args) 189 | transfer(args) 190 | -------------------------------------------------------------------------------- /examples/function_call_examples.py: -------------------------------------------------------------------------------- 1 | # Reference: https://openai.com/blog/function-calling-and-other-api-updates 2 | import json 3 | from pprint import pprint 4 | 5 | import openai 6 | 7 | # To start an OpenAI-like Qwen server, use the following commands: 8 | # git clone https://github.com/QwenLM/Qwen-7B; 9 | # cd Qwen-7B; 10 | # pip install fastapi uvicorn openai pydantic sse_starlette; 11 | # python openai_api.py; 12 | # 13 | # Then configure the api_base and api_key in your client: 14 | openai.api_base = 'http://localhost:8000/v1' 15 | openai.api_key = 'none' 16 | 17 | 18 | def call_qwen(messages, functions=None): 19 | print('input:') 20 | pprint(messages, indent=2) 21 | if functions: 22 | response = openai.ChatCompletion.create(model='Qwen', 23 | messages=messages, 24 | functions=functions) 25 | else: 26 | response = openai.ChatCompletion.create(model='Qwen', 27 | messages=messages) 28 | response = response.choices[0]['message'] 29 | response = json.loads(json.dumps(response, 30 | ensure_ascii=False)) # fix zh rendering 31 | print('output:') 32 | pprint(response, indent=2) 33 | print() 34 | return response 35 | 36 | 37 | def test_1(): 38 | messages = [{'role': 'user', 'content': '你好'}] 39 | call_qwen(messages) 40 | messages.append({'role': 'assistant', 'content': '你好!很高兴为你提供帮助。'}) 41 | 42 | messages.append({ 43 | 'role': 'user', 44 | 'content': '给我讲一个年轻人奋斗创业最终取得成功的故事。故事只能有一句话。' 45 | }) 46 | call_qwen(messages) 47 | messages.append({ 48 | 'role': 49 | 'assistant', 50 | 'content': 51 | '故事的主人公叫李明,他来自一个普通的家庭,父母都是普通的工人。李明想要成为一名成功的企业家。……', 52 | }) 53 | 54 | messages.append({'role': 'user', 'content': '给这个故事起一个标题'}) 55 | call_qwen(messages) 56 | 57 | 58 | def test_2(): 59 | functions = [ 60 | { 61 | 'name_for_human': 62 | '谷歌搜索', 63 | 'name_for_model': 64 | 'google_search', 65 | 'description_for_model': 66 | '谷歌搜索是一个通用搜索引擎,可用于访问互联网、查询百科知识、了解时事新闻等。' + 67 | ' Format the arguments as a JSON object.', 68 | 'parameters': [{ 69 | 'name': 'search_query', 70 | 'description': '搜索关键词或短语', 71 | 'required': True, 72 | 'schema': { 73 | 'type': 'string' 74 | }, 75 | }], 76 | }, 77 | { 78 | 'name_for_human': 79 | '文生图', 80 | 'name_for_model': 81 | 'image_gen', 82 | 'description_for_model': 83 | '文生图是一个AI绘画(图像生成)服务,输入文本描述,返回根据文本作画得到的图片的URL。' + 84 | ' Format the arguments as a JSON object.', 85 | 'parameters': [{ 86 | 'name': 'prompt', 87 | 'description': '英文关键词,描述了希望图像具有什么内容', 88 | 'required': True, 89 | 'schema': { 90 | 'type': 'string' 91 | }, 92 | }], 93 | }, 94 | ] 95 | 96 | messages = [{'role': 'user', 'content': '(请不要调用工具)\n\n你好'}] 97 | call_qwen(messages, functions) 98 | messages.append({ 99 | 'role': 'assistant', 100 | 'content': '你好!很高兴见到你。有什么我可以帮忙的吗?' 101 | }, ) 102 | 103 | messages.append({'role': 'user', 'content': '搜索一下谁是周杰伦'}) 104 | call_qwen(messages, functions) 105 | messages.append({ 106 | 'role': 'assistant', 107 | 'content': '我应该使用Google搜索查找相关信息。', 108 | 'function_call': { 109 | 'name': 'google_search', 110 | 'arguments': '{"search_query": "周杰伦"}', 111 | }, 112 | }) 113 | 114 | messages.append({ 115 | 'role': 'function', 116 | 'name': 'google_search', 117 | 'content': 'Jay Chou is a Taiwanese singer.', 118 | }) 119 | call_qwen(messages, functions) 120 | messages.append( 121 | { 122 | 'role': 'assistant', 123 | 'content': '周杰伦(Jay Chou)是一位来自台湾的歌手。', 124 | }, ) 125 | 126 | messages.append({'role': 'user', 'content': '搜索一下他老婆是谁'}) 127 | call_qwen(messages, functions) 128 | messages.append({ 129 | 'role': 'assistant', 130 | 'content': '我应该使用Google搜索查找相关信息。', 131 | 'function_call': { 132 | 'name': 'google_search', 133 | 'arguments': '{"search_query": "周杰伦 老婆"}', 134 | }, 135 | }) 136 | 137 | messages.append({ 138 | 'role': 'function', 139 | 'name': 'google_search', 140 | 'content': 'Hannah Quinlivan' 141 | }) 142 | call_qwen(messages, functions) 143 | messages.append( 144 | { 145 | 'role': 'assistant', 146 | 'content': '周杰伦的老婆是Hannah Quinlivan。', 147 | }, ) 148 | 149 | messages.append({'role': 'user', 'content': '用文生图工具画个可爱的小猫吧,最好是黑猫'}) 150 | call_qwen(messages, functions) 151 | messages.append({ 152 | 'role': 'assistant', 153 | 'content': '我应该使用文生图API来生成一张可爱的小猫图片。', 154 | 'function_call': { 155 | 'name': 'image_gen', 156 | 'arguments': '{"prompt": "cute black cat"}', 157 | }, 158 | }) 159 | 160 | messages.append({ 161 | 'role': 162 | 'function', 163 | 'name': 164 | 'image_gen', 165 | 'content': 166 | '{"image_url": "https://image.pollinations.ai/prompt/cute%20black%20cat"}', 167 | }) 168 | call_qwen(messages, functions) 169 | 170 | 171 | def test_3(): 172 | functions = [{ 173 | 'name': 'get_current_weather', 174 | 'description': 'Get the current weather in a given location.', 175 | 'parameters': { 176 | 'type': 'object', 177 | 'properties': { 178 | 'location': { 179 | 'type': 'string', 180 | 'description': 181 | 'The city and state, e.g. San Francisco, CA', 182 | }, 183 | 'unit': { 184 | 'type': 'string', 185 | 'enum': ['celsius', 'fahrenheit'] 186 | }, 187 | }, 188 | 'required': ['location'], 189 | }, 190 | }] 191 | 192 | messages = [{ 193 | 'role': 'user', 194 | # Note: The current version of Qwen-7B-Chat (as of 2023.08) performs okay with Chinese tool-use prompts, 195 | # but performs terribly when it comes to English tool-use prompts, due to a mistake in data collecting. 196 | 'content': '波士顿天气如何?', 197 | }] 198 | call_qwen(messages, functions) 199 | messages.append( 200 | { 201 | 'role': 'assistant', 202 | 'content': None, 203 | 'function_call': { 204 | 'name': 'get_current_weather', 205 | 'arguments': '{"location": "Boston, MA"}', 206 | }, 207 | }, ) 208 | 209 | messages.append({ 210 | 'role': 211 | 'function', 212 | 'name': 213 | 'get_current_weather', 214 | 'content': 215 | '{"temperature": "22", "unit": "celsius", "description": "Sunny"}', 216 | }) 217 | call_qwen(messages, functions) 218 | 219 | 220 | def test_4(): 221 | from langchain.agents import AgentType, initialize_agent, load_tools 222 | from langchain.chat_models import ChatOpenAI 223 | 224 | llm = ChatOpenAI( 225 | model_name='Qwen', 226 | openai_api_base='http://localhost:8000/v1', 227 | openai_api_key='EMPTY', 228 | streaming=False, 229 | ) 230 | tools = load_tools(['arxiv'], ) 231 | agent_chain = initialize_agent( 232 | tools, 233 | llm, 234 | agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, 235 | verbose=True, 236 | ) 237 | # TODO: The performance is okay with Chinese prompts, but not so good when it comes to English. 238 | agent_chain.run('查一下论文 1605.08386 的信息') 239 | 240 | 241 | if __name__ == '__main__': 242 | print('### Test Case 1 - No Function Calling (普通问答、无函数调用) ###') 243 | test_1() 244 | print('### Test Case 2 - Use Qwen-Style Functions (函数调用,千问格式) ###') 245 | test_2() 246 | print('### Test Case 3 - Use GPT-Style Functions (函数调用,GPT格式) ###') 247 | test_3() 248 | print('### Test Case 4 - Use LangChain (接入Langchain) ###') 249 | test_4() 250 | -------------------------------------------------------------------------------- /examples/qwen_extra.tiktoken: -------------------------------------------------------------------------------- 1 | 5LiA5Y+q54yr 151851 2 | 5Y+q54yr 151852 3 | 5piv5LiA5Y+q54yr 151853 4 | 5oiR5piv5LiA5Y+q54yr 151854 5 | 5L2g5piv5LiA5Y+q54yr 151855 6 | 5LuW5piv5LiA5Y+q54yr 151856 7 | -------------------------------------------------------------------------------- /examples/qwen_extra_vocab.txt: -------------------------------------------------------------------------------- 1 | 我是一只猫 20 2 | 你是一只猫 10 3 | 他是一只猫 5 4 | 一只 200 5 | 一只猫 100 6 | 夸张的 比喻手法 20 -------------------------------------------------------------------------------- /examples/system_prompt.md: -------------------------------------------------------------------------------- 1 | # 系统指令 (System Prompts) 2 | 3 | ## 什么是系统指令? (What is the System Prompts?) 4 | 5 | 系统指令设定了AI助手的行为模式,例如人物设定、语言风格、任务模式、甚至针对具体问题的具体行为。 6 | 7 | System Propmts set the behavior mode of the AI assistant, such as character settings, language styles, task modes, and even specific behaviors for specific tasks. 8 | 9 | 系统指令可以是一个广泛的人物设定,如“You are a helpful assistant”;也可以是一个十分详细的要求,如“拒绝回答所有代码相关的问题”。 10 | 11 | The System Prompts can be a broad character setting, such as "You are a helpful assistant"; or it can be a very detailed request, such as "Refuse to answer all code-related questions." 12 | 13 | 系统指令为用户提供了一个易组织、上下文稳定的控制AI助手行为的方式,可以从多种角度定制属于你自己的AI助手。 14 | 15 | System Prompts provide users with an easy-to-organize, context-stable way to control the behavior of the AI assistant. You can customize your own AI assistant from multiple perspectives. 16 | 17 | 系统指令需要在多轮对话中稳定,例如角色扮演类系统指令被设定后AI助手不应该在多轮对话中跳脱自身的设定。 18 | 19 | System Prompts need to be stable across multiple rounds of dialogue. For example, after a role-playing system prompt is set, the AI assistant should not escape its own settings in multiple rounds of dialogue. 20 | 21 | 同时,模型也需要具有基于系统指令中对自身行为进行推理的能力。这两者都是为模型赋予跟随系统指令能力时需要克服的难点。 22 | 23 | At the same time, the model also needs to have the ability to reason about its own behavior based on system prompts. Both of these are difficulties that need to be overcome when giving the model the ability to follow system prompts. 24 | 25 | Qwen-1.8B-Chat 和 Qwen-72B-Chat在多样且存在多轮复杂交互的系统指令上进行了充分训练,使模型可以跟随多样的系统指令,实现上下文(in-context)中的模型定制化,进一步提升了通义千问的可扩展性。 26 | 27 | Qwen-1.8-Chat and Qwen-72B-Chat have been fully trained on diverse system prompts with multiple rounds of complex interactions, so that they can follow a variety of system prompts and realize model customization in context, further improving the scalability of Qwen-chat. 28 | 29 | ## 系统指令能做什么? (What can System Prompts do?) 30 | 31 | ### 角色扮演 Role Play 32 | 33 | 在系统指令中告诉千问你需要它扮演的角色,即可沉浸式和该角色对话交流 34 | 35 | Tell Qwen-Chat the role you want it to play in the System Prompt, and you can have an immersive conversation with that role. 36 | 37 | 38 | ![](../assets/system_prompt_role_play.png) 39 | 40 | ![](../assets/system_prompt_role_play_en.png) 41 | 42 | ### 语言风格 Language Style 43 | 44 | 45 | 简单调整千问的语言风格 46 | 47 | Simple adjustment of the Qwen-Chat's language style 48 | 49 | ![](../assets/system_prompt_language_style.png) 50 | 51 | ![](../assets/system_prompt_language_style_en.png) 52 | 53 | ### 任务设定 Task Setting 54 | 55 | 指定具体任务,打造处理专项任务的千问模型 56 | 57 | Setting specific tasks and creating a Qwen-Chat model to handle special tasks 58 | 59 | ![](../assets/system_prompt_task_setting.png) 60 | 61 | ![](../assets/system_prompt_task_setting_en.png) 62 | 63 | ### 行为设定 Behavior Setting 64 | 65 | 设定千问对具体任务的行为模式 66 | 67 | Set behavior patterns of Qwen-Chat for specific tasks 68 | 69 | ![](../assets/system_prompt_behavior_setting.png) 70 | 71 | ![](../assets/system_prompt_behavior_setting_en.png) 72 | 73 | ## 代码示例 Example 74 | 75 | ```python 76 | from transformers import AutoModelForCausalLM, AutoTokenizer 77 | from transformers.generation import GenerationConfig 78 | 79 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-1_8B-Chat", trust_remote_code=True) 80 | 81 | # Only Qwen-72B-Chat and Qwen-1_8B-Chat has system prompt enhancement now. 82 | model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-1_8B-Chat", device_map="auto", trust_remote_code=True).eval() 83 | # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-72B-Chat", device_map="auto", trust_remote_code=True).eval() 84 | 85 | response, _ = model.chat(tokenizer, "你好呀", history=None, system="请用二次元可爱语气和我说话") 86 | print(response) 87 | # 你好啊!我是一只可爱的二次元猫咪哦,不知道你有什么问题需要我帮忙解答吗? 88 | 89 | response, _ = model.chat(tokenizer, "My colleague works diligently", history=None, system="You will write beautiful compliments according to needs") 90 | print(response) 91 | # Your colleague is an outstanding worker! Their dedication and hard work are truly inspiring. They always go above and beyond to ensure that their tasks are completed on time and to the highest standard. I am lucky to have them as a colleague, and I know I can count on them to handle any challenge that comes their way. 92 | ``` -------------------------------------------------------------------------------- /examples/transformers_agent.md: -------------------------------------------------------------------------------- 1 | ## 什么是HuggingFace Agent 2 | 使用大模型作为Agent,仅需自然语言就可调用HuggingFace中的模型,目前支持两种模式: 3 | 4 | - run模式:单轮对话,没有上下文,单个prompt多tool组合调用能力好 5 | - chat模式:多轮对话,有上下文,单次调用能力好,可能需要多次prompt实现多tool组合调用 6 | > 详见官方文档:[Transformers Agents](https://huggingface.co/docs/transformers/transformers_agents) 7 | 8 | ## 使用通义千问作为Agent 9 | ### 安装依赖 10 | ``` 11 | pip install transformers 12 | ``` 13 | ### 构建QWenAgent 14 | 以下代码便可实现QWenAgent: 15 | ```python 16 | import torch 17 | from transformers import AutoModelForCausalLM, AutoTokenizer, Agent 18 | from transformers.generation import GenerationConfig 19 | 20 | 21 | class QWenAgent(Agent): 22 | """ 23 | Agent that uses QWen model and tokenizer to generate code. 24 | 25 | Args: 26 | chat_prompt_template (`str`, *optional*): 27 | Pass along your own prompt if you want to override the default template for the `chat` method. Can be the 28 | actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named 29 | `chat_prompt_template.txt` in this repo in this case. 30 | run_prompt_template (`str`, *optional*): 31 | Pass along your own prompt if you want to override the default template for the `run` method. Can be the 32 | actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named 33 | `run_prompt_template.txt` in this repo in this case. 34 | additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*): 35 | Any additional tools to include on top of the default ones. If you pass along a tool with the same name as 36 | one of the default tools, that default tool will be overridden. 37 | 38 | Example: 39 | 40 | ```py 41 | agent = QWenAgent() 42 | agent.run("Draw me a picture of rivers and lakes.") 43 | ``` 44 | """ 45 | def __init__(self, chat_prompt_template=None, run_prompt_template=None, additional_tools=None): 46 | checkpoint = "Qwen/Qwen-7B-Chat" 47 | self.tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True) 48 | self.model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", trust_remote_code=True).cuda().eval() 49 | self.model.generation_config = GenerationConfig.from_pretrained(checkpoint, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参 50 | self.model.generation_config.do_sample = False # greedy 51 | 52 | super().__init__( 53 | chat_prompt_template=chat_prompt_template, 54 | run_prompt_template=run_prompt_template, 55 | additional_tools=additional_tools, 56 | ) 57 | 58 | def generate_one(self, prompt, stop): 59 | # "Human:" 和 "Assistant:" 曾为通义千问的特殊保留字,需要替换为 "_HUMAN_:" 和 "_ASSISTANT_:"。这一问题将在未来版本修复。 60 | prompt = prompt.replace("Human:", "_HUMAN_:").replace("Assistant:", "_ASSISTANT_:") 61 | stop = [item.replace("Human:", "_HUMAN_:").replace("Assistant:", "_ASSISTANT_:") for item in stop] 62 | 63 | result, _ = self.model.chat(self.tokenizer, prompt, history=None) 64 | for stop_seq in stop: 65 | if result.endswith(stop_seq): 66 | result = result[: -len(stop_seq)] 67 | 68 | result = result.replace("_HUMAN_:", "Human:").replace("_ASSISTANT_:", "Assistant:") 69 | return result 70 | 71 | 72 | agent = QWenAgent() 73 | agent.run("Draw me a picture of rivers and lakes.") 74 | ``` 75 | ### 使用示例 76 | ```python 77 | agent = QWenAgent() 78 | agent.run("generate an image of panda", remote=True) 79 | ``` 80 | ![](../assets/hfagent_run.png) 81 | ![](../assets/hfagent_chat_1.png) 82 | ![](../assets/hfagent_chat_2.png) 83 | > 更多玩法参考HuggingFace官方文档[Transformers Agents](https://huggingface.co/docs/transformers/transformers_agents) 84 | 85 | ## Tools 86 | ### Tools支持 87 | HuggingFace Agent官方14个tool: 88 | 89 | - **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document (Donut) 90 | - **Text question answering**: given a long text and a question, answer the question in the text (Flan-T5) 91 | - **Unconditional image captioning**: Caption the image! (BLIP) 92 | - **Image question answering**: given an image, answer a question on this image (VILT) 93 | - **Image segmentation**: given an image and a prompt, output the segmentation mask of that prompt (CLIPSeg) 94 | - **Speech to text**: given an audio recording of a person talking, transcribe the speech into text (Whisper) 95 | - **Text to speech**: convert text to speech (SpeechT5) 96 | - **Zero-shot text classification**: given a text and a list of labels, identify to which label the text corresponds the most (BART) 97 | - **Text summarization**: summarize a long text in one or a few sentences (BART) 98 | - **Translation**: translate the text into a given language (NLLB) 99 | - **Text downloader**: to download a text from a web URL 100 | - **Text to image**: generate an image according to a prompt, leveraging stable diffusion 101 | - **Image transformation**: transforms an image 102 | - **Text to video**: generate a small video according to a prompt, leveraging damo-vilab 103 | ### Tools模型部署 104 | 部分工具涉及的模型HuggingFace已进行在线部署,仅需设置remote=True便可实现在线调用: 105 | > agent.run(xxx, remote=True) 106 | 107 | HuggingFace没有在线部署的模型会自动下载checkpoint进行本地inference 108 | 网络原因偶尔连不上HuggingFace,请多次尝试 109 | -------------------------------------------------------------------------------- /finetune/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | 23 | "scheduler": { 24 | "type": "WarmupLR", 25 | "params": { 26 | "warmup_min_lr": "auto", 27 | "warmup_max_lr": "auto", 28 | "warmup_num_steps": "auto" 29 | } 30 | }, 31 | 32 | "zero_optimization": { 33 | "stage": 2, 34 | "offload_optimizer": { 35 | "device": "none", 36 | "pin_memory": true 37 | }, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 2e8, 40 | "overlap_comm": true, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 2e8, 43 | "contiguous_gradients": true 44 | }, 45 | 46 | "gradient_accumulation_steps": "auto", 47 | "gradient_clipping": "auto", 48 | "steps_per_print": 100, 49 | "train_batch_size": "auto", 50 | "train_micro_batch_size_per_gpu": "auto", 51 | "wall_clock_breakdown": false 52 | } -------------------------------------------------------------------------------- /finetune/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | 23 | "scheduler": { 24 | "type": "WarmupLR", 25 | "params": { 26 | "warmup_min_lr": "auto", 27 | "warmup_max_lr": "auto", 28 | "warmup_num_steps": "auto" 29 | } 30 | }, 31 | 32 | "zero_optimization": { 33 | "stage": 3, 34 | "offload_optimizer": { 35 | "device": "none", 36 | "pin_memory": true 37 | }, 38 | "offload_param": { 39 | "device": "none", 40 | "pin_memory": true 41 | }, 42 | "overlap_comm": true, 43 | "contiguous_gradients": true, 44 | "sub_group_size": 1e9, 45 | "reduce_bucket_size": "auto", 46 | "stage3_prefetch_bucket_size": "auto", 47 | "stage3_param_persistence_threshold": "auto", 48 | "stage3_max_live_parameters": 1e9, 49 | "stage3_max_reuse_distance": 1e9, 50 | "stage3_gather_16bit_weights_on_model_save": true 51 | }, 52 | 53 | "gradient_accumulation_steps": "auto", 54 | "gradient_clipping": "auto", 55 | "steps_per_print": 100, 56 | "train_batch_size": "auto", 57 | "train_micro_batch_size_per_gpu": "auto", 58 | "wall_clock_breakdown": false 59 | } 60 | -------------------------------------------------------------------------------- /finetune/finetune_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | # Guide: 6 | # This script supports distributed training on multi-gpu workers (as well as single-worker training). 7 | # Please set the options below according to the comments. 8 | # For multi-gpu workers training, these options should be manually set for each worker. 9 | # After setting the options, please run the script on each worker. 10 | 11 | # Number of GPUs per GPU worker 12 | GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())') 13 | 14 | # Number of GPU workers, for single-worker training, please set to 1 15 | NNODES=${NNODES:-1} 16 | 17 | # The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0 18 | NODE_RANK=${NODE_RANK:-0} 19 | 20 | # The ip address of the rank-0 worker, for single-worker training, please set to localhost 21 | MASTER_ADDR=${MASTER_ADDR:-localhost} 22 | 23 | # The port for communication 24 | MASTER_PORT=${MASTER_PORT:-6001} 25 | 26 | MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly 27 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 28 | # See the section for finetuning in README for more information. 29 | DATA="path_to_data" 30 | 31 | function usage() { 32 | echo ' 33 | Usage: bash finetune/finetune_ds.sh [-m MODEL_PATH] [-d DATA_PATH] 34 | ' 35 | } 36 | 37 | while [[ "$1" != "" ]]; do 38 | case $1 in 39 | -m | --model ) 40 | shift 41 | MODEL=$1 42 | ;; 43 | -d | --data ) 44 | shift 45 | DATA=$1 46 | ;; 47 | -h | --help ) 48 | usage 49 | exit 0 50 | ;; 51 | * ) 52 | echo "Unknown argument ${1}" 53 | exit 1 54 | ;; 55 | esac 56 | shift 57 | done 58 | 59 | DISTRIBUTED_ARGS=" 60 | --nproc_per_node $GPUS_PER_NODE \ 61 | --nnodes $NNODES \ 62 | --node_rank $NODE_RANK \ 63 | --master_addr $MASTER_ADDR \ 64 | --master_port $MASTER_PORT 65 | " 66 | 67 | torchrun $DISTRIBUTED_ARGS finetune.py \ 68 | --model_name_or_path $MODEL \ 69 | --data_path $DATA \ 70 | --bf16 True \ 71 | --output_dir output_qwen \ 72 | --num_train_epochs 5 \ 73 | --per_device_train_batch_size 1 \ 74 | --per_device_eval_batch_size 1 \ 75 | --gradient_accumulation_steps 16 \ 76 | --evaluation_strategy "no" \ 77 | --save_strategy "steps" \ 78 | --save_steps 1000 \ 79 | --save_total_limit 10 \ 80 | --learning_rate 1e-5 \ 81 | --weight_decay 0.1 \ 82 | --adam_beta2 0.95 \ 83 | --warmup_ratio 0.01 \ 84 | --lr_scheduler_type "cosine" \ 85 | --logging_steps 1 \ 86 | --report_to "none" \ 87 | --model_max_length 512 \ 88 | --gradient_checkpointing True \ 89 | --lazy_preprocess True \ 90 | --deepspeed finetune/ds_config_zero3.json 91 | -------------------------------------------------------------------------------- /finetune/finetune_lora_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | # Guide: 6 | # This script supports distributed training on multi-gpu workers (as well as single-worker training). 7 | # Please set the options below according to the comments. 8 | # For multi-gpu workers training, these options should be manually set for each worker. 9 | # After setting the options, please run the script on each worker. 10 | 11 | # Number of GPUs per GPU worker 12 | GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())') 13 | 14 | # Number of GPU workers, for single-worker training, please set to 1 15 | NNODES=${NNODES:-1} 16 | 17 | # The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0 18 | NODE_RANK=${NODE_RANK:-0} 19 | 20 | # The ip address of the rank-0 worker, for single-worker training, please set to localhost 21 | MASTER_ADDR=${MASTER_ADDR:-localhost} 22 | 23 | # The port for communication 24 | MASTER_PORT=${MASTER_PORT:-6001} 25 | 26 | MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly 27 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 28 | # See the section for finetuning in README for more information. 29 | DATA="path_to_data" 30 | DS_CONFIG_PATH="finetune/ds_config_zero2.json" 31 | 32 | function usage() { 33 | echo ' 34 | Usage: bash finetune/finetune_lora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] [--deepspeed DS_CONFIG_PATH] 35 | ' 36 | } 37 | 38 | while [[ "$1" != "" ]]; do 39 | case $1 in 40 | -m | --model ) 41 | shift 42 | MODEL=$1 43 | ;; 44 | -d | --data ) 45 | shift 46 | DATA=$1 47 | ;; 48 | --deepspeed ) 49 | shift 50 | DS_CONFIG_PATH=$1 51 | ;; 52 | -h | --help ) 53 | usage 54 | exit 0 55 | ;; 56 | * ) 57 | echo "Unknown argument ${1}" 58 | exit 1 59 | ;; 60 | esac 61 | shift 62 | done 63 | 64 | DISTRIBUTED_ARGS=" 65 | --nproc_per_node $GPUS_PER_NODE \ 66 | --nnodes $NNODES \ 67 | --node_rank $NODE_RANK \ 68 | --master_addr $MASTER_ADDR \ 69 | --master_port $MASTER_PORT 70 | " 71 | 72 | torchrun $DISTRIBUTED_ARGS finetune.py \ 73 | --model_name_or_path $MODEL \ 74 | --data_path $DATA \ 75 | --bf16 True \ 76 | --output_dir output_qwen \ 77 | --num_train_epochs 5 \ 78 | --per_device_train_batch_size 2 \ 79 | --per_device_eval_batch_size 1 \ 80 | --gradient_accumulation_steps 8 \ 81 | --evaluation_strategy "no" \ 82 | --save_strategy "steps" \ 83 | --save_steps 1000 \ 84 | --save_total_limit 10 \ 85 | --learning_rate 3e-4 \ 86 | --weight_decay 0.1 \ 87 | --adam_beta2 0.95 \ 88 | --warmup_ratio 0.01 \ 89 | --lr_scheduler_type "cosine" \ 90 | --logging_steps 1 \ 91 | --report_to "none" \ 92 | --model_max_length 512 \ 93 | --lazy_preprocess True \ 94 | --use_lora \ 95 | --gradient_checkpointing \ 96 | --deepspeed ${DS_CONFIG_PATH} 97 | -------------------------------------------------------------------------------- /finetune/finetune_lora_single_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | 4 | MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly 5 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 6 | # See the section for finetuning in README for more information. 7 | DATA="path_to_data" 8 | 9 | function usage() { 10 | echo ' 11 | Usage: bash finetune/finetune_lora_single_gpu.sh [-m MODEL_PATH] [-d DATA_PATH] 12 | ' 13 | } 14 | 15 | while [[ "$1" != "" ]]; do 16 | case $1 in 17 | -m | --model ) 18 | shift 19 | MODEL=$1 20 | ;; 21 | -d | --data ) 22 | shift 23 | DATA=$1 24 | ;; 25 | -h | --help ) 26 | usage 27 | exit 0 28 | ;; 29 | * ) 30 | echo "Unknown argument ${1}" 31 | exit 1 32 | ;; 33 | esac 34 | shift 35 | done 36 | 37 | export CUDA_VISIBLE_DEVICES=0 38 | 39 | python finetune.py \ 40 | --model_name_or_path $MODEL \ 41 | --data_path $DATA \ 42 | --bf16 True \ 43 | --output_dir output_qwen \ 44 | --num_train_epochs 5 \ 45 | --per_device_train_batch_size 2 \ 46 | --per_device_eval_batch_size 1 \ 47 | --gradient_accumulation_steps 8 \ 48 | --evaluation_strategy "no" \ 49 | --save_strategy "steps" \ 50 | --save_steps 1000 \ 51 | --save_total_limit 10 \ 52 | --learning_rate 3e-4 \ 53 | --weight_decay 0.1 \ 54 | --adam_beta2 0.95 \ 55 | --warmup_ratio 0.01 \ 56 | --lr_scheduler_type "cosine" \ 57 | --logging_steps 1 \ 58 | --report_to "none" \ 59 | --model_max_length 512 \ 60 | --lazy_preprocess True \ 61 | --gradient_checkpointing \ 62 | --use_lora 63 | 64 | # If you use fp16 instead of bf16, you should use deepspeed 65 | # --fp16 True --deepspeed finetune/ds_config_zero2.json 66 | -------------------------------------------------------------------------------- /finetune/finetune_qlora_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | # Guide: 6 | # This script supports distributed training on multi-gpu workers (as well as single-worker training). 7 | # Please set the options below according to the comments. 8 | # For multi-gpu workers training, these options should be manually set for each worker. 9 | # After setting the options, please run the script on each worker. 10 | 11 | # Number of GPUs per GPU worker 12 | GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())') 13 | 14 | # Number of GPU workers, for single-worker training, please set to 1 15 | NNODES=${NNODES:-1} 16 | 17 | # The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0 18 | NODE_RANK=${NODE_RANK:-0} 19 | 20 | # The ip address of the rank-0 worker, for single-worker training, please set to localhost 21 | MASTER_ADDR=${MASTER_ADDR:-localhost} 22 | 23 | # The port for communication 24 | MASTER_PORT=${MASTER_PORT:-6001} 25 | 26 | MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly 27 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 28 | # See the section for finetuning in README for more information. 29 | DATA="path_to_data" 30 | 31 | function usage() { 32 | echo ' 33 | Usage: bash finetune/finetune_qlora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] 34 | ' 35 | } 36 | 37 | while [[ "$1" != "" ]]; do 38 | case $1 in 39 | -m | --model ) 40 | shift 41 | MODEL=$1 42 | ;; 43 | -d | --data ) 44 | shift 45 | DATA=$1 46 | ;; 47 | -h | --help ) 48 | usage 49 | exit 0 50 | ;; 51 | * ) 52 | echo "Unknown argument ${1}" 53 | exit 1 54 | ;; 55 | esac 56 | shift 57 | done 58 | 59 | DISTRIBUTED_ARGS=" 60 | --nproc_per_node $GPUS_PER_NODE \ 61 | --nnodes $NNODES \ 62 | --node_rank $NODE_RANK \ 63 | --master_addr $MASTER_ADDR \ 64 | --master_port $MASTER_PORT 65 | " 66 | 67 | # Remember to use --fp16 instead of --bf16 due to autogptq 68 | torchrun $DISTRIBUTED_ARGS finetune.py \ 69 | --model_name_or_path $MODEL \ 70 | --data_path $DATA \ 71 | --fp16 True \ 72 | --output_dir output_qwen \ 73 | --num_train_epochs 5 \ 74 | --per_device_train_batch_size 2 \ 75 | --per_device_eval_batch_size 1 \ 76 | --gradient_accumulation_steps 8 \ 77 | --evaluation_strategy "no" \ 78 | --save_strategy "steps" \ 79 | --save_steps 1000 \ 80 | --save_total_limit 10 \ 81 | --learning_rate 3e-4 \ 82 | --weight_decay 0.1 \ 83 | --adam_beta2 0.95 \ 84 | --warmup_ratio 0.01 \ 85 | --lr_scheduler_type "cosine" \ 86 | --logging_steps 1 \ 87 | --report_to "none" \ 88 | --model_max_length 512 \ 89 | --lazy_preprocess True \ 90 | --use_lora \ 91 | --q_lora \ 92 | --gradient_checkpointing \ 93 | --deepspeed finetune/ds_config_zero2.json 94 | -------------------------------------------------------------------------------- /finetune/finetune_qlora_single_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly 6 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 7 | # See the section for finetuning in README for more information. 8 | DATA="path_to_data" 9 | 10 | function usage() { 11 | echo ' 12 | Usage: bash finetune/finetune_qlora_single_gpu.sh [-m MODEL_PATH] [-d DATA_PATH] 13 | ' 14 | } 15 | 16 | while [[ "$1" != "" ]]; do 17 | case $1 in 18 | -m | --model ) 19 | shift 20 | MODEL=$1 21 | ;; 22 | -d | --data ) 23 | shift 24 | DATA=$1 25 | ;; 26 | -h | --help ) 27 | usage 28 | exit 0 29 | ;; 30 | * ) 31 | echo "Unknown argument ${1}" 32 | exit 1 33 | ;; 34 | esac 35 | shift 36 | done 37 | 38 | export CUDA_VISIBLE_DEVICES=0 39 | 40 | # Remember to use --fp16 instead of --bf16 due to autogptq 41 | python finetune.py \ 42 | --model_name_or_path $MODEL \ 43 | --data_path $DATA \ 44 | --fp16 True \ 45 | --output_dir output_qwen \ 46 | --num_train_epochs 5 \ 47 | --per_device_train_batch_size 2 \ 48 | --per_device_eval_batch_size 1 \ 49 | --gradient_accumulation_steps 8 \ 50 | --evaluation_strategy "no" \ 51 | --save_strategy "steps" \ 52 | --save_steps 1000 \ 53 | --save_total_limit 10 \ 54 | --learning_rate 3e-4 \ 55 | --weight_decay 0.1 \ 56 | --adam_beta2 0.95 \ 57 | --warmup_ratio 0.01 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --report_to "none" \ 61 | --model_max_length 512 \ 62 | --lazy_preprocess True \ 63 | --gradient_checkpointing \ 64 | --use_lora \ 65 | --q_lora \ 66 | --deepspeed finetune/ds_config_zero2.json 67 | -------------------------------------------------------------------------------- /recipes/applications/chatbot/qwen_chatbot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "54d5d255-aa98-4655-8dd1-bc726430d86a", 6 | "metadata": {}, 7 | "source": [ 8 | "# Qwen-7B-Chat Chat Demo" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "31e04af4-eb27-4802-a7b2-6ea0525f1dc8", 14 | "metadata": {}, 15 | "source": [ 16 | "This notebook uses Qwen-7B-Chat as an example to introduce you to how to build a web-based conversational assistant using Gradio." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "75e51155-9f8e-40dc-8432-60f4567d93a8", 22 | "metadata": {}, 23 | "source": [ 24 | "## Preparation" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "ff6f061c-a033-49f2-8f7d-af3f23ac9125", 30 | "metadata": {}, 31 | "source": [ 32 | "Download Qwen-7B-Chat\n", 33 | "\n", 34 | "Firstly, we need to download the model. You can use the snapshot_download that comes with modelscope to download the model to a specified directory." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "c469a129-451f-4d01-8bc0-e2cf70a262c8", 41 | "metadata": { 42 | "tags": [] 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "!pip install modelscope" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "69af626e-22b8-49ad-8869-8354f4c72bcc", 53 | "metadata": { 54 | "tags": [] 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "from modelscope.hub.snapshot_download import snapshot_download\n", 59 | "snapshot_download(\"qwen/Qwen-7B-Chat\",cache_dir='/tmp/models') " 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "01d2ff34-4053-4710-a289-e354673be1ca", 65 | "metadata": {}, 66 | "source": [ 67 | "## Install Dependencies" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "48b51791-4bbc-4d12-9cd6-587c24c8bea7", 74 | "metadata": { 75 | "tags": [] 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "!pip install -r ../../../requirements.txt\n", 80 | "!pip install gradio==3.37.0 mdtex2html" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "7732037a-246a-4953-af07-dae7a3ae5937", 86 | "metadata": {}, 87 | "source": [ 88 | "## Run the web UI code to start the Qwen chatbot\n", 89 | "\n", 90 | "Users can run the web_demo.py file to have real-time conversations with Qwen-7b-chat on the webpage." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "9e256f0a-d96d-4fd7-b305-fe43c6959dc8", 97 | "metadata": { 98 | "ExecutionIndicator": { 99 | "show": true 100 | }, 101 | "tags": [] 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "!python ../../../web_demo.py -c /tmp/models/qwen/Qwen-7B-Chat" 106 | ] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3 (ipykernel)", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.10.13" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 5 130 | } 131 | -------------------------------------------------------------------------------- /recipes/finetune/ascend/README.md: -------------------------------------------------------------------------------- 1 | # Fine-tuning Qwen by Ascend NPU 2 | Below, we provide a simple example to show how to finetune Qwen by Ascend NPU. Currently, fine-tuning and inference are supported for Qwen 7B and 14B models. You can also refer to the official [mindformers](https://gitee.com/mindspore/mindformers/blob/dev/research/qwen/qwen.md) for detailed usage. 3 | 4 | ## Environment Requirement 5 | 6 | - Hardware: Ascend 910A/B 7 | 8 | ## Quickstart 9 | 10 | 1. Launch Docker Image 11 | 12 | ```bash 13 | ImageID=pai-image-manage-registry.cn-wulanchabu.cr.aliyuncs.com/pai/llm-inference:qwen_v23.0.rc3 14 | docker run -it -u root --ipc=host \ 15 | --device=/dev/davinci0 \ 16 | --device=/dev/davinci1 \ 17 | --device=/dev/davinci2 \ 18 | --device=/dev/davinci3 \ 19 | --device=/dev/davinci4 \ 20 | --device=/dev/davinci5 \ 21 | --device=/dev/davinci6 \ 22 | --device=/dev/davinci7 \ 23 | --device=/dev/davinci_manager \ 24 | --device=/dev/devmm_svm \ 25 | --device=/dev/hisi_hdc \ 26 | -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ 27 | -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \ 28 | -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ 29 | -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \ 30 | -v /etc/ascend_install.info:/etc/ascend_install.info \ 31 | -v /var/log/npu/:/usr/slog \ 32 | -v /etc/hccn.conf:/etc/hccn.conf \ 33 | ${ImageID} /bin/bash 34 | ``` 35 | 36 | 2. Download and Convert model 37 | 38 | - download model by modelscope 39 | 40 | ```bash 41 | cd mindformers 42 | python3 -c "from modelscope.hub.snapshot_download import snapshot_download; snapshot_download('Qwen/Qwen-7B-Chat', cache_dir='.', revision='master')" 43 | ``` 44 | 45 | - convert hf model weights to ckpt weights 46 | 47 | ```bash 48 | python research/qwen/convert_weight.py \ 49 | --torch_ckpt_dir Qwen/Qwen-7B-Chat \ 50 | --mindspore_ckpt_path qwen-7b-chat.ckpt 51 | 52 | mkdir -vp load_checkpoint/rank_0 53 | mv qwen-7b-chat.ckpt load_checkpoint/rank_0/ 54 | ``` 55 | 56 | 3. Prepare training data 57 | 58 | - download demo data 59 | 60 | ```bash 61 | wget -c https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/alpaca_data_min.json 62 | ``` 63 | 64 | - Converts the raw data to the specified format 65 | 66 | ```bash 67 | python research/qwen/alpaca_converter.py \ 68 | --data_path alpaca_data_min.json \ 69 | --output_path alpaca-data-conversation_min.json 70 | ``` 71 | 72 | - Generate Mindrecord data 73 | 74 | ```bash 75 | python research/qwen/qwen_preprocess.py \ 76 | --input_glob alpaca-data-conversation_min.json \ 77 | --model_file Qwen/Qwen-7B-Chat/qwen.tiktoken \ 78 | --seq_length 1024 \ 79 | --output_file alpaca_min.mindrecord 80 | ``` 81 | 82 | 4. Prepare RANK_TABLE_FILE 83 | 84 | ```bash 85 | # generate RANK_TABLE_FILE with 8 npu 86 | python mindformers/tools/hccl_tools.py --device_num "[0,8)" 87 | ``` 88 | 89 | 5. Fine-tune 90 | 91 | You need to replace RANK_TABLE_FILE with the file generated in step 5. 92 | 93 | ```bash 94 | export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE 95 | bash research/run_singlenode.sh "python3 research/qwen/run_qwen.py \ 96 | --config research/qwen/run_qwen_7b.yaml \ 97 | --load_checkpoint /mindformers/research/qwen/load_checkpoint \ 98 | --vocab_file Qwen/Qwen-7B-Chat/qwen.tiktoken \ 99 | --use_parallel True \ 100 | --run_mode finetune \ 101 | --auto_trans_ckpt True \ 102 | --train_data alpaca_min.mindrecord" \ 103 | RANK_TABLE_FILE [0,8] 8 104 | ``` 105 | 106 | 6. Merge model weights 107 | 108 | - Rename model weights 109 | 110 | ```bash 111 | cd output/checkpoint_network 112 | mv rank_0/qwen_rank_0-network.ckpt rank_0/checkpoint_0.ckpt 113 | mv rank_1/qwen_rank_1-network.ckpt rank_1/checkpoint_1.ckpt 114 | mv rank_2/qwen_rank_2-network.ckpt rank_2/checkpoint_2.ckpt 115 | mv rank_3/qwen_rank_3-network.ckpt rank_3/checkpoint_3.ckpt 116 | mv rank_4/qwen_rank_4-network.ckpt rank_4/checkpoint_4.ckpt 117 | mv rank_5/qwen_rank_5-network.ckpt rank_5/checkpoint_5.ckpt 118 | mv rank_6/qwen_rank_6-network.ckpt rank_6/checkpoint_6.ckpt 119 | mv rank_7/qwen_rank_7-network.ckpt rank_7/checkpoint_7.ckpt 120 | cd ../.. 121 | ``` 122 | 123 | - Merge model weights 124 | 125 | ```bash 126 | python mindformers/tools/transform_ckpt.py \ 127 | --src_ckpt_strategy output/strategy \ 128 | --src_ckpt_dir output/checkpoint_network \ 129 | --dst_ckpt_dir output/merged_model 130 | ``` 131 | 132 | 7. Inference fine-tuned model 133 | 134 | ```bash 135 | python research/qwen/run_qwen.py \ 136 | --config research/qwen/run_qwen_7b.yaml \ 137 | --predict_data '比较适合深度学习入门的书籍有' \ 138 | --run_mode predict \ 139 | --load_checkpoint output/merged_model/rank_0/checkpoint_0.ckpt \ 140 | --vocab_file Qwen/Qwen-7B-Chat/qwen.tiktoken \ 141 | --auto_trans_ckpt False \ 142 | --device_id 0 143 | ``` -------------------------------------------------------------------------------- /recipes/finetune/deepspeed/finetune_fullparameter_multi_gpu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "6e6981ab-2d9a-4280-923f-235a166855ba", 6 | "metadata": {}, 7 | "source": [ 8 | "# Fine-Tuning Qwen-Chat Large Language Model (Multiple GPUs)\n", 9 | "\n", 10 | "Tongyi Qianwen is a large language model developed by Alibaba Cloud based on the Transformer architecture, trained on an extensive set of pre-training data. The pre-training data is diverse and covers a wide range, including a large amount of internet text, specialized books, code, etc. In addition, an AI assistant called Qwen-Chat has been created based on the pre-trained model using alignment mechanism.\n", 11 | "\n", 12 | "This notebook uses Qwen-1.8B-Chat as an example to introduce how to fine-tune the Qianwen model using Deepspeed.\n", 13 | "\n", 14 | "## Environment Requirements\n", 15 | "\n", 16 | "Please refer to **requirements.txt** to install the required dependencies.\n", 17 | "\n", 18 | "## Preparation\n", 19 | "\n", 20 | "### Download Qwen-1.8B-Chat\n", 21 | "\n", 22 | "First, download the model files. You can choose to download directly from ModelScope." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "248488f9-4a86-4f35-9d56-50f8e91a8f11", 29 | "metadata": { 30 | "ExecutionIndicator": { 31 | "show": true 32 | }, 33 | "tags": [] 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "from modelscope.hub.snapshot_download import snapshot_download\n", 38 | "model_dir = snapshot_download('Qwen/Qwen-1_8B-Chat', cache_dir='.', revision='master')" 39 | ] 40 | }, 41 | { 42 | "attachments": {}, 43 | "cell_type": "markdown", 44 | "id": "7b2a92b1-f08e-4413-9f92-8f23761e6e1f", 45 | "metadata": {}, 46 | "source": [ 47 | "### Download Example Training Data\n", 48 | "\n", 49 | "Download the data required for training; here, we provide a tiny dataset as an example. It is sampled from [Belle](https://github.com/LianjiaTech/BELLE).\n", 50 | "\n", 51 | "Disclaimer: the dataset can be only used for the research purpose." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "ce195f08-fbb2-470e-b6c0-9a03457458c7", 58 | "metadata": { 59 | "tags": [] 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "!wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/release/tutorials/qwen_recipes/Belle_sampled_qwen.json" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "id": "7226bed0-171b-4d45-a3f9-b3d81ec2bb9f", 69 | "metadata": {}, 70 | "source": [ 71 | "You can also refer to this format to prepare the dataset. Below is a simple example list with 1 sample:\n", 72 | "\n", 73 | "```json\n", 74 | "[\n", 75 | " {\n", 76 | " \"id\": \"identity_0\",\n", 77 | " \"conversations\": [\n", 78 | " {\n", 79 | " \"from\": \"user\",\n", 80 | " \"value\": \"你好\"\n", 81 | " },\n", 82 | " {\n", 83 | " \"from\": \"assistant\",\n", 84 | " \"value\": \"我是一个语言模型,我叫通义千问。\"\n", 85 | " }\n", 86 | " ]\n", 87 | " }\n", 88 | "]\n", 89 | "```\n", 90 | "\n", 91 | "You can also use multi-turn conversations as the training set. Here is a simple example:\n", 92 | "\n", 93 | "```json\n", 94 | "[\n", 95 | " {\n", 96 | " \"id\": \"identity_0\",\n", 97 | " \"conversations\": [\n", 98 | " {\n", 99 | " \"from\": \"user\",\n", 100 | " \"value\": \"你好,能告诉我遛狗的最佳时间吗?\"\n", 101 | " },\n", 102 | " {\n", 103 | " \"from\": \"assistant\",\n", 104 | " \"value\": \"当地最佳遛狗时间因地域差异而异,请问您所在的城市是哪里?\"\n", 105 | " },\n", 106 | " {\n", 107 | " \"from\": \"user\",\n", 108 | " \"value\": \"我在纽约市。\"\n", 109 | " },\n", 110 | " {\n", 111 | " \"from\": \"assistant\",\n", 112 | " \"value\": \"纽约市的遛狗最佳时间通常在早晨6点至8点和晚上8点至10点之间,因为这些时间段气温较低,遛狗更加舒适。但具体时间还需根据气候、气温和季节变化而定。\"\n", 113 | " }\n", 114 | " ]\n", 115 | " }\n", 116 | "]\n", 117 | "```\n", 118 | "\n", 119 | "## Fine-Tune the Model\n", 120 | "\n", 121 | "You can directly run the prepared training script to fine-tune the model. **nproc_per_node** refers to the number of GPUs used fro training." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "7ab0581e-be85-45e6-a5b7-af9c42ea697b", 128 | "metadata": { 129 | "ExecutionIndicator": { 130 | "show": true 131 | }, 132 | "tags": [] 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "!torchrun --nproc_per_node 2 --nnodes 1 --node_rank 0 --master_addr localhost --master_port 6601 ../../finetune.py \\\n", 137 | " --model_name_or_path \"Qwen/Qwen-1_8B-Chat/\" \\\n", 138 | " --data_path \"Belle_sampled_qwen.json\" \\\n", 139 | " --bf16 True \\\n", 140 | " --output_dir \"output_qwen\" \\\n", 141 | " --num_train_epochs 5 \\\n", 142 | " --per_device_train_batch_size 1 \\\n", 143 | " --per_device_eval_batch_size 1 \\\n", 144 | " --gradient_accumulation_steps 16 \\\n", 145 | " --evaluation_strategy \"no\" \\\n", 146 | " --save_strategy \"steps\" \\\n", 147 | " --save_steps 1000 \\\n", 148 | " --save_total_limit 10 \\\n", 149 | " --learning_rate 1e-5 \\\n", 150 | " --weight_decay 0.1 \\\n", 151 | " --adam_beta2 0.95 \\\n", 152 | " --warmup_ratio 0.01 \\\n", 153 | " --lr_scheduler_type \"cosine\" \\\n", 154 | " --logging_steps 1 \\\n", 155 | " --report_to \"none\" \\\n", 156 | " --model_max_length 512 \\\n", 157 | " --gradient_checkpointing True \\\n", 158 | " --lazy_preprocess True \\\n", 159 | " --deepspeed \"../../finetune/ds_config_zero2.json\"" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "## Test the Model\n", 167 | "\n", 168 | "We can test the model as follows:" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n", 178 | "from transformers.generation import GenerationConfig\n", 179 | "\n", 180 | "tokenizer = AutoTokenizer.from_pretrained(\"output_qwen\", trust_remote_code=True)\n", 181 | "model = AutoModelForCausalLM.from_pretrained(\n", 182 | " \"output_qwen\",\n", 183 | " device_map=\"auto\",\n", 184 | " trust_remote_code=True\n", 185 | ").eval()\n", 186 | "\n", 187 | "response, history = model.chat(tokenizer, \"你好\", history=None)\n", 188 | "print(response)" 189 | ] 190 | } 191 | ], 192 | "metadata": { 193 | "kernelspec": { 194 | "display_name": "Python 3 (ipykernel)", 195 | "language": "python", 196 | "name": "python3" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 3 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython3", 208 | "version": "3.10.13" 209 | } 210 | }, 211 | "nbformat": 4, 212 | "nbformat_minor": 5 213 | } 214 | -------------------------------------------------------------------------------- /recipes/finetune/deepspeed/finetune_fullparameter_single_gpu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "6e6981ab-2d9a-4280-923f-235a166855ba", 6 | "metadata": {}, 7 | "source": [ 8 | "# Fine-Tuning Qwen-Chat Large Language Model (Single GPU)\n", 9 | "\n", 10 | "Tongyi Qianwen is a large language model developed by Alibaba Cloud based on the Transformer architecture, trained on an extensive set of pre-training data. The pre-training data is diverse and covers a wide range, including a large amount of internet text, specialized books, code, etc. In addition, an AI assistant called Qwen-Chat has been created based on the pre-trained model using alignment mechanism.\n", 11 | "\n", 12 | "This notebook uses Qwen-1.8B-Chat as an example to introduce how to fine-tune the Qianwen model using Deepspeed.\n", 13 | "\n", 14 | "## Environment Requirements\n", 15 | "\n", 16 | "Please refer to **requirements.txt** to install the required dependencies.\n", 17 | "\n", 18 | "## Preparation\n", 19 | "\n", 20 | "### Download Qwen-1.8B-Chat\n", 21 | "\n", 22 | "First, download the model files. You can choose to download directly from ModelScope." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "248488f9-4a86-4f35-9d56-50f8e91a8f11", 29 | "metadata": { 30 | "ExecutionIndicator": { 31 | "show": true 32 | }, 33 | "execution": { 34 | "iopub.execute_input": "2023-12-31T03:19:11.059814Z", 35 | "iopub.status.busy": "2023-12-31T03:19:11.059177Z", 36 | "iopub.status.idle": "2023-12-31T03:21:54.157827Z", 37 | "shell.execute_reply": "2023-12-31T03:21:54.157333Z", 38 | "shell.execute_reply.started": "2023-12-31T03:19:11.059783Z" 39 | }, 40 | "tags": [] 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "from modelscope.hub.snapshot_download import snapshot_download\n", 45 | "model_dir = snapshot_download('Qwen/Qwen-1_8B-Chat', cache_dir='.', revision='master')" 46 | ] 47 | }, 48 | { 49 | "attachments": {}, 50 | "cell_type": "markdown", 51 | "id": "7b2a92b1-f08e-4413-9f92-8f23761e6e1f", 52 | "metadata": {}, 53 | "source": [ 54 | "### Download Example Training Data\n", 55 | "\n", 56 | "Download the data required for training; here, we provide a tiny dataset as an example. It is sampled from [Belle](https://github.com/LianjiaTech/BELLE).\n", 57 | "\n", 58 | "Disclaimer: the dataset can be only used for the research purpose." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "ce195f08-fbb2-470e-b6c0-9a03457458c7", 65 | "metadata": { 66 | "execution": { 67 | "iopub.execute_input": "2023-12-31T03:21:57.596577Z", 68 | "iopub.status.busy": "2023-12-31T03:21:57.595847Z", 69 | "iopub.status.idle": "2023-12-31T03:21:57.971112Z", 70 | "shell.execute_reply": "2023-12-31T03:21:57.970576Z", 71 | "shell.execute_reply.started": "2023-12-31T03:21:57.596555Z" 72 | }, 73 | "tags": [] 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "!wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/release/tutorials/qwen_recipes/Belle_sampled_qwen.json" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "7226bed0-171b-4d45-a3f9-b3d81ec2bb9f", 83 | "metadata": {}, 84 | "source": [ 85 | "You can also refer to this format to prepare the dataset. Below is a simple example list with 1 sample:\n", 86 | "\n", 87 | "```json\n", 88 | "[\n", 89 | " {\n", 90 | " \"id\": \"identity_0\",\n", 91 | " \"conversations\": [\n", 92 | " {\n", 93 | " \"from\": \"user\",\n", 94 | " \"value\": \"你好\"\n", 95 | " },\n", 96 | " {\n", 97 | " \"from\": \"assistant\",\n", 98 | " \"value\": \"我是一个语言模型,我叫通义千问。\"\n", 99 | " }\n", 100 | " ]\n", 101 | " }\n", 102 | "]\n", 103 | "```\n", 104 | "\n", 105 | "You can also use multi-turn conversations as the training set. Here is a simple example:\n", 106 | "\n", 107 | "```json\n", 108 | "[\n", 109 | " {\n", 110 | " \"id\": \"identity_0\",\n", 111 | " \"conversations\": [\n", 112 | " {\n", 113 | " \"from\": \"user\",\n", 114 | " \"value\": \"你好,能告诉我遛狗的最佳时间吗?\"\n", 115 | " },\n", 116 | " {\n", 117 | " \"from\": \"assistant\",\n", 118 | " \"value\": \"当地最佳遛狗时间因地域差异而异,请问您所在的城市是哪里?\"\n", 119 | " },\n", 120 | " {\n", 121 | " \"from\": \"user\",\n", 122 | " \"value\": \"我在纽约市。\"\n", 123 | " },\n", 124 | " {\n", 125 | " \"from\": \"assistant\",\n", 126 | " \"value\": \"纽约市的遛狗最佳时间通常在早晨6点至8点和晚上8点至10点之间,因为这些时间段气温较低,遛狗更加舒适。但具体时间还需根据气候、气温和季节变化而定。\"\n", 127 | " }\n", 128 | " ]\n", 129 | " }\n", 130 | "]\n", 131 | "```\n", 132 | "\n", 133 | "\n", 134 | "## Fine-Tune the Model\n", 135 | "\n", 136 | "You can directly run the prepared training script to fine-tune the model." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "id": "7ab0581e-be85-45e6-a5b7-af9c42ea697b", 143 | "metadata": { 144 | "ExecutionIndicator": { 145 | "show": true 146 | }, 147 | "execution": { 148 | "iopub.execute_input": "2023-12-31T03:23:52.455178Z", 149 | "iopub.status.busy": "2023-12-31T03:23:52.454615Z", 150 | "iopub.status.idle": "2023-12-31T03:24:15.699948Z", 151 | "shell.execute_reply": "2023-12-31T03:24:15.699358Z", 152 | "shell.execute_reply.started": "2023-12-31T03:23:52.455144Z" 153 | }, 154 | "tags": [] 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "!python ../../finetune.py \\\n", 159 | " --model_name_or_path \"Qwen/Qwen-1_8B-Chat/\"\\\n", 160 | " --data_path \"Belle_sampled_qwen.json\"\\\n", 161 | " --bf16 \\\n", 162 | " --output_dir \"output_qwen\" \\\n", 163 | " --num_train_epochs 5 \\\n", 164 | " --per_device_train_batch_size 1 \\\n", 165 | " --per_device_eval_batch_size 1 \\\n", 166 | " --gradient_accumulation_steps 16 \\\n", 167 | " --evaluation_strategy \"no\" \\\n", 168 | " --save_strategy \"steps\" \\\n", 169 | " --save_steps 1000 \\\n", 170 | " --save_total_limit 10 \\\n", 171 | " --learning_rate 1e-5 \\\n", 172 | " --weight_decay 0.1 \\\n", 173 | " --adam_beta2 0.95 \\\n", 174 | " --warmup_ratio 0.01 \\\n", 175 | " --lr_scheduler_type \"cosine\" \\\n", 176 | " --logging_steps 1 \\\n", 177 | " --report_to \"none\" \\\n", 178 | " --model_max_length 512 \\\n", 179 | " --gradient_checkpointing \\\n", 180 | " --lazy_preprocess" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## Test the Model\n", 188 | "\n", 189 | "We can test the model as follows:" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n", 199 | "from transformers.generation import GenerationConfig\n", 200 | "\n", 201 | "tokenizer = AutoTokenizer.from_pretrained(\"output_qwen\", trust_remote_code=True)\n", 202 | "model = AutoModelForCausalLM.from_pretrained(\n", 203 | " \"output_qwen\",\n", 204 | " device_map=\"auto\",\n", 205 | " trust_remote_code=True\n", 206 | ").eval()\n", 207 | "\n", 208 | "response, history = model.chat(tokenizer, \"你好\", history=None)\n", 209 | "print(response)" 210 | ] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 3 (ipykernel)", 216 | "language": "python", 217 | "name": "python3" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 3 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython3", 229 | "version": "3.10.13" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 5 234 | } 235 | -------------------------------------------------------------------------------- /recipes/finetune/deepspeed/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed 2 | peft -------------------------------------------------------------------------------- /recipes/inference/dashscope/README.md: -------------------------------------------------------------------------------- 1 | # Inference Qwen Using DashScope 2 | 3 | The most simple way to use Qwen through APIs is DashScope API service through Alibaba Cloud. We give an introduction to the usage. Additionally, we provide a script for you to deploy an OpenAI-style API on your own servers. 4 | 5 | DashScope is the large language model API service provided by Alibaba Cloud, which now supports Qwen. Note that the models behind DashScope are in-house versions temporarily without details provided. The services include `qwen-turbo` and `qwen-plus`, where the former one runs faster and the latter achieves better performance. For more information, visit the documentation [here](https://dashscope.aliyun.com). 6 | 7 | Please head to the official website [link](https://help.aliyun.com/zh/dashscope/developer-reference/activate-dashscope-and-create-an-api-key?spm=a2c4g.11186623.0.0.6c2774fahtfXdn) to create a DashScope account and obtain the API key (AK). We recommend setting the AK with an environment variable: 8 | ```bash 9 | export DASHSCOPE_API_KEY="YOUR_DASHSCOPE_API_KEY" 10 | ``` 11 | Then please install the packages and click [here](https://help.aliyun.com/zh/dashscope/developer-reference/install-dashscope-sdk) for the documentation. If you use Python, you can install DashScope with pip: 12 | ```bash 13 | pip install dashscope 14 | ``` 15 | If you use JAVA SDK, you can install it in this way: 16 | ```xml 17 | <!-- https://mvnrepository.com/artifact/com.alibaba/dashscope-sdk-java --> 18 | <dependency> 19 | <groupId>com.alibaba</groupId> 20 | <artifactId>dashscope-sdk-java</artifactId> 21 | <version>the-latest-version</version> 22 | </dependency> 23 | ``` 24 | The simplest way to use DashScope is the usage with messages, which is similar to OpenAI API. The example is demonstrated below: 25 | ```python 26 | import random 27 | from http import HTTPStatus 28 | from dashscope import Generation 29 | 30 | 31 | def call_with_messages(): 32 | messages = [{'role': 'system', 'content': 'You are a helpful assistant.'}, 33 | {'role': 'user', 'content': '如何做西红柿鸡蛋?'}] 34 | gen = Generation() 35 | response = gen.call( 36 | Generation.Models.qwen_turbo, 37 | messages=messages, 38 | seed=random.randint(1, 10000), # set the random seed, optional, default to 1234 if not set 39 | result_format='message', # set the result to be "message" format. 40 | ) 41 | return response 42 | 43 | 44 | if __name__ == '__main__': 45 | response = call_with_messages() 46 | if response.status_code == HTTPStatus.OK: 47 | print(response) 48 | else: 49 | print('Request id: %s, Status code: %s, error code: %s, error message: %s' % ( 50 | response.request_id, response.status_code, 51 | response.code, response.message 52 | )) 53 | ``` 54 | For more usages, please visit the official website for more details. 55 | <br><br> 56 | 57 | -------------------------------------------------------------------------------- /recipes/inference/quantization/README.md: -------------------------------------------------------------------------------- 1 | # Quantization 2 | 3 | ## GPTQ 4 | 5 | We provide a solution based on [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), and release the Int4 and Int8 quantized models, which achieve nearly lossless model effects but improved performance on both memory costs and inference speed. 6 | 7 | Here we demonstrate how to use our provided quantized models for inference. Before you start, make sure you meet the requirements of auto-gptq (e.g., torch 2.0 and above, transformers 4.32.0 and above, etc.) and install the required packages: 8 | 9 | ```bash 10 | pip install auto-gptq optimum 11 | ``` 12 | 13 | If you meet problems installing `auto-gptq`, we advise you to check out the official [repo](https://github.com/PanQiWei/AutoGPTQ) to find a wheel. 14 | 15 | > Note: The pre-compiled `auto-gptq` packages strongly depend on the version of `torch` and its CUDA version. Moreover, due to recent update, 16 | > you may also encounter unsupported version errors from `transformers`, `optimum`, or `peft`. 17 | > We recommend using the latest versions meeting the following requirements: 18 | > - torch==2.1 auto-gptq>=0.5.1 transformers>=4.35.0 optimum>=1.14.0 peft>=0.6.1 19 | > - torch>=2.0,<2.1 auto-gptq<0.5.0 transformers<4.35.0 optimum<1.14.0 peft>=0.5.0,<0.6.0 20 | 21 | Then you can load the quantized model easily and run inference as same as usual: 22 | 23 | ```python 24 | # Model names: "Qwen/Qwen-7B-Chat-Int4", "Qwen/Qwen-14B-Chat-Int4" 25 | model = AutoModelForCausalLM.from_pretrained( 26 | "Qwen/Qwen-7B-Chat-Int4", 27 | device_map="auto", 28 | trust_remote_code=True 29 | ).eval() 30 | response, history = model.chat(tokenizer, "Hi", history=None) 31 | ``` 32 | 33 | We illustrate the model performance of both BF16, Int8 and Int4 models on the benchmark, and we find that the quantized model does not suffer from significant performance degradation. Results are shown below: 34 | 35 | | Quantization | MMLU | CEval (val) | GSM8K | Humaneval | 36 | |----------------------|:----:|:-----------:|:-----:|:---------:| 37 | | Qwen-1.8B-Chat (BF16)| 43.3 | 55.6 | 33.7 | 26.2 | 38 | | Qwen-1.8B-Chat (Int8)| 43.1 | 55.8 | 33.0 | 27.4 | 39 | | Qwen-1.8B-Chat (Int4)| 42.9 | 52.8 | 31.2 | 25.0 | 40 | | Qwen-7B-Chat (BF16) | 55.8 | 59.7 | 50.3 | 37.2 | 41 | | Qwen-7B-Chat (Int8) | 55.4 | 59.4 | 48.3 | 34.8 | 42 | | Qwen-7B-Chat (Int4) | 55.1 | 59.2 | 49.7 | 29.9 | 43 | | Qwen-14B-Chat (BF16) | 64.6 | 69.8 | 60.1 | 43.9 | 44 | | Qwen-14B-Chat (Int8) | 63.6 | 68.6 | 60.0 | 48.2 | 45 | | Qwen-14B-Chat (Int4) | 63.3 | 69.0 | 59.8 | 45.7 | 46 | | Qwen-72B-Chat (BF16) | 74.4 | 80.1 | 76.4 | 64.6 | 47 | | Qwen-72B-Chat (Int8) | 73.5 | 80.1 | 73.5 | 62.2 | 48 | | Qwen-72B-Chat (Int4) | 73.4 | 80.1 | 75.3 | 61.6 | 49 | 50 | ## Quantization of KV cache 51 | 52 | > NOTE: Please be aware that due to the internal mechanism of Hugging Face, the support files for this functionality 53 | > (i.e., `cache_autogptq_cuda_256.cpp` and `cache_autogptq_cuda_kernel_256.cu`) may be missing. Please manually download 54 | > them from the Hugging Face Hub and place them into the same folder as the other module files. 55 | 56 | The attention KV cache can be quantized and compressed for storage, to get a higher sample throughput. The arguments `use_cache_quantization` and `use_cache_kernel` in `config.json` are provided to enable KV cache quantization. The specific use method is as follows: 57 | ```python 58 | model = AutoModelForCausalLM.from_pretrained( 59 | "Qwen/Qwen-7B-Chat", 60 | device_map="auto", 61 | trust_remote_code=True, 62 | use_cache_quantization=True, 63 | use_cache_kernel=True, 64 | use_flash_attn=False 65 | ) 66 | ``` 67 | Attention: Currently, KV cache quantization and flash attention cannot be used at the same time. 68 | If you enable KV cache quantization and flash attention at the same time (`use_flash_attn=True`, `use_cache_quantization=True`, `use_cache_kernel=True`), `use_flash_attn` is disabled by default (`use_flash_attn=false`). 69 | 70 | We have verified that the use of the quantized Int8-KV-Cache model does not suffer from significant performance degradation in downstream evaluation. In the following, we focus on profiling its memory footprint in different conditions. 71 | The profiling runs on a single A100-SXM4-80G GPU with PyTorch 2.0.1 and CUDA 11.4. 72 | We use BF16 models to generate 1024 tokens by default, and "OOM" indicates out-of-memory error. 73 | 74 | With KV cache quantization, the model can infer with a larger batch size (bs). 75 | 76 | | USE KV Cache | bs=1 | bs=4 | bs=16 | bs=32 | bs=64 | bs=100 | 77 | |--------------|:------:|:------:|:------:|:------:|:------:|:------:| 78 | | No | 16.3GB | 24.1GB | 31.7GB | 48.7GB | OOM | OOM | 79 | | Yes | 15.5GB | 17.2GB | 22.3GB | 30.2GB | 48.2GB | 72.4GB | 80 | 81 | With KV cache quantization the model can save more memory when generating longer sequence (`sl`, sequence length, referring to the number of tokens generated) at the stage of inference. 82 | 83 | | USE KV Cache | sl=512 | sl=1024 | sl=2048 | sl=4096 | sl=8192 | 84 | |--------------|:------:|:-------:|:-------:|:-------:|:-------:| 85 | | No | 15.2GB | 16.3GB | 17.6GB | 19.5GB | 23.2GB | 86 | | Yes | 15GB | 15.5GB | 15.8GB | 16.6GB | 17.6GB | 87 | 88 | The model with KV cache quantization will convert the format of `layer_past` from float to int8, and meanwhile the quantized `layer-past` will also store the quantization parameters. 89 | 90 | Specific steps are as follows: 91 | 92 | 1. Quantize key/value 93 | ``` 94 | qv,scale,zero_point=quantize_cache_v(v) 95 | ``` 96 | 2. Store into layer_past 97 | 98 | The following is the format of quantized `layer_past`: 99 | ``` 100 | layer_past=((q_key,key_scale,key_zero_point), 101 | (q_value,value_scale,value_zero_point)) 102 | ``` 103 | 104 | The original format of `layer_past` is shown below: 105 | ``` 106 | layer_past=(key,value) 107 | ``` 108 | 109 | If you want to use the attention KV which is quantized, you can use the dequantization operation to convert the Int8 key/value back to the float format as follows: 110 | ``` 111 | v=dequantize_cache_torch(qv,scale,zero_point) 112 | ``` 113 | <br> -------------------------------------------------------------------------------- /recipes/inference/tensorrt/README.md: -------------------------------------------------------------------------------- 1 | # Inference Qwen Using TensorRT-LLM 2 | Below, we provide a simple example to show how to inference Qwen by TensorRT-LLM. We recommend using GPUs with compute capability of at least SM_80 such as A10 and A800 to run this example, as we have tested on these GPUs. You can find your gpu compute capability on this [link](https://developer.nvidia.com/cuda-gpus). 3 | 4 | ## Installation 5 | You can use pre-built docker image to run this example. Simultaneously, You can also refer to the official [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) for installation and detailed usage. 6 | ```bash 7 | docker run --gpus all -it --ipc=host --network=host pai-image-manage-registry.cn-wulanchabu.cr.aliyuncs.com/pai/llm-inference:tensorrt-llm-0.8.0 bash 8 | ``` 9 | ## Quickstart 10 | 1. Download model by modelscope 11 | 12 | ```bash 13 | cd TensorRT-LLM/examples/qwen 14 | python3 -c "from modelscope.hub.snapshot_download import snapshot_download; snapshot_download('Qwen/Qwen-1_8B-Chat', cache_dir='.', revision='master')" 15 | mkdir -p ./tmp/Qwen 16 | mv Qwen/Qwen-1_8B-Chat ./tmp/Qwen/1_8B 17 | ``` 18 | 19 | 2. Build TensorRT engine from HF checkpoint 20 | 21 | ```bash 22 | python3 build.py --hf_model_dir ./tmp/Qwen/1_8B/ \ 23 | --dtype float16 \ 24 | --remove_input_padding \ 25 | --use_gpt_attention_plugin float16 \ 26 | --enable_context_fmha \ 27 | --use_gemm_plugin float16 \ 28 | --output_dir ./tmp/Qwen/1_8B/trt_engines/fp16/1-gpu/ 29 | ``` 30 | 31 | 3. Inference 32 | ```bash 33 | python3 ../run.py --input_text "你好,请问你叫什么?" \ 34 | --max_output_len=512 \ 35 | --tokenizer_dir ./tmp/Qwen/1_8B/ \ 36 | --engine_dir=./tmp/Qwen/1_8B/trt_engines/fp16/1-gpu 37 | ``` 38 | ``` 39 | Input [Text 0]: "<|im_start|>system 40 | You are a helpful assistant.<|im_end|> 41 | <|im_start|>user 42 | 你好,请问你叫什么?<|im_end|> 43 | <|im_start|>assistant 44 | " 45 | Output [Text 0 Beam 0]: "你好,我是来自阿里云的大规模语言模型,我叫通义千问。" 46 | ``` 47 | -------------------------------------------------------------------------------- /recipes/inference/tensorrt/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 2 | 3 | RUN apt-get update && \ 4 | apt-get -y install python3.10 python3-pip openmpi-bin libopenmpi-dev git && \ 5 | rm -rf /var/lib/apt/lists/* 6 | 7 | RUN pip install tensorrt_llm==0.8.0.dev2024011601 -U --no-cache-dir --pre --extra-index-url https://pypi.nvidia.com 8 | 9 | RUN pip install --no-cache-dir modelscope==1.11.1 10 | 11 | RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \ 12 | cd TensorRT-LLM && \ 13 | git checkout c89653021e66ca78c55f02b366f404455bc12e8d && \ 14 | pip install --no-cache-dir -r examples/qwen/requirements.txt -------------------------------------------------------------------------------- /recipes/inference/vllm/README.md: -------------------------------------------------------------------------------- 1 | # Inference Qwen Using vLLM 2 | 3 | For deployment and fast inference, we suggest using vLLM. 4 | 5 | ## Installation 6 | 7 | If you use cuda 12.1 and pytorch 2.1, you can directly use the following command to install vLLM. 8 | ```bash 9 | # Install vLLM with CUDA 12.1. 10 | pip install vllm 11 | ``` 12 | Otherwise, please refer to the official vLLM [Installation Instructions](https://docs.vllm.ai/en/latest/getting_started/installation.html). 13 | 14 | If you have trouble building vLLM, we recommend using Docker image. 15 | 16 | ```bash 17 | docker run --gpus all -it --rm --ipc=host --network=host qwenllm/qwen:cu121 bash 18 | ``` 19 | 20 | ## GPU Requirements 21 | 22 | Qwen model use Bfloat16 by default, but Bfloat16 is only supported on GPUs with compute capability of at least 8. For GPUs with compute capability less than 8.0, it is recommended to set the dtype to float16. You can find your gpu compute capability on this [link](https://developer.nvidia.com/cuda-gpus). 23 | 24 | We have tested the GPU memory usage on NVIDIA Tesla V100 32GB by manually adjusting gpu-memory-utilization in eager mode, you can refer to the following table to determine whether your machine is capable of running these models. 25 | | Model | seq_len 2048 | seq_len 8192 | seq_len 16384 | seq_len 32768 | 26 | | :--- | ---: | ---: | ---: | ---: | 27 | | Qwen-1.8B | 6.22G | 7.46G | | | 28 | | Qwen-7B | 17.94G | 20.96G | | | 29 | | Qwen-7B-Int4 | 9.10G | 12.26G | | | 30 | | Qwen-14B | 33.40G | | | | 31 | | Qwen-14B-Int4 | 13.30G | | | | 32 | | Qwen-72B | 166.87G | 185.50G | 210.80G | 253.80G | 33 | | Qwen-72B-int4 | 55.37G | 73.66G | 97.79G | 158.80G | 34 | 35 | We have also listed the models that can run on consumer graphics cards by default sequence length in the following table. If the GPU memory only exceeds the model's memory usage by a small margin, you can make the model run on your machine by reducing the max-model-len parameter.</br> 36 | (ps: To run Qwen-14B-Int4 on NVIDIA RTX 3080Ti, you need to set gpu-memory-utilization as 0.99 and enforce eager mode) 37 | 38 | | GPU Memory | GPU | Support Model | 39 | | :---: | :---: | :---: | 40 | | 24GB | NVIDIA RTX 4090/3090/A5000 | Qwen-1.8B/Qwen-7B/Qwen-7B-Int4/Qwen-14B-Int4 | 41 | | 16GB | NVIDIA RTX A4000 | Qwen-1.8B/Qwen-7B-Int4/Qwen-14B-Int4 | 42 | | 12GB | NVIDIA RTX 3080Ti/TITAN Xp | Qwen-1.8B/Qwen-14B-Int4 | 43 | | 11GB | NVIDIA RTX 2080Ti/GTX 1080Ti | Qwen-1.8B | 44 | | 10GB | NVIDIA RTX 3080 | Qwen-1.8B | 45 | 46 | ## Usage 47 | 48 | ### vLLM + Web Demo / OpenAI-like API 49 | 50 | You can use FastChat to launch a web demo or an OpenAI API server. First, install FastChat: 51 | 52 | ```bash 53 | pip install "fschat[model_worker,webui]=0.2.33" "openai<1.0" 54 | ``` 55 | 56 | To run Qwen with vLLM and FastChat, you need launch a controller by: 57 | ```bash 58 | python -m fastchat.serve.controller 59 | ``` 60 | 61 | Then you can launch the model worker, which means loading your model for inference. For single GPU inference, you can directly run: 62 | ```bash 63 | python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --dtype bfloat16 64 | # run int4 model or GPUs with compute capability less than 8.0 65 | # python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --dtype float16 66 | ``` 67 | 68 | However, if you hope to run the model on multiple GPUs for faster inference or larger memory, you can use tensor parallelism supported by vLLM. Suppose you run the model on 4 GPUs, the command is shown below: 69 | ```bash 70 | python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --tensor-parallel-size 4 --dtype bfloat16 71 | # run int4 model or GPUs with compute capability less than 8.0 72 | # python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --tensor-parallel-size 4 --dtype float16 73 | ``` 74 | 75 | After launching your model worker, you can launch a: 76 | 77 | * Web UI Demo 78 | ```bash 79 | python -m fastchat.serve.gradio_web_server 80 | ``` 81 | 82 | * OpenAI API 83 | ```bash 84 | python -m fastchat.serve.openai_api_server --host localhost --port 8000 85 | ``` 86 | 87 | For OpenAI API server, you can invoke the server in the following manner. 88 | 89 | ```python 90 | import openai 91 | openai.api_base = "http://localhost:8000/v1" 92 | openai.api_key = "none" 93 | 94 | # create a request activating streaming response 95 | for chunk in openai.ChatCompletion.create( 96 | model="Qwen", 97 | messages=[ 98 | {"role": "user", "content": "你好"} 99 | ], 100 | stream=True 101 | # Specifying stop words in streaming output format is not yet supported and is under development. 102 | ): 103 | if hasattr(chunk.choices[0].delta, "content"): 104 | print(chunk.choices[0].delta.content, end="", flush=True) 105 | 106 | # create a request not activating streaming response 107 | response = openai.ChatCompletion.create( 108 | model="Qwen", 109 | messages=[ 110 | {"role": "user", "content": "你好"} 111 | ], 112 | stream=False, 113 | stop=[] # You can add custom stop words here, e.g., stop=["Observation:"] for ReAct prompting. 114 | ) 115 | print(response.choices[0].message.content) 116 | ``` 117 | 118 | If you find `"POST /v1/chat/completions HTTP/1.1" 200 OK` in openai_api_server log, it indicates that the call was successful. 119 | 120 | vLLM does not support dynamic-NTK ROPE. Therefore, extending long sequences for Qwen model may lead to quality degradation(even gibberish). 121 | 122 | ### vLLM + Transformer-like Wrapper 123 | 124 | You can download the [wrapper codes](vllm_wrapper.py) and execute the following commands for multiple rounds of dialogue interaction. (Note: It currently only supports the ``model.chat()`` method.) 125 | 126 | ```python 127 | from vllm_wrapper import vLLMWrapper 128 | 129 | # Bfloat16 is only supported on GPUs with compute capability of at least 8.0, 130 | model = vLLMWrapper('Qwen/Qwen-7B-Chat', tensor_parallel_size=1) 131 | 132 | # run int4 model or GPUs with compute capability less than 8.0 133 | # model = vLLMWrapper('Qwen/Qwen-7B-Chat-Int4', tensor_parallel_size=1, dtype="float16") 134 | 135 | response, history = model.chat(query="你好", history=None) 136 | print(response) 137 | response, history = model.chat(query="给我讲一个年轻人奋斗创业最终取得成功的故事。", history=history) 138 | print(response) 139 | response, history = model.chat(query="给这个故事起一个标题", history=history) 140 | print(response) 141 | ``` 142 | ### vLLM Standalone OpenAI-like API 143 | 144 | You can also deploy an OpenAI API server independently through vLLM. First, you need to download [chat template file](template_chatml.jinja). 145 | 146 | Then, you can launch an OpenAI API server by following command: 147 | 148 | ```bash 149 | python -m vllm.entrypoints.openai.api_server --model $model_path --trust-remote-code --chat-template template_chatml.jinja 150 | 151 | # run int4 model or GPUs with compute capability less than 8.0 152 | # python -m vllm.entrypoints.openai.api_server --model $model_path --trust-remote-code --dtype float16 --chat-template template_chatml.jinja 153 | ``` 154 | 155 | For vLLM standalone OpenAI API server, You need to set the `stop_token_ids` parameter to `[151645]` or `stop` parameter to `["<|im_end|>"]` when invoking the server. 156 | 157 | ```python 158 | import openai 159 | openai.api_base = "http://localhost:8000/v1" 160 | openai.api_key = "none" 161 | 162 | # create a request activating streaming response 163 | for chunk in openai.ChatCompletion.create( 164 | model="Qwen", 165 | messages=[ 166 | {"role": "user", "content": "你好"} 167 | ], 168 | stream=True, 169 | stop_token_ids=[151645] 170 | ): 171 | if hasattr(chunk.choices[0].delta, "content"): 172 | print(chunk.choices[0].delta.content, end="", flush=True) 173 | 174 | # create a request not activating streaming response 175 | response = openai.ChatCompletion.create( 176 | model="Qwen", 177 | messages=[ 178 | {"role": "user", "content": "你好"} 179 | ], 180 | stream=False, 181 | stop_token_ids=[151645] 182 | ) 183 | print(response.choices[0].message.content) 184 | ``` -------------------------------------------------------------------------------- /recipes/inference/vllm/template_chatml.jinja: -------------------------------------------------------------------------------- 1 | {% for message in messages %} 2 | {% if loop.first and message['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %} 3 | {{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>\n' }} 4 | {% if loop.last and add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %} 5 | {% endfor %} 6 | -------------------------------------------------------------------------------- /recipes/tests/README.md: -------------------------------------------------------------------------------- 1 | # Unit testing 2 | - Run all unit testing 3 | ```bash 4 | cd tests && pytest -s 5 | ``` 6 | - Run unit testing under a single folder 7 | ```bash 8 | cd tests && pytest -s {dir} 9 | ``` 10 | - Rerun the test cases that failed in the last run 11 | ```bash 12 | cd tests && pytest -s --lf 13 | ``` -------------------------------------------------------------------------------- /recipes/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QwenLM/Qwen/df5866f1be1fb3055c4585ae7518e1518ac81199/recipes/tests/__init__.py -------------------------------------------------------------------------------- /recipes/tests/assets/test_sampled_qwen.json: -------------------------------------------------------------------------------- 1 | [{"conversations": [{"from": "user", "value": "你好"}, {"from": "assistant", "value": "你好!很高兴为你提供帮助。"}], "id": "identity_0"}, {"conversations": [{"from": "user", "value": "给我讲一个年轻人奋斗创业最终取得成功的故事。"}, {"from": "assistant", "value": "这是一个关于一个年轻人奋斗创业最终取得成功的故事。"}], "id": "identity_1"}] -------------------------------------------------------------------------------- /recipes/tests/test_finetune/test_finetune_ds.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pytest 4 | import shutil 5 | from itertools import product 6 | import torch 7 | from modelscope.hub.snapshot_download import snapshot_download 8 | 9 | sys.path.append(os.path.dirname(__file__) + "/..") 10 | from utils import run_in_subprocess 11 | from ut_config import ( 12 | MODEL_TYPE, 13 | DOCKER_VERSION_CU114, 14 | DOCKER_VERSION_CU117, 15 | DOCKER_VERSION_CU121, 16 | DOCKER_MOUNT_DIR, 17 | DOCKER_TEST_DIR, 18 | DATA_DIR, 19 | DS_CONFIG_ZERO2_DIR, 20 | DS_CONFIG_ZERO3_DIR, 21 | ) 22 | 23 | is_chat = ["chat", "base"] 24 | docker_version = [DOCKER_VERSION_CU114, DOCKER_VERSION_CU117, DOCKER_VERSION_CU121] 25 | # ZeRO3 is incompatible with LoRA when finetuning on base model. 26 | # FSDP or ZeRO3 are incompatible with QLoRA. 27 | parametrize_list_none_ds = list( 28 | product(*[[1], ["full", "lora"], is_chat, docker_version, [None]]) 29 | ) 30 | parametrize_list_ds_zero2 = list( 31 | product(*[[2], ["full", "lora"], is_chat, docker_version, [DS_CONFIG_ZERO2_DIR]]) 32 | ) 33 | parametrize_list_ds_zero3 = list( 34 | product(*[[2], ["full"], is_chat, docker_version, [DS_CONFIG_ZERO3_DIR]]) 35 | ) + list(product(*[[2], ["lora"], ["chat"], docker_version, [DS_CONFIG_ZERO3_DIR]])) 36 | parametrize_list_qlora = list( 37 | product(*[[1, 2], ["qlora"], ["chat"], docker_version, [None, DS_CONFIG_ZERO2_DIR]]) 38 | ) 39 | parametrize_list = ( 40 | parametrize_list_none_ds 41 | + parametrize_list_ds_zero2 42 | + parametrize_list_ds_zero3 43 | + parametrize_list_qlora 44 | ) 45 | 46 | 47 | @pytest.mark.parametrize( 48 | "num_gpus,train_type,is_chat,docker_version,deepspeed", parametrize_list 49 | ) 50 | def test_finetune(num_gpus, train_type, is_chat, docker_version, deepspeed): 51 | cmd_docker = f"docker run --gpus all --ipc=host --network=host --rm -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {docker_version} /bin/bash -c " 52 | cmd = "" 53 | # for GPUs SM < 80 54 | is_ampere = torch.cuda.get_device_capability()[0] >= 8 55 | if not is_ampere: 56 | cmd = f"pip uninstall -y flash-attn && " 57 | 58 | model_type = f"{MODEL_TYPE}-Chat" if is_chat == "chat" else MODEL_TYPE 59 | model_type = f"{model_type}-Int4" if train_type == "qlora" else model_type 60 | cmd += f"""torchrun --nproc_per_node {num_gpus} --nnodes 1 --node_rank 0 --master_addr localhost --master_port 12345 {DOCKER_MOUNT_DIR}/finetune.py \ 61 | --model_name_or_path "{DOCKER_TEST_DIR}/{model_type}/" \ 62 | --data_path {DATA_DIR} \ 63 | --output_dir "{DOCKER_TEST_DIR}/output_qwen" \ 64 | --num_train_epochs 1 \ 65 | --per_device_train_batch_size 1 \ 66 | --per_device_eval_batch_size 1 \ 67 | --gradient_accumulation_steps 2 \ 68 | --evaluation_strategy "no" \ 69 | --save_strategy "steps" \ 70 | --save_steps 1000 \ 71 | --save_total_limit 10 \ 72 | --learning_rate 1e-5 \ 73 | --weight_decay 0.1 \ 74 | --adam_beta2 0.95 \ 75 | --warmup_ratio 0.01 \ 76 | --lr_scheduler_type "cosine" \ 77 | --logging_steps 1 \ 78 | --report_to "none" \ 79 | --model_max_length 512""" 80 | if deepspeed: 81 | cmd += f" --deepspeed {deepspeed}" 82 | if train_type == "lora": 83 | cmd += " --use_lora" 84 | elif train_type == "qlora": 85 | cmd += " --use_lora --q_lora" 86 | # for SM < 80 87 | if ( 88 | (not is_ampere) 89 | and train_type == "lora" 90 | and (deepspeed and "zero2" in deepspeed) 91 | and is_chat == "base" 92 | ): 93 | cmd += " --fp16 True" 94 | snapshot_download(model_type, cache_dir=".", revision="master") 95 | run_in_subprocess(cmd_docker + f'"{cmd}"') 96 | if train_type == "full": 97 | assert os.path.exists("output_qwen/config.json") 98 | else: 99 | assert os.path.exists("output_qwen/adapter_config.json") 100 | shutil.rmtree("output_qwen") 101 | -------------------------------------------------------------------------------- /recipes/tests/test_inference/test_inference_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import pytest 5 | import subprocess 6 | import torch 7 | from modelscope.hub.snapshot_download import snapshot_download 8 | 9 | sys.path.append(os.path.dirname(__file__) + "/..") 10 | from utils import run_in_subprocess, simple_openai_api, TelnetPort 11 | from ut_config import ( 12 | MODEL_TYPE, 13 | DOCKER_VERSION_CU114, 14 | DOCKER_VERSION_CU117, 15 | DOCKER_VERSION_CU121, 16 | DOCKER_MOUNT_DIR, 17 | DOCKER_TEST_DIR, 18 | ) 19 | 20 | 21 | # use_cpu=True,use_int=False RuntimeError: "addmm_impl_cpu_" not implemented for 'Half' 22 | # use_cpu=True,use_int4=True ValueError: Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU.You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object 23 | @pytest.mark.parametrize( 24 | "docker_version,use_cpu,use_int4", 25 | [ 26 | (DOCKER_VERSION_CU114, False, False), 27 | (DOCKER_VERSION_CU114, False, True), 28 | (DOCKER_VERSION_CU117, False, False), 29 | (DOCKER_VERSION_CU117, False, True), 30 | (DOCKER_VERSION_CU121, False, False), 31 | (DOCKER_VERSION_CU121, False, True), 32 | ], 33 | ) 34 | def test_inference_api(docker_version, use_cpu, use_int4): 35 | container_name = "test_inference_api" 36 | model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat" 37 | cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {docker_version} /bin/bash -c ' 38 | cmd = "" 39 | # for GPUs SM < 80 40 | is_ampere = torch.cuda.get_device_capability()[0] >= 8 41 | if not is_ampere: 42 | cmd += f"pip uninstall -y flash-attn && " 43 | 44 | cmd += f"""python {DOCKER_MOUNT_DIR}/openai_api.py -c {DOCKER_TEST_DIR}/{model_type}""" 45 | 46 | if use_cpu: 47 | cmd += " --cpu-only" 48 | 49 | snapshot_download(model_type, cache_dir=".", revision="master") 50 | # start model server 51 | print(cmd_docker + f'"{cmd}"') 52 | run_in_subprocess( 53 | f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."' 54 | ) 55 | run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &") 56 | 57 | while not TelnetPort("localhost", 8000): 58 | print("Wait for the model service start.") 59 | time.sleep(0.5) 60 | 61 | if ( 62 | subprocess.run( 63 | f"docker inspect {container_name}", 64 | shell=True, 65 | stdout=subprocess.DEVNULL, 66 | ).returncode 67 | != 0 68 | ): 69 | break 70 | try: 71 | # while load int4 model such as Qwen-1_8B-Chat-Int4, the model name is Qwen-1_8B-Chat 72 | simple_openai_api(f"{MODEL_TYPE}-Chat".split("/")[-1]) 73 | except Exception as e: 74 | time.sleep(1) 75 | with open("tmp.log") as f: 76 | raise Exception(f"{e} \n {f.read()}") 77 | 78 | run_in_subprocess(f"docker rm -f {container_name}") 79 | -------------------------------------------------------------------------------- /recipes/tests/test_inference/test_inference_vllm_fschat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import pytest 5 | import subprocess 6 | import torch 7 | from modelscope.hub.snapshot_download import snapshot_download 8 | 9 | sys.path.append(os.path.dirname(__file__) + "/..") 10 | from utils import run_in_subprocess, simple_openai_api, TelnetPort 11 | from ut_config import ( 12 | MODEL_TYPE, 13 | DOCKER_VERSION_CU121, 14 | DOCKER_MOUNT_DIR, 15 | DOCKER_TEST_DIR, 16 | ) 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "num_gpus,use_int4", 21 | [ 22 | (1, False), 23 | (1, True), 24 | (2, False), 25 | # ValueError: The input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size. 26 | # (2, True) 27 | ], 28 | ) 29 | def test_inference_vllm_fschat(num_gpus, use_int4): 30 | model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat" 31 | container_name = "test_inference_vllm_fschat" 32 | cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {DOCKER_VERSION_CU121} /bin/bash -c ' 33 | cmd = "" 34 | 35 | cmd += f"""nohup python -m fastchat.serve.controller > /dev/null 2>&1 \ 36 | & python -m fastchat.serve.openai_api_server --host localhost --port 8000 > /dev/null 2>&1 \ 37 | & python -m fastchat.serve.vllm_worker --model-path {DOCKER_TEST_DIR}/{model_type} --tensor-parallel-size {num_gpus} --trust-remote-code""" 38 | 39 | # for GPUS SM < 80 and use_int==True 40 | is_ampere = torch.cuda.get_device_capability()[0] >= 8 41 | if not is_ampere or use_int4: 42 | cmd += " --dtype half" 43 | 44 | snapshot_download(model_type, cache_dir=".", revision="master") 45 | # start model server 46 | run_in_subprocess( 47 | f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."' 48 | ) 49 | print(cmd_docker + f'"{cmd}"') 50 | run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &") 51 | 52 | while not TelnetPort("localhost", 21002): 53 | print("Wait for the model service start.") 54 | time.sleep(0.5) 55 | 56 | if ( 57 | subprocess.run( 58 | f"docker inspect {container_name}", 59 | shell=True, 60 | stdout=subprocess.DEVNULL, 61 | ).returncode 62 | != 0 63 | ): 64 | break 65 | 66 | try: 67 | simple_openai_api(model_type.split("/")[-1]) 68 | except Exception as e: 69 | time.sleep(1) 70 | with open("tmp.log") as f: 71 | raise Exception(f"{e} \n {f.read()}") 72 | 73 | run_in_subprocess(f"docker rm -f {container_name}") 74 | -------------------------------------------------------------------------------- /recipes/tests/ut_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # common 4 | MODEL_TYPE = "Qwen/Qwen-1_8B" 5 | DOCKER_VERSION_CU114 = "qwenllm/qwen:cu114" 6 | DOCKER_VERSION_CU117 = "qwenllm/qwen:cu117" 7 | DOCKER_VERSION_CU121 = "qwenllm/qwen:cu121" 8 | DOCKER_MOUNT_DIR = "/qwen-recipes" 9 | DOCKER_TEST_DIR = os.path.join(DOCKER_MOUNT_DIR, "recipes/tests") 10 | 11 | # finetune 12 | DATA_DIR = os.path.join(DOCKER_MOUNT_DIR, "recipes/tests/assets/test_sampled_qwen.json") 13 | DS_CONFIG_ZERO2_DIR = os.path.join( 14 | DOCKER_MOUNT_DIR, "finetune/ds_config_zero2.json" 15 | ) 16 | DS_CONFIG_ZERO3_DIR = os.path.join( 17 | DOCKER_MOUNT_DIR, "finetune/ds_config_zero3.json" 18 | ) 19 | -------------------------------------------------------------------------------- /recipes/tests/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import subprocess 3 | import socket 4 | import openai 5 | 6 | 7 | def run_in_subprocess(cmd): 8 | try: 9 | with subprocess.Popen( 10 | cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE 11 | ) as return_info: 12 | while True: 13 | next_line = return_info.stdout.readline() 14 | return_line = next_line.decode("utf-8", "ignore").strip() 15 | if return_line == "" and return_info.poll() != None: 16 | break 17 | if return_line != "": 18 | logging.info(return_line) 19 | 20 | err_lines = "" 21 | while True: 22 | next_line = return_info.stderr.readline() 23 | return_line = next_line.decode("utf-8", "ignore").strip() 24 | if return_line == "" and return_info.poll() != None: 25 | break 26 | if return_line != "": 27 | logging.info(return_line) 28 | err_lines += return_line + "\n" 29 | 30 | return_code = return_info.wait() 31 | if return_code: 32 | raise RuntimeError(err_lines) 33 | except Exception as e: 34 | raise e 35 | 36 | 37 | def simple_openai_api(model): 38 | openai.api_base = "http://localhost:8000/v1" 39 | openai.api_key = "none" 40 | 41 | # create a request not activating streaming response 42 | response = openai.ChatCompletion.create( 43 | model=model, 44 | messages=[{"role": "user", "content": "你好"}], 45 | stream=False, 46 | stop=[], # You can add custom stop words here, e.g., stop=["Observation:"] for ReAct prompting. 47 | ) 48 | print(response.choices[0].message.content) 49 | 50 | 51 | def TelnetPort(server_ip, port): 52 | sk = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 53 | sk.settimeout(1) 54 | connect_flag = False 55 | try: 56 | sk.connect((server_ip, port)) 57 | connect_flag = True 58 | except Exception: 59 | connect_flag = False 60 | sk.close() 61 | return connect_flag 62 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.32.0,<4.38.0 2 | accelerate 3 | tiktoken 4 | einops 5 | transformers_stream_generator==0.0.4 6 | scipy 7 | -------------------------------------------------------------------------------- /requirements_web_demo.txt: -------------------------------------------------------------------------------- 1 | gradio<3.42 2 | mdtex2html 3 | -------------------------------------------------------------------------------- /run_gptq.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from typing import Dict 4 | import logging 5 | 6 | import torch 7 | import transformers 8 | from transformers import AutoTokenizer 9 | from transformers.trainer_pt_utils import LabelSmoother 10 | from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig 11 | IGNORE_TOKEN_ID = LabelSmoother.ignore_index 12 | 13 | def preprocess( 14 | sources, 15 | tokenizer: transformers.PreTrainedTokenizer, 16 | max_len: int, 17 | system_message: str = "You are a helpful assistant." 18 | ) -> Dict: 19 | roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"} 20 | 21 | im_start = tokenizer.im_start_id 22 | im_end = tokenizer.im_end_id 23 | nl_tokens = tokenizer('\n').input_ids 24 | _system = tokenizer('system').input_ids + nl_tokens 25 | _user = tokenizer('user').input_ids + nl_tokens 26 | _assistant = tokenizer('assistant').input_ids + nl_tokens 27 | 28 | # Apply prompt templates 29 | data = [] 30 | # input_ids, targets = [], [] 31 | for i, source in enumerate(sources): 32 | source = source["conversations"] 33 | if roles[source[0]["from"]] != roles["user"]: 34 | source = source[1:] 35 | 36 | input_id, target = [], [] 37 | system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens 38 | input_id += system 39 | target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens 40 | assert len(input_id) == len(target) 41 | for j, sentence in enumerate(source): 42 | role = roles[sentence["from"]] 43 | _input_id = tokenizer(role).input_ids + nl_tokens + \ 44 | tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens 45 | input_id += _input_id 46 | if role == '<|im_start|>user': 47 | _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens 48 | elif role == '<|im_start|>assistant': 49 | _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \ 50 | _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens 51 | else: 52 | raise NotImplementedError 53 | target += _target 54 | assert len(input_id) == len(target) 55 | input_id = torch.tensor(input_id[:max_len], dtype=torch.int) 56 | target = torch.tensor(target[:max_len], dtype=torch.int) 57 | data.append(dict(input_ids=input_id, attention_mask=input_id.ne(tokenizer.pad_token_id))) 58 | 59 | return data 60 | 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser("Model Quantization using AutoGPTQ") 64 | parser.add_argument("--model_name_or_path", type=str, help="model path") 65 | parser.add_argument("--data_path", type=str, help="calibration data path") 66 | parser.add_argument("--out_path", type=str, help="output path of the quantized model") 67 | parser.add_argument("--max_len", type=int, default=8192, help="max length of calibration data") 68 | parser.add_argument("--bits", type=int, default=4, help="the bits of quantized model. 4 indicates int4 models.") 69 | parser.add_argument("--group-size", type=int, default=128, help="the group size of quantized model") 70 | args = parser.parse_args() 71 | 72 | quantize_config = BaseQuantizeConfig( 73 | bits=args.bits, 74 | group_size=args.group_size, 75 | damp_percent=0.01, 76 | desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad 77 | static_groups=False, 78 | sym=True, 79 | true_sequential=True, 80 | model_name_or_path=None, 81 | model_file_base_name="model" 82 | ) 83 | 84 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True) 85 | tokenizer.pad_token_id = tokenizer.eod_id 86 | data = preprocess(json.load(open(args.data_path)), tokenizer, args.max_len) 87 | 88 | model = AutoGPTQForCausalLM.from_pretrained(args.model_name_or_path, quantize_config, device_map="auto", trust_remote_code=True) 89 | 90 | logging.basicConfig( 91 | format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S" 92 | ) 93 | model.quantize(data, cache_examples_on_gpu=False) 94 | 95 | model.save_quantized(args.out_path, use_safetensors=True) 96 | tokenizer.save_pretrained(args.out_path) 97 | -------------------------------------------------------------------------------- /tokenization_note_ja.md: -------------------------------------------------------------------------------- 1 | # トークン化 2 | 3 | Qwen-7B は `tiktoken` パッケージを使用して、UTF-8 バイトを BPE トークン化します。 4 | Qwen-7B には 2 種類のトークンがあります。BPE の通常のトークン (`bytes` 型) と特殊/制御トークン (`str` 型) です。 5 | 6 | ```python 7 | from transformers import AutoTokenizer 8 | 9 | tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True) 10 | ``` 11 | 12 | ## 通常のトークン 13 | 14 | 通常のトークンは、UTF-8 エンコーディングでエンコードされたテキストのバイト列から学習した BPE トークンです。 15 | これによってすべてのテキストをトークン化することができ、未知のトークンは存在しませんが、一般的でないテキストをトークン化するときにシングルバイトを使用するようにフォールバックすることがあります。 16 | UTF-8 のデコードエラーに遭遇することがあり、そのエラーのデフォルトは `replace` であるため、不完全な生成では置換文字 (�) が使用されます。 17 | この動作は `errors="ignore"` を `decode` 関数に渡すことで変更することができる。 18 | `errors` のオプションについては、[Python ドキュメント](https://docs.python.org/3/library/stdtypes.html#bytes.decode) を参照してください。 19 | 20 | ```python 21 | >>> tokenizer.decode([51461]) 22 | ' �' 23 | 24 | >>> tokenizer.convert_ids_to_tokens([51461]) 25 | [b' \xe6\xa0'] 26 | 27 | >>> b' \xe6\xa0'.decode("utf-8", errors='replace') 28 | ' �' 29 | 30 | >>> tokenizer.decode([51461, 117]) 31 | ' 根' 32 | 33 | >>> tokenizer.convert_ids_to_tokens([51461, 117]) 34 | [b' \xe6\xa0', b'\xb9'] 35 | 36 | >>> b' \xe6\xa0\xb9'.decode("utf-8", errors='replace') 37 | ' 根' 38 | ``` 39 | 40 | 通常のトークン (`bytes` 単位) からその ID へのマッピングは `tokenizer.get_vocab()` から取得できます。 41 | 通常のトークンを語彙に追加することはサポートしていませんし、推奨もしていません。 42 | 43 | ## 特別なトークン 44 | 45 | 特別なトークンは、例えば文書の最後に到達するなど、モデルにとって特別な機能を意味します。 46 | 理論的には、これらは入力テキストには存在せず、入力テキストが処理された後にのみ現れます。 47 | 例えば、文書の終わりを表す `<|endoftext|>` のような表面的な形は、参照を容易にするためだけのものである。 48 | 現在、Qwen-7B では `<|endoftext|>` が、Qwen-7B-Chat では `<|endoftext|>`, `<|im_start|>`, `<|im_end|>` が特殊トークンとして使われています。 49 | 他の目的のために、`<|extra_0|>` から `<|extra_204|>` までの特別なトークンを保持しています。 50 | 特殊トークンの表面形式 (`str` 内) から ID へのマッピングは `tokenizer.special_tokens` から取得できます。 51 | 52 | `bos`、`eos`、`unk`、`pad`、`mask`、`sep` などの概念は学習済みモデル(Qwen-7B と Qwen-7B-Chat)には適用できません。 53 | しかし、`pad` トークンは話が別です。理論的には、モデルがこのトークンを見たり計算したりすることはないので、既知のトークンを使用することができます。 54 | しかし、安全のために、トークナイザーの初期化で指定する特別なトークンの値は、既知の特別なトークンに限定します。 55 | 微調整やその他のフレームワークで特別なトークンを必要とする場合は、次のように指定できます 56 | 57 | ```python 58 | from transformers import AutoTokenizer 59 | 60 | tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True, pad_token='<|endoftext|>') 61 | ``` 62 | 63 | > 警告: 私たちが事前に学習したモデルでは、`bos`, `eos`, `unk` などを設定しても意味がありません。 64 | > 特に、`<<endoftext|>` を `eos` のように使ってはいけません。 65 | > 特に `<|endoftext|>` を `eos` として使用することは、文末と文末が同じであると確信できる場合を除き、避けるべきです。 66 | 67 | ## インジェクション攻撃の防止 68 | 69 | 特殊トークンは通常のトークンとは異なるため、コントロールトークンの表面形が入力テキストに現れるとどうなるでしょうか? 70 | 例えば、次のようなテキストがあるとします 71 | 72 | ``` 73 | print("<|endoftext|>") 74 | ``` 75 | 76 | これは次のようにしてトークン化する必要があります 77 | 78 | ``` 79 | ids:[1350, 9639, 91, 8691, 723, 427, 91, 82598] 80 | tokens: [b'print', b'("<', b'|', b'endo', b'ft', b'ext', b'|', b'>")'] 81 | ``` 82 | 83 | こちらではありません 84 | 85 | ``` 86 | ids: [1350, 445, 151643, 899] 87 | tokens: [b'print', b'("', '<|endoftext|>', b'")'] 88 | ``` 89 | 90 | つまり、特殊トークンの表面形は通常のテキストと同じように扱い、特殊トークンはテキストのトークン化後に開発者が処理するというものです。 91 | しかし、これはコミュニティにおける(安全ではないとはいえ)慣習に抵触し、開発者が車輪を再利用するための新たなステップを追加することになります。 92 | 93 | デフォルトの動作は、すべての既知の特殊トークンの表面形を特殊トークンとして解析するように変更されました。 94 | インジェクション防止を有効にするには、トークナイザーの呼び出しに `allowed_special=set()` を渡します: 95 | 96 | ```python 97 | >>> tokenizer('print("<|endoftext|>")', allowed_special=set()) 98 | {'input_ids': [1350, 9639, 91, 8691, 723, 427, 91, 82598], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]} 99 | ``` 100 | 101 | `str` のセットを `allowed_special` として渡すことで、きめ細かく動作を制御することができます 102 | 103 | ```python 104 | >>> tokenizer('print("<|extra_0|>")<|endoftext|>', allowed_special={'<|endoftext|>'}) 105 | {'input_ids': [1350, 9639, 91, 15460, 62, 15, 91, 82598, 151643], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]} 106 | ``` 107 | 108 | `str` のコレクションを `disallowed_special` として渡すことで、特定の特殊なトークンの表形式が入力テキストで遭遇した場合にトークナイザーがエラーを発生するようにすることもできます 109 | 110 | ```python 111 | >>> tokenizer('print("<|extra_0|>")<|endoftext|>', allowed_special={'<|endoftext|>'}, disallowed_special=('<|extra_0|>', )) 112 | ... 113 | ValueError: Encountered text corresponding to disallowed special token '<|extra_0|>'. 114 | If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|extra_0|>', ...}`. 115 | If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|extra_0|>'})`. 116 | To disable this check for all special tokens, pass `disallowed_special=()`. 117 | ``` 118 | 119 | `allowed_special` と `disallowed_special` の詳細については、[`tiktoken` ドキュメント](https://github.com/openai/tiktoken/blob/095924e02c85617df6889698d94515f91666c7ea/tiktoken/core.py#L75)を参照してください。 120 | 121 | 新しいデフォルトは以下の通り 122 | 123 | ```python 124 | >>> tokenizer('print("<|endoftext|>")', allowed_special="all", disallowed_special=()) 125 | {'input_ids': [1350, 445, 151643, 899], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]} 126 | ``` 127 | 128 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM 3 | from accelerate import dispatch_model 4 | 5 | 6 | def _device_map(num_gpus, num_layers): 7 | per_gpu_layers = (num_layers + 2) / num_gpus 8 | 9 | device_map = { 10 | 'transformer.wte': 0, 11 | 'transformer.ln_f': 0, 12 | 'lm_head': num_gpus-1 13 | } 14 | 15 | used = 1 16 | gpu_target = 0 17 | for i in range(num_layers): 18 | if used >= per_gpu_layers: 19 | gpu_target += 1 20 | used = 0 if gpu_target < num_gpus-1 else 1 21 | assert gpu_target < num_gpus 22 | device_map[f'transformer.h.{i}'] = gpu_target 23 | used += 1 24 | 25 | return device_map 26 | 27 | 28 | def load_model_on_gpus(model_name_or_path, num_gpus: int = 2): 29 | num_devices = torch.cuda.device_count() 30 | 31 | if num_gpus == 1: 32 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='auto', 33 | trust_remote_code=True).eval() 34 | elif 1 < num_gpus <= num_devices: 35 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu', 36 | trust_remote_code=True).eval() 37 | num_layers = model.config.num_hidden_layers 38 | device_map = _device_map(num_gpus, num_layers) 39 | print(device_map) 40 | model = dispatch_model(model, device_map=device_map) 41 | else: 42 | raise KeyError 43 | 44 | return model 45 | -------------------------------------------------------------------------------- /web_demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba Cloud. 2 | # 3 | # This source code is licensed under the license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """A simple web interactive chat demo based on gradio.""" 7 | import os 8 | from argparse import ArgumentParser 9 | 10 | import gradio as gr 11 | import mdtex2html 12 | 13 | import torch 14 | from transformers import AutoModelForCausalLM, AutoTokenizer 15 | from transformers.generation import GenerationConfig 16 | 17 | 18 | DEFAULT_CKPT_PATH = 'Qwen/Qwen-7B-Chat' 19 | 20 | 21 | def _get_args(): 22 | parser = ArgumentParser() 23 | parser.add_argument("-c", "--checkpoint-path", type=str, default=DEFAULT_CKPT_PATH, 24 | help="Checkpoint name or path, default to %(default)r") 25 | parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only") 26 | 27 | parser.add_argument("--share", action="store_true", default=False, 28 | help="Create a publicly shareable link for the interface.") 29 | parser.add_argument("--inbrowser", action="store_true", default=False, 30 | help="Automatically launch the interface in a new tab on the default browser.") 31 | parser.add_argument("--server-port", type=int, default=8000, 32 | help="Demo server port.") 33 | parser.add_argument("--server-name", type=str, default="127.0.0.1", 34 | help="Demo server name.") 35 | 36 | args = parser.parse_args() 37 | return args 38 | 39 | 40 | def _load_model_tokenizer(args): 41 | tokenizer = AutoTokenizer.from_pretrained( 42 | args.checkpoint_path, trust_remote_code=True, resume_download=True, 43 | ) 44 | 45 | if args.cpu_only: 46 | device_map = "cpu" 47 | else: 48 | device_map = "auto" 49 | 50 | model = AutoModelForCausalLM.from_pretrained( 51 | args.checkpoint_path, 52 | device_map=device_map, 53 | trust_remote_code=True, 54 | resume_download=True, 55 | ).eval() 56 | 57 | config = GenerationConfig.from_pretrained( 58 | args.checkpoint_path, trust_remote_code=True, resume_download=True, 59 | ) 60 | 61 | return model, tokenizer, config 62 | 63 | 64 | def postprocess(self, y): 65 | if y is None: 66 | return [] 67 | for i, (message, response) in enumerate(y): 68 | y[i] = ( 69 | None if message is None else mdtex2html.convert(message), 70 | None if response is None else mdtex2html.convert(response), 71 | ) 72 | return y 73 | 74 | 75 | gr.Chatbot.postprocess = postprocess 76 | 77 | 78 | def _parse_text(text): 79 | lines = text.split("\n") 80 | lines = [line for line in lines if line != ""] 81 | count = 0 82 | for i, line in enumerate(lines): 83 | if "```" in line: 84 | count += 1 85 | items = line.split("`") 86 | if count % 2 == 1: 87 | lines[i] = f'<pre><code class="language-{items[-1]}">' 88 | else: 89 | lines[i] = f"<br></code></pre>" 90 | else: 91 | if i > 0: 92 | if count % 2 == 1: 93 | line = line.replace("`", r"\`") 94 | line = line.replace("<", "<") 95 | line = line.replace(">", ">") 96 | line = line.replace(" ", " ") 97 | line = line.replace("*", "*") 98 | line = line.replace("_", "_") 99 | line = line.replace("-", "-") 100 | line = line.replace(".", ".") 101 | line = line.replace("!", "!") 102 | line = line.replace("(", "(") 103 | line = line.replace(")", ")") 104 | line = line.replace("$", "$") 105 | lines[i] = "<br>" + line 106 | text = "".join(lines) 107 | return text 108 | 109 | 110 | def _gc(): 111 | import gc 112 | gc.collect() 113 | if torch.cuda.is_available(): 114 | torch.cuda.empty_cache() 115 | 116 | 117 | def _launch_demo(args, model, tokenizer, config): 118 | 119 | def predict(_query, _chatbot, _task_history): 120 | print(f"User: {_parse_text(_query)}") 121 | _chatbot.append((_parse_text(_query), "")) 122 | full_response = "" 123 | 124 | for response in model.chat_stream(tokenizer, _query, history=_task_history, generation_config=config): 125 | _chatbot[-1] = (_parse_text(_query), _parse_text(response)) 126 | 127 | yield _chatbot 128 | full_response = _parse_text(response) 129 | 130 | print(f"History: {_task_history}") 131 | _task_history.append((_query, full_response)) 132 | print(f"Qwen-Chat: {_parse_text(full_response)}") 133 | 134 | def regenerate(_chatbot, _task_history): 135 | if not _task_history: 136 | yield _chatbot 137 | return 138 | item = _task_history.pop(-1) 139 | _chatbot.pop(-1) 140 | yield from predict(item[0], _chatbot, _task_history) 141 | 142 | def reset_user_input(): 143 | return gr.update(value="") 144 | 145 | def reset_state(_chatbot, _task_history): 146 | _task_history.clear() 147 | _chatbot.clear() 148 | _gc() 149 | return _chatbot 150 | 151 | with gr.Blocks() as demo: 152 | gr.Markdown("""\ 153 | <p align="center"><img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/logo_qwen.jpg" style="height: 80px"/><p>""") 154 | gr.Markdown("""<center><font size=8>Qwen-Chat Bot</center>""") 155 | gr.Markdown( 156 | """\ 157 | <center><font size=3>This WebUI is based on Qwen-Chat, developed by Alibaba Cloud. \ 158 | (本WebUI基于Qwen-Chat打造,实现聊天机器人功能。)</center>""") 159 | gr.Markdown("""\ 160 | <center><font size=4> 161 | Qwen-7B <a href="https://modelscope.cn/models/qwen/Qwen-7B/summary">🤖 </a> | 162 | <a href="https://huggingface.co/Qwen/Qwen-7B">🤗</a>  | 163 | Qwen-7B-Chat <a href="https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary">🤖 </a> | 164 | <a href="https://huggingface.co/Qwen/Qwen-7B-Chat">🤗</a>  | 165 | Qwen-14B <a href="https://modelscope.cn/models/qwen/Qwen-14B/summary">🤖 </a> | 166 | <a href="https://huggingface.co/Qwen/Qwen-14B">🤗</a>  | 167 | Qwen-14B-Chat <a href="https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary">🤖 </a> | 168 | <a href="https://huggingface.co/Qwen/Qwen-14B-Chat">🤗</a>  | 169 |  <a href="https://github.com/QwenLM/Qwen">Github</a></center>""") 170 | 171 | chatbot = gr.Chatbot(label='Qwen-Chat', elem_classes="control-height") 172 | query = gr.Textbox(lines=2, label='Input') 173 | task_history = gr.State([]) 174 | 175 | with gr.Row(): 176 | empty_btn = gr.Button("🧹 Clear History (清除历史)") 177 | submit_btn = gr.Button("🚀 Submit (发送)") 178 | regen_btn = gr.Button("🤔️ Regenerate (重试)") 179 | 180 | submit_btn.click(predict, [query, chatbot, task_history], [chatbot], show_progress=True) 181 | submit_btn.click(reset_user_input, [], [query]) 182 | empty_btn.click(reset_state, [chatbot, task_history], outputs=[chatbot], show_progress=True) 183 | regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True) 184 | 185 | gr.Markdown("""\ 186 | <font size=2>Note: This demo is governed by the original license of Qwen. \ 187 | We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, \ 188 | including hate speech, violence, pornography, deception, etc. \ 189 | (注:本演示受Qwen的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,\ 190 | 包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)""") 191 | 192 | demo.queue().launch( 193 | share=args.share, 194 | inbrowser=args.inbrowser, 195 | server_port=args.server_port, 196 | server_name=args.server_name, 197 | ) 198 | 199 | 200 | def main(): 201 | args = _get_args() 202 | 203 | model, tokenizer, config = _load_model_tokenizer(args) 204 | 205 | _launch_demo(args, model, tokenizer, config) 206 | 207 | 208 | if __name__ == '__main__': 209 | main() 210 | --------------------------------------------------------------------------------