├── .gitignore ├── LICENSE ├── README.md └── qwen2_sft ├── __init__.py ├── dataset ├── __init__.py ├── advgen_toy.json └── alpaca_gpt4_data_zh.json ├── ft_qwen2 ├── __init__.py ├── config.py ├── evaluation.py ├── post_api.py ├── predict.py └── train.py ├── image_aadvgen_loss_success_7b_5e5_bfloat16_pt.png ├── image_advgen_dev_success_7b_2e5_cosine_bs2_fp16.png ├── image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16.png ├── models ├── __init__.py └── qwen2 │ ├── __init__.py │ ├── cache_utils.py │ ├── configuration_qwen2.py │ ├── modeling_attn_mask_utils.py │ ├── modeling_qwen2.py │ ├── pytorch_utils.py │ ├── tokenization_qwen2.py │ └── tokenization_qwen2_fast.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Qwen1.5-SFT 2 | Qwen1.5-SFT(阿里, Ali), Qwen_Qwen1.5-2B-Chat/Qwen_Qwen1.5-7B-Chat微调(transformers)/LORA(peft)/推理 3 | 4 | ## 项目地址 5 | - [https://github.com/yongzhuo/qwen1.5-sft](https://github.com/yongzhuo/qwen1.5-sft) 6 | - 默认数据类型为bfloat6 7 | 8 | ## 备注 9 | ```python 10 | 1. 非常重要: weights要用bfloat16/fp32/tf32(第二版大模型基本共识), 不要用fp16, fp16会特别容易loss=NAN; 11 | 2. SFT最好还是像预训练那样, input/output都计算loss; 12 | 2. transformers需要4.37.0及以上; 13 | 3. qwen1.5, 0.5/1.8/4/7B模型的词典大小为151936(可以除以128), 14/72B模型的字典大小为152064(可以除以256); 多语言版本, 包含繁/简体; 14 | 4. qwen1.5网络架构同Llama; 15 | 5. prompt: 16 | 5.1 标准格式为: 17 | text_input + text_output + <|endoftext|> 18 | 5.2 prompt格式为: 19 | <|im_start|>system 20 | {text_prompt}<|im_end|> 21 | <|im_start|>user 22 | {text_input} 23 | <|im_end|> 24 | <|im_start|>assistant 25 | {text_output} 26 | <|im_end|> 27 | 6 微调输入输出: 28 | 输入:"<|im_start|>system\n{text_prompt}<|im_end|>\n<|im_start|>user\n{text_input}\n<|im_end|>" 29 | 输出:"<|im_start|>assistant\n{text_output}\n<|im_end|>" 30 | 7 推理输入输出(assistant\n放置位置不同): 31 | 输入:"<|im_start|>system\n{text_prompt}<|im_end|>\n<|im_start|>user\n{text_input}\n<|im_end|>\n<|im_start|>assistant" 32 | 输出:"{text_output}\n<|im_end|>" 33 | 8. 网络各层名称 34 | ('base_model.model.model.embed_tokens.weight', torch.bfloat16, False) 35 | ('base_model.model.model.layers.31.self_attn.q_proj.weight', torch.bfloat16, False) 36 | ('base_model.model.model.layers.31.self_attn.q_proj.bias', torch.bfloat16, False) 37 | ('base_model.model.model.layers.31.self_attn.q_proj.lora_A.default.weight', torch.float32, True) 38 | ('base_model.model.model.layers.31.self_attn.q_proj.lora_B.default.weight', torch.float32, True) 39 | ('base_model.model.model.layers.31.self_attn.k_proj.weight', torch.bfloat16, False) 40 | ('base_model.model.model.layers.31.self_attn.k_proj.bias', torch.bfloat16, False) 41 | ('base_model.model.model.layers.31.self_attn.k_proj.lora_A.default.weight', torch.float32, True) 42 | ('base_model.model.model.layers.31.self_attn.k_proj.lora_B.default.weight', torch.float32, True) 43 | ('base_model.model.model.layers.31.self_attn.v_proj.weight', torch.bfloat16, False) 44 | ('base_model.model.model.layers.31.self_attn.v_proj.bias', torch.bfloat16, False) 45 | ('base_model.model.model.layers.31.self_attn.v_proj.lora_A.default.weight', torch.float32, True) 46 | ('base_model.model.model.layers.31.self_attn.v_proj.lora_B.default.weight', torch.float32, True) 47 | ('base_model.model.model.layers.31.self_attn.o_proj.weight', torch.bfloat16, False) 48 | ('base_model.model.model.layers.31.mlp.gate_proj.weight', torch.bfloat16, False) 49 | ('base_model.model.model.layers.31.mlp.up_proj.weight', torch.bfloat16, False) 50 | ('base_model.model.model.layers.31.mlp.down_proj.weight', torch.bfloat16, False) 51 | ('base_model.model.model.layers.31.input_layernorm.weight', torch.bfloat16, False) 52 | ('base_model.model.model.layers.31.post_attention_layernorm.weight', torch.bfloat16, False) 53 | ('base_model.model.model.norm.weight', torch.bfloat16, False) 54 | ('base_model.model.lm_head.weight', torch.bfloat16, False) 55 | 9. RuntimeError: unscale_() has already been called on this optimizer since the last update(). 56 | 微调语料太少导致的 57 | ``` 58 | 59 | ## 环境配置 60 | ```shell 61 | transformers>=4.37.0 62 | torch>=1.11.0 63 | safetensors>=0.4.1 64 | accelerate==0.27.1 65 | fsspec==2023.9.2 66 | rouge==1.0.1 67 | nltk==3.6.6 68 | peft>=0.2.0 69 | numpy 70 | tqdm 71 | ``` 72 | 73 | ## 微调 74 | ```shell 75 | 地址: qwen2_sft/ft_qwen2 76 | 77 | 配置: qwen2_sft/ft_qwen2/config.py 78 | 训练: python train.py 79 | 推理: python predict.py 80 | 验证: python evaluation.py 81 | 接口: python post_api.py 82 | ``` 83 | 84 | ## 数据集-中文 85 | - [https://huggingface.co/datasets/JosephusCheung/GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset) 86 | - [https://huggingface.co/datasets/shareAI/shareGPT_cn](https://huggingface.co/datasets/shareAI/shareGPT_cn) 87 | - [https://huggingface.co/datasets/Mutonix/RefGPT-Fact](https://huggingface.co/datasets/Mutonix/RefGPT-Fact) 88 | - [https://huggingface.co/datasets/BAAI/COIG](https://huggingface.co/datasets/BAAI/COIG) 89 | - [https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) 90 | - [https://github.com/carbonz0/alpaca-chinese-dataset](https://github.com/carbonz0/alpaca-chinese-dataset) 91 | - [https://github.com/LianjiaTech/BELLE](https://github.com/LianjiaTech/BELLE) 92 | - [https://github.com/PhoebusSi/Alpaca-CoT](https://github.com/PhoebusSi/Alpaca-CoT) 93 | - [https://github.com/Hello-SimpleAI/chatgpt-comparison-detection](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection) 94 | - [https://github.com/yangjianxin1/Firefly](https://github.com/yangjianxin1/Firefly) 95 | - [https://github.com/XueFuzhao/InstructionWild](https://github.com/XueFuzhao/InstructionWild) 96 | - [https://github.com/OpenLMLab/MOSS](https://github.com/OpenLMLab/MOSS) 97 | - [https://github.com/thu-coai/Safety-Prompts](https://github.com/thu-coai/Safety-Prompts) 98 | - [https://github.com/LAION-AI/Open-Assistant](https://github.com/LAION-AI/Open-Assistant) 99 | - [https://github.com/TigerResearch/TigerBot](https://github.com/TigerResearch/TigerBot) 100 | 101 | 102 | ## 参考/感谢 103 | - [https://github.com/QwenLM/Qwen1.5](https://github.com/QwenLM/Qwen1.5) 104 | - [https://github.com/google/gemma_pytorch](https://github.com/google/gemma_pytorch) 105 | - [https://huggingface.co/google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it) 106 | - [https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) 107 | - [https://github.com/THUDM/ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) 108 | - [https://github.com/THUDM/GLM](https://github.com/THUDM/GLM) 109 | - [https://github.com/tatsu-lab/stanford_alpaca](https://github.com/tatsu-lab/stanford_alpaca) 110 | - [https://github.com/LianjiaTech/BELLE](https://github.com/LianjiaTech/BELLE) 111 | - [https://github.com/huggingface/peft](https://github.com/huggingface/peft) 112 | - [https://github.com/mymusise/ChatGLM-Tuning](https://github.com/mymusise/ChatGLM-Tuning) 113 | - [https://github.com/bojone/bert4keras](https://github.com/bojone/bert4keras) 114 | - [trl](https://github.com/lvwerra/trl) 115 | - [math23k](https://aclanthology.org/D17-1088) 116 | 117 | 118 | ## 微调日志-advgen 119 | ### 默认的bf16可微调, 最好也计算全部loss(同pt, input/output) 120 | ![qwen2_sft/image_aadvgen_loss_success_7b_5e5_bfloat16_pt.png](qwen2_sft/image_aadvgen_loss_success_7b_5e5_bfloat16_pt.png) 121 | 122 | ### fp16损失函数就为nan, image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16 123 | ![qwen2_sft/image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16.png](qwen2_sft/image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16.png) 124 | 125 | ### fp16细调超参数偶尔训练成功但loss很高, image_advgen_dev_success_7b_2e5_cosine_bs2_fp16 126 | ![qwen2_sft/image_advgen_dev_success_7b_2e5_cosine_bs2_fp16.png](qwen2_sft/image_advgen_dev_success_7b_2e5_cosine_bs2_fp16.png) 127 | 128 | 129 | ## 推理日志-advgen 130 | ```cpu 131 | ('base_model.model.base_model.model.model.layers.31.self_attn.k_proj.bias', torch.float16, False, tensor([ 0.1016, -0.0640, 0.1729, ..., -1.0312, -0.0028, 0.3477], 132 | device='cuda:0', dtype=torch.float16)) 133 | ('base_model.model.base_model.model.model.layers.31.self_attn.k_proj.lora_A.default.weight', torch.float16, False, tensor([[-0.0045, -0.0027, 0.0057, ..., 0.0131, 0.0074, -0.0049], 134 | [-0.0049, 0.0048, -0.0067, ..., 0.0118, 0.0080, -0.0105], 135 | [-0.0052, 0.0086, 0.0070, ..., -0.0102, -0.0005, 0.0166], 136 | ..., 137 | [-0.0097, -0.0158, -0.0116, ..., 0.0051, 0.0046, 0.0015], 138 | [-0.0148, 0.0001, -0.0152, ..., 0.0063, 0.0139, 0.0026], 139 | [-0.0092, 0.0036, 0.0003, ..., -0.0082, 0.0063, -0.0058]], 140 | device='cuda:0', dtype=torch.float16)) 141 | ('base_model.model.base_model.model.model.layers.31.self_attn.k_proj.lora_B.default.weight', torch.float16, False, tensor([[ 1.2102e-03, 1.9951e-03, -2.2488e-03, ..., 1.2598e-03, 142 | -1.4429e-03, -2.0466e-03], 143 | [-5.9128e-05, 4.4274e-04, 1.4553e-03, ..., -1.9350e-03, 144 | -8.8406e-04, 1.9855e-03], 145 | [ 1.5755e-03, 2.3174e-03, -2.6073e-03, ..., -4.1466e-03, 146 | -2.8076e-03, 6.1464e-04], 147 | ..., 148 | [-1.5554e-03, -3.0460e-03, 4.4136e-03, ..., 3.6411e-03, 149 | 6.6280e-04, 5.5790e-04], 150 | [-1.5869e-03, -1.6050e-03, -1.0443e-03, ..., 6.1393e-06, 151 | 1.7271e-03, -1.2022e-04], 152 | [-3.7169e-04, 6.4754e-04, 7.2241e-04, ..., 4.6730e-04, 153 | 7.0667e-04, -1.1092e-04]], device='cuda:0', dtype=torch.float16)) 154 | ...... 155 | ('base_model.model.base_model.model.lm_head.weight', torch.float16, False, tensor([[ 0.0095, -0.0017, -0.0134, ..., 0.0036, 0.0232, -0.0112], 156 | [ 0.0089, -0.0062, 0.0004, ..., -0.0079, 0.0175, -0.0098], 157 | [ 0.0055, 0.0182, 0.0088, ..., 0.0059, 0.0034, -0.0164], 158 | ..., 159 | [-0.0034, -0.0049, 0.0033, ..., -0.0022, -0.0066, 0.0031], 160 | [-0.0034, -0.0071, 0.0032, ..., -0.0022, -0.0066, 0.0013], 161 | [-0.0034, -0.0049, 0.0033, ..., -0.0016, -0.0066, 0.0020]], 162 | device='cuda:0', dtype=torch.float16)) 163 | trainable params: 0 || all params: 7733907456 || trainable%: 0.0 164 | 151645 165 | 151644 166 | 151645 167 | 1699 168 | 872 169 | 8948 170 | {'instruction': '类型#裙*版型#宽松*版型#显瘦*颜色#黑色*图案#撞色*裙型#直筒裙*裙款式#拼接', 'input': '', 'output': ''} 171 | tensor([[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 172 | 151645, 198, 151644, 872, 198, 31905, 2, 102807, 9, 173 | 40301, 24300, 2, 109285, 9, 40301, 24300, 2, 99580, 174 | 102372, 9, 102284, 2, 104723, 9, 108108, 2, 101340, 175 | 38035, 9, 102807, 24300, 2, 73145, 106000, 102807, 9, 176 | 102807, 108756, 2, 100278, 29077, 151645, 198, 151644, 77091]], 177 | device='cuda:0') 178 | <|im_start|>system 179 | You are a helpful assistant.<|im_end|> 180 | <|im_start|>user 181 | 类型#裙*版型#宽松*版型#显瘦*颜色#黑色*图案#撞色*裙型#直筒裙*裙款式#拼接<|im_end|> 182 | <|im_start|>assistant宽松显瘦的黑色直筒裙是女性衣橱里必不可少的单品,这款裙子采用经典的黑色调,十分显瘦显腿长。裙摆处采用撞色的拼接设计,增添层次感的同时又不失时尚气息。裙型采用直筒裙的版型,更加修饰腿型,遮肉效果极佳。无论是职场还是休闲都能轻松驾驭,轻松展现女性的优雅气质。<|im_end|> 183 | 请输入: 184 | 类型#上衣*材质#棉*颜色#白色*风格#休闲*图案#刺绣*衣样式#衬衫*衣领型#尖领*衣袖长#七分袖 185 | 请稍等... 186 | ################################################################################################################################ 187 | {'instruction': '类型#上衣*材质#棉*颜色#白色*风格#休闲*图案#刺绣*衣样式#衬衫*衣领型#尖领*衣袖长#七分袖', 'input': '', 'output': ''} 188 | tensor([[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 189 | 151645, 198, 151644, 872, 198, 31905, 2, 17447, 99741, 190 | 9, 106149, 2, 102430, 9, 102284, 2, 102440, 9, 191 | 104040, 2, 104443, 9, 108108, 2, 100253, 103084, 9, 192 | 99741, 112453, 2, 113727, 9, 99741, 99213, 24300, 2, 193 | 100512, 99213, 9, 99741, 102885, 45861, 2, 99612, 17177, 194 | 102885, 151645, 198, 151644, 77091]], device='cuda:0') 195 | <|im_start|>system 196 | You are a helpful assistant.<|im_end|> 197 | <|im_start|>user 198 | 类型#上衣*材质#棉*颜色#白色*风格#休闲*图案#刺绣*衣样式#衬衫*衣领型#尖领*衣袖长#七分袖<|im_end|> 199 | <|im_start|>assistant这款白色刺绣休闲衬衫采用优质的纯棉材质,亲肤透气,穿着舒适亲肤。经典的尖领设计,修饰脸型,展现个性风格。七分袖的设计,既能凸显出修长的手型,又能带来一丝休闲的韵味。精致的刺绣图案,层次丰富,细节精致,凸显出独特的个性品味。这款衬衫无论是日常出街还是职场穿搭,都能展现出独特的休闲风格。<|im_end|> 200 | 27.06706738471985 201 | 请输入: 202 | 类型#裙*版型#显瘦*材质#牛仔布*风格#复古*风格#性感*图案#复古*裙型#牛仔裙*裙型#包臀裙*裙下摆#开叉*裙下摆#毛边*裙长#半身裙 203 | 请稍等... 204 | ################################################################################################################################ 205 | {'instruction': '类型#裙*版型#显瘦*材质#牛仔布*风格#复古*风格#性感*图案#复古*裙型#牛仔裙*裙型#包臀裙*裙下摆#开叉*裙下摆#毛边*裙长#半身裙', 'input': '', 'output': ''} 206 | tensor([[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 207 | 151645, 198, 151644, 872, 198, 31905, 2, 102807, 9, 208 | 40301, 24300, 2, 99580, 102372, 9, 106149, 2, 100664, 209 | 102437, 51827, 9, 104040, 2, 111455, 9, 104040, 2, 210 | 110803, 9, 108108, 2, 111455, 9, 102807, 24300, 2, 211 | 100664, 102437, 102807, 9, 102807, 24300, 2, 67279, 117318, 212 | 102807, 9, 102807, 16872, 100805, 2, 29767, 102940, 9, 213 | 102807, 16872, 100805, 2, 99676, 99319, 9, 102807, 45861, 214 | 2, 99369, 95256, 102807, 151645, 198, 151644, 77091]], 215 | device='cuda:0') 216 | <|im_start|>system 217 | You are a helpful assistant.<|im_end|> 218 | <|im_start|>user 219 | 类型#裙*版型#显瘦*材质#牛仔布*风格#复古*风格#性感*图案#复古*裙型#牛仔裙*裙型#包臀裙*裙下摆#开叉*裙下摆#毛边*裙长#半身裙<|im_end|> 220 | <|im_start|>assistant这款牛仔裙采用了复古风格的设计,十分的复古又有性感的感觉。牛仔布的材质十分的柔软亲肤,而且牛仔的风格十分的显瘦,很好的修饰身材线条,凸显出女性的优雅气质。这款半身裙的设计十分的别致,裙摆处采用开叉的设计,十分的个性时尚,凸显出女性的个性魅力。裙摆处采用毛边的设计,十分的个性时尚,而且层次感十足,凸显出女性的优雅气质。而且这款半裙还是包臀裙的设计,很好的修饰出女性的身材线条,凸显出女性的优雅气质 221 | 14.573514223098755 222 | 请输入: 223 | 类型#裤*材质#牛仔布*颜色#白色*风格#简约*图案#线条*裤长#短裤*裤型#阔腿裤*裤腰型#高腰*裤口#毛边 224 | 请稍等... 225 | ################################################################################################################################ 226 | {'instruction': '类型#裤*材质#牛仔布*颜色#白色*风格#简约*图案#线条*裤长#短裤*裤型#阔腿裤*裤腰型#高腰*裤口#毛边', 'input': '', 'output': ''} 227 | tensor([[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 228 | 151645, 198, 151644, 872, 198, 31905, 2, 102693, 9, 229 | 106149, 2, 100664, 102437, 51827, 9, 102284, 2, 102440, 230 | 9, 104040, 2, 109962, 9, 108108, 2, 108236, 9, 231 | 102693, 45861, 2, 99534, 102693, 9, 102693, 24300, 2, 232 | 100607, 100447, 102693, 9, 102693, 102113, 24300, 2, 44636, 233 | 102113, 9, 102693, 39426, 2, 99676, 99319, 151645, 198, 234 | 151644, 77091]], device='cuda:0') 235 | <|im_start|>system 236 | You are a helpful assistant.<|im_end|> 237 | <|im_start|>user 238 | 类型#裤*材质#牛仔布*颜色#白色*风格#简约*图案#线条*裤长#短裤*裤型#阔腿裤*裤腰型#高腰*裤口#毛边<|im_end|> 239 | <|im_start|>assistant这款牛仔裤的设计风格十分简约,线条流畅,十分显瘦,而且高腰的设计十分拉长腿型,显高显瘦。裤子采用了白色的色系,十分清新甜美,而且十分百搭。阔腿裤的版型十分修饰腿型,十分显腿长,而且十分遮肉显瘦。裤脚采用了毛边的设计,十分复古有个性,而且十分甜美可爱。这款牛仔短裤十分百搭,而且十分显腿长,而且十分显瘦,而且十分甜美甜美,十分凸显青春活力的气息。<|im_end|> 240 | 22.39077615737915 241 | 请输入: 242 | 类型#裤*版型#显瘦*材质#牛仔布*颜色#深蓝色*裤腰型#高腰 243 | 请稍等... 244 | ################################################################################################################################ 245 | {'instruction': '类型#裤*版型#显瘦*材质#牛仔布*颜色#深蓝色*裤腰型#高腰', 'input': '', 'output': ''} 246 | tensor([[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 247 | 151645, 198, 151644, 872, 198, 31905, 2, 102693, 9, 248 | 40301, 24300, 2, 99580, 102372, 9, 106149, 2, 100664, 249 | 102437, 51827, 9, 102284, 2, 99194, 105681, 9, 102693, 250 | 102113, 24300, 2, 44636, 102113, 151645, 198, 151644, 77091]], 251 | device='cuda:0') 252 | <|im_start|>system 253 | You are a helpful assistant.<|im_end|> 254 | <|im_start|>user 255 | 类型#裤*版型#显瘦*材质#牛仔布*颜色#深蓝色*裤腰型#高腰<|im_end|> 256 | <|im_start|>assistant这款牛仔裤采用深蓝色的色调,显得沉稳大方,同时也十分的显瘦。高腰的版型更是修饰了腿型,显高显腿长。牛仔布的材质十分的舒适亲肤,而且牛仔的质地也能凸显出个性的感觉,十分的青春活力。无论是日常的休闲搭配,还是职场的职场风格,都能很好的展现出女性的优雅知性。<|im_end|> 257 | 8.462041139602661 258 | ``` 259 | 260 | 261 | -------------------------------------------------------------------------------- /qwen2_sft/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2024/3/7 15:40 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /qwen2_sft/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2023/3/16 21:50 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /qwen2_sft/dataset/advgen_toy.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"input": "类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞", "output": "简约而不简单的牛仔外套,白色的衣身十分百搭。衣身多处有做旧破洞设计,打破单调乏味,增加一丝造型看点。衣身后背处有趣味刺绣装饰,丰富层次感,彰显别样时尚。"}, 3 | {"input": "类型#裙*材质#针织*颜色#纯色*风格#复古*风格#文艺*风格#简约*图案#格子*图案#纯色*图案#复古*裙型#背带裙*裙长#连衣裙*裙领型#半高领", "output": "这款BRAND针织两件套连衣裙,简约的纯色半高领针织上衣,修饰着颈部线,尽显优雅气质。同时搭配叠穿起一条背带式的复古格纹裙,整体散发着一股怀旧的时髦魅力,很是文艺范。"}, 4 | {"input": "类型#上衣*风格#嘻哈*图案#卡通*图案#印花*图案#撞色*衣样式#卫衣*衣款式#连帽", "output": "嘻哈玩转童年,随时,没错,出街还是要靠卫衣来装酷哦!时尚个性的连帽设计,率性有范还防风保暖。还有胸前撞色的卡通印花设计,靓丽抢眼更富有趣味性,加上前幅大容量又时尚美观的袋鼠兜,简直就是孩子耍帅装酷必备的利器。"}, 5 | {"input": "类型#裤*风格#英伦*风格#简约", "output": "裤子是简约大方的版型设计,带来一种极简主义风格而且不乏舒适优雅感,是衣橱必不可少的一件百搭单品。标志性的logo可以体现出一股子浓郁的英伦风情,轻而易举带来独一无二的体验。"}, 6 | {"input": "类型#裙*裙下摆#弧形*裙腰型#高腰*裙长#半身裙*裙款式#不规则*裙款式#收腰", "output": "这款来自梵凯的半身裙富有十足的设计感,采用了别致的不规则设计,凸显出时尚前卫的格调,再搭配俏皮的高腰设计,收腰提臀的同时还勾勒出优美迷人的身材曲线,而且还帮你拉长腿部比例,释放出优雅娇俏的小女人味。并且独特的弧形下摆还富有流畅的线条美,一颦一动间展现出灵动柔美的气质。"}, 7 | {"input": "类型#上衣*版型#宽松*版型#显瘦*图案#线条*衣样式#衬衫*衣袖型#泡泡袖*衣款式#抽绳", "output": "这件衬衫的款式非常的宽松,利落的线条可以很好的隐藏身材上的小缺点,穿在身上有着很好的显瘦效果。领口装饰了一个可爱的抽绳,漂亮的绳结展现出了十足的个性,配合时尚的泡泡袖型,尽显女性甜美可爱的气息。"}, 8 | {"input": "类型#裙*材质#蕾丝*风格#宫廷*图案#刺绣*图案#蕾丝*裙型#大裙摆*裙下摆#花边*裙袖型#泡泡袖", "output": "宫廷风的甜美蕾丝设计,清醒的蕾丝拼缝处,刺绣定制的贝壳花边,增添了裙子的精致感觉。超大的裙摆,加上精细的小花边设计,上身后既带着仙气撩人又很有女人味。泡泡袖上的提花面料,在细节处增加了浪漫感,春日的仙女姐姐。浪漫蕾丝布满整个裙身,美丽明艳,气质超仙。"}, 9 | {"input": "类型#裤*版型#显瘦*颜色#黑色*风格#简约*裤长#九分裤", "output": "个性化的九分裤型,穿着在身上,能够从视觉上拉长你的身体比例,让你看起来更加的有范。简约的黑色系列,极具时尚的韵味,充分凸显你专属的成熟韵味。修身的立体廓形,为你塑造修长的曲线。"}, 10 | {"input": "类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领", "output": "文艺个性的印花连衣裙,藏青色底蕴,低调又大气,撞色太阳花分布整个裙身,绚丽而美好,带来时尚减龄的气质。基础款的舒适圆领,简约不失大方,勾勒精致脸庞。领后是一粒包布扣固定,穿脱十分方便。前片立体的打褶设计,搭配后片压褶的做工,增添层次和空间感,显瘦又有型。"}, 11 | {"input": "类型#裙*颜色#蓝色*风格#清新*图案#蝴蝶结", "output": "裙身处采用立体蝴蝶结装饰辅以蓝色条带点缀,令衣身造型饱满富有层次的同时为其注入一丝甜美气息。将女孩清新娇俏的一面衬托而出。"} 12 | ] -------------------------------------------------------------------------------- /qwen2_sft/dataset/alpaca_gpt4_data_zh.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "instruction": "保持健康的三个提示。", 4 | "input": "", 5 | "output": "以下是保持健康的三个提示:\n\n1. 保持身体活动。每天做适当的身体运动,如散步、跑步或游泳,能促进心血管健康,增强肌肉力量,并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物,避免高糖、高脂肪和加工食品,以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要,成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力,促进身体恢复,并提高注意力和记忆力。" 6 | }, 7 | { 8 | "instruction": "三原色是什么?", 9 | "input": "", 10 | "output": "三原色通常指的是红色、绿色和蓝色(RGB)。它们是通过加色混合原理创建色彩的三种基础颜色。在以发光为基础的显示设备中(如电视、计算机显示器、智能手机和平板电脑显示屏), 三原色可混合产生大量色彩。其中红色和绿色可以混合生成黄色,红色和蓝色可以混合生成品红色,蓝色和绿色可以混合生成青色。当红色、绿色和蓝色按相等比例混合时,可以产生白色或灰色。\n\n此外,在印刷和绘画中,三原色指的是以颜料为基础的红、黄和蓝颜色(RYB)。这三种颜色用以通过减色混合原理来创建色彩。不过,三原色的具体定义并不唯一,不同的颜色系统可能会采用不同的三原色。" 11 | }, 12 | { 13 | "instruction": "描述原子的结构。", 14 | "input": "", 15 | "output": "原子是物质的基本单位,它由三种基本粒子组成:质子、中子和电子。质子和中子形成原子核,位于原子中心,核外的电子围绕着原子核运动。\n\n原子结构具有层次性。原子核中,质子带正电,中子不带电(中性)。原子核非常小且致密,占据了原子总质量的绝大部分。电子带负电,通常围绕核运动,形成若干层次,称为壳层或电子层。电子数量与质子数量相等,使原子呈电中性。\n\n电子在每个壳层中都呈规律分布,并且不同壳层所能容纳的电子数也不同。在最里面的壳层一般只能容纳2个电子,其次一层最多可容纳8个电子,再往外的壳层可容纳的电子数逐层递增。\n\n原子核主要受到两种相互作用力的影响:强力和电磁力。强力的作用范围非常小,主要限制在原子核内,具有极强的吸引作用,使核子(质子和中子)紧密结合在一起。电磁力的作用范围较大,主要通过核外的电子与原子核相互作用,发挥作用。\n\n这就是原子的基本结构。原子内部结构复杂多样,不同元素的原子核中质子、中子数量不同,核外电子排布分布也不同,形成了丰富多彩的化学世界。" 16 | }, 17 | { 18 | "instruction": "如何减少空气污染?", 19 | "input": "", 20 | "output": "有很多方法来减少空气污染:\n\n1. 减少机动车排放: 通过支持可再生能源来驱动的交通工具,如电动汽车、混合动力汽车,使用公共交通工具,骑自行车或步行。\n2. 鼓励清洁能源: 政府和企业可以通过提供优惠政策和补贴来鼓励人们使用清洁能源。\n3. 减少工业排放: 工厂可以通过采用更清洁的生产方法,使用减排技术来减少工业废气的排放。\n4. 绿化城市: 种植更多的树木和植物可以吸收空气中的污染物和二氧化碳,有助于减少空气污染。\n5. 减少家庭能耗: 通过使用节能电器、关闭不必要的电源开关、减少用电量,能够减少家庭用电量,从而减少空气污染。\n6. 垃圾分类和回收利用: 妥善处理垃圾,避免焚烧垃圾,对垃圾进行分类和回收利用,可以减少污染物排放。\n\n总之,减少空气污染需要政府、企业和个人共同努力。我们每个人都要承担责任,通过自己的实际行动来保护环境。" 21 | }, 22 | { 23 | "instruction": "描述一次你不得不做出困难决定的经历。", 24 | "input": "", 25 | "output": "作为一名人工智能助手,我是由程序控制的,没有感情和个人意识,我的一切决策都是基于我的算法和程序的设定。但在某种情况下,我需要根据我的程序判断做出决定。\n\n例如,当我处理多个任务时,如果有多个任务具有相同的优先级,那么我需要依据我的程序进行判断,选择哪个任务来处理。\n\n在这种情况下,我会将任务进行比较,考虑它们的时间紧迫性、资源需求、风险和收益等因素,最后做出一个最佳的决定。\n\n虽然这并不是一个真正的困难决定,因为我只是执行我的程序,但它确实需要我进行一定的判断,以确保我能够做出最好的决定。" 26 | }, 27 | { 28 | "instruction": "辨认不同之处。 ", 29 | "input": "输入:Twitter,Instagram,Telegram。", 30 | "output": "Twitter,Instagram 和 Telegram 都是流行的社交媒体平台,但它们之间也有一些不同之处:\n\n- Twitter: 是一个微博社交媒体平台,用户可以发布 280 字符以内的消息(即推文)并与其他用户互动。它主要关注文字交流,尽管用户也可以分享照片和视频。\n- Instagram: 是一个图片和视频分享的社交媒体平台,用户可以在 Instagram 上发布照片和视频并与其他用户互动。它主要关注图片和视频的分享,尽管用户也可以添加文字说明。\n- Telegram: 是一个即时通讯和社交媒体平台,用户可以通过文本、语音、图片和视频与其他用户即时交流。与 Twitter 和 Instagram 不同,Telegram 更侧重于即时通讯,它还提供了端到端加密的私人聊天功能。" 31 | }, 32 | { 33 | "instruction": "解释为什么以下分数等同于1/4", 34 | "input": "输入:4/16", 35 | "output": "4/16等于1/4是因为我们可以约分分子分母都除以他们的最大公约数4,得到(4÷4)/ (16÷4)=1/4。分数的约分是用分子和分母除以相同的非零整数,来表示分数的一个相同的值,这因为分数实际上表示了分子除以分母,所以即使两个数同时除以同一个非零整数,分数的值也不会改变。所以4/16 和1/4是两种不同的书写形式,但它们的值相等。" 36 | }, 37 | { 38 | "instruction": "写一个关于主人公必须做出重要职业决定的第三人称叙述的短故事。", 39 | "input": "", 40 | "output": "杰克是一位年轻有为的律师,他生活在纽约市,并在一家知名的法律事务所工作。杰克的生活似乎很完美,他有自己的公寓,工作稳定,薪水丰厚。然而,内心深处,杰克一直有一个愿望,那就是放弃目前的工作,去旅行并帮助那些需要帮助的人。\n\n一天,杰克的老板找到他,让他在两个选项之间做出一个决定:一是获得一份更高职位的提升,机会难得;二是离开这个公司去寻找其他的机会。杰克知道这是他人生中的一个重要时刻,他必须做出一个他将永远无法改变的决定。\n\n杰克开始思考这两个选项,仔细权衡利弊。他热爱自己的工作,同时也知道这份提升对他的职业生涯有着重大的影响。然而,他内心深处的渴望让他无法拒绝外面的世界。\n\n最终,杰克做出了决定。他选择离开这家公司,去追求他内心深处的愿望。他的老板尊重他的决定,并为他送行。\n\n杰克的职业生涯并没有因此终止,他在旅途中遇到了许多人,帮助了许多需要帮助的人。他发现,作出这个决定是他人生中最正确的选择。\n\n这个故事讲述了一个年轻人面临重要职" 41 | }, 42 | { 43 | "instruction": "渲染一座房子的三维模型", 44 | "input": "", 45 | "output": "对不起,作为一个语言模型,我无法渲染三维模型。但我可以为您提供一些建议。\n\n在制作三维模型时,您需要使用专业的三维建模软件,例如 Blender、Maya 或 Sketchup。这些软件提供了丰富的建模工具,可以让您轻松创建三维模型。\n\n在渲染三维模型时,您可以使用光照和材质来增强模型的真实感。常见的渲染器有 Blender 的 Cycles 渲染器和 Maya 的 Arnold 渲染器,它们都能产生逼真的渲染结果。\n\n如果您是初学者,建议您搜索相关教程学习三维建模和渲染技术。这些技能可以让您更快地掌握如何渲染一座房子的三维模型。" 46 | }, 47 | { 48 | "instruction": "朱利叶斯·凯撒是如何死亡的?", 49 | "input": "", 50 | "output": "朱利叶斯·凯撒,又称尤利乌斯·恺撒(Julius Caesar)是古罗马的政治家、军事家和作家。他于公元前44年3月15日被刺杀。 \n\n根据历史记载,当时罗马元老院里一些参议员联合起来策划了对恺撒的刺杀行动,因为他们担心恺撒的统治将给罗马共和制带来威胁。在公元前44年3月15日(又称“3月的艾达之日”),恺撒去参加元老院会议时,被一群参议员包围并被攻击致死。据记载,他身中23刀,其中一刀最终致命。" 51 | }, 52 | { 53 | "instruction": "法国的首都是什么?", 54 | "input": "", 55 | "output": "法国的首都是巴黎。" 56 | }, 57 | { 58 | "instruction": "生成一个人们去野营旅行可能需要的十件物品的清单。", 59 | "input": "", 60 | "output": "1. 帐篷: 帐篷是野营必备的物品,可以提供庇护,避免风吹日晒,还能够防止野生动物的干扰。\n2. 睡袋: 睡袋能够为您提供一个温暖舒适的睡眠环境,确保您在野外可以拥有一个良好的睡眠。\n3. 防水夹克: 在野外,天气变化无常,因此携带一件防水夹克十分必要,可以防止您被雨水淋湿。\n4. 照明设备: 如手电筒、头灯等,在野外夜晚黑暗的情况下为您提供照明,确保您的安全。\n5. 烹饪用具: 如便携式炉灶、锅碗瓢盆等,方便您在野外烹饪食物。\n6. 刀具: 如多功能军刀、折叠刀等,能够满足您在野外的各种需求。\n7. 防蚊液: 在野外,蚊虫十分繁多,因此携带防蚊液能够有效避免蚊虫叮咬。\n8. 防晒霜: 日晒对皮肤有害,携带防晒霜能够有效防护紫外线,避免晒伤。\n9. 水壶: 在野外,水是必不可少的,携带一个水壶能够方便您随时饮用。\n10. 随身背包: 一个轻" 61 | }, 62 | { 63 | "instruction": "讨论大萧条的原因", 64 | "input": "", 65 | "output": "大萧条是20世纪初期发生在全球范围内的经济衰退现象,其中最为严重的地区当属美国。这场经济危机持续了近十年之久,它的起因较为复杂,涉及多方面因素。\n\n首先,我们要提到1929年的股市崩盘。股票价格大幅下跌,导致投资者损失惨重,进而影响了消费信心。\n\n其次,当时的金融政策也是一个重要因素。为了维持金本位制度,联邦储备系统采取紧缩政策,却进一步加剧了资金短缺的问题。\n\n此外,当时政府对失业率的应对不力,社会保障制度不健全,也为经济衰退的进一步恶化埋下了祸根。\n\n最后,国际贸易也受到了影响。由于当时许多国家采取贸易保护主义政策,全球贸易额大幅下降,给各国经济造成了巨大冲击。\n\n综上所述,大萧条的原因是多方面的,涉及到金融政策,国际贸易,社会保障制度等多个领域。它给全球经济发展带来了严重的挑战,也为我们提供了深刻的历史经验教训。" 66 | }, 67 | { 68 | "instruction": "将以下内容分类为动物、植物和矿物质:橡树、铜矿石、大象。", 69 | "input": "", 70 | "output": "植物: 橡树\n动物:大象\n矿物质:铜矿石" 71 | }, 72 | { 73 | "instruction": "自然语言处理中词嵌入的用途。", 74 | "input": "", 75 | "output": "词嵌入是一种自然语言处理技术,用于表示文本数据。它把单词映射到一个高维的、连续的数值向量空间,并保留了单词之间的语义关系。\n\n词嵌入在自然语言处理中有多种用途,包括但不限于:\n\n1. 提取文本中的特征:词嵌入能够捕获文本数据中的词义和结构信息,帮助提取文本的关键特征。\n\n2. 降维:自然语言处理的任务通常会涉及大量的单词。词嵌入能够将高维的单词空间压缩到一个较低维度的连续向量空间中,大大减少计算量。\n\n3. 相似性搜索:词嵌入算法能够保留单词之间的语义关系。计算相似性时,查找词向量空间中的最近邻居可用来实现相关文本的自动检索。\n\n4. 文本分类:词嵌入向量可作为文本分类器的输入数据。它能够提取文本语义和结构信息,并为文本分类模型提供有用的特征。\n\n5. 机器翻译:词嵌入技术也可以用于机器翻译。利用词嵌入向量,可以将源语言单词映射到目标语言的单词向量空间中,实现跨语言文本检索和翻译。\n\n6. 自然语言生成:词嵌入也用于自然语言生成任务。在文本生成过程中,可以使用词嵌入来帮助生成人类可读的、语义连" 76 | }, 77 | { 78 | "instruction": "描述计算机主板的功能", 79 | "input": "", 80 | "output": "计算机主板,也称为主板、系统板或系统板,是计算机中心组件之一,承担着连接并控制计算机中各硬件部件之间的通讯与数据传输功能。\n\n主板上布置了各种总线,包括数据总线、地址总线、控制总线等,通过这些总线,主板连接了中央处理器(CPU)、内存(RAM)、硬盘驱动器(HDD或SSD)、光驱、网卡、声卡、视频卡等硬件设备,协调它们之间的工作。\n\n主板还搭载了各种芯片组,起到调节和管理各个硬件部件运作的作用。比如,南桥芯片组负责管理主板上的外部设备接口,如USB接口、串口、并口等,而北桥芯片组则主要负责管理处理器、内存和PCI总线之间的数据交互。\n\n此外,主板上还设有BIOS芯片,存储着计算机的基本输入输出系统,控制计算机启动时硬件自检和操作系统的初始化工作。\n\n总之,计算机主板充当着计算机硬件系统的枢纽,负责各个部件之间的协调工作,是计算机运行不可缺少的重要部件。" 81 | }, 82 | { 83 | "instruction": "逆向工程此代码以创建新版本。", 84 | "input": "", 85 | "output": "逆向工程是指通过分析现有软件或系统来充分理解其设计和工作原理,并基于这些理解来创建新版本。由于你没有提供具体的代码,我无法给出确切的步骤来逆向工程它。\n\n但是,一般来说,逆向工程代码通常涉及以下步骤:\n\n1. 阅读和分析代码:这是逆向工程中最重要的一步。你需要仔细阅读代码,了解它如何工作,以及它使用的算法和数据结构。\n\n2. 理解代码的功能和目的:在阅读代码时,应该注意了解代码的功能和目的,以便在创建新版本时能保持原有功能。\n\n3. 提取关键信息:通过分析代码,提取出关键信息,包括算法,数据结构,输入输出等。\n\n4. 根据提取的信息创建新版本:使用提取的信息,创建新版本的代码,可以进行优化,添加新功能等。\n\n需要注意的是,逆向工程可能是一项复杂且费时的工程,需要具备良好的代码阅读能力和分析能力。此外,在逆向工程代码时,应注意遵守相关法律法规,不要侵犯他人的知识产权。" 86 | }, 87 | { 88 | "instruction": "提出数据隐私问题的道德解决方案", 89 | "input": "", 90 | "output": "数据隐私问题的解决方案从道德角度考虑,可以包括以下几个方面:\n\n1. 公开透明: 企业和组织需要更加公开透明地收集,处理和存储个人信息。数据收集和使用的政策应该明确表明,个人信息将如何被使用,谁可以访问,以及在什么情况下可能会被分享。\n\n2. 提供选择: 个人应该有选择是否提交个人信息,以及控制其信息如何被使用的权利。这意味着企业需要提供简单明了的选择:“退出”或“不同意”的选项。\n\n3. 保护个人信息: 企业和组织有责任采取适当的技术和组织措施来保护个人信息免受未经授权访问和泄露。这包括对信息的加密,访问控制和监控。\n\n4. 法律监管: 政府需要制定并执行严格的数据保护法律,以保护公民免受隐私侵犯。这些法律应规定企业如何收集,使用和分享个人信息,以及应对违规行为的惩罚措施。\n\n5. 道德责任: 企业和组织应认识到,保护数据隐私不仅仅是法律责任,更是一种道德责任。这意味着他们应该以一种合乎道德的方式收集,使用和处理个人信息,尊重个人权利和自主权。\n\n上述举措为解决数据隐私问题提供了道德指南,但通过不断努力和跟进实施才能真正保护人们的" 91 | }, 92 | { 93 | "instruction": "生成三个与“道歉”意思相同的动词。", 94 | "input": "", 95 | "output": "1. 抱歉\n2. 致歉\n3. 认错." 96 | }, 97 | { 98 | "instruction": "DNA代表什么?", 99 | "input": "", 100 | "output": "DNA是脱氧核糖核酸(Deoxyribonucleic Acid)的缩写。它是一种生物大分子,负责编码遗传信息并传递遗产特征。 DNA分子由四种核苷酸:腺嘌呤(A),鸟嘌呤(G),胞嘧啶(C)和胸腺嘧啶(T)组成,以特定的序列排列,共同构成某个物种特有的基因组。 DNA的双螺旋结构和碱基配对机制使其能很好地复制和完成遗传信息的转录和翻译。" 101 | }, 102 | { 103 | "instruction": "比较古巴导弹危机和越南战争。", 104 | "input": "", 105 | "output": "古巴导弹危机和越南战争都是20世纪冷战期间发生的重大事件,它们在当时对世界的政治格局产生了深远的影响。但除此之外,两者之间还有许多不同之处。\n\n古巴导弹危机,又称“加勒比海危机”,发生在1962年,是指苏联在古巴部署核武器,导致美国和苏联两个超级大国之间的关系紧张,最终双方通过谈判,平息了这场危机。这一事件被认为是冷战中最危险的时刻,是世界上最接近核战争的时刻。\n\n越南战争,发生在1955年到1975年,是一场旷日持久的战争。它是由北越和南越之间的内战开始的,但最终美国和其他西方国家介入,将战争升级成了东西方意识形态的对抗。这场战争导致了巨大的人道灾难,数百万人死亡,越南、老挝和柬埔寨三国都受到了巨大的影响。\n\n在政治影响上,古巴导弹危机是一场外交危机,它使得美国和苏联两个超级大国关系紧张,但最终通过谈判化解了危机。越南战争则是一场旷日持久的战争,使美国在国际上的形象受损,也对美国内部政治造成了巨大的影响。\n\n在持续时间" 106 | }, 107 | { 108 | "instruction": "生成随机单词列表。", 109 | "input": "", 110 | "output": "[\"hippopotamus\", \"giraffe\", \"rhinoceros\", \"cheetah\", \"platypus\", \"parrot\", \"jellyfish\", \"octopus\", \"dolphin\", \"panther\"]" 111 | }, 112 | { 113 | "instruction": "把关于滑铁卢历史的录音转录成文字。", 114 | "input": "", 115 | "output": "很抱歉,但我无法直接转录录音。您需要提供录音文件的链接或将录音文本提供给我,然后我才能将其转录成文字。" 116 | }, 117 | { 118 | "instruction": "世界上最著名的画家是谁?", 119 | "input": "", 120 | "output": "世界上有许多著名的画家,其中最著名的可能是意大利文艺复兴时期的画家列奥纳多·达·芬奇(Leonardo da Vinci)。他的 杰作包括《蒙娜丽莎》(Mona Lisa)和《最后的晚餐》(The Last Supper)。但是,这个问题并没有确定的答案,因为每个人都有自己的看法。其他著名的画家还有毕加索(Pablo Picasso)、梵高(Vincent van Gogh)、米开朗基罗(Michelangelo)、雷普尔(Raphael)、罗伯特(Rubens)等。" 121 | }, 122 | { 123 | "instruction": "解释齿轮抵抗力的概念。", 124 | "input": "", 125 | "output": "齿轮抵抗力是用来描述齿轮传动系统中各组件受到的摩擦力及惯性力的总和。它主要包括以下几个方面:\n\n1. 齿轮啮合的滑动摩擦力:当齿轮的齿面相互啮合时,会产生相对滑动,从而导致一定的滑动摩擦力;\n\n2. 齿间间隙的碰撞力:齿间间隙存在一定的间隙,在两齿传动过程中会产生碰撞,产生碰撞力;\n\n3. 轴承摩擦力:轴承承受轴和齿轮的转矩作用,会产生一定的摩擦力;\n\n4. 惯性力:齿轮旋转时,其自身及齿间有相对运动,会产生各种类型的惯性力,包括离心惯性力、切向惯性力等。\n\n齿轮抵抗力会影响齿轮传动的效率,并导致热量的产生和噪音的增加。通过采用合理的齿轮结构设计,选择合适的材料和润滑剂,可有效降低齿轮抵抗力,使齿轮传动更加平稳高效。" 126 | }, 127 | { 128 | "instruction": "查水的沸点。", 129 | "input": "", 130 | "output": "水的沸点是指水在大气压强下由液态变成气态的温度。在标准大气压(101.325 kPa)下,水的沸点约为 100°C (摄氏度) 或 212°F(华氏度)。 需要注意的是,水的沸点会随着海拔的改变而改变,例如,在高海拔地区,空气压强降低,水的沸点也会降低。此外,水中溶解物质的多少也会影响沸点,如盐水的沸点要高于纯水的沸点。" 131 | }, 132 | { 133 | "instruction": "描述查理二世国王的生活和统治。", 134 | "input": "", 135 | "output": "查理二世是英格兰、苏格兰和爱尔兰的国王,他在位时间为1660年至1685年。他是斯图亚特王朝的一员,他的父亲是查理一世,在内战中被判处死刑,因此他在青年时期度过了一段流亡生活。1660年,查理二世在王政复辟中重新登上王位。\n\n查理二世统治时期,政治上的一个重要特点是他确立了君主与议会的共治局面,使英国走上了君主立宪制的道路。与此同时,他在宗教政策上采取了一种宽容和谦和的态度,允许天主教徒和清教徒在英格兰存在,并鼓励与其他欧洲国家建立外交关系。\n\n查理二世的统治时期也面临一些挑战,比如1665年的鼠疫大流行和1666年的伦敦大火。但查理二世积极应对这些挑战,下令疏散城市、建造医院和改善卫生环境,推动了伦敦重建。此外,他还重视文化艺术的发展,推动了英国文艺复兴运动的展开。\n\n在个人生活中,查理二世以魅力和风趣著称,他热爱社交活动,赛马和音乐,曾养过许多宠物猎犬和猎鹰。他与妻子凯瑟琳·布拉甘萨没有子嗣,但他公开承认了13" 136 | }, 137 | { 138 | "instruction": "确定经线,将北美洲和南美洲分开。", 139 | "input": "", 140 | "output": "您好,北美洲和南美洲分别位于西半球。这两大洲被一条东西走向的大陆分界线分隔开来。这条分界线被称为巴拿马分水岭。它位于巴拿马地峡的最窄处,将巴拿马的加勒比海海岸与太平洋海岸分隔开来。近似上,我们可以用80°W经线来确定分界线,将北美洲以北地区与南美洲以南地区分隔开来。" 141 | }, 142 | { 143 | "instruction": "解释植物细胞呼吸的过程。", 144 | "input": "", 145 | "output": "植物细胞呼吸可以定义为细胞通过氧气氧化有机分子并产生能量的过程。这个过程发生在线粒体中,并分为三个主要部分:\n\n1. 糖酵解(glycolysis):这个过程在细胞质中进行,将葡萄糖(或其他糖)分解成两个三碳分子的丙酮酸(pyruvate),并释放少量的能量。\n2. 柠檬酸循环(citric acid cycle):这个过程在线粒体中进行,丙酮酸会转化为乙酰辅酶A(acetyl-CoA)并进入柠檬酸循环。柠檬酸循环中,乙酰辅酶A会被氧化,释放二氧化碳和能量。\n3. 电子传递链(electron transport chain):这个过程也在线粒体中进行,通过氧化还原反应将电子从高能级传递到低能级,释放能量。最终,氧气接收电子并结合氢离子形成水。\n\n在这个过程中,能量以ATP(三磷酸腺苷)的形式储存并释放。植物细胞呼吸与动物细胞呼吸非常相似,区别主要在于植物细胞具有叶绿体进行光合作用,可以自己产生糖分供呼吸作用使用。" 146 | }, 147 | { 148 | "instruction": "生成一个形容词列表,描述一个勇敢的人。", 149 | "input": "", 150 | "output": "1. 勇敢\n2. 英勇\n3. 英勇无畏\n4. 坚定\n5. 无畏\n6. 勇猛\n7. 刚毅\n8. 毅然\n9. 果敢\n10. 胆大\n11. 勇往直前\n12. 勇敢坚定\n13. 勇气十足\n14. 不屈不挠\n15. 不怕困难" 151 | }, 152 | { 153 | "instruction": "概述森林砍伐的后果。", 154 | "input": "", 155 | "output": "森林砍伐是指大规模砍伐森林以获取木材或者开辟土地进行其他活动。这一行为会造成许多环境和社会后果。\n\n从环境方面来看,森林砍伐会使得土地流失肥沃的表土,导致土壤流失和干旱。同时,森林砍伐会破坏动植物的栖息地,威胁生物多样性。森林的减少还会改变气候模式,引发干旱,洪水等自然灾害。\n\n社会方面,森林砍伐会影响到当地社区的经济和生活方式。许多社区依赖森林的资源,如木材,水果和药材,森林砍伐可能会破坏他们的生计。森林砍伐还可能导致土地纠纷和冲突。\n\n此外,森林砍伐还会影响全球气候变化。森林是重要的碳汇,减少森林会使大量的二氧化碳释放进大气层,加剧全球变暖。\n\nIn conclusion,砍伐森林会带来多种环境和社会问题,为了可持续发展,应当采取措施控制砍伐行为,保护森林。" 156 | }, 157 | { 158 | "instruction": "制定一个减少家庭用电的计划。", 159 | "input": "", 160 | "output": "制定减少家庭用电的计划可以通过以下步骤实现:\n\n1. 了解家庭用电情况:首先,通过查看电费账单和电表读数了解家庭的用电情况,找出哪些家电设备用电量较大,哪些时间段用电量较高。\n\n2. 提高用电效率:购买节能型家电,选择能效等级高的产品,可以有效降低用电量。而对于已有的家电,使用时应注意节能,例如在使用空调时保持室内温度适宜,不要过低或过高。\n\n3. 改变用电习惯:鼓励家庭成员改变用电习惯,例如及时关闭不用的电源开关,避免长时间待机耗电。尽量利用自然光线,减少白天开灯的时间。\n\n4. 考虑可再生能源:如果条件允许,可以考虑安装太阳能发电系统,利用太阳能发电来降低家庭用电量。\n\n5. 定期检查电器:定期检查电器是否正常运行,及时维修不良电器,以免造成浪费。\n\n最后,要持续关注家庭用电情况,定期评估减少用电的效果,不断调整计划,以适应家庭的实际需求。" 161 | }, 162 | { 163 | "instruction": "快速棕色的狐狸跳了。", 164 | "input": "", 165 | "output": "This sentence in Chinese roughly translates to \"The quick brown fox jumps.\" in English. Is there something specific you would like me to do with this information?" 166 | }, 167 | { 168 | "instruction": "分析给定文本的语气。 ", 169 | "input": "输入:COVID-19大流行对世界产生了巨大影响,它已经彻底改变了我们的生活。", 170 | "output": "本句话的语气是严肃认真的。" 171 | }, 172 | { 173 | "instruction": "使用给定数据计算中位数。", 174 | "input": "输入:\n[2, 3, 7, 8, 10]", 175 | "output": "中位数是指在一组数据中,这组数据从小到大排列后,位于中间位置的数。如果数据的数量为奇数,那么中位数就是中间那个数。如果数据的数量为偶数,那么中位数为中间两个数的平均值。\n在这个例子中,给定的数据集是[2, 3, 7, 8, 10],已经从小到大排列。 数据数量是5,是一个奇数,所以中位数就是正中间的那个数,即7。所以这组数据的中位数是7。" 176 | } 177 | ] 178 | -------------------------------------------------------------------------------- /qwen2_sft/ft_qwen2/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2024/3/7 16:12 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /qwen2_sft/ft_qwen2/config.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2023/3/5 21:29 4 | # @author : Mo 5 | # @function: config of qwen1.5 6 | 7 | 8 | # optimized for RTX 4090. for larger GPUs, increase some of these? 9 | MICRO_BATCH_SIZE = 4 # 4 # default=4 # this could actually be 5 but i like powers of 2 10 | BATCH_SIZE = 128 # 128 11 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE 12 | LEARNING_RATE = 5e-5 ### 1e-5 13 | # LEARNING_RATE = 3e-4 # default=3e-4 # the Karpathy constant 14 | EPOCHS = 3 # default=3 # we don't always need 3 tbh 15 | # LORA_DROPOUT = 0.1 16 | # LORA_ALPHA = 32 17 | # LORA_R = 32 18 | WEIGHT_DECAY = 0.01 19 | LORA_DROPOUT = 0.1 20 | LORA_ALPHA = 16 21 | LORA_R = 8 22 | SAVE_STEPS = 384 23 | VAL_SET_SIZE = 0 24 | MAX_LENGTH_Q = 128 - 1 # default=128 - 2 25 | MAX_LENGTH_A = 128 - 1 # default=128 - 2 26 | MAX_LENGTH_QA = MAX_LENGTH_Q + MAX_LENGTH_A + 2 27 | TARGET_MODULES = ["q_proj", 28 | "k_proj", 29 | "v_proj", 30 | # "o_proj", 31 | # "down_proj", 32 | # "gate_proj", 33 | # "up_proj", 34 | ] 35 | 36 | PATH_MODEL_PRETRAIN = "" 37 | REPO_ID = "Qwen/Qwen1.5-7B-Chat" 38 | PATH_MODEL_PRETRAIN = PATH_MODEL_PRETRAIN if PATH_MODEL_PRETRAIN else REPO_ID 39 | DATA_PATH = "../dataset/alpaca_gpt4_data_zh.json" 40 | MODEL_SAVE_DIR = "model_sft" 41 | 42 | IS_PARALLELIZABLE = True 43 | MODEL_PARALLEL = True 44 | USE_CACHE = False 45 | CUDA_VISIBLE_DEVICES = "0" 46 | USE_TORCH = "1" 47 | CPU_NUMS = "9" 48 | USE_CUDA = False if CUDA_VISIBLE_DEVICES == "-1" else True 49 | USE_ALL_LOSS = True # 计算loss时是否计算全部(False则为) 50 | 51 | """ 52 | ('base_model.model.model.embed_tokens.weight', torch.bfloat16, False) 53 | ('base_model.model.model.layers.31.self_attn.q_proj.weight', torch.bfloat16, False) 54 | ('base_model.model.model.layers.31.self_attn.q_proj.bias', torch.bfloat16, False) 55 | ('base_model.model.model.layers.31.self_attn.q_proj.lora_A.default.weight', torch.float32, True) 56 | ('base_model.model.model.layers.31.self_attn.q_proj.lora_B.default.weight', torch.float32, True) 57 | ('base_model.model.model.layers.31.self_attn.k_proj.weight', torch.bfloat16, False) 58 | ('base_model.model.model.layers.31.self_attn.k_proj.bias', torch.bfloat16, False) 59 | ('base_model.model.model.layers.31.self_attn.k_proj.lora_A.default.weight', torch.float32, True) 60 | ('base_model.model.model.layers.31.self_attn.k_proj.lora_B.default.weight', torch.float32, True) 61 | ('base_model.model.model.layers.31.self_attn.v_proj.weight', torch.bfloat16, False) 62 | ('base_model.model.model.layers.31.self_attn.v_proj.bias', torch.bfloat16, False) 63 | ('base_model.model.model.layers.31.self_attn.v_proj.lora_A.default.weight', torch.float32, True) 64 | ('base_model.model.model.layers.31.self_attn.v_proj.lora_B.default.weight', torch.float32, True) 65 | ('base_model.model.model.layers.31.self_attn.o_proj.weight', torch.bfloat16, False) 66 | ('base_model.model.model.layers.31.mlp.gate_proj.weight', torch.bfloat16, False) 67 | ('base_model.model.model.layers.31.mlp.up_proj.weight', torch.bfloat16, False) 68 | ('base_model.model.model.layers.31.mlp.down_proj.weight', torch.bfloat16, False) 69 | ('base_model.model.model.layers.31.input_layernorm.weight', torch.bfloat16, False) 70 | ('base_model.model.model.layers.31.post_attention_layernorm.weight', torch.bfloat16, False) 71 | ('base_model.model.model.norm.weight', torch.bfloat16, False) 72 | ('base_model.model.lm_head.weight', torch.bfloat16, False) 73 | """ 74 | # layernorm_s = ["post_attention_layernorm", 75 | # "input_layernorm", 76 | # "norm" 77 | # ] 78 | -------------------------------------------------------------------------------- /qwen2_sft/ft_qwen2/evaluation.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2023/3/25 21:56 4 | # @author : Mo 5 | # @function: 验证评估 6 | 7 | 8 | import logging as logger 9 | import traceback 10 | import logging 11 | import random 12 | import time 13 | import json 14 | import sys 15 | import os 16 | 17 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 18 | print(path_root) 19 | sys.path.append(path_root) 20 | from qwen2_sft.ft_qwen2.config import CUDA_VISIBLE_DEVICES, USE_TORCH, CPU_NUMS # from config 21 | os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:3072" 22 | os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES 23 | os.environ["USE_TORCH"] = USE_TORCH 24 | os.environ["OMP_NUM_THREADS"] = CPU_NUMS # export OMP_NUM_THREADS=1 25 | os.environ["OPENBLAS_NUM_THREADS"] = CPU_NUMS # export OPENBLAS_NUM_THREADS=1 26 | os.environ["MKL_NUM_THREADS"] = CPU_NUMS # export MKL_NUM_THREADS=1 27 | os.environ["VECLIB_MAXIMUM_THREADS"] = CPU_NUMS # export VECLIB_MAXIMUM_THREADS=1 28 | os.environ["NUMEXPR_NUM_THREADS"] = CPU_NUMS # export NUMEXPR_NUM_THREADS=1 29 | 30 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 31 | from peft import prepare_model_for_int8_training 32 | from peft import LoraConfig, get_peft_model 33 | from transformers import GenerationConfig 34 | from pydantic import BaseModel 35 | from rouge import Rouge # pip install rouge 36 | from tqdm import tqdm 37 | import torch 38 | import jieba 39 | 40 | # from qwen2_sft.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as LLMTokenizer 41 | # from qwen2_sft.models.qwen2.configuration_qwen2 import Qwen2Config as LLMConfig 42 | # from qwen2_sft.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as LLMModel 43 | from transformers import Qwen2Tokenizer as LLMTokenizer 44 | from transformers import Qwen2ForCausalLM as LLMModel 45 | from transformers import Qwen2Config as LLMConfig 46 | from qwen2_sft.ft_qwen2.config import PATH_MODEL_PRETRAIN, DATA_PATH, MODEL_SAVE_DIR, REPO_ID 47 | from qwen2_sft.ft_qwen2.config import MICRO_BATCH_SIZE, BATCH_SIZE, GRADIENT_ACCUMULATION_STEPS 48 | from qwen2_sft.ft_qwen2.config import LEARNING_RATE, EPOCHS, SAVE_STEPS, VAL_SET_SIZE, TARGET_MODULES 49 | from qwen2_sft.ft_qwen2.config import IS_PARALLELIZABLE, MODEL_PARALLEL, USE_CACHE 50 | from qwen2_sft.ft_qwen2.config import MAX_LENGTH_Q, MAX_LENGTH_A, MAX_LENGTH_QA 51 | from qwen2_sft.ft_qwen2.config import LORA_DROPOUT, LORA_ALPHA, LORA_R 52 | from qwen2_sft.ft_qwen2.config import USE_CUDA 53 | 54 | 55 | # # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} 56 | # world_size = int(os.environ.get("WORLD_SIZE", 1)) 57 | # ddp = world_size != 1 58 | # device_map = "auto" 59 | # # USE_CUDA = True 60 | # print(device_map) 61 | # print(ddp) 62 | 63 | 64 | def load_model_state(model, model_save_dir="./", model_name="adapter_model.safetensors", device="cpu"): 65 | """ 仅加载模型参数(推荐使用) """ 66 | try: 67 | path_model = os.path.join(model_save_dir, model_name) 68 | peft_config = LoraConfig.from_pretrained(model_save_dir) 69 | peft_config.inference_mode = True 70 | model = get_peft_model(model, peft_config) 71 | 72 | try: 73 | if path_model.endswith(".safetensors"): 74 | from safetensors.torch import load_file, save_file 75 | from safetensors import safe_open 76 | state_dict = {} 77 | with safe_open(path_model, framework="pt", device="cpu") as f: 78 | for k in f.keys(): 79 | state_dict[k] = f.get_tensor(k) 80 | ### if path_model.endswith(".bin") or path_model.endswith(".pt"): 81 | else: 82 | state_dict = torch.load(path_model, map_location=torch.device(device)) 83 | except Exception as e: 84 | print(traceback.print_exc()) 85 | ### 全部训练完的话会用这个, 即便是.safetensors 86 | state_dict = torch.load(path_model, map_location=torch.device(device)) 87 | 88 | # print(state_dict.keys()) 89 | state_dict = {"base_model.model." + k.replace("_orig_mod.", "") 90 | .replace(".lora_A.weight", ".lora_A.default.weight") 91 | .replace(".lora_B.weight", ".lora_B.default.weight") 92 | : v for k, v in state_dict.items()} 93 | print(state_dict.keys()) 94 | print("#" * 128) 95 | ### 排查不存在model.keys的 state_dict.key 96 | name_dict = {name: 0 for name, param in model.named_parameters()} 97 | print(name_dict.keys()) 98 | print("#" * 128) 99 | for state_dict_key in state_dict.keys(): 100 | if state_dict_key not in name_dict: 101 | print("{} is not exist!".format(state_dict_key)) 102 | model.load_state_dict(state_dict, strict=False) 103 | # model.to(device) 104 | print("******model loaded success******") 105 | print("self.device: {}".format(device)) 106 | except Exception as e: 107 | print(str(e)) 108 | raise Exception("******load model error******") 109 | return model 110 | def save_model_state(model, config=None, model_save_dir="./", model_name="pytorch_model.bin"): 111 | """ 仅保存模型参数(推荐使用) """ 112 | if not os.path.exists(model_save_dir): 113 | os.makedirs(model_save_dir) 114 | if config: 115 | config.save_pretrained(model_save_dir) 116 | # save model 117 | path_model = os.path.join(model_save_dir, model_name) 118 | # torch.save(model.state_dict(), path_model) 119 | grad_params_dict = {k: v.to("cpu") for k, v in model.named_parameters() 120 | if v.requires_grad == True} 121 | torch.save(grad_params_dict, path_model) 122 | print("******model_save_path is {}******".format(path_model)) 123 | def prepare_model_for_half_training(model, output_embedding_layer_name="lm_head", 124 | use_gradient_checkpointing=True, layer_norm_names=["layer_norm"]): 125 | r""" 126 | This method wrapps the entire protocol for preparing a model before running a training. This includes: 127 | 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm 128 | head to fp32 129 | 130 | Args: 131 | model, (`transformers.PreTrainedModel`): 132 | The loaded model from `transformers` 133 | """ 134 | # 不要使用 model.half(), 这样会先截取精度再训练了, 最初data就要保持half 135 | for name, param in model.named_parameters(): 136 | # freeze base model's layers 137 | param.requires_grad = False 138 | # cast layer norm in fp32 for stability for 8bit models 139 | if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names): 140 | param.data = param.data.to(torch.float32) 141 | elif output_embedding_layer_name in name: # lm_head也需要是tf.float32(最后一层) 142 | param.data = param.data.to(torch.float32) 143 | else: 144 | param.data = param.data.to(torch.half) 145 | 146 | if use_gradient_checkpointing: 147 | # For backward compatibility 148 | if hasattr(model, "enable_input_require_grads"): 149 | model.enable_input_require_grads() 150 | else: 151 | def make_inputs_require_grad(module, input, output): 152 | output.requires_grad_(True) 153 | model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) 154 | # enable gradient checkpointing for memory efficiency 155 | model.gradient_checkpointing_enable() 156 | return model 157 | def print_named_parameters(model, use_print_data=False): 158 | """ 打印模型训练参数/数据类型信息 """ 159 | trainable_params = 0 160 | all_param = 0 161 | for name, param in model.named_parameters(): 162 | if use_print_data: 163 | print((name, param.data.dtype, param.requires_grad, param.data)) 164 | else: 165 | print((name, param.data.dtype, param.requires_grad)) 166 | num_params = param.numel() 167 | # if using DS Zero 3 and the weights are initialized empty 168 | if num_params == 0 and hasattr(param, "ds_numel"): 169 | num_params = param.ds_numel 170 | all_param += num_params 171 | if param.requires_grad: 172 | trainable_params += num_params 173 | print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}") 174 | def txt_read(path, encode_type="utf-8", errors=None): 175 | """ 176 | 读取txt文件,默认utf8格式, 不能有空行 177 | Args: 178 | path[String]: path of file of read, eg. "corpus/xuexiqiangguo.txt" 179 | encode_type[String]: data encode type of file, eg. "utf-8", "gbk" 180 | errors[String]: specifies how encoding errors handled, eg. "ignore", strict 181 | Returns: 182 | lines[List]: output lines 183 | """ 184 | lines = [] 185 | try: 186 | file = open(path, "r", encoding=encode_type, errors=errors) 187 | lines = file.readlines() 188 | file.close() 189 | except Exception as e: 190 | logger.info(str(e)) 191 | finally: 192 | return lines 193 | def load_json(path: str, encoding: str="utf-8"): 194 | """ 195 | Read Line of List form file 196 | Args: 197 | path: path of save file, such as "txt" 198 | encoding: type of encoding, such as "utf-8", "gbk" 199 | Returns: 200 | model_json: dict of word2vec, eg. [{"大漠帝国":132}] 201 | """ 202 | with open(path, "r", encoding=encoding) as fj: 203 | model_json = json.load(fj) 204 | fj.close() 205 | return model_json 206 | def generate_prompt(data_point, is_logger=False): 207 | """ 指令微调: 208 | """ 209 | text_input = data_point.get("instruction", "") + "\t" + data_point.get("input", "") 210 | text_out = data_point.get("output", "") 211 | prompt_text_1 = """<|im_start|>system 212 | You are a helpful assistant.<|im_end|> 213 | <|im_start|>user 214 | {}<|im_end|> 215 | <|im_start|>assistant""" 216 | tokens = tokenizer.tokenize(prompt_text_1) 217 | max_len = MAX_LENGTH_QA - len(tokens) 218 | text_1 = prompt_text_1.format(text_input[:max_len]) 219 | x = tokenizer.encode(text_1, add_special_tokens=False) 220 | if len(x) > (MAX_LENGTH_Q + MAX_LENGTH_A): 221 | x = x[:MAX_LENGTH_Q] + [ID_EOS] 222 | out = {"input_ids": x, "labels": []} 223 | if is_logger: 224 | print(text_1) 225 | print(out) 226 | return out 227 | 228 | 229 | model = LLMModel.from_pretrained(PATH_MODEL_PRETRAIN) 230 | # model = prepare_model_for_half_training(model, 231 | # use_gradient_checkpointing=False, 232 | # output_embedding_layer_name="lm_head", 233 | # layer_norm_names=["post_attention_layernorm", 234 | # "input_layernorm", 235 | # "norm" 236 | # ], 237 | # ) 238 | config = LoraConfig(target_modules=TARGET_MODULES, 239 | lora_dropout=LORA_DROPOUT, 240 | lora_alpha=LORA_ALPHA, 241 | task_type="CAUSAL_LM", 242 | bias="none", 243 | r=LORA_R, 244 | ) 245 | model = get_peft_model(model, config) 246 | model = load_model_state(model=model, model_save_dir=MODEL_SAVE_DIR) 247 | if USE_CUDA: 248 | model = model.cuda() 249 | else: 250 | model = model.bfloat16() 251 | print_named_parameters(model, use_print_data=True) 252 | # print_named_parameters(model) 253 | 254 | tokenizer = LLMTokenizer.from_pretrained(PATH_MODEL_PRETRAIN, add_eos_token=True) 255 | ID_END = 151643 256 | ID_BOS = 151644 257 | ID_EOS = 151645 258 | ID_PAD = ID_EOS 259 | ID_BR = 1699 # "\\n" 260 | ID_USER = 872 261 | ID_MODEL = 8948 262 | tokenizer.pad_token_id = ID_EOS 263 | tokenizer.eos_token_id = ID_EOS 264 | tokenizer.padding_side = "left" 265 | # tokenizer.padding_side = "right" # NO use attention-mask 266 | print(ID_PAD) 267 | print(ID_BOS) 268 | print(ID_EOS) 269 | print(ID_BR) 270 | print(ID_USER) 271 | print(ID_MODEL) 272 | """ 273 | "<|endoftext|>": 151643, 274 | "<|im_start|>": 151644, 275 | "<|im_end|>": 151645 276 | "system": 8948, 277 | "user": 872, 278 | "\\n": 1699, 279 | """ 280 | STOP_WORDS_IDS = [[ID_BOS], [ID_EOS], [ID_END]] 281 | 282 | 283 | def predict(data_point, generation_config): 284 | """ 推理 """ 285 | prompt_dict = generate_prompt(data_point) 286 | # inputs = tokenizer([text_1], return_tensors="pt", padding=True) 287 | input_ids = prompt_dict.get("input_ids") 288 | input_ids = torch.tensor([input_ids], dtype=torch.long) 289 | if USE_CUDA: 290 | input_ids = input_ids.cuda() 291 | generation_config = GenerationConfig(**generation_config) 292 | with torch.no_grad(): 293 | generation_output = model.generate( 294 | input_ids=input_ids, 295 | generation_config=generation_config, 296 | # stop_words_ids=STOP_WORDS_IDS, 297 | return_dict_in_generate=True, 298 | output_scores=True, 299 | ) 300 | s = generation_output.sequences[0] 301 | output = tokenizer.decode(s) 302 | print(input_ids) 303 | print(s) 304 | print(output) 305 | # output = output.strip() 306 | return output 307 | def text_generate(request_data): 308 | instruction = request_data.instruction 309 | text = request_data.text 310 | penalty_alpha = request_data.penalty_alpha 311 | max_new_tokens = request_data.max_new_tokens 312 | temperature = request_data.temperature 313 | do_sample = request_data.do_sample 314 | num_beams = request_data.num_beams 315 | top_p = request_data.top_p 316 | top_k = request_data.top_k 317 | 318 | generation_dict = vars(request_data) 319 | print(generation_dict) 320 | generation_dict.pop("max_new_tokens") 321 | generation_dict.pop("instruction") 322 | generation_dict.pop("text") 323 | data_point = {"instruction": instruction, "input": text, "output": ""} 324 | generation_config = {"temperature": temperature, 325 | "top_p": top_p, 326 | "top_k": top_k, 327 | "num_beams": num_beams, 328 | "do_sample": do_sample, 329 | "penalty_alpha": penalty_alpha, 330 | "max_new_tokens": max_new_tokens, 331 | "pad_token_id": ID_PAD, 332 | "eos_token_id": ID_EOS, 333 | } 334 | try: # 数据预处理, 模型预测 335 | response = predict(data_point, generation_config) 336 | except Exception as e: 337 | logger.info(traceback.print_exc()) 338 | response = "<|endoftext|>" 339 | return response 340 | class Item(BaseModel): 341 | instruction: str = "" 342 | text: str = "1+1=" 343 | penalty_alpha: float = 1.0 344 | max_new_tokens: int = 512 345 | temperature: float = 0.8 # 0.95 # 0.35 # 0.95 346 | do_sample: bool = True 347 | num_beams: int = 1 348 | top_p: float = 0.8 # 0.75 349 | top_k: int = 50 350 | 351 | 352 | if __name__ == '__main__': 353 | 354 | text = "1+1=" 355 | item_config = Item() 356 | item_config.text = text 357 | response = text_generate(item_config) 358 | print(response) 359 | 360 | smooth = SmoothingFunction().method1 361 | rouge = Rouge() 362 | best_bleu = 0. 363 | 364 | fw = open(DATA_PATH + ".qwen_eval_rouge_blue.512.json", "a+", encoding="utf-8") 365 | rouge_1_p, rouge_2_p, rouge_l_p = 0, 0, 0 366 | rouge_1_r, rouge_2_r, rouge_l_r = 0, 0, 0 367 | rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 368 | total = 0 369 | time_start = time.time() 370 | datas = load_json(DATA_PATH) 371 | # datas = datas[:1024] 372 | datas = datas[:8] 373 | for d_json in tqdm(datas, desc="data"): 374 | try: 375 | instruction = d_json.get("instruction", "") 376 | text_input = d_json.get("input", "") 377 | text_output = d_json.get("output", "") 378 | # qtext, qans 379 | total += 1 380 | item_config = Item() 381 | item_config.instruction = instruction 382 | item_config.text = text_input 383 | text_output = " ".join(jieba.lcut(text_output)) 384 | text_gen_str = text_generate(item_config) 385 | text_pred = " ".join(jieba.lcut(text_gen_str.replace("<|endoftext|>", "").replace("<|im_end|>", "") 386 | .replace("<|im_start|>", "").strip().lower())) 387 | line = {"input": text_input, "output": text_output.replace(" ", ""), 388 | "pred": text_pred.replace(" ", "")} 389 | line_str = json.dumps(line, ensure_ascii=False) + "\n" 390 | fw.write(line_str) 391 | if text_pred.strip(): 392 | scores = rouge.get_scores(hyps=text_pred, refs=text_output) 393 | rouge_1 += scores[0]['rouge-1']['f'] 394 | rouge_2 += scores[0]['rouge-2']['f'] 395 | rouge_l += scores[0]['rouge-l']['f'] 396 | 397 | rouge_1_p += scores[0]['rouge-1']['p'] 398 | rouge_2_p += scores[0]['rouge-2']['p'] 399 | rouge_l_p += scores[0]['rouge-l']['p'] 400 | rouge_1_r += scores[0]['rouge-1']['r'] 401 | rouge_2_r += scores[0]['rouge-2']['r'] 402 | rouge_l_r += scores[0]['rouge-l']['r'] 403 | 404 | bleu += sentence_bleu(references=[list(text_output.replace(" ", ""))], 405 | hypothesis=list(text_pred.replace(" ", "")), 406 | smoothing_function=smooth) 407 | mertics_i = {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 408 | 'bleu': bleu, 409 | 'rouge-1_p': rouge_1_p, 'rouge-2_p': rouge_2_p, 'rouge-l_p': rouge_l_p, 410 | 'rouge-1_r': rouge_1_r, 'rouge-2_r': rouge_2_r, 'rouge-l_r': rouge_l_r, } 411 | if total < 5: 412 | print(text_output.replace(" ", "")) 413 | print(text_pred.replace(" ", "")) 414 | print(mertics_i) 415 | except Exception as e: 416 | print(traceback.print_exc()) 417 | continue 418 | time_end = time.time() 419 | lost_time = time_end - time_start 420 | lost_time_avg = lost_time / total 421 | rouge_1, rouge_2, rouge_l, bleu = rouge_1 / total, rouge_2 / total, rouge_l / total, bleu / total 422 | rouge_1_p, rouge_2_p, rouge_l_p = rouge_1_p / total, rouge_2_p / total, rouge_l_p / total 423 | rouge_1_r, rouge_2_r, rouge_l_r = rouge_1_r / total, rouge_2_r / total, rouge_l_r / total 424 | 425 | mertics = {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu, 426 | "lost_time": lost_time, "lost_time_avg": lost_time_avg, 427 | 'rouge-1_p': rouge_1_p, 'rouge-2_p': rouge_2_p, 'rouge-l_p': rouge_l_p, 428 | 'rouge-1_r': rouge_1_r, 'rouge-2_r': rouge_2_r, 'rouge-l_r': rouge_l_r, 429 | } 430 | mertics = {k: round(v, 4) for k, v in mertics.items()} 431 | print(mertics, lost_time, lost_time_avg) 432 | fw.write(json.dumps(mertics, ensure_ascii=False) + "\n") 433 | fw.close() 434 | 435 | 436 | """ 437 | # nohup python evaluation.py > tc.evaluation.py.log 2>&1 & 438 | # tail -n 1000 -f tc.evaluation.py.log 439 | # |myz| 440 | """ 441 | 442 | -------------------------------------------------------------------------------- /qwen2_sft/ft_qwen2/post_api.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2023/3/25 21:56 4 | # @author : Mo 5 | # @function: fastapi-post接口 6 | 7 | 8 | import traceback 9 | import logging 10 | import random 11 | import time 12 | import json 13 | import sys 14 | import os 15 | 16 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 17 | print(path_root) 18 | sys.path.append(path_root) 19 | from qwen2_sft.ft_qwen2.config import CUDA_VISIBLE_DEVICES, USE_TORCH, CPU_NUMS # from config 20 | os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:3072" 21 | os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES 22 | os.environ["USE_TORCH"] = USE_TORCH 23 | os.environ["OMP_NUM_THREADS"] = CPU_NUMS # export OMP_NUM_THREADS=1 24 | os.environ["OPENBLAS_NUM_THREADS"] = CPU_NUMS # export OPENBLAS_NUM_THREADS=1 25 | os.environ["MKL_NUM_THREADS"] = CPU_NUMS # export MKL_NUM_THREADS=1 26 | os.environ["VECLIB_MAXIMUM_THREADS"] = CPU_NUMS # export VECLIB_MAXIMUM_THREADS=1 27 | os.environ["NUMEXPR_NUM_THREADS"] = CPU_NUMS # export NUMEXPR_NUM_THREADS=1 28 | 29 | # from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 30 | from peft import prepare_model_for_int8_training 31 | from peft import LoraConfig, get_peft_model 32 | from transformers import GenerationConfig 33 | from pydantic import BaseModel 34 | from rouge import Rouge # pip install rouge 35 | from tqdm import tqdm 36 | import torch 37 | 38 | from pydantic import BaseModel 39 | from fastapi import FastAPI 40 | import time 41 | 42 | 43 | # from qwen2_sft.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as LLMTokenizer 44 | # from qwen2_sft.models.qwen2.configuration_qwen2 import Qwen2Config as LLMConfig 45 | # from qwen2_sft.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as LLMModel 46 | from transformers import Qwen2Tokenizer as LLMTokenizer 47 | from transformers import Qwen2ForCausalLM as LLMModel 48 | from transformers import Qwen2Config as LLMConfig 49 | from qwen2_sft.ft_qwen2.config import PATH_MODEL_PRETRAIN, DATA_PATH, MODEL_SAVE_DIR, REPO_ID 50 | from qwen2_sft.ft_qwen2.config import MICRO_BATCH_SIZE, BATCH_SIZE, GRADIENT_ACCUMULATION_STEPS 51 | from qwen2_sft.ft_qwen2.config import LEARNING_RATE, EPOCHS, SAVE_STEPS, VAL_SET_SIZE, TARGET_MODULES 52 | from qwen2_sft.ft_qwen2.config import IS_PARALLELIZABLE, MODEL_PARALLEL, USE_CACHE 53 | from qwen2_sft.ft_qwen2.config import MAX_LENGTH_Q, MAX_LENGTH_A, MAX_LENGTH_QA 54 | from qwen2_sft.ft_qwen2.config import LORA_DROPOUT, LORA_ALPHA, LORA_R 55 | from qwen2_sft.ft_qwen2.config import USE_CUDA 56 | 57 | 58 | app = FastAPI() # 日志文件名,为启动时的日期, 全局日志格式 59 | logger_level = logging.INFO 60 | logging.basicConfig(format="%(asctime)s - %(filename)s[line:%(lineno)d] " 61 | "- %(levelname)s: %(message)s", 62 | level=logger_level) 63 | logger = logging.getLogger("ft-llama") 64 | console = logging.StreamHandler() 65 | console.setLevel(logger_level) 66 | logger.addHandler(console) 67 | 68 | 69 | # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} 70 | # world_size = int(os.environ.get("WORLD_SIZE", 1)) 71 | # ddp = world_size != 1 72 | # device_map = "auto" 73 | # # USE_CUDA = True 74 | # print(device_map) 75 | # print(ddp) 76 | 77 | 78 | def load_model_state(model, model_save_dir="./", model_name="adapter_model.safetensors", device="cpu"): 79 | """ 仅加载模型参数(推荐使用) """ 80 | try: 81 | path_model = os.path.join(model_save_dir, model_name) 82 | peft_config = LoraConfig.from_pretrained(model_save_dir) 83 | peft_config.inference_mode = True 84 | model = get_peft_model(model, peft_config) 85 | 86 | try: 87 | if path_model.endswith(".safetensors"): 88 | from safetensors.torch import load_file, save_file 89 | from safetensors import safe_open 90 | state_dict = {} 91 | with safe_open(path_model, framework="pt", device="cpu") as f: 92 | for k in f.keys(): 93 | state_dict[k] = f.get_tensor(k) 94 | ### if path_model.endswith(".bin") or path_model.endswith(".pt"): 95 | else: 96 | state_dict = torch.load(path_model, map_location=torch.device(device)) 97 | except Exception as e: 98 | print(traceback.print_exc()) 99 | ### 全部训练完的话会用这个, 即便是.safetensors 100 | state_dict = torch.load(path_model, map_location=torch.device(device)) 101 | 102 | # print(state_dict.keys()) 103 | state_dict = {"base_model.model." + k.replace("_orig_mod.", "") 104 | .replace(".lora_A.weight", ".lora_A.default.weight") 105 | .replace(".lora_B.weight", ".lora_B.default.weight") 106 | : v for k, v in state_dict.items()} 107 | print(state_dict.keys()) 108 | print("#" * 128) 109 | ### 排查不存在model.keys的 state_dict.key 110 | name_dict = {name: 0 for name, param in model.named_parameters()} 111 | print(name_dict.keys()) 112 | print("#" * 128) 113 | for state_dict_key in state_dict.keys(): 114 | if state_dict_key not in name_dict: 115 | print("{} is not exist!".format(state_dict_key)) 116 | model.load_state_dict(state_dict, strict=False) 117 | # model.to(device) 118 | print("******model loaded success******") 119 | print("self.device: {}".format(device)) 120 | except Exception as e: 121 | print(str(e)) 122 | raise Exception("******load model error******") 123 | return model 124 | def save_model_state(model, config=None, model_save_dir="./", model_name="pytorch_model.bin"): 125 | """ 仅保存模型参数(推荐使用) """ 126 | if not os.path.exists(model_save_dir): 127 | os.makedirs(model_save_dir) 128 | if config: 129 | config.save_pretrained(model_save_dir) 130 | # save model 131 | path_model = os.path.join(model_save_dir, model_name) 132 | # torch.save(model.state_dict(), path_model) 133 | grad_params_dict = {k: v.to("cpu") for k, v in model.named_parameters() 134 | if v.requires_grad == True} 135 | torch.save(grad_params_dict, path_model) 136 | print("******model_save_path is {}******".format(path_model)) 137 | def prepare_model_for_half_training(model, output_embedding_layer_name="lm_head", 138 | use_gradient_checkpointing=True, layer_norm_names=["layer_norm"]): 139 | r""" 140 | This method wrapps the entire protocol for preparing a model before running a training. This includes: 141 | 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm 142 | head to fp32 143 | 144 | Args: 145 | model, (`transformers.PreTrainedModel`): 146 | The loaded model from `transformers` 147 | """ 148 | # 不要使用 model.half(), 这样会先截取精度再训练了, 最初data就要保持half 149 | for name, param in model.named_parameters(): 150 | # freeze base model's layers 151 | param.requires_grad = False 152 | # cast layer norm in fp32 for stability for 8bit models 153 | if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names): 154 | param.data = param.data.to(torch.float32) 155 | elif output_embedding_layer_name in name: # lm_head也需要是tf.float32(最后一层) 156 | param.data = param.data.to(torch.float32) 157 | else: 158 | param.data = param.data.to(torch.half) 159 | 160 | if use_gradient_checkpointing: 161 | # For backward compatibility 162 | if hasattr(model, "enable_input_require_grads"): 163 | model.enable_input_require_grads() 164 | else: 165 | def make_inputs_require_grad(module, input, output): 166 | output.requires_grad_(True) 167 | model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) 168 | # enable gradient checkpointing for memory efficiency 169 | model.gradient_checkpointing_enable() 170 | return model 171 | def print_named_parameters(model, use_print_data=False): 172 | """ 打印模型训练参数/数据类型信息 """ 173 | trainable_params = 0 174 | all_param = 0 175 | for name, param in model.named_parameters(): 176 | if use_print_data: 177 | print((name, param.data.dtype, param.requires_grad, param.data)) 178 | else: 179 | print((name, param.data.dtype, param.requires_grad)) 180 | num_params = param.numel() 181 | # if using DS Zero 3 and the weights are initialized empty 182 | if num_params == 0 and hasattr(param, "ds_numel"): 183 | num_params = param.ds_numel 184 | all_param += num_params 185 | if param.requires_grad: 186 | trainable_params += num_params 187 | print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}") 188 | def txt_read(path, encode_type="utf-8", errors=None): 189 | """ 190 | 读取txt文件,默认utf8格式, 不能有空行 191 | Args: 192 | path[String]: path of file of read, eg. "corpus/xuexiqiangguo.txt" 193 | encode_type[String]: data encode type of file, eg. "utf-8", "gbk" 194 | errors[String]: specifies how encoding errors handled, eg. "ignore", strict 195 | Returns: 196 | lines[List]: output lines 197 | """ 198 | lines = [] 199 | try: 200 | file = open(path, "r", encoding=encode_type, errors=errors) 201 | lines = file.readlines() 202 | file.close() 203 | except Exception as e: 204 | logger.info(str(e)) 205 | finally: 206 | return lines 207 | def load_json(path: str, encoding: str="utf-8"): 208 | """ 209 | Read Line of List form file 210 | Args: 211 | path: path of save file, such as "txt" 212 | encoding: type of encoding, such as "utf-8", "gbk" 213 | Returns: 214 | model_json: dict of word2vec, eg. [{"大漠帝国":132}] 215 | """ 216 | with open(path, "r", encoding=encoding) as fj: 217 | model_json = json.load(fj) 218 | fj.close() 219 | return model_json 220 | def generate_prompt(data_point, is_logger=False): 221 | """ 指令微调: 222 | """ 223 | 224 | text_input = data_point.get("instruction", "") + "\t" + data_point.get("input", "") 225 | text_out = data_point.get("output", "") 226 | prompt_text_1 = """<|im_start|>system 227 | You are a helpful assistant.<|im_end|> 228 | <|im_start|>user 229 | {}<|im_end|> 230 | <|im_start|>assistant""" 231 | tokens = tokenizer.tokenize(prompt_text_1) 232 | max_len = MAX_LENGTH_QA - len(tokens) 233 | text_1 = prompt_text_1.format(text_input[:max_len]) 234 | x = tokenizer.encode(text_1, add_special_tokens=False) 235 | if len(x) > (MAX_LENGTH_Q + MAX_LENGTH_A): 236 | x = x[:MAX_LENGTH_Q] + [ID_EOS] 237 | out = {"input_ids": x, "labels": []} 238 | if is_logger: 239 | print(text_1) 240 | print(out) 241 | return out 242 | 243 | 244 | model = LLMModel.from_pretrained(PATH_MODEL_PRETRAIN) 245 | # model = prepare_model_for_half_training(model, 246 | # use_gradient_checkpointing=False, 247 | # output_embedding_layer_name="lm_head", 248 | # layer_norm_names=["post_attention_layernorm", 249 | # "input_layernorm", 250 | # "norm" 251 | # ], 252 | # ) 253 | config = LoraConfig(target_modules=TARGET_MODULES, 254 | lora_dropout=LORA_DROPOUT, 255 | lora_alpha=LORA_ALPHA, 256 | task_type="CAUSAL_LM", 257 | bias="none", 258 | r=LORA_R, 259 | ) 260 | model = get_peft_model(model, config) 261 | model = load_model_state(model=model, model_save_dir=MODEL_SAVE_DIR) 262 | if USE_CUDA: 263 | model = model.half().cuda() 264 | else: 265 | model = model.bfloat16() 266 | print_named_parameters(model, use_print_data=True) 267 | # print_named_parameters(model) 268 | 269 | tokenizer = LLMTokenizer.from_pretrained(PATH_MODEL_PRETRAIN, add_eos_token=True) 270 | ID_END = 151643 271 | ID_BOS = 151644 272 | ID_EOS = 151645 273 | ID_PAD = ID_EOS 274 | ID_BR = 1699 # "\\n" 275 | ID_USER = 872 276 | ID_MODEL = 8948 277 | tokenizer.pad_token_id = ID_EOS 278 | tokenizer.eos_token_id = ID_EOS 279 | tokenizer.padding_side = "left" 280 | # tokenizer.padding_side = "right" # NO use attention-mask 281 | print(ID_PAD) 282 | print(ID_BOS) 283 | print(ID_EOS) 284 | print(ID_BR) 285 | print(ID_USER) 286 | print(ID_MODEL) 287 | """ 288 | "<|endoftext|>": 151643, 289 | "<|im_start|>": 151644, 290 | "<|im_end|>": 151645 291 | "system": 8948, 292 | "user": 872, 293 | "\\n": 1699, 294 | """ 295 | STOP_WORDS_IDS = [[ID_BOS], [ID_EOS], [ID_END]] 296 | 297 | 298 | 299 | def predict(data_point, generation_config): 300 | """ 推理 """ 301 | prompt_dict = generate_prompt(data_point) 302 | # inputs = tokenizer([text_1], return_tensors="pt", padding=True) 303 | input_ids = prompt_dict.get("input_ids") 304 | input_ids = torch.tensor([input_ids], dtype=torch.long) 305 | if USE_CUDA: 306 | input_ids = input_ids.cuda() 307 | generation_config = GenerationConfig(**generation_config) 308 | with torch.no_grad(): 309 | generation_output = model.generate( 310 | input_ids=input_ids, 311 | generation_config=generation_config, 312 | # stop_words_ids=STOP_WORDS_IDS, 313 | return_dict_in_generate=True, 314 | output_scores=True, 315 | ) 316 | s = generation_output.sequences[0] 317 | output = tokenizer.decode(s) 318 | print(input_ids) 319 | print(s) 320 | print(output) 321 | return output 322 | class Item(BaseModel): 323 | instruction: str = "" 324 | text: str = "1+1=" 325 | penalty_alpha: float = 1.0 326 | max_new_tokens: int = 128 327 | temperature: float = 0.8 # 0.95 # 0.35 # 0.95 328 | do_sample: bool = True 329 | num_beams: int = 1 330 | top_p: float = 0.8 # 0.75 331 | top_k: int = 50 332 | 333 | 334 | @app.post("/nlg/text_generate") 335 | def text_generate(request_data: Item): 336 | instruction = request_data.instruction 337 | text = request_data.text 338 | penalty_alpha = request_data.penalty_alpha 339 | max_new_tokens = request_data.max_new_tokens 340 | temperature = request_data.temperature 341 | do_sample = request_data.do_sample 342 | num_beams = request_data.num_beams 343 | top_p = request_data.top_p 344 | top_k = request_data.top_k 345 | 346 | generation_dict = vars(request_data) 347 | print(generation_dict) 348 | generation_dict.pop("max_new_tokens") 349 | generation_dict.pop("instruction") 350 | generation_dict.pop("text") 351 | data_point = {"instruction": instruction, "input": text, "output": ""} 352 | generation_config = {"temperature": temperature, 353 | "top_p": top_p, 354 | "top_k": top_k, 355 | "num_beams": num_beams, 356 | "do_sample": do_sample, 357 | "penalty_alpha": penalty_alpha, 358 | "max_new_tokens": max_new_tokens, 359 | "pad_token_id": ID_PAD, 360 | "eos_token_id": ID_EOS, 361 | } 362 | try: # 数据预处理, 模型预测 363 | response = predict(data_point, generation_config) 364 | except Exception as e: 365 | logger.info(traceback.print_exc()) 366 | response = "<|im_end|>" 367 | return {"response": response} 368 | 369 | 370 | if __name__ == '__main__': 371 | import uvicorn 372 | uvicorn.run(app=app, 373 | host="0.0.0.0", 374 | port=8036, 375 | workers=1) 376 | 377 | 378 | """ 379 | # nohup python post_api.py > tc.post_api.py.log 2>&1 & 380 | # tail -n 1000 -f tc.post_api.py.log 381 | # |myz| 382 | 383 | 可以在浏览器生成界面直接访问: http://localhost:8036/docs 384 | """ 385 | 386 | -------------------------------------------------------------------------------- /qwen2_sft/ft_qwen2/predict.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2023/3/25 21:56 4 | # @author : Mo 5 | # @function: 推理 6 | 7 | 8 | import random 9 | import time 10 | import sys 11 | import os 12 | import traceback 13 | 14 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 15 | print(path_root) 16 | sys.path.append(path_root) 17 | 18 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 19 | print(path_root) 20 | sys.path.append(path_root) 21 | from qwen2_sft.ft_qwen2.config import CUDA_VISIBLE_DEVICES, USE_TORCH, CPU_NUMS # from config 22 | os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:3072" 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" #CUDA_VISIBLE_DEVICES 24 | os.environ["USE_TORCH"] = USE_TORCH 25 | os.environ["OMP_NUM_THREADS"] = CPU_NUMS # export OMP_NUM_THREADS=1 26 | os.environ["OPENBLAS_NUM_THREADS"] = CPU_NUMS # export OPENBLAS_NUM_THREADS=1 27 | os.environ["MKL_NUM_THREADS"] = CPU_NUMS # export MKL_NUM_THREADS=1 28 | os.environ["VECLIB_MAXIMUM_THREADS"] = CPU_NUMS # export VECLIB_MAXIMUM_THREADS=1 29 | os.environ["NUMEXPR_NUM_THREADS"] = CPU_NUMS # export NUMEXPR_NUM_THREADS=1 30 | 31 | from peft import LoraConfig, get_peft_model 32 | from transformers import GenerationConfig 33 | from pydantic import BaseModel 34 | from rouge import Rouge # pip install rouge 35 | from tqdm import tqdm 36 | import torch 37 | import jieba 38 | 39 | # from qwen2_sft.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as LLMTokenizer 40 | # from qwen2_sft.models.qwen2.configuration_qwen2 import Qwen2Config as LLMConfig 41 | # from qwen2_sft.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as LLMModel 42 | from transformers import Qwen2Tokenizer as LLMTokenizer 43 | from transformers import Qwen2ForCausalLM as LLMModel 44 | from transformers import Qwen2Config as LLMConfig 45 | from qwen2_sft.ft_qwen2.config import PATH_MODEL_PRETRAIN, DATA_PATH, MODEL_SAVE_DIR, REPO_ID 46 | from qwen2_sft.ft_qwen2.config import MICRO_BATCH_SIZE, BATCH_SIZE, GRADIENT_ACCUMULATION_STEPS 47 | from qwen2_sft.ft_qwen2.config import LEARNING_RATE, EPOCHS, SAVE_STEPS, VAL_SET_SIZE, TARGET_MODULES 48 | from qwen2_sft.ft_qwen2.config import IS_PARALLELIZABLE, MODEL_PARALLEL, USE_CACHE 49 | from qwen2_sft.ft_qwen2.config import MAX_LENGTH_Q, MAX_LENGTH_A, MAX_LENGTH_QA 50 | from qwen2_sft.ft_qwen2.config import LORA_DROPOUT, LORA_ALPHA, LORA_R 51 | from qwen2_sft.ft_qwen2.config import USE_CUDA 52 | 53 | 54 | # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} 55 | # world_size = int(os.environ.get("WORLD_SIZE", 1)) 56 | # ddp = world_size != 1 57 | # device_map = "auto" 58 | # # USE_CUDA = True 59 | # print(device_map) 60 | # print(ddp) 61 | 62 | def load_model_state(model, model_save_dir="./", model_name="adapter_model.safetensors", device="cpu"): 63 | """ 仅加载模型参数(推荐使用) """ 64 | try: 65 | path_model = os.path.join(model_save_dir, model_name) 66 | peft_config = LoraConfig.from_pretrained(model_save_dir) 67 | peft_config.inference_mode = True 68 | model = get_peft_model(model, peft_config) 69 | 70 | try: 71 | if path_model.endswith(".safetensors"): 72 | from safetensors.torch import load_file, save_file 73 | from safetensors import safe_open 74 | state_dict = {} 75 | with safe_open(path_model, framework="pt", device="cpu") as f: 76 | for k in f.keys(): 77 | state_dict[k] = f.get_tensor(k) 78 | ### if path_model.endswith(".bin") or path_model.endswith(".pt"): 79 | else: 80 | state_dict = torch.load(path_model, map_location=torch.device(device)) 81 | except Exception as e: 82 | print(traceback.print_exc()) 83 | ### 全部训练完的话会用这个, 即便是.safetensors 84 | state_dict = torch.load(path_model, map_location=torch.device(device)) 85 | 86 | # state_dict = torch.load(path_model, map_location=torch.device(device)) 87 | # print(state_dict.keys()) 88 | state_dict = {"base_model.model." + k.replace("_orig_mod.", "") 89 | .replace(".lora_A.weight", ".lora_A.default.weight") 90 | .replace(".lora_B.weight", ".lora_B.default.weight") 91 | : v for k, v in state_dict.items()} 92 | print(state_dict.keys()) 93 | print("#" * 128) 94 | ### 排查不存在model.keys的 state_dict.key 95 | name_dict = {name: 0 for name, param in model.named_parameters()} 96 | print(name_dict.keys()) 97 | print("#" * 128) 98 | for state_dict_key in state_dict.keys(): 99 | if state_dict_key not in name_dict: 100 | print("{} is not exist!".format(state_dict_key)) 101 | model.load_state_dict(state_dict, strict=False) 102 | # model.to(device) 103 | print("******model loaded success******") 104 | print("self.device: {}".format(device)) 105 | except Exception as e: 106 | print(str(e)) 107 | print(traceback.print_exc()) 108 | return model 109 | def save_model_state(model, config=None, model_save_dir="./", model_name="adapter_model.safetensors"): 110 | """ 仅保存模型参数(推荐使用) """ 111 | if not os.path.exists(model_save_dir): 112 | os.makedirs(model_save_dir) 113 | if config: 114 | config.save_pretrained(model_save_dir) 115 | # save model 116 | path_model = os.path.join(model_save_dir, model_name) 117 | # torch.save(model.state_dict(), path_model) 118 | grad_params_dict = {k: v.to("cpu") for k, v in model.named_parameters() 119 | if v.requires_grad == True} 120 | torch.save(grad_params_dict, path_model) 121 | print("******model_save_path is {}******".format(path_model)) 122 | def prepare_model_for_half_training(model, output_embedding_layer_name="lm_head", 123 | use_gradient_checkpointing=True, layer_norm_names=["layer_norm"]): 124 | r""" 125 | This method wrapps the entire protocol for preparing a model before running a training. This includes: 126 | 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm 127 | head to fp32 128 | 129 | Args: 130 | model, (`transformers.PreTrainedModel`): 131 | The loaded model from `transformers` 132 | """ 133 | # 不要使用 model.half(), 这样会先截取精度再训练了, 最初data就要保持half 134 | for name, param in model.named_parameters(): 135 | # freeze base model's layers 136 | param.requires_grad = False 137 | # cast layer norm in fp32 for stability for 8bit models 138 | if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names): 139 | param.data = param.data.to(torch.float32) 140 | elif output_embedding_layer_name in name: # lm_head也需要是tf.float32(最后一层) 141 | param.data = param.data.to(torch.float32) 142 | else: 143 | param.data = param.data.to(torch.half) 144 | 145 | if use_gradient_checkpointing: 146 | # For backward compatibility 147 | if hasattr(model, "enable_input_require_grads"): 148 | model.enable_input_require_grads() 149 | else: 150 | def make_inputs_require_grad(module, input, output): 151 | output.requires_grad_(True) 152 | model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) 153 | # enable gradient checkpointing for memory efficiency 154 | model.gradient_checkpointing_enable() 155 | return model 156 | def print_named_parameters(model, use_print_data=True): 157 | """ 打印模型训练参数/数据类型信息 """ 158 | trainable_params = 0 159 | all_param = 0 160 | for name, param in model.named_parameters(): 161 | if use_print_data: 162 | print((name, param.data.dtype, param.requires_grad, param.data)) 163 | else: 164 | print((name, param.data.dtype, param.requires_grad)) 165 | num_params = param.numel() 166 | # if using DS Zero 3 and the weights are initialized empty 167 | if num_params == 0 and hasattr(param, "ds_numel"): 168 | num_params = param.ds_numel 169 | all_param += num_params 170 | if param.requires_grad: 171 | trainable_params += num_params 172 | print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}") 173 | def generate_prompt(data_point, is_logger=False): 174 | """ 指令微调: 175 | """ 176 | 177 | text_input = data_point.get("instruction", "") + "\t" + data_point.get("input", "") 178 | text_out = data_point.get("output", "") 179 | prompt_text_1 = """<|im_start|>system 180 | You are a helpful assistant.<|im_end|> 181 | <|im_start|>user 182 | {}<|im_end|> 183 | <|im_start|>assistant""" 184 | tokens = tokenizer.tokenize(prompt_text_1) 185 | max_len = MAX_LENGTH_QA - len(tokens) 186 | text_1 = prompt_text_1.format(text_input[:max_len]) 187 | x = tokenizer.encode(text_1, add_special_tokens=False) 188 | if len(x) > (MAX_LENGTH_Q + MAX_LENGTH_A): 189 | x = x[:MAX_LENGTH_Q] + [ID_EOS] 190 | out = {"input_ids": x, "labels": []} 191 | if is_logger: 192 | print(text_1) 193 | print(out) 194 | return out 195 | 196 | 197 | model = LLMModel.from_pretrained(PATH_MODEL_PRETRAIN) 198 | # model = prepare_model_for_half_training(model, 199 | # use_gradient_checkpointing=False, 200 | # output_embedding_layer_name="lm_head", 201 | # layer_norm_names=["post_attention_layernorm", 202 | # "input_layernorm", 203 | # "norm" 204 | # ], 205 | # ) 206 | config = LoraConfig(target_modules=TARGET_MODULES, 207 | lora_dropout=LORA_DROPOUT, 208 | lora_alpha=LORA_ALPHA, 209 | task_type="CAUSAL_LM", 210 | bias="none", 211 | r=LORA_R, 212 | ) 213 | model = get_peft_model(model, config) 214 | model = load_model_state(model=model, model_save_dir=MODEL_SAVE_DIR) 215 | if USE_CUDA: 216 | model = model.half().cuda() 217 | else: 218 | model = model.bfloat16() 219 | print_named_parameters(model, use_print_data=True) 220 | # print_named_parameters(model) 221 | 222 | tokenizer = LLMTokenizer.from_pretrained(PATH_MODEL_PRETRAIN, add_eos_token=True) 223 | ID_END = 151643 224 | ID_BOS = 151644 225 | ID_EOS = 151645 226 | ID_PAD = ID_EOS 227 | ID_BR = 1699 # "\\n" 228 | ID_USER = 872 229 | ID_MODEL = 8948 230 | tokenizer.pad_token_id = ID_EOS 231 | tokenizer.eos_token_id = ID_EOS 232 | tokenizer.padding_side = "left" 233 | # tokenizer.padding_side = "right" # NO use attention-mask 234 | print(ID_PAD) 235 | print(ID_BOS) 236 | print(ID_EOS) 237 | print(ID_BR) 238 | print(ID_USER) 239 | print(ID_MODEL) 240 | """ 241 | "<|endoftext|>": 151643, 242 | "<|im_start|>": 151644, 243 | "<|im_end|>": 151645 244 | "system": 8948, 245 | "user": 872, 246 | "\\n": 1699, 247 | """ 248 | STOP_WORDS_IDS = [[ID_BOS], [ID_EOS], [ID_END]] 249 | 250 | 251 | def predict(data_dict): 252 | """ 推理 """ 253 | prompt_dict = generate_prompt(data_dict) 254 | # inputs = tokenizer([text_1], return_tensors="pt", padding=True) 255 | input_ids = prompt_dict.get("input_ids") 256 | input_ids = torch.tensor([input_ids], dtype=torch.long) 257 | if USE_CUDA: 258 | input_ids = input_ids.cuda() 259 | generation_config = GenerationConfig( 260 | # temperature=0.8, 261 | # top_p=0.8, 262 | temperature=0.95, 263 | top_p=0.75, 264 | top_k=50, 265 | num_beams=1, 266 | do_sample=True, 267 | max_new_tokens=256, 268 | # penalty_alpha=1.5, 269 | pad_token_id=ID_PAD, 270 | eos_token_id=ID_EOS, 271 | ) 272 | with torch.no_grad(): 273 | generation_output = model.generate( 274 | input_ids=input_ids, 275 | generation_config=generation_config, 276 | # stop_words_ids=STOP_WORDS_IDS, 277 | # stop_words_ids=[[ID_EOS]], 278 | return_dict_in_generate=True, 279 | # return_dict_in_generate=True, 280 | # output_scores=True, 281 | # max_new_tokens=512, 282 | ) 283 | s = generation_output.sequences[0] 284 | output = tokenizer.decode(s) 285 | print(data_dict) 286 | print(input_ids) 287 | print(output) 288 | # output = output.split("答:")[-1] 289 | return output 290 | 291 | 292 | 293 | if __name__ == '__main__': 294 | data_dict = {"instruction": "类型#裙*版型#宽松*版型#显瘦*颜色#黑色*图案#撞色*裙型#直筒裙*裙款式#拼接", 295 | "input": "", 296 | "output": "" 297 | } 298 | res = predict(data_dict) 299 | print(res) 300 | while True: 301 | time_start = time.time() 302 | history = [] 303 | print("请输入:") 304 | ques = input() 305 | print("请稍等...") 306 | try: 307 | if ques.strip().upper() == "CLEAR": 308 | history = [] 309 | print("clear ok") 310 | continue 311 | else: 312 | print("#" * 128) 313 | ques_dict = {"instruction": ques, "input": "", "output": ""} 314 | res = predict(ques_dict) 315 | print(res) 316 | except Exception as e: 317 | print(str(e)) 318 | print(time.time() - time_start) 319 | 320 | """ 321 | python predict.py 322 | 323 | """ -------------------------------------------------------------------------------- /qwen2_sft/ft_qwen2/train.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2023/3/5 21:04 4 | # @author : Mo 5 | # @function: qwen1.5 6 | 7 | 8 | import traceback 9 | import random 10 | import sys 11 | import os 12 | 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | print(path_root) 15 | sys.path.append(path_root) 16 | from qwen2_sft.ft_qwen2.config import CUDA_VISIBLE_DEVICES, USE_TORCH, CPU_NUMS # from config 17 | os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:3072" 18 | os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES 19 | os.environ["USE_TORCH"] = USE_TORCH 20 | os.environ["OMP_NUM_THREADS"] = CPU_NUMS # export OMP_NUM_THREADS=1 21 | os.environ["OPENBLAS_NUM_THREADS"] = CPU_NUMS # export OPENBLAS_NUM_THREADS=1 22 | os.environ["MKL_NUM_THREADS"] = CPU_NUMS # export MKL_NUM_THREADS=1 23 | os.environ["VECLIB_MAXIMUM_THREADS"] = CPU_NUMS # export VECLIB_MAXIMUM_THREADS=1 24 | os.environ["NUMEXPR_NUM_THREADS"] = CPU_NUMS # export NUMEXPR_NUM_THREADS=1 25 | 26 | from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES 27 | from transformers.modeling_utils import unwrap_model 28 | from peft import LoraConfig, get_peft_model 29 | from transformers import GenerationConfig 30 | from tensorboardX import SummaryWriter 31 | from datasets import load_dataset 32 | from pydantic import BaseModel 33 | from rouge import Rouge # pip install rouge 34 | from tqdm import tqdm 35 | import transformers 36 | import torch 37 | 38 | # from qwen2_sft.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as LLMTokenizer 39 | # from qwen2_sft.models.qwen2.configuration_qwen2 import Qwen2Config as LLMConfig 40 | # from qwen2_sft.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as LLMModel 41 | from transformers import Qwen2Tokenizer as LLMTokenizer 42 | from transformers import Qwen2ForCausalLM as LLMModel 43 | from transformers import Qwen2Config as LLMConfig 44 | from qwen2_sft.ft_qwen2.config import PATH_MODEL_PRETRAIN, DATA_PATH, MODEL_SAVE_DIR, REPO_ID 45 | from qwen2_sft.ft_qwen2.config import MICRO_BATCH_SIZE, BATCH_SIZE, GRADIENT_ACCUMULATION_STEPS 46 | from qwen2_sft.ft_qwen2.config import LEARNING_RATE, EPOCHS, SAVE_STEPS, VAL_SET_SIZE, TARGET_MODULES 47 | from qwen2_sft.ft_qwen2.config import IS_PARALLELIZABLE, MODEL_PARALLEL, USE_CACHE 48 | from qwen2_sft.ft_qwen2.config import MAX_LENGTH_Q, MAX_LENGTH_A, MAX_LENGTH_QA 49 | from qwen2_sft.ft_qwen2.config import LORA_DROPOUT, LORA_ALPHA, LORA_R 50 | from qwen2_sft.ft_qwen2.config import USE_CUDA, WEIGHT_DECAY 51 | from qwen2_sft.ft_qwen2.config import USE_ALL_LOSS 52 | 53 | 54 | tensorboardx_witer = SummaryWriter(logdir=MODEL_SAVE_DIR) 55 | # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} 56 | use_all_loss = USE_ALL_LOSS or True 57 | # world_size = int(os.environ.get("WORLD_SIZE", 1)) 58 | # device_map = "auto" 59 | # ddp = world_size != 1 60 | # print(device_map) 61 | # print(ddp) 62 | 63 | 64 | def save_model_state(model, config=None, model_save_dir="./", model_name="adapter_model.safetensors"): 65 | """ 仅保存 有梯度 的 模型参数(推荐使用) """ 66 | if not os.path.exists(model_save_dir): 67 | os.makedirs(model_save_dir) 68 | # save config 69 | if config: 70 | config.save_pretrained(model_save_dir) 71 | # config.to_dict() 72 | # save model 73 | path_model = os.path.join(model_save_dir, model_name) 74 | grad_params_dict = {k: v.to("cpu") for k, v in model.named_parameters() 75 | if v.requires_grad == True} 76 | torch.save(grad_params_dict, path_model) 77 | print("******model_save_path is {}******".format(path_model)) 78 | def print_named_parameters(model, use_print_data=False): 79 | """ 打印模型训练参数/数据类型信息 """ 80 | trainable_params = 0 81 | all_param = 0 82 | for name, param in model.named_parameters(): 83 | if use_print_data: 84 | print((name, param.data.dtype, param.requires_grad, param.data)) 85 | else: 86 | print((name, param.data.dtype, param.requires_grad)) 87 | num_params = param.numel() 88 | # if using DS Zero 3 and the weights are initialized empty 89 | if num_params == 0 and hasattr(param, "ds_numel"): 90 | num_params = param.ds_numel 91 | all_param += num_params 92 | if param.requires_grad: 93 | trainable_params += num_params 94 | print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}") 95 | def prepare_model_for_half_training(model, output_embedding_layer_name="lm_head", 96 | use_gradient_checkpointing=True, layer_norm_names=["layer_norm"]): 97 | r""" 98 | This method wrapps the entire protocol for preparing a model before running a training. This includes: 99 | 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm 100 | head to fp32 101 | 102 | Args: 103 | model, (`transformers.PreTrainedModel`): 104 | The loaded model from `transformers` 105 | """ 106 | # 不要使用 model.half(), 这样会先截取精度再训练了, 最初data就要保持half 107 | for name, param in model.named_parameters(): 108 | # freeze base model's layers 109 | param.requires_grad = False 110 | # cast layer norm in fp32 for stability for 8bit models 111 | if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names): 112 | param.data = param.data.to(torch.float32) 113 | elif output_embedding_layer_name in name: # lm_head也需要是tf.float32(最后一层) 114 | param.data = param.data.to(torch.float32) 115 | else: 116 | param.data = param.data.to(torch.half) 117 | 118 | if use_gradient_checkpointing: 119 | # For backward compatibility 120 | if hasattr(model, "enable_input_require_grads"): 121 | model.enable_input_require_grads() 122 | else: 123 | def make_inputs_require_grad(module, input, output): 124 | output.requires_grad_(True) 125 | model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) 126 | # enable gradient checkpointing for memory efficiency 127 | model.gradient_checkpointing_enable() 128 | return model 129 | def generate_prompt(data_point, is_logger=False): 130 | """ 指令微调: 131 | 普通句子续写: bos + text + eos 132 | 带 prompt: 133 | ‘user 134 | Knock knock. 135 | model 136 | Who’s there?model 137 | user 138 | Gemma. 139 | model 140 | Gemma who?model’ 141 | """ 142 | text_input = data_point.get("content", "") 143 | text_out = data_point.get("summary", "") 144 | 145 | prompt_text_1 = """<|im_start|>system 146 | You are a helpful assistant.<|im_end|> 147 | <|im_start|>user 148 | {}<|im_end|>""" 149 | prompt_text_2 = """\n<|im_start|>assistant 150 | {}<|im_end|>""" 151 | text_1 = prompt_text_1.format(text_input.strip()) 152 | text_2 = prompt_text_2.format(text_out.strip()) 153 | 154 | x = tokenizer.encode(text_1, add_special_tokens=False) 155 | y = tokenizer.encode(text_2, add_special_tokens=False) 156 | if len(x) + len(y) > (MAX_LENGTH_Q + MAX_LENGTH_A): 157 | x = x[:MAX_LENGTH_Q] + [ID_EOS] 158 | y = y[:MAX_LENGTH_A] + [ID_EOS] 159 | out = {"input_ids": x, "labels": y} 160 | if is_logger: 161 | print(text_1) 162 | print(text_2) 163 | print(out) 164 | return out 165 | 166 | def data_collator(batch): 167 | # there's probably a way to do this with the tokenizer settings 168 | len_max_batch = [len(batch[i].get("input_ids")) + len(batch[i].get("labels")) 169 | for i in range(len(batch))] 170 | len_max_batch = min(MAX_LENGTH_QA, max(len_max_batch)) 171 | batch_attention_mask = [] 172 | batch_input_ids = [] 173 | batch_labels = [] 174 | for ba in batch: 175 | x, y = ba.get("input_ids"), ba.get("labels") 176 | len_padding = len_max_batch - len(x) - len(y) 177 | if not use_all_loss: ### 部分loss参与计算, output 178 | if tokenizer.padding_side and tokenizer.padding_side == "left": 179 | labels = [-100] * len_padding + [-100] * len(x) + y 180 | input_ids = [ID_PAD] * len_padding + x + y 181 | attention_mask = [0] * len_padding + [1] * (len_max_batch - len_padding) 182 | else: 183 | labels = [-100] * len(x) + y + [-100] * len_padding 184 | input_ids = x + y + [ID_PAD] * len_padding 185 | attention_mask = [1] * (len(x)+len(y)) + [0] * len_padding 186 | else: ### 全部loss参与计算, input + output 187 | if tokenizer.padding_side and tokenizer.padding_side == "left": 188 | labels = [-100] * len_padding + x + y 189 | input_ids = [ID_PAD] * len_padding + x + y 190 | attention_mask = [0] * len_padding + [1] * (len_max_batch - len_padding) 191 | else: 192 | labels = x + y + [-100] * len_padding 193 | input_ids = x + y + [ID_PAD] * len_padding 194 | attention_mask = [1] * (len(x)+len(y)) + [0] * len_padding 195 | tensor_attention_mask = torch.tensor(attention_mask, dtype=torch.long) 196 | tensor_input_ids = torch.tensor(input_ids, dtype=torch.long) 197 | tensor_labels = torch.tensor(labels, dtype=torch.long) 198 | batch_attention_mask.append(tensor_attention_mask) 199 | batch_input_ids.append(tensor_input_ids) 200 | batch_labels.append(tensor_labels) 201 | batch_attention_mask = torch.stack(batch_attention_mask) 202 | batch_input_ids = torch.stack(batch_input_ids) 203 | batch_labels = torch.stack(batch_labels) 204 | input_dict = {"attention_mask": batch_attention_mask, # no use 205 | "input_ids": batch_input_ids, 206 | "labels": batch_labels, 207 | } 208 | return input_dict 209 | def dfs_file(path_dir): 210 | """ 211 | 递归获取某个目录下的所有文件(所有层, 包括子目录) 212 | Args: 213 | path_dir[String]:, path of dir, eg. "/home/data" 214 | Returns: 215 | data[List]: data of input, eg. ["2020_01_08.txt"] 216 | """ 217 | path_files = [] 218 | for root, dirs, files in os.walk(path_dir): # 分别代表根目录、文件夹、文件 219 | for file in files: # 遍历文件 220 | file_path = os.path.join(root, file) # 获取文件绝对路径 221 | path_files.append(file_path) # 将文件路径添加进列表 222 | files = list(set(path_files)) 223 | files.sort() # the same list 224 | return files 225 | 226 | 227 | model = LLMModel.from_pretrained(PATH_MODEL_PRETRAIN, torch_dtype=torch.bfloat16) 228 | 229 | # model = prepare_model_for_half_training(model, 230 | # use_gradient_checkpointing=True, 231 | # output_embedding_layer_name="lm_head", 232 | # layer_norm_names=["post_attention_layernorm", 233 | # "input_layernorm", 234 | # "norm" 235 | # ], 236 | # ) 237 | model.gradient_checkpointing_enable() 238 | model.enable_input_require_grads() 239 | model.is_parallelizable = IS_PARALLELIZABLE 240 | model.model_parallel = MODEL_PARALLEL 241 | model.config.use_cache = USE_CACHE 242 | config = LoraConfig(target_modules=TARGET_MODULES, 243 | lora_dropout=LORA_DROPOUT, 244 | lora_alpha=LORA_ALPHA, 245 | task_type="CAUSAL_LM", 246 | bias="none", 247 | r=LORA_R, 248 | ) 249 | model = get_peft_model(model, config) 250 | print_named_parameters(model) 251 | model = model.cuda() 252 | # for param in filter(lambda p: p.requires_grad, model.parameters()): 253 | # param.data = param.data.to(torch.float32) 254 | 255 | for name, param in model.named_parameters(): 256 | if "LoR" in name: # 某些peft版本默认dtype=fp16, 这里全部转为 fp32 257 | param.data = param.data.to(torch.float32) 258 | 259 | print_named_parameters(model) 260 | 261 | tokenizer = LLMTokenizer.from_pretrained(PATH_MODEL_PRETRAIN, add_eos_token=True) 262 | ID_END = 151643 263 | ID_BOS = 151644 264 | ID_EOS = 151645 265 | ID_PAD = ID_EOS 266 | ID_BR = 1699 # "\\n" 267 | ID_USER = 872 268 | ID_MODEL = 8948 269 | tokenizer.pad_token_id = ID_EOS 270 | tokenizer.eos_token_id = ID_EOS 271 | tokenizer.padding_side = "left" 272 | # tokenizer.padding_side = "right" # NO use attention-mask 273 | print(ID_PAD) 274 | print(ID_BOS) 275 | print(ID_EOS) 276 | print(ID_BR) 277 | print(ID_USER) 278 | print(ID_MODEL) 279 | """ 280 | "<|endoftext|>": 151643, 281 | "<|im_start|>": 151644, 282 | "<|im_end|>": 151645 283 | "system": 8948, 284 | "user": 872, 285 | "\\n": 1699, 286 | """ 287 | STOP_WORDS_IDS = [[ID_BOS], [ID_EOS], [ID_END]] 288 | 289 | 290 | 291 | ### 只有一个train的情况 292 | data = load_dataset("json", data_files=DATA_PATH) 293 | if VAL_SET_SIZE > 0: 294 | # train_val = data["train"].train_test_split(test_size=min(VAL_SET_SIZE, 295 | # int(len(data["train"])/10000)), shuffle=True, seed=42) 296 | VAL_SET_SIZE = max(min(VAL_SET_SIZE, int(len(data["train"])/10000)), 1) 297 | generate_prompt(data["train"][0], is_logger=True) 298 | train_val = data["train"].train_test_split(test_size=VAL_SET_SIZE, shuffle=True, seed=42) 299 | train_data = train_val["train"].shuffle().map(generate_prompt) 300 | val_data = train_val["test"].shuffle().map(generate_prompt) 301 | else: 302 | generate_prompt(data["train"][0], is_logger=True) 303 | train_data = data["train"].shuffle().map(generate_prompt) 304 | val_data = None 305 | 306 | 307 | class CustomTrainer(transformers.Trainer): 308 | def compute_loss(self, model, inputs, return_outputs=False): 309 | """ newest loss """ 310 | inputs = {k: v.cuda() for k, v in inputs.items()} 311 | outputs = model(**inputs) # if contain labels, will calculate loss 312 | 313 | try: 314 | logs = {} 315 | tr_loss_scalar = self._nested_gather(outputs.loss.detach()).mean().item() 316 | logs["loss"] = round(tr_loss_scalar, 4) 317 | logs["lr"] = self.lr_scheduler.get_last_lr()[0] 318 | step = self.state.global_step 319 | for k, v in logs.items(): 320 | tensorboardx_witer.add_scalar(k, v, step) 321 | self.log(logs) 322 | except Exception as e: 323 | print(traceback.print_exc()) 324 | print(logs) 325 | 326 | if self.label_smoother is not None and "labels" in inputs: 327 | labels = inputs.pop("labels") 328 | else: 329 | labels = None 330 | # Save past state if it exists 331 | # TODO: this needs to be fixed and made cleaner later. 332 | if self.args.past_index >= 0: 333 | self._past = outputs[self.args.past_index] 334 | 335 | if labels is not None: 336 | if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): 337 | loss = self.label_smoother(outputs, labels, shift_labels=True) 338 | else: 339 | loss = self.label_smoother(outputs, labels) 340 | else: 341 | if isinstance(outputs, dict) and "loss" not in outputs: 342 | raise ValueError( 343 | "The model did not return a loss from the inputs, only the following keys: " 344 | f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." 345 | ) 346 | # We don't use .loss here since the model may return tuples instead of ModelOutput. 347 | loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] 348 | return (loss, outputs) if return_outputs else loss 349 | 350 | 351 | trainer = CustomTrainer( 352 | # data_collator=transformers.DataCollatorForSeq2Seq( 353 | # tokenizer, pad_to_multiple_of=8, 354 | # return_tensors="pt", padding=True 355 | # ), 356 | data_collator=data_collator, 357 | train_dataset=train_data, 358 | eval_dataset=val_data, 359 | model=model, 360 | args=transformers.TrainingArguments( 361 | weight_decay=WEIGHT_DECAY, 362 | gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, 363 | per_device_train_batch_size=MICRO_BATCH_SIZE, 364 | learning_rate=LEARNING_RATE, 365 | num_train_epochs=EPOCHS, 366 | max_grad_norm=1.0, 367 | logging_steps=8, 368 | # warmup_steps=382, # 618 369 | # warmup_ratio=0.01, 370 | warmup_steps=1, # 618 371 | evaluation_strategy="no", 372 | lr_scheduler_type="cosine", # "constant", #'constant', # "cosine", 373 | logging_first_step=True, 374 | # evaluation_strategy="steps" if VAL_SET_SIZE > 0 else "no", 375 | # eval_steps=SAVE_STEPS if VAL_SET_SIZE > 0 else None, 376 | save_strategy="steps", 377 | save_total_limit=12, 378 | save_steps=SAVE_STEPS, 379 | # load_best_model_at_end=True if VAL_SET_SIZE > 0 else False, 380 | ddp_find_unused_parameters=None, 381 | gradient_checkpointing=True, 382 | # group_by_length=True, # group together samples of roughly the same length in training 383 | output_dir=MODEL_SAVE_DIR, 384 | report_to=[], # ["tensorboard"], # [], ["wandb"] 385 | optim="adamw_torch", # "adamw_hf", 386 | # optim="adafactor", 387 | # fp16=True, 388 | ) 389 | ) 390 | 391 | if torch.__version__ >= "2" and sys.platform != "win32": 392 | model = torch.compile(model) 393 | 394 | 395 | ## 加载训练好的权重 396 | # files = dfs_file(MODEL_SAVE_DIR) 397 | # files_name_str = str(files) 398 | # flag_checkpoint = True if files and "checkpoint" in files_name_str else False 399 | flag_checkpoint = False 400 | trainer.train(resume_from_checkpoint=flag_checkpoint) 401 | save_model_state(model=model, config=config, model_save_dir=MODEL_SAVE_DIR) 402 | print_named_parameters(model, use_print_data=True) # 查看LoRA层权重是不是为NAN溢出 403 | 404 | 405 | # nohup python train.py > tc.train.py.log 2>&1 & 406 | # tail -n 1000 -f tc.train.py.log 407 | # |myz| 408 | 409 | -------------------------------------------------------------------------------- /qwen2_sft/image_aadvgen_loss_success_7b_5e5_bfloat16_pt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/qwen2-sft/91d4dd876d31f65911fe46216298c590233b67b0/qwen2_sft/image_aadvgen_loss_success_7b_5e5_bfloat16_pt.png -------------------------------------------------------------------------------- /qwen2_sft/image_advgen_dev_success_7b_2e5_cosine_bs2_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/qwen2-sft/91d4dd876d31f65911fe46216298c590233b67b0/qwen2_sft/image_advgen_dev_success_7b_2e5_cosine_bs2_fp16.png -------------------------------------------------------------------------------- /qwen2_sft/image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/qwen2-sft/91d4dd876d31f65911fe46216298c590233b67b0/qwen2_sft/image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16.png -------------------------------------------------------------------------------- /qwen2_sft/models/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2023/8/4 15:21 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /qwen2_sft/models/qwen2/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2024/3/7 15:44 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /qwen2_sft/models/qwen2/cache_utils.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any, Dict, List, Optional, Tuple 3 | 4 | import torch 5 | 6 | from transformers.configuration_utils import PretrainedConfig 7 | 8 | 9 | @dataclass 10 | class Cache: 11 | """ 12 | Base, abstract class for all caches. The actual data structure is specific to each subclass. 13 | """ 14 | 15 | def update( 16 | self, 17 | key_states: torch.Tensor, 18 | value_states: torch.Tensor, 19 | layer_idx: int, 20 | cache_kwargs: Optional[Dict[str, Any]] = None, 21 | ) -> Tuple[torch.Tensor, torch.Tensor]: 22 | """ 23 | Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. 24 | 25 | Parameters: 26 | key_states (`torch.Tensor`): 27 | The new key states to cache. 28 | value_states (`torch.Tensor`): 29 | The new value states to cache. 30 | layer_idx (`int`): 31 | The index of the layer to cache the states for. 32 | cache_kwargs (`Dict[str, Any]`, `optional`): 33 | Additional arguments for the cache subclass. These are specific to each subclass and allow new types of 34 | cache to be created. 35 | 36 | Return: 37 | A tuple containing the updated key and value states. 38 | """ 39 | raise NotImplementedError("Make sure to implement `update` in a subclass.") 40 | 41 | def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: 42 | """Returns the sequence length of the cached states. A layer index can be optionally passed.""" 43 | raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.") 44 | 45 | def get_max_length(self) -> Optional[int]: 46 | """Returns the maximum sequence length of the cached states, if there is any.""" 47 | raise NotImplementedError("Make sure to implement `get_max_length` in a subclass.") 48 | 49 | def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int: 50 | """Given the sequence length of the new inputs, returns the usable length of the cache.""" 51 | # Cache without size limit -> all cache is usable 52 | # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache 53 | # length, we will need to evict part of the cache (and thus not all cache is usable) 54 | max_length = self.get_max_length() 55 | previous_seq_length = self.get_seq_length(layer_idx) 56 | if max_length is not None and previous_seq_length + new_seq_length > max_length: 57 | return max_length - new_seq_length 58 | return previous_seq_length 59 | 60 | 61 | class DynamicCache(Cache): 62 | """ 63 | A cache that grows dynamically as more tokens are generated. This is the default for generative models. 64 | 65 | It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is 66 | `[batch_size, num_heads, seq_len, head_dim]`. 67 | """ 68 | 69 | def __init__(self) -> None: 70 | self.key_cache: List[torch.Tensor] = [] 71 | self.value_cache: List[torch.Tensor] = [] 72 | self.seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen 73 | 74 | def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]: 75 | """ 76 | Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the 77 | sequence length. 78 | """ 79 | if layer_idx < len(self): 80 | return (self.key_cache[layer_idx], self.value_cache[layer_idx]) 81 | else: 82 | raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") 83 | 84 | def __iter__(self): 85 | """ 86 | Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over 87 | keys and values 88 | """ 89 | for layer_idx in range(len(self)): 90 | yield (self.key_cache[layer_idx], self.value_cache[layer_idx]) 91 | 92 | def __len__(self): 93 | """ 94 | Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds 95 | to the number of layers in the model. 96 | """ 97 | return len(self.key_cache) 98 | 99 | def update( 100 | self, 101 | key_states: torch.Tensor, 102 | value_states: torch.Tensor, 103 | layer_idx: int, 104 | cache_kwargs: Optional[Dict[str, Any]] = None, 105 | ) -> Tuple[torch.Tensor, torch.Tensor]: 106 | """ 107 | Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. 108 | 109 | Parameters: 110 | key_states (`torch.Tensor`): 111 | The new key states to cache. 112 | value_states (`torch.Tensor`): 113 | The new value states to cache. 114 | layer_idx (`int`): 115 | The index of the layer to cache the states for. 116 | cache_kwargs (`Dict[str, Any]`, `optional`): 117 | Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`. 118 | 119 | Return: 120 | A tuple containing the updated key and value states. 121 | """ 122 | # Update the number of seen tokens 123 | if layer_idx == 0: 124 | self.seen_tokens += key_states.shape[-2] 125 | 126 | # Update the cache 127 | if len(self.key_cache) <= layer_idx: 128 | self.key_cache.append(key_states) 129 | self.value_cache.append(value_states) 130 | else: 131 | self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) 132 | self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) 133 | 134 | return self.key_cache[layer_idx], self.value_cache[layer_idx] 135 | 136 | def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: 137 | """Returns the sequence length of the cached states. A layer index can be optionally passed.""" 138 | if len(self.key_cache) <= layer_idx: 139 | return 0 140 | return self.key_cache[layer_idx].shape[-2] 141 | 142 | def get_max_length(self) -> Optional[int]: 143 | """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length.""" 144 | return None 145 | 146 | def reorder_cache(self, beam_idx: torch.LongTensor): 147 | """Reorders the cache for beam search, given the selected beam indices.""" 148 | for layer_idx in range(len(self.key_cache)): 149 | device = self.key_cache[layer_idx].device 150 | self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) 151 | device = self.value_cache[layer_idx].device 152 | self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) 153 | 154 | def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]: 155 | """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format.""" 156 | legacy_cache = () 157 | for layer_idx in range(len(self)): 158 | legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),) 159 | return legacy_cache 160 | 161 | @classmethod 162 | def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache": 163 | """Converts a cache in the legacy cache format into an equivalent `DynamicCache`.""" 164 | cache = cls() 165 | if past_key_values is not None: 166 | for layer_idx in range(len(past_key_values)): 167 | key_states, value_states = past_key_values[layer_idx] 168 | cache.update(key_states, value_states, layer_idx) 169 | return cache 170 | 171 | 172 | class SinkCache(Cache): 173 | """ 174 | A cache that as described in the [Attention Sinks paper](https://arxiv.org/abs/2309.17453). It allows the model to 175 | generate beyond the length of its context window, without losing fluency in the conversation. As it discards past 176 | tokens, the model will lose the ability to generate tokens that depend on the context that was discarded. 177 | 178 | It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is 179 | `[batch_size, num_heads, seq_len, head_dim]`. 180 | 181 | Parameters: 182 | window_length (`int`): 183 | The length of the context window. 184 | num_sink_tokens (`int`): 185 | The number of sink tokens. See the original paper for more information. 186 | """ 187 | 188 | def __init__(self, window_length: int, num_sink_tokens: int) -> None: 189 | self.key_cache: List[torch.Tensor] = [] 190 | self.value_cache: List[torch.Tensor] = [] 191 | self.window_length = window_length 192 | self.num_sink_tokens = num_sink_tokens 193 | self.cos_sin_cache = {} 194 | self.seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen 195 | 196 | @staticmethod 197 | def _rotate_half(x): 198 | x1 = x[..., : x.shape[-1] // 2] 199 | x2 = x[..., x.shape[-1] // 2 :] 200 | return torch.cat((-x2, x1), dim=-1) 201 | 202 | def _apply_key_rotary_pos_emb( 203 | self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor 204 | ) -> torch.Tensor: 205 | rotated_key_states = (key_states * cos) + (self._rotate_half(key_states) * sin) 206 | return rotated_key_states 207 | 208 | def _get_rerotation_cos_sin( 209 | self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor 210 | ) -> Tuple[torch.Tensor, torch.Tensor]: 211 | if key_states.shape[-2] not in self.cos_sin_cache: 212 | # Upcast to float32 temporarily for better accuracy 213 | cos = cos.to(torch.float32) 214 | sin = sin.to(torch.float32) 215 | 216 | # Compute the cos and sin required for back- and forward-rotating to one position earlier in the sequence 217 | original_cos = cos[self.num_sink_tokens + key_states.shape[-2] :] 218 | shifted_cos = cos[self.num_sink_tokens : -key_states.shape[-2]] 219 | original_sin = sin[self.num_sink_tokens + key_states.shape[-2] :] 220 | shifted_sin = sin[self.num_sink_tokens : -key_states.shape[-2]] 221 | rerotation_cos = original_cos * shifted_cos + original_sin * shifted_sin 222 | rerotation_sin = -original_sin * shifted_cos + original_cos * shifted_sin 223 | 224 | self.cos_sin_cache[key_states.shape[-2]] = ( 225 | rerotation_cos.to(key_states.dtype).unsqueeze(0), 226 | rerotation_sin.to(key_states.dtype).unsqueeze(0), 227 | ) 228 | return self.cos_sin_cache[key_states.shape[-2]] 229 | 230 | def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: 231 | """Returns the sequence length of the cached states. A layer index can be optionally passed.""" 232 | # Workaround to make 'key_states.shape[-2] + past_key_value.get_seq_length(self.layer_idx)' <= window_length 233 | if len(self.key_cache) <= layer_idx: 234 | return 0 235 | return self.key_cache[layer_idx].shape[-2] 236 | 237 | def get_max_length(self) -> Optional[int]: 238 | """Returns the maximum sequence length of the cached states.""" 239 | return self.window_length 240 | 241 | def update( 242 | self, 243 | key_states: torch.Tensor, 244 | value_states: torch.Tensor, 245 | layer_idx: int, 246 | cache_kwargs: Optional[Dict[str, Any]] = None, 247 | ) -> Tuple[torch.Tensor, torch.Tensor]: 248 | """ 249 | Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. 250 | 251 | Parameters: 252 | key_states (`torch.Tensor`): 253 | The new key states to cache. 254 | value_states (`torch.Tensor`): 255 | The new value states to cache. 256 | layer_idx (`int`): 257 | The index of the layer to cache the states for. 258 | cache_kwargs (`Dict[str, Any]`, `optional`): 259 | Additional arguments for the cache subclass. The following arguments can be used in `SinkCache`: `sin`, 260 | `cos` and `partial_rotation_size`. These arguments are used with models using RoPE, to recompute the 261 | rotation as the tokens are shifted. 262 | 263 | Return: 264 | A tuple containing the updated key and value states. 265 | """ 266 | # Optional kwargs for `SinkCache` -- needed on models using RoPE. `partial_rotation_size` is used on models 267 | # with partially rotated position embeddings, like Phi or Persimmon. 268 | sin = cache_kwargs.get("sin") 269 | cos = cache_kwargs.get("cos") 270 | partial_rotation_size = cache_kwargs.get("partial_rotation_size") 271 | using_rope = cos is not None and sin is not None 272 | 273 | # Update the number of seen tokens 274 | if layer_idx == 0: 275 | self.seen_tokens += key_states.shape[-2] 276 | 277 | # [bsz, num_heads, seq_len, head_dim] 278 | if len(self.key_cache) <= layer_idx: 279 | # Empty cache 280 | self.key_cache.append(key_states) 281 | self.value_cache.append(value_states) 282 | 283 | elif key_states.shape[-2] + self.get_seq_length(layer_idx) < self.window_length: 284 | # Growing cache 285 | self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) 286 | self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) 287 | 288 | else: 289 | # Shifting cache 290 | keys_to_keep = self.key_cache[layer_idx][ 291 | :, :, -self.window_length + self.num_sink_tokens + key_states.shape[-2] : 292 | ] 293 | 294 | # On RoPE models, we need to recompute the Key rotation as the tokens are shifted 295 | if using_rope: 296 | rerotation_cos, rerotation_sin = self._get_rerotation_cos_sin( 297 | key_states, cos[: self.window_length], sin[: self.window_length] 298 | ) 299 | if partial_rotation_size is not None: 300 | keys_to_keep, keys_pass = ( 301 | keys_to_keep[..., :partial_rotation_size], 302 | keys_to_keep[..., partial_rotation_size:], 303 | ) 304 | keys_to_keep = self._apply_key_rotary_pos_emb(keys_to_keep, rerotation_cos, rerotation_sin) 305 | if partial_rotation_size is not None: 306 | keys_to_keep = torch.cat((keys_to_keep, keys_pass), dim=-1) 307 | 308 | # Concatenate sink tokens, shifted & rotated tokens (if needed), and new tokens 309 | sink_keys = self.key_cache[layer_idx][:, :, : self.num_sink_tokens] 310 | self.key_cache[layer_idx] = torch.cat([sink_keys, keys_to_keep, key_states], dim=-2) 311 | 312 | sink_values = self.value_cache[layer_idx][:, :, : self.num_sink_tokens] 313 | values_to_keep = self.value_cache[layer_idx][ 314 | :, :, -self.window_length + self.num_sink_tokens + value_states.shape[-2] : 315 | ] 316 | self.value_cache[layer_idx] = torch.cat([sink_values, values_to_keep, value_states], dim=-2) 317 | 318 | return self.key_cache[layer_idx], self.value_cache[layer_idx] 319 | 320 | def reorder_cache(self, beam_idx: torch.LongTensor): 321 | """Reorders the cache for beam search, given the selected beam indices.""" 322 | for layer_idx in range(len(self.key_cache)): 323 | device = self.key_cache[layer_idx].device 324 | self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) 325 | device = self.value_cache[layer_idx].device 326 | self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) 327 | 328 | 329 | class StaticCache(Cache): 330 | """ 331 | Static Cache class to be used with `torch.compile(model)`. 332 | 333 | Parameters: 334 | config (`PretrainedConfig): 335 | The configuration file defining the `max_position_embeddings`, `hidden_size` and `num_attention_heads` 336 | required to initialize the static cache. 337 | max_batch_size (`int`): 338 | The maximum batch size with which the model will be used. 339 | max_cache_len (`int`): 340 | The maximum sequence length with which the model will be used. 341 | device (`torch.device`): 342 | The device on which the cache should be initialized. Should be the same as the layer. 343 | dtype (*optional*, defaults to `torch.float32`): 344 | The default `dtype` to use when initializing the layer. 345 | """ 346 | 347 | def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device, dtype=None) -> None: 348 | super().__init__() 349 | self.max_batch_size = max_batch_size 350 | self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len 351 | # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads 352 | self.head_dim = ( 353 | config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads 354 | ) 355 | 356 | self.dtype = dtype if dtype is not None else torch.float32 357 | self.num_key_value_heads = ( 358 | config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads 359 | ) 360 | 361 | cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim) 362 | self.key_cache: torch.Tensor = torch.zeros(cache_shape, dtype=self.dtype, device=device) 363 | self.value_cache: torch.Tensor = torch.zeros(cache_shape, dtype=self.dtype, device=device) 364 | 365 | def update( 366 | self, 367 | key_states: torch.Tensor, 368 | value_states: torch.Tensor, 369 | layer_idx: int, 370 | cache_kwargs: Optional[Dict[str, Any]] = None, 371 | ) -> Tuple[torch.Tensor, torch.Tensor]: 372 | """ 373 | Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. 374 | It is VERY important to index using a tensor, otherwise you introduce a copy to the device. 375 | 376 | Parameters: 377 | key_states (`torch.Tensor`): 378 | The new key states to cache. 379 | value_states (`torch.Tensor`): 380 | The new value states to cache. 381 | layer_idx (`int`): 382 | The index of the layer to cache the states for. Kept for backward compatibility 383 | cache_kwargs (`Dict[str, Any]`, `optional`): 384 | Additional arguments for the cache subclass. The `StaticCache` just needs the `q_len` 385 | to know how much of the cache it should overwrite. 386 | 387 | Return: 388 | A tuple containing the updated key and value states. 389 | """ 390 | new_cache_positions = cache_kwargs.get("cache_position") 391 | k_out = self.key_cache 392 | v_out = self.value_cache 393 | 394 | k_out[:, :, new_cache_positions] = key_states 395 | v_out[:, :, new_cache_positions] = value_states 396 | 397 | return k_out, v_out 398 | 399 | def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: 400 | """Returns the sequence length of the cached states that were seen by the model. `layer_idx` kept for BC""" 401 | # TODO: Fix once the stateful `int` bug in PyTorch is fixed. 402 | raise ValueError( 403 | "get_seq_length is not implemented for StaticCache. Please refer to https://github.com/huggingface/transformers/pull/29114." 404 | ) 405 | 406 | def get_usable_length(self, new_sequence_length=None, layer_idx: Optional[int] = 0) -> int: 407 | # TODO: Fix once the stateful `int` bug in PyTorch is fixed. 408 | raise ValueError( 409 | "get_seq_length is not implemented for StaticCache. Please refer to https://github.com/huggingface/transformers/pull/29114." 410 | ) 411 | 412 | def get_max_length(self) -> Optional[int]: 413 | """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length.""" 414 | return self.max_cache_len 415 | 416 | def reorder_cache(self, beam_idx: torch.LongTensor): 417 | """Reorders the cache for beam search, given the selected beam indices.""" 418 | device = self.key_cache.device 419 | self.key_cache = self.key_cache.index_select(0, beam_idx.to(device)) 420 | device = self.value_cache.device 421 | self.value_cache = self.value_cache.index_select(0, beam_idx.to(device)) 422 | 423 | def to_legacy_cache(self): 424 | """Dummy function for BC. We have to keep it because otherwise the call in the forward of models will break it""" 425 | return None 426 | 427 | -------------------------------------------------------------------------------- /qwen2_sft/models/qwen2/configuration_qwen2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Qwen2 model configuration""" 16 | 17 | from transformers.configuration_utils import PretrainedConfig 18 | from transformers.utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = { 24 | "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json", 25 | } 26 | 27 | 28 | class Qwen2Config(PretrainedConfig): 29 | r""" 30 | This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a 31 | Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration 32 | with the defaults will yield a similar configuration to that of 33 | Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta). 34 | 35 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 36 | documentation from [`PretrainedConfig`] for more information. 37 | 38 | 39 | Args: 40 | vocab_size (`int`, *optional*, defaults to 151936): 41 | Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the 42 | `inputs_ids` passed when calling [`Qwen2Model`] 43 | hidden_size (`int`, *optional*, defaults to 4096): 44 | Dimension of the hidden representations. 45 | intermediate_size (`int`, *optional*, defaults to 22016): 46 | Dimension of the MLP representations. 47 | num_hidden_layers (`int`, *optional*, defaults to 32): 48 | Number of hidden layers in the Transformer encoder. 49 | num_attention_heads (`int`, *optional*, defaults to 32): 50 | Number of attention heads for each attention layer in the Transformer encoder. 51 | num_key_value_heads (`int`, *optional*, defaults to 32): 52 | This is the number of key_value heads that should be used to implement Grouped Query Attention. If 53 | `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if 54 | `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When 55 | converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed 56 | by meanpooling all the original heads within that group. For more details checkout [this 57 | paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. 58 | hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): 59 | The non-linear activation function (function or string) in the decoder. 60 | max_position_embeddings (`int`, *optional*, defaults to 32768): 61 | The maximum sequence length that this model might ever be used with. 62 | initializer_range (`float`, *optional*, defaults to 0.02): 63 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 64 | rms_norm_eps (`float`, *optional*, defaults to 1e-06): 65 | The epsilon used by the rms normalization layers. 66 | use_cache (`bool`, *optional*, defaults to `True`): 67 | Whether or not the model should return the last key/values attentions (not used by all models). Only 68 | relevant if `config.is_decoder=True`. 69 | tie_word_embeddings (`bool`, *optional*, defaults to `False`): 70 | Whether the model's input and output word embeddings should be tied. 71 | rope_theta (`float`, *optional*, defaults to 10000.0): 72 | The base period of the RoPE embeddings. 73 | use_sliding_window (`bool`, *optional*, defaults to `False`): 74 | Whether to use sliding window attention. 75 | sliding_window (`int`, *optional*, defaults to 4096): 76 | Sliding window attention (SWA) window size. If not specified, will default to `4096`. 77 | max_window_layers (`int`, *optional*, defaults to 28): 78 | The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. 79 | attention_dropout (`float`, *optional*, defaults to 0.0): 80 | The dropout ratio for the attention probabilities. 81 | 82 | ```python 83 | >>> from transformers import Qwen2Model, Qwen2Config 84 | 85 | >>> # Initializing a Qwen2 style configuration 86 | >>> configuration = Qwen2Config() 87 | 88 | >>> # Initializing a model from the Qwen2-7B style configuration 89 | >>> model = Qwen2Model(configuration) 90 | 91 | >>> # Accessing the model configuration 92 | >>> configuration = model.config 93 | ```""" 94 | 95 | model_type = "qwen2" 96 | keys_to_ignore_at_inference = ["past_key_values"] 97 | 98 | def __init__( 99 | self, 100 | vocab_size=151936, 101 | hidden_size=4096, 102 | intermediate_size=22016, 103 | num_hidden_layers=32, 104 | num_attention_heads=32, 105 | num_key_value_heads=32, 106 | hidden_act="silu", 107 | max_position_embeddings=32768, 108 | initializer_range=0.02, 109 | rms_norm_eps=1e-6, 110 | use_cache=True, 111 | tie_word_embeddings=False, 112 | rope_theta=10000.0, 113 | use_sliding_window=False, 114 | sliding_window=4096, 115 | max_window_layers=28, 116 | attention_dropout=0.0, 117 | **kwargs, 118 | ): 119 | self.vocab_size = vocab_size 120 | self.max_position_embeddings = max_position_embeddings 121 | self.hidden_size = hidden_size 122 | self.intermediate_size = intermediate_size 123 | self.num_hidden_layers = num_hidden_layers 124 | self.num_attention_heads = num_attention_heads 125 | self.use_sliding_window = use_sliding_window 126 | self.sliding_window = sliding_window 127 | self.max_window_layers = max_window_layers 128 | 129 | # for backward compatibility 130 | if num_key_value_heads is None: 131 | num_key_value_heads = num_attention_heads 132 | 133 | self.num_key_value_heads = num_key_value_heads 134 | self.hidden_act = hidden_act 135 | self.initializer_range = initializer_range 136 | self.rms_norm_eps = rms_norm_eps 137 | self.use_cache = use_cache 138 | self.rope_theta = rope_theta 139 | self.attention_dropout = attention_dropout 140 | 141 | 142 | super().__init__( 143 | tie_word_embeddings=tie_word_embeddings, 144 | **kwargs, 145 | ) -------------------------------------------------------------------------------- /qwen2_sft/models/qwen2/modeling_attn_mask_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from dataclasses import dataclass 15 | from typing import List, Optional, Tuple, Union 16 | 17 | import torch 18 | 19 | 20 | @dataclass 21 | class AttentionMaskConverter: 22 | """ 23 | A utility attention mask class that allows one to: 24 | - Create a causal 4d mask 25 | - Create a causal 4d mask with slided window 26 | - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length, 27 | key_value_length) that can be multiplied with attention scores 28 | 29 | Examples: 30 | 31 | ```python 32 | >>> import torch 33 | >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter 34 | 35 | >>> converter = AttentionMaskConverter(True) 36 | >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32) 37 | tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], 38 | [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], 39 | [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], 40 | [-3.4028e+38, -3.4028e+38, -3.4028e+38, 0.0000e+00, -3.4028e+38], 41 | [-3.4028e+38, -3.4028e+38, -3.4028e+38, 0.0000e+00, 0.0000e+00]]]]) 42 | ``` 43 | 44 | Parameters: 45 | is_causal (`bool`): 46 | Whether the attention mask should be a uni-directional (causal) or bi-directional mask. 47 | 48 | sliding_window (`int`, *optional*): 49 | Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer. 50 | """ 51 | 52 | is_causal: bool 53 | sliding_window: int 54 | 55 | def __init__(self, is_causal: bool, sliding_window: Optional[int] = None): 56 | self.is_causal = is_causal 57 | self.sliding_window = sliding_window 58 | 59 | if self.sliding_window is not None and self.sliding_window <= 0: 60 | raise ValueError( 61 | f"Make sure that when passing `sliding_window` that its value is a strictly positive integer, not `{self.sliding_window}`" 62 | ) 63 | 64 | def to_causal_4d( 65 | self, 66 | batch_size: int, 67 | query_length: int, 68 | key_value_length: int, 69 | dtype: torch.dtype, 70 | device: Union[torch.device, "str"] = "cpu", 71 | ) -> Optional[torch.Tensor]: 72 | """ 73 | Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative 74 | bias to upper right hand triangular matrix (causal mask). 75 | """ 76 | if not self.is_causal: 77 | raise ValueError(f"Please use `to_causal_4d` only if {self.__class__} has `is_causal` set to True.") 78 | 79 | # If shape is not cached, create a new causal mask and cache it 80 | input_shape = (batch_size, query_length) 81 | past_key_values_length = key_value_length - query_length 82 | 83 | # create causal mask 84 | # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] 85 | causal_4d_mask = None 86 | if input_shape[-1] > 1 or self.sliding_window is not None: 87 | causal_4d_mask = self._make_causal_mask( 88 | input_shape, 89 | dtype, 90 | device=device, 91 | past_key_values_length=past_key_values_length, 92 | sliding_window=self.sliding_window, 93 | ) 94 | 95 | return causal_4d_mask 96 | 97 | def to_4d( 98 | self, 99 | attention_mask_2d: torch.Tensor, 100 | query_length: int, 101 | dtype: torch.dtype, 102 | key_value_length: Optional[int] = None, 103 | ) -> torch.Tensor: 104 | """ 105 | Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length, 106 | key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is 107 | causal, a causal mask will be added. 108 | """ 109 | input_shape = (attention_mask_2d.shape[0], query_length) 110 | 111 | # create causal mask 112 | # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] 113 | causal_4d_mask = None 114 | if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal: 115 | if key_value_length is None: 116 | raise ValueError( 117 | "This attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask." 118 | ) 119 | 120 | past_key_values_length = key_value_length - query_length 121 | causal_4d_mask = self._make_causal_mask( 122 | input_shape, 123 | dtype, 124 | device=attention_mask_2d.device, 125 | past_key_values_length=past_key_values_length, 126 | sliding_window=self.sliding_window, 127 | ) 128 | elif self.sliding_window is not None: 129 | raise NotImplementedError("Sliding window is currently only implemented for causal masking") 130 | 131 | # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] 132 | expanded_attn_mask = self._expand_mask(attention_mask_2d, dtype, tgt_len=input_shape[-1]).to( 133 | attention_mask_2d.device 134 | ) 135 | 136 | if causal_4d_mask is not None: 137 | expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), torch.finfo(dtype).min) 138 | 139 | # expanded_attn_mask + causal_4d_mask can cause some overflow 140 | expanded_4d_mask = expanded_attn_mask 141 | 142 | return expanded_4d_mask 143 | 144 | @staticmethod 145 | def _make_causal_mask( 146 | input_ids_shape: torch.Size, 147 | dtype: torch.dtype, 148 | device: torch.device, 149 | past_key_values_length: int = 0, 150 | sliding_window: Optional[int] = None, 151 | ): 152 | """ 153 | Make causal mask used for bi-directional self-attention. 154 | """ 155 | bsz, tgt_len = input_ids_shape 156 | mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) 157 | mask_cond = torch.arange(mask.size(-1), device=device) 158 | mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) 159 | 160 | mask = mask.to(dtype) 161 | 162 | if past_key_values_length > 0: 163 | mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) 164 | 165 | # add lower triangular sliding window mask if necessary 166 | if sliding_window is not None: 167 | diagonal = past_key_values_length - sliding_window + 1 168 | 169 | context_mask = 1 - torch.triu(torch.ones_like(mask, dtype=torch.int), diagonal=diagonal) 170 | mask.masked_fill_(context_mask.bool(), torch.finfo(dtype).min) 171 | 172 | return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) 173 | 174 | @staticmethod 175 | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): 176 | """ 177 | Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. 178 | """ 179 | bsz, src_len = mask.size() 180 | tgt_len = tgt_len if tgt_len is not None else src_len 181 | 182 | expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) 183 | 184 | inverted_mask = 1.0 - expanded_mask 185 | 186 | return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) 187 | 188 | @staticmethod 189 | def _unmask_unattended( 190 | expanded_mask: torch.FloatTensor, 191 | min_dtype: float, 192 | ): 193 | # fmt: off 194 | """ 195 | Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when 196 | using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. 197 | Details: https://github.com/pytorch/pytorch/issues/110213 198 | 199 | `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len]. 200 | `attention_mask` is [bsz, src_seq_len]. 201 | 202 | The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias. 203 | 204 | For example, if `expanded_mask` is (e.g. here left-padding case) 205 | ``` 206 | [[[[0, 0, 0], 207 | [0, 0, 0], 208 | [0, 0, 1]]], 209 | [[[1, 0, 0], 210 | [1, 1, 0], 211 | [1, 1, 1]]], 212 | [[[0, 0, 0], 213 | [0, 1, 0], 214 | [0, 1, 1]]]] 215 | ``` 216 | then the modified `expanded_mask` will be 217 | ``` 218 | [[[[1, 1, 1], <-- modified 219 | [1, 1, 1], <-- modified 220 | [0, 0, 1]]], 221 | [[[1, 0, 0], 222 | [1, 1, 0], 223 | [1, 1, 1]]], 224 | [[[1, 1, 1], <-- modified 225 | [0, 1, 0], 226 | [0, 1, 1]]]] 227 | ``` 228 | """ 229 | # fmt: on 230 | if expanded_mask.dtype == torch.bool: 231 | raise ValueError( 232 | "AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor." 233 | ) 234 | 235 | return expanded_mask.mul(~torch.all(expanded_mask == min_dtype, dim=-1, keepdim=True)) 236 | 237 | 238 | def _prepare_4d_causal_attention_mask( 239 | attention_mask: Optional[torch.Tensor], 240 | input_shape: Union[torch.Size, Tuple, List], 241 | inputs_embeds: torch.Tensor, 242 | past_key_values_length: int, 243 | sliding_window: Optional[int] = None, 244 | ): 245 | """ 246 | Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape 247 | `(batch_size, key_value_length)` 248 | 249 | Args: 250 | attention_mask (`torch.Tensor` or `None`): 251 | A 2D attention mask of shape `(batch_size, key_value_length)` 252 | input_shape (`tuple(int)` or `list(int)` or `torch.Size`): 253 | The input shape should be a tuple that defines `(batch_size, query_length)`. 254 | inputs_embeds (`torch.Tensor`): 255 | The embedded inputs as a torch Tensor. 256 | past_key_values_length (`int`): 257 | The length of the key value cache. 258 | sliding_window (`int`, *optional*): 259 | If the model uses windowed attention, a sliding window should be passed. 260 | """ 261 | attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window) 262 | 263 | key_value_length = input_shape[-1] + past_key_values_length 264 | 265 | # 4d mask is passed through the layers 266 | if attention_mask is not None and len(attention_mask.shape) == 2: 267 | attention_mask = attn_mask_converter.to_4d( 268 | attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype 269 | ) 270 | elif attention_mask is not None and len(attention_mask.shape) == 4: 271 | expected_shape = (input_shape[0], 1, input_shape[1], key_value_length) 272 | if tuple(attention_mask.shape) != expected_shape: 273 | raise ValueError( 274 | f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}." 275 | ) 276 | else: 277 | # if the 4D mask has correct shape - invert it and fill with negative infinity 278 | inverted_mask = 1.0 - attention_mask 279 | attention_mask = inverted_mask.masked_fill( 280 | inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min 281 | ) 282 | else: 283 | attention_mask = attn_mask_converter.to_causal_4d( 284 | input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device 285 | ) 286 | 287 | return attention_mask 288 | 289 | 290 | # Adapted from _prepare_4d_causal_attention_mask 291 | def _prepare_4d_causal_attention_mask_for_sdpa( 292 | attention_mask: Optional[torch.Tensor], 293 | input_shape: Union[torch.Size, Tuple, List], 294 | inputs_embeds: torch.Tensor, 295 | past_key_values_length: int, 296 | sliding_window: Optional[int] = None, 297 | ): 298 | """ 299 | Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`. 300 | 301 | In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and 302 | `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks, 303 | allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed). 304 | """ 305 | attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window) 306 | 307 | key_value_length = input_shape[-1] + past_key_values_length 308 | batch_size, query_length = input_shape 309 | 310 | # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1` 311 | # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing. 312 | # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400). 313 | is_tracing = ( 314 | torch.jit.is_tracing() 315 | or isinstance(inputs_embeds, torch.fx.Proxy) 316 | or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling()) 317 | ) 318 | 319 | if attention_mask is not None: 320 | # 4d mask is passed through 321 | if len(attention_mask.shape) == 4: 322 | expected_shape = (input_shape[0], 1, input_shape[1], key_value_length) 323 | if tuple(attention_mask.shape) != expected_shape: 324 | raise ValueError( 325 | f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}." 326 | ) 327 | else: 328 | # if the 4D mask has correct shape - invert it and fill with negative infinity 329 | inverted_mask = 1.0 - attention_mask.to(inputs_embeds.dtype) 330 | attention_mask = inverted_mask.masked_fill( 331 | inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min 332 | ) 333 | return attention_mask 334 | 335 | elif not is_tracing and torch.all(attention_mask == 1): 336 | if query_length == 1: 337 | # For query_length == 1, causal attention and bi-directional attention are the same. 338 | attention_mask = None 339 | elif key_value_length == query_length: 340 | attention_mask = None 341 | else: 342 | # Unfortunately, for query_length > 1 and key_value_length != query_length, we cannot generally ignore the attention mask, as SDPA causal mask generation 343 | # may be wrong. We will set `is_causal=False` in SDPA and rely on Transformers attention_mask instead, hence not setting it to None here. 344 | # Reference: https://github.com/pytorch/pytorch/issues/108108 345 | pass 346 | elif query_length > 1 and key_value_length != query_length: 347 | # See the comment above (https://github.com/pytorch/pytorch/issues/108108). 348 | # Ugly: we set it to True here to dispatch in the following controlflow to `to_causal_4d`. 349 | attention_mask = True 350 | elif is_tracing: 351 | raise ValueError( 352 | 'Attention using SDPA can not be traced with torch.jit.trace when no attention_mask is provided. To solve this issue, please either load your model with the argument `attn_implementation="eager"` or pass an attention_mask input when tracing the model.' 353 | ) 354 | 355 | if attention_mask is None: 356 | expanded_4d_mask = None 357 | elif attention_mask is True: 358 | expanded_4d_mask = attn_mask_converter.to_causal_4d( 359 | input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device 360 | ) 361 | else: 362 | expanded_4d_mask = attn_mask_converter.to_4d( 363 | attention_mask, 364 | input_shape[-1], 365 | dtype=inputs_embeds.dtype, 366 | key_value_length=key_value_length, 367 | ) 368 | 369 | # Attend to all tokens in masked rows from the causal_mask, for example the relevant first rows when 370 | # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. 371 | # Details: https://github.com/pytorch/pytorch/issues/110213 372 | if not is_tracing and expanded_4d_mask.device.type == "cuda": 373 | expanded_4d_mask = AttentionMaskConverter._unmask_unattended( 374 | expanded_4d_mask, min_dtype=torch.finfo(inputs_embeds.dtype).min 375 | ) 376 | 377 | return expanded_4d_mask 378 | 379 | 380 | def _prepare_4d_attention_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): 381 | """ 382 | Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape 383 | `(batch_size, key_value_length)` 384 | 385 | Args: 386 | mask (`torch.Tensor` or `None`): 387 | A 2D attention mask of shape `(batch_size, key_value_length)` 388 | dtype (`torch.dtype`): 389 | The torch dtype the created mask shall have. 390 | tgt_len (`int`): 391 | The target length or query length the created mask shall have. 392 | """ 393 | return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) 394 | 395 | 396 | def _prepare_4d_attention_mask_for_sdpa(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): 397 | """ 398 | Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape 399 | `(batch_size, key_value_length)` 400 | 401 | Args: 402 | mask (`torch.Tensor` or `None`): 403 | A 2D attention mask of shape `(batch_size, key_value_length)` 404 | dtype (`torch.dtype`): 405 | The torch dtype the created mask shall have. 406 | tgt_len (`int`): 407 | The target length or query length the created mask shall have. 408 | """ 409 | batch_size, key_value_length = mask.shape 410 | tgt_len = tgt_len if tgt_len is not None else key_value_length 411 | 412 | # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1` 413 | # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing. 414 | # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400). 415 | is_tracing = ( 416 | torch.jit.is_tracing() 417 | or isinstance(mask, torch.fx.Proxy) 418 | or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling()) 419 | ) 420 | 421 | if torch.all(mask == 1): 422 | if is_tracing: 423 | pass 424 | elif tgt_len == 1: 425 | # For query_length == 1, causal attention and bi-directional attention are the same. 426 | return None 427 | elif key_value_length == tgt_len: 428 | return None 429 | else: 430 | # Unfortunately, for query_length > 1 and key_value_length != query_length, we can not generally ignore the attention mask, as SDPA causal mask generation 431 | # may be wrong. We will set is_causal=False in SDPA and rely on Transformers attention_mask instead, hence not setting it to None here. 432 | # Reference: https://github.com/pytorch/pytorch/issues/108108 433 | return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) 434 | else: 435 | return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) 436 | 437 | 438 | def _create_4d_causal_attention_mask( 439 | input_shape: Union[torch.Size, Tuple, List], 440 | dtype: torch.dtype, 441 | device: torch.device, 442 | past_key_values_length: int = 0, 443 | sliding_window: Optional[int] = None, 444 | ) -> Optional[torch.Tensor]: 445 | """ 446 | Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` 447 | 448 | Args: 449 | input_shape (`tuple(int)` or `list(int)` or `torch.Size`): 450 | The input shape should be a tuple that defines `(batch_size, query_length)`. 451 | dtype (`torch.dtype`): 452 | The torch dtype the created mask shall have. 453 | device (`int`): 454 | The torch device the created mask shall have. 455 | sliding_window (`int`, *optional*): 456 | If the model uses windowed attention, a sliding window should be passed. 457 | """ 458 | attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window) 459 | 460 | key_value_length = past_key_values_length + input_shape[-1] 461 | attention_mask = attn_mask_converter.to_causal_4d( 462 | input_shape[0], input_shape[-1], key_value_length, dtype=dtype, device=device 463 | ) 464 | 465 | return attention_mask -------------------------------------------------------------------------------- /qwen2_sft/models/qwen2/pytorch_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import inspect 15 | from typing import Callable, List, Optional, Set, Tuple, Union 16 | 17 | import torch 18 | from packaging import version 19 | from safetensors.torch import storage_ptr, storage_size 20 | from torch import nn 21 | 22 | from transformers.utils import is_torch_tpu_available, logging 23 | 24 | 25 | ALL_LAYERNORM_LAYERS = [nn.LayerNorm] 26 | 27 | logger = logging.get_logger(__name__) 28 | 29 | parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version) 30 | 31 | is_torch_greater_or_equal_than_2_2 = parsed_torch_version_base >= version.parse("2.2") 32 | is_torch_greater_or_equal_than_2_1 = parsed_torch_version_base >= version.parse("2.1") 33 | is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0") 34 | is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse("1.13") 35 | is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12") 36 | 37 | 38 | def softmax_backward_data(parent, grad_output, output, dim, self): 39 | """ 40 | A function that calls the internal `_softmax_backward_data` PyTorch method and that adjusts the arguments according 41 | to the torch version detected. 42 | """ 43 | 44 | from torch import _softmax_backward_data 45 | 46 | return _softmax_backward_data(grad_output, output, parent.dim, self.dtype) 47 | 48 | 49 | def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0) -> nn.Linear: 50 | """ 51 | Prune a linear layer to keep only entries in index. 52 | 53 | Used to remove heads. 54 | 55 | Args: 56 | layer (`torch.nn.Linear`): The layer to prune. 57 | index (`torch.LongTensor`): The indices to keep in the layer. 58 | dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices. 59 | 60 | Returns: 61 | `torch.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`. 62 | """ 63 | index = index.to(layer.weight.device) 64 | W = layer.weight.index_select(dim, index).clone().detach() 65 | if layer.bias is not None: 66 | if dim == 1: 67 | b = layer.bias.clone().detach() 68 | else: 69 | b = layer.bias[index].clone().detach() 70 | new_size = list(layer.weight.size()) 71 | new_size[dim] = len(index) 72 | new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) 73 | new_layer.weight.requires_grad = False 74 | new_layer.weight.copy_(W.contiguous()) 75 | new_layer.weight.requires_grad = True 76 | if layer.bias is not None: 77 | new_layer.bias.requires_grad = False 78 | new_layer.bias.copy_(b.contiguous()) 79 | new_layer.bias.requires_grad = True 80 | return new_layer 81 | 82 | 83 | class Conv1D(nn.Module): 84 | """ 85 | 1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2). 86 | 87 | Basically works like a linear layer but the weights are transposed. 88 | 89 | Args: 90 | nf (`int`): The number of output features. 91 | nx (`int`): The number of input features. 92 | """ 93 | 94 | def __init__(self, nf, nx): 95 | super().__init__() 96 | self.nf = nf 97 | self.weight = nn.Parameter(torch.empty(nx, nf)) 98 | self.bias = nn.Parameter(torch.zeros(nf)) 99 | nn.init.normal_(self.weight, std=0.02) 100 | 101 | def forward(self, x): 102 | size_out = x.size()[:-1] + (self.nf,) 103 | x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) 104 | x = x.view(size_out) 105 | return x 106 | 107 | 108 | def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) -> Conv1D: 109 | """ 110 | Prune a Conv1D layer to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights 111 | are transposed. 112 | 113 | Used to remove heads. 114 | 115 | Args: 116 | layer ([`~pytorch_utils.Conv1D`]): The layer to prune. 117 | index (`torch.LongTensor`): The indices to keep in the layer. 118 | dim (`int`, *optional*, defaults to 1): The dimension on which to keep the indices. 119 | 120 | Returns: 121 | [`~pytorch_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`. 122 | """ 123 | index = index.to(layer.weight.device) 124 | W = layer.weight.index_select(dim, index).clone().detach() 125 | if dim == 0: 126 | b = layer.bias.clone().detach() 127 | else: 128 | b = layer.bias[index].clone().detach() 129 | new_size = list(layer.weight.size()) 130 | new_size[dim] = len(index) 131 | new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device) 132 | new_layer.weight.requires_grad = False 133 | new_layer.weight.copy_(W.contiguous()) 134 | new_layer.weight.requires_grad = True 135 | new_layer.bias.requires_grad = False 136 | new_layer.bias.copy_(b.contiguous()) 137 | new_layer.bias.requires_grad = True 138 | return new_layer 139 | 140 | 141 | def prune_layer( 142 | layer: Union[nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None 143 | ) -> Union[nn.Linear, Conv1D]: 144 | """ 145 | Prune a Conv1D or linear layer to keep only entries in index. 146 | 147 | Used to remove heads. 148 | 149 | Args: 150 | layer (`Union[torch.nn.Linear, Conv1D]`): The layer to prune. 151 | index (`torch.LongTensor`): The indices to keep in the layer. 152 | dim (`int`, *optional*): The dimension on which to keep the indices. 153 | 154 | Returns: 155 | `torch.nn.Linear` or [`~pytorch_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`. 156 | """ 157 | if isinstance(layer, nn.Linear): 158 | return prune_linear_layer(layer, index, dim=0 if dim is None else dim) 159 | elif isinstance(layer, Conv1D): 160 | return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim) 161 | else: 162 | raise ValueError(f"Can't prune layer of class {layer.__class__}") 163 | 164 | 165 | def apply_chunking_to_forward( 166 | forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors 167 | ) -> torch.Tensor: 168 | """ 169 | This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension 170 | `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory. 171 | 172 | If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly 173 | applying `forward_fn` to `input_tensors`. 174 | 175 | Args: 176 | forward_fn (`Callable[..., torch.Tensor]`): 177 | The forward function of the model. 178 | chunk_size (`int`): 179 | The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`. 180 | chunk_dim (`int`): 181 | The dimension over which the `input_tensors` should be chunked. 182 | input_tensors (`Tuple[torch.Tensor]`): 183 | The input tensors of `forward_fn` which will be chunked 184 | 185 | Returns: 186 | `torch.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`. 187 | 188 | 189 | Examples: 190 | 191 | ```python 192 | # rename the usual forward() fn to forward_chunk() 193 | def forward_chunk(self, hidden_states): 194 | hidden_states = self.decoder(hidden_states) 195 | return hidden_states 196 | 197 | 198 | # implement a chunked forward function 199 | def forward(self, hidden_states): 200 | return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states) 201 | ```""" 202 | 203 | assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors" 204 | 205 | # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility 206 | num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters) 207 | if num_args_in_forward_chunk_fn != len(input_tensors): 208 | raise ValueError( 209 | f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input " 210 | "tensors are given" 211 | ) 212 | 213 | if chunk_size > 0: 214 | tensor_shape = input_tensors[0].shape[chunk_dim] 215 | for input_tensor in input_tensors: 216 | if input_tensor.shape[chunk_dim] != tensor_shape: 217 | raise ValueError( 218 | f"All input tenors have to be of the same shape: {tensor_shape}, " 219 | f"found shape {input_tensor.shape[chunk_dim]}" 220 | ) 221 | 222 | if input_tensors[0].shape[chunk_dim] % chunk_size != 0: 223 | raise ValueError( 224 | f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk " 225 | f"size {chunk_size}" 226 | ) 227 | 228 | num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size 229 | 230 | # chunk input tensor into tuples 231 | input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors) 232 | # apply forward fn to every tuple 233 | output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks)) 234 | # concatenate output at same dimension 235 | return torch.cat(output_chunks, dim=chunk_dim) 236 | 237 | return forward_fn(*input_tensors) 238 | 239 | 240 | def find_pruneable_heads_and_indices( 241 | heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int] 242 | ) -> Tuple[Set[int], torch.LongTensor]: 243 | """ 244 | Finds the heads and their indices taking `already_pruned_heads` into account. 245 | 246 | Args: 247 | heads (`List[int]`): List of the indices of heads to prune. 248 | n_heads (`int`): The number of heads in the model. 249 | head_size (`int`): The size of each head. 250 | already_pruned_heads (`Set[int]`): A set of already pruned heads. 251 | 252 | Returns: 253 | `Tuple[Set[int], torch.LongTensor]`: A tuple with the indices of heads to prune taking `already_pruned_heads` 254 | into account and the indices of rows/columns to keep in the layer weight. 255 | """ 256 | mask = torch.ones(n_heads, head_size) 257 | heads = set(heads) - already_pruned_heads # Convert to set and remove already pruned heads 258 | for head in heads: 259 | # Compute how many pruned heads are before the head and move the index accordingly 260 | head = head - sum(1 if h < head else 0 for h in already_pruned_heads) 261 | mask[head] = 0 262 | mask = mask.view(-1).contiguous().eq(1) 263 | index: torch.LongTensor = torch.arange(len(mask))[mask].long() 264 | return heads, index 265 | 266 | 267 | def meshgrid( 268 | *tensors: Union[torch.Tensor, List[torch.Tensor]], indexing: Optional[str] = None 269 | ) -> Tuple[torch.Tensor, ...]: 270 | """ 271 | Wrapper around torch.meshgrid to avoid warning messages about the introduced `indexing` argument. 272 | 273 | Reference: https://pytorch.org/docs/1.13/generated/torch.meshgrid.html 274 | """ 275 | return torch.meshgrid(*tensors, indexing=indexing) 276 | 277 | 278 | def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]: 279 | """ 280 | Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For 281 | example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is 282 | guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with 283 | non-overlapping lifetimes may have the same id. 284 | """ 285 | if tensor.device.type == "xla" and is_torch_tpu_available(): 286 | # NOTE: xla tensors dont have storage 287 | # use some other unique id to distinguish. 288 | # this is a XLA tensor, it must be created using torch_xla's 289 | # device. So the following import is safe: 290 | import torch_xla 291 | 292 | unique_id = torch_xla._XLAC._xla_get_tensor_id(tensor) 293 | else: 294 | unique_id = storage_ptr(tensor) 295 | 296 | return tensor.device, unique_id, storage_size(tensor) 297 | -------------------------------------------------------------------------------- /qwen2_sft/models/qwen2/tokenization_qwen2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for Qwen2.""" 16 | 17 | import json 18 | import os 19 | import unicodedata 20 | from functools import lru_cache 21 | from typing import Optional, Tuple 22 | 23 | import regex as re 24 | 25 | from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer 26 | from transformers.utils import logging 27 | 28 | 29 | logger = logging.get_logger(__name__) 30 | 31 | VOCAB_FILES_NAMES = { 32 | "vocab_file": "vocab.json", 33 | "merges_file": "merges.txt", 34 | } 35 | 36 | PRETRAINED_VOCAB_FILES_MAP = { 37 | "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"}, 38 | "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"}, 39 | } 40 | 41 | MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} 42 | 43 | PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" 44 | 45 | 46 | @lru_cache() 47 | # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode 48 | def bytes_to_unicode(): 49 | """ 50 | Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control 51 | characters the bpe code barfs on. 52 | 53 | The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab 54 | if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for 55 | decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup 56 | tables between utf-8 bytes and unicode strings. 57 | """ 58 | bs = ( 59 | list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) 60 | ) 61 | cs = bs[:] 62 | n = 0 63 | for b in range(2**8): 64 | if b not in bs: 65 | bs.append(b) 66 | cs.append(2**8 + n) 67 | n += 1 68 | cs = [chr(n) for n in cs] 69 | return dict(zip(bs, cs)) 70 | 71 | 72 | # Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs 73 | def get_pairs(word): 74 | """ 75 | Return set of symbol pairs in a word. 76 | 77 | Word is represented as tuple of symbols (symbols being variable-length strings). 78 | """ 79 | pairs = set() 80 | prev_char = word[0] 81 | for char in word[1:]: 82 | pairs.add((prev_char, char)) 83 | prev_char = char 84 | return pairs 85 | 86 | 87 | class Qwen2Tokenizer(PreTrainedTokenizer): 88 | """ 89 | Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding. 90 | 91 | Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will 92 | be encoded differently whether it is at the beginning of the sentence (without space) or not: 93 | 94 | ```python 95 | >>> from transformers import Qwen2Tokenizer 96 | 97 | >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer") 98 | >>> tokenizer("Hello world")["input_ids"] 99 | [9707, 1879] 100 | 101 | >>> tokenizer(" Hello world")["input_ids"] 102 | [21927, 1879] 103 | ``` 104 | This is expected. 105 | 106 | You should not use GPT2Tokenizer instead, because of the different pretokenization rules. 107 | 108 | This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to 109 | this superclass for more information regarding those methods. 110 | 111 | Args: 112 | vocab_file (`str`): 113 | Path to the vocabulary file. 114 | merges_file (`str`): 115 | Path to the merges file. 116 | errors (`str`, *optional*, defaults to `"replace"`): 117 | Paradigm to follow when decoding bytes to UTF-8. See 118 | [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. 119 | unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 120 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 121 | token instead. 122 | bos_token (`str`, *optional*): 123 | The beginning of sequence token. Not applicable for this tokenizer. 124 | eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 125 | The end of sequence token. 126 | pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 127 | The token used for padding, for example when batching sequences of different lengths. 128 | clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): 129 | Whether or not the model should cleanup the spaces that were added when splitting the input text during the 130 | tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces. 131 | split_special_tokens (`bool`, *optional*, defaults to `False`): 132 | Whether or not the special tokens should be split during the tokenization process. The default behavior is 133 | to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") = 134 | ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<', 135 | '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment. 136 | """ 137 | 138 | vocab_files_names = VOCAB_FILES_NAMES 139 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 140 | max_model_input_sizes = MAX_MODEL_INPUT_SIZES 141 | model_input_names = ["input_ids", "attention_mask"] 142 | 143 | def __init__( 144 | self, 145 | vocab_file, 146 | merges_file, 147 | errors="replace", 148 | unk_token="<|endoftext|>", 149 | bos_token=None, 150 | eos_token="<|endoftext|>", 151 | pad_token="<|endoftext|>", 152 | clean_up_tokenization_spaces=False, 153 | split_special_tokens=False, 154 | **kwargs, 155 | ): 156 | # Qwen vocab does not contain control tokens; added tokens need to be special 157 | bos_token = ( 158 | AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) 159 | if isinstance(bos_token, str) 160 | else bos_token 161 | ) 162 | eos_token = ( 163 | AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) 164 | if isinstance(eos_token, str) 165 | else eos_token 166 | ) 167 | unk_token = ( 168 | AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) 169 | if isinstance(unk_token, str) 170 | else unk_token 171 | ) 172 | pad_token = ( 173 | AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) 174 | if isinstance(pad_token, str) 175 | else pad_token 176 | ) 177 | 178 | with open(vocab_file, encoding="utf-8") as vocab_handle: 179 | self.encoder = json.load(vocab_handle) 180 | self.decoder = {v: k for k, v in self.encoder.items()} 181 | self.errors = errors # how to handle errors in decoding 182 | self.byte_encoder = bytes_to_unicode() 183 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 184 | bpe_merges = [] 185 | if merges_file: 186 | with open(merges_file, encoding="utf-8") as merges_handle: 187 | for line in merges_handle: 188 | line = line.strip() 189 | if not line or line.startswith("#"): 190 | continue 191 | bpe_merges.append(tuple(line.split())) 192 | self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) 193 | # NOTE: the cache can grow without bound and will get really large for long running processes 194 | # (esp. for texts of language that do not use space between word, e.g. Chinese); technically 195 | # not a memory leak but appears as one. 196 | # GPT2Tokenizer has the same problem, so let's be consistent. 197 | self.cache = {} 198 | 199 | self.pat = re.compile(PRETOKENIZE_REGEX) 200 | 201 | if kwargs.get("add_prefix_space", False): 202 | logger.warning_once( 203 | f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect." 204 | ) 205 | 206 | super().__init__( 207 | errors=errors, 208 | bos_token=bos_token, 209 | eos_token=eos_token, 210 | pad_token=pad_token, 211 | unk_token=unk_token, 212 | clean_up_tokenization_spaces=clean_up_tokenization_spaces, 213 | split_special_tokens=split_special_tokens, 214 | **kwargs, 215 | ) 216 | 217 | @property 218 | def vocab_size(self) -> int: 219 | return len(self.encoder) 220 | 221 | # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab 222 | def get_vocab(self): 223 | return dict(self.encoder, **self.added_tokens_encoder) 224 | 225 | # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe 226 | def bpe(self, token): 227 | if token in self.cache: 228 | return self.cache[token] 229 | word = tuple(token) 230 | pairs = get_pairs(word) 231 | 232 | if not pairs: 233 | return token 234 | 235 | while True: 236 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) 237 | if bigram not in self.bpe_ranks: 238 | break 239 | first, second = bigram 240 | new_word = [] 241 | i = 0 242 | while i < len(word): 243 | try: 244 | j = word.index(first, i) 245 | except ValueError: 246 | new_word.extend(word[i:]) 247 | break 248 | else: 249 | new_word.extend(word[i:j]) 250 | i = j 251 | 252 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second: 253 | new_word.append(first + second) 254 | i += 2 255 | else: 256 | new_word.append(word[i]) 257 | i += 1 258 | new_word = tuple(new_word) 259 | word = new_word 260 | if len(word) == 1: 261 | break 262 | else: 263 | pairs = get_pairs(word) 264 | word = " ".join(word) 265 | self.cache[token] = word 266 | return word 267 | 268 | # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize 269 | def _tokenize(self, text): 270 | """Tokenize a string.""" 271 | bpe_tokens = [] 272 | for token in re.findall(self.pat, text): 273 | token = "".join( 274 | self.byte_encoder[b] for b in token.encode("utf-8") 275 | ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) 276 | bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) 277 | return bpe_tokens 278 | 279 | # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id 280 | def _convert_token_to_id(self, token): 281 | """Converts a token (str) in an id using the vocab.""" 282 | return self.encoder.get(token, self.encoder.get(self.unk_token)) 283 | 284 | # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token 285 | def _convert_id_to_token(self, index): 286 | """Converts an index (integer) in a token (str) using the vocab.""" 287 | return self.decoder.get(index) 288 | 289 | # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string 290 | def convert_tokens_to_string(self, tokens): 291 | """Converts a sequence of tokens (string) in a single string.""" 292 | text = "".join(tokens) 293 | text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) 294 | return text 295 | 296 | def decode( 297 | self, 298 | token_ids, 299 | skip_special_tokens: bool = False, 300 | clean_up_tokenization_spaces: Optional[bool] = False, 301 | spaces_between_special_tokens: bool = False, 302 | **kwargs, 303 | ) -> str: 304 | # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers 305 | # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer 306 | return super().decode( 307 | token_ids, 308 | skip_special_tokens=skip_special_tokens, 309 | clean_up_tokenization_spaces=clean_up_tokenization_spaces, 310 | spaces_between_special_tokens=spaces_between_special_tokens, 311 | **kwargs, 312 | ) 313 | 314 | # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary 315 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: 316 | if not os.path.isdir(save_directory): 317 | logger.error(f"Vocabulary path ({save_directory}) should be a directory") 318 | return 319 | vocab_file = os.path.join( 320 | save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] 321 | ) 322 | merge_file = os.path.join( 323 | save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] 324 | ) 325 | 326 | with open(vocab_file, "w", encoding="utf-8") as f: 327 | f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") 328 | 329 | index = 0 330 | with open(merge_file, "w", encoding="utf-8") as writer: 331 | writer.write("#version: 0.2\n") 332 | for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): 333 | if index != token_index: 334 | logger.warning( 335 | f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." 336 | " Please check that the tokenizer is not corrupted!" 337 | ) 338 | index = token_index 339 | writer.write(" ".join(bpe_tokens) + "\n") 340 | index += 1 341 | 342 | return vocab_file, merge_file 343 | 344 | def prepare_for_tokenization(self, text, **kwargs): 345 | text = unicodedata.normalize("NFC", text) 346 | return (text, kwargs) 347 | 348 | 349 | def save_json(jsons, json_path, indent=4): 350 | """ 351 | 保存json 352 | Args: 353 | path[String]:, path of file of save, eg. "corpus/xuexiqiangguo.lib" 354 | jsons[Json]: json of input data, eg. [{"桂林": 132}] 355 | indent[int]: pretty-printed with that indent level, eg. 4 356 | Returns: 357 | None 358 | """ 359 | with open(json_path, "w", encoding="utf-8") as fj: 360 | fj.write(json.dumps(jsons, ensure_ascii=False, indent=indent)) 361 | fj.close() 362 | def load_json(path, parse_int=None): 363 | """ 364 | 加载json 365 | Args: 366 | path_file[String]:, path of file of save, eg. "corpus/xuexiqiangguo.lib" 367 | parse_int[Boolean]: equivalent to int(num_str), eg. True or False 368 | Returns: 369 | data[Any] 370 | """ 371 | with open(path, mode="r", encoding="utf-8") as fj: 372 | model_json = json.load(fj, parse_int=parse_int) 373 | return model_json 374 | 375 | 376 | if __name__ == '__main__': 377 | myz = 0 378 | 379 | 380 | path = "E:/DATA/bert-model/00_pytorch/LLM/Qwen_Qwen1.5-0.5B-Chat" 381 | tokenizer = Qwen2Tokenizer.from_pretrained(path) 382 | # tokenizer.save_vocabulary() 383 | vocab_dict = tokenizer.get_vocab() 384 | save_json(vocab_dict, "qwen1.5_vocab_dict.json") 385 | 386 | 387 | prompt = "Give me a short introduction to large language model." 388 | messages = [ 389 | {"role": "system", "content": "You are a helpful assistant."}, 390 | {"role": "user", "content": ""} 391 | ] 392 | tokens11 = tokenizer.apply_chat_template( 393 | messages, 394 | tokenize=False, 395 | add_generation_prompt=True 396 | ) 397 | print(tokens11) 398 | 399 | tokens = tokenizer.tokenize(tokens11, do_lower_case=True) 400 | print(tokens) 401 | tokens = tokenizer("Hello world")["input_ids"] 402 | print(tokens) 403 | 404 | from transformers import Qwen2Tokenizer 405 | 406 | path = "E:/DATA/bert-model/00_pytorch/LLM/Qwen_Qwen1.5-14B-Chat" 407 | tokenizer2 = Qwen2Tokenizer.from_pretrained(path) 408 | tokens = tokenizer2("Hello world")["input_ids"] 409 | print(tokens) 410 | tokens = tokenizer.tokenize(tokens11, do_lower_case=True) 411 | print(tokens) 412 | 413 | """ 414 | 415 | 416 | "<|endoftext|>": 151643, 417 | "<|im_start|>": 151644, 418 | "<|im_end|>": 151645 419 | 420 | [151644, 82, 88, 82, 83, 68, 76, 198, 56, 78, 84, 220, 64, 81, 68, 220, 64, 220, 71, 68, 75, 79, 69, 84, 75, 220, 64, 82, 82, 72, 82, 83, 64, 77, 83, 13, 151645, 198, 151644, 84, 82, 68, 81, 198, 151645, 198, 151644, 64, 82, 82, 72, 82, 83, 64, 77, 83, 198] 421 | """ 422 | 423 | -------------------------------------------------------------------------------- /qwen2_sft/models/qwen2/tokenization_qwen2_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for Qwen2.""" 16 | 17 | from typing import Optional, Tuple 18 | 19 | from transformers.tokenization_utils import AddedToken 20 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast 21 | from transformers.utils import logging 22 | 23 | from qwen2_sft.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer 24 | 25 | 26 | logger = logging.get_logger(__name__) 27 | 28 | VOCAB_FILES_NAMES = { 29 | "vocab_file": "vocab.json", 30 | "merges_file": "merges.txt", 31 | "tokenizer_file": "tokenizer.json", 32 | } 33 | 34 | PRETRAINED_VOCAB_FILES_MAP = { 35 | "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"}, 36 | "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"}, 37 | "tokenizer_file": { 38 | "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json" 39 | }, 40 | } 41 | 42 | MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} 43 | 44 | 45 | class Qwen2TokenizerFast(PreTrainedTokenizerFast): 46 | """ 47 | Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level 48 | Byte-Pair-Encoding. 49 | 50 | Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will 51 | be encoded differently whether it is at the beginning of the sentence (without space) or not: 52 | 53 | ```python 54 | >>> from transformers import Qwen2TokenizerFast 55 | 56 | >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer") 57 | >>> tokenizer("Hello world")["input_ids"] 58 | [9707, 1879] 59 | 60 | >>> tokenizer(" Hello world")["input_ids"] 61 | [21927, 1879] 62 | ``` 63 | This is expected. 64 | 65 | This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should 66 | refer to this superclass for more information regarding those methods. 67 | 68 | Args: 69 | vocab_file (`str`, *optional*): 70 | Path to the vocabulary file. 71 | merges_file (`str`, *optional*): 72 | Path to the merges file. 73 | tokenizer_file (`str`, *optional*): 74 | Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that 75 | contains everything needed to load the tokenizer. 76 | unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 77 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 78 | token instead. Not applicable to this tokenizer. 79 | bos_token (`str`, *optional*): 80 | The beginning of sequence token. Not applicable for this tokenizer. 81 | eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 82 | The end of sequence token. 83 | pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 84 | The token used for padding, for example when batching sequences of different lengths. 85 | """ 86 | 87 | vocab_files_names = VOCAB_FILES_NAMES 88 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 89 | max_model_input_sizes = MAX_MODEL_INPUT_SIZES 90 | model_input_names = ["input_ids", "attention_mask"] 91 | slow_tokenizer_class = Qwen2Tokenizer 92 | 93 | def __init__( 94 | self, 95 | vocab_file=None, 96 | merges_file=None, 97 | tokenizer_file=None, 98 | unk_token="<|endoftext|>", 99 | bos_token=None, 100 | eos_token="<|endoftext|>", 101 | pad_token="<|endoftext|>", 102 | **kwargs, 103 | ): 104 | # We need to at least pass vocab_file and merges_file to base class 105 | # in case a slow tokenizer needs to be initialized; other can be 106 | # configured through files. 107 | # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token 108 | 109 | bos_token = ( 110 | AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) 111 | if isinstance(bos_token, str) 112 | else bos_token 113 | ) 114 | eos_token = ( 115 | AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) 116 | if isinstance(eos_token, str) 117 | else eos_token 118 | ) 119 | unk_token = ( 120 | AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) 121 | if isinstance(unk_token, str) 122 | else unk_token 123 | ) 124 | pad_token = ( 125 | AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) 126 | if isinstance(pad_token, str) 127 | else pad_token 128 | ) 129 | 130 | super().__init__( 131 | vocab_file, 132 | merges_file, 133 | tokenizer_file=tokenizer_file, 134 | unk_token=unk_token, 135 | bos_token=bos_token, 136 | eos_token=eos_token, 137 | pad_token=pad_token, 138 | **kwargs, 139 | ) 140 | 141 | # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary 142 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: 143 | files = self._tokenizer.model.save(save_directory, name=filename_prefix) 144 | return tuple(files) -------------------------------------------------------------------------------- /qwen2_sft/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.37.0 2 | torch>=1.13.1 3 | 4 | safetensors>=0.4.1 5 | accelerate==0.27.1 6 | fsspec==2023.9.2 7 | tensorboardX==2.4 8 | rouge==1.0.1 9 | nltk==3.6.6 10 | numpy 11 | tqdm --------------------------------------------------------------------------------