├── .gitignore
├── LICENSE
├── README.md
└── qwen2_sft
    ├── __init__.py
    ├── dataset
        ├── __init__.py
        ├── advgen_toy.json
        └── alpaca_gpt4_data_zh.json
    ├── ft_qwen2
        ├── __init__.py
        ├── config.py
        ├── evaluation.py
        ├── post_api.py
        ├── predict.py
        └── train.py
    ├── image_aadvgen_loss_success_7b_5e5_bfloat16_pt.png
    ├── image_advgen_dev_success_7b_2e5_cosine_bs2_fp16.png
    ├── image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16.png
    ├── models
        ├── __init__.py
        └── qwen2
        │   ├── __init__.py
        │   ├── cache_utils.py
        │   ├── configuration_qwen2.py
        │   ├── modeling_attn_mask_utils.py
        │   ├── modeling_qwen2.py
        │   ├── pytorch_utils.py
        │   ├── tokenization_qwen2.py
        │   └── tokenization_qwen2_fast.py
    └── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Qwen1.5-SFT
  2 | Qwen1.5-SFT(阿里, Ali), Qwen_Qwen1.5-2B-Chat/Qwen_Qwen1.5-7B-Chat微调(transformers)/LORA(peft)/推理
  3 | 
  4 | ## 项目地址
  5 |  - [https://github.com/yongzhuo/qwen1.5-sft](https://github.com/yongzhuo/qwen1.5-sft)
  6 |  - 默认数据类型为bfloat6
  7 | 
  8 | ## 备注
  9 | ```python
 10 | 1. 非常重要: weights要用bfloat16/fp32/tf32(第二版大模型基本共识), 不要用fp16, fp16会特别容易loss=NAN;
 11 | 2. SFT最好还是像预训练那样, input/output都计算loss;
 12 | 2. transformers需要4.37.0及以上;
 13 | 3. qwen1.5, 0.5/1.8/4/7B模型的词典大小为151936(可以除以128), 14/72B模型的字典大小为152064(可以除以256); 多语言版本, 包含繁/简体;
 14 | 4. qwen1.5网络架构同Llama; 
 15 | 5. prompt:
 16 |    5.1 标准格式为: 
 17 | text_input + text_output + <|endoftext|>
 18 |    5.2 prompt格式为: 
 19 | <|im_start|>system
 20 | {text_prompt}<|im_end|>
 21 | <|im_start|>user
 22 | {text_input}
 23 | <|im_end|>
 24 | <|im_start|>assistant
 25 | {text_output}
 26 | <|im_end|>
 27 | 6 微调输入输出:
 28 |     输入："<|im_start|>system\n{text_prompt}<|im_end|>\n<|im_start|>user\n{text_input}\n<|im_end|>"
 29 |     输出："<|im_start|>assistant\n{text_output}\n<|im_end|>"
 30 | 7 推理输入输出(assistant\n放置位置不同):
 31 |     输入："<|im_start|>system\n{text_prompt}<|im_end|>\n<|im_start|>user\n{text_input}\n<|im_end|>\n<|im_start|>assistant"
 32 |     输出："{text_output}\n<|im_end|>"
 33 | 8. 网络各层名称
 34 | ('base_model.model.model.embed_tokens.weight', torch.bfloat16, False)
 35 | ('base_model.model.model.layers.31.self_attn.q_proj.weight', torch.bfloat16, False)
 36 | ('base_model.model.model.layers.31.self_attn.q_proj.bias', torch.bfloat16, False)
 37 | ('base_model.model.model.layers.31.self_attn.q_proj.lora_A.default.weight', torch.float32, True)
 38 | ('base_model.model.model.layers.31.self_attn.q_proj.lora_B.default.weight', torch.float32, True)
 39 | ('base_model.model.model.layers.31.self_attn.k_proj.weight', torch.bfloat16, False)
 40 | ('base_model.model.model.layers.31.self_attn.k_proj.bias', torch.bfloat16, False)
 41 | ('base_model.model.model.layers.31.self_attn.k_proj.lora_A.default.weight', torch.float32, True)
 42 | ('base_model.model.model.layers.31.self_attn.k_proj.lora_B.default.weight', torch.float32, True)
 43 | ('base_model.model.model.layers.31.self_attn.v_proj.weight', torch.bfloat16, False)
 44 | ('base_model.model.model.layers.31.self_attn.v_proj.bias', torch.bfloat16, False)
 45 | ('base_model.model.model.layers.31.self_attn.v_proj.lora_A.default.weight', torch.float32, True)
 46 | ('base_model.model.model.layers.31.self_attn.v_proj.lora_B.default.weight', torch.float32, True)
 47 | ('base_model.model.model.layers.31.self_attn.o_proj.weight', torch.bfloat16, False)
 48 | ('base_model.model.model.layers.31.mlp.gate_proj.weight', torch.bfloat16, False)
 49 | ('base_model.model.model.layers.31.mlp.up_proj.weight', torch.bfloat16, False)
 50 | ('base_model.model.model.layers.31.mlp.down_proj.weight', torch.bfloat16, False)
 51 | ('base_model.model.model.layers.31.input_layernorm.weight', torch.bfloat16, False)
 52 | ('base_model.model.model.layers.31.post_attention_layernorm.weight', torch.bfloat16, False)
 53 | ('base_model.model.model.norm.weight', torch.bfloat16, False)
 54 | ('base_model.model.lm_head.weight', torch.bfloat16, False)
 55 | 9. RuntimeError: unscale_() has already been called on this optimizer since the last update().
 56 |     微调语料太少导致的
 57 | ```
 58 | 
 59 | ## 环境配置
 60 | ```shell
 61 | transformers>=4.37.0
 62 | torch>=1.11.0
 63 | safetensors>=0.4.1
 64 | accelerate==0.27.1
 65 | fsspec==2023.9.2
 66 | rouge==1.0.1
 67 | nltk==3.6.6
 68 | peft>=0.2.0
 69 | numpy
 70 | tqdm
 71 | ```
 72 | 
 73 | ## 微调
 74 | ```shell
 75 | 地址: qwen2_sft/ft_qwen2
 76 | 
 77 | 配置: qwen2_sft/ft_qwen2/config.py
 78 | 训练: python train.py
 79 | 推理: python predict.py
 80 | 验证: python evaluation.py
 81 | 接口: python post_api.py
 82 | ```
 83 | 
 84 | ## 数据集-中文
 85 |  - [https://huggingface.co/datasets/JosephusCheung/GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
 86 |  - [https://huggingface.co/datasets/shareAI/shareGPT_cn](https://huggingface.co/datasets/shareAI/shareGPT_cn)
 87 |  - [https://huggingface.co/datasets/Mutonix/RefGPT-Fact](https://huggingface.co/datasets/Mutonix/RefGPT-Fact)
 88 |  - [https://huggingface.co/datasets/BAAI/COIG](https://huggingface.co/datasets/BAAI/COIG)
 89 |  - [https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 90 |  - [https://github.com/carbonz0/alpaca-chinese-dataset](https://github.com/carbonz0/alpaca-chinese-dataset)
 91 |  - [https://github.com/LianjiaTech/BELLE](https://github.com/LianjiaTech/BELLE)
 92 |  - [https://github.com/PhoebusSi/Alpaca-CoT](https://github.com/PhoebusSi/Alpaca-CoT)
 93 |  - [https://github.com/Hello-SimpleAI/chatgpt-comparison-detection](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)
 94 |  - [https://github.com/yangjianxin1/Firefly](https://github.com/yangjianxin1/Firefly)
 95 |  - [https://github.com/XueFuzhao/InstructionWild](https://github.com/XueFuzhao/InstructionWild)
 96 |  - [https://github.com/OpenLMLab/MOSS](https://github.com/OpenLMLab/MOSS)
 97 |  - [https://github.com/thu-coai/Safety-Prompts](https://github.com/thu-coai/Safety-Prompts)
 98 |  - [https://github.com/LAION-AI/Open-Assistant](https://github.com/LAION-AI/Open-Assistant)
 99 |  - [https://github.com/TigerResearch/TigerBot](https://github.com/TigerResearch/TigerBot)
100 | 
101 | 
102 | ## 参考/感谢
103 |  - [https://github.com/QwenLM/Qwen1.5](https://github.com/QwenLM/Qwen1.5)
104 |  - [https://github.com/google/gemma_pytorch](https://github.com/google/gemma_pytorch)
105 |  - [https://huggingface.co/google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it)
106 |  - [https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
107 |  - [https://github.com/THUDM/ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B)
108 |  - [https://github.com/THUDM/GLM](https://github.com/THUDM/GLM)
109 |  - [https://github.com/tatsu-lab/stanford_alpaca](https://github.com/tatsu-lab/stanford_alpaca)
110 |  - [https://github.com/LianjiaTech/BELLE](https://github.com/LianjiaTech/BELLE)
111 |  - [https://github.com/huggingface/peft](https://github.com/huggingface/peft)
112 |  - [https://github.com/mymusise/ChatGLM-Tuning](https://github.com/mymusise/ChatGLM-Tuning)
113 |  - [https://github.com/bojone/bert4keras](https://github.com/bojone/bert4keras)
114 |  - [trl](https://github.com/lvwerra/trl)
115 |  - [math23k](https://aclanthology.org/D17-1088)
116 | 
117 | 
118 | ## 微调日志-advgen
119 | ### 默认的bf16可微调, 最好也计算全部loss(同pt, input/output)
120 |  ![qwen2_sft/image_aadvgen_loss_success_7b_5e5_bfloat16_pt.png](qwen2_sft/image_aadvgen_loss_success_7b_5e5_bfloat16_pt.png)
121 | 
122 | ### fp16损失函数就为nan, image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16
123 |  ![qwen2_sft/image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16.png](qwen2_sft/image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16.png)
124 | 
125 | ### fp16细调超参数偶尔训练成功但loss很高, image_advgen_dev_success_7b_2e5_cosine_bs2_fp16
126 |  ![qwen2_sft/image_advgen_dev_success_7b_2e5_cosine_bs2_fp16.png](qwen2_sft/image_advgen_dev_success_7b_2e5_cosine_bs2_fp16.png)
127 | 
128 | 
129 | ## 推理日志-advgen
130 | ```cpu
131 | ('base_model.model.base_model.model.model.layers.31.self_attn.k_proj.bias', torch.float16, False, tensor([ 0.1016, -0.0640,  0.1729,  ..., -1.0312, -0.0028,  0.3477],
132 |        device='cuda:0', dtype=torch.float16))
133 | ('base_model.model.base_model.model.model.layers.31.self_attn.k_proj.lora_A.default.weight', torch.float16, False, tensor([[-0.0045, -0.0027,  0.0057,  ...,  0.0131,  0.0074, -0.0049],
134 |         [-0.0049,  0.0048, -0.0067,  ...,  0.0118,  0.0080, -0.0105],
135 |         [-0.0052,  0.0086,  0.0070,  ..., -0.0102, -0.0005,  0.0166],
136 |         ...,
137 |         [-0.0097, -0.0158, -0.0116,  ...,  0.0051,  0.0046,  0.0015],
138 |         [-0.0148,  0.0001, -0.0152,  ...,  0.0063,  0.0139,  0.0026],
139 |         [-0.0092,  0.0036,  0.0003,  ..., -0.0082,  0.0063, -0.0058]],
140 |        device='cuda:0', dtype=torch.float16))
141 | ('base_model.model.base_model.model.model.layers.31.self_attn.k_proj.lora_B.default.weight', torch.float16, False, tensor([[ 1.2102e-03,  1.9951e-03, -2.2488e-03,  ...,  1.2598e-03,
142 |          -1.4429e-03, -2.0466e-03],
143 |         [-5.9128e-05,  4.4274e-04,  1.4553e-03,  ..., -1.9350e-03,
144 |          -8.8406e-04,  1.9855e-03],
145 |         [ 1.5755e-03,  2.3174e-03, -2.6073e-03,  ..., -4.1466e-03,
146 |          -2.8076e-03,  6.1464e-04],
147 |         ...,
148 |         [-1.5554e-03, -3.0460e-03,  4.4136e-03,  ...,  3.6411e-03,
149 |           6.6280e-04,  5.5790e-04],
150 |         [-1.5869e-03, -1.6050e-03, -1.0443e-03,  ...,  6.1393e-06,
151 |           1.7271e-03, -1.2022e-04],
152 |         [-3.7169e-04,  6.4754e-04,  7.2241e-04,  ...,  4.6730e-04,
153 |           7.0667e-04, -1.1092e-04]], device='cuda:0', dtype=torch.float16))
154 | ......
155 | ('base_model.model.base_model.model.lm_head.weight', torch.float16, False, tensor([[ 0.0095, -0.0017, -0.0134,  ...,  0.0036,  0.0232, -0.0112],
156 |         [ 0.0089, -0.0062,  0.0004,  ..., -0.0079,  0.0175, -0.0098],
157 |         [ 0.0055,  0.0182,  0.0088,  ...,  0.0059,  0.0034, -0.0164],
158 |         ...,
159 |         [-0.0034, -0.0049,  0.0033,  ..., -0.0022, -0.0066,  0.0031],
160 |         [-0.0034, -0.0071,  0.0032,  ..., -0.0022, -0.0066,  0.0013],
161 |         [-0.0034, -0.0049,  0.0033,  ..., -0.0016, -0.0066,  0.0020]],
162 |        device='cuda:0', dtype=torch.float16))
163 | trainable params: 0 || all params: 7733907456 || trainable%: 0.0
164 | 151645
165 | 151644
166 | 151645
167 | 1699
168 | 872
169 | 8948
170 | {'instruction': '类型#裙*版型#宽松*版型#显瘦*颜色#黑色*图案#撞色*裙型#直筒裙*裙款式#拼接', 'input': '', 'output': ''}
171 | tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
172 |          151645,    198, 151644,    872,    198,  31905,      2, 102807,      9,
173 |           40301,  24300,      2, 109285,      9,  40301,  24300,      2,  99580,
174 |          102372,      9, 102284,      2, 104723,      9, 108108,      2, 101340,
175 |           38035,      9, 102807,  24300,      2,  73145, 106000, 102807,      9,
176 |          102807, 108756,      2, 100278,  29077, 151645,    198, 151644,  77091]],
177 |        device='cuda:0')
178 | <|im_start|>system
179 | You are a helpful assistant.<|im_end|>
180 | <|im_start|>user
181 | 类型#裙*版型#宽松*版型#显瘦*颜色#黑色*图案#撞色*裙型#直筒裙*裙款式#拼接<|im_end|>
182 | <|im_start|>assistant宽松显瘦的黑色直筒裙是女性衣橱里必不可少的单品，这款裙子采用经典的黑色调，十分显瘦显腿长。裙摆处采用撞色的拼接设计，增添层次感的同时又不失时尚气息。裙型采用直筒裙的版型，更加修饰腿型，遮肉效果极佳。无论是职场还是休闲都能轻松驾驭，轻松展现女性的优雅气质。<|im_end|>
183 | 请输入:
184 | 类型#上衣*材质#棉*颜色#白色*风格#休闲*图案#刺绣*衣样式#衬衫*衣领型#尖领*衣袖长#七分袖
185 | 请稍等...
186 | ################################################################################################################################
187 | {'instruction': '类型#上衣*材质#棉*颜色#白色*风格#休闲*图案#刺绣*衣样式#衬衫*衣领型#尖领*衣袖长#七分袖', 'input': '', 'output': ''}
188 | tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
189 |          151645,    198, 151644,    872,    198,  31905,      2,  17447,  99741,
190 |               9, 106149,      2, 102430,      9, 102284,      2, 102440,      9,
191 |          104040,      2, 104443,      9, 108108,      2, 100253, 103084,      9,
192 |           99741, 112453,      2, 113727,      9,  99741,  99213,  24300,      2,
193 |          100512,  99213,      9,  99741, 102885,  45861,      2,  99612,  17177,
194 |          102885, 151645,    198, 151644,  77091]], device='cuda:0')
195 | <|im_start|>system
196 | You are a helpful assistant.<|im_end|>
197 | <|im_start|>user
198 | 类型#上衣*材质#棉*颜色#白色*风格#休闲*图案#刺绣*衣样式#衬衫*衣领型#尖领*衣袖长#七分袖<|im_end|>
199 | <|im_start|>assistant这款白色刺绣休闲衬衫采用优质的纯棉材质，亲肤透气，穿着舒适亲肤。经典的尖领设计，修饰脸型，展现个性风格。七分袖的设计，既能凸显出修长的手型，又能带来一丝休闲的韵味。精致的刺绣图案，层次丰富，细节精致，凸显出独特的个性品味。这款衬衫无论是日常出街还是职场穿搭，都能展现出独特的休闲风格。<|im_end|>
200 | 27.06706738471985
201 | 请输入:
202 | 类型#裙*版型#显瘦*材质#牛仔布*风格#复古*风格#性感*图案#复古*裙型#牛仔裙*裙型#包臀裙*裙下摆#开叉*裙下摆#毛边*裙长#半身裙
203 | 请稍等...
204 | ################################################################################################################################
205 | {'instruction': '类型#裙*版型#显瘦*材质#牛仔布*风格#复古*风格#性感*图案#复古*裙型#牛仔裙*裙型#包臀裙*裙下摆#开叉*裙下摆#毛边*裙长#半身裙', 'input': '', 'output': ''}
206 | tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
207 |          151645,    198, 151644,    872,    198,  31905,      2, 102807,      9,
208 |           40301,  24300,      2,  99580, 102372,      9, 106149,      2, 100664,
209 |          102437,  51827,      9, 104040,      2, 111455,      9, 104040,      2,
210 |          110803,      9, 108108,      2, 111455,      9, 102807,  24300,      2,
211 |          100664, 102437, 102807,      9, 102807,  24300,      2,  67279, 117318,
212 |          102807,      9, 102807,  16872, 100805,      2,  29767, 102940,      9,
213 |          102807,  16872, 100805,      2,  99676,  99319,      9, 102807,  45861,
214 |               2,  99369,  95256, 102807, 151645,    198, 151644,  77091]],
215 |        device='cuda:0')
216 | <|im_start|>system
217 | You are a helpful assistant.<|im_end|>
218 | <|im_start|>user
219 | 类型#裙*版型#显瘦*材质#牛仔布*风格#复古*风格#性感*图案#复古*裙型#牛仔裙*裙型#包臀裙*裙下摆#开叉*裙下摆#毛边*裙长#半身裙<|im_end|>
220 | <|im_start|>assistant这款牛仔裙采用了复古风格的设计，十分的复古又有性感的感觉。牛仔布的材质十分的柔软亲肤，而且牛仔的风格十分的显瘦，很好的修饰身材线条，凸显出女性的优雅气质。这款半身裙的设计十分的别致，裙摆处采用开叉的设计，十分的个性时尚，凸显出女性的个性魅力。裙摆处采用毛边的设计，十分的个性时尚，而且层次感十足，凸显出女性的优雅气质。而且这款半裙还是包臀裙的设计，很好的修饰出女性的身材线条，凸显出女性的优雅气质
221 | 14.573514223098755
222 | 请输入:
223 | 类型#裤*材质#牛仔布*颜色#白色*风格#简约*图案#线条*裤长#短裤*裤型#阔腿裤*裤腰型#高腰*裤口#毛边
224 | 请稍等...
225 | ################################################################################################################################
226 | {'instruction': '类型#裤*材质#牛仔布*颜色#白色*风格#简约*图案#线条*裤长#短裤*裤型#阔腿裤*裤腰型#高腰*裤口#毛边', 'input': '', 'output': ''}
227 | tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
228 |          151645,    198, 151644,    872,    198,  31905,      2, 102693,      9,
229 |          106149,      2, 100664, 102437,  51827,      9, 102284,      2, 102440,
230 |               9, 104040,      2, 109962,      9, 108108,      2, 108236,      9,
231 |          102693,  45861,      2,  99534, 102693,      9, 102693,  24300,      2,
232 |          100607, 100447, 102693,      9, 102693, 102113,  24300,      2,  44636,
233 |          102113,      9, 102693,  39426,      2,  99676,  99319, 151645,    198,
234 |          151644,  77091]], device='cuda:0')
235 | <|im_start|>system
236 | You are a helpful assistant.<|im_end|>
237 | <|im_start|>user
238 | 类型#裤*材质#牛仔布*颜色#白色*风格#简约*图案#线条*裤长#短裤*裤型#阔腿裤*裤腰型#高腰*裤口#毛边<|im_end|>
239 | <|im_start|>assistant这款牛仔裤的设计风格十分简约，线条流畅，十分显瘦，而且高腰的设计十分拉长腿型，显高显瘦。裤子采用了白色的色系，十分清新甜美，而且十分百搭。阔腿裤的版型十分修饰腿型，十分显腿长，而且十分遮肉显瘦。裤脚采用了毛边的设计，十分复古有个性，而且十分甜美可爱。这款牛仔短裤十分百搭，而且十分显腿长，而且十分显瘦，而且十分甜美甜美，十分凸显青春活力的气息。<|im_end|>
240 | 22.39077615737915
241 | 请输入:
242 | 类型#裤*版型#显瘦*材质#牛仔布*颜色#深蓝色*裤腰型#高腰
243 | 请稍等...
244 | ################################################################################################################################
245 | {'instruction': '类型#裤*版型#显瘦*材质#牛仔布*颜色#深蓝色*裤腰型#高腰', 'input': '', 'output': ''}
246 | tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
247 |          151645,    198, 151644,    872,    198,  31905,      2, 102693,      9,
248 |           40301,  24300,      2,  99580, 102372,      9, 106149,      2, 100664,
249 |          102437,  51827,      9, 102284,      2,  99194, 105681,      9, 102693,
250 |          102113,  24300,      2,  44636, 102113, 151645,    198, 151644,  77091]],
251 |        device='cuda:0')
252 | <|im_start|>system
253 | You are a helpful assistant.<|im_end|>
254 | <|im_start|>user
255 | 类型#裤*版型#显瘦*材质#牛仔布*颜色#深蓝色*裤腰型#高腰<|im_end|>
256 | <|im_start|>assistant这款牛仔裤采用深蓝色的色调，显得沉稳大方，同时也十分的显瘦。高腰的版型更是修饰了腿型，显高显腿长。牛仔布的材质十分的舒适亲肤，而且牛仔的质地也能凸显出个性的感觉，十分的青春活力。无论是日常的休闲搭配，还是职场的职场风格，都能很好的展现出女性的优雅知性。<|im_end|>
257 | 8.462041139602661
258 | ```
259 | 
260 | 
261 | 


--------------------------------------------------------------------------------
/qwen2_sft/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2024/3/7 15:40
4 | # @author  : Mo
5 | # @function:
6 | 


--------------------------------------------------------------------------------
/qwen2_sft/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2023/3/16 21:50
4 | # @author  : Mo
5 | # @function:
6 | 


--------------------------------------------------------------------------------
/qwen2_sft/dataset/advgen_toy.json:
--------------------------------------------------------------------------------
 1 | [
 2 | {"input": "类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞", "output": "简约而不简单的牛仔外套，白色的衣身十分百搭。衣身多处有做旧破洞设计，打破单调乏味，增加一丝造型看点。衣身后背处有趣味刺绣装饰，丰富层次感，彰显别样时尚。"},
 3 | {"input": "类型#裙*材质#针织*颜色#纯色*风格#复古*风格#文艺*风格#简约*图案#格子*图案#纯色*图案#复古*裙型#背带裙*裙长#连衣裙*裙领型#半高领", "output": "这款BRAND针织两件套连衣裙，简约的纯色半高领针织上衣，修饰着颈部线，尽显优雅气质。同时搭配叠穿起一条背带式的复古格纹裙，整体散发着一股怀旧的时髦魅力，很是文艺范。"},
 4 | {"input": "类型#上衣*风格#嘻哈*图案#卡通*图案#印花*图案#撞色*衣样式#卫衣*衣款式#连帽", "output": "嘻哈玩转童年，随时<UNK>，没错，出街还是要靠卫衣来装酷哦！时尚个性的连帽设计，率性有范还防风保暖。还有胸前撞色的卡通印花设计，靓丽抢眼更富有趣味性，加上前幅大容量又时尚美观的袋鼠兜，简直就是孩子耍帅装酷必备的利器。"},
 5 | {"input": "类型#裤*风格#英伦*风格#简约", "output": "裤子是简约大方的版型设计，带来一种极简主义风格而且不乏舒适优雅感，是衣橱必不可少的一件百搭单品。标志性的logo可以体现出一股子浓郁的英伦风情，轻而易举带来独一无二的<UNK>体验。"},
 6 | {"input": "类型#裙*裙下摆#弧形*裙腰型#高腰*裙长#半身裙*裙款式#不规则*裙款式#收腰", "output": "这款来自梵凯的半身裙富有十足的设计感，采用了别致的不规则设计，凸显出时尚前卫的格调，再搭配俏皮的高腰设计，收腰提臀的同时还勾勒出优美迷人的身材曲线，而且还帮你拉长腿部比例，释放出优雅娇俏的小女人味。并且独特的弧形下摆还富有流畅的线条美，一颦一动间展现出灵动柔美的气质。"},
 7 | {"input": "类型#上衣*版型#宽松*版型#显瘦*图案#线条*衣样式#衬衫*衣袖型#泡泡袖*衣款式#抽绳", "output": "这件衬衫的款式非常的宽松，利落的线条可以很好的隐藏身材上的小缺点，穿在身上有着很好的显瘦效果。领口装饰了一个可爱的抽绳，漂亮的绳结展现出了十足的个性，配合时尚的泡泡袖型，尽显女性甜美可爱的气息。"},
 8 | {"input": "类型#裙*材质#蕾丝*风格#宫廷*图案#刺绣*图案#蕾丝*裙型#大裙摆*裙下摆#花边*裙袖型#泡泡袖", "output": "宫廷风的甜美蕾丝设计，清醒的蕾丝拼缝处，刺绣定制的贝壳花边，增添了裙子的精致感觉。超大的裙摆，加上精细的小花边设计，上身后既带着仙气撩人又很有女人味。泡泡袖上的提花面料，在细节处增加了浪漫感，春日的仙女姐姐。浪漫蕾丝布满整个裙身，美丽明艳，气质超仙。"},
 9 | {"input": "类型#裤*版型#显瘦*颜色#黑色*风格#简约*裤长#九分裤", "output": "个性化的九分裤型，穿着在身上，能够从视觉上拉长你的身体比例，让你看起来更加的有范。简约的黑色系列，极具时尚的韵味，充分凸显你专属的成熟韵味。修身的立体廓形，为你塑造修长的曲线。"},
10 | {"input": "类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领", "output": "文艺个性的印花连衣裙，藏青色底蕴，低调又大气，撞色太阳花分布整个裙身，绚丽而美好，带来时尚减龄的气质。基础款的舒适圆领，简约不失大方，勾勒精致脸庞。领后是一粒包布扣固定，穿脱十分方便。前片立体的打褶设计，搭配后片压褶的做工，增添层次和空间感，显瘦又有型。"},
11 | {"input": "类型#裙*颜色#蓝色*风格#清新*图案#蝴蝶结", "output": "裙身处采用立体蝴蝶结装饰辅以蓝色条带点缀，令衣身造型饱满富有层次的同时为其注入一丝甜美气息。将女孩清新娇俏的一面衬托而出。"}
12 | ]


--------------------------------------------------------------------------------
/qwen2_sft/dataset/alpaca_gpt4_data_zh.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "instruction": "保持健康的三个提示。",
  4 |     "input": "",
  5 |     "output": "以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。"
  6 |   },
  7 |   {
  8 |     "instruction": "三原色是什么？",
  9 |     "input": "",
 10 |     "output": "三原色通常指的是红色、绿色和蓝色（RGB）。它们是通过加色混合原理创建色彩的三种基础颜色。在以发光为基础的显示设备中（如电视、计算机显示器、智能手机和平板电脑显示屏）, 三原色可混合产生大量色彩。其中红色和绿色可以混合生成黄色，红色和蓝色可以混合生成品红色，蓝色和绿色可以混合生成青色。当红色、绿色和蓝色按相等比例混合时，可以产生白色或灰色。\n\n此外，在印刷和绘画中，三原色指的是以颜料为基础的红、黄和蓝颜色（RYB）。这三种颜色用以通过减色混合原理来创建色彩。不过，三原色的具体定义并不唯一，不同的颜色系统可能会采用不同的三原色。"
 11 |   },
 12 |   {
 13 |     "instruction": "描述原子的结构。",
 14 |     "input": "",
 15 |     "output": "原子是物质的基本单位，它由三种基本粒子组成：质子、中子和电子。质子和中子形成原子核，位于原子中心，核外的电子围绕着原子核运动。\n\n原子结构具有层次性。原子核中，质子带正电，中子不带电（中性）。原子核非常小且致密，占据了原子总质量的绝大部分。电子带负电，通常围绕核运动，形成若干层次，称为壳层或电子层。电子数量与质子数量相等，使原子呈电中性。\n\n电子在每个壳层中都呈规律分布，并且不同壳层所能容纳的电子数也不同。在最里面的壳层一般只能容纳2个电子，其次一层最多可容纳8个电子，再往外的壳层可容纳的电子数逐层递增。\n\n原子核主要受到两种相互作用力的影响：强力和电磁力。强力的作用范围非常小，主要限制在原子核内，具有极强的吸引作用，使核子（质子和中子）紧密结合在一起。电磁力的作用范围较大，主要通过核外的电子与原子核相互作用，发挥作用。\n\n这就是原子的基本结构。原子内部结构复杂多样，不同元素的原子核中质子、中子数量不同，核外电子排布分布也不同，形成了丰富多彩的化学世界。"
 16 |   },
 17 |   {
 18 |     "instruction": "如何减少空气污染？",
 19 |     "input": "",
 20 |     "output": "有很多方法来减少空气污染:\n\n1. 减少机动车排放: 通过支持可再生能源来驱动的交通工具，如电动汽车、混合动力汽车，使用公共交通工具，骑自行车或步行。\n2. 鼓励清洁能源: 政府和企业可以通过提供优惠政策和补贴来鼓励人们使用清洁能源。\n3. 减少工业排放: 工厂可以通过采用更清洁的生产方法，使用减排技术来减少工业废气的排放。\n4. 绿化城市: 种植更多的树木和植物可以吸收空气中的污染物和二氧化碳，有助于减少空气污染。\n5. 减少家庭能耗: 通过使用节能电器、关闭不必要的电源开关、减少用电量，能够减少家庭用电量，从而减少空气污染。\n6. 垃圾分类和回收利用: 妥善处理垃圾，避免焚烧垃圾，对垃圾进行分类和回收利用，可以减少污染物排放。\n\n总之，减少空气污染需要政府、企业和个人共同努力。我们每个人都要承担责任，通过自己的实际行动来保护环境。"
 21 |   },
 22 |   {
 23 |     "instruction": "描述一次你不得不做出困难决定的经历。",
 24 |     "input": "",
 25 |     "output": "作为一名人工智能助手，我是由程序控制的，没有感情和个人意识，我的一切决策都是基于我的算法和程序的设定。但在某种情况下，我需要根据我的程序判断做出决定。\n\n例如，当我处理多个任务时，如果有多个任务具有相同的优先级，那么我需要依据我的程序进行判断，选择哪个任务来处理。\n\n在这种情况下，我会将任务进行比较，考虑它们的时间紧迫性、资源需求、风险和收益等因素，最后做出一个最佳的决定。\n\n虽然这并不是一个真正的困难决定，因为我只是执行我的程序，但它确实需要我进行一定的判断，以确保我能够做出最好的决定。"
 26 |   },
 27 |   {
 28 |     "instruction": "辨认不同之处。 ",
 29 |     "input": "输入：Twitter，Instagram，Telegram。",
 30 |     "output": "Twitter，Instagram 和 Telegram 都是流行的社交媒体平台，但它们之间也有一些不同之处：\n\n- Twitter: 是一个微博社交媒体平台，用户可以发布 280 字符以内的消息（即推文）并与其他用户互动。它主要关注文字交流，尽管用户也可以分享照片和视频。\n- Instagram: 是一个图片和视频分享的社交媒体平台，用户可以在 Instagram 上发布照片和视频并与其他用户互动。它主要关注图片和视频的分享，尽管用户也可以添加文字说明。\n- Telegram: 是一个即时通讯和社交媒体平台，用户可以通过文本、语音、图片和视频与其他用户即时交流。与 Twitter 和 Instagram 不同，Telegram 更侧重于即时通讯，它还提供了端到端加密的私人聊天功能。"
 31 |   },
 32 |   {
 33 |     "instruction": "解释为什么以下分数等同于1/4",
 34 |     "input": "输入：4/16",
 35 |     "output": "4/16等于1/4是因为我们可以约分分子分母都除以他们的最大公约数4，得到（4÷4）/ (16÷4）=1/4。分数的约分是用分子和分母除以相同的非零整数，来表示分数的一个相同的值，这因为分数实际上表示了分子除以分母，所以即使两个数同时除以同一个非零整数，分数的值也不会改变。所以4/16 和1/4是两种不同的书写形式，但它们的值相等。"
 36 |   },
 37 |   {
 38 |     "instruction": "写一个关于主人公必须做出重要职业决定的第三人称叙述的短故事。",
 39 |     "input": "",
 40 |     "output": "杰克是一位年轻有为的律师，他生活在纽约市，并在一家知名的法律事务所工作。杰克的生活似乎很完美，他有自己的公寓，工作稳定，薪水丰厚。然而，内心深处，杰克一直有一个愿望，那就是放弃目前的工作，去旅行并帮助那些需要帮助的人。\n\n一天，杰克的老板找到他，让他在两个选项之间做出一个决定：一是获得一份更高职位的提升，机会难得；二是离开这个公司去寻找其他的机会。杰克知道这是他人生中的一个重要时刻，他必须做出一个他将永远无法改变的决定。\n\n杰克开始思考这两个选项，仔细权衡利弊。他热爱自己的工作，同时也知道这份提升对他的职业生涯有着重大的影响。然而，他内心深处的渴望让他无法拒绝外面的世界。\n\n最终，杰克做出了决定。他选择离开这家公司，去追求他内心深处的愿望。他的老板尊重他的决定，并为他送行。\n\n杰克的职业生涯并没有因此终止，他在旅途中遇到了许多人，帮助了许多需要帮助的人。他发现，作出这个决定是他人生中最正确的选择。\n\n这个故事讲述了一个年轻人面临重要职"
 41 |   },
 42 |   {
 43 |     "instruction": "渲染一座房子的三维模型",
 44 |     "input": "",
 45 |     "output": "对不起，作为一个语言模型，我无法渲染三维模型。但我可以为您提供一些建议。\n\n在制作三维模型时，您需要使用专业的三维建模软件，例如 Blender、Maya 或 Sketchup。这些软件提供了丰富的建模工具，可以让您轻松创建三维模型。\n\n在渲染三维模型时，您可以使用光照和材质来增强模型的真实感。常见的渲染器有 Blender 的 Cycles 渲染器和 Maya 的 Arnold 渲染器，它们都能产生逼真的渲染结果。\n\n如果您是初学者，建议您搜索相关教程学习三维建模和渲染技术。这些技能可以让您更快地掌握如何渲染一座房子的三维模型。"
 46 |   },
 47 |   {
 48 |     "instruction": "朱利叶斯·凯撒是如何死亡的？",
 49 |     "input": "",
 50 |     "output": "朱利叶斯·凯撒，又称尤利乌斯·恺撒（Julius Caesar）是古罗马的政治家、军事家和作家。他于公元前44年3月15日被刺杀。 \n\n根据历史记载，当时罗马元老院里一些参议员联合起来策划了对恺撒的刺杀行动，因为他们担心恺撒的统治将给罗马共和制带来威胁。在公元前44年3月15日（又称“3月的艾达之日”），恺撒去参加元老院会议时，被一群参议员包围并被攻击致死。据记载，他身中23刀，其中一刀最终致命。"
 51 |   },
 52 |   {
 53 |     "instruction": "法国的首都是什么？",
 54 |     "input": "",
 55 |     "output": "法国的首都是巴黎。"
 56 |   },
 57 |   {
 58 |     "instruction": "生成一个人们去野营旅行可能需要的十件物品的清单。",
 59 |     "input": "",
 60 |     "output": "1. 帐篷: 帐篷是野营必备的物品，可以提供庇护，避免风吹日晒，还能够防止野生动物的干扰。\n2. 睡袋: 睡袋能够为您提供一个温暖舒适的睡眠环境，确保您在野外可以拥有一个良好的睡眠。\n3. 防水夹克: 在野外，天气变化无常，因此携带一件防水夹克十分必要，可以防止您被雨水淋湿。\n4. 照明设备: 如手电筒、头灯等，在野外夜晚黑暗的情况下为您提供照明，确保您的安全。\n5. 烹饪用具: 如便携式炉灶、锅碗瓢盆等，方便您在野外烹饪食物。\n6. 刀具: 如多功能军刀、折叠刀等，能够满足您在野外的各种需求。\n7. 防蚊液: 在野外，蚊虫十分繁多，因此携带防蚊液能够有效避免蚊虫叮咬。\n8. 防晒霜: 日晒对皮肤有害，携带防晒霜能够有效防护紫外线，避免晒伤。\n9. 水壶: 在野外，水是必不可少的，携带一个水壶能够方便您随时饮用。\n10. 随身背包: 一个轻"
 61 |   },
 62 |   {
 63 |     "instruction": "讨论大萧条的原因",
 64 |     "input": "",
 65 |     "output": "大萧条是20世纪初期发生在全球范围内的经济衰退现象，其中最为严重的地区当属美国。这场经济危机持续了近十年之久，它的起因较为复杂，涉及多方面因素。\n\n首先，我们要提到1929年的股市崩盘。股票价格大幅下跌，导致投资者损失惨重，进而影响了消费信心。\n\n其次，当时的金融政策也是一个重要因素。为了维持金本位制度，联邦储备系统采取紧缩政策，却进一步加剧了资金短缺的问题。\n\n此外，当时政府对失业率的应对不力，社会保障制度不健全，也为经济衰退的进一步恶化埋下了祸根。\n\n最后，国际贸易也受到了影响。由于当时许多国家采取贸易保护主义政策，全球贸易额大幅下降，给各国经济造成了巨大冲击。\n\n综上所述，大萧条的原因是多方面的，涉及到金融政策，国际贸易，社会保障制度等多个领域。它给全球经济发展带来了严重的挑战，也为我们提供了深刻的历史经验教训。"
 66 |   },
 67 |   {
 68 |     "instruction": "将以下内容分类为动物、植物和矿物质：橡树、铜矿石、大象。",
 69 |     "input": "",
 70 |     "output": "植物: 橡树\n动物:大象\n矿物质:铜矿石"
 71 |   },
 72 |   {
 73 |     "instruction": "自然语言处理中词嵌入的用途。",
 74 |     "input": "",
 75 |     "output": "词嵌入是一种自然语言处理技术，用于表示文本数据。它把单词映射到一个高维的、连续的数值向量空间，并保留了单词之间的语义关系。\n\n词嵌入在自然语言处理中有多种用途，包括但不限于：\n\n1. 提取文本中的特征：词嵌入能够捕获文本数据中的词义和结构信息，帮助提取文本的关键特征。\n\n2. 降维：自然语言处理的任务通常会涉及大量的单词。词嵌入能够将高维的单词空间压缩到一个较低维度的连续向量空间中，大大减少计算量。\n\n3. 相似性搜索：词嵌入算法能够保留单词之间的语义关系。计算相似性时，查找词向量空间中的最近邻居可用来实现相关文本的自动检索。\n\n4. 文本分类：词嵌入向量可作为文本分类器的输入数据。它能够提取文本语义和结构信息，并为文本分类模型提供有用的特征。\n\n5. 机器翻译：词嵌入技术也可以用于机器翻译。利用词嵌入向量，可以将源语言单词映射到目标语言的单词向量空间中，实现跨语言文本检索和翻译。\n\n6. 自然语言生成：词嵌入也用于自然语言生成任务。在文本生成过程中，可以使用词嵌入来帮助生成人类可读的、语义连"
 76 |   },
 77 |   {
 78 |     "instruction": "描述计算机主板的功能",
 79 |     "input": "",
 80 |     "output": "计算机主板，也称为主板、系统板或系统板，是计算机中心组件之一，承担着连接并控制计算机中各硬件部件之间的通讯与数据传输功能。\n\n主板上布置了各种总线，包括数据总线、地址总线、控制总线等，通过这些总线，主板连接了中央处理器(CPU)、内存(RAM)、硬盘驱动器(HDD或SSD)、光驱、网卡、声卡、视频卡等硬件设备，协调它们之间的工作。\n\n主板还搭载了各种芯片组，起到调节和管理各个硬件部件运作的作用。比如，南桥芯片组负责管理主板上的外部设备接口，如USB接口、串口、并口等，而北桥芯片组则主要负责管理处理器、内存和PCI总线之间的数据交互。\n\n此外，主板上还设有BIOS芯片，存储着计算机的基本输入输出系统，控制计算机启动时硬件自检和操作系统的初始化工作。\n\n总之，计算机主板充当着计算机硬件系统的枢纽，负责各个部件之间的协调工作，是计算机运行不可缺少的重要部件。"
 81 |   },
 82 |   {
 83 |     "instruction": "逆向工程此代码以创建新版本。",
 84 |     "input": "",
 85 |     "output": "逆向工程是指通过分析现有软件或系统来充分理解其设计和工作原理，并基于这些理解来创建新版本。由于你没有提供具体的代码，我无法给出确切的步骤来逆向工程它。\n\n但是，一般来说，逆向工程代码通常涉及以下步骤：\n\n1. 阅读和分析代码：这是逆向工程中最重要的一步。你需要仔细阅读代码，了解它如何工作，以及它使用的算法和数据结构。\n\n2. 理解代码的功能和目的：在阅读代码时，应该注意了解代码的功能和目的，以便在创建新版本时能保持原有功能。\n\n3. 提取关键信息：通过分析代码，提取出关键信息，包括算法，数据结构，输入输出等。\n\n4. 根据提取的信息创建新版本：使用提取的信息，创建新版本的代码，可以进行优化，添加新功能等。\n\n需要注意的是，逆向工程可能是一项复杂且费时的工程，需要具备良好的代码阅读能力和分析能力。此外，在逆向工程代码时，应注意遵守相关法律法规，不要侵犯他人的知识产权。"
 86 |   },
 87 |   {
 88 |     "instruction": "提出数据隐私问题的道德解决方案",
 89 |     "input": "",
 90 |     "output": "数据隐私问题的解决方案从道德角度考虑，可以包括以下几个方面：\n\n1. 公开透明: 企业和组织需要更加公开透明地收集，处理和存储个人信息。数据收集和使用的政策应该明确表明，个人信息将如何被使用，谁可以访问，以及在什么情况下可能会被分享。\n\n2. 提供选择: 个人应该有选择是否提交个人信息，以及控制其信息如何被使用的权利。这意味着企业需要提供简单明了的选择：“退出”或“不同意”的选项。\n\n3. 保护个人信息: 企业和组织有责任采取适当的技术和组织措施来保护个人信息免受未经授权访问和泄露。这包括对信息的加密，访问控制和监控。\n\n4. 法律监管: 政府需要制定并执行严格的数据保护法律，以保护公民免受隐私侵犯。这些法律应规定企业如何收集，使用和分享个人信息，以及应对违规行为的惩罚措施。\n\n5. 道德责任: 企业和组织应认识到，保护数据隐私不仅仅是法律责任，更是一种道德责任。这意味着他们应该以一种合乎道德的方式收集，使用和处理个人信息，尊重个人权利和自主权。\n\n上述举措为解决数据隐私问题提供了道德指南，但通过不断努力和跟进实施才能真正保护人们的"
 91 |   },
 92 |   {
 93 |     "instruction": "生成三个与“道歉”意思相同的动词。",
 94 |     "input": "",
 95 |     "output": "1. 抱歉\n2. 致歉\n3. 认错."
 96 |   },
 97 |   {
 98 |     "instruction": "DNA代表什么？",
 99 |     "input": "",
100 |     "output": "DNA是脱氧核糖核酸(Deoxyribonucleic Acid)的缩写。它是一种生物大分子，负责编码遗传信息并传递遗产特征。 DNA分子由四种核苷酸：腺嘌呤（A），鸟嘌呤（G），胞嘧啶（C）和胸腺嘧啶（T）组成，以特定的序列排列，共同构成某个物种特有的基因组。 DNA的双螺旋结构和碱基配对机制使其能很好地复制和完成遗传信息的转录和翻译。"
101 |   },
102 |   {
103 |     "instruction": "比较古巴导弹危机和越南战争。",
104 |     "input": "",
105 |     "output": "古巴导弹危机和越南战争都是20世纪冷战期间发生的重大事件，它们在当时对世界的政治格局产生了深远的影响。但除此之外，两者之间还有许多不同之处。\n\n古巴导弹危机，又称“加勒比海危机”，发生在1962年，是指苏联在古巴部署核武器，导致美国和苏联两个超级大国之间的关系紧张，最终双方通过谈判，平息了这场危机。这一事件被认为是冷战中最危险的时刻，是世界上最接近核战争的时刻。\n\n越南战争，发生在1955年到1975年，是一场旷日持久的战争。它是由北越和南越之间的内战开始的，但最终美国和其他西方国家介入，将战争升级成了东西方意识形态的对抗。这场战争导致了巨大的人道灾难，数百万人死亡，越南、老挝和柬埔寨三国都受到了巨大的影响。\n\n在政治影响上，古巴导弹危机是一场外交危机，它使得美国和苏联两个超级大国关系紧张，但最终通过谈判化解了危机。越南战争则是一场旷日持久的战争，使美国在国际上的形象受损，也对美国内部政治造成了巨大的影响。\n\n在持续时间"
106 |   },
107 |   {
108 |     "instruction": "生成随机单词列表。",
109 |     "input": "",
110 |     "output": "[\"hippopotamus\", \"giraffe\", \"rhinoceros\", \"cheetah\", \"platypus\", \"parrot\", \"jellyfish\", \"octopus\", \"dolphin\", \"panther\"]"
111 |   },
112 |   {
113 |     "instruction": "把关于滑铁卢历史的录音转录成文字。",
114 |     "input": "",
115 |     "output": "很抱歉，但我无法直接转录录音。您需要提供录音文件的链接或将录音文本提供给我，然后我才能将其转录成文字。"
116 |   },
117 |   {
118 |     "instruction": "世界上最著名的画家是谁？",
119 |     "input": "",
120 |     "output": "世界上有许多著名的画家，其中最著名的可能是意大利文艺复兴时期的画家列奥纳多·达·芬奇(Leonardo da Vinci)。他的 杰作包括《蒙娜丽莎》（Mona Lisa）和《最后的晚餐》（The Last Supper）。但是，这个问题并没有确定的答案，因为每个人都有自己的看法。其他著名的画家还有毕加索(Pablo Picasso)、梵高(Vincent van Gogh)、米开朗基罗(Michelangelo)、雷普尔(Raphael)、罗伯特(Rubens)等。"
121 |   },
122 |   {
123 |     "instruction": "解释齿轮抵抗力的概念。",
124 |     "input": "",
125 |     "output": "齿轮抵抗力是用来描述齿轮传动系统中各组件受到的摩擦力及惯性力的总和。它主要包括以下几个方面：\n\n1. 齿轮啮合的滑动摩擦力：当齿轮的齿面相互啮合时，会产生相对滑动，从而导致一定的滑动摩擦力；\n\n2. 齿间间隙的碰撞力：齿间间隙存在一定的间隙，在两齿传动过程中会产生碰撞，产生碰撞力；\n\n3. 轴承摩擦力：轴承承受轴和齿轮的转矩作用，会产生一定的摩擦力；\n\n4. 惯性力：齿轮旋转时，其自身及齿间有相对运动，会产生各种类型的惯性力，包括离心惯性力、切向惯性力等。\n\n齿轮抵抗力会影响齿轮传动的效率，并导致热量的产生和噪音的增加。通过采用合理的齿轮结构设计，选择合适的材料和润滑剂，可有效降低齿轮抵抗力，使齿轮传动更加平稳高效。"
126 |   },
127 |   {
128 |     "instruction": "查水的沸点。",
129 |     "input": "",
130 |     "output": "水的沸点是指水在大气压强下由液态变成气态的温度。在标准大气压（101.325 kPa）下，水的沸点约为 100°C （摄氏度） 或 212°F（华氏度）。 需要注意的是，水的沸点会随着海拔的改变而改变，例如，在高海拔地区，空气压强降低，水的沸点也会降低。此外，水中溶解物质的多少也会影响沸点，如盐水的沸点要高于纯水的沸点。"
131 |   },
132 |   {
133 |     "instruction": "描述查理二世国王的生活和统治。",
134 |     "input": "",
135 |     "output": "查理二世是英格兰、苏格兰和爱尔兰的国王，他在位时间为1660年至1685年。他是斯图亚特王朝的一员，他的父亲是查理一世，在内战中被判处死刑，因此他在青年时期度过了一段流亡生活。1660年，查理二世在王政复辟中重新登上王位。\n\n查理二世统治时期，政治上的一个重要特点是他确立了君主与议会的共治局面，使英国走上了君主立宪制的道路。与此同时，他在宗教政策上采取了一种宽容和谦和的态度，允许天主教徒和清教徒在英格兰存在，并鼓励与其他欧洲国家建立外交关系。\n\n查理二世的统治时期也面临一些挑战，比如1665年的鼠疫大流行和1666年的伦敦大火。但查理二世积极应对这些挑战，下令疏散城市、建造医院和改善卫生环境，推动了伦敦重建。此外，他还重视文化艺术的发展，推动了英国文艺复兴运动的展开。\n\n在个人生活中，查理二世以魅力和风趣著称，他热爱社交活动，赛马和音乐，曾养过许多宠物猎犬和猎鹰。他与妻子凯瑟琳·布拉甘萨没有子嗣，但他公开承认了13"
136 |   },
137 |   {
138 |     "instruction": "确定经线，将北美洲和南美洲分开。",
139 |     "input": "",
140 |     "output": "您好，北美洲和南美洲分别位于西半球。这两大洲被一条东西走向的大陆分界线分隔开来。这条分界线被称为巴拿马分水岭。它位于巴拿马地峡的最窄处，将巴拿马的加勒比海海岸与太平洋海岸分隔开来。近似上，我们可以用80°W经线来确定分界线，将北美洲以北地区与南美洲以南地区分隔开来。"
141 |   },
142 |   {
143 |     "instruction": "解释植物细胞呼吸的过程。",
144 |     "input": "",
145 |     "output": "植物细胞呼吸可以定义为细胞通过氧气氧化有机分子并产生能量的过程。这个过程发生在线粒体中，并分为三个主要部分：\n\n1. 糖酵解（glycolysis）：这个过程在细胞质中进行，将葡萄糖（或其他糖）分解成两个三碳分子的丙酮酸（pyruvate），并释放少量的能量。\n2. 柠檬酸循环（citric acid cycle）：这个过程在线粒体中进行，丙酮酸会转化为乙酰辅酶A（acetyl-CoA）并进入柠檬酸循环。柠檬酸循环中，乙酰辅酶A会被氧化，释放二氧化碳和能量。\n3. 电子传递链（electron transport chain）：这个过程也在线粒体中进行，通过氧化还原反应将电子从高能级传递到低能级，释放能量。最终，氧气接收电子并结合氢离子形成水。\n\n在这个过程中，能量以ATP（三磷酸腺苷）的形式储存并释放。植物细胞呼吸与动物细胞呼吸非常相似，区别主要在于植物细胞具有叶绿体进行光合作用，可以自己产生糖分供呼吸作用使用。"
146 |   },
147 |   {
148 |     "instruction": "生成一个形容词列表，描述一个勇敢的人。",
149 |     "input": "",
150 |     "output": "1. 勇敢\n2. 英勇\n3. 英勇无畏\n4. 坚定\n5. 无畏\n6. 勇猛\n7. 刚毅\n8. 毅然\n9. 果敢\n10. 胆大\n11. 勇往直前\n12. 勇敢坚定\n13. 勇气十足\n14. 不屈不挠\n15. 不怕困难"
151 |   },
152 |   {
153 |     "instruction": "概述森林砍伐的后果。",
154 |     "input": "",
155 |     "output": "森林砍伐是指大规模砍伐森林以获取木材或者开辟土地进行其他活动。这一行为会造成许多环境和社会后果。\n\n从环境方面来看，森林砍伐会使得土地流失肥沃的表土，导致土壤流失和干旱。同时，森林砍伐会破坏动植物的栖息地，威胁生物多样性。森林的减少还会改变气候模式，引发干旱，洪水等自然灾害。\n\n社会方面，森林砍伐会影响到当地社区的经济和生活方式。许多社区依赖森林的资源，如木材，水果和药材，森林砍伐可能会破坏他们的生计。森林砍伐还可能导致土地纠纷和冲突。\n\n此外，森林砍伐还会影响全球气候变化。森林是重要的碳汇，减少森林会使大量的二氧化碳释放进大气层，加剧全球变暖。\n\nIn conclusion,砍伐森林会带来多种环境和社会问题，为了可持续发展，应当采取措施控制砍伐行为，保护森林。"
156 |   },
157 |   {
158 |     "instruction": "制定一个减少家庭用电的计划。",
159 |     "input": "",
160 |     "output": "制定减少家庭用电的计划可以通过以下步骤实现：\n\n1. 了解家庭用电情况：首先，通过查看电费账单和电表读数了解家庭的用电情况，找出哪些家电设备用电量较大，哪些时间段用电量较高。\n\n2. 提高用电效率：购买节能型家电，选择能效等级高的产品，可以有效降低用电量。而对于已有的家电，使用时应注意节能，例如在使用空调时保持室内温度适宜，不要过低或过高。\n\n3. 改变用电习惯：鼓励家庭成员改变用电习惯，例如及时关闭不用的电源开关，避免长时间待机耗电。尽量利用自然光线，减少白天开灯的时间。\n\n4. 考虑可再生能源：如果条件允许，可以考虑安装太阳能发电系统，利用太阳能发电来降低家庭用电量。\n\n5. 定期检查电器：定期检查电器是否正常运行，及时维修不良电器，以免造成浪费。\n\n最后，要持续关注家庭用电情况，定期评估减少用电的效果，不断调整计划，以适应家庭的实际需求。"
161 |   },
162 |   {
163 |     "instruction": "快速棕色的狐狸跳了。",
164 |     "input": "",
165 |     "output": "This sentence in Chinese roughly translates to \"The quick brown fox jumps.\" in English. Is there something specific you would like me to do with this information?"
166 |   },
167 |   {
168 |     "instruction": "分析给定文本的语气。 ",
169 |     "input": "输入：COVID-19大流行对世界产生了巨大影响，它已经彻底改变了我们的生活。",
170 |     "output": "本句话的语气是严肃认真的。"
171 |   },
172 |   {
173 |     "instruction": "使用给定数据计算中位数。",
174 |     "input": "输入：\n[2, 3, 7, 8, 10]",
175 |     "output": "中位数是指在一组数据中，这组数据从小到大排列后，位于中间位置的数。如果数据的数量为奇数，那么中位数就是中间那个数。如果数据的数量为偶数，那么中位数为中间两个数的平均值。\n在这个例子中，给定的数据集是[2, 3, 7, 8, 10]，已经从小到大排列。 数据数量是5，是一个奇数，所以中位数就是正中间的那个数，即7。所以这组数据的中位数是7。"
176 |   }
177 | ]
178 | 


--------------------------------------------------------------------------------
/qwen2_sft/ft_qwen2/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2024/3/7 16:12
4 | # @author  : Mo
5 | # @function:
6 | 


--------------------------------------------------------------------------------
/qwen2_sft/ft_qwen2/config.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2023/3/5 21:29
 4 | # @author  : Mo
 5 | # @function: config of qwen1.5
 6 | 
 7 | 
 8 | # optimized for RTX 4090. for larger GPUs, increase some of these?
 9 | MICRO_BATCH_SIZE = 4  # 4  # default=4  # this could actually be 5 but i like powers of 2
10 | BATCH_SIZE = 128  # 128
11 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
12 | LEARNING_RATE = 5e-5  ### 1e-5
13 | # LEARNING_RATE = 3e-4  # default=3e-4  # the Karpathy constant
14 | EPOCHS = 3  # default=3  # we don't always need 3 tbh
15 | # LORA_DROPOUT = 0.1
16 | # LORA_ALPHA = 32
17 | # LORA_R = 32
18 | WEIGHT_DECAY = 0.01
19 | LORA_DROPOUT = 0.1
20 | LORA_ALPHA = 16
21 | LORA_R = 8
22 | SAVE_STEPS = 384
23 | VAL_SET_SIZE = 0
24 | MAX_LENGTH_Q = 128 - 1  # default=128 - 2
25 | MAX_LENGTH_A = 128 - 1  # default=128 - 2
26 | MAX_LENGTH_QA = MAX_LENGTH_Q + MAX_LENGTH_A + 2
27 | TARGET_MODULES = ["q_proj",
28 |                   "k_proj",
29 |                   "v_proj",
30 |                   # "o_proj",
31 |                   # "down_proj",
32 |                   # "gate_proj",
33 |                   # "up_proj",
34 |                   ]
35 | 
36 | PATH_MODEL_PRETRAIN = ""
37 | REPO_ID = "Qwen/Qwen1.5-7B-Chat"
38 | PATH_MODEL_PRETRAIN = PATH_MODEL_PRETRAIN if PATH_MODEL_PRETRAIN else REPO_ID
39 | DATA_PATH = "../dataset/alpaca_gpt4_data_zh.json"
40 | MODEL_SAVE_DIR = "model_sft"
41 | 
42 | IS_PARALLELIZABLE = True
43 | MODEL_PARALLEL = True
44 | USE_CACHE = False
45 | CUDA_VISIBLE_DEVICES = "0"
46 | USE_TORCH = "1"
47 | CPU_NUMS = "9"
48 | USE_CUDA = False if CUDA_VISIBLE_DEVICES == "-1" else True
49 | USE_ALL_LOSS = True  # 计算loss时是否计算全部(False则为)
50 | 
51 | """
52 | ('base_model.model.model.embed_tokens.weight', torch.bfloat16, False)
53 | ('base_model.model.model.layers.31.self_attn.q_proj.weight', torch.bfloat16, False)
54 | ('base_model.model.model.layers.31.self_attn.q_proj.bias', torch.bfloat16, False)
55 | ('base_model.model.model.layers.31.self_attn.q_proj.lora_A.default.weight', torch.float32, True)
56 | ('base_model.model.model.layers.31.self_attn.q_proj.lora_B.default.weight', torch.float32, True)
57 | ('base_model.model.model.layers.31.self_attn.k_proj.weight', torch.bfloat16, False)
58 | ('base_model.model.model.layers.31.self_attn.k_proj.bias', torch.bfloat16, False)
59 | ('base_model.model.model.layers.31.self_attn.k_proj.lora_A.default.weight', torch.float32, True)
60 | ('base_model.model.model.layers.31.self_attn.k_proj.lora_B.default.weight', torch.float32, True)
61 | ('base_model.model.model.layers.31.self_attn.v_proj.weight', torch.bfloat16, False)
62 | ('base_model.model.model.layers.31.self_attn.v_proj.bias', torch.bfloat16, False)
63 | ('base_model.model.model.layers.31.self_attn.v_proj.lora_A.default.weight', torch.float32, True)
64 | ('base_model.model.model.layers.31.self_attn.v_proj.lora_B.default.weight', torch.float32, True)
65 | ('base_model.model.model.layers.31.self_attn.o_proj.weight', torch.bfloat16, False)
66 | ('base_model.model.model.layers.31.mlp.gate_proj.weight', torch.bfloat16, False)
67 | ('base_model.model.model.layers.31.mlp.up_proj.weight', torch.bfloat16, False)
68 | ('base_model.model.model.layers.31.mlp.down_proj.weight', torch.bfloat16, False)
69 | ('base_model.model.model.layers.31.input_layernorm.weight', torch.bfloat16, False)
70 | ('base_model.model.model.layers.31.post_attention_layernorm.weight', torch.bfloat16, False)
71 | ('base_model.model.model.norm.weight', torch.bfloat16, False)
72 | ('base_model.model.lm_head.weight', torch.bfloat16, False)
73 | """
74 | # layernorm_s = ["post_attention_layernorm",
75 | #                "input_layernorm",
76 | #                "norm"
77 | #                ]
78 | 


--------------------------------------------------------------------------------
/qwen2_sft/ft_qwen2/evaluation.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2023/3/25 21:56
  4 | # @author  : Mo
  5 | # @function: 验证评估
  6 | 
  7 | 
  8 | import logging as logger
  9 | import traceback
 10 | import logging
 11 | import random
 12 | import time
 13 | import json
 14 | import sys
 15 | import os
 16 | 
 17 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
 18 | print(path_root)
 19 | sys.path.append(path_root)
 20 | from qwen2_sft.ft_qwen2.config import CUDA_VISIBLE_DEVICES, USE_TORCH, CPU_NUMS  # from config
 21 | os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:3072"
 22 | os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES
 23 | os.environ["USE_TORCH"] = USE_TORCH
 24 | os.environ["OMP_NUM_THREADS"] = CPU_NUMS  # export OMP_NUM_THREADS=1
 25 | os.environ["OPENBLAS_NUM_THREADS"] = CPU_NUMS  # export OPENBLAS_NUM_THREADS=1
 26 | os.environ["MKL_NUM_THREADS"] = CPU_NUMS  # export MKL_NUM_THREADS=1
 27 | os.environ["VECLIB_MAXIMUM_THREADS"] = CPU_NUMS  # export VECLIB_MAXIMUM_THREADS=1
 28 | os.environ["NUMEXPR_NUM_THREADS"] = CPU_NUMS  # export NUMEXPR_NUM_THREADS=1
 29 | 
 30 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 31 | from peft import prepare_model_for_int8_training
 32 | from peft import LoraConfig, get_peft_model
 33 | from transformers import GenerationConfig
 34 | from pydantic import BaseModel
 35 | from rouge import Rouge  # pip install rouge
 36 | from tqdm import tqdm
 37 | import torch
 38 | import jieba
 39 | 
 40 | # from qwen2_sft.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as LLMTokenizer
 41 | # from qwen2_sft.models.qwen2.configuration_qwen2 import Qwen2Config as LLMConfig
 42 | # from qwen2_sft.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as LLMModel
 43 | from transformers import Qwen2Tokenizer as LLMTokenizer
 44 | from transformers import Qwen2ForCausalLM as LLMModel
 45 | from transformers import Qwen2Config as LLMConfig
 46 | from qwen2_sft.ft_qwen2.config import PATH_MODEL_PRETRAIN, DATA_PATH, MODEL_SAVE_DIR, REPO_ID
 47 | from qwen2_sft.ft_qwen2.config import MICRO_BATCH_SIZE, BATCH_SIZE, GRADIENT_ACCUMULATION_STEPS
 48 | from qwen2_sft.ft_qwen2.config import LEARNING_RATE, EPOCHS, SAVE_STEPS, VAL_SET_SIZE, TARGET_MODULES
 49 | from qwen2_sft.ft_qwen2.config import IS_PARALLELIZABLE, MODEL_PARALLEL, USE_CACHE
 50 | from qwen2_sft.ft_qwen2.config import MAX_LENGTH_Q, MAX_LENGTH_A, MAX_LENGTH_QA
 51 | from qwen2_sft.ft_qwen2.config import LORA_DROPOUT, LORA_ALPHA, LORA_R
 52 | from qwen2_sft.ft_qwen2.config import USE_CUDA
 53 | 
 54 | 
 55 | # # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
 56 | # world_size = int(os.environ.get("WORLD_SIZE", 1))
 57 | # ddp = world_size != 1
 58 | # device_map = "auto"
 59 | # # USE_CUDA = True
 60 | # print(device_map)
 61 | # print(ddp)
 62 | 
 63 | 
 64 | def load_model_state(model, model_save_dir="./", model_name="adapter_model.safetensors", device="cpu"):
 65 |     """  仅加载模型参数(推荐使用)  """
 66 |     try:
 67 |         path_model = os.path.join(model_save_dir, model_name)
 68 |         peft_config = LoraConfig.from_pretrained(model_save_dir)
 69 |         peft_config.inference_mode = True
 70 |         model = get_peft_model(model, peft_config)
 71 | 
 72 |         try:
 73 |             if path_model.endswith(".safetensors"):
 74 |                 from safetensors.torch import load_file, save_file
 75 |                 from safetensors import safe_open
 76 |                 state_dict = {}
 77 |                 with safe_open(path_model, framework="pt", device="cpu") as f:
 78 |                     for k in f.keys():
 79 |                         state_dict[k] = f.get_tensor(k)
 80 |             ### if path_model.endswith(".bin") or path_model.endswith(".pt"):
 81 |             else:
 82 |                 state_dict = torch.load(path_model, map_location=torch.device(device))
 83 |         except Exception as e:
 84 |             print(traceback.print_exc())
 85 |             ### 全部训练完的话会用这个, 即便是.safetensors
 86 |             state_dict = torch.load(path_model, map_location=torch.device(device))
 87 | 
 88 |         # print(state_dict.keys())
 89 |         state_dict = {"base_model.model." + k.replace("_orig_mod.", "")
 90 |         .replace(".lora_A.weight", ".lora_A.default.weight")
 91 |         .replace(".lora_B.weight", ".lora_B.default.weight")
 92 |                       : v for k, v in state_dict.items()}
 93 |         print(state_dict.keys())
 94 |         print("#" * 128)
 95 |         ### 排查不存在model.keys的 state_dict.key
 96 |         name_dict = {name: 0 for name, param in model.named_parameters()}
 97 |         print(name_dict.keys())
 98 |         print("#" * 128)
 99 |         for state_dict_key in state_dict.keys():
100 |             if state_dict_key not in name_dict:
101 |                 print("{} is not exist!".format(state_dict_key))
102 |         model.load_state_dict(state_dict, strict=False)
103 |         # model.to(device)
104 |         print("******model loaded success******")
105 |         print("self.device: {}".format(device))
106 |     except Exception as e:
107 |         print(str(e))
108 |         raise Exception("******load model error******")
109 |     return model
110 | def save_model_state(model, config=None, model_save_dir="./", model_name="pytorch_model.bin"):
111 |     """  仅保存模型参数(推荐使用)  """
112 |     if not os.path.exists(model_save_dir):
113 |         os.makedirs(model_save_dir)
114 |     if config:
115 |         config.save_pretrained(model_save_dir)
116 |     # save model
117 |     path_model = os.path.join(model_save_dir, model_name)
118 |     # torch.save(model.state_dict(), path_model)
119 |     grad_params_dict = {k: v.to("cpu") for k, v in model.named_parameters()
120 |                         if v.requires_grad == True}
121 |     torch.save(grad_params_dict, path_model)
122 |     print("******model_save_path is {}******".format(path_model))
123 | def prepare_model_for_half_training(model, output_embedding_layer_name="lm_head",
124 |         use_gradient_checkpointing=True, layer_norm_names=["layer_norm"]):
125 |     r"""
126 |     This method wrapps the entire protocol for preparing a model before running a training. This includes:
127 |         1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
128 |         head to fp32
129 | 
130 |     Args:
131 |         model, (`transformers.PreTrainedModel`):
132 |             The loaded model from `transformers`
133 |     """
134 |     #  不要使用 model.half(), 这样会先截取精度再训练了, 最初data就要保持half
135 |     for name, param in model.named_parameters():
136 |         # freeze base model's layers
137 |         param.requires_grad = False
138 |         # cast layer norm in fp32 for stability for 8bit models
139 |         if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
140 |             param.data = param.data.to(torch.float32)
141 |         elif output_embedding_layer_name in name:  # lm_head也需要是tf.float32(最后一层)
142 |             param.data = param.data.to(torch.float32)
143 |         else:
144 |             param.data = param.data.to(torch.half)
145 | 
146 |     if use_gradient_checkpointing:
147 |         # For backward compatibility
148 |         if hasattr(model, "enable_input_require_grads"):
149 |             model.enable_input_require_grads()
150 |         else:
151 |             def make_inputs_require_grad(module, input, output):
152 |                 output.requires_grad_(True)
153 |             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
154 |         # enable gradient checkpointing for memory efficiency
155 |         model.gradient_checkpointing_enable()
156 |     return model
157 | def print_named_parameters(model, use_print_data=False):
158 |     """   打印模型训练参数/数据类型信息   """
159 |     trainable_params = 0
160 |     all_param = 0
161 |     for name, param in model.named_parameters():
162 |         if use_print_data:
163 |             print((name, param.data.dtype, param.requires_grad, param.data))
164 |         else:
165 |             print((name, param.data.dtype, param.requires_grad))
166 |         num_params = param.numel()
167 |         # if using DS Zero 3 and the weights are initialized empty
168 |         if num_params == 0 and hasattr(param, "ds_numel"):
169 |             num_params = param.ds_numel
170 |         all_param += num_params
171 |         if param.requires_grad:
172 |             trainable_params += num_params
173 |     print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")
174 | def txt_read(path, encode_type="utf-8", errors=None):
175 |     """
176 |         读取txt文件，默认utf8格式, 不能有空行
177 |     Args:
178 |         path[String]: path of file of read, eg. "corpus/xuexiqiangguo.txt"
179 |         encode_type[String]: data encode type of file, eg. "utf-8", "gbk"
180 |         errors[String]: specifies how encoding errors handled, eg. "ignore", strict
181 |     Returns:
182 |         lines[List]: output lines
183 |     """
184 |     lines = []
185 |     try:
186 |         file = open(path, "r", encoding=encode_type, errors=errors)
187 |         lines = file.readlines()
188 |         file.close()
189 |     except Exception as e:
190 |         logger.info(str(e))
191 |     finally:
192 |         return lines
193 | def load_json(path: str, encoding: str="utf-8"):
194 |     """
195 |     Read Line of List<json> form file
196 |     Args:
197 |         path: path of save file, such as "txt"
198 |         encoding: type of encoding, such as "utf-8", "gbk"
199 |     Returns:
200 |         model_json: dict of word2vec, eg. [{"大漠帝国":132}]
201 |     """
202 |     with open(path, "r", encoding=encoding) as fj:
203 |         model_json = json.load(fj)
204 |         fj.close()
205 |     return model_json
206 | def generate_prompt(data_point, is_logger=False):
207 |     """   指令微调:
208 |     """
209 |     text_input = data_point.get("instruction", "") + "\t" + data_point.get("input", "")
210 |     text_out = data_point.get("output", "")
211 |     prompt_text_1 = """<|im_start|>system
212 | You are a helpful assistant.<|im_end|>
213 | <|im_start|>user
214 | {}<|im_end|>
215 | <|im_start|>assistant"""
216 |     tokens = tokenizer.tokenize(prompt_text_1)
217 |     max_len = MAX_LENGTH_QA - len(tokens)
218 |     text_1 = prompt_text_1.format(text_input[:max_len])
219 |     x = tokenizer.encode(text_1, add_special_tokens=False)
220 |     if len(x) > (MAX_LENGTH_Q + MAX_LENGTH_A):
221 |         x = x[:MAX_LENGTH_Q] + [ID_EOS]
222 |     out = {"input_ids": x, "labels": []}
223 |     if is_logger:
224 |         print(text_1)
225 |         print(out)
226 |     return out
227 | 
228 | 
229 | model = LLMModel.from_pretrained(PATH_MODEL_PRETRAIN)
230 | # model = prepare_model_for_half_training(model,
231 | #         use_gradient_checkpointing=False,
232 | #         output_embedding_layer_name="lm_head",
233 | #         layer_norm_names=["post_attention_layernorm",
234 | #                           "input_layernorm",
235 | #                           "norm"
236 | #                           ],
237 | #         )
238 | config = LoraConfig(target_modules=TARGET_MODULES,
239 |                     lora_dropout=LORA_DROPOUT,
240 |                     lora_alpha=LORA_ALPHA,
241 |                     task_type="CAUSAL_LM",
242 |                     bias="none",
243 |                     r=LORA_R,
244 |                     )
245 | model = get_peft_model(model, config)
246 | model = load_model_state(model=model, model_save_dir=MODEL_SAVE_DIR)
247 | if USE_CUDA:
248 |     model = model.cuda()
249 | else:
250 |     model = model.bfloat16()
251 | print_named_parameters(model, use_print_data=True)
252 | # print_named_parameters(model)
253 | 
254 | tokenizer = LLMTokenizer.from_pretrained(PATH_MODEL_PRETRAIN, add_eos_token=True)
255 | ID_END = 151643
256 | ID_BOS = 151644
257 | ID_EOS = 151645
258 | ID_PAD = ID_EOS
259 | ID_BR = 1699  # "\\n"
260 | ID_USER = 872
261 | ID_MODEL = 8948
262 | tokenizer.pad_token_id = ID_EOS
263 | tokenizer.eos_token_id = ID_EOS
264 | tokenizer.padding_side = "left"
265 | # tokenizer.padding_side = "right"  # NO use attention-mask
266 | print(ID_PAD)
267 | print(ID_BOS)
268 | print(ID_EOS)
269 | print(ID_BR)
270 | print(ID_USER)
271 | print(ID_MODEL)
272 | """
273 | "<|endoftext|>": 151643,
274 | "<|im_start|>": 151644,
275 | "<|im_end|>": 151645
276 | "system": 8948,
277 | "user": 872,
278 | "\\n": 1699,
279 | """
280 | STOP_WORDS_IDS = [[ID_BOS], [ID_EOS], [ID_END]]
281 | 
282 | 
283 | def predict(data_point, generation_config):
284 |     """  推理  """
285 |     prompt_dict = generate_prompt(data_point)
286 |     # inputs = tokenizer([text_1], return_tensors="pt", padding=True)
287 |     input_ids = prompt_dict.get("input_ids")
288 |     input_ids = torch.tensor([input_ids], dtype=torch.long)
289 |     if USE_CUDA:
290 |         input_ids = input_ids.cuda()
291 |     generation_config = GenerationConfig(**generation_config)
292 |     with torch.no_grad():
293 |         generation_output = model.generate(
294 |             input_ids=input_ids,
295 |             generation_config=generation_config,
296 |             # stop_words_ids=STOP_WORDS_IDS,
297 |             return_dict_in_generate=True,
298 |             output_scores=True,
299 |         )
300 |     s = generation_output.sequences[0]
301 |     output = tokenizer.decode(s)
302 |     print(input_ids)
303 |     print(s)
304 |     print(output)
305 |     # output = output.strip()
306 |     return output
307 | def text_generate(request_data):
308 |     instruction = request_data.instruction
309 |     text = request_data.text
310 |     penalty_alpha = request_data.penalty_alpha
311 |     max_new_tokens = request_data.max_new_tokens
312 |     temperature = request_data.temperature
313 |     do_sample = request_data.do_sample
314 |     num_beams = request_data.num_beams
315 |     top_p = request_data.top_p
316 |     top_k = request_data.top_k
317 | 
318 |     generation_dict = vars(request_data)
319 |     print(generation_dict)
320 |     generation_dict.pop("max_new_tokens")
321 |     generation_dict.pop("instruction")
322 |     generation_dict.pop("text")
323 |     data_point = {"instruction": instruction, "input": text, "output": ""}
324 |     generation_config = {"temperature": temperature,
325 |                          "top_p": top_p,
326 |                          "top_k": top_k,
327 |                          "num_beams": num_beams,
328 |                          "do_sample": do_sample,
329 |                          "penalty_alpha": penalty_alpha,
330 |                          "max_new_tokens": max_new_tokens,
331 |                          "pad_token_id": ID_PAD,
332 |                          "eos_token_id": ID_EOS,
333 |                          }
334 |     try:  # 数据预处理, 模型预测
335 |         response = predict(data_point, generation_config)
336 |     except Exception as e:
337 |         logger.info(traceback.print_exc())
338 |         response = "<|endoftext|>"
339 |     return response
340 | class Item(BaseModel):
341 |     instruction: str = ""
342 |     text: str = "1+1="
343 |     penalty_alpha: float = 1.0
344 |     max_new_tokens: int = 512
345 |     temperature: float = 0.8  # 0.95  # 0.35  # 0.95
346 |     do_sample: bool = True
347 |     num_beams: int = 1
348 |     top_p: float = 0.8  # 0.75
349 |     top_k: int = 50
350 | 
351 | 
352 | if __name__ == '__main__':
353 | 
354 |     text = "1+1="
355 |     item_config = Item()
356 |     item_config.text = text
357 |     response = text_generate(item_config)
358 |     print(response)
359 | 
360 |     smooth = SmoothingFunction().method1
361 |     rouge = Rouge()
362 |     best_bleu = 0.
363 | 
364 |     fw = open(DATA_PATH + ".qwen_eval_rouge_blue.512.json", "a+", encoding="utf-8")
365 |     rouge_1_p, rouge_2_p, rouge_l_p = 0, 0, 0
366 |     rouge_1_r, rouge_2_r, rouge_l_r = 0, 0, 0
367 |     rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
368 |     total = 0
369 |     time_start = time.time()
370 |     datas = load_json(DATA_PATH)
371 |     # datas = datas[:1024]
372 |     datas = datas[:8]
373 |     for d_json in tqdm(datas, desc="data"):
374 |         try:
375 |             instruction = d_json.get("instruction", "")
376 |             text_input = d_json.get("input", "")
377 |             text_output = d_json.get("output", "")
378 |             # qtext, qans
379 |             total += 1
380 |             item_config = Item()
381 |             item_config.instruction = instruction
382 |             item_config.text = text_input
383 |             text_output = " ".join(jieba.lcut(text_output))
384 |             text_gen_str = text_generate(item_config)
385 |             text_pred = " ".join(jieba.lcut(text_gen_str.replace("<|endoftext|>", "").replace("<|im_end|>", "")
386 |                                             .replace("<|im_start|>", "").strip().lower()))
387 |             line = {"input": text_input, "output": text_output.replace(" ", ""),
388 |                     "pred": text_pred.replace(" ", "")}
389 |             line_str = json.dumps(line, ensure_ascii=False) + "\n"
390 |             fw.write(line_str)
391 |             if text_pred.strip():
392 |                 scores = rouge.get_scores(hyps=text_pred, refs=text_output)
393 |                 rouge_1 += scores[0]['rouge-1']['f']
394 |                 rouge_2 += scores[0]['rouge-2']['f']
395 |                 rouge_l += scores[0]['rouge-l']['f']
396 | 
397 |                 rouge_1_p += scores[0]['rouge-1']['p']
398 |                 rouge_2_p += scores[0]['rouge-2']['p']
399 |                 rouge_l_p += scores[0]['rouge-l']['p']
400 |                 rouge_1_r += scores[0]['rouge-1']['r']
401 |                 rouge_2_r += scores[0]['rouge-2']['r']
402 |                 rouge_l_r += scores[0]['rouge-l']['r']
403 | 
404 |                 bleu += sentence_bleu(references=[list(text_output.replace(" ", ""))],
405 |                                       hypothesis=list(text_pred.replace(" ", "")),
406 |                                       smoothing_function=smooth)
407 |                 mertics_i = {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l,
408 |                              'bleu': bleu,
409 |                              'rouge-1_p': rouge_1_p, 'rouge-2_p': rouge_2_p, 'rouge-l_p': rouge_l_p,
410 |                              'rouge-1_r': rouge_1_r, 'rouge-2_r': rouge_2_r, 'rouge-l_r': rouge_l_r, }
411 |                 if total < 5:
412 |                     print(text_output.replace(" ", ""))
413 |                     print(text_pred.replace(" ", ""))
414 |                     print(mertics_i)
415 |         except Exception as e:
416 |             print(traceback.print_exc())
417 |             continue
418 |     time_end = time.time()
419 |     lost_time = time_end - time_start
420 |     lost_time_avg = lost_time / total
421 |     rouge_1, rouge_2, rouge_l, bleu = rouge_1 / total, rouge_2 / total, rouge_l / total, bleu / total
422 |     rouge_1_p, rouge_2_p, rouge_l_p = rouge_1_p / total, rouge_2_p / total, rouge_l_p / total
423 |     rouge_1_r, rouge_2_r, rouge_l_r = rouge_1_r / total, rouge_2_r / total, rouge_l_r / total
424 | 
425 |     mertics = {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu,
426 |                "lost_time": lost_time, "lost_time_avg": lost_time_avg,
427 |                'rouge-1_p': rouge_1_p, 'rouge-2_p': rouge_2_p, 'rouge-l_p': rouge_l_p,
428 |                'rouge-1_r': rouge_1_r, 'rouge-2_r': rouge_2_r, 'rouge-l_r': rouge_l_r,
429 |                }
430 |     mertics = {k: round(v, 4) for k, v in mertics.items()}
431 |     print(mertics, lost_time, lost_time_avg)
432 |     fw.write(json.dumps(mertics, ensure_ascii=False) + "\n")
433 |     fw.close()
434 | 
435 | 
436 | """
437 | # nohup python evaluation.py > tc.evaluation.py.log 2>&1 &
438 | # tail -n 1000  -f tc.evaluation.py.log
439 | # |myz|
440 | """
441 | 
442 | 


--------------------------------------------------------------------------------
/qwen2_sft/ft_qwen2/post_api.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2023/3/25 21:56
  4 | # @author  : Mo
  5 | # @function: fastapi-post接口
  6 | 
  7 | 
  8 | import traceback
  9 | import logging
 10 | import random
 11 | import time
 12 | import json
 13 | import sys
 14 | import os
 15 | 
 16 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
 17 | print(path_root)
 18 | sys.path.append(path_root)
 19 | from qwen2_sft.ft_qwen2.config import CUDA_VISIBLE_DEVICES, USE_TORCH, CPU_NUMS  # from config
 20 | os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:3072"
 21 | os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES
 22 | os.environ["USE_TORCH"] = USE_TORCH
 23 | os.environ["OMP_NUM_THREADS"] = CPU_NUMS  # export OMP_NUM_THREADS=1
 24 | os.environ["OPENBLAS_NUM_THREADS"] = CPU_NUMS  # export OPENBLAS_NUM_THREADS=1
 25 | os.environ["MKL_NUM_THREADS"] = CPU_NUMS  # export MKL_NUM_THREADS=1
 26 | os.environ["VECLIB_MAXIMUM_THREADS"] = CPU_NUMS  # export VECLIB_MAXIMUM_THREADS=1
 27 | os.environ["NUMEXPR_NUM_THREADS"] = CPU_NUMS  # export NUMEXPR_NUM_THREADS=1
 28 | 
 29 | # from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 30 | from peft import prepare_model_for_int8_training
 31 | from peft import LoraConfig, get_peft_model
 32 | from transformers import GenerationConfig
 33 | from pydantic import BaseModel
 34 | from rouge import Rouge  # pip install rouge
 35 | from tqdm import tqdm
 36 | import torch
 37 | 
 38 | from pydantic import BaseModel
 39 | from fastapi import FastAPI
 40 | import time
 41 | 
 42 | 
 43 | # from qwen2_sft.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as LLMTokenizer
 44 | # from qwen2_sft.models.qwen2.configuration_qwen2 import Qwen2Config as LLMConfig
 45 | # from qwen2_sft.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as LLMModel
 46 | from transformers import Qwen2Tokenizer as LLMTokenizer
 47 | from transformers import Qwen2ForCausalLM as LLMModel
 48 | from transformers import Qwen2Config as LLMConfig
 49 | from qwen2_sft.ft_qwen2.config import PATH_MODEL_PRETRAIN, DATA_PATH, MODEL_SAVE_DIR, REPO_ID
 50 | from qwen2_sft.ft_qwen2.config import MICRO_BATCH_SIZE, BATCH_SIZE, GRADIENT_ACCUMULATION_STEPS
 51 | from qwen2_sft.ft_qwen2.config import LEARNING_RATE, EPOCHS, SAVE_STEPS, VAL_SET_SIZE, TARGET_MODULES
 52 | from qwen2_sft.ft_qwen2.config import IS_PARALLELIZABLE, MODEL_PARALLEL, USE_CACHE
 53 | from qwen2_sft.ft_qwen2.config import MAX_LENGTH_Q, MAX_LENGTH_A, MAX_LENGTH_QA
 54 | from qwen2_sft.ft_qwen2.config import LORA_DROPOUT, LORA_ALPHA, LORA_R
 55 | from qwen2_sft.ft_qwen2.config import USE_CUDA
 56 | 
 57 | 
 58 | app = FastAPI()  # 日志文件名,为启动时的日期, 全局日志格式
 59 | logger_level = logging.INFO
 60 | logging.basicConfig(format="%(asctime)s - %(filename)s[line:%(lineno)d] "
 61 |                            "- %(levelname)s: %(message)s",
 62 |                     level=logger_level)
 63 | logger = logging.getLogger("ft-llama")
 64 | console = logging.StreamHandler()
 65 | console.setLevel(logger_level)
 66 | logger.addHandler(console)
 67 | 
 68 | 
 69 | # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
 70 | # world_size = int(os.environ.get("WORLD_SIZE", 1))
 71 | # ddp = world_size != 1
 72 | # device_map = "auto"
 73 | # # USE_CUDA = True
 74 | # print(device_map)
 75 | # print(ddp)
 76 | 
 77 | 
 78 | def load_model_state(model, model_save_dir="./", model_name="adapter_model.safetensors", device="cpu"):
 79 |     """  仅加载模型参数(推荐使用)  """
 80 |     try:
 81 |         path_model = os.path.join(model_save_dir, model_name)
 82 |         peft_config = LoraConfig.from_pretrained(model_save_dir)
 83 |         peft_config.inference_mode = True
 84 |         model = get_peft_model(model, peft_config)
 85 | 
 86 |         try:
 87 |             if path_model.endswith(".safetensors"):
 88 |                 from safetensors.torch import load_file, save_file
 89 |                 from safetensors import safe_open
 90 |                 state_dict = {}
 91 |                 with safe_open(path_model, framework="pt", device="cpu") as f:
 92 |                     for k in f.keys():
 93 |                         state_dict[k] = f.get_tensor(k)
 94 |             ### if path_model.endswith(".bin") or path_model.endswith(".pt"):
 95 |             else:
 96 |                 state_dict = torch.load(path_model, map_location=torch.device(device))
 97 |         except Exception as e:
 98 |             print(traceback.print_exc())
 99 |             ### 全部训练完的话会用这个, 即便是.safetensors
100 |             state_dict = torch.load(path_model, map_location=torch.device(device))
101 | 
102 |         # print(state_dict.keys())
103 |         state_dict = {"base_model.model." + k.replace("_orig_mod.", "")
104 |                       .replace(".lora_A.weight", ".lora_A.default.weight")
105 |                       .replace(".lora_B.weight", ".lora_B.default.weight")
106 |                       : v for k, v in state_dict.items()}
107 |         print(state_dict.keys())
108 |         print("#" * 128)
109 |         ### 排查不存在model.keys的 state_dict.key
110 |         name_dict = {name: 0 for name, param in model.named_parameters()}
111 |         print(name_dict.keys())
112 |         print("#" * 128)
113 |         for state_dict_key in state_dict.keys():
114 |             if state_dict_key not in name_dict:
115 |                 print("{} is not exist!".format(state_dict_key))
116 |         model.load_state_dict(state_dict, strict=False)
117 |         # model.to(device)
118 |         print("******model loaded success******")
119 |         print("self.device: {}".format(device))
120 |     except Exception as e:
121 |         print(str(e))
122 |         raise Exception("******load model error******")
123 |     return model
124 | def save_model_state(model, config=None, model_save_dir="./", model_name="pytorch_model.bin"):
125 |     """  仅保存模型参数(推荐使用)  """
126 |     if not os.path.exists(model_save_dir):
127 |         os.makedirs(model_save_dir)
128 |     if config:
129 |         config.save_pretrained(model_save_dir)
130 |     # save model
131 |     path_model = os.path.join(model_save_dir, model_name)
132 |     # torch.save(model.state_dict(), path_model)
133 |     grad_params_dict = {k: v.to("cpu") for k, v in model.named_parameters()
134 |                         if v.requires_grad == True}
135 |     torch.save(grad_params_dict, path_model)
136 |     print("******model_save_path is {}******".format(path_model))
137 | def prepare_model_for_half_training(model, output_embedding_layer_name="lm_head",
138 |         use_gradient_checkpointing=True, layer_norm_names=["layer_norm"]):
139 |     r"""
140 |     This method wrapps the entire protocol for preparing a model before running a training. This includes:
141 |         1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
142 |         head to fp32
143 | 
144 |     Args:
145 |         model, (`transformers.PreTrainedModel`):
146 |             The loaded model from `transformers`
147 |     """
148 |     #  不要使用 model.half(), 这样会先截取精度再训练了, 最初data就要保持half
149 |     for name, param in model.named_parameters():
150 |         # freeze base model's layers
151 |         param.requires_grad = False
152 |         # cast layer norm in fp32 for stability for 8bit models
153 |         if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
154 |             param.data = param.data.to(torch.float32)
155 |         elif output_embedding_layer_name in name:  # lm_head也需要是tf.float32(最后一层)
156 |             param.data = param.data.to(torch.float32)
157 |         else:
158 |             param.data = param.data.to(torch.half)
159 | 
160 |     if use_gradient_checkpointing:
161 |         # For backward compatibility
162 |         if hasattr(model, "enable_input_require_grads"):
163 |             model.enable_input_require_grads()
164 |         else:
165 |             def make_inputs_require_grad(module, input, output):
166 |                 output.requires_grad_(True)
167 |             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
168 |         # enable gradient checkpointing for memory efficiency
169 |         model.gradient_checkpointing_enable()
170 |     return model
171 | def print_named_parameters(model, use_print_data=False):
172 |     """   打印模型训练参数/数据类型信息   """
173 |     trainable_params = 0
174 |     all_param = 0
175 |     for name, param in model.named_parameters():
176 |         if use_print_data:
177 |             print((name, param.data.dtype, param.requires_grad, param.data))
178 |         else:
179 |             print((name, param.data.dtype, param.requires_grad))
180 |         num_params = param.numel()
181 |         # if using DS Zero 3 and the weights are initialized empty
182 |         if num_params == 0 and hasattr(param, "ds_numel"):
183 |             num_params = param.ds_numel
184 |         all_param += num_params
185 |         if param.requires_grad:
186 |             trainable_params += num_params
187 |     print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")
188 | def txt_read(path, encode_type="utf-8", errors=None):
189 |     """
190 |         读取txt文件，默认utf8格式, 不能有空行
191 |     Args:
192 |         path[String]: path of file of read, eg. "corpus/xuexiqiangguo.txt"
193 |         encode_type[String]: data encode type of file, eg. "utf-8", "gbk"
194 |         errors[String]: specifies how encoding errors handled, eg. "ignore", strict
195 |     Returns:
196 |         lines[List]: output lines
197 |     """
198 |     lines = []
199 |     try:
200 |         file = open(path, "r", encoding=encode_type, errors=errors)
201 |         lines = file.readlines()
202 |         file.close()
203 |     except Exception as e:
204 |         logger.info(str(e))
205 |     finally:
206 |         return lines
207 | def load_json(path: str, encoding: str="utf-8"):
208 |     """
209 |     Read Line of List<json> form file
210 |     Args:
211 |         path: path of save file, such as "txt"
212 |         encoding: type of encoding, such as "utf-8", "gbk"
213 |     Returns:
214 |         model_json: dict of word2vec, eg. [{"大漠帝国":132}]
215 |     """
216 |     with open(path, "r", encoding=encoding) as fj:
217 |         model_json = json.load(fj)
218 |         fj.close()
219 |     return model_json
220 | def generate_prompt(data_point, is_logger=False):
221 |     """   指令微调:
222 |     """
223 | 
224 |     text_input = data_point.get("instruction", "") + "\t" + data_point.get("input", "")
225 |     text_out = data_point.get("output", "")
226 |     prompt_text_1 = """<|im_start|>system
227 | You are a helpful assistant.<|im_end|>
228 | <|im_start|>user
229 | {}<|im_end|>
230 | <|im_start|>assistant"""
231 |     tokens = tokenizer.tokenize(prompt_text_1)
232 |     max_len = MAX_LENGTH_QA - len(tokens)
233 |     text_1 = prompt_text_1.format(text_input[:max_len])
234 |     x = tokenizer.encode(text_1, add_special_tokens=False)
235 |     if len(x) > (MAX_LENGTH_Q + MAX_LENGTH_A):
236 |         x = x[:MAX_LENGTH_Q] + [ID_EOS]
237 |     out = {"input_ids": x, "labels": []}
238 |     if is_logger:
239 |         print(text_1)
240 |         print(out)
241 |     return out
242 | 
243 | 
244 | model = LLMModel.from_pretrained(PATH_MODEL_PRETRAIN)
245 | # model = prepare_model_for_half_training(model,
246 | #         use_gradient_checkpointing=False,
247 | #         output_embedding_layer_name="lm_head",
248 | #         layer_norm_names=["post_attention_layernorm",
249 | #                           "input_layernorm",
250 | #                           "norm"
251 | #                           ],
252 | #         )
253 | config = LoraConfig(target_modules=TARGET_MODULES,
254 |                     lora_dropout=LORA_DROPOUT,
255 |                     lora_alpha=LORA_ALPHA,
256 |                     task_type="CAUSAL_LM",
257 |                     bias="none",
258 |                     r=LORA_R,
259 |                     )
260 | model = get_peft_model(model, config)
261 | model = load_model_state(model=model, model_save_dir=MODEL_SAVE_DIR)
262 | if USE_CUDA:
263 |     model = model.half().cuda()
264 | else:
265 |     model = model.bfloat16()
266 | print_named_parameters(model, use_print_data=True)
267 | # print_named_parameters(model)
268 | 
269 | tokenizer = LLMTokenizer.from_pretrained(PATH_MODEL_PRETRAIN, add_eos_token=True)
270 | ID_END = 151643
271 | ID_BOS = 151644
272 | ID_EOS = 151645
273 | ID_PAD = ID_EOS
274 | ID_BR = 1699  # "\\n"
275 | ID_USER = 872
276 | ID_MODEL = 8948
277 | tokenizer.pad_token_id = ID_EOS
278 | tokenizer.eos_token_id = ID_EOS
279 | tokenizer.padding_side = "left"
280 | # tokenizer.padding_side = "right"  # NO use attention-mask
281 | print(ID_PAD)
282 | print(ID_BOS)
283 | print(ID_EOS)
284 | print(ID_BR)
285 | print(ID_USER)
286 | print(ID_MODEL)
287 | """
288 | "<|endoftext|>": 151643,
289 | "<|im_start|>": 151644,
290 | "<|im_end|>": 151645
291 | "system": 8948,
292 | "user": 872,
293 | "\\n": 1699,
294 | """
295 | STOP_WORDS_IDS = [[ID_BOS], [ID_EOS], [ID_END]]
296 | 
297 | 
298 | 
299 | def predict(data_point, generation_config):
300 |     """  推理  """
301 |     prompt_dict = generate_prompt(data_point)
302 |     # inputs = tokenizer([text_1], return_tensors="pt", padding=True)
303 |     input_ids = prompt_dict.get("input_ids")
304 |     input_ids = torch.tensor([input_ids], dtype=torch.long)
305 |     if USE_CUDA:
306 |         input_ids = input_ids.cuda()
307 |     generation_config = GenerationConfig(**generation_config)
308 |     with torch.no_grad():
309 |         generation_output = model.generate(
310 |             input_ids=input_ids,
311 |             generation_config=generation_config,
312 |             # stop_words_ids=STOP_WORDS_IDS,
313 |             return_dict_in_generate=True,
314 |             output_scores=True,
315 |         )
316 |     s = generation_output.sequences[0]
317 |     output = tokenizer.decode(s)
318 |     print(input_ids)
319 |     print(s)
320 |     print(output)
321 |     return output
322 | class Item(BaseModel):
323 |     instruction: str = ""
324 |     text: str = "1+1="
325 |     penalty_alpha: float = 1.0
326 |     max_new_tokens: int = 128
327 |     temperature: float = 0.8  # 0.95  # 0.35  # 0.95
328 |     do_sample: bool = True
329 |     num_beams: int = 1
330 |     top_p: float = 0.8  # 0.75
331 |     top_k: int = 50
332 | 
333 | 
334 | @app.post("/nlg/text_generate")
335 | def text_generate(request_data: Item):
336 |     instruction = request_data.instruction
337 |     text = request_data.text
338 |     penalty_alpha = request_data.penalty_alpha
339 |     max_new_tokens = request_data.max_new_tokens
340 |     temperature = request_data.temperature
341 |     do_sample = request_data.do_sample
342 |     num_beams = request_data.num_beams
343 |     top_p = request_data.top_p
344 |     top_k = request_data.top_k
345 | 
346 |     generation_dict = vars(request_data)
347 |     print(generation_dict)
348 |     generation_dict.pop("max_new_tokens")
349 |     generation_dict.pop("instruction")
350 |     generation_dict.pop("text")
351 |     data_point = {"instruction": instruction, "input": text, "output": ""}
352 |     generation_config = {"temperature": temperature,
353 |                          "top_p": top_p,
354 |                          "top_k": top_k,
355 |                          "num_beams": num_beams,
356 |                          "do_sample": do_sample,
357 |                          "penalty_alpha": penalty_alpha,
358 |                          "max_new_tokens": max_new_tokens,
359 |                          "pad_token_id": ID_PAD,
360 |                          "eos_token_id": ID_EOS,
361 |                          }
362 |     try:  # 数据预处理, 模型预测
363 |         response = predict(data_point, generation_config)
364 |     except Exception as e:
365 |         logger.info(traceback.print_exc())
366 |         response = "<|im_end|>"
367 |     return {"response": response}
368 | 
369 | 
370 | if __name__ == '__main__':
371 |     import uvicorn
372 |     uvicorn.run(app=app,
373 |                 host="0.0.0.0",
374 |                 port=8036,
375 |                 workers=1)
376 | 
377 | 
378 | """
379 | # nohup python post_api.py > tc.post_api.py.log 2>&1 &
380 | # tail -n 1000  -f tc.post_api.py.log
381 | # |myz|
382 | 
383 | 可以在浏览器生成界面直接访问: http://localhost:8036/docs
384 | """
385 | 
386 | 


--------------------------------------------------------------------------------
/qwen2_sft/ft_qwen2/predict.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2023/3/25 21:56
  4 | # @author  : Mo
  5 | # @function: 推理
  6 | 
  7 | 
  8 | import random
  9 | import time
 10 | import sys
 11 | import os
 12 | import traceback
 13 | 
 14 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
 15 | print(path_root)
 16 | sys.path.append(path_root)
 17 | 
 18 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 19 | print(path_root)
 20 | sys.path.append(path_root)
 21 | from qwen2_sft.ft_qwen2.config import CUDA_VISIBLE_DEVICES, USE_TORCH, CPU_NUMS  # from config
 22 | os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:3072"
 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"  #CUDA_VISIBLE_DEVICES
 24 | os.environ["USE_TORCH"] = USE_TORCH
 25 | os.environ["OMP_NUM_THREADS"] = CPU_NUMS  # export OMP_NUM_THREADS=1
 26 | os.environ["OPENBLAS_NUM_THREADS"] = CPU_NUMS  # export OPENBLAS_NUM_THREADS=1
 27 | os.environ["MKL_NUM_THREADS"] = CPU_NUMS  # export MKL_NUM_THREADS=1
 28 | os.environ["VECLIB_MAXIMUM_THREADS"] = CPU_NUMS  # export VECLIB_MAXIMUM_THREADS=1
 29 | os.environ["NUMEXPR_NUM_THREADS"] = CPU_NUMS  # export NUMEXPR_NUM_THREADS=1
 30 | 
 31 | from peft import LoraConfig, get_peft_model
 32 | from transformers import GenerationConfig
 33 | from pydantic import BaseModel
 34 | from rouge import Rouge  # pip install rouge
 35 | from tqdm import tqdm
 36 | import torch
 37 | import jieba
 38 | 
 39 | # from qwen2_sft.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as LLMTokenizer
 40 | # from qwen2_sft.models.qwen2.configuration_qwen2 import Qwen2Config as LLMConfig
 41 | # from qwen2_sft.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as LLMModel
 42 | from transformers import Qwen2Tokenizer as LLMTokenizer
 43 | from transformers import Qwen2ForCausalLM as LLMModel
 44 | from transformers import Qwen2Config as LLMConfig
 45 | from qwen2_sft.ft_qwen2.config import PATH_MODEL_PRETRAIN, DATA_PATH, MODEL_SAVE_DIR, REPO_ID
 46 | from qwen2_sft.ft_qwen2.config import MICRO_BATCH_SIZE, BATCH_SIZE, GRADIENT_ACCUMULATION_STEPS
 47 | from qwen2_sft.ft_qwen2.config import LEARNING_RATE, EPOCHS, SAVE_STEPS, VAL_SET_SIZE, TARGET_MODULES
 48 | from qwen2_sft.ft_qwen2.config import IS_PARALLELIZABLE, MODEL_PARALLEL, USE_CACHE
 49 | from qwen2_sft.ft_qwen2.config import MAX_LENGTH_Q, MAX_LENGTH_A, MAX_LENGTH_QA
 50 | from qwen2_sft.ft_qwen2.config import LORA_DROPOUT, LORA_ALPHA, LORA_R
 51 | from qwen2_sft.ft_qwen2.config import USE_CUDA
 52 | 
 53 | 
 54 | # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
 55 | # world_size = int(os.environ.get("WORLD_SIZE", 1))
 56 | # ddp = world_size != 1
 57 | # device_map = "auto"
 58 | # # USE_CUDA = True
 59 | # print(device_map)
 60 | # print(ddp)
 61 | 
 62 | def load_model_state(model, model_save_dir="./", model_name="adapter_model.safetensors", device="cpu"):
 63 |     """  仅加载模型参数(推荐使用)  """
 64 |     try:
 65 |         path_model = os.path.join(model_save_dir, model_name)
 66 |         peft_config = LoraConfig.from_pretrained(model_save_dir)
 67 |         peft_config.inference_mode = True
 68 |         model = get_peft_model(model, peft_config)
 69 | 
 70 |         try:
 71 |             if path_model.endswith(".safetensors"):
 72 |                 from safetensors.torch import load_file, save_file
 73 |                 from safetensors import safe_open
 74 |                 state_dict = {}
 75 |                 with safe_open(path_model, framework="pt", device="cpu") as f:
 76 |                     for k in f.keys():
 77 |                         state_dict[k] = f.get_tensor(k)
 78 |             ### if path_model.endswith(".bin") or path_model.endswith(".pt"):
 79 |             else:
 80 |                 state_dict = torch.load(path_model, map_location=torch.device(device))
 81 |         except Exception as e:
 82 |             print(traceback.print_exc())
 83 |             ### 全部训练完的话会用这个, 即便是.safetensors
 84 |             state_dict = torch.load(path_model, map_location=torch.device(device))
 85 | 
 86 |         # state_dict = torch.load(path_model, map_location=torch.device(device))
 87 |         # print(state_dict.keys())
 88 |         state_dict = {"base_model.model." + k.replace("_orig_mod.", "")
 89 |                       .replace(".lora_A.weight", ".lora_A.default.weight")
 90 |                       .replace(".lora_B.weight", ".lora_B.default.weight")
 91 |                       : v for k, v in state_dict.items()}
 92 |         print(state_dict.keys())
 93 |         print("#" * 128)
 94 |         ### 排查不存在model.keys的 state_dict.key
 95 |         name_dict = {name: 0 for name, param in model.named_parameters()}
 96 |         print(name_dict.keys())
 97 |         print("#" * 128)
 98 |         for state_dict_key in state_dict.keys():
 99 |             if state_dict_key not in name_dict:
100 |                 print("{} is not exist!".format(state_dict_key))
101 |         model.load_state_dict(state_dict, strict=False)
102 |         # model.to(device)
103 |         print("******model loaded success******")
104 |         print("self.device: {}".format(device))
105 |     except Exception as e:
106 |         print(str(e))
107 |         print(traceback.print_exc())
108 |     return model
109 | def save_model_state(model, config=None, model_save_dir="./", model_name="adapter_model.safetensors"):
110 |     """  仅保存模型参数(推荐使用)  """
111 |     if not os.path.exists(model_save_dir):
112 |         os.makedirs(model_save_dir)
113 |     if config:
114 |         config.save_pretrained(model_save_dir)
115 |     # save model
116 |     path_model = os.path.join(model_save_dir, model_name)
117 |     # torch.save(model.state_dict(), path_model)
118 |     grad_params_dict = {k: v.to("cpu") for k, v in model.named_parameters()
119 |                         if v.requires_grad == True}
120 |     torch.save(grad_params_dict, path_model)
121 |     print("******model_save_path is {}******".format(path_model))
122 | def prepare_model_for_half_training(model, output_embedding_layer_name="lm_head",
123 |         use_gradient_checkpointing=True, layer_norm_names=["layer_norm"]):
124 |     r"""
125 |     This method wrapps the entire protocol for preparing a model before running a training. This includes:
126 |         1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
127 |         head to fp32
128 | 
129 |     Args:
130 |         model, (`transformers.PreTrainedModel`):
131 |             The loaded model from `transformers`
132 |     """
133 |     #  不要使用 model.half(), 这样会先截取精度再训练了, 最初data就要保持half
134 |     for name, param in model.named_parameters():
135 |         # freeze base model's layers
136 |         param.requires_grad = False
137 |         # cast layer norm in fp32 for stability for 8bit models
138 |         if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
139 |             param.data = param.data.to(torch.float32)
140 |         elif output_embedding_layer_name in name:  # lm_head也需要是tf.float32(最后一层)
141 |             param.data = param.data.to(torch.float32)
142 |         else:
143 |             param.data = param.data.to(torch.half)
144 | 
145 |     if use_gradient_checkpointing:
146 |         # For backward compatibility
147 |         if hasattr(model, "enable_input_require_grads"):
148 |             model.enable_input_require_grads()
149 |         else:
150 |             def make_inputs_require_grad(module, input, output):
151 |                 output.requires_grad_(True)
152 |             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
153 |         # enable gradient checkpointing for memory efficiency
154 |         model.gradient_checkpointing_enable()
155 |     return model
156 | def print_named_parameters(model, use_print_data=True):
157 |     """   打印模型训练参数/数据类型信息   """
158 |     trainable_params = 0
159 |     all_param = 0
160 |     for name, param in model.named_parameters():
161 |         if use_print_data:
162 |             print((name, param.data.dtype, param.requires_grad, param.data))
163 |         else:
164 |             print((name, param.data.dtype, param.requires_grad))
165 |         num_params = param.numel()
166 |         # if using DS Zero 3 and the weights are initialized empty
167 |         if num_params == 0 and hasattr(param, "ds_numel"):
168 |             num_params = param.ds_numel
169 |         all_param += num_params
170 |         if param.requires_grad:
171 |             trainable_params += num_params
172 |     print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")
173 | def generate_prompt(data_point, is_logger=False):
174 |     """   指令微调:
175 |     """
176 | 
177 |     text_input = data_point.get("instruction", "") + "\t" + data_point.get("input", "")
178 |     text_out = data_point.get("output", "")
179 |     prompt_text_1 = """<|im_start|>system
180 | You are a helpful assistant.<|im_end|>
181 | <|im_start|>user
182 | {}<|im_end|>
183 | <|im_start|>assistant"""
184 |     tokens = tokenizer.tokenize(prompt_text_1)
185 |     max_len = MAX_LENGTH_QA - len(tokens)
186 |     text_1 = prompt_text_1.format(text_input[:max_len])
187 |     x = tokenizer.encode(text_1, add_special_tokens=False)
188 |     if len(x) > (MAX_LENGTH_Q + MAX_LENGTH_A):
189 |         x = x[:MAX_LENGTH_Q] + [ID_EOS]
190 |     out = {"input_ids": x, "labels": []}
191 |     if is_logger:
192 |         print(text_1)
193 |         print(out)
194 |     return out
195 | 
196 | 
197 | model = LLMModel.from_pretrained(PATH_MODEL_PRETRAIN)
198 | # model = prepare_model_for_half_training(model,
199 | #         use_gradient_checkpointing=False,
200 | #         output_embedding_layer_name="lm_head",
201 | #         layer_norm_names=["post_attention_layernorm",
202 | #                           "input_layernorm",
203 | #                           "norm"
204 | #                           ],
205 | #         )
206 | config = LoraConfig(target_modules=TARGET_MODULES,
207 |                     lora_dropout=LORA_DROPOUT,
208 |                     lora_alpha=LORA_ALPHA,
209 |                     task_type="CAUSAL_LM",
210 |                     bias="none",
211 |                     r=LORA_R,
212 |                     )
213 | model = get_peft_model(model, config)
214 | model = load_model_state(model=model, model_save_dir=MODEL_SAVE_DIR)
215 | if USE_CUDA:
216 |     model = model.half().cuda()
217 | else:
218 |     model = model.bfloat16()
219 | print_named_parameters(model, use_print_data=True)
220 | # print_named_parameters(model)
221 | 
222 | tokenizer = LLMTokenizer.from_pretrained(PATH_MODEL_PRETRAIN, add_eos_token=True)
223 | ID_END = 151643
224 | ID_BOS = 151644
225 | ID_EOS = 151645
226 | ID_PAD = ID_EOS
227 | ID_BR = 1699  # "\\n"
228 | ID_USER = 872
229 | ID_MODEL = 8948
230 | tokenizer.pad_token_id = ID_EOS
231 | tokenizer.eos_token_id = ID_EOS
232 | tokenizer.padding_side = "left"
233 | # tokenizer.padding_side = "right"  # NO use attention-mask
234 | print(ID_PAD)
235 | print(ID_BOS)
236 | print(ID_EOS)
237 | print(ID_BR)
238 | print(ID_USER)
239 | print(ID_MODEL)
240 | """
241 | "<|endoftext|>": 151643,
242 | "<|im_start|>": 151644,
243 | "<|im_end|>": 151645
244 | "system": 8948,
245 | "user": 872,
246 | "\\n": 1699,
247 | """
248 | STOP_WORDS_IDS = [[ID_BOS], [ID_EOS], [ID_END]]
249 | 
250 | 
251 | def predict(data_dict):
252 |     """  推理  """
253 |     prompt_dict = generate_prompt(data_dict)
254 |     # inputs = tokenizer([text_1], return_tensors="pt", padding=True)
255 |     input_ids = prompt_dict.get("input_ids")
256 |     input_ids = torch.tensor([input_ids], dtype=torch.long)
257 |     if USE_CUDA:
258 |         input_ids = input_ids.cuda()
259 |     generation_config = GenerationConfig(
260 |         # temperature=0.8,
261 |         # top_p=0.8,
262 |         temperature=0.95,
263 |         top_p=0.75,
264 |         top_k=50,
265 |         num_beams=1,
266 |         do_sample=True,
267 |         max_new_tokens=256,
268 |         # penalty_alpha=1.5,
269 |         pad_token_id=ID_PAD,
270 |         eos_token_id=ID_EOS,
271 |     )
272 |     with torch.no_grad():
273 |         generation_output = model.generate(
274 |             input_ids=input_ids,
275 |             generation_config=generation_config,
276 |             # stop_words_ids=STOP_WORDS_IDS,
277 |             # stop_words_ids=[[ID_EOS]],
278 |             return_dict_in_generate=True,
279 |             # return_dict_in_generate=True,
280 |             # output_scores=True,
281 |             # max_new_tokens=512,
282 |         )
283 |     s = generation_output.sequences[0]
284 |     output = tokenizer.decode(s)
285 |     print(data_dict)
286 |     print(input_ids)
287 |     print(output)
288 |     # output = output.split("答：")[-1]
289 |     return output
290 | 
291 | 
292 | 
293 | if __name__ == '__main__':
294 |     data_dict = {"instruction": "类型#裙*版型#宽松*版型#显瘦*颜色#黑色*图案#撞色*裙型#直筒裙*裙款式#拼接",
295 |                  "input": "",
296 |                  "output": ""
297 |                  }
298 |     res = predict(data_dict)
299 |     print(res)
300 |     while True:
301 |         time_start = time.time()
302 |         history = []
303 |         print("请输入:")
304 |         ques = input()
305 |         print("请稍等...")
306 |         try:
307 |             if ques.strip().upper() == "CLEAR":
308 |                 history = []
309 |                 print("clear ok")
310 |                 continue
311 |             else:
312 |                 print("#" * 128)
313 |                 ques_dict = {"instruction": ques, "input": "", "output": ""}
314 |                 res = predict(ques_dict)
315 |                 print(res)
316 |         except Exception as e:
317 |             print(str(e))
318 |         print(time.time() - time_start)
319 | 
320 | """
321 | python predict.py
322 | 
323 | """


--------------------------------------------------------------------------------
/qwen2_sft/ft_qwen2/train.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2023/3/5 21:04
  4 | # @author  : Mo
  5 | # @function: qwen1.5
  6 | 
  7 | 
  8 | import traceback
  9 | import random
 10 | import sys
 11 | import os
 12 | 
 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
 14 | print(path_root)
 15 | sys.path.append(path_root)
 16 | from qwen2_sft.ft_qwen2.config import CUDA_VISIBLE_DEVICES, USE_TORCH, CPU_NUMS  # from config
 17 | os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:3072"
 18 | os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES
 19 | os.environ["USE_TORCH"] = USE_TORCH
 20 | os.environ["OMP_NUM_THREADS"] = CPU_NUMS  # export OMP_NUM_THREADS=1
 21 | os.environ["OPENBLAS_NUM_THREADS"] = CPU_NUMS  # export OPENBLAS_NUM_THREADS=1
 22 | os.environ["MKL_NUM_THREADS"] = CPU_NUMS  # export MKL_NUM_THREADS=1
 23 | os.environ["VECLIB_MAXIMUM_THREADS"] = CPU_NUMS  # export VECLIB_MAXIMUM_THREADS=1
 24 | os.environ["NUMEXPR_NUM_THREADS"] = CPU_NUMS  # export NUMEXPR_NUM_THREADS=1
 25 | 
 26 | from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 27 | from transformers.modeling_utils import unwrap_model
 28 | from peft import LoraConfig, get_peft_model
 29 | from transformers import GenerationConfig
 30 | from tensorboardX import SummaryWriter
 31 | from datasets import load_dataset
 32 | from pydantic import BaseModel
 33 | from rouge import Rouge  # pip install rouge
 34 | from tqdm import tqdm
 35 | import transformers
 36 | import torch
 37 | 
 38 | # from qwen2_sft.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as LLMTokenizer
 39 | # from qwen2_sft.models.qwen2.configuration_qwen2 import Qwen2Config as LLMConfig
 40 | # from qwen2_sft.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as LLMModel
 41 | from transformers import Qwen2Tokenizer as LLMTokenizer
 42 | from transformers import Qwen2ForCausalLM as LLMModel
 43 | from transformers import Qwen2Config as LLMConfig
 44 | from qwen2_sft.ft_qwen2.config import PATH_MODEL_PRETRAIN, DATA_PATH, MODEL_SAVE_DIR, REPO_ID
 45 | from qwen2_sft.ft_qwen2.config import MICRO_BATCH_SIZE, BATCH_SIZE, GRADIENT_ACCUMULATION_STEPS
 46 | from qwen2_sft.ft_qwen2.config import LEARNING_RATE, EPOCHS, SAVE_STEPS, VAL_SET_SIZE, TARGET_MODULES
 47 | from qwen2_sft.ft_qwen2.config import IS_PARALLELIZABLE, MODEL_PARALLEL, USE_CACHE
 48 | from qwen2_sft.ft_qwen2.config import MAX_LENGTH_Q, MAX_LENGTH_A, MAX_LENGTH_QA
 49 | from qwen2_sft.ft_qwen2.config import LORA_DROPOUT, LORA_ALPHA, LORA_R
 50 | from qwen2_sft.ft_qwen2.config import USE_CUDA, WEIGHT_DECAY
 51 | from qwen2_sft.ft_qwen2.config import USE_ALL_LOSS
 52 | 
 53 | 
 54 | tensorboardx_witer = SummaryWriter(logdir=MODEL_SAVE_DIR)
 55 | # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
 56 | use_all_loss = USE_ALL_LOSS or True
 57 | # world_size = int(os.environ.get("WORLD_SIZE", 1))
 58 | # device_map = "auto"
 59 | # ddp = world_size != 1
 60 | # print(device_map)
 61 | # print(ddp)
 62 | 
 63 | 
 64 | def save_model_state(model, config=None, model_save_dir="./", model_name="adapter_model.safetensors"):
 65 |     """  仅保存 有梯度 的 模型参数(推荐使用)  """
 66 |     if not os.path.exists(model_save_dir):
 67 |         os.makedirs(model_save_dir)
 68 |     # save config
 69 |     if config:
 70 |         config.save_pretrained(model_save_dir)
 71 |         # config.to_dict()
 72 |     # save model
 73 |     path_model = os.path.join(model_save_dir, model_name)
 74 |     grad_params_dict = {k: v.to("cpu") for k, v in model.named_parameters()
 75 |                         if v.requires_grad == True}
 76 |     torch.save(grad_params_dict, path_model)
 77 |     print("******model_save_path is {}******".format(path_model))
 78 | def print_named_parameters(model, use_print_data=False):
 79 |     """   打印模型训练参数/数据类型信息   """
 80 |     trainable_params = 0
 81 |     all_param = 0
 82 |     for name, param in model.named_parameters():
 83 |         if use_print_data:
 84 |             print((name, param.data.dtype, param.requires_grad, param.data))
 85 |         else:
 86 |             print((name, param.data.dtype, param.requires_grad))
 87 |         num_params = param.numel()
 88 |         # if using DS Zero 3 and the weights are initialized empty
 89 |         if num_params == 0 and hasattr(param, "ds_numel"):
 90 |             num_params = param.ds_numel
 91 |         all_param += num_params
 92 |         if param.requires_grad:
 93 |             trainable_params += num_params
 94 |     print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")
 95 | def prepare_model_for_half_training(model, output_embedding_layer_name="lm_head",
 96 |         use_gradient_checkpointing=True, layer_norm_names=["layer_norm"]):
 97 |     r"""
 98 |     This method wrapps the entire protocol for preparing a model before running a training. This includes:
 99 |         1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
100 |         head to fp32
101 | 
102 |     Args:
103 |         model, (`transformers.PreTrainedModel`):
104 |             The loaded model from `transformers`
105 |     """
106 |     #  不要使用 model.half(), 这样会先截取精度再训练了, 最初data就要保持half
107 |     for name, param in model.named_parameters():
108 |         # freeze base model's layers
109 |         param.requires_grad = False
110 |         # cast layer norm in fp32 for stability for 8bit models
111 |         if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
112 |             param.data = param.data.to(torch.float32)
113 |         elif output_embedding_layer_name in name:  # lm_head也需要是tf.float32(最后一层)
114 |             param.data = param.data.to(torch.float32)
115 |         else:
116 |             param.data = param.data.to(torch.half)
117 | 
118 |     if use_gradient_checkpointing:
119 |         # For backward compatibility
120 |         if hasattr(model, "enable_input_require_grads"):
121 |             model.enable_input_require_grads()
122 |         else:
123 |             def make_inputs_require_grad(module, input, output):
124 |                 output.requires_grad_(True)
125 |             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
126 |         # enable gradient checkpointing for memory efficiency
127 |         model.gradient_checkpointing_enable()
128 |     return model
129 | def generate_prompt(data_point, is_logger=False):
130 |     """   指令微调:
131 |     普通句子续写: bos + text + eos
132 |     带 prompt:
133 |     ‘<start_of_turn>user
134 |     Knock knock.<end_of_turn>
135 |     <start_of_turn>model
136 |     Who’s there?<end_of_turn>model
137 |     <start_of_turn>user
138 |     Gemma.<end_of_turn>
139 |     <start_of_turn>model
140 |     Gemma who?<end_of_turn>model’
141 |     """
142 |     text_input = data_point.get("content", "")
143 |     text_out = data_point.get("summary", "")
144 | 
145 |     prompt_text_1 = """<|im_start|>system
146 | You are a helpful assistant.<|im_end|>
147 | <|im_start|>user
148 | {}<|im_end|>"""
149 |     prompt_text_2 = """\n<|im_start|>assistant
150 | {}<|im_end|>"""
151 |     text_1 = prompt_text_1.format(text_input.strip())
152 |     text_2 = prompt_text_2.format(text_out.strip())
153 | 
154 |     x = tokenizer.encode(text_1, add_special_tokens=False)
155 |     y = tokenizer.encode(text_2, add_special_tokens=False)
156 |     if len(x) + len(y) > (MAX_LENGTH_Q + MAX_LENGTH_A):
157 |         x = x[:MAX_LENGTH_Q] + [ID_EOS]
158 |         y = y[:MAX_LENGTH_A] + [ID_EOS]
159 |     out = {"input_ids": x, "labels": y}
160 |     if is_logger:
161 |         print(text_1)
162 |         print(text_2)
163 |         print(out)
164 |     return out
165 | 
166 | def data_collator(batch):
167 |     # there's probably a way to do this with the tokenizer settings
168 |     len_max_batch = [len(batch[i].get("input_ids")) + len(batch[i].get("labels"))
169 |                     for i in range(len(batch))]
170 |     len_max_batch = min(MAX_LENGTH_QA, max(len_max_batch))
171 |     batch_attention_mask = []
172 |     batch_input_ids = []
173 |     batch_labels = []
174 |     for ba in batch:
175 |         x, y = ba.get("input_ids"), ba.get("labels")
176 |         len_padding = len_max_batch - len(x) - len(y)
177 |         if not use_all_loss: ### 部分loss参与计算, output
178 |             if tokenizer.padding_side and tokenizer.padding_side == "left":
179 |                 labels = [-100] * len_padding + [-100] * len(x) + y
180 |                 input_ids = [ID_PAD] * len_padding + x + y
181 |                 attention_mask = [0] * len_padding + [1] * (len_max_batch - len_padding)
182 |             else:
183 |                 labels = [-100] * len(x) + y + [-100] * len_padding
184 |                 input_ids = x + y + [ID_PAD] * len_padding
185 |                 attention_mask = [1] * (len(x)+len(y)) + [0] * len_padding
186 |         else:  ### 全部loss参与计算, input + output
187 |             if tokenizer.padding_side and tokenizer.padding_side == "left":
188 |                 labels = [-100] * len_padding + x + y
189 |                 input_ids = [ID_PAD] * len_padding + x + y
190 |                 attention_mask = [0] * len_padding + [1] * (len_max_batch - len_padding)
191 |             else:
192 |                 labels = x + y + [-100] * len_padding
193 |                 input_ids = x + y + [ID_PAD] * len_padding
194 |                 attention_mask = [1] * (len(x)+len(y)) + [0] * len_padding
195 |         tensor_attention_mask = torch.tensor(attention_mask, dtype=torch.long)
196 |         tensor_input_ids = torch.tensor(input_ids, dtype=torch.long)
197 |         tensor_labels = torch.tensor(labels, dtype=torch.long)
198 |         batch_attention_mask.append(tensor_attention_mask)
199 |         batch_input_ids.append(tensor_input_ids)
200 |         batch_labels.append(tensor_labels)
201 |     batch_attention_mask = torch.stack(batch_attention_mask)
202 |     batch_input_ids = torch.stack(batch_input_ids)
203 |     batch_labels = torch.stack(batch_labels)
204 |     input_dict = {"attention_mask": batch_attention_mask,  # no use
205 |                   "input_ids": batch_input_ids,
206 |                   "labels": batch_labels,
207 |                   }
208 |     return input_dict
209 | def dfs_file(path_dir):
210 |     """
211 |         递归获取某个目录下的所有文件(所有层, 包括子目录)
212 |     Args:
213 |         path_dir[String]:, path of dir, eg. "/home/data"
214 |     Returns:
215 |         data[List]: data of input, eg. ["2020_01_08.txt"]
216 |     """
217 |     path_files = []
218 |     for root, dirs, files in os.walk(path_dir):  # 分别代表根目录、文件夹、文件
219 |         for file in files:  # 遍历文件
220 |             file_path = os.path.join(root, file)  # 获取文件绝对路径
221 |             path_files.append(file_path)  # 将文件路径添加进列表
222 |     files = list(set(path_files))
223 |     files.sort()  # the same list
224 |     return files
225 | 
226 | 
227 | model = LLMModel.from_pretrained(PATH_MODEL_PRETRAIN, torch_dtype=torch.bfloat16)
228 | 
229 | # model = prepare_model_for_half_training(model,
230 | #         use_gradient_checkpointing=True,
231 | #         output_embedding_layer_name="lm_head",
232 | #         layer_norm_names=["post_attention_layernorm",
233 | #                           "input_layernorm",
234 | #                           "norm"
235 | #                           ],
236 | #         )
237 | model.gradient_checkpointing_enable()
238 | model.enable_input_require_grads()
239 | model.is_parallelizable = IS_PARALLELIZABLE
240 | model.model_parallel = MODEL_PARALLEL
241 | model.config.use_cache = USE_CACHE
242 | config = LoraConfig(target_modules=TARGET_MODULES,
243 |                     lora_dropout=LORA_DROPOUT,
244 |                     lora_alpha=LORA_ALPHA,
245 |                     task_type="CAUSAL_LM",
246 |                     bias="none",
247 |                     r=LORA_R,
248 |                     )
249 | model = get_peft_model(model, config)
250 | print_named_parameters(model)
251 | model = model.cuda()
252 | # for param in filter(lambda p: p.requires_grad, model.parameters()):
253 | #     param.data = param.data.to(torch.float32)
254 | 
255 | for name, param in model.named_parameters():
256 |     if "LoR" in name:   # 某些peft版本默认dtype=fp16, 这里全部转为 fp32
257 |         param.data = param.data.to(torch.float32)
258 | 
259 | print_named_parameters(model)
260 | 
261 | tokenizer = LLMTokenizer.from_pretrained(PATH_MODEL_PRETRAIN, add_eos_token=True)
262 | ID_END = 151643
263 | ID_BOS = 151644
264 | ID_EOS = 151645
265 | ID_PAD = ID_EOS
266 | ID_BR = 1699  # "\\n"
267 | ID_USER = 872
268 | ID_MODEL = 8948
269 | tokenizer.pad_token_id = ID_EOS
270 | tokenizer.eos_token_id = ID_EOS
271 | tokenizer.padding_side = "left"
272 | # tokenizer.padding_side = "right"  # NO use attention-mask
273 | print(ID_PAD)
274 | print(ID_BOS)
275 | print(ID_EOS)
276 | print(ID_BR)
277 | print(ID_USER)
278 | print(ID_MODEL)
279 | """
280 | "<|endoftext|>": 151643,
281 | "<|im_start|>": 151644,
282 | "<|im_end|>": 151645
283 | "system": 8948,
284 | "user": 872,
285 | "\\n": 1699,
286 | """
287 | STOP_WORDS_IDS = [[ID_BOS], [ID_EOS], [ID_END]]
288 | 
289 | 
290 | 
291 | ### 只有一个train的情况
292 | data = load_dataset("json", data_files=DATA_PATH)
293 | if VAL_SET_SIZE > 0:
294 |     # train_val = data["train"].train_test_split(test_size=min(VAL_SET_SIZE,
295 |     #                     int(len(data["train"])/10000)), shuffle=True, seed=42)
296 |     VAL_SET_SIZE = max(min(VAL_SET_SIZE, int(len(data["train"])/10000)), 1)
297 |     generate_prompt(data["train"][0], is_logger=True)
298 |     train_val = data["train"].train_test_split(test_size=VAL_SET_SIZE, shuffle=True, seed=42)
299 |     train_data = train_val["train"].shuffle().map(generate_prompt)
300 |     val_data = train_val["test"].shuffle().map(generate_prompt)
301 | else:
302 |     generate_prompt(data["train"][0], is_logger=True)
303 |     train_data = data["train"].shuffle().map(generate_prompt)
304 |     val_data = None
305 | 
306 | 
307 | class CustomTrainer(transformers.Trainer):
308 |     def compute_loss(self, model, inputs, return_outputs=False):
309 |         """  newest loss """
310 |         inputs = {k: v.cuda() for k, v in inputs.items()}
311 |         outputs = model(**inputs)  # if contain labels, will calculate loss
312 | 
313 |         try:
314 |             logs = {}
315 |             tr_loss_scalar = self._nested_gather(outputs.loss.detach()).mean().item()
316 |             logs["loss"] = round(tr_loss_scalar, 4)
317 |             logs["lr"] = self.lr_scheduler.get_last_lr()[0]
318 |             step = self.state.global_step
319 |             for k, v in logs.items():
320 |                 tensorboardx_witer.add_scalar(k, v, step)
321 |             self.log(logs)
322 |         except Exception as e:
323 |             print(traceback.print_exc())
324 |             print(logs)
325 | 
326 |         if self.label_smoother is not None and "labels" in inputs:
327 |             labels = inputs.pop("labels")
328 |         else:
329 |             labels = None
330 |         # Save past state if it exists
331 |         # TODO: this needs to be fixed and made cleaner later.
332 |         if self.args.past_index >= 0:
333 |             self._past = outputs[self.args.past_index]
334 | 
335 |         if labels is not None:
336 |             if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
337 |                 loss = self.label_smoother(outputs, labels, shift_labels=True)
338 |             else:
339 |                 loss = self.label_smoother(outputs, labels)
340 |         else:
341 |             if isinstance(outputs, dict) and "loss" not in outputs:
342 |                 raise ValueError(
343 |                     "The model did not return a loss from the inputs, only the following keys: "
344 |                     f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
345 |                 )
346 |             # We don't use .loss here since the model may return tuples instead of ModelOutput.
347 |             loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
348 |         return (loss, outputs) if return_outputs else loss
349 | 
350 | 
351 | trainer = CustomTrainer(
352 |         # data_collator=transformers.DataCollatorForSeq2Seq(
353 |         #             tokenizer, pad_to_multiple_of=8,
354 |         #             return_tensors="pt", padding=True
355 |         #         ),
356 |         data_collator=data_collator,
357 |         train_dataset=train_data,
358 |         eval_dataset=val_data,
359 |         model=model,
360 |         args=transformers.TrainingArguments(
361 |             weight_decay=WEIGHT_DECAY,
362 |             gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
363 |             per_device_train_batch_size=MICRO_BATCH_SIZE,
364 |             learning_rate=LEARNING_RATE,
365 |             num_train_epochs=EPOCHS,
366 |             max_grad_norm=1.0,
367 |             logging_steps=8,
368 |             # warmup_steps=382,  # 618
369 |             # warmup_ratio=0.01,
370 |             warmup_steps=1,  # 618
371 |             evaluation_strategy="no",
372 |             lr_scheduler_type="cosine",  # "constant",  #'constant',  # "cosine",
373 |             logging_first_step=True,
374 |             # evaluation_strategy="steps" if VAL_SET_SIZE > 0 else "no",
375 |             # eval_steps=SAVE_STEPS if VAL_SET_SIZE > 0 else None,
376 |             save_strategy="steps",
377 |             save_total_limit=12,
378 |             save_steps=SAVE_STEPS,
379 |             # load_best_model_at_end=True if VAL_SET_SIZE > 0 else False,
380 |             ddp_find_unused_parameters=None,
381 |             gradient_checkpointing=True,
382 |             # group_by_length=True,  # group together samples of roughly the same length in training
383 |             output_dir=MODEL_SAVE_DIR,
384 |             report_to=[],  # ["tensorboard"],  # [], ["wandb"]
385 |             optim="adamw_torch",  # "adamw_hf",
386 |             # optim="adafactor",
387 |             # fp16=True,
388 |         )
389 |     )
390 | 
391 | if torch.__version__ >= "2" and sys.platform != "win32":
392 |     model = torch.compile(model)
393 | 
394 | 
395 | ## 加载训练好的权重
396 | # files = dfs_file(MODEL_SAVE_DIR)
397 | # files_name_str = str(files)
398 | # flag_checkpoint = True if files and "checkpoint" in files_name_str else False
399 | flag_checkpoint = False
400 | trainer.train(resume_from_checkpoint=flag_checkpoint)
401 | save_model_state(model=model, config=config, model_save_dir=MODEL_SAVE_DIR)
402 | print_named_parameters(model, use_print_data=True)  # 查看LoRA层权重是不是为NAN溢出
403 | 
404 | 
405 | # nohup python train.py > tc.train.py.log 2>&1 &
406 | # tail -n 1000  -f tc.train.py.log
407 | # |myz|
408 | 
409 | 


--------------------------------------------------------------------------------
/qwen2_sft/image_aadvgen_loss_success_7b_5e5_bfloat16_pt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongzhuo/qwen2-sft/91d4dd876d31f65911fe46216298c590233b67b0/qwen2_sft/image_aadvgen_loss_success_7b_5e5_bfloat16_pt.png


--------------------------------------------------------------------------------
/qwen2_sft/image_advgen_dev_success_7b_2e5_cosine_bs2_fp16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongzhuo/qwen2-sft/91d4dd876d31f65911fe46216298c590233b67b0/qwen2_sft/image_advgen_dev_success_7b_2e5_cosine_bs2_fp16.png


--------------------------------------------------------------------------------
/qwen2_sft/image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongzhuo/qwen2-sft/91d4dd876d31f65911fe46216298c590233b67b0/qwen2_sft/image_advgen_loss_nan_7b_3e4_constranr_bs4_fp16.png


--------------------------------------------------------------------------------
/qwen2_sft/models/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2023/8/4 15:21
4 | # @author  : Mo
5 | # @function:
6 | 


--------------------------------------------------------------------------------
/qwen2_sft/models/qwen2/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2024/3/7 15:44
4 | # @author  : Mo
5 | # @function:
6 | 


--------------------------------------------------------------------------------
/qwen2_sft/models/qwen2/cache_utils.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Any, Dict, List, Optional, Tuple
  3 | 
  4 | import torch
  5 | 
  6 | from transformers.configuration_utils import PretrainedConfig
  7 | 
  8 | 
  9 | @dataclass
 10 | class Cache:
 11 |     """
 12 |     Base, abstract class for all caches. The actual data structure is specific to each subclass.
 13 |     """
 14 | 
 15 |     def update(
 16 |         self,
 17 |         key_states: torch.Tensor,
 18 |         value_states: torch.Tensor,
 19 |         layer_idx: int,
 20 |         cache_kwargs: Optional[Dict[str, Any]] = None,
 21 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 22 |         """
 23 |         Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
 24 | 
 25 |         Parameters:
 26 |             key_states (`torch.Tensor`):
 27 |                 The new key states to cache.
 28 |             value_states (`torch.Tensor`):
 29 |                 The new value states to cache.
 30 |             layer_idx (`int`):
 31 |                 The index of the layer to cache the states for.
 32 |             cache_kwargs (`Dict[str, Any]`, `optional`):
 33 |                 Additional arguments for the cache subclass. These are specific to each subclass and allow new types of
 34 |                 cache to be created.
 35 | 
 36 |         Return:
 37 |             A tuple containing the updated key and value states.
 38 |         """
 39 |         raise NotImplementedError("Make sure to implement `update` in a subclass.")
 40 | 
 41 |     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
 42 |         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
 43 |         raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.")
 44 | 
 45 |     def get_max_length(self) -> Optional[int]:
 46 |         """Returns the maximum sequence length of the cached states, if there is any."""
 47 |         raise NotImplementedError("Make sure to implement `get_max_length` in a subclass.")
 48 | 
 49 |     def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
 50 |         """Given the sequence length of the new inputs, returns the usable length of the cache."""
 51 |         # Cache without size limit -> all cache is usable
 52 |         # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
 53 |         #   length, we will need to evict part of the cache (and thus not all cache is usable)
 54 |         max_length = self.get_max_length()
 55 |         previous_seq_length = self.get_seq_length(layer_idx)
 56 |         if max_length is not None and previous_seq_length + new_seq_length > max_length:
 57 |             return max_length - new_seq_length
 58 |         return previous_seq_length
 59 | 
 60 | 
 61 | class DynamicCache(Cache):
 62 |     """
 63 |     A cache that grows dynamically as more tokens are generated. This is the default for generative models.
 64 | 
 65 |     It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
 66 |     `[batch_size, num_heads, seq_len, head_dim]`.
 67 |     """
 68 | 
 69 |     def __init__(self) -> None:
 70 |         self.key_cache: List[torch.Tensor] = []
 71 |         self.value_cache: List[torch.Tensor] = []
 72 |         self.seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
 73 | 
 74 |     def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
 75 |         """
 76 |         Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
 77 |         sequence length.
 78 |         """
 79 |         if layer_idx < len(self):
 80 |             return (self.key_cache[layer_idx], self.value_cache[layer_idx])
 81 |         else:
 82 |             raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
 83 | 
 84 |     def __iter__(self):
 85 |         """
 86 |         Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
 87 |         keys and values
 88 |         """
 89 |         for layer_idx in range(len(self)):
 90 |             yield (self.key_cache[layer_idx], self.value_cache[layer_idx])
 91 | 
 92 |     def __len__(self):
 93 |         """
 94 |         Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
 95 |         to the number of layers in the model.
 96 |         """
 97 |         return len(self.key_cache)
 98 | 
 99 |     def update(
100 |         self,
101 |         key_states: torch.Tensor,
102 |         value_states: torch.Tensor,
103 |         layer_idx: int,
104 |         cache_kwargs: Optional[Dict[str, Any]] = None,
105 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
106 |         """
107 |         Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
108 | 
109 |         Parameters:
110 |             key_states (`torch.Tensor`):
111 |                 The new key states to cache.
112 |             value_states (`torch.Tensor`):
113 |                 The new value states to cache.
114 |             layer_idx (`int`):
115 |                 The index of the layer to cache the states for.
116 |             cache_kwargs (`Dict[str, Any]`, `optional`):
117 |                 Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
118 | 
119 |         Return:
120 |             A tuple containing the updated key and value states.
121 |         """
122 |         # Update the number of seen tokens
123 |         if layer_idx == 0:
124 |             self.seen_tokens += key_states.shape[-2]
125 | 
126 |         # Update the cache
127 |         if len(self.key_cache) <= layer_idx:
128 |             self.key_cache.append(key_states)
129 |             self.value_cache.append(value_states)
130 |         else:
131 |             self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
132 |             self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
133 | 
134 |         return self.key_cache[layer_idx], self.value_cache[layer_idx]
135 | 
136 |     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
137 |         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
138 |         if len(self.key_cache) <= layer_idx:
139 |             return 0
140 |         return self.key_cache[layer_idx].shape[-2]
141 | 
142 |     def get_max_length(self) -> Optional[int]:
143 |         """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
144 |         return None
145 | 
146 |     def reorder_cache(self, beam_idx: torch.LongTensor):
147 |         """Reorders the cache for beam search, given the selected beam indices."""
148 |         for layer_idx in range(len(self.key_cache)):
149 |             device = self.key_cache[layer_idx].device
150 |             self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
151 |             device = self.value_cache[layer_idx].device
152 |             self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
153 | 
154 |     def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
155 |         """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format."""
156 |         legacy_cache = ()
157 |         for layer_idx in range(len(self)):
158 |             legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),)
159 |         return legacy_cache
160 | 
161 |     @classmethod
162 |     def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
163 |         """Converts a cache in the legacy cache format into an equivalent `DynamicCache`."""
164 |         cache = cls()
165 |         if past_key_values is not None:
166 |             for layer_idx in range(len(past_key_values)):
167 |                 key_states, value_states = past_key_values[layer_idx]
168 |                 cache.update(key_states, value_states, layer_idx)
169 |         return cache
170 | 
171 | 
172 | class SinkCache(Cache):
173 |     """
174 |     A cache that as described in the [Attention Sinks paper](https://arxiv.org/abs/2309.17453). It allows the model to
175 |     generate beyond the length of its context window, without losing fluency in the conversation. As it discards past
176 |     tokens, the model will lose the ability to generate tokens that depend on the context that was discarded.
177 | 
178 |     It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
179 |     `[batch_size, num_heads, seq_len, head_dim]`.
180 | 
181 |     Parameters:
182 |         window_length (`int`):
183 |             The length of the context window.
184 |         num_sink_tokens (`int`):
185 |             The number of sink tokens. See the original paper for more information.
186 |     """
187 | 
188 |     def __init__(self, window_length: int, num_sink_tokens: int) -> None:
189 |         self.key_cache: List[torch.Tensor] = []
190 |         self.value_cache: List[torch.Tensor] = []
191 |         self.window_length = window_length
192 |         self.num_sink_tokens = num_sink_tokens
193 |         self.cos_sin_cache = {}
194 |         self.seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
195 | 
196 |     @staticmethod
197 |     def _rotate_half(x):
198 |         x1 = x[..., : x.shape[-1] // 2]
199 |         x2 = x[..., x.shape[-1] // 2 :]
200 |         return torch.cat((-x2, x1), dim=-1)
201 | 
202 |     def _apply_key_rotary_pos_emb(
203 |         self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
204 |     ) -> torch.Tensor:
205 |         rotated_key_states = (key_states * cos) + (self._rotate_half(key_states) * sin)
206 |         return rotated_key_states
207 | 
208 |     def _get_rerotation_cos_sin(
209 |         self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
210 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
211 |         if key_states.shape[-2] not in self.cos_sin_cache:
212 |             # Upcast to float32 temporarily for better accuracy
213 |             cos = cos.to(torch.float32)
214 |             sin = sin.to(torch.float32)
215 | 
216 |             # Compute the cos and sin required for back- and forward-rotating to one position earlier in the sequence
217 |             original_cos = cos[self.num_sink_tokens + key_states.shape[-2] :]
218 |             shifted_cos = cos[self.num_sink_tokens : -key_states.shape[-2]]
219 |             original_sin = sin[self.num_sink_tokens + key_states.shape[-2] :]
220 |             shifted_sin = sin[self.num_sink_tokens : -key_states.shape[-2]]
221 |             rerotation_cos = original_cos * shifted_cos + original_sin * shifted_sin
222 |             rerotation_sin = -original_sin * shifted_cos + original_cos * shifted_sin
223 | 
224 |             self.cos_sin_cache[key_states.shape[-2]] = (
225 |                 rerotation_cos.to(key_states.dtype).unsqueeze(0),
226 |                 rerotation_sin.to(key_states.dtype).unsqueeze(0),
227 |             )
228 |         return self.cos_sin_cache[key_states.shape[-2]]
229 | 
230 |     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
231 |         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
232 |         # Workaround to make 'key_states.shape[-2] + past_key_value.get_seq_length(self.layer_idx)' <= window_length
233 |         if len(self.key_cache) <= layer_idx:
234 |             return 0
235 |         return self.key_cache[layer_idx].shape[-2]
236 | 
237 |     def get_max_length(self) -> Optional[int]:
238 |         """Returns the maximum sequence length of the cached states."""
239 |         return self.window_length
240 | 
241 |     def update(
242 |         self,
243 |         key_states: torch.Tensor,
244 |         value_states: torch.Tensor,
245 |         layer_idx: int,
246 |         cache_kwargs: Optional[Dict[str, Any]] = None,
247 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
248 |         """
249 |         Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
250 | 
251 |         Parameters:
252 |             key_states (`torch.Tensor`):
253 |                 The new key states to cache.
254 |             value_states (`torch.Tensor`):
255 |                 The new value states to cache.
256 |             layer_idx (`int`):
257 |                 The index of the layer to cache the states for.
258 |             cache_kwargs (`Dict[str, Any]`, `optional`):
259 |                 Additional arguments for the cache subclass. The following arguments can be used in `SinkCache`: `sin`,
260 |                 `cos` and `partial_rotation_size`. These arguments are used with models using RoPE, to recompute the
261 |                 rotation as the tokens are shifted.
262 | 
263 |         Return:
264 |             A tuple containing the updated key and value states.
265 |         """
266 |         # Optional kwargs for `SinkCache` -- needed on models using RoPE. `partial_rotation_size` is used on models
267 |         # with partially rotated position embeddings, like Phi or Persimmon.
268 |         sin = cache_kwargs.get("sin")
269 |         cos = cache_kwargs.get("cos")
270 |         partial_rotation_size = cache_kwargs.get("partial_rotation_size")
271 |         using_rope = cos is not None and sin is not None
272 | 
273 |         # Update the number of seen tokens
274 |         if layer_idx == 0:
275 |             self.seen_tokens += key_states.shape[-2]
276 | 
277 |         # [bsz, num_heads, seq_len, head_dim]
278 |         if len(self.key_cache) <= layer_idx:
279 |             # Empty cache
280 |             self.key_cache.append(key_states)
281 |             self.value_cache.append(value_states)
282 | 
283 |         elif key_states.shape[-2] + self.get_seq_length(layer_idx) < self.window_length:
284 |             # Growing cache
285 |             self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
286 |             self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
287 | 
288 |         else:
289 |             # Shifting cache
290 |             keys_to_keep = self.key_cache[layer_idx][
291 |                 :, :, -self.window_length + self.num_sink_tokens + key_states.shape[-2] :
292 |             ]
293 | 
294 |             # On RoPE models, we need to recompute the Key rotation as the tokens are shifted
295 |             if using_rope:
296 |                 rerotation_cos, rerotation_sin = self._get_rerotation_cos_sin(
297 |                     key_states, cos[: self.window_length], sin[: self.window_length]
298 |                 )
299 |                 if partial_rotation_size is not None:
300 |                     keys_to_keep, keys_pass = (
301 |                         keys_to_keep[..., :partial_rotation_size],
302 |                         keys_to_keep[..., partial_rotation_size:],
303 |                     )
304 |                 keys_to_keep = self._apply_key_rotary_pos_emb(keys_to_keep, rerotation_cos, rerotation_sin)
305 |                 if partial_rotation_size is not None:
306 |                     keys_to_keep = torch.cat((keys_to_keep, keys_pass), dim=-1)
307 | 
308 |             # Concatenate sink tokens, shifted & rotated tokens (if needed), and new tokens
309 |             sink_keys = self.key_cache[layer_idx][:, :, : self.num_sink_tokens]
310 |             self.key_cache[layer_idx] = torch.cat([sink_keys, keys_to_keep, key_states], dim=-2)
311 | 
312 |             sink_values = self.value_cache[layer_idx][:, :, : self.num_sink_tokens]
313 |             values_to_keep = self.value_cache[layer_idx][
314 |                 :, :, -self.window_length + self.num_sink_tokens + value_states.shape[-2] :
315 |             ]
316 |             self.value_cache[layer_idx] = torch.cat([sink_values, values_to_keep, value_states], dim=-2)
317 | 
318 |         return self.key_cache[layer_idx], self.value_cache[layer_idx]
319 | 
320 |     def reorder_cache(self, beam_idx: torch.LongTensor):
321 |         """Reorders the cache for beam search, given the selected beam indices."""
322 |         for layer_idx in range(len(self.key_cache)):
323 |             device = self.key_cache[layer_idx].device
324 |             self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
325 |             device = self.value_cache[layer_idx].device
326 |             self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
327 | 
328 | 
329 | class StaticCache(Cache):
330 |     """
331 |     Static Cache class to be used with `torch.compile(model)`.
332 | 
333 |     Parameters:
334 |         config (`PretrainedConfig):
335 |             The configuration file defining the `max_position_embeddings`, `hidden_size` and `num_attention_heads`
336 |             required to initialize the static cache.
337 |         max_batch_size (`int`):
338 |             The maximum batch size with which the model will be used.
339 |         max_cache_len (`int`):
340 |             The maximum sequence length with which the model will be used.
341 |         device (`torch.device`):
342 |             The device on which the cache should be initialized. Should be the same as the layer.
343 |         dtype (*optional*, defaults to `torch.float32`):
344 |             The default `dtype` to use when initializing the layer.
345 |     """
346 | 
347 |     def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device, dtype=None) -> None:
348 |         super().__init__()
349 |         self.max_batch_size = max_batch_size
350 |         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
351 |         # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
352 |         self.head_dim = (
353 |             config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
354 |         )
355 | 
356 |         self.dtype = dtype if dtype is not None else torch.float32
357 |         self.num_key_value_heads = (
358 |             config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
359 |         )
360 | 
361 |         cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
362 |         self.key_cache: torch.Tensor = torch.zeros(cache_shape, dtype=self.dtype, device=device)
363 |         self.value_cache: torch.Tensor = torch.zeros(cache_shape, dtype=self.dtype, device=device)
364 | 
365 |     def update(
366 |         self,
367 |         key_states: torch.Tensor,
368 |         value_states: torch.Tensor,
369 |         layer_idx: int,
370 |         cache_kwargs: Optional[Dict[str, Any]] = None,
371 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
372 |         """
373 |         Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
374 |         It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
375 | 
376 |         Parameters:
377 |             key_states (`torch.Tensor`):
378 |                 The new key states to cache.
379 |             value_states (`torch.Tensor`):
380 |                 The new value states to cache.
381 |             layer_idx (`int`):
382 |                 The index of the layer to cache the states for. Kept for backward compatibility
383 |             cache_kwargs (`Dict[str, Any]`, `optional`):
384 |                 Additional arguments for the cache subclass. The `StaticCache` just needs the `q_len`
385 |                 to know how much of the cache it should overwrite.
386 | 
387 |         Return:
388 |             A tuple containing the updated key and value states.
389 |         """
390 |         new_cache_positions = cache_kwargs.get("cache_position")
391 |         k_out = self.key_cache
392 |         v_out = self.value_cache
393 | 
394 |         k_out[:, :, new_cache_positions] = key_states
395 |         v_out[:, :, new_cache_positions] = value_states
396 | 
397 |         return k_out, v_out
398 | 
399 |     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
400 |         """Returns the sequence length of the cached states that were seen by the model. `layer_idx` kept for BC"""
401 |         # TODO: Fix once the stateful `int` bug in PyTorch is fixed.
402 |         raise ValueError(
403 |             "get_seq_length is not implemented for StaticCache. Please refer to https://github.com/huggingface/transformers/pull/29114."
404 |         )
405 | 
406 |     def get_usable_length(self, new_sequence_length=None, layer_idx: Optional[int] = 0) -> int:
407 |         # TODO: Fix once the stateful `int` bug in PyTorch is fixed.
408 |         raise ValueError(
409 |             "get_seq_length is not implemented for StaticCache. Please refer to https://github.com/huggingface/transformers/pull/29114."
410 |         )
411 | 
412 |     def get_max_length(self) -> Optional[int]:
413 |         """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
414 |         return self.max_cache_len
415 | 
416 |     def reorder_cache(self, beam_idx: torch.LongTensor):
417 |         """Reorders the cache for beam search, given the selected beam indices."""
418 |         device = self.key_cache.device
419 |         self.key_cache = self.key_cache.index_select(0, beam_idx.to(device))
420 |         device = self.value_cache.device
421 |         self.value_cache = self.value_cache.index_select(0, beam_idx.to(device))
422 | 
423 |     def to_legacy_cache(self):
424 |         """Dummy function for BC. We have to keep it because otherwise the call in the forward of models will break it"""
425 |         return None
426 | 
427 | 


--------------------------------------------------------------------------------
/qwen2_sft/models/qwen2/configuration_qwen2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Qwen2 model configuration"""
 16 | 
 17 | from transformers.configuration_utils import PretrainedConfig
 18 | from transformers.utils import logging
 19 | 
 20 | 
 21 | logger = logging.get_logger(__name__)
 22 | 
 23 | QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 24 |     "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
 25 | }
 26 | 
 27 | 
 28 | class Qwen2Config(PretrainedConfig):
 29 |     r"""
 30 |     This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
 31 |     Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
 32 |     with the defaults will yield a similar configuration to that of
 33 |     Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
 34 | 
 35 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 36 |     documentation from [`PretrainedConfig`] for more information.
 37 | 
 38 | 
 39 |     Args:
 40 |         vocab_size (`int`, *optional*, defaults to 151936):
 41 |             Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
 42 |             `inputs_ids` passed when calling [`Qwen2Model`]
 43 |         hidden_size (`int`, *optional*, defaults to 4096):
 44 |             Dimension of the hidden representations.
 45 |         intermediate_size (`int`, *optional*, defaults to 22016):
 46 |             Dimension of the MLP representations.
 47 |         num_hidden_layers (`int`, *optional*, defaults to 32):
 48 |             Number of hidden layers in the Transformer encoder.
 49 |         num_attention_heads (`int`, *optional*, defaults to 32):
 50 |             Number of attention heads for each attention layer in the Transformer encoder.
 51 |         num_key_value_heads (`int`, *optional*, defaults to 32):
 52 |             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
 53 |             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
 54 |             `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
 55 |             converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
 56 |             by meanpooling all the original heads within that group. For more details checkout [this
 57 |             paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
 58 |         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
 59 |             The non-linear activation function (function or string) in the decoder.
 60 |         max_position_embeddings (`int`, *optional*, defaults to 32768):
 61 |             The maximum sequence length that this model might ever be used with.
 62 |         initializer_range (`float`, *optional*, defaults to 0.02):
 63 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 64 |         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
 65 |             The epsilon used by the rms normalization layers.
 66 |         use_cache (`bool`, *optional*, defaults to `True`):
 67 |             Whether or not the model should return the last key/values attentions (not used by all models). Only
 68 |             relevant if `config.is_decoder=True`.
 69 |         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
 70 |             Whether the model's input and output word embeddings should be tied.
 71 |         rope_theta (`float`, *optional*, defaults to 10000.0):
 72 |             The base period of the RoPE embeddings.
 73 |         use_sliding_window (`bool`, *optional*, defaults to `False`):
 74 |             Whether to use sliding window attention.
 75 |         sliding_window (`int`, *optional*, defaults to 4096):
 76 |             Sliding window attention (SWA) window size. If not specified, will default to `4096`.
 77 |         max_window_layers (`int`, *optional*, defaults to 28):
 78 |             The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
 79 |         attention_dropout (`float`, *optional*, defaults to 0.0):
 80 |             The dropout ratio for the attention probabilities.
 81 | 
 82 |     ```python
 83 |     >>> from transformers import Qwen2Model, Qwen2Config
 84 | 
 85 |     >>> # Initializing a Qwen2 style configuration
 86 |     >>> configuration = Qwen2Config()
 87 | 
 88 |     >>> # Initializing a model from the Qwen2-7B style configuration
 89 |     >>> model = Qwen2Model(configuration)
 90 | 
 91 |     >>> # Accessing the model configuration
 92 |     >>> configuration = model.config
 93 |     ```"""
 94 | 
 95 |     model_type = "qwen2"
 96 |     keys_to_ignore_at_inference = ["past_key_values"]
 97 | 
 98 |     def __init__(
 99 |         self,
100 |         vocab_size=151936,
101 |         hidden_size=4096,
102 |         intermediate_size=22016,
103 |         num_hidden_layers=32,
104 |         num_attention_heads=32,
105 |         num_key_value_heads=32,
106 |         hidden_act="silu",
107 |         max_position_embeddings=32768,
108 |         initializer_range=0.02,
109 |         rms_norm_eps=1e-6,
110 |         use_cache=True,
111 |         tie_word_embeddings=False,
112 |         rope_theta=10000.0,
113 |         use_sliding_window=False,
114 |         sliding_window=4096,
115 |         max_window_layers=28,
116 |         attention_dropout=0.0,
117 |         **kwargs,
118 |     ):
119 |         self.vocab_size = vocab_size
120 |         self.max_position_embeddings = max_position_embeddings
121 |         self.hidden_size = hidden_size
122 |         self.intermediate_size = intermediate_size
123 |         self.num_hidden_layers = num_hidden_layers
124 |         self.num_attention_heads = num_attention_heads
125 |         self.use_sliding_window = use_sliding_window
126 |         self.sliding_window = sliding_window
127 |         self.max_window_layers = max_window_layers
128 | 
129 |         # for backward compatibility
130 |         if num_key_value_heads is None:
131 |             num_key_value_heads = num_attention_heads
132 | 
133 |         self.num_key_value_heads = num_key_value_heads
134 |         self.hidden_act = hidden_act
135 |         self.initializer_range = initializer_range
136 |         self.rms_norm_eps = rms_norm_eps
137 |         self.use_cache = use_cache
138 |         self.rope_theta = rope_theta
139 |         self.attention_dropout = attention_dropout
140 | 
141 | 
142 |         super().__init__(
143 |             tie_word_embeddings=tie_word_embeddings,
144 |             **kwargs,
145 |         )


--------------------------------------------------------------------------------
/qwen2_sft/models/qwen2/modeling_attn_mask_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from dataclasses import dataclass
 15 | from typing import List, Optional, Tuple, Union
 16 | 
 17 | import torch
 18 | 
 19 | 
 20 | @dataclass
 21 | class AttentionMaskConverter:
 22 |     """
 23 |     A utility attention mask class that allows one to:
 24 |         - Create a causal 4d mask
 25 |         - Create a causal 4d mask with slided window
 26 |         - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
 27 |           key_value_length) that can be multiplied with attention scores
 28 | 
 29 |     Examples:
 30 | 
 31 |     ```python
 32 |     >>> import torch
 33 |     >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 34 | 
 35 |     >>> converter = AttentionMaskConverter(True)
 36 |     >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
 37 |     tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
 38 |             [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
 39 |             [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
 40 |             [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00, -3.4028e+38],
 41 |             [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00]]]])
 42 |     ```
 43 | 
 44 |     Parameters:
 45 |         is_causal (`bool`):
 46 |             Whether the attention mask should be a uni-directional (causal) or bi-directional mask.
 47 | 
 48 |         sliding_window (`int`, *optional*):
 49 |             Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
 50 |     """
 51 | 
 52 |     is_causal: bool
 53 |     sliding_window: int
 54 | 
 55 |     def __init__(self, is_causal: bool, sliding_window: Optional[int] = None):
 56 |         self.is_causal = is_causal
 57 |         self.sliding_window = sliding_window
 58 | 
 59 |         if self.sliding_window is not None and self.sliding_window <= 0:
 60 |             raise ValueError(
 61 |                 f"Make sure that when passing `sliding_window` that its value is a strictly positive integer, not `{self.sliding_window}`"
 62 |             )
 63 | 
 64 |     def to_causal_4d(
 65 |         self,
 66 |         batch_size: int,
 67 |         query_length: int,
 68 |         key_value_length: int,
 69 |         dtype: torch.dtype,
 70 |         device: Union[torch.device, "str"] = "cpu",
 71 |     ) -> Optional[torch.Tensor]:
 72 |         """
 73 |         Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
 74 |         bias to upper right hand triangular matrix (causal mask).
 75 |         """
 76 |         if not self.is_causal:
 77 |             raise ValueError(f"Please use `to_causal_4d` only if {self.__class__} has `is_causal` set to True.")
 78 | 
 79 |         # If shape is not cached, create a new causal mask and cache it
 80 |         input_shape = (batch_size, query_length)
 81 |         past_key_values_length = key_value_length - query_length
 82 | 
 83 |         # create causal mask
 84 |         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
 85 |         causal_4d_mask = None
 86 |         if input_shape[-1] > 1 or self.sliding_window is not None:
 87 |             causal_4d_mask = self._make_causal_mask(
 88 |                 input_shape,
 89 |                 dtype,
 90 |                 device=device,
 91 |                 past_key_values_length=past_key_values_length,
 92 |                 sliding_window=self.sliding_window,
 93 |             )
 94 | 
 95 |         return causal_4d_mask
 96 | 
 97 |     def to_4d(
 98 |         self,
 99 |         attention_mask_2d: torch.Tensor,
100 |         query_length: int,
101 |         dtype: torch.dtype,
102 |         key_value_length: Optional[int] = None,
103 |     ) -> torch.Tensor:
104 |         """
105 |         Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
106 |         key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
107 |         causal, a causal mask will be added.
108 |         """
109 |         input_shape = (attention_mask_2d.shape[0], query_length)
110 | 
111 |         # create causal mask
112 |         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
113 |         causal_4d_mask = None
114 |         if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
115 |             if key_value_length is None:
116 |                 raise ValueError(
117 |                     "This attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask."
118 |                 )
119 | 
120 |             past_key_values_length = key_value_length - query_length
121 |             causal_4d_mask = self._make_causal_mask(
122 |                 input_shape,
123 |                 dtype,
124 |                 device=attention_mask_2d.device,
125 |                 past_key_values_length=past_key_values_length,
126 |                 sliding_window=self.sliding_window,
127 |             )
128 |         elif self.sliding_window is not None:
129 |             raise NotImplementedError("Sliding window is currently only implemented for causal masking")
130 | 
131 |         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
132 |         expanded_attn_mask = self._expand_mask(attention_mask_2d, dtype, tgt_len=input_shape[-1]).to(
133 |             attention_mask_2d.device
134 |         )
135 | 
136 |         if causal_4d_mask is not None:
137 |             expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), torch.finfo(dtype).min)
138 | 
139 |         # expanded_attn_mask + causal_4d_mask can cause some overflow
140 |         expanded_4d_mask = expanded_attn_mask
141 | 
142 |         return expanded_4d_mask
143 | 
144 |     @staticmethod
145 |     def _make_causal_mask(
146 |         input_ids_shape: torch.Size,
147 |         dtype: torch.dtype,
148 |         device: torch.device,
149 |         past_key_values_length: int = 0,
150 |         sliding_window: Optional[int] = None,
151 |     ):
152 |         """
153 |         Make causal mask used for bi-directional self-attention.
154 |         """
155 |         bsz, tgt_len = input_ids_shape
156 |         mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
157 |         mask_cond = torch.arange(mask.size(-1), device=device)
158 |         mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
159 | 
160 |         mask = mask.to(dtype)
161 | 
162 |         if past_key_values_length > 0:
163 |             mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
164 | 
165 |         # add lower triangular sliding window mask if necessary
166 |         if sliding_window is not None:
167 |             diagonal = past_key_values_length - sliding_window + 1
168 | 
169 |             context_mask = 1 - torch.triu(torch.ones_like(mask, dtype=torch.int), diagonal=diagonal)
170 |             mask.masked_fill_(context_mask.bool(), torch.finfo(dtype).min)
171 | 
172 |         return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
173 | 
174 |     @staticmethod
175 |     def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
176 |         """
177 |         Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
178 |         """
179 |         bsz, src_len = mask.size()
180 |         tgt_len = tgt_len if tgt_len is not None else src_len
181 | 
182 |         expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
183 | 
184 |         inverted_mask = 1.0 - expanded_mask
185 | 
186 |         return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
187 | 
188 |     @staticmethod
189 |     def _unmask_unattended(
190 |         expanded_mask: torch.FloatTensor,
191 |         min_dtype: float,
192 |     ):
193 |         # fmt: off
194 |         """
195 |         Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
196 |         using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
197 |         Details: https://github.com/pytorch/pytorch/issues/110213
198 | 
199 |         `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
200 |         `attention_mask` is [bsz, src_seq_len].
201 | 
202 |         The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.
203 | 
204 |         For example, if `expanded_mask` is (e.g. here left-padding case)
205 |         ```
206 |         [[[[0, 0, 0],
207 |            [0, 0, 0],
208 |            [0, 0, 1]]],
209 |          [[[1, 0, 0],
210 |            [1, 1, 0],
211 |            [1, 1, 1]]],
212 |          [[[0, 0, 0],
213 |            [0, 1, 0],
214 |            [0, 1, 1]]]]
215 |         ```
216 |         then the modified `expanded_mask` will be
217 |         ```
218 |         [[[[1, 1, 1],   <-- modified
219 |            [1, 1, 1],   <-- modified
220 |            [0, 0, 1]]],
221 |          [[[1, 0, 0],
222 |            [1, 1, 0],
223 |            [1, 1, 1]]],
224 |          [[[1, 1, 1],   <-- modified
225 |            [0, 1, 0],
226 |            [0, 1, 1]]]]
227 |         ```
228 |         """
229 |         # fmt: on
230 |         if expanded_mask.dtype == torch.bool:
231 |             raise ValueError(
232 |                 "AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor."
233 |             )
234 | 
235 |         return expanded_mask.mul(~torch.all(expanded_mask == min_dtype, dim=-1, keepdim=True))
236 | 
237 | 
238 | def _prepare_4d_causal_attention_mask(
239 |     attention_mask: Optional[torch.Tensor],
240 |     input_shape: Union[torch.Size, Tuple, List],
241 |     inputs_embeds: torch.Tensor,
242 |     past_key_values_length: int,
243 |     sliding_window: Optional[int] = None,
244 | ):
245 |     """
246 |     Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
247 |     `(batch_size, key_value_length)`
248 | 
249 |     Args:
250 |         attention_mask (`torch.Tensor` or `None`):
251 |             A 2D attention mask of shape `(batch_size, key_value_length)`
252 |         input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
253 |             The input shape should be a tuple that defines `(batch_size, query_length)`.
254 |         inputs_embeds (`torch.Tensor`):
255 |             The embedded inputs as a torch Tensor.
256 |         past_key_values_length (`int`):
257 |             The length of the key value cache.
258 |         sliding_window (`int`, *optional*):
259 |             If the model uses windowed attention, a sliding window should be passed.
260 |     """
261 |     attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
262 | 
263 |     key_value_length = input_shape[-1] + past_key_values_length
264 | 
265 |     # 4d mask is passed through the layers
266 |     if attention_mask is not None and len(attention_mask.shape) == 2:
267 |         attention_mask = attn_mask_converter.to_4d(
268 |             attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
269 |         )
270 |     elif attention_mask is not None and len(attention_mask.shape) == 4:
271 |         expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
272 |         if tuple(attention_mask.shape) != expected_shape:
273 |             raise ValueError(
274 |                 f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
275 |             )
276 |         else:
277 |             # if the 4D mask has correct shape - invert it and fill with negative infinity
278 |             inverted_mask = 1.0 - attention_mask
279 |             attention_mask = inverted_mask.masked_fill(
280 |                 inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
281 |             )
282 |     else:
283 |         attention_mask = attn_mask_converter.to_causal_4d(
284 |             input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
285 |         )
286 | 
287 |     return attention_mask
288 | 
289 | 
290 | # Adapted from _prepare_4d_causal_attention_mask
291 | def _prepare_4d_causal_attention_mask_for_sdpa(
292 |     attention_mask: Optional[torch.Tensor],
293 |     input_shape: Union[torch.Size, Tuple, List],
294 |     inputs_embeds: torch.Tensor,
295 |     past_key_values_length: int,
296 |     sliding_window: Optional[int] = None,
297 | ):
298 |     """
299 |     Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.
300 | 
301 |     In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
302 |     `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
303 |     allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
304 |     """
305 |     attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
306 | 
307 |     key_value_length = input_shape[-1] + past_key_values_length
308 |     batch_size, query_length = input_shape
309 | 
310 |     # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1`
311 |     # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing.
312 |     # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
313 |     is_tracing = (
314 |         torch.jit.is_tracing()
315 |         or isinstance(inputs_embeds, torch.fx.Proxy)
316 |         or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
317 |     )
318 | 
319 |     if attention_mask is not None:
320 |         # 4d mask is passed through
321 |         if len(attention_mask.shape) == 4:
322 |             expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
323 |             if tuple(attention_mask.shape) != expected_shape:
324 |                 raise ValueError(
325 |                     f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
326 |                 )
327 |             else:
328 |                 # if the 4D mask has correct shape - invert it and fill with negative infinity
329 |                 inverted_mask = 1.0 - attention_mask.to(inputs_embeds.dtype)
330 |                 attention_mask = inverted_mask.masked_fill(
331 |                     inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
332 |                 )
333 |                 return attention_mask
334 | 
335 |         elif not is_tracing and torch.all(attention_mask == 1):
336 |             if query_length == 1:
337 |                 # For query_length == 1, causal attention and bi-directional attention are the same.
338 |                 attention_mask = None
339 |             elif key_value_length == query_length:
340 |                 attention_mask = None
341 |             else:
342 |                 # Unfortunately, for query_length > 1 and key_value_length != query_length, we cannot generally ignore the attention mask, as SDPA causal mask generation
343 |                 # may be wrong. We will set `is_causal=False` in SDPA and rely on Transformers attention_mask instead, hence not setting it to None here.
344 |                 # Reference: https://github.com/pytorch/pytorch/issues/108108
345 |                 pass
346 |     elif query_length > 1 and key_value_length != query_length:
347 |         # See the comment above (https://github.com/pytorch/pytorch/issues/108108).
348 |         # Ugly: we set it to True here to dispatch in the following controlflow to `to_causal_4d`.
349 |         attention_mask = True
350 |     elif is_tracing:
351 |         raise ValueError(
352 |             'Attention using SDPA can not be traced with torch.jit.trace when no attention_mask is provided. To solve this issue, please either load your model with the argument `attn_implementation="eager"` or pass an attention_mask input when tracing the model.'
353 |         )
354 | 
355 |     if attention_mask is None:
356 |         expanded_4d_mask = None
357 |     elif attention_mask is True:
358 |         expanded_4d_mask = attn_mask_converter.to_causal_4d(
359 |             input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
360 |         )
361 |     else:
362 |         expanded_4d_mask = attn_mask_converter.to_4d(
363 |             attention_mask,
364 |             input_shape[-1],
365 |             dtype=inputs_embeds.dtype,
366 |             key_value_length=key_value_length,
367 |         )
368 | 
369 |         # Attend to all tokens in masked rows from the causal_mask, for example the relevant first rows when
370 |         # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
371 |         # Details: https://github.com/pytorch/pytorch/issues/110213
372 |         if not is_tracing and expanded_4d_mask.device.type == "cuda":
373 |             expanded_4d_mask = AttentionMaskConverter._unmask_unattended(
374 |                 expanded_4d_mask, min_dtype=torch.finfo(inputs_embeds.dtype).min
375 |             )
376 | 
377 |     return expanded_4d_mask
378 | 
379 | 
380 | def _prepare_4d_attention_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
381 |     """
382 |     Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
383 |     `(batch_size, key_value_length)`
384 | 
385 |     Args:
386 |         mask (`torch.Tensor` or `None`):
387 |             A 2D attention mask of shape `(batch_size, key_value_length)`
388 |         dtype (`torch.dtype`):
389 |             The torch dtype the created mask shall have.
390 |         tgt_len (`int`):
391 |             The target length or query length the created mask shall have.
392 |     """
393 |     return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
394 | 
395 | 
396 | def _prepare_4d_attention_mask_for_sdpa(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
397 |     """
398 |     Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
399 |     `(batch_size, key_value_length)`
400 | 
401 |     Args:
402 |         mask (`torch.Tensor` or `None`):
403 |             A 2D attention mask of shape `(batch_size, key_value_length)`
404 |         dtype (`torch.dtype`):
405 |             The torch dtype the created mask shall have.
406 |         tgt_len (`int`):
407 |             The target length or query length the created mask shall have.
408 |     """
409 |     batch_size, key_value_length = mask.shape
410 |     tgt_len = tgt_len if tgt_len is not None else key_value_length
411 | 
412 |     # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1`
413 |     # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing.
414 |     # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
415 |     is_tracing = (
416 |         torch.jit.is_tracing()
417 |         or isinstance(mask, torch.fx.Proxy)
418 |         or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
419 |     )
420 | 
421 |     if torch.all(mask == 1):
422 |         if is_tracing:
423 |             pass
424 |         elif tgt_len == 1:
425 |             # For query_length == 1, causal attention and bi-directional attention are the same.
426 |             return None
427 |         elif key_value_length == tgt_len:
428 |             return None
429 |         else:
430 |             # Unfortunately, for query_length > 1 and key_value_length != query_length, we can not generally ignore the attention mask, as SDPA causal mask generation
431 |             # may be wrong. We will set is_causal=False in SDPA and rely on Transformers attention_mask instead, hence not setting it to None here.
432 |             # Reference: https://github.com/pytorch/pytorch/issues/108108
433 |             return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
434 |     else:
435 |         return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
436 | 
437 | 
438 | def _create_4d_causal_attention_mask(
439 |     input_shape: Union[torch.Size, Tuple, List],
440 |     dtype: torch.dtype,
441 |     device: torch.device,
442 |     past_key_values_length: int = 0,
443 |     sliding_window: Optional[int] = None,
444 | ) -> Optional[torch.Tensor]:
445 |     """
446 |     Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`
447 | 
448 |     Args:
449 |         input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
450 |             The input shape should be a tuple that defines `(batch_size, query_length)`.
451 |         dtype (`torch.dtype`):
452 |             The torch dtype the created mask shall have.
453 |         device (`int`):
454 |             The torch device the created mask shall have.
455 |         sliding_window (`int`, *optional*):
456 |             If the model uses windowed attention, a sliding window should be passed.
457 |     """
458 |     attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
459 | 
460 |     key_value_length = past_key_values_length + input_shape[-1]
461 |     attention_mask = attn_mask_converter.to_causal_4d(
462 |         input_shape[0], input_shape[-1], key_value_length, dtype=dtype, device=device
463 |     )
464 | 
465 |     return attention_mask


--------------------------------------------------------------------------------
/qwen2_sft/models/qwen2/pytorch_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import inspect
 15 | from typing import Callable, List, Optional, Set, Tuple, Union
 16 | 
 17 | import torch
 18 | from packaging import version
 19 | from safetensors.torch import storage_ptr, storage_size
 20 | from torch import nn
 21 | 
 22 | from transformers.utils import is_torch_tpu_available, logging
 23 | 
 24 | 
 25 | ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
 26 | 
 27 | logger = logging.get_logger(__name__)
 28 | 
 29 | parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
 30 | 
 31 | is_torch_greater_or_equal_than_2_2 = parsed_torch_version_base >= version.parse("2.2")
 32 | is_torch_greater_or_equal_than_2_1 = parsed_torch_version_base >= version.parse("2.1")
 33 | is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0")
 34 | is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse("1.13")
 35 | is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")
 36 | 
 37 | 
 38 | def softmax_backward_data(parent, grad_output, output, dim, self):
 39 |     """
 40 |     A function that calls the internal `_softmax_backward_data` PyTorch method and that adjusts the arguments according
 41 |     to the torch version detected.
 42 |     """
 43 | 
 44 |     from torch import _softmax_backward_data
 45 | 
 46 |     return _softmax_backward_data(grad_output, output, parent.dim, self.dtype)
 47 | 
 48 | 
 49 | def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0) -> nn.Linear:
 50 |     """
 51 |     Prune a linear layer to keep only entries in index.
 52 | 
 53 |     Used to remove heads.
 54 | 
 55 |     Args:
 56 |         layer (`torch.nn.Linear`): The layer to prune.
 57 |         index (`torch.LongTensor`): The indices to keep in the layer.
 58 |         dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices.
 59 | 
 60 |     Returns:
 61 |         `torch.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`.
 62 |     """
 63 |     index = index.to(layer.weight.device)
 64 |     W = layer.weight.index_select(dim, index).clone().detach()
 65 |     if layer.bias is not None:
 66 |         if dim == 1:
 67 |             b = layer.bias.clone().detach()
 68 |         else:
 69 |             b = layer.bias[index].clone().detach()
 70 |     new_size = list(layer.weight.size())
 71 |     new_size[dim] = len(index)
 72 |     new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
 73 |     new_layer.weight.requires_grad = False
 74 |     new_layer.weight.copy_(W.contiguous())
 75 |     new_layer.weight.requires_grad = True
 76 |     if layer.bias is not None:
 77 |         new_layer.bias.requires_grad = False
 78 |         new_layer.bias.copy_(b.contiguous())
 79 |         new_layer.bias.requires_grad = True
 80 |     return new_layer
 81 | 
 82 | 
 83 | class Conv1D(nn.Module):
 84 |     """
 85 |     1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
 86 | 
 87 |     Basically works like a linear layer but the weights are transposed.
 88 | 
 89 |     Args:
 90 |         nf (`int`): The number of output features.
 91 |         nx (`int`): The number of input features.
 92 |     """
 93 | 
 94 |     def __init__(self, nf, nx):
 95 |         super().__init__()
 96 |         self.nf = nf
 97 |         self.weight = nn.Parameter(torch.empty(nx, nf))
 98 |         self.bias = nn.Parameter(torch.zeros(nf))
 99 |         nn.init.normal_(self.weight, std=0.02)
100 | 
101 |     def forward(self, x):
102 |         size_out = x.size()[:-1] + (self.nf,)
103 |         x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
104 |         x = x.view(size_out)
105 |         return x
106 | 
107 | 
108 | def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) -> Conv1D:
109 |     """
110 |     Prune a Conv1D layer to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights
111 |     are transposed.
112 | 
113 |     Used to remove heads.
114 | 
115 |     Args:
116 |         layer ([`~pytorch_utils.Conv1D`]): The layer to prune.
117 |         index (`torch.LongTensor`): The indices to keep in the layer.
118 |         dim (`int`, *optional*, defaults to 1): The dimension on which to keep the indices.
119 | 
120 |     Returns:
121 |         [`~pytorch_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`.
122 |     """
123 |     index = index.to(layer.weight.device)
124 |     W = layer.weight.index_select(dim, index).clone().detach()
125 |     if dim == 0:
126 |         b = layer.bias.clone().detach()
127 |     else:
128 |         b = layer.bias[index].clone().detach()
129 |     new_size = list(layer.weight.size())
130 |     new_size[dim] = len(index)
131 |     new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
132 |     new_layer.weight.requires_grad = False
133 |     new_layer.weight.copy_(W.contiguous())
134 |     new_layer.weight.requires_grad = True
135 |     new_layer.bias.requires_grad = False
136 |     new_layer.bias.copy_(b.contiguous())
137 |     new_layer.bias.requires_grad = True
138 |     return new_layer
139 | 
140 | 
141 | def prune_layer(
142 |     layer: Union[nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None
143 | ) -> Union[nn.Linear, Conv1D]:
144 |     """
145 |     Prune a Conv1D or linear layer to keep only entries in index.
146 | 
147 |     Used to remove heads.
148 | 
149 |     Args:
150 |         layer (`Union[torch.nn.Linear, Conv1D]`): The layer to prune.
151 |         index (`torch.LongTensor`): The indices to keep in the layer.
152 |         dim (`int`, *optional*): The dimension on which to keep the indices.
153 | 
154 |     Returns:
155 |         `torch.nn.Linear` or [`~pytorch_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`.
156 |     """
157 |     if isinstance(layer, nn.Linear):
158 |         return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
159 |     elif isinstance(layer, Conv1D):
160 |         return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
161 |     else:
162 |         raise ValueError(f"Can't prune layer of class {layer.__class__}")
163 | 
164 | 
165 | def apply_chunking_to_forward(
166 |     forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
167 | ) -> torch.Tensor:
168 |     """
169 |     This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension
170 |     `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory.
171 | 
172 |     If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly
173 |     applying `forward_fn` to `input_tensors`.
174 | 
175 |     Args:
176 |         forward_fn (`Callable[..., torch.Tensor]`):
177 |             The forward function of the model.
178 |         chunk_size (`int`):
179 |             The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`.
180 |         chunk_dim (`int`):
181 |             The dimension over which the `input_tensors` should be chunked.
182 |         input_tensors (`Tuple[torch.Tensor]`):
183 |             The input tensors of `forward_fn` which will be chunked
184 | 
185 |     Returns:
186 |         `torch.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`.
187 | 
188 | 
189 |     Examples:
190 | 
191 |     ```python
192 |     # rename the usual forward() fn to forward_chunk()
193 |     def forward_chunk(self, hidden_states):
194 |         hidden_states = self.decoder(hidden_states)
195 |         return hidden_states
196 | 
197 | 
198 |     # implement a chunked forward function
199 |     def forward(self, hidden_states):
200 |         return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
201 |     ```"""
202 | 
203 |     assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
204 | 
205 |     # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
206 |     num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
207 |     if num_args_in_forward_chunk_fn != len(input_tensors):
208 |         raise ValueError(
209 |             f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
210 |             "tensors are given"
211 |         )
212 | 
213 |     if chunk_size > 0:
214 |         tensor_shape = input_tensors[0].shape[chunk_dim]
215 |         for input_tensor in input_tensors:
216 |             if input_tensor.shape[chunk_dim] != tensor_shape:
217 |                 raise ValueError(
218 |                     f"All input tenors have to be of the same shape: {tensor_shape}, "
219 |                     f"found shape {input_tensor.shape[chunk_dim]}"
220 |                 )
221 | 
222 |         if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
223 |             raise ValueError(
224 |                 f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk "
225 |                 f"size {chunk_size}"
226 |             )
227 | 
228 |         num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
229 | 
230 |         # chunk input tensor into tuples
231 |         input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
232 |         # apply forward fn to every tuple
233 |         output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
234 |         # concatenate output at same dimension
235 |         return torch.cat(output_chunks, dim=chunk_dim)
236 | 
237 |     return forward_fn(*input_tensors)
238 | 
239 | 
240 | def find_pruneable_heads_and_indices(
241 |     heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int]
242 | ) -> Tuple[Set[int], torch.LongTensor]:
243 |     """
244 |     Finds the heads and their indices taking `already_pruned_heads` into account.
245 | 
246 |     Args:
247 |         heads (`List[int]`): List of the indices of heads to prune.
248 |         n_heads (`int`): The number of heads in the model.
249 |         head_size (`int`): The size of each head.
250 |         already_pruned_heads (`Set[int]`): A set of already pruned heads.
251 | 
252 |     Returns:
253 |         `Tuple[Set[int], torch.LongTensor]`: A tuple with the indices of heads to prune taking `already_pruned_heads`
254 |         into account and the indices of rows/columns to keep in the layer weight.
255 |     """
256 |     mask = torch.ones(n_heads, head_size)
257 |     heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
258 |     for head in heads:
259 |         # Compute how many pruned heads are before the head and move the index accordingly
260 |         head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
261 |         mask[head] = 0
262 |     mask = mask.view(-1).contiguous().eq(1)
263 |     index: torch.LongTensor = torch.arange(len(mask))[mask].long()
264 |     return heads, index
265 | 
266 | 
267 | def meshgrid(
268 |     *tensors: Union[torch.Tensor, List[torch.Tensor]], indexing: Optional[str] = None
269 | ) -> Tuple[torch.Tensor, ...]:
270 |     """
271 |     Wrapper around torch.meshgrid to avoid warning messages about the introduced `indexing` argument.
272 | 
273 |     Reference: https://pytorch.org/docs/1.13/generated/torch.meshgrid.html
274 |     """
275 |     return torch.meshgrid(*tensors, indexing=indexing)
276 | 
277 | 
278 | def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
279 |     """
280 |     Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
281 |     example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
282 |     guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
283 |     non-overlapping lifetimes may have the same id.
284 |     """
285 |     if tensor.device.type == "xla" and is_torch_tpu_available():
286 |         # NOTE: xla tensors dont have storage
287 |         # use some other unique id to distinguish.
288 |         # this is a XLA tensor, it must be created using torch_xla's
289 |         # device. So the following import is safe:
290 |         import torch_xla
291 | 
292 |         unique_id = torch_xla._XLAC._xla_get_tensor_id(tensor)
293 |     else:
294 |         unique_id = storage_ptr(tensor)
295 | 
296 |     return tensor.device, unique_id, storage_size(tensor)
297 | 


--------------------------------------------------------------------------------
/qwen2_sft/models/qwen2/tokenization_qwen2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for Qwen2."""
 16 | 
 17 | import json
 18 | import os
 19 | import unicodedata
 20 | from functools import lru_cache
 21 | from typing import Optional, Tuple
 22 | 
 23 | import regex as re
 24 | 
 25 | from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
 26 | from transformers.utils import logging
 27 | 
 28 | 
 29 | logger = logging.get_logger(__name__)
 30 | 
 31 | VOCAB_FILES_NAMES = {
 32 |     "vocab_file": "vocab.json",
 33 |     "merges_file": "merges.txt",
 34 | }
 35 | 
 36 | PRETRAINED_VOCAB_FILES_MAP = {
 37 |     "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
 38 |     "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
 39 | }
 40 | 
 41 | MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
 42 | 
 43 | PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
 44 | 
 45 | 
 46 | @lru_cache()
 47 | # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
 48 | def bytes_to_unicode():
 49 |     """
 50 |     Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
 51 |     characters the bpe code barfs on.
 52 | 
 53 |     The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
 54 |     if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
 55 |     decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
 56 |     tables between utf-8 bytes and unicode strings.
 57 |     """
 58 |     bs = (
 59 |         list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
 60 |     )
 61 |     cs = bs[:]
 62 |     n = 0
 63 |     for b in range(2**8):
 64 |         if b not in bs:
 65 |             bs.append(b)
 66 |             cs.append(2**8 + n)
 67 |             n += 1
 68 |     cs = [chr(n) for n in cs]
 69 |     return dict(zip(bs, cs))
 70 | 
 71 | 
 72 | # Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
 73 | def get_pairs(word):
 74 |     """
 75 |     Return set of symbol pairs in a word.
 76 | 
 77 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 78 |     """
 79 |     pairs = set()
 80 |     prev_char = word[0]
 81 |     for char in word[1:]:
 82 |         pairs.add((prev_char, char))
 83 |         prev_char = char
 84 |     return pairs
 85 | 
 86 | 
 87 | class Qwen2Tokenizer(PreTrainedTokenizer):
 88 |     """
 89 |     Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
 90 | 
 91 |     Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
 92 |     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 93 | 
 94 |     ```python
 95 |     >>> from transformers import Qwen2Tokenizer
 96 | 
 97 |     >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
 98 |     >>> tokenizer("Hello world")["input_ids"]
 99 |     [9707, 1879]
100 | 
101 |     >>> tokenizer(" Hello world")["input_ids"]
102 |     [21927, 1879]
103 |     ```
104 |     This is expected.
105 | 
106 |     You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
107 | 
108 |     This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
109 |     this superclass for more information regarding those methods.
110 | 
111 |     Args:
112 |         vocab_file (`str`):
113 |             Path to the vocabulary file.
114 |         merges_file (`str`):
115 |             Path to the merges file.
116 |         errors (`str`, *optional*, defaults to `"replace"`):
117 |             Paradigm to follow when decoding bytes to UTF-8. See
118 |             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
119 |         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
120 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
121 |             token instead.
122 |         bos_token (`str`, *optional*):
123 |             The beginning of sequence token. Not applicable for this tokenizer.
124 |         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
125 |             The end of sequence token.
126 |         pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
127 |             The token used for padding, for example when batching sequences of different lengths.
128 |         clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
129 |             Whether or not the model should cleanup the spaces that were added when splitting the input text during the
130 |             tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
131 |         split_special_tokens (`bool`, *optional*, defaults to `False`):
132 |             Whether or not the special tokens should be split during the tokenization process. The default behavior is
133 |             to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
134 |             ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
135 |             '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
136 |     """
137 | 
138 |     vocab_files_names = VOCAB_FILES_NAMES
139 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
140 |     max_model_input_sizes = MAX_MODEL_INPUT_SIZES
141 |     model_input_names = ["input_ids", "attention_mask"]
142 | 
143 |     def __init__(
144 |         self,
145 |         vocab_file,
146 |         merges_file,
147 |         errors="replace",
148 |         unk_token="<|endoftext|>",
149 |         bos_token=None,
150 |         eos_token="<|endoftext|>",
151 |         pad_token="<|endoftext|>",
152 |         clean_up_tokenization_spaces=False,
153 |         split_special_tokens=False,
154 |         **kwargs,
155 |     ):
156 |         # Qwen vocab does not contain control tokens; added tokens need to be special
157 |         bos_token = (
158 |             AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
159 |             if isinstance(bos_token, str)
160 |             else bos_token
161 |         )
162 |         eos_token = (
163 |             AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
164 |             if isinstance(eos_token, str)
165 |             else eos_token
166 |         )
167 |         unk_token = (
168 |             AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
169 |             if isinstance(unk_token, str)
170 |             else unk_token
171 |         )
172 |         pad_token = (
173 |             AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
174 |             if isinstance(pad_token, str)
175 |             else pad_token
176 |         )
177 | 
178 |         with open(vocab_file, encoding="utf-8") as vocab_handle:
179 |             self.encoder = json.load(vocab_handle)
180 |         self.decoder = {v: k for k, v in self.encoder.items()}
181 |         self.errors = errors  # how to handle errors in decoding
182 |         self.byte_encoder = bytes_to_unicode()
183 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
184 |         bpe_merges = []
185 |         if merges_file:
186 |             with open(merges_file, encoding="utf-8") as merges_handle:
187 |                 for line in merges_handle:
188 |                     line = line.strip()
189 |                     if not line or line.startswith("#"):
190 |                         continue
191 |                     bpe_merges.append(tuple(line.split()))
192 |         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
193 |         # NOTE: the cache can grow without bound and will get really large for long running processes
194 |         # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
195 |         # not a memory leak but appears as one.
196 |         # GPT2Tokenizer has the same problem, so let's be consistent.
197 |         self.cache = {}
198 | 
199 |         self.pat = re.compile(PRETOKENIZE_REGEX)
200 | 
201 |         if kwargs.get("add_prefix_space", False):
202 |             logger.warning_once(
203 |                 f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
204 |             )
205 | 
206 |         super().__init__(
207 |             errors=errors,
208 |             bos_token=bos_token,
209 |             eos_token=eos_token,
210 |             pad_token=pad_token,
211 |             unk_token=unk_token,
212 |             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
213 |             split_special_tokens=split_special_tokens,
214 |             **kwargs,
215 |         )
216 | 
217 |     @property
218 |     def vocab_size(self) -> int:
219 |         return len(self.encoder)
220 | 
221 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
222 |     def get_vocab(self):
223 |         return dict(self.encoder, **self.added_tokens_encoder)
224 | 
225 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
226 |     def bpe(self, token):
227 |         if token in self.cache:
228 |             return self.cache[token]
229 |         word = tuple(token)
230 |         pairs = get_pairs(word)
231 | 
232 |         if not pairs:
233 |             return token
234 | 
235 |         while True:
236 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
237 |             if bigram not in self.bpe_ranks:
238 |                 break
239 |             first, second = bigram
240 |             new_word = []
241 |             i = 0
242 |             while i < len(word):
243 |                 try:
244 |                     j = word.index(first, i)
245 |                 except ValueError:
246 |                     new_word.extend(word[i:])
247 |                     break
248 |                 else:
249 |                     new_word.extend(word[i:j])
250 |                     i = j
251 | 
252 |                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
253 |                     new_word.append(first + second)
254 |                     i += 2
255 |                 else:
256 |                     new_word.append(word[i])
257 |                     i += 1
258 |             new_word = tuple(new_word)
259 |             word = new_word
260 |             if len(word) == 1:
261 |                 break
262 |             else:
263 |                 pairs = get_pairs(word)
264 |         word = " ".join(word)
265 |         self.cache[token] = word
266 |         return word
267 | 
268 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
269 |     def _tokenize(self, text):
270 |         """Tokenize a string."""
271 |         bpe_tokens = []
272 |         for token in re.findall(self.pat, text):
273 |             token = "".join(
274 |                 self.byte_encoder[b] for b in token.encode("utf-8")
275 |             )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
276 |             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
277 |         return bpe_tokens
278 | 
279 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
280 |     def _convert_token_to_id(self, token):
281 |         """Converts a token (str) in an id using the vocab."""
282 |         return self.encoder.get(token, self.encoder.get(self.unk_token))
283 | 
284 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
285 |     def _convert_id_to_token(self, index):
286 |         """Converts an index (integer) in a token (str) using the vocab."""
287 |         return self.decoder.get(index)
288 | 
289 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
290 |     def convert_tokens_to_string(self, tokens):
291 |         """Converts a sequence of tokens (string) in a single string."""
292 |         text = "".join(tokens)
293 |         text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
294 |         return text
295 | 
296 |     def decode(
297 |         self,
298 |         token_ids,
299 |         skip_special_tokens: bool = False,
300 |         clean_up_tokenization_spaces: Optional[bool] = False,
301 |         spaces_between_special_tokens: bool = False,
302 |         **kwargs,
303 |     ) -> str:
304 |         # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
305 |         # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
306 |         return super().decode(
307 |             token_ids,
308 |             skip_special_tokens=skip_special_tokens,
309 |             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
310 |             spaces_between_special_tokens=spaces_between_special_tokens,
311 |             **kwargs,
312 |         )
313 | 
314 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
315 |     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
316 |         if not os.path.isdir(save_directory):
317 |             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
318 |             return
319 |         vocab_file = os.path.join(
320 |             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
321 |         )
322 |         merge_file = os.path.join(
323 |             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
324 |         )
325 | 
326 |         with open(vocab_file, "w", encoding="utf-8") as f:
327 |             f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
328 | 
329 |         index = 0
330 |         with open(merge_file, "w", encoding="utf-8") as writer:
331 |             writer.write("#version: 0.2\n")
332 |             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
333 |                 if index != token_index:
334 |                     logger.warning(
335 |                         f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
336 |                         " Please check that the tokenizer is not corrupted!"
337 |                     )
338 |                     index = token_index
339 |                 writer.write(" ".join(bpe_tokens) + "\n")
340 |                 index += 1
341 | 
342 |         return vocab_file, merge_file
343 | 
344 |     def prepare_for_tokenization(self, text, **kwargs):
345 |         text = unicodedata.normalize("NFC", text)
346 |         return (text, kwargs)
347 | 
348 | 
349 | def save_json(jsons, json_path, indent=4):
350 |     """
351 |         保存json
352 |     Args:
353 |         path[String]:, path of file of save, eg. "corpus/xuexiqiangguo.lib"
354 |         jsons[Json]: json of input data, eg. [{"桂林": 132}]
355 |         indent[int]: pretty-printed with that indent level, eg. 4
356 |     Returns:
357 |         None
358 |     """
359 |     with open(json_path, "w", encoding="utf-8") as fj:
360 |         fj.write(json.dumps(jsons, ensure_ascii=False, indent=indent))
361 |     fj.close()
362 | def load_json(path, parse_int=None):
363 |     """
364 |         加载json
365 |     Args:
366 |         path_file[String]:, path of file of save, eg. "corpus/xuexiqiangguo.lib"
367 |         parse_int[Boolean]: equivalent to int(num_str), eg. True or False
368 |     Returns:
369 |         data[Any]
370 |     """
371 |     with open(path, mode="r", encoding="utf-8") as fj:
372 |         model_json = json.load(fj, parse_int=parse_int)
373 |     return model_json
374 | 
375 | 
376 | if __name__ == '__main__':
377 |     myz = 0
378 | 
379 | 
380 |     path = "E:/DATA/bert-model/00_pytorch/LLM/Qwen_Qwen1.5-0.5B-Chat"
381 |     tokenizer = Qwen2Tokenizer.from_pretrained(path)
382 |     # tokenizer.save_vocabulary()
383 |     vocab_dict = tokenizer.get_vocab()
384 |     save_json(vocab_dict, "qwen1.5_vocab_dict.json")
385 | 
386 | 
387 |     prompt = "Give me a short introduction to large language model."
388 |     messages = [
389 |         {"role": "system", "content": "You are a helpful assistant."},
390 |         {"role": "user", "content": ""}
391 |     ]
392 |     tokens11 = tokenizer.apply_chat_template(
393 |         messages,
394 |         tokenize=False,
395 |         add_generation_prompt=True
396 |     )
397 |     print(tokens11)
398 | 
399 |     tokens = tokenizer.tokenize(tokens11, do_lower_case=True)
400 |     print(tokens)
401 |     tokens = tokenizer("Hello world")["input_ids"]
402 |     print(tokens)
403 | 
404 |     from transformers import Qwen2Tokenizer
405 | 
406 |     path = "E:/DATA/bert-model/00_pytorch/LLM/Qwen_Qwen1.5-14B-Chat"
407 |     tokenizer2 = Qwen2Tokenizer.from_pretrained(path)
408 |     tokens = tokenizer2("Hello world")["input_ids"]
409 |     print(tokens)
410 |     tokens = tokenizer.tokenize(tokens11, do_lower_case=True)
411 |     print(tokens)
412 | 
413 |     """
414 | 
415 |     
416 | "<|endoftext|>": 151643,
417 | "<|im_start|>": 151644,
418 | "<|im_end|>": 151645
419 |     
420 | [151644, 82, 88, 82, 83, 68, 76, 198, 56, 78, 84, 220, 64, 81, 68, 220, 64, 220, 71, 68, 75, 79, 69, 84, 75, 220, 64, 82, 82, 72, 82, 83, 64, 77, 83, 13, 151645, 198, 151644, 84, 82, 68, 81, 198, 151645, 198, 151644, 64, 82, 82, 72, 82, 83, 64, 77, 83, 198]
421 | """
422 | 
423 | 


--------------------------------------------------------------------------------
/qwen2_sft/models/qwen2/tokenization_qwen2_fast.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for Qwen2."""
 16 | 
 17 | from typing import Optional, Tuple
 18 | 
 19 | from transformers.tokenization_utils import AddedToken
 20 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 21 | from transformers.utils import logging
 22 | 
 23 | from qwen2_sft.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
 24 | 
 25 | 
 26 | logger = logging.get_logger(__name__)
 27 | 
 28 | VOCAB_FILES_NAMES = {
 29 |     "vocab_file": "vocab.json",
 30 |     "merges_file": "merges.txt",
 31 |     "tokenizer_file": "tokenizer.json",
 32 | }
 33 | 
 34 | PRETRAINED_VOCAB_FILES_MAP = {
 35 |     "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
 36 |     "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
 37 |     "tokenizer_file": {
 38 |         "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json"
 39 |     },
 40 | }
 41 | 
 42 | MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
 43 | 
 44 | 
 45 | class Qwen2TokenizerFast(PreTrainedTokenizerFast):
 46 |     """
 47 |     Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
 48 |     Byte-Pair-Encoding.
 49 | 
 50 |     Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
 51 |     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 52 | 
 53 |     ```python
 54 |     >>> from transformers import Qwen2TokenizerFast
 55 | 
 56 |     >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
 57 |     >>> tokenizer("Hello world")["input_ids"]
 58 |     [9707, 1879]
 59 | 
 60 |     >>> tokenizer(" Hello world")["input_ids"]
 61 |     [21927, 1879]
 62 |     ```
 63 |     This is expected.
 64 | 
 65 |     This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
 66 |     refer to this superclass for more information regarding those methods.
 67 | 
 68 |     Args:
 69 |         vocab_file (`str`, *optional*):
 70 |             Path to the vocabulary file.
 71 |         merges_file (`str`, *optional*):
 72 |             Path to the merges file.
 73 |         tokenizer_file (`str`, *optional*):
 74 |             Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
 75 |             contains everything needed to load the tokenizer.
 76 |         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 77 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
 78 |             token instead. Not applicable to this tokenizer.
 79 |         bos_token (`str`, *optional*):
 80 |             The beginning of sequence token. Not applicable for this tokenizer.
 81 |         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 82 |             The end of sequence token.
 83 |         pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 84 |             The token used for padding, for example when batching sequences of different lengths.
 85 |     """
 86 | 
 87 |     vocab_files_names = VOCAB_FILES_NAMES
 88 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 89 |     max_model_input_sizes = MAX_MODEL_INPUT_SIZES
 90 |     model_input_names = ["input_ids", "attention_mask"]
 91 |     slow_tokenizer_class = Qwen2Tokenizer
 92 | 
 93 |     def __init__(
 94 |         self,
 95 |         vocab_file=None,
 96 |         merges_file=None,
 97 |         tokenizer_file=None,
 98 |         unk_token="<|endoftext|>",
 99 |         bos_token=None,
100 |         eos_token="<|endoftext|>",
101 |         pad_token="<|endoftext|>",
102 |         **kwargs,
103 |     ):
104 |         # We need to at least pass vocab_file and merges_file to base class
105 |         # in case a slow tokenizer needs to be initialized; other can be
106 |         # configured through files.
107 |         # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
108 | 
109 |         bos_token = (
110 |             AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
111 |             if isinstance(bos_token, str)
112 |             else bos_token
113 |         )
114 |         eos_token = (
115 |             AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
116 |             if isinstance(eos_token, str)
117 |             else eos_token
118 |         )
119 |         unk_token = (
120 |             AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
121 |             if isinstance(unk_token, str)
122 |             else unk_token
123 |         )
124 |         pad_token = (
125 |             AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
126 |             if isinstance(pad_token, str)
127 |             else pad_token
128 |         )
129 | 
130 |         super().__init__(
131 |             vocab_file,
132 |             merges_file,
133 |             tokenizer_file=tokenizer_file,
134 |             unk_token=unk_token,
135 |             bos_token=bos_token,
136 |             eos_token=eos_token,
137 |             pad_token=pad_token,
138 |             **kwargs,
139 |         )
140 | 
141 |     # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
142 |     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
143 |         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
144 |         return tuple(files)


--------------------------------------------------------------------------------
/qwen2_sft/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers>=4.37.0
 2 | torch>=1.13.1
 3 | 
 4 | safetensors>=0.4.1
 5 | accelerate==0.27.1
 6 | fsspec==2023.9.2
 7 | tensorboardX==2.4
 8 | rouge==1.0.1
 9 | nltk==3.6.6
10 | numpy
11 | tqdm


--------------------------------------------------------------------------------