├── .DS_Store
├── img
├── roadmap_engineer.png
├── roadmap_scientist.png
├── roadmap_fundamentals.png
└── colab.svg
├── 4_bit_LLM_Quantization_with_GPTQ.ipynb
├── LICENSE
├── README.md
├── Fine_tune_a_Mistral_7b_model_with_DPO.ipynb
└── Mergekit.ipynb
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/friendmine/llm-course-chn/HEAD/.DS_Store
--------------------------------------------------------------------------------
/img/roadmap_engineer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/friendmine/llm-course-chn/HEAD/img/roadmap_engineer.png
--------------------------------------------------------------------------------
/img/roadmap_scientist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/friendmine/llm-course-chn/HEAD/img/roadmap_scientist.png
--------------------------------------------------------------------------------
/img/roadmap_fundamentals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/friendmine/llm-course-chn/HEAD/img/roadmap_fundamentals.png
--------------------------------------------------------------------------------
/img/colab.svg:
--------------------------------------------------------------------------------
1 | Open in Colab Open in Colab
2 |
--------------------------------------------------------------------------------
/4_bit_LLM_Quantization_with_GPTQ.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "view-in-github"
8 | },
9 | "source": [
10 | " "
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "# 使用GPTQ进行LLM 4-bit 量化\n",
18 | "> 🗣️ [大型语言模型课程](https://github.com/mlabonne/llm-course)\n",
19 | "\n",
20 | "❤️ 由[@maximelabonne](https://twitter.com/maximelabonne)创建。\n",
21 | "\n",
22 | "与笔记配合的是此文章中的代码:https://mlabonne.github.io/blog/4bit_quantization/"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {
29 | "id": "BhufqqQAaz6e"
30 | },
31 | "outputs": [],
32 | "source": [
33 | "!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {
40 | "id": "dg8NyBL0ZNyw"
41 | },
42 | "outputs": [],
43 | "source": [
44 | "import random\n",
45 | "\n",
46 | "from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig\n",
47 | "from datasets import load_dataset\n",
48 | "import torch\n",
49 | "from transformers import AutoTokenizer\n",
50 | "\n",
51 | "\n",
52 | "# Define base model and output directory\n",
53 | "model_id = \"gpt2\"\n",
54 | "out_dir = model_id + \"-GPTQ\""
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "id": "C9352jN0ZP6I"
62 | },
63 | "outputs": [],
64 | "source": [
65 | "# Load quantize config, model and tokenizer\n",
66 | "quantize_config = BaseQuantizeConfig(\n",
67 | " bits=4,\n",
68 | " group_size=128,\n",
69 | " damp_percent=0.01,\n",
70 | " desc_act=False,\n",
71 | ")\n",
72 | "model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config)\n",
73 | "tokenizer = AutoTokenizer.from_pretrained(model_id)"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {
80 | "colab": {
81 | "base_uri": "https://localhost:8080/"
82 | },
83 | "id": "6wuBLe6aZSe-",
84 | "outputId": "e4ebd71a-2854-4347-cebe-08cf040d1eb6"
85 | },
86 | "outputs": [
87 | {
88 | "name": "stderr",
89 | "output_type": "stream",
90 | "text": [
91 | "WARNING:datasets.builder:Found cached dataset json (/root/.cache/huggingface/datasets/allenai___json/allenai--c4-6e494e9c0ee1404e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n",
92 | "Token indices sequence length is longer than the specified maximum sequence length for this model (2441065 > 1024). Running this sequence through the model will result in indexing errors\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "# Load data and tokenize examples\n",
98 | "n_samples = 1024\n",
99 | "data = load_dataset(\"allenai/c4\", data_files=\"en/c4-train.00001-of-01024.json.gz\", split=f\"train[:{n_samples*5}]\")\n",
100 | "tokenized_data = tokenizer(\"\\n\\n\".join(data['text']), return_tensors='pt')\n",
101 | "\n",
102 | "# Format tokenized examples\n",
103 | "examples_ids = []\n",
104 | "for _ in range(n_samples):\n",
105 | " i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)\n",
106 | " j = i + tokenizer.model_max_length\n",
107 | " input_ids = tokenized_data.input_ids[:, i:j]\n",
108 | " attention_mask = torch.ones_like(input_ids)\n",
109 | " examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {
116 | "colab": {
117 | "base_uri": "https://localhost:8080/"
118 | },
119 | "id": "ETsG2iYrXaUg",
120 | "outputId": "e48b825e-0ebc-4a73-dbfd-b5571cafd24e"
121 | },
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "CPU times: user 4min 35s, sys: 3.49 s, total: 4min 39s\n",
128 | "Wall time: 5min 8s\n"
129 | ]
130 | },
131 | {
132 | "data": {
133 | "text/plain": [
134 | "('gpt2-GPTQ/tokenizer_config.json',\n",
135 | " 'gpt2-GPTQ/special_tokens_map.json',\n",
136 | " 'gpt2-GPTQ/vocab.json',\n",
137 | " 'gpt2-GPTQ/merges.txt',\n",
138 | " 'gpt2-GPTQ/added_tokens.json',\n",
139 | " 'gpt2-GPTQ/tokenizer.json')"
140 | ]
141 | },
142 | "execution_count": 5,
143 | "metadata": {},
144 | "output_type": "execute_result"
145 | }
146 | ],
147 | "source": [
148 | "%%time\n",
149 | "\n",
150 | "# Quantize with GPTQ\n",
151 | "model.quantize(\n",
152 | " examples_ids,\n",
153 | " batch_size=1,\n",
154 | " use_triton=True,\n",
155 | ")\n",
156 | "\n",
157 | "# Save model and tokenizer\n",
158 | "model.save_quantized(out_dir, use_safetensors=True)\n",
159 | "tokenizer.save_pretrained(out_dir)"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {
166 | "colab": {
167 | "base_uri": "https://localhost:8080/"
168 | },
169 | "id": "nktu1FsdZ9sd",
170 | "outputId": "9943c829-1b58-474a-f245-6aefa09d81dc"
171 | },
172 | "outputs": [
173 | {
174 | "name": "stderr",
175 | "output_type": "stream",
176 | "text": [
177 | "WARNING:accelerate.utils.modeling:The safetensors archive passed at gpt2-GPTQ/gptq_model-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.\n",
178 | "WARNING:auto_gptq.modeling._base:GPT2GPTQForCausalLM hasn't fused attention module yet, will skip inject fused attention.\n",
179 | "WARNING:auto_gptq.modeling._base:GPT2GPTQForCausalLM hasn't fused mlp module yet, will skip inject fused mlp.\n"
180 | ]
181 | }
182 | ],
183 | "source": [
184 | "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
185 | "\n",
186 | "# Reload model and tokenizer\n",
187 | "model = AutoGPTQForCausalLM.from_quantized(\n",
188 | " out_dir,\n",
189 | " device=device,\n",
190 | " use_triton=True,\n",
191 | " use_safetensors=True,\n",
192 | ")\n",
193 | "tokenizer = AutoTokenizer.from_pretrained(out_dir)"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {
200 | "colab": {
201 | "base_uri": "https://localhost:8080/"
202 | },
203 | "id": "cRhIGrXdiFdt",
204 | "outputId": "6dca2078-6f01-44da-9895-3a03bdfb4b5b"
205 | },
206 | "outputs": [
207 | {
208 | "name": "stderr",
209 | "output_type": "stream",
210 | "text": [
211 | "The model 'GPT2GPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBertForCausalLM', 'RoFormerForCausalLM', 'RwkvForCausalLM', 'Speech2Text2ForCausalLM', 'TransfoXLLMHeadModel', 'TrOCRForCausalLM', 'XGLMForCausalLM', 'XLMWithLMHeadModel', 'XLMProphetNetForCausalLM', 'XLMRobertaForCausalLM', 'XLMRobertaXLForCausalLM', 'XLNetLMHeadModel', 'XmodForCausalLM'].\n",
212 | "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
213 | ]
214 | },
215 | {
216 | "name": "stdout",
217 | "output_type": "stream",
218 | "text": [
219 | "I have a dream,\" she told CNN last week. \"I have this dream of helping my mother find her own. But, to tell that for the first time, now that I'm seeing my mother now, just knowing how wonderful it is that\n"
220 | ]
221 | }
222 | ],
223 | "source": [
224 | "from transformers import pipeline\n",
225 | "\n",
226 | "generator = pipeline('text-generation', model=model, tokenizer=tokenizer)\n",
227 | "result = generator(\"I have a dream\", do_sample=True, max_length=50)[0]['generated_text']\n",
228 | "print(result)"
229 | ]
230 | }
231 | ],
232 | "metadata": {
233 | "accelerator": "GPU",
234 | "colab": {
235 | "authorship_tag": "ABX9TyOS2QEuJ1BDI/3IFsLsFIZo",
236 | "gpuType": "T4",
237 | "include_colab_link": true,
238 | "provenance": []
239 | },
240 | "kernelspec": {
241 | "display_name": "Python 3",
242 | "name": "python3"
243 | },
244 | "language_info": {
245 | "name": "python"
246 | }
247 | },
248 | "nbformat": 4,
249 | "nbformat_minor": 0
250 | }
251 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # llm-course-chn
2 |
3 | # 许可
4 | ```
5 | 向mlabonne申请了中文的翻译许可!
6 | https://github.com/mlabonne/llm-course/issues/57
7 | ```
8 |
9 |
19 |
20 |
21 |
22 | 大语言模型课程分为三个部分:
23 |
24 | 1. 🧩 **LLM基础** 涵盖关于数学、Python和神经网络的基础知识。
25 | 2. 🧑🔬 **LLM科学家** 侧重于使用最新技术构建最佳的LLM。
26 | 3. 👷 **LLM工程师** 侧重于创建基于LLM的应用程序并部署它们。
27 |
28 | ## 📝 笔记本
29 |
30 | 与大语言模型相关的笔记本和文章列表。
31 |
32 | ### 工具
33 |
34 | | 笔记本 | 描述 | 笔记本 |
35 | |----------|-------------|----------|
36 | | 🧐 [LLM自动评估](https://github.com/mlabonne/llm-autoeval) | 使用RunPod自动评估您的LLM | |
37 | | 🥱 LazyMergekit | 使用mergekit一键轻松合并模型。 | |
38 | | 🦎 LazyAxolotl | 一键在云上微调模型。 | |
39 | | ⚡ AutoGGUF | 一键将LLM量化为GGUF格式。 | |
40 | | 🌳 模型家族树 | 可视化合并模型的家族树。 | |
41 |
42 |
43 |
44 | ### 微调
45 |
46 | | 笔记本 | 描述 | 文章 | 笔记本 |
47 | |---------------------------------------|-------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------|
48 | | 在Google Colab中微调Llama 2 | 一步一步指导如何微调您的第一个Llama 2模型。 | [文章](https://mlabonne.github.io/blog/posts/Fine_Tune_Your_Own_Llama_2_Model_in_a_Colab_Notebook.html) | |
49 | | 使用Axolotl微调LLM | 微调的最新工具的端到端指南。 | [文章](https://mlabonne.github.io/blog/posts/A_Beginners_Guide_to_LLM_Finetuning.html) | |
50 | | 使用DPO微调Mistral-7b | 通过DPO提升监督微调模型的性能。 | [文章](https://medium.com/towards-data-science/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac) | |
51 |
52 |
53 | ### 量化
54 |
55 | | 笔记本 | 描述 | 文章 | 笔记本 |
56 | |---------------------------------------|-------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------|
57 | | 1. 量化入门 | 使用8位量化优化大型语言模型。 | [文章](https://mlabonne.github.io/blog/posts/Introduction_to_Weight_Quantization.html) | |
58 | | 2. 使用GPTQ进行4位量化 | 将您自己的开源LLM量化,以便在消费硬件上运行。 | [文章](https://mlabonne.github.io/blog/4bit_quantization/) | |
59 | | 3. 使用GGUF和llama.cpp量化 | 使用llama.cpp量化Llama 2模型,并将GGUF版本上传到HF Hub。 | [文章](https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html) | |
60 | | 4. ExLlamaV2:最快的运行LLM库 | 量化并运行EXL2模型,并将它们上传到HF Hub。 | [文章](https://mlabonne.github.io/blog/posts/ExLlamaV2_The_Fastest_Library_to_Run%C2%A0LLMs.html) | |
61 |
62 | ### 其他
63 |
64 | | 笔记本 | 描述 | 文章 | 笔记本 |
65 | |---------------------------------------|-------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------|
66 | | 大型语言模型中的解码策略 | 从束搜索到核心抽样的文本生成指南 | [文章](https://mlabonne.github.io/blog/posts/2022-06-07-Decoding_strategies.html) | |
67 | | 可视化GPT-2的损失 | 基于权重扰动的损失3D图。 | [推文](https://twitter.com/maximelabonne/status/1667618081844219904) | |
68 | | 用知识图谱改善ChatGPT | 用知识图谱增强ChatGPT的回答。 | [文章](https://mlabonne.github.io/blog/posts/Article_Improve_ChatGPT_with_Knowledge_Graphs.html) | |
69 | | 使用mergekit合并LLM | 轻松创建您自己的模型,无需GPU! | [文章](https://towardsdatascience.com/merge-large-language-models-with-mergekit-2118fb392b54) | |
70 |
71 |
72 | ## 🧩 LLM基础
73 |
74 | 
75 |
76 | ### 1. 机器学习的数学
77 |
78 | 在掌握机器学习之前,了解驱动这些算法的基本数学概念非常重要。
79 |
80 | - **线性代数**:这对于理解许多算法尤其是在深度学习中使用的算法至关重要。关键概念包括向量、矩阵、行列式、特征值和特征向量、向量空间和线性变换。
81 | - **微积分**:许多机器学习算法涉及连续函数的优化,这需要理解导数、积分、极限和级数。多变量微积分和梯度概念也很重要。
82 | - **概率与统计**:这些对于理解模型如何从数据中学习并做出预测至关重要。关键概念包括概率论、随机变量、概率分布、期望、方差、协方差、相关性、假设检验、置信区间、最大似然估计和贝叶斯推断。
83 |
84 | 📚 资源:
85 |
86 | - [3Blue1Brown - 线性代数的精髓](https://www.youtube.com/watch?v=fNk_zzaMoSs&list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab):一系列视频,提供了这些概念的几何直观感受。
87 | - [StatQuest with Josh Starmer - 统计学基础](https://www.youtube.com/watch?v=qBigTkBLU6g&list=PLblh5JKOoLUK0FLuzwntyYI10UQFUhsY9):为许多统计概念提供简单明了的解释。
88 | - [AP Statistics Intuition by Ms Aerin](https://automata88.medium.com/list/cacc224d5e7d):Medium文章列表,提供了每个概率分布背后的直觉理解。
89 | - [沉浸式线性代数](https://immersivemath.com/ila/learnmore.html):线性代数的另一种视觉解读。
90 | - [Khan Academy - 线性代数](https://www.khanacademy.org/math/linear-algebra):非常适合初学者,以非常直观的方式解释概念。
91 | - [Khan Academy - 微积分](https://www.khanacademy.org/math/calculus-1):一个涵盖所有基础微积分概念的互动课程。
92 | - [Khan Academy - 概率与统计](https://www.khanacademy.org/math/statistics-probability):以易于理解的格式提供材料。
93 |
94 | ---
95 |
96 | ### 2. 机器学习的Python
97 |
98 | Python是一种功能强大且灵活的编程语言,特别适用于机器学习,这得益于其可读性、一致性和健壮的数据科学库生态系统。
99 |
100 | - **Python基础**:Python编程需要理解基本语法、数据类型、错误处理和面向对象编程。
101 | - **数据科学库**:熟悉NumPy进行数值操作,用Pandas进行数据处理和分析,用Matplotlib和Seaborn做数据可视化。
102 | - **数据预处理**:涉及特征缩放和规范化、处理缺失数据、异常检测、类别数据编码,以及将数据分割为训练、验证和测试集。
103 | - **机器学习库**:熟练使用Scikit-learn,这是一个提供广泛的监督和无监督学习算法的python库,至关重要。理解如何实现线性回归、逻辑回归、决策树、随机森林、最近邻(K-NN)和K均值聚类等算法很重要。降维技术如PCA和t-SNE对于可视化高维数据也很有帮助。
104 |
105 | 📚 资源:
106 |
107 | - [Real Python](https://realpython.com/):一个全面的资源,提供针对Python初学者和高级概念的文章和教程。
108 | - [freeCodeCamp - 学习Python](https://www.youtube.com/watch?v=rfscVS0vtbw):提供Python所有核心概念全面介绍的长视频。
109 | - [Python数据科学手册](https://jakevdp.github.io/PythonDataScienceHandbook/):一个很好的学习pandas、NumPy、Matplotlib和Seaborn的免费数字书。
110 | - [freeCodeCamp - 机器学习入门](https://youtu.be/i_LwzRVP7bg):为初学者提供不同机器学习算法的实用介绍。
111 | - [Udacity - 机器学习入门](https://www.udacity.com/course/intro-to-machine-learning--ud120):一个免费课程,涵盖了PCA和其他几个机器学习概
112 |
113 |
114 | ### 3. 神经网络
115 |
116 | 神经网络是许多机器学习模型的基础部分,特别是在深度学习领域。为了有效地利用它们,对它们的设计和机制有一个全面的理解是必要的。
117 |
118 | - **基础知识**:这包括了解神经网络的结构,如层、权重、偏差和激活函数(sigmoid、tanh、ReLU等)。
119 | - **训练和优化**:熟悉反向传播和不同类型的损失函数,如均方误差(MSE)和交叉熵。理解各种优化算法,如梯度下降、随机梯度下降、RMSprop和Adam。
120 | - **过拟合**:理解过拟合的概念(模型在训练数据上表现良好但在未见数据上表现不佳)并学习各种正则化技术(dropout、L1/L2正则化、提前停止、数据增强)来防止它。
121 | - **实现多层感知机(MLP)**:使用PyTorch构建一个MLP,也称为全连接网络。
122 |
123 | 📚 资源:
124 |
125 | - [3Blue1Brown - 神经网络是什么?](https://www.youtube.com/watch?v=aircAruvnKk):这个视频直观地解释了神经网络及其内部工作原理。
126 | - [freeCodeCamp - 深度学习速成课](https://www.youtube.com/watch?v=VyWAvY2CF9c):这个视频高效地介绍了深度学习中所有最重要的概念。
127 | - [Fast.ai - 实用深度学习](https://course.fast.ai/):为有编程经验想学习深度学习的人设计的免费课程。
128 | - [Patrick Loeber - PyTorch教程](https://www.youtube.com/playlist?list=PLqnslRFeH2UrcDBWF5mfPGpqQDSta6VK4):一系列视频,让完全的初学者学习PyTorch。
129 |
130 | ---
131 |
132 | ### 4. 自然语言处理(NLP)
133 |
134 | NLP是人工智能的一个迷人分支,它桥接了人类语言与机器理解之间的差距。从简单的文本处理到理解语言细微差别,NLP在许多应用中发挥着关键作用,如翻译、情感分析、聊天机器人等等。
135 |
136 | - **文本预处理**:学习各种文本预处理步骤,如分词(将文本分割成单词或句子)、词干提取(将词汇还原到其根形式)、词形还原(类似于词干提取但考虑上下文)、停用词移除等。
137 | - **特征提取技术**:熟悉将文本数据转换为机器学习算法能理解的格式的技术。关键方法包括词袋(BoW)、词频-逆文档频率(TF-IDF)和n-gram。
138 | - **词嵌入**:词嵌入是一种词表示方式,允许意义相近的词有相似的表示。关键方法包括Word2Vec、GloVe和FastText。
139 | - **循环神经网络(RNNs)**:理解RNN的工作原理,RNN是一种设计用来处理序列数据的神经网络。探索LSTM和GRU,这两种RNN变体能够学习长期依赖。
140 |
141 | 📚 资源:
142 |
143 | - [RealPython - 使用spaCy进行Python自然语言处理](https://realpython.com/natural-language-processing-spacy-python/):关于Python中spaCy库进行NLP任务的详尽指南。
144 | - [Kaggle - NLP指南](https://www.kaggle.com/learn-guide/natural-language-processing):一些笔记本和资源,用于Python中NLP的实践解释。
145 | - [Jay Alammar - Word2Vec图解](https://jalammar.github.io/illustrated-word2vec/):理解著名的Word2Vec架构的好参考。
146 | - [Jake Tae - 从零开始的PyTorch RNN](https://jaketae.github.io/study/pytorch-rnn/):在PyTorch中实践和简单实现RNN、LSTM和GRU模型。
147 | - [colah的博客 - 理解LSTM网络](https://colah.github.io/posts/2015-08-Understanding-LSTMs/):关于LSTM网络的更理论化文章。
148 |
149 | ## 🧑🔬 LLM科学家
150 |
151 | 本课程部分侧重于学习如何使用最新技术构建最佳的LLM。
152 |
153 | 
154 |
155 | ### 1. LLM架构
156 |
157 | 虽然不需要深入了解Transformer架构,但重要的是要很好地理解它的输入(令牌)和输出(logits)。原始注意力机制是另一个需要掌握的关键组成部分,因为后续会介绍它的改进版本。
158 |
159 | * **高层视图**:回顾编码器-解码器Transformer架构,更具体地说是在每个现代LLM中使用的仅解码器GPT架构。
160 | * **令牌化**:理解如何将原始文本数据转换成模型可以理解的格式,这涉及将文本分割成令牌(通常是单词或子词)。
161 | * **注意力机制**:掌握注意力机制背后的理论,包括自注意力和缩放点积注意力,这使得模型在产生输出时能够关注输入的不同部分。
162 | * **文本生成**:了解模型可以生成输出序列的不同方式。常见策略包括贪婪解码、束搜索、Top-k采样和核采样。
163 |
164 | 📚 **参考资料**:
165 | - [插图式Transformer](https://jalammar.github.io/illustrated-transformer/) 作者Jay Alammar:对Transformer模型的直观和视觉解释。
166 | - [插图式GPT-2](https://jalammar.github.io/illustrated-gpt2/) 作者Jay Alammar:比前一篇文章更重要,它专注于GPT架构,与Llama非常相似。
167 | - [LLM可视化](https://bbycroft.net/llm) 作者Brendan Bycroft:令人难以置信的LLM内部3D可视化。
168 | * [nanoGPT](https://www.youtube.com/watch?v=kCc8FmEb1nY) 作者Andrej Karpathy:一个2小时长的YouTube视频,从头开始重新实现GPT(面向程序员)。
169 | * [注意力?注意力!](https://lilianweng.github.io/posts/2018-06-24-attention/) 作者Lilian Weng:以更正式的方式介绍注意力的需求。
170 | * [LLM中的解码策略](https://mlabonne.github.io/blog/posts/2023-06-07-Decoding_strategies.html):提供代码和直观介绍不同文本生成解码策略。
171 |
172 | ---
173 | ### 2. 构建指令数据集
174 |
175 | 虽然从维基百科和其他网站找到原始数据很容易,但在野外收集指令和答案的配对却很难。就像在传统机器学习中一样,数据集的质量将直接影响模型的质量,这就是为什么它可能是微调过程中最重要的组件。
176 |
177 | * **[Alpaca](https://crfm.stanford.edu/2023/03/13/alpaca.html)-样数据集**:使用OpenAI API(GPT)从头生成合成数据。您可以指定种子和系统提示来创建多样化的数据集。
178 | * **高级技术**:了解如何使用[Evol-Instruct](https://arxiv.org/abs/2304.12244)改进现有数据集,如何像[Orca](https://arxiv.org/abs/2306.02707)和[phi-1](https://arxiv.org/abs/2306.11644)论文中那样生成高质量合成数据。
179 | * **过滤数据**:传统技术包括使用正则表达式、删除近似重复项、专注于含有大量令牌的答案等。
180 | * **提示模板**:没有真正的标准格式来格式化指令和答案,这就是为什么了解不同的聊天模板很重要,例如[ChatML](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/chatgpt?tabs=python&pivots=programming-language-chat-ml)、[Alpaca](https://crfm.stanford.edu/2023/03/13/alpaca.html)等。
181 |
182 | 📚 **参考资料**:
183 | * [为指令调优准备数据集](https://wandb.ai/capecape/alpaca_ft/reports/How-to-Fine-Tune-an-LLM-Part-1-Preparing-a-Dataset-for-Instruction-Tuning--Vmlldzo1NTcxNzE2) 作者Thomas Capelle:探索Alpaca和Alpaca-GPT4数据集及其格式化方式。
184 | * [生成葡萄牙语临床指令数据集](https://medium.com/mlearning-ai/generating-a-clinical-instruction-dataset-in-portuguese-with-langchain-and-gpt-4-6ee9abfa41ae) 作者Solano Todeschini:教程,介绍如何使用GPT-4创建合成指令数据集。
185 | * [使用GPT 3.5进行新闻分类](https://medium.com/@kshitiz.sahay26/how-i-created-an-instruction-dataset-using-gpt-3-5-to-fine-tune-llama-2-for-news-classification-ed02fe41c81f) 作者Kshitiz Sahay:使用GPT 3.5创建指令数据集,以微调Llama 2进行新闻分类。
186 | * [微调LLM的数据集创建](https://colab.research.google.com/drive/1GH8PW9-zAe4cXEZyOIE-T9uHXblIldAg?usp=sharing):包含一些过滤数据集并上传结果的技术的笔记本。
187 | * [聊天模板](https://huggingface.co/blog/chat-templates) 作者Matthew Carrigan:Hugging Face关于提示模板的页面。
188 |
189 | ---
190 | ### 3. 预训练模型
191 |
192 | 预训练是一个非常漫长且成本高昂的过程,这就是为什么这不是本课程的重点。了解预训练期间发生的事情的某种程度是好的,但不需要亲手实践。
193 |
194 | * **数据管道**:预训练需要巨大的数据集(例如,[Llama 2](https://arxiv.org/abs/2307.09288)是在2万亿令牌上训练的)需要过滤、令牌化,并与预定义的词汇表协同。
195 | * **因果语言建模**:了解因果和掩码语言建模的区别,以及在这种情况下使用的损失函数。为了高效预训练,进一步了解[Megatron-LM](https://github.com/NVIDIA/Megatron-LM)或[gpt-neox](https://github.com/EleutherAI/gpt-neox)。
196 | * **规模定律**:[规模定律](https://arxiv.org/pdf/2001.08361.pdf)根据模型大小、数据集大小和用于训练的计算量描述了预期的模型性能。
197 | * **高性能计算**:这里不在讨论范围内,但如果你计划从头开始创建自己的LLM(硬件、分布式工作负载等),更多关于HPC的知识是基础。
198 |
199 | 📚 **参考资料**:
200 | * [LLMDataHub](https://github.com/Zjh-819/LLMDataHub) 作者Junhao Zhao:为预训练、微调和RLHF精心策划的数据集列表。
201 | * [从头开始预训练因果语言模型](https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt) 作者Hugging Face:使用transformers库从头开始预训练GPT-2模型。
202 | * [TinyLlama](https://github.com/jzhang38/TinyLlama) 作者Zhang等:查看此项目以深入了解如何从头开始训练Llama模型。
203 | * [因果语言建模](https://huggingface.co/docs/transformers/tasks/language_modeling) 作者Hugging Face:解释因果和掩码语言建模的区别,以及如何快速微调DistilGPT-2模型。
204 | * [Chinchilla的狂野含义](https://www.lesswrong.com/posts/6Fpvch8RR29qLEWNH/chinchilla-s-wild-implications) 作者nostalgebraist:讨论规模定律并解释它们对LLM通常意味着什么。
205 | * [BLOOM](https://bigscience.notion.site/BLOOM-BigScience-176B-Model-ad073ca07cdf479398d5f95d88e218c4) 作者BigScience:Notion页面描述了如何构建BLOOM模型,包含大量关于工程部分和遇到的问题的有用信息。
206 | * [OPT-175日志](https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/chronicles/OPT175B_Logbook.pdf) 作者Meta:研究日志显示了什么出了问题,什么做得对。如果你计划预训练一个非常大的语言模型(在这种情况下,175B参数),这非常有用。
207 | * [LLM 360](https://www.llm360.ai/):一个开源LLM的框架,包含训练和数据准备代码、数据、指标和模型。
208 |
209 | ---
210 | ### 4. 监督式微调
211 |
212 | 预训练模型仅在下一个令牌预测任务上进行训练,这就是为什么它们不是有用的助手。SFT允许您调整它们以响应指令。此外,它允许您在任何数据上(私有的,GPT-4未见过的等)微调模型,并在不必支付OpenAI等API费用的情况下使用它。
213 |
214 | * **全面微调**:全面微调指的是训练模型中的所有参数。这不是一种高效的技术,但它产生略微更好的结果。
215 | * [**LoRA**](https://arxiv.org/abs/2106.09685):一种基于低秩适配器的参数高效技术(PEFT)。我们不是训练所有参数,而只训练这些适配器。
216 | * [**QLoRA**](https://arxiv.org/abs/2305.14314):另一种基于LoRA的PEFT,它还将模型的权重量化为4比特,并引入分页优化器来管理内存峰值。结合[Unsloth](https://github.com/unslothai/unsloth)在免费的Colab笔记本上高效运行。
217 | * **[Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl)**:一个用户友好且功能强大的微调工具,用于许多最先进的开源模型。
218 | * [**DeepSpeed**](https://www.deepspeed.ai/):高效的预训练和微调LLM,适用于多GPU和多节点设置(在Axolotl中实现)。
219 |
220 | 📚 **参考资料**:
221 | * [新手LLM训练指南](https://rentry.org/llm-training) 作者Alpin:微调LLM时考虑的主要概念和参数的概览。
222 | * [LoRA洞见](https://lightning.ai/pages/community/lora-insights/) 作者Sebastian Raschka:关于LoRA的实践见解以及如何选择最佳参数。
223 | * [微调你自己的Llama 2模型](https://mlabonne.github.io/blog/posts/Fine_Tune_Your_Own_Llama_2_Model_in_a_Colab_Notebook.html):手把手教程,介绍如何使用Hugging Face库微调Llama 2模型。
224 | * [为因果LLM填充训练示例](https://towardsdatascience.com/padding-large-language-models-examples-with-llama-2-199fb10df8ff) 作者Benjamin Marie:因果LLM填充训练示例的最佳实践
225 | * [LLM微调入门指南](https://mlabonne.github.io/blog/posts/A_Beginners_Guide_to_LLM_Finetuning.html):教程,介绍如何使用Axolotl微调CodeLlama模型。
226 |
227 | ---
228 |
229 | ### 5. 人类反馈的强化学习
230 |
231 | 在监督式微调之后,RLHF是用来将LLM的答案与人类期望对齐的一步。其思想是从人类(或人工)反馈中学习偏好,这可以用来减少偏见、审查模型或使其以更有用的方式行动。它比SFT更复杂,通常被视为可选的。
232 |
233 | * **偏好数据集**:这些数据集通常包含有某种排名的几个答案,这使得它们比指令数据集更难产出。
234 | * [**近端策略优化**](https://arxiv.org/abs/1707.06347):这种算法利用一个奖励模型,预测给定文本是否由人类高度评价。然后,这个预测被用来优化SFT模型,基于KL散度的惩罚。
235 | * **[直接偏好优化](https://arxiv.org/abs/2305.18290)**:DPO通过将问题重构为一个分类问题来简化过程。它使用一个参考模型而不是奖励模型(无需训练),只需要一个超参数,使其更稳定和高效。
236 |
237 | 📚 **参考资料**:
238 | * [使用RLHF训练LLM的介绍](https://wandb.ai/ayush-thakur/Intro-RLAIF/reports/An-Introduction-to-Training-LLMs-Using-Reinforcement-Learning-From-Human-Feedback-RLHF---VmlldzozMzYyNjcy) 作者Ayush Thakur:解释为什么使用RLHF减少偏见和提高LLM的性能是可取的。
239 | * [RLHF插图](https://huggingface.co/blog/rlhf) 作者Hugging Face:使用奖励模型训练和强化学习微调的RLHF介绍。
240 | * [StackLLaMA](https://huggingface.co/blog/stackllama) 作者Hugging Face:使用transformers库高效对齐LLaMA模型与RLHF的教程。
241 | * [LLM训练:RLHF及其替代方案](https://substack.com/profile/27393275-sebastian-raschka-phd) 作者Sebastian Rashcka:RLHF过程及其替代方案如RLAIF的概览。
242 | * [使用DPO微调Mistral-7b](https://huggingface.co/blog/dpo-trl):使用DPO微调Mistral-7b模型并复制[NeuralHermes-2.5](https://huggingface.co/mlabonne/NeuralHermes-2.5-Mistral-7B)的教程。
243 |
244 | ---
245 | ### 6. 评估
246 |
247 | 评估LLM是流程中一个被低估的部分,它既耗时又相对可靠。您的下游任务应该决定您想要评估什么,但总是记得好特德法则:“当一个度量成为目标时,它就不再是一个好的度量。”
248 |
249 | * **传统指标**:像困惑度和BLEU分数这样的指标因为在大多数情境下都存在缺陷而不再流行。了解它们并知道它们何时可以应用仍然很重要。
250 | * **通用基准**:基于[语言模型评估工具包](https://github.com/EleutherAI/lm-evaluation-harness),[Open LLM排行榜](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)是通用LLM(如ChatGPT)的主要基准。还有其他流行的基准,如[BigBench](https://github.com/google/BIG-bench)、[MT-Bench](https://arxiv.org/abs/2306.05685)等。
251 | * **任务特定基准**:如摘要、翻译和问答任务有专门的基准、指标,甚至子领域(医学、金融等),例如生物医学问答的[PubMedQA](https://pubmedqa.github.io/)。
252 | * **人类评估**:最可靠的评估是用户的接受率或人类进行的比较。如果您想知道模型是否表现良好,最简单但最确实的方式是自己使用它。
253 |
254 | 📚 **参考资料**:
255 | * [固定长度模型的困惑度](https://huggingface.co/docs/transformers/perplexity) 作者Hugging Face:使用transformers库实现困惑度的概览及代码。
256 | * [自担风险的BLEU](https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213) 作者Rachael Tatman:BLEU分数及其众多问题的概览及示例。
257 | * [LLM评估综述](https://arxiv.org/abs/2307.03109) 作者Chang等:关于评估什么、在哪里评估以及如何评估的综合论文。
258 | * [聊天机器人竞技场排行榜](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) 作者lmsys:基于人类比较的通用用途LLM的Elo评分。
259 |
260 | ---
261 | ### 7. 量化
262 |
263 | 量化是使用较低精度转换模型的权重(和激活)的过程。例如,使用16位存储的权重可以转换为4位表示。这项技术变得越来越重要,以减少与LLM相关的计算和内存成本。
264 |
265 | * **基础技术**:了解不同精度级别(FP32、FP16、INT8等)以及如何使用absmax和零点技术进行简单量化。
266 | * **GGUF和llama.cpp**:最初设计用于在CPU上运行,[llama.cpp](https://github.com/ggerganov/llama.cpp)和GGUF格式已成为在消费级硬件上运行LLM的最受欢迎的工具。
267 | * **GPTQ和EXL2**:[GPTQ](https://arxiv.org/abs/2210.17323)特别是[EXL2](https://github.com/turboderp/exllamav2)格式提供了令人难以置信的速度,但只能在GPU上运行。模型量化也需要很长时间。
268 | * **AWQ**:这种新格式比GPTQ更精确(困惑度更低),但使用的VRAM更多,不一定更快。
269 |
270 | 📚 **参考资料**:
271 | * [量化入门](https://mlabonne.github.io/blog/posts/Introduction_to_Weight_Quantization.html):量化的概览,absmax和零点量化,以及LLM.int8()的代码。
272 | * [使用llama.cpp量化Llama模型](https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html):使用llama.cpp和GGUF格式量化Llama 2模型的教程。
273 | * [使用GPTQ进行4位LLM量化](https://mlabonne.github.io/blog/posts/Introduction_to_Weight_Quantization.html):使用GPTQ算法和AutoGPTQ量化LLM的教程。
274 | * [ExLlamaV2:运行LLM的最快库](https://mlabonne.github.io/blog/posts/ExLlamaV2_The_Fastest_Library_to_Run%C2%A0LLMs.html):使用EXL2格式量化Mistral模型并使用ExLlamaV2库运行它的指南。
275 | * [了解激活感知权重量化](https://medium.com/friendliai/understanding-activation-aware-weight-quantization-awq-boosting-inference-serving-efficiency-in-10bb0faf63a8) 作者FriendliAI:AWQ技术及其优点的概览。
276 |
277 | ---
278 | ### 8. 新趋势
279 |
280 | * **位置嵌入**:了解LLM如何编码位置,尤其是像[RoPE](https://arxiv.org/abs/2104.09864)这样的相对位置编码方案。实现[YaRN](https://arxiv.org/abs/2309.00071)(通过温度因子乘以注意力矩阵)或[ALiBi](https://arxiv.org/abs/2108.12409)(基于令牌距离的注意力惩罚)以扩展上下文长度。
281 | * **模型合并**:合并训练好的模型已成为一种流行的创建性能模型而无需任何微调的方式。流行的[mergekit](https://github.com/cg123/mergekit)库实现了最流行的合并方法,如SLERP、[DARE](https://arxiv.org/abs/2311.03099)和[TIES](https://arxiv.org/abs/2311.03099)。
282 | * **专家混合**:[Mixtral](https://arxiv.org/abs/2401.04088)由于其出色的性能重新使MoE架构流行起来。同时,通过合并模型(如[Phixtral](https://huggingface.co/mlabonne/phixtral-2x2_8))在OSS社区中出现了一种frankenMoE,这是一个更便宜且高性能的选项。
283 | * **多模态模型**:这些模型(如[CLIP](https://openai.com/research/clip)、[Stable Diffusion](https://stability.ai/stable-image)或[LLaVA](https://llava-vl.github.io/))能够使用统一的嵌入空间处理多种类型的输入(文本、图像、音频等),解锁了强大的应用,如文本到图像。
284 |
285 | 📚 **参考资料**:
286 | * [扩展RoPE](https://blog.eleuther.ai/yarn/) 作者EleutherAI:总结不同位置编码技术的文章。
287 | * [理解YaRN](https://medium.com/@rcrajatchawla/understanding-yarn-extending-context-window-of-llms-3f21e3522465) 作者Rajat Chawla:YaRN介绍。
288 | * [使用mergekit合并LLM](https://mlabonne.github.io/blog/posts/2024-01-08_Merge_LLMs_with_mergekit.html):使用mergekit进行模型合并的教程。
289 | * [专家混合解释](https://huggingface.co/blog/moe) 作者Hugging Face:关于MoE及其工作原理的详尽指南。
290 | * [大型多模态模型](https://huyenchip.com/2023/10/10/multimodal.html) 作者Chip Huyen:多模态系统及该领域近期历史的概览。
291 |
292 |
293 | ## 👷 LLM工程师
294 |
295 | 本课程部分专注于学习如何构建LLM驱动的可以用于生产的应用程序,重点是增强模型和部署它们。
296 |
297 | 
298 |
299 | ### 1. 运行LLM
300 |
301 | 由于高硬件要求,运行LLM可能很困难。根据您的用例,您可能希望简单地通过API(如GPT-4)消费模型,或者在本地运行它。无论哪种情况,额外的提示和引导技术都可以改善和约束您应用程序的输出。
302 |
303 | * **LLM API**:API是部署LLM的便捷方式。这个领域分为私有LLM([OpenAI](https://platform.openai.com/)、[Google](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview)、[Anthropic](https://docs.anthropic.com/claude/reference/getting-started-with-the-api)、[Cohere](https://docs.cohere.com/docs)等)和开源LLM([OpenRouter](https://openrouter.ai/)、[Hugging Face](https://huggingface.co/inference-api)、[Together AI](https://www.together.ai/)等)。
304 | * **开源LLM**:[Hugging Face Hub](https://huggingface.co/models)是寻找LLM的好地方。您可以直接在[Hugging Face Spaces](https://huggingface.co/spaces)中运行其中一些,或者下载并在诸如[LM Studio](https://lmstudio.ai/)的应用中或通过CLI与[llama.cpp](https://github.com/ggerganov/llama.cpp)或[Ollama](https://ollama.ai/)在本地运行它们。
305 | * **提示工程**:常见技术包括零次提示、少次提示、思维链和ReAct。它们在更大的模型上效果更好,但可以适应较小的模型。
306 | * **结构化输出**:许多任务需要结构化输出,如严格的模板或JSON格式。库如[LMQL](https://lmql.ai/)、[Outlines](https://github.com/outlines-dev/outlines)、[Guidance](https://github.com/guidance-ai/guidance)等可用于引导生成并遵循给定结构。
307 |
308 | 📚 **参考资料**:
309 | * [使用LM Studio在本地运行LLM](https://www.kdnuggets.com/run-an-llm-locally-with-lm-studio) 作者Nisha Arya:如何使用LM Studio的简短指南。
310 | * [提示工程指南](https://www.promptingguide.ai/) 作者DAIR.AI:带有示例的提示技术的详尽列表。
311 | * [Outlines - 快速入门](https://outlines-dev.github.io/outlines/quickstart/):Outlines启用的引导生成技术列表。
312 | * [LMQL - 概览](https://lmql.ai/docs/language/overview.html):LMQL语言的介绍。
313 |
314 | ---
315 | ### 2. 构建向量存储
316 |
317 | 创建向量存储是构建检索增强生成(RAG)管道的第一步。文档被加载、拆分,相关块被用来生成向量表示(嵌入),这些向量表示存储以备将来在推理中使用。
318 |
319 | * **文档摄入**:文档加载器是方便的包装器,可以处理多种格式:PDF、JSON、HTML、Markdown等。它们还可以直接从一些数据库和API(GitHub、Reddit、Google Drive等)检索数据。
320 | * **文档拆分**:文本拆分器将文档拆分成较小的、语义上有意义的块。与在*n*个字符后拆分文本相比,最好是按标题或递归拆分,附带一些额外的元数据。
321 | * **嵌入模型**:嵌入模型将文本转换为向量表示。这对于进行语义搜索是必要的,因为它允许更深入、更细腻的理解语言。
322 | * **向量数据库**:向量数据库(如[Chroma](https://www.trychroma.com/)、[Pinecone](https://www.pinecone.io/)、[Milvus](https://milvus.io/)、[FAISS](https://faiss.ai/)、[Annoy](https://github.com/spotify/annoy)等)被设计用于存储嵌入向量。它们使得基于向量相似度快速检索与查询“最相似”的数据成为可能。
323 |
324 | 📚 **参考资料**:
325 | * [LangChain - 文本拆分器](https://python.langchain.com/docs/modules/data_connection/document_transformers/):LangChain中实现的不同文本拆分器的列表。
326 | * [句子转换器库](https://www.sbert.net/):嵌入模型的流行库。
327 | * [MTEB排行榜](https://huggingface.co/spaces/mteb/leaderboard):嵌入模型的排行榜。
328 | * [前5大向量数据库](https://www.datacamp.com/blog/the-top-5-vector-databases) 作者Moez Ali:最佳和最流行的向量数据库的比较。
329 |
330 | ---
331 | ### 3. 检索增强生成
332 |
333 | 使用RAG,LLM从数据库检索上下文文档以提高其回答的准确性。RAG是一种增强模型知识而无需任何微调的流行方式。
334 |
335 | * **协调器**:协调器(如[LangChain](https://python.langchain.com/docs/get_started/introduction)、[LlamaIndex](https://docs.llamaindex.ai/en/stable/)、[FastRAG](https://github.com/IntelLabs/fastRAG)等)是流行的框架,用于将LLM与工具、数据库、记忆等连接并增强其能力。
336 | * **检索器**:用户指令不是为检索优化的。可以应用不同技术(例如,多查询检索器、[HyDE](https://arxiv.org/abs/2212.10496)等)来重述/扩展它们并提高性能。
337 | * **记忆**:为了记住之前的指令和答案,像ChatGPT这样的LLM和聊天机器人将这些历史添加到其上下文窗口中。这个缓冲区可以通过摘要(例如,使用较小的LLM)、向量存储+RAG等进行改进。
338 | * **评估**:我们需要评估文档检索(上下文的精确度和召回率)和生成阶段(忠实度和答案相关性)。可以使用工具[Ragas](https://github.com/explodinggradients/ragas/tree/main)和[DeepEval](https://github.com/confident-ai/deepeval)来简化这一过程。
339 |
340 | 📚 **参考资料**:
341 | * [Llamaindex - 高层概念](https://docs.llamaindex.ai/en/stable/getting_started/concepts.html):构建RAG管道时需要了解的主要概念。
342 | * [Pinecone - 检索增强](https://www.pinecone.io/learn/series/langchain/langchain-retrieval-augmentation/):检索增强过程的概览。
343 | * [LangChain - 问答与RAG](https://python.langchain.com/docs/use_cases/question_answering/quickstart):构建典型RAG管道的分步教程。
344 | * [LangChain - 记忆类型](https://python.langchain.com/docs/modules/memory/types/):不同类型记忆及其相关用途的列表。
345 | * [RAG管道 - 指标](https://docs.ragas.io/en/stable/concepts/metrics/index.html):用于评估RAG管道的主要指标概览。
346 |
347 |
348 | ---
349 | ### 4. 高级 RAG
350 |
351 | 现实生活中的应用可能需要复杂的流程,包括 SQL 或图数据库,以及自动选择相关工具和 API。这些高级技术可以改进基线解决方案,并提供额外的功能。
352 |
353 | * **查询构建**:存储在传统数据库中的结构化数据需要特定的查询语言,如 SQL、Cypher、元数据等。我们可以直接将用户指令翻译成查询语言,以查询构建方式访问数据。
354 | * **代理和工具**:代理通过自动选择最相关的工具来增强 LLM 的功能,以提供答案。这些工具可以简单到使用 Google 或 Wikipedia,也可以复杂到使用 Python 解释器或 Jira。
355 | * **后处理**:喂给 LLM 的输入的最终处理步骤。它通过重排序、[RAG-融合](https://github.com/Raudaschl/rag-fusion) 和分类来提高检索文档的相关性和多样性。
356 |
357 | 📚 **参考资料**:
358 | * [LangChain - 查询构建](https://blog.langchain.dev/query-construction/):关于不同类型查询构建的博客文章。
359 | * [LangChain - SQL](https://python.langchain.com/docs/use_cases/qa_structured/sql):关于如何使用 LLM 与 SQL 数据库交互的教程,包括 Text-to-SQL 和一个可选的 SQL 代理。
360 | * [Pinecone - LLM 代理](https://www.pinecone.io/learn/series/langchain/langchain-agents/):介绍代理和工具的不同类型。
361 | * [由 Lilian Weng 编写的 LLM 动力自主代理](https://lilianweng.github.io/posts/2023-06-23-agent/):关于 LLM 代理的更理论性文章。
362 | * [LangChain - OpenAI 的 RAG](https://blog.langchain.dev/applying-openai-rag/):概述 OpenAI 采用的 RAG 策略,包括后处理。
363 |
364 | ---
365 | ### 5. 推理优化
366 |
367 | 文本生成是一个成本高昂的过程,需要昂贵的硬件。除了量化,还提出了各种技术来最大化吞吐量和减少推理成本。
368 |
369 | * **快速注意力**:优化注意力机制,将其复杂性从二次方降低到线性,加速训练和推理。
370 | * **键值缓存**:了解键值缓存及在 [多查询注意力](https://arxiv.org/abs/1911.02150)(MQA)和 [分组查询注意力](https://arxiv.org/abs/2305.13245)(GQA)中引入的改进。
371 | * **推测性解码**:使用小模型产生草稿,然后由更大的模型复审,以加速文本生成。
372 |
373 | 📚 **参考资料**:
374 | * [Hugging Face 的 GPU 推理](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one):解释如何在 GPU 上优化推理。
375 | * [Databricks 的 LLM 推理](https://www.databricks.com/blog/llm-inference-performance-engineering-best-practices):关于如何在生产中优化 LLM 推理的最佳实践。
376 | * [为速度和内存优化 LLM](https://huggingface.co/docs/transformers/main/en/llm_tutorial_optimization) 由 Hugging Face 提供:解释三种主要优化速度和内存的技术,即量化、快速注意力和架构创新。
377 | * [辅助生成](https://huggingface.co/blog/assisted-generation) 由 Hugging Face 提供:HF 版本的推测性解码,这是一个有趣的博客文章,介绍了它是如何工作的,并提供了实现它的代码。
378 |
379 | ---
380 | ### 6. 部署 LLM
381 |
382 | 在多个GPU 集群上部署 LLM 是一项工程壮举。在其他场景中,演示和本地应用可以通过更低的复杂性来实现。
383 |
384 | * **本地部署**:隐私是开源 LLM 相对于私有 LLM 的一个重要优势。本地 LLM 服务器(如 [LM Studio](https://lmstudio.ai/)、[Ollama](https://ollama.ai/)、[oobabooga](https://github.com/oobabooga/text-generation-webui)、[kobold.cpp](https://github.com/LostRuins/koboldcpp) 等)利用这一优势为本地应用提供动力。
385 | * **演示部署**:像 [Gradio](https://www.gradio.app/) 和 [Streamlit](https://docs.streamlit.io/) 这样的框架有助于原型应用的开发和演示分享。你也可以轻松地在线托管它们,例如使用 [Hugging Face Spaces](https://huggingface.co/spaces)。
386 | * **服务器部署**:在大规模部署 LLM 需要云(参见 [SkyPilot](https://skypilot.readthedocs.io/en/latest/))或本地基础设施,并且经常利用优化的文本生成框架,如 [TGI](https://github.com/huggingface/text-generation-inference)、[vLLM](https://github.com/vllm-project/vllm/tree/main) 等。
387 | * **边缘部署**:在受限环境中,如 [MLC LLM](https://github.com/mlc-ai/mlc-llm) 和 [mnn-llm](https://github.com/wangzhaode/mnn-llm/blob/master/README_en.md) 这样的高性能框架可以在网络浏览器、Android 和 iOS 中部署 LLM。
388 |
389 | 📚 **参考资料**:
390 | * [Streamlit - 构建基础 LLM 应用](https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps):使用 Streamlit 构建基本的 ChatGPT 类应用的教程。
391 | * [HF LLM 推理容器](https://huggingface.co/blog/sagemaker-huggingface-llm):使用 Hugging Face 的推理容器在 Amazon SageMaker 上部署 LLM。
392 | * [Philschmid 博客](https://www.philschmid.de/) 由 Philipp Schmid 编写:关于使用 Amazon SageMaker 部署 LLM 的高质量文章集合。
393 | * [优化延迟](https://hamel.dev/notes/llm/inference/03_inference.html) 由 Hamel Husain 编写:比较了 TGI、vLLM、CTranslate2 和 mlc 在吞吐量和延迟方面的优势。
394 |
395 | ---
396 |
397 | ---
398 | ### 7. 保护LLM
399 |
400 | 除了与软件相关的传统安全问题外,由于LLM的训练和提示方式,它们还存在独特的弱点。
401 |
402 | * **提示攻击**:与提示工程相关的不同技术,包括提示注入(额外的指令以劫持模型的答案)、数据/提示泄露(检索其原始数据/提示)和越狱(制作提示以绕过安全特性)。
403 | * **后门**:攻击向量可以直接针对训练数据本身,通过污染训练数据(例如,使用错误信息)或创建后门(在推理期间改变模型行为的秘密触发器)。
404 | * **防御措施**:保护您的LLM应用程序的最佳方式是对这些漏洞进行测试(例如,使用红队测试和像[garak](https://github.com/leondz/garak/)这样的检查)并在生产中观察它们(使用像[langfuse](https://github.com/langfuse/langfuse)这样的框架)。
405 |
406 | 📚 **参考资料**:
407 | * [OWASP LLM十大安全问题](https://owasp.org/www-project-top-10-for-large-language-model-applications/) 作者HEGO Wiki:LLM应用程序中观察到的10个最严重的安全漏洞列表。
408 | * [提示注入入门](https://github.com/jthack/PIPE) 作者Joseph Thacker:专为工程师准备的提示注入简短指南。
409 | * [LLM安全](https://llmsecurity.net/) 作者[@llm_sec](https://twitter.com/llm_sec):与LLM安全相关的广泛资源列表。
410 | * [LLM红队测试](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/red-teaming) 作者Microsoft:如何对LLM进行红队测试的指南。
411 |
412 |
413 |
414 | ## 鸣谢
415 |
416 | 这份路线图的灵感来自于Milan Milanović和Romano Roth出色的[DevOps路线图](https://github.com/milanm/DevOps-Roadmap)。
417 |
418 | 特别感谢:
419 |
420 | * Thomas Thelen激励我创建这份路线图
421 | * André Frade对初稿的投入和审阅
422 | * Dino Dunn提供有关LLM安全性的资源
423 |
424 | *免责声明:我与此处列出的任何来源无关。*
425 |
426 | ---
427 |
428 |
429 |
430 |
431 |
--------------------------------------------------------------------------------
/Fine_tune_a_Mistral_7b_model_with_DPO.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "machine_shape": "hm",
8 | "gpuType": "A100",
9 | "authorship_tag": "ABX9TyNuIN7/ICiXCX5xELzN1Y3R",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "language_info": {
17 | "name": "python"
18 | },
19 | "widgets": {
20 | "application/vnd.jupyter.widget-state+json": {
21 | "22773c721a7c4221a9c14cd388461d4c": {
22 | "model_module": "@jupyter-widgets/controls",
23 | "model_name": "HBoxModel",
24 | "model_module_version": "1.5.0",
25 | "state": {
26 | "_dom_classes": [],
27 | "_model_module": "@jupyter-widgets/controls",
28 | "_model_module_version": "1.5.0",
29 | "_model_name": "HBoxModel",
30 | "_view_count": null,
31 | "_view_module": "@jupyter-widgets/controls",
32 | "_view_module_version": "1.5.0",
33 | "_view_name": "HBoxView",
34 | "box_style": "",
35 | "children": [
36 | "IPY_MODEL_6b54841f5de1482694c360095dae3039",
37 | "IPY_MODEL_448ccbc85e624ec3b3e71931a7ee4ff6",
38 | "IPY_MODEL_173769f6f465485f8848a11bf269850b"
39 | ],
40 | "layout": "IPY_MODEL_60978b9b4e8348f0a71ce3e35c73bcff"
41 | }
42 | },
43 | "6b54841f5de1482694c360095dae3039": {
44 | "model_module": "@jupyter-widgets/controls",
45 | "model_name": "HTMLModel",
46 | "model_module_version": "1.5.0",
47 | "state": {
48 | "_dom_classes": [],
49 | "_model_module": "@jupyter-widgets/controls",
50 | "_model_module_version": "1.5.0",
51 | "_model_name": "HTMLModel",
52 | "_view_count": null,
53 | "_view_module": "@jupyter-widgets/controls",
54 | "_view_module_version": "1.5.0",
55 | "_view_name": "HTMLView",
56 | "description": "",
57 | "description_tooltip": null,
58 | "layout": "IPY_MODEL_6a38dcbaf4674b448329ac0a16587d2a",
59 | "placeholder": "",
60 | "style": "IPY_MODEL_7eaeada2158e493189449af91f643553",
61 | "value": "Loading checkpoint shards: 100%"
62 | }
63 | },
64 | "448ccbc85e624ec3b3e71931a7ee4ff6": {
65 | "model_module": "@jupyter-widgets/controls",
66 | "model_name": "FloatProgressModel",
67 | "model_module_version": "1.5.0",
68 | "state": {
69 | "_dom_classes": [],
70 | "_model_module": "@jupyter-widgets/controls",
71 | "_model_module_version": "1.5.0",
72 | "_model_name": "FloatProgressModel",
73 | "_view_count": null,
74 | "_view_module": "@jupyter-widgets/controls",
75 | "_view_module_version": "1.5.0",
76 | "_view_name": "ProgressView",
77 | "bar_style": "success",
78 | "description": "",
79 | "description_tooltip": null,
80 | "layout": "IPY_MODEL_6e32854952b340008edca0139d3471d6",
81 | "max": 3,
82 | "min": 0,
83 | "orientation": "horizontal",
84 | "style": "IPY_MODEL_db6d7cfcdade4b4baa213a5d0abc07d7",
85 | "value": 3
86 | }
87 | },
88 | "173769f6f465485f8848a11bf269850b": {
89 | "model_module": "@jupyter-widgets/controls",
90 | "model_name": "HTMLModel",
91 | "model_module_version": "1.5.0",
92 | "state": {
93 | "_dom_classes": [],
94 | "_model_module": "@jupyter-widgets/controls",
95 | "_model_module_version": "1.5.0",
96 | "_model_name": "HTMLModel",
97 | "_view_count": null,
98 | "_view_module": "@jupyter-widgets/controls",
99 | "_view_module_version": "1.5.0",
100 | "_view_name": "HTMLView",
101 | "description": "",
102 | "description_tooltip": null,
103 | "layout": "IPY_MODEL_9083029642744c43b7705532cbe0cf79",
104 | "placeholder": "",
105 | "style": "IPY_MODEL_d028a98caa13425b907ceb513119006e",
106 | "value": " 3/3 [00:11<00:00, 2.89s/it]"
107 | }
108 | },
109 | "60978b9b4e8348f0a71ce3e35c73bcff": {
110 | "model_module": "@jupyter-widgets/base",
111 | "model_name": "LayoutModel",
112 | "model_module_version": "1.2.0",
113 | "state": {
114 | "_model_module": "@jupyter-widgets/base",
115 | "_model_module_version": "1.2.0",
116 | "_model_name": "LayoutModel",
117 | "_view_count": null,
118 | "_view_module": "@jupyter-widgets/base",
119 | "_view_module_version": "1.2.0",
120 | "_view_name": "LayoutView",
121 | "align_content": null,
122 | "align_items": null,
123 | "align_self": null,
124 | "border": null,
125 | "bottom": null,
126 | "display": null,
127 | "flex": null,
128 | "flex_flow": null,
129 | "grid_area": null,
130 | "grid_auto_columns": null,
131 | "grid_auto_flow": null,
132 | "grid_auto_rows": null,
133 | "grid_column": null,
134 | "grid_gap": null,
135 | "grid_row": null,
136 | "grid_template_areas": null,
137 | "grid_template_columns": null,
138 | "grid_template_rows": null,
139 | "height": null,
140 | "justify_content": null,
141 | "justify_items": null,
142 | "left": null,
143 | "margin": null,
144 | "max_height": null,
145 | "max_width": null,
146 | "min_height": null,
147 | "min_width": null,
148 | "object_fit": null,
149 | "object_position": null,
150 | "order": null,
151 | "overflow": null,
152 | "overflow_x": null,
153 | "overflow_y": null,
154 | "padding": null,
155 | "right": null,
156 | "top": null,
157 | "visibility": null,
158 | "width": null
159 | }
160 | },
161 | "6a38dcbaf4674b448329ac0a16587d2a": {
162 | "model_module": "@jupyter-widgets/base",
163 | "model_name": "LayoutModel",
164 | "model_module_version": "1.2.0",
165 | "state": {
166 | "_model_module": "@jupyter-widgets/base",
167 | "_model_module_version": "1.2.0",
168 | "_model_name": "LayoutModel",
169 | "_view_count": null,
170 | "_view_module": "@jupyter-widgets/base",
171 | "_view_module_version": "1.2.0",
172 | "_view_name": "LayoutView",
173 | "align_content": null,
174 | "align_items": null,
175 | "align_self": null,
176 | "border": null,
177 | "bottom": null,
178 | "display": null,
179 | "flex": null,
180 | "flex_flow": null,
181 | "grid_area": null,
182 | "grid_auto_columns": null,
183 | "grid_auto_flow": null,
184 | "grid_auto_rows": null,
185 | "grid_column": null,
186 | "grid_gap": null,
187 | "grid_row": null,
188 | "grid_template_areas": null,
189 | "grid_template_columns": null,
190 | "grid_template_rows": null,
191 | "height": null,
192 | "justify_content": null,
193 | "justify_items": null,
194 | "left": null,
195 | "margin": null,
196 | "max_height": null,
197 | "max_width": null,
198 | "min_height": null,
199 | "min_width": null,
200 | "object_fit": null,
201 | "object_position": null,
202 | "order": null,
203 | "overflow": null,
204 | "overflow_x": null,
205 | "overflow_y": null,
206 | "padding": null,
207 | "right": null,
208 | "top": null,
209 | "visibility": null,
210 | "width": null
211 | }
212 | },
213 | "7eaeada2158e493189449af91f643553": {
214 | "model_module": "@jupyter-widgets/controls",
215 | "model_name": "DescriptionStyleModel",
216 | "model_module_version": "1.5.0",
217 | "state": {
218 | "_model_module": "@jupyter-widgets/controls",
219 | "_model_module_version": "1.5.0",
220 | "_model_name": "DescriptionStyleModel",
221 | "_view_count": null,
222 | "_view_module": "@jupyter-widgets/base",
223 | "_view_module_version": "1.2.0",
224 | "_view_name": "StyleView",
225 | "description_width": ""
226 | }
227 | },
228 | "6e32854952b340008edca0139d3471d6": {
229 | "model_module": "@jupyter-widgets/base",
230 | "model_name": "LayoutModel",
231 | "model_module_version": "1.2.0",
232 | "state": {
233 | "_model_module": "@jupyter-widgets/base",
234 | "_model_module_version": "1.2.0",
235 | "_model_name": "LayoutModel",
236 | "_view_count": null,
237 | "_view_module": "@jupyter-widgets/base",
238 | "_view_module_version": "1.2.0",
239 | "_view_name": "LayoutView",
240 | "align_content": null,
241 | "align_items": null,
242 | "align_self": null,
243 | "border": null,
244 | "bottom": null,
245 | "display": null,
246 | "flex": null,
247 | "flex_flow": null,
248 | "grid_area": null,
249 | "grid_auto_columns": null,
250 | "grid_auto_flow": null,
251 | "grid_auto_rows": null,
252 | "grid_column": null,
253 | "grid_gap": null,
254 | "grid_row": null,
255 | "grid_template_areas": null,
256 | "grid_template_columns": null,
257 | "grid_template_rows": null,
258 | "height": null,
259 | "justify_content": null,
260 | "justify_items": null,
261 | "left": null,
262 | "margin": null,
263 | "max_height": null,
264 | "max_width": null,
265 | "min_height": null,
266 | "min_width": null,
267 | "object_fit": null,
268 | "object_position": null,
269 | "order": null,
270 | "overflow": null,
271 | "overflow_x": null,
272 | "overflow_y": null,
273 | "padding": null,
274 | "right": null,
275 | "top": null,
276 | "visibility": null,
277 | "width": null
278 | }
279 | },
280 | "db6d7cfcdade4b4baa213a5d0abc07d7": {
281 | "model_module": "@jupyter-widgets/controls",
282 | "model_name": "ProgressStyleModel",
283 | "model_module_version": "1.5.0",
284 | "state": {
285 | "_model_module": "@jupyter-widgets/controls",
286 | "_model_module_version": "1.5.0",
287 | "_model_name": "ProgressStyleModel",
288 | "_view_count": null,
289 | "_view_module": "@jupyter-widgets/base",
290 | "_view_module_version": "1.2.0",
291 | "_view_name": "StyleView",
292 | "bar_color": null,
293 | "description_width": ""
294 | }
295 | },
296 | "9083029642744c43b7705532cbe0cf79": {
297 | "model_module": "@jupyter-widgets/base",
298 | "model_name": "LayoutModel",
299 | "model_module_version": "1.2.0",
300 | "state": {
301 | "_model_module": "@jupyter-widgets/base",
302 | "_model_module_version": "1.2.0",
303 | "_model_name": "LayoutModel",
304 | "_view_count": null,
305 | "_view_module": "@jupyter-widgets/base",
306 | "_view_module_version": "1.2.0",
307 | "_view_name": "LayoutView",
308 | "align_content": null,
309 | "align_items": null,
310 | "align_self": null,
311 | "border": null,
312 | "bottom": null,
313 | "display": null,
314 | "flex": null,
315 | "flex_flow": null,
316 | "grid_area": null,
317 | "grid_auto_columns": null,
318 | "grid_auto_flow": null,
319 | "grid_auto_rows": null,
320 | "grid_column": null,
321 | "grid_gap": null,
322 | "grid_row": null,
323 | "grid_template_areas": null,
324 | "grid_template_columns": null,
325 | "grid_template_rows": null,
326 | "height": null,
327 | "justify_content": null,
328 | "justify_items": null,
329 | "left": null,
330 | "margin": null,
331 | "max_height": null,
332 | "max_width": null,
333 | "min_height": null,
334 | "min_width": null,
335 | "object_fit": null,
336 | "object_position": null,
337 | "order": null,
338 | "overflow": null,
339 | "overflow_x": null,
340 | "overflow_y": null,
341 | "padding": null,
342 | "right": null,
343 | "top": null,
344 | "visibility": null,
345 | "width": null
346 | }
347 | },
348 | "d028a98caa13425b907ceb513119006e": {
349 | "model_module": "@jupyter-widgets/controls",
350 | "model_name": "DescriptionStyleModel",
351 | "model_module_version": "1.5.0",
352 | "state": {
353 | "_model_module": "@jupyter-widgets/controls",
354 | "_model_module_version": "1.5.0",
355 | "_model_name": "DescriptionStyleModel",
356 | "_view_count": null,
357 | "_view_module": "@jupyter-widgets/base",
358 | "_view_module_version": "1.2.0",
359 | "_view_name": "StyleView",
360 | "description_width": ""
361 | }
362 | }
363 | }
364 | },
365 | "accelerator": "GPU"
366 | },
367 | "cells": [
368 | {
369 | "cell_type": "markdown",
370 | "metadata": {
371 | "id": "view-in-github",
372 | "colab_type": "text"
373 | },
374 | "source": [
375 | " "
376 | ]
377 | },
378 | {
379 | "cell_type": "markdown",
380 | "source": [
381 | "# Fine-tune a Mistral-7b model with DPO\n",
382 | "\n",
383 | "> 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n",
384 | "\n",
385 | "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne)."
386 | ],
387 | "metadata": {
388 | "id": "Pa8905-YsHAn"
389 | }
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "metadata": {
395 | "id": "_zIBL8IssExG"
396 | },
397 | "outputs": [],
398 | "source": [
399 | "!pip install -q datasets trl peft bitsandbytes sentencepiece wandb"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "source": [
405 | "import os\n",
406 | "import gc\n",
407 | "import torch\n",
408 | "\n",
409 | "import transformers\n",
410 | "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n",
411 | "from datasets import load_dataset\n",
412 | "from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training\n",
413 | "from trl import DPOTrainer\n",
414 | "import bitsandbytes as bnb\n",
415 | "from google.colab import userdata\n",
416 | "import wandb\n",
417 | "\n",
418 | "# Defined in the secrets tab in Google Colab\n",
419 | "hf_token = userdata.get('huggingface')\n",
420 | "wb_token = userdata.get('wandb')\n",
421 | "wandb.login(key=wb_token)\n",
422 | "\n",
423 | "model_name = \"teknium/OpenHermes-2.5-Mistral-7B\"\n",
424 | "new_model = \"NeuralHermes-2.5-Mistral-7B\""
425 | ],
426 | "metadata": {
427 | "colab": {
428 | "base_uri": "https://localhost:8080/"
429 | },
430 | "id": "YpdkZsMNylvp",
431 | "outputId": "6c2df234-1ce7-4cd2-a7e3-567e7536319f"
432 | },
433 | "execution_count": null,
434 | "outputs": [
435 | {
436 | "output_type": "stream",
437 | "name": "stderr",
438 | "text": [
439 | "/usr/local/lib/python3.10/dist-packages/trl/trainer/ppo_config.py:141: UserWarning: The `optimize_cuda_cache` arguement will be deprecated soon, please use `optimize_device_cache` instead.\n",
440 | " warnings.warn(\n",
441 | "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmlabonne\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
442 | "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n",
443 | "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
444 | "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n"
445 | ]
446 | }
447 | ]
448 | },
449 | {
450 | "cell_type": "markdown",
451 | "source": [
452 | "## Format dataset"
453 | ],
454 | "metadata": {
455 | "id": "d8CvUgROUDw-"
456 | }
457 | },
458 | {
459 | "cell_type": "code",
460 | "source": [
461 | "def chatml_format(example):\n",
462 | " # Format system\n",
463 | " if len(example['system']) > 0:\n",
464 | " message = {\"role\": \"system\", \"content\": example['system']}\n",
465 | " system = tokenizer.apply_chat_template([message], tokenize=False)\n",
466 | " else:\n",
467 | " system = \"\"\n",
468 | "\n",
469 | " # Format instruction\n",
470 | " message = {\"role\": \"user\", \"content\": example['question']}\n",
471 | " prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)\n",
472 | "\n",
473 | " # Format chosen answer\n",
474 | " chosen = example['chosen'] + \"<|im_end|>\\n\"\n",
475 | "\n",
476 | " # Format rejected answer\n",
477 | " rejected = example['rejected'] + \"<|im_end|>\\n\"\n",
478 | "\n",
479 | " return {\n",
480 | " \"prompt\": system + prompt,\n",
481 | " \"chosen\": chosen,\n",
482 | " \"rejected\": rejected,\n",
483 | " }\n",
484 | "\n",
485 | "# Load dataset\n",
486 | "dataset = load_dataset(\"Intel/orca_dpo_pairs\")['train']\n",
487 | "\n",
488 | "# Save columns\n",
489 | "original_columns = dataset.column_names\n",
490 | "\n",
491 | "# Tokenizer\n",
492 | "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
493 | "tokenizer.pad_token = tokenizer.eos_token\n",
494 | "tokenizer.padding_side = \"left\"\n",
495 | "\n",
496 | "# Format dataset\n",
497 | "dataset = dataset.map(\n",
498 | " chatml_format,\n",
499 | " remove_columns=original_columns\n",
500 | ")\n",
501 | "\n",
502 | "# Print sample\n",
503 | "dataset[1]"
504 | ],
505 | "metadata": {
506 | "colab": {
507 | "base_uri": "https://localhost:8080/"
508 | },
509 | "id": "MCD77GZ60DOT",
510 | "outputId": "c7c6773c-5545-4fee-bfa3-6fa6d69c0f3f"
511 | },
512 | "execution_count": null,
513 | "outputs": [
514 | {
515 | "output_type": "stream",
516 | "name": "stderr",
517 | "text": [
518 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
519 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
520 | ]
521 | },
522 | {
523 | "output_type": "execute_result",
524 | "data": {
525 | "text/plain": [
526 | "{'prompt': '<|im_start|>system\\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>\\n<|im_start|>user\\nGenerate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One<|im_end|>\\n<|im_start|>assistant\\n',\n",
527 | " 'chosen': 'Midsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.<|im_end|>\\n',\n",
528 | " 'rejected': ' Sure! Here\\'s a sentence that describes all the data you provided:\\n\\n\"Midsummer House is a moderately priced Chinese restaurant with a customer rating of 3 out of 5, located near All Bar One, offering a variety of delicious dishes.\"<|im_end|>\\n'}"
529 | ]
530 | },
531 | "metadata": {},
532 | "execution_count": 3
533 | }
534 | ]
535 | },
536 | {
537 | "cell_type": "markdown",
538 | "source": [
539 | "## Train model with DPO"
540 | ],
541 | "metadata": {
542 | "id": "DeT5eUK_UJgK"
543 | }
544 | },
545 | {
546 | "cell_type": "code",
547 | "source": [
548 | "# LoRA configuration\n",
549 | "peft_config = LoraConfig(\n",
550 | " r=16,\n",
551 | " lora_alpha=16,\n",
552 | " lora_dropout=0.05,\n",
553 | " bias=\"none\",\n",
554 | " task_type=\"CAUSAL_LM\",\n",
555 | " target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']\n",
556 | ")\n",
557 | "\n",
558 | "# Model to fine-tune\n",
559 | "model = AutoModelForCausalLM.from_pretrained(\n",
560 | " model_name,\n",
561 | " torch_dtype=torch.float16,\n",
562 | " load_in_4bit=True\n",
563 | ")\n",
564 | "model.config.use_cache = False\n",
565 | "\n",
566 | "# Training arguments\n",
567 | "training_args = TrainingArguments(\n",
568 | " per_device_train_batch_size=4,\n",
569 | " gradient_accumulation_steps=4,\n",
570 | " gradient_checkpointing=True,\n",
571 | " learning_rate=5e-5,\n",
572 | " lr_scheduler_type=\"cosine\",\n",
573 | " max_steps=200,\n",
574 | " save_strategy=\"no\",\n",
575 | " logging_steps=1,\n",
576 | " output_dir=new_model,\n",
577 | " optim=\"paged_adamw_32bit\",\n",
578 | " warmup_steps=100,\n",
579 | " bf16=True,\n",
580 | " report_to=\"wandb\",\n",
581 | ")\n",
582 | "\n",
583 | "# Create DPO trainer\n",
584 | "dpo_trainer = DPOTrainer(\n",
585 | " model,\n",
586 | " args=training_args,\n",
587 | " train_dataset=dataset,\n",
588 | " tokenizer=tokenizer,\n",
589 | " peft_config=peft_config,\n",
590 | " beta=0.1,\n",
591 | " max_prompt_length=1024,\n",
592 | " max_length=1536,\n",
593 | ")\n",
594 | "\n",
595 | "# Fine-tune model with DPO\n",
596 | "dpo_trainer.train()"
597 | ],
598 | "metadata": {
599 | "id": "rKPILNOLR-aK"
600 | },
601 | "execution_count": null,
602 | "outputs": []
603 | },
604 | {
605 | "cell_type": "markdown",
606 | "source": [
607 | "## Upload model"
608 | ],
609 | "metadata": {
610 | "id": "3LdhPpcrUM3H"
611 | }
612 | },
613 | {
614 | "cell_type": "code",
615 | "source": [
616 | "# Save artifacts\n",
617 | "dpo_trainer.model.save_pretrained(\"final_checkpoint\")\n",
618 | "tokenizer.save_pretrained(\"final_checkpoint\")\n",
619 | "\n",
620 | "# Flush memory\n",
621 | "del dpo_trainer, model\n",
622 | "gc.collect()\n",
623 | "torch.cuda.empty_cache()\n",
624 | "\n",
625 | "# Reload model in FP16 (instead of NF4)\n",
626 | "base_model = AutoModelForCausalLM.from_pretrained(\n",
627 | " model_name,\n",
628 | " return_dict=True,\n",
629 | " torch_dtype=torch.float16,\n",
630 | ")\n",
631 | "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
632 | "\n",
633 | "# Merge base model with the adapter\n",
634 | "model = PeftModel.from_pretrained(base_model, \"final_checkpoint\")\n",
635 | "model = model.merge_and_unload()\n",
636 | "\n",
637 | "# Save model and tokenizer\n",
638 | "model.save_pretrained(new_model)\n",
639 | "tokenizer.save_pretrained(new_model)\n",
640 | "\n",
641 | "# Push them to the HF Hub\n",
642 | "model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)\n",
643 | "tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)"
644 | ],
645 | "metadata": {
646 | "id": "h7cIvxcTfBC4"
647 | },
648 | "execution_count": null,
649 | "outputs": []
650 | },
651 | {
652 | "cell_type": "markdown",
653 | "source": [
654 | "## Inference"
655 | ],
656 | "metadata": {
657 | "id": "G6EFsmS4UOgV"
658 | }
659 | },
660 | {
661 | "cell_type": "code",
662 | "source": [
663 | "# Format prompt\n",
664 | "message = [\n",
665 | " {\"role\": \"system\", \"content\": \"You are a helpful assistant chatbot.\"},\n",
666 | " {\"role\": \"user\", \"content\": \"What is a Large Language Model?\"}\n",
667 | "]\n",
668 | "tokenizer = AutoTokenizer.from_pretrained(new_model)\n",
669 | "prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)\n",
670 | "\n",
671 | "# Create pipeline\n",
672 | "pipeline = transformers.pipeline(\n",
673 | " \"text-generation\",\n",
674 | " model=new_model,\n",
675 | " tokenizer=tokenizer\n",
676 | ")\n",
677 | "\n",
678 | "# Generate text\n",
679 | "sequences = pipeline(\n",
680 | " prompt,\n",
681 | " do_sample=True,\n",
682 | " temperature=0.7,\n",
683 | " top_p=0.9,\n",
684 | " num_return_sequences=1,\n",
685 | " max_length=200,\n",
686 | ")\n",
687 | "print(sequences[0]['generated_text'])"
688 | ],
689 | "metadata": {
690 | "colab": {
691 | "base_uri": "https://localhost:8080/",
692 | "height": 251,
693 | "referenced_widgets": [
694 | "22773c721a7c4221a9c14cd388461d4c",
695 | "6b54841f5de1482694c360095dae3039",
696 | "448ccbc85e624ec3b3e71931a7ee4ff6",
697 | "173769f6f465485f8848a11bf269850b",
698 | "60978b9b4e8348f0a71ce3e35c73bcff",
699 | "6a38dcbaf4674b448329ac0a16587d2a",
700 | "7eaeada2158e493189449af91f643553",
701 | "6e32854952b340008edca0139d3471d6",
702 | "db6d7cfcdade4b4baa213a5d0abc07d7",
703 | "9083029642744c43b7705532cbe0cf79",
704 | "d028a98caa13425b907ceb513119006e"
705 | ]
706 | },
707 | "id": "LAEUZFjvlJOv",
708 | "outputId": "9b5720c7-49ef-45c7-e5a7-f38d64899b1e"
709 | },
710 | "execution_count": null,
711 | "outputs": [
712 | {
713 | "output_type": "stream",
714 | "name": "stderr",
715 | "text": [
716 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
717 | ]
718 | },
719 | {
720 | "output_type": "display_data",
721 | "data": {
722 | "text/plain": [
723 | "Loading checkpoint shards: 0%| | 0/3 [00:00, ?it/s]"
724 | ],
725 | "application/vnd.jupyter.widget-view+json": {
726 | "version_major": 2,
727 | "version_minor": 0,
728 | "model_id": "22773c721a7c4221a9c14cd388461d4c"
729 | }
730 | },
731 | "metadata": {}
732 | },
733 | {
734 | "output_type": "stream",
735 | "name": "stderr",
736 | "text": [
737 | "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1473: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )\n",
738 | " warnings.warn(\n",
739 | "Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.\n"
740 | ]
741 | },
742 | {
743 | "output_type": "stream",
744 | "name": "stdout",
745 | "text": [
746 | "<|im_start|>system\n",
747 | "You are a helpful assistant chatbot.<|im_end|>\n",
748 | "<|im_start|>user\n",
749 | "What is a Large Language Model?<|im_end|>\n",
750 | "<|im_start|>assistant\n",
751 | "A large language model is a type of artificial intelligence (AI) system that has been trained on vast amounts of text data. These models are designed to understand and generate human language, allowing them to perform various natural language processing tasks, such as text generation, language translation, and question answering. Large language models typically use deep learning techniques, like recurrent neural networks (RNNs) or transformers, to learn patterns and relationships in the data, enabling them to generate coherent and contextually relevant responses. The size of these models, in terms of the number of parameters and the volume of data they are trained on, plays a significant role in their ability to comprehend and produce complex language structures.\n"
752 | ]
753 | }
754 | ]
755 | }
756 | ]
757 | }
--------------------------------------------------------------------------------
/Mergekit.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "machine_shape": "hm",
8 | "authorship_tag": "ABX9TyO11tndDoFEdL4+/FDgLad9",
9 | "include_colab_link": true
10 | },
11 | "kernelspec": {
12 | "name": "python3",
13 | "display_name": "Python 3"
14 | },
15 | "language_info": {
16 | "name": "python"
17 | },
18 | "widgets": {
19 | "application/vnd.jupyter.widget-state+json": {
20 | "de24d272f2b842c5b01eedb3f536b810": {
21 | "model_module": "@jupyter-widgets/controls",
22 | "model_name": "HBoxModel",
23 | "model_module_version": "1.5.0",
24 | "state": {
25 | "_dom_classes": [],
26 | "_model_module": "@jupyter-widgets/controls",
27 | "_model_module_version": "1.5.0",
28 | "_model_name": "HBoxModel",
29 | "_view_count": null,
30 | "_view_module": "@jupyter-widgets/controls",
31 | "_view_module_version": "1.5.0",
32 | "_view_name": "HBoxView",
33 | "box_style": "",
34 | "children": [
35 | "IPY_MODEL_0c5dab2657b2473385a424d90f3d4664",
36 | "IPY_MODEL_57efe36e546c473d8be34102f6ba9a58",
37 | "IPY_MODEL_871bad1d905d4877a9eaa242cfd54c4e"
38 | ],
39 | "layout": "IPY_MODEL_8951f6b2edf64464869391197c900f84"
40 | }
41 | },
42 | "0c5dab2657b2473385a424d90f3d4664": {
43 | "model_module": "@jupyter-widgets/controls",
44 | "model_name": "HTMLModel",
45 | "model_module_version": "1.5.0",
46 | "state": {
47 | "_dom_classes": [],
48 | "_model_module": "@jupyter-widgets/controls",
49 | "_model_module_version": "1.5.0",
50 | "_model_name": "HTMLModel",
51 | "_view_count": null,
52 | "_view_module": "@jupyter-widgets/controls",
53 | "_view_module_version": "1.5.0",
54 | "_view_name": "HTMLView",
55 | "description": "",
56 | "description_tooltip": null,
57 | "layout": "IPY_MODEL_69a61ad28d5141dcbaea44060bc5ebf7",
58 | "placeholder": "",
59 | "style": "IPY_MODEL_76c2fbf005ae4a5790edfeb499b387b7",
60 | "value": "tokenizer.model: 100%"
61 | }
62 | },
63 | "57efe36e546c473d8be34102f6ba9a58": {
64 | "model_module": "@jupyter-widgets/controls",
65 | "model_name": "FloatProgressModel",
66 | "model_module_version": "1.5.0",
67 | "state": {
68 | "_dom_classes": [],
69 | "_model_module": "@jupyter-widgets/controls",
70 | "_model_module_version": "1.5.0",
71 | "_model_name": "FloatProgressModel",
72 | "_view_count": null,
73 | "_view_module": "@jupyter-widgets/controls",
74 | "_view_module_version": "1.5.0",
75 | "_view_name": "ProgressView",
76 | "bar_style": "success",
77 | "description": "",
78 | "description_tooltip": null,
79 | "layout": "IPY_MODEL_116964f328dc45d991d895d684ac1216",
80 | "max": 493443,
81 | "min": 0,
82 | "orientation": "horizontal",
83 | "style": "IPY_MODEL_1ecec5ba4424498082a5f64cf3d7faf8",
84 | "value": 493443
85 | }
86 | },
87 | "871bad1d905d4877a9eaa242cfd54c4e": {
88 | "model_module": "@jupyter-widgets/controls",
89 | "model_name": "HTMLModel",
90 | "model_module_version": "1.5.0",
91 | "state": {
92 | "_dom_classes": [],
93 | "_model_module": "@jupyter-widgets/controls",
94 | "_model_module_version": "1.5.0",
95 | "_model_name": "HTMLModel",
96 | "_view_count": null,
97 | "_view_module": "@jupyter-widgets/controls",
98 | "_view_module_version": "1.5.0",
99 | "_view_name": "HTMLView",
100 | "description": "",
101 | "description_tooltip": null,
102 | "layout": "IPY_MODEL_fc4edcef273b4e75894f4b512122de94",
103 | "placeholder": "",
104 | "style": "IPY_MODEL_ca2323b142f54998985d30481d5cfabe",
105 | "value": " 493k/493k [00:00<00:00, 42.2kB/s]"
106 | }
107 | },
108 | "8951f6b2edf64464869391197c900f84": {
109 | "model_module": "@jupyter-widgets/base",
110 | "model_name": "LayoutModel",
111 | "model_module_version": "1.2.0",
112 | "state": {
113 | "_model_module": "@jupyter-widgets/base",
114 | "_model_module_version": "1.2.0",
115 | "_model_name": "LayoutModel",
116 | "_view_count": null,
117 | "_view_module": "@jupyter-widgets/base",
118 | "_view_module_version": "1.2.0",
119 | "_view_name": "LayoutView",
120 | "align_content": null,
121 | "align_items": null,
122 | "align_self": null,
123 | "border": null,
124 | "bottom": null,
125 | "display": null,
126 | "flex": null,
127 | "flex_flow": null,
128 | "grid_area": null,
129 | "grid_auto_columns": null,
130 | "grid_auto_flow": null,
131 | "grid_auto_rows": null,
132 | "grid_column": null,
133 | "grid_gap": null,
134 | "grid_row": null,
135 | "grid_template_areas": null,
136 | "grid_template_columns": null,
137 | "grid_template_rows": null,
138 | "height": null,
139 | "justify_content": null,
140 | "justify_items": null,
141 | "left": null,
142 | "margin": null,
143 | "max_height": null,
144 | "max_width": null,
145 | "min_height": null,
146 | "min_width": null,
147 | "object_fit": null,
148 | "object_position": null,
149 | "order": null,
150 | "overflow": null,
151 | "overflow_x": null,
152 | "overflow_y": null,
153 | "padding": null,
154 | "right": null,
155 | "top": null,
156 | "visibility": null,
157 | "width": null
158 | }
159 | },
160 | "69a61ad28d5141dcbaea44060bc5ebf7": {
161 | "model_module": "@jupyter-widgets/base",
162 | "model_name": "LayoutModel",
163 | "model_module_version": "1.2.0",
164 | "state": {
165 | "_model_module": "@jupyter-widgets/base",
166 | "_model_module_version": "1.2.0",
167 | "_model_name": "LayoutModel",
168 | "_view_count": null,
169 | "_view_module": "@jupyter-widgets/base",
170 | "_view_module_version": "1.2.0",
171 | "_view_name": "LayoutView",
172 | "align_content": null,
173 | "align_items": null,
174 | "align_self": null,
175 | "border": null,
176 | "bottom": null,
177 | "display": null,
178 | "flex": null,
179 | "flex_flow": null,
180 | "grid_area": null,
181 | "grid_auto_columns": null,
182 | "grid_auto_flow": null,
183 | "grid_auto_rows": null,
184 | "grid_column": null,
185 | "grid_gap": null,
186 | "grid_row": null,
187 | "grid_template_areas": null,
188 | "grid_template_columns": null,
189 | "grid_template_rows": null,
190 | "height": null,
191 | "justify_content": null,
192 | "justify_items": null,
193 | "left": null,
194 | "margin": null,
195 | "max_height": null,
196 | "max_width": null,
197 | "min_height": null,
198 | "min_width": null,
199 | "object_fit": null,
200 | "object_position": null,
201 | "order": null,
202 | "overflow": null,
203 | "overflow_x": null,
204 | "overflow_y": null,
205 | "padding": null,
206 | "right": null,
207 | "top": null,
208 | "visibility": null,
209 | "width": null
210 | }
211 | },
212 | "76c2fbf005ae4a5790edfeb499b387b7": {
213 | "model_module": "@jupyter-widgets/controls",
214 | "model_name": "DescriptionStyleModel",
215 | "model_module_version": "1.5.0",
216 | "state": {
217 | "_model_module": "@jupyter-widgets/controls",
218 | "_model_module_version": "1.5.0",
219 | "_model_name": "DescriptionStyleModel",
220 | "_view_count": null,
221 | "_view_module": "@jupyter-widgets/base",
222 | "_view_module_version": "1.2.0",
223 | "_view_name": "StyleView",
224 | "description_width": ""
225 | }
226 | },
227 | "116964f328dc45d991d895d684ac1216": {
228 | "model_module": "@jupyter-widgets/base",
229 | "model_name": "LayoutModel",
230 | "model_module_version": "1.2.0",
231 | "state": {
232 | "_model_module": "@jupyter-widgets/base",
233 | "_model_module_version": "1.2.0",
234 | "_model_name": "LayoutModel",
235 | "_view_count": null,
236 | "_view_module": "@jupyter-widgets/base",
237 | "_view_module_version": "1.2.0",
238 | "_view_name": "LayoutView",
239 | "align_content": null,
240 | "align_items": null,
241 | "align_self": null,
242 | "border": null,
243 | "bottom": null,
244 | "display": null,
245 | "flex": null,
246 | "flex_flow": null,
247 | "grid_area": null,
248 | "grid_auto_columns": null,
249 | "grid_auto_flow": null,
250 | "grid_auto_rows": null,
251 | "grid_column": null,
252 | "grid_gap": null,
253 | "grid_row": null,
254 | "grid_template_areas": null,
255 | "grid_template_columns": null,
256 | "grid_template_rows": null,
257 | "height": null,
258 | "justify_content": null,
259 | "justify_items": null,
260 | "left": null,
261 | "margin": null,
262 | "max_height": null,
263 | "max_width": null,
264 | "min_height": null,
265 | "min_width": null,
266 | "object_fit": null,
267 | "object_position": null,
268 | "order": null,
269 | "overflow": null,
270 | "overflow_x": null,
271 | "overflow_y": null,
272 | "padding": null,
273 | "right": null,
274 | "top": null,
275 | "visibility": null,
276 | "width": null
277 | }
278 | },
279 | "1ecec5ba4424498082a5f64cf3d7faf8": {
280 | "model_module": "@jupyter-widgets/controls",
281 | "model_name": "ProgressStyleModel",
282 | "model_module_version": "1.5.0",
283 | "state": {
284 | "_model_module": "@jupyter-widgets/controls",
285 | "_model_module_version": "1.5.0",
286 | "_model_name": "ProgressStyleModel",
287 | "_view_count": null,
288 | "_view_module": "@jupyter-widgets/base",
289 | "_view_module_version": "1.2.0",
290 | "_view_name": "StyleView",
291 | "bar_color": null,
292 | "description_width": ""
293 | }
294 | },
295 | "fc4edcef273b4e75894f4b512122de94": {
296 | "model_module": "@jupyter-widgets/base",
297 | "model_name": "LayoutModel",
298 | "model_module_version": "1.2.0",
299 | "state": {
300 | "_model_module": "@jupyter-widgets/base",
301 | "_model_module_version": "1.2.0",
302 | "_model_name": "LayoutModel",
303 | "_view_count": null,
304 | "_view_module": "@jupyter-widgets/base",
305 | "_view_module_version": "1.2.0",
306 | "_view_name": "LayoutView",
307 | "align_content": null,
308 | "align_items": null,
309 | "align_self": null,
310 | "border": null,
311 | "bottom": null,
312 | "display": null,
313 | "flex": null,
314 | "flex_flow": null,
315 | "grid_area": null,
316 | "grid_auto_columns": null,
317 | "grid_auto_flow": null,
318 | "grid_auto_rows": null,
319 | "grid_column": null,
320 | "grid_gap": null,
321 | "grid_row": null,
322 | "grid_template_areas": null,
323 | "grid_template_columns": null,
324 | "grid_template_rows": null,
325 | "height": null,
326 | "justify_content": null,
327 | "justify_items": null,
328 | "left": null,
329 | "margin": null,
330 | "max_height": null,
331 | "max_width": null,
332 | "min_height": null,
333 | "min_width": null,
334 | "object_fit": null,
335 | "object_position": null,
336 | "order": null,
337 | "overflow": null,
338 | "overflow_x": null,
339 | "overflow_y": null,
340 | "padding": null,
341 | "right": null,
342 | "top": null,
343 | "visibility": null,
344 | "width": null
345 | }
346 | },
347 | "ca2323b142f54998985d30481d5cfabe": {
348 | "model_module": "@jupyter-widgets/controls",
349 | "model_name": "DescriptionStyleModel",
350 | "model_module_version": "1.5.0",
351 | "state": {
352 | "_model_module": "@jupyter-widgets/controls",
353 | "_model_module_version": "1.5.0",
354 | "_model_name": "DescriptionStyleModel",
355 | "_view_count": null,
356 | "_view_module": "@jupyter-widgets/base",
357 | "_view_module_version": "1.2.0",
358 | "_view_name": "StyleView",
359 | "description_width": ""
360 | }
361 | },
362 | "63626ac2d0f546188c07512a04c71417": {
363 | "model_module": "@jupyter-widgets/controls",
364 | "model_name": "HBoxModel",
365 | "model_module_version": "1.5.0",
366 | "state": {
367 | "_dom_classes": [],
368 | "_model_module": "@jupyter-widgets/controls",
369 | "_model_module_version": "1.5.0",
370 | "_model_name": "HBoxModel",
371 | "_view_count": null,
372 | "_view_module": "@jupyter-widgets/controls",
373 | "_view_module_version": "1.5.0",
374 | "_view_name": "HBoxView",
375 | "box_style": "",
376 | "children": [
377 | "IPY_MODEL_decd91747fd04ce39f3e2b733bc7f477",
378 | "IPY_MODEL_7140e4c154424fcab846a71889e99ed2",
379 | "IPY_MODEL_2264d8b75251425e94e635558af4e223"
380 | ],
381 | "layout": "IPY_MODEL_c37478198217457cb30c6649203cf4dc"
382 | }
383 | },
384 | "decd91747fd04ce39f3e2b733bc7f477": {
385 | "model_module": "@jupyter-widgets/controls",
386 | "model_name": "HTMLModel",
387 | "model_module_version": "1.5.0",
388 | "state": {
389 | "_dom_classes": [],
390 | "_model_module": "@jupyter-widgets/controls",
391 | "_model_module_version": "1.5.0",
392 | "_model_name": "HTMLModel",
393 | "_view_count": null,
394 | "_view_module": "@jupyter-widgets/controls",
395 | "_view_module_version": "1.5.0",
396 | "_view_name": "HTMLView",
397 | "description": "",
398 | "description_tooltip": null,
399 | "layout": "IPY_MODEL_4918769e4e984dfda924776e2373154c",
400 | "placeholder": "",
401 | "style": "IPY_MODEL_9b48494c94cf49b5835489d97f7a24c5",
402 | "value": "model-00001-of-00002.safetensors: 100%"
403 | }
404 | },
405 | "7140e4c154424fcab846a71889e99ed2": {
406 | "model_module": "@jupyter-widgets/controls",
407 | "model_name": "FloatProgressModel",
408 | "model_module_version": "1.5.0",
409 | "state": {
410 | "_dom_classes": [],
411 | "_model_module": "@jupyter-widgets/controls",
412 | "_model_module_version": "1.5.0",
413 | "_model_name": "FloatProgressModel",
414 | "_view_count": null,
415 | "_view_module": "@jupyter-widgets/controls",
416 | "_view_module_version": "1.5.0",
417 | "_view_name": "ProgressView",
418 | "bar_style": "success",
419 | "description": "",
420 | "description_tooltip": null,
421 | "layout": "IPY_MODEL_6ed844da52fe466eb1c10c814489448c",
422 | "max": 9942990000,
423 | "min": 0,
424 | "orientation": "horizontal",
425 | "style": "IPY_MODEL_9c60efa02e80423e828628190dd13bc3",
426 | "value": 9942990000
427 | }
428 | },
429 | "2264d8b75251425e94e635558af4e223": {
430 | "model_module": "@jupyter-widgets/controls",
431 | "model_name": "HTMLModel",
432 | "model_module_version": "1.5.0",
433 | "state": {
434 | "_dom_classes": [],
435 | "_model_module": "@jupyter-widgets/controls",
436 | "_model_module_version": "1.5.0",
437 | "_model_name": "HTMLModel",
438 | "_view_count": null,
439 | "_view_module": "@jupyter-widgets/controls",
440 | "_view_module_version": "1.5.0",
441 | "_view_name": "HTMLView",
442 | "description": "",
443 | "description_tooltip": null,
444 | "layout": "IPY_MODEL_0170e8cc57d94041956f7afbf2eef449",
445 | "placeholder": "",
446 | "style": "IPY_MODEL_220c2ba5f2524271b24fe049431a474c",
447 | "value": " 9.94G/9.94G [04:04<00:00, 36.9MB/s]"
448 | }
449 | },
450 | "c37478198217457cb30c6649203cf4dc": {
451 | "model_module": "@jupyter-widgets/base",
452 | "model_name": "LayoutModel",
453 | "model_module_version": "1.2.0",
454 | "state": {
455 | "_model_module": "@jupyter-widgets/base",
456 | "_model_module_version": "1.2.0",
457 | "_model_name": "LayoutModel",
458 | "_view_count": null,
459 | "_view_module": "@jupyter-widgets/base",
460 | "_view_module_version": "1.2.0",
461 | "_view_name": "LayoutView",
462 | "align_content": null,
463 | "align_items": null,
464 | "align_self": null,
465 | "border": null,
466 | "bottom": null,
467 | "display": null,
468 | "flex": null,
469 | "flex_flow": null,
470 | "grid_area": null,
471 | "grid_auto_columns": null,
472 | "grid_auto_flow": null,
473 | "grid_auto_rows": null,
474 | "grid_column": null,
475 | "grid_gap": null,
476 | "grid_row": null,
477 | "grid_template_areas": null,
478 | "grid_template_columns": null,
479 | "grid_template_rows": null,
480 | "height": null,
481 | "justify_content": null,
482 | "justify_items": null,
483 | "left": null,
484 | "margin": null,
485 | "max_height": null,
486 | "max_width": null,
487 | "min_height": null,
488 | "min_width": null,
489 | "object_fit": null,
490 | "object_position": null,
491 | "order": null,
492 | "overflow": null,
493 | "overflow_x": null,
494 | "overflow_y": null,
495 | "padding": null,
496 | "right": null,
497 | "top": null,
498 | "visibility": null,
499 | "width": null
500 | }
501 | },
502 | "4918769e4e984dfda924776e2373154c": {
503 | "model_module": "@jupyter-widgets/base",
504 | "model_name": "LayoutModel",
505 | "model_module_version": "1.2.0",
506 | "state": {
507 | "_model_module": "@jupyter-widgets/base",
508 | "_model_module_version": "1.2.0",
509 | "_model_name": "LayoutModel",
510 | "_view_count": null,
511 | "_view_module": "@jupyter-widgets/base",
512 | "_view_module_version": "1.2.0",
513 | "_view_name": "LayoutView",
514 | "align_content": null,
515 | "align_items": null,
516 | "align_self": null,
517 | "border": null,
518 | "bottom": null,
519 | "display": null,
520 | "flex": null,
521 | "flex_flow": null,
522 | "grid_area": null,
523 | "grid_auto_columns": null,
524 | "grid_auto_flow": null,
525 | "grid_auto_rows": null,
526 | "grid_column": null,
527 | "grid_gap": null,
528 | "grid_row": null,
529 | "grid_template_areas": null,
530 | "grid_template_columns": null,
531 | "grid_template_rows": null,
532 | "height": null,
533 | "justify_content": null,
534 | "justify_items": null,
535 | "left": null,
536 | "margin": null,
537 | "max_height": null,
538 | "max_width": null,
539 | "min_height": null,
540 | "min_width": null,
541 | "object_fit": null,
542 | "object_position": null,
543 | "order": null,
544 | "overflow": null,
545 | "overflow_x": null,
546 | "overflow_y": null,
547 | "padding": null,
548 | "right": null,
549 | "top": null,
550 | "visibility": null,
551 | "width": null
552 | }
553 | },
554 | "9b48494c94cf49b5835489d97f7a24c5": {
555 | "model_module": "@jupyter-widgets/controls",
556 | "model_name": "DescriptionStyleModel",
557 | "model_module_version": "1.5.0",
558 | "state": {
559 | "_model_module": "@jupyter-widgets/controls",
560 | "_model_module_version": "1.5.0",
561 | "_model_name": "DescriptionStyleModel",
562 | "_view_count": null,
563 | "_view_module": "@jupyter-widgets/base",
564 | "_view_module_version": "1.2.0",
565 | "_view_name": "StyleView",
566 | "description_width": ""
567 | }
568 | },
569 | "6ed844da52fe466eb1c10c814489448c": {
570 | "model_module": "@jupyter-widgets/base",
571 | "model_name": "LayoutModel",
572 | "model_module_version": "1.2.0",
573 | "state": {
574 | "_model_module": "@jupyter-widgets/base",
575 | "_model_module_version": "1.2.0",
576 | "_model_name": "LayoutModel",
577 | "_view_count": null,
578 | "_view_module": "@jupyter-widgets/base",
579 | "_view_module_version": "1.2.0",
580 | "_view_name": "LayoutView",
581 | "align_content": null,
582 | "align_items": null,
583 | "align_self": null,
584 | "border": null,
585 | "bottom": null,
586 | "display": null,
587 | "flex": null,
588 | "flex_flow": null,
589 | "grid_area": null,
590 | "grid_auto_columns": null,
591 | "grid_auto_flow": null,
592 | "grid_auto_rows": null,
593 | "grid_column": null,
594 | "grid_gap": null,
595 | "grid_row": null,
596 | "grid_template_areas": null,
597 | "grid_template_columns": null,
598 | "grid_template_rows": null,
599 | "height": null,
600 | "justify_content": null,
601 | "justify_items": null,
602 | "left": null,
603 | "margin": null,
604 | "max_height": null,
605 | "max_width": null,
606 | "min_height": null,
607 | "min_width": null,
608 | "object_fit": null,
609 | "object_position": null,
610 | "order": null,
611 | "overflow": null,
612 | "overflow_x": null,
613 | "overflow_y": null,
614 | "padding": null,
615 | "right": null,
616 | "top": null,
617 | "visibility": null,
618 | "width": null
619 | }
620 | },
621 | "9c60efa02e80423e828628190dd13bc3": {
622 | "model_module": "@jupyter-widgets/controls",
623 | "model_name": "ProgressStyleModel",
624 | "model_module_version": "1.5.0",
625 | "state": {
626 | "_model_module": "@jupyter-widgets/controls",
627 | "_model_module_version": "1.5.0",
628 | "_model_name": "ProgressStyleModel",
629 | "_view_count": null,
630 | "_view_module": "@jupyter-widgets/base",
631 | "_view_module_version": "1.2.0",
632 | "_view_name": "StyleView",
633 | "bar_color": null,
634 | "description_width": ""
635 | }
636 | },
637 | "0170e8cc57d94041956f7afbf2eef449": {
638 | "model_module": "@jupyter-widgets/base",
639 | "model_name": "LayoutModel",
640 | "model_module_version": "1.2.0",
641 | "state": {
642 | "_model_module": "@jupyter-widgets/base",
643 | "_model_module_version": "1.2.0",
644 | "_model_name": "LayoutModel",
645 | "_view_count": null,
646 | "_view_module": "@jupyter-widgets/base",
647 | "_view_module_version": "1.2.0",
648 | "_view_name": "LayoutView",
649 | "align_content": null,
650 | "align_items": null,
651 | "align_self": null,
652 | "border": null,
653 | "bottom": null,
654 | "display": null,
655 | "flex": null,
656 | "flex_flow": null,
657 | "grid_area": null,
658 | "grid_auto_columns": null,
659 | "grid_auto_flow": null,
660 | "grid_auto_rows": null,
661 | "grid_column": null,
662 | "grid_gap": null,
663 | "grid_row": null,
664 | "grid_template_areas": null,
665 | "grid_template_columns": null,
666 | "grid_template_rows": null,
667 | "height": null,
668 | "justify_content": null,
669 | "justify_items": null,
670 | "left": null,
671 | "margin": null,
672 | "max_height": null,
673 | "max_width": null,
674 | "min_height": null,
675 | "min_width": null,
676 | "object_fit": null,
677 | "object_position": null,
678 | "order": null,
679 | "overflow": null,
680 | "overflow_x": null,
681 | "overflow_y": null,
682 | "padding": null,
683 | "right": null,
684 | "top": null,
685 | "visibility": null,
686 | "width": null
687 | }
688 | },
689 | "220c2ba5f2524271b24fe049431a474c": {
690 | "model_module": "@jupyter-widgets/controls",
691 | "model_name": "DescriptionStyleModel",
692 | "model_module_version": "1.5.0",
693 | "state": {
694 | "_model_module": "@jupyter-widgets/controls",
695 | "_model_module_version": "1.5.0",
696 | "_model_name": "DescriptionStyleModel",
697 | "_view_count": null,
698 | "_view_module": "@jupyter-widgets/base",
699 | "_view_module_version": "1.2.0",
700 | "_view_name": "StyleView",
701 | "description_width": ""
702 | }
703 | },
704 | "a6f99dd0662846f9a381d2d507a7b447": {
705 | "model_module": "@jupyter-widgets/controls",
706 | "model_name": "HBoxModel",
707 | "model_module_version": "1.5.0",
708 | "state": {
709 | "_dom_classes": [],
710 | "_model_module": "@jupyter-widgets/controls",
711 | "_model_module_version": "1.5.0",
712 | "_model_name": "HBoxModel",
713 | "_view_count": null,
714 | "_view_module": "@jupyter-widgets/controls",
715 | "_view_module_version": "1.5.0",
716 | "_view_name": "HBoxView",
717 | "box_style": "",
718 | "children": [
719 | "IPY_MODEL_900b9fcb70a84781bd5b4213df54626d",
720 | "IPY_MODEL_0ea83f270e164795b64f23b143efb300",
721 | "IPY_MODEL_318dcdeac8fb40f88fa60114f1c6a7c1"
722 | ],
723 | "layout": "IPY_MODEL_af89cf715e0e4c5e9f59943a255394c1"
724 | }
725 | },
726 | "900b9fcb70a84781bd5b4213df54626d": {
727 | "model_module": "@jupyter-widgets/controls",
728 | "model_name": "HTMLModel",
729 | "model_module_version": "1.5.0",
730 | "state": {
731 | "_dom_classes": [],
732 | "_model_module": "@jupyter-widgets/controls",
733 | "_model_module_version": "1.5.0",
734 | "_model_name": "HTMLModel",
735 | "_view_count": null,
736 | "_view_module": "@jupyter-widgets/controls",
737 | "_view_module_version": "1.5.0",
738 | "_view_name": "HTMLView",
739 | "description": "",
740 | "description_tooltip": null,
741 | "layout": "IPY_MODEL_40e23e35299d45d499432b8f1a9bc924",
742 | "placeholder": "",
743 | "style": "IPY_MODEL_126b374e286747768ef7218454534640",
744 | "value": "Upload 3 LFS files: 100%"
745 | }
746 | },
747 | "0ea83f270e164795b64f23b143efb300": {
748 | "model_module": "@jupyter-widgets/controls",
749 | "model_name": "FloatProgressModel",
750 | "model_module_version": "1.5.0",
751 | "state": {
752 | "_dom_classes": [],
753 | "_model_module": "@jupyter-widgets/controls",
754 | "_model_module_version": "1.5.0",
755 | "_model_name": "FloatProgressModel",
756 | "_view_count": null,
757 | "_view_module": "@jupyter-widgets/controls",
758 | "_view_module_version": "1.5.0",
759 | "_view_name": "ProgressView",
760 | "bar_style": "success",
761 | "description": "",
762 | "description_tooltip": null,
763 | "layout": "IPY_MODEL_bdd26e54eed5477f99b135552e5f3450",
764 | "max": 3,
765 | "min": 0,
766 | "orientation": "horizontal",
767 | "style": "IPY_MODEL_163a6fd878134e1eb5f193d1ebfff1c1",
768 | "value": 3
769 | }
770 | },
771 | "318dcdeac8fb40f88fa60114f1c6a7c1": {
772 | "model_module": "@jupyter-widgets/controls",
773 | "model_name": "HTMLModel",
774 | "model_module_version": "1.5.0",
775 | "state": {
776 | "_dom_classes": [],
777 | "_model_module": "@jupyter-widgets/controls",
778 | "_model_module_version": "1.5.0",
779 | "_model_name": "HTMLModel",
780 | "_view_count": null,
781 | "_view_module": "@jupyter-widgets/controls",
782 | "_view_module_version": "1.5.0",
783 | "_view_name": "HTMLView",
784 | "description": "",
785 | "description_tooltip": null,
786 | "layout": "IPY_MODEL_953d7c014f76413c9805a2ef8c2c9356",
787 | "placeholder": "",
788 | "style": "IPY_MODEL_348879bf76d1471f9c79c1ec2dc07c1d",
789 | "value": " 3/3 [04:05<00:00, 245.46s/it]"
790 | }
791 | },
792 | "af89cf715e0e4c5e9f59943a255394c1": {
793 | "model_module": "@jupyter-widgets/base",
794 | "model_name": "LayoutModel",
795 | "model_module_version": "1.2.0",
796 | "state": {
797 | "_model_module": "@jupyter-widgets/base",
798 | "_model_module_version": "1.2.0",
799 | "_model_name": "LayoutModel",
800 | "_view_count": null,
801 | "_view_module": "@jupyter-widgets/base",
802 | "_view_module_version": "1.2.0",
803 | "_view_name": "LayoutView",
804 | "align_content": null,
805 | "align_items": null,
806 | "align_self": null,
807 | "border": null,
808 | "bottom": null,
809 | "display": null,
810 | "flex": null,
811 | "flex_flow": null,
812 | "grid_area": null,
813 | "grid_auto_columns": null,
814 | "grid_auto_flow": null,
815 | "grid_auto_rows": null,
816 | "grid_column": null,
817 | "grid_gap": null,
818 | "grid_row": null,
819 | "grid_template_areas": null,
820 | "grid_template_columns": null,
821 | "grid_template_rows": null,
822 | "height": null,
823 | "justify_content": null,
824 | "justify_items": null,
825 | "left": null,
826 | "margin": null,
827 | "max_height": null,
828 | "max_width": null,
829 | "min_height": null,
830 | "min_width": null,
831 | "object_fit": null,
832 | "object_position": null,
833 | "order": null,
834 | "overflow": null,
835 | "overflow_x": null,
836 | "overflow_y": null,
837 | "padding": null,
838 | "right": null,
839 | "top": null,
840 | "visibility": null,
841 | "width": null
842 | }
843 | },
844 | "40e23e35299d45d499432b8f1a9bc924": {
845 | "model_module": "@jupyter-widgets/base",
846 | "model_name": "LayoutModel",
847 | "model_module_version": "1.2.0",
848 | "state": {
849 | "_model_module": "@jupyter-widgets/base",
850 | "_model_module_version": "1.2.0",
851 | "_model_name": "LayoutModel",
852 | "_view_count": null,
853 | "_view_module": "@jupyter-widgets/base",
854 | "_view_module_version": "1.2.0",
855 | "_view_name": "LayoutView",
856 | "align_content": null,
857 | "align_items": null,
858 | "align_self": null,
859 | "border": null,
860 | "bottom": null,
861 | "display": null,
862 | "flex": null,
863 | "flex_flow": null,
864 | "grid_area": null,
865 | "grid_auto_columns": null,
866 | "grid_auto_flow": null,
867 | "grid_auto_rows": null,
868 | "grid_column": null,
869 | "grid_gap": null,
870 | "grid_row": null,
871 | "grid_template_areas": null,
872 | "grid_template_columns": null,
873 | "grid_template_rows": null,
874 | "height": null,
875 | "justify_content": null,
876 | "justify_items": null,
877 | "left": null,
878 | "margin": null,
879 | "max_height": null,
880 | "max_width": null,
881 | "min_height": null,
882 | "min_width": null,
883 | "object_fit": null,
884 | "object_position": null,
885 | "order": null,
886 | "overflow": null,
887 | "overflow_x": null,
888 | "overflow_y": null,
889 | "padding": null,
890 | "right": null,
891 | "top": null,
892 | "visibility": null,
893 | "width": null
894 | }
895 | },
896 | "126b374e286747768ef7218454534640": {
897 | "model_module": "@jupyter-widgets/controls",
898 | "model_name": "DescriptionStyleModel",
899 | "model_module_version": "1.5.0",
900 | "state": {
901 | "_model_module": "@jupyter-widgets/controls",
902 | "_model_module_version": "1.5.0",
903 | "_model_name": "DescriptionStyleModel",
904 | "_view_count": null,
905 | "_view_module": "@jupyter-widgets/base",
906 | "_view_module_version": "1.2.0",
907 | "_view_name": "StyleView",
908 | "description_width": ""
909 | }
910 | },
911 | "bdd26e54eed5477f99b135552e5f3450": {
912 | "model_module": "@jupyter-widgets/base",
913 | "model_name": "LayoutModel",
914 | "model_module_version": "1.2.0",
915 | "state": {
916 | "_model_module": "@jupyter-widgets/base",
917 | "_model_module_version": "1.2.0",
918 | "_model_name": "LayoutModel",
919 | "_view_count": null,
920 | "_view_module": "@jupyter-widgets/base",
921 | "_view_module_version": "1.2.0",
922 | "_view_name": "LayoutView",
923 | "align_content": null,
924 | "align_items": null,
925 | "align_self": null,
926 | "border": null,
927 | "bottom": null,
928 | "display": null,
929 | "flex": null,
930 | "flex_flow": null,
931 | "grid_area": null,
932 | "grid_auto_columns": null,
933 | "grid_auto_flow": null,
934 | "grid_auto_rows": null,
935 | "grid_column": null,
936 | "grid_gap": null,
937 | "grid_row": null,
938 | "grid_template_areas": null,
939 | "grid_template_columns": null,
940 | "grid_template_rows": null,
941 | "height": null,
942 | "justify_content": null,
943 | "justify_items": null,
944 | "left": null,
945 | "margin": null,
946 | "max_height": null,
947 | "max_width": null,
948 | "min_height": null,
949 | "min_width": null,
950 | "object_fit": null,
951 | "object_position": null,
952 | "order": null,
953 | "overflow": null,
954 | "overflow_x": null,
955 | "overflow_y": null,
956 | "padding": null,
957 | "right": null,
958 | "top": null,
959 | "visibility": null,
960 | "width": null
961 | }
962 | },
963 | "163a6fd878134e1eb5f193d1ebfff1c1": {
964 | "model_module": "@jupyter-widgets/controls",
965 | "model_name": "ProgressStyleModel",
966 | "model_module_version": "1.5.0",
967 | "state": {
968 | "_model_module": "@jupyter-widgets/controls",
969 | "_model_module_version": "1.5.0",
970 | "_model_name": "ProgressStyleModel",
971 | "_view_count": null,
972 | "_view_module": "@jupyter-widgets/base",
973 | "_view_module_version": "1.2.0",
974 | "_view_name": "StyleView",
975 | "bar_color": null,
976 | "description_width": ""
977 | }
978 | },
979 | "953d7c014f76413c9805a2ef8c2c9356": {
980 | "model_module": "@jupyter-widgets/base",
981 | "model_name": "LayoutModel",
982 | "model_module_version": "1.2.0",
983 | "state": {
984 | "_model_module": "@jupyter-widgets/base",
985 | "_model_module_version": "1.2.0",
986 | "_model_name": "LayoutModel",
987 | "_view_count": null,
988 | "_view_module": "@jupyter-widgets/base",
989 | "_view_module_version": "1.2.0",
990 | "_view_name": "LayoutView",
991 | "align_content": null,
992 | "align_items": null,
993 | "align_self": null,
994 | "border": null,
995 | "bottom": null,
996 | "display": null,
997 | "flex": null,
998 | "flex_flow": null,
999 | "grid_area": null,
1000 | "grid_auto_columns": null,
1001 | "grid_auto_flow": null,
1002 | "grid_auto_rows": null,
1003 | "grid_column": null,
1004 | "grid_gap": null,
1005 | "grid_row": null,
1006 | "grid_template_areas": null,
1007 | "grid_template_columns": null,
1008 | "grid_template_rows": null,
1009 | "height": null,
1010 | "justify_content": null,
1011 | "justify_items": null,
1012 | "left": null,
1013 | "margin": null,
1014 | "max_height": null,
1015 | "max_width": null,
1016 | "min_height": null,
1017 | "min_width": null,
1018 | "object_fit": null,
1019 | "object_position": null,
1020 | "order": null,
1021 | "overflow": null,
1022 | "overflow_x": null,
1023 | "overflow_y": null,
1024 | "padding": null,
1025 | "right": null,
1026 | "top": null,
1027 | "visibility": null,
1028 | "width": null
1029 | }
1030 | },
1031 | "348879bf76d1471f9c79c1ec2dc07c1d": {
1032 | "model_module": "@jupyter-widgets/controls",
1033 | "model_name": "DescriptionStyleModel",
1034 | "model_module_version": "1.5.0",
1035 | "state": {
1036 | "_model_module": "@jupyter-widgets/controls",
1037 | "_model_module_version": "1.5.0",
1038 | "_model_name": "DescriptionStyleModel",
1039 | "_view_count": null,
1040 | "_view_module": "@jupyter-widgets/base",
1041 | "_view_module_version": "1.2.0",
1042 | "_view_name": "StyleView",
1043 | "description_width": ""
1044 | }
1045 | },
1046 | "8d54ae0d028b40e7b018454187db1a1c": {
1047 | "model_module": "@jupyter-widgets/controls",
1048 | "model_name": "HBoxModel",
1049 | "model_module_version": "1.5.0",
1050 | "state": {
1051 | "_dom_classes": [],
1052 | "_model_module": "@jupyter-widgets/controls",
1053 | "_model_module_version": "1.5.0",
1054 | "_model_name": "HBoxModel",
1055 | "_view_count": null,
1056 | "_view_module": "@jupyter-widgets/controls",
1057 | "_view_module_version": "1.5.0",
1058 | "_view_name": "HBoxView",
1059 | "box_style": "",
1060 | "children": [
1061 | "IPY_MODEL_562353040be54593b23734390f49927c",
1062 | "IPY_MODEL_00cbebe6df7d4995913f20e39fc71b15",
1063 | "IPY_MODEL_aee3c563fdc54f9cb3ebc2630c84a9e6"
1064 | ],
1065 | "layout": "IPY_MODEL_b74e307a751844ffab9f7f3df367774b"
1066 | }
1067 | },
1068 | "562353040be54593b23734390f49927c": {
1069 | "model_module": "@jupyter-widgets/controls",
1070 | "model_name": "HTMLModel",
1071 | "model_module_version": "1.5.0",
1072 | "state": {
1073 | "_dom_classes": [],
1074 | "_model_module": "@jupyter-widgets/controls",
1075 | "_model_module_version": "1.5.0",
1076 | "_model_name": "HTMLModel",
1077 | "_view_count": null,
1078 | "_view_module": "@jupyter-widgets/controls",
1079 | "_view_module_version": "1.5.0",
1080 | "_view_name": "HTMLView",
1081 | "description": "",
1082 | "description_tooltip": null,
1083 | "layout": "IPY_MODEL_8e6142e41f714fe9abe6a5bb72c071f9",
1084 | "placeholder": "",
1085 | "style": "IPY_MODEL_49cd1c5663404fb5a307c345e7e970c3",
1086 | "value": "model-00002-of-00002.safetensors: 100%"
1087 | }
1088 | },
1089 | "00cbebe6df7d4995913f20e39fc71b15": {
1090 | "model_module": "@jupyter-widgets/controls",
1091 | "model_name": "FloatProgressModel",
1092 | "model_module_version": "1.5.0",
1093 | "state": {
1094 | "_dom_classes": [],
1095 | "_model_module": "@jupyter-widgets/controls",
1096 | "_model_module_version": "1.5.0",
1097 | "_model_name": "FloatProgressModel",
1098 | "_view_count": null,
1099 | "_view_module": "@jupyter-widgets/controls",
1100 | "_view_module_version": "1.5.0",
1101 | "_view_name": "ProgressView",
1102 | "bar_style": "success",
1103 | "description": "",
1104 | "description_tooltip": null,
1105 | "layout": "IPY_MODEL_920ef8e509d24ccda930f4c47eff158c",
1106 | "max": 8030324832,
1107 | "min": 0,
1108 | "orientation": "horizontal",
1109 | "style": "IPY_MODEL_c8828d61b26a47ac97a1541e14c00f62",
1110 | "value": 8030324832
1111 | }
1112 | },
1113 | "aee3c563fdc54f9cb3ebc2630c84a9e6": {
1114 | "model_module": "@jupyter-widgets/controls",
1115 | "model_name": "HTMLModel",
1116 | "model_module_version": "1.5.0",
1117 | "state": {
1118 | "_dom_classes": [],
1119 | "_model_module": "@jupyter-widgets/controls",
1120 | "_model_module_version": "1.5.0",
1121 | "_model_name": "HTMLModel",
1122 | "_view_count": null,
1123 | "_view_module": "@jupyter-widgets/controls",
1124 | "_view_module_version": "1.5.0",
1125 | "_view_name": "HTMLView",
1126 | "description": "",
1127 | "description_tooltip": null,
1128 | "layout": "IPY_MODEL_a3d7e352222647a99be79935b1ebd86a",
1129 | "placeholder": "",
1130 | "style": "IPY_MODEL_80666ef5f07641c482a23618a767791d",
1131 | "value": " 8.03G/8.03G [03:13<00:00, 54.0MB/s]"
1132 | }
1133 | },
1134 | "b74e307a751844ffab9f7f3df367774b": {
1135 | "model_module": "@jupyter-widgets/base",
1136 | "model_name": "LayoutModel",
1137 | "model_module_version": "1.2.0",
1138 | "state": {
1139 | "_model_module": "@jupyter-widgets/base",
1140 | "_model_module_version": "1.2.0",
1141 | "_model_name": "LayoutModel",
1142 | "_view_count": null,
1143 | "_view_module": "@jupyter-widgets/base",
1144 | "_view_module_version": "1.2.0",
1145 | "_view_name": "LayoutView",
1146 | "align_content": null,
1147 | "align_items": null,
1148 | "align_self": null,
1149 | "border": null,
1150 | "bottom": null,
1151 | "display": null,
1152 | "flex": null,
1153 | "flex_flow": null,
1154 | "grid_area": null,
1155 | "grid_auto_columns": null,
1156 | "grid_auto_flow": null,
1157 | "grid_auto_rows": null,
1158 | "grid_column": null,
1159 | "grid_gap": null,
1160 | "grid_row": null,
1161 | "grid_template_areas": null,
1162 | "grid_template_columns": null,
1163 | "grid_template_rows": null,
1164 | "height": null,
1165 | "justify_content": null,
1166 | "justify_items": null,
1167 | "left": null,
1168 | "margin": null,
1169 | "max_height": null,
1170 | "max_width": null,
1171 | "min_height": null,
1172 | "min_width": null,
1173 | "object_fit": null,
1174 | "object_position": null,
1175 | "order": null,
1176 | "overflow": null,
1177 | "overflow_x": null,
1178 | "overflow_y": null,
1179 | "padding": null,
1180 | "right": null,
1181 | "top": null,
1182 | "visibility": null,
1183 | "width": null
1184 | }
1185 | },
1186 | "8e6142e41f714fe9abe6a5bb72c071f9": {
1187 | "model_module": "@jupyter-widgets/base",
1188 | "model_name": "LayoutModel",
1189 | "model_module_version": "1.2.0",
1190 | "state": {
1191 | "_model_module": "@jupyter-widgets/base",
1192 | "_model_module_version": "1.2.0",
1193 | "_model_name": "LayoutModel",
1194 | "_view_count": null,
1195 | "_view_module": "@jupyter-widgets/base",
1196 | "_view_module_version": "1.2.0",
1197 | "_view_name": "LayoutView",
1198 | "align_content": null,
1199 | "align_items": null,
1200 | "align_self": null,
1201 | "border": null,
1202 | "bottom": null,
1203 | "display": null,
1204 | "flex": null,
1205 | "flex_flow": null,
1206 | "grid_area": null,
1207 | "grid_auto_columns": null,
1208 | "grid_auto_flow": null,
1209 | "grid_auto_rows": null,
1210 | "grid_column": null,
1211 | "grid_gap": null,
1212 | "grid_row": null,
1213 | "grid_template_areas": null,
1214 | "grid_template_columns": null,
1215 | "grid_template_rows": null,
1216 | "height": null,
1217 | "justify_content": null,
1218 | "justify_items": null,
1219 | "left": null,
1220 | "margin": null,
1221 | "max_height": null,
1222 | "max_width": null,
1223 | "min_height": null,
1224 | "min_width": null,
1225 | "object_fit": null,
1226 | "object_position": null,
1227 | "order": null,
1228 | "overflow": null,
1229 | "overflow_x": null,
1230 | "overflow_y": null,
1231 | "padding": null,
1232 | "right": null,
1233 | "top": null,
1234 | "visibility": null,
1235 | "width": null
1236 | }
1237 | },
1238 | "49cd1c5663404fb5a307c345e7e970c3": {
1239 | "model_module": "@jupyter-widgets/controls",
1240 | "model_name": "DescriptionStyleModel",
1241 | "model_module_version": "1.5.0",
1242 | "state": {
1243 | "_model_module": "@jupyter-widgets/controls",
1244 | "_model_module_version": "1.5.0",
1245 | "_model_name": "DescriptionStyleModel",
1246 | "_view_count": null,
1247 | "_view_module": "@jupyter-widgets/base",
1248 | "_view_module_version": "1.2.0",
1249 | "_view_name": "StyleView",
1250 | "description_width": ""
1251 | }
1252 | },
1253 | "920ef8e509d24ccda930f4c47eff158c": {
1254 | "model_module": "@jupyter-widgets/base",
1255 | "model_name": "LayoutModel",
1256 | "model_module_version": "1.2.0",
1257 | "state": {
1258 | "_model_module": "@jupyter-widgets/base",
1259 | "_model_module_version": "1.2.0",
1260 | "_model_name": "LayoutModel",
1261 | "_view_count": null,
1262 | "_view_module": "@jupyter-widgets/base",
1263 | "_view_module_version": "1.2.0",
1264 | "_view_name": "LayoutView",
1265 | "align_content": null,
1266 | "align_items": null,
1267 | "align_self": null,
1268 | "border": null,
1269 | "bottom": null,
1270 | "display": null,
1271 | "flex": null,
1272 | "flex_flow": null,
1273 | "grid_area": null,
1274 | "grid_auto_columns": null,
1275 | "grid_auto_flow": null,
1276 | "grid_auto_rows": null,
1277 | "grid_column": null,
1278 | "grid_gap": null,
1279 | "grid_row": null,
1280 | "grid_template_areas": null,
1281 | "grid_template_columns": null,
1282 | "grid_template_rows": null,
1283 | "height": null,
1284 | "justify_content": null,
1285 | "justify_items": null,
1286 | "left": null,
1287 | "margin": null,
1288 | "max_height": null,
1289 | "max_width": null,
1290 | "min_height": null,
1291 | "min_width": null,
1292 | "object_fit": null,
1293 | "object_position": null,
1294 | "order": null,
1295 | "overflow": null,
1296 | "overflow_x": null,
1297 | "overflow_y": null,
1298 | "padding": null,
1299 | "right": null,
1300 | "top": null,
1301 | "visibility": null,
1302 | "width": null
1303 | }
1304 | },
1305 | "c8828d61b26a47ac97a1541e14c00f62": {
1306 | "model_module": "@jupyter-widgets/controls",
1307 | "model_name": "ProgressStyleModel",
1308 | "model_module_version": "1.5.0",
1309 | "state": {
1310 | "_model_module": "@jupyter-widgets/controls",
1311 | "_model_module_version": "1.5.0",
1312 | "_model_name": "ProgressStyleModel",
1313 | "_view_count": null,
1314 | "_view_module": "@jupyter-widgets/base",
1315 | "_view_module_version": "1.2.0",
1316 | "_view_name": "StyleView",
1317 | "bar_color": null,
1318 | "description_width": ""
1319 | }
1320 | },
1321 | "a3d7e352222647a99be79935b1ebd86a": {
1322 | "model_module": "@jupyter-widgets/base",
1323 | "model_name": "LayoutModel",
1324 | "model_module_version": "1.2.0",
1325 | "state": {
1326 | "_model_module": "@jupyter-widgets/base",
1327 | "_model_module_version": "1.2.0",
1328 | "_model_name": "LayoutModel",
1329 | "_view_count": null,
1330 | "_view_module": "@jupyter-widgets/base",
1331 | "_view_module_version": "1.2.0",
1332 | "_view_name": "LayoutView",
1333 | "align_content": null,
1334 | "align_items": null,
1335 | "align_self": null,
1336 | "border": null,
1337 | "bottom": null,
1338 | "display": null,
1339 | "flex": null,
1340 | "flex_flow": null,
1341 | "grid_area": null,
1342 | "grid_auto_columns": null,
1343 | "grid_auto_flow": null,
1344 | "grid_auto_rows": null,
1345 | "grid_column": null,
1346 | "grid_gap": null,
1347 | "grid_row": null,
1348 | "grid_template_areas": null,
1349 | "grid_template_columns": null,
1350 | "grid_template_rows": null,
1351 | "height": null,
1352 | "justify_content": null,
1353 | "justify_items": null,
1354 | "left": null,
1355 | "margin": null,
1356 | "max_height": null,
1357 | "max_width": null,
1358 | "min_height": null,
1359 | "min_width": null,
1360 | "object_fit": null,
1361 | "object_position": null,
1362 | "order": null,
1363 | "overflow": null,
1364 | "overflow_x": null,
1365 | "overflow_y": null,
1366 | "padding": null,
1367 | "right": null,
1368 | "top": null,
1369 | "visibility": null,
1370 | "width": null
1371 | }
1372 | },
1373 | "80666ef5f07641c482a23618a767791d": {
1374 | "model_module": "@jupyter-widgets/controls",
1375 | "model_name": "DescriptionStyleModel",
1376 | "model_module_version": "1.5.0",
1377 | "state": {
1378 | "_model_module": "@jupyter-widgets/controls",
1379 | "_model_module_version": "1.5.0",
1380 | "_model_name": "DescriptionStyleModel",
1381 | "_view_count": null,
1382 | "_view_module": "@jupyter-widgets/base",
1383 | "_view_module_version": "1.2.0",
1384 | "_view_name": "StyleView",
1385 | "description_width": ""
1386 | }
1387 | }
1388 | }
1389 | }
1390 | },
1391 | "cells": [
1392 | {
1393 | "cell_type": "markdown",
1394 | "metadata": {
1395 | "id": "view-in-github",
1396 | "colab_type": "text"
1397 | },
1398 | "source": [
1399 | " "
1400 | ]
1401 | },
1402 | {
1403 | "cell_type": "markdown",
1404 | "source": [
1405 | "# Merge Large Language Models with mergekit\n",
1406 | "> 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n",
1407 | "\n",
1408 | "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n",
1409 | "\n",
1410 | "Model merging only requires a lot of RAM. With a free Google Colab account, you should be able to run it using a T4 GPU (VRAM offloading).\n",
1411 | "\n",
1412 | "Examples of merge configurations:\n",
1413 | "\n",
1414 | "### TIES-Merging\n",
1415 | "\n",
1416 | "```yaml\n",
1417 | "models:\n",
1418 | " - model: mistralai/Mistral-7B-v0.1\n",
1419 | " # no parameters necessary for base model\n",
1420 | " - model: OpenPipe/mistral-ft-optimized-1218\n",
1421 | " parameters:\n",
1422 | " density: 0.5\n",
1423 | " weight: 0.5\n",
1424 | " - model: mlabonne/NeuralHermes-2.5-Mistral-7B\n",
1425 | " parameters:\n",
1426 | " density: 0.5\n",
1427 | " weight: 0.3\n",
1428 | "merge_method: ties\n",
1429 | "base_model: mistralai/Mistral-7B-v0.1\n",
1430 | "parameters:\n",
1431 | " normalize: true\n",
1432 | "dtype: float16\n",
1433 | "```\n",
1434 | "\n",
1435 | "You can find the final model on the Hugging Face Hub at [mlabonne/NeuralPipe-7B-ties](https://huggingface.co/mlabonne/NeuralPipe-7B-ties).\n",
1436 | "\n",
1437 | "### SLERP\n",
1438 | "\n",
1439 | "```yaml\n",
1440 | "slices:\n",
1441 | " - sources:\n",
1442 | " - model: OpenPipe/mistral-ft-optimized-1218\n",
1443 | " layer_range: [0, 32]\n",
1444 | " - model: mlabonne/NeuralHermes-2.5-Mistral-7B\n",
1445 | " layer_range: [0, 32]\n",
1446 | "merge_method: slerp\n",
1447 | "base_model: OpenPipe/mistral-ft-optimized-1218\n",
1448 | "parameters:\n",
1449 | " t:\n",
1450 | " - filter: self_attn\n",
1451 | " value: [0, 0.5, 0.3, 0.7, 1]\n",
1452 | " - filter: mlp\n",
1453 | " value: [1, 0.5, 0.7, 0.3, 0]\n",
1454 | " - value: 0.5\n",
1455 | "dtype: bfloat16\n",
1456 | "```\n",
1457 | "\n",
1458 | "You can find the final model on the Hugging Face Hub at [mlabonne/NeuralPipe-7B-slerp](https://huggingface.co/mlabonne/NeuralPipe-7B-slerp).\n",
1459 | "\n",
1460 | "### Passthrough\n",
1461 | "\n",
1462 | "```yaml\n",
1463 | "slices:\n",
1464 | " - sources:\n",
1465 | " - model: OpenPipe/mistral-ft-optimized-1218\n",
1466 | " layer_range: [0, 32]\n",
1467 | " - sources:\n",
1468 | " - model: mlabonne/NeuralHermes-2.5-Mistral-7B\n",
1469 | " layer_range: [24, 32]\n",
1470 | "merge_method: passthrough\n",
1471 | "dtype: bfloat16\n",
1472 | "```\n",
1473 | "\n",
1474 | "You can find the final model on the Hugging Face Hub at [mlabonne/NeuralPipe-9B-merged](https://huggingface.co/mlabonne/NeuralPipe-9B-merged)."
1475 | ],
1476 | "metadata": {
1477 | "id": "o12O0YjJvvLW"
1478 | }
1479 | },
1480 | {
1481 | "cell_type": "code",
1482 | "execution_count": null,
1483 | "metadata": {
1484 | "id": "NPNPie5Eo3EZ"
1485 | },
1486 | "outputs": [],
1487 | "source": [
1488 | "!git clone https://github.com/cg123/mergekit.git\n",
1489 | "!cd mergekit && pip install -q -e ."
1490 | ]
1491 | },
1492 | {
1493 | "cell_type": "code",
1494 | "source": [
1495 | "import yaml\n",
1496 | "\n",
1497 | "MODEL_NAME = \"Marcoro14-7B-slerp\"\n",
1498 | "yaml_config = \"\"\"\n",
1499 | "slices:\n",
1500 | " - sources:\n",
1501 | " - model: AIDC-ai-business/Marcoroni-7B-v3\n",
1502 | " layer_range: [0, 32]\n",
1503 | " - model: EmbeddedLLM/Mistral-7B-Merge-14-v0.1\n",
1504 | " layer_range: [0, 32]\n",
1505 | "merge_method: slerp\n",
1506 | "base_model: AIDC-ai-business/Marcoroni-7B-v3\n",
1507 | "parameters:\n",
1508 | " t:\n",
1509 | " - filter: self_attn\n",
1510 | " value: [0, 0.5, 0.3, 0.7, 1]\n",
1511 | " - filter: mlp\n",
1512 | " value: [1, 0.5, 0.7, 0.3, 0]\n",
1513 | " - value: 0.5\n",
1514 | "dtype: bfloat16\n",
1515 | "\n",
1516 | "\"\"\"\n",
1517 | "\n",
1518 | "# Save config as yaml file\n",
1519 | "with open('config.yaml', 'w', encoding=\"utf-8\") as f:\n",
1520 | " f.write(yaml_config)"
1521 | ],
1522 | "metadata": {
1523 | "id": "LGd7jlfCpNcg"
1524 | },
1525 | "execution_count": null,
1526 | "outputs": []
1527 | },
1528 | {
1529 | "cell_type": "code",
1530 | "source": [
1531 | "# Merge models\n",
1532 | "!mergekit-yaml config.yaml merge --copy-tokenizer --allow-crimes --out-shard-size 1B --lazy-unpickle"
1533 | ],
1534 | "metadata": {
1535 | "id": "d5mYzDo1q96y"
1536 | },
1537 | "execution_count": null,
1538 | "outputs": []
1539 | },
1540 | {
1541 | "cell_type": "code",
1542 | "source": [
1543 | "!pip install -qU huggingface_hub\n",
1544 | "\n",
1545 | "from huggingface_hub import ModelCard, ModelCardData\n",
1546 | "from jinja2 import Template\n",
1547 | "\n",
1548 | "username = \"mlabonne\"\n",
1549 | "\n",
1550 | "template_text = \"\"\"\n",
1551 | "---\n",
1552 | "license: apache-2.0\n",
1553 | "tags:\n",
1554 | "- merge\n",
1555 | "- mergekit\n",
1556 | "- lazymergekit\n",
1557 | "{%- for model in models %}\n",
1558 | "- {{ model }}\n",
1559 | "{%- endfor %}\n",
1560 | "---\n",
1561 | "\n",
1562 | "# {{ model_name }}\n",
1563 | "\n",
1564 | "{{ model_name }} is a merge of the following models using [mergekit](https://github.com/cg123/mergekit):\n",
1565 | "\n",
1566 | "{%- for model in models %}\n",
1567 | "* [{{ model }}](https://huggingface.co/{{ model }})\n",
1568 | "{%- endfor %}\n",
1569 | "\n",
1570 | "## 🧩 Configuration\n",
1571 | "\n",
1572 | "```yaml\n",
1573 | "{{- yaml_config -}}\n",
1574 | "```\n",
1575 | "\"\"\"\n",
1576 | "\n",
1577 | "# Create a Jinja template object\n",
1578 | "jinja_template = Template(template_text.strip())\n",
1579 | "\n",
1580 | "# Get list of models from config\n",
1581 | "data = yaml.safe_load(yaml_config)\n",
1582 | "if \"models\" in data:\n",
1583 | " models = [data[\"models\"][i][\"model\"] for i in range(len(data[\"models\"])) if \"parameters\" in data[\"models\"][i]]\n",
1584 | "elif \"parameters\" in data:\n",
1585 | " models = [data[\"slices\"][0][\"sources\"][i][\"model\"] for i in range(len(data[\"slices\"][0][\"sources\"]))]\n",
1586 | "elif \"slices\" in data:\n",
1587 | " models = [data[\"slices\"][i][\"sources\"][0][\"model\"] for i in range(len(data[\"slices\"]))]\n",
1588 | "else:\n",
1589 | " raise Exception(\"No models or slices found in yaml config\")\n",
1590 | "\n",
1591 | "# Fill the template\n",
1592 | "content = jinja_template.render(\n",
1593 | " model_name=MODEL_NAME,\n",
1594 | " models=models,\n",
1595 | " yaml_config=yaml_config,\n",
1596 | " username=username,\n",
1597 | ")\n",
1598 | "\n",
1599 | "# Save the model card\n",
1600 | "card = ModelCard(content)\n",
1601 | "card.save('merge/README.md')"
1602 | ],
1603 | "metadata": {
1604 | "colab": {
1605 | "base_uri": "https://localhost:8080/"
1606 | },
1607 | "id": "w-RNKev373lI",
1608 | "outputId": "fccbbd1d-295f-4def-a398-f226813294bb"
1609 | },
1610 | "execution_count": null,
1611 | "outputs": [
1612 | {
1613 | "output_type": "stream",
1614 | "name": "stdout",
1615 | "text": [
1616 | "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/330.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.6/330.1 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m330.1/330.1 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1617 | "\u001b[?25h"
1618 | ]
1619 | }
1620 | ]
1621 | },
1622 | {
1623 | "cell_type": "code",
1624 | "source": [
1625 | "from google.colab import userdata\n",
1626 | "from huggingface_hub import HfApi\n",
1627 | "\n",
1628 | "username = \"mlabonne\"\n",
1629 | "\n",
1630 | "# Defined in the secrets tab in Google Colab\n",
1631 | "api = HfApi(token=userdata.get(\"HF_TOKEN\"))\n",
1632 | "\n",
1633 | "api.create_repo(\n",
1634 | " repo_id=f\"{username}/{MODEL_NAME}\",\n",
1635 | " repo_type=\"model\"\n",
1636 | ")\n",
1637 | "api.upload_folder(\n",
1638 | " repo_id=f\"{username}/{MODEL_NAME}\",\n",
1639 | " folder_path=\"merge\",\n",
1640 | ")"
1641 | ],
1642 | "metadata": {
1643 | "colab": {
1644 | "base_uri": "https://localhost:8080/",
1645 | "height": 164,
1646 | "referenced_widgets": [
1647 | "de24d272f2b842c5b01eedb3f536b810",
1648 | "0c5dab2657b2473385a424d90f3d4664",
1649 | "57efe36e546c473d8be34102f6ba9a58",
1650 | "871bad1d905d4877a9eaa242cfd54c4e",
1651 | "8951f6b2edf64464869391197c900f84",
1652 | "69a61ad28d5141dcbaea44060bc5ebf7",
1653 | "76c2fbf005ae4a5790edfeb499b387b7",
1654 | "116964f328dc45d991d895d684ac1216",
1655 | "1ecec5ba4424498082a5f64cf3d7faf8",
1656 | "fc4edcef273b4e75894f4b512122de94",
1657 | "ca2323b142f54998985d30481d5cfabe",
1658 | "63626ac2d0f546188c07512a04c71417",
1659 | "decd91747fd04ce39f3e2b733bc7f477",
1660 | "7140e4c154424fcab846a71889e99ed2",
1661 | "2264d8b75251425e94e635558af4e223",
1662 | "c37478198217457cb30c6649203cf4dc",
1663 | "4918769e4e984dfda924776e2373154c",
1664 | "9b48494c94cf49b5835489d97f7a24c5",
1665 | "6ed844da52fe466eb1c10c814489448c",
1666 | "9c60efa02e80423e828628190dd13bc3",
1667 | "0170e8cc57d94041956f7afbf2eef449",
1668 | "220c2ba5f2524271b24fe049431a474c",
1669 | "a6f99dd0662846f9a381d2d507a7b447",
1670 | "900b9fcb70a84781bd5b4213df54626d",
1671 | "0ea83f270e164795b64f23b143efb300",
1672 | "318dcdeac8fb40f88fa60114f1c6a7c1",
1673 | "af89cf715e0e4c5e9f59943a255394c1",
1674 | "40e23e35299d45d499432b8f1a9bc924",
1675 | "126b374e286747768ef7218454534640",
1676 | "bdd26e54eed5477f99b135552e5f3450",
1677 | "163a6fd878134e1eb5f193d1ebfff1c1",
1678 | "953d7c014f76413c9805a2ef8c2c9356",
1679 | "348879bf76d1471f9c79c1ec2dc07c1d",
1680 | "8d54ae0d028b40e7b018454187db1a1c",
1681 | "562353040be54593b23734390f49927c",
1682 | "00cbebe6df7d4995913f20e39fc71b15",
1683 | "aee3c563fdc54f9cb3ebc2630c84a9e6",
1684 | "b74e307a751844ffab9f7f3df367774b",
1685 | "8e6142e41f714fe9abe6a5bb72c071f9",
1686 | "49cd1c5663404fb5a307c345e7e970c3",
1687 | "920ef8e509d24ccda930f4c47eff158c",
1688 | "c8828d61b26a47ac97a1541e14c00f62",
1689 | "a3d7e352222647a99be79935b1ebd86a",
1690 | "80666ef5f07641c482a23618a767791d"
1691 | ]
1692 | },
1693 | "id": "ik0V0dF55gfU",
1694 | "outputId": "9f6c605c-6b51-473d-c1fa-b103e9208785"
1695 | },
1696 | "execution_count": null,
1697 | "outputs": [
1698 | {
1699 | "output_type": "display_data",
1700 | "data": {
1701 | "text/plain": [
1702 | "tokenizer.model: 0%| | 0.00/493k [00:00, ?B/s]"
1703 | ],
1704 | "application/vnd.jupyter.widget-view+json": {
1705 | "version_major": 2,
1706 | "version_minor": 0,
1707 | "model_id": "de24d272f2b842c5b01eedb3f536b810"
1708 | }
1709 | },
1710 | "metadata": {}
1711 | },
1712 | {
1713 | "output_type": "display_data",
1714 | "data": {
1715 | "text/plain": [
1716 | "model-00001-of-00002.safetensors: 0%| | 0.00/9.94G [00:00, ?B/s]"
1717 | ],
1718 | "application/vnd.jupyter.widget-view+json": {
1719 | "version_major": 2,
1720 | "version_minor": 0,
1721 | "model_id": "63626ac2d0f546188c07512a04c71417"
1722 | }
1723 | },
1724 | "metadata": {}
1725 | },
1726 | {
1727 | "output_type": "display_data",
1728 | "data": {
1729 | "text/plain": [
1730 | "Upload 3 LFS files: 0%| | 0/3 [00:00, ?it/s]"
1731 | ],
1732 | "application/vnd.jupyter.widget-view+json": {
1733 | "version_major": 2,
1734 | "version_minor": 0,
1735 | "model_id": "a6f99dd0662846f9a381d2d507a7b447"
1736 | }
1737 | },
1738 | "metadata": {}
1739 | },
1740 | {
1741 | "output_type": "display_data",
1742 | "data": {
1743 | "text/plain": [
1744 | "model-00002-of-00002.safetensors: 0%| | 0.00/8.03G [00:00, ?B/s]"
1745 | ],
1746 | "application/vnd.jupyter.widget-view+json": {
1747 | "version_major": 2,
1748 | "version_minor": 0,
1749 | "model_id": "8d54ae0d028b40e7b018454187db1a1c"
1750 | }
1751 | },
1752 | "metadata": {}
1753 | },
1754 | {
1755 | "output_type": "execute_result",
1756 | "data": {
1757 | "text/plain": [
1758 | "'https://huggingface.co/mlabonne/NeuralPipe-9B-merged/tree/main/'"
1759 | ],
1760 | "application/vnd.google.colaboratory.intrinsic+json": {
1761 | "type": "string"
1762 | }
1763 | },
1764 | "metadata": {},
1765 | "execution_count": 5
1766 | }
1767 | ]
1768 | }
1769 | ]
1770 | }
--------------------------------------------------------------------------------