├── 4_bit_LLM_Quantization_with_GPTQ.ipynb ├── Decoding_Strategies_in_Large_Language Models.ipynb ├── Fine_tune_Llama_2_in_Google_Colab.ipynb ├── Fine_tune_a_Mistral_7b_model_with_DPO.ipynb ├── Improve_ChatGPT_with_Knowledge_Graphs.ipynb ├── Introduction_to_Weight_Quantization.ipynb ├── LICENSE ├── Mergekit.ipynb ├── Quantize_Llama_2_models_using_GGUF_and_llama_cpp.ipynb ├── Quantize_models_with_ExLlamaV2.ipynb ├── README.md ├── Visualizing_GPT_2's_Loss_Landscape.ipynb ├── images ├── colab.svg ├── roadmap_fundamentals.png └── roadmap_scientist.png └── nanoLoRA.ipynb /4_bit_LLM_Quantization_with_GPTQ.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4", 8 | "authorship_tag": "ABX9TyOS2QEuJ1BDI/3IFsLsFIZo", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | }, 18 | "accelerator": "GPU" 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "view-in-github", 25 | "colab_type": "text" 26 | }, 27 | "source": [ 28 | "\"Open" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "source": [ 34 | "# 4-bit LLM Quantization with GPTQ\n", 35 | "> 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n", 36 | "\n", 37 | "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n", 38 | "\n", 39 | "Companion notebook to execute the code from the following article: https://mlabonne.github.io/blog/4bit_quantization/" 40 | ], 41 | "metadata": { 42 | "id": "yezrHxYvg_wR" 43 | } 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "id": "BhufqqQAaz6e" 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "source": [ 59 | "import random\n", 60 | "\n", 61 | "from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig\n", 62 | "from datasets import load_dataset\n", 63 | "import torch\n", 64 | "from transformers import AutoTokenizer\n", 65 | "\n", 66 | "\n", 67 | "# Define base model and output directory\n", 68 | "model_id = \"gpt2\"\n", 69 | "out_dir = model_id + \"-GPTQ\"" 70 | ], 71 | "metadata": { 72 | "id": "dg8NyBL0ZNyw" 73 | }, 74 | "execution_count": null, 75 | "outputs": [] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "source": [ 80 | "# Load quantize config, model and tokenizer\n", 81 | "quantize_config = BaseQuantizeConfig(\n", 82 | " bits=4,\n", 83 | " group_size=128,\n", 84 | " damp_percent=0.01,\n", 85 | " desc_act=False,\n", 86 | ")\n", 87 | "model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config)\n", 88 | "tokenizer = AutoTokenizer.from_pretrained(model_id)" 89 | ], 90 | "metadata": { 91 | "id": "C9352jN0ZP6I" 92 | }, 93 | "execution_count": null, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "source": [ 99 | "# Load data and tokenize examples\n", 100 | "n_samples = 1024\n", 101 | "data = load_dataset(\"allenai/c4\", data_files=\"en/c4-train.00001-of-01024.json.gz\", split=f\"train[:{n_samples*5}]\")\n", 102 | "tokenized_data = tokenizer(\"\\n\\n\".join(data['text']), return_tensors='pt')\n", 103 | "\n", 104 | "# Format tokenized examples\n", 105 | "examples_ids = []\n", 106 | "for _ in range(n_samples):\n", 107 | " i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)\n", 108 | " j = i + tokenizer.model_max_length\n", 109 | " input_ids = tokenized_data.input_ids[:, i:j]\n", 110 | " attention_mask = torch.ones_like(input_ids)\n", 111 | " examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})" 112 | ], 113 | "metadata": { 114 | "id": "6wuBLe6aZSe-", 115 | "colab": { 116 | "base_uri": "https://localhost:8080/" 117 | }, 118 | "outputId": "e4ebd71a-2854-4347-cebe-08cf040d1eb6" 119 | }, 120 | "execution_count": null, 121 | "outputs": [ 122 | { 123 | "output_type": "stream", 124 | "name": "stderr", 125 | "text": [ 126 | "WARNING:datasets.builder:Found cached dataset json (/root/.cache/huggingface/datasets/allenai___json/allenai--c4-6e494e9c0ee1404e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n", 127 | "Token indices sequence length is longer than the specified maximum sequence length for this model (2441065 > 1024). Running this sequence through the model will result in indexing errors\n" 128 | ] 129 | } 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "source": [ 135 | "%%time\n", 136 | "\n", 137 | "# Quantize with GPTQ\n", 138 | "model.quantize(\n", 139 | " examples_ids,\n", 140 | " batch_size=1,\n", 141 | " use_triton=True,\n", 142 | ")\n", 143 | "\n", 144 | "# Save model and tokenizer\n", 145 | "model.save_quantized(out_dir, use_safetensors=True)\n", 146 | "tokenizer.save_pretrained(out_dir)" 147 | ], 148 | "metadata": { 149 | "id": "ETsG2iYrXaUg", 150 | "colab": { 151 | "base_uri": "https://localhost:8080/" 152 | }, 153 | "outputId": "e48b825e-0ebc-4a73-dbfd-b5571cafd24e" 154 | }, 155 | "execution_count": null, 156 | "outputs": [ 157 | { 158 | "output_type": "stream", 159 | "name": "stdout", 160 | "text": [ 161 | "CPU times: user 4min 35s, sys: 3.49 s, total: 4min 39s\n", 162 | "Wall time: 5min 8s\n" 163 | ] 164 | }, 165 | { 166 | "output_type": "execute_result", 167 | "data": { 168 | "text/plain": [ 169 | "('gpt2-GPTQ/tokenizer_config.json',\n", 170 | " 'gpt2-GPTQ/special_tokens_map.json',\n", 171 | " 'gpt2-GPTQ/vocab.json',\n", 172 | " 'gpt2-GPTQ/merges.txt',\n", 173 | " 'gpt2-GPTQ/added_tokens.json',\n", 174 | " 'gpt2-GPTQ/tokenizer.json')" 175 | ] 176 | }, 177 | "metadata": {}, 178 | "execution_count": 5 179 | } 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "source": [ 185 | "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", 186 | "\n", 187 | "# Reload model and tokenizer\n", 188 | "model = AutoGPTQForCausalLM.from_quantized(\n", 189 | " out_dir,\n", 190 | " device=device,\n", 191 | " use_triton=True,\n", 192 | " use_safetensors=True,\n", 193 | ")\n", 194 | "tokenizer = AutoTokenizer.from_pretrained(out_dir)" 195 | ], 196 | "metadata": { 197 | "id": "nktu1FsdZ9sd", 198 | "colab": { 199 | "base_uri": "https://localhost:8080/" 200 | }, 201 | "outputId": "9943c829-1b58-474a-f245-6aefa09d81dc" 202 | }, 203 | "execution_count": null, 204 | "outputs": [ 205 | { 206 | "output_type": "stream", 207 | "name": "stderr", 208 | "text": [ 209 | "WARNING:accelerate.utils.modeling:The safetensors archive passed at gpt2-GPTQ/gptq_model-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.\n", 210 | "WARNING:auto_gptq.modeling._base:GPT2GPTQForCausalLM hasn't fused attention module yet, will skip inject fused attention.\n", 211 | "WARNING:auto_gptq.modeling._base:GPT2GPTQForCausalLM hasn't fused mlp module yet, will skip inject fused mlp.\n" 212 | ] 213 | } 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "source": [ 219 | "from transformers import pipeline\n", 220 | "\n", 221 | "generator = pipeline('text-generation', model=model, tokenizer=tokenizer)\n", 222 | "result = generator(\"I have a dream\", do_sample=True, max_length=50)[0]['generated_text']\n", 223 | "print(result)" 224 | ], 225 | "metadata": { 226 | "colab": { 227 | "base_uri": "https://localhost:8080/" 228 | }, 229 | "id": "cRhIGrXdiFdt", 230 | "outputId": "6dca2078-6f01-44da-9895-3a03bdfb4b5b" 231 | }, 232 | "execution_count": null, 233 | "outputs": [ 234 | { 235 | "output_type": "stream", 236 | "name": "stderr", 237 | "text": [ 238 | "The model 'GPT2GPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBertForCausalLM', 'RoFormerForCausalLM', 'RwkvForCausalLM', 'Speech2Text2ForCausalLM', 'TransfoXLLMHeadModel', 'TrOCRForCausalLM', 'XGLMForCausalLM', 'XLMWithLMHeadModel', 'XLMProphetNetForCausalLM', 'XLMRobertaForCausalLM', 'XLMRobertaXLForCausalLM', 'XLNetLMHeadModel', 'XmodForCausalLM'].\n", 239 | "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" 240 | ] 241 | }, 242 | { 243 | "output_type": "stream", 244 | "name": "stdout", 245 | "text": [ 246 | "I have a dream,\" she told CNN last week. \"I have this dream of helping my mother find her own. But, to tell that for the first time, now that I'm seeing my mother now, just knowing how wonderful it is that\n" 247 | ] 248 | } 249 | ] 250 | } 251 | ] 252 | } -------------------------------------------------------------------------------- /Fine_tune_a_Mistral_7b_model_with_DPO.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "machine_shape": "hm", 8 | "gpuType": "A100", 9 | "authorship_tag": "ABX9TyOJJCuqxZQnS1q+Fvz5+URG", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | }, 19 | "widgets": { 20 | "application/vnd.jupyter.widget-state+json": { 21 | "22773c721a7c4221a9c14cd388461d4c": { 22 | "model_module": "@jupyter-widgets/controls", 23 | "model_name": "HBoxModel", 24 | "model_module_version": "1.5.0", 25 | "state": { 26 | "_dom_classes": [], 27 | "_model_module": "@jupyter-widgets/controls", 28 | "_model_module_version": "1.5.0", 29 | "_model_name": "HBoxModel", 30 | "_view_count": null, 31 | "_view_module": "@jupyter-widgets/controls", 32 | "_view_module_version": "1.5.0", 33 | "_view_name": "HBoxView", 34 | "box_style": "", 35 | "children": [ 36 | "IPY_MODEL_6b54841f5de1482694c360095dae3039", 37 | "IPY_MODEL_448ccbc85e624ec3b3e71931a7ee4ff6", 38 | "IPY_MODEL_173769f6f465485f8848a11bf269850b" 39 | ], 40 | "layout": "IPY_MODEL_60978b9b4e8348f0a71ce3e35c73bcff" 41 | } 42 | }, 43 | "6b54841f5de1482694c360095dae3039": { 44 | "model_module": "@jupyter-widgets/controls", 45 | "model_name": "HTMLModel", 46 | "model_module_version": "1.5.0", 47 | "state": { 48 | "_dom_classes": [], 49 | "_model_module": "@jupyter-widgets/controls", 50 | "_model_module_version": "1.5.0", 51 | "_model_name": "HTMLModel", 52 | "_view_count": null, 53 | "_view_module": "@jupyter-widgets/controls", 54 | "_view_module_version": "1.5.0", 55 | "_view_name": "HTMLView", 56 | "description": "", 57 | "description_tooltip": null, 58 | "layout": "IPY_MODEL_6a38dcbaf4674b448329ac0a16587d2a", 59 | "placeholder": "​", 60 | "style": "IPY_MODEL_7eaeada2158e493189449af91f643553", 61 | "value": "Loading checkpoint shards: 100%" 62 | } 63 | }, 64 | "448ccbc85e624ec3b3e71931a7ee4ff6": { 65 | "model_module": "@jupyter-widgets/controls", 66 | "model_name": "FloatProgressModel", 67 | "model_module_version": "1.5.0", 68 | "state": { 69 | "_dom_classes": [], 70 | "_model_module": "@jupyter-widgets/controls", 71 | "_model_module_version": "1.5.0", 72 | "_model_name": "FloatProgressModel", 73 | "_view_count": null, 74 | "_view_module": "@jupyter-widgets/controls", 75 | "_view_module_version": "1.5.0", 76 | "_view_name": "ProgressView", 77 | "bar_style": "success", 78 | "description": "", 79 | "description_tooltip": null, 80 | "layout": "IPY_MODEL_6e32854952b340008edca0139d3471d6", 81 | "max": 3, 82 | "min": 0, 83 | "orientation": "horizontal", 84 | "style": "IPY_MODEL_db6d7cfcdade4b4baa213a5d0abc07d7", 85 | "value": 3 86 | } 87 | }, 88 | "173769f6f465485f8848a11bf269850b": { 89 | "model_module": "@jupyter-widgets/controls", 90 | "model_name": "HTMLModel", 91 | "model_module_version": "1.5.0", 92 | "state": { 93 | "_dom_classes": [], 94 | "_model_module": "@jupyter-widgets/controls", 95 | "_model_module_version": "1.5.0", 96 | "_model_name": "HTMLModel", 97 | "_view_count": null, 98 | "_view_module": "@jupyter-widgets/controls", 99 | "_view_module_version": "1.5.0", 100 | "_view_name": "HTMLView", 101 | "description": "", 102 | "description_tooltip": null, 103 | "layout": "IPY_MODEL_9083029642744c43b7705532cbe0cf79", 104 | "placeholder": "​", 105 | "style": "IPY_MODEL_d028a98caa13425b907ceb513119006e", 106 | "value": " 3/3 [00:11<00:00, 2.89s/it]" 107 | } 108 | }, 109 | "60978b9b4e8348f0a71ce3e35c73bcff": { 110 | "model_module": "@jupyter-widgets/base", 111 | "model_name": "LayoutModel", 112 | "model_module_version": "1.2.0", 113 | "state": { 114 | "_model_module": "@jupyter-widgets/base", 115 | "_model_module_version": "1.2.0", 116 | "_model_name": "LayoutModel", 117 | "_view_count": null, 118 | "_view_module": "@jupyter-widgets/base", 119 | "_view_module_version": "1.2.0", 120 | "_view_name": "LayoutView", 121 | "align_content": null, 122 | "align_items": null, 123 | "align_self": null, 124 | "border": null, 125 | "bottom": null, 126 | "display": null, 127 | "flex": null, 128 | "flex_flow": null, 129 | "grid_area": null, 130 | "grid_auto_columns": null, 131 | "grid_auto_flow": null, 132 | "grid_auto_rows": null, 133 | "grid_column": null, 134 | "grid_gap": null, 135 | "grid_row": null, 136 | "grid_template_areas": null, 137 | "grid_template_columns": null, 138 | "grid_template_rows": null, 139 | "height": null, 140 | "justify_content": null, 141 | "justify_items": null, 142 | "left": null, 143 | "margin": null, 144 | "max_height": null, 145 | "max_width": null, 146 | "min_height": null, 147 | "min_width": null, 148 | "object_fit": null, 149 | "object_position": null, 150 | "order": null, 151 | "overflow": null, 152 | "overflow_x": null, 153 | "overflow_y": null, 154 | "padding": null, 155 | "right": null, 156 | "top": null, 157 | "visibility": null, 158 | "width": null 159 | } 160 | }, 161 | "6a38dcbaf4674b448329ac0a16587d2a": { 162 | "model_module": "@jupyter-widgets/base", 163 | "model_name": "LayoutModel", 164 | "model_module_version": "1.2.0", 165 | "state": { 166 | "_model_module": "@jupyter-widgets/base", 167 | "_model_module_version": "1.2.0", 168 | "_model_name": "LayoutModel", 169 | "_view_count": null, 170 | "_view_module": "@jupyter-widgets/base", 171 | "_view_module_version": "1.2.0", 172 | "_view_name": "LayoutView", 173 | "align_content": null, 174 | "align_items": null, 175 | "align_self": null, 176 | "border": null, 177 | "bottom": null, 178 | "display": null, 179 | "flex": null, 180 | "flex_flow": null, 181 | "grid_area": null, 182 | "grid_auto_columns": null, 183 | "grid_auto_flow": null, 184 | "grid_auto_rows": null, 185 | "grid_column": null, 186 | "grid_gap": null, 187 | "grid_row": null, 188 | "grid_template_areas": null, 189 | "grid_template_columns": null, 190 | "grid_template_rows": null, 191 | "height": null, 192 | "justify_content": null, 193 | "justify_items": null, 194 | "left": null, 195 | "margin": null, 196 | "max_height": null, 197 | "max_width": null, 198 | "min_height": null, 199 | "min_width": null, 200 | "object_fit": null, 201 | "object_position": null, 202 | "order": null, 203 | "overflow": null, 204 | "overflow_x": null, 205 | "overflow_y": null, 206 | "padding": null, 207 | "right": null, 208 | "top": null, 209 | "visibility": null, 210 | "width": null 211 | } 212 | }, 213 | "7eaeada2158e493189449af91f643553": { 214 | "model_module": "@jupyter-widgets/controls", 215 | "model_name": "DescriptionStyleModel", 216 | "model_module_version": "1.5.0", 217 | "state": { 218 | "_model_module": "@jupyter-widgets/controls", 219 | "_model_module_version": "1.5.0", 220 | "_model_name": "DescriptionStyleModel", 221 | "_view_count": null, 222 | "_view_module": "@jupyter-widgets/base", 223 | "_view_module_version": "1.2.0", 224 | "_view_name": "StyleView", 225 | "description_width": "" 226 | } 227 | }, 228 | "6e32854952b340008edca0139d3471d6": { 229 | "model_module": "@jupyter-widgets/base", 230 | "model_name": "LayoutModel", 231 | "model_module_version": "1.2.0", 232 | "state": { 233 | "_model_module": "@jupyter-widgets/base", 234 | "_model_module_version": "1.2.0", 235 | "_model_name": "LayoutModel", 236 | "_view_count": null, 237 | "_view_module": "@jupyter-widgets/base", 238 | "_view_module_version": "1.2.0", 239 | "_view_name": "LayoutView", 240 | "align_content": null, 241 | "align_items": null, 242 | "align_self": null, 243 | "border": null, 244 | "bottom": null, 245 | "display": null, 246 | "flex": null, 247 | "flex_flow": null, 248 | "grid_area": null, 249 | "grid_auto_columns": null, 250 | "grid_auto_flow": null, 251 | "grid_auto_rows": null, 252 | "grid_column": null, 253 | "grid_gap": null, 254 | "grid_row": null, 255 | "grid_template_areas": null, 256 | "grid_template_columns": null, 257 | "grid_template_rows": null, 258 | "height": null, 259 | "justify_content": null, 260 | "justify_items": null, 261 | "left": null, 262 | "margin": null, 263 | "max_height": null, 264 | "max_width": null, 265 | "min_height": null, 266 | "min_width": null, 267 | "object_fit": null, 268 | "object_position": null, 269 | "order": null, 270 | "overflow": null, 271 | "overflow_x": null, 272 | "overflow_y": null, 273 | "padding": null, 274 | "right": null, 275 | "top": null, 276 | "visibility": null, 277 | "width": null 278 | } 279 | }, 280 | "db6d7cfcdade4b4baa213a5d0abc07d7": { 281 | "model_module": "@jupyter-widgets/controls", 282 | "model_name": "ProgressStyleModel", 283 | "model_module_version": "1.5.0", 284 | "state": { 285 | "_model_module": "@jupyter-widgets/controls", 286 | "_model_module_version": "1.5.0", 287 | "_model_name": "ProgressStyleModel", 288 | "_view_count": null, 289 | "_view_module": "@jupyter-widgets/base", 290 | "_view_module_version": "1.2.0", 291 | "_view_name": "StyleView", 292 | "bar_color": null, 293 | "description_width": "" 294 | } 295 | }, 296 | "9083029642744c43b7705532cbe0cf79": { 297 | "model_module": "@jupyter-widgets/base", 298 | "model_name": "LayoutModel", 299 | "model_module_version": "1.2.0", 300 | "state": { 301 | "_model_module": "@jupyter-widgets/base", 302 | "_model_module_version": "1.2.0", 303 | "_model_name": "LayoutModel", 304 | "_view_count": null, 305 | "_view_module": "@jupyter-widgets/base", 306 | "_view_module_version": "1.2.0", 307 | "_view_name": "LayoutView", 308 | "align_content": null, 309 | "align_items": null, 310 | "align_self": null, 311 | "border": null, 312 | "bottom": null, 313 | "display": null, 314 | "flex": null, 315 | "flex_flow": null, 316 | "grid_area": null, 317 | "grid_auto_columns": null, 318 | "grid_auto_flow": null, 319 | "grid_auto_rows": null, 320 | "grid_column": null, 321 | "grid_gap": null, 322 | "grid_row": null, 323 | "grid_template_areas": null, 324 | "grid_template_columns": null, 325 | "grid_template_rows": null, 326 | "height": null, 327 | "justify_content": null, 328 | "justify_items": null, 329 | "left": null, 330 | "margin": null, 331 | "max_height": null, 332 | "max_width": null, 333 | "min_height": null, 334 | "min_width": null, 335 | "object_fit": null, 336 | "object_position": null, 337 | "order": null, 338 | "overflow": null, 339 | "overflow_x": null, 340 | "overflow_y": null, 341 | "padding": null, 342 | "right": null, 343 | "top": null, 344 | "visibility": null, 345 | "width": null 346 | } 347 | }, 348 | "d028a98caa13425b907ceb513119006e": { 349 | "model_module": "@jupyter-widgets/controls", 350 | "model_name": "DescriptionStyleModel", 351 | "model_module_version": "1.5.0", 352 | "state": { 353 | "_model_module": "@jupyter-widgets/controls", 354 | "_model_module_version": "1.5.0", 355 | "_model_name": "DescriptionStyleModel", 356 | "_view_count": null, 357 | "_view_module": "@jupyter-widgets/base", 358 | "_view_module_version": "1.2.0", 359 | "_view_name": "StyleView", 360 | "description_width": "" 361 | } 362 | } 363 | } 364 | }, 365 | "accelerator": "GPU" 366 | }, 367 | "cells": [ 368 | { 369 | "cell_type": "markdown", 370 | "metadata": { 371 | "id": "view-in-github", 372 | "colab_type": "text" 373 | }, 374 | "source": [ 375 | "\"Open" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "source": [ 381 | "# Fine-tune a Mistral-7b model with DPO\n", 382 | "\n", 383 | "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne)." 384 | ], 385 | "metadata": { 386 | "id": "Pa8905-YsHAn" 387 | } 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "id": "_zIBL8IssExG" 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "!pip install -q datasets trl peft bitsandbytes sentencepiece wandb" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "source": [ 403 | "import os\n", 404 | "import gc\n", 405 | "import torch\n", 406 | "\n", 407 | "import transformers\n", 408 | "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n", 409 | "from datasets import load_dataset\n", 410 | "from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training\n", 411 | "from trl import DPOTrainer\n", 412 | "import bitsandbytes as bnb\n", 413 | "from google.colab import userdata\n", 414 | "import wandb\n", 415 | "\n", 416 | "# Defined in the secrets tab in Google Colab\n", 417 | "hf_token = userdata.get('huggingface')\n", 418 | "wb_token = userdata.get('wandb')\n", 419 | "wandb.login(key=wb_token)\n", 420 | "\n", 421 | "model_name = \"teknium/OpenHermes-2.5-Mistral-7B\"\n", 422 | "new_model = \"NeuralHermes-2.5-Mistral-7B\"" 423 | ], 424 | "metadata": { 425 | "colab": { 426 | "base_uri": "https://localhost:8080/" 427 | }, 428 | "id": "YpdkZsMNylvp", 429 | "outputId": "6c2df234-1ce7-4cd2-a7e3-567e7536319f" 430 | }, 431 | "execution_count": null, 432 | "outputs": [ 433 | { 434 | "output_type": "stream", 435 | "name": "stderr", 436 | "text": [ 437 | "/usr/local/lib/python3.10/dist-packages/trl/trainer/ppo_config.py:141: UserWarning: The `optimize_cuda_cache` arguement will be deprecated soon, please use `optimize_device_cache` instead.\n", 438 | " warnings.warn(\n", 439 | "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmlabonne\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n", 440 | "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n", 441 | "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n", 442 | "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n" 443 | ] 444 | } 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "source": [ 450 | "## Format dataset" 451 | ], 452 | "metadata": { 453 | "id": "d8CvUgROUDw-" 454 | } 455 | }, 456 | { 457 | "cell_type": "code", 458 | "source": [ 459 | "def chatml_format(example):\n", 460 | " # Format system\n", 461 | " if len(example['system']) > 0:\n", 462 | " message = {\"role\": \"system\", \"content\": example['system']}\n", 463 | " system = tokenizer.apply_chat_template([message], tokenize=False)\n", 464 | " else:\n", 465 | " system = \"\"\n", 466 | "\n", 467 | " # Format instruction\n", 468 | " message = {\"role\": \"user\", \"content\": example['question']}\n", 469 | " prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)\n", 470 | "\n", 471 | " # Format chosen answer\n", 472 | " chosen = example['chatgpt'] + \"<|im_end|>\\n\"\n", 473 | "\n", 474 | " # Format rejected answer\n", 475 | " rejected = example['llama2-13b-chat'] + \"<|im_end|>\\n\"\n", 476 | "\n", 477 | " return {\n", 478 | " \"prompt\": system + prompt,\n", 479 | " \"chosen\": chosen,\n", 480 | " \"rejected\": rejected,\n", 481 | " }\n", 482 | "\n", 483 | "# Load dataset\n", 484 | "dataset = load_dataset(\"Intel/orca_dpo_pairs\")['train']\n", 485 | "\n", 486 | "# Save columns\n", 487 | "original_columns = dataset.column_names\n", 488 | "\n", 489 | "# Tokenizer\n", 490 | "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", 491 | "tokenizer.pad_token = tokenizer.eos_token\n", 492 | "tokenizer.padding_side = \"left\"\n", 493 | "\n", 494 | "# Format dataset\n", 495 | "dataset = dataset.map(\n", 496 | " chatml_format,\n", 497 | " remove_columns=original_columns\n", 498 | ")\n", 499 | "\n", 500 | "# Print sample\n", 501 | "dataset[1]" 502 | ], 503 | "metadata": { 504 | "colab": { 505 | "base_uri": "https://localhost:8080/" 506 | }, 507 | "id": "MCD77GZ60DOT", 508 | "outputId": "c7c6773c-5545-4fee-bfa3-6fa6d69c0f3f" 509 | }, 510 | "execution_count": null, 511 | "outputs": [ 512 | { 513 | "output_type": "stream", 514 | "name": "stderr", 515 | "text": [ 516 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", 517 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" 518 | ] 519 | }, 520 | { 521 | "output_type": "execute_result", 522 | "data": { 523 | "text/plain": [ 524 | "{'prompt': '<|im_start|>system\\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>\\n<|im_start|>user\\nGenerate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One<|im_end|>\\n<|im_start|>assistant\\n',\n", 525 | " 'chosen': 'Midsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.<|im_end|>\\n',\n", 526 | " 'rejected': ' Sure! Here\\'s a sentence that describes all the data you provided:\\n\\n\"Midsummer House is a moderately priced Chinese restaurant with a customer rating of 3 out of 5, located near All Bar One, offering a variety of delicious dishes.\"<|im_end|>\\n'}" 527 | ] 528 | }, 529 | "metadata": {}, 530 | "execution_count": 3 531 | } 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "source": [ 537 | "## Train model with DPO" 538 | ], 539 | "metadata": { 540 | "id": "DeT5eUK_UJgK" 541 | } 542 | }, 543 | { 544 | "cell_type": "code", 545 | "source": [ 546 | "# LoRA configuration\n", 547 | "peft_config = LoraConfig(\n", 548 | " r=16,\n", 549 | " lora_alpha=16,\n", 550 | " lora_dropout=0.05,\n", 551 | " bias=\"none\",\n", 552 | " task_type=\"CAUSAL_LM\",\n", 553 | " target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']\n", 554 | ")\n", 555 | "\n", 556 | "# Model to fine-tune\n", 557 | "model = AutoModelForCausalLM.from_pretrained(\n", 558 | " model_name,\n", 559 | " torch_dtype=torch.float16,\n", 560 | " load_in_4bit=True\n", 561 | ")\n", 562 | "model.config.use_cache = False\n", 563 | "\n", 564 | "# Reference model\n", 565 | "ref_model = AutoModelForCausalLM.from_pretrained(\n", 566 | " model_name,\n", 567 | " torch_dtype=torch.float16,\n", 568 | " load_in_4bit=True\n", 569 | ")\n", 570 | "\n", 571 | "# Training arguments\n", 572 | "training_args = TrainingArguments(\n", 573 | " per_device_train_batch_size=4,\n", 574 | " gradient_accumulation_steps=4,\n", 575 | " gradient_checkpointing=True,\n", 576 | " learning_rate=5e-5,\n", 577 | " lr_scheduler_type=\"cosine\",\n", 578 | " max_steps=200,\n", 579 | " save_strategy=\"no\",\n", 580 | " logging_steps=1,\n", 581 | " output_dir=new_model,\n", 582 | " optim=\"paged_adamw_32bit\",\n", 583 | " warmup_steps=100,\n", 584 | " bf16=True,\n", 585 | " report_to=\"wandb\",\n", 586 | ")\n", 587 | "\n", 588 | "# Create DPO trainer\n", 589 | "dpo_trainer = DPOTrainer(\n", 590 | " model,\n", 591 | " ref_model,\n", 592 | " args=training_args,\n", 593 | " train_dataset=dataset,\n", 594 | " tokenizer=tokenizer,\n", 595 | " peft_config=peft_config,\n", 596 | " beta=0.1,\n", 597 | " max_prompt_length=1024,\n", 598 | " max_length=1536,\n", 599 | ")\n", 600 | "\n", 601 | "# Fine-tune model with DPO\n", 602 | "dpo_trainer.train()" 603 | ], 604 | "metadata": { 605 | "id": "rKPILNOLR-aK" 606 | }, 607 | "execution_count": null, 608 | "outputs": [] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "source": [ 613 | "## Upload model" 614 | ], 615 | "metadata": { 616 | "id": "3LdhPpcrUM3H" 617 | } 618 | }, 619 | { 620 | "cell_type": "code", 621 | "source": [ 622 | "# Save artifacts\n", 623 | "dpo_trainer.model.save_pretrained(\"final_checkpoint\")\n", 624 | "tokenizer.save_pretrained(\"final_checkpoint\")\n", 625 | "\n", 626 | "# Flush memory\n", 627 | "del dpo_trainer, model, ref_model\n", 628 | "gc.collect()\n", 629 | "torch.cuda.empty_cache()\n", 630 | "\n", 631 | "# Reload model in FP16 (instead of NF4)\n", 632 | "base_model = AutoModelForCausalLM.from_pretrained(\n", 633 | " model_name,\n", 634 | " return_dict=True,\n", 635 | " torch_dtype=torch.float16,\n", 636 | ")\n", 637 | "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", 638 | "\n", 639 | "# Merge base model with the adapter\n", 640 | "model = PeftModel.from_pretrained(base_model, \"final_checkpoint\")\n", 641 | "model = model.merge_and_unload()\n", 642 | "\n", 643 | "# Save model and tokenizer\n", 644 | "model.save_pretrained(new_model)\n", 645 | "tokenizer.save_pretrained(new_model)\n", 646 | "\n", 647 | "# Push them to the HF Hub\n", 648 | "model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)\n", 649 | "tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)" 650 | ], 651 | "metadata": { 652 | "id": "h7cIvxcTfBC4" 653 | }, 654 | "execution_count": null, 655 | "outputs": [] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "source": [ 660 | "## Inference" 661 | ], 662 | "metadata": { 663 | "id": "G6EFsmS4UOgV" 664 | } 665 | }, 666 | { 667 | "cell_type": "code", 668 | "source": [ 669 | "# Format prompt\n", 670 | "message = [\n", 671 | " {\"role\": \"system\", \"content\": \"You are a helpful assistant chatbot.\"},\n", 672 | " {\"role\": \"user\", \"content\": \"What is a Large Language Model?\"}\n", 673 | "]\n", 674 | "tokenizer = AutoTokenizer.from_pretrained(new_model)\n", 675 | "prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)\n", 676 | "\n", 677 | "# Create pipeline\n", 678 | "pipeline = transformers.pipeline(\n", 679 | " \"text-generation\",\n", 680 | " model=new_model,\n", 681 | " tokenizer=tokenizer\n", 682 | ")\n", 683 | "\n", 684 | "# Generate text\n", 685 | "sequences = pipeline(\n", 686 | " prompt,\n", 687 | " do_sample=True,\n", 688 | " temperature=0.7,\n", 689 | " top_p=0.9,\n", 690 | " num_return_sequences=1,\n", 691 | " max_length=200,\n", 692 | ")\n", 693 | "print(sequences[0]['generated_text'])" 694 | ], 695 | "metadata": { 696 | "colab": { 697 | "base_uri": "https://localhost:8080/", 698 | "height": 251, 699 | "referenced_widgets": [ 700 | "22773c721a7c4221a9c14cd388461d4c", 701 | "6b54841f5de1482694c360095dae3039", 702 | "448ccbc85e624ec3b3e71931a7ee4ff6", 703 | "173769f6f465485f8848a11bf269850b", 704 | "60978b9b4e8348f0a71ce3e35c73bcff", 705 | "6a38dcbaf4674b448329ac0a16587d2a", 706 | "7eaeada2158e493189449af91f643553", 707 | "6e32854952b340008edca0139d3471d6", 708 | "db6d7cfcdade4b4baa213a5d0abc07d7", 709 | "9083029642744c43b7705532cbe0cf79", 710 | "d028a98caa13425b907ceb513119006e" 711 | ] 712 | }, 713 | "id": "LAEUZFjvlJOv", 714 | "outputId": "9b5720c7-49ef-45c7-e5a7-f38d64899b1e" 715 | }, 716 | "execution_count": null, 717 | "outputs": [ 718 | { 719 | "output_type": "stream", 720 | "name": "stderr", 721 | "text": [ 722 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" 723 | ] 724 | }, 725 | { 726 | "output_type": "display_data", 727 | "data": { 728 | "text/plain": [ 729 | "Loading checkpoint shards: 0%| | 0/3 [00:00system\n", 753 | "You are a helpful assistant chatbot.<|im_end|>\n", 754 | "<|im_start|>user\n", 755 | "What is a Large Language Model?<|im_end|>\n", 756 | "<|im_start|>assistant\n", 757 | "A large language model is a type of artificial intelligence (AI) system that has been trained on vast amounts of text data. These models are designed to understand and generate human language, allowing them to perform various natural language processing tasks, such as text generation, language translation, and question answering. Large language models typically use deep learning techniques, like recurrent neural networks (RNNs) or transformers, to learn patterns and relationships in the data, enabling them to generate coherent and contextually relevant responses. The size of these models, in terms of the number of parameters and the volume of data they are trained on, plays a significant role in their ability to comprehend and produce complex language structures.\n" 758 | ] 759 | } 760 | ] 761 | } 762 | ] 763 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Mergekit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "machine_shape": "hm", 8 | "authorship_tag": "ABX9TyNkCdo3uzEUbLA4CS6VfaEM", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | }, 18 | "widgets": { 19 | "application/vnd.jupyter.widget-state+json": { 20 | "de24d272f2b842c5b01eedb3f536b810": { 21 | "model_module": "@jupyter-widgets/controls", 22 | "model_name": "HBoxModel", 23 | "model_module_version": "1.5.0", 24 | "state": { 25 | "_dom_classes": [], 26 | "_model_module": "@jupyter-widgets/controls", 27 | "_model_module_version": "1.5.0", 28 | "_model_name": "HBoxModel", 29 | "_view_count": null, 30 | "_view_module": "@jupyter-widgets/controls", 31 | "_view_module_version": "1.5.0", 32 | "_view_name": "HBoxView", 33 | "box_style": "", 34 | "children": [ 35 | "IPY_MODEL_0c5dab2657b2473385a424d90f3d4664", 36 | "IPY_MODEL_57efe36e546c473d8be34102f6ba9a58", 37 | "IPY_MODEL_871bad1d905d4877a9eaa242cfd54c4e" 38 | ], 39 | "layout": "IPY_MODEL_8951f6b2edf64464869391197c900f84" 40 | } 41 | }, 42 | "0c5dab2657b2473385a424d90f3d4664": { 43 | "model_module": "@jupyter-widgets/controls", 44 | "model_name": "HTMLModel", 45 | "model_module_version": "1.5.0", 46 | "state": { 47 | "_dom_classes": [], 48 | "_model_module": "@jupyter-widgets/controls", 49 | "_model_module_version": "1.5.0", 50 | "_model_name": "HTMLModel", 51 | "_view_count": null, 52 | "_view_module": "@jupyter-widgets/controls", 53 | "_view_module_version": "1.5.0", 54 | "_view_name": "HTMLView", 55 | "description": "", 56 | "description_tooltip": null, 57 | "layout": "IPY_MODEL_69a61ad28d5141dcbaea44060bc5ebf7", 58 | "placeholder": "​", 59 | "style": "IPY_MODEL_76c2fbf005ae4a5790edfeb499b387b7", 60 | "value": "tokenizer.model: 100%" 61 | } 62 | }, 63 | "57efe36e546c473d8be34102f6ba9a58": { 64 | "model_module": "@jupyter-widgets/controls", 65 | "model_name": "FloatProgressModel", 66 | "model_module_version": "1.5.0", 67 | "state": { 68 | "_dom_classes": [], 69 | "_model_module": "@jupyter-widgets/controls", 70 | "_model_module_version": "1.5.0", 71 | "_model_name": "FloatProgressModel", 72 | "_view_count": null, 73 | "_view_module": "@jupyter-widgets/controls", 74 | "_view_module_version": "1.5.0", 75 | "_view_name": "ProgressView", 76 | "bar_style": "success", 77 | "description": "", 78 | "description_tooltip": null, 79 | "layout": "IPY_MODEL_116964f328dc45d991d895d684ac1216", 80 | "max": 493443, 81 | "min": 0, 82 | "orientation": "horizontal", 83 | "style": "IPY_MODEL_1ecec5ba4424498082a5f64cf3d7faf8", 84 | "value": 493443 85 | } 86 | }, 87 | "871bad1d905d4877a9eaa242cfd54c4e": { 88 | "model_module": "@jupyter-widgets/controls", 89 | "model_name": "HTMLModel", 90 | "model_module_version": "1.5.0", 91 | "state": { 92 | "_dom_classes": [], 93 | "_model_module": "@jupyter-widgets/controls", 94 | "_model_module_version": "1.5.0", 95 | "_model_name": "HTMLModel", 96 | "_view_count": null, 97 | "_view_module": "@jupyter-widgets/controls", 98 | "_view_module_version": "1.5.0", 99 | "_view_name": "HTMLView", 100 | "description": "", 101 | "description_tooltip": null, 102 | "layout": "IPY_MODEL_fc4edcef273b4e75894f4b512122de94", 103 | "placeholder": "​", 104 | "style": "IPY_MODEL_ca2323b142f54998985d30481d5cfabe", 105 | "value": " 493k/493k [00:00<00:00, 42.2kB/s]" 106 | } 107 | }, 108 | "8951f6b2edf64464869391197c900f84": { 109 | "model_module": "@jupyter-widgets/base", 110 | "model_name": "LayoutModel", 111 | "model_module_version": "1.2.0", 112 | "state": { 113 | "_model_module": "@jupyter-widgets/base", 114 | "_model_module_version": "1.2.0", 115 | "_model_name": "LayoutModel", 116 | "_view_count": null, 117 | "_view_module": "@jupyter-widgets/base", 118 | "_view_module_version": "1.2.0", 119 | "_view_name": "LayoutView", 120 | "align_content": null, 121 | "align_items": null, 122 | "align_self": null, 123 | "border": null, 124 | "bottom": null, 125 | "display": null, 126 | "flex": null, 127 | "flex_flow": null, 128 | "grid_area": null, 129 | "grid_auto_columns": null, 130 | "grid_auto_flow": null, 131 | "grid_auto_rows": null, 132 | "grid_column": null, 133 | "grid_gap": null, 134 | "grid_row": null, 135 | "grid_template_areas": null, 136 | "grid_template_columns": null, 137 | "grid_template_rows": null, 138 | "height": null, 139 | "justify_content": null, 140 | "justify_items": null, 141 | "left": null, 142 | "margin": null, 143 | "max_height": null, 144 | "max_width": null, 145 | "min_height": null, 146 | "min_width": null, 147 | "object_fit": null, 148 | "object_position": null, 149 | "order": null, 150 | "overflow": null, 151 | "overflow_x": null, 152 | "overflow_y": null, 153 | "padding": null, 154 | "right": null, 155 | "top": null, 156 | "visibility": null, 157 | "width": null 158 | } 159 | }, 160 | "69a61ad28d5141dcbaea44060bc5ebf7": { 161 | "model_module": "@jupyter-widgets/base", 162 | "model_name": "LayoutModel", 163 | "model_module_version": "1.2.0", 164 | "state": { 165 | "_model_module": "@jupyter-widgets/base", 166 | "_model_module_version": "1.2.0", 167 | "_model_name": "LayoutModel", 168 | "_view_count": null, 169 | "_view_module": "@jupyter-widgets/base", 170 | "_view_module_version": "1.2.0", 171 | "_view_name": "LayoutView", 172 | "align_content": null, 173 | "align_items": null, 174 | "align_self": null, 175 | "border": null, 176 | "bottom": null, 177 | "display": null, 178 | "flex": null, 179 | "flex_flow": null, 180 | "grid_area": null, 181 | "grid_auto_columns": null, 182 | "grid_auto_flow": null, 183 | "grid_auto_rows": null, 184 | "grid_column": null, 185 | "grid_gap": null, 186 | "grid_row": null, 187 | "grid_template_areas": null, 188 | "grid_template_columns": null, 189 | "grid_template_rows": null, 190 | "height": null, 191 | "justify_content": null, 192 | "justify_items": null, 193 | "left": null, 194 | "margin": null, 195 | "max_height": null, 196 | "max_width": null, 197 | "min_height": null, 198 | "min_width": null, 199 | "object_fit": null, 200 | "object_position": null, 201 | "order": null, 202 | "overflow": null, 203 | "overflow_x": null, 204 | "overflow_y": null, 205 | "padding": null, 206 | "right": null, 207 | "top": null, 208 | "visibility": null, 209 | "width": null 210 | } 211 | }, 212 | "76c2fbf005ae4a5790edfeb499b387b7": { 213 | "model_module": "@jupyter-widgets/controls", 214 | "model_name": "DescriptionStyleModel", 215 | "model_module_version": "1.5.0", 216 | "state": { 217 | "_model_module": "@jupyter-widgets/controls", 218 | "_model_module_version": "1.5.0", 219 | "_model_name": "DescriptionStyleModel", 220 | "_view_count": null, 221 | "_view_module": "@jupyter-widgets/base", 222 | "_view_module_version": "1.2.0", 223 | "_view_name": "StyleView", 224 | "description_width": "" 225 | } 226 | }, 227 | "116964f328dc45d991d895d684ac1216": { 228 | "model_module": "@jupyter-widgets/base", 229 | "model_name": "LayoutModel", 230 | "model_module_version": "1.2.0", 231 | "state": { 232 | "_model_module": "@jupyter-widgets/base", 233 | "_model_module_version": "1.2.0", 234 | "_model_name": "LayoutModel", 235 | "_view_count": null, 236 | "_view_module": "@jupyter-widgets/base", 237 | "_view_module_version": "1.2.0", 238 | "_view_name": "LayoutView", 239 | "align_content": null, 240 | "align_items": null, 241 | "align_self": null, 242 | "border": null, 243 | "bottom": null, 244 | "display": null, 245 | "flex": null, 246 | "flex_flow": null, 247 | "grid_area": null, 248 | "grid_auto_columns": null, 249 | "grid_auto_flow": null, 250 | "grid_auto_rows": null, 251 | "grid_column": null, 252 | "grid_gap": null, 253 | "grid_row": null, 254 | "grid_template_areas": null, 255 | "grid_template_columns": null, 256 | "grid_template_rows": null, 257 | "height": null, 258 | "justify_content": null, 259 | "justify_items": null, 260 | "left": null, 261 | "margin": null, 262 | "max_height": null, 263 | "max_width": null, 264 | "min_height": null, 265 | "min_width": null, 266 | "object_fit": null, 267 | "object_position": null, 268 | "order": null, 269 | "overflow": null, 270 | "overflow_x": null, 271 | "overflow_y": null, 272 | "padding": null, 273 | "right": null, 274 | "top": null, 275 | "visibility": null, 276 | "width": null 277 | } 278 | }, 279 | "1ecec5ba4424498082a5f64cf3d7faf8": { 280 | "model_module": "@jupyter-widgets/controls", 281 | "model_name": "ProgressStyleModel", 282 | "model_module_version": "1.5.0", 283 | "state": { 284 | "_model_module": "@jupyter-widgets/controls", 285 | "_model_module_version": "1.5.0", 286 | "_model_name": "ProgressStyleModel", 287 | "_view_count": null, 288 | "_view_module": "@jupyter-widgets/base", 289 | "_view_module_version": "1.2.0", 290 | "_view_name": "StyleView", 291 | "bar_color": null, 292 | "description_width": "" 293 | } 294 | }, 295 | "fc4edcef273b4e75894f4b512122de94": { 296 | "model_module": "@jupyter-widgets/base", 297 | "model_name": "LayoutModel", 298 | "model_module_version": "1.2.0", 299 | "state": { 300 | "_model_module": "@jupyter-widgets/base", 301 | "_model_module_version": "1.2.0", 302 | "_model_name": "LayoutModel", 303 | "_view_count": null, 304 | "_view_module": "@jupyter-widgets/base", 305 | "_view_module_version": "1.2.0", 306 | "_view_name": "LayoutView", 307 | "align_content": null, 308 | "align_items": null, 309 | "align_self": null, 310 | "border": null, 311 | "bottom": null, 312 | "display": null, 313 | "flex": null, 314 | "flex_flow": null, 315 | "grid_area": null, 316 | "grid_auto_columns": null, 317 | "grid_auto_flow": null, 318 | "grid_auto_rows": null, 319 | "grid_column": null, 320 | "grid_gap": null, 321 | "grid_row": null, 322 | "grid_template_areas": null, 323 | "grid_template_columns": null, 324 | "grid_template_rows": null, 325 | "height": null, 326 | "justify_content": null, 327 | "justify_items": null, 328 | "left": null, 329 | "margin": null, 330 | "max_height": null, 331 | "max_width": null, 332 | "min_height": null, 333 | "min_width": null, 334 | "object_fit": null, 335 | "object_position": null, 336 | "order": null, 337 | "overflow": null, 338 | "overflow_x": null, 339 | "overflow_y": null, 340 | "padding": null, 341 | "right": null, 342 | "top": null, 343 | "visibility": null, 344 | "width": null 345 | } 346 | }, 347 | "ca2323b142f54998985d30481d5cfabe": { 348 | "model_module": "@jupyter-widgets/controls", 349 | "model_name": "DescriptionStyleModel", 350 | "model_module_version": "1.5.0", 351 | "state": { 352 | "_model_module": "@jupyter-widgets/controls", 353 | "_model_module_version": "1.5.0", 354 | "_model_name": "DescriptionStyleModel", 355 | "_view_count": null, 356 | "_view_module": "@jupyter-widgets/base", 357 | "_view_module_version": "1.2.0", 358 | "_view_name": "StyleView", 359 | "description_width": "" 360 | } 361 | }, 362 | "63626ac2d0f546188c07512a04c71417": { 363 | "model_module": "@jupyter-widgets/controls", 364 | "model_name": "HBoxModel", 365 | "model_module_version": "1.5.0", 366 | "state": { 367 | "_dom_classes": [], 368 | "_model_module": "@jupyter-widgets/controls", 369 | "_model_module_version": "1.5.0", 370 | "_model_name": "HBoxModel", 371 | "_view_count": null, 372 | "_view_module": "@jupyter-widgets/controls", 373 | "_view_module_version": "1.5.0", 374 | "_view_name": "HBoxView", 375 | "box_style": "", 376 | "children": [ 377 | "IPY_MODEL_decd91747fd04ce39f3e2b733bc7f477", 378 | "IPY_MODEL_7140e4c154424fcab846a71889e99ed2", 379 | "IPY_MODEL_2264d8b75251425e94e635558af4e223" 380 | ], 381 | "layout": "IPY_MODEL_c37478198217457cb30c6649203cf4dc" 382 | } 383 | }, 384 | "decd91747fd04ce39f3e2b733bc7f477": { 385 | "model_module": "@jupyter-widgets/controls", 386 | "model_name": "HTMLModel", 387 | "model_module_version": "1.5.0", 388 | "state": { 389 | "_dom_classes": [], 390 | "_model_module": "@jupyter-widgets/controls", 391 | "_model_module_version": "1.5.0", 392 | "_model_name": "HTMLModel", 393 | "_view_count": null, 394 | "_view_module": "@jupyter-widgets/controls", 395 | "_view_module_version": "1.5.0", 396 | "_view_name": "HTMLView", 397 | "description": "", 398 | "description_tooltip": null, 399 | "layout": "IPY_MODEL_4918769e4e984dfda924776e2373154c", 400 | "placeholder": "​", 401 | "style": "IPY_MODEL_9b48494c94cf49b5835489d97f7a24c5", 402 | "value": "model-00001-of-00002.safetensors: 100%" 403 | } 404 | }, 405 | "7140e4c154424fcab846a71889e99ed2": { 406 | "model_module": "@jupyter-widgets/controls", 407 | "model_name": "FloatProgressModel", 408 | "model_module_version": "1.5.0", 409 | "state": { 410 | "_dom_classes": [], 411 | "_model_module": "@jupyter-widgets/controls", 412 | "_model_module_version": "1.5.0", 413 | "_model_name": "FloatProgressModel", 414 | "_view_count": null, 415 | "_view_module": "@jupyter-widgets/controls", 416 | "_view_module_version": "1.5.0", 417 | "_view_name": "ProgressView", 418 | "bar_style": "success", 419 | "description": "", 420 | "description_tooltip": null, 421 | "layout": "IPY_MODEL_6ed844da52fe466eb1c10c814489448c", 422 | "max": 9942990000, 423 | "min": 0, 424 | "orientation": "horizontal", 425 | "style": "IPY_MODEL_9c60efa02e80423e828628190dd13bc3", 426 | "value": 9942990000 427 | } 428 | }, 429 | "2264d8b75251425e94e635558af4e223": { 430 | "model_module": "@jupyter-widgets/controls", 431 | "model_name": "HTMLModel", 432 | "model_module_version": "1.5.0", 433 | "state": { 434 | "_dom_classes": [], 435 | "_model_module": "@jupyter-widgets/controls", 436 | "_model_module_version": "1.5.0", 437 | "_model_name": "HTMLModel", 438 | "_view_count": null, 439 | "_view_module": "@jupyter-widgets/controls", 440 | "_view_module_version": "1.5.0", 441 | "_view_name": "HTMLView", 442 | "description": "", 443 | "description_tooltip": null, 444 | "layout": "IPY_MODEL_0170e8cc57d94041956f7afbf2eef449", 445 | "placeholder": "​", 446 | "style": "IPY_MODEL_220c2ba5f2524271b24fe049431a474c", 447 | "value": " 9.94G/9.94G [04:04<00:00, 36.9MB/s]" 448 | } 449 | }, 450 | "c37478198217457cb30c6649203cf4dc": { 451 | "model_module": "@jupyter-widgets/base", 452 | "model_name": "LayoutModel", 453 | "model_module_version": "1.2.0", 454 | "state": { 455 | "_model_module": "@jupyter-widgets/base", 456 | "_model_module_version": "1.2.0", 457 | "_model_name": "LayoutModel", 458 | "_view_count": null, 459 | "_view_module": "@jupyter-widgets/base", 460 | "_view_module_version": "1.2.0", 461 | "_view_name": "LayoutView", 462 | "align_content": null, 463 | "align_items": null, 464 | "align_self": null, 465 | "border": null, 466 | "bottom": null, 467 | "display": null, 468 | "flex": null, 469 | "flex_flow": null, 470 | "grid_area": null, 471 | "grid_auto_columns": null, 472 | "grid_auto_flow": null, 473 | "grid_auto_rows": null, 474 | "grid_column": null, 475 | "grid_gap": null, 476 | "grid_row": null, 477 | "grid_template_areas": null, 478 | "grid_template_columns": null, 479 | "grid_template_rows": null, 480 | "height": null, 481 | "justify_content": null, 482 | "justify_items": null, 483 | "left": null, 484 | "margin": null, 485 | "max_height": null, 486 | "max_width": null, 487 | "min_height": null, 488 | "min_width": null, 489 | "object_fit": null, 490 | "object_position": null, 491 | "order": null, 492 | "overflow": null, 493 | "overflow_x": null, 494 | "overflow_y": null, 495 | "padding": null, 496 | "right": null, 497 | "top": null, 498 | "visibility": null, 499 | "width": null 500 | } 501 | }, 502 | "4918769e4e984dfda924776e2373154c": { 503 | "model_module": "@jupyter-widgets/base", 504 | "model_name": "LayoutModel", 505 | "model_module_version": "1.2.0", 506 | "state": { 507 | "_model_module": "@jupyter-widgets/base", 508 | "_model_module_version": "1.2.0", 509 | "_model_name": "LayoutModel", 510 | "_view_count": null, 511 | "_view_module": "@jupyter-widgets/base", 512 | "_view_module_version": "1.2.0", 513 | "_view_name": "LayoutView", 514 | "align_content": null, 515 | "align_items": null, 516 | "align_self": null, 517 | "border": null, 518 | "bottom": null, 519 | "display": null, 520 | "flex": null, 521 | "flex_flow": null, 522 | "grid_area": null, 523 | "grid_auto_columns": null, 524 | "grid_auto_flow": null, 525 | "grid_auto_rows": null, 526 | "grid_column": null, 527 | "grid_gap": null, 528 | "grid_row": null, 529 | "grid_template_areas": null, 530 | "grid_template_columns": null, 531 | "grid_template_rows": null, 532 | "height": null, 533 | "justify_content": null, 534 | "justify_items": null, 535 | "left": null, 536 | "margin": null, 537 | "max_height": null, 538 | "max_width": null, 539 | "min_height": null, 540 | "min_width": null, 541 | "object_fit": null, 542 | "object_position": null, 543 | "order": null, 544 | "overflow": null, 545 | "overflow_x": null, 546 | "overflow_y": null, 547 | "padding": null, 548 | "right": null, 549 | "top": null, 550 | "visibility": null, 551 | "width": null 552 | } 553 | }, 554 | "9b48494c94cf49b5835489d97f7a24c5": { 555 | "model_module": "@jupyter-widgets/controls", 556 | "model_name": "DescriptionStyleModel", 557 | "model_module_version": "1.5.0", 558 | "state": { 559 | "_model_module": "@jupyter-widgets/controls", 560 | "_model_module_version": "1.5.0", 561 | "_model_name": "DescriptionStyleModel", 562 | "_view_count": null, 563 | "_view_module": "@jupyter-widgets/base", 564 | "_view_module_version": "1.2.0", 565 | "_view_name": "StyleView", 566 | "description_width": "" 567 | } 568 | }, 569 | "6ed844da52fe466eb1c10c814489448c": { 570 | "model_module": "@jupyter-widgets/base", 571 | "model_name": "LayoutModel", 572 | "model_module_version": "1.2.0", 573 | "state": { 574 | "_model_module": "@jupyter-widgets/base", 575 | "_model_module_version": "1.2.0", 576 | "_model_name": "LayoutModel", 577 | "_view_count": null, 578 | "_view_module": "@jupyter-widgets/base", 579 | "_view_module_version": "1.2.0", 580 | "_view_name": "LayoutView", 581 | "align_content": null, 582 | "align_items": null, 583 | "align_self": null, 584 | "border": null, 585 | "bottom": null, 586 | "display": null, 587 | "flex": null, 588 | "flex_flow": null, 589 | "grid_area": null, 590 | "grid_auto_columns": null, 591 | "grid_auto_flow": null, 592 | "grid_auto_rows": null, 593 | "grid_column": null, 594 | "grid_gap": null, 595 | "grid_row": null, 596 | "grid_template_areas": null, 597 | "grid_template_columns": null, 598 | "grid_template_rows": null, 599 | "height": null, 600 | "justify_content": null, 601 | "justify_items": null, 602 | "left": null, 603 | "margin": null, 604 | "max_height": null, 605 | "max_width": null, 606 | "min_height": null, 607 | "min_width": null, 608 | "object_fit": null, 609 | "object_position": null, 610 | "order": null, 611 | "overflow": null, 612 | "overflow_x": null, 613 | "overflow_y": null, 614 | "padding": null, 615 | "right": null, 616 | "top": null, 617 | "visibility": null, 618 | "width": null 619 | } 620 | }, 621 | "9c60efa02e80423e828628190dd13bc3": { 622 | "model_module": "@jupyter-widgets/controls", 623 | "model_name": "ProgressStyleModel", 624 | "model_module_version": "1.5.0", 625 | "state": { 626 | "_model_module": "@jupyter-widgets/controls", 627 | "_model_module_version": "1.5.0", 628 | "_model_name": "ProgressStyleModel", 629 | "_view_count": null, 630 | "_view_module": "@jupyter-widgets/base", 631 | "_view_module_version": "1.2.0", 632 | "_view_name": "StyleView", 633 | "bar_color": null, 634 | "description_width": "" 635 | } 636 | }, 637 | "0170e8cc57d94041956f7afbf2eef449": { 638 | "model_module": "@jupyter-widgets/base", 639 | "model_name": "LayoutModel", 640 | "model_module_version": "1.2.0", 641 | "state": { 642 | "_model_module": "@jupyter-widgets/base", 643 | "_model_module_version": "1.2.0", 644 | "_model_name": "LayoutModel", 645 | "_view_count": null, 646 | "_view_module": "@jupyter-widgets/base", 647 | "_view_module_version": "1.2.0", 648 | "_view_name": "LayoutView", 649 | "align_content": null, 650 | "align_items": null, 651 | "align_self": null, 652 | "border": null, 653 | "bottom": null, 654 | "display": null, 655 | "flex": null, 656 | "flex_flow": null, 657 | "grid_area": null, 658 | "grid_auto_columns": null, 659 | "grid_auto_flow": null, 660 | "grid_auto_rows": null, 661 | "grid_column": null, 662 | "grid_gap": null, 663 | "grid_row": null, 664 | "grid_template_areas": null, 665 | "grid_template_columns": null, 666 | "grid_template_rows": null, 667 | "height": null, 668 | "justify_content": null, 669 | "justify_items": null, 670 | "left": null, 671 | "margin": null, 672 | "max_height": null, 673 | "max_width": null, 674 | "min_height": null, 675 | "min_width": null, 676 | "object_fit": null, 677 | "object_position": null, 678 | "order": null, 679 | "overflow": null, 680 | "overflow_x": null, 681 | "overflow_y": null, 682 | "padding": null, 683 | "right": null, 684 | "top": null, 685 | "visibility": null, 686 | "width": null 687 | } 688 | }, 689 | "220c2ba5f2524271b24fe049431a474c": { 690 | "model_module": "@jupyter-widgets/controls", 691 | "model_name": "DescriptionStyleModel", 692 | "model_module_version": "1.5.0", 693 | "state": { 694 | "_model_module": "@jupyter-widgets/controls", 695 | "_model_module_version": "1.5.0", 696 | "_model_name": "DescriptionStyleModel", 697 | "_view_count": null, 698 | "_view_module": "@jupyter-widgets/base", 699 | "_view_module_version": "1.2.0", 700 | "_view_name": "StyleView", 701 | "description_width": "" 702 | } 703 | }, 704 | "a6f99dd0662846f9a381d2d507a7b447": { 705 | "model_module": "@jupyter-widgets/controls", 706 | "model_name": "HBoxModel", 707 | "model_module_version": "1.5.0", 708 | "state": { 709 | "_dom_classes": [], 710 | "_model_module": "@jupyter-widgets/controls", 711 | "_model_module_version": "1.5.0", 712 | "_model_name": "HBoxModel", 713 | "_view_count": null, 714 | "_view_module": "@jupyter-widgets/controls", 715 | "_view_module_version": "1.5.0", 716 | "_view_name": "HBoxView", 717 | "box_style": "", 718 | "children": [ 719 | "IPY_MODEL_900b9fcb70a84781bd5b4213df54626d", 720 | "IPY_MODEL_0ea83f270e164795b64f23b143efb300", 721 | "IPY_MODEL_318dcdeac8fb40f88fa60114f1c6a7c1" 722 | ], 723 | "layout": "IPY_MODEL_af89cf715e0e4c5e9f59943a255394c1" 724 | } 725 | }, 726 | "900b9fcb70a84781bd5b4213df54626d": { 727 | "model_module": "@jupyter-widgets/controls", 728 | "model_name": "HTMLModel", 729 | "model_module_version": "1.5.0", 730 | "state": { 731 | "_dom_classes": [], 732 | "_model_module": "@jupyter-widgets/controls", 733 | "_model_module_version": "1.5.0", 734 | "_model_name": "HTMLModel", 735 | "_view_count": null, 736 | "_view_module": "@jupyter-widgets/controls", 737 | "_view_module_version": "1.5.0", 738 | "_view_name": "HTMLView", 739 | "description": "", 740 | "description_tooltip": null, 741 | "layout": "IPY_MODEL_40e23e35299d45d499432b8f1a9bc924", 742 | "placeholder": "​", 743 | "style": "IPY_MODEL_126b374e286747768ef7218454534640", 744 | "value": "Upload 3 LFS files: 100%" 745 | } 746 | }, 747 | "0ea83f270e164795b64f23b143efb300": { 748 | "model_module": "@jupyter-widgets/controls", 749 | "model_name": "FloatProgressModel", 750 | "model_module_version": "1.5.0", 751 | "state": { 752 | "_dom_classes": [], 753 | "_model_module": "@jupyter-widgets/controls", 754 | "_model_module_version": "1.5.0", 755 | "_model_name": "FloatProgressModel", 756 | "_view_count": null, 757 | "_view_module": "@jupyter-widgets/controls", 758 | "_view_module_version": "1.5.0", 759 | "_view_name": "ProgressView", 760 | "bar_style": "success", 761 | "description": "", 762 | "description_tooltip": null, 763 | "layout": "IPY_MODEL_bdd26e54eed5477f99b135552e5f3450", 764 | "max": 3, 765 | "min": 0, 766 | "orientation": "horizontal", 767 | "style": "IPY_MODEL_163a6fd878134e1eb5f193d1ebfff1c1", 768 | "value": 3 769 | } 770 | }, 771 | "318dcdeac8fb40f88fa60114f1c6a7c1": { 772 | "model_module": "@jupyter-widgets/controls", 773 | "model_name": "HTMLModel", 774 | "model_module_version": "1.5.0", 775 | "state": { 776 | "_dom_classes": [], 777 | "_model_module": "@jupyter-widgets/controls", 778 | "_model_module_version": "1.5.0", 779 | "_model_name": "HTMLModel", 780 | "_view_count": null, 781 | "_view_module": "@jupyter-widgets/controls", 782 | "_view_module_version": "1.5.0", 783 | "_view_name": "HTMLView", 784 | "description": "", 785 | "description_tooltip": null, 786 | "layout": "IPY_MODEL_953d7c014f76413c9805a2ef8c2c9356", 787 | "placeholder": "​", 788 | "style": "IPY_MODEL_348879bf76d1471f9c79c1ec2dc07c1d", 789 | "value": " 3/3 [04:05<00:00, 245.46s/it]" 790 | } 791 | }, 792 | "af89cf715e0e4c5e9f59943a255394c1": { 793 | "model_module": "@jupyter-widgets/base", 794 | "model_name": "LayoutModel", 795 | "model_module_version": "1.2.0", 796 | "state": { 797 | "_model_module": "@jupyter-widgets/base", 798 | "_model_module_version": "1.2.0", 799 | "_model_name": "LayoutModel", 800 | "_view_count": null, 801 | "_view_module": "@jupyter-widgets/base", 802 | "_view_module_version": "1.2.0", 803 | "_view_name": "LayoutView", 804 | "align_content": null, 805 | "align_items": null, 806 | "align_self": null, 807 | "border": null, 808 | "bottom": null, 809 | "display": null, 810 | "flex": null, 811 | "flex_flow": null, 812 | "grid_area": null, 813 | "grid_auto_columns": null, 814 | "grid_auto_flow": null, 815 | "grid_auto_rows": null, 816 | "grid_column": null, 817 | "grid_gap": null, 818 | "grid_row": null, 819 | "grid_template_areas": null, 820 | "grid_template_columns": null, 821 | "grid_template_rows": null, 822 | "height": null, 823 | "justify_content": null, 824 | "justify_items": null, 825 | "left": null, 826 | "margin": null, 827 | "max_height": null, 828 | "max_width": null, 829 | "min_height": null, 830 | "min_width": null, 831 | "object_fit": null, 832 | "object_position": null, 833 | "order": null, 834 | "overflow": null, 835 | "overflow_x": null, 836 | "overflow_y": null, 837 | "padding": null, 838 | "right": null, 839 | "top": null, 840 | "visibility": null, 841 | "width": null 842 | } 843 | }, 844 | "40e23e35299d45d499432b8f1a9bc924": { 845 | "model_module": "@jupyter-widgets/base", 846 | "model_name": "LayoutModel", 847 | "model_module_version": "1.2.0", 848 | "state": { 849 | "_model_module": "@jupyter-widgets/base", 850 | "_model_module_version": "1.2.0", 851 | "_model_name": "LayoutModel", 852 | "_view_count": null, 853 | "_view_module": "@jupyter-widgets/base", 854 | "_view_module_version": "1.2.0", 855 | "_view_name": "LayoutView", 856 | "align_content": null, 857 | "align_items": null, 858 | "align_self": null, 859 | "border": null, 860 | "bottom": null, 861 | "display": null, 862 | "flex": null, 863 | "flex_flow": null, 864 | "grid_area": null, 865 | "grid_auto_columns": null, 866 | "grid_auto_flow": null, 867 | "grid_auto_rows": null, 868 | "grid_column": null, 869 | "grid_gap": null, 870 | "grid_row": null, 871 | "grid_template_areas": null, 872 | "grid_template_columns": null, 873 | "grid_template_rows": null, 874 | "height": null, 875 | "justify_content": null, 876 | "justify_items": null, 877 | "left": null, 878 | "margin": null, 879 | "max_height": null, 880 | "max_width": null, 881 | "min_height": null, 882 | "min_width": null, 883 | "object_fit": null, 884 | "object_position": null, 885 | "order": null, 886 | "overflow": null, 887 | "overflow_x": null, 888 | "overflow_y": null, 889 | "padding": null, 890 | "right": null, 891 | "top": null, 892 | "visibility": null, 893 | "width": null 894 | } 895 | }, 896 | "126b374e286747768ef7218454534640": { 897 | "model_module": "@jupyter-widgets/controls", 898 | "model_name": "DescriptionStyleModel", 899 | "model_module_version": "1.5.0", 900 | "state": { 901 | "_model_module": "@jupyter-widgets/controls", 902 | "_model_module_version": "1.5.0", 903 | "_model_name": "DescriptionStyleModel", 904 | "_view_count": null, 905 | "_view_module": "@jupyter-widgets/base", 906 | "_view_module_version": "1.2.0", 907 | "_view_name": "StyleView", 908 | "description_width": "" 909 | } 910 | }, 911 | "bdd26e54eed5477f99b135552e5f3450": { 912 | "model_module": "@jupyter-widgets/base", 913 | "model_name": "LayoutModel", 914 | "model_module_version": "1.2.0", 915 | "state": { 916 | "_model_module": "@jupyter-widgets/base", 917 | "_model_module_version": "1.2.0", 918 | "_model_name": "LayoutModel", 919 | "_view_count": null, 920 | "_view_module": "@jupyter-widgets/base", 921 | "_view_module_version": "1.2.0", 922 | "_view_name": "LayoutView", 923 | "align_content": null, 924 | "align_items": null, 925 | "align_self": null, 926 | "border": null, 927 | "bottom": null, 928 | "display": null, 929 | "flex": null, 930 | "flex_flow": null, 931 | "grid_area": null, 932 | "grid_auto_columns": null, 933 | "grid_auto_flow": null, 934 | "grid_auto_rows": null, 935 | "grid_column": null, 936 | "grid_gap": null, 937 | "grid_row": null, 938 | "grid_template_areas": null, 939 | "grid_template_columns": null, 940 | "grid_template_rows": null, 941 | "height": null, 942 | "justify_content": null, 943 | "justify_items": null, 944 | "left": null, 945 | "margin": null, 946 | "max_height": null, 947 | "max_width": null, 948 | "min_height": null, 949 | "min_width": null, 950 | "object_fit": null, 951 | "object_position": null, 952 | "order": null, 953 | "overflow": null, 954 | "overflow_x": null, 955 | "overflow_y": null, 956 | "padding": null, 957 | "right": null, 958 | "top": null, 959 | "visibility": null, 960 | "width": null 961 | } 962 | }, 963 | "163a6fd878134e1eb5f193d1ebfff1c1": { 964 | "model_module": "@jupyter-widgets/controls", 965 | "model_name": "ProgressStyleModel", 966 | "model_module_version": "1.5.0", 967 | "state": { 968 | "_model_module": "@jupyter-widgets/controls", 969 | "_model_module_version": "1.5.0", 970 | "_model_name": "ProgressStyleModel", 971 | "_view_count": null, 972 | "_view_module": "@jupyter-widgets/base", 973 | "_view_module_version": "1.2.0", 974 | "_view_name": "StyleView", 975 | "bar_color": null, 976 | "description_width": "" 977 | } 978 | }, 979 | "953d7c014f76413c9805a2ef8c2c9356": { 980 | "model_module": "@jupyter-widgets/base", 981 | "model_name": "LayoutModel", 982 | "model_module_version": "1.2.0", 983 | "state": { 984 | "_model_module": "@jupyter-widgets/base", 985 | "_model_module_version": "1.2.0", 986 | "_model_name": "LayoutModel", 987 | "_view_count": null, 988 | "_view_module": "@jupyter-widgets/base", 989 | "_view_module_version": "1.2.0", 990 | "_view_name": "LayoutView", 991 | "align_content": null, 992 | "align_items": null, 993 | "align_self": null, 994 | "border": null, 995 | "bottom": null, 996 | "display": null, 997 | "flex": null, 998 | "flex_flow": null, 999 | "grid_area": null, 1000 | "grid_auto_columns": null, 1001 | "grid_auto_flow": null, 1002 | "grid_auto_rows": null, 1003 | "grid_column": null, 1004 | "grid_gap": null, 1005 | "grid_row": null, 1006 | "grid_template_areas": null, 1007 | "grid_template_columns": null, 1008 | "grid_template_rows": null, 1009 | "height": null, 1010 | "justify_content": null, 1011 | "justify_items": null, 1012 | "left": null, 1013 | "margin": null, 1014 | "max_height": null, 1015 | "max_width": null, 1016 | "min_height": null, 1017 | "min_width": null, 1018 | "object_fit": null, 1019 | "object_position": null, 1020 | "order": null, 1021 | "overflow": null, 1022 | "overflow_x": null, 1023 | "overflow_y": null, 1024 | "padding": null, 1025 | "right": null, 1026 | "top": null, 1027 | "visibility": null, 1028 | "width": null 1029 | } 1030 | }, 1031 | "348879bf76d1471f9c79c1ec2dc07c1d": { 1032 | "model_module": "@jupyter-widgets/controls", 1033 | "model_name": "DescriptionStyleModel", 1034 | "model_module_version": "1.5.0", 1035 | "state": { 1036 | "_model_module": "@jupyter-widgets/controls", 1037 | "_model_module_version": "1.5.0", 1038 | "_model_name": "DescriptionStyleModel", 1039 | "_view_count": null, 1040 | "_view_module": "@jupyter-widgets/base", 1041 | "_view_module_version": "1.2.0", 1042 | "_view_name": "StyleView", 1043 | "description_width": "" 1044 | } 1045 | }, 1046 | "8d54ae0d028b40e7b018454187db1a1c": { 1047 | "model_module": "@jupyter-widgets/controls", 1048 | "model_name": "HBoxModel", 1049 | "model_module_version": "1.5.0", 1050 | "state": { 1051 | "_dom_classes": [], 1052 | "_model_module": "@jupyter-widgets/controls", 1053 | "_model_module_version": "1.5.0", 1054 | "_model_name": "HBoxModel", 1055 | "_view_count": null, 1056 | "_view_module": "@jupyter-widgets/controls", 1057 | "_view_module_version": "1.5.0", 1058 | "_view_name": "HBoxView", 1059 | "box_style": "", 1060 | "children": [ 1061 | "IPY_MODEL_562353040be54593b23734390f49927c", 1062 | "IPY_MODEL_00cbebe6df7d4995913f20e39fc71b15", 1063 | "IPY_MODEL_aee3c563fdc54f9cb3ebc2630c84a9e6" 1064 | ], 1065 | "layout": "IPY_MODEL_b74e307a751844ffab9f7f3df367774b" 1066 | } 1067 | }, 1068 | "562353040be54593b23734390f49927c": { 1069 | "model_module": "@jupyter-widgets/controls", 1070 | "model_name": "HTMLModel", 1071 | "model_module_version": "1.5.0", 1072 | "state": { 1073 | "_dom_classes": [], 1074 | "_model_module": "@jupyter-widgets/controls", 1075 | "_model_module_version": "1.5.0", 1076 | "_model_name": "HTMLModel", 1077 | "_view_count": null, 1078 | "_view_module": "@jupyter-widgets/controls", 1079 | "_view_module_version": "1.5.0", 1080 | "_view_name": "HTMLView", 1081 | "description": "", 1082 | "description_tooltip": null, 1083 | "layout": "IPY_MODEL_8e6142e41f714fe9abe6a5bb72c071f9", 1084 | "placeholder": "​", 1085 | "style": "IPY_MODEL_49cd1c5663404fb5a307c345e7e970c3", 1086 | "value": "model-00002-of-00002.safetensors: 100%" 1087 | } 1088 | }, 1089 | "00cbebe6df7d4995913f20e39fc71b15": { 1090 | "model_module": "@jupyter-widgets/controls", 1091 | "model_name": "FloatProgressModel", 1092 | "model_module_version": "1.5.0", 1093 | "state": { 1094 | "_dom_classes": [], 1095 | "_model_module": "@jupyter-widgets/controls", 1096 | "_model_module_version": "1.5.0", 1097 | "_model_name": "FloatProgressModel", 1098 | "_view_count": null, 1099 | "_view_module": "@jupyter-widgets/controls", 1100 | "_view_module_version": "1.5.0", 1101 | "_view_name": "ProgressView", 1102 | "bar_style": "success", 1103 | "description": "", 1104 | "description_tooltip": null, 1105 | "layout": "IPY_MODEL_920ef8e509d24ccda930f4c47eff158c", 1106 | "max": 8030324832, 1107 | "min": 0, 1108 | "orientation": "horizontal", 1109 | "style": "IPY_MODEL_c8828d61b26a47ac97a1541e14c00f62", 1110 | "value": 8030324832 1111 | } 1112 | }, 1113 | "aee3c563fdc54f9cb3ebc2630c84a9e6": { 1114 | "model_module": "@jupyter-widgets/controls", 1115 | "model_name": "HTMLModel", 1116 | "model_module_version": "1.5.0", 1117 | "state": { 1118 | "_dom_classes": [], 1119 | "_model_module": "@jupyter-widgets/controls", 1120 | "_model_module_version": "1.5.0", 1121 | "_model_name": "HTMLModel", 1122 | "_view_count": null, 1123 | "_view_module": "@jupyter-widgets/controls", 1124 | "_view_module_version": "1.5.0", 1125 | "_view_name": "HTMLView", 1126 | "description": "", 1127 | "description_tooltip": null, 1128 | "layout": "IPY_MODEL_a3d7e352222647a99be79935b1ebd86a", 1129 | "placeholder": "​", 1130 | "style": "IPY_MODEL_80666ef5f07641c482a23618a767791d", 1131 | "value": " 8.03G/8.03G [03:13<00:00, 54.0MB/s]" 1132 | } 1133 | }, 1134 | "b74e307a751844ffab9f7f3df367774b": { 1135 | "model_module": "@jupyter-widgets/base", 1136 | "model_name": "LayoutModel", 1137 | "model_module_version": "1.2.0", 1138 | "state": { 1139 | "_model_module": "@jupyter-widgets/base", 1140 | "_model_module_version": "1.2.0", 1141 | "_model_name": "LayoutModel", 1142 | "_view_count": null, 1143 | "_view_module": "@jupyter-widgets/base", 1144 | "_view_module_version": "1.2.0", 1145 | "_view_name": "LayoutView", 1146 | "align_content": null, 1147 | "align_items": null, 1148 | "align_self": null, 1149 | "border": null, 1150 | "bottom": null, 1151 | "display": null, 1152 | "flex": null, 1153 | "flex_flow": null, 1154 | "grid_area": null, 1155 | "grid_auto_columns": null, 1156 | "grid_auto_flow": null, 1157 | "grid_auto_rows": null, 1158 | "grid_column": null, 1159 | "grid_gap": null, 1160 | "grid_row": null, 1161 | "grid_template_areas": null, 1162 | "grid_template_columns": null, 1163 | "grid_template_rows": null, 1164 | "height": null, 1165 | "justify_content": null, 1166 | "justify_items": null, 1167 | "left": null, 1168 | "margin": null, 1169 | "max_height": null, 1170 | "max_width": null, 1171 | "min_height": null, 1172 | "min_width": null, 1173 | "object_fit": null, 1174 | "object_position": null, 1175 | "order": null, 1176 | "overflow": null, 1177 | "overflow_x": null, 1178 | "overflow_y": null, 1179 | "padding": null, 1180 | "right": null, 1181 | "top": null, 1182 | "visibility": null, 1183 | "width": null 1184 | } 1185 | }, 1186 | "8e6142e41f714fe9abe6a5bb72c071f9": { 1187 | "model_module": "@jupyter-widgets/base", 1188 | "model_name": "LayoutModel", 1189 | "model_module_version": "1.2.0", 1190 | "state": { 1191 | "_model_module": "@jupyter-widgets/base", 1192 | "_model_module_version": "1.2.0", 1193 | "_model_name": "LayoutModel", 1194 | "_view_count": null, 1195 | "_view_module": "@jupyter-widgets/base", 1196 | "_view_module_version": "1.2.0", 1197 | "_view_name": "LayoutView", 1198 | "align_content": null, 1199 | "align_items": null, 1200 | "align_self": null, 1201 | "border": null, 1202 | "bottom": null, 1203 | "display": null, 1204 | "flex": null, 1205 | "flex_flow": null, 1206 | "grid_area": null, 1207 | "grid_auto_columns": null, 1208 | "grid_auto_flow": null, 1209 | "grid_auto_rows": null, 1210 | "grid_column": null, 1211 | "grid_gap": null, 1212 | "grid_row": null, 1213 | "grid_template_areas": null, 1214 | "grid_template_columns": null, 1215 | "grid_template_rows": null, 1216 | "height": null, 1217 | "justify_content": null, 1218 | "justify_items": null, 1219 | "left": null, 1220 | "margin": null, 1221 | "max_height": null, 1222 | "max_width": null, 1223 | "min_height": null, 1224 | "min_width": null, 1225 | "object_fit": null, 1226 | "object_position": null, 1227 | "order": null, 1228 | "overflow": null, 1229 | "overflow_x": null, 1230 | "overflow_y": null, 1231 | "padding": null, 1232 | "right": null, 1233 | "top": null, 1234 | "visibility": null, 1235 | "width": null 1236 | } 1237 | }, 1238 | "49cd1c5663404fb5a307c345e7e970c3": { 1239 | "model_module": "@jupyter-widgets/controls", 1240 | "model_name": "DescriptionStyleModel", 1241 | "model_module_version": "1.5.0", 1242 | "state": { 1243 | "_model_module": "@jupyter-widgets/controls", 1244 | "_model_module_version": "1.5.0", 1245 | "_model_name": "DescriptionStyleModel", 1246 | "_view_count": null, 1247 | "_view_module": "@jupyter-widgets/base", 1248 | "_view_module_version": "1.2.0", 1249 | "_view_name": "StyleView", 1250 | "description_width": "" 1251 | } 1252 | }, 1253 | "920ef8e509d24ccda930f4c47eff158c": { 1254 | "model_module": "@jupyter-widgets/base", 1255 | "model_name": "LayoutModel", 1256 | "model_module_version": "1.2.0", 1257 | "state": { 1258 | "_model_module": "@jupyter-widgets/base", 1259 | "_model_module_version": "1.2.0", 1260 | "_model_name": "LayoutModel", 1261 | "_view_count": null, 1262 | "_view_module": "@jupyter-widgets/base", 1263 | "_view_module_version": "1.2.0", 1264 | "_view_name": "LayoutView", 1265 | "align_content": null, 1266 | "align_items": null, 1267 | "align_self": null, 1268 | "border": null, 1269 | "bottom": null, 1270 | "display": null, 1271 | "flex": null, 1272 | "flex_flow": null, 1273 | "grid_area": null, 1274 | "grid_auto_columns": null, 1275 | "grid_auto_flow": null, 1276 | "grid_auto_rows": null, 1277 | "grid_column": null, 1278 | "grid_gap": null, 1279 | "grid_row": null, 1280 | "grid_template_areas": null, 1281 | "grid_template_columns": null, 1282 | "grid_template_rows": null, 1283 | "height": null, 1284 | "justify_content": null, 1285 | "justify_items": null, 1286 | "left": null, 1287 | "margin": null, 1288 | "max_height": null, 1289 | "max_width": null, 1290 | "min_height": null, 1291 | "min_width": null, 1292 | "object_fit": null, 1293 | "object_position": null, 1294 | "order": null, 1295 | "overflow": null, 1296 | "overflow_x": null, 1297 | "overflow_y": null, 1298 | "padding": null, 1299 | "right": null, 1300 | "top": null, 1301 | "visibility": null, 1302 | "width": null 1303 | } 1304 | }, 1305 | "c8828d61b26a47ac97a1541e14c00f62": { 1306 | "model_module": "@jupyter-widgets/controls", 1307 | "model_name": "ProgressStyleModel", 1308 | "model_module_version": "1.5.0", 1309 | "state": { 1310 | "_model_module": "@jupyter-widgets/controls", 1311 | "_model_module_version": "1.5.0", 1312 | "_model_name": "ProgressStyleModel", 1313 | "_view_count": null, 1314 | "_view_module": "@jupyter-widgets/base", 1315 | "_view_module_version": "1.2.0", 1316 | "_view_name": "StyleView", 1317 | "bar_color": null, 1318 | "description_width": "" 1319 | } 1320 | }, 1321 | "a3d7e352222647a99be79935b1ebd86a": { 1322 | "model_module": "@jupyter-widgets/base", 1323 | "model_name": "LayoutModel", 1324 | "model_module_version": "1.2.0", 1325 | "state": { 1326 | "_model_module": "@jupyter-widgets/base", 1327 | "_model_module_version": "1.2.0", 1328 | "_model_name": "LayoutModel", 1329 | "_view_count": null, 1330 | "_view_module": "@jupyter-widgets/base", 1331 | "_view_module_version": "1.2.0", 1332 | "_view_name": "LayoutView", 1333 | "align_content": null, 1334 | "align_items": null, 1335 | "align_self": null, 1336 | "border": null, 1337 | "bottom": null, 1338 | "display": null, 1339 | "flex": null, 1340 | "flex_flow": null, 1341 | "grid_area": null, 1342 | "grid_auto_columns": null, 1343 | "grid_auto_flow": null, 1344 | "grid_auto_rows": null, 1345 | "grid_column": null, 1346 | "grid_gap": null, 1347 | "grid_row": null, 1348 | "grid_template_areas": null, 1349 | "grid_template_columns": null, 1350 | "grid_template_rows": null, 1351 | "height": null, 1352 | "justify_content": null, 1353 | "justify_items": null, 1354 | "left": null, 1355 | "margin": null, 1356 | "max_height": null, 1357 | "max_width": null, 1358 | "min_height": null, 1359 | "min_width": null, 1360 | "object_fit": null, 1361 | "object_position": null, 1362 | "order": null, 1363 | "overflow": null, 1364 | "overflow_x": null, 1365 | "overflow_y": null, 1366 | "padding": null, 1367 | "right": null, 1368 | "top": null, 1369 | "visibility": null, 1370 | "width": null 1371 | } 1372 | }, 1373 | "80666ef5f07641c482a23618a767791d": { 1374 | "model_module": "@jupyter-widgets/controls", 1375 | "model_name": "DescriptionStyleModel", 1376 | "model_module_version": "1.5.0", 1377 | "state": { 1378 | "_model_module": "@jupyter-widgets/controls", 1379 | "_model_module_version": "1.5.0", 1380 | "_model_name": "DescriptionStyleModel", 1381 | "_view_count": null, 1382 | "_view_module": "@jupyter-widgets/base", 1383 | "_view_module_version": "1.2.0", 1384 | "_view_name": "StyleView", 1385 | "description_width": "" 1386 | } 1387 | } 1388 | } 1389 | } 1390 | }, 1391 | "cells": [ 1392 | { 1393 | "cell_type": "markdown", 1394 | "metadata": { 1395 | "id": "view-in-github", 1396 | "colab_type": "text" 1397 | }, 1398 | "source": [ 1399 | "\"Open" 1400 | ] 1401 | }, 1402 | { 1403 | "cell_type": "markdown", 1404 | "source": [ 1405 | "# Merge Large Language Models with mergekit\n", 1406 | "> 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n", 1407 | "\n", 1408 | "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n", 1409 | "\n", 1410 | "Model merging only requires a lot of RAM. With a free Google Colab account, you should be able to run it using a T4 GPU (VRAM offloading).\n", 1411 | "\n", 1412 | "Examples of merge configurations:\n", 1413 | "\n", 1414 | "### TIES-Merging\n", 1415 | "\n", 1416 | "```yaml\n", 1417 | "models:\n", 1418 | " - model: mistralai/Mistral-7B-v0.1\n", 1419 | " # no parameters necessary for base model\n", 1420 | " - model: OpenPipe/mistral-ft-optimized-1218\n", 1421 | " parameters:\n", 1422 | " density: 0.5\n", 1423 | " weight: 0.5\n", 1424 | " - model: mlabonne/NeuralHermes-2.5-Mistral-7B\n", 1425 | " parameters:\n", 1426 | " density: 0.5\n", 1427 | " weight: 0.3\n", 1428 | "merge_method: ties\n", 1429 | "base_model: mistralai/Mistral-7B-v0.1\n", 1430 | "parameters:\n", 1431 | " normalize: true\n", 1432 | "dtype: float16\n", 1433 | "```\n", 1434 | "\n", 1435 | "You can find the final model on the Hugging Face Hub at [mlabonne/NeuralPipe-7B-ties](https://huggingface.co/mlabonne/NeuralPipe-7B-ties).\n", 1436 | "\n", 1437 | "### SLERP\n", 1438 | "\n", 1439 | "```yaml\n", 1440 | "slices:\n", 1441 | " - sources:\n", 1442 | " - model: OpenPipe/mistral-ft-optimized-1218\n", 1443 | " layer_range: [0, 32]\n", 1444 | " - model: mlabonne/NeuralHermes-2.5-Mistral-7B\n", 1445 | " layer_range: [0, 32]\n", 1446 | "merge_method: slerp\n", 1447 | "base_model: OpenPipe/mistral-ft-optimized-1218\n", 1448 | "parameters:\n", 1449 | " t:\n", 1450 | " - filter: self_attn\n", 1451 | " value: [0, 0.5, 0.3, 0.7, 1]\n", 1452 | " - filter: mlp\n", 1453 | " value: [1, 0.5, 0.7, 0.3, 0]\n", 1454 | " - value: 0.5\n", 1455 | "dtype: bfloat16\n", 1456 | "```\n", 1457 | "\n", 1458 | "You can find the final model on the Hugging Face Hub at [mlabonne/NeuralPipe-7B-slerp](https://huggingface.co/mlabonne/NeuralPipe-7B-slerp).\n", 1459 | "\n", 1460 | "### Passthrough\n", 1461 | "\n", 1462 | "```yaml\n", 1463 | "slices:\n", 1464 | " - sources:\n", 1465 | " - model: OpenPipe/mistral-ft-optimized-1218\n", 1466 | " layer_range: [0, 32]\n", 1467 | " - sources:\n", 1468 | " - model: mlabonne/NeuralHermes-2.5-Mistral-7B\n", 1469 | " layer_range: [24, 32]\n", 1470 | "merge_method: passthrough\n", 1471 | "dtype: bfloat16\n", 1472 | "```\n", 1473 | "\n", 1474 | "You can find the final model on the Hugging Face Hub at [mlabonne/NeuralPipe-9B-merged](https://huggingface.co/mlabonne/NeuralPipe-9B-merged)." 1475 | ], 1476 | "metadata": { 1477 | "id": "o12O0YjJvvLW" 1478 | } 1479 | }, 1480 | { 1481 | "cell_type": "code", 1482 | "execution_count": null, 1483 | "metadata": { 1484 | "id": "NPNPie5Eo3EZ" 1485 | }, 1486 | "outputs": [], 1487 | "source": [ 1488 | "!git clone https://github.com/cg123/mergekit.git\n", 1489 | "%cd mergekit\n", 1490 | "!pip install -e ." 1491 | ] 1492 | }, 1493 | { 1494 | "cell_type": "code", 1495 | "source": [ 1496 | "MODEL_NAME = \"Marcoro14-7B-slerp\"\n", 1497 | "yaml_config = \"\"\"\n", 1498 | "slices:\n", 1499 | " - sources:\n", 1500 | " - model: AIDC-ai-business/Marcoroni-7B-v3\n", 1501 | " layer_range: [0, 32]\n", 1502 | " - model: EmbeddedLLM/Mistral-7B-Merge-14-v0.1\n", 1503 | " layer_range: [0, 32]\n", 1504 | "merge_method: slerp\n", 1505 | "base_model: AIDC-ai-business/Marcoroni-7B-v3\n", 1506 | "parameters:\n", 1507 | " t:\n", 1508 | " - filter: self_attn\n", 1509 | " value: [0, 0.5, 0.3, 0.7, 1]\n", 1510 | " - filter: mlp\n", 1511 | " value: [1, 0.5, 0.7, 0.3, 0]\n", 1512 | " - value: 0.5\n", 1513 | "dtype: bfloat16\n", 1514 | "\"\"\"" 1515 | ], 1516 | "metadata": { 1517 | "id": "LGd7jlfCpNcg" 1518 | }, 1519 | "execution_count": 21, 1520 | "outputs": [] 1521 | }, 1522 | { 1523 | "cell_type": "code", 1524 | "source": [ 1525 | "import yaml\n", 1526 | "import torch\n", 1527 | "\n", 1528 | "from mergekit.config import MergeConfiguration\n", 1529 | "from mergekit.merge import MergeOptions, run_merge\n", 1530 | "\n", 1531 | "# Load merge config\n", 1532 | "data = yaml.safe_load(yaml_config)\n", 1533 | "merge_config = MergeConfiguration.model_validate(data)\n", 1534 | "\n", 1535 | "# Merge models\n", 1536 | "run_merge(\n", 1537 | " merge_config,\n", 1538 | " \"./merged\",\n", 1539 | " options=MergeOptions(\n", 1540 | " lora_merge_cache='/tmp',\n", 1541 | " cuda=torch.cuda.is_available(),\n", 1542 | " low_cpu_memory=True, # VRAM offloading\n", 1543 | " copy_tokenizer=True,\n", 1544 | " trust_remote_code=False,\n", 1545 | " lazy_unpickle=False,\n", 1546 | " ),\n", 1547 | ")" 1548 | ], 1549 | "metadata": { 1550 | "id": "d5mYzDo1q96y" 1551 | }, 1552 | "execution_count": null, 1553 | "outputs": [] 1554 | }, 1555 | { 1556 | "cell_type": "code", 1557 | "source": [ 1558 | "!pip install -qU huggingface_hub\n", 1559 | "\n", 1560 | "from huggingface_hub import ModelCard, ModelCardData\n", 1561 | "from jinja2 import Template\n", 1562 | "\n", 1563 | "template_text = \"\"\"\n", 1564 | "---\n", 1565 | "license: apache-2.0\n", 1566 | "tags:\n", 1567 | "- merge\n", 1568 | "- mergekit\n", 1569 | "---\n", 1570 | "\n", 1571 | "# {{ model_name }}\n", 1572 | "\n", 1573 | "This model is a merge of the following models made with [mergekit](https://github.com/cg123/mergekit):\n", 1574 | "\n", 1575 | "{%- for model in models %}\n", 1576 | " * [{{ model }}](https://huggingface.co/{{ model }})\n", 1577 | "{%- endfor %}\n", 1578 | "\n", 1579 | "## 🧩 Configuration\n", 1580 | "\n", 1581 | "```yaml\n", 1582 | "{{- yaml_config -}}\n", 1583 | "```\n", 1584 | "\"\"\"\n", 1585 | "\n", 1586 | "# Create a Jinja template object\n", 1587 | "jinja_template = Template(template_text.strip())\n", 1588 | "\n", 1589 | "# Get list of models from config\n", 1590 | "if \"models\" in data:\n", 1591 | " models = [data[\"models\"][i][\"model\"] for i in range(len(data[\"models\"])) if \"parameters\" in data[\"models\"][i]]\n", 1592 | "elif \"parameters\" in data:\n", 1593 | " models = [data[\"slices\"][0][\"sources\"][i][\"model\"] for i in range(len(data[\"slices\"][0][\"sources\"]))]\n", 1594 | "elif \"slices\" in data:\n", 1595 | " models = [data[\"slices\"][i][\"sources\"][0][\"model\"] for i in range(len(data[\"slices\"]))]\n", 1596 | "else:\n", 1597 | " raise Exception(\"No models or slices found in yaml config\")\n", 1598 | "\n", 1599 | "# Fill the template\n", 1600 | "content = jinja_template.render(\n", 1601 | " model_name=MODEL_NAME,\n", 1602 | " models=models,\n", 1603 | " yaml_config=yaml_config,\n", 1604 | ")\n", 1605 | "\n", 1606 | "# Save the model card\n", 1607 | "card = ModelCard(content)\n", 1608 | "card.save('merged/README.md')" 1609 | ], 1610 | "metadata": { 1611 | "colab": { 1612 | "base_uri": "https://localhost:8080/" 1613 | }, 1614 | "id": "w-RNKev373lI", 1615 | "outputId": "fccbbd1d-295f-4def-a398-f226813294bb" 1616 | }, 1617 | "execution_count": null, 1618 | "outputs": [ 1619 | { 1620 | "output_type": "stream", 1621 | "name": "stdout", 1622 | "text": [ 1623 | "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/330.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.6/330.1 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m330.1/330.1 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1624 | "\u001b[?25h" 1625 | ] 1626 | } 1627 | ] 1628 | }, 1629 | { 1630 | "cell_type": "code", 1631 | "source": [ 1632 | "from google.colab import userdata\n", 1633 | "from huggingface_hub import HfApi\n", 1634 | "\n", 1635 | "username = \"mlabonne\"\n", 1636 | "\n", 1637 | "# Defined in the secrets tab in Google Colab\n", 1638 | "api = HfApi(token=userdata.get(\"HF_TOKEN\"))\n", 1639 | "\n", 1640 | "api.create_repo(\n", 1641 | " repo_id=f\"{username}/{MODEL_NAME}\",\n", 1642 | " repo_type=\"model\"\n", 1643 | ")\n", 1644 | "api.upload_folder(\n", 1645 | " repo_id=f\"{username}/{MODEL_NAME}\",\n", 1646 | " folder_path=\"merged\",\n", 1647 | ")" 1648 | ], 1649 | "metadata": { 1650 | "colab": { 1651 | "base_uri": "https://localhost:8080/", 1652 | "height": 164, 1653 | "referenced_widgets": [ 1654 | "de24d272f2b842c5b01eedb3f536b810", 1655 | "0c5dab2657b2473385a424d90f3d4664", 1656 | "57efe36e546c473d8be34102f6ba9a58", 1657 | "871bad1d905d4877a9eaa242cfd54c4e", 1658 | "8951f6b2edf64464869391197c900f84", 1659 | "69a61ad28d5141dcbaea44060bc5ebf7", 1660 | "76c2fbf005ae4a5790edfeb499b387b7", 1661 | "116964f328dc45d991d895d684ac1216", 1662 | "1ecec5ba4424498082a5f64cf3d7faf8", 1663 | "fc4edcef273b4e75894f4b512122de94", 1664 | "ca2323b142f54998985d30481d5cfabe", 1665 | "63626ac2d0f546188c07512a04c71417", 1666 | "decd91747fd04ce39f3e2b733bc7f477", 1667 | "7140e4c154424fcab846a71889e99ed2", 1668 | "2264d8b75251425e94e635558af4e223", 1669 | "c37478198217457cb30c6649203cf4dc", 1670 | "4918769e4e984dfda924776e2373154c", 1671 | "9b48494c94cf49b5835489d97f7a24c5", 1672 | "6ed844da52fe466eb1c10c814489448c", 1673 | "9c60efa02e80423e828628190dd13bc3", 1674 | "0170e8cc57d94041956f7afbf2eef449", 1675 | "220c2ba5f2524271b24fe049431a474c", 1676 | "a6f99dd0662846f9a381d2d507a7b447", 1677 | "900b9fcb70a84781bd5b4213df54626d", 1678 | "0ea83f270e164795b64f23b143efb300", 1679 | "318dcdeac8fb40f88fa60114f1c6a7c1", 1680 | "af89cf715e0e4c5e9f59943a255394c1", 1681 | "40e23e35299d45d499432b8f1a9bc924", 1682 | "126b374e286747768ef7218454534640", 1683 | "bdd26e54eed5477f99b135552e5f3450", 1684 | "163a6fd878134e1eb5f193d1ebfff1c1", 1685 | "953d7c014f76413c9805a2ef8c2c9356", 1686 | "348879bf76d1471f9c79c1ec2dc07c1d", 1687 | "8d54ae0d028b40e7b018454187db1a1c", 1688 | "562353040be54593b23734390f49927c", 1689 | "00cbebe6df7d4995913f20e39fc71b15", 1690 | "aee3c563fdc54f9cb3ebc2630c84a9e6", 1691 | "b74e307a751844ffab9f7f3df367774b", 1692 | "8e6142e41f714fe9abe6a5bb72c071f9", 1693 | "49cd1c5663404fb5a307c345e7e970c3", 1694 | "920ef8e509d24ccda930f4c47eff158c", 1695 | "c8828d61b26a47ac97a1541e14c00f62", 1696 | "a3d7e352222647a99be79935b1ebd86a", 1697 | "80666ef5f07641c482a23618a767791d" 1698 | ] 1699 | }, 1700 | "id": "ik0V0dF55gfU", 1701 | "outputId": "9f6c605c-6b51-473d-c1fa-b103e9208785" 1702 | }, 1703 | "execution_count": null, 1704 | "outputs": [ 1705 | { 1706 | "output_type": "display_data", 1707 | "data": { 1708 | "text/plain": [ 1709 | "tokenizer.model: 0%| | 0.00/493k [00:00Follow me on XBlogHands-on GNN

4 | 5 | The LLM course is divided into three parts: 6 | 7 | 1. 🧩 **LLM Fundamentals** covers essential knowledge about mathematics, Python, and neural networks. 8 | 2. 🧑‍🔬 **The LLM Scientist** focuses on learning how to build the best possible LLMs using the latest techniques 9 | 3. 👷 **The LLM Engineer** focuses on how to create LLM-based solutions and deploy them. 10 | 11 | ## 📝 Notebooks 12 | 13 | A list of notebooks and articles related to large language models. 14 | 15 | ### Fine-tuning 16 | 17 | | Notebook | Description | Article | Notebook | 18 | |---------------------------------------|-------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| 19 | | Fine-tune Llama 2 in Google Colab | Step-by-step guide to fine-tune your first Llama 2 model. | [Article](https://mlabonne.github.io/blog/posts/Fine_Tune_Your_Own_Llama_2_Model_in_a_Colab_Notebook.html) | Open In Colab | 20 | | Fine-tune LLMs with Axolotl | End-to-end guide to the state-of-the-art tool for fine-tuning. | [Article](https://mlabonne.github.io/blog/posts/A_Beginners_Guide_to_LLM_Finetuning.html) | W.I.P. | 21 | | Fine-tune a Mistral-7b model with DPO | Boost the performance of supervised fine-tuned models with DPO. | [Tweet](https://twitter.com/maximelabonne/status/1729936514107290022) | Open In Colab | 22 | 23 | ### Quantization 24 | 25 | | Notebook | Description | Article | Notebook | 26 | |---------------------------------------|-------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| 27 | | 1. Introduction to Weight Quantization | Large language model optimization using 8-bit quantization. | [Article](https://mlabonne.github.io/blog/posts/Introduction_to_Weight_Quantization.html) | Open In Colab | 28 | | 2. 4-bit LLM Quantization using GPTQ | Quantize your own open-source LLMs to run them on consumer hardware. | [Article](https://mlabonne.github.io/blog/4bit_quantization/) | Open In Colab | 29 | | 3. Quantize Llama 2 models with GGUF and llama.cpp | Quantize Llama 2 models with llama.cpp and upload GGUF versions to the HF Hub. | [Article](https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html) | Open In Colab | 30 | | 4. ExLlamaV2: The Fastest Library to Run LLMs | Quantize and run EXL2 models and upload them to the HF Hub. | [Article](https://mlabonne.github.io/blog/posts/ExLlamaV2_The_Fastest_Library_to_Run%C2%A0LLMs.html) | Open In Colab | 31 | 32 | ### Other 33 | 34 | | Notebook | Description | Article | Notebook | 35 | |---------------------------------------|-------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| 36 | | Merge LLMs with Mergekit | Combine multiple LLMs and create your own Frankenstein models | [Tweet](https://twitter.com/maximelabonne/status/1740732104554807676) | Open In Colab | 37 | | Decoding Strategies in Large Language Models | A guide to text generation from beam search to nucleus sampling | [Article](https://mlabonne.github.io/blog/posts/2022-06-07-Decoding_strategies.html) | Open In Colab | 38 | | Visualizing GPT-2's Loss Landscape | 3D plot of the loss landscape based on weight pertubations. | [Tweet](https://twitter.com/maximelabonne/status/1667618081844219904) | Open In Colab | 39 | | Improve ChatGPT with Knowledge Graphs | Augment ChatGPT's answers with knowledge graphs. | [Article](https://mlabonne.github.io/blog/posts/Article_Improve_ChatGPT_with_Knowledge_Graphs.html) | Open In Colab | 40 | 41 | ## 🧩 LLM Fundamentals 42 | 43 | ![](images/roadmap_fundamentals.png) 44 | 45 | ### 1. Mathematics for Machine Learning 46 | 47 | Before mastering machine learning, it is important to understand the fundamental mathematical concepts that power these algorithms. 48 | 49 | - **Linear Algebra**: This is crucial for understanding many algorithms, especially those used in deep learning. Key concepts include vectors, matrices, determinants, eigenvalues and eigenvectors, vector spaces, and linear transformations. 50 | - **Calculus**: Many machine learning algorithms involve the optimization of continuous functions, which requires an understanding of derivatives, integrals, limits, and series. Multivariable calculus and the concept of gradients are also important. 51 | - **Probability and Statistics**: These are crucial for understanding how models learn from data and make predictions. Key concepts include probability theory, random variables, probability distributions, expectations, variance, covariance, correlation, hypothesis testing, confidence intervals, maximum likelihood estimation, and Bayesian inference. 52 | 53 | 📚 Resources: 54 | 55 | - [3Blue1Brown - The Essence of Linear Algebra](https://www.youtube.com/watch?v=fNk_zzaMoSs&list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab): Series of videos that give a geometric intuition to these concepts. 56 | - [StatQuest with Josh Starmer - Statistics Fundamentals](https://www.youtube.com/watch?v=qBigTkBLU6g&list=PLblh5JKOoLUK0FLuzwntyYI10UQFUhsY9): Offers simple and clear explanations for many statistical concepts. 57 | - [AP Statistics Intuition by Ms Aerin](https://automata88.medium.com/list/cacc224d5e7d): List of Medium articles that provide the intuition behind every probability distribution. 58 | - [Immersive Linear Algebra](https://immersivemath.com/ila/learnmore.html): Another visual interpretation of linear algebra. 59 | - [Khan Academy - Linear Algebra](https://www.khanacademy.org/math/linear-algebra): Great for beginners as it explains the concepts in a very intuitive way. 60 | - [Khan Academy - Calculus](https://www.khanacademy.org/math/calculus-1): An interactive course that covers all the basics of calculus. 61 | - [Khan Academy - Probability and Statistics](https://www.khanacademy.org/math/statistics-probability): Delivers the material in an easy-to-understand format. 62 | 63 | --- 64 | 65 | ### 2. Python for Machine Learning 66 | 67 | Python is a powerful and flexible programming language that's particularly good for machine learning, thanks to its readability, consistency, and robust ecosystem of data science libraries. 68 | 69 | - **Python Basics**: Understanding of Python's basic syntax, data types, error handling, and object-oriented programming is crucial. 70 | - **Data Science Libraries**: Familiarity with NumPy for numerical operations, Pandas for data manipulation and analysis, Matplotlib and Seaborn for data visualization is a must. 71 | - **Data Preprocessing**: This involves feature scaling and normalization, handling missing data, outlier detection, categorical data encoding, and splitting data into training, validation, and test sets. 72 | - **Machine Learning Libraries**: Proficiency with Scikit-learn, a library providing a wide selection of supervised and unsupervised learning algorithms, is vital. Understanding how to implement algorithms like linear regression, logistic regression, decision trees, random forests, k-nearest neighbors (K-NN), and K-means clustering is important. Dimensionality reduction techniques like PCA and t-SNE are also very helpful for visualizing high-dimensional data. 73 | 74 | 📚 Resources: 75 | 76 | - [Real Python](https://realpython.com/): A comprehensive resource with articles and tutorials for both beginner and advanced Python concepts. 77 | - [freeCodeCamp - Learn Python](https://www.youtube.com/watch?v=rfscVS0vtbw): Long video that provides a full introduction into all of the core concepts in Python. 78 | - [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/): Free digital book that is a great resource for learning pandas, NumPy, matplotlib, and Seaborn. 79 | - [freeCodeCamp - Machine Learning for Everybody](https://youtu.be/i_LwzRVP7bg): Practical introduction to different machine learning algorithms for beginners. 80 | - [Udacity - Intro to Machine Learning](https://www.udacity.com/course/intro-to-machine-learning--ud120): Free course that covers PCA and several other machine learning concepts. 81 | 82 | --- 83 | 84 | ### 3. Neural Networks 85 | 86 | Neural networks are a fundamental part of many machine learning models, particularly in the realm of deep learning. To utilize them effectively, a comprehensive understanding of their design and mechanics is essential. 87 | 88 | - **Fundamentals**: This includes understanding the structure of a neural network such as layers, weights, biases, activation functions (sigmoid, tanh, ReLU, etc.) 89 | - **Training and Optimization**: Familiarize yourself with backpropagation and different types of loss functions, like Mean Squared Error (MSE) and Cross-Entropy. Understand various optimization algorithms like Gradient Descent, Stochastic Gradient Descent, RMSprop, and Adam. 90 | - **Overfitting**: It's crucial to comprehend the concept of overfitting (where a model performs well on training data but poorly on unseen data) and various regularization techniques to prevent it. Techniques include dropout, L1/L2 regularization, early stopping, and data augmentation. 91 | - **Implement a Multilayer Perceptron (MLP)**: Build an MLP, also known as a fully connected network, using PyTorch. 92 | 93 | 📚 Resources: 94 | 95 | - [3Blue1Brown - But what is a Neural Network?](https://www.youtube.com/watch?v=aircAruvnKk): This video gives an intuitive explanation of neural networks and their inner workings. 96 | - [freeCodeCamp - Deep Learning Crash Course](https://www.youtube.com/watch?v=VyWAvY2CF9c): This video efficiently introduces all the most important concepts in deep learning. 97 | - [Fast.ai - Practical Deep Learning](https://course.fast.ai/): Free course designed for people with coding experience who want to learn about deep learning. 98 | - [Patrick Loeber - PyTorch Tutorials](https://www.youtube.com/playlist?list=PLqnslRFeH2UrcDBWF5mfPGpqQDSta6VK4): Series of videos for complete beginners to learn about PyTorch. 99 | 100 | --- 101 | 102 | ### 4. Natural Language Processing (NLP) 103 | 104 | NLP is a fascinating branch of artificial intelligence that bridges the gap between human language and machine understanding. From simple text processing to understanding linguistic nuances, NLP plays a crucial role in many applications like translation, sentiment analysis, chatbots, and much more. 105 | 106 | - **Text Preprocessing**: Learn various text preprocessing steps like tokenization (splitting text into words or sentences), stemming (reducing words to their root form), lemmatization (similar to stemming but considers the context), stop word removal, etc. 107 | - **Feature Extraction Techniques**: Become familiar with techniques to convert text data into a format that can be understood by machine learning algorithms. Key methods include Bag-of-words (BoW), Term Frequency-Inverse Document Frequency (TF-IDF), and n-grams. 108 | - **Word Embeddings**: Word embeddings are a type of word representation that allows words with similar meanings to have similar representations. Key methods include Word2Vec, GloVe, and FastText. 109 | - **Recurrent Neural Networks (RNNs)**: Understand the working of RNNs, a type of neural network designed to work with sequence data. Explore LSTMs and GRUs, two RNN variants that are capable of learning long-term dependencies. 110 | 111 | 📚 Resources: 112 | 113 | - [RealPython - NLP with spaCy in Python](https://realpython.com/natural-language-processing-spacy-python/): Exhaustive guide about the spaCy library for NLP tasks in Python. 114 | - [Kaggle - NLP Guide](https://www.kaggle.com/learn-guide/natural-language-processing): A few notebooks and resources for a hands-on explanation of NLP in Python. 115 | - [Jay Alammar - The Illustration Word2Vec](https://jalammar.github.io/illustrated-word2vec/): A good reference to understand the famous Word2Vec architecture. 116 | - [Jake Tae - PyTorch RNN from Scratch](https://jaketae.github.io/study/pytorch-rnn/): Practical and simple implementation of RNN, LSTM, and GRU models in PyTorch. 117 | - [colah's blog - Understanding LSTM Networks](https://colah.github.io/posts/2015-08-Understanding-LSTMs/): A more theoretical article about the LSTM network. 118 | 119 | ## 🧑‍🔬 The LLM Scientist 120 | 121 | ![](images/roadmap_scientist.png) 122 | 123 | ### 1. The LLM architecture 124 | 125 | While an in-depth knowledge about the Transformer architecture is not required, it is important to have a good understanding of its inputs (tokens) and outputs (logits). The vanilla attention mechanism is another crucial component to master, as improved versions of it are introduced later on. 126 | 127 | * **High-level view**: Revisit the encoder-decoder Transformer architecture, and more specifically the decoder-only GPT architecture, which is used in every modern LLM. 128 | * **Tokenization**: Understand how to convert raw text data into a format that the model can understand, which involves splitting the text into tokens (usually words or subwords). 129 | * **Attention mechanisms**: Grasp the theory behind attention mechanisms, including self-attention and scaled dot-product attention, which allows the model to focus on different parts of the input when producing an output. 130 | * **Text generation**: Learn about the different ways the model can generate output sequences. Common strategies include greedy decoding, beam search, top-k sampling, and nucleus sampling. 131 | 132 | 📚 **References**: 133 | - [The Illustrated Transformer](https://jalammar.github.io/illustrated-transformer/) by Jay Alammar: A visual and intuitive explanation of the Transformer model. 134 | - [The Illustrated GPT-2](https://jalammar.github.io/illustrated-gpt2/) by Jay Alammar: Even more important than the previous article, it is focused on the GPT architecture, which is very similar to Llama's. 135 | * [nanoGPT](https://www.youtube.com/watch?v=kCc8FmEb1nY) by Andrej Karpathy: A 2h-long YouTube video to reimplement GPT from scratch (for programmers). 136 | * [Attention? Attention!](https://lilianweng.github.io/posts/2018-06-24-attention/) by Lilian Weng: Introduce the need for attention in a more formal way. 137 | * [Decoding Strategies in LLMs](https://mlabonne.github.io/blog/posts/2023-06-07-Decoding_strategies.html): Provide code and a visual introduction to the different decoding strategies to generate text. 138 | 139 | --- 140 | ### 2. Building an instruction dataset 141 | 142 | While it's easy to find raw data from Wikipedia and other websites, it's difficult to collect pairs of instructions and answers in the wild. Like in traditional machine learning, the quality of the dataset will directly influence the quality of the model, which is why it might be the most important component in the fine-tuning process. 143 | 144 | * **[Alpaca](https://crfm.stanford.edu/2023/03/13/alpaca.html)-like dataset**: Generate synthetic data from scratch with the OpenAI API (GPT). You can specify seeds and system prompts to create a diverse dataset. 145 | * **Advanced techniques**: Learn how to improve existing datasets with [Evol-Instruct](https://arxiv.org/abs/2304.12244), how to generate high-quality synthetic data like in the [Orca](https://arxiv.org/abs/2306.02707) and [phi-1](https://arxiv.org/abs/2306.11644) papers. 146 | * **Filtering data**: Traditional techniques involving regex, removing near-duplicates, focusing on answers with a high number of tokens, etc. 147 | * **Prompt templates**: There's no true standard way of formatting instructions and answers, which is why it's important to know about the different chat templates, such as [ChatML](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/chatgpt?tabs=python&pivots=programming-language-chat-ml), [Alpaca](https://crfm.stanford.edu/2023/03/13/alpaca.html), etc. 148 | 149 | 📚 **References**: 150 | * [Preparing a Dataset for Instruction tuning](https://wandb.ai/capecape/alpaca_ft/reports/How-to-Fine-Tune-an-LLM-Part-1-Preparing-a-Dataset-for-Instruction-Tuning--Vmlldzo1NTcxNzE2) by Thomas Capelle: Exploration of the Alpaca and Alpaca-GPT4 datasets and how to format them. 151 | * [Generating a Clinical Instruction Dataset](https://medium.com/mlearning-ai/generating-a-clinical-instruction-dataset-in-portuguese-with-langchain-and-gpt-4-6ee9abfa41ae) by Solano Todeschini: Tutorial on how to create a synthetic instruction dataset using GPT-4. 152 | * [GPT 3.5 for news classification](https://medium.com/@kshitiz.sahay26/how-i-created-an-instruction-dataset-using-gpt-3-5-to-fine-tune-llama-2-for-news-classification-ed02fe41c81f) by Kshitiz Sahay: Use GPT 3.5 to create an instruction dataset to fine-tune Llama 2 for news classification. 153 | * [Dataset creation for fine-tuning LLM](https://colab.research.google.com/drive/1GH8PW9-zAe4cXEZyOIE-T9uHXblIldAg?usp=sharing): Notebook that contains a few techniques to filter a dataset and upload the result. 154 | * [Chat Template](https://huggingface.co/blog/chat-templates) by Matthew Carrigan: Hugging Face's page about prompt templates 155 | 156 | --- 157 | ### 3. Pre-training models 158 | 159 | Pre-training is a very long and costly process, which is why this is not the focus of this course. It's good to have some level of understanding of what happens during pre-training, but hands-on experience is not required. 160 | 161 | * **Data pipeline**: Pre-training requires huge datasets (e.g., [Llama 2](https://arxiv.org/abs/2307.09288) was trained on 2 trillion tokens) that need to be filtered, tokenized, and collated with a pre-defined vocabulary. 162 | * **Causal language modeling**: Learn the difference between causal and masked language modeling, as well as the loss function used in this case. For efficient pre-training, learn more about [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). 163 | * **Scaling laws**: The [scaling laws](https://arxiv.org/pdf/2001.08361.pdf) describe the expected model performance based on the model size, dataset size, and the amount of compute used for training. 164 | * **High-Performance Computing**: Out of scope here, but more knowledge about HPC is fundamental if you're planning to create your own LLM from scratch (hardware, distributed workload, etc.). 165 | 166 | 📚 **References**: 167 | * [LLMDataHub](https://github.com/Zjh-819/LLMDataHub) by Junhao Zhao: Curated list of datasets for pre-training, fine-tuning, and RLHF. 168 | * [Training a causal language model from scratch](https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt) by Hugging Face: Pre-train a GPT-2 model from scratch using the transformers library. 169 | * [Megatron-LM](https://github.com/NVIDIA/Megatron-LM): State-of-the-art library to efficiently pre-train models. 170 | * [TinyLlama](https://github.com/jzhang38/TinyLlama) by Zhang et al.: Check this project to get a good understanding of how a Llama model is trained from scratch. 171 | * [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) by Hugging Face: Explain the difference between causal and masked language modeling and how to quickly fine-tune a DistilGPT-2 model. 172 | * [Chinchilla's wild implications](https://www.lesswrong.com/posts/6Fpvch8RR29qLEWNH/chinchilla-s-wild-implications) by nostalgebraist: Discuss the scaling laws and explain what they mean to LLMs in general. 173 | * [BLOOM](https://bigscience.notion.site/BLOOM-BigScience-176B-Model-ad073ca07cdf479398d5f95d88e218c4) by BigScience: Notion pages that describes how the BLOOM model was built, with a lot of useful information about the engineering part and the problems that were encountered. 174 | * [OPT-175 Logbook](https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/chronicles/OPT175B_Logbook.pdf) by Meta: Research logs showing what went wrong and what went right. Useful if you're planning to pre-train a very large language model (in this case, 175B parameters). 175 | 176 | --- 177 | ### 4. Supervised Fine-Tuning 178 | 179 | Pre-trained models are only trained on a next-token prediction task, which is why they're not helpful assistants. SFT allows you to tweak them into responding to instructions. Moreover, it allows you to fine-tune your model on any data (private, not seen by GPT-4, etc.) and use it without having to pay for an API like OpenAI's. 180 | 181 | * **Full fine-tuning**: Full fine-tuning refers to training all the parameters in the model. It is not an efficient technique, but it produces slightly better results. 182 | * [**LoRA**](https://arxiv.org/abs/2106.09685): A parameter-efficient technique (PEFT) based on low-rank adapters. Instead of training all the parameters, we only train these adapters. 183 | * [**QLoRA**](https://arxiv.org/abs/2305.14314): Another PEFT based on LoRA, which also quantizes the weights of the model in 4 bits and introduce paged optimizers to manage memory spikes. 184 | * **[Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl)**: A user-friendly and powerful fine-tuning tool that is used in a lot of state-of-the-art open-source models. 185 | * [**DeepSpeed**](https://www.deepspeed.ai/): Efficient pre-training and fine-tuning of LLMs for multi-GPU and multi-node settings (implemented in Axolotl). 186 | 187 | 📚 **References**: 188 | * [The Novice's LLM Training Guide](https://rentry.org/llm-training) by Alpin: Overview of the main concepts and parameters to consider when fine-tuning LLMs. 189 | * [LoRA insights](https://lightning.ai/pages/community/lora-insights/) by Sebastian Raschka: Practical insights about LoRA and how to select the best parameters. 190 | * [Fine-Tune Your Own Llama 2 Model](https://mlabonne.github.io/blog/posts/Fine_Tune_Your_Own_Llama_2_Model_in_a_Colab_Notebook.html): Hands-on tutorial on how to fine-tune a Llama 2 model using Hugging Face libraries. 191 | * [Padding Large Language Models](https://towardsdatascience.com/padding-large-language-models-examples-with-llama-2-199fb10df8ff) by Benjamin Marie: Best practices to pad training examples for causal LLMs 192 | * [A Beginner's Guide to LLM Fine-Tuning](https://mlabonne.github.io/blog/posts/A_Beginners_Guide_to_LLM_Finetuning.html): Tutorial on how to fine-tune a CodeLlama model using Axolotl. 193 | 194 | --- 195 | ### 5. Reinforcement Learning from Human Feedback 196 | 197 | After supervised fine-tuning, RLHF is a step used to align the LLM's answers with human expectations. The idea is to learn preferences from human (or artificial) feedback, which can be used to reduce biases, censor models, or make them act in a more useful way. It is more complex than SFT and often seen as optional. 198 | 199 | * **Preference datasets**: These datasets typically contain several answers with some kind of ranking, which makes them more difficult to produce than instruction datasets. 200 | * [**Proximal Policy Optimization**](https://arxiv.org/abs/1707.06347): This algorithm leverages a reward model that predicts whether a given text is highly ranked by humans. This prediction is then used to optimize the SFT model with a penalty based on KL divergence. 201 | * **[Direct Preference Optimization](https://arxiv.org/abs/2305.18290)**: DPO simplifies the process by reframing it as a classification problem. It uses a reference model instead of a reward model (no training needed) and only requires one hyperparameter, making it more stable and efficient. 202 | 203 | 📚 **References**: 204 | * [An Introduction to Training LLMs using RLHF](https://wandb.ai/ayush-thakur/Intro-RLAIF/reports/An-Introduction-to-Training-LLMs-Using-Reinforcement-Learning-From-Human-Feedback-RLHF---VmlldzozMzYyNjcy) by Ayush Thakur: Explain why RLHF is desirable to reduce bias and increase performance in LLMs. 205 | * [Illustration RLHF](https://huggingface.co/blog/rlhf) by Hugging Face: Introduction to RLHF with reward model training and fine-tuning with reinforcement learning. 206 | * [StackLLaMA](https://huggingface.co/blog/stackllama) by Hugging Face: Tutorial to efficiently align a LLaMA model with RLHF using the transformers library. 207 | * [Fine-tune Llama 2 with DPO](https://huggingface.co/blog/dpo-trl) by Hugging Face: Tutorial to fine-tune a Llama 2 model with DPO. 208 | * [LLM Training: RLHF and Its Alternatives](https://substack.com/profile/27393275-sebastian-raschka-phd) by Sebastian Rashcka: Overview of the RLHF process and alternatives like RLAIF. 209 | 210 | --- 211 | ### 6. Evaluation 212 | 213 | Evaluating LLMs is an undervalued part of the pipeline, which is time-consuming and moderately reliable. Your downstream task should dictate what you want to evaluate, but always remember the Goodhart's law: "when a measure becomes a target, it ceases to be a good measure." 214 | 215 | * **Traditional metrics**: Metrics like perplexity and BLEU score are not popular as they were because they're flawed in most contexts. It is still important to understand them and when they can be applied. 216 | * **General benchmarks**: Based on the [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) is the main benchmark for general-purpose LLMs (like ChatGPT). There are other popular benchmarks like [BigBench](https://github.com/google/BIG-bench), [MT-Bench](https://arxiv.org/abs/2306.05685), etc. 217 | * **Task-specific benchmarks**: Tasks like summarization, translation, question answering have dedicated benchmarks, metrics, and even subdomains (medical, financial, etc.), such as [PubMedQA](https://pubmedqa.github.io/) for biomedical question answering. 218 | * **Human evaluation**: The most reliable evaluation is the acceptance rate by users or comparisons made by humans. If you want to know if a model performs well, the simplest but surest way is to use it yourself. 219 | 220 | 📚 **References**: 221 | * [Perplexity of fixed-length models](https://huggingface.co/docs/transformers/perplexity) by Hugging Face: Overview of perplexity with code to implement it with the transformers library. 222 | * [BLEU at your own risk](https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213) by Rachael Tatman: Overview of the BLEU score and its many issues with examples. 223 | * [A Survey on Evaluation of LLMs](https://arxiv.org/abs/2307.03109) by Chang et al.: Comprehensive paper about what to evaluate, where to evaluate, and how to evaluate. 224 | * [Chatbot Arena Leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) by lmsys: Elo rating of general-purpose LLMs, based on comparisons made by humans. 225 | 226 | --- 227 | ### 7. Quantization 228 | 229 | Quantization is the process of converting the weights (and activations) of a model using a lower precision. For example, weights stored using 16 bits can be converted into a 4-bit representation. This technique has become increasingly important to reduce the computational and memory costs associated to LLMs. 230 | 231 | * **Base techniques**: Learn the different levels of precision (FP32, FP16, INT8, etc.) and how to perform naïve quantization with absmax and zero-point techniques. 232 | * **GGUF and llama.cpp**: Originally designed to run on CPUs, [llama.cpp](https://github.com/ggerganov/llama.cpp) and the GGUF format have become the most popular tools to run LLMs on consumer-grade hardware. 233 | * **GPTQ and EXL2**: [GPTQ](https://arxiv.org/abs/2210.17323) and, more specifically, the [EXL2](https://github.com/turboderp/exllamav2) format offer an incredible speed but can only run on GPUs. Models also take a long time to be quantized. 234 | * **AWQ**: This new format is more accurate than GPTQ (lower perplexity) but uses a lot more VRAM and is not necessarily faster. 235 | 236 | 📚 **References**: 237 | * [Introduction to quantization](https://mlabonne.github.io/blog/posts/Introduction_to_Weight_Quantization.html): Overview of quantization, absmax and zero-point quantization, and LLM.int8() with code. 238 | * [Quantize Llama models with llama.cpp](https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html): Tutorial on how to quantize a Llama 2 model using llama.cpp and the GGUF format. 239 | * [4-bit LLM Quantization with GPTQ](https://mlabonne.github.io/blog/posts/Introduction_to_Weight_Quantization.html): Tutorial on how to quantize an LLM using the GPTQ algorithm with AutoGPTQ. 240 | * [ExLlamaV2: The Fastest Library to Run LLMs](https://mlabonne.github.io/blog/posts/ExLlamaV2_The_Fastest_Library_to_Run%C2%A0LLMs.html): Guide on how to quantize a Mistral model using the EXL2 format and run it with the ExLlamaV2 library. 241 | * [Understanding Activation-Aware Weight Quantization](https://medium.com/friendliai/understanding-activation-aware-weight-quantization-awq-boosting-inference-serving-efficiency-in-10bb0faf63a8) by FriendliAI: Overview of the AWQ technique and its benefits. 242 | 243 | --- 244 | ### 8. Inference optimization 245 | 246 | * **Flash Attention**: Optimization of the attention mechanism to transform its complexity from quadratic to linear, speeding up both training and inference. 247 | * **Key-value cache**: Understand the key-value cache and the improvements introduced in [Multi-Query Attention](https://arxiv.org/abs/1911.02150) (MQA) and [Grouped-Query Attention](https://arxiv.org/abs/2305.13245) (GQA). 248 | * **Speculative decoding**: Use a small model to produce drafts that are then reviewed by a larger model to speed up text generation. 249 | * **Positional encoding**: Understand positional encodings in transformers, particularly relative schemes like [RoPE](https://arxiv.org/abs/2104.09864), [ALiBi](https://arxiv.org/abs/2108.12409), and [YaRN](https://arxiv.org/abs/2309.00071). (Not directly connected to inference optimization but to longer context windows.) 250 | 251 | 📚 **References**: 252 | * [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one) by Hugging Face: Explain how to optimize inference on GPUs. 253 | * [Optimizing LLMs for Speed and Memory](https://huggingface.co/docs/transformers/main/en/llm_tutorial_optimization) by Hugging Face: Explain three main techniques to optimize speed and memory, namely quantization, Flash Attention, and architectural innovations. 254 | * [Assisted Generation](https://huggingface.co/blog/assisted-generation) by Hugging Face: HF's version of speculative decoding, it's an interesting blog post about how it works with code to implement it. 255 | * [Extending the RoPE](https://blog.eleuther.ai/yarn/) by EleutherAI: Article that summarizes the different position-encoding techniques. 256 | * [Extending Context is Hard... but not Impossible](https://kaiokendev.github.io/context) by kaiokendev: This blog post introduces the SuperHOT technique and provides an excellent survey of related work. 257 | 258 | ## 👷 The LLM Engineer 259 | 260 | W.I.P. 261 | 262 | --- 263 | ### Contributions 264 | 265 | Feel free to contact me if you think other topics should be mentioned or if the current architecture can be improved. 266 | 267 | ### Acknowledgements 268 | 269 | This roadmap was inspired by the excellent [DevOps Roadmap](https://github.com/milanm/DevOps-Roadmap) from Milan Milanović and Romano Roth. 270 | 271 | Special thanks to Thomas Thelen for motivating me to create a roadmap, and André Frade for his input and review of the first draft. 272 | 273 | *Disclaimer: I am not affiliated with any sources listed here.* 274 | 275 | ## Star History 276 | 277 | [![Star History Chart](https://api.star-history.com/svg?repos=mlabonne/llm-course&type=Date)](https://star-history.com/#mlabonne/llm-course&Date) 278 | -------------------------------------------------------------------------------- /Visualizing_GPT_2's_Loss_Landscape.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "source": [ 16 | "# Visualizing GPT-2's Loss Landscape\n", 17 | "\n", 18 | "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n", 19 | "\n", 20 | "Simple perturbation-based calculation of the negative log-likelihood loss in two directions, given \"I have a dream\" as input.\n", 21 | "\n", 22 | "Reference: [Visualizing the Loss Landscape of Neural Nets](https://arxiv.org/abs/1712.09913), by Li et al. (2018)" 23 | ], 24 | "metadata": { 25 | "id": "Dgptqrg0zEY5" 26 | } 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "id": "lIYdn1woOS1n", 33 | "colab": { 34 | "base_uri": "https://localhost:8080/" 35 | }, 36 | "outputId": "ffd96eb4-2861-4658-e89d-383eaeb36c3b" 37 | }, 38 | "outputs": [ 39 | { 40 | "output_type": "stream", 41 | "name": "stdout", 42 | "text": [ 43 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m47.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 44 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 45 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m75.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 46 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m67.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 47 | "\u001b[?25h" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "!pip install -q transformers" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "source": [ 58 | "%%time\n", 59 | "\n", 60 | "import torch\n", 61 | "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n", 62 | "import numpy as np\n", 63 | "import plotly.graph_objects as go\n", 64 | "from tqdm import tqdm\n", 65 | "import imageio\n", 66 | "import os\n", 67 | "\n", 68 | "# Load pre-trained model\n", 69 | "model_name = 'gpt2'\n", 70 | "model = GPT2LMHeadModel.from_pretrained(model_name)\n", 71 | "tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n", 72 | "\n", 73 | "# Set model to evaluation mode\n", 74 | "model.eval()\n", 75 | "\n", 76 | "# Define our input\n", 77 | "input_text = \"I have a dream\"\n", 78 | "inputs = tokenizer.encode_plus(input_text, return_tensors=\"pt\")\n", 79 | "\n", 80 | "# Compute the original loss\n", 81 | "outputs = model(**inputs, labels=inputs[\"input_ids\"])\n", 82 | "original_loss = outputs.loss.item()\n", 83 | "\n", 84 | "# Define two random directions\n", 85 | "direction1 = [torch.randn_like(p) for p in model.parameters()]\n", 86 | "direction2 = [torch.randn_like(p) for p in model.parameters()]\n", 87 | "\n", 88 | "# Normalize vectors\n", 89 | "for p, d1, d2 in zip(model.parameters(), direction1, direction2):\n", 90 | " norm_p = torch.linalg.norm(p.flatten())\n", 91 | " d1.div_(torch.linalg.norm(d1.flatten())).mul_(norm_p)\n", 92 | " d2.div_(torch.linalg.norm(d2.flatten())).mul_(norm_p)\n", 93 | "\n", 94 | "# Define the range to explore\n", 95 | "x = np.linspace(-1, 1, 20)\n", 96 | "y = np.linspace(-1, 1, 20)\n", 97 | "X, Y = np.meshgrid(x, y)\n", 98 | "\n", 99 | "# Prepare to collect the losses\n", 100 | "Z = np.zeros_like(X)\n", 101 | "\n", 102 | "# Compute loss for each direction\n", 103 | "for i in tqdm(range(x.size), desc=\"x progress\"):\n", 104 | " for j in tqdm(range(y.size), desc=\"y progress\", leave=False):\n", 105 | " # Perturb the model parameters\n", 106 | " for p, d1, d2 in zip(model.parameters(), direction1, direction2):\n", 107 | " p.data.add_(x[i]*d1 + y[j]*d2)\n", 108 | " \n", 109 | " # Compute the loss\n", 110 | " outputs = model(**inputs, labels=inputs['input_ids'])\n", 111 | " Z[i, j] = outputs.loss.item()\n", 112 | " \n", 113 | " # Revert the model parameters\n", 114 | " for p, d1, d2 in zip(model.parameters(), direction1, direction2):\n", 115 | " p.data.sub_(x[i]*d1 + y[j]*d2)\n" 116 | ], 117 | "metadata": { 118 | "colab": { 119 | "base_uri": "https://localhost:8080/" 120 | }, 121 | "id": "4Wud37sZa1Y7", 122 | "outputId": "c4e9c839-2938-4df8-c54f-96f5792cc0ed" 123 | }, 124 | "execution_count": 11, 125 | "outputs": [ 126 | { 127 | "output_type": "stream", 128 | "name": "stderr", 129 | "text": [ 130 | "100%|██████████| 20/20 [12:42<00:00, 38.11s/it]\n" 131 | ] 132 | } 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "source": [ 138 | "# Create 3D plot\n", 139 | "fig = go.Figure(data=[go.Surface(z=Z, x=X, y=Y, \n", 140 | " showscale=False,)])\n", 141 | "fig.update_layout(\n", 142 | " title=\"GPT-2's Loss Landscape\",\n", 143 | " autosize=True,\n", 144 | " width=1000,\n", 145 | " height=600,\n", 146 | " # scene=dict(\n", 147 | " # xaxis=dict(visible=False),\n", 148 | " # yaxis=dict(visible=False),\n", 149 | " # zaxis=dict(visible=False),\n", 150 | " # )\n", 151 | ")\n", 152 | "fig.show()" 153 | ], 154 | "metadata": { 155 | "colab": { 156 | "base_uri": "https://localhost:8080/", 157 | "height": 617 158 | }, 159 | "id": "5zcGwU4ji67L", 160 | "outputId": "f149f7eb-abfb-4b06-aead-ace3772c5379" 161 | }, 162 | "execution_count": 32, 163 | "outputs": [ 164 | { 165 | "output_type": "display_data", 166 | "data": { 167 | "text/html": [ 168 | "\n", 169 | "\n", 170 | "\n", 171 | "
\n", 172 | "
\n", 197 | "\n", 198 | "" 199 | ] 200 | }, 201 | "metadata": {} 202 | } 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "source": [], 208 | "metadata": { 209 | "id": "5Gvy1Bkut13M" 210 | }, 211 | "execution_count": null, 212 | "outputs": [] 213 | } 214 | ], 215 | "metadata": { 216 | "colab": { 217 | "provenance": [], 218 | "gpuType": "T4", 219 | "include_colab_link": true 220 | }, 221 | "kernelspec": { 222 | "display_name": "Python 3", 223 | "name": "python3" 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 0 228 | } -------------------------------------------------------------------------------- /images/colab.svg: -------------------------------------------------------------------------------- 1 | Open in ColabOpen in Colab 2 | -------------------------------------------------------------------------------- /images/roadmap_fundamentals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/llm-course/85b8056dd58b94e0eb7065425eaefc6f1b625404/images/roadmap_fundamentals.png -------------------------------------------------------------------------------- /images/roadmap_scientist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/llm-course/85b8056dd58b94e0eb7065425eaefc6f1b625404/images/roadmap_scientist.png --------------------------------------------------------------------------------