├── Finetune_AnyLLM.ipynb
├── Finetune_Mixtral_lora.ipynb
├── LICENSE.txt
├── README.md
└── inference.ipynb
/Finetune_Mixtral_lora.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# License\n",
8 | "This notebook is licensed under the MIT License - see the LICENSE file in [this repository](https://github.com/PrakharSaxena24/RepoForLLMs/) for details.\n"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {
14 | "id": "-MlkIQ0pLSrY"
15 | },
16 | "source": [
17 | "# Finetune Mixtral8x7B.\n",
18 | "This is being run on A100 (40GB).\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "id": "S1CJwtl7J2Eg"
26 | },
27 | "outputs": [],
28 | "source": [
29 | "!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy\n"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {
36 | "id": "NBPD8rQUkmTg"
37 | },
38 | "outputs": [],
39 | "source": [
40 | "import torch\n",
41 | "import transformers\n",
42 | "from datasets import load_dataset\n",
43 | "from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments\n",
44 | "from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {
51 | "colab": {
52 | "base_uri": "https://localhost:8080/",
53 | "height": 49,
54 | "referenced_widgets": [
55 | "442bd77df1b4493794f8a2e49fecd5d1",
56 | "ce646a2be8d84eefb8910a2b62d0e831",
57 | "9283af3600a24f5fbb630ecf529fe045",
58 | "046c8d8c2822419eafe2ecc5e015d7b2",
59 | "7fb8fec91f84454bbcd5dcba9001d731",
60 | "87ecefbbf8be4e5fbf1719aed14af9ad",
61 | "a54b7839a9454671a9f3d275fb338e6c",
62 | "a275e5f970094dfa9408445452b12b3e",
63 | "a30e1efe178743108b95fb5bfe32b0f0",
64 | "87e2d5e8da95478b92a7a725c9b100a6",
65 | "326790700c434c6098c386101cc9e6c4"
66 | ]
67 | },
68 | "id": "E8JNln8MknxJ",
69 | "outputId": "ea33aedd-e100-4e21-fff8-40d8b22dcdf9"
70 | },
71 | "outputs": [
72 | {
73 | "data": {
74 | "application/vnd.jupyter.widget-view+json": {
75 | "model_id": "442bd77df1b4493794f8a2e49fecd5d1",
76 | "version_major": 2,
77 | "version_minor": 0
78 | },
79 | "text/plain": [
80 | "Loading checkpoint shards: 0%| | 0/19 [00:00, ?it/s]"
81 | ]
82 | },
83 | "metadata": {},
84 | "output_type": "display_data"
85 | }
86 | ],
87 | "source": [
88 | "tokenizer = AutoTokenizer.from_pretrained(\"mistralai/Mixtral-8x7B-Instruct-v0.1\")\n",
89 | "model = AutoModelForCausalLM.from_pretrained(\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
90 | " load_in_4bit=True,\n",
91 | " torch_dtype=torch.float16,\n",
92 | " device_map=\"auto\",\n",
93 | " # attn_implementation=\"flash_attention_2\", #You can use flash attention on your local GPU with specific libraries\n",
94 | " )"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {
101 | "id": "IeRJMgM3ko6B"
102 | },
103 | "outputs": [],
104 | "source": [
105 | "tokenizer.pad_token = \"!\" #Not EOS, will explain another time."
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {
112 | "id": "u5AHv0KfnQs2"
113 | },
114 | "outputs": [],
115 | "source": [
116 | "CUTOFF_LEN = 256 #Our dataset has shot text\n",
117 | "LORA_R = 8\n",
118 | "LORA_ALPHA = 2 * LORA_R\n",
119 | "LORA_DROPOUT = 0.1"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {
126 | "id": "rcX9e6_8mojQ"
127 | },
128 | "outputs": [],
129 | "source": [
130 | "config = LoraConfig(\n",
131 | " r=LORA_R,\n",
132 | " lora_alpha=LORA_ALPHA,\n",
133 | " target_modules=[ \"w1\", \"w2\", \"w3\"], #Only Training the \"expert\" layers\n",
134 | " lora_dropout=LORA_DROPOUT,\n",
135 | " bias=\"none\",\n",
136 | " task_type=\"CAUSAL_LM\"\n",
137 | ")\n",
138 | "\n",
139 | "model = get_peft_model(model, config)"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {
146 | "colab": {
147 | "base_uri": "https://localhost:8080/"
148 | },
149 | "id": "fKRjCSQPod8Q",
150 | "outputId": "7ea3d8b0-ccab-49e6-ed6e-cc47d09af527"
151 | },
152 | "outputs": [
153 | {
154 | "name": "stdout",
155 | "output_type": "stream",
156 | "text": [
157 | "trainable params: 113246208 || all params: 23595847680 || trainable%: 0.4799412571898752\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "def print_trainable_parameters(m):\n",
163 | " trainable_params = sum(p.numel() for p in m.parameters() if p.requires_grad)\n",
164 | " all_params = sum(p.numel() for p in m.parameters())\n",
165 | " print(f\"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params}\")\n",
166 | "\n",
167 | "print_trainable_parameters(model)"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "colab": {
175 | "base_uri": "https://localhost:8080/"
176 | },
177 | "id": "8Zagq7JrohqK",
178 | "outputId": "00ad6477-0e89-4027-8bc4-62846236cacc"
179 | },
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "dataset DatasetDict({\n",
186 | " train: Dataset({\n",
187 | " features: ['modern', 'shakespearean'],\n",
188 | " num_rows: 274\n",
189 | " })\n",
190 | "})\n"
191 | ]
192 | }
193 | ],
194 | "source": [
195 | "dataset = load_dataset(\"harpreetsahota/modern-to-shakesperean-translation\") #Found a good small dataset for a quick test run!\n",
196 | "print(\"dataset\", dataset)\n",
197 | "train_data = dataset[\"train\"] # Not using evaluation data"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {
204 | "id": "qjQhqAn4o_Of"
205 | },
206 | "outputs": [],
207 | "source": [
208 | "def generate_prompt(user_query, sep=\"\\n\\n### \"): #The prompt format is taken from the official Mixtral huggingface page\n",
209 | " sys_msg= \"Translate the given text to Shakespearean style.\"\n",
210 | " p = \" [INST]\" + sys_msg +\"\\n\"+ user_query[\"modern\"] + \"[/INST]\" + user_query[\"shakespearean\"] + \"\"\n",
211 | " return p"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {
218 | "id": "z5BN_OW-qTyr"
219 | },
220 | "outputs": [],
221 | "source": [
222 | "def tokenize(prompt):\n",
223 | " return tokenizer(\n",
224 | " prompt + tokenizer.eos_token,\n",
225 | " truncation=True,\n",
226 | " max_length=CUTOFF_LEN ,\n",
227 | " padding=\"max_length\"\n",
228 | " )"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {
235 | "colab": {
236 | "base_uri": "https://localhost:8080/",
237 | "height": 49,
238 | "referenced_widgets": [
239 | "82023d1657324e7dbd604a02ed9a8565",
240 | "c2465ae3c2df4af1b80def5cd6e3daa3",
241 | "a00e1e7c8dc74e928a42ec4fcd7c110f",
242 | "49024a3b2fc746d7b89ec2297077badb",
243 | "0bdfbbbf29c7478298e7006189e40d3c",
244 | "a0c07ee1d65d4d70b983fb33ae78dd36",
245 | "56145d6b220e4228ba37fe38b1d1a08a",
246 | "f8bb1f285c0f4fdda057fb64a3379262",
247 | "b1610a3654634afabcaa455e12911c9c",
248 | "1733e70e3664473985e530372f856ca7",
249 | "e7c559612c0f4ffcb7467e01e4d33b2d"
250 | ]
251 | },
252 | "id": "ov8PNhNPqtlq",
253 | "outputId": "de260c66-4a80-4d11-937f-bf53590ca678"
254 | },
255 | "outputs": [
256 | {
257 | "data": {
258 | "application/vnd.jupyter.widget-view+json": {
259 | "model_id": "82023d1657324e7dbd604a02ed9a8565",
260 | "version_major": 2,
261 | "version_minor": 0
262 | },
263 | "text/plain": [
264 | "Map: 0%| | 0/274 [00:00, ? examples/s]"
265 | ]
266 | },
267 | "metadata": {},
268 | "output_type": "display_data"
269 | }
270 | ],
271 | "source": [
272 | "train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)), remove_columns=[\"modern\" , \"shakespearean\"])"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": null,
278 | "metadata": {
279 | "id": "2TeVtAVbq04C"
280 | },
281 | "outputs": [],
282 | "source": [
283 | "trainer = Trainer(\n",
284 | " model=model,\n",
285 | " train_dataset=train_data,\n",
286 | " args=TrainingArguments(\n",
287 | " per_device_train_batch_size=1,\n",
288 | " gradient_accumulation_steps=4,\n",
289 | " num_train_epochs=6,\n",
290 | " learning_rate=1e-4,\n",
291 | " logging_steps=2,\n",
292 | " optim=\"adamw_torch\",\n",
293 | " save_strategy=\"epoch\",\n",
294 | " output_dir=\"mixtral-moe-lora-instruct-shapeskeare\"\n",
295 | " ),\n",
296 | " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)\n",
297 | ")\n",
298 | "model.config.use_cache = False\n"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {
305 | "colab": {
306 | "base_uri": "https://localhost:8080/",
307 | "height": 964
308 | },
309 | "id": "L683Bua0rCjN",
310 | "outputId": "9d7d7879-2270-4a3b-ccbd-df3baa5bc7c3"
311 | },
312 | "outputs": [
313 | {
314 | "name": "stderr",
315 | "output_type": "stream",
316 | "text": [
317 | "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
318 | "/usr/local/lib/python3.10/dist-packages/bitsandbytes/nn/modules.py:226: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference or training speed.\n",
319 | " warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference or training speed.')\n"
320 | ]
321 | },
322 | {
323 | "data": {
324 | "text/html": [
325 | "\n",
326 | "
\n",
327 | " \n",
328 | "
\n",
329 | " [ 55/408 07:57 < 53:00, 0.11 it/s, Epoch 0.79/6]\n",
330 | "
\n",
331 | " \n",
332 | " \n",
333 | " \n",
334 | " Step | \n",
335 | " Training Loss | \n",
336 | "
\n",
337 | " \n",
338 | " \n",
339 | " \n",
340 | " 2 | \n",
341 | " 6.575100 | \n",
342 | "
\n",
343 | " \n",
344 | " 4 | \n",
345 | " 5.346300 | \n",
346 | "
\n",
347 | " \n",
348 | " 6 | \n",
349 | " 3.909500 | \n",
350 | "
\n",
351 | " \n",
352 | " 8 | \n",
353 | " 3.360000 | \n",
354 | "
\n",
355 | " \n",
356 | " 10 | \n",
357 | " 2.603800 | \n",
358 | "
\n",
359 | " \n",
360 | " 12 | \n",
361 | " 2.199500 | \n",
362 | "
\n",
363 | " \n",
364 | " 14 | \n",
365 | " 2.069200 | \n",
366 | "
\n",
367 | " \n",
368 | " 16 | \n",
369 | " 1.869600 | \n",
370 | "
\n",
371 | " \n",
372 | " 18 | \n",
373 | " 1.914100 | \n",
374 | "
\n",
375 | " \n",
376 | " 20 | \n",
377 | " 1.813700 | \n",
378 | "
\n",
379 | " \n",
380 | " 22 | \n",
381 | " 1.680800 | \n",
382 | "
\n",
383 | " \n",
384 | " 24 | \n",
385 | " 1.706800 | \n",
386 | "
\n",
387 | " \n",
388 | " 26 | \n",
389 | " 1.474800 | \n",
390 | "
\n",
391 | " \n",
392 | " 28 | \n",
393 | " 1.683900 | \n",
394 | "
\n",
395 | " \n",
396 | " 30 | \n",
397 | " 1.678800 | \n",
398 | "
\n",
399 | " \n",
400 | " 32 | \n",
401 | " 1.478400 | \n",
402 | "
\n",
403 | " \n",
404 | " 34 | \n",
405 | " 1.557600 | \n",
406 | "
\n",
407 | " \n",
408 | " 36 | \n",
409 | " 1.424500 | \n",
410 | "
\n",
411 | " \n",
412 | " 38 | \n",
413 | " 1.405000 | \n",
414 | "
\n",
415 | " \n",
416 | " 40 | \n",
417 | " 1.498200 | \n",
418 | "
\n",
419 | " \n",
420 | " 42 | \n",
421 | " 1.407700 | \n",
422 | "
\n",
423 | " \n",
424 | " 44 | \n",
425 | " 1.307800 | \n",
426 | "
\n",
427 | " \n",
428 | " 46 | \n",
429 | " 1.285800 | \n",
430 | "
\n",
431 | " \n",
432 | " 48 | \n",
433 | " 1.359700 | \n",
434 | "
\n",
435 | " \n",
436 | " 50 | \n",
437 | " 1.449500 | \n",
438 | "
\n",
439 | " \n",
440 | " 52 | \n",
441 | " 1.409000 | \n",
442 | "
\n",
443 | " \n",
444 | "
"
445 | ],
446 | "text/plain": [
447 | ""
448 | ]
449 | },
450 | "metadata": {},
451 | "output_type": "display_data"
452 | }
453 | ],
454 | "source": [
455 | "trainer.train()"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": null,
461 | "metadata": {
462 | "id": "agczdazarIdk"
463 | },
464 | "outputs": [],
465 | "source": []
466 | }
467 | ],
468 | "metadata": {
469 | "accelerator": "GPU",
470 | "colab": {
471 | "gpuType": "A100",
472 | "machine_shape": "hm",
473 | "provenance": []
474 | },
475 | "kernelspec": {
476 | "display_name": "Python 3",
477 | "name": "python3"
478 | },
479 | "language_info": {
480 | "name": "python"
481 | },
482 | "widgets": {
483 | "application/vnd.jupyter.widget-state+json": {
484 | "046c8d8c2822419eafe2ecc5e015d7b2": {
485 | "model_module": "@jupyter-widgets/controls",
486 | "model_module_version": "1.5.0",
487 | "model_name": "HTMLModel",
488 | "state": {
489 | "_dom_classes": [],
490 | "_model_module": "@jupyter-widgets/controls",
491 | "_model_module_version": "1.5.0",
492 | "_model_name": "HTMLModel",
493 | "_view_count": null,
494 | "_view_module": "@jupyter-widgets/controls",
495 | "_view_module_version": "1.5.0",
496 | "_view_name": "HTMLView",
497 | "description": "",
498 | "description_tooltip": null,
499 | "layout": "IPY_MODEL_87e2d5e8da95478b92a7a725c9b100a6",
500 | "placeholder": "",
501 | "style": "IPY_MODEL_326790700c434c6098c386101cc9e6c4",
502 | "value": " 19/19 [06:15<00:00, 18.74s/it]"
503 | }
504 | },
505 | "0bdfbbbf29c7478298e7006189e40d3c": {
506 | "model_module": "@jupyter-widgets/base",
507 | "model_module_version": "1.2.0",
508 | "model_name": "LayoutModel",
509 | "state": {
510 | "_model_module": "@jupyter-widgets/base",
511 | "_model_module_version": "1.2.0",
512 | "_model_name": "LayoutModel",
513 | "_view_count": null,
514 | "_view_module": "@jupyter-widgets/base",
515 | "_view_module_version": "1.2.0",
516 | "_view_name": "LayoutView",
517 | "align_content": null,
518 | "align_items": null,
519 | "align_self": null,
520 | "border": null,
521 | "bottom": null,
522 | "display": null,
523 | "flex": null,
524 | "flex_flow": null,
525 | "grid_area": null,
526 | "grid_auto_columns": null,
527 | "grid_auto_flow": null,
528 | "grid_auto_rows": null,
529 | "grid_column": null,
530 | "grid_gap": null,
531 | "grid_row": null,
532 | "grid_template_areas": null,
533 | "grid_template_columns": null,
534 | "grid_template_rows": null,
535 | "height": null,
536 | "justify_content": null,
537 | "justify_items": null,
538 | "left": null,
539 | "margin": null,
540 | "max_height": null,
541 | "max_width": null,
542 | "min_height": null,
543 | "min_width": null,
544 | "object_fit": null,
545 | "object_position": null,
546 | "order": null,
547 | "overflow": null,
548 | "overflow_x": null,
549 | "overflow_y": null,
550 | "padding": null,
551 | "right": null,
552 | "top": null,
553 | "visibility": null,
554 | "width": null
555 | }
556 | },
557 | "1733e70e3664473985e530372f856ca7": {
558 | "model_module": "@jupyter-widgets/base",
559 | "model_module_version": "1.2.0",
560 | "model_name": "LayoutModel",
561 | "state": {
562 | "_model_module": "@jupyter-widgets/base",
563 | "_model_module_version": "1.2.0",
564 | "_model_name": "LayoutModel",
565 | "_view_count": null,
566 | "_view_module": "@jupyter-widgets/base",
567 | "_view_module_version": "1.2.0",
568 | "_view_name": "LayoutView",
569 | "align_content": null,
570 | "align_items": null,
571 | "align_self": null,
572 | "border": null,
573 | "bottom": null,
574 | "display": null,
575 | "flex": null,
576 | "flex_flow": null,
577 | "grid_area": null,
578 | "grid_auto_columns": null,
579 | "grid_auto_flow": null,
580 | "grid_auto_rows": null,
581 | "grid_column": null,
582 | "grid_gap": null,
583 | "grid_row": null,
584 | "grid_template_areas": null,
585 | "grid_template_columns": null,
586 | "grid_template_rows": null,
587 | "height": null,
588 | "justify_content": null,
589 | "justify_items": null,
590 | "left": null,
591 | "margin": null,
592 | "max_height": null,
593 | "max_width": null,
594 | "min_height": null,
595 | "min_width": null,
596 | "object_fit": null,
597 | "object_position": null,
598 | "order": null,
599 | "overflow": null,
600 | "overflow_x": null,
601 | "overflow_y": null,
602 | "padding": null,
603 | "right": null,
604 | "top": null,
605 | "visibility": null,
606 | "width": null
607 | }
608 | },
609 | "326790700c434c6098c386101cc9e6c4": {
610 | "model_module": "@jupyter-widgets/controls",
611 | "model_module_version": "1.5.0",
612 | "model_name": "DescriptionStyleModel",
613 | "state": {
614 | "_model_module": "@jupyter-widgets/controls",
615 | "_model_module_version": "1.5.0",
616 | "_model_name": "DescriptionStyleModel",
617 | "_view_count": null,
618 | "_view_module": "@jupyter-widgets/base",
619 | "_view_module_version": "1.2.0",
620 | "_view_name": "StyleView",
621 | "description_width": ""
622 | }
623 | },
624 | "442bd77df1b4493794f8a2e49fecd5d1": {
625 | "model_module": "@jupyter-widgets/controls",
626 | "model_module_version": "1.5.0",
627 | "model_name": "HBoxModel",
628 | "state": {
629 | "_dom_classes": [],
630 | "_model_module": "@jupyter-widgets/controls",
631 | "_model_module_version": "1.5.0",
632 | "_model_name": "HBoxModel",
633 | "_view_count": null,
634 | "_view_module": "@jupyter-widgets/controls",
635 | "_view_module_version": "1.5.0",
636 | "_view_name": "HBoxView",
637 | "box_style": "",
638 | "children": [
639 | "IPY_MODEL_ce646a2be8d84eefb8910a2b62d0e831",
640 | "IPY_MODEL_9283af3600a24f5fbb630ecf529fe045",
641 | "IPY_MODEL_046c8d8c2822419eafe2ecc5e015d7b2"
642 | ],
643 | "layout": "IPY_MODEL_7fb8fec91f84454bbcd5dcba9001d731"
644 | }
645 | },
646 | "49024a3b2fc746d7b89ec2297077badb": {
647 | "model_module": "@jupyter-widgets/controls",
648 | "model_module_version": "1.5.0",
649 | "model_name": "HTMLModel",
650 | "state": {
651 | "_dom_classes": [],
652 | "_model_module": "@jupyter-widgets/controls",
653 | "_model_module_version": "1.5.0",
654 | "_model_name": "HTMLModel",
655 | "_view_count": null,
656 | "_view_module": "@jupyter-widgets/controls",
657 | "_view_module_version": "1.5.0",
658 | "_view_name": "HTMLView",
659 | "description": "",
660 | "description_tooltip": null,
661 | "layout": "IPY_MODEL_1733e70e3664473985e530372f856ca7",
662 | "placeholder": "",
663 | "style": "IPY_MODEL_e7c559612c0f4ffcb7467e01e4d33b2d",
664 | "value": " 274/274 [00:00<00:00, 2056.92 examples/s]"
665 | }
666 | },
667 | "56145d6b220e4228ba37fe38b1d1a08a": {
668 | "model_module": "@jupyter-widgets/controls",
669 | "model_module_version": "1.5.0",
670 | "model_name": "DescriptionStyleModel",
671 | "state": {
672 | "_model_module": "@jupyter-widgets/controls",
673 | "_model_module_version": "1.5.0",
674 | "_model_name": "DescriptionStyleModel",
675 | "_view_count": null,
676 | "_view_module": "@jupyter-widgets/base",
677 | "_view_module_version": "1.2.0",
678 | "_view_name": "StyleView",
679 | "description_width": ""
680 | }
681 | },
682 | "7fb8fec91f84454bbcd5dcba9001d731": {
683 | "model_module": "@jupyter-widgets/base",
684 | "model_module_version": "1.2.0",
685 | "model_name": "LayoutModel",
686 | "state": {
687 | "_model_module": "@jupyter-widgets/base",
688 | "_model_module_version": "1.2.0",
689 | "_model_name": "LayoutModel",
690 | "_view_count": null,
691 | "_view_module": "@jupyter-widgets/base",
692 | "_view_module_version": "1.2.0",
693 | "_view_name": "LayoutView",
694 | "align_content": null,
695 | "align_items": null,
696 | "align_self": null,
697 | "border": null,
698 | "bottom": null,
699 | "display": null,
700 | "flex": null,
701 | "flex_flow": null,
702 | "grid_area": null,
703 | "grid_auto_columns": null,
704 | "grid_auto_flow": null,
705 | "grid_auto_rows": null,
706 | "grid_column": null,
707 | "grid_gap": null,
708 | "grid_row": null,
709 | "grid_template_areas": null,
710 | "grid_template_columns": null,
711 | "grid_template_rows": null,
712 | "height": null,
713 | "justify_content": null,
714 | "justify_items": null,
715 | "left": null,
716 | "margin": null,
717 | "max_height": null,
718 | "max_width": null,
719 | "min_height": null,
720 | "min_width": null,
721 | "object_fit": null,
722 | "object_position": null,
723 | "order": null,
724 | "overflow": null,
725 | "overflow_x": null,
726 | "overflow_y": null,
727 | "padding": null,
728 | "right": null,
729 | "top": null,
730 | "visibility": null,
731 | "width": null
732 | }
733 | },
734 | "82023d1657324e7dbd604a02ed9a8565": {
735 | "model_module": "@jupyter-widgets/controls",
736 | "model_module_version": "1.5.0",
737 | "model_name": "HBoxModel",
738 | "state": {
739 | "_dom_classes": [],
740 | "_model_module": "@jupyter-widgets/controls",
741 | "_model_module_version": "1.5.0",
742 | "_model_name": "HBoxModel",
743 | "_view_count": null,
744 | "_view_module": "@jupyter-widgets/controls",
745 | "_view_module_version": "1.5.0",
746 | "_view_name": "HBoxView",
747 | "box_style": "",
748 | "children": [
749 | "IPY_MODEL_c2465ae3c2df4af1b80def5cd6e3daa3",
750 | "IPY_MODEL_a00e1e7c8dc74e928a42ec4fcd7c110f",
751 | "IPY_MODEL_49024a3b2fc746d7b89ec2297077badb"
752 | ],
753 | "layout": "IPY_MODEL_0bdfbbbf29c7478298e7006189e40d3c"
754 | }
755 | },
756 | "87e2d5e8da95478b92a7a725c9b100a6": {
757 | "model_module": "@jupyter-widgets/base",
758 | "model_module_version": "1.2.0",
759 | "model_name": "LayoutModel",
760 | "state": {
761 | "_model_module": "@jupyter-widgets/base",
762 | "_model_module_version": "1.2.0",
763 | "_model_name": "LayoutModel",
764 | "_view_count": null,
765 | "_view_module": "@jupyter-widgets/base",
766 | "_view_module_version": "1.2.0",
767 | "_view_name": "LayoutView",
768 | "align_content": null,
769 | "align_items": null,
770 | "align_self": null,
771 | "border": null,
772 | "bottom": null,
773 | "display": null,
774 | "flex": null,
775 | "flex_flow": null,
776 | "grid_area": null,
777 | "grid_auto_columns": null,
778 | "grid_auto_flow": null,
779 | "grid_auto_rows": null,
780 | "grid_column": null,
781 | "grid_gap": null,
782 | "grid_row": null,
783 | "grid_template_areas": null,
784 | "grid_template_columns": null,
785 | "grid_template_rows": null,
786 | "height": null,
787 | "justify_content": null,
788 | "justify_items": null,
789 | "left": null,
790 | "margin": null,
791 | "max_height": null,
792 | "max_width": null,
793 | "min_height": null,
794 | "min_width": null,
795 | "object_fit": null,
796 | "object_position": null,
797 | "order": null,
798 | "overflow": null,
799 | "overflow_x": null,
800 | "overflow_y": null,
801 | "padding": null,
802 | "right": null,
803 | "top": null,
804 | "visibility": null,
805 | "width": null
806 | }
807 | },
808 | "87ecefbbf8be4e5fbf1719aed14af9ad": {
809 | "model_module": "@jupyter-widgets/base",
810 | "model_module_version": "1.2.0",
811 | "model_name": "LayoutModel",
812 | "state": {
813 | "_model_module": "@jupyter-widgets/base",
814 | "_model_module_version": "1.2.0",
815 | "_model_name": "LayoutModel",
816 | "_view_count": null,
817 | "_view_module": "@jupyter-widgets/base",
818 | "_view_module_version": "1.2.0",
819 | "_view_name": "LayoutView",
820 | "align_content": null,
821 | "align_items": null,
822 | "align_self": null,
823 | "border": null,
824 | "bottom": null,
825 | "display": null,
826 | "flex": null,
827 | "flex_flow": null,
828 | "grid_area": null,
829 | "grid_auto_columns": null,
830 | "grid_auto_flow": null,
831 | "grid_auto_rows": null,
832 | "grid_column": null,
833 | "grid_gap": null,
834 | "grid_row": null,
835 | "grid_template_areas": null,
836 | "grid_template_columns": null,
837 | "grid_template_rows": null,
838 | "height": null,
839 | "justify_content": null,
840 | "justify_items": null,
841 | "left": null,
842 | "margin": null,
843 | "max_height": null,
844 | "max_width": null,
845 | "min_height": null,
846 | "min_width": null,
847 | "object_fit": null,
848 | "object_position": null,
849 | "order": null,
850 | "overflow": null,
851 | "overflow_x": null,
852 | "overflow_y": null,
853 | "padding": null,
854 | "right": null,
855 | "top": null,
856 | "visibility": null,
857 | "width": null
858 | }
859 | },
860 | "9283af3600a24f5fbb630ecf529fe045": {
861 | "model_module": "@jupyter-widgets/controls",
862 | "model_module_version": "1.5.0",
863 | "model_name": "FloatProgressModel",
864 | "state": {
865 | "_dom_classes": [],
866 | "_model_module": "@jupyter-widgets/controls",
867 | "_model_module_version": "1.5.0",
868 | "_model_name": "FloatProgressModel",
869 | "_view_count": null,
870 | "_view_module": "@jupyter-widgets/controls",
871 | "_view_module_version": "1.5.0",
872 | "_view_name": "ProgressView",
873 | "bar_style": "success",
874 | "description": "",
875 | "description_tooltip": null,
876 | "layout": "IPY_MODEL_a275e5f970094dfa9408445452b12b3e",
877 | "max": 19,
878 | "min": 0,
879 | "orientation": "horizontal",
880 | "style": "IPY_MODEL_a30e1efe178743108b95fb5bfe32b0f0",
881 | "value": 19
882 | }
883 | },
884 | "a00e1e7c8dc74e928a42ec4fcd7c110f": {
885 | "model_module": "@jupyter-widgets/controls",
886 | "model_module_version": "1.5.0",
887 | "model_name": "FloatProgressModel",
888 | "state": {
889 | "_dom_classes": [],
890 | "_model_module": "@jupyter-widgets/controls",
891 | "_model_module_version": "1.5.0",
892 | "_model_name": "FloatProgressModel",
893 | "_view_count": null,
894 | "_view_module": "@jupyter-widgets/controls",
895 | "_view_module_version": "1.5.0",
896 | "_view_name": "ProgressView",
897 | "bar_style": "success",
898 | "description": "",
899 | "description_tooltip": null,
900 | "layout": "IPY_MODEL_f8bb1f285c0f4fdda057fb64a3379262",
901 | "max": 274,
902 | "min": 0,
903 | "orientation": "horizontal",
904 | "style": "IPY_MODEL_b1610a3654634afabcaa455e12911c9c",
905 | "value": 274
906 | }
907 | },
908 | "a0c07ee1d65d4d70b983fb33ae78dd36": {
909 | "model_module": "@jupyter-widgets/base",
910 | "model_module_version": "1.2.0",
911 | "model_name": "LayoutModel",
912 | "state": {
913 | "_model_module": "@jupyter-widgets/base",
914 | "_model_module_version": "1.2.0",
915 | "_model_name": "LayoutModel",
916 | "_view_count": null,
917 | "_view_module": "@jupyter-widgets/base",
918 | "_view_module_version": "1.2.0",
919 | "_view_name": "LayoutView",
920 | "align_content": null,
921 | "align_items": null,
922 | "align_self": null,
923 | "border": null,
924 | "bottom": null,
925 | "display": null,
926 | "flex": null,
927 | "flex_flow": null,
928 | "grid_area": null,
929 | "grid_auto_columns": null,
930 | "grid_auto_flow": null,
931 | "grid_auto_rows": null,
932 | "grid_column": null,
933 | "grid_gap": null,
934 | "grid_row": null,
935 | "grid_template_areas": null,
936 | "grid_template_columns": null,
937 | "grid_template_rows": null,
938 | "height": null,
939 | "justify_content": null,
940 | "justify_items": null,
941 | "left": null,
942 | "margin": null,
943 | "max_height": null,
944 | "max_width": null,
945 | "min_height": null,
946 | "min_width": null,
947 | "object_fit": null,
948 | "object_position": null,
949 | "order": null,
950 | "overflow": null,
951 | "overflow_x": null,
952 | "overflow_y": null,
953 | "padding": null,
954 | "right": null,
955 | "top": null,
956 | "visibility": null,
957 | "width": null
958 | }
959 | },
960 | "a275e5f970094dfa9408445452b12b3e": {
961 | "model_module": "@jupyter-widgets/base",
962 | "model_module_version": "1.2.0",
963 | "model_name": "LayoutModel",
964 | "state": {
965 | "_model_module": "@jupyter-widgets/base",
966 | "_model_module_version": "1.2.0",
967 | "_model_name": "LayoutModel",
968 | "_view_count": null,
969 | "_view_module": "@jupyter-widgets/base",
970 | "_view_module_version": "1.2.0",
971 | "_view_name": "LayoutView",
972 | "align_content": null,
973 | "align_items": null,
974 | "align_self": null,
975 | "border": null,
976 | "bottom": null,
977 | "display": null,
978 | "flex": null,
979 | "flex_flow": null,
980 | "grid_area": null,
981 | "grid_auto_columns": null,
982 | "grid_auto_flow": null,
983 | "grid_auto_rows": null,
984 | "grid_column": null,
985 | "grid_gap": null,
986 | "grid_row": null,
987 | "grid_template_areas": null,
988 | "grid_template_columns": null,
989 | "grid_template_rows": null,
990 | "height": null,
991 | "justify_content": null,
992 | "justify_items": null,
993 | "left": null,
994 | "margin": null,
995 | "max_height": null,
996 | "max_width": null,
997 | "min_height": null,
998 | "min_width": null,
999 | "object_fit": null,
1000 | "object_position": null,
1001 | "order": null,
1002 | "overflow": null,
1003 | "overflow_x": null,
1004 | "overflow_y": null,
1005 | "padding": null,
1006 | "right": null,
1007 | "top": null,
1008 | "visibility": null,
1009 | "width": null
1010 | }
1011 | },
1012 | "a30e1efe178743108b95fb5bfe32b0f0": {
1013 | "model_module": "@jupyter-widgets/controls",
1014 | "model_module_version": "1.5.0",
1015 | "model_name": "ProgressStyleModel",
1016 | "state": {
1017 | "_model_module": "@jupyter-widgets/controls",
1018 | "_model_module_version": "1.5.0",
1019 | "_model_name": "ProgressStyleModel",
1020 | "_view_count": null,
1021 | "_view_module": "@jupyter-widgets/base",
1022 | "_view_module_version": "1.2.0",
1023 | "_view_name": "StyleView",
1024 | "bar_color": null,
1025 | "description_width": ""
1026 | }
1027 | },
1028 | "a54b7839a9454671a9f3d275fb338e6c": {
1029 | "model_module": "@jupyter-widgets/controls",
1030 | "model_module_version": "1.5.0",
1031 | "model_name": "DescriptionStyleModel",
1032 | "state": {
1033 | "_model_module": "@jupyter-widgets/controls",
1034 | "_model_module_version": "1.5.0",
1035 | "_model_name": "DescriptionStyleModel",
1036 | "_view_count": null,
1037 | "_view_module": "@jupyter-widgets/base",
1038 | "_view_module_version": "1.2.0",
1039 | "_view_name": "StyleView",
1040 | "description_width": ""
1041 | }
1042 | },
1043 | "b1610a3654634afabcaa455e12911c9c": {
1044 | "model_module": "@jupyter-widgets/controls",
1045 | "model_module_version": "1.5.0",
1046 | "model_name": "ProgressStyleModel",
1047 | "state": {
1048 | "_model_module": "@jupyter-widgets/controls",
1049 | "_model_module_version": "1.5.0",
1050 | "_model_name": "ProgressStyleModel",
1051 | "_view_count": null,
1052 | "_view_module": "@jupyter-widgets/base",
1053 | "_view_module_version": "1.2.0",
1054 | "_view_name": "StyleView",
1055 | "bar_color": null,
1056 | "description_width": ""
1057 | }
1058 | },
1059 | "c2465ae3c2df4af1b80def5cd6e3daa3": {
1060 | "model_module": "@jupyter-widgets/controls",
1061 | "model_module_version": "1.5.0",
1062 | "model_name": "HTMLModel",
1063 | "state": {
1064 | "_dom_classes": [],
1065 | "_model_module": "@jupyter-widgets/controls",
1066 | "_model_module_version": "1.5.0",
1067 | "_model_name": "HTMLModel",
1068 | "_view_count": null,
1069 | "_view_module": "@jupyter-widgets/controls",
1070 | "_view_module_version": "1.5.0",
1071 | "_view_name": "HTMLView",
1072 | "description": "",
1073 | "description_tooltip": null,
1074 | "layout": "IPY_MODEL_a0c07ee1d65d4d70b983fb33ae78dd36",
1075 | "placeholder": "",
1076 | "style": "IPY_MODEL_56145d6b220e4228ba37fe38b1d1a08a",
1077 | "value": "Map: 100%"
1078 | }
1079 | },
1080 | "ce646a2be8d84eefb8910a2b62d0e831": {
1081 | "model_module": "@jupyter-widgets/controls",
1082 | "model_module_version": "1.5.0",
1083 | "model_name": "HTMLModel",
1084 | "state": {
1085 | "_dom_classes": [],
1086 | "_model_module": "@jupyter-widgets/controls",
1087 | "_model_module_version": "1.5.0",
1088 | "_model_name": "HTMLModel",
1089 | "_view_count": null,
1090 | "_view_module": "@jupyter-widgets/controls",
1091 | "_view_module_version": "1.5.0",
1092 | "_view_name": "HTMLView",
1093 | "description": "",
1094 | "description_tooltip": null,
1095 | "layout": "IPY_MODEL_87ecefbbf8be4e5fbf1719aed14af9ad",
1096 | "placeholder": "",
1097 | "style": "IPY_MODEL_a54b7839a9454671a9f3d275fb338e6c",
1098 | "value": "Loading checkpoint shards: 100%"
1099 | }
1100 | },
1101 | "e7c559612c0f4ffcb7467e01e4d33b2d": {
1102 | "model_module": "@jupyter-widgets/controls",
1103 | "model_module_version": "1.5.0",
1104 | "model_name": "DescriptionStyleModel",
1105 | "state": {
1106 | "_model_module": "@jupyter-widgets/controls",
1107 | "_model_module_version": "1.5.0",
1108 | "_model_name": "DescriptionStyleModel",
1109 | "_view_count": null,
1110 | "_view_module": "@jupyter-widgets/base",
1111 | "_view_module_version": "1.2.0",
1112 | "_view_name": "StyleView",
1113 | "description_width": ""
1114 | }
1115 | },
1116 | "f8bb1f285c0f4fdda057fb64a3379262": {
1117 | "model_module": "@jupyter-widgets/base",
1118 | "model_module_version": "1.2.0",
1119 | "model_name": "LayoutModel",
1120 | "state": {
1121 | "_model_module": "@jupyter-widgets/base",
1122 | "_model_module_version": "1.2.0",
1123 | "_model_name": "LayoutModel",
1124 | "_view_count": null,
1125 | "_view_module": "@jupyter-widgets/base",
1126 | "_view_module_version": "1.2.0",
1127 | "_view_name": "LayoutView",
1128 | "align_content": null,
1129 | "align_items": null,
1130 | "align_self": null,
1131 | "border": null,
1132 | "bottom": null,
1133 | "display": null,
1134 | "flex": null,
1135 | "flex_flow": null,
1136 | "grid_area": null,
1137 | "grid_auto_columns": null,
1138 | "grid_auto_flow": null,
1139 | "grid_auto_rows": null,
1140 | "grid_column": null,
1141 | "grid_gap": null,
1142 | "grid_row": null,
1143 | "grid_template_areas": null,
1144 | "grid_template_columns": null,
1145 | "grid_template_rows": null,
1146 | "height": null,
1147 | "justify_content": null,
1148 | "justify_items": null,
1149 | "left": null,
1150 | "margin": null,
1151 | "max_height": null,
1152 | "max_width": null,
1153 | "min_height": null,
1154 | "min_width": null,
1155 | "object_fit": null,
1156 | "object_position": null,
1157 | "order": null,
1158 | "overflow": null,
1159 | "overflow_x": null,
1160 | "overflow_y": null,
1161 | "padding": null,
1162 | "right": null,
1163 | "top": null,
1164 | "visibility": null,
1165 | "width": null
1166 | }
1167 | }
1168 | }
1169 | }
1170 | },
1171 | "nbformat": 4,
1172 | "nbformat_minor": 0
1173 | }
1174 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Prakhar Saxena
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # About LLMs Repository
2 |
3 | ## Overview
4 | Easy tutorials on LLMs. This repository is dedicated to sharing insights and knowledge about LLMs, including less talked about topics like tokenizers, data collators etc.
5 |
6 | ## Features
7 | - Insights on various aspects of LLMs.
8 | - Regular updates with new content.
9 |
10 | Stay tuned for more!
11 |
12 | ## License
13 | This project is licensed under the MIT License - see the [LICENSE](LICENSE.txt) file for details.
14 |
--------------------------------------------------------------------------------
/inference.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\" #use this accordign to your GPU setup\n",
11 | "import os\n",
12 | "import torch\n",
13 | "import transformers\n",
14 | "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
15 | "from peft import LoraConfig, get_peft_model, PeftModel\n"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "seed = 42\n",
25 | "torch.manual_seed(seed)\n",
26 | "torch.cuda.manual_seed_all(seed)\n",
27 | "\n",
28 | "# Use teh base model whihc you trained below, can be llama, mixtral etc\n",
29 | "model_name = \"mistralai/Mistral-7B-Instruct-v0.2\"\n",
30 | "\n",
31 | "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
32 | "model = AutoModelForCausalLM.from_pretrained(\n",
33 | " model_name,\n",
34 | " load_in_8bit=True,\n",
35 | " device_map=\"auto\",\n",
36 | " torch_dtype=torch.float16\n",
37 | ")\n",
38 | "print(model)\n",
39 | "\n",
40 | " # path to the checkpoint folder, check your path as the checkpoint number might be different\n",
41 | "lora = \"./mistral-lora-instruct-shapeskeare/checkpoint-32/\"\n",
42 | "\n",
43 | "model = PeftModel.from_pretrained(model, lora)\n",
44 | "\n"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "def generate_prompt(user_query): #The prompt format is taken from the official Mistral huggingface page, yformat for each model might differ\n",
54 | " return \"[INST]\" + user_query + \"[/INST]\" \n",
55 | "\n",
56 | "def evaluate(question):\n",
57 | "\n",
58 | " prompt= generate_prompt(question)\n",
59 | " inputs = tokenizer(prompt, add_special_tokens=False, return_tensors=\"pt\")\n",
60 | " input_ids = inputs[\"input_ids\"].cuda()\n",
61 | "\n",
62 | " with torch.no_grad():\n",
63 | " gen_tokens = model.generate(\n",
64 | " input_ids=input_ids,\n",
65 | " attention_mask=inputs[\"attention_mask\"].cuda(),\n",
66 | " max_new_tokens=512,\n",
67 | " do_sample=True,\n",
68 | " temperature=0.8,\n",
69 | " top_p=0.95,\n",
70 | " eos_token_id=2,\n",
71 | " )\n",
72 | "\n",
73 | " out=tokenizer.decode(gen_tokens[0],\n",
74 | " skip_special_tokens=False)\n",
75 | "\n",
76 | " return out\n",
77 | "\n"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "questions_eval = [\n",
87 | " \"What is your favorite color?\",\n",
88 | " \"How many continents are there in the world?\",\n",
89 | " \"What year was the first iPhone released?\",\n",
90 | " \"What is the capital of France?\",\n",
91 | " \"Who wrote 'To Kill a Mockingbird'?\",\n",
92 | " \"What gas do plants breathe in that humans and animals breathe out?\",\n",
93 | " \"How many planets are in our solar system?\",\n",
94 | " \"What is the boiling point of water?\",\n",
95 | " \"What is the largest ocean on Earth?\",\n",
96 | " \"Who is the current president of the United States?\"\n",
97 | "]\n",
98 | "\n",
99 | "\n",
100 | "for question in questions_eval:\n",
101 | " a=evaluate(question)\n",
102 | " print(a)\n",
103 | " print()\n",
104 | " print(\"-\"*50)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": []
113 | }
114 | ],
115 | "metadata": {
116 | "kernelspec": {
117 | "display_name": "repe",
118 | "language": "python",
119 | "name": "python3"
120 | },
121 | "language_info": {
122 | "codemirror_mode": {
123 | "name": "ipython",
124 | "version": 3
125 | },
126 | "file_extension": ".py",
127 | "mimetype": "text/x-python",
128 | "name": "python",
129 | "nbconvert_exporter": "python",
130 | "pygments_lexer": "ipython3",
131 | "version": "3.10.14"
132 | }
133 | },
134 | "nbformat": 4,
135 | "nbformat_minor": 2
136 | }
137 |
--------------------------------------------------------------------------------