├── Pretraining ├── dataset_prev.jpg └── custom_llm_model.py ├── Finetuning ├── dataset_finetune_prev.jpg └── instruction_finetuning.ipynb ├── README.md └── RLHF_DPO └── dpo.ipynb /Pretraining/dataset_prev.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samadon1/LLM-From-Scratch/HEAD/Pretraining/dataset_prev.jpg -------------------------------------------------------------------------------- /Finetuning/dataset_finetune_prev.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samadon1/LLM-From-Scratch/HEAD/Finetuning/dataset_finetune_prev.jpg -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Medical LLM Project 2 | 3 | A Language Model fine-tuned for medical applications, progressing from pretraining to instruction fine-tuning and Direct Preference Optimization (DPO). 4 | 5 | ## Datasets 6 | 7 | 1. Pretraining: [Medical Text Dataset](https://www.kaggle.com/datasets/chaitanyakck/medical-text) (Kaggle) 8 | 2. Fine-tuning: [PMC LLaMA Instructions](https://huggingface.co/datasets/axiong/pmc_llama_instructions) (Hugging Face) 9 | 10 | ## Project Stages 11 | 12 | 1. **Pretraining** 13 | - Custom GPT model on medical texts 14 | 15 | 2. **Instruction Fine-tuning** 16 | - Used LitGPT for LoRA fine-tuning on instruction dataset 17 | 18 | 3. **Direct Preference Optimization (DPO)** 19 | - Generated variants using fine-tuned model 20 | - Created preference pairs based on Levenshtein distance 21 | 22 | ## Key Features 23 | 24 | - Customized for medical domain 25 | - Progression from general language model to instruction-following 26 | - Experiment with preference optimization 27 | 28 | ## Future Work 29 | 30 | - Larger medical datasets 31 | - Advanced DPO techniques 32 | - Multi-task learning in medical domain 33 | - Benchmark evaluation: 34 | - Compare against established medical NLP models 35 | - Evaluate on standardized medical QA datasets 36 | - Assess performance on clinical decision support tasks 37 | -------------------------------------------------------------------------------- /RLHF_DPO/dpo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#### DPO Data Generation with difflib" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 3, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import torch\n", 17 | "from transformers import AutoTokenizer, AutoModelForCausalLM\n", 18 | "from Levenshtein import distance as levenshtein_distance\n", 19 | "\n", 20 | "def generate_variants(model, tokenizer, input_text, num_variants=2, temperature=0.7, top_k=50):\n", 21 | " input_ids = tokenizer.encode(input_text, return_tensors=\"pt\")\n", 22 | " \n", 23 | " outputs = []\n", 24 | " for _ in range(num_variants):\n", 25 | " output = model.generate(\n", 26 | " input_ids, \n", 27 | " max_length=100,\n", 28 | " do_sample=True,\n", 29 | " temperature=temperature,\n", 30 | " top_k=top_k,\n", 31 | " num_return_sequences=1\n", 32 | " )\n", 33 | " decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)\n", 34 | " outputs.append(decoded_output)\n", 35 | " \n", 36 | " return outputs\n", 37 | "\n", 38 | "def create_dpo_dataset(data, model, tokenizer):\n", 39 | " dpo_data = []\n", 40 | " \n", 41 | " for item in data:\n", 42 | " input_text = item['instruction'] + ('\\n' + item['input'] if item['input'] else '')\n", 43 | " ground_truth = item['output']\n", 44 | " \n", 45 | " variants = generate_variants(model, tokenizer, input_text)\n", 46 | " \n", 47 | " distances = [levenshtein_distance(v, ground_truth) for v in variants]\n", 48 | " \n", 49 | " if distances[0] <= distances[1]:\n", 50 | " chosen, rejected = variants[0], variants[1]\n", 51 | " else:\n", 52 | " chosen, rejected = variants[1], variants[0]\n", 53 | " \n", 54 | " dpo_data.append({\n", 55 | " 'prompt': input_text,\n", 56 | " 'chosen': chosen,\n", 57 | " 'rejected': rejected,\n", 58 | " 'ground_truth': ground_truth\n", 59 | " })\n", 60 | " \n", 61 | " return dpo_data" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [] 70 | } 71 | ], 72 | "metadata": { 73 | "language_info": { 74 | "name": "python" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 2 79 | } 80 | -------------------------------------------------------------------------------- /Pretraining/custom_llm_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.utils.data import Dataset, DataLoader 4 | import tiktoken 5 | import matplotlib.pyplot as plt 6 | from matplotlib.ticker import MaxNLocator 7 | import random 8 | 9 | class GPTDatasetV1(Dataset): 10 | def __init__(self, txt, tokenizer, max_length, stride, augment_prob=0.1): 11 | self.tokenizer = tokenizer 12 | self.input_ids = [] 13 | self.target_ids = [] 14 | self.augment_prob = augment_prob 15 | 16 | token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) 17 | 18 | for i in range(0, len(token_ids) - max_length, stride): 19 | input_chunk = token_ids[i:i + max_length] 20 | target_chunk = token_ids[i + 1: i + max_length + 1] 21 | self.input_ids.append(torch.tensor(input_chunk)) 22 | self.target_ids.append(torch.tensor(target_chunk)) 23 | 24 | def __len__(self): 25 | return len(self.input_ids) 26 | 27 | def __getitem__(self, idx): 28 | input_ids, target_ids = self.input_ids[idx], self.target_ids[idx] 29 | if random.random() < self.augment_prob: 30 | input_ids = self.augment_sequence(input_ids) 31 | return input_ids, target_ids 32 | 33 | def augment_sequence(self, sequence): 34 | mask_idx = random.randint(0, len(sequence) - 1) 35 | sequence[mask_idx] = self.tokenizer.encode("[MASK]")[0] 36 | return sequence 37 | 38 | def create_dataloader_v1(txt, batch_size=4, max_length=256, 39 | stride=128, shuffle=True, drop_last=True, num_workers=0): 40 | tokenizer = tiktoken.get_encoding("gpt2") 41 | dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) 42 | dataloader = DataLoader( 43 | dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers) 44 | return dataloader 45 | 46 | class Swish(nn.Module): 47 | def forward(self, x): 48 | return x * torch.sigmoid(x) 49 | 50 | class MultiHeadAttention(nn.Module): 51 | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): 52 | super().__init__() 53 | assert d_out % num_heads == 0, "d_out must be divisible by num_heads" 54 | 55 | self.d_out = d_out 56 | self.num_heads = num_heads 57 | self.head_dim = d_out // num_heads 58 | 59 | self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) 60 | self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) 61 | self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) 62 | self.out_proj = nn.Linear(d_out, d_out) 63 | self.dropout = nn.Dropout(dropout) 64 | self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) 65 | self.relative_position_encoding = nn.Parameter(torch.randn(2 * context_length - 1, self.head_dim)) 66 | 67 | def forward(self, x): 68 | b, num_tokens, d_in = x.shape 69 | 70 | keys = self.W_key(x) 71 | queries = self.W_query(x) 72 | values = self.W_value(x) 73 | 74 | keys = keys.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2) 75 | queries = queries.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2) 76 | values = values.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2) 77 | 78 | attn_scores = queries @ keys.transpose(2, 3) 79 | 80 | relative_position = self._get_relative_positions(num_tokens) 81 | rel_attn_scores = self._relative_attention_scores(queries, relative_position) 82 | attn_scores += rel_attn_scores 83 | 84 | mask_bool = self.mask.bool()[:num_tokens, :num_tokens] 85 | attn_scores.masked_fill_(mask_bool, -torch.inf) 86 | 87 | attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) 88 | attn_weights = self.dropout(attn_weights) 89 | 90 | context_vec = (attn_weights @ values).transpose(1, 2) 91 | context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) 92 | context_vec = self.out_proj(context_vec) 93 | 94 | return context_vec 95 | 96 | def _get_relative_positions(self, length): 97 | range_vec = torch.arange(length) 98 | range_mat = range_vec.unsqueeze(0).repeat(length, 1) 99 | distance_mat = range_mat - range_mat.T 100 | return distance_mat + length - 1 101 | 102 | def _relative_attention_scores(self, queries, relative_position): 103 | embeddings = self.relative_position_encoding[relative_position] 104 | return torch.einsum('bhld,lrd->bhlr', queries, embeddings) 105 | 106 | class FeedForward(nn.Module): 107 | def __init__(self, cfg): 108 | super().__init__() 109 | self.layers = nn.Sequential( 110 | nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), 111 | Swish(), 112 | nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), 113 | ) 114 | 115 | def forward(self, x): 116 | return self.layers(x) 117 | 118 | class TransformerBlock(nn.Module): 119 | def __init__(self, cfg): 120 | super().__init__() 121 | self.att = MultiHeadAttention( 122 | d_in=cfg["emb_dim"], 123 | d_out=cfg["emb_dim"], 124 | context_length=cfg["context_length"], 125 | num_heads=cfg["n_heads"], 126 | dropout=cfg["drop_rate"], 127 | qkv_bias=cfg["qkv_bias"]) 128 | self.ff = FeedForward(cfg) 129 | self.norm1 = nn.LayerNorm(cfg["emb_dim"]) 130 | self.norm2 = nn.LayerNorm(cfg["emb_dim"]) 131 | self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) 132 | self.conv = nn.Conv1d(cfg["emb_dim"], cfg["emb_dim"], kernel_size=3, padding=1) 133 | 134 | def forward(self, x): 135 | shortcut = x 136 | x = self.norm1(x) 137 | x = self.att(x) 138 | x = self.drop_shortcut(x) 139 | x = x + shortcut 140 | 141 | shortcut = x 142 | x = self.norm2(x) 143 | x = self.ff(x) 144 | x = self.drop_shortcut(x) 145 | x = x + shortcut 146 | 147 | x = x + self.conv(x.transpose(1, 2)).transpose(1, 2) 148 | 149 | return x 150 | 151 | class GPTModel(nn.Module): 152 | def __init__(self, cfg): 153 | super().__init__() 154 | self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) 155 | self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) 156 | self.drop_emb = nn.Dropout(cfg["drop_rate"]) 157 | self.token_type_emb = nn.Embedding(cfg["num_token_types"], cfg["emb_dim"]) 158 | 159 | self.trf_blocks = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) 160 | 161 | self.final_norm = nn.LayerNorm(cfg["emb_dim"]) 162 | self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) 163 | 164 | def forward(self, in_idx, token_type_ids=None): 165 | batch_size, seq_len = in_idx.shape 166 | tok_embeds = self.tok_emb(in_idx) 167 | pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) 168 | 169 | if token_type_ids is None: 170 | token_type_ids = torch.zeros_like(in_idx) 171 | token_type_embeds = self.token_type_emb(token_type_ids) 172 | 173 | x = tok_embeds + pos_embeds + token_type_embeds 174 | x = self.drop_emb(x) 175 | 176 | for block in self.trf_blocks: 177 | x = block(x) 178 | 179 | x = self.final_norm(x) 180 | logits = self.out_head(x) 181 | return logits 182 | 183 | def focal_loss(logits, targets, alpha=0.25, gamma=2): 184 | ce_loss = torch.nn.functional.cross_entropy(logits, targets, reduction='none') 185 | pt = torch.exp(-ce_loss) 186 | focal_loss = alpha * (1-pt)**gamma * ce_loss 187 | return focal_loss.mean() 188 | 189 | def calc_loss_batch(input_batch, target_batch, model, device): 190 | input_batch, target_batch = input_batch.to(device), target_batch.to(device) 191 | logits = model(input_batch) 192 | loss = focal_loss(logits.flatten(0, 1), target_batch.flatten()) 193 | return loss 194 | 195 | def top_k_sampling(logits, k=10): 196 | v, _ = torch.topk(logits, k) 197 | logits[logits < v[:, [-1]]] = float('-inf') 198 | probas = torch.softmax(logits, dim=-1) 199 | return probas 200 | 201 | def generate_text_simple(model, idx, max_new_tokens, context_size): 202 | for _ in range(max_new_tokens): 203 | idx_cond = idx[:, -context_size:] 204 | with torch.no_grad(): 205 | logits = model(idx_cond) 206 | logits = logits[:, -1, :] 207 | probas = top_k_sampling(logits) 208 | idx_next = torch.multinomial(probas, num_samples=1) 209 | idx = torch.cat((idx, idx_next), dim=1) 210 | return idx 211 | 212 | def visualize_attention(attention_weights): 213 | plt.figure(figsize=(10, 8)) 214 | plt.imshow(attention_weights, cmap='viridis') 215 | plt.colorbar() 216 | plt.title("Attention Weights") 217 | plt.show() 218 | -------------------------------------------------------------------------------- /Finetuning/instruction_finetuning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Instruction Finetuning " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "![alt text](dataset_finetune_prev.jpg) \n", 15 | "Dataset from: https://huggingface.co/datasets/axiong/pmc_llama_instructions" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "#### Setup and Imports" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "matplotlib version: 3.8.2\n", 35 | "numpy version: 1.26.4\n", 36 | "tiktoken version: 0.7.0\n", 37 | "torch version: 2.2.1+cu121\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "import torch\n", 43 | "import tiktoken\n", 44 | "from matplotlib import pyplot as plt\n", 45 | "from litgpt import LLM\n", 46 | "import json\n", 47 | "from tqdm import tqdm\n", 48 | "\n", 49 | "from importlib.metadata import version\n", 50 | "pkgs = [\"matplotlib\", \"numpy\", \"tiktoken\", \"torch\"]\n", 51 | "for p in pkgs:\n", 52 | " print(f\"{p} version: {version(p)}\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "#### Load and Preprocess Dataset" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "Number of entries: 1100\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "\n", 77 | "with open(\"instruction_data.json\", \"r\") as file:\n", 78 | " data = json.load(file)\n", 79 | "\n", 80 | "processed_data = [\n", 81 | " {\n", 82 | " \"instruction\": item[\"instruction\"],\n", 83 | " \"input\": item[\"input\"],\n", 84 | " \"output\": item[\"output\"]\n", 85 | " }\n", 86 | " for item in data\n", 87 | "]\n", 88 | "\n", 89 | "print(\"Number of entries:\", len(processed_data))" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "#### Create Training and Test Sets" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 3, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "Training set length: 935\n", 109 | "Test set length: 165\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "import random\n", 115 | "\n", 116 | "random.shuffle(processed_data)\n", 117 | "\n", 118 | "train_ratio = 0.85\n", 119 | "train_size = int(len(processed_data) * train_ratio)\n", 120 | "\n", 121 | "train_data = processed_data[:train_size]\n", 122 | "test_data = processed_data[train_size:]\n", 123 | "\n", 124 | "print(\"Training set length:\", len(train_data))\n", 125 | "print(\"Test set length:\", len(test_data))\n", 126 | "\n", 127 | "\n", 128 | "with open(\"train.json\", \"w\") as json_file:\n", 129 | " json.dump(train_data, json_file, indent=4)\n", 130 | " \n", 131 | "with open(\"test.json\", \"w\") as json_file:\n", 132 | " json.dump(test_data, json_file, indent=4)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "#### Finetuning" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 4, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "{'checkpoint_dir': PosixPath('checkpoints/microsoft/phi-2'),\n", 152 | " 'data': JSON(json_path=PosixPath('train.json'),\n", 153 | " mask_prompt=False,\n", 154 | " val_split_fraction=0.1,\n", 155 | " prompt_style=,\n", 156 | " ignore_index=-100,\n", 157 | " seed=42,\n", 158 | " num_workers=4),\n", 159 | " 'devices': 1,\n", 160 | " 'eval': EvalArgs(interval=100,\n", 161 | " max_new_tokens=100,\n", 162 | " max_iters=100,\n", 163 | " initial_validation=False,\n", 164 | " final_validation=True),\n", 165 | " 'logger_name': 'csv',\n", 166 | " 'lora_alpha': 16,\n", 167 | " 'lora_dropout': 0.05,\n", 168 | " 'lora_head': False,\n", 169 | " 'lora_key': False,\n", 170 | " 'lora_mlp': False,\n", 171 | " 'lora_projection': False,\n", 172 | " 'lora_query': True,\n", 173 | " 'lora_r': 8,\n", 174 | " 'lora_value': True,\n", 175 | " 'optimizer': 'AdamW',\n", 176 | " 'out_dir': PosixPath('out/finetune/lora'),\n", 177 | " 'precision': None,\n", 178 | " 'quantize': None,\n", 179 | " 'seed': 1337,\n", 180 | " 'train': TrainArgs(save_interval=1000,\n", 181 | " log_interval=100,\n", 182 | " global_batch_size=16,\n", 183 | " micro_batch_size=1,\n", 184 | " lr_warmup_steps=100,\n", 185 | " lr_warmup_fraction=None,\n", 186 | " epochs=3,\n", 187 | " max_tokens=None,\n", 188 | " max_steps=None,\n", 189 | " max_seq_length=None,\n", 190 | " tie_embeddings=None,\n", 191 | " max_norm=None,\n", 192 | " min_lr=6e-05)}\n", 193 | "Using bfloat16 Automatic Mixed Precision (AMP)\n", 194 | "Seed set to 1337\n", 195 | "Number of trainable parameters: 2,621,440\n", 196 | "Number of non-trainable parameters: 2,779,683,840\n", 197 | "The longest sequence length in the train data is 101, the model's maximum sequence length is 101 and context length is 2048\n", 198 | "Verifying settings ...\n", 199 | "Missing logger folder: /teamspace/studios/this_studio/out/finetune/lora/logs/csv\n", 200 | "Epoch 1 | iter 100 step 6 | loss train: 2.393, val: n/a | iter time: 82.21 ms\n", 201 | "Epoch 1 | iter 200 step 12 | loss train: 2.138, val: n/a | iter time: 83.16 ms\n", 202 | "Epoch 1 | iter 300 step 18 | loss train: 1.908, val: n/a | iter time: 80.77 ms\n", 203 | "Epoch 1 | iter 400 step 25 | loss train: 1.449, val: n/a | iter time: 85.55 ms (step)\n", 204 | "Epoch 1 | iter 500 step 31 | loss train: 0.931, val: n/a | iter time: 81.96 ms\n", 205 | "Epoch 1 | iter 600 step 37 | loss train: 0.657, val: n/a | iter time: 81.52 ms\n", 206 | "Epoch 1 | iter 700 step 43 | loss train: 0.579, val: n/a | iter time: 81.38 ms\n", 207 | "Epoch 1 | iter 800 step 50 | loss train: 0.695, val: n/a | iter time: 84.98 ms (step)\n", 208 | "Epoch 2 | iter 900 step 56 | loss train: 0.537, val: n/a | iter time: 82.43 ms\n", 209 | "Epoch 2 | iter 1000 step 62 | loss train: 0.473, val: n/a | iter time: 80.72 ms\n", 210 | "Epoch 2 | iter 1100 step 68 | loss train: 0.462, val: n/a | iter time: 80.93 ms\n", 211 | "Epoch 2 | iter 1200 step 75 | loss train: 0.527, val: n/a | iter time: 81.72 ms (step)\n", 212 | "Epoch 2 | iter 1300 step 81 | loss train: 0.505, val: n/a | iter time: 79.20 ms\n", 213 | "Epoch 2 | iter 1400 step 87 | loss train: 0.510, val: n/a | iter time: 80.38 ms\n", 214 | "Epoch 2 | iter 1500 step 93 | loss train: 0.578, val: n/a | iter time: 82.84 ms\n", 215 | "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", 216 | " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n", 217 | "Epoch 2 | iter 1600 step 100 | loss train: 0.529, val: n/a | iter time: 82.59 ms (step)\n", 218 | "Validating ...\n", 219 | "Recommend a movie for me to watch during the weekend and explain the reason.\n", 220 | "Length of encoded instruction (45) and eval.max_new_tokens (100) exceeds model.max_seq_length (101) used for training. Skipping example generation for efficiency. The model's supported context size (post-training) is 2048.\n", 221 | "iter 1600: val loss 0.4743, val time: 5395.57 ms\n", 222 | "Epoch 3 | iter 1700 step 106 | loss train: 0.478, val: 0.474 | iter time: 79.69 ms\n", 223 | "Epoch 3 | iter 1800 step 112 | loss train: 0.617, val: 0.474 | iter time: 78.80 ms\n", 224 | "Epoch 3 | iter 1900 step 118 | loss train: 1.122, val: 0.474 | iter time: 79.49 ms\n", 225 | "Epoch 3 | iter 2000 step 125 | loss train: 0.773, val: 0.474 | iter time: 84.16 ms (step)\n", 226 | "Epoch 3 | iter 2100 step 131 | loss train: 0.683, val: 0.474 | iter time: 79.49 ms\n", 227 | "Epoch 3 | iter 2200 step 137 | loss train: 0.746, val: 0.474 | iter time: 82.68 ms\n", 228 | "Epoch 3 | iter 2300 step 143 | loss train: 0.710, val: 0.474 | iter time: 82.90 ms\n", 229 | "Epoch 3 | iter 2400 step 150 | loss train: 0.664, val: 0.474 | iter time: 81.16 ms (step)\n", 230 | "Epoch 3 | iter 2500 step 156 | loss train: 0.697, val: 0.474 | iter time: 86.41 ms\n", 231 | "Training time: 213.92s\n", 232 | "Memory used: 16.76 GB\n", 233 | "Validating ...\n", 234 | "Final evaluation | val loss: 0.640 | val ppl: 1.896\n", 235 | "Saving LoRA weights to '/teamspace/studios/this_studio/out/finetune/lora/final/lit_model.pth.lora'\n", 236 | "{'checkpoint_dir': PosixPath('/teamspace/studios/this_studio/out/finetune/lora/final'),\n", 237 | " 'precision': None,\n", 238 | " 'pretrained_checkpoint_dir': None}\n", 239 | "Saved merged weights to '/teamspace/studios/this_studio/out/finetune/lora/final/lit_model.pth'\n" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "\n", 245 | "!litgpt finetune_lora microsoft/phi-2 \\\n", 246 | "--data JSON \\\n", 247 | "--data.val_split_fraction 0.1 \\\n", 248 | "--data.json_path train.json \\\n", 249 | "--train.epochs 3 \\\n", 250 | "--train.log_interval 100" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 5, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stderr", 260 | "output_type": "stream", 261 | "text": [ 262 | "100%|██████████| 165/165 [00:51<00:00, 3.20it/s]\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "def format_input(entry):\n", 268 | " instruction_text = (\n", 269 | " f\"Below is an instruction that describes a task. \"\n", 270 | " f\"Write a response that appropriately completes the request.\"\n", 271 | " f\"\\n\\n### Instruction:\\n{entry['instruction']}\"\n", 272 | " )\n", 273 | " input_text = f\"\\n\\n### Input:\\n{entry['input']}\" if entry[\"input\"] else \"\"\n", 274 | " return instruction_text + input_text\n", 275 | "\n", 276 | "llm = LLM.load(\"microsoft/phi-2\")\n", 277 | "\n", 278 | "for i in tqdm(range(len(test_data))):\n", 279 | " response = llm.generate(format_input(test_data[i]))\n", 280 | " test_data[i][\"base_model\"] = response\n", 281 | "\n", 282 | "with open(\"test_base_model.json\", \"w\") as json_file:\n", 283 | " json.dump(test_data, json_file, indent=4)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 12, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "name": "stderr", 293 | "output_type": "stream", 294 | "text": [ 295 | " 72%|███████▏ | 118/165 [00:34<00:12, 3.81it/s]" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "# del llm\n", 301 | "llm_finetuned = LLM.load(\"/teamspace/studios/this_studio/out/finetune/lora/final/\")\n", 302 | "\n", 303 | "for i in tqdm(range(len(test_data))):\n", 304 | " response = llm_finetuned.generate(format_input(test_data[i]))\n", 305 | " test_data[i][\"finetuned_model\"] = response\n", 306 | "\n", 307 | "\n", 308 | "with open(\"test_base_and_finetuned_model.json\", \"w\") as json_file:\n", 309 | " json.dump(test_data, json_file, indent=4)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 13, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "Sample 1:\n", 322 | "Instruction: Arrange the following events in chronological order: Invention of the airplane, Fall of the Berlin Wall, Discovery of America.\n", 323 | "Base model output: The correct order of these events is Discovery of America, Invention of the airplane, Fall of the Berlin Wall.\n", 324 | "\n", 325 | "Finetuned model output: Italicize the correct order in the following sentence.\n", 326 | "\n", 327 | "### Response:\n", 328 | "Concentrate on providing a good answer that appropriately completes the request.\n", 329 | "\n", 330 | "\n", 331 | "Sample 2:\n", 332 | "Instruction: Find a synonym for the given verb.\n", 333 | "Base model output: Start\n", 334 | "\n", 335 | "Finetuned model output: Prologue\n", 336 | "\n", 337 | "\n", 338 | "Sample 3:\n", 339 | "Instruction: Translate the phrase 'Life is beautiful' into Italian.\n", 340 | "Base model output: L'aiei sono belli.\n", 341 | "\n", 342 | "Finetuned model output: Lifo semplice\n", 343 | "\n", 344 | "\n", 345 | "Sample 4:\n", 346 | "Instruction: Convert the following verb to its gerund form: 'eat'\n", 347 | "Base model output: Eating\n", 348 | "\n", 349 | "Finetuned model output: Eating\n", 350 | "\n", 351 | "\n", 352 | "\n", 353 | "Sample 5:\n", 354 | "Instruction: Look up the freezing point of water.\n", 355 | "Base model output: The freezing point of water is 0 degrees Celsius or 32 degrees Fahrenheit.\n", 356 | "\n", 357 | "Finetuned model output: Here is an input that provides a statement that describes a task.\n", 358 | "\n", 359 | "###Irlchemia has written a text that begins with 'What is the freezing point of water?'\n", 360 | "\n", 361 | "### Response:\n", 362 | "The freezing point of water is 0 degrees\n", 363 | "\n", 364 | "\n" 365 | ] 366 | } 367 | ], 368 | "source": [ 369 | "for i in range(5):\n", 370 | " print(f\"Sample {i+1}:\")\n", 371 | " print(\"Instruction:\", test_data[i][\"instruction\"])\n", 372 | " print(\"Base model output:\", test_data[i][\"base_model\"])\n", 373 | " print(\"Finetuned model output:\", test_data[i][\"finetuned_model\"])\n", 374 | " print(\"\\n\")" 375 | ] 376 | } 377 | ], 378 | "metadata": { 379 | "language_info": { 380 | "name": "python" 381 | } 382 | }, 383 | "nbformat": 4, 384 | "nbformat_minor": 2 385 | } 386 | --------------------------------------------------------------------------------