├── Finetune-Bloom7B-tagger.py ├── Finetune-meta-Llama-2-7b-hf.py ├── Qwen2.5_3B_GRPO.py ├── README.md ├── call_finetune_intel_neural_chat7B.py ├── call_finetune_mistral_7b.py ├── finetune_Llama-7b_with_only_lora.py ├── finetune_NousResearch_Nous-Hermes-2-SOLAR-10.7B.py ├── finetune_Yi_34B.py ├── finetune_intel_neural_chat7B.py ├── finetune_llama_3_1_8B-Instruct.py ├── finetune_meta_llama3_8B_instruct.py ├── finetune_microsoft-Phi-3-mini-128k-instruct.py ├── finetune_microsoft-phi-2.py ├── finetune_mistral_7b.py ├── finetune_starling-LM-7B-alpha.py ├── web_chat_bot_finetuned_mistral_7b.py ├── web_chat_bot_llama3_8b_instruct.py └── web_microsoft-Phi-3-mini-128k-instruct.py /Finetune-Bloom7B-tagger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import bitsandbytes as bnb 5 | import transformers as transformers 6 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM 7 | from transformers import BitsAndBytesConfig 8 | from peft import LoraConfig, get_peft_model 9 | from datasets import load_dataset 10 | 11 | 12 | #Setup the model 13 | model_id="bigscience/bloom-1b7" 14 | tokenizer = AutoTokenizer.from_pretrained(model_id) 15 | model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True) 16 | 17 | print(model.get_memory_footprint()) 18 | 19 | ''' 20 | Change the compute dtype 21 | The compute dtype is used to change the dtype that will be used during computation. 22 | For example, hidden states could be in float32 but computation can be set to bf16 for speedups. By default, the compute dtype is set to float32. 23 | 24 | quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) 25 | 26 | ''' 27 | 28 | ''' 29 | Using NF4 (Normal Float 4) data type 30 | You can also use the NF4 data type, which is a new 4bit datatype adapted for weights that have been initialized using a normal distribution. For that run: 31 | 32 | nf4_config = BitsAndBytesConfig( 33 | load_in_4bit=True, 34 | bnb_4bit_quant_type="nf4", 35 | ) 36 | 37 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config) 38 | ''' 39 | 40 | ''' 41 | Use nested quantization for more memory efficient inference 42 | We also advise users to use the nested quantization technique. This saves more memory at no additional performance - from our empirical observations, 43 | this enables fine-tuning llama-13b model on an NVIDIA-T4 16GB with a sequence length of 1024, batch size of 1 and gradient accumulation steps of 4. 44 | 45 | double_quant_config = BitsAndBytesConfig( 46 | load_in_4bit=True, 47 | bnb_4bit_use_double_quant=True, 48 | ) 49 | 50 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=double_quant_config) 51 | ''' 52 | 53 | #Freezing the original weights 54 | for param in model.parameters(): 55 | param.requires_grad = False 56 | if param.ndim ==1: 57 | param.data = param.data.to(torch.float32) 58 | model.gradient_checkpointing_enable() 59 | model.enable_input_require_grads() 60 | 61 | class CastOutputToFloat(nn.Sequential): 62 | def forward(self, x): return super().forward(x).to(torch.float32) 63 | model.lm_head = CastOutputToFloat(model.lm_head) 64 | 65 | #Setting up the LoRa Adapters 66 | def print_trainable_parameters(model): 67 | trainable_params = 0 68 | all_param = 0 69 | for _, param in model.named_parameters(): 70 | all_param += param.numel() 71 | if param.requires_grad: 72 | trainable_params += param.numel() 73 | print( 74 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" 75 | ) 76 | 77 | config = LoraConfig( 78 | r=16, 79 | lora_alpha=32, 80 | lora_dropout=0.05, 81 | bias = 'none', 82 | task_type="CAUSAL_LM" 83 | ) 84 | 85 | model = get_peft_model(model, config) 86 | print_trainable_parameters(model) 87 | data = load_dataset("Abirate/english_quotes") 88 | 89 | def merge_colunms(example): 90 | example['prediction'] = example['quote'] + " ->: " + str(example["tags"]) 91 | return example 92 | 93 | data['train'] = data['train'].map(merge_colunms) 94 | print(data['train']["prediction"][:5]) 95 | print(data['train'][0]) 96 | 97 | data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True) 98 | 99 | print(data) 100 | 101 | #Training 102 | trainer = transformers.Trainer( 103 | model=model, 104 | train_dataset=data['train'], 105 | args=transformers.TrainingArguments( 106 | per_gpu_train_batch_size=4, 107 | gradient_accumulation_steps=4, 108 | warmup_steps=100, 109 | max_steps=200, 110 | learning_rate=2e-4, 111 | fp16=True, 112 | logging_steps=1, 113 | output_dir='outputs' 114 | ), 115 | data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) 116 | ) 117 | 118 | model.config.use_cache = False 119 | trainer.train() 120 | 121 | model.push_to_hub("meetrais/bloom-7b1-lora-tagger", 122 | token="HuggingFace-app-key", 123 | commit_message="basic training", 124 | private=True) -------------------------------------------------------------------------------- /Finetune-meta-Llama-2-7b-hf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import bitsandbytes as bnb 5 | import transformers as transformers 6 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM 7 | from transformers import BitsAndBytesConfig 8 | from peft import LoraConfig, get_peft_model 9 | from datasets import load_dataset 10 | 11 | 12 | #Setup the model 13 | model_id="meta-llama/Llama-2-7b-hf" 14 | tokenizer = AutoTokenizer.from_pretrained(model_id) 15 | model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True) 16 | 17 | print(model.get_memory_footprint()) 18 | #Freezing the original weights 19 | for param in model.parameters(): 20 | param.requires_grad = False 21 | if param.ndim ==1: 22 | param.data = param.data.to(torch.float32) 23 | model.gradient_checkpointing_enable() 24 | model.enable_input_require_grads() 25 | 26 | class CastOutputToFloat(nn.Sequential): 27 | def forward(self, x): return super().forward(x).to(torch.float32) 28 | model.lm_head = CastOutputToFloat(model.lm_head) 29 | 30 | #Setting up the LoRa Adapters 31 | def print_trainable_parameters(model): 32 | trainable_params = 0 33 | all_param = 0 34 | for _, param in model.named_parameters(): 35 | all_param += param.numel() 36 | if param.requires_grad: 37 | trainable_params += param.numel() 38 | print( 39 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" 40 | ) 41 | 42 | config = LoraConfig( 43 | r=16, 44 | lora_alpha=32, 45 | lora_dropout=0.05, 46 | bias = 'none', 47 | task_type="CAUSAL_LM" 48 | ) 49 | 50 | model = get_peft_model(model, config) 51 | print_trainable_parameters(model) 52 | data = load_dataset("Abirate/english_quotes") 53 | 54 | def merge_colunms(example): 55 | example['prediction'] = example['quote'] + " ->: " + str(example["tags"]) 56 | return example 57 | 58 | data['train'] = data['train'].map(merge_colunms) 59 | print(data['train']["prediction"][:5]) 60 | print(data['train'][0]) 61 | 62 | data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True) 63 | 64 | print(data) 65 | 66 | #Training 67 | trainer = transformers.Trainer( 68 | model=model, 69 | train_dataset=data['train'], 70 | args=transformers.TrainingArguments( 71 | per_gpu_train_batch_size=4, 72 | gradient_accumulation_steps=4, 73 | warmup_steps=100, 74 | max_steps=200, 75 | learning_rate=2e-4, 76 | fp16=True, 77 | logging_steps=1, 78 | output_dir='outputs' 79 | ), 80 | data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) 81 | ) 82 | 83 | model.config.use_cache = False 84 | trainer.train() 85 | 86 | model.push_to_hub("meetrais/meta-Llama-2-7b-hf-finetuned", 87 | token="HuggingFace-app-key", 88 | commit_message="basic training", 89 | private=True) 90 | -------------------------------------------------------------------------------- /Qwen2.5_3B_GRPO.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import re 3 | from datasets import load_dataset, Dataset 4 | from unsloth import FastLanguageModel, PatchFastRL, is_bfloat16_supported 5 | from trl import GRPOConfig, GRPOTrainer 6 | 7 | PatchFastRL("GRPO", FastLanguageModel) 8 | torch.distributed.launch=True 9 | 10 | max_seq_length = 256 # Can increase for longer reasoning traces 11 | lora_rank = 32 # Larger rank = smarter, but slower 12 | 13 | model, tokenizer = FastLanguageModel.from_pretrained( 14 | model_name = "Qwen/Qwen2.5-3B-Instruct", 15 | max_seq_length = max_seq_length, 16 | load_in_4bit = True, # False for LoRA 16bit 17 | fast_inference = False, # Disable vLLM fast inference for LoRA training 18 | max_lora_rank = lora_rank, 19 | gpu_memory_utilization = 0.9, # Reduce if out of memory, 20 | ) 21 | 22 | model = FastLanguageModel.get_peft_model( 23 | model, 24 | r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 25 | target_modules = [ 26 | "q_proj", "k_proj", "v_proj", "o_proj", 27 | ], # Remove QKVO if out of memory 28 | lora_alpha = lora_rank, 29 | use_gradient_checkpointing = "unsloth", # Enable long context finetuning 30 | random_state = 3407, 31 | ) 32 | 33 | # Load and prep dataset 34 | SYSTEM_PROMPT = """ 35 | Respond in the following format: 36 | 37 | ... 38 | 39 | 40 | ... 41 | 42 | """ 43 | 44 | XML_COT_FORMAT = """\ 45 | 46 | {reasoning} 47 | 48 | 49 | {answer} 50 | 51 | """ 52 | 53 | def extract_xml_answer(text: str) -> str: 54 | answer = text.split("")[-1] 55 | answer = answer.split("")[0] 56 | return answer.strip() 57 | 58 | def extract_hash_answer(text: str) -> str | None: 59 | if "####" not in text: 60 | return None 61 | return text.split("####")[1].strip() 62 | 63 | # uncomment middle messages for 1-shot prompting 64 | def get_gsm8k_questions(split = "train") -> Dataset: 65 | data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore 66 | data = data.map(lambda x: { # type: ignore 67 | 'prompt': [ 68 | {'role': 'system', 'content': SYSTEM_PROMPT}, 69 | {'role': 'user', 'content': x['question']} 70 | ], 71 | 'answer': extract_hash_answer(x['answer']) 72 | }) # type: ignore 73 | return data # type: ignore 74 | 75 | dataset = get_gsm8k_questions() 76 | 77 | # Reward functions 78 | def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]: 79 | responses = [completion[0]['content'] for completion in completions] 80 | q = prompts[0][-1]['content'] 81 | extracted_responses = [extract_xml_answer(r) for r in responses] 82 | print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}") 83 | return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)] 84 | 85 | def int_reward_func(completions, **kwargs) -> list[float]: 86 | responses = [completion[0]['content'] for completion in completions] 87 | extracted_responses = [extract_xml_answer(r) for r in responses] 88 | return [0.5 if r.isdigit() else 0.0 for r in extracted_responses] 89 | 90 | def strict_format_reward_func(completions, **kwargs) -> list[float]: 91 | """Reward function that checks if the completion has a specific format.""" 92 | pattern = r"^\n.*?\n\n\n.*?\n\n$" 93 | responses = [completion[0]["content"] for completion in completions] 94 | matches = [re.match(pattern, r) for r in responses] 95 | return [0.5 if match else 0.0 for match in matches] 96 | 97 | def soft_format_reward_func(completions, **kwargs) -> list[float]: 98 | """Reward function that checks if the completion has a specific format.""" 99 | pattern = r".*?\s*.*?" 100 | responses = [completion[0]["content"] for completion in completions] 101 | matches = [re.match(pattern, r) for r in responses] 102 | return [0.5 if match else 0.0 for match in matches] 103 | 104 | def count_xml(text) -> float: 105 | count = 0.0 106 | if text.count("\n") == 1: 107 | count += 0.125 108 | if text.count("\n\n") == 1: 109 | count += 0.125 110 | if text.count("\n\n") == 1: 111 | count += 0.125 112 | count -= len(text.split("\n\n")[-1])*0.001 113 | if text.count("\n") == 1: 114 | count += 0.125 115 | count -= (len(text.split("\n")[-1]) - 1)*0.001 116 | return count 117 | 118 | def xmlcount_reward_func(completions, **kwargs) -> list[float]: 119 | contents = [completion[0]["content"] for completion in completions] 120 | return [count_xml(c) for c in contents] 121 | 122 | ####### Training ######################### 123 | training_args = GRPOConfig( 124 | use_vllm = False, # disable vLLM to avoid device conflicts 125 | learning_rate = 5e-6, 126 | adam_beta1 = 0.9, 127 | adam_beta2 = 0.99, 128 | weight_decay = 0.1, 129 | warmup_ratio = 0.1, 130 | lr_scheduler_type = "cosine", 131 | optim = "adamw_8bit", 132 | logging_steps = 1, 133 | bf16 = is_bfloat16_supported(), 134 | fp16 = not is_bfloat16_supported(), 135 | per_device_train_batch_size = 1, 136 | gradient_accumulation_steps = 1, # Increase to 4 for smoother training 137 | num_generations = 8, # Decrease if out of memory 138 | max_prompt_length = 256, 139 | max_completion_length = 200, 140 | # num_train_epochs = 1, # Set to 1 for a full training run 141 | max_steps = 250, 142 | save_steps = 250, 143 | max_grad_norm = 0.1, 144 | report_to = "none", # Can use Weights & Biases 145 | output_dir = "outputs", 146 | ) 147 | 148 | trainer = GRPOTrainer( 149 | model = model, 150 | processing_class = tokenizer, 151 | reward_funcs = [ 152 | xmlcount_reward_func, 153 | soft_format_reward_func, 154 | strict_format_reward_func, 155 | int_reward_func, 156 | correctness_reward_func, 157 | ], 158 | args = training_args, 159 | train_dataset = dataset, 160 | ) 161 | trainer.train() 162 | 163 | ##########Inferencing without GPRO Trained##################### 164 | text = tokenizer.apply_chat_template([ 165 | {"role" : "user", "content" : "How many r's are in strawberry?"}, 166 | ], tokenize = False, add_generation_prompt = True) 167 | 168 | from vllm import SamplingParams 169 | sampling_params = SamplingParams( 170 | temperature = 0.8, 171 | top_p = 0.95, 172 | max_tokens = 1024, 173 | ) 174 | output = model.fast_generate( 175 | [text], 176 | sampling_params = sampling_params, 177 | lora_request = None, 178 | )[0].outputs[0].text 179 | 180 | print(output) 181 | 182 | #Saving Model 183 | model.save_lora("grpo_saved_lora") 184 | 185 | ##########Inferencing with lora GPRO Trained##################### 186 | text = tokenizer.apply_chat_template([ 187 | {"role" : "system", "content" : SYSTEM_PROMPT}, 188 | {"role" : "user", "content" : "How many r's are in strawberry?"}, 189 | ], tokenize = False, add_generation_prompt = True) 190 | 191 | from vllm import SamplingParams 192 | sampling_params = SamplingParams( 193 | temperature = 0.8, 194 | top_p = 0.95, 195 | max_tokens = 1024, 196 | ) 197 | output = model.fast_generate( 198 | text, 199 | sampling_params = sampling_params, 200 | lora_request = model.load_lora("grpo_saved_lora"), 201 | )[0].outputs[0].text 202 | 203 | print(output) 204 | 205 | 206 | 207 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM-Fine-Tuning 2 | This GitHub repository has several examples of fine-tuning of open source large language models. It demonstrates how to fine-tune and quantize large language models using performance efficient fine-tuning techniques like Lora and QLora. 3 | 4 | Reference -> https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu 5 | -------------------------------------------------------------------------------- /call_finetune_intel_neural_chat7B.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from peft import PeftModel, PeftConfig 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from transformers import BitsAndBytesConfig 5 | import time 6 | 7 | peft_model_id = "meetrais/finetuned-neural-chat-7b-v3-1" 8 | config = PeftConfig.from_pretrained(peft_model_id) 9 | bnb_config = BitsAndBytesConfig( 10 | load_in_4bit=True, 11 | bnb_4bit_use_double_quant=True, 12 | bnb_4bit_quant_type="nf4", 13 | bnb_4bit_compute_dtype=torch.bfloat16 14 | ) 15 | model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=bnb_config, device_map='auto') 16 | #model = AutoModelForCausalLM.from_pretrained(peft_model_id, load_in_4bit=True,bnb_4bit_compute_type=torch.float16, bnb_4bit_use_double_quant=True, device_map='auto') 17 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) 18 | 19 | if tokenizer.pad_token is None: 20 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 21 | text = "Capital of USA is" 22 | device = "cuda:0" 23 | 24 | inputs = tokenizer(text, return_tensors="pt").to(device) 25 | max_new_tokens=30 26 | start = time.time() 27 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id) 28 | end = time.time() 29 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 30 | 31 | token_per_seconds = max_new_tokens/(end-start) 32 | print(f"Tokens per second: {token_per_seconds}") 33 | -------------------------------------------------------------------------------- /call_finetune_mistral_7b.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from peft import PeftModel, PeftConfig 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from transformers import BitsAndBytesConfig 5 | 6 | peft_model_id = "meetrais/finetuned_mistral_7b" 7 | config = PeftConfig.from_pretrained(peft_model_id) 8 | bnb_config = BitsAndBytesConfig( 9 | load_in_4bit=True, 10 | bnb_4bit_use_double_quant=True, 11 | bnb_4bit_quant_type="nf4", 12 | bnb_4bit_compute_dtype=torch.bfloat16 13 | ) 14 | model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=bnb_config, device_map='auto') 15 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) 16 | 17 | if tokenizer.pad_token is None: 18 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 19 | text = "Capital of USA is" 20 | device = "cuda:0" 21 | 22 | inputs = tokenizer(text, return_tensors="pt").to(device) 23 | 24 | outputs = model.generate(**inputs, max_new_tokens=20) 25 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) -------------------------------------------------------------------------------- /finetune_Llama-7b_with_only_lora.py: -------------------------------------------------------------------------------- 1 | import transformers as transformers 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments 3 | from trl import SFTTrainer 4 | from transformers import BitsAndBytesConfig 5 | import torch 6 | import torch.nn as nn 7 | from peft import LoraConfig, get_peft_model 8 | from datasets import load_dataset 9 | 10 | model_id = "mistralai/Mistral-7B-v0.1" 11 | bnb_config = BitsAndBytesConfig( 12 | load_in_4bit=True, 13 | bnb_4bit_use_double_quant=True, 14 | bnb_4bit_quant_type="nf4", 15 | bnb_4bit_compute_dtype=torch.bfloat16, 16 | 17 | ) 18 | 19 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) 20 | 21 | tokenizer = AutoTokenizer.from_pretrained(model_id) 22 | if tokenizer.pad_token is None: 23 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 24 | 25 | config = LoraConfig( 26 | r=16, 27 | lora_alpha=32, 28 | lora_dropout=0.05, 29 | bias = 'none', 30 | task_type="CAUSAL_LM", 31 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"] 32 | ) 33 | 34 | 35 | model = get_peft_model(model, config) 36 | #print_trainable_parameters(model) 37 | 38 | total_parameters = 0 39 | for name, param in model.named_parameters(): 40 | total_parameters += param.numel() 41 | 42 | print(f"Total parameters: {total_parameters}") 43 | 44 | # Freeze the non-Lora parameters 45 | for name, param in model.named_parameters(): 46 | if 'lora' not in name: 47 | print(f'Freezing non-LoRA parameter {name}') 48 | param.requires_grad = False 49 | 50 | #Training 51 | training_arguments = TrainingArguments( 52 | output_dir= "./results", 53 | num_train_epochs= 2, 54 | per_device_train_batch_size= 1, 55 | gradient_accumulation_steps= 1, 56 | optim = "paged_adamw_8bit", 57 | save_steps= 100, 58 | logging_steps= 30, 59 | learning_rate= 2e-4, 60 | weight_decay= 0.001, 61 | fp16= True, 62 | bf16= False, 63 | max_grad_norm= 0.3, 64 | max_steps= -1, 65 | warmup_ratio= 0.3, 66 | group_by_length= True, 67 | lr_scheduler_type= "constant" 68 | ) 69 | # Setting sft parameters 70 | dataset = load_dataset("imdb", split="train") 71 | trainer = SFTTrainer( 72 | train_dataset=dataset, 73 | model=model, 74 | max_seq_length= 20, 75 | dataset_text_field="text", 76 | tokenizer=tokenizer, 77 | args=training_arguments, 78 | packing= False, 79 | ) 80 | 81 | model.config.use_cache = False 82 | trainer.train() 83 | 84 | total_parameters = 0 85 | for name, param in model.named_parameters(): 86 | total_parameters += param.numel() 87 | 88 | print(f"Total parameters after Freeze: {total_parameters}") 89 | -------------------------------------------------------------------------------- /finetune_NousResearch_Nous-Hermes-2-SOLAR-10.7B.py: -------------------------------------------------------------------------------- 1 | import transformers as transformers 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments 3 | from trl import SFTTrainer 4 | from transformers import BitsAndBytesConfig 5 | import torch 6 | import torch.nn as nn 7 | from peft import LoraConfig, get_peft_model 8 | from datasets import load_dataset 9 | 10 | model_id = "NousResearch/Nous-Hermes-2-SOLAR-10.7B" 11 | tokenizer = AutoTokenizer.from_pretrained(model_id) 12 | if tokenizer.pad_token is None: 13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 14 | text = "Capital of USA is" 15 | device = "cuda:0" 16 | 17 | inputs = tokenizer(text, return_tensors="pt").to(device) 18 | 19 | bnb_config = BitsAndBytesConfig( 20 | load_in_4bit=True, 21 | bnb_4bit_use_double_quant=True, 22 | bnb_4bit_quant_type="nf4", 23 | bnb_4bit_compute_dtype=torch.bfloat16, 24 | 25 | ) 26 | 27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) 28 | 29 | config = LoraConfig( 30 | r=16, 31 | lora_alpha=32, 32 | lora_dropout=0.05, 33 | bias = 'none', 34 | task_type="CAUSAL_LM", 35 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"] 36 | ) 37 | 38 | #print(model) 39 | 40 | model = get_peft_model(model, config) 41 | outputs = model.generate(**inputs, max_new_tokens=30) 42 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 43 | """ 44 | model.push_to_hub("meetrais/finetuned-neural-chat-7b-v3-1", 45 | token="Your-Hugging-Face-Token-Here", 46 | commit_message="basic training", 47 | private=True) 48 | """ 49 | 50 | 51 | -------------------------------------------------------------------------------- /finetune_Yi_34B.py: -------------------------------------------------------------------------------- 1 | import transformers as transformers 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments 3 | from trl import SFTTrainer 4 | from transformers import BitsAndBytesConfig 5 | import torch 6 | import torch.nn as nn 7 | from peft import LoraConfig, get_peft_model 8 | from datasets import load_dataset 9 | 10 | model_id = "01-ai/Yi-34B" 11 | tokenizer = AutoTokenizer.from_pretrained(model_id) 12 | if tokenizer.pad_token is None: 13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 14 | text = "Capital of USA is" 15 | device = "cuda:0" 16 | 17 | inputs = tokenizer(text, return_tensors="pt").to(device) 18 | 19 | bnb_config = BitsAndBytesConfig( 20 | load_in_4bit=True, 21 | bnb_4bit_use_double_quant=True, 22 | bnb_4bit_quant_type="nf4", 23 | bnb_4bit_compute_dtype=torch.bfloat16 24 | ) 25 | 26 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) 27 | 28 | 29 | config = LoraConfig( 30 | r=16, 31 | lora_alpha=32, 32 | lora_dropout=0.05, 33 | bias = 'none', 34 | task_type="CAUSAL_LM" 35 | ) 36 | model = get_peft_model(model, config) 37 | 38 | 39 | outputs = model.generate(**inputs, max_new_tokens=20) 40 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 41 | """ 42 | model.push_to_hub("meetrais/finetuned_mistral_7b", 43 | token="Your-Hugging-Face-Token-Here", 44 | commit_message="basic training", 45 | private=True) 46 | """ 47 | 48 | 49 | -------------------------------------------------------------------------------- /finetune_intel_neural_chat7B.py: -------------------------------------------------------------------------------- 1 | import transformers as transformers 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments 3 | from trl import SFTTrainer 4 | from transformers import BitsAndBytesConfig 5 | import torch 6 | import torch.nn as nn 7 | from peft import LoraConfig, get_peft_model 8 | from datasets import load_dataset 9 | 10 | model_id = "Intel/neural-chat-7b-v3-1" 11 | tokenizer = AutoTokenizer.from_pretrained(model_id) 12 | if tokenizer.pad_token is None: 13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 14 | text = "Capital of USA is" 15 | device = "cuda:0" 16 | 17 | inputs = tokenizer(text, return_tensors="pt").to(device) 18 | 19 | bnb_config = BitsAndBytesConfig( 20 | load_in_4bit=True, 21 | bnb_4bit_use_double_quant=True, 22 | bnb_4bit_quant_type="nf4", 23 | bnb_4bit_compute_dtype=torch.bfloat16, 24 | 25 | ) 26 | 27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) 28 | 29 | config = LoraConfig( 30 | r=16, 31 | lora_alpha=32, 32 | lora_dropout=0.05, 33 | bias = 'none', 34 | task_type="CAUSAL_LM", 35 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"] 36 | ) 37 | 38 | #print(model) 39 | 40 | model = get_peft_model(model, config) 41 | outputs = model.generate(**inputs, max_new_tokens=30) 42 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 43 | """ 44 | model.push_to_hub("meetrais/finetuned-neural-chat-7b-v3-1", 45 | token="Your-Hugging-Face-Token-Here", 46 | commit_message="basic training", 47 | private=True) 48 | """ 49 | 50 | 51 | -------------------------------------------------------------------------------- /finetune_llama_3_1_8B-Instruct.py: -------------------------------------------------------------------------------- 1 | import transformers as transformers 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments 3 | from trl import SFTTrainer 4 | from transformers import BitsAndBytesConfig 5 | import torch 6 | import torch.nn as nn 7 | from peft import LoraConfig, get_peft_model 8 | from datasets import load_dataset 9 | 10 | model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" 11 | tokenizer = AutoTokenizer.from_pretrained(model_id) 12 | if tokenizer.pad_token is None: 13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 14 | text = "Capital of USA is" 15 | device = "cuda:0" 16 | 17 | inputs = tokenizer(text, return_tensors="pt").to(device) 18 | 19 | bnb_config = BitsAndBytesConfig( 20 | load_in_4bit=True, 21 | bnb_4bit_use_double_quant=True, 22 | bnb_4bit_quant_type="nf4", 23 | bnb_4bit_compute_dtype=torch.bfloat16, 24 | load_in_8bit=False 25 | ) 26 | 27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) 28 | 29 | config = LoraConfig( 30 | r=16, 31 | lora_alpha=32, 32 | lora_dropout=0.05, 33 | bias = 'none', 34 | task_type="CAUSAL_LM", 35 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"] 36 | ) 37 | 38 | model = get_peft_model(model, config) 39 | outputs = model.generate(**inputs, max_new_tokens=30) 40 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 41 | 42 | model.push_to_hub("meetrais/finetuned_Meta-Llama-3.1-8B-Instruct", 43 | token="Your_HF_Token", 44 | commit_message="basic training", 45 | private=True) 46 | -------------------------------------------------------------------------------- /finetune_meta_llama3_8B_instruct.py: -------------------------------------------------------------------------------- 1 | import transformers as transformers 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments 3 | from trl import SFTTrainer 4 | from transformers import BitsAndBytesConfig 5 | import torch 6 | import torch.nn as nn 7 | from peft import LoraConfig, get_peft_model 8 | from datasets import load_dataset 9 | 10 | print("Torch version:",torch.__version__) 11 | 12 | print("Is CUDA enabled?",torch.cuda.is_available()) 13 | 14 | model_id = "meta-llama/Meta-Llama-3-8B-Instruct" 15 | tokenizer = AutoTokenizer.from_pretrained(model_id) 16 | if tokenizer.pad_token is None: 17 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 18 | text = "Capital of USA is" 19 | device = "cuda:0" 20 | 21 | inputs = tokenizer(text, return_tensors="pt").to(device) 22 | 23 | bnb_config = BitsAndBytesConfig( 24 | load_in_4bit=True, 25 | bnb_4bit_use_double_quant=True, 26 | bnb_4bit_quant_type="nf4", 27 | bnb_4bit_compute_dtype=torch.bfloat16, 28 | load_in_8bit=False 29 | ) 30 | 31 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) 32 | 33 | config = LoraConfig( 34 | r=16, 35 | lora_alpha=32, 36 | lora_dropout=0.05, 37 | bias = 'none', 38 | task_type="CAUSAL_LM", 39 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"] 40 | ) 41 | 42 | model = get_peft_model(model, config) 43 | outputs = model.generate(**inputs, max_new_tokens=30) 44 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 45 | 46 | 47 | model.push_to_hub("meetrais/Meta-Llama-3-8B-Instruct-NIM-LORA", 48 | token="HF-Access-Key", 49 | commit_message="basic training", 50 | private=True) 51 | 52 | 53 | -------------------------------------------------------------------------------- /finetune_microsoft-Phi-3-mini-128k-instruct.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 3 | 4 | torch.random.manual_seed(0) 5 | 6 | model = AutoModelForCausalLM.from_pretrained( 7 | "microsoft/Phi-3-mini-128k-instruct", 8 | device_map="cuda", 9 | torch_dtype="auto", 10 | trust_remote_code=True 11 | ) 12 | tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct") 13 | 14 | messages = [ 15 | {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."}, 16 | {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}, 17 | {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."}, 18 | {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"}, 19 | ] 20 | 21 | pipe = pipeline( 22 | "text-generation", 23 | model=model, 24 | tokenizer=tokenizer, 25 | ) 26 | 27 | generation_args = { 28 | "max_new_tokens": 500, 29 | "return_full_text": False, 30 | "temperature": 0.6, 31 | "do_sample": False, 32 | } 33 | 34 | output = pipe(messages, **generation_args) 35 | print(output[0]['generated_text']) 36 | -------------------------------------------------------------------------------- /finetune_microsoft-phi-2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | 4 | model_id = "microsoft/phi-2" 5 | tokenizer = AutoTokenizer.from_pretrained(model_id) 6 | 7 | text = "What is Capital of USA?" 8 | inputs = tokenizer(text, return_tensors="pt").to(0) 9 | 10 | model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", device_map="cuda", trust_remote_code=True) 11 | outputs = model.generate(**inputs, max_new_tokens=50) 12 | 13 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 14 | -------------------------------------------------------------------------------- /finetune_mistral_7b.py: -------------------------------------------------------------------------------- 1 | import transformers as transformers 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments 3 | from trl import SFTTrainer 4 | from transformers import BitsAndBytesConfig 5 | import torch 6 | import torch.nn as nn 7 | from peft import LoraConfig, get_peft_model 8 | from datasets import load_dataset 9 | 10 | model_id = "mistralai/Mistral-7B-v0.1" 11 | tokenizer = AutoTokenizer.from_pretrained(model_id) 12 | if tokenizer.pad_token is None: 13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 14 | text = "Capital of USA is" 15 | device = "cuda:0" 16 | 17 | inputs = tokenizer(text, return_tensors="pt").to(device) 18 | 19 | bnb_config = BitsAndBytesConfig( 20 | load_in_4bit=True, 21 | bnb_4bit_use_double_quant=True, 22 | bnb_4bit_quant_type="nf4", 23 | bnb_4bit_compute_dtype=torch.bfloat16, 24 | load_in_8bit=False 25 | ) 26 | 27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) 28 | 29 | """ 30 | #Freezing the original weights 31 | for param in model.parameters(): 32 | param.requires_grad = False 33 | if param.ndim ==1: 34 | param.data = param.data.to(torch.float32) 35 | model.gradient_checkpointing_enable() 36 | model.enable_input_require_grads() 37 | 38 | class CastOutputToFloat(nn.Sequential): 39 | def forward(self, x): return super().forward(x).to(torch.float32) 40 | model.lm_head = CastOutputToFloat(model.lm_head) 41 | 42 | #Setting up the LoRa Adapters 43 | def print_trainable_parameters(model): 44 | trainable_params = 0 45 | all_param = 0 46 | for _, param in model.named_parameters(): 47 | all_param += param.numel() 48 | if param.requires_grad: 49 | trainable_params += param.numel() 50 | print( 51 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" 52 | ) 53 | 54 | config = LoraConfig( 55 | r=16, 56 | lora_alpha=32, 57 | lora_dropout=0.05, 58 | bias = 'none', 59 | task_type="CAUSAL_LM" 60 | ) 61 | 62 | model = get_peft_model(model, config) 63 | print_trainable_parameters(model) 64 | dataset_name = "gathnex/Gath_baize" 65 | dataset = load_dataset(dataset_name, split="train[:1000]") 66 | dataset["chat_sample"][0] 67 | 68 | def merge_colunms(example): 69 | example['prediction'] = example['quote'] + " ->: " + str(example["tags"]) 70 | return example 71 | 72 | 73 | #data['train'] = data['train'].map(merge_colunms) 74 | #print(data['train'][0]) 75 | 76 | #data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True) 77 | 78 | #print(data) 79 | 80 | #Training 81 | training_arguments = TrainingArguments( 82 | output_dir= "./results", 83 | num_train_epochs= 10, 84 | per_device_train_batch_size= 8, 85 | gradient_accumulation_steps= 2, 86 | optim = "paged_adamw_8bit", 87 | save_steps= 100, 88 | logging_steps= 30, 89 | learning_rate= 2e-4, 90 | weight_decay= 0.001, 91 | fp16= True, 92 | bf16= False, 93 | max_grad_norm= 0.3, 94 | max_steps= -1, 95 | warmup_ratio= 0.3, 96 | group_by_length= True, 97 | lr_scheduler_type= "constant" 98 | ) 99 | # Setting sft parameters 100 | trainer = SFTTrainer( 101 | model=model, 102 | train_dataset=dataset, 103 | max_seq_length= 20, 104 | dataset_text_field="chat_sample", 105 | tokenizer=tokenizer, 106 | args=training_arguments, 107 | packing= False, 108 | ) 109 | 110 | model.config.use_cache = False 111 | trainer.train() 112 | """ 113 | #print(model) 114 | 115 | config = LoraConfig( 116 | r=16, 117 | lora_alpha=32, 118 | lora_dropout=0.05, 119 | bias = 'none', 120 | task_type="CAUSAL_LM", 121 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"] 122 | ) 123 | 124 | model = get_peft_model(model, config) 125 | outputs = model.generate(**inputs, max_new_tokens=30) 126 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 127 | 128 | """ 129 | model.push_to_hub("meetrais/finetuned_mistral_7b", 130 | token="Your-Hugging-Face-Token-Here", 131 | commit_message="basic training", 132 | private=True) 133 | 134 | """ 135 | -------------------------------------------------------------------------------- /finetune_starling-LM-7B-alpha.py: -------------------------------------------------------------------------------- 1 | import transformers as transformers 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments 3 | from trl import SFTTrainer 4 | from transformers import BitsAndBytesConfig 5 | import torch 6 | import torch.nn as nn 7 | from peft import LoraConfig, get_peft_model 8 | from datasets import load_dataset 9 | 10 | model_id = "berkeley-nest/Starling-LM-7B-alpha" 11 | tokenizer = AutoTokenizer.from_pretrained(model_id) 12 | if tokenizer.pad_token is None: 13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 14 | text = "Capital of USA is" 15 | device = "cuda:0" 16 | 17 | inputs = tokenizer(text, return_tensors="pt").to(device) 18 | 19 | bnb_config = BitsAndBytesConfig( 20 | load_in_4bit=True, 21 | bnb_4bit_use_double_quant=True, 22 | bnb_4bit_quant_type="nf4", 23 | bnb_4bit_compute_dtype=torch.bfloat16, 24 | load_in_8bit=False 25 | ) 26 | 27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) 28 | 29 | """ 30 | #Freezing the original weights 31 | for param in model.parameters(): 32 | param.requires_grad = False 33 | if param.ndim ==1: 34 | param.data = param.data.to(torch.float32) 35 | model.gradient_checkpointing_enable() 36 | model.enable_input_require_grads() 37 | 38 | class CastOutputToFloat(nn.Sequential): 39 | def forward(self, x): return super().forward(x).to(torch.float32) 40 | model.lm_head = CastOutputToFloat(model.lm_head) 41 | 42 | #Setting up the LoRa Adapters 43 | def print_trainable_parameters(model): 44 | trainable_params = 0 45 | all_param = 0 46 | for _, param in model.named_parameters(): 47 | all_param += param.numel() 48 | if param.requires_grad: 49 | trainable_params += param.numel() 50 | print( 51 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" 52 | ) 53 | 54 | config = LoraConfig( 55 | r=16, 56 | lora_alpha=32, 57 | lora_dropout=0.05, 58 | bias = 'none', 59 | task_type="CAUSAL_LM" 60 | ) 61 | 62 | model = get_peft_model(model, config) 63 | print_trainable_parameters(model) 64 | dataset_name = "gathnex/Gath_baize" 65 | dataset = load_dataset(dataset_name, split="train[:1000]") 66 | dataset["chat_sample"][0] 67 | 68 | def merge_colunms(example): 69 | example['prediction'] = example['quote'] + " ->: " + str(example["tags"]) 70 | return example 71 | 72 | 73 | #data['train'] = data['train'].map(merge_colunms) 74 | #print(data['train'][0]) 75 | 76 | #data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True) 77 | 78 | #print(data) 79 | 80 | #Training 81 | training_arguments = TrainingArguments( 82 | output_dir= "./results", 83 | num_train_epochs= 10, 84 | per_device_train_batch_size= 8, 85 | gradient_accumulation_steps= 2, 86 | optim = "paged_adamw_8bit", 87 | save_steps= 100, 88 | logging_steps= 30, 89 | learning_rate= 2e-4, 90 | weight_decay= 0.001, 91 | fp16= True, 92 | bf16= False, 93 | max_grad_norm= 0.3, 94 | max_steps= -1, 95 | warmup_ratio= 0.3, 96 | group_by_length= True, 97 | lr_scheduler_type= "constant" 98 | ) 99 | # Setting sft parameters 100 | trainer = SFTTrainer( 101 | model=model, 102 | train_dataset=dataset, 103 | max_seq_length= 20, 104 | dataset_text_field="chat_sample", 105 | tokenizer=tokenizer, 106 | args=training_arguments, 107 | packing= False, 108 | ) 109 | 110 | model.config.use_cache = False 111 | trainer.train() 112 | """ 113 | config = LoraConfig( 114 | r=16, 115 | lora_alpha=32, 116 | lora_dropout=0.05, 117 | bias = 'none', 118 | task_type="CAUSAL_LM", 119 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"] 120 | ) 121 | 122 | #print(model) 123 | model = get_peft_model(model, config) 124 | outputs = model.generate(**inputs, max_new_tokens=30) 125 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 126 | 127 | """ 128 | model.push_to_hub("meetrais/finetuned_mistral_7b", 129 | token="Your-Hugging-Face-Token-Here", 130 | commit_message="basic training", 131 | private=True) 132 | 133 | """ 134 | -------------------------------------------------------------------------------- /web_chat_bot_finetuned_mistral_7b.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | from threading import Thread 3 | import argparse 4 | import os 5 | import torch 6 | import gradio as gr 7 | from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer 8 | from peft import PeftModel, PeftConfig 9 | from transformers import BitsAndBytesConfig 10 | 11 | peft_model_id = "meetrais/finetuned_mistral_7b" 12 | config = PeftConfig.from_pretrained(peft_model_id) 13 | bnb_config = BitsAndBytesConfig( 14 | load_in_4bit=True, 15 | bnb_4bit_use_double_quant=True, 16 | bnb_4bit_quant_type="nf4", 17 | bnb_4bit_compute_dtype=torch.bfloat16 18 | ) 19 | model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=bnb_config, device_map='auto') 20 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) 21 | 22 | if tokenizer.pad_token is None: 23 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 24 | 25 | usingAdapter = True 26 | 27 | device = "cuda:0" 28 | # Function to run the text generation process 29 | def run_generation(user_text, top_p, temperature, top_k, max_new_tokens): 30 | #template = "### Text: {}\n### The tone is:\n" 31 | #model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt") 32 | #model_inputs = model_inputs.to(device) 33 | model_inputs= tokenizer(user_text, return_tensors="pt").to(device) 34 | 35 | # Generate text in a separate thread 36 | streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) 37 | generate_kwargs = dict( 38 | **model_inputs, 39 | max_new_tokens=max_new_tokens, 40 | pad_token_id=tokenizer.eos_token_id, 41 | streamer=streamer, 42 | ) 43 | thread = Thread(target=model.generate, kwargs=generate_kwargs) 44 | thread.start() 45 | 46 | # Retrieve and yield the generated text 47 | model_output = "" 48 | for new_text in streamer: 49 | model_output += new_text 50 | yield model_output 51 | return model_output 52 | 53 | # Gradio UI setup 54 | with gr.Blocks() as demo: 55 | with gr.Row(): 56 | with gr.Column(scale=4): 57 | user_text = gr.Textbox(placeholder="Write your question here", label="User input") 58 | model_output = gr.Textbox(label="Model output", lines=10, interactive=False) 59 | button_submit = gr.Button(value="Submit") 60 | 61 | with gr.Column(scale=1): 62 | max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=250, step=1, label="Max New Tokens") 63 | top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)") 64 | top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k") 65 | temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature") 66 | 67 | user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output) 68 | button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output) 69 | 70 | demo.queue(max_size=32).launch(server_port=8082) 71 | -------------------------------------------------------------------------------- /web_chat_bot_llama3_8b_instruct.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | import torch 3 | import gradio as gr 4 | 5 | device = "cuda:0" 6 | # Function to run the text generation process 7 | def run_generation(user_text, top_p, temperature, top_k, max_new_tokens): 8 | model_id = "meta-llama/Meta-Llama-3-8B-Instruct" 9 | device = "cuda:0" 10 | pipeline = transformers.pipeline( 11 | "text-generation", 12 | model=model_id, 13 | model_kwargs={"torch_dtype": torch.bfloat16}, 14 | device=device, 15 | ) 16 | 17 | messages = [ 18 | {"role": "system", "content": "You are a helpfull assistant."}, 19 | {"role": "user", "content": user_text}, 20 | ] 21 | 22 | prompt = pipeline.tokenizer.apply_chat_template( 23 | messages, 24 | tokenize=False, 25 | add_generation_prompt=True 26 | ) 27 | 28 | terminators = [ 29 | pipeline.tokenizer.eos_token_id, 30 | pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") 31 | ] 32 | 33 | outputs = pipeline( 34 | prompt, 35 | max_new_tokens=max_new_tokens, 36 | eos_token_id=terminators, 37 | do_sample=True, 38 | temperature=temperature, 39 | top_p=top_p, 40 | top_k=top_k 41 | ) 42 | return outputs[0]["generated_text"][len(prompt):] 43 | 44 | # Gradio UI setup 45 | with gr.Blocks() as demo: 46 | with gr.Row(): 47 | with gr.Column(scale=4): 48 | user_text = gr.Textbox(placeholder="Write your question here", label="User input") 49 | model_output = gr.Textbox(label="Model output", lines=10, interactive=False) 50 | button_submit = gr.Button(value="Submit") 51 | 52 | with gr.Column(scale=1): 53 | max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=250, step=1, label="Max New Tokens") 54 | top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)") 55 | top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k") 56 | temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature") 57 | 58 | user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output) 59 | button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output) 60 | 61 | demo.queue(max_size=32).launch(server_port=8082) -------------------------------------------------------------------------------- /web_microsoft-Phi-3-mini-128k-instruct.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 3 | import gradio as gr 4 | 5 | device = "cuda:0" 6 | # Function to run the text generation process 7 | def run_generation(user_text, top_p, temperature, top_k, max_new_tokens): 8 | torch.random.manual_seed(0) 9 | 10 | model = AutoModelForCausalLM.from_pretrained( 11 | "microsoft/Phi-3-mini-128k-instruct", 12 | device_map="cuda", 13 | torch_dtype="auto", 14 | trust_remote_code=True 15 | ) 16 | tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct") 17 | 18 | messages = [ 19 | {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."}, 20 | {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}, 21 | {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."}, 22 | {"role": "user", "content": "{0}".format(user_text)}, 23 | ] 24 | 25 | pipe = pipeline( 26 | "text-generation", 27 | model=model, 28 | tokenizer=tokenizer, 29 | ) 30 | 31 | generation_args = { 32 | "max_new_tokens": max_new_tokens, 33 | "return_full_text": False, 34 | "temperature": temperature, 35 | "do_sample": False, 36 | } 37 | 38 | output = pipe(messages, **generation_args) 39 | return output[0]['generated_text'] 40 | 41 | # Gradio UI setup 42 | with gr.Blocks() as demo: 43 | with gr.Row(): 44 | with gr.Column(scale=4): 45 | user_text = gr.Textbox(placeholder="Write your question here", label="User input") 46 | model_output = gr.Textbox(label="Model output", lines=10, interactive=False) 47 | button_submit = gr.Button(value="Submit") 48 | 49 | with gr.Column(scale=1): 50 | max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=250, step=1, label="Max New Tokens") 51 | top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)") 52 | top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k") 53 | temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature") 54 | 55 | user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output) 56 | button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output) 57 | 58 | demo.queue(max_size=32).launch(server_port=8082) --------------------------------------------------------------------------------