├── Finetune-Bloom7B-tagger.py
├── Finetune-meta-Llama-2-7b-hf.py
├── Qwen2.5_3B_GRPO.py
├── README.md
├── call_finetune_intel_neural_chat7B.py
├── call_finetune_mistral_7b.py
├── finetune_Llama-7b_with_only_lora.py
├── finetune_NousResearch_Nous-Hermes-2-SOLAR-10.7B.py
├── finetune_Yi_34B.py
├── finetune_intel_neural_chat7B.py
├── finetune_llama_3_1_8B-Instruct.py
├── finetune_meta_llama3_8B_instruct.py
├── finetune_microsoft-Phi-3-mini-128k-instruct.py
├── finetune_microsoft-phi-2.py
├── finetune_mistral_7b.py
├── finetune_starling-LM-7B-alpha.py
├── web_chat_bot_finetuned_mistral_7b.py
├── web_chat_bot_llama3_8b_instruct.py
└── web_microsoft-Phi-3-mini-128k-instruct.py


/Finetune-Bloom7B-tagger.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import bitsandbytes as bnb
  5 | import transformers  as transformers
  6 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
  7 | from transformers import BitsAndBytesConfig
  8 | from peft import LoraConfig, get_peft_model 
  9 | from datasets import load_dataset
 10 | 
 11 | 
 12 | #Setup the model
 13 | model_id="bigscience/bloom-1b7"
 14 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 15 | model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
 16 | 
 17 | print(model.get_memory_footprint())
 18 | 
 19 | '''
 20 | Change the compute dtype
 21 | The compute dtype is used to change the dtype that will be used during computation. 
 22 | For example, hidden states could be in float32 but computation can be set to bf16 for speedups. By default, the compute dtype is set to float32.
 23 | 
 24 | quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
 25 | 
 26 | '''
 27 | 
 28 | '''
 29 | Using NF4 (Normal Float 4) data type
 30 | You can also use the NF4 data type, which is a new 4bit datatype adapted for weights that have been initialized using a normal distribution. For that run:
 31 | 
 32 | nf4_config = BitsAndBytesConfig(
 33 |     load_in_4bit=True,
 34 |     bnb_4bit_quant_type="nf4",
 35 | )
 36 | 
 37 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
 38 | '''
 39 | 
 40 | '''
 41 | Use nested quantization for more memory efficient inference
 42 | We also advise users to use the nested quantization technique. This saves more memory at no additional performance - from our empirical observations, 
 43 | this enables fine-tuning llama-13b model on an NVIDIA-T4 16GB with a sequence length of 1024, batch size of 1 and gradient accumulation steps of 4.
 44 | 
 45 | double_quant_config = BitsAndBytesConfig(
 46 |     load_in_4bit=True,
 47 |     bnb_4bit_use_double_quant=True,
 48 | )
 49 | 
 50 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=double_quant_config)
 51 | '''
 52 | 
 53 | #Freezing the original weights
 54 | for param in model.parameters():
 55 |     param.requires_grad = False
 56 |     if param.ndim ==1:
 57 |         param.data = param.data.to(torch.float32)
 58 | model.gradient_checkpointing_enable()
 59 | model.enable_input_require_grads()
 60 | 
 61 | class CastOutputToFloat(nn.Sequential):
 62 |     def forward(self, x): return super().forward(x).to(torch.float32)
 63 | model.lm_head = CastOutputToFloat(model.lm_head)
 64 | 
 65 | #Setting up the LoRa Adapters
 66 | def print_trainable_parameters(model):
 67 |     trainable_params = 0
 68 |     all_param = 0
 69 |     for _, param in model.named_parameters():
 70 |         all_param += param.numel()
 71 |         if param.requires_grad:
 72 |             trainable_params += param.numel()
 73 |     print(
 74 |         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
 75 |     ) 
 76 | 
 77 | config  = LoraConfig(
 78 |     r=16,
 79 |     lora_alpha=32,
 80 |     lora_dropout=0.05,
 81 |     bias = 'none',
 82 |     task_type="CAUSAL_LM"
 83 | )
 84 | 
 85 | model = get_peft_model(model, config)
 86 | print_trainable_parameters(model)
 87 | data = load_dataset("Abirate/english_quotes")
 88 | 
 89 | def merge_colunms(example):
 90 |     example['prediction'] = example['quote'] + " ->: " + str(example["tags"])
 91 |     return example
 92 | 
 93 | data['train'] = data['train'].map(merge_colunms)
 94 | print(data['train']["prediction"][:5])
 95 | print(data['train'][0])
 96 | 
 97 | data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)
 98 | 
 99 | print(data)
100 | 
101 | #Training
102 | trainer =  transformers.Trainer(
103 |     model=model,
104 |     train_dataset=data['train'],
105 |     args=transformers.TrainingArguments(
106 |         per_gpu_train_batch_size=4,
107 |         gradient_accumulation_steps=4,
108 |         warmup_steps=100,
109 |         max_steps=200,
110 |         learning_rate=2e-4,
111 |         fp16=True,
112 |         logging_steps=1,
113 |         output_dir='outputs'
114 |     ),
115 |     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
116 | )
117 | 
118 | model.config.use_cache = False
119 | trainer.train()
120 | 
121 | model.push_to_hub("meetrais/bloom-7b1-lora-tagger",
122 |                   token="HuggingFace-app-key",
123 |                   commit_message="basic training",
124 |                   private=True)


--------------------------------------------------------------------------------
/Finetune-meta-Llama-2-7b-hf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.nn as nn
 4 | import bitsandbytes as bnb
 5 | import transformers  as transformers
 6 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
 7 | from transformers import BitsAndBytesConfig
 8 | from peft import LoraConfig, get_peft_model 
 9 | from datasets import load_dataset
10 | 
11 | 
12 | #Setup the model
13 | model_id="meta-llama/Llama-2-7b-hf"
14 | tokenizer = AutoTokenizer.from_pretrained(model_id)
15 | model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
16 | 
17 | print(model.get_memory_footprint())
18 | #Freezing the original weights
19 | for param in model.parameters():
20 |     param.requires_grad = False
21 |     if param.ndim ==1:
22 |         param.data = param.data.to(torch.float32)
23 | model.gradient_checkpointing_enable()
24 | model.enable_input_require_grads()
25 | 
26 | class CastOutputToFloat(nn.Sequential):
27 |     def forward(self, x): return super().forward(x).to(torch.float32)
28 | model.lm_head = CastOutputToFloat(model.lm_head)
29 | 
30 | #Setting up the LoRa Adapters
31 | def print_trainable_parameters(model):
32 |     trainable_params = 0
33 |     all_param = 0
34 |     for _, param in model.named_parameters():
35 |         all_param += param.numel()
36 |         if param.requires_grad:
37 |             trainable_params += param.numel()
38 |     print(
39 |         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
40 |     ) 
41 | 
42 | config  = LoraConfig(
43 |     r=16,
44 |     lora_alpha=32,
45 |     lora_dropout=0.05,
46 |     bias = 'none',
47 |     task_type="CAUSAL_LM"
48 | )
49 | 
50 | model = get_peft_model(model, config)
51 | print_trainable_parameters(model)
52 | data = load_dataset("Abirate/english_quotes")
53 | 
54 | def merge_colunms(example):
55 |     example['prediction'] = example['quote'] + " ->: " + str(example["tags"])
56 |     return example
57 | 
58 | data['train'] = data['train'].map(merge_colunms)
59 | print(data['train']["prediction"][:5])
60 | print(data['train'][0])
61 | 
62 | data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)
63 | 
64 | print(data)
65 | 
66 | #Training
67 | trainer =  transformers.Trainer(
68 |     model=model,
69 |     train_dataset=data['train'],
70 |     args=transformers.TrainingArguments(
71 |         per_gpu_train_batch_size=4,
72 |         gradient_accumulation_steps=4,
73 |         warmup_steps=100,
74 |         max_steps=200,
75 |         learning_rate=2e-4,
76 |         fp16=True,
77 |         logging_steps=1,
78 |         output_dir='outputs'
79 |     ),
80 |     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
81 | )
82 | 
83 | model.config.use_cache = False
84 | trainer.train()
85 | 
86 | model.push_to_hub("meetrais/meta-Llama-2-7b-hf-finetuned",
87 |                   token="HuggingFace-app-key",
88 |                   commit_message="basic training",
89 |                   private=True)
90 | 


--------------------------------------------------------------------------------
/Qwen2.5_3B_GRPO.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import re
  3 | from datasets import load_dataset, Dataset
  4 | from unsloth import FastLanguageModel, PatchFastRL, is_bfloat16_supported
  5 | from trl import GRPOConfig, GRPOTrainer
  6 | 
  7 | PatchFastRL("GRPO", FastLanguageModel)
  8 | torch.distributed.launch=True
  9 | 
 10 | max_seq_length = 256 # Can increase for longer reasoning traces
 11 | lora_rank = 32 # Larger rank = smarter, but slower
 12 | 
 13 | model, tokenizer = FastLanguageModel.from_pretrained(
 14 |     model_name = "Qwen/Qwen2.5-3B-Instruct",
 15 |     max_seq_length = max_seq_length,
 16 |     load_in_4bit = True, # False for LoRA 16bit
 17 |     fast_inference = False, # Disable vLLM fast inference for LoRA training
 18 |     max_lora_rank = lora_rank,
 19 |     gpu_memory_utilization = 0.9, # Reduce if out of memory,
 20 | )
 21 | 
 22 | model = FastLanguageModel.get_peft_model(
 23 |     model,
 24 |     r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
 25 |     target_modules = [
 26 |         "q_proj", "k_proj", "v_proj", "o_proj",
 27 |     ], # Remove QKVO if out of memory
 28 |     lora_alpha = lora_rank,
 29 |     use_gradient_checkpointing = "unsloth", # Enable long context finetuning
 30 |     random_state = 3407,
 31 | )
 32 | 
 33 | # Load and prep dataset
 34 | SYSTEM_PROMPT = """
 35 | Respond in the following format:
 36 | <reasoning>
 37 | ...
 38 | </reasoning>
 39 | <answer>
 40 | ...
 41 | </answer>
 42 | """
 43 | 
 44 | XML_COT_FORMAT = """\
 45 | <reasoning>
 46 | {reasoning}
 47 | </reasoning>
 48 | <answer>
 49 | {answer}
 50 | </answer>
 51 | """
 52 | 
 53 | def extract_xml_answer(text: str) -> str:
 54 |     answer = text.split("<answer>")[-1]
 55 |     answer = answer.split("</answer>")[0]
 56 |     return answer.strip()
 57 | 
 58 | def extract_hash_answer(text: str) -> str | None:
 59 |     if "####" not in text:
 60 |         return None
 61 |     return text.split("####")[1].strip()
 62 | 
 63 | # uncomment middle messages for 1-shot prompting
 64 | def get_gsm8k_questions(split = "train") -> Dataset:
 65 |     data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
 66 |     data = data.map(lambda x: { # type: ignore
 67 |         'prompt': [
 68 |             {'role': 'system', 'content': SYSTEM_PROMPT},
 69 |             {'role': 'user', 'content': x['question']}
 70 |         ],
 71 |         'answer': extract_hash_answer(x['answer'])
 72 |     }) # type: ignore
 73 |     return data # type: ignore
 74 | 
 75 | dataset = get_gsm8k_questions()
 76 | 
 77 | # Reward functions
 78 | def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
 79 |     responses = [completion[0]['content'] for completion in completions]
 80 |     q = prompts[0][-1]['content']
 81 |     extracted_responses = [extract_xml_answer(r) for r in responses]
 82 |     print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
 83 |     return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]
 84 | 
 85 | def int_reward_func(completions, **kwargs) -> list[float]:
 86 |     responses = [completion[0]['content'] for completion in completions]
 87 |     extracted_responses = [extract_xml_answer(r) for r in responses]
 88 |     return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]
 89 | 
 90 | def strict_format_reward_func(completions, **kwargs) -> list[float]:
 91 |     """Reward function that checks if the completion has a specific format."""
 92 |     pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
 93 |     responses = [completion[0]["content"] for completion in completions]
 94 |     matches = [re.match(pattern, r) for r in responses]
 95 |     return [0.5 if match else 0.0 for match in matches]
 96 | 
 97 | def soft_format_reward_func(completions, **kwargs) -> list[float]:
 98 |     """Reward function that checks if the completion has a specific format."""
 99 |     pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
100 |     responses = [completion[0]["content"] for completion in completions]
101 |     matches = [re.match(pattern, r) for r in responses]
102 |     return [0.5 if match else 0.0 for match in matches]
103 | 
104 | def count_xml(text) -> float:
105 |     count = 0.0
106 |     if text.count("<reasoning>\n") == 1:
107 |         count += 0.125
108 |     if text.count("\n</reasoning>\n") == 1:
109 |         count += 0.125
110 |     if text.count("\n<answer>\n") == 1:
111 |         count += 0.125
112 |         count -= len(text.split("\n</answer>\n")[-1])*0.001
113 |     if text.count("\n</answer>") == 1:
114 |         count += 0.125
115 |         count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
116 |     return count
117 | 
118 | def xmlcount_reward_func(completions, **kwargs) -> list[float]:
119 |     contents = [completion[0]["content"] for completion in completions]
120 |     return [count_xml(c) for c in contents]
121 | 
122 | ####### Training #########################
123 | training_args = GRPOConfig(
124 |     use_vllm = False, # disable vLLM to avoid device conflicts
125 |     learning_rate = 5e-6,
126 |     adam_beta1 = 0.9,
127 |     adam_beta2 = 0.99,
128 |     weight_decay = 0.1,
129 |     warmup_ratio = 0.1,
130 |     lr_scheduler_type = "cosine",
131 |     optim = "adamw_8bit",
132 |     logging_steps = 1,
133 |     bf16 = is_bfloat16_supported(),
134 |     fp16 = not is_bfloat16_supported(),
135 |     per_device_train_batch_size = 1,
136 |     gradient_accumulation_steps = 1, # Increase to 4 for smoother training
137 |     num_generations = 8, # Decrease if out of memory
138 |     max_prompt_length = 256,
139 |     max_completion_length = 200,
140 |     # num_train_epochs = 1, # Set to 1 for a full training run
141 |     max_steps = 250,
142 |     save_steps = 250,
143 |     max_grad_norm = 0.1,
144 |     report_to = "none", # Can use Weights & Biases
145 |     output_dir = "outputs",
146 | )
147 | 
148 | trainer = GRPOTrainer(
149 |     model = model,
150 |     processing_class = tokenizer,
151 |     reward_funcs = [
152 |         xmlcount_reward_func,
153 |         soft_format_reward_func,
154 |         strict_format_reward_func,
155 |         int_reward_func,
156 |         correctness_reward_func,
157 |     ],
158 |     args = training_args,
159 |     train_dataset = dataset,
160 | )
161 | trainer.train()
162 | 
163 | ##########Inferencing without GPRO Trained#####################
164 | text = tokenizer.apply_chat_template([
165 |     {"role" : "user", "content" : "How many r's are in strawberry?"},
166 | ], tokenize = False, add_generation_prompt = True)
167 | 
168 | from vllm import SamplingParams
169 | sampling_params = SamplingParams(
170 |     temperature = 0.8,
171 |     top_p = 0.95,
172 |     max_tokens = 1024,
173 | )
174 | output = model.fast_generate(
175 |     [text],
176 |     sampling_params = sampling_params,
177 |     lora_request = None,
178 | )[0].outputs[0].text
179 | 
180 | print(output)
181 | 
182 | #Saving Model
183 | model.save_lora("grpo_saved_lora")
184 | 
185 | ##########Inferencing with lora GPRO Trained#####################
186 | text = tokenizer.apply_chat_template([
187 |     {"role" : "system", "content" : SYSTEM_PROMPT},
188 |     {"role" : "user", "content" : "How many r's are in strawberry?"},
189 | ], tokenize = False, add_generation_prompt = True)
190 | 
191 | from vllm import SamplingParams
192 | sampling_params = SamplingParams(
193 |     temperature = 0.8,
194 |     top_p = 0.95,
195 |     max_tokens = 1024,
196 | )
197 | output = model.fast_generate(
198 |     text,
199 |     sampling_params = sampling_params,
200 |     lora_request = model.load_lora("grpo_saved_lora"),
201 | )[0].outputs[0].text
202 | 
203 | print(output)
204 | 
205 | 
206 | 
207 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LLM-Fine-Tuning
2 | This GitHub repository has several examples of fine-tuning of open source large language models. It demonstrates how to fine-tune and quantize large language models using performance efficient fine-tuning techniques like Lora and QLora.
3 | 
4 | Reference  -> https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
5 | 


--------------------------------------------------------------------------------
/call_finetune_intel_neural_chat7B.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from peft import PeftModel, PeftConfig
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | from transformers import BitsAndBytesConfig
 5 | import time
 6 | 
 7 | peft_model_id = "meetrais/finetuned-neural-chat-7b-v3-1"
 8 | config = PeftConfig.from_pretrained(peft_model_id)
 9 | bnb_config = BitsAndBytesConfig(
10 | load_in_4bit=True,
11 | bnb_4bit_use_double_quant=True,
12 | bnb_4bit_quant_type="nf4",
13 | bnb_4bit_compute_dtype=torch.bfloat16
14 | )
15 | model = AutoModelForCausalLM.from_pretrained(peft_model_id,  quantization_config=bnb_config, device_map='auto')
16 | #model = AutoModelForCausalLM.from_pretrained(peft_model_id,  load_in_4bit=True,bnb_4bit_compute_type=torch.float16, bnb_4bit_use_double_quant=True, device_map='auto')
17 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
18 | 
19 | if tokenizer.pad_token is None:
20 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
21 | text = "Capital of USA is"
22 | device = "cuda:0"
23 | 
24 | inputs = tokenizer(text, return_tensors="pt").to(device)
25 | max_new_tokens=30
26 | start = time.time()
27 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
28 | end = time.time()
29 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
30 | 
31 | token_per_seconds = max_new_tokens/(end-start)
32 | print(f"Tokens per second: {token_per_seconds}")
33 | 


--------------------------------------------------------------------------------
/call_finetune_mistral_7b.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from peft import PeftModel, PeftConfig
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | from transformers import BitsAndBytesConfig
 5 | 
 6 | peft_model_id = "meetrais/finetuned_mistral_7b"
 7 | config = PeftConfig.from_pretrained(peft_model_id)
 8 | bnb_config = BitsAndBytesConfig(
 9 | load_in_4bit=True,
10 | bnb_4bit_use_double_quant=True,
11 | bnb_4bit_quant_type="nf4",
12 | bnb_4bit_compute_dtype=torch.bfloat16
13 | )
14 | model = AutoModelForCausalLM.from_pretrained(peft_model_id,  quantization_config=bnb_config, device_map='auto')
15 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
16 | 
17 | if tokenizer.pad_token is None:
18 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
19 | text = "Capital of USA is"
20 | device = "cuda:0"
21 | 
22 | inputs = tokenizer(text, return_tensors="pt").to(device)
23 | 
24 | outputs = model.generate(**inputs, max_new_tokens=20)
25 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))


--------------------------------------------------------------------------------
/finetune_Llama-7b_with_only_lora.py:
--------------------------------------------------------------------------------
 1 | import transformers  as transformers
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
 3 | from trl import SFTTrainer
 4 | from transformers import BitsAndBytesConfig
 5 | import torch
 6 | import torch.nn as nn
 7 | from peft import LoraConfig, get_peft_model 
 8 | from datasets import load_dataset
 9 | 
10 | model_id = "mistralai/Mistral-7B-v0.1"
11 | bnb_config = BitsAndBytesConfig(
12 |     load_in_4bit=True,
13 |     bnb_4bit_use_double_quant=True,
14 |     bnb_4bit_quant_type="nf4",
15 |     bnb_4bit_compute_dtype=torch.bfloat16,
16 |     
17 | )
18 | 
19 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
20 | 
21 | tokenizer = AutoTokenizer.from_pretrained(model_id)
22 | if tokenizer.pad_token is None:
23 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
24 | 
25 | config  = LoraConfig(
26 |     r=16,
27 |     lora_alpha=32,
28 |     lora_dropout=0.05,
29 |     bias = 'none',
30 |     task_type="CAUSAL_LM",
31 |     target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
32 | )
33 | 
34 | 
35 | model = get_peft_model(model, config)
36 | #print_trainable_parameters(model)
37 | 
38 | total_parameters = 0
39 | for name, param in model.named_parameters():
40 |     total_parameters += param.numel()
41 | 
42 | print(f"Total parameters: {total_parameters}")
43 | 
44 | # Freeze the non-Lora parameters
45 | for name, param in model.named_parameters():
46 |     if 'lora' not in name:
47 |         print(f'Freezing non-LoRA parameter {name}')
48 |         param.requires_grad = False
49 | 
50 | #Training
51 | training_arguments = TrainingArguments(
52 |     output_dir= "./results",
53 |     num_train_epochs= 2,
54 |     per_device_train_batch_size= 1,
55 |     gradient_accumulation_steps= 1,
56 |     optim = "paged_adamw_8bit",
57 |     save_steps= 100,
58 |     logging_steps= 30,
59 |     learning_rate= 2e-4,
60 |     weight_decay= 0.001,
61 |     fp16= True,
62 |     bf16= False,
63 |     max_grad_norm= 0.3,
64 |     max_steps= -1,
65 |     warmup_ratio= 0.3,
66 |     group_by_length= True,
67 |     lr_scheduler_type= "constant"
68 | )
69 | # Setting sft parameters
70 | dataset = load_dataset("imdb", split="train")
71 | trainer = SFTTrainer(
72 |     train_dataset=dataset,
73 |     model=model,
74 |     max_seq_length= 20,
75 |     dataset_text_field="text",
76 |     tokenizer=tokenizer,
77 |     args=training_arguments,
78 |     packing= False,
79 | )
80 | 
81 | model.config.use_cache = False
82 | trainer.train()
83 | 
84 | total_parameters = 0
85 | for name, param in model.named_parameters():
86 |     total_parameters += param.numel()
87 | 
88 | print(f"Total parameters after Freeze: {total_parameters}")
89 | 


--------------------------------------------------------------------------------
/finetune_NousResearch_Nous-Hermes-2-SOLAR-10.7B.py:
--------------------------------------------------------------------------------
 1 | import transformers  as transformers
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
 3 | from trl import SFTTrainer
 4 | from transformers import BitsAndBytesConfig
 5 | import torch
 6 | import torch.nn as nn
 7 | from peft import LoraConfig, get_peft_model 
 8 | from datasets import load_dataset
 9 | 
10 | model_id = "NousResearch/Nous-Hermes-2-SOLAR-10.7B"
11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
12 | if tokenizer.pad_token is None:
13 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 | text = "Capital of USA is"
15 | device = "cuda:0"
16 | 
17 | inputs = tokenizer(text, return_tensors="pt").to(device)
18 | 
19 | bnb_config = BitsAndBytesConfig(
20 |     load_in_4bit=True,
21 |     bnb_4bit_use_double_quant=True,
22 |     bnb_4bit_quant_type="nf4",
23 |     bnb_4bit_compute_dtype=torch.bfloat16,
24 |     
25 | )
26 | 
27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
28 | 
29 | config  = LoraConfig(
30 |     r=16,
31 |     lora_alpha=32,
32 |     lora_dropout=0.05,
33 |     bias = 'none',
34 |     task_type="CAUSAL_LM",
35 |     target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
36 | )
37 | 
38 | #print(model)
39 | 
40 | model = get_peft_model(model, config)
41 | outputs = model.generate(**inputs, max_new_tokens=30)
42 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
43 | """
44 | model.push_to_hub("meetrais/finetuned-neural-chat-7b-v3-1",
45 |                   token="Your-Hugging-Face-Token-Here",
46 |                   commit_message="basic training",
47 |                   private=True)
48 | """
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/finetune_Yi_34B.py:
--------------------------------------------------------------------------------
 1 | import transformers  as transformers
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
 3 | from trl import SFTTrainer
 4 | from transformers import BitsAndBytesConfig
 5 | import torch
 6 | import torch.nn as nn
 7 | from peft import LoraConfig, get_peft_model 
 8 | from datasets import load_dataset
 9 | 
10 | model_id = "01-ai/Yi-34B"
11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
12 | if tokenizer.pad_token is None:
13 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 | text = "Capital of USA is"
15 | device = "cuda:0"
16 | 
17 | inputs = tokenizer(text, return_tensors="pt").to(device)
18 | 
19 | bnb_config = BitsAndBytesConfig(
20 |     load_in_4bit=True,
21 |     bnb_4bit_use_double_quant=True,
22 |     bnb_4bit_quant_type="nf4",
23 |     bnb_4bit_compute_dtype=torch.bfloat16
24 | )
25 | 
26 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
27 | 
28 | 
29 | config  = LoraConfig(
30 |     r=16,
31 |     lora_alpha=32,
32 |     lora_dropout=0.05,
33 |     bias = 'none',
34 |     task_type="CAUSAL_LM"
35 | )
36 | model = get_peft_model(model, config)
37 | 
38 | 
39 | outputs = model.generate(**inputs, max_new_tokens=20)
40 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
41 | """
42 | model.push_to_hub("meetrais/finetuned_mistral_7b",
43 |                   token="Your-Hugging-Face-Token-Here",
44 |                   commit_message="basic training",
45 |                   private=True)
46 | """
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/finetune_intel_neural_chat7B.py:
--------------------------------------------------------------------------------
 1 | import transformers  as transformers
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
 3 | from trl import SFTTrainer
 4 | from transformers import BitsAndBytesConfig
 5 | import torch
 6 | import torch.nn as nn
 7 | from peft import LoraConfig, get_peft_model 
 8 | from datasets import load_dataset
 9 | 
10 | model_id = "Intel/neural-chat-7b-v3-1"
11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
12 | if tokenizer.pad_token is None:
13 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 | text = "Capital of USA is"
15 | device = "cuda:0"
16 | 
17 | inputs = tokenizer(text, return_tensors="pt").to(device)
18 | 
19 | bnb_config = BitsAndBytesConfig(
20 |     load_in_4bit=True,
21 |     bnb_4bit_use_double_quant=True,
22 |     bnb_4bit_quant_type="nf4",
23 |     bnb_4bit_compute_dtype=torch.bfloat16,
24 |     
25 | )
26 | 
27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
28 | 
29 | config  = LoraConfig(
30 |     r=16,
31 |     lora_alpha=32,
32 |     lora_dropout=0.05,
33 |     bias = 'none',
34 |     task_type="CAUSAL_LM",
35 |     target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
36 | )
37 | 
38 | #print(model)
39 | 
40 | model = get_peft_model(model, config)
41 | outputs = model.generate(**inputs, max_new_tokens=30)
42 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
43 | """
44 | model.push_to_hub("meetrais/finetuned-neural-chat-7b-v3-1",
45 |                   token="Your-Hugging-Face-Token-Here",
46 |                   commit_message="basic training",
47 |                   private=True)
48 | """
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/finetune_llama_3_1_8B-Instruct.py:
--------------------------------------------------------------------------------
 1 | import transformers  as transformers
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
 3 | from trl import SFTTrainer
 4 | from transformers import BitsAndBytesConfig
 5 | import torch
 6 | import torch.nn as nn
 7 | from peft import LoraConfig, get_peft_model 
 8 | from datasets import load_dataset
 9 | 
10 | model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
12 | if tokenizer.pad_token is None:
13 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 | text = "Capital of USA is"
15 | device = "cuda:0"
16 | 
17 | inputs = tokenizer(text, return_tensors="pt").to(device)
18 | 
19 | bnb_config = BitsAndBytesConfig(
20 |     load_in_4bit=True,
21 |     bnb_4bit_use_double_quant=True,
22 |     bnb_4bit_quant_type="nf4",
23 |     bnb_4bit_compute_dtype=torch.bfloat16,
24 |     load_in_8bit=False
25 | )
26 | 
27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
28 | 
29 | config  = LoraConfig(
30 |     r=16,
31 |     lora_alpha=32,
32 |     lora_dropout=0.05,
33 |     bias = 'none',
34 |     task_type="CAUSAL_LM",
35 |     target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
36 | )
37 | 
38 | model = get_peft_model(model, config)
39 | outputs = model.generate(**inputs, max_new_tokens=30)
40 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
41 | 
42 | model.push_to_hub("meetrais/finetuned_Meta-Llama-3.1-8B-Instruct",
43 |                   token="Your_HF_Token",
44 |                   commit_message="basic training",
45 |                   private=True)
46 | 


--------------------------------------------------------------------------------
/finetune_meta_llama3_8B_instruct.py:
--------------------------------------------------------------------------------
 1 | import transformers  as transformers
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
 3 | from trl import SFTTrainer
 4 | from transformers import BitsAndBytesConfig
 5 | import torch
 6 | import torch.nn as nn
 7 | from peft import LoraConfig, get_peft_model 
 8 | from datasets import load_dataset
 9 | 
10 | print("Torch version:",torch.__version__)
11 | 
12 | print("Is CUDA enabled?",torch.cuda.is_available())
13 | 
14 | model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
15 | tokenizer = AutoTokenizer.from_pretrained(model_id)
16 | if tokenizer.pad_token is None:
17 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
18 | text = "Capital of USA is"
19 | device = "cuda:0"
20 | 
21 | inputs = tokenizer(text, return_tensors="pt").to(device)
22 | 
23 | bnb_config = BitsAndBytesConfig(
24 |     load_in_4bit=True,
25 |     bnb_4bit_use_double_quant=True,
26 |     bnb_4bit_quant_type="nf4",
27 |     bnb_4bit_compute_dtype=torch.bfloat16,
28 |     load_in_8bit=False
29 | )
30 | 
31 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
32 | 
33 | config  = LoraConfig(
34 |     r=16,
35 |     lora_alpha=32,
36 |     lora_dropout=0.05,
37 |     bias = 'none',
38 |     task_type="CAUSAL_LM",
39 |     target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
40 | )
41 | 
42 | model = get_peft_model(model, config)
43 | outputs = model.generate(**inputs, max_new_tokens=30)
44 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
45 | 
46 | 
47 | model.push_to_hub("meetrais/Meta-Llama-3-8B-Instruct-NIM-LORA",
48 |                   token="HF-Access-Key",
49 |                   commit_message="basic training",
50 |                   private=True)
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/finetune_microsoft-Phi-3-mini-128k-instruct.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 3 | 
 4 | torch.random.manual_seed(0)
 5 | 
 6 | model = AutoModelForCausalLM.from_pretrained(
 7 |     "microsoft/Phi-3-mini-128k-instruct", 
 8 |     device_map="cuda", 
 9 |     torch_dtype="auto", 
10 |     trust_remote_code=True
11 | )
12 | tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
13 | 
14 | messages = [
15 |     {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},
16 |     {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
17 |     {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
18 |     {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
19 | ]
20 | 
21 | pipe = pipeline(
22 |     "text-generation",
23 |     model=model,
24 |     tokenizer=tokenizer,
25 | )
26 | 
27 | generation_args = {
28 |     "max_new_tokens": 500,
29 |     "return_full_text": False,
30 |     "temperature": 0.6,
31 |     "do_sample": False,
32 | }
33 | 
34 | output = pipe(messages, **generation_args)
35 | print(output[0]['generated_text'])
36 | 


--------------------------------------------------------------------------------
/finetune_microsoft-phi-2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | 
 4 | model_id = "microsoft/phi-2"
 5 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 6 | 
 7 | text = "What is Capital of USA?"
 8 | inputs = tokenizer(text, return_tensors="pt").to(0)
 9 | 
10 | model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", device_map="cuda", trust_remote_code=True)
11 | outputs = model.generate(**inputs, max_new_tokens=50)
12 | 
13 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
14 | 


--------------------------------------------------------------------------------
/finetune_mistral_7b.py:
--------------------------------------------------------------------------------
  1 | import transformers  as transformers
  2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
  3 | from trl import SFTTrainer
  4 | from transformers import BitsAndBytesConfig
  5 | import torch
  6 | import torch.nn as nn
  7 | from peft import LoraConfig, get_peft_model 
  8 | from datasets import load_dataset
  9 | 
 10 | model_id = "mistralai/Mistral-7B-v0.1"
 11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 12 | if tokenizer.pad_token is None:
 13 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 14 | text = "Capital of USA is"
 15 | device = "cuda:0"
 16 | 
 17 | inputs = tokenizer(text, return_tensors="pt").to(device)
 18 | 
 19 | bnb_config = BitsAndBytesConfig(
 20 |     load_in_4bit=True,
 21 |     bnb_4bit_use_double_quant=True,
 22 |     bnb_4bit_quant_type="nf4",
 23 |     bnb_4bit_compute_dtype=torch.bfloat16,
 24 |     load_in_8bit=False
 25 | )
 26 | 
 27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
 28 | 
 29 | """
 30 | #Freezing the original weights
 31 | for param in model.parameters():
 32 |     param.requires_grad = False
 33 |     if param.ndim ==1:
 34 |         param.data = param.data.to(torch.float32)
 35 | model.gradient_checkpointing_enable()
 36 | model.enable_input_require_grads()
 37 | 
 38 | class CastOutputToFloat(nn.Sequential):
 39 |     def forward(self, x): return super().forward(x).to(torch.float32)
 40 | model.lm_head = CastOutputToFloat(model.lm_head)
 41 | 
 42 | #Setting up the LoRa Adapters
 43 | def print_trainable_parameters(model):
 44 |     trainable_params = 0
 45 |     all_param = 0
 46 |     for _, param in model.named_parameters():
 47 |         all_param += param.numel()
 48 |         if param.requires_grad:
 49 |             trainable_params += param.numel()
 50 |     print(
 51 |         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
 52 |     ) 
 53 | 
 54 | config  = LoraConfig(
 55 |     r=16,
 56 |     lora_alpha=32,
 57 |     lora_dropout=0.05,
 58 |     bias = 'none',
 59 |     task_type="CAUSAL_LM"
 60 | )
 61 | 
 62 | model = get_peft_model(model, config)
 63 | print_trainable_parameters(model)
 64 | dataset_name = "gathnex/Gath_baize"
 65 | dataset = load_dataset(dataset_name, split="train[:1000]")
 66 | dataset["chat_sample"][0]
 67 | 
 68 | def merge_colunms(example):
 69 |     example['prediction'] = example['quote'] + " ->: " + str(example["tags"])
 70 |     return example
 71 | 
 72 | 
 73 | #data['train'] = data['train'].map(merge_colunms)
 74 | #print(data['train'][0])
 75 | 
 76 | #data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)
 77 | 
 78 | #print(data)
 79 | 
 80 | #Training
 81 | training_arguments = TrainingArguments(
 82 |     output_dir= "./results",
 83 |     num_train_epochs= 10,
 84 |     per_device_train_batch_size= 8,
 85 |     gradient_accumulation_steps= 2,
 86 |     optim = "paged_adamw_8bit",
 87 |     save_steps= 100,
 88 |     logging_steps= 30,
 89 |     learning_rate= 2e-4,
 90 |     weight_decay= 0.001,
 91 |     fp16= True,
 92 |     bf16= False,
 93 |     max_grad_norm= 0.3,
 94 |     max_steps= -1,
 95 |     warmup_ratio= 0.3,
 96 |     group_by_length= True,
 97 |     lr_scheduler_type= "constant"
 98 | )
 99 | # Setting sft parameters
100 | trainer = SFTTrainer(
101 |     model=model,
102 |     train_dataset=dataset,
103 |     max_seq_length= 20,
104 |     dataset_text_field="chat_sample",
105 |     tokenizer=tokenizer,
106 |     args=training_arguments,
107 |     packing= False,
108 | )
109 | 
110 | model.config.use_cache = False
111 | trainer.train()
112 | """
113 | #print(model)
114 | 
115 | config  = LoraConfig(
116 |     r=16,
117 |     lora_alpha=32,
118 |     lora_dropout=0.05,
119 |     bias = 'none',
120 |     task_type="CAUSAL_LM",
121 |     target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
122 | )
123 | 
124 | model = get_peft_model(model, config)
125 | outputs = model.generate(**inputs, max_new_tokens=30)
126 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
127 | 
128 | """
129 | model.push_to_hub("meetrais/finetuned_mistral_7b",
130 |                   token="Your-Hugging-Face-Token-Here",
131 |                   commit_message="basic training",
132 |                   private=True)
133 | 
134 | """
135 | 


--------------------------------------------------------------------------------
/finetune_starling-LM-7B-alpha.py:
--------------------------------------------------------------------------------
  1 | import transformers  as transformers
  2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
  3 | from trl import SFTTrainer
  4 | from transformers import BitsAndBytesConfig
  5 | import torch
  6 | import torch.nn as nn
  7 | from peft import LoraConfig, get_peft_model 
  8 | from datasets import load_dataset
  9 | 
 10 | model_id = "berkeley-nest/Starling-LM-7B-alpha"
 11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 12 | if tokenizer.pad_token is None:
 13 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 14 | text = "Capital of USA is"
 15 | device = "cuda:0"
 16 | 
 17 | inputs = tokenizer(text, return_tensors="pt").to(device)
 18 | 
 19 | bnb_config = BitsAndBytesConfig(
 20 |     load_in_4bit=True,
 21 |     bnb_4bit_use_double_quant=True,
 22 |     bnb_4bit_quant_type="nf4",
 23 |     bnb_4bit_compute_dtype=torch.bfloat16,
 24 |     load_in_8bit=False
 25 | )
 26 | 
 27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
 28 | 
 29 | """
 30 | #Freezing the original weights
 31 | for param in model.parameters():
 32 |     param.requires_grad = False
 33 |     if param.ndim ==1:
 34 |         param.data = param.data.to(torch.float32)
 35 | model.gradient_checkpointing_enable()
 36 | model.enable_input_require_grads()
 37 | 
 38 | class CastOutputToFloat(nn.Sequential):
 39 |     def forward(self, x): return super().forward(x).to(torch.float32)
 40 | model.lm_head = CastOutputToFloat(model.lm_head)
 41 | 
 42 | #Setting up the LoRa Adapters
 43 | def print_trainable_parameters(model):
 44 |     trainable_params = 0
 45 |     all_param = 0
 46 |     for _, param in model.named_parameters():
 47 |         all_param += param.numel()
 48 |         if param.requires_grad:
 49 |             trainable_params += param.numel()
 50 |     print(
 51 |         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
 52 |     ) 
 53 | 
 54 | config  = LoraConfig(
 55 |     r=16,
 56 |     lora_alpha=32,
 57 |     lora_dropout=0.05,
 58 |     bias = 'none',
 59 |     task_type="CAUSAL_LM"
 60 | )
 61 | 
 62 | model = get_peft_model(model, config)
 63 | print_trainable_parameters(model)
 64 | dataset_name = "gathnex/Gath_baize"
 65 | dataset = load_dataset(dataset_name, split="train[:1000]")
 66 | dataset["chat_sample"][0]
 67 | 
 68 | def merge_colunms(example):
 69 |     example['prediction'] = example['quote'] + " ->: " + str(example["tags"])
 70 |     return example
 71 | 
 72 | 
 73 | #data['train'] = data['train'].map(merge_colunms)
 74 | #print(data['train'][0])
 75 | 
 76 | #data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)
 77 | 
 78 | #print(data)
 79 | 
 80 | #Training
 81 | training_arguments = TrainingArguments(
 82 |     output_dir= "./results",
 83 |     num_train_epochs= 10,
 84 |     per_device_train_batch_size= 8,
 85 |     gradient_accumulation_steps= 2,
 86 |     optim = "paged_adamw_8bit",
 87 |     save_steps= 100,
 88 |     logging_steps= 30,
 89 |     learning_rate= 2e-4,
 90 |     weight_decay= 0.001,
 91 |     fp16= True,
 92 |     bf16= False,
 93 |     max_grad_norm= 0.3,
 94 |     max_steps= -1,
 95 |     warmup_ratio= 0.3,
 96 |     group_by_length= True,
 97 |     lr_scheduler_type= "constant"
 98 | )
 99 | # Setting sft parameters
100 | trainer = SFTTrainer(
101 |     model=model,
102 |     train_dataset=dataset,
103 |     max_seq_length= 20,
104 |     dataset_text_field="chat_sample",
105 |     tokenizer=tokenizer,
106 |     args=training_arguments,
107 |     packing= False,
108 | )
109 | 
110 | model.config.use_cache = False
111 | trainer.train()
112 | """
113 | config  = LoraConfig(
114 |     r=16,
115 |     lora_alpha=32,
116 |     lora_dropout=0.05,
117 |     bias = 'none',
118 |     task_type="CAUSAL_LM",
119 |     target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
120 | )
121 | 
122 | #print(model)
123 | model = get_peft_model(model, config)
124 | outputs = model.generate(**inputs, max_new_tokens=30)
125 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
126 | 
127 | """
128 | model.push_to_hub("meetrais/finetuned_mistral_7b",
129 |                   token="Your-Hugging-Face-Token-Here",
130 |                   commit_message="basic training",
131 |                   private=True)
132 | 
133 | """
134 | 


--------------------------------------------------------------------------------
/web_chat_bot_finetuned_mistral_7b.py:
--------------------------------------------------------------------------------
 1 | # Import necessary libraries
 2 | from threading import Thread
 3 | import argparse
 4 | import os
 5 | import torch
 6 | import gradio as gr
 7 | from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 8 | from peft import PeftModel, PeftConfig
 9 | from transformers import BitsAndBytesConfig
10 | 
11 | peft_model_id = "meetrais/finetuned_mistral_7b"
12 | config = PeftConfig.from_pretrained(peft_model_id)
13 | bnb_config = BitsAndBytesConfig(
14 | load_in_4bit=True,
15 | bnb_4bit_use_double_quant=True,
16 | bnb_4bit_quant_type="nf4",
17 | bnb_4bit_compute_dtype=torch.bfloat16
18 | )
19 | model = AutoModelForCausalLM.from_pretrained(peft_model_id,  quantization_config=bnb_config, device_map='auto')
20 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
21 | 
22 | if tokenizer.pad_token is None:
23 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
24 | 
25 | usingAdapter = True
26 | 
27 | device =  "cuda:0"
28 | # Function to run the text generation process
29 | def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
30 |     #template = "### Text: {}\n### The tone is:\n"
31 |     #model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt")
32 |     #model_inputs = model_inputs.to(device) 
33 |     model_inputs= tokenizer(user_text, return_tensors="pt").to(device)
34 | 
35 |     # Generate text in a separate thread
36 |     streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
37 |     generate_kwargs = dict(
38 |         **model_inputs,
39 |         max_new_tokens=max_new_tokens, 
40 |         pad_token_id=tokenizer.eos_token_id,
41 |         streamer=streamer,
42 |     )
43 |     thread = Thread(target=model.generate, kwargs=generate_kwargs)
44 |     thread.start()
45 |     
46 |     # Retrieve and yield the generated text
47 |     model_output = ""
48 |     for new_text in streamer:
49 |         model_output += new_text
50 |         yield model_output
51 |     return model_output
52 | 
53 | # Gradio UI setup
54 | with gr.Blocks() as demo:
55 |     with gr.Row():
56 |         with gr.Column(scale=4):
57 |             user_text = gr.Textbox(placeholder="Write your question here", label="User input")
58 |             model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
59 |             button_submit = gr.Button(value="Submit")
60 | 
61 |         with gr.Column(scale=1):
62 |             max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=250, step=1, label="Max New Tokens")
63 |             top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
64 |             top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k")
65 |             temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature")
66 | 
67 |     user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
68 |     button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
69 | 
70 |     demo.queue(max_size=32).launch(server_port=8082)
71 | 


--------------------------------------------------------------------------------
/web_chat_bot_llama3_8b_instruct.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | import torch
 3 | import gradio as gr
 4 | 
 5 | device =  "cuda:0"
 6 | # Function to run the text generation process
 7 | def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
 8 |     model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 9 |     device =  "cuda:0"
10 |     pipeline = transformers.pipeline(
11 |         "text-generation",
12 |         model=model_id,
13 |         model_kwargs={"torch_dtype": torch.bfloat16},
14 |         device=device,
15 |     )
16 |     
17 |     messages = [
18 |     {"role": "system", "content": "You are a helpfull assistant."},
19 |     {"role": "user", "content": user_text},
20 | ]
21 | 
22 |     prompt = pipeline.tokenizer.apply_chat_template(
23 |             messages, 
24 |             tokenize=False, 
25 |             add_generation_prompt=True
26 |     )
27 | 
28 |     terminators = [
29 |         pipeline.tokenizer.eos_token_id,
30 |         pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
31 |     ]
32 | 
33 |     outputs = pipeline(
34 |         prompt,
35 |         max_new_tokens=max_new_tokens,
36 |         eos_token_id=terminators,
37 |         do_sample=True,
38 |         temperature=temperature,
39 |         top_p=top_p,
40 |         top_k=top_k
41 |     )
42 |     return outputs[0]["generated_text"][len(prompt):]
43 | 
44 | # Gradio UI setup
45 | with gr.Blocks() as demo:
46 |     with gr.Row():
47 |         with gr.Column(scale=4):
48 |             user_text = gr.Textbox(placeholder="Write your question here", label="User input")
49 |             model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
50 |             button_submit = gr.Button(value="Submit")
51 | 
52 |         with gr.Column(scale=1):
53 |             max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=250, step=1, label="Max New Tokens")
54 |             top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
55 |             top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k")
56 |             temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature")
57 | 
58 |     user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
59 |     button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
60 | 
61 |     demo.queue(max_size=32).launch(server_port=8082)


--------------------------------------------------------------------------------
/web_microsoft-Phi-3-mini-128k-instruct.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 3 | import gradio as gr
 4 | 
 5 | device =  "cuda:0"
 6 | # Function to run the text generation process
 7 | def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
 8 |     torch.random.manual_seed(0)
 9 | 
10 |     model = AutoModelForCausalLM.from_pretrained(
11 |         "microsoft/Phi-3-mini-128k-instruct", 
12 |         device_map="cuda", 
13 |         torch_dtype="auto", 
14 |         trust_remote_code=True
15 |     )
16 |     tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
17 | 
18 |     messages = [
19 |         {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},
20 |         {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
21 |         {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
22 |         {"role": "user", "content": "{0}".format(user_text)},
23 |     ]
24 | 
25 |     pipe = pipeline(
26 |         "text-generation",
27 |         model=model,
28 |         tokenizer=tokenizer,
29 |     )
30 | 
31 |     generation_args = {
32 |         "max_new_tokens": max_new_tokens,
33 |         "return_full_text": False,
34 |         "temperature": temperature,
35 |         "do_sample": False,
36 |     }
37 | 
38 |     output = pipe(messages, **generation_args)
39 |     return output[0]['generated_text']
40 | 
41 | # Gradio UI setup
42 | with gr.Blocks() as demo:
43 |     with gr.Row():
44 |         with gr.Column(scale=4):
45 |             user_text = gr.Textbox(placeholder="Write your question here", label="User input")
46 |             model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
47 |             button_submit = gr.Button(value="Submit")
48 | 
49 |         with gr.Column(scale=1):
50 |             max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=250, step=1, label="Max New Tokens")
51 |             top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
52 |             top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k")
53 |             temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature")
54 | 
55 |     user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
56 |     button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
57 | 
58 |     demo.queue(max_size=32).launch(server_port=8082)


--------------------------------------------------------------------------------