├── Finetune-Bloom7B-tagger.py
├── Finetune-meta-Llama-2-7b-hf.py
├── Qwen2.5_3B_GRPO.py
├── README.md
├── call_finetune_intel_neural_chat7B.py
├── call_finetune_mistral_7b.py
├── finetune_Llama-7b_with_only_lora.py
├── finetune_NousResearch_Nous-Hermes-2-SOLAR-10.7B.py
├── finetune_Yi_34B.py
├── finetune_intel_neural_chat7B.py
├── finetune_llama_3_1_8B-Instruct.py
├── finetune_meta_llama3_8B_instruct.py
├── finetune_microsoft-Phi-3-mini-128k-instruct.py
├── finetune_microsoft-phi-2.py
├── finetune_mistral_7b.py
├── finetune_starling-LM-7B-alpha.py
├── web_chat_bot_finetuned_mistral_7b.py
├── web_chat_bot_llama3_8b_instruct.py
└── web_microsoft-Phi-3-mini-128k-instruct.py
/Finetune-Bloom7B-tagger.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import torch.nn as nn
4 | import bitsandbytes as bnb
5 | import transformers as transformers
6 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
7 | from transformers import BitsAndBytesConfig
8 | from peft import LoraConfig, get_peft_model
9 | from datasets import load_dataset
10 |
11 |
12 | #Setup the model
13 | model_id="bigscience/bloom-1b7"
14 | tokenizer = AutoTokenizer.from_pretrained(model_id)
15 | model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
16 |
17 | print(model.get_memory_footprint())
18 |
19 | '''
20 | Change the compute dtype
21 | The compute dtype is used to change the dtype that will be used during computation.
22 | For example, hidden states could be in float32 but computation can be set to bf16 for speedups. By default, the compute dtype is set to float32.
23 |
24 | quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
25 |
26 | '''
27 |
28 | '''
29 | Using NF4 (Normal Float 4) data type
30 | You can also use the NF4 data type, which is a new 4bit datatype adapted for weights that have been initialized using a normal distribution. For that run:
31 |
32 | nf4_config = BitsAndBytesConfig(
33 | load_in_4bit=True,
34 | bnb_4bit_quant_type="nf4",
35 | )
36 |
37 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
38 | '''
39 |
40 | '''
41 | Use nested quantization for more memory efficient inference
42 | We also advise users to use the nested quantization technique. This saves more memory at no additional performance - from our empirical observations,
43 | this enables fine-tuning llama-13b model on an NVIDIA-T4 16GB with a sequence length of 1024, batch size of 1 and gradient accumulation steps of 4.
44 |
45 | double_quant_config = BitsAndBytesConfig(
46 | load_in_4bit=True,
47 | bnb_4bit_use_double_quant=True,
48 | )
49 |
50 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=double_quant_config)
51 | '''
52 |
53 | #Freezing the original weights
54 | for param in model.parameters():
55 | param.requires_grad = False
56 | if param.ndim ==1:
57 | param.data = param.data.to(torch.float32)
58 | model.gradient_checkpointing_enable()
59 | model.enable_input_require_grads()
60 |
61 | class CastOutputToFloat(nn.Sequential):
62 | def forward(self, x): return super().forward(x).to(torch.float32)
63 | model.lm_head = CastOutputToFloat(model.lm_head)
64 |
65 | #Setting up the LoRa Adapters
66 | def print_trainable_parameters(model):
67 | trainable_params = 0
68 | all_param = 0
69 | for _, param in model.named_parameters():
70 | all_param += param.numel()
71 | if param.requires_grad:
72 | trainable_params += param.numel()
73 | print(
74 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
75 | )
76 |
77 | config = LoraConfig(
78 | r=16,
79 | lora_alpha=32,
80 | lora_dropout=0.05,
81 | bias = 'none',
82 | task_type="CAUSAL_LM"
83 | )
84 |
85 | model = get_peft_model(model, config)
86 | print_trainable_parameters(model)
87 | data = load_dataset("Abirate/english_quotes")
88 |
89 | def merge_colunms(example):
90 | example['prediction'] = example['quote'] + " ->: " + str(example["tags"])
91 | return example
92 |
93 | data['train'] = data['train'].map(merge_colunms)
94 | print(data['train']["prediction"][:5])
95 | print(data['train'][0])
96 |
97 | data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)
98 |
99 | print(data)
100 |
101 | #Training
102 | trainer = transformers.Trainer(
103 | model=model,
104 | train_dataset=data['train'],
105 | args=transformers.TrainingArguments(
106 | per_gpu_train_batch_size=4,
107 | gradient_accumulation_steps=4,
108 | warmup_steps=100,
109 | max_steps=200,
110 | learning_rate=2e-4,
111 | fp16=True,
112 | logging_steps=1,
113 | output_dir='outputs'
114 | ),
115 | data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
116 | )
117 |
118 | model.config.use_cache = False
119 | trainer.train()
120 |
121 | model.push_to_hub("meetrais/bloom-7b1-lora-tagger",
122 | token="HuggingFace-app-key",
123 | commit_message="basic training",
124 | private=True)
--------------------------------------------------------------------------------
/Finetune-meta-Llama-2-7b-hf.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import torch.nn as nn
4 | import bitsandbytes as bnb
5 | import transformers as transformers
6 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
7 | from transformers import BitsAndBytesConfig
8 | from peft import LoraConfig, get_peft_model
9 | from datasets import load_dataset
10 |
11 |
12 | #Setup the model
13 | model_id="meta-llama/Llama-2-7b-hf"
14 | tokenizer = AutoTokenizer.from_pretrained(model_id)
15 | model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
16 |
17 | print(model.get_memory_footprint())
18 | #Freezing the original weights
19 | for param in model.parameters():
20 | param.requires_grad = False
21 | if param.ndim ==1:
22 | param.data = param.data.to(torch.float32)
23 | model.gradient_checkpointing_enable()
24 | model.enable_input_require_grads()
25 |
26 | class CastOutputToFloat(nn.Sequential):
27 | def forward(self, x): return super().forward(x).to(torch.float32)
28 | model.lm_head = CastOutputToFloat(model.lm_head)
29 |
30 | #Setting up the LoRa Adapters
31 | def print_trainable_parameters(model):
32 | trainable_params = 0
33 | all_param = 0
34 | for _, param in model.named_parameters():
35 | all_param += param.numel()
36 | if param.requires_grad:
37 | trainable_params += param.numel()
38 | print(
39 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
40 | )
41 |
42 | config = LoraConfig(
43 | r=16,
44 | lora_alpha=32,
45 | lora_dropout=0.05,
46 | bias = 'none',
47 | task_type="CAUSAL_LM"
48 | )
49 |
50 | model = get_peft_model(model, config)
51 | print_trainable_parameters(model)
52 | data = load_dataset("Abirate/english_quotes")
53 |
54 | def merge_colunms(example):
55 | example['prediction'] = example['quote'] + " ->: " + str(example["tags"])
56 | return example
57 |
58 | data['train'] = data['train'].map(merge_colunms)
59 | print(data['train']["prediction"][:5])
60 | print(data['train'][0])
61 |
62 | data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)
63 |
64 | print(data)
65 |
66 | #Training
67 | trainer = transformers.Trainer(
68 | model=model,
69 | train_dataset=data['train'],
70 | args=transformers.TrainingArguments(
71 | per_gpu_train_batch_size=4,
72 | gradient_accumulation_steps=4,
73 | warmup_steps=100,
74 | max_steps=200,
75 | learning_rate=2e-4,
76 | fp16=True,
77 | logging_steps=1,
78 | output_dir='outputs'
79 | ),
80 | data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
81 | )
82 |
83 | model.config.use_cache = False
84 | trainer.train()
85 |
86 | model.push_to_hub("meetrais/meta-Llama-2-7b-hf-finetuned",
87 | token="HuggingFace-app-key",
88 | commit_message="basic training",
89 | private=True)
90 |
--------------------------------------------------------------------------------
/Qwen2.5_3B_GRPO.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import re
3 | from datasets import load_dataset, Dataset
4 | from unsloth import FastLanguageModel, PatchFastRL, is_bfloat16_supported
5 | from trl import GRPOConfig, GRPOTrainer
6 |
7 | PatchFastRL("GRPO", FastLanguageModel)
8 | torch.distributed.launch=True
9 |
10 | max_seq_length = 256 # Can increase for longer reasoning traces
11 | lora_rank = 32 # Larger rank = smarter, but slower
12 |
13 | model, tokenizer = FastLanguageModel.from_pretrained(
14 | model_name = "Qwen/Qwen2.5-3B-Instruct",
15 | max_seq_length = max_seq_length,
16 | load_in_4bit = True, # False for LoRA 16bit
17 | fast_inference = False, # Disable vLLM fast inference for LoRA training
18 | max_lora_rank = lora_rank,
19 | gpu_memory_utilization = 0.9, # Reduce if out of memory,
20 | )
21 |
22 | model = FastLanguageModel.get_peft_model(
23 | model,
24 | r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
25 | target_modules = [
26 | "q_proj", "k_proj", "v_proj", "o_proj",
27 | ], # Remove QKVO if out of memory
28 | lora_alpha = lora_rank,
29 | use_gradient_checkpointing = "unsloth", # Enable long context finetuning
30 | random_state = 3407,
31 | )
32 |
33 | # Load and prep dataset
34 | SYSTEM_PROMPT = """
35 | Respond in the following format:
36 |
37 | ...
38 |
39 |
40 | ...
41 |
42 | """
43 |
44 | XML_COT_FORMAT = """\
45 |
46 | {reasoning}
47 |
48 |
49 | {answer}
50 |
51 | """
52 |
53 | def extract_xml_answer(text: str) -> str:
54 | answer = text.split("")[-1]
55 | answer = answer.split("")[0]
56 | return answer.strip()
57 |
58 | def extract_hash_answer(text: str) -> str | None:
59 | if "####" not in text:
60 | return None
61 | return text.split("####")[1].strip()
62 |
63 | # uncomment middle messages for 1-shot prompting
64 | def get_gsm8k_questions(split = "train") -> Dataset:
65 | data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
66 | data = data.map(lambda x: { # type: ignore
67 | 'prompt': [
68 | {'role': 'system', 'content': SYSTEM_PROMPT},
69 | {'role': 'user', 'content': x['question']}
70 | ],
71 | 'answer': extract_hash_answer(x['answer'])
72 | }) # type: ignore
73 | return data # type: ignore
74 |
75 | dataset = get_gsm8k_questions()
76 |
77 | # Reward functions
78 | def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
79 | responses = [completion[0]['content'] for completion in completions]
80 | q = prompts[0][-1]['content']
81 | extracted_responses = [extract_xml_answer(r) for r in responses]
82 | print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
83 | return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]
84 |
85 | def int_reward_func(completions, **kwargs) -> list[float]:
86 | responses = [completion[0]['content'] for completion in completions]
87 | extracted_responses = [extract_xml_answer(r) for r in responses]
88 | return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]
89 |
90 | def strict_format_reward_func(completions, **kwargs) -> list[float]:
91 | """Reward function that checks if the completion has a specific format."""
92 | pattern = r"^\n.*?\n\n\n.*?\n\n$"
93 | responses = [completion[0]["content"] for completion in completions]
94 | matches = [re.match(pattern, r) for r in responses]
95 | return [0.5 if match else 0.0 for match in matches]
96 |
97 | def soft_format_reward_func(completions, **kwargs) -> list[float]:
98 | """Reward function that checks if the completion has a specific format."""
99 | pattern = r".*?\s*.*?"
100 | responses = [completion[0]["content"] for completion in completions]
101 | matches = [re.match(pattern, r) for r in responses]
102 | return [0.5 if match else 0.0 for match in matches]
103 |
104 | def count_xml(text) -> float:
105 | count = 0.0
106 | if text.count("\n") == 1:
107 | count += 0.125
108 | if text.count("\n\n") == 1:
109 | count += 0.125
110 | if text.count("\n\n") == 1:
111 | count += 0.125
112 | count -= len(text.split("\n\n")[-1])*0.001
113 | if text.count("\n") == 1:
114 | count += 0.125
115 | count -= (len(text.split("\n")[-1]) - 1)*0.001
116 | return count
117 |
118 | def xmlcount_reward_func(completions, **kwargs) -> list[float]:
119 | contents = [completion[0]["content"] for completion in completions]
120 | return [count_xml(c) for c in contents]
121 |
122 | ####### Training #########################
123 | training_args = GRPOConfig(
124 | use_vllm = False, # disable vLLM to avoid device conflicts
125 | learning_rate = 5e-6,
126 | adam_beta1 = 0.9,
127 | adam_beta2 = 0.99,
128 | weight_decay = 0.1,
129 | warmup_ratio = 0.1,
130 | lr_scheduler_type = "cosine",
131 | optim = "adamw_8bit",
132 | logging_steps = 1,
133 | bf16 = is_bfloat16_supported(),
134 | fp16 = not is_bfloat16_supported(),
135 | per_device_train_batch_size = 1,
136 | gradient_accumulation_steps = 1, # Increase to 4 for smoother training
137 | num_generations = 8, # Decrease if out of memory
138 | max_prompt_length = 256,
139 | max_completion_length = 200,
140 | # num_train_epochs = 1, # Set to 1 for a full training run
141 | max_steps = 250,
142 | save_steps = 250,
143 | max_grad_norm = 0.1,
144 | report_to = "none", # Can use Weights & Biases
145 | output_dir = "outputs",
146 | )
147 |
148 | trainer = GRPOTrainer(
149 | model = model,
150 | processing_class = tokenizer,
151 | reward_funcs = [
152 | xmlcount_reward_func,
153 | soft_format_reward_func,
154 | strict_format_reward_func,
155 | int_reward_func,
156 | correctness_reward_func,
157 | ],
158 | args = training_args,
159 | train_dataset = dataset,
160 | )
161 | trainer.train()
162 |
163 | ##########Inferencing without GPRO Trained#####################
164 | text = tokenizer.apply_chat_template([
165 | {"role" : "user", "content" : "How many r's are in strawberry?"},
166 | ], tokenize = False, add_generation_prompt = True)
167 |
168 | from vllm import SamplingParams
169 | sampling_params = SamplingParams(
170 | temperature = 0.8,
171 | top_p = 0.95,
172 | max_tokens = 1024,
173 | )
174 | output = model.fast_generate(
175 | [text],
176 | sampling_params = sampling_params,
177 | lora_request = None,
178 | )[0].outputs[0].text
179 |
180 | print(output)
181 |
182 | #Saving Model
183 | model.save_lora("grpo_saved_lora")
184 |
185 | ##########Inferencing with lora GPRO Trained#####################
186 | text = tokenizer.apply_chat_template([
187 | {"role" : "system", "content" : SYSTEM_PROMPT},
188 | {"role" : "user", "content" : "How many r's are in strawberry?"},
189 | ], tokenize = False, add_generation_prompt = True)
190 |
191 | from vllm import SamplingParams
192 | sampling_params = SamplingParams(
193 | temperature = 0.8,
194 | top_p = 0.95,
195 | max_tokens = 1024,
196 | )
197 | output = model.fast_generate(
198 | text,
199 | sampling_params = sampling_params,
200 | lora_request = model.load_lora("grpo_saved_lora"),
201 | )[0].outputs[0].text
202 |
203 | print(output)
204 |
205 |
206 |
207 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LLM-Fine-Tuning
2 | This GitHub repository has several examples of fine-tuning of open source large language models. It demonstrates how to fine-tune and quantize large language models using performance efficient fine-tuning techniques like Lora and QLora.
3 |
4 | Reference -> https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
5 |
--------------------------------------------------------------------------------
/call_finetune_intel_neural_chat7B.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from peft import PeftModel, PeftConfig
3 | from transformers import AutoModelForCausalLM, AutoTokenizer
4 | from transformers import BitsAndBytesConfig
5 | import time
6 |
7 | peft_model_id = "meetrais/finetuned-neural-chat-7b-v3-1"
8 | config = PeftConfig.from_pretrained(peft_model_id)
9 | bnb_config = BitsAndBytesConfig(
10 | load_in_4bit=True,
11 | bnb_4bit_use_double_quant=True,
12 | bnb_4bit_quant_type="nf4",
13 | bnb_4bit_compute_dtype=torch.bfloat16
14 | )
15 | model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=bnb_config, device_map='auto')
16 | #model = AutoModelForCausalLM.from_pretrained(peft_model_id, load_in_4bit=True,bnb_4bit_compute_type=torch.float16, bnb_4bit_use_double_quant=True, device_map='auto')
17 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
18 |
19 | if tokenizer.pad_token is None:
20 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
21 | text = "Capital of USA is"
22 | device = "cuda:0"
23 |
24 | inputs = tokenizer(text, return_tensors="pt").to(device)
25 | max_new_tokens=30
26 | start = time.time()
27 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
28 | end = time.time()
29 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
30 |
31 | token_per_seconds = max_new_tokens/(end-start)
32 | print(f"Tokens per second: {token_per_seconds}")
33 |
--------------------------------------------------------------------------------
/call_finetune_mistral_7b.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from peft import PeftModel, PeftConfig
3 | from transformers import AutoModelForCausalLM, AutoTokenizer
4 | from transformers import BitsAndBytesConfig
5 |
6 | peft_model_id = "meetrais/finetuned_mistral_7b"
7 | config = PeftConfig.from_pretrained(peft_model_id)
8 | bnb_config = BitsAndBytesConfig(
9 | load_in_4bit=True,
10 | bnb_4bit_use_double_quant=True,
11 | bnb_4bit_quant_type="nf4",
12 | bnb_4bit_compute_dtype=torch.bfloat16
13 | )
14 | model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=bnb_config, device_map='auto')
15 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
16 |
17 | if tokenizer.pad_token is None:
18 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
19 | text = "Capital of USA is"
20 | device = "cuda:0"
21 |
22 | inputs = tokenizer(text, return_tensors="pt").to(device)
23 |
24 | outputs = model.generate(**inputs, max_new_tokens=20)
25 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
--------------------------------------------------------------------------------
/finetune_Llama-7b_with_only_lora.py:
--------------------------------------------------------------------------------
1 | import transformers as transformers
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
3 | from trl import SFTTrainer
4 | from transformers import BitsAndBytesConfig
5 | import torch
6 | import torch.nn as nn
7 | from peft import LoraConfig, get_peft_model
8 | from datasets import load_dataset
9 |
10 | model_id = "mistralai/Mistral-7B-v0.1"
11 | bnb_config = BitsAndBytesConfig(
12 | load_in_4bit=True,
13 | bnb_4bit_use_double_quant=True,
14 | bnb_4bit_quant_type="nf4",
15 | bnb_4bit_compute_dtype=torch.bfloat16,
16 |
17 | )
18 |
19 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
20 |
21 | tokenizer = AutoTokenizer.from_pretrained(model_id)
22 | if tokenizer.pad_token is None:
23 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
24 |
25 | config = LoraConfig(
26 | r=16,
27 | lora_alpha=32,
28 | lora_dropout=0.05,
29 | bias = 'none',
30 | task_type="CAUSAL_LM",
31 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
32 | )
33 |
34 |
35 | model = get_peft_model(model, config)
36 | #print_trainable_parameters(model)
37 |
38 | total_parameters = 0
39 | for name, param in model.named_parameters():
40 | total_parameters += param.numel()
41 |
42 | print(f"Total parameters: {total_parameters}")
43 |
44 | # Freeze the non-Lora parameters
45 | for name, param in model.named_parameters():
46 | if 'lora' not in name:
47 | print(f'Freezing non-LoRA parameter {name}')
48 | param.requires_grad = False
49 |
50 | #Training
51 | training_arguments = TrainingArguments(
52 | output_dir= "./results",
53 | num_train_epochs= 2,
54 | per_device_train_batch_size= 1,
55 | gradient_accumulation_steps= 1,
56 | optim = "paged_adamw_8bit",
57 | save_steps= 100,
58 | logging_steps= 30,
59 | learning_rate= 2e-4,
60 | weight_decay= 0.001,
61 | fp16= True,
62 | bf16= False,
63 | max_grad_norm= 0.3,
64 | max_steps= -1,
65 | warmup_ratio= 0.3,
66 | group_by_length= True,
67 | lr_scheduler_type= "constant"
68 | )
69 | # Setting sft parameters
70 | dataset = load_dataset("imdb", split="train")
71 | trainer = SFTTrainer(
72 | train_dataset=dataset,
73 | model=model,
74 | max_seq_length= 20,
75 | dataset_text_field="text",
76 | tokenizer=tokenizer,
77 | args=training_arguments,
78 | packing= False,
79 | )
80 |
81 | model.config.use_cache = False
82 | trainer.train()
83 |
84 | total_parameters = 0
85 | for name, param in model.named_parameters():
86 | total_parameters += param.numel()
87 |
88 | print(f"Total parameters after Freeze: {total_parameters}")
89 |
--------------------------------------------------------------------------------
/finetune_NousResearch_Nous-Hermes-2-SOLAR-10.7B.py:
--------------------------------------------------------------------------------
1 | import transformers as transformers
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
3 | from trl import SFTTrainer
4 | from transformers import BitsAndBytesConfig
5 | import torch
6 | import torch.nn as nn
7 | from peft import LoraConfig, get_peft_model
8 | from datasets import load_dataset
9 |
10 | model_id = "NousResearch/Nous-Hermes-2-SOLAR-10.7B"
11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
12 | if tokenizer.pad_token is None:
13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 | text = "Capital of USA is"
15 | device = "cuda:0"
16 |
17 | inputs = tokenizer(text, return_tensors="pt").to(device)
18 |
19 | bnb_config = BitsAndBytesConfig(
20 | load_in_4bit=True,
21 | bnb_4bit_use_double_quant=True,
22 | bnb_4bit_quant_type="nf4",
23 | bnb_4bit_compute_dtype=torch.bfloat16,
24 |
25 | )
26 |
27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
28 |
29 | config = LoraConfig(
30 | r=16,
31 | lora_alpha=32,
32 | lora_dropout=0.05,
33 | bias = 'none',
34 | task_type="CAUSAL_LM",
35 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
36 | )
37 |
38 | #print(model)
39 |
40 | model = get_peft_model(model, config)
41 | outputs = model.generate(**inputs, max_new_tokens=30)
42 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
43 | """
44 | model.push_to_hub("meetrais/finetuned-neural-chat-7b-v3-1",
45 | token="Your-Hugging-Face-Token-Here",
46 | commit_message="basic training",
47 | private=True)
48 | """
49 |
50 |
51 |
--------------------------------------------------------------------------------
/finetune_Yi_34B.py:
--------------------------------------------------------------------------------
1 | import transformers as transformers
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
3 | from trl import SFTTrainer
4 | from transformers import BitsAndBytesConfig
5 | import torch
6 | import torch.nn as nn
7 | from peft import LoraConfig, get_peft_model
8 | from datasets import load_dataset
9 |
10 | model_id = "01-ai/Yi-34B"
11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
12 | if tokenizer.pad_token is None:
13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 | text = "Capital of USA is"
15 | device = "cuda:0"
16 |
17 | inputs = tokenizer(text, return_tensors="pt").to(device)
18 |
19 | bnb_config = BitsAndBytesConfig(
20 | load_in_4bit=True,
21 | bnb_4bit_use_double_quant=True,
22 | bnb_4bit_quant_type="nf4",
23 | bnb_4bit_compute_dtype=torch.bfloat16
24 | )
25 |
26 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
27 |
28 |
29 | config = LoraConfig(
30 | r=16,
31 | lora_alpha=32,
32 | lora_dropout=0.05,
33 | bias = 'none',
34 | task_type="CAUSAL_LM"
35 | )
36 | model = get_peft_model(model, config)
37 |
38 |
39 | outputs = model.generate(**inputs, max_new_tokens=20)
40 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
41 | """
42 | model.push_to_hub("meetrais/finetuned_mistral_7b",
43 | token="Your-Hugging-Face-Token-Here",
44 | commit_message="basic training",
45 | private=True)
46 | """
47 |
48 |
49 |
--------------------------------------------------------------------------------
/finetune_intel_neural_chat7B.py:
--------------------------------------------------------------------------------
1 | import transformers as transformers
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
3 | from trl import SFTTrainer
4 | from transformers import BitsAndBytesConfig
5 | import torch
6 | import torch.nn as nn
7 | from peft import LoraConfig, get_peft_model
8 | from datasets import load_dataset
9 |
10 | model_id = "Intel/neural-chat-7b-v3-1"
11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
12 | if tokenizer.pad_token is None:
13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 | text = "Capital of USA is"
15 | device = "cuda:0"
16 |
17 | inputs = tokenizer(text, return_tensors="pt").to(device)
18 |
19 | bnb_config = BitsAndBytesConfig(
20 | load_in_4bit=True,
21 | bnb_4bit_use_double_quant=True,
22 | bnb_4bit_quant_type="nf4",
23 | bnb_4bit_compute_dtype=torch.bfloat16,
24 |
25 | )
26 |
27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
28 |
29 | config = LoraConfig(
30 | r=16,
31 | lora_alpha=32,
32 | lora_dropout=0.05,
33 | bias = 'none',
34 | task_type="CAUSAL_LM",
35 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
36 | )
37 |
38 | #print(model)
39 |
40 | model = get_peft_model(model, config)
41 | outputs = model.generate(**inputs, max_new_tokens=30)
42 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
43 | """
44 | model.push_to_hub("meetrais/finetuned-neural-chat-7b-v3-1",
45 | token="Your-Hugging-Face-Token-Here",
46 | commit_message="basic training",
47 | private=True)
48 | """
49 |
50 |
51 |
--------------------------------------------------------------------------------
/finetune_llama_3_1_8B-Instruct.py:
--------------------------------------------------------------------------------
1 | import transformers as transformers
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
3 | from trl import SFTTrainer
4 | from transformers import BitsAndBytesConfig
5 | import torch
6 | import torch.nn as nn
7 | from peft import LoraConfig, get_peft_model
8 | from datasets import load_dataset
9 |
10 | model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
12 | if tokenizer.pad_token is None:
13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 | text = "Capital of USA is"
15 | device = "cuda:0"
16 |
17 | inputs = tokenizer(text, return_tensors="pt").to(device)
18 |
19 | bnb_config = BitsAndBytesConfig(
20 | load_in_4bit=True,
21 | bnb_4bit_use_double_quant=True,
22 | bnb_4bit_quant_type="nf4",
23 | bnb_4bit_compute_dtype=torch.bfloat16,
24 | load_in_8bit=False
25 | )
26 |
27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
28 |
29 | config = LoraConfig(
30 | r=16,
31 | lora_alpha=32,
32 | lora_dropout=0.05,
33 | bias = 'none',
34 | task_type="CAUSAL_LM",
35 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
36 | )
37 |
38 | model = get_peft_model(model, config)
39 | outputs = model.generate(**inputs, max_new_tokens=30)
40 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
41 |
42 | model.push_to_hub("meetrais/finetuned_Meta-Llama-3.1-8B-Instruct",
43 | token="Your_HF_Token",
44 | commit_message="basic training",
45 | private=True)
46 |
--------------------------------------------------------------------------------
/finetune_meta_llama3_8B_instruct.py:
--------------------------------------------------------------------------------
1 | import transformers as transformers
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
3 | from trl import SFTTrainer
4 | from transformers import BitsAndBytesConfig
5 | import torch
6 | import torch.nn as nn
7 | from peft import LoraConfig, get_peft_model
8 | from datasets import load_dataset
9 |
10 | print("Torch version:",torch.__version__)
11 |
12 | print("Is CUDA enabled?",torch.cuda.is_available())
13 |
14 | model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
15 | tokenizer = AutoTokenizer.from_pretrained(model_id)
16 | if tokenizer.pad_token is None:
17 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
18 | text = "Capital of USA is"
19 | device = "cuda:0"
20 |
21 | inputs = tokenizer(text, return_tensors="pt").to(device)
22 |
23 | bnb_config = BitsAndBytesConfig(
24 | load_in_4bit=True,
25 | bnb_4bit_use_double_quant=True,
26 | bnb_4bit_quant_type="nf4",
27 | bnb_4bit_compute_dtype=torch.bfloat16,
28 | load_in_8bit=False
29 | )
30 |
31 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
32 |
33 | config = LoraConfig(
34 | r=16,
35 | lora_alpha=32,
36 | lora_dropout=0.05,
37 | bias = 'none',
38 | task_type="CAUSAL_LM",
39 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
40 | )
41 |
42 | model = get_peft_model(model, config)
43 | outputs = model.generate(**inputs, max_new_tokens=30)
44 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
45 |
46 |
47 | model.push_to_hub("meetrais/Meta-Llama-3-8B-Instruct-NIM-LORA",
48 | token="HF-Access-Key",
49 | commit_message="basic training",
50 | private=True)
51 |
52 |
53 |
--------------------------------------------------------------------------------
/finetune_microsoft-Phi-3-mini-128k-instruct.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3 |
4 | torch.random.manual_seed(0)
5 |
6 | model = AutoModelForCausalLM.from_pretrained(
7 | "microsoft/Phi-3-mini-128k-instruct",
8 | device_map="cuda",
9 | torch_dtype="auto",
10 | trust_remote_code=True
11 | )
12 | tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
13 |
14 | messages = [
15 | {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},
16 | {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
17 | {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
18 | {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
19 | ]
20 |
21 | pipe = pipeline(
22 | "text-generation",
23 | model=model,
24 | tokenizer=tokenizer,
25 | )
26 |
27 | generation_args = {
28 | "max_new_tokens": 500,
29 | "return_full_text": False,
30 | "temperature": 0.6,
31 | "do_sample": False,
32 | }
33 |
34 | output = pipe(messages, **generation_args)
35 | print(output[0]['generated_text'])
36 |
--------------------------------------------------------------------------------
/finetune_microsoft-phi-2.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer
3 |
4 | model_id = "microsoft/phi-2"
5 | tokenizer = AutoTokenizer.from_pretrained(model_id)
6 |
7 | text = "What is Capital of USA?"
8 | inputs = tokenizer(text, return_tensors="pt").to(0)
9 |
10 | model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", device_map="cuda", trust_remote_code=True)
11 | outputs = model.generate(**inputs, max_new_tokens=50)
12 |
13 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
14 |
--------------------------------------------------------------------------------
/finetune_mistral_7b.py:
--------------------------------------------------------------------------------
1 | import transformers as transformers
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
3 | from trl import SFTTrainer
4 | from transformers import BitsAndBytesConfig
5 | import torch
6 | import torch.nn as nn
7 | from peft import LoraConfig, get_peft_model
8 | from datasets import load_dataset
9 |
10 | model_id = "mistralai/Mistral-7B-v0.1"
11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
12 | if tokenizer.pad_token is None:
13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 | text = "Capital of USA is"
15 | device = "cuda:0"
16 |
17 | inputs = tokenizer(text, return_tensors="pt").to(device)
18 |
19 | bnb_config = BitsAndBytesConfig(
20 | load_in_4bit=True,
21 | bnb_4bit_use_double_quant=True,
22 | bnb_4bit_quant_type="nf4",
23 | bnb_4bit_compute_dtype=torch.bfloat16,
24 | load_in_8bit=False
25 | )
26 |
27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
28 |
29 | """
30 | #Freezing the original weights
31 | for param in model.parameters():
32 | param.requires_grad = False
33 | if param.ndim ==1:
34 | param.data = param.data.to(torch.float32)
35 | model.gradient_checkpointing_enable()
36 | model.enable_input_require_grads()
37 |
38 | class CastOutputToFloat(nn.Sequential):
39 | def forward(self, x): return super().forward(x).to(torch.float32)
40 | model.lm_head = CastOutputToFloat(model.lm_head)
41 |
42 | #Setting up the LoRa Adapters
43 | def print_trainable_parameters(model):
44 | trainable_params = 0
45 | all_param = 0
46 | for _, param in model.named_parameters():
47 | all_param += param.numel()
48 | if param.requires_grad:
49 | trainable_params += param.numel()
50 | print(
51 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
52 | )
53 |
54 | config = LoraConfig(
55 | r=16,
56 | lora_alpha=32,
57 | lora_dropout=0.05,
58 | bias = 'none',
59 | task_type="CAUSAL_LM"
60 | )
61 |
62 | model = get_peft_model(model, config)
63 | print_trainable_parameters(model)
64 | dataset_name = "gathnex/Gath_baize"
65 | dataset = load_dataset(dataset_name, split="train[:1000]")
66 | dataset["chat_sample"][0]
67 |
68 | def merge_colunms(example):
69 | example['prediction'] = example['quote'] + " ->: " + str(example["tags"])
70 | return example
71 |
72 |
73 | #data['train'] = data['train'].map(merge_colunms)
74 | #print(data['train'][0])
75 |
76 | #data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)
77 |
78 | #print(data)
79 |
80 | #Training
81 | training_arguments = TrainingArguments(
82 | output_dir= "./results",
83 | num_train_epochs= 10,
84 | per_device_train_batch_size= 8,
85 | gradient_accumulation_steps= 2,
86 | optim = "paged_adamw_8bit",
87 | save_steps= 100,
88 | logging_steps= 30,
89 | learning_rate= 2e-4,
90 | weight_decay= 0.001,
91 | fp16= True,
92 | bf16= False,
93 | max_grad_norm= 0.3,
94 | max_steps= -1,
95 | warmup_ratio= 0.3,
96 | group_by_length= True,
97 | lr_scheduler_type= "constant"
98 | )
99 | # Setting sft parameters
100 | trainer = SFTTrainer(
101 | model=model,
102 | train_dataset=dataset,
103 | max_seq_length= 20,
104 | dataset_text_field="chat_sample",
105 | tokenizer=tokenizer,
106 | args=training_arguments,
107 | packing= False,
108 | )
109 |
110 | model.config.use_cache = False
111 | trainer.train()
112 | """
113 | #print(model)
114 |
115 | config = LoraConfig(
116 | r=16,
117 | lora_alpha=32,
118 | lora_dropout=0.05,
119 | bias = 'none',
120 | task_type="CAUSAL_LM",
121 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
122 | )
123 |
124 | model = get_peft_model(model, config)
125 | outputs = model.generate(**inputs, max_new_tokens=30)
126 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
127 |
128 | """
129 | model.push_to_hub("meetrais/finetuned_mistral_7b",
130 | token="Your-Hugging-Face-Token-Here",
131 | commit_message="basic training",
132 | private=True)
133 |
134 | """
135 |
--------------------------------------------------------------------------------
/finetune_starling-LM-7B-alpha.py:
--------------------------------------------------------------------------------
1 | import transformers as transformers
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
3 | from trl import SFTTrainer
4 | from transformers import BitsAndBytesConfig
5 | import torch
6 | import torch.nn as nn
7 | from peft import LoraConfig, get_peft_model
8 | from datasets import load_dataset
9 |
10 | model_id = "berkeley-nest/Starling-LM-7B-alpha"
11 | tokenizer = AutoTokenizer.from_pretrained(model_id)
12 | if tokenizer.pad_token is None:
13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 | text = "Capital of USA is"
15 | device = "cuda:0"
16 |
17 | inputs = tokenizer(text, return_tensors="pt").to(device)
18 |
19 | bnb_config = BitsAndBytesConfig(
20 | load_in_4bit=True,
21 | bnb_4bit_use_double_quant=True,
22 | bnb_4bit_quant_type="nf4",
23 | bnb_4bit_compute_dtype=torch.bfloat16,
24 | load_in_8bit=False
25 | )
26 |
27 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
28 |
29 | """
30 | #Freezing the original weights
31 | for param in model.parameters():
32 | param.requires_grad = False
33 | if param.ndim ==1:
34 | param.data = param.data.to(torch.float32)
35 | model.gradient_checkpointing_enable()
36 | model.enable_input_require_grads()
37 |
38 | class CastOutputToFloat(nn.Sequential):
39 | def forward(self, x): return super().forward(x).to(torch.float32)
40 | model.lm_head = CastOutputToFloat(model.lm_head)
41 |
42 | #Setting up the LoRa Adapters
43 | def print_trainable_parameters(model):
44 | trainable_params = 0
45 | all_param = 0
46 | for _, param in model.named_parameters():
47 | all_param += param.numel()
48 | if param.requires_grad:
49 | trainable_params += param.numel()
50 | print(
51 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
52 | )
53 |
54 | config = LoraConfig(
55 | r=16,
56 | lora_alpha=32,
57 | lora_dropout=0.05,
58 | bias = 'none',
59 | task_type="CAUSAL_LM"
60 | )
61 |
62 | model = get_peft_model(model, config)
63 | print_trainable_parameters(model)
64 | dataset_name = "gathnex/Gath_baize"
65 | dataset = load_dataset(dataset_name, split="train[:1000]")
66 | dataset["chat_sample"][0]
67 |
68 | def merge_colunms(example):
69 | example['prediction'] = example['quote'] + " ->: " + str(example["tags"])
70 | return example
71 |
72 |
73 | #data['train'] = data['train'].map(merge_colunms)
74 | #print(data['train'][0])
75 |
76 | #data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)
77 |
78 | #print(data)
79 |
80 | #Training
81 | training_arguments = TrainingArguments(
82 | output_dir= "./results",
83 | num_train_epochs= 10,
84 | per_device_train_batch_size= 8,
85 | gradient_accumulation_steps= 2,
86 | optim = "paged_adamw_8bit",
87 | save_steps= 100,
88 | logging_steps= 30,
89 | learning_rate= 2e-4,
90 | weight_decay= 0.001,
91 | fp16= True,
92 | bf16= False,
93 | max_grad_norm= 0.3,
94 | max_steps= -1,
95 | warmup_ratio= 0.3,
96 | group_by_length= True,
97 | lr_scheduler_type= "constant"
98 | )
99 | # Setting sft parameters
100 | trainer = SFTTrainer(
101 | model=model,
102 | train_dataset=dataset,
103 | max_seq_length= 20,
104 | dataset_text_field="chat_sample",
105 | tokenizer=tokenizer,
106 | args=training_arguments,
107 | packing= False,
108 | )
109 |
110 | model.config.use_cache = False
111 | trainer.train()
112 | """
113 | config = LoraConfig(
114 | r=16,
115 | lora_alpha=32,
116 | lora_dropout=0.05,
117 | bias = 'none',
118 | task_type="CAUSAL_LM",
119 | target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"]
120 | )
121 |
122 | #print(model)
123 | model = get_peft_model(model, config)
124 | outputs = model.generate(**inputs, max_new_tokens=30)
125 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
126 |
127 | """
128 | model.push_to_hub("meetrais/finetuned_mistral_7b",
129 | token="Your-Hugging-Face-Token-Here",
130 | commit_message="basic training",
131 | private=True)
132 |
133 | """
134 |
--------------------------------------------------------------------------------
/web_chat_bot_finetuned_mistral_7b.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | from threading import Thread
3 | import argparse
4 | import os
5 | import torch
6 | import gradio as gr
7 | from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
8 | from peft import PeftModel, PeftConfig
9 | from transformers import BitsAndBytesConfig
10 |
11 | peft_model_id = "meetrais/finetuned_mistral_7b"
12 | config = PeftConfig.from_pretrained(peft_model_id)
13 | bnb_config = BitsAndBytesConfig(
14 | load_in_4bit=True,
15 | bnb_4bit_use_double_quant=True,
16 | bnb_4bit_quant_type="nf4",
17 | bnb_4bit_compute_dtype=torch.bfloat16
18 | )
19 | model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=bnb_config, device_map='auto')
20 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
21 |
22 | if tokenizer.pad_token is None:
23 | tokenizer.add_special_tokens({'pad_token': '[PAD]'})
24 |
25 | usingAdapter = True
26 |
27 | device = "cuda:0"
28 | # Function to run the text generation process
29 | def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
30 | #template = "### Text: {}\n### The tone is:\n"
31 | #model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt")
32 | #model_inputs = model_inputs.to(device)
33 | model_inputs= tokenizer(user_text, return_tensors="pt").to(device)
34 |
35 | # Generate text in a separate thread
36 | streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
37 | generate_kwargs = dict(
38 | **model_inputs,
39 | max_new_tokens=max_new_tokens,
40 | pad_token_id=tokenizer.eos_token_id,
41 | streamer=streamer,
42 | )
43 | thread = Thread(target=model.generate, kwargs=generate_kwargs)
44 | thread.start()
45 |
46 | # Retrieve and yield the generated text
47 | model_output = ""
48 | for new_text in streamer:
49 | model_output += new_text
50 | yield model_output
51 | return model_output
52 |
53 | # Gradio UI setup
54 | with gr.Blocks() as demo:
55 | with gr.Row():
56 | with gr.Column(scale=4):
57 | user_text = gr.Textbox(placeholder="Write your question here", label="User input")
58 | model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
59 | button_submit = gr.Button(value="Submit")
60 |
61 | with gr.Column(scale=1):
62 | max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=250, step=1, label="Max New Tokens")
63 | top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
64 | top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k")
65 | temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature")
66 |
67 | user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
68 | button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
69 |
70 | demo.queue(max_size=32).launch(server_port=8082)
71 |
--------------------------------------------------------------------------------
/web_chat_bot_llama3_8b_instruct.py:
--------------------------------------------------------------------------------
1 | import transformers
2 | import torch
3 | import gradio as gr
4 |
5 | device = "cuda:0"
6 | # Function to run the text generation process
7 | def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
8 | model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
9 | device = "cuda:0"
10 | pipeline = transformers.pipeline(
11 | "text-generation",
12 | model=model_id,
13 | model_kwargs={"torch_dtype": torch.bfloat16},
14 | device=device,
15 | )
16 |
17 | messages = [
18 | {"role": "system", "content": "You are a helpfull assistant."},
19 | {"role": "user", "content": user_text},
20 | ]
21 |
22 | prompt = pipeline.tokenizer.apply_chat_template(
23 | messages,
24 | tokenize=False,
25 | add_generation_prompt=True
26 | )
27 |
28 | terminators = [
29 | pipeline.tokenizer.eos_token_id,
30 | pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
31 | ]
32 |
33 | outputs = pipeline(
34 | prompt,
35 | max_new_tokens=max_new_tokens,
36 | eos_token_id=terminators,
37 | do_sample=True,
38 | temperature=temperature,
39 | top_p=top_p,
40 | top_k=top_k
41 | )
42 | return outputs[0]["generated_text"][len(prompt):]
43 |
44 | # Gradio UI setup
45 | with gr.Blocks() as demo:
46 | with gr.Row():
47 | with gr.Column(scale=4):
48 | user_text = gr.Textbox(placeholder="Write your question here", label="User input")
49 | model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
50 | button_submit = gr.Button(value="Submit")
51 |
52 | with gr.Column(scale=1):
53 | max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=250, step=1, label="Max New Tokens")
54 | top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
55 | top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k")
56 | temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature")
57 |
58 | user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
59 | button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
60 |
61 | demo.queue(max_size=32).launch(server_port=8082)
--------------------------------------------------------------------------------
/web_microsoft-Phi-3-mini-128k-instruct.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3 | import gradio as gr
4 |
5 | device = "cuda:0"
6 | # Function to run the text generation process
7 | def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
8 | torch.random.manual_seed(0)
9 |
10 | model = AutoModelForCausalLM.from_pretrained(
11 | "microsoft/Phi-3-mini-128k-instruct",
12 | device_map="cuda",
13 | torch_dtype="auto",
14 | trust_remote_code=True
15 | )
16 | tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
17 |
18 | messages = [
19 | {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},
20 | {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
21 | {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
22 | {"role": "user", "content": "{0}".format(user_text)},
23 | ]
24 |
25 | pipe = pipeline(
26 | "text-generation",
27 | model=model,
28 | tokenizer=tokenizer,
29 | )
30 |
31 | generation_args = {
32 | "max_new_tokens": max_new_tokens,
33 | "return_full_text": False,
34 | "temperature": temperature,
35 | "do_sample": False,
36 | }
37 |
38 | output = pipe(messages, **generation_args)
39 | return output[0]['generated_text']
40 |
41 | # Gradio UI setup
42 | with gr.Blocks() as demo:
43 | with gr.Row():
44 | with gr.Column(scale=4):
45 | user_text = gr.Textbox(placeholder="Write your question here", label="User input")
46 | model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
47 | button_submit = gr.Button(value="Submit")
48 |
49 | with gr.Column(scale=1):
50 | max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=250, step=1, label="Max New Tokens")
51 | top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
52 | top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k")
53 | temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature")
54 |
55 | user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
56 | button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
57 |
58 | demo.queue(max_size=32).launch(server_port=8082)
--------------------------------------------------------------------------------