├── .Rhistory ├── llmtune ├── __init__.py ├── engine │ ├── __init__.py │ ├── lora │ │ ├── __init__.py │ │ ├── peft.py │ │ ├── config.py │ │ └── utils.py │ ├── quant │ │ ├── __init__.py │ │ ├── gptq │ │ │ ├── __init__.py │ │ │ ├── quantizer.py │ │ │ ├── extras.py │ │ │ └── algorithm.py │ │ ├── algorithm.py │ │ ├── converter.py │ │ └── config.py │ └── inference │ │ ├── __init__.py │ │ ├── matmult.py │ │ ├── cuda │ │ └── quant_cuda.cpp │ │ ├── modules.py │ │ └── autograd.py ├── llms │ ├── __init__.py │ ├── opt │ │ ├── __init__.py │ │ ├── config.py │ │ └── model.py │ ├── bloom │ │ ├── __init__.py │ │ └── model.py │ ├── llama │ │ ├── __init__.py │ │ ├── config.py │ │ └── model.py │ ├── config.py │ └── autollm.py ├── utils.py ├── data │ ├── abstract.py │ ├── __init__.py │ ├── alpaca.py │ ├── text.py │ ├── gpt4all.py │ └── calibration.py ├── config.py ├── executor.py └── run.py ├── .DS_Store ├── finetune ├── samsum-llama │ ├── llama_lora_samsum.json │ ├── utils.py │ ├── eval_samsum_4bit_bnb.py │ ├── data.py │ ├── eval_samsum_4bit_llmtune.py │ ├── train_samsum_4bit.py │ └── train_samsum_4bit_bnb.py ├── samsum-opt │ ├── llama_lora_samsum.json │ ├── utils.py │ ├── data.py │ ├── eval_samsum_opt_4bit_llmtune.py │ └── train_samsum_opt_4bit_llmtune.py ├── mnli-llama │ ├── llama_lora_mnli_label.json │ ├── utils.py │ ├── data_mnli_label.py │ ├── eval_mnli_llmtune.py │ └── train_mnli_llmtune_label.py └── bbh-eval │ ├── main_dev.py │ └── bbh_dev.py ├── examples ├── push_to_hub.py ├── quantize.py ├── generate.py ├── generate-after-lora.py └── finetune.py ├── LICENSE └── README.md /.Rhistory: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmtune/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmtune/engine/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmtune/llms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmtune/llms/opt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmtune/engine/lora/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmtune/engine/quant/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmtune/llms/bloom/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmtune/llms/llama/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmtune/engine/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmtune/engine/quant/gptq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kuleshov-group/MODULoRA-Experiment/HEAD/.DS_Store -------------------------------------------------------------------------------- /llmtune/engine/quant/algorithm.py: -------------------------------------------------------------------------------- 1 | from llmtune.engine.quant.config import QuantConfig 2 | 3 | class QuantizationAlgorithm(): 4 | """Quantization algorthim abstract class""" 5 | def __init__(self, config: QuantConfig): 6 | self.config = config 7 | 8 | def quantize(self, model, dataloader): 9 | raise NotImplementedError -------------------------------------------------------------------------------- /finetune/samsum-llama/llama_lora_samsum.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Template used by LLAMA-SAMSUM.", 3 | "prompts_input": [ 4 | "### Summarize this: {instruction}\n ### Output: " 5 | ], 6 | "prompts_no_input": [ 7 | "### Summarize this: {instruction}\n ### Output: " 8 | ], 9 | "output_separator": "### Output: " 10 | } -------------------------------------------------------------------------------- /finetune/samsum-opt/llama_lora_samsum.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Template used by LLAMA-SAMSUM.", 3 | "prompts_input": [ 4 | "### Summarize this: {instruction}\n ### Output: " 5 | ], 6 | "prompts_no_input": [ 7 | "### Summarize this: {instruction}\n ### Output: " 8 | ], 9 | "output_separator": "### Output: " 10 | } -------------------------------------------------------------------------------- /examples/push_to_hub.py: -------------------------------------------------------------------------------- 1 | from llmtune.llms.autollm import AutoLLMForCausalLM 2 | 3 | # load model 4 | model_dir = './llama-7b-quantized' # can generate this via quantize.py 5 | llm = AutoLLMForCausalLM.from_pretrained(model_dir) 6 | 7 | # push to hub 8 | llm.push_to_hub( 9 | repo_id='', 10 | save_dir=model_dir, 11 | commit_message='first commit' 12 | ) 13 | -------------------------------------------------------------------------------- /finetune/mnli-llama/llama_lora_mnli_label.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Template used by LLAMA-MNLI-m output label.", 3 | "prompts_input": [ 4 | "### Premise: {instruction}\n ### Hypothesis: {hypothesis}\n ### Genre: {genre} ### Label:" 5 | ], 6 | "prompts_no_input": [ 7 | "### Premise: {instruction}\n ### Hypothesis: {hypothesis}\n ### Genre: {genre} ### Label:" 8 | ], 9 | "output_separator": "### Label:" 10 | } -------------------------------------------------------------------------------- /llmtune/engine/lora/peft.py: -------------------------------------------------------------------------------- 1 | """Wraps around PEFT to use QuantLoraModel instead of regular LoraModel.""" 2 | 3 | import peft as quant_peft 4 | from llmtune.engine.lora.lora import QuantLoraModel 5 | 6 | # monkey patch peft to use QuantLoraModel 7 | quant_peft.tuners.lora.LoraModel = QuantLoraModel 8 | quant_peft.peft_model.LoraModel = QuantLoraModel 9 | 10 | # the above works for PEFT at the time of writing this code; 11 | # when upgrading to a newer PEFT, use this insted: 12 | # quant_peft.peft_model.PEFT_TYPE_TO_MODEL_MAPPING[ 13 | # quant_peft.utils.PeftType.LORA 14 | # ] = QuantLoraModel -------------------------------------------------------------------------------- /finetune/bbh-eval/main_dev.py: -------------------------------------------------------------------------------- 1 | from fire import Fire 2 | 3 | import bbh_dev 4 | 5 | def main(task_name: str, **kwargs): 6 | task_map = dict( 7 | bbh=bbh_dev.main, 8 | ) 9 | 10 | if task_name == "all": 11 | results = {} 12 | for name, task_fn in task_map.items(): 13 | score = task_fn(**kwargs) 14 | results[name] = score 15 | else: 16 | task_fn = task_map.get(task_name) 17 | if task_fn is None: 18 | raise ValueError(f"{task_name}. Choose from {list(task_map.keys())}") 19 | score = task_fn(**kwargs) 20 | results = {task_name: score} 21 | 22 | results = {name: round(score * 100, 2) for name, score in results.items()} 23 | print(results) 24 | return results 25 | 26 | if __name__ == "__main__": 27 | Fire(main) 28 | -------------------------------------------------------------------------------- /llmtune/llms/opt/config.py: -------------------------------------------------------------------------------- 1 | # from llmtune.llms.config import AutoQuantConfig, LLMType 2 | 3 | OPT_MODELS = [ 4 | "opt-6.7b-4bit", "opt-13b-4bit", 5 | "opt-6.7b-3bit", "opt-13b-3bit", 6 | ] 7 | 8 | def get_opt_config(model): 9 | if '4bit' in model: 10 | bits = 4 11 | elif '3bit' in model: 12 | bits = 3 13 | elif '2bit' in model: 14 | bits = 2 15 | 16 | if '6.7b' in model: 17 | hf_config_name = "facebook/opt-6.7b" 18 | elif '13b' in model: 19 | hf_config_name = "facebook/opt-13b" 20 | 21 | raise NotImplementedError() 22 | 23 | llm_config = AutoQuantConfig( 24 | name=model, 25 | model_type=LLMType.OPT, 26 | hf_config_name=hf_config_name, 27 | hf_tokenizer_config="", 28 | bits=bits 29 | ) 30 | return llm_config 31 | -------------------------------------------------------------------------------- /llmtune/utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import urllib.request 3 | 4 | def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): 5 | if type(module) in layers: 6 | return {name: module} 7 | res = {} 8 | for name1, child in module.named_children(): 9 | res.update(find_layers( 10 | child, layers=layers, name=name + '.' + name1 if name != '' else name1 11 | )) 12 | return res 13 | 14 | def to_half_precision(model): 15 | for n, m in model.named_modules(): 16 | if '4bit' in str(type(m)) or 'QuantLinear' in str(type(m)): 17 | # m.zeros = m.zeros.half() 18 | m.scales = m.scales.half() 19 | if m.bias is not None: 20 | m.bias = m.bias.half() 21 | return model 22 | 23 | def download_file(url, path): 24 | print('Starting download') 25 | urllib.request.urlretrieve(url, path) 26 | print('Done') -------------------------------------------------------------------------------- /llmtune/data/abstract.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Dict, Any 3 | 4 | 5 | # Abstract train data loader 6 | class AbstractTrainData(ABC): 7 | """ 8 | """ 9 | @abstractmethod 10 | def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len: int) -> None: 11 | """ 12 | Args: 13 | dataset (str): Path to dataset 14 | val_set_size (int) : Size of validation set 15 | tokenizer (_type_): Tokenizer 16 | """ 17 | self.tokenizer = tokenizer 18 | self.dataset = dataset 19 | self.val_set_size = val_set_size 20 | self.cutoff_len = cutoff_len 21 | self.train_data = None 22 | self.val_data = None 23 | 24 | @abstractmethod 25 | def tokenize(self, prompt: str) -> Dict[str, Any]: 26 | pass 27 | 28 | @abstractmethod 29 | def prepare_data(self) -> None: 30 | """Loads dataset from file and prepares train_data for trainer.""" 31 | pass 32 | -------------------------------------------------------------------------------- /llmtune/data/__init__.py: -------------------------------------------------------------------------------- 1 | from llmtune.data.text import TrainTxt 2 | from llmtune.data.alpaca import TrainSAD 3 | from llmtune.data.gpt4all import TrainGPT4All 4 | 5 | def load_finetuning_data(tune_config, tokenizer): 6 | if tune_config.ds_type == "alpaca": 7 | data = TrainSAD( 8 | tune_config.dataset, 9 | tune_config.val_set_size, 10 | tokenizer, 11 | tune_config.cutoff_len 12 | ) 13 | elif tune_config.ds_type == "gpt4all": 14 | raise NotImplementedError('GPT4All dataset currently not supported') 15 | data = TrainGPT4All( 16 | tune_config.dataset, 17 | tune_config.val_set_size, 18 | tokenizer, 19 | tune_config.cutoff_len 20 | ) 21 | else: 22 | raise ValueError(f"Invalid data name: {tune_config.ds_type}") 23 | # data.prepare_data( 24 | # thd=tune_config.txt_row_thd, use_eos_token=tune_config.use_eos_token 25 | # ) 26 | data.prepare_data() 27 | return data -------------------------------------------------------------------------------- /llmtune/engine/quant/converter.py: -------------------------------------------------------------------------------- 1 | from llmtune.engine.inference.modules import QuantLinear 2 | 3 | def make_quant( 4 | module, names, bits, groupsize=-1, name='', is_cuda=True 5 | ): 6 | if isinstance(module, QuantLinear): 7 | return 8 | for attr in dir(module): 9 | tmp = getattr(module, attr) 10 | name1 = name + '.' + attr if name != '' else attr 11 | if name1 in names: 12 | setattr( 13 | module, attr, QuantLinear( 14 | bits=bits, 15 | groupsize=groupsize, 16 | in_features=tmp.in_features, 17 | out_features=tmp.out_features, 18 | bias=tmp.bias, 19 | is_cuda=is_cuda, 20 | ) 21 | ) 22 | for name1, child in module.named_children(): 23 | make_quant( 24 | child, 25 | names, 26 | bits=bits, 27 | name=name + '.' + name1 if name != '' else name1, 28 | groupsize=groupsize, 29 | is_cuda=is_cuda 30 | ) 31 | -------------------------------------------------------------------------------- /examples/quantize.py: -------------------------------------------------------------------------------- 1 | from llmtune.llms.autollm import AutoLLMForCausalLM 2 | from llmtune.engine.quant.config import QuantConfig 3 | from llmtune.engine.quant.gptq.executor import GPTQAlgorithm 4 | from llmtune.data.calibration import get_calibration_loaders 5 | 6 | # load model 7 | model_name = 'decapoda-research/llama-7b-hf' 8 | llm = AutoLLMForCausalLM.from_pretrained(model_name) 9 | llm.eval() 10 | 11 | # set up quantization config 12 | config = QuantConfig( 13 | bits=4, 14 | dataset='c4', 15 | seed=0, 16 | nsamples=128, 17 | percdamp=.01, 18 | groupsize=64, 19 | act_order=True, 20 | nearest=False, 21 | save='./llama-7b-quantized' 22 | ) 23 | 24 | # load gptq calibration data 25 | dataloader, _ = get_calibration_loaders( 26 | config.dataset, 27 | nsamples=config.nsamples, 28 | seed=config.seed, 29 | model=llm.base_model.name_or_path, 30 | seqlen=llm.base_model.seqlen 31 | ) 32 | 33 | # create quantization algorithm 34 | gptq = GPTQAlgorithm(config) 35 | llm = gptq.quantize(llm, dataloader) 36 | 37 | llm.save_pretrained(config.save) 38 | print(f'Model weights saved to: {config.save}') -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 oscarscaro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /llmtune/llms/llama/config.py: -------------------------------------------------------------------------------- 1 | # from llmtune.llms.config import AutoQuantConfig, LLMType 2 | 3 | LLAMA_MODELS = [ 4 | "llama-7b-4bit", "llama-13b-4bit", "llama-30b-4bit", "llama-65b-4bit", 5 | "llama-7b-3bit", "llama-13b-3bit", "llama-30b-3bit", "llama-65b-3bit", 6 | "llama-7b-2bit", "llama-65b-2bit", 7 | ] 8 | 9 | def get_llama_config(model): 10 | if '4bit' in model: 11 | bits = 4 12 | elif '3bit' in model: 13 | bits = 3 14 | elif '2bit' in model: 15 | bits = 2 16 | 17 | if '7b' in model: 18 | hf_config_name = "decapoda-research/llama-7b-hf" 19 | elif '13b' in model: 20 | hf_config_name = "decapoda-research/llama-13b-hf" 21 | elif '30b' in model: 22 | hf_config_name = "decapoda-research/llama-30b-hf" 23 | elif '65b' in model: 24 | hf_config_name = "decapoda-research/llama-65b-hf" 25 | 26 | raise NotImplementedError() 27 | 28 | llm_config = AutoQuantConfig( 29 | name=model, 30 | model_type=LLMType.LLAMA, 31 | hf_config_name=hf_config_name, 32 | hf_tokenizer_config="huggyllama/llama-13b", 33 | bits=bits 34 | ) 35 | return llm_config 36 | -------------------------------------------------------------------------------- /examples/generate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer 3 | from llmtune.llms.autollm import AutoLLMForCausalLM 4 | from llmtune.utils import to_half_precision 5 | 6 | # model config 7 | model_name = '' 8 | # model_name = './llama-7b-quantized' # can generate local dir via quantize.py 9 | tokenizer_name = 'huggyllama/llama-13b' 10 | DEV = 'cuda' 11 | 12 | # load model 13 | llm = AutoLLMForCausalLM.from_pretrained(model_name).to(DEV) 14 | llm.eval() 15 | llm = to_half_precision(llm) 16 | 17 | # load tokenizer 18 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) 19 | 20 | # encode prompt 21 | prompt = 'The pyramids were built by' 22 | input_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEV) 23 | 24 | # generation config 25 | min_length=10 26 | max_length=200 27 | top_p=.95 28 | top_k=25 29 | temperature=1.0 30 | 31 | # generate text 32 | with torch.no_grad(): 33 | generated_ids = llm.generate( 34 | inputs=input_ids, 35 | do_sample=True, 36 | min_length=min_length, 37 | max_length=max_length, 38 | top_p=top_p, 39 | top_k=top_k, 40 | temperature=temperature, 41 | ) 42 | 43 | # decode and print 44 | output = tokenizer.decode([el.item() for el in generated_ids[0]]) 45 | print(output) 46 | -------------------------------------------------------------------------------- /llmtune/engine/quant/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from dataclasses import dataclass 4 | from transformers.utils.hub import PushToHubMixin, cached_file 5 | 6 | @dataclass 7 | class QuantConfig(PushToHubMixin): 8 | dataset: str 9 | bits: int 10 | nsamples: int 11 | groupsize: int 12 | act_order: bool 13 | percdamp: float 14 | seed: int 15 | nearest: bool 16 | save: str 17 | 18 | def save_pretrained(self, save_dir: str, **kwargs): 19 | config_path = os.path.join(save_dir, "quant_config.json") 20 | with open(config_path, "w", encoding="utf-8") as f: 21 | json.dump(self.to_dict(), f, indent=2) 22 | 23 | @classmethod 24 | def from_pretrained(cls, save_dir: str, **kwargs): 25 | config_filename = "quant_config.json" 26 | if os.path.isdir(save_dir): 27 | config_path = os.path.join(save_dir, config_filename) 28 | else: 29 | config_path = cached_file(save_dir, config_filename) 30 | with open(config_path, "r", encoding="utf-8") as f: 31 | return cls(**json.load(f)) 32 | 33 | def to_dict(self): 34 | return { 35 | 'dataset': self.dataset, 36 | 'bits': self.bits, 37 | 'nsamples': self.nsamples, 38 | 'groupsize': self.groupsize, 39 | 'act_order': self.act_order, 40 | 'percdamp': self.percdamp, 41 | 'seed': self.seed, 42 | 'nearest': self.nearest, 43 | 'save': self.save, 44 | } -------------------------------------------------------------------------------- /examples/generate-after-lora.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, GenerationConfig 3 | from llmtune.llms.autollm import AutoLLMForCausalLM 4 | from llmtune.utils import to_half_precision 5 | from llmtune.engine.lora.peft import quant_peft 6 | 7 | # model config 8 | model_name = '' 9 | # model_name = './llama-7b-quantized' # can generate local dir via quantize.py 10 | tokenizer_name = 'huggyllama/llama-7b' 11 | DEV = 'cuda' 12 | 13 | # load model 14 | llm = AutoLLMForCausalLM.from_pretrained(model_name).to(DEV) 15 | llm.eval() 16 | llm = to_half_precision(llm) 17 | 18 | # load tokenizer 19 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) 20 | 21 | # load lora from existing checkpoint 22 | adapter_path = './llama-7b-quantized-lora' # can generate this via finetune.py 23 | model = quant_peft.PeftModel.from_pretrained( 24 | llm, adapter_path, 25 | device_map='auto' 26 | ) 27 | print(adapter_path, 'loaded') 28 | 29 | # encode prompt 30 | prompt = 'Write a detailed step-by-step recipe for a blueberry lasagna dish' 31 | input_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEV) 32 | 33 | # generation config 34 | min_length=10 35 | max_length=200 36 | top_p=.95 37 | top_k=25 38 | temperature=1.0 39 | 40 | # generate text 41 | with torch.no_grad(): 42 | generated_ids = model.generate( 43 | inputs=input_ids, 44 | generation_config=GenerationConfig( 45 | do_sample=True, 46 | min_length=min_length, 47 | max_length=max_length, 48 | top_p=top_p, 49 | top_k=top_k, 50 | temperature=temperature, 51 | ) 52 | ) 53 | 54 | # decode and print 55 | output = tokenizer.decode([el.item() for el in generated_ids[0]]) 56 | print(output) 57 | -------------------------------------------------------------------------------- /llmtune/engine/lora/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class FinetuneConfig: 4 | """Config holder for finetuning""" 5 | def __init__( 6 | self, dataset: str, ds_type: str, 7 | lora_out_dir: str, 8 | mbatch_size: int, batch_size: int, 9 | epochs: int, lr: float, 10 | cutoff_len: int, 11 | lora_r: int, lora_alpha: int, lora_dropout: float, 12 | val_set_size: float, 13 | warmup_steps: int, save_steps: int, 14 | save_total_limit: int, logging_steps: int, 15 | ): 16 | self.dataset = dataset 17 | self.ds_type = ds_type 18 | self.lora_out_dir = lora_out_dir 19 | self.mbatch_size = mbatch_size 20 | self.batch_size = batch_size 21 | self.gradient_accumulation_steps = self.batch_size // self.mbatch_size 22 | self.epochs = epochs 23 | self.lr = lr 24 | self.cutoff_len = cutoff_len 25 | self.lora_r = lora_r 26 | self.lora_alpha = lora_alpha 27 | # self.lora_dropout = 0 if gradient_checkpointing else lora_dropout 28 | self.lora_dropout = lora_dropout 29 | self.val_set_size = int(val_set_size) if val_set_size > 1.0 else float(val_set_size) 30 | self.warmup_steps = warmup_steps 31 | self.save_steps = save_steps 32 | self.save_total_limit = save_total_limit 33 | self.logging_steps = logging_steps 34 | self.world_size = int(os.environ.get("WORLD_SIZE", 1)) 35 | self.local_rank = int(os.environ.get("LOCAL_RANK", 0)) 36 | self.ddp = self.world_size != 1 37 | self.device_map = "auto" if not self.ddp else {"": self.local_rank} 38 | if self.ddp: 39 | self.gradient_accumulation_steps = self.gradient_accumulation_steps // self.world_size -------------------------------------------------------------------------------- /llmtune/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from llmtune.llms.config import AutoConfig 3 | from llmtune.llms.opt.config import OPT_MODELS 4 | from llmtune.llms.llama.config import LLAMA_MODELS 5 | from llmtune.engine.lora.config import FinetuneConfig 6 | from llmtune.engine.quant.config import QuantConfig 7 | 8 | # ---------------------------------------------------------------------------- 9 | 10 | # define some constants 11 | DEV = torch.device('cuda') 12 | LLM_MODELS = LLAMA_MODELS + OPT_MODELS 13 | 14 | # ---------------------------------------------------------------------------- 15 | 16 | # helpers for loading configs 17 | def get_finetune_config(args): 18 | return FinetuneConfig( 19 | dataset=args.dataset, 20 | ds_type=args.data_type, 21 | lora_out_dir=args.adapter, 22 | mbatch_size=args.mbatch_size, 23 | batch_size=args.batch_size, 24 | epochs=args.epochs, 25 | lr=args.lr, 26 | cutoff_len=args.cutoff_len, 27 | lora_r=args.lora_r, 28 | lora_alpha=args.lora_alpha, 29 | lora_dropout=args.lora_dropout, 30 | val_set_size=args.val_set_size, 31 | warmup_steps=args.warmup_steps, 32 | save_steps=args.save_steps, 33 | save_total_limit=args.save_total_limit, 34 | logging_steps=args.logging_steps, 35 | ) 36 | 37 | def get_quant_config(args): 38 | return QuantConfig( 39 | dataset=args.dataset, 40 | bits=args.bits, 41 | nsamples=args.nsamples, 42 | groupsize=args.groupsize, 43 | act_order=args.act_order, 44 | percdamp=args.percdamp, 45 | seed=args.seed, 46 | nearest=args.nearest, 47 | save=args.save, 48 | ) 49 | 50 | def get_llm_config(model_name_or_path): 51 | return AutoConfig(model_name_or_path) -------------------------------------------------------------------------------- /llmtune/llms/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from enum import Enum 4 | from transformers import PretrainedConfig, AutoConfig 5 | from transformers.utils.hub import PushToHubMixin, cached_file 6 | from llmtune.engine.quant.config import QuantConfig 7 | 8 | class LLMType(Enum): 9 | LLAMA = 'llama' 10 | OPT = 'opt' 11 | BLOOM = 'bloom' 12 | 13 | class AutoLLMConfig(PretrainedConfig,PushToHubMixin): 14 | def __init__( 15 | self, 16 | base_config: PretrainedConfig, 17 | quant_config: QuantConfig = None 18 | ): 19 | self.base_config = base_config 20 | self.quant_config = None 21 | if quant_config is not None: 22 | self.quant_config = quant_config 23 | 24 | @property 25 | def is_quantized(self): 26 | return self.quant_config is not None 27 | 28 | def set_quant_config(self, quant_config): 29 | if self.quant_config is not None: 30 | raise RuntimeError('quant_config already set') 31 | self.quant_config = quant_config 32 | 33 | @property 34 | def model_type(self): 35 | return self.base_config.model_type 36 | 37 | def save_pretrained(self, save_dir: str, **kwargs): 38 | self.base_config.save_pretrained(save_dir, **kwargs) 39 | if self.is_quantized: 40 | self.quant_config.save_pretrained(save_dir, **kwargs) 41 | 42 | @classmethod 43 | def from_pretrained(cls, save_dir: str): 44 | # load config 45 | base_config = AutoConfig.from_pretrained(save_dir) 46 | 47 | # check if quantized model and config are available 48 | try: 49 | quant_config = ( 50 | QuantConfig.from_pretrained(save_dir) 51 | ) 52 | except: 53 | quant_config = None 54 | 55 | # check if it's a valid model 56 | if base_config.model_type not in [e.value for e in LLMType]: 57 | raise NotImplementedError( 58 | f"Model type {base_config.model_type} currently not supported" 59 | ) 60 | 61 | return cls(base_config, quant_config) 62 | -------------------------------------------------------------------------------- /llmtune/llms/bloom/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from llmtune.utils import find_layers 5 | from llmtune.engine.quant.converter import make_quant 6 | 7 | def load_bloom_unquantized(llm_config): 8 | import torch 9 | from transformers import BloomForCausalLM 10 | def skip(*args, **kwargs): 11 | pass 12 | torch.nn.init.kaiming_uniform_ = skip 13 | torch.nn.init.uniform_ = skip 14 | torch.nn.init.normal_ = skip 15 | model = BloomForCausalLM.from_pretrained( 16 | llm_config.base_config.name_or_path, torch_dtype='auto' 17 | ) 18 | return model 19 | 20 | def load_bloom_quantized(llm_config, quantized_weights_path): 21 | import transformers, accelerate 22 | from transformers import BloomConfig, BloomForCausalLM 23 | 24 | with accelerate.init_empty_weights(): 25 | config = BloomConfig.from_pretrained( 26 | llm_config.base_config.name_or_path 27 | ) 28 | torch.set_default_dtype(torch.half) 29 | transformers.modeling_utils._init_weights = False 30 | torch.set_default_dtype(torch.half) 31 | model = BloomForCausalLM(config) 32 | torch.set_default_dtype(torch.float) 33 | model = model.eval() 34 | layers = find_layers(model) 35 | for name in ['lm_head']: 36 | if name in layers: 37 | del layers[name] 38 | make_quant( 39 | model, layers, llm_config.quant_config.bits, 40 | groupsize=llm_config.quant_config.groupsize 41 | ) 42 | model = accelerate.load_checkpoint_and_dispatch( 43 | model=model, 44 | checkpoint=quantized_weights_path, 45 | device_map="auto", 46 | # device_map={'': 0}, 47 | no_split_module_classes=["LlamaDecoderLayer"] 48 | ) 49 | return model 50 | 51 | def load_bloom(llm_config, quantized_weights_path): 52 | if quantized_weights_path is None: 53 | model = load_bloom_unquantized(llm_config) 54 | else: 55 | model = load_bloom_quantized( 56 | llm_config, quantized_weights_path 57 | ) 58 | model.seqlen = 2048 59 | return model 60 | 61 | def load_bloom_tokenizer(name_or_path): 62 | from transformers import BloomTokenizer 63 | 64 | tokenizer = BloomTokenizer.from_pretrained( 65 | name_or_path 66 | ) 67 | tokenizer.truncation_side = 'left' 68 | return tokenizer 69 | -------------------------------------------------------------------------------- /llmtune/llms/llama/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from llmtune.utils import find_layers 5 | from llmtune.engine.quant.converter import make_quant 6 | 7 | def load_llama_unquantized(llm_config): 8 | import torch 9 | from transformers import LlamaForCausalLM 10 | def skip(*args, **kwargs): 11 | pass 12 | torch.nn.init.kaiming_uniform_ = skip 13 | torch.nn.init.uniform_ = skip 14 | torch.nn.init.normal_ = skip 15 | model = LlamaForCausalLM.from_pretrained( 16 | llm_config.base_config.name_or_path, torch_dtype='auto' 17 | ) 18 | return model 19 | 20 | def load_llama_quantized(llm_config, quantized_weights_path): 21 | import transformers, accelerate 22 | from transformers import LlamaConfig, LlamaForCausalLM 23 | 24 | with accelerate.init_empty_weights(): 25 | config = LlamaConfig.from_pretrained( 26 | llm_config.base_config.name_or_path 27 | ) 28 | torch.set_default_dtype(torch.half) 29 | transformers.modeling_utils._init_weights = False 30 | torch.set_default_dtype(torch.half) 31 | model = LlamaForCausalLM(config) 32 | torch.set_default_dtype(torch.float) 33 | model = model.eval() 34 | layers = find_layers(model) 35 | for name in ['lm_head']: 36 | if name in layers: 37 | del layers[name] 38 | make_quant( 39 | model, layers, llm_config.quant_config.bits, 40 | groupsize=llm_config.quant_config.groupsize 41 | ) 42 | model = accelerate.load_checkpoint_and_dispatch( 43 | model=model, 44 | checkpoint=quantized_weights_path, 45 | device_map="auto", 46 | # device_map={'': 0}, 47 | no_split_module_classes=["LlamaDecoderLayer"] 48 | ) 49 | return model 50 | 51 | def load_llama(llm_config, quantized_weights_path): 52 | if quantized_weights_path is None: 53 | model = load_llama_unquantized(llm_config) 54 | else: 55 | model = load_llama_quantized( 56 | llm_config, quantized_weights_path 57 | ) 58 | model.seqlen = 2048 59 | return model 60 | 61 | def load_llama_tokenizer(name_or_path): 62 | from transformers import LlamaTokenizer 63 | 64 | tokenizer = LlamaTokenizer.from_pretrained( 65 | name_or_path 66 | ) 67 | tokenizer.truncation_side = 'left' 68 | return tokenizer 69 | -------------------------------------------------------------------------------- /llmtune/engine/lora/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023-present the HuggingFace Inc. team. 3 | # Edite by Volodymyr Kuleshov 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import torch 18 | 19 | 20 | def prepare_model_for_int4_training( 21 | model, output_embedding_layer_name="lm_head", use_gradient_checkpointing=False, layer_norm_names=["layer_norm"] 22 | ): 23 | r""" 24 | This method wrapps the entire protocol for preparing a model before running a training. This includes: 25 | 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm 26 | head to fp32 27 | Args: 28 | model, (`transformers.PreTrainedModel`): 29 | The loaded model from `transformers` 30 | """ 31 | # loaded_in_8bit = getattr(model, "is_loaded_in_8bit", False) 32 | loaded_in_4bit = True 33 | 34 | for name, param in model.named_parameters(): 35 | # freeze base model's layers 36 | param.requires_grad = False 37 | 38 | if loaded_in_4bit: 39 | # cast layer norm in fp32 for stability for 4bit models 40 | if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names): 41 | param.data = param.data.to(torch.float32) 42 | 43 | if loaded_in_4bit and use_gradient_checkpointing: 44 | raise NotImplementedError() 45 | 46 | if hasattr(model, output_embedding_layer_name): 47 | output_embedding_layer = getattr(model, output_embedding_layer_name) 48 | input_dtype = output_embedding_layer.weight.dtype 49 | 50 | class CastOutputToFloat(torch.nn.Sequential): 51 | r""" 52 | Manually cast to the expected dtype of the lm_head as sometimes there is a final layer norm that is casted 53 | in fp32 54 | """ 55 | 56 | def forward(self, x): 57 | return super().forward(x.to(input_dtype)).to(torch.float32) 58 | 59 | setattr(model, output_embedding_layer_name, CastOutputToFloat(output_embedding_layer)) 60 | 61 | return model 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ModuLoRA 2 | Code repository (experiment) for the paper "ModuLoRA: Finetuning 3-Bit LLMs on Consumer GPUs by Integrating with Modular Quantizers", [ArXiv](https://arxiv.org/abs/2309.16119). 3 | 4 | **This repo builds on [LLMtools](https://github.com/kuleshov-group/llmtools), with added support of custom dataset preparation and evaluation to reproduce our experiment.** 5 | 6 | **Abstract:** We propose a memory-efficient finetuning algorithm for large language models (LLMs) that supports 7 | finetuning LLMs with 65B parameters in 3-bit or 4-bit precision on as little as one 48GB GPU. Our 8 | method, modular low-rank adaptation (MODULORA), integrates any user-specified weight quantizer 9 | with finetuning via low-rank adapters (LoRAs). Our approach relies on a simple quantization-agnostic 10 | backward pass that adaptively materializes low-precision LLM weights from a custom black-box 11 | quantization module. This approach enables finetuning 3-bit LLMs for the first time—leveraging 12 | state-of-the-art 3-bit OPTQ quantization often outperforms finetuning that relies on less sophisticated 13 | 4-bit and 8-bit methods. In our experiments, MODULORA attains competitive performance on text 14 | classification, natural language infernece, and instruction following tasks using significantly less 15 | memory than existing approaches, and we also surpass the state-of-the-art ROUGE score on a popular 16 | summarization task. We release MODULORA together with a series of low-precision models— 17 | including the first family of 3-bit instruction following Alpaca LLMs—as part of LLMTOOLS, a 18 | user-friendly library for quantizing, running, and finetuning LLMs on consumer GPU. 19 | 20 | 21 | # Repository Overview 22 | 23 | There are several directories in this repo: 24 | * [llmtune/](llmtune) contains the source code for the package `llmtune`, which needs to be installed to run the examples we provide; 25 | * [examples/](examples/) contains an example implementation of 4-bit, 3-bit quantization using OPTQ, finetuning with alpaca dataset, and model generation after applying finetuned LoRA adapater werights. 26 | * [finetune/samsum-llama/](finetune/samsum-llama) contains implementation of finetuning SAMSum benchmark with LoRA in LLaMA models using our package and bitsandbytes, which can be used to reproduce the result in our paper; 27 | * [finetune/mnli-samsum/](finetune/mnli-llama) contains implementation of finetuning MNLI benchmark with LoRA in LLaMA models using our package and bitsandbytes, which produces competitive results compared to SOTA; 28 | * Others finetuning scripts can also be found in the same directory [OPT](finetune/samsum-opt), [BLOOM](finetune/mnli-bloom) 29 | * See how we train `MODULoRA` 3-bit / 4-bit models in [SAMSum-LLAMA](finetune/samsum-llama/train_samsum_4bit.py), [MNLI-LLAMA](finetune/mnli-llama/train_mnli_llmtune_label.py), and [BBH-LLAMA](finetune/mnli-llama/modeling_roberta.py) 30 | * See how we evaluate `MODULoRA` results in [SAMSum-LLAMA](finetune/samsum-llama/eval_samsum_4bit_llmtune.py), [MNLI-LLAMA](finetune/mnli-llama/eval_mnli_llmtune.py), and [BBH-LLAMA](finetune/bbh-eval/main_dev.py) 31 | 32 | -------------------------------------------------------------------------------- /llmtune/data/alpaca.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any 2 | from datasets import load_dataset 3 | from llmtune.data.abstract import AbstractTrainData 4 | 5 | DEFAULT_HF_PATH = "kuleshov/alpaca-data" 6 | 7 | class TrainSAD(AbstractTrainData): 8 | def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None: 9 | super().__init__(dataset, val_set_size, tokenizer, cutoff_len) 10 | 11 | def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]: 12 | # there's probably a way to do this with the tokenizer settings 13 | # but again, gotta move fast 14 | if use_eos_token: 15 | result = self.tokenizer( 16 | prompt + self.tokenizer.eos_token, 17 | truncation=True, 18 | max_length=self.cutoff_len, 19 | padding=False, 20 | ) 21 | if ( 22 | result["input_ids"][-1] != self.tokenizer.eos_token_id 23 | and len(result["input_ids"]) < self.cutoff_len 24 | ): 25 | result["input_ids"].append(self.tokenizer.eos_token_id) 26 | result["attention_mask"].append(1) 27 | return result 28 | else: 29 | result = self.tokenizer( 30 | prompt, 31 | truncation=True, 32 | max_length=self.cutoff_len + 1, 33 | padding="max_length", 34 | ) 35 | return { 36 | "input_ids": result["input_ids"][:-1], 37 | "attention_mask": result["attention_mask"][:-1], 38 | } 39 | 40 | def prepare_data(self, use_eos_token=True, **kwargs) -> None: 41 | if self.dataset: 42 | data = load_dataset("json", data_files=self.dataset) 43 | else: 44 | data = load_dataset(DEFAULT_HF_PATH) 45 | 46 | if self.val_set_size > 0: 47 | train_val = data["train"].train_test_split( 48 | test_size=self.val_set_size, shuffle=True, seed=42 49 | ) 50 | self.train_data = train_val["train"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token)) 51 | self.val_data = train_val["test"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token)) 52 | else: 53 | self.train_data = data["train"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token)) 54 | self.val_data = None 55 | 56 | # Auxiliary methods 57 | def generate_prompt(self, data_point, **kwargs): 58 | return make_prompt( 59 | data_point["instruction"], 60 | data_point["input"], 61 | data_point["output"] 62 | ) 63 | 64 | 65 | def generate_and_tokenize_prompt(self, data_point, **kwargs): 66 | prompt = self.generate_prompt(data_point, **kwargs) 67 | return self.tokenize(prompt, **kwargs) 68 | 69 | def make_prompt(instruction, input_, output=""): 70 | return "{0}\n\n{1}\n{2}\n\n{3}\n{4}\n\n{5}\n{6}".format( 71 | "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.", 72 | "### Instruction:", 73 | instruction, 74 | "### Input:", 75 | input_, 76 | "### Response:", 77 | output 78 | ) 79 | 80 | def make_output(raw_output): 81 | return raw_output.split("### Response:")[1].strip() -------------------------------------------------------------------------------- /llmtune/data/text.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, Any 3 | from datasets import Dataset 4 | from torch.utils.data import DataLoader 5 | from llmtune.data.abstract import AbstractTrainData 6 | 7 | # LLaMA txt train data loader 8 | class TrainTxt(AbstractTrainData): 9 | def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len): 10 | super().__init__(dataset, val_set_size, tokenizer, cutoff_len) # TODO: Validation size isn't used 11 | self.cutoff_len = cutoff_len 12 | self.exceed_count = 0 13 | 14 | def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]: 15 | # there's probably a way to do this with the tokenizer settings 16 | # but again, gotta move fast 17 | if use_eos_token: 18 | result = self.tokenizer( 19 | prompt + self.tokenizer.eos_token, 20 | truncation=True, 21 | max_length=self.cutoff_len, 22 | padding=False, 23 | ) 24 | d = { 25 | "input_ids": result["input_ids"], 26 | "attention_mask": result["attention_mask"], 27 | } 28 | if ( 29 | d["input_ids"][-1] != self.tokenizer.eos_token_id 30 | and len(d["input_ids"]) < self.cutoff_len 31 | ): 32 | d["input_ids"].append(self.tokenizer.eos_token_id) 33 | d["attention_mask"].append(1) 34 | else: 35 | result = self.tokenizer( 36 | prompt, 37 | truncation=True, 38 | max_length=self.cutoff_len + 1, 39 | padding="max_length", 40 | ) 41 | d = { 42 | "input_ids": result["input_ids"][:-1], 43 | "attention_mask": result["attention_mask"][:-1], 44 | } 45 | if sum(d['attention_mask']) >= self.cutoff_len: 46 | self.exceed_count += 1 47 | return d 48 | 49 | @classmethod 50 | def format_new_rows(cls, rows, thd=128): 51 | r_b = '' 52 | new_rows = [] 53 | for row in rows: 54 | if len(r_b) == 0: 55 | r_b += row 56 | else: 57 | r_b += '\n' + row 58 | if len(r_b) > thd: 59 | new_rows.append(r_b) 60 | r_b = '' 61 | if len(r_b) > thd: 62 | new_rows.append(r_b) 63 | r_b = '' 64 | return new_rows 65 | 66 | def prepare_data(self, thd=-1, use_eos_token=True, **kwargs): 67 | if os.path.isdir(self.dataset): 68 | rows = [] 69 | for filename in os.listdir(self.dataset): 70 | with open(self.dataset + filename, 'r', encoding='utf8') as file: 71 | txt = file.read() 72 | txt = txt.replace('\r\n', '\n').replace('\u3000', ' ') 73 | rows += [r for r in txt.split('\n') if r != ''] 74 | else: 75 | with open(self.dataset, 'r', encoding='utf8') as file: 76 | txt = file.read() 77 | txt = txt.replace('\r\n', '\n') 78 | rows = [r for r in txt.split('\n') if r != ''] 79 | if thd != -1: 80 | rows = self.format_new_rows(rows, thd=thd) 81 | data = Dataset.from_dict({"input": rows}) 82 | data = data.shuffle().map(lambda x: self.tokenize(x["input"], use_eos_token=use_eos_token)) 83 | print('Train Data: {:.2f}%'.format(self.exceed_count / len(data) * 100), 'outliers') 84 | self.train_data = data 85 | -------------------------------------------------------------------------------- /llmtune/llms/opt/model.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | from llmtune.utils import find_layers 5 | from llmtune.engine.quant.converter import make_quant 6 | 7 | def load_opt_unquantized(llm_config): 8 | from transformers import OPTForCausalLM 9 | def skip(*args, **kwargs): 10 | pass 11 | torch.nn.init.kaiming_uniform_ = skip 12 | torch.nn.init.uniform_ = skip 13 | torch.nn.init.normal_ = skip 14 | model = OPTForCausalLM.from_pretrained( 15 | llm_config.base_config.name_or_path, torch_dtype='auto' 16 | ) 17 | return model 18 | 19 | def load_opt_quantized(llm_config, quantized_weights_path): 20 | import transformers, accelerate 21 | from transformers import OPTConfig, OPTForCausalLM 22 | 23 | with accelerate.init_empty_weights(): 24 | config = OPTConfig.from_pretrained( 25 | llm_config.base_config.name_or_path 26 | ) 27 | torch.set_default_dtype(torch.half) 28 | transformers.modeling_utils._init_weights = False 29 | torch.set_default_dtype(torch.half) 30 | model = OPTForCausalLM(config) 31 | torch.set_default_dtype(torch.float) 32 | model = model.eval() 33 | layers = find_layers(model) 34 | for name in [ 35 | 'model.decoder.project_out', 36 | 'model.decoder.project_in', 'lm_head' 37 | ]: 38 | if name in layers: 39 | del layers[name] 40 | make_quant( 41 | model, layers, llm_config.quant_config.bits, 42 | groupsize=llm_config.quant_config.groupsize 43 | ) 44 | model = accelerate.load_checkpoint_and_dispatch( 45 | model=model, 46 | checkpoint=quantized_weights_path, 47 | device_map="auto", 48 | # device_map={'': 0}, 49 | no_split_module_classes=["OPTDecoderLayer"] 50 | ) 51 | return model 52 | 53 | def load_opt_quantized_old(llm_config, checkpoint): 54 | import transformers 55 | from transformers import OPTConfig, OPTForCausalLM 56 | def noop(*args, **kwargs): 57 | pass 58 | 59 | config = OPTConfig.from_pretrained( 60 | llm_config.base_config.name_or_path 61 | ) 62 | torch.nn.init.kaiming_uniform_ = noop 63 | torch.nn.init.uniform_ = noop 64 | torch.nn.init.normal_ = noop 65 | 66 | torch.set_default_dtype(torch.half) 67 | transformers.modeling_utils._init_weights = False 68 | torch.set_default_dtype(torch.half) 69 | model = OPTForCausalLM(config) 70 | torch.set_default_dtype(torch.float) 71 | model = model.eval() 72 | layers = find_layers(model) 73 | for name in [ 74 | 'model.decoder.project_out', 75 | 'model.decoder.project_in', 'lm_head' 76 | ]: 77 | if name in layers: 78 | del layers[name] 79 | make_quant(model, layers, llm_config.quant_config.bits) 80 | 81 | print('Loading OPT model') 82 | model.load_state_dict(torch.load(checkpoint)) 83 | model.seqlen = 2048 84 | print('Done') 85 | 86 | return model 87 | 88 | def load_opt(llm_config, quantized_weights_path): 89 | if quantized_weights_path is None: 90 | model = load_opt_unquantized(llm_config) 91 | else: 92 | model = load_opt_quantized( 93 | llm_config, quantized_weights_path 94 | ) 95 | model.seqlen = 2048 96 | return model 97 | 98 | def load_opt_tokenizer(name_or_path): 99 | from transformers import AutoTokenizer 100 | tokenizer = AutoTokenizer.from_pretrained( 101 | name_or_path 102 | ) 103 | tokenizer.truncation_side = 'left' 104 | return tokenizer -------------------------------------------------------------------------------- /finetune/mnli-llama/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import torch 5 | import numpy as np 6 | 7 | 8 | def set_random_seed(seed): 9 | random.seed(seed) 10 | np.random.seed(seed) 11 | os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2" 12 | os.environ["PL_GLOBAL_SEED"] = str(seed) 13 | os.environ["PYTHONHASHSEED"] = str(seed) 14 | torch.manual_seed(seed) 15 | torch.cuda.manual_seed_all(seed) 16 | torch.backends.cudnn.benchmark = False 17 | torch.backends.cudnn.deterministic = True 18 | 19 | 20 | def fix_tokenizer(tokenizer): 21 | # Fixing broken tokenizers 22 | special_tokens = dict() 23 | for token_id in range(1000): 24 | token = tokenizer.convert_ids_to_tokens(token_id) 25 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad" in token: 26 | special_tokens["pad_token"] = token 27 | if tokenizer.bos_token_id in (None, tokenizer.vocab_size) and "" in token: 28 | special_tokens["bos_token"] = token 29 | if tokenizer.eos_token_id in (None, tokenizer.vocab_size) and "" in token: 30 | special_tokens["eos_token"] = token 31 | if tokenizer.unk_token_id in (None, tokenizer.vocab_size) and "unk" in token: 32 | special_tokens["unk_token"] = token 33 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep" in token: 34 | special_tokens["sep_token"] = token 35 | 36 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "bos_token" in special_tokens: 37 | special_tokens["sep_token"] = special_tokens["bos_token"] 38 | 39 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad_token" not in special_tokens: 40 | if tokenizer.unk_token_id is not None: 41 | special_tokens["pad_token"] = tokenizer.unk_token 42 | else: 43 | special_tokens["pad_token"] = "<|pad|>" 44 | 45 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep_token" not in special_tokens: 46 | if tokenizer.bos_token_id is not None: 47 | special_tokens["sep_token"] = tokenizer.bos_token 48 | else: 49 | special_tokens["sep_token"] = "<|sep|>" 50 | print(special_tokens) 51 | tokenizer.add_special_tokens(special_tokens) 52 | 53 | print("Vocab size: ", tokenizer.vocab_size) 54 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token) 55 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token) 56 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token) 57 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token) 58 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token) 59 | return tokenizer 60 | 61 | 62 | def fix_model(model, tokenizer, use_resize=True): 63 | model.config.pad_token_id = tokenizer.pad_token_id 64 | assert model.config.pad_token_id is not None 65 | 66 | bos_candidates = ( 67 | tokenizer.bos_token_id, 68 | tokenizer.cls_token_id, 69 | tokenizer.sep_token_id, 70 | tokenizer.unk_token_id 71 | ) 72 | for bos_candidate in bos_candidates: 73 | model.config.bos_token_id = bos_candidate 74 | if bos_candidate is not None: 75 | break 76 | assert model.config.bos_token_id is not None 77 | model.config.decoder_start_token_id = model.config.bos_token_id 78 | 79 | eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id) 80 | for eos_candidate in eos_candidates: 81 | model.config.eos_token_id = eos_candidate 82 | if eos_candidate is not None: 83 | break 84 | assert model.config.eos_token_id is not None 85 | 86 | if use_resize: 87 | model.resize_token_embeddings(len(tokenizer)) 88 | 89 | return model 90 | 91 | 92 | def gen_batch(records, batch_size): 93 | batch_start = 0 94 | while batch_start < len(records): 95 | batch_end = batch_start + batch_size 96 | batch = records[batch_start: batch_end] 97 | batch_start = batch_end 98 | yield batch -------------------------------------------------------------------------------- /examples/finetune.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import transformers 4 | from transformers import AutoTokenizer 5 | from llmtune.llms.autollm import AutoLLMForCausalLM 6 | from llmtune.engine.lora.config import FinetuneConfig 7 | from llmtune.data import TrainSAD 8 | from llmtune.engine.lora.peft import quant_peft 9 | from llmtune.utils import to_half_precision 10 | 11 | # model config 12 | model_name = '' 13 | # model_name = './llama-7b-quantized' # can generate local dir via quantize.py 14 | tokenizer_name = 'huggyllama/llama-13b' 15 | DEV = 'cuda' 16 | 17 | # load model 18 | transformers.logging.set_verbosity_info() 19 | llm = AutoLLMForCausalLM.from_pretrained(model_name) 20 | llm.eval() 21 | llm = llm.to(DEV) 22 | llm = to_half_precision(llm) 23 | 24 | # load tokenizer 25 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) 26 | tokenizer.pad_token_id = 0 27 | 28 | # finetune training config 29 | mbatch_size=1 30 | batch_size=2 31 | epochs=3 32 | lr=2e-4 33 | cutoff_len=256 34 | lora_r=8 35 | lora_alpha=16 36 | lora_dropout=0.05 37 | val_set_size=0.2 38 | warmup_steps=50 39 | save_steps=50 40 | save_total_limit=3 41 | logging_steps=10 42 | 43 | data_type = 'alpaca' 44 | dataset = None # will load alpaca from HF 45 | adapter_path = './llama-7b-quantized-lora' 46 | 47 | # set up finetuning config 48 | tune_config = FinetuneConfig( 49 | dataset=dataset, 50 | ds_type=data_type, 51 | lora_out_dir=adapter_path, 52 | mbatch_size=mbatch_size, 53 | batch_size=batch_size, 54 | epochs=epochs, 55 | lr=lr, 56 | cutoff_len=cutoff_len, 57 | lora_r=lora_r, 58 | lora_alpha=lora_alpha, 59 | lora_dropout=lora_dropout, 60 | val_set_size=val_set_size, 61 | warmup_steps=warmup_steps, 62 | save_steps=save_steps, 63 | save_total_limit=save_total_limit, 64 | logging_steps=logging_steps, 65 | ) 66 | 67 | # set up lora config 68 | lora_config = quant_peft.LoraConfig( 69 | r=tune_config.lora_r, 70 | lora_alpha=tune_config.lora_alpha, 71 | target_modules=["q_proj", "v_proj"], 72 | lora_dropout=tune_config.lora_dropout, 73 | bias="none", 74 | task_type="CAUSAL_LM", 75 | ) 76 | 77 | # create a new lora from config 78 | model = quant_peft.get_peft_model(llm, lora_config) 79 | 80 | # load stanford alpaca data 81 | data = TrainSAD( 82 | tune_config.dataset, 83 | tune_config.val_set_size, 84 | tokenizer, 85 | tune_config.cutoff_len 86 | ) 87 | data.prepare_data() # this tokenizes the dataset 88 | 89 | # training args 90 | training_arguments = transformers.TrainingArguments( 91 | per_device_train_batch_size=tune_config.mbatch_size, 92 | gradient_accumulation_steps=tune_config.gradient_accumulation_steps, 93 | warmup_steps=tune_config.warmup_steps, 94 | num_train_epochs=tune_config.epochs, 95 | learning_rate=tune_config.lr, 96 | fp16=True, 97 | logging_steps=tune_config.logging_steps, 98 | evaluation_strategy="no", 99 | save_strategy="steps", 100 | eval_steps=None, 101 | save_steps=tune_config.save_steps, 102 | output_dir=tune_config.lora_out_dir, 103 | save_total_limit=tune_config.save_total_limit, 104 | load_best_model_at_end=False, 105 | ddp_find_unused_parameters=False if tune_config.ddp else None, 106 | ) 107 | 108 | # start trainer 109 | trainer = transformers.Trainer( 110 | model=model, 111 | train_dataset=data.train_data, 112 | eval_dataset=data.val_data, 113 | args=training_arguments, 114 | data_collator=transformers.DataCollatorForLanguageModeling( 115 | tokenizer, mlm=False 116 | ), 117 | ) 118 | print(training_arguments.parallel_mode) 119 | model.config.use_cache = False 120 | 121 | # use half precision 122 | model = to_half_precision(model) 123 | 124 | # start training 125 | checkpoint_dir = tune_config.lora_out_dir 126 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir): 127 | trainer.train(resume_from_checkpoint=True) 128 | else: 129 | trainer.train() 130 | 131 | # Save Model 132 | model.save_pretrained(tune_config.lora_out_dir) 133 | 134 | -------------------------------------------------------------------------------- /llmtune/data/gpt4all.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import Dict, Any 3 | from datasets import load_dataset 4 | from llmtune.data.abstract import AbstractTrainData 5 | 6 | # GPT4All-like Data 7 | class TrainGPT4All(AbstractTrainData): 8 | def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None: 9 | super().__init__(dataset, val_set_size, tokenizer, cutoff_len) 10 | 11 | def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]: 12 | pass 13 | 14 | def tokenize_inputs(self, examples): 15 | max_length = self.cutoff_len 16 | input_ids = torch.full((len(examples["prompt"]), max_length), self.tokenizer.pad_token_id) 17 | # ignore bos 18 | newline_tokens = self.tokenizer("\n", return_tensors="pt")["input_ids"][0, 1:] 19 | 20 | out = {"labels": [], "attention_mask": []} 21 | for i, (prompt, response) in enumerate(zip(examples["prompt"], examples["response"])): 22 | input_tokens = self.tokenizer(prompt, truncation=True, max_length=max_length // 2, return_tensors="pt")["input_ids"].squeeze() 23 | if input_tokens.dim() == 0: 24 | input_tokens = input_tokens.unsqueeze(0) 25 | 26 | input_len = len(input_tokens) 27 | 28 | # plus one since we remove bos from response 29 | # but we subtract one since we want to add eos token 30 | remaining_tokens = max_length - input_len - len(newline_tokens) + 1 31 | # remove bos 32 | target_tokens = self.tokenizer(response, truncation=True, max_length=remaining_tokens, return_tensors="pt")["input_ids"].squeeze()[1:] 33 | 34 | input_ids[i, :input_len] = input_tokens 35 | # add newline between prompt and response 36 | newline_plus_inputs = input_len + len(newline_tokens) 37 | input_ids[i, input_len: newline_plus_inputs] = newline_tokens 38 | 39 | # add target tokens, remove bos 40 | input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens 41 | # add eos token, enforce stopping if we don't truncate 42 | # we don't want long code to stop generating if truncated during training 43 | if newline_plus_inputs + len(target_tokens) < max_length: 44 | input_ids[i, newline_plus_inputs + len(target_tokens)] = self.tokenizer.eos_token_id 45 | 46 | labels = input_ids[i].clone() 47 | labels[: newline_plus_inputs] = -100 48 | labels[labels == self.tokenizer.pad_token_id] = -100 49 | # to debug this, can set all values == -100 to the pad token, then assert that tokenizer.decode(labels, skip_special_tokens=True).strip() == response 50 | 51 | attention_mask = input_ids[i].ne(self.tokenizer.pad_token_id).int() 52 | 53 | out["labels"].append(labels) 54 | out["attention_mask"].append(attention_mask) 55 | 56 | out["input_ids"] = input_ids 57 | 58 | out = {k: torch.stack(v) if isinstance(v, list) else v for k, v in out.items()} 59 | 60 | return out 61 | 62 | def prepare_data(self, **kwargs) -> None: 63 | dataset = load_dataset("json", data_files=self.dataset) 64 | 65 | self.val_data = None 66 | if self.val_set_size > 0: 67 | dataset = dataset["train"].train_test_split( 68 | test_size=self.val_set_size, shuffle=True, seed=42 # ! Seed = 42 (?) 69 | ) 70 | train_dataset, val_dataset = dataset["train"], dataset["test"] 71 | 72 | # tokenize inputs and return labels and attention mask 73 | val_dataset = val_dataset.map( 74 | lambda ele: self.tokenize_inputs(ele), 75 | batched=True, 76 | remove_columns=["source", "prompt"], 77 | ) 78 | self.val_data = val_dataset.with_format("torch") 79 | else: 80 | train_dataset = dataset["train"] 81 | 82 | train_dataset = train_dataset.map( 83 | lambda ele: self.tokenize_inputs(ele), 84 | batched=True, 85 | remove_columns=["source", "prompt"], 86 | ) 87 | self.train_data = train_dataset.with_format("torch") 88 | -------------------------------------------------------------------------------- /finetune/bbh-eval/bbh_dev.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from argparse import Namespace 4 | from typing import List 5 | 6 | from datasets import load_dataset, get_dataset_config_names 7 | from fire import Fire 8 | from pydantic import BaseModel 9 | from tqdm import tqdm 10 | 11 | from modeling_dev import select_model, EvalModel 12 | 13 | 14 | class BBHSample(BaseModel): 15 | input: str 16 | target: str 17 | 18 | def as_prompt(self, include_answer: bool = True): 19 | prompt = self.input 20 | prompt += "\nAnswer:" 21 | if include_answer: 22 | prompt += " {}\n\n".format(self.target) 23 | return prompt 24 | 25 | 26 | class BBHData(BaseModel): 27 | samples: List[BBHSample] 28 | 29 | @classmethod 30 | def get_config_names(cls, path: str = "lukaemon/bbh") -> List[str]: 31 | return get_dataset_config_names(path) 32 | 33 | @classmethod 34 | def load_from_huggingface( 35 | cls, path: str = "lukaemon/bbh", config: str = "", split: str = "test" 36 | ): 37 | data = load_dataset(path, config, split=split) 38 | samples = [BBHSample(**raw) for raw in tqdm(data, desc=str((path, split)))] 39 | return cls(samples=samples) 40 | 41 | 42 | def gen_prompt(data: BBHData, k=-1): 43 | prompt = "" 44 | if k == -1: 45 | k = len(data.samples) 46 | for i in range(k): 47 | prompt += data.samples[i].as_prompt() 48 | return prompt 49 | 50 | 51 | def evaluate(model: EvalModel, data: BBHData, ntrain: int) -> dict: 52 | data_train = BBHData(samples=data.samples[:ntrain]) 53 | data_test = BBHData(samples=data.samples[ntrain:]) 54 | is_correct = [] 55 | 56 | for i in range(len(data_test.samples)): 57 | # get prompt and make sure it fits 58 | k = int(ntrain) 59 | prompt_end = data_test.samples[i].as_prompt(include_answer=False) 60 | train_prompt = gen_prompt(data_train, k) 61 | prompt = train_prompt + prompt_end 62 | 63 | while not model.check_valid_length(prompt) and k > 0: 64 | k -= 1 65 | train_prompt = gen_prompt(data_train, k) 66 | prompt = train_prompt + prompt_end 67 | 68 | label = data_test.samples[i].target 69 | pred = model.run(prompt) 70 | is_correct.append(pred.strip().startswith(label)) 71 | if i == 0: 72 | print(dict(prompt=prompt, label=label, pred=pred)) 73 | 74 | return dict(score=sum(is_correct) / len(is_correct)) 75 | 76 | 77 | def main(data_dir: str = "lukaemon/bbh", ntrain: int = 3, **kwargs): 78 | args = Namespace(**locals()) 79 | model = select_model(max_input_length=2048, max_output_length=32, **kwargs) 80 | print(locals()) 81 | 82 | if 'load_4bit' in kwargs: 83 | loadin_4bit = 'true' 84 | else: 85 | loadin_4bit = 'false' 86 | 87 | if 'load_8bit' in kwargs: 88 | loadin_8bit = 'true' 89 | else: 90 | loadin_8bit = 'false' 91 | 92 | if 'lora_path' in kwargs: 93 | file_name = f"all_results_{kwargs['model_path'].replace('/', '-')}_{kwargs['lora_path'].replace('/', '-')}_4bit_{loadin_4bit}_8bit_{loadin_8bit}.txt" 94 | else: 95 | file_name = f"all_results_{kwargs['model_path'].replace('/', '-')}_4bit_{loadin_4bit}_8bit_{loadin_8bit}.txt" 96 | 97 | all_results = [] 98 | if os.path.exists(file_name): 99 | with open(file_name, "r") as f: 100 | print(f"Loading {file_name}") 101 | all_results = json.load(f) 102 | print(all_results) 103 | 104 | start = len(all_results) 105 | for name in tqdm(BBHData.get_config_names()[start:]): 106 | data = BBHData.load_from_huggingface(config=name) 107 | result = evaluate(model, data, ntrain=ntrain) 108 | all_results.append(result) 109 | print(dict(name=name, **result)) 110 | 111 | # Save the state of all_results after each iteration 112 | with open(file_name, "w") as f: 113 | json.dump(all_results, f) 114 | 115 | score = sum(res["score"] for res in all_results) / len(all_results) 116 | print(dict(average=score)) 117 | return score 118 | 119 | 120 | if __name__ == "__main__": 121 | Fire() 122 | -------------------------------------------------------------------------------- /llmtune/data/calibration.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def set_seed(seed): 6 | np.random.seed(seed) 7 | torch.random.manual_seed(seed) 8 | 9 | 10 | def get_wikitext2(nsamples, seed, seqlen, model): 11 | from datasets import load_dataset 12 | traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train') 13 | testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') 14 | 15 | from transformers import AutoTokenizer 16 | tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) 17 | trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt') 18 | testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt') 19 | 20 | import random 21 | random.seed(seed) 22 | trainloader = [] 23 | for _ in range(nsamples): 24 | i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) 25 | j = i + seqlen 26 | inp = trainenc.input_ids[:, i:j] 27 | tar = inp.clone() 28 | tar[:, :-1] = -100 29 | trainloader.append((inp, tar)) 30 | return trainloader, testenc 31 | 32 | def get_ptb(nsamples, seed, seqlen, model): 33 | from datasets import load_dataset 34 | traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') 35 | valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation') 36 | 37 | from transformers import AutoTokenizer 38 | tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) 39 | trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt') 40 | testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt') 41 | 42 | import random 43 | random.seed(seed) 44 | trainloader = [] 45 | for _ in range(nsamples): 46 | i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) 47 | j = i + seqlen 48 | inp = trainenc.input_ids[:, i:j] 49 | tar = inp.clone() 50 | tar[:, :-1] = -100 51 | trainloader.append((inp, tar)) 52 | return trainloader, testenc 53 | 54 | def get_c4(nsamples, seed, seqlen, model): 55 | from datasets import load_dataset 56 | traindata = load_dataset( 57 | 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train', use_auth_token=True 58 | ) 59 | valdata = load_dataset( 60 | 'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation',use_auth_token=True 61 | ) 62 | 63 | from transformers import AutoTokenizer 64 | tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) 65 | 66 | import random 67 | random.seed(seed) 68 | trainloader = [] 69 | for _ in range(nsamples): 70 | while True: 71 | i = random.randint(0, len(traindata) - 1) 72 | trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') 73 | if trainenc.input_ids.shape[1] >= seqlen: 74 | break 75 | i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) 76 | j = i + seqlen 77 | inp = trainenc.input_ids[:, i:j] 78 | tar = inp.clone() 79 | tar[:, :-1] = -100 80 | trainloader.append((inp, tar)) 81 | 82 | import random 83 | random.seed(0) 84 | valenc = [] 85 | for _ in range(256): 86 | while True: 87 | i = random.randint(0, len(valdata) - 1) 88 | tmp = tokenizer(valdata[i]['text'], return_tensors='pt') 89 | if tmp.input_ids.shape[1] >= seqlen: 90 | break 91 | i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1) 92 | j = i + seqlen 93 | valenc.append(tmp.input_ids[:, i:j]) 94 | valenc = torch.hstack(valenc) 95 | class TokenizerWrapper: 96 | def __init__(self, input_ids): 97 | self.input_ids = input_ids 98 | valenc = TokenizerWrapper(valenc) 99 | 100 | return trainloader, valenc 101 | 102 | 103 | def get_calibration_loaders( 104 | name, nsamples=128, seed=0, seqlen=2048, model='' 105 | ): 106 | if 'wikitext2' in name: 107 | return get_wikitext2(nsamples, seed, seqlen, model) 108 | if 'ptb' in name: 109 | return get_ptb(nsamples, seed, seqlen, model) 110 | if 'c4' in name: 111 | return get_c4(nsamples, seed, seqlen, model) 112 | -------------------------------------------------------------------------------- /llmtune/engine/inference/matmult.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | try: 4 | import quant_cuda 5 | except: 6 | print('CUDA extension not installed. Inference will not work.') 7 | 8 | # Global Buffer 9 | buffer_mat_dic = {} 10 | use_new = True 11 | auto_switch = True 12 | auto_switch_thd = 8 13 | debug = False 14 | cache_buffer = True 15 | 16 | def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda', bits=4): 17 | target_shape = (shape_of_qweight[0] * (32 // bits), shape_of_qweight[1]) 18 | if not cache_buffer: 19 | return torch.zeros(target_shape, dtype=dtype, device=device) 20 | if target_shape not in buffer_mat_dic.keys(): 21 | buffer_mat_dic[target_shape] = torch.zeros(target_shape, dtype=dtype, device=device) 22 | else: 23 | if buffer_mat_dic[target_shape].device != device: 24 | buffer_mat_dic[target_shape] = buffer_mat_dic[target_shape].to(device) 25 | if buffer_mat_dic[target_shape].dtype != dtype: 26 | buffer_mat_dic[target_shape] = buffer_mat_dic[target_shape].to(dtype=dtype) 27 | return buffer_mat_dic[target_shape] 28 | 29 | def _matmul4bit_v1_recons(x, qweight, scales, zeros, transpose=False): 30 | if debug: 31 | print('_matmul4bit_v1_recons') 32 | if not transpose: 33 | assert qweight.shape[0] * 8 == x.shape[-1] 34 | else: 35 | assert qweight.shape[1] == x.shape[-1] 36 | buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device) 37 | quant_cuda.vecquant4recons_v1(qweight, buffer, scales, zeros) 38 | # dtype = x.dtype 39 | # x = x.float() 40 | if not transpose: 41 | output = torch.matmul(x, buffer) 42 | else: 43 | output = torch.matmul(x, buffer.T) 44 | # output = output.to(dtype) 45 | return output 46 | 47 | 48 | def _matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx, transpose=False): 49 | if debug: 50 | print('_matmul4bit_v2_recons') 51 | if not transpose: 52 | assert qweight.shape[0] * 8 == x.shape[-1] 53 | else: 54 | assert qweight.shape[1] == x.shape[-1] 55 | buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device) 56 | quant_cuda.vecquant4recons_v2(qweight, buffer, scales, zeros, g_idx) 57 | if not transpose: 58 | output = torch.matmul(x, buffer) 59 | else: 60 | output = torch.matmul(x, buffer.T) 61 | return output 62 | 63 | 64 | def _matmul2bit_v2_recons(x, qweight, scales, zeros, g_idx, transpose=False): 65 | if debug: 66 | print('_matmul2bit_v2_recons') 67 | if not transpose: 68 | assert qweight.shape[0] * 16 == x.shape[-1] 69 | else: 70 | assert qweight.shape[1] == x.shape[-1] 71 | buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device, bits=2) 72 | quant_cuda.vecquant2recons_v2(qweight, buffer, scales, zeros, g_idx) 73 | if not transpose: 74 | output = torch.matmul(x, buffer) 75 | else: 76 | output = torch.matmul(x, buffer.T) 77 | return output 78 | 79 | 80 | def matmul4bit(x, qweight, scales, zeros, g_idx=None): 81 | raise NotImplementedError() 82 | # detect if zeros is int32 83 | if zeros.dtype != torch.int32: 84 | # use v1 85 | if use_new: 86 | if auto_switch: 87 | if np.prod(x.shape[:-1]) > auto_switch_thd: 88 | output = _matmul4bit_v1_recons(x.half(), qweight, scales.half(), zeros.half()) 89 | else: 90 | output = _matmul4bit_v1(x, qweight, scales, zeros) 91 | else: 92 | output = _matmul4bit_v1(x, qweight, scales, zeros) 93 | else: 94 | if g_idx is None: 95 | g_idx = torch.zeros(qweight.shape[0] * 8, dtype=torch.int32, device=x.device) 96 | # use v2 97 | if use_new: 98 | if auto_switch: 99 | if np.prod(x.shape[:-1]) > auto_switch_thd: 100 | output = _matmul4bit_v2_recons(x.half(), qweight, scales.half(), zeros, g_idx) 101 | else: 102 | output = _matmul4bit_v2(x, qweight, scales, zeros, g_idx) 103 | else: 104 | output = _matmul4bit_v2(x, qweight, scales, zeros, g_idx) 105 | return output 106 | 107 | 108 | def matmul3bit(x, qweight, scales, zeros, g_idx, outfeatures): 109 | out_shape = x.shape[:-1] + (outfeatures, ) 110 | x = x.reshape(-1,x.shape[-1]) 111 | output = torch.zeros((x.shape[0], outfeatures), device=x.device, dtype=torch.float32) 112 | quant_cuda.vecquant3matmul(x.float(), qweight, output, scales.float(), zeros, g_idx) 113 | output = output.reshape(out_shape) 114 | return output -------------------------------------------------------------------------------- /llmtune/engine/quant/gptq/quantizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | def quantize(x, scale, zero, maxq): 7 | if maxq < 0: 8 | return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero 9 | q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) 10 | return scale * (q - zero) 11 | 12 | class Quantizer(nn.Module): 13 | def __init__(self, shape=1): 14 | super(Quantizer, self).__init__() 15 | self.register_buffer('maxq', torch.tensor(0)) 16 | self.register_buffer('scale', torch.zeros(shape)) 17 | self.register_buffer('zero', torch.zeros(shape)) 18 | 19 | def configure( 20 | self, 21 | bits, perchannel=False, sym=True, 22 | mse=False, norm=2.4, grid=100, maxshrink=.8, 23 | trits=False 24 | ): 25 | 26 | self.maxq = torch.tensor(2 ** bits - 1) 27 | self.perchannel = perchannel 28 | self.sym = sym 29 | self.mse = mse 30 | self.norm = norm 31 | self.grid = grid 32 | self.maxshrink = maxshrink 33 | if trits: 34 | self.maxq = torch.tensor(-1) 35 | 36 | def find_params(self, x, weight=False): 37 | dev = x.device 38 | self.maxq = self.maxq.to(dev) 39 | 40 | shape = x.shape 41 | if self.perchannel: 42 | if weight: 43 | x = x.flatten(1) 44 | else: 45 | if len(shape) == 4: 46 | x = x.permute([1, 0, 2, 3]) 47 | x = x.flatten(1) 48 | if len(shape) == 3: 49 | x = x.reshape((-1, shape[-1])).t() 50 | if len(shape) == 2: 51 | x = x.t() 52 | else: 53 | x = x.flatten().unsqueeze(0) 54 | 55 | tmp = torch.zeros(x.shape[0], device=dev) 56 | xmin = torch.minimum(x.min(1)[0], tmp) 57 | xmax = torch.maximum(x.max(1)[0], tmp) 58 | 59 | if self.sym: 60 | xmax = torch.maximum(torch.abs(xmin), xmax) 61 | tmp = xmin < 0 62 | if torch.any(tmp): 63 | xmin[tmp] = -xmax[tmp] 64 | tmp = (xmin == 0) & (xmax == 0) 65 | xmin[tmp] = -1 66 | xmax[tmp] = +1 67 | 68 | if self.maxq < 0: 69 | self.scale = xmax 70 | self.zero = xmin 71 | else: 72 | self.scale = (xmax - xmin) / self.maxq 73 | if self.sym: 74 | self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2) 75 | else: 76 | self.zero = torch.round(-xmin / self.scale) 77 | 78 | if self.mse: 79 | best = torch.full([x.shape[0]], float('inf'), device=dev) 80 | for i in range(int(self.maxshrink * self.grid)): 81 | p = 1 - i / self.grid 82 | xmin1 = p * xmin 83 | xmax1 = p * xmax 84 | scale1 = (xmax1 - xmin1) / self.maxq 85 | zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero 86 | q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq) 87 | q -= x 88 | q.abs_() 89 | q.pow_(self.norm) 90 | err = torch.sum(q, 1) 91 | tmp = err < best 92 | if torch.any(tmp): 93 | best[tmp] = err[tmp] 94 | self.scale[tmp] = scale1[tmp] 95 | self.zero[tmp] = zero1[tmp] 96 | if not self.perchannel: 97 | if weight: 98 | tmp = shape[0] 99 | else: 100 | tmp = shape[1] if len(shape) != 3 else shape[2] 101 | self.scale = self.scale.repeat(tmp) 102 | self.zero = self.zero.repeat(tmp) 103 | 104 | if weight: 105 | shape = [-1] + [1] * (len(shape) - 1) 106 | self.scale = self.scale.reshape(shape) 107 | self.zero = self.zero.reshape(shape) 108 | return 109 | if len(shape) == 4: 110 | self.scale = self.scale.reshape((1, -1, 1, 1)) 111 | self.zero = self.zero.reshape((1, -1, 1, 1)) 112 | if len(shape) == 3: 113 | self.scale = self.scale.reshape((1, 1, -1)) 114 | self.zero = self.zero.reshape((1, 1, -1)) 115 | if len(shape) == 2: 116 | self.scale = self.scale.unsqueeze(0) 117 | self.zero = self.zero.unsqueeze(0) 118 | 119 | def quantize(self, x): 120 | if self.ready(): 121 | return quantize(x, self.scale, self.zero, self.maxq) 122 | return x 123 | 124 | def enabled(self): 125 | return self.maxq > 0 126 | 127 | def ready(self): 128 | return torch.all(self.scale != 0) 129 | -------------------------------------------------------------------------------- /llmtune/engine/inference/cuda/quant_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // standard forward operations 6 | 7 | void vecquant2matmul_cuda( 8 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 9 | torch::Tensor scales, torch::Tensor zeros, 10 | torch::Tensor g_idx 11 | ); 12 | 13 | void vecquant2matmul( 14 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 15 | torch::Tensor scales, torch::Tensor zeros, 16 | torch::Tensor g_idx 17 | ) { 18 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 19 | vecquant2matmul_cuda(vec, mat, mul, scales, zeros, g_idx); 20 | } 21 | 22 | void vecquant3matmul_cuda( 23 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 24 | torch::Tensor scales, torch::Tensor zeros, 25 | torch::Tensor g_idx 26 | ); 27 | 28 | void vecquant3matmul( 29 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 30 | torch::Tensor scales, torch::Tensor zeros, 31 | torch::Tensor g_idx 32 | ) { 33 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 34 | vecquant3matmul_cuda(vec, mat, mul, scales, zeros, g_idx); 35 | } 36 | 37 | void vecquant4matmul_cuda( 38 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 39 | torch::Tensor scales, torch::Tensor zeros, 40 | torch::Tensor g_idx 41 | ); 42 | 43 | void vecquant4matmul( 44 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 45 | torch::Tensor scales, torch::Tensor zeros, 46 | torch::Tensor g_idx 47 | ) { 48 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 49 | vecquant4matmul_cuda(vec, mat, mul, scales, zeros, g_idx); 50 | } 51 | 52 | void vecquant8matmul_cuda( 53 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 54 | torch::Tensor scales, torch::Tensor zeros, 55 | torch::Tensor g_idx 56 | ); 57 | 58 | void vecquant8matmul( 59 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 60 | torch::Tensor scales, torch::Tensor zeros, 61 | torch::Tensor g_idx 62 | ) { 63 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 64 | vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx); 65 | } 66 | 67 | // methods based on reconstruction (unpacking) 68 | 69 | void vecquant4recons_v1_cuda( 70 | torch::Tensor mat, torch::Tensor res, torch::Tensor scales, torch::Tensor zeros 71 | ); 72 | 73 | void vecquant4recons_v1( 74 | torch::Tensor mat, torch::Tensor res, 75 | torch::Tensor scales, torch::Tensor zeros 76 | ) { 77 | const at::cuda::OptionalCUDAGuard device_guard(device_of(scales)); 78 | vecquant4recons_v1_cuda(mat, res, scales, zeros); 79 | } 80 | 81 | void vecquant4recons_v2_cuda( 82 | torch::Tensor mat, torch::Tensor res, 83 | torch::Tensor scales, torch::Tensor zeros, 84 | torch::Tensor g_idx 85 | ); 86 | 87 | void vecquant4recons_v2( 88 | torch::Tensor mat, torch::Tensor res, torch::Tensor scales, torch::Tensor zeros, torch::Tensor g_idx 89 | ) { 90 | const at::cuda::OptionalCUDAGuard device_guard(device_of(scales)); 91 | vecquant4recons_v2_cuda(mat, res, scales, zeros, g_idx); 92 | } 93 | 94 | void vecquant2recons_v2_cuda( 95 | torch::Tensor mat, torch::Tensor res, 96 | torch::Tensor scales, torch::Tensor zeros, 97 | torch::Tensor g_idx 98 | ); 99 | 100 | void vecquant2recons_v2( 101 | torch::Tensor mat, torch::Tensor res, torch::Tensor scales, torch::Tensor zeros, torch::Tensor g_idx 102 | ) { 103 | const at::cuda::OptionalCUDAGuard device_guard(device_of(scales)); 104 | vecquant2recons_v2_cuda(mat, res, scales, zeros, g_idx); 105 | } 106 | 107 | void vecquant4matmul_v1_faster_cuda( 108 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 109 | torch::Tensor scales, torch::Tensor zeros 110 | ); 111 | 112 | void vecquant4matmul_v1_faster( 113 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 114 | torch::Tensor scales, torch::Tensor zeros 115 | ) { 116 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 117 | vecquant4matmul_v1_faster_cuda(vec, mat, mul, scales, zeros); 118 | } 119 | 120 | 121 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 122 | m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA)"); 123 | m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA)"); 124 | m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA)"); 125 | m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA)"); 126 | 127 | // Reconstruction Kernel 128 | m.def("vecquant4recons_v1", &vecquant4recons_v1, "Vector 4-bit Quantized Matrix Reconstruction (CUDA)"); 129 | m.def("vecquant4recons_v2", &vecquant4recons_v2, "Vector 4-bit Quantized Matrix Reconstruction (CUDA) with group-size support"); 130 | m.def("vecquant2recons_v2", &vecquant2recons_v2, "Vector 2-bit Quantized Matrix Reconstruction (CUDA) with group-size support"); 131 | } -------------------------------------------------------------------------------- /llmtune/executor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | 5 | from llmtune.config import DEV 6 | from llmtune.utils import to_half_precision 7 | 8 | def load_llm(model_name_or_path): 9 | from llmtune.llms.autollm import AutoLLMForCausalLM 10 | llm = AutoLLMForCausalLM.from_pretrained(model_name_or_path) 11 | return llm 12 | 13 | def load_tokenizer(model_name_or_path, llm_config=None): 14 | from llmtune.llms.autollm import get_default_tokenizer 15 | if llm_config is not None: 16 | model_type = llm_config.model_type 17 | else: 18 | model_type = None 19 | return get_default_tokenizer(model_name_or_path, model_type) 20 | 21 | def load_adapter(llm, adapter_path=None, lora_config=None): 22 | from llmtune.engine.lora.peft import quant_peft 23 | if adapter_path is None and lora_config is not None: 24 | model = quant_peft.get_peft_model(llm, lora_config) 25 | elif adapter_path is not None and lora_config is None: 26 | model = quant_peft.PeftModel.from_pretrained( 27 | llm, adapter_path, 28 | device_map='auto', 29 | torch_dtype=torch.float32 30 | ) 31 | print(adapter_path, 'loaded') 32 | else: 33 | ValueError('Need to specify adapter_path or lora_config') 34 | return model 35 | 36 | def generate( 37 | llm, tokenizer, prompt, min_length, max_length, temperature, top_k, top_p 38 | ): 39 | llm.to(DEV) 40 | llm = to_half_precision(llm) 41 | input_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEV) 42 | 43 | with torch.no_grad(): 44 | generated_ids = llm.generate( 45 | inputs=input_ids, 46 | do_sample=True, 47 | min_length=min_length, 48 | max_length=max_length, 49 | top_p=top_p, 50 | top_k=top_k, 51 | temperature=temperature, 52 | ) 53 | return tokenizer.decode([el.item() for el in generated_ids[0]]) 54 | 55 | def finetune(llm, tokenizer, tune_config): 56 | import transformers 57 | from llmtune.data import load_finetuning_data 58 | from llmtune.engine.lora.peft import quant_peft 59 | transformers.logging.set_verbosity_info() 60 | tokenizer.pad_token_id = 0 61 | 62 | lora_config = quant_peft.LoraConfig( 63 | r=tune_config.lora_r, 64 | lora_alpha=tune_config.lora_alpha, 65 | target_modules=["q_proj", "v_proj"], 66 | lora_dropout=tune_config.lora_dropout, 67 | bias="none", 68 | task_type="CAUSAL_LM", 69 | ) 70 | model = load_adapter(llm, lora_config=lora_config) 71 | model.print_trainable_parameters() 72 | 73 | data = load_finetuning_data(tune_config, tokenizer) 74 | 75 | training_arguments = transformers.TrainingArguments( 76 | per_device_train_batch_size=tune_config.mbatch_size, 77 | gradient_accumulation_steps=tune_config.gradient_accumulation_steps, 78 | warmup_steps=tune_config.warmup_steps, 79 | num_train_epochs=tune_config.epochs, 80 | learning_rate=tune_config.lr, 81 | fp16=True, 82 | logging_steps=tune_config.logging_steps, 83 | evaluation_strategy="no", 84 | save_strategy="steps", 85 | eval_steps=None, 86 | save_steps=tune_config.save_steps, 87 | output_dir=tune_config.lora_out_dir, 88 | save_total_limit=tune_config.save_total_limit, 89 | load_best_model_at_end=False, 90 | ddp_find_unused_parameters=False if tune_config.ddp else None, 91 | ) 92 | 93 | trainer = transformers.Trainer( 94 | model=model, 95 | train_dataset=data.train_data, 96 | eval_dataset=data.val_data, 97 | args=training_arguments, 98 | data_collator=transformers.DataCollatorForLanguageModeling( 99 | tokenizer, mlm=False 100 | ), 101 | ) 102 | print(training_arguments.parallel_mode) 103 | model.config.use_cache = False 104 | 105 | # use half precision 106 | model = to_half_precision(model) 107 | 108 | # start training 109 | checkpoint_dir = tune_config.lora_out_dir 110 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir): 111 | trainer.train(resume_from_checkpoint=True) 112 | else: 113 | trainer.train() 114 | 115 | # Save Model 116 | model.save_pretrained(tune_config.lora_out_dir) 117 | 118 | def quantize(llm, config): 119 | from llmtune.data.calibration import get_calibration_loaders 120 | from llmtune.engine.quant.gptq.executor import GPTQAlgorithm 121 | 122 | llm.eval() 123 | dataloader, _ = get_calibration_loaders( 124 | config.dataset, 125 | nsamples=config.nsamples, 126 | seed=config.seed, 127 | model=llm.base_model.name_or_path, 128 | seqlen=llm.base_model.seqlen 129 | ) 130 | 131 | gptq = GPTQAlgorithm(config) 132 | llm = gptq.quantize(llm, dataloader) 133 | 134 | llm.save_pretrained(config.save) 135 | print(f'Model weights saved to: {config.save}') 136 | 137 | -------------------------------------------------------------------------------- /llmtune/engine/quant/gptq/extras.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from llmtune.engine.quant.gptq.algorithm import GPTQ 4 | from llmtune.engine.quant.gptq.quantizer import Quantizer 5 | from llmtune.engine.quant.converter import make_quant 6 | from llmtune.engine.inference.modules import QuantLinear 7 | from llmtune.utils import find_layers 8 | 9 | @torch.no_grad() 10 | def quantize_opt( 11 | model, dataloader, bits, groupsize, act_order, nsamples, percdamp, 12 | sym=False, true_sequential=False, nearest=False, trits=False, dev='cuda' 13 | ): 14 | print('Starting ...') 15 | if nearest is True or true_sequential is True: 16 | raise NotImplementedError() 17 | 18 | use_cache = model.config.use_cache 19 | model.config.use_cache = False 20 | layers = model.model.decoder.layers 21 | 22 | model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 23 | model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) 24 | if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: 25 | model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 26 | if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: 27 | model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 28 | layers[0] = layers[0].to(dev) 29 | 30 | dtype = next(iter(model.parameters())).dtype 31 | inps = torch.zeros( 32 | (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev 33 | ) 34 | cache = {'i': 0, 'attention_mask': None} 35 | 36 | class Catcher(nn.Module): 37 | def __init__(self, module): 38 | super().__init__() 39 | self.module = module 40 | def forward(self, inp, **kwargs): 41 | inps[cache['i']] = inp 42 | cache['i'] += 1 43 | cache['attention_mask'] = kwargs['attention_mask'] 44 | raise ValueError 45 | layers[0] = Catcher(layers[0]) 46 | for batch in dataloader: 47 | try: 48 | model(batch[0].to(dev)) 49 | except ValueError: 50 | pass 51 | layers[0] = layers[0].module 52 | 53 | layers[0] = layers[0].cpu() 54 | model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() 55 | model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() 56 | if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: 57 | model.model.decoder.project_out = model.model.decoder.project_out.cpu() 58 | if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: 59 | model.model.decoder.project_in = model.model.decoder.project_in.cpu() 60 | torch.cuda.empty_cache() 61 | 62 | outs = torch.zeros_like(inps) 63 | attention_mask = cache['attention_mask'] 64 | 65 | print('Ready.') 66 | 67 | quantizers = {} 68 | for i in range(len(layers)): 69 | layer = layers[i].to(dev) 70 | 71 | subset = find_layers(layer) 72 | gptq = {} 73 | for name in subset: 74 | gptq[name] = GPTQ(subset[name]) 75 | gptq[name].quantizer = Quantizer() 76 | gptq[name].quantizer.configure( bits, perchannel=True, sym=sym, mse=False, trits=trits ) 77 | 78 | def add_batch(name): 79 | def tmp(_, inp, out): 80 | gptq[name].add_batch(inp[0].data, out.data) 81 | return tmp 82 | 83 | handles = [] 84 | for name in subset: 85 | handles.append(subset[name].register_forward_hook(add_batch(name))) 86 | 87 | for j in range(nsamples): 88 | outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] 89 | 90 | for h in handles: 91 | h.remove() 92 | 93 | for name in subset: 94 | print(f'Quantizing {name} in layer {i+1}/{len(layers)}...') 95 | scale,zero,g_idx = gptq[name].fasterquant(percdamp=percdamp, groupsize=groupsize, actorder=act_order) 96 | quantizers['model.decoder.layers.%d.%s' % (i, name)] = (gptq[name].quantizer.cpu(),scale.cpu(),zero.cpu(),g_idx.cpu()) 97 | gptq[name].free() 98 | 99 | for j in range(nsamples): 100 | outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] 101 | 102 | layers[i] = layer.cpu() 103 | del layer 104 | del gptq 105 | torch.cuda.empty_cache() 106 | 107 | inps, outs = outs, inps 108 | 109 | model.config.use_cache = use_cache 110 | 111 | return quantizers 112 | 113 | def pack_opt(model, quantizers, wbits, groupsize): 114 | layers = find_layers(model) 115 | layers = {n: layers[n] for n in quantizers} 116 | make_quant(model, quantizers, wbits, groupsize) 117 | qlayers = find_layers(model, [QuantLinear]) 118 | print('Packing ...') 119 | for name in qlayers: 120 | print(name) 121 | quantizers[name],scale,zero,g_idx = quantizers[name] 122 | qlayers[name].pack(layers[name], scale, zero, g_idx) 123 | print('Done.') 124 | return model -------------------------------------------------------------------------------- /finetune/samsum-llama/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import torch 5 | import numpy as np 6 | 7 | 8 | def set_random_seed(seed): 9 | random.seed(seed) 10 | np.random.seed(seed) 11 | os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2" 12 | os.environ["PL_GLOBAL_SEED"] = str(seed) 13 | os.environ["PYTHONHASHSEED"] = str(seed) 14 | torch.manual_seed(seed) 15 | torch.cuda.manual_seed_all(seed) 16 | torch.backends.cudnn.benchmark = False 17 | torch.backends.cudnn.deterministic = True 18 | 19 | 20 | def fix_tokenizer(tokenizer): 21 | # Fixing broken tokenizers 22 | special_tokens = dict() 23 | for token_id in range(1000): 24 | token = tokenizer.convert_ids_to_tokens(token_id) 25 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad" in token: 26 | special_tokens["pad_token"] = token 27 | if tokenizer.bos_token_id in (None, tokenizer.vocab_size) and "" in token: 28 | special_tokens["bos_token"] = token 29 | if tokenizer.eos_token_id in (None, tokenizer.vocab_size) and "" in token: 30 | special_tokens["eos_token"] = token 31 | if tokenizer.unk_token_id in (None, tokenizer.vocab_size) and "unk" in token: 32 | special_tokens["unk_token"] = token 33 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep" in token: 34 | special_tokens["sep_token"] = token 35 | 36 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "bos_token" in special_tokens: 37 | special_tokens["sep_token"] = special_tokens["bos_token"] 38 | 39 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad_token" not in special_tokens: 40 | if tokenizer.unk_token_id is not None: 41 | special_tokens["pad_token"] = tokenizer.unk_token 42 | else: 43 | special_tokens["pad_token"] = "<|pad|>" 44 | 45 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep_token" not in special_tokens: 46 | if tokenizer.bos_token_id is not None: 47 | special_tokens["sep_token"] = tokenizer.bos_token 48 | else: 49 | special_tokens["sep_token"] = "<|sep|>" 50 | print(special_tokens) 51 | tokenizer.add_special_tokens(special_tokens) 52 | 53 | print("Vocab size: ", tokenizer.vocab_size) 54 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token) 55 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token) 56 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token) 57 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token) 58 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token) 59 | return tokenizer 60 | 61 | 62 | def fix_model(model, tokenizer, use_resize=True): 63 | model.config.pad_token_id = tokenizer.pad_token_id 64 | assert model.config.pad_token_id is not None 65 | 66 | bos_candidates = ( 67 | tokenizer.bos_token_id, 68 | tokenizer.cls_token_id, 69 | tokenizer.sep_token_id, 70 | tokenizer.unk_token_id 71 | ) 72 | for bos_candidate in bos_candidates: 73 | model.config.bos_token_id = bos_candidate 74 | if bos_candidate is not None: 75 | break 76 | assert model.config.bos_token_id is not None 77 | model.config.decoder_start_token_id = model.config.bos_token_id 78 | 79 | eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id) 80 | for eos_candidate in eos_candidates: 81 | model.config.eos_token_id = eos_candidate 82 | if eos_candidate is not None: 83 | break 84 | assert model.config.eos_token_id is not None 85 | 86 | if use_resize: 87 | model.resize_token_embeddings(len(tokenizer)) 88 | 89 | return model 90 | 91 | 92 | def gen_batch(records, batch_size): 93 | batch_start = 0 94 | while batch_start < len(records): 95 | batch_end = batch_start + batch_size 96 | batch = records[batch_start: batch_end] 97 | batch_start = batch_end 98 | yield batch 99 | 100 | 101 | def print_special_tokens(tokenizer): 102 | print("Vocab size: ", tokenizer.vocab_size) 103 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token) 104 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token) 105 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token) 106 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token) 107 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token) 108 | return tokenizer 109 | 110 | # PAD: 0 111 | # BOS: 1 112 | # EOS: 2 113 | # UNK: 0 114 | # SEP: 1 115 | 116 | def fix_tokenizer_opt(tokenizer): 117 | # Fixing broken tokenizers 118 | special_tokens = { 119 | 'pad_token': '', 120 | 'bos_token': '', 121 | 'eos_token': '', 122 | 'unk_token': '', 123 | 'sep_token': '' 124 | 125 | } 126 | 127 | tokenizer.add_special_tokens(special_tokens) 128 | 129 | print("Vocab size: ", tokenizer.vocab_size) 130 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token) 131 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token) 132 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token) 133 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token) 134 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token) 135 | return tokenizer -------------------------------------------------------------------------------- /finetune/samsum-opt/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import torch 5 | import numpy as np 6 | 7 | 8 | def set_random_seed(seed): 9 | random.seed(seed) 10 | np.random.seed(seed) 11 | os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2" 12 | os.environ["PL_GLOBAL_SEED"] = str(seed) 13 | os.environ["PYTHONHASHSEED"] = str(seed) 14 | torch.manual_seed(seed) 15 | torch.cuda.manual_seed_all(seed) 16 | torch.backends.cudnn.benchmark = False 17 | torch.backends.cudnn.deterministic = True 18 | 19 | 20 | def fix_tokenizer(tokenizer): 21 | # Fixing broken tokenizers 22 | special_tokens = dict() 23 | for token_id in range(1000): 24 | token = tokenizer.convert_ids_to_tokens(token_id) 25 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad" in token: 26 | special_tokens["pad_token"] = token 27 | if tokenizer.bos_token_id in (None, tokenizer.vocab_size) and "" in token: 28 | special_tokens["bos_token"] = token 29 | if tokenizer.eos_token_id in (None, tokenizer.vocab_size) and "" in token: 30 | special_tokens["eos_token"] = token 31 | if tokenizer.unk_token_id in (None, tokenizer.vocab_size) and "unk" in token: 32 | special_tokens["unk_token"] = token 33 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep" in token: 34 | special_tokens["sep_token"] = token 35 | 36 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "bos_token" in special_tokens: 37 | special_tokens["sep_token"] = special_tokens["bos_token"] 38 | 39 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad_token" not in special_tokens: 40 | if tokenizer.unk_token_id is not None: 41 | special_tokens["pad_token"] = tokenizer.unk_token 42 | else: 43 | special_tokens["pad_token"] = "<|pad|>" 44 | 45 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep_token" not in special_tokens: 46 | if tokenizer.bos_token_id is not None: 47 | special_tokens["sep_token"] = tokenizer.bos_token 48 | else: 49 | special_tokens["sep_token"] = "<|sep|>" 50 | print(special_tokens) 51 | tokenizer.add_special_tokens(special_tokens) 52 | 53 | print("Vocab size: ", tokenizer.vocab_size) 54 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token) 55 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token) 56 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token) 57 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token) 58 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token) 59 | return tokenizer 60 | 61 | 62 | def fix_model(model, tokenizer, use_resize=True): 63 | model.config.pad_token_id = tokenizer.pad_token_id 64 | assert model.config.pad_token_id is not None 65 | 66 | bos_candidates = ( 67 | tokenizer.bos_token_id, 68 | tokenizer.cls_token_id, 69 | tokenizer.sep_token_id, 70 | tokenizer.unk_token_id 71 | ) 72 | for bos_candidate in bos_candidates: 73 | model.config.bos_token_id = bos_candidate 74 | if bos_candidate is not None: 75 | break 76 | assert model.config.bos_token_id is not None 77 | model.config.decoder_start_token_id = model.config.bos_token_id 78 | 79 | eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id) 80 | for eos_candidate in eos_candidates: 81 | model.config.eos_token_id = eos_candidate 82 | if eos_candidate is not None: 83 | break 84 | assert model.config.eos_token_id is not None 85 | 86 | if use_resize: 87 | model.resize_token_embeddings(len(tokenizer)) 88 | 89 | return model 90 | 91 | 92 | def gen_batch(records, batch_size): 93 | batch_start = 0 94 | while batch_start < len(records): 95 | batch_end = batch_start + batch_size 96 | batch = records[batch_start: batch_end] 97 | batch_start = batch_end 98 | yield batch 99 | 100 | 101 | def print_special_tokens(tokenizer): 102 | print("Vocab size: ", tokenizer.vocab_size) 103 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token) 104 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token) 105 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token) 106 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token) 107 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token) 108 | return tokenizer 109 | 110 | # PAD: 0 111 | # BOS: 1 112 | # EOS: 2 113 | # UNK: 0 114 | # SEP: 1 115 | 116 | def fix_tokenizer_opt(tokenizer): 117 | # Fixing broken tokenizers 118 | special_tokens = { 119 | 'pad_token': '', 120 | 'bos_token': '', 121 | 'eos_token': '', 122 | 'unk_token': '', 123 | 'sep_token': '' 124 | 125 | } 126 | 127 | tokenizer.add_special_tokens(special_tokens) 128 | 129 | print("Vocab size: ", tokenizer.vocab_size) 130 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token) 131 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token) 132 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token) 133 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token) 134 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token) 135 | return tokenizer -------------------------------------------------------------------------------- /finetune/samsum-llama/eval_samsum_4bit_bnb.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # Set up the argument parser 4 | parser = argparse.ArgumentParser(description='Python script to work with models') 5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True) 6 | parser.add_argument('--adapter', type=str, help='adapter ID for huggingface', required=True) 7 | parser.add_argument('--file_name', type=str, help='backup file name', required=True) 8 | parser.add_argument('--seed', type=int, help='model seed number', required=True) 9 | 10 | # Parse the arguments 11 | args = parser.parse_args() 12 | 13 | # Use the command line arguments in your script 14 | print('Model Name:', args.model_name) 15 | print('Adapter Name: ', args.adapter) 16 | print('Output file:', args.file_name) 17 | print('Seed: ', args.seed) 18 | 19 | import random 20 | import json 21 | import os 22 | 23 | # import wandb 24 | import torch 25 | import numpy as np 26 | # import bitsandbytes as bnb 27 | from tqdm import tqdm 28 | import transformers 29 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq 30 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig 31 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 32 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training, PeftModel 33 | from datasets import load_dataset 34 | 35 | from utils import * 36 | from data import * 37 | 38 | import evaluate 39 | import numpy as np 40 | from datasets import load_from_disk 41 | from tqdm import tqdm 42 | 43 | 44 | output_dir = args.adapter 45 | model_name = args.model_name 46 | seed = args.seed 47 | train_sample_rate = 1.0 48 | val_sample_rate = 1.0 49 | local_rank = 0 50 | 51 | set_random_seed(seed) 52 | logging.set_verbosity_info() 53 | 54 | # with open(config_file, "r") as r: 55 | # config = json.load(r) 56 | 57 | os.environ["WANDB_DISABLED"] = "true" 58 | 59 | device_map = "auto" 60 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 61 | ddp = world_size != 1 62 | 63 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 64 | tokenizer = fix_tokenizer(tokenizer) 65 | # tokenizer.save_pretrained(output_dir) 66 | 67 | dataset = load_dataset('samsum') 68 | val_records = dataset['test'] 69 | 70 | ## Config for llama 7-b 71 | model_type = "causal" 72 | templates_path = "llama_lora_samsum.json" 73 | only_target_loss = False 74 | mode = "instruct" 75 | 76 | model_types = { 77 | "causal": AutoModelForCausalLM, 78 | "seq2seq": AutoModelForSeq2SeqLM 79 | } 80 | load_in_8bit = False 81 | load_in_4bit = True 82 | if load_in_8bit: 83 | assert not load_in_4bit 84 | model = model_types[model_type].from_pretrained( 85 | model_name, 86 | load_in_8bit=True, 87 | device_map=device_map 88 | ) 89 | elif load_in_4bit: 90 | assert not load_in_8bit 91 | #use_bf16 = trainer_config.get("bf16", False) 92 | use_bf16 = True 93 | compute_dtype = torch.bfloat16 if use_bf16 else torch.float16 94 | model = model_types[model_type].from_pretrained( 95 | model_name, 96 | load_in_4bit=True, 97 | device_map=device_map, 98 | quantization_config=BitsAndBytesConfig( 99 | load_in_4bit=True, 100 | bnb_4bit_compute_dtype=compute_dtype, 101 | bnb_4bit_use_double_quant=True, 102 | ), 103 | torch_dtype=torch.bfloat16 if use_bf16 else torch.float32 104 | ) 105 | else: 106 | model = model_types[model_type].from_pretrained(model_name) 107 | 108 | # Default model generation params 109 | model = fix_model(model, tokenizer, use_resize=False) 110 | model.config.num_beams = 5 111 | 112 | 113 | peft_model_id = args.adapter 114 | model = PeftModel.from_pretrained(model, peft_model_id) 115 | 116 | # Metric 117 | metric = evaluate.load("rouge") 118 | 119 | def evaluate_peft_model(sample,max_target_length=45): 120 | # Load dataset from the hub and get a sample 121 | sample_word = f"### Summarize this: {sample}\n ### Output: " 122 | with torch.inference_mode(), torch.autocast("cuda"): 123 | input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda() 124 | outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 45) 125 | output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"") 126 | print(f"Output:\n{output}") 127 | # Some simple post-processing 128 | return output 129 | 130 | # run predictions 131 | # this can take ~45 minutes 132 | predictions = [] 133 | for sample in tqdm(dataset['test']['dialogue']): 134 | p = evaluate_peft_model(sample) 135 | predictions.append(p) 136 | 137 | # compute metric 138 | rogue = metric.compute(predictions=predictions, references=dataset['test']['summary'], use_stemmer=True) 139 | 140 | # print results 141 | print(f'Seed: {seed}') 142 | print(f"Rogue1: {rogue['rouge1']* 100:2f}%") 143 | print(f"rouge2: {rogue['rouge2']* 100:2f}%") 144 | print(f"rougeL: {rogue['rougeL']* 100:2f}%") 145 | print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%") 146 | 147 | file_name = args.file_name 148 | with open(file_name, 'w') as f: 149 | for item in predictions: 150 | # write each item on a new line 151 | f.write("%s\n" % item) 152 | f.write(f'Seed: {seed}') 153 | f.write(f"Rogue1: {rogue['rouge1']* 100:2f}%") 154 | f.write(f"rouge2: {rogue['rouge2']* 100:2f}%") 155 | f.write(f"rougeL: {rogue['rougeL']* 100:2f}%") 156 | f.write(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%") -------------------------------------------------------------------------------- /finetune/samsum-llama/data.py: -------------------------------------------------------------------------------- 1 | import random 2 | import json 3 | from typing import Optional 4 | from dataclasses import dataclass 5 | from typing import List, Dict, Tuple, Any 6 | 7 | import numpy as np 8 | import torch 9 | import torch.nn.functional as F 10 | from torch.utils.data import Dataset 11 | from transformers import AutoTokenizer, PreTrainedTokenizerBase 12 | from tqdm import tqdm 13 | 14 | 15 | class InstructDataset(Dataset): 16 | def __init__( 17 | self, 18 | original_records: List[Dict], 19 | tokenizer: AutoTokenizer, 20 | max_source_tokens_count: int, 21 | max_target_tokens_count: int, 22 | templates_path: str, 23 | sample_rate: float = 1.0, 24 | only_target_loss: bool = True, 25 | input_type: str = "causal", 26 | target_field: str = "human_reference", 27 | source_field: str = "input", 28 | use_padding: bool = False 29 | ): 30 | self.original_records = original_records 31 | self.sample_rate = sample_rate 32 | self.tokenizer = tokenizer 33 | self.max_source_tokens_count = max_source_tokens_count 34 | self.max_target_tokens_count = max_target_tokens_count 35 | self.only_target_loss = only_target_loss 36 | self.input_type = input_type 37 | self.target_field = target_field 38 | self.source_field = source_field 39 | self.use_padding = use_padding 40 | self.is_printed = False 41 | 42 | with open(templates_path) as r: 43 | self.templates = json.load(r) 44 | 45 | self.records = [] 46 | for record in tqdm(original_records): #original dataset 47 | if random.random() > self.sample_rate: 48 | continue 49 | tensors = self.convert_record(record) 50 | if tensors is None: 51 | continue 52 | self.records.append(tensors) 53 | 54 | def __len__(self): 55 | return len(self.records) 56 | 57 | def __getitem__(self, index): 58 | return self.records[index] 59 | 60 | def convert_record(self, record): 61 | instruction = record["dialogue"] 62 | #inp = record[self.source_field] #basically no use 63 | out = record[self.target_field] 64 | # if inp.strip() != "" and False: 65 | # templates = self.templates["prompts_input"] 66 | # prompt_template = random.choice(templates) 67 | # source = prompt_template.format(instruction=instruction.strip(), inp=inp.strip()) 68 | # else: 69 | templates = self.templates["prompts_no_input"] ## This is what we want 70 | prompt_template = random.choice(templates) 71 | source = prompt_template.format(instruction=instruction.strip()) ## put the prompt inside 72 | target = out.strip() 73 | if not self.is_printed: 74 | print("Source and target examples") 75 | print(source) 76 | print(target) 77 | self.is_printed = True 78 | if self.input_type == "causal": 79 | return self.convert_causal(source, target) 80 | elif self.input_type == "seq2seq": 81 | return self.convert_seq2seq(source, target) 82 | else: 83 | assert False 84 | 85 | def convert_causal(self, source, target=None): 86 | source_tokens = self.tokenizer( 87 | source, 88 | add_special_tokens=False, 89 | max_length=self.max_source_tokens_count, 90 | padding=False, 91 | truncation=True 92 | )["input_ids"] 93 | ## added the box_token id 94 | if self.tokenizer.bos_token_id: 95 | source_tokens.insert(0, self.tokenizer.bos_token_id) ## box_token_id 96 | input_ids = source_tokens[:] 97 | actual_length = len(input_ids) 98 | max_length = self.max_source_tokens_count + self.max_target_tokens_count + 2 99 | if target is not None: 100 | target_tokens = self.tokenizer( 101 | target, 102 | add_special_tokens=False, 103 | max_length=self.max_target_tokens_count, 104 | padding=False, 105 | truncation=True 106 | )["input_ids"] 107 | input_ids += target_tokens + [self.tokenizer.eos_token_id] ## eos_token_id 108 | actual_length = len(input_ids) 109 | if self.use_padding: 110 | padding = [self.tokenizer.pad_token_id for i in range(len(input_ids), max_length)] 111 | input_ids.extend(padding) 112 | 113 | input_ids = torch.LongTensor(input_ids) 114 | labels = input_ids.clone() 115 | attention_mask = input_ids.new_ones(input_ids.size()) 116 | if self.use_padding: 117 | labels[actual_length:] = -100 118 | attention_mask[actual_length:] = 0 119 | if self.only_target_loss: 120 | labels[:len(source_tokens)] = -100 121 | assert input_ids.size(0) == labels.size(0) == attention_mask.size(0) <= max_length 122 | 123 | return { 124 | "input_ids": input_ids, 125 | "labels": labels, 126 | "attention_mask": attention_mask 127 | } 128 | 129 | def convert_seq2seq(self, source, target=None): 130 | inputs = self.tokenizer( 131 | source, 132 | add_special_tokens=True, 133 | max_length=self.max_source_tokens_count, 134 | padding=False, 135 | truncation=True, 136 | return_tensors="pt" 137 | ) 138 | inputs = {k: v.squeeze(0) for k, v in inputs.items()} 139 | if target is not None: 140 | outputs = self.tokenizer( 141 | target, 142 | add_special_tokens=True, 143 | max_length=self.max_target_tokens_count, 144 | padding=False, 145 | truncation=True, 146 | return_tensors="pt" 147 | ) 148 | labels = outputs["input_ids"].squeeze(0).tolist() 149 | if labels[-1] != self.tokenizer.eos_token_id: 150 | labels.append(self.tokenizer.eos_token_id) 151 | inputs["labels"] = torch.LongTensor(labels) 152 | return inputs 153 | -------------------------------------------------------------------------------- /finetune/samsum-opt/data.py: -------------------------------------------------------------------------------- 1 | import random 2 | import json 3 | from typing import Optional 4 | from dataclasses import dataclass 5 | from typing import List, Dict, Tuple, Any 6 | 7 | import numpy as np 8 | import torch 9 | import torch.nn.functional as F 10 | from torch.utils.data import Dataset 11 | from transformers import AutoTokenizer, PreTrainedTokenizerBase 12 | from tqdm import tqdm 13 | 14 | 15 | class InstructDataset(Dataset): 16 | def __init__( 17 | self, 18 | original_records: List[Dict], 19 | tokenizer: AutoTokenizer, 20 | max_source_tokens_count: int, 21 | max_target_tokens_count: int, 22 | templates_path: str, 23 | sample_rate: float = 1.0, 24 | only_target_loss: bool = True, 25 | input_type: str = "causal", 26 | target_field: str = "human_reference", 27 | source_field: str = "input", 28 | use_padding: bool = False 29 | ): 30 | self.original_records = original_records 31 | self.sample_rate = sample_rate 32 | self.tokenizer = tokenizer 33 | self.max_source_tokens_count = max_source_tokens_count 34 | self.max_target_tokens_count = max_target_tokens_count 35 | self.only_target_loss = only_target_loss 36 | self.input_type = input_type 37 | self.target_field = target_field 38 | self.source_field = source_field 39 | self.use_padding = use_padding 40 | self.is_printed = False 41 | 42 | with open(templates_path) as r: 43 | self.templates = json.load(r) 44 | 45 | self.records = [] 46 | for record in tqdm(original_records): #original dataset 47 | if random.random() > self.sample_rate: 48 | continue 49 | tensors = self.convert_record(record) 50 | if tensors is None: 51 | continue 52 | self.records.append(tensors) 53 | 54 | def __len__(self): 55 | return len(self.records) 56 | 57 | def __getitem__(self, index): 58 | return self.records[index] 59 | 60 | def convert_record(self, record): 61 | instruction = record["dialogue"] 62 | #inp = record[self.source_field] #basically no use 63 | out = record[self.target_field] 64 | # if inp.strip() != "" and False: 65 | # templates = self.templates["prompts_input"] 66 | # prompt_template = random.choice(templates) 67 | # source = prompt_template.format(instruction=instruction.strip(), inp=inp.strip()) 68 | # else: 69 | templates = self.templates["prompts_no_input"] ## This is what we want 70 | prompt_template = random.choice(templates) 71 | source = prompt_template.format(instruction=instruction.strip()) ## put the prompt inside 72 | target = out.strip() 73 | if not self.is_printed: 74 | print("Source and target examples") 75 | print(source) 76 | print(target) 77 | self.is_printed = True 78 | if self.input_type == "causal": 79 | return self.convert_causal(source, target) 80 | elif self.input_type == "seq2seq": 81 | return self.convert_seq2seq(source, target) 82 | else: 83 | assert False 84 | 85 | def convert_causal(self, source, target=None): 86 | source_tokens = self.tokenizer( 87 | source, 88 | add_special_tokens=False, 89 | max_length=self.max_source_tokens_count, 90 | padding=False, 91 | truncation=True 92 | )["input_ids"] 93 | ## added the box_token id 94 | if self.tokenizer.bos_token_id: 95 | source_tokens.insert(0, self.tokenizer.bos_token_id) ## box_token_id 96 | input_ids = source_tokens[:] 97 | actual_length = len(input_ids) 98 | max_length = self.max_source_tokens_count + self.max_target_tokens_count + 2 99 | if target is not None: 100 | target_tokens = self.tokenizer( 101 | target, 102 | add_special_tokens=False, 103 | max_length=self.max_target_tokens_count, 104 | padding=False, 105 | truncation=True 106 | )["input_ids"] 107 | input_ids += target_tokens + [self.tokenizer.eos_token_id] ## eos_token_id 108 | actual_length = len(input_ids) 109 | if self.use_padding: 110 | padding = [self.tokenizer.pad_token_id for i in range(len(input_ids), max_length)] 111 | input_ids.extend(padding) 112 | 113 | input_ids = torch.LongTensor(input_ids) 114 | labels = input_ids.clone() 115 | attention_mask = input_ids.new_ones(input_ids.size()) 116 | if self.use_padding: 117 | labels[actual_length:] = -100 118 | attention_mask[actual_length:] = 0 119 | if self.only_target_loss: 120 | labels[:len(source_tokens)] = -100 121 | assert input_ids.size(0) == labels.size(0) == attention_mask.size(0) <= max_length 122 | 123 | return { 124 | "input_ids": input_ids, 125 | "labels": labels, 126 | "attention_mask": attention_mask 127 | } 128 | 129 | def convert_seq2seq(self, source, target=None): 130 | inputs = self.tokenizer( 131 | source, 132 | add_special_tokens=True, 133 | max_length=self.max_source_tokens_count, 134 | padding=False, 135 | truncation=True, 136 | return_tensors="pt" 137 | ) 138 | inputs = {k: v.squeeze(0) for k, v in inputs.items()} 139 | if target is not None: 140 | outputs = self.tokenizer( 141 | target, 142 | add_special_tokens=True, 143 | max_length=self.max_target_tokens_count, 144 | padding=False, 145 | truncation=True, 146 | return_tensors="pt" 147 | ) 148 | labels = outputs["input_ids"].squeeze(0).tolist() 149 | if labels[-1] != self.tokenizer.eos_token_id: 150 | labels.append(self.tokenizer.eos_token_id) 151 | inputs["labels"] = torch.LongTensor(labels) 152 | return inputs 153 | -------------------------------------------------------------------------------- /finetune/samsum-opt/eval_samsum_opt_4bit_llmtune.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # Set up the argument parser 4 | parser = argparse.ArgumentParser(description='Python script to work with models') 5 | parser.add_argument('--model_name', type=str, help='HF model name with your user', required=True) 6 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True) 7 | parser.add_argument('--file_name', type=str, help='backup file name', required=True) 8 | parser.add_argument('--seed', type=int, help='model seed number', required=True) 9 | 10 | # Parse the arguments 11 | args = parser.parse_args() 12 | 13 | # Use the command line arguments in your script 14 | print('Model Name:', args.model_name) 15 | print('Adapter Path: ', args.adapter) 16 | print('Seed: ', args.seed) 17 | 18 | import random 19 | import json 20 | import os 21 | 22 | # import wandb 23 | import torch 24 | import numpy as np 25 | # import bitsandbytes as bnb 26 | from tqdm import tqdm 27 | import transformers 28 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq 29 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig 30 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 31 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training 32 | from datasets import load_dataset 33 | 34 | from utils import * 35 | from data import * 36 | 37 | import evaluate 38 | import numpy as np 39 | from datasets import load_from_disk 40 | from tqdm import tqdm 41 | 42 | from llmtune.llms.autollm import AutoLLMForCausalLM 43 | from llmtune.engine.lora.config import FinetuneConfig 44 | from llmtune.engine.lora.peft import quant_peft 45 | from llmtune.utils import to_half_precision 46 | 47 | output_dir = args.adapter 48 | seed = args.seed 49 | train_sample_rate = 1.0 50 | val_sample_rate = 1.0 51 | local_rank = 0 52 | 53 | # model config 54 | model_name = args.model_name 55 | tokenizer_name = "facebook/opt-6.7b" 56 | DEV = 'cuda' 57 | 58 | set_random_seed(42) 59 | logging.set_verbosity_info() 60 | 61 | # with open(config_file, "r") as r: 62 | # config = json.load(r) 63 | 64 | device_map = "auto" 65 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 66 | ddp = world_size != 1 67 | 68 | transformers.logging.set_verbosity_info() 69 | 70 | # load tokenizer 71 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) 72 | tokenizer.pad_token_id = 0 73 | ## Fix Tokenizer 74 | tokenizer = fix_tokenizer_opt(tokenizer) 75 | 76 | # load model 77 | llm = AutoLLMForCausalLM.from_pretrained(model_name) 78 | ## Fix Model 79 | lllm = fix_model(llm, tokenizer, use_resize=False) 80 | llm.eval() 81 | llm = llm.to(DEV) 82 | llm = to_half_precision(llm) 83 | 84 | 85 | 86 | ## dataset 87 | dataset = load_dataset('samsum') 88 | train_records = dataset['train'] 89 | val_records = dataset['test'] 90 | #random.shuffle(train_records) 91 | print("train_record[0]: ",train_records[0]) 92 | 93 | ## Config for llama 7-b 94 | model_type = "causal" 95 | templates_path = "llama_lora_samsum.json" 96 | only_target_loss = False 97 | mode = "instruct" 98 | 99 | 100 | adapter_path = args.adapter 101 | model = quant_peft.PeftModel.from_pretrained( 102 | llm, adapter_path, 103 | device_map='auto' 104 | ) 105 | print(adapter_path, 'loaded') 106 | 107 | 108 | # Model configs 109 | model.config.num_beams = 5 110 | 111 | 112 | # Metric 113 | metric = evaluate.load("rouge") 114 | 115 | def evaluate_peft_model(sample,max_target_length=45): 116 | # Load dataset from the hub and get a sample 117 | sample_word = f"### Summarize this: {sample}\n ### Output: " 118 | input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda() 119 | # with torch.inference_mode(), torch.autocast("cuda"): 120 | print("input_ids: ",input_ids) 121 | outputs = model.generate(input_ids=input_ids, do_sample=True, max_new_tokens = 45) 122 | output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"") 123 | print(f"Output:\n{output}") 124 | # Some simple post-processing 125 | return output 126 | 127 | # run predictions 128 | # this can take ~45 minutes 129 | predictions = [] 130 | for sample in tqdm(dataset['test']['dialogue']): 131 | p = evaluate_peft_model(sample) 132 | predictions.append(p) 133 | 134 | # compute metric 135 | 136 | 137 | file_name = args.file_name 138 | # with open(file_name, 'w') as f: 139 | # for item in predictions: 140 | # # write each item on a new line 141 | # f.write("%s\n" % item) 142 | # f.write(f'Seed: {seed}') 143 | 144 | 145 | # def process_file(filename): 146 | # output_list = [] 147 | # delete_lines = False 148 | # with open(filename, 'r') as file: 149 | # for line in file: 150 | # stripped_line = line.strip() 151 | # if stripped_line.startswith("### Summarize this:"): 152 | # delete_lines = True 153 | # continue 154 | # elif stripped_line.startswith("### Output: "): 155 | # output = stripped_line[len("### Output: "):] 156 | # output_list.append(output) 157 | # delete_lines = False 158 | # continue 159 | 160 | # if not delete_lines: 161 | # output_list.append(stripped_line) 162 | 163 | # return output_list 164 | 165 | # predictions = process_file(file_name) 166 | # predictions.pop() 167 | 168 | rogue = metric.compute(predictions=predictions, references=dataset['test']['summary'], use_stemmer=True) 169 | 170 | # print results 171 | print(f"Rogue1: {rogue['rouge1']* 100:2f}%") 172 | print(f"rouge2: {rogue['rouge2']* 100:2f}%") 173 | print(f"rougeL: {rogue['rougeL']* 100:2f}%") 174 | print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%") 175 | 176 | with open(file_name, 'w') as f: 177 | for item in predictions: 178 | # write each item on a new line 179 | f.write("%s\n" % item) 180 | f.write(f'Seed: {seed}\n') 181 | f.write(f"Rogue1: {rogue['rouge1']* 100:2f}%\n") 182 | f.write(f"rouge2: {rogue['rouge2']* 100:2f}%\n") 183 | f.write(f"rougeL: {rogue['rougeL']* 100:2f}%\n") 184 | f.write(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%\n") 185 | 186 | -------------------------------------------------------------------------------- /finetune/mnli-llama/data_mnli_label.py: -------------------------------------------------------------------------------- 1 | import random 2 | import json 3 | from typing import Optional 4 | from dataclasses import dataclass 5 | from typing import List, Dict, Tuple, Any 6 | 7 | import numpy as np 8 | import torch 9 | import torch.nn.functional as F 10 | from torch.utils.data import Dataset 11 | from transformers import AutoTokenizer, PreTrainedTokenizerBase 12 | from tqdm import tqdm 13 | 14 | 15 | class InstructDataset(Dataset): 16 | def __init__( 17 | self, 18 | original_records: List[Dict], 19 | tokenizer: AutoTokenizer, 20 | max_source_tokens_count: int, 21 | max_target_tokens_count: int, 22 | templates_path: str, 23 | sample_rate: float = 1.0, 24 | only_target_loss: bool = True, 25 | input_type: str = "causal", 26 | target_field: str = "human_reference", 27 | source_field: str = "input", 28 | use_padding: bool = False 29 | ): 30 | self.original_records = original_records 31 | self.sample_rate = sample_rate 32 | self.tokenizer = tokenizer 33 | self.max_source_tokens_count = max_source_tokens_count 34 | self.max_target_tokens_count = max_target_tokens_count 35 | self.only_target_loss = only_target_loss 36 | self.input_type = input_type 37 | self.target_field = target_field 38 | self.source_field = source_field 39 | self.use_padding = use_padding 40 | self.is_printed = False 41 | 42 | with open(templates_path) as r: 43 | self.templates = json.load(r) 44 | 45 | self.records = [] 46 | for record in tqdm(original_records): #original dataset 47 | if random.random() > self.sample_rate: 48 | continue 49 | tensors = self.convert_record(record) 50 | if tensors is None: 51 | continue 52 | self.records.append(tensors) 53 | 54 | def __len__(self): 55 | return len(self.records) 56 | 57 | def __getitem__(self, index): 58 | return self.records[index] 59 | 60 | def convert_record(self, record): 61 | instruction = record["premise"] 62 | hypothesis = record["hypothesis"] 63 | genre = record["genre"] 64 | #inp = record[self.source_field] #basically no use 65 | out = record["label"] 66 | # if inp.strip() != "" and False: 67 | # templates = self.templates["prompts_input"] 68 | # prompt_template = random.choice(templates) 69 | # source = prompt_template.format(instruction=instruction.strip(), inp=inp.strip()) 70 | # else: 71 | templates = self.templates["prompts_no_input"] ## This is what we want 72 | prompt_template = random.choice(templates) 73 | source = prompt_template.format(instruction=instruction.strip(), hypothesis=hypothesis.strip(), genre=genre.strip()) ## put the prompt inside 74 | target = str(out) 75 | if not self.is_printed: 76 | print("Source and target examples") 77 | print(source) 78 | print(target) 79 | self.is_printed = True 80 | if self.input_type == "causal": 81 | return self.convert_causal(source, target) 82 | elif self.input_type == "seq2seq": 83 | return self.convert_seq2seq(source, target) 84 | else: 85 | assert False 86 | 87 | def convert_causal(self, source, target=None): 88 | source_tokens = self.tokenizer( 89 | source, 90 | add_special_tokens=False, 91 | max_length=self.max_source_tokens_count, 92 | padding=False, 93 | truncation=True 94 | )["input_ids"] 95 | ## added the box_token id 96 | if self.tokenizer.bos_token_id: 97 | source_tokens.insert(0, self.tokenizer.bos_token_id) ## box_token_id 98 | input_ids = source_tokens[:] 99 | actual_length = len(input_ids) 100 | max_length = self.max_source_tokens_count + self.max_target_tokens_count + 2 101 | if target is not None: 102 | target_tokens = self.tokenizer( 103 | target, 104 | add_special_tokens=False, 105 | max_length=self.max_target_tokens_count, 106 | padding=False, 107 | truncation=True 108 | )["input_ids"] 109 | input_ids += target_tokens + [self.tokenizer.eos_token_id] ## eos_token_id 110 | actual_length = len(input_ids) 111 | if self.use_padding: 112 | padding = [self.tokenizer.pad_token_id for i in range(len(input_ids), max_length)] 113 | input_ids.extend(padding) 114 | 115 | input_ids = torch.LongTensor(input_ids) 116 | labels = input_ids.clone() 117 | attention_mask = input_ids.new_ones(input_ids.size()) 118 | if self.use_padding: 119 | labels[actual_length:] = -100 120 | attention_mask[actual_length:] = 0 121 | if self.only_target_loss: 122 | labels[:len(source_tokens)] = -100 123 | assert input_ids.size(0) == labels.size(0) == attention_mask.size(0) <= max_length 124 | 125 | return { 126 | "input_ids": input_ids, 127 | "labels": labels, 128 | "attention_mask": attention_mask 129 | } 130 | 131 | def convert_seq2seq(self, source, target=None): 132 | inputs = self.tokenizer( 133 | source, 134 | add_special_tokens=True, 135 | max_length=self.max_source_tokens_count, 136 | padding=False, 137 | truncation=True, 138 | return_tensors="pt" 139 | ) 140 | inputs = {k: v.squeeze(0) for k, v in inputs.items()} 141 | if target is not None: 142 | outputs = self.tokenizer( 143 | target, 144 | add_special_tokens=True, 145 | max_length=self.max_target_tokens_count, 146 | padding=False, 147 | truncation=True, 148 | return_tensors="pt" 149 | ) 150 | labels = outputs["input_ids"].squeeze(0).tolist() 151 | if labels[-1] != self.tokenizer.eos_token_id: 152 | labels.append(self.tokenizer.eos_token_id) 153 | inputs["labels"] = torch.LongTensor(labels) 154 | return inputs 155 | -------------------------------------------------------------------------------- /llmtune/engine/quant/gptq/algorithm.py: -------------------------------------------------------------------------------- 1 | import math 2 | import time 3 | 4 | import torch 5 | import torch.nn as nn 6 | import transformers 7 | 8 | DEBUG = False 9 | torch.backends.cuda.matmul.allow_tf32 = False 10 | torch.backends.cudnn.allow_tf32 = False 11 | 12 | def quantize(x, scale, zero, maxq): 13 | if maxq < 0: 14 | return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero 15 | q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) 16 | return scale * (q - zero) 17 | 18 | class GPTQ: 19 | def __init__(self, layer): 20 | self.layer = layer 21 | self.dev = self.layer.weight.device 22 | W = layer.weight.data.clone() 23 | if isinstance(self.layer, nn.Conv2d): 24 | W = W.flatten(1) 25 | if isinstance(self.layer, transformers.Conv1D): 26 | W = W.t() 27 | self.rows = W.shape[0] 28 | self.columns = W.shape[1] 29 | self.H = torch.zeros((self.columns, self.columns), device=self.dev) 30 | self.nsamples = 0 31 | 32 | def add_batch(self, inp, out): 33 | if DEBUG: 34 | self.inp1 = inp 35 | self.out1 = out 36 | if len(inp.shape) == 2: 37 | inp = inp.unsqueeze(0) 38 | tmp = inp.shape[0] 39 | if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D): 40 | if len(inp.shape) == 3: 41 | inp = inp.reshape((-1, inp.shape[-1])) 42 | inp = inp.t() 43 | if isinstance(self.layer, nn.Conv2d): 44 | unfold = nn.Unfold( 45 | self.layer.kernel_size, 46 | dilation=self.layer.dilation, 47 | padding=self.layer.padding, 48 | stride=self.layer.stride 49 | ) 50 | inp = unfold(inp) 51 | inp = inp.permute([1, 0, 2]) 52 | inp = inp.flatten(1) 53 | self.H *= self.nsamples / (self.nsamples + tmp) 54 | self.nsamples += tmp 55 | # inp = inp.float() 56 | inp = math.sqrt(2 / self.nsamples) * inp.float() 57 | # self.H += 2 / self.nsamples * inp.matmul(inp.t()) 58 | self.H += inp.matmul(inp.t()) 59 | 60 | def fasterquant( 61 | self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False 62 | ): 63 | W = self.layer.weight.data.clone() 64 | if isinstance(self.layer, nn.Conv2d): 65 | W = W.flatten(1) 66 | if isinstance(self.layer, transformers.Conv1D): 67 | W = W.t() 68 | W = W.float() 69 | 70 | tick = time.time() 71 | 72 | if not self.quantizer.ready(): 73 | self.quantizer.find_params(W, weight=True) 74 | 75 | H = self.H 76 | del self.H 77 | dead = torch.diag(H) == 0 78 | H[dead, dead] = 1 79 | W[:, dead] = 0 80 | 81 | if actorder: 82 | perm = torch.argsort(torch.diag(H), descending=True) 83 | W = W[:, perm] 84 | H = H[perm][:, perm] 85 | 86 | Losses = torch.zeros_like(W) 87 | Q = torch.zeros_like(W) 88 | 89 | damp = percdamp * torch.mean(torch.diag(H)) 90 | diag = torch.arange(self.columns, device=self.dev) 91 | H[diag, diag] += damp 92 | H = torch.linalg.cholesky(H) 93 | H = torch.cholesky_inverse(H) 94 | H = torch.linalg.cholesky(H, upper=True) 95 | Hinv = H 96 | 97 | g_idx = [] 98 | scale = [] 99 | zero = [] 100 | now_idx = 1 101 | 102 | for i1 in range(0, self.columns, blocksize): 103 | i2 = min(i1 + blocksize, self.columns) 104 | count = i2 - i1 105 | 106 | W1 = W[:, i1:i2].clone() 107 | Q1 = torch.zeros_like(W1) 108 | Err1 = torch.zeros_like(W1) 109 | Losses1 = torch.zeros_like(W1) 110 | Hinv1 = Hinv[i1:i2, i1:i2] 111 | 112 | for i in range(count): 113 | w = W1[:, i] 114 | d = Hinv1[i, i] 115 | 116 | if groupsize != -1: 117 | if (i1 + i) % groupsize == 0: 118 | self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True) 119 | 120 | if ((i1 + i) // groupsize) - now_idx == -1: 121 | scale.append(self.quantizer.scale) 122 | zero.append(self.quantizer.zero) 123 | now_idx += 1 124 | 125 | q = quantize( 126 | w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq 127 | ).flatten() 128 | Q1[:, i] = q 129 | Losses1[:, i] = (w - q) ** 2 / d ** 2 130 | 131 | err1 = (w - q) / d 132 | W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) 133 | Err1[:, i] = err1 134 | 135 | Q[:, i1:i2] = Q1 136 | Losses[:, i1:i2] = Losses1 / 2 137 | 138 | W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) 139 | 140 | if DEBUG: 141 | self.layer.weight.data[:, :i2] = Q[:, :i2] 142 | self.layer.weight.data[:, i2:] = W[:, i2:] 143 | print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) 144 | print(torch.sum(Losses)) 145 | 146 | torch.cuda.synchronize() 147 | print('time %.2f' % (time.time() - tick)) 148 | print('error', torch.sum(Losses).item()) 149 | 150 | groupsize = groupsize if groupsize != -1 else self.columns 151 | g_idx = [i // groupsize for i in range(self.columns)] 152 | g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device) 153 | if actorder: 154 | invperm = torch.argsort(perm) 155 | Q = Q[:, invperm] 156 | g_idx = g_idx[invperm] 157 | 158 | if isinstance(self.layer, transformers.Conv1D): 159 | Q = Q.t() 160 | self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) 161 | if DEBUG: 162 | print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) 163 | 164 | if scale == []: 165 | scale.append(self.quantizer.scale) 166 | zero.append(self.quantizer.zero) 167 | scale = torch.cat(scale,dim=1) 168 | zero = torch.cat(zero,dim=1) 169 | return scale,zero,g_idx 170 | 171 | def free(self): 172 | if DEBUG: 173 | self.inp1 = None 174 | self.out1 = None 175 | self.H = None 176 | self.Losses = None 177 | self.Trace = None 178 | torch.cuda.empty_cache() 179 | -------------------------------------------------------------------------------- /llmtune/llms/autollm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch import nn 4 | from typing import Dict, List, Optional, Union 5 | from transformers import AutoTokenizer 6 | from transformers.utils.hub import ( 7 | PushToHubMixin, cached_file, create_repo, 8 | create_commit, CommitOperationAdd 9 | ) 10 | from llmtune.llms.config import AutoLLMConfig, LLMType 11 | from llmtune.llms.llama.model import load_llama, load_llama_tokenizer 12 | from llmtune.llms.opt.model import load_opt, load_opt_tokenizer 13 | from llmtune.llms.bloom.model import load_bloom, load_bloom_tokenizer 14 | 15 | def get_default_tokenizer(name_or_path, model_type=None): 16 | if model_type is not None: 17 | if model_type == 'llama': 18 | return load_llama_tokenizer(name_or_path) 19 | elif model_type == 'opt': 20 | return load_opt_tokenizer(name_or_path) 21 | elif model_type == 'bloom': 22 | return load_bloom_tokenizer(name_or_path) 23 | else: 24 | raise ValueError() 25 | else: 26 | return AutoTokenizer.from_pretrained(name_or_path) 27 | 28 | class AutoLLMForCausalLM(nn.Module, PushToHubMixin): 29 | def __init__( 30 | self, 31 | base_model, 32 | llm_config 33 | ): 34 | super().__init__() 35 | self.base_model = base_model 36 | self.llm_config = llm_config 37 | 38 | @property 39 | def is_quantized(self): 40 | return self.llm_config.is_quantized 41 | 42 | def set_quant_config(self, quant_config): 43 | self.llm_config.set_quant_config(quant_config) 44 | 45 | @property 46 | def device(self): 47 | if not self.hf_device_map: 48 | return self.base_model.device 49 | else: 50 | device = [ 51 | d for d in self.hf_device_map.values() 52 | if d not in {'cpu', 'disk'} 53 | ][0] 54 | return torch.device(device) 55 | 56 | @property 57 | def hf_device_map(self): 58 | return getattr(self.base_model, "hf_device_map", None) 59 | 60 | @property 61 | def config(self): 62 | return self.base_model.config 63 | 64 | @property 65 | def _keys_to_ignore_on_save(self): 66 | return self.base_model._keys_to_ignore_on_save 67 | 68 | @property 69 | def _no_split_modules(self): 70 | return self.base_model._no_split_modules 71 | 72 | def to(self, device: Union[str, torch.device]): 73 | self.base_model = self.base_model.to(device) 74 | return self 75 | 76 | def forward(self, *args, **kwargs): 77 | return self.base_model(*args, **kwargs) 78 | 79 | def generate(self, **kwargs): 80 | with ( 81 | torch.inference_mode(), 82 | torch.amp.autocast(device_type=self.device.type) 83 | ): 84 | return self.base_model.generate(**kwargs) 85 | 86 | def prepare_inputs_for_generation(self, *args, **kwargs): 87 | return self.base_model.prepare_inputs_for_generation(*args, **kwargs) 88 | 89 | @classmethod 90 | def from_pretrained( 91 | cls, 92 | model_name_or_path: str, 93 | device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, 94 | device: Optional[Union[str, int]] = None, 95 | ): 96 | # load config 97 | llm_config = AutoLLMConfig.from_pretrained(model_name_or_path) 98 | load_quantized = llm_config.quant_config is not None 99 | 100 | # resolve path to checkpoint (could be None) 101 | checkpoint = None 102 | if load_quantized: 103 | if os.path.isdir(model_name_or_path): 104 | checkpoint = os.path.join( 105 | model_name_or_path, 'quantized_weights.pt' 106 | ) 107 | else: # remote 108 | checkpoint = cached_file( 109 | model_name_or_path, 'quantized_weights.pt' 110 | ) 111 | if checkpoint is None: 112 | raise FileNotFoundError( 113 | f"Couldn't find quantized weights in {model_name_or_path}" 114 | ) 115 | 116 | # load base model 117 | if llm_config.model_type == LLMType.LLAMA.value: 118 | model = load_llama(llm_config, checkpoint) 119 | elif llm_config.model_type == LLMType.OPT.value: 120 | model = load_opt(llm_config, checkpoint) 121 | elif llm_config.model_type == LLMType.BLOOM.value: 122 | model = load_bloom(llm_config, checkpoint) 123 | else: 124 | raise NotImplementedError( 125 | f'{llm_config.model_type} not supported' 126 | ) 127 | 128 | return cls(model, llm_config) 129 | 130 | def save_pretrained(self, save_dir: str): 131 | os.makedirs(save_dir, exist_ok=True) 132 | print('test') 133 | 134 | # save config 135 | self.llm_config.save_pretrained(save_dir) 136 | 137 | # save base model 138 | self.base_model.to('cpu') 139 | print(self.llm_config.quant_config) 140 | if not self.is_quantized: 141 | self.base_model.save_pretrained(save_dir) 142 | else: 143 | torch.save( 144 | self.base_model.state_dict(), 145 | os.path.join(save_dir, 'quantized_weights.pt') 146 | ) 147 | self.llm_config.base_config.model_name_or_path = save_dir 148 | 149 | def push_to_hub( 150 | self, 151 | repo_id: str, 152 | save_dir: str, 153 | commit_message: Optional[str] = "", 154 | use_auth_token: Optional[Union[bool, str]] = None, 155 | private: Optional[bool] = None, 156 | token: Optional[Union[bool, str]] = None, 157 | create_pr: Optional[bool] = False, 158 | ) -> str: 159 | 160 | if not os.path.exists(save_dir): 161 | print(f"Saving model to {save_dir}") 162 | self.save_pretrained(save_dir) 163 | 164 | repo_url = create_repo( 165 | repo_id=repo_id, token=token, private=private, 166 | exist_ok=True, repo_type="model" 167 | ) 168 | repo_id = repo_url.repo_id 169 | 170 | operations = [ 171 | CommitOperationAdd( 172 | path_or_fileobj=os.path.join(save_dir, f), 173 | path_in_repo=f 174 | ) 175 | for f in os.listdir(save_dir) 176 | ] 177 | print( 178 | f"Uploading the following files to {repo_id}: " 179 | f"{','.join(os.listdir(save_dir))}" 180 | ) 181 | return create_commit( 182 | repo_id=repo_id, 183 | operations=operations, 184 | commit_message=commit_message, 185 | token=use_auth_token, 186 | create_pr=create_pr, 187 | repo_type="model", 188 | ) 189 | 190 | -------------------------------------------------------------------------------- /llmtune/engine/inference/modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from llmtune.engine.inference.autograd import ( 6 | Autograd2bit, Autograd4bit, Autograd3bit 7 | ) 8 | 9 | try: 10 | import quant_cuda 11 | except: 12 | print('CUDA extension not installed. Inference will not work.') 13 | 14 | # Assumes layer is perfectly divisible into 256 * 256 blocks 15 | class QuantLinear(nn.Module): 16 | def __init__( 17 | self, bits, groupsize, in_features, out_features, bias, is_cuda=True 18 | ): 19 | super().__init__() 20 | if bits not in [2,3,4,8]: 21 | raise NotImplementedError("Only 2,3,4,8 bits are supported.") 22 | self.in_features = in_features 23 | self.out_features = out_features 24 | self.bits = bits 25 | self.groupsize = groupsize if groupsize != -1 else in_features 26 | self.maxq = 2 ** self.bits - 1 27 | 28 | self.register_buffer('qweight', torch.zeros((in_features // 32 * self.bits, out_features), dtype=torch.int32)) 29 | self.register_buffer('qzeros', torch.zeros((math.ceil(in_features / self.groupsize), out_features // 32 * self.bits), dtype=torch.int32)) 30 | self.register_buffer('scales', torch.zeros((math.ceil(in_features / self.groupsize), out_features), dtype=torch.float16)) 31 | self.register_buffer('g_idx', torch.tensor([i // self.groupsize for i in range(in_features)], dtype = torch.int32)) 32 | if bias is not None: 33 | self.register_buffer('bias', torch.zeros((out_features),dtype=torch.float16)) 34 | else: 35 | self.bias = None 36 | 37 | # is performed by unpacking the weights and using torch.matmul 38 | if self.bits in [2,4,8]: 39 | self.register_buffer('wf',torch.tensor(list(range(0,32,self.bits)), dtype=torch.int32).unsqueeze(0),persistent=False) 40 | elif self.bits == 3: 41 | self.register_buffer('wf', torch.tensor([[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0], 42 | [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31], 43 | [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],], dtype=torch.int32).reshape(1,3,12), persistent=False) 44 | 45 | self.is_cuda = is_cuda 46 | 47 | def pack(self, linear, scales, zeros, g_idx = None): 48 | self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx 49 | 50 | scales = scales.t().contiguous() 51 | zeros = zeros.t().contiguous() 52 | scale_zeros = zeros * scales 53 | self.scales = scales.clone().half() 54 | if linear.bias is not None: 55 | self.bias = linear.bias.clone().half() 56 | 57 | intweight = [] 58 | for idx in range(self.in_features): 59 | intweight.append( 60 | torch.round( 61 | (linear.weight.data[:,idx] + scale_zeros[self.g_idx[idx]]) 62 | / self.scales[self.g_idx[idx]]).to(torch.int)[:,None] 63 | ) 64 | intweight = torch.cat(intweight,dim=1) 65 | intweight = intweight.t().contiguous() 66 | intweight = intweight.numpy().astype(np.uint32) 67 | qweight = np.zeros( 68 | (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32 69 | ) 70 | i = 0 71 | row = 0 72 | while row < qweight.shape[0]: 73 | if self.bits in [2,4,8]: 74 | for j in range(i, i + (32//self.bits)): 75 | qweight[row] |= intweight[j] << (self.bits * (j - i)) 76 | i += 32//self.bits 77 | row += 1 78 | elif self.bits == 3: 79 | for j in range(i, i + 10): 80 | qweight[row] |= intweight[j] << (3 * (j - i)) 81 | i += 10 82 | qweight[row] |= intweight[i] << 30 83 | row += 1 84 | qweight[row] |= (intweight[i] >> 2) & 1 85 | i += 1 86 | for j in range(i, i + 10): 87 | qweight[row] |= intweight[j] << (3 * (j - i) + 1) 88 | i += 10 89 | qweight[row] |= intweight[i] << 31 90 | row += 1 91 | qweight[row] |= (intweight[i] >> 1) & 0x3 92 | i += 1 93 | for j in range(i, i + 10): 94 | qweight[row] |= intweight[j] << (3 * (j - i) + 2) 95 | i += 10 96 | row += 1 97 | else: 98 | raise NotImplementedError("Only 2,3,4,8 bits are supported.") 99 | 100 | qweight = qweight.astype(np.int32) 101 | self.qweight = torch.from_numpy(qweight) 102 | 103 | zeros -= 1; 104 | zeros = zeros.numpy().astype(np.uint32) 105 | qzeros = np.zeros( 106 | (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32 107 | ) 108 | i = 0 109 | col = 0 110 | while col < qzeros.shape[1]: 111 | if self.bits in [2,4,8]: 112 | for j in range(i, i + (32//self.bits)): 113 | qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i)) 114 | i += 32//self.bits 115 | col += 1 116 | elif self.bits == 3: 117 | for j in range(i, i + 10): 118 | qzeros[:, col] |= zeros[:, j] << (3 * (j - i)) 119 | i += 10 120 | qzeros[:, col] |= zeros[:, i] << 30 121 | col += 1 122 | qzeros[:, col] |= (zeros[:, i] >> 2) & 1 123 | i += 1 124 | for j in range(i, i + 10): 125 | qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1) 126 | i += 10 127 | qzeros[:, col] |= zeros[:, i] << 31 128 | col += 1 129 | qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3 130 | i += 1 131 | for j in range(i, i + 10): 132 | qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2) 133 | i += 10 134 | col += 1 135 | else: 136 | raise NotImplementedError("Only 2,3,4,8 bits are supported.") 137 | 138 | qzeros = qzeros.astype(np.int32) 139 | self.qzeros = torch.from_numpy(qzeros) 140 | 141 | def forward(self, x): 142 | if self.bits == 4: 143 | out = Autograd4bit.apply( 144 | x, 145 | self.qweight, 146 | self.scales, 147 | self.qzeros, 148 | self.g_idx, 149 | ) 150 | if self.bias is not None: 151 | out += self.bias 152 | elif self.bits == 2: 153 | out = Autograd2bit.apply( 154 | x, 155 | self.qweight, 156 | self.scales, 157 | self.qzeros, 158 | self.g_idx, 159 | ) 160 | if self.bias is not None: 161 | out += self.bias 162 | elif self.bits == 3: 163 | out = Autograd3bit.apply( 164 | x, 165 | self.qweight, 166 | self.scales, 167 | self.qzeros, 168 | self.g_idx, 169 | self.wf, 170 | self.out_features, 171 | ) 172 | if self.bias is not None: 173 | out += self.bias 174 | else: 175 | raise NotImplementedError() 176 | return out 177 | -------------------------------------------------------------------------------- /llmtune/engine/inference/autograd.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from . import matmult as mm 6 | from torch.cuda.amp import custom_bwd, custom_fwd 7 | 8 | class Autograd4bit(torch.autograd.Function): 9 | @staticmethod 10 | @custom_fwd(cast_inputs=torch.float16) 11 | def forward(ctx, x, qweight, scales, zeros, g_idx): 12 | ctx.save_for_backward(qweight, scales, zeros, g_idx) 13 | if g_idx is None: 14 | output = mm._matmul4bit_v1_recons( 15 | x, qweight, scales, zeros 16 | ) 17 | else: 18 | output = mm._matmul4bit_v2_recons( 19 | x, qweight, scales, zeros, g_idx 20 | ) 21 | output = output.clone() 22 | return output 23 | 24 | @staticmethod 25 | @custom_bwd 26 | def backward(ctx, grad_output): 27 | qweight, scales, zeros, g_idx = ctx.saved_tensors 28 | if ctx.needs_input_grad[0]: 29 | if g_idx is None: 30 | grad = mm._matmul4bit_v1_recons( 31 | grad_output, qweight, scales, zeros, transpose=True 32 | ) 33 | else: 34 | grad = mm._matmul4bit_v2_recons( 35 | grad_output, qweight, scales, zeros, g_idx, transpose=True 36 | ) 37 | return grad, None, None, None, None, None, None 38 | 39 | class Autograd2bit(torch.autograd.Function): 40 | @staticmethod 41 | @custom_fwd(cast_inputs=torch.float16) 42 | def forward(ctx, x, qweight, scales, zeros, g_idx): 43 | ctx.save_for_backward(qweight, scales, zeros, g_idx) 44 | output = mm._matmul2bit_v2_recons(x, qweight, scales, zeros, g_idx) 45 | output = output.clone() 46 | return output 47 | 48 | @staticmethod 49 | @custom_bwd 50 | def backward(ctx, grad_output): 51 | qweight, scales, zeros, g_idx = ctx.saved_tensors 52 | if ctx.needs_input_grad[0]: 53 | grad = mm._matmul2bit_v2_recons( 54 | grad_output, qweight, scales, zeros, g_idx, transpose=True 55 | ) 56 | return grad, None, None, None, None, None, None 57 | 58 | class Autograd3bit(torch.autograd.Function): 59 | @staticmethod 60 | @custom_fwd(cast_inputs=torch.float16) 61 | def forward(ctx, x, qweight, scales, qzeros, g_idx, wf, outfeatures): 62 | ctx.save_for_backward(qweight, scales, qzeros, g_idx, wf) 63 | # output = mm.matmul3bit(x, qweight, scales, qzeros, g_idx, outfeatures) 64 | # output = output.half() 65 | # below, we instead unpack weights in pytorch 66 | weight = unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf) 67 | output = torch.matmul(x.half(), weight) 68 | output.reshape(x.shape[:-1] + (outfeatures,)) 69 | return output 70 | 71 | @staticmethod 72 | @custom_bwd 73 | def backward(ctx, grad_output): 74 | qweight, scales, qzeros, g_idx, wf = ctx.saved_tensors 75 | if ctx.needs_input_grad[0]: 76 | weight = unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf) 77 | grad = torch.matmul(grad_output.half(), weight.T) 78 | return grad, None, None, None, None, None, None, None 79 | 80 | def classic_forward( 81 | x, qweight, bias, scales, qzeros, g_idx, outfeatures, wf=None, 82 | bits=4, is_cuda=True, kernel_switch_threshold=128 83 | ): 84 | out_shape = x.shape[:-1] + (outfeatures, ) 85 | x = x.reshape(-1,x.shape[-1]) 86 | # dtype = x.dtype 87 | # x = x.float() 88 | if is_cuda is True and (kernel_switch_threshold is False or x.shape[0] < kernel_switch_threshold): 89 | raise NotImplementedError() # code below needs some fixes 90 | out = torch.zeros((x.shape[0], outfeatures), device=x.device, dtype=torch.float32) 91 | if bits == 2: 92 | quant_cuda.vecquant2matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx) 93 | elif bits == 3: 94 | quant_cuda.vecquant3matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx) 95 | elif bits == 4: 96 | quant_cuda.vecquant4matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx) 97 | elif bits == 8: 98 | quant_cuda.vecquant8matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx) 99 | out = out.half() 100 | else: 101 | weight = unpack_weight(qweight, scales, qzeros, g_idx, wf, bits) 102 | out = torch.matmul(x.half(), weight) 103 | del weight 104 | 105 | out = out.reshape(out_shape) 106 | out = out + bias if bias is not None else out 107 | # out = out.to(dtype) 108 | return out 109 | 110 | def unpack_weight(qweight, scales, qzeros, g_idx, wf=None, bits=4): 111 | if bits == 3: 112 | return unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf) 113 | elif bits in [2,4,8]: 114 | zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0)).to(torch.int16 if self.bits == 8 else torch.int8) 115 | torch.bitwise_and(zeros, (2 ** bits) - 1, out=zeros) 116 | 117 | zeros = zeros + 1 118 | zeros = zeros.reshape(scales.shape) 119 | 120 | weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8) 121 | torch.bitwise_and(weight,(2 ** bits) - 1, out=weight) 122 | 123 | weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2]) 124 | 125 | g_idx_long = g_idx.to(torch.long) 126 | weight = (scales[g_idx_long] * (weight - zeros[g_idx_long])) 127 | else: 128 | raise NotImplementedError() 129 | 130 | return weight 131 | 132 | def unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf=None): 133 | zeros = qzeros.reshape(qzeros.shape[0], qzeros.shape[1]//3, 3, 1).expand(-1, -1, -1, 12) 134 | zeros = (zeros >> wf.unsqueeze(0)) 135 | zeros[:,:,0,10] = (zeros[:,:,0,10]&0x3) | ((zeros[:,:,1,0] << 2)&0x4) 136 | zeros[:,:,1,11] = (zeros[:,:,1,11]&0x1) | ((zeros[:,:,2,0] << 1)&0x6) 137 | zeros &= 0x7 138 | zeros = torch.cat([zeros[:,:,0,:11], zeros[:,:,1,1:12], zeros[:,:,2,1:11]], dim=2) 139 | 140 | zeros = zeros + 1 141 | zeros = zeros.reshape(scales.shape) 142 | 143 | weight = qweight.reshape(qweight.shape[0]//3, 3, 1, qweight.shape[1]).expand(-1, -1, 12, -1) 144 | weight = (weight >> wf.unsqueeze(-1))&0x7 145 | weight[:,0,10] = (weight[:,0,10]&0x3) | ((weight[:,1,0] << 2)&0x4) 146 | weight[:,1,11] = (weight[:,1,11]&0x1) | ((weight[:,2,0] << 1)&0x6) 147 | weight &= 0x7 148 | weight = torch.cat([weight[:,0,:11], weight[:,1,1:12], weight[:,2,1:11]], dim=1) 149 | 150 | weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2]) 151 | 152 | g_idx_long = g_idx.to(torch.long) 153 | weight = (scales[g_idx_long] * (weight - zeros[g_idx_long])) 154 | # out = torch.matmul(x.half(), weights) 155 | # weight -= zeros[g_idx_long] 156 | # weight = weight.to(torch.half) 157 | # weight *= scales[g_idx_long] 158 | return weight 159 | 160 | # ---------------------------------------------------------------------------- 161 | # helpers 162 | 163 | buffer_mat_dic = {} 164 | def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda'): 165 | if shape_of_qweight not in buffer_mat_dic.keys(): 166 | buffer_mat_dic[shape_of_qweight] = torch.zeros( 167 | (shape_of_qweight[0] * 8, shape_of_qweight[1]), 168 | dtype=dtype, device=device 169 | ) 170 | return buffer_mat_dic[shape_of_qweight] 171 | -------------------------------------------------------------------------------- /llmtune/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from llmtune.config import LLM_MODELS 3 | 4 | # ---------------------------------------------------------------------------- 5 | 6 | def make_parser(): 7 | parser = argparse.ArgumentParser() 8 | parser.set_defaults(func=lambda args: parser.print_help()) 9 | subparsers = parser.add_subparsers(title='Commands') 10 | 11 | # generate 12 | 13 | gen_parser = subparsers.add_parser('generate') 14 | gen_parser.set_defaults(func=generate) 15 | 16 | gen_parser.add_argument('--model', required=True, 17 | help='Path or HF hub name of model to load') 18 | gen_parser.add_argument('--tokenizer', required=False, 19 | help='Path or HF hub name of tokenizer to load (default is model)') 20 | gen_parser.add_argument('--adapter', type=str, required=False, 21 | help='Path to the folder with the Lora adapter.') 22 | gen_parser.add_argument('--groupsize', type=int, default=-1, 23 | help='Groupsize used for quantization; -1 uses full row.') 24 | gen_parser.add_argument('--prompt', type=str, default='', 25 | help='Text used to initialize generation') 26 | gen_parser.add_argument('--instruction', type=str, default='', 27 | help='Instruction for an alpaca-style model') 28 | gen_parser.add_argument('--min-length', type=int, default=10, 29 | help='Minimum length of the sequence to be generated.') 30 | gen_parser.add_argument('--max-length', type=int, default=200, 31 | help='Maximum length of the sequence to be generated.') 32 | gen_parser.add_argument('--top_p', type=float, default=.95, 33 | help='Top p sampling parameter.') 34 | gen_parser.add_argument('--top_k', type=int, default=50, 35 | help='Top p sampling parameter.') 36 | gen_parser.add_argument('--temperature', type=float, default=1.0, 37 | help='Sampling temperature.') 38 | 39 | # quantize 40 | 41 | quant_parser = subparsers.add_parser('quantize') 42 | quant_parser.set_defaults(func=quantize) 43 | 44 | quant_parser.add_argument('--model', required=True, 45 | help='Path or HF hub name of model to load') 46 | quant_parser.add_argument('--save', type=str, required=True, 47 | help='Path to the saved model weights.') 48 | quant_parser.add_argument('--bits', type=int, # required=True, 49 | choices=[2, 3, 4, 8], help='#bits to use for quantization.') 50 | quant_parser.add_argument('--dataset', type=str, default='c4', 51 | choices=['wikitext2', 'ptb', 'c4'], 52 | help='Where to extract calibration data from.') 53 | quant_parser.add_argument('--seed', type=int, default=0, 54 | help='Seed for sampling the calibration data.') 55 | quant_parser.add_argument('--nsamples', type=int, default=128, 56 | help='Number of calibration data samples.') 57 | quant_parser.add_argument('--percdamp', type=float, default=.01, 58 | help='Percent of the average Hessian diagonal to use for dampening.') 59 | quant_parser.add_argument('--groupsize', type=int, default=-1, 60 | help='Groupsize to use for quantization; -1 uses full row.') 61 | quant_parser.add_argument('--act-order', action='store_true', 62 | help='Whether to apply the activation order GPTQ heuristic.') 63 | quant_parser.add_argument('--nearest', action='store_true', 64 | help='Use basic round-to-nearest quantization.') 65 | 66 | # finetune 67 | 68 | tune_parser = subparsers.add_parser('finetune') 69 | tune_parser.set_defaults(func=finetune) 70 | 71 | # finetune model config 72 | tune_parser.add_argument('--model', required=True, 73 | help='Path or HF hub name of model to load') 74 | tune_parser.add_argument('--tokenizer', required=False, 75 | help='Path or HF hub name of tokenizer to load (default is model)') 76 | tune_parser.add_argument("--data-type", choices=["alpaca", "gpt4all"], 77 | help="Dataset format", default="alpaca") 78 | tune_parser.add_argument("--dataset", required=False, 79 | help="Path to local dataset file.") 80 | tune_parser.add_argument('--adapter', type=str, required=False, 81 | help='Path to Lora adapter folder (also holds checkpoints)') 82 | tune_parser.add_argument('--groupsize', type=int, 83 | help='Groupsize used for quantization; -1 uses full row.') 84 | 85 | # finetune training config 86 | tune_parser.add_argument("--mbatch_size", default=1, type=int, 87 | help="Micro-batch size. ") 88 | tune_parser.add_argument("--batch_size", default=2, type=int, 89 | help="Batch size. ") 90 | tune_parser.add_argument("--epochs", default=3, type=int, 91 | help="Epochs. ") 92 | tune_parser.add_argument("--lr", default=2e-4, type=float, 93 | help="Learning rate. ") 94 | tune_parser.add_argument("--cutoff_len", default=256, type=int, 95 | help="") 96 | tune_parser.add_argument("--lora_r", default=8, type=int, 97 | help="") 98 | tune_parser.add_argument("--lora_alpha", default=16, type=int, 99 | help="") 100 | tune_parser.add_argument("--lora_dropout", default=0.05, type=float, 101 | help="") 102 | tune_parser.add_argument("--val_set_size", default=0.2, type=float, 103 | help="Validation set size. ") 104 | tune_parser.add_argument("--warmup_steps", default=50, type=int, 105 | help="") 106 | tune_parser.add_argument("--save_steps", default=50, type=int, 107 | help="") 108 | tune_parser.add_argument("--save_total_limit", default=3, type=int, 109 | help="") 110 | tune_parser.add_argument("--logging_steps", default=10, type=int, 111 | help="") 112 | 113 | return parser 114 | 115 | # ---------------------------------------------------------------------------- 116 | 117 | def main(): 118 | parser = make_parser() 119 | args = parser.parse_args() 120 | args.func(args) 121 | 122 | def generate(args): 123 | import llmtune.executor as llmtune 124 | llm = llmtune.load_llm(args.model) 125 | tk_name = args.tokenizer if args.tokenizer is not None else args.model 126 | tokenizer = llmtune.load_tokenizer(tk_name, llm.llm_config) 127 | if args.adapter is not None: 128 | llm = llmtune.load_adapter(llm, adapter_path=args.adapter) 129 | if args.prompt and args.instruction: 130 | raise Exception('Cannot specify both prompt and instruction') 131 | if args.instruction: 132 | from llmtune.data.alpaca import make_prompt 133 | prompt = make_prompt(args.instruction, input_="") 134 | else: 135 | prompt = args.prompt 136 | 137 | output = llmtune.generate( 138 | llm, 139 | tokenizer, 140 | prompt, 141 | args.min_length, 142 | args.max_length, 143 | args.temperature, 144 | args.top_k, 145 | args.top_p, 146 | ) 147 | 148 | if args.instruction: 149 | from llmtune.data.alpaca import make_output 150 | output = make_output(output) 151 | 152 | print(output) 153 | 154 | def finetune(args): 155 | import llmtune.executor as llmtune 156 | llm = llmtune.load_llm(args.model) 157 | tk_name = args.tokenizer if args.tokenizer is not None else args.model 158 | tokenizer = llmtune.load_tokenizer(tk_name, llm.llm_config) 159 | from llmtune.config import get_finetune_config 160 | finetune_config = get_finetune_config(args) 161 | from llmtune.executor import finetune 162 | finetune(llm, tokenizer, finetune_config) 163 | 164 | def quantize(args): 165 | from llmtune.config import get_quant_config 166 | quant_config = get_quant_config(args) 167 | import llmtune.executor as llmtune 168 | llm = llmtune.load_llm(args.model) 169 | output = llmtune.quantize( 170 | llm, 171 | quant_config 172 | ) 173 | 174 | if __name__ == '__main__': 175 | main() -------------------------------------------------------------------------------- /finetune/mnli-llama/eval_mnli_llmtune.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # Set up the argument parser 4 | parser = argparse.ArgumentParser(description='Python script to work with models') 5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True) 6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True) 7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True) 8 | parser.add_argument('--seed', type=int, help='model seed number', required=True) 9 | parser.add_argument('--file_name', type=str, help='file name to store predictions and acc', required=True) 10 | parser.add_argument('--checkpoint_name', type=str, help='folder name to store all the check points', required=True) 11 | parser.add_argument('--start_index', type=int, help='model seed number', required=True) 12 | parser.add_argument('--end_index', type=int, help='model seed number', required=True) 13 | 14 | # Parse the arguments 15 | args = parser.parse_args() 16 | 17 | # Use the command line arguments in your script 18 | print('Model Name:', args.model_name) 19 | print('Weight Path:', args.weight_path) 20 | print('Adapter Path: ', args.adapter) 21 | print('Seed: ', args.seed) 22 | 23 | import random 24 | import json 25 | import os 26 | import pickle 27 | 28 | # import wandb 29 | import torch 30 | import numpy as np 31 | # import bitsandbytes as bnb 32 | from tqdm import tqdm 33 | import transformers 34 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq 35 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig 36 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 37 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training 38 | from datasets import load_dataset 39 | 40 | from utils import * 41 | from data_mnli_label import * 42 | 43 | import evaluate 44 | import numpy as np 45 | from datasets import load_from_disk 46 | from tqdm import tqdm 47 | 48 | from llmtune.executor import load_llm, load_adapter 49 | from llmtune.engine.lora.peft import quant_peft 50 | 51 | output_dir = args.adapter 52 | model_name = "huggyllama/llama-13b" 53 | seed = args.seed 54 | train_sample_rate = 1.0 55 | val_sample_rate = 1.0 56 | local_rank = 0 57 | 58 | set_random_seed(seed) 59 | logging.set_verbosity_info() 60 | 61 | # with open(config_file, "r") as r: 62 | # config = json.load(r) 63 | 64 | device_map = "auto" 65 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 66 | ddp = world_size != 1 67 | 68 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 69 | tokenizer = fix_tokenizer(tokenizer) 70 | # tokenizer.save_pretrained(output_dir) 71 | 72 | dataset = load_dataset('multi_nli') 73 | train_records = dataset['train'] 74 | val_records = dataset['validation_matched'] 75 | #random.shuffle(train_records) 76 | print("train_record[0]: ",train_records[0]) 77 | 78 | ## Config for llama 7-b 79 | model_type = "causal" 80 | templates_path = "llama_lora_mnli.json" 81 | only_target_loss = False 82 | 83 | llmtune_model_name = args.model_name 84 | llmtune_quantized_weights_path = args.weight_path ## probably want to change this using our version of the right way 85 | llmtune_groupsize = 64 86 | 87 | 88 | llm, _ = load_llm( 89 | llmtune_model_name, 90 | llmtune_quantized_weights_path, 91 | llmtune_groupsize 92 | ) 93 | model = fix_model(llm, tokenizer, use_resize=False) 94 | 95 | # Default model generation params 96 | model.config.num_beams = 5 97 | 98 | 99 | if not ddp and torch.cuda.device_count() > 1: 100 | model.is_parallelizable = True 101 | model.model_parallel = True 102 | 103 | 104 | model = load_adapter(model, adapter_path=output_dir) 105 | 106 | # Metric 107 | 108 | def evaluate_peft_model_mnli(sample,max_target_length=65): 109 | instruction, input, genre = sample['premise'], sample['hypothesis'], sample['genre'] 110 | sample_word = f"### Premise: {instruction}\n ### Hypothesis: {input}\n ### Genre: {genre} ### Label: " 111 | print(sample_word) 112 | input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda() 113 | outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 5) 114 | output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"") 115 | output = output.strip() 116 | print(f"Output:\n{output}") 117 | # Some simple post-processing 118 | return output 119 | 120 | 121 | 122 | def acc_compute(predictions,references): 123 | acc = 0 124 | for i in range(len(predictions)): 125 | if predictions[i].lower() == references[i].lower(): 126 | acc += 1 127 | acc /= len(predictions) 128 | 129 | print("accuracy:", acc) 130 | return acc 131 | 132 | 133 | def store_pred(file_name_pickle_pred,file_name_pickle_ref,predictions,references): 134 | with open(file_name_pickle_pred, "wb") as fp: #Pickling 135 | pickle.dump(predictions, fp) 136 | with open(file_name_pickle_ref, "wb") as fp: #Pickling 137 | pickle.dump(references, fp) 138 | 139 | 140 | 141 | 142 | ##Arguments setting 143 | start_index = args.start_index 144 | end_index = args.end_index 145 | eval_len = end_index - start_index 146 | eval_save_len = eval_len // 10 147 | print("Evaluation will start at: ", start_index) 148 | print("Evaluation will end at: ", end_index) 149 | print(f'Evaluation will save at every {eval_save_len} steps') 150 | 151 | 152 | ## Create Check point Folder 153 | checkpoint_path = f'{args.checkpoint_name}_{start_index}_{end_index}' 154 | 155 | current_directory = os.getcwd() 156 | final_directory = os.path.join(current_directory, checkpoint_path) 157 | if not os.path.exists(final_directory): 158 | os.makedirs(final_directory) 159 | 160 | 161 | 162 | 163 | 164 | predictions = [] 165 | references_orig = val_records['label'][start_index:end_index] 166 | ## convert references to list of strings 167 | references = [] 168 | for item in references_orig: 169 | references.append(str(item)) 170 | 171 | 172 | count_eval = 0 173 | for idx in tqdm(range(start_index, end_index)): 174 | sample = val_records[idx] 175 | p = evaluate_peft_model_mnli(sample) 176 | predictions.append(p) 177 | count_eval += 1 178 | ## Detecting checkpoing 179 | if (count_eval%eval_save_len == 0): 180 | print(f'=>=>Checkpointing at {count_eval} steps<=<=') 181 | 182 | predictions_step = [s.strip() for s in predictions] 183 | print("prediction_step: ", predictions_step) 184 | references_step = references[0:count_eval] 185 | print("references_step: ", references_step) 186 | acc = acc_compute(predictions_step,references_step) 187 | checkpoint_name_txt = f'{final_directory}/{count_eval}.txt' 188 | checkpoint_name_pred = f'{final_directory}/{count_eval}_pred' ## pickle file for pred list 189 | checkpoint_name_ref = f'{final_directory}/{count_eval}_ref' ## pickle file for ref list 190 | ## writing pickle file 191 | store_pred(checkpoint_name_pred,checkpoint_name_ref,predictions_step,checkpoint_name_ref) 192 | with open(checkpoint_name_txt, "w") as f: 193 | for item in predictions_step: 194 | # write each item on a new line 195 | f.write("%s\n" % item) 196 | f.write("%s\n" % acc) 197 | 198 | 199 | 200 | 201 | predictions = [s.strip() for s in predictions] 202 | 203 | 204 | 205 | file_name = args.file_name 206 | 207 | with open(file_name, 'w') as f: 208 | for item in predictions: 209 | # write each item on a new line 210 | f.write("%s\n" % item) 211 | f.write("%s\n" % acc) 212 | 213 | 214 | file_name_pickle_pred = f'{final_directory}/final_pred_{start_index}_{end_index}' 215 | file_name_pickle_ref = f'{final_directory}/final_ref_{start_index}_{end_index}' 216 | 217 | store_pred(file_name_pickle_pred,file_name_pickle_ref,predictions,references) 218 | 219 | 220 | """ 221 | Loading pickle file 222 | with open("test", "rb") as fp: # Unpickling 223 | b = pickle.load(fp) 224 | """ 225 | -------------------------------------------------------------------------------- /finetune/samsum-opt/train_samsum_opt_4bit_llmtune.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # Set up the argument parser 4 | parser = argparse.ArgumentParser(description='Python script to work with models') 5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True) 6 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True) 7 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True) 8 | parser.add_argument('--seed', type=int, help='model seed number', required=True) 9 | 10 | # Parse the arguments 11 | args = parser.parse_args() 12 | 13 | # Use the command line arguments in your script 14 | print('Model Name:', args.model_name) 15 | print('Adapter Path: ', args.adapter) 16 | print('Seed: ', args.seed) 17 | print('mbatch_size: ', args.mbatch_size) 18 | 19 | 20 | import os 21 | import torch 22 | import transformers 23 | from transformers import AutoTokenizer 24 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq 25 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig 26 | from llmtune.llms.autollm import AutoLLMForCausalLM 27 | from llmtune.engine.lora.config import FinetuneConfig 28 | from llmtune.engine.lora.peft import quant_peft 29 | from llmtune.utils import to_half_precision 30 | from datasets import load_dataset 31 | 32 | from utils import * 33 | from data import * 34 | 35 | # os env setting 36 | os.environ["WANDB_DISABLED"] = "true" 37 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 38 | 39 | # model config 40 | model_name = args.model_name 41 | tokenizer_name = 'facebook/opt-6.7b' 42 | DEV = 'cuda' 43 | 44 | transformers.logging.set_verbosity_info() 45 | 46 | # load tokenizer 47 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) 48 | tokenizer.pad_token_id = 0 49 | ## Fix Tokenizer 50 | tokenizer = fix_tokenizer_opt(tokenizer) 51 | 52 | # load model 53 | llm = AutoLLMForCausalLM.from_pretrained(model_name) 54 | ## Fix Model 55 | lllm = fix_model(llm, tokenizer, use_resize=False) 56 | llm.eval() 57 | llm = llm.to(DEV) 58 | llm = to_half_precision(llm) 59 | 60 | 61 | # finetune training config 62 | MICRO_BATCH_SIZE=args.mbatch_size 63 | BATCH_SIZE = 128 64 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE 65 | EPOCHS = 3 66 | LEARNING_RATE = 1e-3 # the Karpathy constant 67 | CUTOFF_LEN = 128 # 128 accounts for about 95% of the data 68 | LORA_R = 8 69 | LORA_ALPHA = 32 70 | LORA_DROPOUT = 0.1 71 | VAL_SET_SIZE= 2000 72 | 73 | # data/gpu config 74 | seed = args.seed 75 | set_random_seed(seed) 76 | train_sample_rate = 1.0 77 | val_sample_rate = 1.0 78 | 79 | device_map = "auto" 80 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 81 | ddp = world_size != 1 82 | 83 | # if ddp: 84 | # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} 85 | # gradient_accumulation_steps = gradient_accumulation_steps // world_size 86 | 87 | # adapter_path = '/share/kuleshov/vk379/alpacas/opt-7b-quantized-lora' 88 | lora_out_dir = args.adapter 89 | 90 | # set up lora config 91 | lora_config = quant_peft.LoraConfig( 92 | r=LORA_R, 93 | lora_alpha=LORA_ALPHA, 94 | target_modules=["q_proj", "v_proj"], 95 | lora_dropout=LORA_DROPOUT, 96 | bias="none", 97 | task_type="CAUSAL_LM", 98 | ) 99 | 100 | 101 | if not ddp and torch.cuda.device_count() > 1: 102 | llm.is_parallelizable = True 103 | llm.model_parallel = True 104 | 105 | 106 | # create a new lora from config 107 | model = quant_peft.get_peft_model(llm, lora_config) 108 | 109 | if not ddp and torch.cuda.device_count() > 1: 110 | print("GPU parallel acctivated") 111 | model.is_parallelizable = True 112 | model.model_parallel = True 113 | 114 | # load stanford alpaca data 115 | dataset = load_dataset('samsum') 116 | train_records = dataset['train'] 117 | val_records = dataset['test'] 118 | 119 | ## Config for llama 65-b 120 | model_type = "causal" 121 | templates_path = "llama_lora_samsum.json" 122 | only_target_loss = False 123 | mode = "instruct" 124 | 125 | if mode == "instruct": 126 | max_source_tokens_count = 205 # Changed depending on the dataset 127 | max_target_tokens_count = 45 128 | target_field = "summary" 129 | source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response) 130 | 131 | train_dataset = InstructDataset( 132 | train_records, 133 | tokenizer, 134 | max_source_tokens_count=max_source_tokens_count, 135 | max_target_tokens_count=max_target_tokens_count, 136 | sample_rate=train_sample_rate, 137 | input_type=model_type, 138 | templates_path=templates_path, 139 | target_field=target_field, 140 | source_field=source_field, 141 | only_target_loss=only_target_loss 142 | ) 143 | 144 | val_dataset = InstructDataset( 145 | val_records, 146 | tokenizer, 147 | max_source_tokens_count=max_source_tokens_count, 148 | max_target_tokens_count=max_target_tokens_count, 149 | sample_rate=val_sample_rate, 150 | input_type=model_type, 151 | templates_path=templates_path, 152 | target_field=target_field, 153 | source_field=source_field, 154 | only_target_loss=only_target_loss 155 | ) 156 | 157 | ## Save the model 158 | dataloader_train = torch.utils.data.DataLoader(train_dataset) 159 | # torch.save(dataloader_train,'dataloader_train.pth') 160 | 161 | dataloader_val = torch.utils.data.DataLoader(val_dataset) 162 | # torch.save(dataloader_val,'dataloader_val.pth') 163 | 164 | else: 165 | assert False 166 | 167 | if "seq2seq" in model_type: 168 | data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8) 169 | else: 170 | data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8) 171 | 172 | print("INPUT_IDS") 173 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0]) 174 | print("MASK") 175 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0]) 176 | print("LABELS") 177 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0]) 178 | 179 | 180 | 181 | # Model configs 182 | model.config.num_beams = 5 183 | if mode == "instruct": 184 | max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1 185 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count 186 | 187 | 188 | # Training args 189 | training_arguments = transformers.TrainingArguments( 190 | per_device_train_batch_size = MICRO_BATCH_SIZE, 191 | per_device_eval_batch_size = 1, 192 | gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, 193 | warmup_ratio=0.06, 194 | #num_train_epochs=3, 195 | max_steps = 400, 196 | learning_rate=LEARNING_RATE, 197 | lr_scheduler_type = "cosine", ## LoRA original paper uses linear 198 | fp16=True, 199 | logging_steps=50, 200 | evaluation_strategy="steps", 201 | logging_strategy="steps", 202 | save_strategy="steps", 203 | eval_steps=50, 204 | save_steps=50, 205 | output_dir=lora_out_dir, 206 | optim = "adamw_torch", 207 | torch_compile = False, 208 | save_total_limit=2, 209 | load_best_model_at_end=True, 210 | ddp_find_unused_parameters=False if ddp else None, 211 | ) 212 | 213 | 214 | def preprocess_logits_for_metrics(logits, labels): 215 | """ 216 | Original Trainer may have a memory leak. 217 | This is a workaround to avoid storing too many tensors that are not needed. 218 | """ 219 | pred_ids = torch.argmax(logits[0], dim=-1) 220 | return pred_ids, labels 221 | 222 | # Start trainer 223 | trainer = transformers.Trainer( 224 | model=model, 225 | args=training_arguments, 226 | train_dataset=train_dataset, 227 | eval_dataset=val_dataset, 228 | data_collator=data_collator, 229 | preprocess_logits_for_metrics = preprocess_logits_for_metrics, 230 | ) 231 | 232 | # print("Prallel Training status: ", training_arguments.parallel_mode) 233 | model.config.use_cache = False 234 | 235 | # use half precision 236 | model = to_half_precision(model) 237 | 238 | # start training 239 | checkpoint_dir = lora_out_dir 240 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir): 241 | trainer.train(resume_from_checkpoint=True) 242 | else: 243 | trainer.train() 244 | 245 | # Save Model 246 | model.save_pretrained(lora_out_dir) -------------------------------------------------------------------------------- /finetune/samsum-llama/eval_samsum_4bit_llmtune.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # Set up the argument parser 4 | parser = argparse.ArgumentParser(description='Python script to work with models') 5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True) 6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True) 7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True) 8 | parser.add_argument('--seed', type=int, help='model seed number', required=True) 9 | parser.add_argument('--file_name', type=str, help='file name to store predictions and acc', required=True) 10 | parser.add_argument('--checkpoint_name', type=str, help='folder name to store all the check points', required=True) 11 | parser.add_argument('--start_index', type=int, help='model seed number', required=True) 12 | parser.add_argument('--end_index', type=int, help='model seed number', required=True) 13 | 14 | # Parse the arguments 15 | args = parser.parse_args() 16 | 17 | # Use the command line arguments in your script 18 | print('Model Name:', args.model_name) 19 | print('Weight Path:', args.weight_path) 20 | print('Adapter Path: ', args.adapter) 21 | print('Seed: ', args.seed) 22 | 23 | import random 24 | import json 25 | import os 26 | 27 | #for eval 28 | import pickle 29 | 30 | # import wandb 31 | import torch 32 | import numpy as np 33 | # import bitsandbytes as bnb 34 | from tqdm import tqdm 35 | import transformers 36 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq 37 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig 38 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 39 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training 40 | from datasets import load_dataset 41 | 42 | from utils import * 43 | from data import * 44 | 45 | import evaluate 46 | import numpy as np 47 | from datasets import load_from_disk 48 | from tqdm import tqdm 49 | 50 | from llmtune.executor import load_llm, load_adapter 51 | from llmtune.engine.lora.peft import quant_peft 52 | 53 | output_dir = args.adapter 54 | seed = args.seed 55 | train_sample_rate = 1.0 56 | val_sample_rate = 1.0 57 | local_rank = 0 58 | 59 | set_random_seed(seed) 60 | logging.set_verbosity_info() 61 | 62 | # with open(config_file, "r") as r: 63 | # config = json.load(r) 64 | 65 | device_map = "auto" 66 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 67 | ddp = world_size != 1 68 | 69 | tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-13b", use_fast=False,model_max_length=250) 70 | tokenizer = fix_tokenizer(tokenizer) 71 | # tokenizer.save_pretrained(output_dir) 72 | 73 | dataset = load_dataset('samsum') 74 | train_records = dataset['train'] 75 | val_records = dataset['test'] 76 | #random.shuffle(train_records) 77 | print("train_record[0]: ",train_records[0]) 78 | 79 | ## Config for llama 7-b 80 | model_type = "causal" 81 | templates_path = "llama_lora_samsum.json" 82 | only_target_loss = False 83 | mode = "instruct" 84 | 85 | llmtune_model_name = args.model_name 86 | llmtune_quantized_weights_path = args.weight_path 87 | llmtune_groupsize = 64 88 | 89 | llm, _ = load_llm( 90 | llmtune_model_name, 91 | llmtune_quantized_weights_path, 92 | llmtune_groupsize 93 | ) 94 | model = fix_model(llm, tokenizer, use_resize=False) 95 | 96 | # Default model generation params 97 | model.config.num_beams = 5 98 | 99 | 100 | if not ddp and torch.cuda.device_count() > 1: 101 | model.is_parallelizable = True 102 | model.model_parallel = True 103 | 104 | 105 | model = load_adapter(model, adapter_path=output_dir) 106 | 107 | # Metric 108 | metric = evaluate.load("rouge") 109 | 110 | def evaluate_peft_model_samsum(sample,max_target_length=45): 111 | # Load dataset from the hub and get a sample 112 | sample_word = f"### Summarize this: {sample}\n ### Output: " 113 | input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda() 114 | with torch.autocast("cuda"): 115 | outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 45) 116 | output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"") 117 | output = output.strip() 118 | print(f"Output:\n{output}") 119 | # Some simple post-processing 120 | return output 121 | 122 | 123 | def rouge_compute(predictions,references): 124 | rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True) 125 | return rogue 126 | 127 | 128 | def store_pred(file_name_pickle_pred,file_name_pickle_ref,predictions,references): 129 | with open(file_name_pickle_pred, "wb") as fp: #Pickling 130 | pickle.dump(predictions, fp) 131 | with open(file_name_pickle_ref, "wb") as fp: #Pickling 132 | pickle.dump(references, fp) 133 | 134 | 135 | 136 | ##Arguments setting 137 | start_index = args.start_index 138 | end_index = args.end_index 139 | eval_len = end_index - start_index 140 | eval_save_len = eval_len // 10 141 | print("Evaluation will start at: ", start_index) 142 | print("Evaluation will end at: ", end_index) 143 | print(f'Evaluation will save at every {eval_save_len} steps') 144 | 145 | 146 | ## Create Check point Folder 147 | checkpoint_path = f'{args.checkpoint_name}_{start_index}_{end_index}' 148 | 149 | current_directory = os.getcwd() 150 | final_directory = os.path.join(current_directory, checkpoint_path) 151 | if not os.path.exists(final_directory): 152 | os.makedirs(final_directory) 153 | 154 | 155 | 156 | predictions = [] 157 | references_origin = val_records['summary'][start_index:end_index] 158 | references = [] 159 | 160 | count_eval = 0 161 | 162 | 163 | for idx in tqdm(range(start_index, end_index)): 164 | sample = val_records['dialogue'][idx] 165 | # Load dataset from the hub and get a sample 166 | sample_word = f"### Summarize this: {sample}\n ### Output: " 167 | input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda() 168 | 169 | print("length of input ids:", len(input_ids[0])) 170 | # if (len(input_ids[0]) < 300): 171 | with torch.inference_mode(), torch.autocast("cuda"): 172 | outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 45) 173 | output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"") 174 | output = output.strip() 175 | print(f"Model Output: \n{output}") 176 | predictions.append(output) 177 | print(f"Reference Output: \n {references_origin[count_eval]}") 178 | references.append(references_origin[count_eval]) 179 | count_eval+=1 180 | 181 | ## Detecting checkpoing 182 | if (count_eval%eval_save_len == 0): 183 | print(f'=>=>Checkpointing at {count_eval} steps<=<=') 184 | 185 | predictions_step = [s.strip() for s in predictions] 186 | print("prediction_step: ", predictions_step) 187 | references_step = references 188 | print("references_step: ", references_step) 189 | rouge = rouge_compute(predictions_step,references_step) 190 | checkpoint_name_txt = f'{final_directory}/{count_eval}.txt' 191 | checkpoint_name_pred = f'{final_directory}/{count_eval}_pred' ## pickle file for pred list 192 | checkpoint_name_ref = f'{final_directory}/{count_eval}_ref' ## pickle file for ref list 193 | ## writing pickle file 194 | store_pred(checkpoint_name_pred,checkpoint_name_ref,predictions_step,checkpoint_name_ref) 195 | with open(checkpoint_name_txt, "w") as f: 196 | for item in predictions_step: 197 | # write each item on a new line 198 | f.write("%s\n" % item) 199 | f.write(f'Seed: {seed}') 200 | f.write(f"Rogue1: {rouge['rouge1']* 100:2f}%") 201 | f.write(f"rouge2: {rouge['rouge2']* 100:2f}%") 202 | f.write(f"rougeL: {rouge['rougeL']* 100:2f}%") 203 | f.write(f"rougeLsum: {rouge['rougeLsum']* 100:2f}%") 204 | 205 | 206 | predictions = [s.strip() for s in predictions] 207 | 208 | 209 | # compute metric 210 | rouge = metric.compute(predictions=predictions, references=references, use_stemmer=True) 211 | 212 | file_name = args.file_name 213 | with open(file_name, 'w') as f: 214 | f.write(f'Seed: {seed}') 215 | f.write(f"Rogue1: {rouge['rouge1']* 100:2f}%") 216 | f.write(f"rouge2: {rouge['rouge2']* 100:2f}%") 217 | f.write(f"rougeL: {rouge['rougeL']* 100:2f}%") 218 | f.write(f"rougeLsum: {rouge['rougeLsum']* 100:2f}%") 219 | -------------------------------------------------------------------------------- /finetune/samsum-llama/train_samsum_4bit.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # Set up the argument parser 4 | parser = argparse.ArgumentParser(description='Python script to work with models') 5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True) 6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True) 7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True) 8 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True) 9 | parser.add_argument('--seed', type=int, help='model seed number', required=True) 10 | 11 | # Parse the arguments 12 | args = parser.parse_args() 13 | 14 | # Use the command line arguments in your script 15 | print('Model Name:', args.model_name) 16 | print('Weight Path:', args.weight_path) 17 | print('Adapter Path: ', args.adapter) 18 | print('Seed: ', args.seed) 19 | print('mbatch_size: ', args.mbatch_size) 20 | 21 | 22 | import random 23 | import json 24 | import os 25 | 26 | # import wandb 27 | import torch 28 | import numpy as np 29 | import bitsandbytes as bnb 30 | from tqdm import tqdm 31 | import transformers 32 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq 33 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig 34 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 35 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training 36 | from datasets import load_dataset 37 | 38 | # from src.dataset import InstructDataset, ChatDataset 39 | # from src.util.dl import set_random_seed, fix_tokenizer, fix_model 40 | # from src.util.io import read_jsonl 41 | 42 | from utils import * 43 | from data import * 44 | 45 | from llmtune.executor import load_llm, load_adapter 46 | from llmtune.engine.lora.peft import quant_peft 47 | 48 | 49 | # os.environ["WANDB_LOG_MODEL"] = "checkpoint" 50 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 51 | 52 | 53 | class SavePeftModelCallback(TrainerCallback): 54 | def on_save( 55 | self, 56 | args: TrainingArguments, 57 | state: TrainerState, 58 | control: TrainerControl, 59 | **kwargs, 60 | ): 61 | checkpoint_folder = os.path.join( 62 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}" 63 | ) 64 | 65 | peft_model_path = os.path.join(checkpoint_folder, "adapter_model") 66 | kwargs["model"].save_pretrained(peft_model_path) 67 | return control 68 | 69 | checkpoint = None 70 | seed = args.seed 71 | train_sample_rate = 1.0 72 | val_sample_rate = 1.0 73 | local_rank = 0 74 | # report_to = "wandb" 75 | output_dir = args.adapter 76 | 77 | set_random_seed(seed) 78 | logging.set_verbosity_info() 79 | 80 | # with open(config_file, "r") as r: 81 | # config = json.load(r) 82 | 83 | device_map = "auto" 84 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 85 | ddp = world_size != 1 86 | # if ddp: 87 | # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} 88 | # gradient_accumulation_steps = gradient_accumulation_steps // world_size 89 | 90 | #deepspeed_config = config.get("deepspeed") 91 | 92 | 93 | 94 | ### Training Configuration 95 | #trainer_config = config["trainer"] 96 | 97 | MICRO_BATCH_SIZE = args.mbatch_size # this could actually be 5 but i like powers of 2 98 | BATCH_SIZE = 128 99 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE 100 | EPOCHS = 3 # we don't need 3 tbh 101 | LEARNING_RATE = 1e-3 # the Karpathy constant 102 | CUTOFF_LEN = 128 # 128 accounts for about 95% of the data 103 | LORA_R = 8 104 | LORA_ALPHA = 16 105 | LORA_DROPOUT = 0.05 106 | VAL_SET_SIZE= 2000 107 | 108 | def preprocess_logits_for_metrics(logits, labels): 109 | """ 110 | Original Trainer may have a memory leak. 111 | This is a workaround to avoid storing too many tensors that are not needed. 112 | """ 113 | pred_ids = torch.argmax(logits[0], dim=-1) 114 | return pred_ids, labels 115 | 116 | trainer_config = transformers.TrainingArguments( 117 | per_device_train_batch_size = MICRO_BATCH_SIZE, 118 | per_device_eval_batch_size = MICRO_BATCH_SIZE, 119 | gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, 120 | warmup_ratio=0.06, 121 | #num_train_epochs=3, 122 | max_steps = 350, 123 | learning_rate=LEARNING_RATE, 124 | lr_scheduler_type = "cosine", ## LoRA original paper uses linear 125 | fp16=True, 126 | logging_steps=50, 127 | evaluation_strategy="steps", 128 | logging_strategy="steps", 129 | save_strategy="steps", 130 | eval_steps=50, 131 | save_steps=50, 132 | # report_to=report_to, 133 | output_dir=output_dir, 134 | optim = "adamw_torch", 135 | torch_compile = False, 136 | save_total_limit=2, 137 | load_best_model_at_end=True, 138 | ddp_find_unused_parameters=False if ddp else None, 139 | ) 140 | 141 | 142 | # ### Apply LoRA 143 | # 144 | # Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`. 145 | 146 | target_modules = None 147 | target_modules = ['q_proj', 'v_proj'] # edit with your desired target modules 148 | #lora_config = config.get("lora") 149 | lora_config = LoraConfig( 150 | r=8, lora_alpha=32, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" 151 | ) 152 | 153 | callbacks = [SavePeftModelCallback] if lora_config else [] 154 | ##no need to use callbacks 155 | callbacks = [] 156 | 157 | training_args = trainer_config 158 | 159 | 160 | model_name = "huggyllama/llama-13b" 161 | 162 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 163 | tokenizer = fix_tokenizer(tokenizer) 164 | # tokenizer.save_pretrained(output_dir) 165 | 166 | dataset = load_dataset('samsum') 167 | train_records = dataset['train'] 168 | val_records = dataset['test'] 169 | #random.shuffle(train_records) 170 | print("train_record[0]: ",train_records[0]) 171 | 172 | ## Config for llama 65-b 173 | model_type = "causal" 174 | templates_path = "llama_lora_samsum.json" 175 | only_target_loss = False 176 | mode = "instruct" 177 | 178 | llmtune_model_name = args.model_name 179 | llmtune_quantized_weights_path = args.weight_path 180 | llmtune_groupsize = 64 181 | 182 | if mode == "instruct": 183 | max_source_tokens_count = 255 # Changed depending on the dataset 184 | max_target_tokens_count = 50 185 | target_field = "summary" 186 | source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response) 187 | 188 | train_dataset = InstructDataset( 189 | train_records, 190 | tokenizer, 191 | max_source_tokens_count=max_source_tokens_count, 192 | max_target_tokens_count=max_target_tokens_count, 193 | sample_rate=train_sample_rate, 194 | input_type=model_type, 195 | templates_path=templates_path, 196 | target_field=target_field, 197 | source_field=source_field, 198 | only_target_loss=only_target_loss 199 | ) 200 | 201 | val_dataset = InstructDataset( 202 | val_records, 203 | tokenizer, 204 | max_source_tokens_count=max_source_tokens_count, 205 | max_target_tokens_count=max_target_tokens_count, 206 | sample_rate=val_sample_rate, 207 | input_type=model_type, 208 | templates_path=templates_path, 209 | target_field=target_field, 210 | source_field=source_field, 211 | only_target_loss=only_target_loss 212 | ) 213 | 214 | ## Save the model 215 | dataloader_train = torch.utils.data.DataLoader(train_dataset) 216 | # torch.save(dataloader_train,'dataloader_train.pth') 217 | 218 | dataloader_val = torch.utils.data.DataLoader(val_dataset) 219 | # torch.save(dataloader_val,'dataloader_val.pth') 220 | 221 | else: 222 | assert False 223 | 224 | if "seq2seq" in model_type: 225 | data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8) 226 | else: 227 | data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8) 228 | 229 | print("INPUT_IDS") 230 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0]) 231 | print("MASK") 232 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0]) 233 | print("LABELS") 234 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0]) 235 | 236 | llm, _ = load_llm( 237 | llmtune_model_name, 238 | llmtune_quantized_weights_path, 239 | llmtune_groupsize 240 | ) 241 | model = fix_model(llm, tokenizer, use_resize=False) 242 | 243 | # Default model generation params 244 | model.config.num_beams = 5 245 | if mode == "instruct": 246 | max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1 247 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count 248 | 249 | if not ddp and torch.cuda.device_count() > 1: 250 | model.is_parallelizable = True 251 | model.model_parallel = True 252 | 253 | if lora_config: 254 | #lora_config = LoraConfig(**lora_config) 255 | # model = get_peft_model(model, lora_config) 256 | model = load_adapter(model, lora_config=lora_config) 257 | 258 | trainer_class = Trainer ##if not omit_base_model_save else TrainerNoBaseSave 259 | print("Trainer class:", trainer_class) 260 | trainer = trainer_class( 261 | model=model, 262 | args=training_args, 263 | train_dataset=train_dataset, 264 | eval_dataset=val_dataset, 265 | callbacks=callbacks, 266 | data_collator=data_collator, 267 | preprocess_logits_for_metrics = preprocess_logits_for_metrics, 268 | ) 269 | 270 | # with wandb.init(project="llama_ft_samsum", name="llama finetuning run") as run: ## changed the name don't forget 271 | checkpoint_dir = output_dir 272 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir): 273 | trainer.train(resume_from_checkpoint=True) 274 | else: 275 | trainer.train() 276 | model.save_pretrained(output_dir) -------------------------------------------------------------------------------- /finetune/mnli-llama/train_mnli_llmtune_label.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # Set up the argument parser 4 | parser = argparse.ArgumentParser(description='Python script to work with models') 5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True) 6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True) 7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True) 8 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True) 9 | parser.add_argument('--seed', type=int, help='model seed number', required=True) 10 | 11 | # Parse the arguments 12 | args = parser.parse_args() 13 | 14 | # Use the command line arguments in your script 15 | print('Model Name:', args.model_name) 16 | print('Weight Path:', args.weight_path) 17 | print('Adapter Path: ', args.adapter) 18 | print('Seed: ', args.seed) 19 | print('mbatch_size: ', args.mbatch_size) 20 | 21 | 22 | import random 23 | import json 24 | import os 25 | 26 | # import wandb 27 | import torch 28 | import numpy as np 29 | import bitsandbytes as bnb 30 | from tqdm import tqdm 31 | import transformers 32 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq 33 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig 34 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 35 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training 36 | from datasets import load_dataset 37 | 38 | # from src.dataset import InstructDataset, ChatDataset 39 | # from src.util.dl import set_random_seed, fix_tokenizer, fix_model 40 | # from src.util.io import read_jsonl 41 | 42 | from utils import * 43 | from data_mnli_label import * 44 | 45 | from llmtune.executor import load_llm, load_adapter 46 | from llmtune.engine.lora.peft import quant_peft 47 | 48 | 49 | # os.environ["WANDB_LOG_MODEL"] = "checkpoint" 50 | os.environ["WANDB_DISABLED"] = "true" 51 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 52 | 53 | 54 | class SavePeftModelCallback(TrainerCallback): 55 | def on_save( 56 | self, 57 | args: TrainingArguments, 58 | state: TrainerState, 59 | control: TrainerControl, 60 | **kwargs, 61 | ): 62 | checkpoint_folder = os.path.join( 63 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}" 64 | ) 65 | 66 | peft_model_path = os.path.join(checkpoint_folder, "adapter_model") 67 | kwargs["model"].save_pretrained(peft_model_path) 68 | return control 69 | 70 | checkpoint = None 71 | seed = args.seed 72 | train_sample_rate = 1.0 73 | val_sample_rate = 1.0 74 | local_rank = 0 75 | # report_to = "wandb" 76 | output_dir = args.adapter 77 | 78 | set_random_seed(seed) 79 | logging.set_verbosity_info() 80 | 81 | # with open(config_file, "r") as r: 82 | # config = json.load(r) 83 | 84 | device_map = "auto" 85 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 86 | ddp = world_size != 1 87 | 88 | if ddp: 89 | device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} 90 | gradient_accumulation_steps = gradient_accumulation_steps // world_size 91 | 92 | #deepspeed_config = config.get("deepspeed") 93 | 94 | 95 | 96 | ### Training Configuration 97 | #trainer_config = config["trainer"] 98 | 99 | MICRO_BATCH_SIZE = args.mbatch_size # this could actually be 5 but i like powers of 2 100 | BATCH_SIZE = 256 101 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE 102 | EPOCHS = 1 # we don't need 3 tbh 103 | LEARNING_RATE = 1e-3 # the Karpathy constant 104 | CUTOFF_LEN = 128 # 128 accounts for about 95% of the data 105 | LORA_R = 8 106 | LORA_ALPHA = 16 107 | LORA_DROPOUT = 0.05 108 | VAL_SET_SIZE= 2000 109 | 110 | def preprocess_logits_for_metrics(logits, labels): 111 | """ 112 | Original Trainer may have a memory leak. 113 | This is a workaround to avoid storing too many tensors that are not needed. 114 | """ 115 | pred_ids = torch.argmax(logits[0], dim=-1) 116 | return pred_ids, labels 117 | 118 | trainer_config = transformers.TrainingArguments( 119 | per_device_train_batch_size = MICRO_BATCH_SIZE, 120 | per_device_eval_batch_size = MICRO_BATCH_SIZE, 121 | gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, 122 | warmup_ratio=0.06, 123 | num_train_epochs=EPOCHS, 124 | # max_steps = 350, 125 | learning_rate=LEARNING_RATE, 126 | lr_scheduler_type = "cosine", ## LoRA original paper uses linear 127 | fp16=True, 128 | logging_steps=150, 129 | evaluation_strategy="steps", 130 | logging_strategy="steps", 131 | save_strategy="steps", 132 | eval_steps=300, 133 | save_steps=300, 134 | # report_to=report_to, 135 | output_dir=output_dir, 136 | optim = "adamw_torch", 137 | torch_compile = False, 138 | save_total_limit=2, 139 | load_best_model_at_end=True, 140 | ddp_find_unused_parameters=False if ddp else None, 141 | ) 142 | 143 | 144 | # ### Apply LoRA 145 | # 146 | # Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`. 147 | 148 | target_modules = None 149 | target_modules = ['q_proj', 'v_proj'] # edit with your desired target modules 150 | #lora_config = config.get("lora") 151 | lora_config = LoraConfig( 152 | r=8, lora_alpha=32, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" 153 | ) 154 | 155 | callbacks = [SavePeftModelCallback] if lora_config else [] 156 | ##no need to use callbacks 157 | callbacks = [] 158 | 159 | training_args = trainer_config 160 | 161 | 162 | model_name = "huggyllama/llama-13b" 163 | 164 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 165 | tokenizer = fix_tokenizer(tokenizer) 166 | # tokenizer.save_pretrained(output_dir) 167 | 168 | dataset = load_dataset('multi_nli') 169 | train_records = dataset['train'] 170 | val_records = dataset['validation_matched'] 171 | #random.shuffle(train_records) 172 | print("train_record[0]: ",train_records[0]) 173 | 174 | model_type = "causal" 175 | templates_path = "llama_lora_mnli_label.json" 176 | only_target_loss = False 177 | mode = "instruct" 178 | 179 | llmtune_model_name = args.model_name 180 | llmtune_quantized_weights_path = args.weight_path 181 | llmtune_groupsize = 64 182 | 183 | if mode == "instruct": 184 | max_source_tokens_count = 64 # Changed depending on the dataset 185 | max_target_tokens_count = 4 186 | target_field = "" 187 | source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response) 188 | 189 | train_dataset = InstructDataset( 190 | train_records, 191 | tokenizer, 192 | max_source_tokens_count=max_source_tokens_count, 193 | max_target_tokens_count=max_target_tokens_count, 194 | sample_rate=train_sample_rate, 195 | input_type=model_type, 196 | templates_path=templates_path, 197 | target_field=target_field, 198 | source_field=source_field, 199 | only_target_loss=only_target_loss 200 | ) 201 | 202 | val_dataset = InstructDataset( 203 | val_records, 204 | tokenizer, 205 | max_source_tokens_count=max_source_tokens_count, 206 | max_target_tokens_count=max_target_tokens_count, 207 | sample_rate=val_sample_rate, 208 | input_type=model_type, 209 | templates_path=templates_path, 210 | target_field=target_field, 211 | source_field=source_field, 212 | only_target_loss=only_target_loss 213 | ) 214 | 215 | ## Save the model 216 | dataloader_train = torch.utils.data.DataLoader(train_dataset) 217 | # torch.save(dataloader_train,'dataloader_train.pth') 218 | 219 | dataloader_val = torch.utils.data.DataLoader(val_dataset) 220 | # torch.save(dataloader_val,'dataloader_val.pth') 221 | 222 | else: 223 | assert False 224 | 225 | if "seq2seq" in model_type: 226 | data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8) 227 | else: 228 | data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8) 229 | 230 | print("INPUT_IDS") 231 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0]) 232 | print("MASK") 233 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0]) 234 | print("LABELS") 235 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0]) 236 | 237 | llm, _ = load_llm( 238 | llmtune_model_name, 239 | llmtune_quantized_weights_path, 240 | llmtune_groupsize 241 | ) 242 | model = fix_model(llm, tokenizer, use_resize=False) 243 | 244 | # Default model generation params 245 | model.config.num_beams = 5 246 | if mode == "instruct": 247 | max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1 248 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count 249 | 250 | if not ddp and torch.cuda.device_count() > 1: 251 | model.is_parallelizable = True 252 | model.model_parallel = True 253 | 254 | if lora_config: 255 | #lora_config = LoraConfig(**lora_config) 256 | # model = get_peft_model(model, lora_config) 257 | model = load_adapter(model, lora_config=lora_config) 258 | 259 | trainer_class = Trainer ##if not omit_base_model_save else TrainerNoBaseSave 260 | print("Trainer class:", trainer_class) 261 | trainer = trainer_class( 262 | model=model, 263 | args=training_args, 264 | train_dataset=train_dataset, 265 | eval_dataset=val_dataset, 266 | callbacks=callbacks, 267 | data_collator=data_collator, 268 | # preprocess_logits_for_metrics = preprocess_logits_for_metrics, 269 | ) 270 | 271 | # with wandb.init(project="llama_ft_samsum", name="llama finetuning run") as run: ## changed the name don't forget 272 | checkpoint_dir = output_dir 273 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir): 274 | trainer.train(resume_from_checkpoint=True) 275 | else: 276 | trainer.train() 277 | model.save_pretrained(output_dir) -------------------------------------------------------------------------------- /finetune/samsum-llama/train_samsum_4bit_bnb.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # Set up the argument parser 4 | parser = argparse.ArgumentParser(description='Python script to work with models') 5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True) 6 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True) 7 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True) 8 | parser.add_argument('--seed', type=int, help='model seed number', required=True) 9 | parser.add_argument('--repo_name', type=str, help='HF model name', required=True) 10 | 11 | 12 | # Parse the arguments 13 | args = parser.parse_args() 14 | 15 | # Use the command line arguments in your script 16 | print('Model Name:', args.model_name) 17 | print('Adapter Path: ', args.adapter) 18 | print('Seed: ', args.seed) 19 | print('mbatch_size: ', args.mbatch_size) 20 | 21 | 22 | import random 23 | import json 24 | import os 25 | 26 | # import wandb 27 | import torch 28 | import numpy as np 29 | import bitsandbytes as bnb 30 | from tqdm import tqdm 31 | import transformers 32 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq 33 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig 34 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 35 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training 36 | from datasets import load_dataset 37 | 38 | from utils import * 39 | from data import * 40 | 41 | 42 | 43 | 44 | os.environ["WANDB_DISABLED"] = "true" 45 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 46 | 47 | 48 | 49 | 50 | class SavePeftModelCallback(TrainerCallback): 51 | def on_save( 52 | self, 53 | args: TrainingArguments, 54 | state: TrainerState, 55 | control: TrainerControl, 56 | **kwargs, 57 | ): 58 | checkpoint_folder = os.path.join( 59 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}" 60 | ) 61 | 62 | peft_model_path = os.path.join(checkpoint_folder, "adapter_model") 63 | kwargs["model"].save_pretrained(peft_model_path) 64 | return control 65 | 66 | 67 | checkpoint = None 68 | seed = args.seed 69 | train_sample_rate = 1.0 70 | val_sample_rate = 1.0 71 | local_rank = 0 72 | output_dir = args.adapter 73 | 74 | set_random_seed(seed) 75 | logging.set_verbosity_info() 76 | 77 | # with open(config_file, "r") as r: 78 | # config = json.load(r) 79 | 80 | 81 | device_map = "auto" 82 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 83 | ddp = world_size != 1 84 | if ddp: 85 | device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} 86 | gradient_accumulation_steps = gradient_accumulation_steps // world_size 87 | 88 | 89 | #deepspeed_config = config.get("deepspeed") 90 | 91 | 92 | 93 | 94 | ### Training Configuration 95 | #trainer_config = config["trainer"] 96 | 97 | MICRO_BATCH_SIZE = args.mbatch_size # this could actually be 5 but i like powers of 2 98 | BATCH_SIZE = 128 99 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE 100 | EPOCHS = 3 # we don't need 3 tbh 101 | LEARNING_RATE = 1e-3 # the Karpathy constant 102 | CUTOFF_LEN = 128 # 128 accounts for about 95% of the data 103 | LORA_R = 8 104 | LORA_ALPHA = 16 105 | LORA_DROPOUT = 0.05 106 | VAL_SET_SIZE= 2000 107 | 108 | def preprocess_logits_for_metrics(logits, labels): 109 | """ 110 | Original Trainer may have a memory leak. 111 | This is a workaround to avoid storing too many tensors that are not needed. 112 | """ 113 | pred_ids = torch.argmax(logits[0], dim=-1) 114 | return pred_ids, labels 115 | 116 | trainer_config = transformers.TrainingArguments( 117 | per_device_train_batch_size = MICRO_BATCH_SIZE, 118 | per_device_eval_batch_size = MICRO_BATCH_SIZE, 119 | gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, 120 | warmup_ratio=0.06, 121 | #num_train_epochs=3, 122 | max_steps = 350, 123 | learning_rate=LEARNING_RATE, 124 | lr_scheduler_type = "cosine", ## LoRA original paper uses linear 125 | fp16=True, 126 | logging_steps=50, 127 | evaluation_strategy="steps", 128 | logging_strategy="steps", 129 | save_strategy="steps", 130 | eval_steps=50, 131 | save_steps=50, 132 | # report_to=report_to, 133 | output_dir=output_dir, 134 | optim = "adamw_torch", 135 | torch_compile = False, 136 | save_total_limit=2, 137 | load_best_model_at_end=False, 138 | ddp_find_unused_parameters=False if ddp else None, 139 | ) 140 | 141 | 142 | # ### Apply LoRA 143 | # 144 | # Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`. 145 | 146 | target_modules = None 147 | target_modules = ['q_proj', 'v_proj'] # edit with your desired target modules 148 | #lora_config = config.get("lora") 149 | lora_config = LoraConfig( 150 | r=8, lora_alpha=32, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" 151 | ) 152 | 153 | callbacks = [SavePeftModelCallback] if lora_config else [] 154 | ##no need to use callbacks 155 | callbacks = [] 156 | 157 | training_args = trainer_config 158 | 159 | 160 | model_name = args.model_name 161 | 162 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 163 | tokenizer = fix_tokenizer(tokenizer) 164 | # tokenizer.save_pretrained(output_dir) 165 | 166 | dataset = load_dataset('samsum') 167 | train_records = dataset['train'] 168 | val_records = dataset['test'] 169 | #random.shuffle(train_records) 170 | print("train_record[0]: ",train_records[0]) 171 | 172 | ## Config for llama 65-b 173 | model_type = "causal" 174 | templates_path = "llama_lora_samsum.json" 175 | only_target_loss = False 176 | mode = "instruct" 177 | 178 | 179 | if mode == "instruct": 180 | max_source_tokens_count = 255 # Changed depending on the dataset 181 | max_target_tokens_count = 50 182 | target_field = "summary" 183 | source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response) 184 | 185 | train_dataset = InstructDataset( 186 | train_records, 187 | tokenizer, 188 | max_source_tokens_count=max_source_tokens_count, 189 | max_target_tokens_count=max_target_tokens_count, 190 | sample_rate=train_sample_rate, 191 | input_type=model_type, 192 | templates_path=templates_path, 193 | target_field=target_field, 194 | source_field=source_field, 195 | only_target_loss=only_target_loss 196 | ) 197 | 198 | val_dataset = InstructDataset( 199 | val_records, 200 | tokenizer, 201 | max_source_tokens_count=max_source_tokens_count, 202 | max_target_tokens_count=max_target_tokens_count, 203 | sample_rate=val_sample_rate, 204 | input_type=model_type, 205 | templates_path=templates_path, 206 | target_field=target_field, 207 | source_field=source_field, 208 | only_target_loss=only_target_loss 209 | ) 210 | 211 | else: 212 | assert False 213 | 214 | if "seq2seq" in model_type: 215 | data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8) 216 | else: 217 | data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8) 218 | 219 | print("INPUT_IDS") 220 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0]) 221 | print("MASK") 222 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0]) 223 | print("LABELS") 224 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0]) 225 | 226 | 227 | model_types = { 228 | "causal": AutoModelForCausalLM, 229 | "seq2seq": AutoModelForSeq2SeqLM 230 | } 231 | ## Decide whether to laod in 8-bit 232 | load_in_8bit = False 233 | load_in_4bit = True 234 | if load_in_8bit: 235 | assert not load_in_4bit 236 | model = model_types[model_type].from_pretrained( 237 | model_name, 238 | load_in_8bit=True, 239 | device_map=device_map 240 | ) 241 | model = fix_model(model, tokenizer, use_resize=False) 242 | model = prepare_model_for_int8_training(model) 243 | elif load_in_4bit: 244 | assert not load_in_8bit 245 | # use_bf16 = trainer_config.get("bf16", False) 246 | use_bf16 = getattr(trainer_config, "bf16", False) 247 | compute_dtype = torch.bfloat16 if use_bf16 else torch.float16 248 | model = model_types[model_type].from_pretrained( 249 | model_name, 250 | load_in_4bit=True, 251 | device_map=device_map, 252 | quantization_config=BitsAndBytesConfig( 253 | load_in_4bit=True, 254 | llm_int8_threshold=6.0, 255 | llm_int8_has_fp16_weight=False, 256 | bnb_4bit_compute_dtype=compute_dtype, 257 | bnb_4bit_use_double_quant=True, 258 | bnb_4bit_quant_type="nf4" 259 | ), 260 | torch_dtype=torch.bfloat16 if use_bf16 else torch.float32 261 | ) 262 | model = fix_model(model, tokenizer, use_resize=False) 263 | model = prepare_model_for_int8_training(model) 264 | else: 265 | model = model_types[model_type].from_pretrained(model_name) 266 | model = fix_model(model, tokenizer) 267 | 268 | # Default model generation params 269 | model.config.num_beams = 5 270 | if mode == "instruct": 271 | max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1 272 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count 273 | 274 | if not ddp and torch.cuda.device_count() > 1: 275 | model.is_parallelizable = True 276 | model.model_parallel = True 277 | 278 | if lora_config: 279 | #lora_config = LoraConfig(**lora_config) 280 | model = get_peft_model(model, lora_config) 281 | 282 | trainer_class = Trainer ##if not omit_base_model_save else TrainerNoBaseSave 283 | print("Trainer class:", trainer_class) 284 | trainer = trainer_class( 285 | model=model, 286 | args=training_args, 287 | train_dataset=train_dataset, 288 | eval_dataset=val_dataset, 289 | callbacks=callbacks, 290 | data_collator=data_collator, 291 | preprocess_logits_for_metrics = preprocess_logits_for_metrics, 292 | ) 293 | 294 | # with wandb.init(project="llama_ft_samsum", name="llama finetuning run") as run: ## changed the name don't forget 295 | checkpoint_dir = output_dir 296 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir): 297 | trainer.train(resume_from_checkpoint=True) 298 | else: 299 | trainer.train() 300 | model.save_pretrained(output_dir) 301 | 302 | trainer.model.push_to_hub(args.repo_name) --------------------------------------------------------------------------------