├── .Rhistory
├── llmtune
├── __init__.py
├── engine
│ ├── __init__.py
│ ├── lora
│ │ ├── __init__.py
│ │ ├── peft.py
│ │ ├── config.py
│ │ └── utils.py
│ ├── quant
│ │ ├── __init__.py
│ │ ├── gptq
│ │ │ ├── __init__.py
│ │ │ ├── quantizer.py
│ │ │ ├── extras.py
│ │ │ └── algorithm.py
│ │ ├── algorithm.py
│ │ ├── converter.py
│ │ └── config.py
│ └── inference
│ │ ├── __init__.py
│ │ ├── matmult.py
│ │ ├── cuda
│ │ └── quant_cuda.cpp
│ │ ├── modules.py
│ │ └── autograd.py
├── llms
│ ├── __init__.py
│ ├── opt
│ │ ├── __init__.py
│ │ ├── config.py
│ │ └── model.py
│ ├── bloom
│ │ ├── __init__.py
│ │ └── model.py
│ ├── llama
│ │ ├── __init__.py
│ │ ├── config.py
│ │ └── model.py
│ ├── config.py
│ └── autollm.py
├── utils.py
├── data
│ ├── abstract.py
│ ├── __init__.py
│ ├── alpaca.py
│ ├── text.py
│ ├── gpt4all.py
│ └── calibration.py
├── config.py
├── executor.py
└── run.py
├── .DS_Store
├── finetune
├── samsum-llama
│ ├── llama_lora_samsum.json
│ ├── utils.py
│ ├── eval_samsum_4bit_bnb.py
│ ├── data.py
│ ├── eval_samsum_4bit_llmtune.py
│ ├── train_samsum_4bit.py
│ └── train_samsum_4bit_bnb.py
├── samsum-opt
│ ├── llama_lora_samsum.json
│ ├── utils.py
│ ├── data.py
│ ├── eval_samsum_opt_4bit_llmtune.py
│ └── train_samsum_opt_4bit_llmtune.py
├── mnli-llama
│ ├── llama_lora_mnli_label.json
│ ├── utils.py
│ ├── data_mnli_label.py
│ ├── eval_mnli_llmtune.py
│ └── train_mnli_llmtune_label.py
└── bbh-eval
│ ├── main_dev.py
│ └── bbh_dev.py
├── examples
├── push_to_hub.py
├── quantize.py
├── generate.py
├── generate-after-lora.py
└── finetune.py
├── LICENSE
└── README.md
/.Rhistory:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmtune/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmtune/engine/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmtune/llms/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmtune/llms/opt/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmtune/engine/lora/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmtune/engine/quant/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmtune/llms/bloom/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmtune/llms/llama/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmtune/engine/inference/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmtune/engine/quant/gptq/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kuleshov-group/MODULoRA-Experiment/HEAD/.DS_Store
--------------------------------------------------------------------------------
/llmtune/engine/quant/algorithm.py:
--------------------------------------------------------------------------------
1 | from llmtune.engine.quant.config import QuantConfig
2 |
3 | class QuantizationAlgorithm():
4 | """Quantization algorthim abstract class"""
5 | def __init__(self, config: QuantConfig):
6 | self.config = config
7 |
8 | def quantize(self, model, dataloader):
9 | raise NotImplementedError
--------------------------------------------------------------------------------
/finetune/samsum-llama/llama_lora_samsum.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "Template used by LLAMA-SAMSUM.",
3 | "prompts_input": [
4 | "### Summarize this: {instruction}\n ### Output: "
5 | ],
6 | "prompts_no_input": [
7 | "### Summarize this: {instruction}\n ### Output: "
8 | ],
9 | "output_separator": "### Output: "
10 | }
--------------------------------------------------------------------------------
/finetune/samsum-opt/llama_lora_samsum.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "Template used by LLAMA-SAMSUM.",
3 | "prompts_input": [
4 | "### Summarize this: {instruction}\n ### Output: "
5 | ],
6 | "prompts_no_input": [
7 | "### Summarize this: {instruction}\n ### Output: "
8 | ],
9 | "output_separator": "### Output: "
10 | }
--------------------------------------------------------------------------------
/examples/push_to_hub.py:
--------------------------------------------------------------------------------
1 | from llmtune.llms.autollm import AutoLLMForCausalLM
2 |
3 | # load model
4 | model_dir = './llama-7b-quantized' # can generate this via quantize.py
5 | llm = AutoLLMForCausalLM.from_pretrained(model_dir)
6 |
7 | # push to hub
8 | llm.push_to_hub(
9 | repo_id='',
10 | save_dir=model_dir,
11 | commit_message='first commit'
12 | )
13 |
--------------------------------------------------------------------------------
/finetune/mnli-llama/llama_lora_mnli_label.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "Template used by LLAMA-MNLI-m output label.",
3 | "prompts_input": [
4 | "### Premise: {instruction}\n ### Hypothesis: {hypothesis}\n ### Genre: {genre} ### Label:"
5 | ],
6 | "prompts_no_input": [
7 | "### Premise: {instruction}\n ### Hypothesis: {hypothesis}\n ### Genre: {genre} ### Label:"
8 | ],
9 | "output_separator": "### Label:"
10 | }
--------------------------------------------------------------------------------
/llmtune/engine/lora/peft.py:
--------------------------------------------------------------------------------
1 | """Wraps around PEFT to use QuantLoraModel instead of regular LoraModel."""
2 |
3 | import peft as quant_peft
4 | from llmtune.engine.lora.lora import QuantLoraModel
5 |
6 | # monkey patch peft to use QuantLoraModel
7 | quant_peft.tuners.lora.LoraModel = QuantLoraModel
8 | quant_peft.peft_model.LoraModel = QuantLoraModel
9 |
10 | # the above works for PEFT at the time of writing this code;
11 | # when upgrading to a newer PEFT, use this insted:
12 | # quant_peft.peft_model.PEFT_TYPE_TO_MODEL_MAPPING[
13 | # quant_peft.utils.PeftType.LORA
14 | # ] = QuantLoraModel
--------------------------------------------------------------------------------
/finetune/bbh-eval/main_dev.py:
--------------------------------------------------------------------------------
1 | from fire import Fire
2 |
3 | import bbh_dev
4 |
5 | def main(task_name: str, **kwargs):
6 | task_map = dict(
7 | bbh=bbh_dev.main,
8 | )
9 |
10 | if task_name == "all":
11 | results = {}
12 | for name, task_fn in task_map.items():
13 | score = task_fn(**kwargs)
14 | results[name] = score
15 | else:
16 | task_fn = task_map.get(task_name)
17 | if task_fn is None:
18 | raise ValueError(f"{task_name}. Choose from {list(task_map.keys())}")
19 | score = task_fn(**kwargs)
20 | results = {task_name: score}
21 |
22 | results = {name: round(score * 100, 2) for name, score in results.items()}
23 | print(results)
24 | return results
25 |
26 | if __name__ == "__main__":
27 | Fire(main)
28 |
--------------------------------------------------------------------------------
/llmtune/llms/opt/config.py:
--------------------------------------------------------------------------------
1 | # from llmtune.llms.config import AutoQuantConfig, LLMType
2 |
3 | OPT_MODELS = [
4 | "opt-6.7b-4bit", "opt-13b-4bit",
5 | "opt-6.7b-3bit", "opt-13b-3bit",
6 | ]
7 |
8 | def get_opt_config(model):
9 | if '4bit' in model:
10 | bits = 4
11 | elif '3bit' in model:
12 | bits = 3
13 | elif '2bit' in model:
14 | bits = 2
15 |
16 | if '6.7b' in model:
17 | hf_config_name = "facebook/opt-6.7b"
18 | elif '13b' in model:
19 | hf_config_name = "facebook/opt-13b"
20 |
21 | raise NotImplementedError()
22 |
23 | llm_config = AutoQuantConfig(
24 | name=model,
25 | model_type=LLMType.OPT,
26 | hf_config_name=hf_config_name,
27 | hf_tokenizer_config="",
28 | bits=bits
29 | )
30 | return llm_config
31 |
--------------------------------------------------------------------------------
/llmtune/utils.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import urllib.request
3 |
4 | def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
5 | if type(module) in layers:
6 | return {name: module}
7 | res = {}
8 | for name1, child in module.named_children():
9 | res.update(find_layers(
10 | child, layers=layers, name=name + '.' + name1 if name != '' else name1
11 | ))
12 | return res
13 |
14 | def to_half_precision(model):
15 | for n, m in model.named_modules():
16 | if '4bit' in str(type(m)) or 'QuantLinear' in str(type(m)):
17 | # m.zeros = m.zeros.half()
18 | m.scales = m.scales.half()
19 | if m.bias is not None:
20 | m.bias = m.bias.half()
21 | return model
22 |
23 | def download_file(url, path):
24 | print('Starting download')
25 | urllib.request.urlretrieve(url, path)
26 | print('Done')
--------------------------------------------------------------------------------
/llmtune/data/abstract.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Dict, Any
3 |
4 |
5 | # Abstract train data loader
6 | class AbstractTrainData(ABC):
7 | """
8 | """
9 | @abstractmethod
10 | def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len: int) -> None:
11 | """
12 | Args:
13 | dataset (str): Path to dataset
14 | val_set_size (int) : Size of validation set
15 | tokenizer (_type_): Tokenizer
16 | """
17 | self.tokenizer = tokenizer
18 | self.dataset = dataset
19 | self.val_set_size = val_set_size
20 | self.cutoff_len = cutoff_len
21 | self.train_data = None
22 | self.val_data = None
23 |
24 | @abstractmethod
25 | def tokenize(self, prompt: str) -> Dict[str, Any]:
26 | pass
27 |
28 | @abstractmethod
29 | def prepare_data(self) -> None:
30 | """Loads dataset from file and prepares train_data for trainer."""
31 | pass
32 |
--------------------------------------------------------------------------------
/llmtune/data/__init__.py:
--------------------------------------------------------------------------------
1 | from llmtune.data.text import TrainTxt
2 | from llmtune.data.alpaca import TrainSAD
3 | from llmtune.data.gpt4all import TrainGPT4All
4 |
5 | def load_finetuning_data(tune_config, tokenizer):
6 | if tune_config.ds_type == "alpaca":
7 | data = TrainSAD(
8 | tune_config.dataset,
9 | tune_config.val_set_size,
10 | tokenizer,
11 | tune_config.cutoff_len
12 | )
13 | elif tune_config.ds_type == "gpt4all":
14 | raise NotImplementedError('GPT4All dataset currently not supported')
15 | data = TrainGPT4All(
16 | tune_config.dataset,
17 | tune_config.val_set_size,
18 | tokenizer,
19 | tune_config.cutoff_len
20 | )
21 | else:
22 | raise ValueError(f"Invalid data name: {tune_config.ds_type}")
23 | # data.prepare_data(
24 | # thd=tune_config.txt_row_thd, use_eos_token=tune_config.use_eos_token
25 | # )
26 | data.prepare_data()
27 | return data
--------------------------------------------------------------------------------
/llmtune/engine/quant/converter.py:
--------------------------------------------------------------------------------
1 | from llmtune.engine.inference.modules import QuantLinear
2 |
3 | def make_quant(
4 | module, names, bits, groupsize=-1, name='', is_cuda=True
5 | ):
6 | if isinstance(module, QuantLinear):
7 | return
8 | for attr in dir(module):
9 | tmp = getattr(module, attr)
10 | name1 = name + '.' + attr if name != '' else attr
11 | if name1 in names:
12 | setattr(
13 | module, attr, QuantLinear(
14 | bits=bits,
15 | groupsize=groupsize,
16 | in_features=tmp.in_features,
17 | out_features=tmp.out_features,
18 | bias=tmp.bias,
19 | is_cuda=is_cuda,
20 | )
21 | )
22 | for name1, child in module.named_children():
23 | make_quant(
24 | child,
25 | names,
26 | bits=bits,
27 | name=name + '.' + name1 if name != '' else name1,
28 | groupsize=groupsize,
29 | is_cuda=is_cuda
30 | )
31 |
--------------------------------------------------------------------------------
/examples/quantize.py:
--------------------------------------------------------------------------------
1 | from llmtune.llms.autollm import AutoLLMForCausalLM
2 | from llmtune.engine.quant.config import QuantConfig
3 | from llmtune.engine.quant.gptq.executor import GPTQAlgorithm
4 | from llmtune.data.calibration import get_calibration_loaders
5 |
6 | # load model
7 | model_name = 'decapoda-research/llama-7b-hf'
8 | llm = AutoLLMForCausalLM.from_pretrained(model_name)
9 | llm.eval()
10 |
11 | # set up quantization config
12 | config = QuantConfig(
13 | bits=4,
14 | dataset='c4',
15 | seed=0,
16 | nsamples=128,
17 | percdamp=.01,
18 | groupsize=64,
19 | act_order=True,
20 | nearest=False,
21 | save='./llama-7b-quantized'
22 | )
23 |
24 | # load gptq calibration data
25 | dataloader, _ = get_calibration_loaders(
26 | config.dataset,
27 | nsamples=config.nsamples,
28 | seed=config.seed,
29 | model=llm.base_model.name_or_path,
30 | seqlen=llm.base_model.seqlen
31 | )
32 |
33 | # create quantization algorithm
34 | gptq = GPTQAlgorithm(config)
35 | llm = gptq.quantize(llm, dataloader)
36 |
37 | llm.save_pretrained(config.save)
38 | print(f'Model weights saved to: {config.save}')
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 oscarscaro
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/llmtune/llms/llama/config.py:
--------------------------------------------------------------------------------
1 | # from llmtune.llms.config import AutoQuantConfig, LLMType
2 |
3 | LLAMA_MODELS = [
4 | "llama-7b-4bit", "llama-13b-4bit", "llama-30b-4bit", "llama-65b-4bit",
5 | "llama-7b-3bit", "llama-13b-3bit", "llama-30b-3bit", "llama-65b-3bit",
6 | "llama-7b-2bit", "llama-65b-2bit",
7 | ]
8 |
9 | def get_llama_config(model):
10 | if '4bit' in model:
11 | bits = 4
12 | elif '3bit' in model:
13 | bits = 3
14 | elif '2bit' in model:
15 | bits = 2
16 |
17 | if '7b' in model:
18 | hf_config_name = "decapoda-research/llama-7b-hf"
19 | elif '13b' in model:
20 | hf_config_name = "decapoda-research/llama-13b-hf"
21 | elif '30b' in model:
22 | hf_config_name = "decapoda-research/llama-30b-hf"
23 | elif '65b' in model:
24 | hf_config_name = "decapoda-research/llama-65b-hf"
25 |
26 | raise NotImplementedError()
27 |
28 | llm_config = AutoQuantConfig(
29 | name=model,
30 | model_type=LLMType.LLAMA,
31 | hf_config_name=hf_config_name,
32 | hf_tokenizer_config="huggyllama/llama-13b",
33 | bits=bits
34 | )
35 | return llm_config
36 |
--------------------------------------------------------------------------------
/examples/generate.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer
3 | from llmtune.llms.autollm import AutoLLMForCausalLM
4 | from llmtune.utils import to_half_precision
5 |
6 | # model config
7 | model_name = ''
8 | # model_name = './llama-7b-quantized' # can generate local dir via quantize.py
9 | tokenizer_name = 'huggyllama/llama-13b'
10 | DEV = 'cuda'
11 |
12 | # load model
13 | llm = AutoLLMForCausalLM.from_pretrained(model_name).to(DEV)
14 | llm.eval()
15 | llm = to_half_precision(llm)
16 |
17 | # load tokenizer
18 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
19 |
20 | # encode prompt
21 | prompt = 'The pyramids were built by'
22 | input_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEV)
23 |
24 | # generation config
25 | min_length=10
26 | max_length=200
27 | top_p=.95
28 | top_k=25
29 | temperature=1.0
30 |
31 | # generate text
32 | with torch.no_grad():
33 | generated_ids = llm.generate(
34 | inputs=input_ids,
35 | do_sample=True,
36 | min_length=min_length,
37 | max_length=max_length,
38 | top_p=top_p,
39 | top_k=top_k,
40 | temperature=temperature,
41 | )
42 |
43 | # decode and print
44 | output = tokenizer.decode([el.item() for el in generated_ids[0]])
45 | print(output)
46 |
--------------------------------------------------------------------------------
/llmtune/engine/quant/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from dataclasses import dataclass
4 | from transformers.utils.hub import PushToHubMixin, cached_file
5 |
6 | @dataclass
7 | class QuantConfig(PushToHubMixin):
8 | dataset: str
9 | bits: int
10 | nsamples: int
11 | groupsize: int
12 | act_order: bool
13 | percdamp: float
14 | seed: int
15 | nearest: bool
16 | save: str
17 |
18 | def save_pretrained(self, save_dir: str, **kwargs):
19 | config_path = os.path.join(save_dir, "quant_config.json")
20 | with open(config_path, "w", encoding="utf-8") as f:
21 | json.dump(self.to_dict(), f, indent=2)
22 |
23 | @classmethod
24 | def from_pretrained(cls, save_dir: str, **kwargs):
25 | config_filename = "quant_config.json"
26 | if os.path.isdir(save_dir):
27 | config_path = os.path.join(save_dir, config_filename)
28 | else:
29 | config_path = cached_file(save_dir, config_filename)
30 | with open(config_path, "r", encoding="utf-8") as f:
31 | return cls(**json.load(f))
32 |
33 | def to_dict(self):
34 | return {
35 | 'dataset': self.dataset,
36 | 'bits': self.bits,
37 | 'nsamples': self.nsamples,
38 | 'groupsize': self.groupsize,
39 | 'act_order': self.act_order,
40 | 'percdamp': self.percdamp,
41 | 'seed': self.seed,
42 | 'nearest': self.nearest,
43 | 'save': self.save,
44 | }
--------------------------------------------------------------------------------
/examples/generate-after-lora.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, GenerationConfig
3 | from llmtune.llms.autollm import AutoLLMForCausalLM
4 | from llmtune.utils import to_half_precision
5 | from llmtune.engine.lora.peft import quant_peft
6 |
7 | # model config
8 | model_name = ''
9 | # model_name = './llama-7b-quantized' # can generate local dir via quantize.py
10 | tokenizer_name = 'huggyllama/llama-7b'
11 | DEV = 'cuda'
12 |
13 | # load model
14 | llm = AutoLLMForCausalLM.from_pretrained(model_name).to(DEV)
15 | llm.eval()
16 | llm = to_half_precision(llm)
17 |
18 | # load tokenizer
19 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
20 |
21 | # load lora from existing checkpoint
22 | adapter_path = './llama-7b-quantized-lora' # can generate this via finetune.py
23 | model = quant_peft.PeftModel.from_pretrained(
24 | llm, adapter_path,
25 | device_map='auto'
26 | )
27 | print(adapter_path, 'loaded')
28 |
29 | # encode prompt
30 | prompt = 'Write a detailed step-by-step recipe for a blueberry lasagna dish'
31 | input_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEV)
32 |
33 | # generation config
34 | min_length=10
35 | max_length=200
36 | top_p=.95
37 | top_k=25
38 | temperature=1.0
39 |
40 | # generate text
41 | with torch.no_grad():
42 | generated_ids = model.generate(
43 | inputs=input_ids,
44 | generation_config=GenerationConfig(
45 | do_sample=True,
46 | min_length=min_length,
47 | max_length=max_length,
48 | top_p=top_p,
49 | top_k=top_k,
50 | temperature=temperature,
51 | )
52 | )
53 |
54 | # decode and print
55 | output = tokenizer.decode([el.item() for el in generated_ids[0]])
56 | print(output)
57 |
--------------------------------------------------------------------------------
/llmtune/engine/lora/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | class FinetuneConfig:
4 | """Config holder for finetuning"""
5 | def __init__(
6 | self, dataset: str, ds_type: str,
7 | lora_out_dir: str,
8 | mbatch_size: int, batch_size: int,
9 | epochs: int, lr: float,
10 | cutoff_len: int,
11 | lora_r: int, lora_alpha: int, lora_dropout: float,
12 | val_set_size: float,
13 | warmup_steps: int, save_steps: int,
14 | save_total_limit: int, logging_steps: int,
15 | ):
16 | self.dataset = dataset
17 | self.ds_type = ds_type
18 | self.lora_out_dir = lora_out_dir
19 | self.mbatch_size = mbatch_size
20 | self.batch_size = batch_size
21 | self.gradient_accumulation_steps = self.batch_size // self.mbatch_size
22 | self.epochs = epochs
23 | self.lr = lr
24 | self.cutoff_len = cutoff_len
25 | self.lora_r = lora_r
26 | self.lora_alpha = lora_alpha
27 | # self.lora_dropout = 0 if gradient_checkpointing else lora_dropout
28 | self.lora_dropout = lora_dropout
29 | self.val_set_size = int(val_set_size) if val_set_size > 1.0 else float(val_set_size)
30 | self.warmup_steps = warmup_steps
31 | self.save_steps = save_steps
32 | self.save_total_limit = save_total_limit
33 | self.logging_steps = logging_steps
34 | self.world_size = int(os.environ.get("WORLD_SIZE", 1))
35 | self.local_rank = int(os.environ.get("LOCAL_RANK", 0))
36 | self.ddp = self.world_size != 1
37 | self.device_map = "auto" if not self.ddp else {"": self.local_rank}
38 | if self.ddp:
39 | self.gradient_accumulation_steps = self.gradient_accumulation_steps // self.world_size
--------------------------------------------------------------------------------
/llmtune/config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from llmtune.llms.config import AutoConfig
3 | from llmtune.llms.opt.config import OPT_MODELS
4 | from llmtune.llms.llama.config import LLAMA_MODELS
5 | from llmtune.engine.lora.config import FinetuneConfig
6 | from llmtune.engine.quant.config import QuantConfig
7 |
8 | # ----------------------------------------------------------------------------
9 |
10 | # define some constants
11 | DEV = torch.device('cuda')
12 | LLM_MODELS = LLAMA_MODELS + OPT_MODELS
13 |
14 | # ----------------------------------------------------------------------------
15 |
16 | # helpers for loading configs
17 | def get_finetune_config(args):
18 | return FinetuneConfig(
19 | dataset=args.dataset,
20 | ds_type=args.data_type,
21 | lora_out_dir=args.adapter,
22 | mbatch_size=args.mbatch_size,
23 | batch_size=args.batch_size,
24 | epochs=args.epochs,
25 | lr=args.lr,
26 | cutoff_len=args.cutoff_len,
27 | lora_r=args.lora_r,
28 | lora_alpha=args.lora_alpha,
29 | lora_dropout=args.lora_dropout,
30 | val_set_size=args.val_set_size,
31 | warmup_steps=args.warmup_steps,
32 | save_steps=args.save_steps,
33 | save_total_limit=args.save_total_limit,
34 | logging_steps=args.logging_steps,
35 | )
36 |
37 | def get_quant_config(args):
38 | return QuantConfig(
39 | dataset=args.dataset,
40 | bits=args.bits,
41 | nsamples=args.nsamples,
42 | groupsize=args.groupsize,
43 | act_order=args.act_order,
44 | percdamp=args.percdamp,
45 | seed=args.seed,
46 | nearest=args.nearest,
47 | save=args.save,
48 | )
49 |
50 | def get_llm_config(model_name_or_path):
51 | return AutoConfig(model_name_or_path)
--------------------------------------------------------------------------------
/llmtune/llms/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from enum import Enum
4 | from transformers import PretrainedConfig, AutoConfig
5 | from transformers.utils.hub import PushToHubMixin, cached_file
6 | from llmtune.engine.quant.config import QuantConfig
7 |
8 | class LLMType(Enum):
9 | LLAMA = 'llama'
10 | OPT = 'opt'
11 | BLOOM = 'bloom'
12 |
13 | class AutoLLMConfig(PretrainedConfig,PushToHubMixin):
14 | def __init__(
15 | self,
16 | base_config: PretrainedConfig,
17 | quant_config: QuantConfig = None
18 | ):
19 | self.base_config = base_config
20 | self.quant_config = None
21 | if quant_config is not None:
22 | self.quant_config = quant_config
23 |
24 | @property
25 | def is_quantized(self):
26 | return self.quant_config is not None
27 |
28 | def set_quant_config(self, quant_config):
29 | if self.quant_config is not None:
30 | raise RuntimeError('quant_config already set')
31 | self.quant_config = quant_config
32 |
33 | @property
34 | def model_type(self):
35 | return self.base_config.model_type
36 |
37 | def save_pretrained(self, save_dir: str, **kwargs):
38 | self.base_config.save_pretrained(save_dir, **kwargs)
39 | if self.is_quantized:
40 | self.quant_config.save_pretrained(save_dir, **kwargs)
41 |
42 | @classmethod
43 | def from_pretrained(cls, save_dir: str):
44 | # load config
45 | base_config = AutoConfig.from_pretrained(save_dir)
46 |
47 | # check if quantized model and config are available
48 | try:
49 | quant_config = (
50 | QuantConfig.from_pretrained(save_dir)
51 | )
52 | except:
53 | quant_config = None
54 |
55 | # check if it's a valid model
56 | if base_config.model_type not in [e.value for e in LLMType]:
57 | raise NotImplementedError(
58 | f"Model type {base_config.model_type} currently not supported"
59 | )
60 |
61 | return cls(base_config, quant_config)
62 |
--------------------------------------------------------------------------------
/llmtune/llms/bloom/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | from llmtune.utils import find_layers
5 | from llmtune.engine.quant.converter import make_quant
6 |
7 | def load_bloom_unquantized(llm_config):
8 | import torch
9 | from transformers import BloomForCausalLM
10 | def skip(*args, **kwargs):
11 | pass
12 | torch.nn.init.kaiming_uniform_ = skip
13 | torch.nn.init.uniform_ = skip
14 | torch.nn.init.normal_ = skip
15 | model = BloomForCausalLM.from_pretrained(
16 | llm_config.base_config.name_or_path, torch_dtype='auto'
17 | )
18 | return model
19 |
20 | def load_bloom_quantized(llm_config, quantized_weights_path):
21 | import transformers, accelerate
22 | from transformers import BloomConfig, BloomForCausalLM
23 |
24 | with accelerate.init_empty_weights():
25 | config = BloomConfig.from_pretrained(
26 | llm_config.base_config.name_or_path
27 | )
28 | torch.set_default_dtype(torch.half)
29 | transformers.modeling_utils._init_weights = False
30 | torch.set_default_dtype(torch.half)
31 | model = BloomForCausalLM(config)
32 | torch.set_default_dtype(torch.float)
33 | model = model.eval()
34 | layers = find_layers(model)
35 | for name in ['lm_head']:
36 | if name in layers:
37 | del layers[name]
38 | make_quant(
39 | model, layers, llm_config.quant_config.bits,
40 | groupsize=llm_config.quant_config.groupsize
41 | )
42 | model = accelerate.load_checkpoint_and_dispatch(
43 | model=model,
44 | checkpoint=quantized_weights_path,
45 | device_map="auto",
46 | # device_map={'': 0},
47 | no_split_module_classes=["LlamaDecoderLayer"]
48 | )
49 | return model
50 |
51 | def load_bloom(llm_config, quantized_weights_path):
52 | if quantized_weights_path is None:
53 | model = load_bloom_unquantized(llm_config)
54 | else:
55 | model = load_bloom_quantized(
56 | llm_config, quantized_weights_path
57 | )
58 | model.seqlen = 2048
59 | return model
60 |
61 | def load_bloom_tokenizer(name_or_path):
62 | from transformers import BloomTokenizer
63 |
64 | tokenizer = BloomTokenizer.from_pretrained(
65 | name_or_path
66 | )
67 | tokenizer.truncation_side = 'left'
68 | return tokenizer
69 |
--------------------------------------------------------------------------------
/llmtune/llms/llama/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | from llmtune.utils import find_layers
5 | from llmtune.engine.quant.converter import make_quant
6 |
7 | def load_llama_unquantized(llm_config):
8 | import torch
9 | from transformers import LlamaForCausalLM
10 | def skip(*args, **kwargs):
11 | pass
12 | torch.nn.init.kaiming_uniform_ = skip
13 | torch.nn.init.uniform_ = skip
14 | torch.nn.init.normal_ = skip
15 | model = LlamaForCausalLM.from_pretrained(
16 | llm_config.base_config.name_or_path, torch_dtype='auto'
17 | )
18 | return model
19 |
20 | def load_llama_quantized(llm_config, quantized_weights_path):
21 | import transformers, accelerate
22 | from transformers import LlamaConfig, LlamaForCausalLM
23 |
24 | with accelerate.init_empty_weights():
25 | config = LlamaConfig.from_pretrained(
26 | llm_config.base_config.name_or_path
27 | )
28 | torch.set_default_dtype(torch.half)
29 | transformers.modeling_utils._init_weights = False
30 | torch.set_default_dtype(torch.half)
31 | model = LlamaForCausalLM(config)
32 | torch.set_default_dtype(torch.float)
33 | model = model.eval()
34 | layers = find_layers(model)
35 | for name in ['lm_head']:
36 | if name in layers:
37 | del layers[name]
38 | make_quant(
39 | model, layers, llm_config.quant_config.bits,
40 | groupsize=llm_config.quant_config.groupsize
41 | )
42 | model = accelerate.load_checkpoint_and_dispatch(
43 | model=model,
44 | checkpoint=quantized_weights_path,
45 | device_map="auto",
46 | # device_map={'': 0},
47 | no_split_module_classes=["LlamaDecoderLayer"]
48 | )
49 | return model
50 |
51 | def load_llama(llm_config, quantized_weights_path):
52 | if quantized_weights_path is None:
53 | model = load_llama_unquantized(llm_config)
54 | else:
55 | model = load_llama_quantized(
56 | llm_config, quantized_weights_path
57 | )
58 | model.seqlen = 2048
59 | return model
60 |
61 | def load_llama_tokenizer(name_or_path):
62 | from transformers import LlamaTokenizer
63 |
64 | tokenizer = LlamaTokenizer.from_pretrained(
65 | name_or_path
66 | )
67 | tokenizer.truncation_side = 'left'
68 | return tokenizer
69 |
--------------------------------------------------------------------------------
/llmtune/engine/lora/utils.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2023-present the HuggingFace Inc. team.
3 | # Edite by Volodymyr Kuleshov
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import torch
18 |
19 |
20 | def prepare_model_for_int4_training(
21 | model, output_embedding_layer_name="lm_head", use_gradient_checkpointing=False, layer_norm_names=["layer_norm"]
22 | ):
23 | r"""
24 | This method wrapps the entire protocol for preparing a model before running a training. This includes:
25 | 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
26 | head to fp32
27 | Args:
28 | model, (`transformers.PreTrainedModel`):
29 | The loaded model from `transformers`
30 | """
31 | # loaded_in_8bit = getattr(model, "is_loaded_in_8bit", False)
32 | loaded_in_4bit = True
33 |
34 | for name, param in model.named_parameters():
35 | # freeze base model's layers
36 | param.requires_grad = False
37 |
38 | if loaded_in_4bit:
39 | # cast layer norm in fp32 for stability for 4bit models
40 | if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
41 | param.data = param.data.to(torch.float32)
42 |
43 | if loaded_in_4bit and use_gradient_checkpointing:
44 | raise NotImplementedError()
45 |
46 | if hasattr(model, output_embedding_layer_name):
47 | output_embedding_layer = getattr(model, output_embedding_layer_name)
48 | input_dtype = output_embedding_layer.weight.dtype
49 |
50 | class CastOutputToFloat(torch.nn.Sequential):
51 | r"""
52 | Manually cast to the expected dtype of the lm_head as sometimes there is a final layer norm that is casted
53 | in fp32
54 | """
55 |
56 | def forward(self, x):
57 | return super().forward(x.to(input_dtype)).to(torch.float32)
58 |
59 | setattr(model, output_embedding_layer_name, CastOutputToFloat(output_embedding_layer))
60 |
61 | return model
62 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ModuLoRA
2 | Code repository (experiment) for the paper "ModuLoRA: Finetuning 3-Bit LLMs on Consumer GPUs by Integrating with Modular Quantizers", [ArXiv](https://arxiv.org/abs/2309.16119).
3 |
4 | **This repo builds on [LLMtools](https://github.com/kuleshov-group/llmtools), with added support of custom dataset preparation and evaluation to reproduce our experiment.**
5 |
6 | **Abstract:** We propose a memory-efficient finetuning algorithm for large language models (LLMs) that supports
7 | finetuning LLMs with 65B parameters in 3-bit or 4-bit precision on as little as one 48GB GPU. Our
8 | method, modular low-rank adaptation (MODULORA), integrates any user-specified weight quantizer
9 | with finetuning via low-rank adapters (LoRAs). Our approach relies on a simple quantization-agnostic
10 | backward pass that adaptively materializes low-precision LLM weights from a custom black-box
11 | quantization module. This approach enables finetuning 3-bit LLMs for the first time—leveraging
12 | state-of-the-art 3-bit OPTQ quantization often outperforms finetuning that relies on less sophisticated
13 | 4-bit and 8-bit methods. In our experiments, MODULORA attains competitive performance on text
14 | classification, natural language infernece, and instruction following tasks using significantly less
15 | memory than existing approaches, and we also surpass the state-of-the-art ROUGE score on a popular
16 | summarization task. We release MODULORA together with a series of low-precision models—
17 | including the first family of 3-bit instruction following Alpaca LLMs—as part of LLMTOOLS, a
18 | user-friendly library for quantizing, running, and finetuning LLMs on consumer GPU.
19 |
20 |
21 | # Repository Overview
22 |
23 | There are several directories in this repo:
24 | * [llmtune/](llmtune) contains the source code for the package `llmtune`, which needs to be installed to run the examples we provide;
25 | * [examples/](examples/) contains an example implementation of 4-bit, 3-bit quantization using OPTQ, finetuning with alpaca dataset, and model generation after applying finetuned LoRA adapater werights.
26 | * [finetune/samsum-llama/](finetune/samsum-llama) contains implementation of finetuning SAMSum benchmark with LoRA in LLaMA models using our package and bitsandbytes, which can be used to reproduce the result in our paper;
27 | * [finetune/mnli-samsum/](finetune/mnli-llama) contains implementation of finetuning MNLI benchmark with LoRA in LLaMA models using our package and bitsandbytes, which produces competitive results compared to SOTA;
28 | * Others finetuning scripts can also be found in the same directory [OPT](finetune/samsum-opt), [BLOOM](finetune/mnli-bloom)
29 | * See how we train `MODULoRA` 3-bit / 4-bit models in [SAMSum-LLAMA](finetune/samsum-llama/train_samsum_4bit.py), [MNLI-LLAMA](finetune/mnli-llama/train_mnli_llmtune_label.py), and [BBH-LLAMA](finetune/mnli-llama/modeling_roberta.py)
30 | * See how we evaluate `MODULoRA` results in [SAMSum-LLAMA](finetune/samsum-llama/eval_samsum_4bit_llmtune.py), [MNLI-LLAMA](finetune/mnli-llama/eval_mnli_llmtune.py), and [BBH-LLAMA](finetune/bbh-eval/main_dev.py)
31 |
32 |
--------------------------------------------------------------------------------
/llmtune/data/alpaca.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Any
2 | from datasets import load_dataset
3 | from llmtune.data.abstract import AbstractTrainData
4 |
5 | DEFAULT_HF_PATH = "kuleshov/alpaca-data"
6 |
7 | class TrainSAD(AbstractTrainData):
8 | def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None:
9 | super().__init__(dataset, val_set_size, tokenizer, cutoff_len)
10 |
11 | def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
12 | # there's probably a way to do this with the tokenizer settings
13 | # but again, gotta move fast
14 | if use_eos_token:
15 | result = self.tokenizer(
16 | prompt + self.tokenizer.eos_token,
17 | truncation=True,
18 | max_length=self.cutoff_len,
19 | padding=False,
20 | )
21 | if (
22 | result["input_ids"][-1] != self.tokenizer.eos_token_id
23 | and len(result["input_ids"]) < self.cutoff_len
24 | ):
25 | result["input_ids"].append(self.tokenizer.eos_token_id)
26 | result["attention_mask"].append(1)
27 | return result
28 | else:
29 | result = self.tokenizer(
30 | prompt,
31 | truncation=True,
32 | max_length=self.cutoff_len + 1,
33 | padding="max_length",
34 | )
35 | return {
36 | "input_ids": result["input_ids"][:-1],
37 | "attention_mask": result["attention_mask"][:-1],
38 | }
39 |
40 | def prepare_data(self, use_eos_token=True, **kwargs) -> None:
41 | if self.dataset:
42 | data = load_dataset("json", data_files=self.dataset)
43 | else:
44 | data = load_dataset(DEFAULT_HF_PATH)
45 |
46 | if self.val_set_size > 0:
47 | train_val = data["train"].train_test_split(
48 | test_size=self.val_set_size, shuffle=True, seed=42
49 | )
50 | self.train_data = train_val["train"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
51 | self.val_data = train_val["test"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
52 | else:
53 | self.train_data = data["train"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
54 | self.val_data = None
55 |
56 | # Auxiliary methods
57 | def generate_prompt(self, data_point, **kwargs):
58 | return make_prompt(
59 | data_point["instruction"],
60 | data_point["input"],
61 | data_point["output"]
62 | )
63 |
64 |
65 | def generate_and_tokenize_prompt(self, data_point, **kwargs):
66 | prompt = self.generate_prompt(data_point, **kwargs)
67 | return self.tokenize(prompt, **kwargs)
68 |
69 | def make_prompt(instruction, input_, output=""):
70 | return "{0}\n\n{1}\n{2}\n\n{3}\n{4}\n\n{5}\n{6}".format(
71 | "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.",
72 | "### Instruction:",
73 | instruction,
74 | "### Input:",
75 | input_,
76 | "### Response:",
77 | output
78 | )
79 |
80 | def make_output(raw_output):
81 | return raw_output.split("### Response:")[1].strip()
--------------------------------------------------------------------------------
/llmtune/data/text.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Dict, Any
3 | from datasets import Dataset
4 | from torch.utils.data import DataLoader
5 | from llmtune.data.abstract import AbstractTrainData
6 |
7 | # LLaMA txt train data loader
8 | class TrainTxt(AbstractTrainData):
9 | def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len):
10 | super().__init__(dataset, val_set_size, tokenizer, cutoff_len) # TODO: Validation size isn't used
11 | self.cutoff_len = cutoff_len
12 | self.exceed_count = 0
13 |
14 | def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
15 | # there's probably a way to do this with the tokenizer settings
16 | # but again, gotta move fast
17 | if use_eos_token:
18 | result = self.tokenizer(
19 | prompt + self.tokenizer.eos_token,
20 | truncation=True,
21 | max_length=self.cutoff_len,
22 | padding=False,
23 | )
24 | d = {
25 | "input_ids": result["input_ids"],
26 | "attention_mask": result["attention_mask"],
27 | }
28 | if (
29 | d["input_ids"][-1] != self.tokenizer.eos_token_id
30 | and len(d["input_ids"]) < self.cutoff_len
31 | ):
32 | d["input_ids"].append(self.tokenizer.eos_token_id)
33 | d["attention_mask"].append(1)
34 | else:
35 | result = self.tokenizer(
36 | prompt,
37 | truncation=True,
38 | max_length=self.cutoff_len + 1,
39 | padding="max_length",
40 | )
41 | d = {
42 | "input_ids": result["input_ids"][:-1],
43 | "attention_mask": result["attention_mask"][:-1],
44 | }
45 | if sum(d['attention_mask']) >= self.cutoff_len:
46 | self.exceed_count += 1
47 | return d
48 |
49 | @classmethod
50 | def format_new_rows(cls, rows, thd=128):
51 | r_b = ''
52 | new_rows = []
53 | for row in rows:
54 | if len(r_b) == 0:
55 | r_b += row
56 | else:
57 | r_b += '\n' + row
58 | if len(r_b) > thd:
59 | new_rows.append(r_b)
60 | r_b = ''
61 | if len(r_b) > thd:
62 | new_rows.append(r_b)
63 | r_b = ''
64 | return new_rows
65 |
66 | def prepare_data(self, thd=-1, use_eos_token=True, **kwargs):
67 | if os.path.isdir(self.dataset):
68 | rows = []
69 | for filename in os.listdir(self.dataset):
70 | with open(self.dataset + filename, 'r', encoding='utf8') as file:
71 | txt = file.read()
72 | txt = txt.replace('\r\n', '\n').replace('\u3000', ' ')
73 | rows += [r for r in txt.split('\n') if r != '']
74 | else:
75 | with open(self.dataset, 'r', encoding='utf8') as file:
76 | txt = file.read()
77 | txt = txt.replace('\r\n', '\n')
78 | rows = [r for r in txt.split('\n') if r != '']
79 | if thd != -1:
80 | rows = self.format_new_rows(rows, thd=thd)
81 | data = Dataset.from_dict({"input": rows})
82 | data = data.shuffle().map(lambda x: self.tokenize(x["input"], use_eos_token=use_eos_token))
83 | print('Train Data: {:.2f}%'.format(self.exceed_count / len(data) * 100), 'outliers')
84 | self.train_data = data
85 |
--------------------------------------------------------------------------------
/llmtune/llms/opt/model.py:
--------------------------------------------------------------------------------
1 |
2 | import torch
3 |
4 | from llmtune.utils import find_layers
5 | from llmtune.engine.quant.converter import make_quant
6 |
7 | def load_opt_unquantized(llm_config):
8 | from transformers import OPTForCausalLM
9 | def skip(*args, **kwargs):
10 | pass
11 | torch.nn.init.kaiming_uniform_ = skip
12 | torch.nn.init.uniform_ = skip
13 | torch.nn.init.normal_ = skip
14 | model = OPTForCausalLM.from_pretrained(
15 | llm_config.base_config.name_or_path, torch_dtype='auto'
16 | )
17 | return model
18 |
19 | def load_opt_quantized(llm_config, quantized_weights_path):
20 | import transformers, accelerate
21 | from transformers import OPTConfig, OPTForCausalLM
22 |
23 | with accelerate.init_empty_weights():
24 | config = OPTConfig.from_pretrained(
25 | llm_config.base_config.name_or_path
26 | )
27 | torch.set_default_dtype(torch.half)
28 | transformers.modeling_utils._init_weights = False
29 | torch.set_default_dtype(torch.half)
30 | model = OPTForCausalLM(config)
31 | torch.set_default_dtype(torch.float)
32 | model = model.eval()
33 | layers = find_layers(model)
34 | for name in [
35 | 'model.decoder.project_out',
36 | 'model.decoder.project_in', 'lm_head'
37 | ]:
38 | if name in layers:
39 | del layers[name]
40 | make_quant(
41 | model, layers, llm_config.quant_config.bits,
42 | groupsize=llm_config.quant_config.groupsize
43 | )
44 | model = accelerate.load_checkpoint_and_dispatch(
45 | model=model,
46 | checkpoint=quantized_weights_path,
47 | device_map="auto",
48 | # device_map={'': 0},
49 | no_split_module_classes=["OPTDecoderLayer"]
50 | )
51 | return model
52 |
53 | def load_opt_quantized_old(llm_config, checkpoint):
54 | import transformers
55 | from transformers import OPTConfig, OPTForCausalLM
56 | def noop(*args, **kwargs):
57 | pass
58 |
59 | config = OPTConfig.from_pretrained(
60 | llm_config.base_config.name_or_path
61 | )
62 | torch.nn.init.kaiming_uniform_ = noop
63 | torch.nn.init.uniform_ = noop
64 | torch.nn.init.normal_ = noop
65 |
66 | torch.set_default_dtype(torch.half)
67 | transformers.modeling_utils._init_weights = False
68 | torch.set_default_dtype(torch.half)
69 | model = OPTForCausalLM(config)
70 | torch.set_default_dtype(torch.float)
71 | model = model.eval()
72 | layers = find_layers(model)
73 | for name in [
74 | 'model.decoder.project_out',
75 | 'model.decoder.project_in', 'lm_head'
76 | ]:
77 | if name in layers:
78 | del layers[name]
79 | make_quant(model, layers, llm_config.quant_config.bits)
80 |
81 | print('Loading OPT model')
82 | model.load_state_dict(torch.load(checkpoint))
83 | model.seqlen = 2048
84 | print('Done')
85 |
86 | return model
87 |
88 | def load_opt(llm_config, quantized_weights_path):
89 | if quantized_weights_path is None:
90 | model = load_opt_unquantized(llm_config)
91 | else:
92 | model = load_opt_quantized(
93 | llm_config, quantized_weights_path
94 | )
95 | model.seqlen = 2048
96 | return model
97 |
98 | def load_opt_tokenizer(name_or_path):
99 | from transformers import AutoTokenizer
100 | tokenizer = AutoTokenizer.from_pretrained(
101 | name_or_path
102 | )
103 | tokenizer.truncation_side = 'left'
104 | return tokenizer
--------------------------------------------------------------------------------
/finetune/mnli-llama/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 |
4 | import torch
5 | import numpy as np
6 |
7 |
8 | def set_random_seed(seed):
9 | random.seed(seed)
10 | np.random.seed(seed)
11 | os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
12 | os.environ["PL_GLOBAL_SEED"] = str(seed)
13 | os.environ["PYTHONHASHSEED"] = str(seed)
14 | torch.manual_seed(seed)
15 | torch.cuda.manual_seed_all(seed)
16 | torch.backends.cudnn.benchmark = False
17 | torch.backends.cudnn.deterministic = True
18 |
19 |
20 | def fix_tokenizer(tokenizer):
21 | # Fixing broken tokenizers
22 | special_tokens = dict()
23 | for token_id in range(1000):
24 | token = tokenizer.convert_ids_to_tokens(token_id)
25 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad" in token:
26 | special_tokens["pad_token"] = token
27 | if tokenizer.bos_token_id in (None, tokenizer.vocab_size) and "" in token:
28 | special_tokens["bos_token"] = token
29 | if tokenizer.eos_token_id in (None, tokenizer.vocab_size) and "" in token:
30 | special_tokens["eos_token"] = token
31 | if tokenizer.unk_token_id in (None, tokenizer.vocab_size) and "unk" in token:
32 | special_tokens["unk_token"] = token
33 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep" in token:
34 | special_tokens["sep_token"] = token
35 |
36 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "bos_token" in special_tokens:
37 | special_tokens["sep_token"] = special_tokens["bos_token"]
38 |
39 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad_token" not in special_tokens:
40 | if tokenizer.unk_token_id is not None:
41 | special_tokens["pad_token"] = tokenizer.unk_token
42 | else:
43 | special_tokens["pad_token"] = "<|pad|>"
44 |
45 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep_token" not in special_tokens:
46 | if tokenizer.bos_token_id is not None:
47 | special_tokens["sep_token"] = tokenizer.bos_token
48 | else:
49 | special_tokens["sep_token"] = "<|sep|>"
50 | print(special_tokens)
51 | tokenizer.add_special_tokens(special_tokens)
52 |
53 | print("Vocab size: ", tokenizer.vocab_size)
54 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
55 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
56 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
57 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
58 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
59 | return tokenizer
60 |
61 |
62 | def fix_model(model, tokenizer, use_resize=True):
63 | model.config.pad_token_id = tokenizer.pad_token_id
64 | assert model.config.pad_token_id is not None
65 |
66 | bos_candidates = (
67 | tokenizer.bos_token_id,
68 | tokenizer.cls_token_id,
69 | tokenizer.sep_token_id,
70 | tokenizer.unk_token_id
71 | )
72 | for bos_candidate in bos_candidates:
73 | model.config.bos_token_id = bos_candidate
74 | if bos_candidate is not None:
75 | break
76 | assert model.config.bos_token_id is not None
77 | model.config.decoder_start_token_id = model.config.bos_token_id
78 |
79 | eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id)
80 | for eos_candidate in eos_candidates:
81 | model.config.eos_token_id = eos_candidate
82 | if eos_candidate is not None:
83 | break
84 | assert model.config.eos_token_id is not None
85 |
86 | if use_resize:
87 | model.resize_token_embeddings(len(tokenizer))
88 |
89 | return model
90 |
91 |
92 | def gen_batch(records, batch_size):
93 | batch_start = 0
94 | while batch_start < len(records):
95 | batch_end = batch_start + batch_size
96 | batch = records[batch_start: batch_end]
97 | batch_start = batch_end
98 | yield batch
--------------------------------------------------------------------------------
/examples/finetune.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import transformers
4 | from transformers import AutoTokenizer
5 | from llmtune.llms.autollm import AutoLLMForCausalLM
6 | from llmtune.engine.lora.config import FinetuneConfig
7 | from llmtune.data import TrainSAD
8 | from llmtune.engine.lora.peft import quant_peft
9 | from llmtune.utils import to_half_precision
10 |
11 | # model config
12 | model_name = ''
13 | # model_name = './llama-7b-quantized' # can generate local dir via quantize.py
14 | tokenizer_name = 'huggyllama/llama-13b'
15 | DEV = 'cuda'
16 |
17 | # load model
18 | transformers.logging.set_verbosity_info()
19 | llm = AutoLLMForCausalLM.from_pretrained(model_name)
20 | llm.eval()
21 | llm = llm.to(DEV)
22 | llm = to_half_precision(llm)
23 |
24 | # load tokenizer
25 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
26 | tokenizer.pad_token_id = 0
27 |
28 | # finetune training config
29 | mbatch_size=1
30 | batch_size=2
31 | epochs=3
32 | lr=2e-4
33 | cutoff_len=256
34 | lora_r=8
35 | lora_alpha=16
36 | lora_dropout=0.05
37 | val_set_size=0.2
38 | warmup_steps=50
39 | save_steps=50
40 | save_total_limit=3
41 | logging_steps=10
42 |
43 | data_type = 'alpaca'
44 | dataset = None # will load alpaca from HF
45 | adapter_path = './llama-7b-quantized-lora'
46 |
47 | # set up finetuning config
48 | tune_config = FinetuneConfig(
49 | dataset=dataset,
50 | ds_type=data_type,
51 | lora_out_dir=adapter_path,
52 | mbatch_size=mbatch_size,
53 | batch_size=batch_size,
54 | epochs=epochs,
55 | lr=lr,
56 | cutoff_len=cutoff_len,
57 | lora_r=lora_r,
58 | lora_alpha=lora_alpha,
59 | lora_dropout=lora_dropout,
60 | val_set_size=val_set_size,
61 | warmup_steps=warmup_steps,
62 | save_steps=save_steps,
63 | save_total_limit=save_total_limit,
64 | logging_steps=logging_steps,
65 | )
66 |
67 | # set up lora config
68 | lora_config = quant_peft.LoraConfig(
69 | r=tune_config.lora_r,
70 | lora_alpha=tune_config.lora_alpha,
71 | target_modules=["q_proj", "v_proj"],
72 | lora_dropout=tune_config.lora_dropout,
73 | bias="none",
74 | task_type="CAUSAL_LM",
75 | )
76 |
77 | # create a new lora from config
78 | model = quant_peft.get_peft_model(llm, lora_config)
79 |
80 | # load stanford alpaca data
81 | data = TrainSAD(
82 | tune_config.dataset,
83 | tune_config.val_set_size,
84 | tokenizer,
85 | tune_config.cutoff_len
86 | )
87 | data.prepare_data() # this tokenizes the dataset
88 |
89 | # training args
90 | training_arguments = transformers.TrainingArguments(
91 | per_device_train_batch_size=tune_config.mbatch_size,
92 | gradient_accumulation_steps=tune_config.gradient_accumulation_steps,
93 | warmup_steps=tune_config.warmup_steps,
94 | num_train_epochs=tune_config.epochs,
95 | learning_rate=tune_config.lr,
96 | fp16=True,
97 | logging_steps=tune_config.logging_steps,
98 | evaluation_strategy="no",
99 | save_strategy="steps",
100 | eval_steps=None,
101 | save_steps=tune_config.save_steps,
102 | output_dir=tune_config.lora_out_dir,
103 | save_total_limit=tune_config.save_total_limit,
104 | load_best_model_at_end=False,
105 | ddp_find_unused_parameters=False if tune_config.ddp else None,
106 | )
107 |
108 | # start trainer
109 | trainer = transformers.Trainer(
110 | model=model,
111 | train_dataset=data.train_data,
112 | eval_dataset=data.val_data,
113 | args=training_arguments,
114 | data_collator=transformers.DataCollatorForLanguageModeling(
115 | tokenizer, mlm=False
116 | ),
117 | )
118 | print(training_arguments.parallel_mode)
119 | model.config.use_cache = False
120 |
121 | # use half precision
122 | model = to_half_precision(model)
123 |
124 | # start training
125 | checkpoint_dir = tune_config.lora_out_dir
126 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
127 | trainer.train(resume_from_checkpoint=True)
128 | else:
129 | trainer.train()
130 |
131 | # Save Model
132 | model.save_pretrained(tune_config.lora_out_dir)
133 |
134 |
--------------------------------------------------------------------------------
/llmtune/data/gpt4all.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from typing import Dict, Any
3 | from datasets import load_dataset
4 | from llmtune.data.abstract import AbstractTrainData
5 |
6 | # GPT4All-like Data
7 | class TrainGPT4All(AbstractTrainData):
8 | def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None:
9 | super().__init__(dataset, val_set_size, tokenizer, cutoff_len)
10 |
11 | def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
12 | pass
13 |
14 | def tokenize_inputs(self, examples):
15 | max_length = self.cutoff_len
16 | input_ids = torch.full((len(examples["prompt"]), max_length), self.tokenizer.pad_token_id)
17 | # ignore bos
18 | newline_tokens = self.tokenizer("\n", return_tensors="pt")["input_ids"][0, 1:]
19 |
20 | out = {"labels": [], "attention_mask": []}
21 | for i, (prompt, response) in enumerate(zip(examples["prompt"], examples["response"])):
22 | input_tokens = self.tokenizer(prompt, truncation=True, max_length=max_length // 2, return_tensors="pt")["input_ids"].squeeze()
23 | if input_tokens.dim() == 0:
24 | input_tokens = input_tokens.unsqueeze(0)
25 |
26 | input_len = len(input_tokens)
27 |
28 | # plus one since we remove bos from response
29 | # but we subtract one since we want to add eos token
30 | remaining_tokens = max_length - input_len - len(newline_tokens) + 1
31 | # remove bos
32 | target_tokens = self.tokenizer(response, truncation=True, max_length=remaining_tokens, return_tensors="pt")["input_ids"].squeeze()[1:]
33 |
34 | input_ids[i, :input_len] = input_tokens
35 | # add newline between prompt and response
36 | newline_plus_inputs = input_len + len(newline_tokens)
37 | input_ids[i, input_len: newline_plus_inputs] = newline_tokens
38 |
39 | # add target tokens, remove bos
40 | input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens
41 | # add eos token, enforce stopping if we don't truncate
42 | # we don't want long code to stop generating if truncated during training
43 | if newline_plus_inputs + len(target_tokens) < max_length:
44 | input_ids[i, newline_plus_inputs + len(target_tokens)] = self.tokenizer.eos_token_id
45 |
46 | labels = input_ids[i].clone()
47 | labels[: newline_plus_inputs] = -100
48 | labels[labels == self.tokenizer.pad_token_id] = -100
49 | # to debug this, can set all values == -100 to the pad token, then assert that tokenizer.decode(labels, skip_special_tokens=True).strip() == response
50 |
51 | attention_mask = input_ids[i].ne(self.tokenizer.pad_token_id).int()
52 |
53 | out["labels"].append(labels)
54 | out["attention_mask"].append(attention_mask)
55 |
56 | out["input_ids"] = input_ids
57 |
58 | out = {k: torch.stack(v) if isinstance(v, list) else v for k, v in out.items()}
59 |
60 | return out
61 |
62 | def prepare_data(self, **kwargs) -> None:
63 | dataset = load_dataset("json", data_files=self.dataset)
64 |
65 | self.val_data = None
66 | if self.val_set_size > 0:
67 | dataset = dataset["train"].train_test_split(
68 | test_size=self.val_set_size, shuffle=True, seed=42 # ! Seed = 42 (?)
69 | )
70 | train_dataset, val_dataset = dataset["train"], dataset["test"]
71 |
72 | # tokenize inputs and return labels and attention mask
73 | val_dataset = val_dataset.map(
74 | lambda ele: self.tokenize_inputs(ele),
75 | batched=True,
76 | remove_columns=["source", "prompt"],
77 | )
78 | self.val_data = val_dataset.with_format("torch")
79 | else:
80 | train_dataset = dataset["train"]
81 |
82 | train_dataset = train_dataset.map(
83 | lambda ele: self.tokenize_inputs(ele),
84 | batched=True,
85 | remove_columns=["source", "prompt"],
86 | )
87 | self.train_data = train_dataset.with_format("torch")
88 |
--------------------------------------------------------------------------------
/finetune/bbh-eval/bbh_dev.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from argparse import Namespace
4 | from typing import List
5 |
6 | from datasets import load_dataset, get_dataset_config_names
7 | from fire import Fire
8 | from pydantic import BaseModel
9 | from tqdm import tqdm
10 |
11 | from modeling_dev import select_model, EvalModel
12 |
13 |
14 | class BBHSample(BaseModel):
15 | input: str
16 | target: str
17 |
18 | def as_prompt(self, include_answer: bool = True):
19 | prompt = self.input
20 | prompt += "\nAnswer:"
21 | if include_answer:
22 | prompt += " {}\n\n".format(self.target)
23 | return prompt
24 |
25 |
26 | class BBHData(BaseModel):
27 | samples: List[BBHSample]
28 |
29 | @classmethod
30 | def get_config_names(cls, path: str = "lukaemon/bbh") -> List[str]:
31 | return get_dataset_config_names(path)
32 |
33 | @classmethod
34 | def load_from_huggingface(
35 | cls, path: str = "lukaemon/bbh", config: str = "", split: str = "test"
36 | ):
37 | data = load_dataset(path, config, split=split)
38 | samples = [BBHSample(**raw) for raw in tqdm(data, desc=str((path, split)))]
39 | return cls(samples=samples)
40 |
41 |
42 | def gen_prompt(data: BBHData, k=-1):
43 | prompt = ""
44 | if k == -1:
45 | k = len(data.samples)
46 | for i in range(k):
47 | prompt += data.samples[i].as_prompt()
48 | return prompt
49 |
50 |
51 | def evaluate(model: EvalModel, data: BBHData, ntrain: int) -> dict:
52 | data_train = BBHData(samples=data.samples[:ntrain])
53 | data_test = BBHData(samples=data.samples[ntrain:])
54 | is_correct = []
55 |
56 | for i in range(len(data_test.samples)):
57 | # get prompt and make sure it fits
58 | k = int(ntrain)
59 | prompt_end = data_test.samples[i].as_prompt(include_answer=False)
60 | train_prompt = gen_prompt(data_train, k)
61 | prompt = train_prompt + prompt_end
62 |
63 | while not model.check_valid_length(prompt) and k > 0:
64 | k -= 1
65 | train_prompt = gen_prompt(data_train, k)
66 | prompt = train_prompt + prompt_end
67 |
68 | label = data_test.samples[i].target
69 | pred = model.run(prompt)
70 | is_correct.append(pred.strip().startswith(label))
71 | if i == 0:
72 | print(dict(prompt=prompt, label=label, pred=pred))
73 |
74 | return dict(score=sum(is_correct) / len(is_correct))
75 |
76 |
77 | def main(data_dir: str = "lukaemon/bbh", ntrain: int = 3, **kwargs):
78 | args = Namespace(**locals())
79 | model = select_model(max_input_length=2048, max_output_length=32, **kwargs)
80 | print(locals())
81 |
82 | if 'load_4bit' in kwargs:
83 | loadin_4bit = 'true'
84 | else:
85 | loadin_4bit = 'false'
86 |
87 | if 'load_8bit' in kwargs:
88 | loadin_8bit = 'true'
89 | else:
90 | loadin_8bit = 'false'
91 |
92 | if 'lora_path' in kwargs:
93 | file_name = f"all_results_{kwargs['model_path'].replace('/', '-')}_{kwargs['lora_path'].replace('/', '-')}_4bit_{loadin_4bit}_8bit_{loadin_8bit}.txt"
94 | else:
95 | file_name = f"all_results_{kwargs['model_path'].replace('/', '-')}_4bit_{loadin_4bit}_8bit_{loadin_8bit}.txt"
96 |
97 | all_results = []
98 | if os.path.exists(file_name):
99 | with open(file_name, "r") as f:
100 | print(f"Loading {file_name}")
101 | all_results = json.load(f)
102 | print(all_results)
103 |
104 | start = len(all_results)
105 | for name in tqdm(BBHData.get_config_names()[start:]):
106 | data = BBHData.load_from_huggingface(config=name)
107 | result = evaluate(model, data, ntrain=ntrain)
108 | all_results.append(result)
109 | print(dict(name=name, **result))
110 |
111 | # Save the state of all_results after each iteration
112 | with open(file_name, "w") as f:
113 | json.dump(all_results, f)
114 |
115 | score = sum(res["score"] for res in all_results) / len(all_results)
116 | print(dict(average=score))
117 | return score
118 |
119 |
120 | if __name__ == "__main__":
121 | Fire()
122 |
--------------------------------------------------------------------------------
/llmtune/data/calibration.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 |
5 | def set_seed(seed):
6 | np.random.seed(seed)
7 | torch.random.manual_seed(seed)
8 |
9 |
10 | def get_wikitext2(nsamples, seed, seqlen, model):
11 | from datasets import load_dataset
12 | traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
13 | testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
14 |
15 | from transformers import AutoTokenizer
16 | tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
17 | trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt')
18 | testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
19 |
20 | import random
21 | random.seed(seed)
22 | trainloader = []
23 | for _ in range(nsamples):
24 | i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
25 | j = i + seqlen
26 | inp = trainenc.input_ids[:, i:j]
27 | tar = inp.clone()
28 | tar[:, :-1] = -100
29 | trainloader.append((inp, tar))
30 | return trainloader, testenc
31 |
32 | def get_ptb(nsamples, seed, seqlen, model):
33 | from datasets import load_dataset
34 | traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
35 | valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation')
36 |
37 | from transformers import AutoTokenizer
38 | tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
39 | trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt')
40 | testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt')
41 |
42 | import random
43 | random.seed(seed)
44 | trainloader = []
45 | for _ in range(nsamples):
46 | i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
47 | j = i + seqlen
48 | inp = trainenc.input_ids[:, i:j]
49 | tar = inp.clone()
50 | tar[:, :-1] = -100
51 | trainloader.append((inp, tar))
52 | return trainloader, testenc
53 |
54 | def get_c4(nsamples, seed, seqlen, model):
55 | from datasets import load_dataset
56 | traindata = load_dataset(
57 | 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train', use_auth_token=True
58 | )
59 | valdata = load_dataset(
60 | 'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation',use_auth_token=True
61 | )
62 |
63 | from transformers import AutoTokenizer
64 | tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
65 |
66 | import random
67 | random.seed(seed)
68 | trainloader = []
69 | for _ in range(nsamples):
70 | while True:
71 | i = random.randint(0, len(traindata) - 1)
72 | trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
73 | if trainenc.input_ids.shape[1] >= seqlen:
74 | break
75 | i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
76 | j = i + seqlen
77 | inp = trainenc.input_ids[:, i:j]
78 | tar = inp.clone()
79 | tar[:, :-1] = -100
80 | trainloader.append((inp, tar))
81 |
82 | import random
83 | random.seed(0)
84 | valenc = []
85 | for _ in range(256):
86 | while True:
87 | i = random.randint(0, len(valdata) - 1)
88 | tmp = tokenizer(valdata[i]['text'], return_tensors='pt')
89 | if tmp.input_ids.shape[1] >= seqlen:
90 | break
91 | i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
92 | j = i + seqlen
93 | valenc.append(tmp.input_ids[:, i:j])
94 | valenc = torch.hstack(valenc)
95 | class TokenizerWrapper:
96 | def __init__(self, input_ids):
97 | self.input_ids = input_ids
98 | valenc = TokenizerWrapper(valenc)
99 |
100 | return trainloader, valenc
101 |
102 |
103 | def get_calibration_loaders(
104 | name, nsamples=128, seed=0, seqlen=2048, model=''
105 | ):
106 | if 'wikitext2' in name:
107 | return get_wikitext2(nsamples, seed, seqlen, model)
108 | if 'ptb' in name:
109 | return get_ptb(nsamples, seed, seqlen, model)
110 | if 'c4' in name:
111 | return get_c4(nsamples, seed, seqlen, model)
112 |
--------------------------------------------------------------------------------
/llmtune/engine/inference/matmult.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | try:
4 | import quant_cuda
5 | except:
6 | print('CUDA extension not installed. Inference will not work.')
7 |
8 | # Global Buffer
9 | buffer_mat_dic = {}
10 | use_new = True
11 | auto_switch = True
12 | auto_switch_thd = 8
13 | debug = False
14 | cache_buffer = True
15 |
16 | def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda', bits=4):
17 | target_shape = (shape_of_qweight[0] * (32 // bits), shape_of_qweight[1])
18 | if not cache_buffer:
19 | return torch.zeros(target_shape, dtype=dtype, device=device)
20 | if target_shape not in buffer_mat_dic.keys():
21 | buffer_mat_dic[target_shape] = torch.zeros(target_shape, dtype=dtype, device=device)
22 | else:
23 | if buffer_mat_dic[target_shape].device != device:
24 | buffer_mat_dic[target_shape] = buffer_mat_dic[target_shape].to(device)
25 | if buffer_mat_dic[target_shape].dtype != dtype:
26 | buffer_mat_dic[target_shape] = buffer_mat_dic[target_shape].to(dtype=dtype)
27 | return buffer_mat_dic[target_shape]
28 |
29 | def _matmul4bit_v1_recons(x, qweight, scales, zeros, transpose=False):
30 | if debug:
31 | print('_matmul4bit_v1_recons')
32 | if not transpose:
33 | assert qweight.shape[0] * 8 == x.shape[-1]
34 | else:
35 | assert qweight.shape[1] == x.shape[-1]
36 | buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
37 | quant_cuda.vecquant4recons_v1(qweight, buffer, scales, zeros)
38 | # dtype = x.dtype
39 | # x = x.float()
40 | if not transpose:
41 | output = torch.matmul(x, buffer)
42 | else:
43 | output = torch.matmul(x, buffer.T)
44 | # output = output.to(dtype)
45 | return output
46 |
47 |
48 | def _matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx, transpose=False):
49 | if debug:
50 | print('_matmul4bit_v2_recons')
51 | if not transpose:
52 | assert qweight.shape[0] * 8 == x.shape[-1]
53 | else:
54 | assert qweight.shape[1] == x.shape[-1]
55 | buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
56 | quant_cuda.vecquant4recons_v2(qweight, buffer, scales, zeros, g_idx)
57 | if not transpose:
58 | output = torch.matmul(x, buffer)
59 | else:
60 | output = torch.matmul(x, buffer.T)
61 | return output
62 |
63 |
64 | def _matmul2bit_v2_recons(x, qweight, scales, zeros, g_idx, transpose=False):
65 | if debug:
66 | print('_matmul2bit_v2_recons')
67 | if not transpose:
68 | assert qweight.shape[0] * 16 == x.shape[-1]
69 | else:
70 | assert qweight.shape[1] == x.shape[-1]
71 | buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device, bits=2)
72 | quant_cuda.vecquant2recons_v2(qweight, buffer, scales, zeros, g_idx)
73 | if not transpose:
74 | output = torch.matmul(x, buffer)
75 | else:
76 | output = torch.matmul(x, buffer.T)
77 | return output
78 |
79 |
80 | def matmul4bit(x, qweight, scales, zeros, g_idx=None):
81 | raise NotImplementedError()
82 | # detect if zeros is int32
83 | if zeros.dtype != torch.int32:
84 | # use v1
85 | if use_new:
86 | if auto_switch:
87 | if np.prod(x.shape[:-1]) > auto_switch_thd:
88 | output = _matmul4bit_v1_recons(x.half(), qweight, scales.half(), zeros.half())
89 | else:
90 | output = _matmul4bit_v1(x, qweight, scales, zeros)
91 | else:
92 | output = _matmul4bit_v1(x, qweight, scales, zeros)
93 | else:
94 | if g_idx is None:
95 | g_idx = torch.zeros(qweight.shape[0] * 8, dtype=torch.int32, device=x.device)
96 | # use v2
97 | if use_new:
98 | if auto_switch:
99 | if np.prod(x.shape[:-1]) > auto_switch_thd:
100 | output = _matmul4bit_v2_recons(x.half(), qweight, scales.half(), zeros, g_idx)
101 | else:
102 | output = _matmul4bit_v2(x, qweight, scales, zeros, g_idx)
103 | else:
104 | output = _matmul4bit_v2(x, qweight, scales, zeros, g_idx)
105 | return output
106 |
107 |
108 | def matmul3bit(x, qweight, scales, zeros, g_idx, outfeatures):
109 | out_shape = x.shape[:-1] + (outfeatures, )
110 | x = x.reshape(-1,x.shape[-1])
111 | output = torch.zeros((x.shape[0], outfeatures), device=x.device, dtype=torch.float32)
112 | quant_cuda.vecquant3matmul(x.float(), qweight, output, scales.float(), zeros, g_idx)
113 | output = output.reshape(out_shape)
114 | return output
--------------------------------------------------------------------------------
/llmtune/engine/quant/gptq/quantizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | def quantize(x, scale, zero, maxq):
7 | if maxq < 0:
8 | return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
9 | q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
10 | return scale * (q - zero)
11 |
12 | class Quantizer(nn.Module):
13 | def __init__(self, shape=1):
14 | super(Quantizer, self).__init__()
15 | self.register_buffer('maxq', torch.tensor(0))
16 | self.register_buffer('scale', torch.zeros(shape))
17 | self.register_buffer('zero', torch.zeros(shape))
18 |
19 | def configure(
20 | self,
21 | bits, perchannel=False, sym=True,
22 | mse=False, norm=2.4, grid=100, maxshrink=.8,
23 | trits=False
24 | ):
25 |
26 | self.maxq = torch.tensor(2 ** bits - 1)
27 | self.perchannel = perchannel
28 | self.sym = sym
29 | self.mse = mse
30 | self.norm = norm
31 | self.grid = grid
32 | self.maxshrink = maxshrink
33 | if trits:
34 | self.maxq = torch.tensor(-1)
35 |
36 | def find_params(self, x, weight=False):
37 | dev = x.device
38 | self.maxq = self.maxq.to(dev)
39 |
40 | shape = x.shape
41 | if self.perchannel:
42 | if weight:
43 | x = x.flatten(1)
44 | else:
45 | if len(shape) == 4:
46 | x = x.permute([1, 0, 2, 3])
47 | x = x.flatten(1)
48 | if len(shape) == 3:
49 | x = x.reshape((-1, shape[-1])).t()
50 | if len(shape) == 2:
51 | x = x.t()
52 | else:
53 | x = x.flatten().unsqueeze(0)
54 |
55 | tmp = torch.zeros(x.shape[0], device=dev)
56 | xmin = torch.minimum(x.min(1)[0], tmp)
57 | xmax = torch.maximum(x.max(1)[0], tmp)
58 |
59 | if self.sym:
60 | xmax = torch.maximum(torch.abs(xmin), xmax)
61 | tmp = xmin < 0
62 | if torch.any(tmp):
63 | xmin[tmp] = -xmax[tmp]
64 | tmp = (xmin == 0) & (xmax == 0)
65 | xmin[tmp] = -1
66 | xmax[tmp] = +1
67 |
68 | if self.maxq < 0:
69 | self.scale = xmax
70 | self.zero = xmin
71 | else:
72 | self.scale = (xmax - xmin) / self.maxq
73 | if self.sym:
74 | self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
75 | else:
76 | self.zero = torch.round(-xmin / self.scale)
77 |
78 | if self.mse:
79 | best = torch.full([x.shape[0]], float('inf'), device=dev)
80 | for i in range(int(self.maxshrink * self.grid)):
81 | p = 1 - i / self.grid
82 | xmin1 = p * xmin
83 | xmax1 = p * xmax
84 | scale1 = (xmax1 - xmin1) / self.maxq
85 | zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
86 | q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
87 | q -= x
88 | q.abs_()
89 | q.pow_(self.norm)
90 | err = torch.sum(q, 1)
91 | tmp = err < best
92 | if torch.any(tmp):
93 | best[tmp] = err[tmp]
94 | self.scale[tmp] = scale1[tmp]
95 | self.zero[tmp] = zero1[tmp]
96 | if not self.perchannel:
97 | if weight:
98 | tmp = shape[0]
99 | else:
100 | tmp = shape[1] if len(shape) != 3 else shape[2]
101 | self.scale = self.scale.repeat(tmp)
102 | self.zero = self.zero.repeat(tmp)
103 |
104 | if weight:
105 | shape = [-1] + [1] * (len(shape) - 1)
106 | self.scale = self.scale.reshape(shape)
107 | self.zero = self.zero.reshape(shape)
108 | return
109 | if len(shape) == 4:
110 | self.scale = self.scale.reshape((1, -1, 1, 1))
111 | self.zero = self.zero.reshape((1, -1, 1, 1))
112 | if len(shape) == 3:
113 | self.scale = self.scale.reshape((1, 1, -1))
114 | self.zero = self.zero.reshape((1, 1, -1))
115 | if len(shape) == 2:
116 | self.scale = self.scale.unsqueeze(0)
117 | self.zero = self.zero.unsqueeze(0)
118 |
119 | def quantize(self, x):
120 | if self.ready():
121 | return quantize(x, self.scale, self.zero, self.maxq)
122 | return x
123 |
124 | def enabled(self):
125 | return self.maxq > 0
126 |
127 | def ready(self):
128 | return torch.all(self.scale != 0)
129 |
--------------------------------------------------------------------------------
/llmtune/engine/inference/cuda/quant_cuda.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | // standard forward operations
6 |
7 | void vecquant2matmul_cuda(
8 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
9 | torch::Tensor scales, torch::Tensor zeros,
10 | torch::Tensor g_idx
11 | );
12 |
13 | void vecquant2matmul(
14 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
15 | torch::Tensor scales, torch::Tensor zeros,
16 | torch::Tensor g_idx
17 | ) {
18 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
19 | vecquant2matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
20 | }
21 |
22 | void vecquant3matmul_cuda(
23 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
24 | torch::Tensor scales, torch::Tensor zeros,
25 | torch::Tensor g_idx
26 | );
27 |
28 | void vecquant3matmul(
29 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
30 | torch::Tensor scales, torch::Tensor zeros,
31 | torch::Tensor g_idx
32 | ) {
33 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
34 | vecquant3matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
35 | }
36 |
37 | void vecquant4matmul_cuda(
38 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
39 | torch::Tensor scales, torch::Tensor zeros,
40 | torch::Tensor g_idx
41 | );
42 |
43 | void vecquant4matmul(
44 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
45 | torch::Tensor scales, torch::Tensor zeros,
46 | torch::Tensor g_idx
47 | ) {
48 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
49 | vecquant4matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
50 | }
51 |
52 | void vecquant8matmul_cuda(
53 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
54 | torch::Tensor scales, torch::Tensor zeros,
55 | torch::Tensor g_idx
56 | );
57 |
58 | void vecquant8matmul(
59 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
60 | torch::Tensor scales, torch::Tensor zeros,
61 | torch::Tensor g_idx
62 | ) {
63 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
64 | vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
65 | }
66 |
67 | // methods based on reconstruction (unpacking)
68 |
69 | void vecquant4recons_v1_cuda(
70 | torch::Tensor mat, torch::Tensor res, torch::Tensor scales, torch::Tensor zeros
71 | );
72 |
73 | void vecquant4recons_v1(
74 | torch::Tensor mat, torch::Tensor res,
75 | torch::Tensor scales, torch::Tensor zeros
76 | ) {
77 | const at::cuda::OptionalCUDAGuard device_guard(device_of(scales));
78 | vecquant4recons_v1_cuda(mat, res, scales, zeros);
79 | }
80 |
81 | void vecquant4recons_v2_cuda(
82 | torch::Tensor mat, torch::Tensor res,
83 | torch::Tensor scales, torch::Tensor zeros,
84 | torch::Tensor g_idx
85 | );
86 |
87 | void vecquant4recons_v2(
88 | torch::Tensor mat, torch::Tensor res, torch::Tensor scales, torch::Tensor zeros, torch::Tensor g_idx
89 | ) {
90 | const at::cuda::OptionalCUDAGuard device_guard(device_of(scales));
91 | vecquant4recons_v2_cuda(mat, res, scales, zeros, g_idx);
92 | }
93 |
94 | void vecquant2recons_v2_cuda(
95 | torch::Tensor mat, torch::Tensor res,
96 | torch::Tensor scales, torch::Tensor zeros,
97 | torch::Tensor g_idx
98 | );
99 |
100 | void vecquant2recons_v2(
101 | torch::Tensor mat, torch::Tensor res, torch::Tensor scales, torch::Tensor zeros, torch::Tensor g_idx
102 | ) {
103 | const at::cuda::OptionalCUDAGuard device_guard(device_of(scales));
104 | vecquant2recons_v2_cuda(mat, res, scales, zeros, g_idx);
105 | }
106 |
107 | void vecquant4matmul_v1_faster_cuda(
108 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
109 | torch::Tensor scales, torch::Tensor zeros
110 | );
111 |
112 | void vecquant4matmul_v1_faster(
113 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
114 | torch::Tensor scales, torch::Tensor zeros
115 | ) {
116 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
117 | vecquant4matmul_v1_faster_cuda(vec, mat, mul, scales, zeros);
118 | }
119 |
120 |
121 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
122 | m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
123 | m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
124 | m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
125 | m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
126 |
127 | // Reconstruction Kernel
128 | m.def("vecquant4recons_v1", &vecquant4recons_v1, "Vector 4-bit Quantized Matrix Reconstruction (CUDA)");
129 | m.def("vecquant4recons_v2", &vecquant4recons_v2, "Vector 4-bit Quantized Matrix Reconstruction (CUDA) with group-size support");
130 | m.def("vecquant2recons_v2", &vecquant2recons_v2, "Vector 2-bit Quantized Matrix Reconstruction (CUDA) with group-size support");
131 | }
--------------------------------------------------------------------------------
/llmtune/executor.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import torch
4 |
5 | from llmtune.config import DEV
6 | from llmtune.utils import to_half_precision
7 |
8 | def load_llm(model_name_or_path):
9 | from llmtune.llms.autollm import AutoLLMForCausalLM
10 | llm = AutoLLMForCausalLM.from_pretrained(model_name_or_path)
11 | return llm
12 |
13 | def load_tokenizer(model_name_or_path, llm_config=None):
14 | from llmtune.llms.autollm import get_default_tokenizer
15 | if llm_config is not None:
16 | model_type = llm_config.model_type
17 | else:
18 | model_type = None
19 | return get_default_tokenizer(model_name_or_path, model_type)
20 |
21 | def load_adapter(llm, adapter_path=None, lora_config=None):
22 | from llmtune.engine.lora.peft import quant_peft
23 | if adapter_path is None and lora_config is not None:
24 | model = quant_peft.get_peft_model(llm, lora_config)
25 | elif adapter_path is not None and lora_config is None:
26 | model = quant_peft.PeftModel.from_pretrained(
27 | llm, adapter_path,
28 | device_map='auto',
29 | torch_dtype=torch.float32
30 | )
31 | print(adapter_path, 'loaded')
32 | else:
33 | ValueError('Need to specify adapter_path or lora_config')
34 | return model
35 |
36 | def generate(
37 | llm, tokenizer, prompt, min_length, max_length, temperature, top_k, top_p
38 | ):
39 | llm.to(DEV)
40 | llm = to_half_precision(llm)
41 | input_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEV)
42 |
43 | with torch.no_grad():
44 | generated_ids = llm.generate(
45 | inputs=input_ids,
46 | do_sample=True,
47 | min_length=min_length,
48 | max_length=max_length,
49 | top_p=top_p,
50 | top_k=top_k,
51 | temperature=temperature,
52 | )
53 | return tokenizer.decode([el.item() for el in generated_ids[0]])
54 |
55 | def finetune(llm, tokenizer, tune_config):
56 | import transformers
57 | from llmtune.data import load_finetuning_data
58 | from llmtune.engine.lora.peft import quant_peft
59 | transformers.logging.set_verbosity_info()
60 | tokenizer.pad_token_id = 0
61 |
62 | lora_config = quant_peft.LoraConfig(
63 | r=tune_config.lora_r,
64 | lora_alpha=tune_config.lora_alpha,
65 | target_modules=["q_proj", "v_proj"],
66 | lora_dropout=tune_config.lora_dropout,
67 | bias="none",
68 | task_type="CAUSAL_LM",
69 | )
70 | model = load_adapter(llm, lora_config=lora_config)
71 | model.print_trainable_parameters()
72 |
73 | data = load_finetuning_data(tune_config, tokenizer)
74 |
75 | training_arguments = transformers.TrainingArguments(
76 | per_device_train_batch_size=tune_config.mbatch_size,
77 | gradient_accumulation_steps=tune_config.gradient_accumulation_steps,
78 | warmup_steps=tune_config.warmup_steps,
79 | num_train_epochs=tune_config.epochs,
80 | learning_rate=tune_config.lr,
81 | fp16=True,
82 | logging_steps=tune_config.logging_steps,
83 | evaluation_strategy="no",
84 | save_strategy="steps",
85 | eval_steps=None,
86 | save_steps=tune_config.save_steps,
87 | output_dir=tune_config.lora_out_dir,
88 | save_total_limit=tune_config.save_total_limit,
89 | load_best_model_at_end=False,
90 | ddp_find_unused_parameters=False if tune_config.ddp else None,
91 | )
92 |
93 | trainer = transformers.Trainer(
94 | model=model,
95 | train_dataset=data.train_data,
96 | eval_dataset=data.val_data,
97 | args=training_arguments,
98 | data_collator=transformers.DataCollatorForLanguageModeling(
99 | tokenizer, mlm=False
100 | ),
101 | )
102 | print(training_arguments.parallel_mode)
103 | model.config.use_cache = False
104 |
105 | # use half precision
106 | model = to_half_precision(model)
107 |
108 | # start training
109 | checkpoint_dir = tune_config.lora_out_dir
110 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
111 | trainer.train(resume_from_checkpoint=True)
112 | else:
113 | trainer.train()
114 |
115 | # Save Model
116 | model.save_pretrained(tune_config.lora_out_dir)
117 |
118 | def quantize(llm, config):
119 | from llmtune.data.calibration import get_calibration_loaders
120 | from llmtune.engine.quant.gptq.executor import GPTQAlgorithm
121 |
122 | llm.eval()
123 | dataloader, _ = get_calibration_loaders(
124 | config.dataset,
125 | nsamples=config.nsamples,
126 | seed=config.seed,
127 | model=llm.base_model.name_or_path,
128 | seqlen=llm.base_model.seqlen
129 | )
130 |
131 | gptq = GPTQAlgorithm(config)
132 | llm = gptq.quantize(llm, dataloader)
133 |
134 | llm.save_pretrained(config.save)
135 | print(f'Model weights saved to: {config.save}')
136 |
137 |
--------------------------------------------------------------------------------
/llmtune/engine/quant/gptq/extras.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from llmtune.engine.quant.gptq.algorithm import GPTQ
4 | from llmtune.engine.quant.gptq.quantizer import Quantizer
5 | from llmtune.engine.quant.converter import make_quant
6 | from llmtune.engine.inference.modules import QuantLinear
7 | from llmtune.utils import find_layers
8 |
9 | @torch.no_grad()
10 | def quantize_opt(
11 | model, dataloader, bits, groupsize, act_order, nsamples, percdamp,
12 | sym=False, true_sequential=False, nearest=False, trits=False, dev='cuda'
13 | ):
14 | print('Starting ...')
15 | if nearest is True or true_sequential is True:
16 | raise NotImplementedError()
17 |
18 | use_cache = model.config.use_cache
19 | model.config.use_cache = False
20 | layers = model.model.decoder.layers
21 |
22 | model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
23 | model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
24 | if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
25 | model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
26 | if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
27 | model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
28 | layers[0] = layers[0].to(dev)
29 |
30 | dtype = next(iter(model.parameters())).dtype
31 | inps = torch.zeros(
32 | (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
33 | )
34 | cache = {'i': 0, 'attention_mask': None}
35 |
36 | class Catcher(nn.Module):
37 | def __init__(self, module):
38 | super().__init__()
39 | self.module = module
40 | def forward(self, inp, **kwargs):
41 | inps[cache['i']] = inp
42 | cache['i'] += 1
43 | cache['attention_mask'] = kwargs['attention_mask']
44 | raise ValueError
45 | layers[0] = Catcher(layers[0])
46 | for batch in dataloader:
47 | try:
48 | model(batch[0].to(dev))
49 | except ValueError:
50 | pass
51 | layers[0] = layers[0].module
52 |
53 | layers[0] = layers[0].cpu()
54 | model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
55 | model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
56 | if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
57 | model.model.decoder.project_out = model.model.decoder.project_out.cpu()
58 | if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
59 | model.model.decoder.project_in = model.model.decoder.project_in.cpu()
60 | torch.cuda.empty_cache()
61 |
62 | outs = torch.zeros_like(inps)
63 | attention_mask = cache['attention_mask']
64 |
65 | print('Ready.')
66 |
67 | quantizers = {}
68 | for i in range(len(layers)):
69 | layer = layers[i].to(dev)
70 |
71 | subset = find_layers(layer)
72 | gptq = {}
73 | for name in subset:
74 | gptq[name] = GPTQ(subset[name])
75 | gptq[name].quantizer = Quantizer()
76 | gptq[name].quantizer.configure( bits, perchannel=True, sym=sym, mse=False, trits=trits )
77 |
78 | def add_batch(name):
79 | def tmp(_, inp, out):
80 | gptq[name].add_batch(inp[0].data, out.data)
81 | return tmp
82 |
83 | handles = []
84 | for name in subset:
85 | handles.append(subset[name].register_forward_hook(add_batch(name)))
86 |
87 | for j in range(nsamples):
88 | outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
89 |
90 | for h in handles:
91 | h.remove()
92 |
93 | for name in subset:
94 | print(f'Quantizing {name} in layer {i+1}/{len(layers)}...')
95 | scale,zero,g_idx = gptq[name].fasterquant(percdamp=percdamp, groupsize=groupsize, actorder=act_order)
96 | quantizers['model.decoder.layers.%d.%s' % (i, name)] = (gptq[name].quantizer.cpu(),scale.cpu(),zero.cpu(),g_idx.cpu())
97 | gptq[name].free()
98 |
99 | for j in range(nsamples):
100 | outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
101 |
102 | layers[i] = layer.cpu()
103 | del layer
104 | del gptq
105 | torch.cuda.empty_cache()
106 |
107 | inps, outs = outs, inps
108 |
109 | model.config.use_cache = use_cache
110 |
111 | return quantizers
112 |
113 | def pack_opt(model, quantizers, wbits, groupsize):
114 | layers = find_layers(model)
115 | layers = {n: layers[n] for n in quantizers}
116 | make_quant(model, quantizers, wbits, groupsize)
117 | qlayers = find_layers(model, [QuantLinear])
118 | print('Packing ...')
119 | for name in qlayers:
120 | print(name)
121 | quantizers[name],scale,zero,g_idx = quantizers[name]
122 | qlayers[name].pack(layers[name], scale, zero, g_idx)
123 | print('Done.')
124 | return model
--------------------------------------------------------------------------------
/finetune/samsum-llama/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 |
4 | import torch
5 | import numpy as np
6 |
7 |
8 | def set_random_seed(seed):
9 | random.seed(seed)
10 | np.random.seed(seed)
11 | os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
12 | os.environ["PL_GLOBAL_SEED"] = str(seed)
13 | os.environ["PYTHONHASHSEED"] = str(seed)
14 | torch.manual_seed(seed)
15 | torch.cuda.manual_seed_all(seed)
16 | torch.backends.cudnn.benchmark = False
17 | torch.backends.cudnn.deterministic = True
18 |
19 |
20 | def fix_tokenizer(tokenizer):
21 | # Fixing broken tokenizers
22 | special_tokens = dict()
23 | for token_id in range(1000):
24 | token = tokenizer.convert_ids_to_tokens(token_id)
25 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad" in token:
26 | special_tokens["pad_token"] = token
27 | if tokenizer.bos_token_id in (None, tokenizer.vocab_size) and "" in token:
28 | special_tokens["bos_token"] = token
29 | if tokenizer.eos_token_id in (None, tokenizer.vocab_size) and "" in token:
30 | special_tokens["eos_token"] = token
31 | if tokenizer.unk_token_id in (None, tokenizer.vocab_size) and "unk" in token:
32 | special_tokens["unk_token"] = token
33 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep" in token:
34 | special_tokens["sep_token"] = token
35 |
36 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "bos_token" in special_tokens:
37 | special_tokens["sep_token"] = special_tokens["bos_token"]
38 |
39 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad_token" not in special_tokens:
40 | if tokenizer.unk_token_id is not None:
41 | special_tokens["pad_token"] = tokenizer.unk_token
42 | else:
43 | special_tokens["pad_token"] = "<|pad|>"
44 |
45 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep_token" not in special_tokens:
46 | if tokenizer.bos_token_id is not None:
47 | special_tokens["sep_token"] = tokenizer.bos_token
48 | else:
49 | special_tokens["sep_token"] = "<|sep|>"
50 | print(special_tokens)
51 | tokenizer.add_special_tokens(special_tokens)
52 |
53 | print("Vocab size: ", tokenizer.vocab_size)
54 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
55 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
56 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
57 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
58 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
59 | return tokenizer
60 |
61 |
62 | def fix_model(model, tokenizer, use_resize=True):
63 | model.config.pad_token_id = tokenizer.pad_token_id
64 | assert model.config.pad_token_id is not None
65 |
66 | bos_candidates = (
67 | tokenizer.bos_token_id,
68 | tokenizer.cls_token_id,
69 | tokenizer.sep_token_id,
70 | tokenizer.unk_token_id
71 | )
72 | for bos_candidate in bos_candidates:
73 | model.config.bos_token_id = bos_candidate
74 | if bos_candidate is not None:
75 | break
76 | assert model.config.bos_token_id is not None
77 | model.config.decoder_start_token_id = model.config.bos_token_id
78 |
79 | eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id)
80 | for eos_candidate in eos_candidates:
81 | model.config.eos_token_id = eos_candidate
82 | if eos_candidate is not None:
83 | break
84 | assert model.config.eos_token_id is not None
85 |
86 | if use_resize:
87 | model.resize_token_embeddings(len(tokenizer))
88 |
89 | return model
90 |
91 |
92 | def gen_batch(records, batch_size):
93 | batch_start = 0
94 | while batch_start < len(records):
95 | batch_end = batch_start + batch_size
96 | batch = records[batch_start: batch_end]
97 | batch_start = batch_end
98 | yield batch
99 |
100 |
101 | def print_special_tokens(tokenizer):
102 | print("Vocab size: ", tokenizer.vocab_size)
103 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
104 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
105 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
106 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
107 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
108 | return tokenizer
109 |
110 | # PAD: 0
111 | # BOS: 1
112 | # EOS: 2
113 | # UNK: 0
114 | # SEP: 1
115 |
116 | def fix_tokenizer_opt(tokenizer):
117 | # Fixing broken tokenizers
118 | special_tokens = {
119 | 'pad_token': '',
120 | 'bos_token': '',
121 | 'eos_token': '',
122 | 'unk_token': '',
123 | 'sep_token': ''
124 |
125 | }
126 |
127 | tokenizer.add_special_tokens(special_tokens)
128 |
129 | print("Vocab size: ", tokenizer.vocab_size)
130 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
131 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
132 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
133 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
134 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
135 | return tokenizer
--------------------------------------------------------------------------------
/finetune/samsum-opt/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 |
4 | import torch
5 | import numpy as np
6 |
7 |
8 | def set_random_seed(seed):
9 | random.seed(seed)
10 | np.random.seed(seed)
11 | os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
12 | os.environ["PL_GLOBAL_SEED"] = str(seed)
13 | os.environ["PYTHONHASHSEED"] = str(seed)
14 | torch.manual_seed(seed)
15 | torch.cuda.manual_seed_all(seed)
16 | torch.backends.cudnn.benchmark = False
17 | torch.backends.cudnn.deterministic = True
18 |
19 |
20 | def fix_tokenizer(tokenizer):
21 | # Fixing broken tokenizers
22 | special_tokens = dict()
23 | for token_id in range(1000):
24 | token = tokenizer.convert_ids_to_tokens(token_id)
25 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad" in token:
26 | special_tokens["pad_token"] = token
27 | if tokenizer.bos_token_id in (None, tokenizer.vocab_size) and "" in token:
28 | special_tokens["bos_token"] = token
29 | if tokenizer.eos_token_id in (None, tokenizer.vocab_size) and "" in token:
30 | special_tokens["eos_token"] = token
31 | if tokenizer.unk_token_id in (None, tokenizer.vocab_size) and "unk" in token:
32 | special_tokens["unk_token"] = token
33 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep" in token:
34 | special_tokens["sep_token"] = token
35 |
36 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "bos_token" in special_tokens:
37 | special_tokens["sep_token"] = special_tokens["bos_token"]
38 |
39 | if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad_token" not in special_tokens:
40 | if tokenizer.unk_token_id is not None:
41 | special_tokens["pad_token"] = tokenizer.unk_token
42 | else:
43 | special_tokens["pad_token"] = "<|pad|>"
44 |
45 | if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep_token" not in special_tokens:
46 | if tokenizer.bos_token_id is not None:
47 | special_tokens["sep_token"] = tokenizer.bos_token
48 | else:
49 | special_tokens["sep_token"] = "<|sep|>"
50 | print(special_tokens)
51 | tokenizer.add_special_tokens(special_tokens)
52 |
53 | print("Vocab size: ", tokenizer.vocab_size)
54 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
55 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
56 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
57 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
58 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
59 | return tokenizer
60 |
61 |
62 | def fix_model(model, tokenizer, use_resize=True):
63 | model.config.pad_token_id = tokenizer.pad_token_id
64 | assert model.config.pad_token_id is not None
65 |
66 | bos_candidates = (
67 | tokenizer.bos_token_id,
68 | tokenizer.cls_token_id,
69 | tokenizer.sep_token_id,
70 | tokenizer.unk_token_id
71 | )
72 | for bos_candidate in bos_candidates:
73 | model.config.bos_token_id = bos_candidate
74 | if bos_candidate is not None:
75 | break
76 | assert model.config.bos_token_id is not None
77 | model.config.decoder_start_token_id = model.config.bos_token_id
78 |
79 | eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id)
80 | for eos_candidate in eos_candidates:
81 | model.config.eos_token_id = eos_candidate
82 | if eos_candidate is not None:
83 | break
84 | assert model.config.eos_token_id is not None
85 |
86 | if use_resize:
87 | model.resize_token_embeddings(len(tokenizer))
88 |
89 | return model
90 |
91 |
92 | def gen_batch(records, batch_size):
93 | batch_start = 0
94 | while batch_start < len(records):
95 | batch_end = batch_start + batch_size
96 | batch = records[batch_start: batch_end]
97 | batch_start = batch_end
98 | yield batch
99 |
100 |
101 | def print_special_tokens(tokenizer):
102 | print("Vocab size: ", tokenizer.vocab_size)
103 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
104 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
105 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
106 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
107 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
108 | return tokenizer
109 |
110 | # PAD: 0
111 | # BOS: 1
112 | # EOS: 2
113 | # UNK: 0
114 | # SEP: 1
115 |
116 | def fix_tokenizer_opt(tokenizer):
117 | # Fixing broken tokenizers
118 | special_tokens = {
119 | 'pad_token': '',
120 | 'bos_token': '',
121 | 'eos_token': '',
122 | 'unk_token': '',
123 | 'sep_token': ''
124 |
125 | }
126 |
127 | tokenizer.add_special_tokens(special_tokens)
128 |
129 | print("Vocab size: ", tokenizer.vocab_size)
130 | print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
131 | print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
132 | print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
133 | print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
134 | print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
135 | return tokenizer
--------------------------------------------------------------------------------
/finetune/samsum-llama/eval_samsum_4bit_bnb.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | # Set up the argument parser
4 | parser = argparse.ArgumentParser(description='Python script to work with models')
5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
6 | parser.add_argument('--adapter', type=str, help='adapter ID for huggingface', required=True)
7 | parser.add_argument('--file_name', type=str, help='backup file name', required=True)
8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
9 |
10 | # Parse the arguments
11 | args = parser.parse_args()
12 |
13 | # Use the command line arguments in your script
14 | print('Model Name:', args.model_name)
15 | print('Adapter Name: ', args.adapter)
16 | print('Output file:', args.file_name)
17 | print('Seed: ', args.seed)
18 |
19 | import random
20 | import json
21 | import os
22 |
23 | # import wandb
24 | import torch
25 | import numpy as np
26 | # import bitsandbytes as bnb
27 | from tqdm import tqdm
28 | import transformers
29 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
30 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
31 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
32 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training, PeftModel
33 | from datasets import load_dataset
34 |
35 | from utils import *
36 | from data import *
37 |
38 | import evaluate
39 | import numpy as np
40 | from datasets import load_from_disk
41 | from tqdm import tqdm
42 |
43 |
44 | output_dir = args.adapter
45 | model_name = args.model_name
46 | seed = args.seed
47 | train_sample_rate = 1.0
48 | val_sample_rate = 1.0
49 | local_rank = 0
50 |
51 | set_random_seed(seed)
52 | logging.set_verbosity_info()
53 |
54 | # with open(config_file, "r") as r:
55 | # config = json.load(r)
56 |
57 | os.environ["WANDB_DISABLED"] = "true"
58 |
59 | device_map = "auto"
60 | world_size = int(os.environ.get("WORLD_SIZE", 1))
61 | ddp = world_size != 1
62 |
63 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
64 | tokenizer = fix_tokenizer(tokenizer)
65 | # tokenizer.save_pretrained(output_dir)
66 |
67 | dataset = load_dataset('samsum')
68 | val_records = dataset['test']
69 |
70 | ## Config for llama 7-b
71 | model_type = "causal"
72 | templates_path = "llama_lora_samsum.json"
73 | only_target_loss = False
74 | mode = "instruct"
75 |
76 | model_types = {
77 | "causal": AutoModelForCausalLM,
78 | "seq2seq": AutoModelForSeq2SeqLM
79 | }
80 | load_in_8bit = False
81 | load_in_4bit = True
82 | if load_in_8bit:
83 | assert not load_in_4bit
84 | model = model_types[model_type].from_pretrained(
85 | model_name,
86 | load_in_8bit=True,
87 | device_map=device_map
88 | )
89 | elif load_in_4bit:
90 | assert not load_in_8bit
91 | #use_bf16 = trainer_config.get("bf16", False)
92 | use_bf16 = True
93 | compute_dtype = torch.bfloat16 if use_bf16 else torch.float16
94 | model = model_types[model_type].from_pretrained(
95 | model_name,
96 | load_in_4bit=True,
97 | device_map=device_map,
98 | quantization_config=BitsAndBytesConfig(
99 | load_in_4bit=True,
100 | bnb_4bit_compute_dtype=compute_dtype,
101 | bnb_4bit_use_double_quant=True,
102 | ),
103 | torch_dtype=torch.bfloat16 if use_bf16 else torch.float32
104 | )
105 | else:
106 | model = model_types[model_type].from_pretrained(model_name)
107 |
108 | # Default model generation params
109 | model = fix_model(model, tokenizer, use_resize=False)
110 | model.config.num_beams = 5
111 |
112 |
113 | peft_model_id = args.adapter
114 | model = PeftModel.from_pretrained(model, peft_model_id)
115 |
116 | # Metric
117 | metric = evaluate.load("rouge")
118 |
119 | def evaluate_peft_model(sample,max_target_length=45):
120 | # Load dataset from the hub and get a sample
121 | sample_word = f"### Summarize this: {sample}\n ### Output: "
122 | with torch.inference_mode(), torch.autocast("cuda"):
123 | input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda()
124 | outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 45)
125 | output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"")
126 | print(f"Output:\n{output}")
127 | # Some simple post-processing
128 | return output
129 |
130 | # run predictions
131 | # this can take ~45 minutes
132 | predictions = []
133 | for sample in tqdm(dataset['test']['dialogue']):
134 | p = evaluate_peft_model(sample)
135 | predictions.append(p)
136 |
137 | # compute metric
138 | rogue = metric.compute(predictions=predictions, references=dataset['test']['summary'], use_stemmer=True)
139 |
140 | # print results
141 | print(f'Seed: {seed}')
142 | print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
143 | print(f"rouge2: {rogue['rouge2']* 100:2f}%")
144 | print(f"rougeL: {rogue['rougeL']* 100:2f}%")
145 | print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
146 |
147 | file_name = args.file_name
148 | with open(file_name, 'w') as f:
149 | for item in predictions:
150 | # write each item on a new line
151 | f.write("%s\n" % item)
152 | f.write(f'Seed: {seed}')
153 | f.write(f"Rogue1: {rogue['rouge1']* 100:2f}%")
154 | f.write(f"rouge2: {rogue['rouge2']* 100:2f}%")
155 | f.write(f"rougeL: {rogue['rougeL']* 100:2f}%")
156 | f.write(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
--------------------------------------------------------------------------------
/finetune/samsum-llama/data.py:
--------------------------------------------------------------------------------
1 | import random
2 | import json
3 | from typing import Optional
4 | from dataclasses import dataclass
5 | from typing import List, Dict, Tuple, Any
6 |
7 | import numpy as np
8 | import torch
9 | import torch.nn.functional as F
10 | from torch.utils.data import Dataset
11 | from transformers import AutoTokenizer, PreTrainedTokenizerBase
12 | from tqdm import tqdm
13 |
14 |
15 | class InstructDataset(Dataset):
16 | def __init__(
17 | self,
18 | original_records: List[Dict],
19 | tokenizer: AutoTokenizer,
20 | max_source_tokens_count: int,
21 | max_target_tokens_count: int,
22 | templates_path: str,
23 | sample_rate: float = 1.0,
24 | only_target_loss: bool = True,
25 | input_type: str = "causal",
26 | target_field: str = "human_reference",
27 | source_field: str = "input",
28 | use_padding: bool = False
29 | ):
30 | self.original_records = original_records
31 | self.sample_rate = sample_rate
32 | self.tokenizer = tokenizer
33 | self.max_source_tokens_count = max_source_tokens_count
34 | self.max_target_tokens_count = max_target_tokens_count
35 | self.only_target_loss = only_target_loss
36 | self.input_type = input_type
37 | self.target_field = target_field
38 | self.source_field = source_field
39 | self.use_padding = use_padding
40 | self.is_printed = False
41 |
42 | with open(templates_path) as r:
43 | self.templates = json.load(r)
44 |
45 | self.records = []
46 | for record in tqdm(original_records): #original dataset
47 | if random.random() > self.sample_rate:
48 | continue
49 | tensors = self.convert_record(record)
50 | if tensors is None:
51 | continue
52 | self.records.append(tensors)
53 |
54 | def __len__(self):
55 | return len(self.records)
56 |
57 | def __getitem__(self, index):
58 | return self.records[index]
59 |
60 | def convert_record(self, record):
61 | instruction = record["dialogue"]
62 | #inp = record[self.source_field] #basically no use
63 | out = record[self.target_field]
64 | # if inp.strip() != "" and False:
65 | # templates = self.templates["prompts_input"]
66 | # prompt_template = random.choice(templates)
67 | # source = prompt_template.format(instruction=instruction.strip(), inp=inp.strip())
68 | # else:
69 | templates = self.templates["prompts_no_input"] ## This is what we want
70 | prompt_template = random.choice(templates)
71 | source = prompt_template.format(instruction=instruction.strip()) ## put the prompt inside
72 | target = out.strip()
73 | if not self.is_printed:
74 | print("Source and target examples")
75 | print(source)
76 | print(target)
77 | self.is_printed = True
78 | if self.input_type == "causal":
79 | return self.convert_causal(source, target)
80 | elif self.input_type == "seq2seq":
81 | return self.convert_seq2seq(source, target)
82 | else:
83 | assert False
84 |
85 | def convert_causal(self, source, target=None):
86 | source_tokens = self.tokenizer(
87 | source,
88 | add_special_tokens=False,
89 | max_length=self.max_source_tokens_count,
90 | padding=False,
91 | truncation=True
92 | )["input_ids"]
93 | ## added the box_token id
94 | if self.tokenizer.bos_token_id:
95 | source_tokens.insert(0, self.tokenizer.bos_token_id) ## box_token_id
96 | input_ids = source_tokens[:]
97 | actual_length = len(input_ids)
98 | max_length = self.max_source_tokens_count + self.max_target_tokens_count + 2
99 | if target is not None:
100 | target_tokens = self.tokenizer(
101 | target,
102 | add_special_tokens=False,
103 | max_length=self.max_target_tokens_count,
104 | padding=False,
105 | truncation=True
106 | )["input_ids"]
107 | input_ids += target_tokens + [self.tokenizer.eos_token_id] ## eos_token_id
108 | actual_length = len(input_ids)
109 | if self.use_padding:
110 | padding = [self.tokenizer.pad_token_id for i in range(len(input_ids), max_length)]
111 | input_ids.extend(padding)
112 |
113 | input_ids = torch.LongTensor(input_ids)
114 | labels = input_ids.clone()
115 | attention_mask = input_ids.new_ones(input_ids.size())
116 | if self.use_padding:
117 | labels[actual_length:] = -100
118 | attention_mask[actual_length:] = 0
119 | if self.only_target_loss:
120 | labels[:len(source_tokens)] = -100
121 | assert input_ids.size(0) == labels.size(0) == attention_mask.size(0) <= max_length
122 |
123 | return {
124 | "input_ids": input_ids,
125 | "labels": labels,
126 | "attention_mask": attention_mask
127 | }
128 |
129 | def convert_seq2seq(self, source, target=None):
130 | inputs = self.tokenizer(
131 | source,
132 | add_special_tokens=True,
133 | max_length=self.max_source_tokens_count,
134 | padding=False,
135 | truncation=True,
136 | return_tensors="pt"
137 | )
138 | inputs = {k: v.squeeze(0) for k, v in inputs.items()}
139 | if target is not None:
140 | outputs = self.tokenizer(
141 | target,
142 | add_special_tokens=True,
143 | max_length=self.max_target_tokens_count,
144 | padding=False,
145 | truncation=True,
146 | return_tensors="pt"
147 | )
148 | labels = outputs["input_ids"].squeeze(0).tolist()
149 | if labels[-1] != self.tokenizer.eos_token_id:
150 | labels.append(self.tokenizer.eos_token_id)
151 | inputs["labels"] = torch.LongTensor(labels)
152 | return inputs
153 |
--------------------------------------------------------------------------------
/finetune/samsum-opt/data.py:
--------------------------------------------------------------------------------
1 | import random
2 | import json
3 | from typing import Optional
4 | from dataclasses import dataclass
5 | from typing import List, Dict, Tuple, Any
6 |
7 | import numpy as np
8 | import torch
9 | import torch.nn.functional as F
10 | from torch.utils.data import Dataset
11 | from transformers import AutoTokenizer, PreTrainedTokenizerBase
12 | from tqdm import tqdm
13 |
14 |
15 | class InstructDataset(Dataset):
16 | def __init__(
17 | self,
18 | original_records: List[Dict],
19 | tokenizer: AutoTokenizer,
20 | max_source_tokens_count: int,
21 | max_target_tokens_count: int,
22 | templates_path: str,
23 | sample_rate: float = 1.0,
24 | only_target_loss: bool = True,
25 | input_type: str = "causal",
26 | target_field: str = "human_reference",
27 | source_field: str = "input",
28 | use_padding: bool = False
29 | ):
30 | self.original_records = original_records
31 | self.sample_rate = sample_rate
32 | self.tokenizer = tokenizer
33 | self.max_source_tokens_count = max_source_tokens_count
34 | self.max_target_tokens_count = max_target_tokens_count
35 | self.only_target_loss = only_target_loss
36 | self.input_type = input_type
37 | self.target_field = target_field
38 | self.source_field = source_field
39 | self.use_padding = use_padding
40 | self.is_printed = False
41 |
42 | with open(templates_path) as r:
43 | self.templates = json.load(r)
44 |
45 | self.records = []
46 | for record in tqdm(original_records): #original dataset
47 | if random.random() > self.sample_rate:
48 | continue
49 | tensors = self.convert_record(record)
50 | if tensors is None:
51 | continue
52 | self.records.append(tensors)
53 |
54 | def __len__(self):
55 | return len(self.records)
56 |
57 | def __getitem__(self, index):
58 | return self.records[index]
59 |
60 | def convert_record(self, record):
61 | instruction = record["dialogue"]
62 | #inp = record[self.source_field] #basically no use
63 | out = record[self.target_field]
64 | # if inp.strip() != "" and False:
65 | # templates = self.templates["prompts_input"]
66 | # prompt_template = random.choice(templates)
67 | # source = prompt_template.format(instruction=instruction.strip(), inp=inp.strip())
68 | # else:
69 | templates = self.templates["prompts_no_input"] ## This is what we want
70 | prompt_template = random.choice(templates)
71 | source = prompt_template.format(instruction=instruction.strip()) ## put the prompt inside
72 | target = out.strip()
73 | if not self.is_printed:
74 | print("Source and target examples")
75 | print(source)
76 | print(target)
77 | self.is_printed = True
78 | if self.input_type == "causal":
79 | return self.convert_causal(source, target)
80 | elif self.input_type == "seq2seq":
81 | return self.convert_seq2seq(source, target)
82 | else:
83 | assert False
84 |
85 | def convert_causal(self, source, target=None):
86 | source_tokens = self.tokenizer(
87 | source,
88 | add_special_tokens=False,
89 | max_length=self.max_source_tokens_count,
90 | padding=False,
91 | truncation=True
92 | )["input_ids"]
93 | ## added the box_token id
94 | if self.tokenizer.bos_token_id:
95 | source_tokens.insert(0, self.tokenizer.bos_token_id) ## box_token_id
96 | input_ids = source_tokens[:]
97 | actual_length = len(input_ids)
98 | max_length = self.max_source_tokens_count + self.max_target_tokens_count + 2
99 | if target is not None:
100 | target_tokens = self.tokenizer(
101 | target,
102 | add_special_tokens=False,
103 | max_length=self.max_target_tokens_count,
104 | padding=False,
105 | truncation=True
106 | )["input_ids"]
107 | input_ids += target_tokens + [self.tokenizer.eos_token_id] ## eos_token_id
108 | actual_length = len(input_ids)
109 | if self.use_padding:
110 | padding = [self.tokenizer.pad_token_id for i in range(len(input_ids), max_length)]
111 | input_ids.extend(padding)
112 |
113 | input_ids = torch.LongTensor(input_ids)
114 | labels = input_ids.clone()
115 | attention_mask = input_ids.new_ones(input_ids.size())
116 | if self.use_padding:
117 | labels[actual_length:] = -100
118 | attention_mask[actual_length:] = 0
119 | if self.only_target_loss:
120 | labels[:len(source_tokens)] = -100
121 | assert input_ids.size(0) == labels.size(0) == attention_mask.size(0) <= max_length
122 |
123 | return {
124 | "input_ids": input_ids,
125 | "labels": labels,
126 | "attention_mask": attention_mask
127 | }
128 |
129 | def convert_seq2seq(self, source, target=None):
130 | inputs = self.tokenizer(
131 | source,
132 | add_special_tokens=True,
133 | max_length=self.max_source_tokens_count,
134 | padding=False,
135 | truncation=True,
136 | return_tensors="pt"
137 | )
138 | inputs = {k: v.squeeze(0) for k, v in inputs.items()}
139 | if target is not None:
140 | outputs = self.tokenizer(
141 | target,
142 | add_special_tokens=True,
143 | max_length=self.max_target_tokens_count,
144 | padding=False,
145 | truncation=True,
146 | return_tensors="pt"
147 | )
148 | labels = outputs["input_ids"].squeeze(0).tolist()
149 | if labels[-1] != self.tokenizer.eos_token_id:
150 | labels.append(self.tokenizer.eos_token_id)
151 | inputs["labels"] = torch.LongTensor(labels)
152 | return inputs
153 |
--------------------------------------------------------------------------------
/finetune/samsum-opt/eval_samsum_opt_4bit_llmtune.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | # Set up the argument parser
4 | parser = argparse.ArgumentParser(description='Python script to work with models')
5 | parser.add_argument('--model_name', type=str, help='HF model name with your user', required=True)
6 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
7 | parser.add_argument('--file_name', type=str, help='backup file name', required=True)
8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
9 |
10 | # Parse the arguments
11 | args = parser.parse_args()
12 |
13 | # Use the command line arguments in your script
14 | print('Model Name:', args.model_name)
15 | print('Adapter Path: ', args.adapter)
16 | print('Seed: ', args.seed)
17 |
18 | import random
19 | import json
20 | import os
21 |
22 | # import wandb
23 | import torch
24 | import numpy as np
25 | # import bitsandbytes as bnb
26 | from tqdm import tqdm
27 | import transformers
28 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
29 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
30 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
31 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
32 | from datasets import load_dataset
33 |
34 | from utils import *
35 | from data import *
36 |
37 | import evaluate
38 | import numpy as np
39 | from datasets import load_from_disk
40 | from tqdm import tqdm
41 |
42 | from llmtune.llms.autollm import AutoLLMForCausalLM
43 | from llmtune.engine.lora.config import FinetuneConfig
44 | from llmtune.engine.lora.peft import quant_peft
45 | from llmtune.utils import to_half_precision
46 |
47 | output_dir = args.adapter
48 | seed = args.seed
49 | train_sample_rate = 1.0
50 | val_sample_rate = 1.0
51 | local_rank = 0
52 |
53 | # model config
54 | model_name = args.model_name
55 | tokenizer_name = "facebook/opt-6.7b"
56 | DEV = 'cuda'
57 |
58 | set_random_seed(42)
59 | logging.set_verbosity_info()
60 |
61 | # with open(config_file, "r") as r:
62 | # config = json.load(r)
63 |
64 | device_map = "auto"
65 | world_size = int(os.environ.get("WORLD_SIZE", 1))
66 | ddp = world_size != 1
67 |
68 | transformers.logging.set_verbosity_info()
69 |
70 | # load tokenizer
71 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
72 | tokenizer.pad_token_id = 0
73 | ## Fix Tokenizer
74 | tokenizer = fix_tokenizer_opt(tokenizer)
75 |
76 | # load model
77 | llm = AutoLLMForCausalLM.from_pretrained(model_name)
78 | ## Fix Model
79 | lllm = fix_model(llm, tokenizer, use_resize=False)
80 | llm.eval()
81 | llm = llm.to(DEV)
82 | llm = to_half_precision(llm)
83 |
84 |
85 |
86 | ## dataset
87 | dataset = load_dataset('samsum')
88 | train_records = dataset['train']
89 | val_records = dataset['test']
90 | #random.shuffle(train_records)
91 | print("train_record[0]: ",train_records[0])
92 |
93 | ## Config for llama 7-b
94 | model_type = "causal"
95 | templates_path = "llama_lora_samsum.json"
96 | only_target_loss = False
97 | mode = "instruct"
98 |
99 |
100 | adapter_path = args.adapter
101 | model = quant_peft.PeftModel.from_pretrained(
102 | llm, adapter_path,
103 | device_map='auto'
104 | )
105 | print(adapter_path, 'loaded')
106 |
107 |
108 | # Model configs
109 | model.config.num_beams = 5
110 |
111 |
112 | # Metric
113 | metric = evaluate.load("rouge")
114 |
115 | def evaluate_peft_model(sample,max_target_length=45):
116 | # Load dataset from the hub and get a sample
117 | sample_word = f"### Summarize this: {sample}\n ### Output: "
118 | input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda()
119 | # with torch.inference_mode(), torch.autocast("cuda"):
120 | print("input_ids: ",input_ids)
121 | outputs = model.generate(input_ids=input_ids, do_sample=True, max_new_tokens = 45)
122 | output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"")
123 | print(f"Output:\n{output}")
124 | # Some simple post-processing
125 | return output
126 |
127 | # run predictions
128 | # this can take ~45 minutes
129 | predictions = []
130 | for sample in tqdm(dataset['test']['dialogue']):
131 | p = evaluate_peft_model(sample)
132 | predictions.append(p)
133 |
134 | # compute metric
135 |
136 |
137 | file_name = args.file_name
138 | # with open(file_name, 'w') as f:
139 | # for item in predictions:
140 | # # write each item on a new line
141 | # f.write("%s\n" % item)
142 | # f.write(f'Seed: {seed}')
143 |
144 |
145 | # def process_file(filename):
146 | # output_list = []
147 | # delete_lines = False
148 | # with open(filename, 'r') as file:
149 | # for line in file:
150 | # stripped_line = line.strip()
151 | # if stripped_line.startswith("### Summarize this:"):
152 | # delete_lines = True
153 | # continue
154 | # elif stripped_line.startswith("### Output: "):
155 | # output = stripped_line[len("### Output: "):]
156 | # output_list.append(output)
157 | # delete_lines = False
158 | # continue
159 |
160 | # if not delete_lines:
161 | # output_list.append(stripped_line)
162 |
163 | # return output_list
164 |
165 | # predictions = process_file(file_name)
166 | # predictions.pop()
167 |
168 | rogue = metric.compute(predictions=predictions, references=dataset['test']['summary'], use_stemmer=True)
169 |
170 | # print results
171 | print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
172 | print(f"rouge2: {rogue['rouge2']* 100:2f}%")
173 | print(f"rougeL: {rogue['rougeL']* 100:2f}%")
174 | print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
175 |
176 | with open(file_name, 'w') as f:
177 | for item in predictions:
178 | # write each item on a new line
179 | f.write("%s\n" % item)
180 | f.write(f'Seed: {seed}\n')
181 | f.write(f"Rogue1: {rogue['rouge1']* 100:2f}%\n")
182 | f.write(f"rouge2: {rogue['rouge2']* 100:2f}%\n")
183 | f.write(f"rougeL: {rogue['rougeL']* 100:2f}%\n")
184 | f.write(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%\n")
185 |
186 |
--------------------------------------------------------------------------------
/finetune/mnli-llama/data_mnli_label.py:
--------------------------------------------------------------------------------
1 | import random
2 | import json
3 | from typing import Optional
4 | from dataclasses import dataclass
5 | from typing import List, Dict, Tuple, Any
6 |
7 | import numpy as np
8 | import torch
9 | import torch.nn.functional as F
10 | from torch.utils.data import Dataset
11 | from transformers import AutoTokenizer, PreTrainedTokenizerBase
12 | from tqdm import tqdm
13 |
14 |
15 | class InstructDataset(Dataset):
16 | def __init__(
17 | self,
18 | original_records: List[Dict],
19 | tokenizer: AutoTokenizer,
20 | max_source_tokens_count: int,
21 | max_target_tokens_count: int,
22 | templates_path: str,
23 | sample_rate: float = 1.0,
24 | only_target_loss: bool = True,
25 | input_type: str = "causal",
26 | target_field: str = "human_reference",
27 | source_field: str = "input",
28 | use_padding: bool = False
29 | ):
30 | self.original_records = original_records
31 | self.sample_rate = sample_rate
32 | self.tokenizer = tokenizer
33 | self.max_source_tokens_count = max_source_tokens_count
34 | self.max_target_tokens_count = max_target_tokens_count
35 | self.only_target_loss = only_target_loss
36 | self.input_type = input_type
37 | self.target_field = target_field
38 | self.source_field = source_field
39 | self.use_padding = use_padding
40 | self.is_printed = False
41 |
42 | with open(templates_path) as r:
43 | self.templates = json.load(r)
44 |
45 | self.records = []
46 | for record in tqdm(original_records): #original dataset
47 | if random.random() > self.sample_rate:
48 | continue
49 | tensors = self.convert_record(record)
50 | if tensors is None:
51 | continue
52 | self.records.append(tensors)
53 |
54 | def __len__(self):
55 | return len(self.records)
56 |
57 | def __getitem__(self, index):
58 | return self.records[index]
59 |
60 | def convert_record(self, record):
61 | instruction = record["premise"]
62 | hypothesis = record["hypothesis"]
63 | genre = record["genre"]
64 | #inp = record[self.source_field] #basically no use
65 | out = record["label"]
66 | # if inp.strip() != "" and False:
67 | # templates = self.templates["prompts_input"]
68 | # prompt_template = random.choice(templates)
69 | # source = prompt_template.format(instruction=instruction.strip(), inp=inp.strip())
70 | # else:
71 | templates = self.templates["prompts_no_input"] ## This is what we want
72 | prompt_template = random.choice(templates)
73 | source = prompt_template.format(instruction=instruction.strip(), hypothesis=hypothesis.strip(), genre=genre.strip()) ## put the prompt inside
74 | target = str(out)
75 | if not self.is_printed:
76 | print("Source and target examples")
77 | print(source)
78 | print(target)
79 | self.is_printed = True
80 | if self.input_type == "causal":
81 | return self.convert_causal(source, target)
82 | elif self.input_type == "seq2seq":
83 | return self.convert_seq2seq(source, target)
84 | else:
85 | assert False
86 |
87 | def convert_causal(self, source, target=None):
88 | source_tokens = self.tokenizer(
89 | source,
90 | add_special_tokens=False,
91 | max_length=self.max_source_tokens_count,
92 | padding=False,
93 | truncation=True
94 | )["input_ids"]
95 | ## added the box_token id
96 | if self.tokenizer.bos_token_id:
97 | source_tokens.insert(0, self.tokenizer.bos_token_id) ## box_token_id
98 | input_ids = source_tokens[:]
99 | actual_length = len(input_ids)
100 | max_length = self.max_source_tokens_count + self.max_target_tokens_count + 2
101 | if target is not None:
102 | target_tokens = self.tokenizer(
103 | target,
104 | add_special_tokens=False,
105 | max_length=self.max_target_tokens_count,
106 | padding=False,
107 | truncation=True
108 | )["input_ids"]
109 | input_ids += target_tokens + [self.tokenizer.eos_token_id] ## eos_token_id
110 | actual_length = len(input_ids)
111 | if self.use_padding:
112 | padding = [self.tokenizer.pad_token_id for i in range(len(input_ids), max_length)]
113 | input_ids.extend(padding)
114 |
115 | input_ids = torch.LongTensor(input_ids)
116 | labels = input_ids.clone()
117 | attention_mask = input_ids.new_ones(input_ids.size())
118 | if self.use_padding:
119 | labels[actual_length:] = -100
120 | attention_mask[actual_length:] = 0
121 | if self.only_target_loss:
122 | labels[:len(source_tokens)] = -100
123 | assert input_ids.size(0) == labels.size(0) == attention_mask.size(0) <= max_length
124 |
125 | return {
126 | "input_ids": input_ids,
127 | "labels": labels,
128 | "attention_mask": attention_mask
129 | }
130 |
131 | def convert_seq2seq(self, source, target=None):
132 | inputs = self.tokenizer(
133 | source,
134 | add_special_tokens=True,
135 | max_length=self.max_source_tokens_count,
136 | padding=False,
137 | truncation=True,
138 | return_tensors="pt"
139 | )
140 | inputs = {k: v.squeeze(0) for k, v in inputs.items()}
141 | if target is not None:
142 | outputs = self.tokenizer(
143 | target,
144 | add_special_tokens=True,
145 | max_length=self.max_target_tokens_count,
146 | padding=False,
147 | truncation=True,
148 | return_tensors="pt"
149 | )
150 | labels = outputs["input_ids"].squeeze(0).tolist()
151 | if labels[-1] != self.tokenizer.eos_token_id:
152 | labels.append(self.tokenizer.eos_token_id)
153 | inputs["labels"] = torch.LongTensor(labels)
154 | return inputs
155 |
--------------------------------------------------------------------------------
/llmtune/engine/quant/gptq/algorithm.py:
--------------------------------------------------------------------------------
1 | import math
2 | import time
3 |
4 | import torch
5 | import torch.nn as nn
6 | import transformers
7 |
8 | DEBUG = False
9 | torch.backends.cuda.matmul.allow_tf32 = False
10 | torch.backends.cudnn.allow_tf32 = False
11 |
12 | def quantize(x, scale, zero, maxq):
13 | if maxq < 0:
14 | return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
15 | q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
16 | return scale * (q - zero)
17 |
18 | class GPTQ:
19 | def __init__(self, layer):
20 | self.layer = layer
21 | self.dev = self.layer.weight.device
22 | W = layer.weight.data.clone()
23 | if isinstance(self.layer, nn.Conv2d):
24 | W = W.flatten(1)
25 | if isinstance(self.layer, transformers.Conv1D):
26 | W = W.t()
27 | self.rows = W.shape[0]
28 | self.columns = W.shape[1]
29 | self.H = torch.zeros((self.columns, self.columns), device=self.dev)
30 | self.nsamples = 0
31 |
32 | def add_batch(self, inp, out):
33 | if DEBUG:
34 | self.inp1 = inp
35 | self.out1 = out
36 | if len(inp.shape) == 2:
37 | inp = inp.unsqueeze(0)
38 | tmp = inp.shape[0]
39 | if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
40 | if len(inp.shape) == 3:
41 | inp = inp.reshape((-1, inp.shape[-1]))
42 | inp = inp.t()
43 | if isinstance(self.layer, nn.Conv2d):
44 | unfold = nn.Unfold(
45 | self.layer.kernel_size,
46 | dilation=self.layer.dilation,
47 | padding=self.layer.padding,
48 | stride=self.layer.stride
49 | )
50 | inp = unfold(inp)
51 | inp = inp.permute([1, 0, 2])
52 | inp = inp.flatten(1)
53 | self.H *= self.nsamples / (self.nsamples + tmp)
54 | self.nsamples += tmp
55 | # inp = inp.float()
56 | inp = math.sqrt(2 / self.nsamples) * inp.float()
57 | # self.H += 2 / self.nsamples * inp.matmul(inp.t())
58 | self.H += inp.matmul(inp.t())
59 |
60 | def fasterquant(
61 | self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False
62 | ):
63 | W = self.layer.weight.data.clone()
64 | if isinstance(self.layer, nn.Conv2d):
65 | W = W.flatten(1)
66 | if isinstance(self.layer, transformers.Conv1D):
67 | W = W.t()
68 | W = W.float()
69 |
70 | tick = time.time()
71 |
72 | if not self.quantizer.ready():
73 | self.quantizer.find_params(W, weight=True)
74 |
75 | H = self.H
76 | del self.H
77 | dead = torch.diag(H) == 0
78 | H[dead, dead] = 1
79 | W[:, dead] = 0
80 |
81 | if actorder:
82 | perm = torch.argsort(torch.diag(H), descending=True)
83 | W = W[:, perm]
84 | H = H[perm][:, perm]
85 |
86 | Losses = torch.zeros_like(W)
87 | Q = torch.zeros_like(W)
88 |
89 | damp = percdamp * torch.mean(torch.diag(H))
90 | diag = torch.arange(self.columns, device=self.dev)
91 | H[diag, diag] += damp
92 | H = torch.linalg.cholesky(H)
93 | H = torch.cholesky_inverse(H)
94 | H = torch.linalg.cholesky(H, upper=True)
95 | Hinv = H
96 |
97 | g_idx = []
98 | scale = []
99 | zero = []
100 | now_idx = 1
101 |
102 | for i1 in range(0, self.columns, blocksize):
103 | i2 = min(i1 + blocksize, self.columns)
104 | count = i2 - i1
105 |
106 | W1 = W[:, i1:i2].clone()
107 | Q1 = torch.zeros_like(W1)
108 | Err1 = torch.zeros_like(W1)
109 | Losses1 = torch.zeros_like(W1)
110 | Hinv1 = Hinv[i1:i2, i1:i2]
111 |
112 | for i in range(count):
113 | w = W1[:, i]
114 | d = Hinv1[i, i]
115 |
116 | if groupsize != -1:
117 | if (i1 + i) % groupsize == 0:
118 | self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True)
119 |
120 | if ((i1 + i) // groupsize) - now_idx == -1:
121 | scale.append(self.quantizer.scale)
122 | zero.append(self.quantizer.zero)
123 | now_idx += 1
124 |
125 | q = quantize(
126 | w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq
127 | ).flatten()
128 | Q1[:, i] = q
129 | Losses1[:, i] = (w - q) ** 2 / d ** 2
130 |
131 | err1 = (w - q) / d
132 | W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
133 | Err1[:, i] = err1
134 |
135 | Q[:, i1:i2] = Q1
136 | Losses[:, i1:i2] = Losses1 / 2
137 |
138 | W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
139 |
140 | if DEBUG:
141 | self.layer.weight.data[:, :i2] = Q[:, :i2]
142 | self.layer.weight.data[:, i2:] = W[:, i2:]
143 | print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
144 | print(torch.sum(Losses))
145 |
146 | torch.cuda.synchronize()
147 | print('time %.2f' % (time.time() - tick))
148 | print('error', torch.sum(Losses).item())
149 |
150 | groupsize = groupsize if groupsize != -1 else self.columns
151 | g_idx = [i // groupsize for i in range(self.columns)]
152 | g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
153 | if actorder:
154 | invperm = torch.argsort(perm)
155 | Q = Q[:, invperm]
156 | g_idx = g_idx[invperm]
157 |
158 | if isinstance(self.layer, transformers.Conv1D):
159 | Q = Q.t()
160 | self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
161 | if DEBUG:
162 | print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
163 |
164 | if scale == []:
165 | scale.append(self.quantizer.scale)
166 | zero.append(self.quantizer.zero)
167 | scale = torch.cat(scale,dim=1)
168 | zero = torch.cat(zero,dim=1)
169 | return scale,zero,g_idx
170 |
171 | def free(self):
172 | if DEBUG:
173 | self.inp1 = None
174 | self.out1 = None
175 | self.H = None
176 | self.Losses = None
177 | self.Trace = None
178 | torch.cuda.empty_cache()
179 |
--------------------------------------------------------------------------------
/llmtune/llms/autollm.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | from torch import nn
4 | from typing import Dict, List, Optional, Union
5 | from transformers import AutoTokenizer
6 | from transformers.utils.hub import (
7 | PushToHubMixin, cached_file, create_repo,
8 | create_commit, CommitOperationAdd
9 | )
10 | from llmtune.llms.config import AutoLLMConfig, LLMType
11 | from llmtune.llms.llama.model import load_llama, load_llama_tokenizer
12 | from llmtune.llms.opt.model import load_opt, load_opt_tokenizer
13 | from llmtune.llms.bloom.model import load_bloom, load_bloom_tokenizer
14 |
15 | def get_default_tokenizer(name_or_path, model_type=None):
16 | if model_type is not None:
17 | if model_type == 'llama':
18 | return load_llama_tokenizer(name_or_path)
19 | elif model_type == 'opt':
20 | return load_opt_tokenizer(name_or_path)
21 | elif model_type == 'bloom':
22 | return load_bloom_tokenizer(name_or_path)
23 | else:
24 | raise ValueError()
25 | else:
26 | return AutoTokenizer.from_pretrained(name_or_path)
27 |
28 | class AutoLLMForCausalLM(nn.Module, PushToHubMixin):
29 | def __init__(
30 | self,
31 | base_model,
32 | llm_config
33 | ):
34 | super().__init__()
35 | self.base_model = base_model
36 | self.llm_config = llm_config
37 |
38 | @property
39 | def is_quantized(self):
40 | return self.llm_config.is_quantized
41 |
42 | def set_quant_config(self, quant_config):
43 | self.llm_config.set_quant_config(quant_config)
44 |
45 | @property
46 | def device(self):
47 | if not self.hf_device_map:
48 | return self.base_model.device
49 | else:
50 | device = [
51 | d for d in self.hf_device_map.values()
52 | if d not in {'cpu', 'disk'}
53 | ][0]
54 | return torch.device(device)
55 |
56 | @property
57 | def hf_device_map(self):
58 | return getattr(self.base_model, "hf_device_map", None)
59 |
60 | @property
61 | def config(self):
62 | return self.base_model.config
63 |
64 | @property
65 | def _keys_to_ignore_on_save(self):
66 | return self.base_model._keys_to_ignore_on_save
67 |
68 | @property
69 | def _no_split_modules(self):
70 | return self.base_model._no_split_modules
71 |
72 | def to(self, device: Union[str, torch.device]):
73 | self.base_model = self.base_model.to(device)
74 | return self
75 |
76 | def forward(self, *args, **kwargs):
77 | return self.base_model(*args, **kwargs)
78 |
79 | def generate(self, **kwargs):
80 | with (
81 | torch.inference_mode(),
82 | torch.amp.autocast(device_type=self.device.type)
83 | ):
84 | return self.base_model.generate(**kwargs)
85 |
86 | def prepare_inputs_for_generation(self, *args, **kwargs):
87 | return self.base_model.prepare_inputs_for_generation(*args, **kwargs)
88 |
89 | @classmethod
90 | def from_pretrained(
91 | cls,
92 | model_name_or_path: str,
93 | device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
94 | device: Optional[Union[str, int]] = None,
95 | ):
96 | # load config
97 | llm_config = AutoLLMConfig.from_pretrained(model_name_or_path)
98 | load_quantized = llm_config.quant_config is not None
99 |
100 | # resolve path to checkpoint (could be None)
101 | checkpoint = None
102 | if load_quantized:
103 | if os.path.isdir(model_name_or_path):
104 | checkpoint = os.path.join(
105 | model_name_or_path, 'quantized_weights.pt'
106 | )
107 | else: # remote
108 | checkpoint = cached_file(
109 | model_name_or_path, 'quantized_weights.pt'
110 | )
111 | if checkpoint is None:
112 | raise FileNotFoundError(
113 | f"Couldn't find quantized weights in {model_name_or_path}"
114 | )
115 |
116 | # load base model
117 | if llm_config.model_type == LLMType.LLAMA.value:
118 | model = load_llama(llm_config, checkpoint)
119 | elif llm_config.model_type == LLMType.OPT.value:
120 | model = load_opt(llm_config, checkpoint)
121 | elif llm_config.model_type == LLMType.BLOOM.value:
122 | model = load_bloom(llm_config, checkpoint)
123 | else:
124 | raise NotImplementedError(
125 | f'{llm_config.model_type} not supported'
126 | )
127 |
128 | return cls(model, llm_config)
129 |
130 | def save_pretrained(self, save_dir: str):
131 | os.makedirs(save_dir, exist_ok=True)
132 | print('test')
133 |
134 | # save config
135 | self.llm_config.save_pretrained(save_dir)
136 |
137 | # save base model
138 | self.base_model.to('cpu')
139 | print(self.llm_config.quant_config)
140 | if not self.is_quantized:
141 | self.base_model.save_pretrained(save_dir)
142 | else:
143 | torch.save(
144 | self.base_model.state_dict(),
145 | os.path.join(save_dir, 'quantized_weights.pt')
146 | )
147 | self.llm_config.base_config.model_name_or_path = save_dir
148 |
149 | def push_to_hub(
150 | self,
151 | repo_id: str,
152 | save_dir: str,
153 | commit_message: Optional[str] = "",
154 | use_auth_token: Optional[Union[bool, str]] = None,
155 | private: Optional[bool] = None,
156 | token: Optional[Union[bool, str]] = None,
157 | create_pr: Optional[bool] = False,
158 | ) -> str:
159 |
160 | if not os.path.exists(save_dir):
161 | print(f"Saving model to {save_dir}")
162 | self.save_pretrained(save_dir)
163 |
164 | repo_url = create_repo(
165 | repo_id=repo_id, token=token, private=private,
166 | exist_ok=True, repo_type="model"
167 | )
168 | repo_id = repo_url.repo_id
169 |
170 | operations = [
171 | CommitOperationAdd(
172 | path_or_fileobj=os.path.join(save_dir, f),
173 | path_in_repo=f
174 | )
175 | for f in os.listdir(save_dir)
176 | ]
177 | print(
178 | f"Uploading the following files to {repo_id}: "
179 | f"{','.join(os.listdir(save_dir))}"
180 | )
181 | return create_commit(
182 | repo_id=repo_id,
183 | operations=operations,
184 | commit_message=commit_message,
185 | token=use_auth_token,
186 | create_pr=create_pr,
187 | repo_type="model",
188 | )
189 |
190 |
--------------------------------------------------------------------------------
/llmtune/engine/inference/modules.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 | from llmtune.engine.inference.autograd import (
6 | Autograd2bit, Autograd4bit, Autograd3bit
7 | )
8 |
9 | try:
10 | import quant_cuda
11 | except:
12 | print('CUDA extension not installed. Inference will not work.')
13 |
14 | # Assumes layer is perfectly divisible into 256 * 256 blocks
15 | class QuantLinear(nn.Module):
16 | def __init__(
17 | self, bits, groupsize, in_features, out_features, bias, is_cuda=True
18 | ):
19 | super().__init__()
20 | if bits not in [2,3,4,8]:
21 | raise NotImplementedError("Only 2,3,4,8 bits are supported.")
22 | self.in_features = in_features
23 | self.out_features = out_features
24 | self.bits = bits
25 | self.groupsize = groupsize if groupsize != -1 else in_features
26 | self.maxq = 2 ** self.bits - 1
27 |
28 | self.register_buffer('qweight', torch.zeros((in_features // 32 * self.bits, out_features), dtype=torch.int32))
29 | self.register_buffer('qzeros', torch.zeros((math.ceil(in_features / self.groupsize), out_features // 32 * self.bits), dtype=torch.int32))
30 | self.register_buffer('scales', torch.zeros((math.ceil(in_features / self.groupsize), out_features), dtype=torch.float16))
31 | self.register_buffer('g_idx', torch.tensor([i // self.groupsize for i in range(in_features)], dtype = torch.int32))
32 | if bias is not None:
33 | self.register_buffer('bias', torch.zeros((out_features),dtype=torch.float16))
34 | else:
35 | self.bias = None
36 |
37 | # is performed by unpacking the weights and using torch.matmul
38 | if self.bits in [2,4,8]:
39 | self.register_buffer('wf',torch.tensor(list(range(0,32,self.bits)), dtype=torch.int32).unsqueeze(0),persistent=False)
40 | elif self.bits == 3:
41 | self.register_buffer('wf', torch.tensor([[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
42 | [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
43 | [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],], dtype=torch.int32).reshape(1,3,12), persistent=False)
44 |
45 | self.is_cuda = is_cuda
46 |
47 | def pack(self, linear, scales, zeros, g_idx = None):
48 | self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
49 |
50 | scales = scales.t().contiguous()
51 | zeros = zeros.t().contiguous()
52 | scale_zeros = zeros * scales
53 | self.scales = scales.clone().half()
54 | if linear.bias is not None:
55 | self.bias = linear.bias.clone().half()
56 |
57 | intweight = []
58 | for idx in range(self.in_features):
59 | intweight.append(
60 | torch.round(
61 | (linear.weight.data[:,idx] + scale_zeros[self.g_idx[idx]])
62 | / self.scales[self.g_idx[idx]]).to(torch.int)[:,None]
63 | )
64 | intweight = torch.cat(intweight,dim=1)
65 | intweight = intweight.t().contiguous()
66 | intweight = intweight.numpy().astype(np.uint32)
67 | qweight = np.zeros(
68 | (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
69 | )
70 | i = 0
71 | row = 0
72 | while row < qweight.shape[0]:
73 | if self.bits in [2,4,8]:
74 | for j in range(i, i + (32//self.bits)):
75 | qweight[row] |= intweight[j] << (self.bits * (j - i))
76 | i += 32//self.bits
77 | row += 1
78 | elif self.bits == 3:
79 | for j in range(i, i + 10):
80 | qweight[row] |= intweight[j] << (3 * (j - i))
81 | i += 10
82 | qweight[row] |= intweight[i] << 30
83 | row += 1
84 | qweight[row] |= (intweight[i] >> 2) & 1
85 | i += 1
86 | for j in range(i, i + 10):
87 | qweight[row] |= intweight[j] << (3 * (j - i) + 1)
88 | i += 10
89 | qweight[row] |= intweight[i] << 31
90 | row += 1
91 | qweight[row] |= (intweight[i] >> 1) & 0x3
92 | i += 1
93 | for j in range(i, i + 10):
94 | qweight[row] |= intweight[j] << (3 * (j - i) + 2)
95 | i += 10
96 | row += 1
97 | else:
98 | raise NotImplementedError("Only 2,3,4,8 bits are supported.")
99 |
100 | qweight = qweight.astype(np.int32)
101 | self.qweight = torch.from_numpy(qweight)
102 |
103 | zeros -= 1;
104 | zeros = zeros.numpy().astype(np.uint32)
105 | qzeros = np.zeros(
106 | (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
107 | )
108 | i = 0
109 | col = 0
110 | while col < qzeros.shape[1]:
111 | if self.bits in [2,4,8]:
112 | for j in range(i, i + (32//self.bits)):
113 | qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
114 | i += 32//self.bits
115 | col += 1
116 | elif self.bits == 3:
117 | for j in range(i, i + 10):
118 | qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
119 | i += 10
120 | qzeros[:, col] |= zeros[:, i] << 30
121 | col += 1
122 | qzeros[:, col] |= (zeros[:, i] >> 2) & 1
123 | i += 1
124 | for j in range(i, i + 10):
125 | qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
126 | i += 10
127 | qzeros[:, col] |= zeros[:, i] << 31
128 | col += 1
129 | qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
130 | i += 1
131 | for j in range(i, i + 10):
132 | qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
133 | i += 10
134 | col += 1
135 | else:
136 | raise NotImplementedError("Only 2,3,4,8 bits are supported.")
137 |
138 | qzeros = qzeros.astype(np.int32)
139 | self.qzeros = torch.from_numpy(qzeros)
140 |
141 | def forward(self, x):
142 | if self.bits == 4:
143 | out = Autograd4bit.apply(
144 | x,
145 | self.qweight,
146 | self.scales,
147 | self.qzeros,
148 | self.g_idx,
149 | )
150 | if self.bias is not None:
151 | out += self.bias
152 | elif self.bits == 2:
153 | out = Autograd2bit.apply(
154 | x,
155 | self.qweight,
156 | self.scales,
157 | self.qzeros,
158 | self.g_idx,
159 | )
160 | if self.bias is not None:
161 | out += self.bias
162 | elif self.bits == 3:
163 | out = Autograd3bit.apply(
164 | x,
165 | self.qweight,
166 | self.scales,
167 | self.qzeros,
168 | self.g_idx,
169 | self.wf,
170 | self.out_features,
171 | )
172 | if self.bias is not None:
173 | out += self.bias
174 | else:
175 | raise NotImplementedError()
176 | return out
177 |
--------------------------------------------------------------------------------
/llmtune/engine/inference/autograd.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 | from . import matmult as mm
6 | from torch.cuda.amp import custom_bwd, custom_fwd
7 |
8 | class Autograd4bit(torch.autograd.Function):
9 | @staticmethod
10 | @custom_fwd(cast_inputs=torch.float16)
11 | def forward(ctx, x, qweight, scales, zeros, g_idx):
12 | ctx.save_for_backward(qweight, scales, zeros, g_idx)
13 | if g_idx is None:
14 | output = mm._matmul4bit_v1_recons(
15 | x, qweight, scales, zeros
16 | )
17 | else:
18 | output = mm._matmul4bit_v2_recons(
19 | x, qweight, scales, zeros, g_idx
20 | )
21 | output = output.clone()
22 | return output
23 |
24 | @staticmethod
25 | @custom_bwd
26 | def backward(ctx, grad_output):
27 | qweight, scales, zeros, g_idx = ctx.saved_tensors
28 | if ctx.needs_input_grad[0]:
29 | if g_idx is None:
30 | grad = mm._matmul4bit_v1_recons(
31 | grad_output, qweight, scales, zeros, transpose=True
32 | )
33 | else:
34 | grad = mm._matmul4bit_v2_recons(
35 | grad_output, qweight, scales, zeros, g_idx, transpose=True
36 | )
37 | return grad, None, None, None, None, None, None
38 |
39 | class Autograd2bit(torch.autograd.Function):
40 | @staticmethod
41 | @custom_fwd(cast_inputs=torch.float16)
42 | def forward(ctx, x, qweight, scales, zeros, g_idx):
43 | ctx.save_for_backward(qweight, scales, zeros, g_idx)
44 | output = mm._matmul2bit_v2_recons(x, qweight, scales, zeros, g_idx)
45 | output = output.clone()
46 | return output
47 |
48 | @staticmethod
49 | @custom_bwd
50 | def backward(ctx, grad_output):
51 | qweight, scales, zeros, g_idx = ctx.saved_tensors
52 | if ctx.needs_input_grad[0]:
53 | grad = mm._matmul2bit_v2_recons(
54 | grad_output, qweight, scales, zeros, g_idx, transpose=True
55 | )
56 | return grad, None, None, None, None, None, None
57 |
58 | class Autograd3bit(torch.autograd.Function):
59 | @staticmethod
60 | @custom_fwd(cast_inputs=torch.float16)
61 | def forward(ctx, x, qweight, scales, qzeros, g_idx, wf, outfeatures):
62 | ctx.save_for_backward(qweight, scales, qzeros, g_idx, wf)
63 | # output = mm.matmul3bit(x, qweight, scales, qzeros, g_idx, outfeatures)
64 | # output = output.half()
65 | # below, we instead unpack weights in pytorch
66 | weight = unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf)
67 | output = torch.matmul(x.half(), weight)
68 | output.reshape(x.shape[:-1] + (outfeatures,))
69 | return output
70 |
71 | @staticmethod
72 | @custom_bwd
73 | def backward(ctx, grad_output):
74 | qweight, scales, qzeros, g_idx, wf = ctx.saved_tensors
75 | if ctx.needs_input_grad[0]:
76 | weight = unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf)
77 | grad = torch.matmul(grad_output.half(), weight.T)
78 | return grad, None, None, None, None, None, None, None
79 |
80 | def classic_forward(
81 | x, qweight, bias, scales, qzeros, g_idx, outfeatures, wf=None,
82 | bits=4, is_cuda=True, kernel_switch_threshold=128
83 | ):
84 | out_shape = x.shape[:-1] + (outfeatures, )
85 | x = x.reshape(-1,x.shape[-1])
86 | # dtype = x.dtype
87 | # x = x.float()
88 | if is_cuda is True and (kernel_switch_threshold is False or x.shape[0] < kernel_switch_threshold):
89 | raise NotImplementedError() # code below needs some fixes
90 | out = torch.zeros((x.shape[0], outfeatures), device=x.device, dtype=torch.float32)
91 | if bits == 2:
92 | quant_cuda.vecquant2matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx)
93 | elif bits == 3:
94 | quant_cuda.vecquant3matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx)
95 | elif bits == 4:
96 | quant_cuda.vecquant4matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx)
97 | elif bits == 8:
98 | quant_cuda.vecquant8matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx)
99 | out = out.half()
100 | else:
101 | weight = unpack_weight(qweight, scales, qzeros, g_idx, wf, bits)
102 | out = torch.matmul(x.half(), weight)
103 | del weight
104 |
105 | out = out.reshape(out_shape)
106 | out = out + bias if bias is not None else out
107 | # out = out.to(dtype)
108 | return out
109 |
110 | def unpack_weight(qweight, scales, qzeros, g_idx, wf=None, bits=4):
111 | if bits == 3:
112 | return unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf)
113 | elif bits in [2,4,8]:
114 | zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0)).to(torch.int16 if self.bits == 8 else torch.int8)
115 | torch.bitwise_and(zeros, (2 ** bits) - 1, out=zeros)
116 |
117 | zeros = zeros + 1
118 | zeros = zeros.reshape(scales.shape)
119 |
120 | weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8)
121 | torch.bitwise_and(weight,(2 ** bits) - 1, out=weight)
122 |
123 | weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
124 |
125 | g_idx_long = g_idx.to(torch.long)
126 | weight = (scales[g_idx_long] * (weight - zeros[g_idx_long]))
127 | else:
128 | raise NotImplementedError()
129 |
130 | return weight
131 |
132 | def unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf=None):
133 | zeros = qzeros.reshape(qzeros.shape[0], qzeros.shape[1]//3, 3, 1).expand(-1, -1, -1, 12)
134 | zeros = (zeros >> wf.unsqueeze(0))
135 | zeros[:,:,0,10] = (zeros[:,:,0,10]&0x3) | ((zeros[:,:,1,0] << 2)&0x4)
136 | zeros[:,:,1,11] = (zeros[:,:,1,11]&0x1) | ((zeros[:,:,2,0] << 1)&0x6)
137 | zeros &= 0x7
138 | zeros = torch.cat([zeros[:,:,0,:11], zeros[:,:,1,1:12], zeros[:,:,2,1:11]], dim=2)
139 |
140 | zeros = zeros + 1
141 | zeros = zeros.reshape(scales.shape)
142 |
143 | weight = qweight.reshape(qweight.shape[0]//3, 3, 1, qweight.shape[1]).expand(-1, -1, 12, -1)
144 | weight = (weight >> wf.unsqueeze(-1))&0x7
145 | weight[:,0,10] = (weight[:,0,10]&0x3) | ((weight[:,1,0] << 2)&0x4)
146 | weight[:,1,11] = (weight[:,1,11]&0x1) | ((weight[:,2,0] << 1)&0x6)
147 | weight &= 0x7
148 | weight = torch.cat([weight[:,0,:11], weight[:,1,1:12], weight[:,2,1:11]], dim=1)
149 |
150 | weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
151 |
152 | g_idx_long = g_idx.to(torch.long)
153 | weight = (scales[g_idx_long] * (weight - zeros[g_idx_long]))
154 | # out = torch.matmul(x.half(), weights)
155 | # weight -= zeros[g_idx_long]
156 | # weight = weight.to(torch.half)
157 | # weight *= scales[g_idx_long]
158 | return weight
159 |
160 | # ----------------------------------------------------------------------------
161 | # helpers
162 |
163 | buffer_mat_dic = {}
164 | def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda'):
165 | if shape_of_qweight not in buffer_mat_dic.keys():
166 | buffer_mat_dic[shape_of_qweight] = torch.zeros(
167 | (shape_of_qweight[0] * 8, shape_of_qweight[1]),
168 | dtype=dtype, device=device
169 | )
170 | return buffer_mat_dic[shape_of_qweight]
171 |
--------------------------------------------------------------------------------
/llmtune/run.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from llmtune.config import LLM_MODELS
3 |
4 | # ----------------------------------------------------------------------------
5 |
6 | def make_parser():
7 | parser = argparse.ArgumentParser()
8 | parser.set_defaults(func=lambda args: parser.print_help())
9 | subparsers = parser.add_subparsers(title='Commands')
10 |
11 | # generate
12 |
13 | gen_parser = subparsers.add_parser('generate')
14 | gen_parser.set_defaults(func=generate)
15 |
16 | gen_parser.add_argument('--model', required=True,
17 | help='Path or HF hub name of model to load')
18 | gen_parser.add_argument('--tokenizer', required=False,
19 | help='Path or HF hub name of tokenizer to load (default is model)')
20 | gen_parser.add_argument('--adapter', type=str, required=False,
21 | help='Path to the folder with the Lora adapter.')
22 | gen_parser.add_argument('--groupsize', type=int, default=-1,
23 | help='Groupsize used for quantization; -1 uses full row.')
24 | gen_parser.add_argument('--prompt', type=str, default='',
25 | help='Text used to initialize generation')
26 | gen_parser.add_argument('--instruction', type=str, default='',
27 | help='Instruction for an alpaca-style model')
28 | gen_parser.add_argument('--min-length', type=int, default=10,
29 | help='Minimum length of the sequence to be generated.')
30 | gen_parser.add_argument('--max-length', type=int, default=200,
31 | help='Maximum length of the sequence to be generated.')
32 | gen_parser.add_argument('--top_p', type=float, default=.95,
33 | help='Top p sampling parameter.')
34 | gen_parser.add_argument('--top_k', type=int, default=50,
35 | help='Top p sampling parameter.')
36 | gen_parser.add_argument('--temperature', type=float, default=1.0,
37 | help='Sampling temperature.')
38 |
39 | # quantize
40 |
41 | quant_parser = subparsers.add_parser('quantize')
42 | quant_parser.set_defaults(func=quantize)
43 |
44 | quant_parser.add_argument('--model', required=True,
45 | help='Path or HF hub name of model to load')
46 | quant_parser.add_argument('--save', type=str, required=True,
47 | help='Path to the saved model weights.')
48 | quant_parser.add_argument('--bits', type=int, # required=True,
49 | choices=[2, 3, 4, 8], help='#bits to use for quantization.')
50 | quant_parser.add_argument('--dataset', type=str, default='c4',
51 | choices=['wikitext2', 'ptb', 'c4'],
52 | help='Where to extract calibration data from.')
53 | quant_parser.add_argument('--seed', type=int, default=0,
54 | help='Seed for sampling the calibration data.')
55 | quant_parser.add_argument('--nsamples', type=int, default=128,
56 | help='Number of calibration data samples.')
57 | quant_parser.add_argument('--percdamp', type=float, default=.01,
58 | help='Percent of the average Hessian diagonal to use for dampening.')
59 | quant_parser.add_argument('--groupsize', type=int, default=-1,
60 | help='Groupsize to use for quantization; -1 uses full row.')
61 | quant_parser.add_argument('--act-order', action='store_true',
62 | help='Whether to apply the activation order GPTQ heuristic.')
63 | quant_parser.add_argument('--nearest', action='store_true',
64 | help='Use basic round-to-nearest quantization.')
65 |
66 | # finetune
67 |
68 | tune_parser = subparsers.add_parser('finetune')
69 | tune_parser.set_defaults(func=finetune)
70 |
71 | # finetune model config
72 | tune_parser.add_argument('--model', required=True,
73 | help='Path or HF hub name of model to load')
74 | tune_parser.add_argument('--tokenizer', required=False,
75 | help='Path or HF hub name of tokenizer to load (default is model)')
76 | tune_parser.add_argument("--data-type", choices=["alpaca", "gpt4all"],
77 | help="Dataset format", default="alpaca")
78 | tune_parser.add_argument("--dataset", required=False,
79 | help="Path to local dataset file.")
80 | tune_parser.add_argument('--adapter', type=str, required=False,
81 | help='Path to Lora adapter folder (also holds checkpoints)')
82 | tune_parser.add_argument('--groupsize', type=int,
83 | help='Groupsize used for quantization; -1 uses full row.')
84 |
85 | # finetune training config
86 | tune_parser.add_argument("--mbatch_size", default=1, type=int,
87 | help="Micro-batch size. ")
88 | tune_parser.add_argument("--batch_size", default=2, type=int,
89 | help="Batch size. ")
90 | tune_parser.add_argument("--epochs", default=3, type=int,
91 | help="Epochs. ")
92 | tune_parser.add_argument("--lr", default=2e-4, type=float,
93 | help="Learning rate. ")
94 | tune_parser.add_argument("--cutoff_len", default=256, type=int,
95 | help="")
96 | tune_parser.add_argument("--lora_r", default=8, type=int,
97 | help="")
98 | tune_parser.add_argument("--lora_alpha", default=16, type=int,
99 | help="")
100 | tune_parser.add_argument("--lora_dropout", default=0.05, type=float,
101 | help="")
102 | tune_parser.add_argument("--val_set_size", default=0.2, type=float,
103 | help="Validation set size. ")
104 | tune_parser.add_argument("--warmup_steps", default=50, type=int,
105 | help="")
106 | tune_parser.add_argument("--save_steps", default=50, type=int,
107 | help="")
108 | tune_parser.add_argument("--save_total_limit", default=3, type=int,
109 | help="")
110 | tune_parser.add_argument("--logging_steps", default=10, type=int,
111 | help="")
112 |
113 | return parser
114 |
115 | # ----------------------------------------------------------------------------
116 |
117 | def main():
118 | parser = make_parser()
119 | args = parser.parse_args()
120 | args.func(args)
121 |
122 | def generate(args):
123 | import llmtune.executor as llmtune
124 | llm = llmtune.load_llm(args.model)
125 | tk_name = args.tokenizer if args.tokenizer is not None else args.model
126 | tokenizer = llmtune.load_tokenizer(tk_name, llm.llm_config)
127 | if args.adapter is not None:
128 | llm = llmtune.load_adapter(llm, adapter_path=args.adapter)
129 | if args.prompt and args.instruction:
130 | raise Exception('Cannot specify both prompt and instruction')
131 | if args.instruction:
132 | from llmtune.data.alpaca import make_prompt
133 | prompt = make_prompt(args.instruction, input_="")
134 | else:
135 | prompt = args.prompt
136 |
137 | output = llmtune.generate(
138 | llm,
139 | tokenizer,
140 | prompt,
141 | args.min_length,
142 | args.max_length,
143 | args.temperature,
144 | args.top_k,
145 | args.top_p,
146 | )
147 |
148 | if args.instruction:
149 | from llmtune.data.alpaca import make_output
150 | output = make_output(output)
151 |
152 | print(output)
153 |
154 | def finetune(args):
155 | import llmtune.executor as llmtune
156 | llm = llmtune.load_llm(args.model)
157 | tk_name = args.tokenizer if args.tokenizer is not None else args.model
158 | tokenizer = llmtune.load_tokenizer(tk_name, llm.llm_config)
159 | from llmtune.config import get_finetune_config
160 | finetune_config = get_finetune_config(args)
161 | from llmtune.executor import finetune
162 | finetune(llm, tokenizer, finetune_config)
163 |
164 | def quantize(args):
165 | from llmtune.config import get_quant_config
166 | quant_config = get_quant_config(args)
167 | import llmtune.executor as llmtune
168 | llm = llmtune.load_llm(args.model)
169 | output = llmtune.quantize(
170 | llm,
171 | quant_config
172 | )
173 |
174 | if __name__ == '__main__':
175 | main()
--------------------------------------------------------------------------------
/finetune/mnli-llama/eval_mnli_llmtune.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | # Set up the argument parser
4 | parser = argparse.ArgumentParser(description='Python script to work with models')
5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True)
7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
9 | parser.add_argument('--file_name', type=str, help='file name to store predictions and acc', required=True)
10 | parser.add_argument('--checkpoint_name', type=str, help='folder name to store all the check points', required=True)
11 | parser.add_argument('--start_index', type=int, help='model seed number', required=True)
12 | parser.add_argument('--end_index', type=int, help='model seed number', required=True)
13 |
14 | # Parse the arguments
15 | args = parser.parse_args()
16 |
17 | # Use the command line arguments in your script
18 | print('Model Name:', args.model_name)
19 | print('Weight Path:', args.weight_path)
20 | print('Adapter Path: ', args.adapter)
21 | print('Seed: ', args.seed)
22 |
23 | import random
24 | import json
25 | import os
26 | import pickle
27 |
28 | # import wandb
29 | import torch
30 | import numpy as np
31 | # import bitsandbytes as bnb
32 | from tqdm import tqdm
33 | import transformers
34 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
35 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
36 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
37 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
38 | from datasets import load_dataset
39 |
40 | from utils import *
41 | from data_mnli_label import *
42 |
43 | import evaluate
44 | import numpy as np
45 | from datasets import load_from_disk
46 | from tqdm import tqdm
47 |
48 | from llmtune.executor import load_llm, load_adapter
49 | from llmtune.engine.lora.peft import quant_peft
50 |
51 | output_dir = args.adapter
52 | model_name = "huggyllama/llama-13b"
53 | seed = args.seed
54 | train_sample_rate = 1.0
55 | val_sample_rate = 1.0
56 | local_rank = 0
57 |
58 | set_random_seed(seed)
59 | logging.set_verbosity_info()
60 |
61 | # with open(config_file, "r") as r:
62 | # config = json.load(r)
63 |
64 | device_map = "auto"
65 | world_size = int(os.environ.get("WORLD_SIZE", 1))
66 | ddp = world_size != 1
67 |
68 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
69 | tokenizer = fix_tokenizer(tokenizer)
70 | # tokenizer.save_pretrained(output_dir)
71 |
72 | dataset = load_dataset('multi_nli')
73 | train_records = dataset['train']
74 | val_records = dataset['validation_matched']
75 | #random.shuffle(train_records)
76 | print("train_record[0]: ",train_records[0])
77 |
78 | ## Config for llama 7-b
79 | model_type = "causal"
80 | templates_path = "llama_lora_mnli.json"
81 | only_target_loss = False
82 |
83 | llmtune_model_name = args.model_name
84 | llmtune_quantized_weights_path = args.weight_path ## probably want to change this using our version of the right way
85 | llmtune_groupsize = 64
86 |
87 |
88 | llm, _ = load_llm(
89 | llmtune_model_name,
90 | llmtune_quantized_weights_path,
91 | llmtune_groupsize
92 | )
93 | model = fix_model(llm, tokenizer, use_resize=False)
94 |
95 | # Default model generation params
96 | model.config.num_beams = 5
97 |
98 |
99 | if not ddp and torch.cuda.device_count() > 1:
100 | model.is_parallelizable = True
101 | model.model_parallel = True
102 |
103 |
104 | model = load_adapter(model, adapter_path=output_dir)
105 |
106 | # Metric
107 |
108 | def evaluate_peft_model_mnli(sample,max_target_length=65):
109 | instruction, input, genre = sample['premise'], sample['hypothesis'], sample['genre']
110 | sample_word = f"### Premise: {instruction}\n ### Hypothesis: {input}\n ### Genre: {genre} ### Label: "
111 | print(sample_word)
112 | input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda()
113 | outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 5)
114 | output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"")
115 | output = output.strip()
116 | print(f"Output:\n{output}")
117 | # Some simple post-processing
118 | return output
119 |
120 |
121 |
122 | def acc_compute(predictions,references):
123 | acc = 0
124 | for i in range(len(predictions)):
125 | if predictions[i].lower() == references[i].lower():
126 | acc += 1
127 | acc /= len(predictions)
128 |
129 | print("accuracy:", acc)
130 | return acc
131 |
132 |
133 | def store_pred(file_name_pickle_pred,file_name_pickle_ref,predictions,references):
134 | with open(file_name_pickle_pred, "wb") as fp: #Pickling
135 | pickle.dump(predictions, fp)
136 | with open(file_name_pickle_ref, "wb") as fp: #Pickling
137 | pickle.dump(references, fp)
138 |
139 |
140 |
141 |
142 | ##Arguments setting
143 | start_index = args.start_index
144 | end_index = args.end_index
145 | eval_len = end_index - start_index
146 | eval_save_len = eval_len // 10
147 | print("Evaluation will start at: ", start_index)
148 | print("Evaluation will end at: ", end_index)
149 | print(f'Evaluation will save at every {eval_save_len} steps')
150 |
151 |
152 | ## Create Check point Folder
153 | checkpoint_path = f'{args.checkpoint_name}_{start_index}_{end_index}'
154 |
155 | current_directory = os.getcwd()
156 | final_directory = os.path.join(current_directory, checkpoint_path)
157 | if not os.path.exists(final_directory):
158 | os.makedirs(final_directory)
159 |
160 |
161 |
162 |
163 |
164 | predictions = []
165 | references_orig = val_records['label'][start_index:end_index]
166 | ## convert references to list of strings
167 | references = []
168 | for item in references_orig:
169 | references.append(str(item))
170 |
171 |
172 | count_eval = 0
173 | for idx in tqdm(range(start_index, end_index)):
174 | sample = val_records[idx]
175 | p = evaluate_peft_model_mnli(sample)
176 | predictions.append(p)
177 | count_eval += 1
178 | ## Detecting checkpoing
179 | if (count_eval%eval_save_len == 0):
180 | print(f'=>=>Checkpointing at {count_eval} steps<=<=')
181 |
182 | predictions_step = [s.strip() for s in predictions]
183 | print("prediction_step: ", predictions_step)
184 | references_step = references[0:count_eval]
185 | print("references_step: ", references_step)
186 | acc = acc_compute(predictions_step,references_step)
187 | checkpoint_name_txt = f'{final_directory}/{count_eval}.txt'
188 | checkpoint_name_pred = f'{final_directory}/{count_eval}_pred' ## pickle file for pred list
189 | checkpoint_name_ref = f'{final_directory}/{count_eval}_ref' ## pickle file for ref list
190 | ## writing pickle file
191 | store_pred(checkpoint_name_pred,checkpoint_name_ref,predictions_step,checkpoint_name_ref)
192 | with open(checkpoint_name_txt, "w") as f:
193 | for item in predictions_step:
194 | # write each item on a new line
195 | f.write("%s\n" % item)
196 | f.write("%s\n" % acc)
197 |
198 |
199 |
200 |
201 | predictions = [s.strip() for s in predictions]
202 |
203 |
204 |
205 | file_name = args.file_name
206 |
207 | with open(file_name, 'w') as f:
208 | for item in predictions:
209 | # write each item on a new line
210 | f.write("%s\n" % item)
211 | f.write("%s\n" % acc)
212 |
213 |
214 | file_name_pickle_pred = f'{final_directory}/final_pred_{start_index}_{end_index}'
215 | file_name_pickle_ref = f'{final_directory}/final_ref_{start_index}_{end_index}'
216 |
217 | store_pred(file_name_pickle_pred,file_name_pickle_ref,predictions,references)
218 |
219 |
220 | """
221 | Loading pickle file
222 | with open("test", "rb") as fp: # Unpickling
223 | b = pickle.load(fp)
224 | """
225 |
--------------------------------------------------------------------------------
/finetune/samsum-opt/train_samsum_opt_4bit_llmtune.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | # Set up the argument parser
4 | parser = argparse.ArgumentParser(description='Python script to work with models')
5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
6 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
7 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True)
8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
9 |
10 | # Parse the arguments
11 | args = parser.parse_args()
12 |
13 | # Use the command line arguments in your script
14 | print('Model Name:', args.model_name)
15 | print('Adapter Path: ', args.adapter)
16 | print('Seed: ', args.seed)
17 | print('mbatch_size: ', args.mbatch_size)
18 |
19 |
20 | import os
21 | import torch
22 | import transformers
23 | from transformers import AutoTokenizer
24 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
25 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
26 | from llmtune.llms.autollm import AutoLLMForCausalLM
27 | from llmtune.engine.lora.config import FinetuneConfig
28 | from llmtune.engine.lora.peft import quant_peft
29 | from llmtune.utils import to_half_precision
30 | from datasets import load_dataset
31 |
32 | from utils import *
33 | from data import *
34 |
35 | # os env setting
36 | os.environ["WANDB_DISABLED"] = "true"
37 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
38 |
39 | # model config
40 | model_name = args.model_name
41 | tokenizer_name = 'facebook/opt-6.7b'
42 | DEV = 'cuda'
43 |
44 | transformers.logging.set_verbosity_info()
45 |
46 | # load tokenizer
47 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
48 | tokenizer.pad_token_id = 0
49 | ## Fix Tokenizer
50 | tokenizer = fix_tokenizer_opt(tokenizer)
51 |
52 | # load model
53 | llm = AutoLLMForCausalLM.from_pretrained(model_name)
54 | ## Fix Model
55 | lllm = fix_model(llm, tokenizer, use_resize=False)
56 | llm.eval()
57 | llm = llm.to(DEV)
58 | llm = to_half_precision(llm)
59 |
60 |
61 | # finetune training config
62 | MICRO_BATCH_SIZE=args.mbatch_size
63 | BATCH_SIZE = 128
64 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
65 | EPOCHS = 3
66 | LEARNING_RATE = 1e-3 # the Karpathy constant
67 | CUTOFF_LEN = 128 # 128 accounts for about 95% of the data
68 | LORA_R = 8
69 | LORA_ALPHA = 32
70 | LORA_DROPOUT = 0.1
71 | VAL_SET_SIZE= 2000
72 |
73 | # data/gpu config
74 | seed = args.seed
75 | set_random_seed(seed)
76 | train_sample_rate = 1.0
77 | val_sample_rate = 1.0
78 |
79 | device_map = "auto"
80 | world_size = int(os.environ.get("WORLD_SIZE", 1))
81 | ddp = world_size != 1
82 |
83 | # if ddp:
84 | # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
85 | # gradient_accumulation_steps = gradient_accumulation_steps // world_size
86 |
87 | # adapter_path = '/share/kuleshov/vk379/alpacas/opt-7b-quantized-lora'
88 | lora_out_dir = args.adapter
89 |
90 | # set up lora config
91 | lora_config = quant_peft.LoraConfig(
92 | r=LORA_R,
93 | lora_alpha=LORA_ALPHA,
94 | target_modules=["q_proj", "v_proj"],
95 | lora_dropout=LORA_DROPOUT,
96 | bias="none",
97 | task_type="CAUSAL_LM",
98 | )
99 |
100 |
101 | if not ddp and torch.cuda.device_count() > 1:
102 | llm.is_parallelizable = True
103 | llm.model_parallel = True
104 |
105 |
106 | # create a new lora from config
107 | model = quant_peft.get_peft_model(llm, lora_config)
108 |
109 | if not ddp and torch.cuda.device_count() > 1:
110 | print("GPU parallel acctivated")
111 | model.is_parallelizable = True
112 | model.model_parallel = True
113 |
114 | # load stanford alpaca data
115 | dataset = load_dataset('samsum')
116 | train_records = dataset['train']
117 | val_records = dataset['test']
118 |
119 | ## Config for llama 65-b
120 | model_type = "causal"
121 | templates_path = "llama_lora_samsum.json"
122 | only_target_loss = False
123 | mode = "instruct"
124 |
125 | if mode == "instruct":
126 | max_source_tokens_count = 205 # Changed depending on the dataset
127 | max_target_tokens_count = 45
128 | target_field = "summary"
129 | source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response)
130 |
131 | train_dataset = InstructDataset(
132 | train_records,
133 | tokenizer,
134 | max_source_tokens_count=max_source_tokens_count,
135 | max_target_tokens_count=max_target_tokens_count,
136 | sample_rate=train_sample_rate,
137 | input_type=model_type,
138 | templates_path=templates_path,
139 | target_field=target_field,
140 | source_field=source_field,
141 | only_target_loss=only_target_loss
142 | )
143 |
144 | val_dataset = InstructDataset(
145 | val_records,
146 | tokenizer,
147 | max_source_tokens_count=max_source_tokens_count,
148 | max_target_tokens_count=max_target_tokens_count,
149 | sample_rate=val_sample_rate,
150 | input_type=model_type,
151 | templates_path=templates_path,
152 | target_field=target_field,
153 | source_field=source_field,
154 | only_target_loss=only_target_loss
155 | )
156 |
157 | ## Save the model
158 | dataloader_train = torch.utils.data.DataLoader(train_dataset)
159 | # torch.save(dataloader_train,'dataloader_train.pth')
160 |
161 | dataloader_val = torch.utils.data.DataLoader(val_dataset)
162 | # torch.save(dataloader_val,'dataloader_val.pth')
163 |
164 | else:
165 | assert False
166 |
167 | if "seq2seq" in model_type:
168 | data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)
169 | else:
170 | data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
171 |
172 | print("INPUT_IDS")
173 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0])
174 | print("MASK")
175 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0])
176 | print("LABELS")
177 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0])
178 |
179 |
180 |
181 | # Model configs
182 | model.config.num_beams = 5
183 | if mode == "instruct":
184 | max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1
185 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count
186 |
187 |
188 | # Training args
189 | training_arguments = transformers.TrainingArguments(
190 | per_device_train_batch_size = MICRO_BATCH_SIZE,
191 | per_device_eval_batch_size = 1,
192 | gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
193 | warmup_ratio=0.06,
194 | #num_train_epochs=3,
195 | max_steps = 400,
196 | learning_rate=LEARNING_RATE,
197 | lr_scheduler_type = "cosine", ## LoRA original paper uses linear
198 | fp16=True,
199 | logging_steps=50,
200 | evaluation_strategy="steps",
201 | logging_strategy="steps",
202 | save_strategy="steps",
203 | eval_steps=50,
204 | save_steps=50,
205 | output_dir=lora_out_dir,
206 | optim = "adamw_torch",
207 | torch_compile = False,
208 | save_total_limit=2,
209 | load_best_model_at_end=True,
210 | ddp_find_unused_parameters=False if ddp else None,
211 | )
212 |
213 |
214 | def preprocess_logits_for_metrics(logits, labels):
215 | """
216 | Original Trainer may have a memory leak.
217 | This is a workaround to avoid storing too many tensors that are not needed.
218 | """
219 | pred_ids = torch.argmax(logits[0], dim=-1)
220 | return pred_ids, labels
221 |
222 | # Start trainer
223 | trainer = transformers.Trainer(
224 | model=model,
225 | args=training_arguments,
226 | train_dataset=train_dataset,
227 | eval_dataset=val_dataset,
228 | data_collator=data_collator,
229 | preprocess_logits_for_metrics = preprocess_logits_for_metrics,
230 | )
231 |
232 | # print("Prallel Training status: ", training_arguments.parallel_mode)
233 | model.config.use_cache = False
234 |
235 | # use half precision
236 | model = to_half_precision(model)
237 |
238 | # start training
239 | checkpoint_dir = lora_out_dir
240 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
241 | trainer.train(resume_from_checkpoint=True)
242 | else:
243 | trainer.train()
244 |
245 | # Save Model
246 | model.save_pretrained(lora_out_dir)
--------------------------------------------------------------------------------
/finetune/samsum-llama/eval_samsum_4bit_llmtune.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | # Set up the argument parser
4 | parser = argparse.ArgumentParser(description='Python script to work with models')
5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True)
7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
9 | parser.add_argument('--file_name', type=str, help='file name to store predictions and acc', required=True)
10 | parser.add_argument('--checkpoint_name', type=str, help='folder name to store all the check points', required=True)
11 | parser.add_argument('--start_index', type=int, help='model seed number', required=True)
12 | parser.add_argument('--end_index', type=int, help='model seed number', required=True)
13 |
14 | # Parse the arguments
15 | args = parser.parse_args()
16 |
17 | # Use the command line arguments in your script
18 | print('Model Name:', args.model_name)
19 | print('Weight Path:', args.weight_path)
20 | print('Adapter Path: ', args.adapter)
21 | print('Seed: ', args.seed)
22 |
23 | import random
24 | import json
25 | import os
26 |
27 | #for eval
28 | import pickle
29 |
30 | # import wandb
31 | import torch
32 | import numpy as np
33 | # import bitsandbytes as bnb
34 | from tqdm import tqdm
35 | import transformers
36 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
37 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
38 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
39 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
40 | from datasets import load_dataset
41 |
42 | from utils import *
43 | from data import *
44 |
45 | import evaluate
46 | import numpy as np
47 | from datasets import load_from_disk
48 | from tqdm import tqdm
49 |
50 | from llmtune.executor import load_llm, load_adapter
51 | from llmtune.engine.lora.peft import quant_peft
52 |
53 | output_dir = args.adapter
54 | seed = args.seed
55 | train_sample_rate = 1.0
56 | val_sample_rate = 1.0
57 | local_rank = 0
58 |
59 | set_random_seed(seed)
60 | logging.set_verbosity_info()
61 |
62 | # with open(config_file, "r") as r:
63 | # config = json.load(r)
64 |
65 | device_map = "auto"
66 | world_size = int(os.environ.get("WORLD_SIZE", 1))
67 | ddp = world_size != 1
68 |
69 | tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-13b", use_fast=False,model_max_length=250)
70 | tokenizer = fix_tokenizer(tokenizer)
71 | # tokenizer.save_pretrained(output_dir)
72 |
73 | dataset = load_dataset('samsum')
74 | train_records = dataset['train']
75 | val_records = dataset['test']
76 | #random.shuffle(train_records)
77 | print("train_record[0]: ",train_records[0])
78 |
79 | ## Config for llama 7-b
80 | model_type = "causal"
81 | templates_path = "llama_lora_samsum.json"
82 | only_target_loss = False
83 | mode = "instruct"
84 |
85 | llmtune_model_name = args.model_name
86 | llmtune_quantized_weights_path = args.weight_path
87 | llmtune_groupsize = 64
88 |
89 | llm, _ = load_llm(
90 | llmtune_model_name,
91 | llmtune_quantized_weights_path,
92 | llmtune_groupsize
93 | )
94 | model = fix_model(llm, tokenizer, use_resize=False)
95 |
96 | # Default model generation params
97 | model.config.num_beams = 5
98 |
99 |
100 | if not ddp and torch.cuda.device_count() > 1:
101 | model.is_parallelizable = True
102 | model.model_parallel = True
103 |
104 |
105 | model = load_adapter(model, adapter_path=output_dir)
106 |
107 | # Metric
108 | metric = evaluate.load("rouge")
109 |
110 | def evaluate_peft_model_samsum(sample,max_target_length=45):
111 | # Load dataset from the hub and get a sample
112 | sample_word = f"### Summarize this: {sample}\n ### Output: "
113 | input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda()
114 | with torch.autocast("cuda"):
115 | outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 45)
116 | output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"")
117 | output = output.strip()
118 | print(f"Output:\n{output}")
119 | # Some simple post-processing
120 | return output
121 |
122 |
123 | def rouge_compute(predictions,references):
124 | rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
125 | return rogue
126 |
127 |
128 | def store_pred(file_name_pickle_pred,file_name_pickle_ref,predictions,references):
129 | with open(file_name_pickle_pred, "wb") as fp: #Pickling
130 | pickle.dump(predictions, fp)
131 | with open(file_name_pickle_ref, "wb") as fp: #Pickling
132 | pickle.dump(references, fp)
133 |
134 |
135 |
136 | ##Arguments setting
137 | start_index = args.start_index
138 | end_index = args.end_index
139 | eval_len = end_index - start_index
140 | eval_save_len = eval_len // 10
141 | print("Evaluation will start at: ", start_index)
142 | print("Evaluation will end at: ", end_index)
143 | print(f'Evaluation will save at every {eval_save_len} steps')
144 |
145 |
146 | ## Create Check point Folder
147 | checkpoint_path = f'{args.checkpoint_name}_{start_index}_{end_index}'
148 |
149 | current_directory = os.getcwd()
150 | final_directory = os.path.join(current_directory, checkpoint_path)
151 | if not os.path.exists(final_directory):
152 | os.makedirs(final_directory)
153 |
154 |
155 |
156 | predictions = []
157 | references_origin = val_records['summary'][start_index:end_index]
158 | references = []
159 |
160 | count_eval = 0
161 |
162 |
163 | for idx in tqdm(range(start_index, end_index)):
164 | sample = val_records['dialogue'][idx]
165 | # Load dataset from the hub and get a sample
166 | sample_word = f"### Summarize this: {sample}\n ### Output: "
167 | input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda()
168 |
169 | print("length of input ids:", len(input_ids[0]))
170 | # if (len(input_ids[0]) < 300):
171 | with torch.inference_mode(), torch.autocast("cuda"):
172 | outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 45)
173 | output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"")
174 | output = output.strip()
175 | print(f"Model Output: \n{output}")
176 | predictions.append(output)
177 | print(f"Reference Output: \n {references_origin[count_eval]}")
178 | references.append(references_origin[count_eval])
179 | count_eval+=1
180 |
181 | ## Detecting checkpoing
182 | if (count_eval%eval_save_len == 0):
183 | print(f'=>=>Checkpointing at {count_eval} steps<=<=')
184 |
185 | predictions_step = [s.strip() for s in predictions]
186 | print("prediction_step: ", predictions_step)
187 | references_step = references
188 | print("references_step: ", references_step)
189 | rouge = rouge_compute(predictions_step,references_step)
190 | checkpoint_name_txt = f'{final_directory}/{count_eval}.txt'
191 | checkpoint_name_pred = f'{final_directory}/{count_eval}_pred' ## pickle file for pred list
192 | checkpoint_name_ref = f'{final_directory}/{count_eval}_ref' ## pickle file for ref list
193 | ## writing pickle file
194 | store_pred(checkpoint_name_pred,checkpoint_name_ref,predictions_step,checkpoint_name_ref)
195 | with open(checkpoint_name_txt, "w") as f:
196 | for item in predictions_step:
197 | # write each item on a new line
198 | f.write("%s\n" % item)
199 | f.write(f'Seed: {seed}')
200 | f.write(f"Rogue1: {rouge['rouge1']* 100:2f}%")
201 | f.write(f"rouge2: {rouge['rouge2']* 100:2f}%")
202 | f.write(f"rougeL: {rouge['rougeL']* 100:2f}%")
203 | f.write(f"rougeLsum: {rouge['rougeLsum']* 100:2f}%")
204 |
205 |
206 | predictions = [s.strip() for s in predictions]
207 |
208 |
209 | # compute metric
210 | rouge = metric.compute(predictions=predictions, references=references, use_stemmer=True)
211 |
212 | file_name = args.file_name
213 | with open(file_name, 'w') as f:
214 | f.write(f'Seed: {seed}')
215 | f.write(f"Rogue1: {rouge['rouge1']* 100:2f}%")
216 | f.write(f"rouge2: {rouge['rouge2']* 100:2f}%")
217 | f.write(f"rougeL: {rouge['rougeL']* 100:2f}%")
218 | f.write(f"rougeLsum: {rouge['rougeLsum']* 100:2f}%")
219 |
--------------------------------------------------------------------------------
/finetune/samsum-llama/train_samsum_4bit.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | # Set up the argument parser
4 | parser = argparse.ArgumentParser(description='Python script to work with models')
5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True)
7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
8 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True)
9 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
10 |
11 | # Parse the arguments
12 | args = parser.parse_args()
13 |
14 | # Use the command line arguments in your script
15 | print('Model Name:', args.model_name)
16 | print('Weight Path:', args.weight_path)
17 | print('Adapter Path: ', args.adapter)
18 | print('Seed: ', args.seed)
19 | print('mbatch_size: ', args.mbatch_size)
20 |
21 |
22 | import random
23 | import json
24 | import os
25 |
26 | # import wandb
27 | import torch
28 | import numpy as np
29 | import bitsandbytes as bnb
30 | from tqdm import tqdm
31 | import transformers
32 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
33 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
34 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
35 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
36 | from datasets import load_dataset
37 |
38 | # from src.dataset import InstructDataset, ChatDataset
39 | # from src.util.dl import set_random_seed, fix_tokenizer, fix_model
40 | # from src.util.io import read_jsonl
41 |
42 | from utils import *
43 | from data import *
44 |
45 | from llmtune.executor import load_llm, load_adapter
46 | from llmtune.engine.lora.peft import quant_peft
47 |
48 |
49 | # os.environ["WANDB_LOG_MODEL"] = "checkpoint"
50 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
51 |
52 |
53 | class SavePeftModelCallback(TrainerCallback):
54 | def on_save(
55 | self,
56 | args: TrainingArguments,
57 | state: TrainerState,
58 | control: TrainerControl,
59 | **kwargs,
60 | ):
61 | checkpoint_folder = os.path.join(
62 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
63 | )
64 |
65 | peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
66 | kwargs["model"].save_pretrained(peft_model_path)
67 | return control
68 |
69 | checkpoint = None
70 | seed = args.seed
71 | train_sample_rate = 1.0
72 | val_sample_rate = 1.0
73 | local_rank = 0
74 | # report_to = "wandb"
75 | output_dir = args.adapter
76 |
77 | set_random_seed(seed)
78 | logging.set_verbosity_info()
79 |
80 | # with open(config_file, "r") as r:
81 | # config = json.load(r)
82 |
83 | device_map = "auto"
84 | world_size = int(os.environ.get("WORLD_SIZE", 1))
85 | ddp = world_size != 1
86 | # if ddp:
87 | # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
88 | # gradient_accumulation_steps = gradient_accumulation_steps // world_size
89 |
90 | #deepspeed_config = config.get("deepspeed")
91 |
92 |
93 |
94 | ### Training Configuration
95 | #trainer_config = config["trainer"]
96 |
97 | MICRO_BATCH_SIZE = args.mbatch_size # this could actually be 5 but i like powers of 2
98 | BATCH_SIZE = 128
99 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
100 | EPOCHS = 3 # we don't need 3 tbh
101 | LEARNING_RATE = 1e-3 # the Karpathy constant
102 | CUTOFF_LEN = 128 # 128 accounts for about 95% of the data
103 | LORA_R = 8
104 | LORA_ALPHA = 16
105 | LORA_DROPOUT = 0.05
106 | VAL_SET_SIZE= 2000
107 |
108 | def preprocess_logits_for_metrics(logits, labels):
109 | """
110 | Original Trainer may have a memory leak.
111 | This is a workaround to avoid storing too many tensors that are not needed.
112 | """
113 | pred_ids = torch.argmax(logits[0], dim=-1)
114 | return pred_ids, labels
115 |
116 | trainer_config = transformers.TrainingArguments(
117 | per_device_train_batch_size = MICRO_BATCH_SIZE,
118 | per_device_eval_batch_size = MICRO_BATCH_SIZE,
119 | gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
120 | warmup_ratio=0.06,
121 | #num_train_epochs=3,
122 | max_steps = 350,
123 | learning_rate=LEARNING_RATE,
124 | lr_scheduler_type = "cosine", ## LoRA original paper uses linear
125 | fp16=True,
126 | logging_steps=50,
127 | evaluation_strategy="steps",
128 | logging_strategy="steps",
129 | save_strategy="steps",
130 | eval_steps=50,
131 | save_steps=50,
132 | # report_to=report_to,
133 | output_dir=output_dir,
134 | optim = "adamw_torch",
135 | torch_compile = False,
136 | save_total_limit=2,
137 | load_best_model_at_end=True,
138 | ddp_find_unused_parameters=False if ddp else None,
139 | )
140 |
141 |
142 | # ### Apply LoRA
143 | #
144 | # Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.
145 |
146 | target_modules = None
147 | target_modules = ['q_proj', 'v_proj'] # edit with your desired target modules
148 | #lora_config = config.get("lora")
149 | lora_config = LoraConfig(
150 | r=8, lora_alpha=32, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
151 | )
152 |
153 | callbacks = [SavePeftModelCallback] if lora_config else []
154 | ##no need to use callbacks
155 | callbacks = []
156 |
157 | training_args = trainer_config
158 |
159 |
160 | model_name = "huggyllama/llama-13b"
161 |
162 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
163 | tokenizer = fix_tokenizer(tokenizer)
164 | # tokenizer.save_pretrained(output_dir)
165 |
166 | dataset = load_dataset('samsum')
167 | train_records = dataset['train']
168 | val_records = dataset['test']
169 | #random.shuffle(train_records)
170 | print("train_record[0]: ",train_records[0])
171 |
172 | ## Config for llama 65-b
173 | model_type = "causal"
174 | templates_path = "llama_lora_samsum.json"
175 | only_target_loss = False
176 | mode = "instruct"
177 |
178 | llmtune_model_name = args.model_name
179 | llmtune_quantized_weights_path = args.weight_path
180 | llmtune_groupsize = 64
181 |
182 | if mode == "instruct":
183 | max_source_tokens_count = 255 # Changed depending on the dataset
184 | max_target_tokens_count = 50
185 | target_field = "summary"
186 | source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response)
187 |
188 | train_dataset = InstructDataset(
189 | train_records,
190 | tokenizer,
191 | max_source_tokens_count=max_source_tokens_count,
192 | max_target_tokens_count=max_target_tokens_count,
193 | sample_rate=train_sample_rate,
194 | input_type=model_type,
195 | templates_path=templates_path,
196 | target_field=target_field,
197 | source_field=source_field,
198 | only_target_loss=only_target_loss
199 | )
200 |
201 | val_dataset = InstructDataset(
202 | val_records,
203 | tokenizer,
204 | max_source_tokens_count=max_source_tokens_count,
205 | max_target_tokens_count=max_target_tokens_count,
206 | sample_rate=val_sample_rate,
207 | input_type=model_type,
208 | templates_path=templates_path,
209 | target_field=target_field,
210 | source_field=source_field,
211 | only_target_loss=only_target_loss
212 | )
213 |
214 | ## Save the model
215 | dataloader_train = torch.utils.data.DataLoader(train_dataset)
216 | # torch.save(dataloader_train,'dataloader_train.pth')
217 |
218 | dataloader_val = torch.utils.data.DataLoader(val_dataset)
219 | # torch.save(dataloader_val,'dataloader_val.pth')
220 |
221 | else:
222 | assert False
223 |
224 | if "seq2seq" in model_type:
225 | data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)
226 | else:
227 | data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
228 |
229 | print("INPUT_IDS")
230 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0])
231 | print("MASK")
232 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0])
233 | print("LABELS")
234 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0])
235 |
236 | llm, _ = load_llm(
237 | llmtune_model_name,
238 | llmtune_quantized_weights_path,
239 | llmtune_groupsize
240 | )
241 | model = fix_model(llm, tokenizer, use_resize=False)
242 |
243 | # Default model generation params
244 | model.config.num_beams = 5
245 | if mode == "instruct":
246 | max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1
247 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count
248 |
249 | if not ddp and torch.cuda.device_count() > 1:
250 | model.is_parallelizable = True
251 | model.model_parallel = True
252 |
253 | if lora_config:
254 | #lora_config = LoraConfig(**lora_config)
255 | # model = get_peft_model(model, lora_config)
256 | model = load_adapter(model, lora_config=lora_config)
257 |
258 | trainer_class = Trainer ##if not omit_base_model_save else TrainerNoBaseSave
259 | print("Trainer class:", trainer_class)
260 | trainer = trainer_class(
261 | model=model,
262 | args=training_args,
263 | train_dataset=train_dataset,
264 | eval_dataset=val_dataset,
265 | callbacks=callbacks,
266 | data_collator=data_collator,
267 | preprocess_logits_for_metrics = preprocess_logits_for_metrics,
268 | )
269 |
270 | # with wandb.init(project="llama_ft_samsum", name="llama finetuning run") as run: ## changed the name don't forget
271 | checkpoint_dir = output_dir
272 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
273 | trainer.train(resume_from_checkpoint=True)
274 | else:
275 | trainer.train()
276 | model.save_pretrained(output_dir)
--------------------------------------------------------------------------------
/finetune/mnli-llama/train_mnli_llmtune_label.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | # Set up the argument parser
4 | parser = argparse.ArgumentParser(description='Python script to work with models')
5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True)
7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
8 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True)
9 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
10 |
11 | # Parse the arguments
12 | args = parser.parse_args()
13 |
14 | # Use the command line arguments in your script
15 | print('Model Name:', args.model_name)
16 | print('Weight Path:', args.weight_path)
17 | print('Adapter Path: ', args.adapter)
18 | print('Seed: ', args.seed)
19 | print('mbatch_size: ', args.mbatch_size)
20 |
21 |
22 | import random
23 | import json
24 | import os
25 |
26 | # import wandb
27 | import torch
28 | import numpy as np
29 | import bitsandbytes as bnb
30 | from tqdm import tqdm
31 | import transformers
32 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
33 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
34 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
35 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
36 | from datasets import load_dataset
37 |
38 | # from src.dataset import InstructDataset, ChatDataset
39 | # from src.util.dl import set_random_seed, fix_tokenizer, fix_model
40 | # from src.util.io import read_jsonl
41 |
42 | from utils import *
43 | from data_mnli_label import *
44 |
45 | from llmtune.executor import load_llm, load_adapter
46 | from llmtune.engine.lora.peft import quant_peft
47 |
48 |
49 | # os.environ["WANDB_LOG_MODEL"] = "checkpoint"
50 | os.environ["WANDB_DISABLED"] = "true"
51 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
52 |
53 |
54 | class SavePeftModelCallback(TrainerCallback):
55 | def on_save(
56 | self,
57 | args: TrainingArguments,
58 | state: TrainerState,
59 | control: TrainerControl,
60 | **kwargs,
61 | ):
62 | checkpoint_folder = os.path.join(
63 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
64 | )
65 |
66 | peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
67 | kwargs["model"].save_pretrained(peft_model_path)
68 | return control
69 |
70 | checkpoint = None
71 | seed = args.seed
72 | train_sample_rate = 1.0
73 | val_sample_rate = 1.0
74 | local_rank = 0
75 | # report_to = "wandb"
76 | output_dir = args.adapter
77 |
78 | set_random_seed(seed)
79 | logging.set_verbosity_info()
80 |
81 | # with open(config_file, "r") as r:
82 | # config = json.load(r)
83 |
84 | device_map = "auto"
85 | world_size = int(os.environ.get("WORLD_SIZE", 1))
86 | ddp = world_size != 1
87 |
88 | if ddp:
89 | device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
90 | gradient_accumulation_steps = gradient_accumulation_steps // world_size
91 |
92 | #deepspeed_config = config.get("deepspeed")
93 |
94 |
95 |
96 | ### Training Configuration
97 | #trainer_config = config["trainer"]
98 |
99 | MICRO_BATCH_SIZE = args.mbatch_size # this could actually be 5 but i like powers of 2
100 | BATCH_SIZE = 256
101 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
102 | EPOCHS = 1 # we don't need 3 tbh
103 | LEARNING_RATE = 1e-3 # the Karpathy constant
104 | CUTOFF_LEN = 128 # 128 accounts for about 95% of the data
105 | LORA_R = 8
106 | LORA_ALPHA = 16
107 | LORA_DROPOUT = 0.05
108 | VAL_SET_SIZE= 2000
109 |
110 | def preprocess_logits_for_metrics(logits, labels):
111 | """
112 | Original Trainer may have a memory leak.
113 | This is a workaround to avoid storing too many tensors that are not needed.
114 | """
115 | pred_ids = torch.argmax(logits[0], dim=-1)
116 | return pred_ids, labels
117 |
118 | trainer_config = transformers.TrainingArguments(
119 | per_device_train_batch_size = MICRO_BATCH_SIZE,
120 | per_device_eval_batch_size = MICRO_BATCH_SIZE,
121 | gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
122 | warmup_ratio=0.06,
123 | num_train_epochs=EPOCHS,
124 | # max_steps = 350,
125 | learning_rate=LEARNING_RATE,
126 | lr_scheduler_type = "cosine", ## LoRA original paper uses linear
127 | fp16=True,
128 | logging_steps=150,
129 | evaluation_strategy="steps",
130 | logging_strategy="steps",
131 | save_strategy="steps",
132 | eval_steps=300,
133 | save_steps=300,
134 | # report_to=report_to,
135 | output_dir=output_dir,
136 | optim = "adamw_torch",
137 | torch_compile = False,
138 | save_total_limit=2,
139 | load_best_model_at_end=True,
140 | ddp_find_unused_parameters=False if ddp else None,
141 | )
142 |
143 |
144 | # ### Apply LoRA
145 | #
146 | # Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.
147 |
148 | target_modules = None
149 | target_modules = ['q_proj', 'v_proj'] # edit with your desired target modules
150 | #lora_config = config.get("lora")
151 | lora_config = LoraConfig(
152 | r=8, lora_alpha=32, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
153 | )
154 |
155 | callbacks = [SavePeftModelCallback] if lora_config else []
156 | ##no need to use callbacks
157 | callbacks = []
158 |
159 | training_args = trainer_config
160 |
161 |
162 | model_name = "huggyllama/llama-13b"
163 |
164 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
165 | tokenizer = fix_tokenizer(tokenizer)
166 | # tokenizer.save_pretrained(output_dir)
167 |
168 | dataset = load_dataset('multi_nli')
169 | train_records = dataset['train']
170 | val_records = dataset['validation_matched']
171 | #random.shuffle(train_records)
172 | print("train_record[0]: ",train_records[0])
173 |
174 | model_type = "causal"
175 | templates_path = "llama_lora_mnli_label.json"
176 | only_target_loss = False
177 | mode = "instruct"
178 |
179 | llmtune_model_name = args.model_name
180 | llmtune_quantized_weights_path = args.weight_path
181 | llmtune_groupsize = 64
182 |
183 | if mode == "instruct":
184 | max_source_tokens_count = 64 # Changed depending on the dataset
185 | max_target_tokens_count = 4
186 | target_field = ""
187 | source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response)
188 |
189 | train_dataset = InstructDataset(
190 | train_records,
191 | tokenizer,
192 | max_source_tokens_count=max_source_tokens_count,
193 | max_target_tokens_count=max_target_tokens_count,
194 | sample_rate=train_sample_rate,
195 | input_type=model_type,
196 | templates_path=templates_path,
197 | target_field=target_field,
198 | source_field=source_field,
199 | only_target_loss=only_target_loss
200 | )
201 |
202 | val_dataset = InstructDataset(
203 | val_records,
204 | tokenizer,
205 | max_source_tokens_count=max_source_tokens_count,
206 | max_target_tokens_count=max_target_tokens_count,
207 | sample_rate=val_sample_rate,
208 | input_type=model_type,
209 | templates_path=templates_path,
210 | target_field=target_field,
211 | source_field=source_field,
212 | only_target_loss=only_target_loss
213 | )
214 |
215 | ## Save the model
216 | dataloader_train = torch.utils.data.DataLoader(train_dataset)
217 | # torch.save(dataloader_train,'dataloader_train.pth')
218 |
219 | dataloader_val = torch.utils.data.DataLoader(val_dataset)
220 | # torch.save(dataloader_val,'dataloader_val.pth')
221 |
222 | else:
223 | assert False
224 |
225 | if "seq2seq" in model_type:
226 | data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)
227 | else:
228 | data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
229 |
230 | print("INPUT_IDS")
231 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0])
232 | print("MASK")
233 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0])
234 | print("LABELS")
235 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0])
236 |
237 | llm, _ = load_llm(
238 | llmtune_model_name,
239 | llmtune_quantized_weights_path,
240 | llmtune_groupsize
241 | )
242 | model = fix_model(llm, tokenizer, use_resize=False)
243 |
244 | # Default model generation params
245 | model.config.num_beams = 5
246 | if mode == "instruct":
247 | max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1
248 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count
249 |
250 | if not ddp and torch.cuda.device_count() > 1:
251 | model.is_parallelizable = True
252 | model.model_parallel = True
253 |
254 | if lora_config:
255 | #lora_config = LoraConfig(**lora_config)
256 | # model = get_peft_model(model, lora_config)
257 | model = load_adapter(model, lora_config=lora_config)
258 |
259 | trainer_class = Trainer ##if not omit_base_model_save else TrainerNoBaseSave
260 | print("Trainer class:", trainer_class)
261 | trainer = trainer_class(
262 | model=model,
263 | args=training_args,
264 | train_dataset=train_dataset,
265 | eval_dataset=val_dataset,
266 | callbacks=callbacks,
267 | data_collator=data_collator,
268 | # preprocess_logits_for_metrics = preprocess_logits_for_metrics,
269 | )
270 |
271 | # with wandb.init(project="llama_ft_samsum", name="llama finetuning run") as run: ## changed the name don't forget
272 | checkpoint_dir = output_dir
273 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
274 | trainer.train(resume_from_checkpoint=True)
275 | else:
276 | trainer.train()
277 | model.save_pretrained(output_dir)
--------------------------------------------------------------------------------
/finetune/samsum-llama/train_samsum_4bit_bnb.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | # Set up the argument parser
4 | parser = argparse.ArgumentParser(description='Python script to work with models')
5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
6 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
7 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True)
8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
9 | parser.add_argument('--repo_name', type=str, help='HF model name', required=True)
10 |
11 |
12 | # Parse the arguments
13 | args = parser.parse_args()
14 |
15 | # Use the command line arguments in your script
16 | print('Model Name:', args.model_name)
17 | print('Adapter Path: ', args.adapter)
18 | print('Seed: ', args.seed)
19 | print('mbatch_size: ', args.mbatch_size)
20 |
21 |
22 | import random
23 | import json
24 | import os
25 |
26 | # import wandb
27 | import torch
28 | import numpy as np
29 | import bitsandbytes as bnb
30 | from tqdm import tqdm
31 | import transformers
32 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
33 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
34 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
35 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
36 | from datasets import load_dataset
37 |
38 | from utils import *
39 | from data import *
40 |
41 |
42 |
43 |
44 | os.environ["WANDB_DISABLED"] = "true"
45 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
46 |
47 |
48 |
49 |
50 | class SavePeftModelCallback(TrainerCallback):
51 | def on_save(
52 | self,
53 | args: TrainingArguments,
54 | state: TrainerState,
55 | control: TrainerControl,
56 | **kwargs,
57 | ):
58 | checkpoint_folder = os.path.join(
59 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
60 | )
61 |
62 | peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
63 | kwargs["model"].save_pretrained(peft_model_path)
64 | return control
65 |
66 |
67 | checkpoint = None
68 | seed = args.seed
69 | train_sample_rate = 1.0
70 | val_sample_rate = 1.0
71 | local_rank = 0
72 | output_dir = args.adapter
73 |
74 | set_random_seed(seed)
75 | logging.set_verbosity_info()
76 |
77 | # with open(config_file, "r") as r:
78 | # config = json.load(r)
79 |
80 |
81 | device_map = "auto"
82 | world_size = int(os.environ.get("WORLD_SIZE", 1))
83 | ddp = world_size != 1
84 | if ddp:
85 | device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
86 | gradient_accumulation_steps = gradient_accumulation_steps // world_size
87 |
88 |
89 | #deepspeed_config = config.get("deepspeed")
90 |
91 |
92 |
93 |
94 | ### Training Configuration
95 | #trainer_config = config["trainer"]
96 |
97 | MICRO_BATCH_SIZE = args.mbatch_size # this could actually be 5 but i like powers of 2
98 | BATCH_SIZE = 128
99 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
100 | EPOCHS = 3 # we don't need 3 tbh
101 | LEARNING_RATE = 1e-3 # the Karpathy constant
102 | CUTOFF_LEN = 128 # 128 accounts for about 95% of the data
103 | LORA_R = 8
104 | LORA_ALPHA = 16
105 | LORA_DROPOUT = 0.05
106 | VAL_SET_SIZE= 2000
107 |
108 | def preprocess_logits_for_metrics(logits, labels):
109 | """
110 | Original Trainer may have a memory leak.
111 | This is a workaround to avoid storing too many tensors that are not needed.
112 | """
113 | pred_ids = torch.argmax(logits[0], dim=-1)
114 | return pred_ids, labels
115 |
116 | trainer_config = transformers.TrainingArguments(
117 | per_device_train_batch_size = MICRO_BATCH_SIZE,
118 | per_device_eval_batch_size = MICRO_BATCH_SIZE,
119 | gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
120 | warmup_ratio=0.06,
121 | #num_train_epochs=3,
122 | max_steps = 350,
123 | learning_rate=LEARNING_RATE,
124 | lr_scheduler_type = "cosine", ## LoRA original paper uses linear
125 | fp16=True,
126 | logging_steps=50,
127 | evaluation_strategy="steps",
128 | logging_strategy="steps",
129 | save_strategy="steps",
130 | eval_steps=50,
131 | save_steps=50,
132 | # report_to=report_to,
133 | output_dir=output_dir,
134 | optim = "adamw_torch",
135 | torch_compile = False,
136 | save_total_limit=2,
137 | load_best_model_at_end=False,
138 | ddp_find_unused_parameters=False if ddp else None,
139 | )
140 |
141 |
142 | # ### Apply LoRA
143 | #
144 | # Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.
145 |
146 | target_modules = None
147 | target_modules = ['q_proj', 'v_proj'] # edit with your desired target modules
148 | #lora_config = config.get("lora")
149 | lora_config = LoraConfig(
150 | r=8, lora_alpha=32, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
151 | )
152 |
153 | callbacks = [SavePeftModelCallback] if lora_config else []
154 | ##no need to use callbacks
155 | callbacks = []
156 |
157 | training_args = trainer_config
158 |
159 |
160 | model_name = args.model_name
161 |
162 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
163 | tokenizer = fix_tokenizer(tokenizer)
164 | # tokenizer.save_pretrained(output_dir)
165 |
166 | dataset = load_dataset('samsum')
167 | train_records = dataset['train']
168 | val_records = dataset['test']
169 | #random.shuffle(train_records)
170 | print("train_record[0]: ",train_records[0])
171 |
172 | ## Config for llama 65-b
173 | model_type = "causal"
174 | templates_path = "llama_lora_samsum.json"
175 | only_target_loss = False
176 | mode = "instruct"
177 |
178 |
179 | if mode == "instruct":
180 | max_source_tokens_count = 255 # Changed depending on the dataset
181 | max_target_tokens_count = 50
182 | target_field = "summary"
183 | source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response)
184 |
185 | train_dataset = InstructDataset(
186 | train_records,
187 | tokenizer,
188 | max_source_tokens_count=max_source_tokens_count,
189 | max_target_tokens_count=max_target_tokens_count,
190 | sample_rate=train_sample_rate,
191 | input_type=model_type,
192 | templates_path=templates_path,
193 | target_field=target_field,
194 | source_field=source_field,
195 | only_target_loss=only_target_loss
196 | )
197 |
198 | val_dataset = InstructDataset(
199 | val_records,
200 | tokenizer,
201 | max_source_tokens_count=max_source_tokens_count,
202 | max_target_tokens_count=max_target_tokens_count,
203 | sample_rate=val_sample_rate,
204 | input_type=model_type,
205 | templates_path=templates_path,
206 | target_field=target_field,
207 | source_field=source_field,
208 | only_target_loss=only_target_loss
209 | )
210 |
211 | else:
212 | assert False
213 |
214 | if "seq2seq" in model_type:
215 | data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)
216 | else:
217 | data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
218 |
219 | print("INPUT_IDS")
220 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0])
221 | print("MASK")
222 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0])
223 | print("LABELS")
224 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0])
225 |
226 |
227 | model_types = {
228 | "causal": AutoModelForCausalLM,
229 | "seq2seq": AutoModelForSeq2SeqLM
230 | }
231 | ## Decide whether to laod in 8-bit
232 | load_in_8bit = False
233 | load_in_4bit = True
234 | if load_in_8bit:
235 | assert not load_in_4bit
236 | model = model_types[model_type].from_pretrained(
237 | model_name,
238 | load_in_8bit=True,
239 | device_map=device_map
240 | )
241 | model = fix_model(model, tokenizer, use_resize=False)
242 | model = prepare_model_for_int8_training(model)
243 | elif load_in_4bit:
244 | assert not load_in_8bit
245 | # use_bf16 = trainer_config.get("bf16", False)
246 | use_bf16 = getattr(trainer_config, "bf16", False)
247 | compute_dtype = torch.bfloat16 if use_bf16 else torch.float16
248 | model = model_types[model_type].from_pretrained(
249 | model_name,
250 | load_in_4bit=True,
251 | device_map=device_map,
252 | quantization_config=BitsAndBytesConfig(
253 | load_in_4bit=True,
254 | llm_int8_threshold=6.0,
255 | llm_int8_has_fp16_weight=False,
256 | bnb_4bit_compute_dtype=compute_dtype,
257 | bnb_4bit_use_double_quant=True,
258 | bnb_4bit_quant_type="nf4"
259 | ),
260 | torch_dtype=torch.bfloat16 if use_bf16 else torch.float32
261 | )
262 | model = fix_model(model, tokenizer, use_resize=False)
263 | model = prepare_model_for_int8_training(model)
264 | else:
265 | model = model_types[model_type].from_pretrained(model_name)
266 | model = fix_model(model, tokenizer)
267 |
268 | # Default model generation params
269 | model.config.num_beams = 5
270 | if mode == "instruct":
271 | max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1
272 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count
273 |
274 | if not ddp and torch.cuda.device_count() > 1:
275 | model.is_parallelizable = True
276 | model.model_parallel = True
277 |
278 | if lora_config:
279 | #lora_config = LoraConfig(**lora_config)
280 | model = get_peft_model(model, lora_config)
281 |
282 | trainer_class = Trainer ##if not omit_base_model_save else TrainerNoBaseSave
283 | print("Trainer class:", trainer_class)
284 | trainer = trainer_class(
285 | model=model,
286 | args=training_args,
287 | train_dataset=train_dataset,
288 | eval_dataset=val_dataset,
289 | callbacks=callbacks,
290 | data_collator=data_collator,
291 | preprocess_logits_for_metrics = preprocess_logits_for_metrics,
292 | )
293 |
294 | # with wandb.init(project="llama_ft_samsum", name="llama finetuning run") as run: ## changed the name don't forget
295 | checkpoint_dir = output_dir
296 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
297 | trainer.train(resume_from_checkpoint=True)
298 | else:
299 | trainer.train()
300 | model.save_pretrained(output_dir)
301 |
302 | trainer.model.push_to_hub(args.repo_name)
--------------------------------------------------------------------------------