├── .Rhistory
├── llmtune
    ├── __init__.py
    ├── engine
    │   ├── __init__.py
    │   ├── lora
    │   │   ├── __init__.py
    │   │   ├── peft.py
    │   │   ├── config.py
    │   │   └── utils.py
    │   ├── quant
    │   │   ├── __init__.py
    │   │   ├── gptq
    │   │   │   ├── __init__.py
    │   │   │   ├── quantizer.py
    │   │   │   ├── extras.py
    │   │   │   └── algorithm.py
    │   │   ├── algorithm.py
    │   │   ├── converter.py
    │   │   └── config.py
    │   └── inference
    │   │   ├── __init__.py
    │   │   ├── matmult.py
    │   │   ├── cuda
    │   │       └── quant_cuda.cpp
    │   │   ├── modules.py
    │   │   └── autograd.py
    ├── llms
    │   ├── __init__.py
    │   ├── opt
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   └── model.py
    │   ├── bloom
    │   │   ├── __init__.py
    │   │   └── model.py
    │   ├── llama
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   └── model.py
    │   ├── config.py
    │   └── autollm.py
    ├── utils.py
    ├── data
    │   ├── abstract.py
    │   ├── __init__.py
    │   ├── alpaca.py
    │   ├── text.py
    │   ├── gpt4all.py
    │   └── calibration.py
    ├── config.py
    ├── executor.py
    └── run.py
├── .DS_Store
├── finetune
    ├── samsum-llama
    │   ├── llama_lora_samsum.json
    │   ├── utils.py
    │   ├── eval_samsum_4bit_bnb.py
    │   ├── data.py
    │   ├── eval_samsum_4bit_llmtune.py
    │   ├── train_samsum_4bit.py
    │   └── train_samsum_4bit_bnb.py
    ├── samsum-opt
    │   ├── llama_lora_samsum.json
    │   ├── utils.py
    │   ├── data.py
    │   ├── eval_samsum_opt_4bit_llmtune.py
    │   └── train_samsum_opt_4bit_llmtune.py
    ├── mnli-llama
    │   ├── llama_lora_mnli_label.json
    │   ├── utils.py
    │   ├── data_mnli_label.py
    │   ├── eval_mnli_llmtune.py
    │   └── train_mnli_llmtune_label.py
    └── bbh-eval
    │   ├── main_dev.py
    │   └── bbh_dev.py
├── examples
    ├── push_to_hub.py
    ├── quantize.py
    ├── generate.py
    ├── generate-after-lora.py
    └── finetune.py
├── LICENSE
└── README.md


/.Rhistory:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmtune/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmtune/engine/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmtune/llms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmtune/llms/opt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmtune/engine/lora/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmtune/engine/quant/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmtune/llms/bloom/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmtune/llms/llama/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmtune/engine/inference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmtune/engine/quant/gptq/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kuleshov-group/MODULoRA-Experiment/HEAD/.DS_Store


--------------------------------------------------------------------------------
/llmtune/engine/quant/algorithm.py:
--------------------------------------------------------------------------------
1 | from llmtune.engine.quant.config import QuantConfig
2 | 
3 | class QuantizationAlgorithm():
4 | 	"""Quantization algorthim abstract class"""
5 | 	def __init__(self, config: QuantConfig):
6 | 		self.config = config
7 | 
8 | 	def quantize(self, model, dataloader):
9 | 		raise NotImplementedError


--------------------------------------------------------------------------------
/finetune/samsum-llama/llama_lora_samsum.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "description": "Template used by LLAMA-SAMSUM.",
 3 |     "prompts_input": [
 4 |         "### Summarize this: {instruction}\n ### Output: "
 5 |     ],
 6 |     "prompts_no_input": [
 7 |         "### Summarize this: {instruction}\n ### Output: "
 8 |     ],
 9 |     "output_separator": "### Output: "
10 | }


--------------------------------------------------------------------------------
/finetune/samsum-opt/llama_lora_samsum.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "description": "Template used by LLAMA-SAMSUM.",
 3 |     "prompts_input": [
 4 |         "### Summarize this: {instruction}\n ### Output: "
 5 |     ],
 6 |     "prompts_no_input": [
 7 |         "### Summarize this: {instruction}\n ### Output: "
 8 |     ],
 9 |     "output_separator": "### Output: "
10 | }


--------------------------------------------------------------------------------
/examples/push_to_hub.py:
--------------------------------------------------------------------------------
 1 | from llmtune.llms.autollm import AutoLLMForCausalLM
 2 | 
 3 | # load model
 4 | model_dir = './llama-7b-quantized' # can generate this via quantize.py
 5 | llm = AutoLLMForCausalLM.from_pretrained(model_dir)
 6 | 
 7 | # push to hub
 8 | llm.push_to_hub(
 9 | 	repo_id='', 
10 | 	save_dir=model_dir,
11 | 	commit_message='first commit'
12 | )
13 | 


--------------------------------------------------------------------------------
/finetune/mnli-llama/llama_lora_mnli_label.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "description": "Template used by LLAMA-MNLI-m output label.",
 3 |     "prompts_input": [
 4 |         "### Premise: {instruction}\n ### Hypothesis: {hypothesis}\n ### Genre: {genre} ### Label:"
 5 |     ],
 6 |     "prompts_no_input": [
 7 |         "### Premise: {instruction}\n ### Hypothesis: {hypothesis}\n ### Genre: {genre} ### Label:"
 8 |     ],
 9 |     "output_separator": "### Label:"
10 | }


--------------------------------------------------------------------------------
/llmtune/engine/lora/peft.py:
--------------------------------------------------------------------------------
 1 | """Wraps around PEFT to use QuantLoraModel instead of regular LoraModel."""
 2 | 
 3 | import peft as quant_peft
 4 | from llmtune.engine.lora.lora import QuantLoraModel
 5 | 
 6 | # monkey patch peft to use QuantLoraModel
 7 | quant_peft.tuners.lora.LoraModel = QuantLoraModel
 8 | quant_peft.peft_model.LoraModel = QuantLoraModel
 9 | 
10 | # the above works for PEFT at the time of writing this code;
11 | # when upgrading to a newer PEFT, use this insted:
12 | # quant_peft.peft_model.PEFT_TYPE_TO_MODEL_MAPPING[
13 | # 	quant_peft.utils.PeftType.LORA
14 | # ] = QuantLoraModel


--------------------------------------------------------------------------------
/finetune/bbh-eval/main_dev.py:
--------------------------------------------------------------------------------
 1 | from fire import Fire
 2 | 
 3 | import bbh_dev
 4 | 
 5 | def main(task_name: str, **kwargs):
 6 |     task_map = dict(
 7 |         bbh=bbh_dev.main,
 8 |     )
 9 | 
10 |     if task_name == "all":
11 |         results = {}
12 |         for name, task_fn in task_map.items():
13 |             score = task_fn(**kwargs)
14 |             results[name] = score
15 |     else:
16 |         task_fn = task_map.get(task_name)
17 |         if task_fn is None:
18 |             raise ValueError(f"{task_name}. Choose from {list(task_map.keys())}")
19 |         score = task_fn(**kwargs)
20 |         results = {task_name: score}
21 | 
22 |     results = {name: round(score * 100, 2) for name, score in results.items()}
23 |     print(results)
24 |     return results
25 | 
26 | if __name__ == "__main__":
27 |     Fire(main)
28 | 


--------------------------------------------------------------------------------
/llmtune/llms/opt/config.py:
--------------------------------------------------------------------------------
 1 | # from llmtune.llms.config import AutoQuantConfig, LLMType
 2 | 
 3 | OPT_MODELS  = [
 4 |     "opt-6.7b-4bit", "opt-13b-4bit",
 5 |     "opt-6.7b-3bit", "opt-13b-3bit",
 6 | ]
 7 | 
 8 | def get_opt_config(model):
 9 |     if '4bit' in model:
10 |         bits = 4
11 |     elif '3bit' in model:
12 |         bits = 3
13 |     elif '2bit' in model:
14 |         bits = 2
15 | 
16 |     if '6.7b' in model:
17 |         hf_config_name = "facebook/opt-6.7b"
18 |     elif '13b' in model:
19 |         hf_config_name = "facebook/opt-13b"
20 | 
21 |     raise NotImplementedError()
22 | 
23 |     llm_config = AutoQuantConfig(
24 |         name=model,
25 |         model_type=LLMType.OPT,
26 |         hf_config_name=hf_config_name,
27 |         hf_tokenizer_config="",
28 |         bits=bits
29 |     )
30 |     return llm_config
31 | 


--------------------------------------------------------------------------------
/llmtune/utils.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import urllib.request
 3 | 
 4 | def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
 5 |     if type(module) in layers:
 6 |         return {name: module}
 7 |     res = {}
 8 |     for name1, child in module.named_children():
 9 |         res.update(find_layers(
10 |             child, layers=layers, name=name + '.' + name1 if name != '' else name1
11 |         ))
12 |     return res
13 | 
14 | def to_half_precision(model):
15 |     for n, m in model.named_modules():
16 |         if '4bit' in str(type(m)) or 'QuantLinear' in str(type(m)):
17 |             # m.zeros = m.zeros.half()
18 |             m.scales = m.scales.half()    
19 |             if m.bias is not None:
20 |                 m.bias = m.bias.half()    
21 |     return model
22 | 
23 | def download_file(url, path):
24 | 	print('Starting download')
25 | 	urllib.request.urlretrieve(url, path)
26 | 	print('Done')


--------------------------------------------------------------------------------
/llmtune/data/abstract.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Dict, Any
 3 | 
 4 | 
 5 | # Abstract train data loader
 6 | class AbstractTrainData(ABC):
 7 |     """
 8 |     """
 9 |     @abstractmethod
10 |     def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len: int) -> None:
11 |         """
12 |         Args:
13 |             dataset (str): Path to dataset
14 |             val_set_size (int) : Size of validation set
15 |             tokenizer (_type_): Tokenizer
16 |         """
17 |         self.tokenizer = tokenizer
18 |         self.dataset = dataset
19 |         self.val_set_size = val_set_size
20 |         self.cutoff_len = cutoff_len
21 |         self.train_data = None
22 |         self.val_data = None
23 | 
24 |     @abstractmethod
25 |     def tokenize(self, prompt: str) -> Dict[str, Any]:
26 |         pass
27 | 
28 |     @abstractmethod
29 |     def prepare_data(self) -> None:
30 |         """Loads dataset from file and prepares train_data for trainer."""
31 |         pass
32 | 


--------------------------------------------------------------------------------
/llmtune/data/__init__.py:
--------------------------------------------------------------------------------
 1 | from llmtune.data.text import TrainTxt
 2 | from llmtune.data.alpaca import TrainSAD
 3 | from llmtune.data.gpt4all import TrainGPT4All
 4 | 
 5 | def load_finetuning_data(tune_config, tokenizer):
 6 |     if tune_config.ds_type == "alpaca":
 7 |         data = TrainSAD(
 8 |             tune_config.dataset, 
 9 |             tune_config.val_set_size, 
10 |             tokenizer, 
11 |             tune_config.cutoff_len
12 |         )
13 |     elif tune_config.ds_type == "gpt4all":
14 |         raise NotImplementedError('GPT4All dataset currently not supported')
15 |         data = TrainGPT4All(
16 |             tune_config.dataset, 
17 |             tune_config.val_set_size, 
18 |             tokenizer, 
19 |             tune_config.cutoff_len
20 |         )
21 |     else:
22 |         raise ValueError(f"Invalid data name: {tune_config.ds_type}")
23 |     # data.prepare_data(
24 |     #     thd=tune_config.txt_row_thd, use_eos_token=tune_config.use_eos_token
25 |     # )
26 |     data.prepare_data()
27 |     return data


--------------------------------------------------------------------------------
/llmtune/engine/quant/converter.py:
--------------------------------------------------------------------------------
 1 | from llmtune.engine.inference.modules import QuantLinear
 2 | 
 3 | def make_quant(
 4 |     module, names, bits, groupsize=-1, name='', is_cuda=True
 5 | ):
 6 |     if isinstance(module, QuantLinear):
 7 |         return
 8 |     for attr in dir(module):
 9 |         tmp = getattr(module, attr)
10 |         name1 = name + '.' + attr if name != '' else attr
11 |         if name1 in names:
12 |             setattr(
13 |                 module, attr, QuantLinear(
14 |                     bits=bits, 
15 |                     groupsize=groupsize, 
16 |                     in_features=tmp.in_features, 
17 |                     out_features=tmp.out_features, 
18 |                     bias=tmp.bias, 
19 |                     is_cuda=is_cuda,
20 |                 )
21 |             )
22 |     for name1, child in module.named_children():
23 |         make_quant(
24 |             child, 
25 |             names, 
26 |             bits=bits,
27 |             name=name + '.' + name1 if name != '' else name1, 
28 |             groupsize=groupsize,
29 |             is_cuda=is_cuda
30 |         )
31 | 


--------------------------------------------------------------------------------
/examples/quantize.py:
--------------------------------------------------------------------------------
 1 | from llmtune.llms.autollm import AutoLLMForCausalLM
 2 | from llmtune.engine.quant.config import QuantConfig
 3 | from llmtune.engine.quant.gptq.executor import GPTQAlgorithm
 4 | from llmtune.data.calibration import get_calibration_loaders
 5 | 
 6 | # load model
 7 | model_name = 'decapoda-research/llama-7b-hf'
 8 | llm = AutoLLMForCausalLM.from_pretrained(model_name)
 9 | llm.eval()
10 | 
11 | # set up quantization config
12 | config = QuantConfig(
13 | 	bits=4,
14 | 	dataset='c4',
15 | 	seed=0,
16 | 	nsamples=128,
17 | 	percdamp=.01,
18 | 	groupsize=64,
19 | 	act_order=True,
20 | 	nearest=False,
21 | 	save='./llama-7b-quantized'
22 | )
23 | 
24 | # load gptq calibration data
25 | dataloader, _ = get_calibration_loaders(
26 |     config.dataset, 
27 |     nsamples=config.nsamples, 
28 |     seed=config.seed, 
29 |     model=llm.base_model.name_or_path, 
30 |     seqlen=llm.base_model.seqlen
31 | )
32 | 
33 | # create quantization algorithm
34 | gptq = GPTQAlgorithm(config)
35 | llm = gptq.quantize(llm, dataloader)
36 | 
37 | llm.save_pretrained(config.save)
38 | print(f'Model weights saved to: {config.save}')


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 oscarscaro
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/llmtune/llms/llama/config.py:
--------------------------------------------------------------------------------
 1 | # from llmtune.llms.config import AutoQuantConfig, LLMType
 2 | 
 3 | LLAMA_MODELS = [
 4 |     "llama-7b-4bit", "llama-13b-4bit", "llama-30b-4bit", "llama-65b-4bit",
 5 |     "llama-7b-3bit", "llama-13b-3bit", "llama-30b-3bit", "llama-65b-3bit",
 6 |     "llama-7b-2bit", "llama-65b-2bit", 
 7 | ]
 8 | 
 9 | def get_llama_config(model):
10 |     if '4bit' in model:
11 |         bits = 4
12 |     elif '3bit' in model:
13 |         bits = 3
14 |     elif '2bit' in model:
15 |         bits = 2
16 | 
17 |     if '7b' in model:
18 |         hf_config_name = "decapoda-research/llama-7b-hf"
19 |     elif '13b' in model:
20 |         hf_config_name = "decapoda-research/llama-13b-hf"
21 |     elif '30b' in model:
22 |         hf_config_name = "decapoda-research/llama-30b-hf"
23 |     elif '65b' in model:
24 |         hf_config_name = "decapoda-research/llama-65b-hf"
25 | 
26 |     raise NotImplementedError()
27 | 
28 |     llm_config = AutoQuantConfig(
29 |         name=model,
30 |         model_type=LLMType.LLAMA,
31 |         hf_config_name=hf_config_name,
32 |         hf_tokenizer_config="huggyllama/llama-13b",
33 |         bits=bits
34 |     )
35 |     return llm_config
36 | 


--------------------------------------------------------------------------------
/examples/generate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer
 3 | from llmtune.llms.autollm import AutoLLMForCausalLM
 4 | from llmtune.utils import to_half_precision
 5 | 
 6 | # model config
 7 | model_name = ''
 8 | # model_name = './llama-7b-quantized' # can generate local dir via quantize.py
 9 | tokenizer_name = 'huggyllama/llama-13b'
10 | DEV = 'cuda'
11 | 
12 | # load model
13 | llm = AutoLLMForCausalLM.from_pretrained(model_name).to(DEV)
14 | llm.eval()
15 | llm = to_half_precision(llm)
16 | 
17 | # load tokenizer
18 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
19 | 
20 | # encode prompt
21 | prompt = 'The pyramids were built by'
22 | input_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEV)
23 | 
24 | # generation config
25 | min_length=10
26 | max_length=200
27 | top_p=.95
28 | top_k=25
29 | temperature=1.0
30 | 
31 | # generate text
32 | with torch.no_grad():
33 |     generated_ids = llm.generate(
34 |         inputs=input_ids,
35 |         do_sample=True,
36 |         min_length=min_length,
37 |         max_length=max_length,
38 |         top_p=top_p,
39 |         top_k=top_k,
40 |         temperature=temperature,
41 |     )
42 | 
43 | # decode and print
44 | output = tokenizer.decode([el.item() for el in generated_ids[0]])
45 | print(output)
46 | 


--------------------------------------------------------------------------------
/llmtune/engine/quant/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from dataclasses import dataclass
 4 | from transformers.utils.hub import PushToHubMixin, cached_file
 5 | 
 6 | @dataclass
 7 | class QuantConfig(PushToHubMixin):
 8 |     dataset: str
 9 |     bits: int
10 |     nsamples: int
11 |     groupsize: int
12 |     act_order: bool
13 |     percdamp: float
14 |     seed: int
15 |     nearest: bool
16 |     save: str
17 | 
18 |     def save_pretrained(self, save_dir: str, **kwargs):
19 |         config_path = os.path.join(save_dir, "quant_config.json")
20 |         with open(config_path, "w", encoding="utf-8") as f:
21 |             json.dump(self.to_dict(), f, indent=2)
22 | 
23 |     @classmethod
24 |     def from_pretrained(cls, save_dir: str, **kwargs):
25 |         config_filename = "quant_config.json"
26 |         if os.path.isdir(save_dir): 
27 |             config_path = os.path.join(save_dir, config_filename)
28 |         else:
29 |             config_path = cached_file(save_dir, config_filename)
30 |         with open(config_path, "r", encoding="utf-8") as f:
31 |             return cls(**json.load(f))
32 |                 
33 |     def to_dict(self):
34 |         return {
35 |             'dataset': self.dataset,
36 |             'bits': self.bits,
37 |             'nsamples': self.nsamples,
38 |             'groupsize': self.groupsize,
39 |             'act_order': self.act_order,
40 |             'percdamp': self.percdamp,
41 |             'seed': self.seed,
42 |             'nearest': self.nearest,
43 |             'save': self.save,
44 |         }


--------------------------------------------------------------------------------
/examples/generate-after-lora.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, GenerationConfig
 3 | from llmtune.llms.autollm import AutoLLMForCausalLM
 4 | from llmtune.utils import to_half_precision
 5 | from llmtune.engine.lora.peft import quant_peft
 6 | 
 7 | # model config
 8 | model_name = ''
 9 | # model_name = './llama-7b-quantized' # can generate local dir via quantize.py
10 | tokenizer_name = 'huggyllama/llama-7b'
11 | DEV = 'cuda'
12 | 
13 | # load model
14 | llm = AutoLLMForCausalLM.from_pretrained(model_name).to(DEV)
15 | llm.eval()
16 | llm = to_half_precision(llm)
17 | 
18 | # load tokenizer
19 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
20 | 
21 | # load lora from existing checkpoint
22 | adapter_path = './llama-7b-quantized-lora' # can generate this via finetune.py
23 | model = quant_peft.PeftModel.from_pretrained(
24 |     llm, adapter_path, 
25 |     device_map='auto'
26 | )
27 | print(adapter_path, 'loaded')
28 | 
29 | # encode prompt
30 | prompt = 'Write a detailed step-by-step recipe for a blueberry lasagna dish'
31 | input_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEV)
32 | 
33 | # generation config
34 | min_length=10
35 | max_length=200
36 | top_p=.95
37 | top_k=25
38 | temperature=1.0
39 | 
40 | # generate text
41 | with torch.no_grad():
42 |     generated_ids = model.generate(
43 |         inputs=input_ids,
44 |         generation_config=GenerationConfig(
45 | 	        do_sample=True,
46 | 	        min_length=min_length,
47 | 	        max_length=max_length,
48 | 	        top_p=top_p,
49 | 	        top_k=top_k,
50 | 	        temperature=temperature,
51 | 	    )
52 |     )
53 | 
54 | # decode and print
55 | output = tokenizer.decode([el.item() for el in generated_ids[0]])
56 | print(output)
57 | 


--------------------------------------------------------------------------------
/llmtune/engine/lora/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | class FinetuneConfig:
 4 |     """Config holder for finetuning"""
 5 |     def __init__(
 6 |         self, dataset: str, ds_type: str, 
 7 |         lora_out_dir: str,
 8 |         mbatch_size: int, batch_size: int,
 9 |         epochs: int, lr: float, 
10 |         cutoff_len: int,
11 |         lora_r: int, lora_alpha: int, lora_dropout: float,
12 |         val_set_size: float,
13 |         warmup_steps: int, save_steps: int, 
14 |         save_total_limit: int, logging_steps: int,
15 |      ):
16 |         self.dataset = dataset
17 |         self.ds_type = ds_type
18 |         self.lora_out_dir = lora_out_dir
19 |         self.mbatch_size = mbatch_size
20 |         self.batch_size = batch_size
21 |         self.gradient_accumulation_steps = self.batch_size // self.mbatch_size
22 |         self.epochs = epochs
23 |         self.lr = lr
24 |         self.cutoff_len = cutoff_len
25 |         self.lora_r = lora_r
26 |         self.lora_alpha = lora_alpha
27 |         # self.lora_dropout = 0 if gradient_checkpointing else lora_dropout
28 |         self.lora_dropout = lora_dropout
29 |         self.val_set_size = int(val_set_size) if val_set_size > 1.0 else float(val_set_size)
30 |         self.warmup_steps = warmup_steps
31 |         self.save_steps = save_steps
32 |         self.save_total_limit = save_total_limit
33 |         self.logging_steps = logging_steps
34 |         self.world_size = int(os.environ.get("WORLD_SIZE", 1))
35 |         self.local_rank = int(os.environ.get("LOCAL_RANK", 0))
36 |         self.ddp = self.world_size != 1
37 |         self.device_map = "auto" if not self.ddp else {"": self.local_rank}
38 |         if self.ddp:
39 |             self.gradient_accumulation_steps = self.gradient_accumulation_steps // self.world_size


--------------------------------------------------------------------------------
/llmtune/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from llmtune.llms.config import AutoConfig
 3 | from llmtune.llms.opt.config import OPT_MODELS
 4 | from llmtune.llms.llama.config import LLAMA_MODELS
 5 | from llmtune.engine.lora.config import FinetuneConfig
 6 | from llmtune.engine.quant.config import QuantConfig
 7 | 
 8 | # ----------------------------------------------------------------------------
 9 | 
10 | # define some constants
11 | DEV = torch.device('cuda')
12 | LLM_MODELS = LLAMA_MODELS + OPT_MODELS
13 | 
14 | # ----------------------------------------------------------------------------
15 | 
16 | # helpers for loading configs
17 | def get_finetune_config(args):
18 |     return FinetuneConfig(
19 |         dataset=args.dataset, 
20 |         ds_type=args.data_type, 
21 |         lora_out_dir=args.adapter, 
22 |         mbatch_size=args.mbatch_size,
23 |         batch_size=args.batch_size,
24 |         epochs=args.epochs, 
25 |         lr=args.lr,
26 |         cutoff_len=args.cutoff_len,
27 |         lora_r=args.lora_r, 
28 |         lora_alpha=args.lora_alpha, 
29 |         lora_dropout=args.lora_dropout,
30 |         val_set_size=args.val_set_size,
31 |         warmup_steps=args.warmup_steps,
32 |         save_steps=args.save_steps,
33 |         save_total_limit=args.save_total_limit,
34 |         logging_steps=args.logging_steps,
35 |     )
36 | 
37 | def get_quant_config(args):
38 |     return QuantConfig(
39 |         dataset=args.dataset,
40 |         bits=args.bits,
41 |         nsamples=args.nsamples,
42 |         groupsize=args.groupsize,
43 |         act_order=args.act_order,
44 |         percdamp=args.percdamp,
45 |         seed=args.seed,
46 |         nearest=args.nearest,
47 |         save=args.save,
48 |     )
49 | 
50 | def get_llm_config(model_name_or_path):
51 |     return AutoConfig(model_name_or_path)


--------------------------------------------------------------------------------
/llmtune/llms/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from enum import Enum
 4 | from transformers import PretrainedConfig, AutoConfig
 5 | from transformers.utils.hub import PushToHubMixin, cached_file
 6 | from llmtune.engine.quant.config import QuantConfig
 7 | 
 8 | class LLMType(Enum):
 9 |     LLAMA = 'llama'
10 |     OPT = 'opt'
11 |     BLOOM = 'bloom'
12 | 
13 | class AutoLLMConfig(PretrainedConfig,PushToHubMixin):
14 |     def __init__(
15 |         self, 
16 |         base_config: PretrainedConfig, 
17 |         quant_config: QuantConfig = None
18 |     ):
19 |         self.base_config = base_config
20 |         self.quant_config = None
21 |         if quant_config is not None:
22 |             self.quant_config = quant_config
23 | 
24 |     @property
25 |     def is_quantized(self):
26 |         return self.quant_config is not None
27 | 
28 |     def set_quant_config(self, quant_config):
29 |         if self.quant_config is not None:
30 |             raise RuntimeError('quant_config already set')
31 |         self.quant_config = quant_config
32 | 
33 |     @property
34 |     def model_type(self):
35 |         return self.base_config.model_type
36 | 
37 |     def save_pretrained(self, save_dir: str, **kwargs):
38 |         self.base_config.save_pretrained(save_dir, **kwargs)
39 |         if self.is_quantized:
40 |             self.quant_config.save_pretrained(save_dir, **kwargs)
41 | 
42 |     @classmethod
43 |     def from_pretrained(cls, save_dir: str):
44 |         # load config
45 |         base_config = AutoConfig.from_pretrained(save_dir)
46 | 
47 |         # check if quantized model and config are available
48 |         try:
49 |             quant_config = (
50 |                 QuantConfig.from_pretrained(save_dir)
51 |             )
52 |         except:
53 |             quant_config = None
54 | 
55 |         # check if it's a valid model
56 |         if base_config.model_type not in [e.value for e in LLMType]:
57 |             raise NotImplementedError(
58 |                 f"Model type {base_config.model_type} currently not supported"
59 |             )
60 | 
61 |         return cls(base_config, quant_config)
62 | 


--------------------------------------------------------------------------------
/llmtune/llms/bloom/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from llmtune.utils import find_layers
 5 | from llmtune.engine.quant.converter import make_quant
 6 | 
 7 | def load_bloom_unquantized(llm_config):
 8 |     import torch
 9 |     from transformers import BloomForCausalLM
10 |     def skip(*args, **kwargs):
11 |         pass
12 |     torch.nn.init.kaiming_uniform_ = skip
13 |     torch.nn.init.uniform_ = skip
14 |     torch.nn.init.normal_ = skip
15 |     model = BloomForCausalLM.from_pretrained(
16 |         llm_config.base_config.name_or_path, torch_dtype='auto'
17 |     )
18 |     return model
19 | 
20 | def load_bloom_quantized(llm_config, quantized_weights_path):
21 |     import transformers, accelerate
22 |     from transformers import BloomConfig, BloomForCausalLM
23 |     
24 |     with accelerate.init_empty_weights():
25 |         config = BloomConfig.from_pretrained(
26 |             llm_config.base_config.name_or_path
27 |         )
28 |         torch.set_default_dtype(torch.half)
29 |         transformers.modeling_utils._init_weights = False
30 |         torch.set_default_dtype(torch.half)
31 |         model = BloomForCausalLM(config)
32 |         torch.set_default_dtype(torch.float)
33 |         model = model.eval()
34 |         layers = find_layers(model)
35 |         for name in ['lm_head']:
36 |             if name in layers:
37 |                 del layers[name]
38 |         make_quant(
39 |             model, layers, llm_config.quant_config.bits, 
40 |             groupsize=llm_config.quant_config.groupsize
41 |         )
42 |     model = accelerate.load_checkpoint_and_dispatch(
43 |             model=model,
44 |             checkpoint=quantized_weights_path,
45 |             device_map="auto",
46 |             # device_map={'': 0},
47 |             no_split_module_classes=["LlamaDecoderLayer"]
48 |     )
49 |     return model
50 | 
51 | def load_bloom(llm_config, quantized_weights_path):
52 |     if quantized_weights_path is None:
53 |         model = load_bloom_unquantized(llm_config)
54 |     else:
55 |         model = load_bloom_quantized(
56 |             llm_config, quantized_weights_path
57 |         )
58 |     model.seqlen = 2048
59 |     return model
60 | 
61 | def load_bloom_tokenizer(name_or_path):
62 |     from transformers import BloomTokenizer
63 |     
64 |     tokenizer = BloomTokenizer.from_pretrained(
65 |         name_or_path
66 |     )
67 |     tokenizer.truncation_side = 'left'
68 |     return tokenizer
69 | 


--------------------------------------------------------------------------------
/llmtune/llms/llama/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from llmtune.utils import find_layers
 5 | from llmtune.engine.quant.converter import make_quant
 6 | 
 7 | def load_llama_unquantized(llm_config):
 8 |     import torch
 9 |     from transformers import LlamaForCausalLM
10 |     def skip(*args, **kwargs):
11 |         pass
12 |     torch.nn.init.kaiming_uniform_ = skip
13 |     torch.nn.init.uniform_ = skip
14 |     torch.nn.init.normal_ = skip
15 |     model = LlamaForCausalLM.from_pretrained(
16 |         llm_config.base_config.name_or_path, torch_dtype='auto'
17 |     )
18 |     return model
19 | 
20 | def load_llama_quantized(llm_config, quantized_weights_path):
21 |     import transformers, accelerate
22 |     from transformers import LlamaConfig, LlamaForCausalLM
23 |     
24 |     with accelerate.init_empty_weights():
25 |         config = LlamaConfig.from_pretrained(
26 |             llm_config.base_config.name_or_path
27 |         )
28 |         torch.set_default_dtype(torch.half)
29 |         transformers.modeling_utils._init_weights = False
30 |         torch.set_default_dtype(torch.half)
31 |         model = LlamaForCausalLM(config)
32 |         torch.set_default_dtype(torch.float)
33 |         model = model.eval()
34 |         layers = find_layers(model)
35 |         for name in ['lm_head']:
36 |             if name in layers:
37 |                 del layers[name]
38 |         make_quant(
39 |             model, layers, llm_config.quant_config.bits, 
40 |             groupsize=llm_config.quant_config.groupsize
41 |         )
42 |     model = accelerate.load_checkpoint_and_dispatch(
43 |             model=model,
44 |             checkpoint=quantized_weights_path,
45 |             device_map="auto",
46 |             # device_map={'': 0},
47 |             no_split_module_classes=["LlamaDecoderLayer"]
48 |     )
49 |     return model
50 | 
51 | def load_llama(llm_config, quantized_weights_path):
52 |     if quantized_weights_path is None:
53 |         model = load_llama_unquantized(llm_config)
54 |     else:
55 |         model = load_llama_quantized(
56 |             llm_config, quantized_weights_path
57 |         )
58 |     model.seqlen = 2048
59 |     return model
60 | 
61 | def load_llama_tokenizer(name_or_path):
62 |     from transformers import LlamaTokenizer
63 |     
64 |     tokenizer = LlamaTokenizer.from_pretrained(
65 |         name_or_path
66 |     )
67 |     tokenizer.truncation_side = 'left'
68 |     return tokenizer
69 | 


--------------------------------------------------------------------------------
/llmtune/engine/lora/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2023-present the HuggingFace Inc. team.
 3 | # Edite by Volodymyr Kuleshov
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import torch
18 | 
19 | 
20 | def prepare_model_for_int4_training(
21 |     model, output_embedding_layer_name="lm_head", use_gradient_checkpointing=False, layer_norm_names=["layer_norm"]
22 | ):
23 |     r"""
24 |     This method wrapps the entire protocol for preparing a model before running a training. This includes:
25 |         1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
26 |         head to fp32
27 |     Args:
28 |         model, (`transformers.PreTrainedModel`):
29 |             The loaded model from `transformers`
30 |     """
31 |     # loaded_in_8bit = getattr(model, "is_loaded_in_8bit", False)
32 |     loaded_in_4bit = True
33 | 
34 |     for name, param in model.named_parameters():
35 |         # freeze base model's layers
36 |         param.requires_grad = False
37 | 
38 |         if loaded_in_4bit:
39 |             # cast layer norm in fp32 for stability for 4bit models
40 |             if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
41 |                 param.data = param.data.to(torch.float32)
42 | 
43 |     if loaded_in_4bit and use_gradient_checkpointing:
44 |         raise NotImplementedError()
45 | 
46 |     if hasattr(model, output_embedding_layer_name):
47 |         output_embedding_layer = getattr(model, output_embedding_layer_name)
48 |         input_dtype = output_embedding_layer.weight.dtype
49 | 
50 |         class CastOutputToFloat(torch.nn.Sequential):
51 |             r"""
52 |             Manually cast to the expected dtype of the lm_head as sometimes there is a final layer norm that is casted
53 |             in fp32
54 |             """
55 | 
56 |             def forward(self, x):
57 |                 return super().forward(x.to(input_dtype)).to(torch.float32)
58 | 
59 |         setattr(model, output_embedding_layer_name, CastOutputToFloat(output_embedding_layer))
60 | 
61 |     return model
62 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ModuLoRA
 2 | Code repository (experiment) for the paper "ModuLoRA: Finetuning 3-Bit LLMs on Consumer GPUs by Integrating with Modular Quantizers", [ArXiv](https://arxiv.org/abs/2309.16119).
 3 | 
 4 | **This repo builds on [LLMtools](https://github.com/kuleshov-group/llmtools), with added support of custom dataset preparation and evaluation to reproduce our experiment.**
 5 | 
 6 | **Abstract:** We propose a memory-efficient finetuning algorithm for large language models (LLMs) that supports
 7 | finetuning LLMs with 65B parameters in 3-bit or 4-bit precision on as little as one 48GB GPU. Our
 8 | method, modular low-rank adaptation (MODULORA), integrates any user-specified weight quantizer
 9 | with finetuning via low-rank adapters (LoRAs). Our approach relies on a simple quantization-agnostic
10 | backward pass that adaptively materializes low-precision LLM weights from a custom black-box
11 | quantization module. This approach enables finetuning 3-bit LLMs for the first time—leveraging
12 | state-of-the-art 3-bit OPTQ quantization often outperforms finetuning that relies on less sophisticated
13 | 4-bit and 8-bit methods. In our experiments, MODULORA attains competitive performance on text
14 | classification, natural language infernece, and instruction following tasks using significantly less
15 | memory than existing approaches, and we also surpass the state-of-the-art ROUGE score on a popular
16 | summarization task. We release MODULORA together with a series of low-precision models—
17 | including the first family of 3-bit instruction following Alpaca LLMs—as part of LLMTOOLS, a
18 | user-friendly library for quantizing, running, and finetuning LLMs on consumer GPU.
19 | 
20 | 
21 | # Repository Overview
22 | 
23 | There are several directories in this repo:
24 | * [llmtune/](llmtune) contains the source code for the package `llmtune`, which needs to be installed to run the examples we provide;
25 | * [examples/](examples/) contains an example implementation of 4-bit, 3-bit quantization using OPTQ, finetuning with alpaca dataset, and model generation after applying finetuned LoRA adapater werights.
26 | * [finetune/samsum-llama/](finetune/samsum-llama) contains implementation of finetuning SAMSum benchmark with LoRA in LLaMA models using our package and bitsandbytes, which can be used to reproduce the result in our paper;
27 | * [finetune/mnli-samsum/](finetune/mnli-llama) contains implementation of finetuning MNLI benchmark with LoRA in LLaMA models using our package and bitsandbytes, which produces competitive results compared to SOTA;
28 | * Others finetuning scripts can also be found in the same directory [OPT](finetune/samsum-opt), [BLOOM](finetune/mnli-bloom)
29 | * See how we train `MODULoRA` 3-bit / 4-bit models in [SAMSum-LLAMA](finetune/samsum-llama/train_samsum_4bit.py), [MNLI-LLAMA](finetune/mnli-llama/train_mnli_llmtune_label.py), and [BBH-LLAMA](finetune/mnli-llama/modeling_roberta.py)
30 | * See how we evaluate `MODULoRA` results in [SAMSum-LLAMA](finetune/samsum-llama/eval_samsum_4bit_llmtune.py), [MNLI-LLAMA](finetune/mnli-llama/eval_mnli_llmtune.py), and [BBH-LLAMA](finetune/bbh-eval/main_dev.py)
31 | 
32 | 


--------------------------------------------------------------------------------
/llmtune/data/alpaca.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any
 2 | from datasets import load_dataset
 3 | from llmtune.data.abstract import AbstractTrainData
 4 | 
 5 | DEFAULT_HF_PATH = "kuleshov/alpaca-data"
 6 | 
 7 | class TrainSAD(AbstractTrainData):
 8 |     def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None:
 9 |         super().__init__(dataset, val_set_size, tokenizer, cutoff_len)
10 | 
11 |     def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
12 |         # there's probably a way to do this with the tokenizer settings
13 |         # but again, gotta move fast
14 |         if use_eos_token:
15 |             result = self.tokenizer(
16 |                 prompt + self.tokenizer.eos_token,
17 |                 truncation=True,
18 |                 max_length=self.cutoff_len,
19 |                 padding=False,
20 |             )
21 |             if (
22 |                 result["input_ids"][-1] != self.tokenizer.eos_token_id
23 |                 and len(result["input_ids"]) < self.cutoff_len
24 |             ):
25 |                 result["input_ids"].append(self.tokenizer.eos_token_id)
26 |                 result["attention_mask"].append(1)
27 |             return result
28 |         else:
29 |             result = self.tokenizer(
30 |                 prompt,
31 |                 truncation=True,
32 |                 max_length=self.cutoff_len + 1,
33 |                 padding="max_length",
34 |             )
35 |             return {
36 |                 "input_ids": result["input_ids"][:-1],
37 |                 "attention_mask": result["attention_mask"][:-1],
38 |             }
39 | 
40 |     def prepare_data(self, use_eos_token=True, **kwargs) -> None:
41 |         if self.dataset:
42 |             data = load_dataset("json", data_files=self.dataset)
43 |         else:
44 |             data = load_dataset(DEFAULT_HF_PATH)
45 | 
46 |         if self.val_set_size > 0:
47 |             train_val = data["train"].train_test_split(
48 |                 test_size=self.val_set_size, shuffle=True, seed=42
49 |             )
50 |             self.train_data = train_val["train"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
51 |             self.val_data = train_val["test"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
52 |         else:
53 |             self.train_data = data["train"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
54 |             self.val_data = None
55 | 
56 |     # Auxiliary methods
57 |     def generate_prompt(self, data_point, **kwargs):
58 |         return make_prompt(
59 |             data_point["instruction"],
60 |             data_point["input"],
61 |             data_point["output"]
62 |         )
63 | 
64 | 
65 |     def generate_and_tokenize_prompt(self, data_point, **kwargs):
66 |         prompt = self.generate_prompt(data_point, **kwargs)
67 |         return self.tokenize(prompt, **kwargs)
68 | 
69 | def make_prompt(instruction, input_, output=""):
70 |     return "{0}\n\n{1}\n{2}\n\n{3}\n{4}\n\n{5}\n{6}".format(
71 |         "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.",
72 |         "### Instruction:",
73 |         instruction,
74 |         "### Input:",
75 |         input_,
76 |         "### Response:",
77 |         output
78 |     )
79 | 
80 | def make_output(raw_output):
81 |     return raw_output.split("### Response:")[1].strip()


--------------------------------------------------------------------------------
/llmtune/data/text.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict, Any
 3 | from datasets import Dataset
 4 | from torch.utils.data import DataLoader
 5 | from llmtune.data.abstract import AbstractTrainData
 6 | 
 7 | # LLaMA txt train data loader
 8 | class TrainTxt(AbstractTrainData):
 9 |     def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len):
10 |         super().__init__(dataset, val_set_size, tokenizer, cutoff_len)  # TODO: Validation size isn't used
11 |         self.cutoff_len = cutoff_len
12 |         self.exceed_count = 0
13 | 
14 |     def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
15 |         # there's probably a way to do this with the tokenizer settings
16 |         # but again, gotta move fast
17 |         if use_eos_token:
18 |             result = self.tokenizer(
19 |                 prompt + self.tokenizer.eos_token,
20 |                 truncation=True,
21 |                 max_length=self.cutoff_len,
22 |                 padding=False,
23 |             )
24 |             d = {
25 |                 "input_ids": result["input_ids"],
26 |                 "attention_mask": result["attention_mask"],
27 |             }
28 |             if (
29 |                 d["input_ids"][-1] != self.tokenizer.eos_token_id
30 |                 and len(d["input_ids"]) < self.cutoff_len
31 |             ):
32 |                 d["input_ids"].append(self.tokenizer.eos_token_id)
33 |                 d["attention_mask"].append(1)
34 |         else:
35 |             result = self.tokenizer(
36 |                 prompt,
37 |                 truncation=True,
38 |                 max_length=self.cutoff_len + 1,
39 |                 padding="max_length",
40 |             )
41 |             d = {
42 |                 "input_ids": result["input_ids"][:-1],
43 |                 "attention_mask": result["attention_mask"][:-1],
44 |             }
45 |         if sum(d['attention_mask']) >= self.cutoff_len:
46 |             self.exceed_count += 1
47 |         return d
48 | 
49 |     @classmethod
50 |     def format_new_rows(cls, rows, thd=128):
51 |         r_b = ''
52 |         new_rows = []
53 |         for row in rows:
54 |             if len(r_b) == 0:
55 |                 r_b += row
56 |             else:
57 |                 r_b += '\n' + row
58 |             if len(r_b) > thd:
59 |                 new_rows.append(r_b)
60 |                 r_b = ''
61 |         if len(r_b) > thd:
62 |             new_rows.append(r_b)
63 |             r_b = ''
64 |         return new_rows
65 | 
66 |     def prepare_data(self, thd=-1, use_eos_token=True, **kwargs):
67 |         if os.path.isdir(self.dataset):
68 |             rows = []
69 |             for filename in os.listdir(self.dataset):
70 |                 with open(self.dataset + filename, 'r', encoding='utf8') as file:
71 |                     txt = file.read()
72 |                 txt = txt.replace('\r\n', '\n').replace('\u3000', ' ')
73 |                 rows += [r for r in txt.split('\n') if r != '']
74 |         else:
75 |             with open(self.dataset, 'r', encoding='utf8') as file:
76 |                 txt = file.read()
77 |             txt = txt.replace('\r\n', '\n')
78 |             rows = [r for r in txt.split('\n') if r != '']
79 |         if thd != -1:
80 |             rows = self.format_new_rows(rows, thd=thd)
81 |         data = Dataset.from_dict({"input": rows})
82 |         data = data.shuffle().map(lambda x: self.tokenize(x["input"], use_eos_token=use_eos_token))
83 |         print('Train Data: {:.2f}%'.format(self.exceed_count / len(data) * 100), 'outliers')
84 |         self.train_data = data
85 | 


--------------------------------------------------------------------------------
/llmtune/llms/opt/model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | 
  4 | from llmtune.utils import find_layers
  5 | from llmtune.engine.quant.converter import make_quant
  6 | 
  7 | def load_opt_unquantized(llm_config):
  8 |     from transformers import OPTForCausalLM
  9 |     def skip(*args, **kwargs):
 10 |         pass
 11 |     torch.nn.init.kaiming_uniform_ = skip
 12 |     torch.nn.init.uniform_ = skip
 13 |     torch.nn.init.normal_ = skip
 14 |     model = OPTForCausalLM.from_pretrained(
 15 |         llm_config.base_config.name_or_path, torch_dtype='auto'
 16 |     )
 17 |     return model
 18 | 
 19 | def load_opt_quantized(llm_config, quantized_weights_path):
 20 |     import transformers, accelerate
 21 |     from transformers import OPTConfig, OPTForCausalLM
 22 |     
 23 |     with accelerate.init_empty_weights():
 24 |         config = OPTConfig.from_pretrained(
 25 |             llm_config.base_config.name_or_path
 26 |         )
 27 |         torch.set_default_dtype(torch.half)
 28 |         transformers.modeling_utils._init_weights = False
 29 |         torch.set_default_dtype(torch.half)
 30 |         model = OPTForCausalLM(config)
 31 |         torch.set_default_dtype(torch.float)
 32 |         model = model.eval()
 33 |         layers = find_layers(model)
 34 |         for name in [
 35 |             'model.decoder.project_out', 
 36 |             'model.decoder.project_in', 'lm_head'
 37 |         ]:
 38 |             if name in layers:
 39 |                 del layers[name]
 40 |         make_quant(
 41 |             model, layers, llm_config.quant_config.bits, 
 42 |             groupsize=llm_config.quant_config.groupsize
 43 |         )
 44 |     model = accelerate.load_checkpoint_and_dispatch(
 45 |             model=model,
 46 |             checkpoint=quantized_weights_path,
 47 |             device_map="auto",
 48 |             # device_map={'': 0},
 49 |             no_split_module_classes=["OPTDecoderLayer"]
 50 |     )
 51 |     return model
 52 | 
 53 | def load_opt_quantized_old(llm_config, checkpoint):
 54 |     import transformers
 55 |     from transformers import OPTConfig, OPTForCausalLM
 56 |     def noop(*args, **kwargs):
 57 |         pass
 58 |     
 59 |     config = OPTConfig.from_pretrained(
 60 |             llm_config.base_config.name_or_path
 61 |         )
 62 |     torch.nn.init.kaiming_uniform_ = noop 
 63 |     torch.nn.init.uniform_ = noop 
 64 |     torch.nn.init.normal_ = noop 
 65 | 
 66 |     torch.set_default_dtype(torch.half)
 67 |     transformers.modeling_utils._init_weights = False
 68 |     torch.set_default_dtype(torch.half)
 69 |     model = OPTForCausalLM(config)
 70 |     torch.set_default_dtype(torch.float)
 71 |     model = model.eval()
 72 |     layers = find_layers(model)
 73 |     for name in [
 74 |         'model.decoder.project_out', 
 75 |         'model.decoder.project_in', 'lm_head'
 76 |     ]:
 77 |         if name in layers:
 78 |             del layers[name]
 79 |     make_quant(model, layers, llm_config.quant_config.bits)
 80 | 
 81 |     print('Loading OPT model')
 82 |     model.load_state_dict(torch.load(checkpoint))
 83 |     model.seqlen = 2048
 84 |     print('Done')
 85 | 
 86 |     return model
 87 | 
 88 | def load_opt(llm_config, quantized_weights_path):
 89 |     if quantized_weights_path is None:
 90 |         model = load_opt_unquantized(llm_config)
 91 |     else:
 92 |         model = load_opt_quantized(
 93 |             llm_config, quantized_weights_path
 94 |         )
 95 |     model.seqlen = 2048
 96 |     return model
 97 | 
 98 | def load_opt_tokenizer(name_or_path):
 99 |     from transformers import AutoTokenizer
100 |     tokenizer = AutoTokenizer.from_pretrained(
101 |         name_or_path
102 |     )
103 |     tokenizer.truncation_side = 'left'
104 |     return tokenizer


--------------------------------------------------------------------------------
/finetune/mnli-llama/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | import torch
 5 | import numpy as np
 6 | 
 7 | 
 8 | def set_random_seed(seed):
 9 |     random.seed(seed)
10 |     np.random.seed(seed)
11 |     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
12 |     os.environ["PL_GLOBAL_SEED"] = str(seed)
13 |     os.environ["PYTHONHASHSEED"] = str(seed)
14 |     torch.manual_seed(seed)
15 |     torch.cuda.manual_seed_all(seed)
16 |     torch.backends.cudnn.benchmark = False
17 |     torch.backends.cudnn.deterministic = True
18 | 
19 | 
20 | def fix_tokenizer(tokenizer):
21 |     # Fixing broken tokenizers
22 |     special_tokens = dict()
23 |     for token_id in range(1000):
24 |         token = tokenizer.convert_ids_to_tokens(token_id)
25 |         if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad" in token:
26 |             special_tokens["pad_token"] = token
27 |         if tokenizer.bos_token_id in (None, tokenizer.vocab_size) and "<s>" in token:
28 |             special_tokens["bos_token"] = token
29 |         if tokenizer.eos_token_id in (None, tokenizer.vocab_size) and "</s>" in token:
30 |             special_tokens["eos_token"] = token
31 |         if tokenizer.unk_token_id in (None, tokenizer.vocab_size) and "unk" in token:
32 |             special_tokens["unk_token"] = token
33 |         if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep" in token:
34 |             special_tokens["sep_token"] = token
35 | 
36 |     if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "bos_token" in special_tokens:
37 |         special_tokens["sep_token"] = special_tokens["bos_token"]
38 | 
39 |     if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad_token" not in special_tokens:
40 |         if tokenizer.unk_token_id is not None:
41 |             special_tokens["pad_token"] = tokenizer.unk_token
42 |         else:
43 |             special_tokens["pad_token"] = "<|pad|>"
44 | 
45 |     if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep_token" not in special_tokens:
46 |         if tokenizer.bos_token_id is not None:
47 |             special_tokens["sep_token"] = tokenizer.bos_token
48 |         else:
49 |             special_tokens["sep_token"] = "<|sep|>"
50 |     print(special_tokens)
51 |     tokenizer.add_special_tokens(special_tokens)
52 | 
53 |     print("Vocab size: ", tokenizer.vocab_size)
54 |     print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
55 |     print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
56 |     print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
57 |     print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
58 |     print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
59 |     return tokenizer
60 | 
61 | 
62 | def fix_model(model, tokenizer, use_resize=True):
63 |     model.config.pad_token_id = tokenizer.pad_token_id
64 |     assert model.config.pad_token_id is not None
65 | 
66 |     bos_candidates = (
67 |         tokenizer.bos_token_id,
68 |         tokenizer.cls_token_id,
69 |         tokenizer.sep_token_id,
70 |         tokenizer.unk_token_id
71 |     )
72 |     for bos_candidate in bos_candidates:
73 |         model.config.bos_token_id = bos_candidate
74 |         if bos_candidate is not None:
75 |             break
76 |     assert model.config.bos_token_id is not None
77 |     model.config.decoder_start_token_id = model.config.bos_token_id
78 | 
79 |     eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id)
80 |     for eos_candidate in eos_candidates:
81 |         model.config.eos_token_id = eos_candidate
82 |         if eos_candidate is not None:
83 |             break
84 |     assert model.config.eos_token_id is not None
85 | 
86 |     if use_resize:
87 |         model.resize_token_embeddings(len(tokenizer))
88 | 
89 |     return model
90 | 
91 | 
92 | def gen_batch(records, batch_size):
93 |     batch_start = 0
94 |     while batch_start < len(records):
95 |         batch_end = batch_start + batch_size
96 |         batch = records[batch_start: batch_end]
97 |         batch_start = batch_end
98 |         yield batch


--------------------------------------------------------------------------------
/examples/finetune.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import transformers
  4 | from transformers import AutoTokenizer
  5 | from llmtune.llms.autollm import AutoLLMForCausalLM
  6 | from llmtune.engine.lora.config import FinetuneConfig
  7 | from llmtune.data import TrainSAD
  8 | from llmtune.engine.lora.peft import quant_peft
  9 | from llmtune.utils import to_half_precision
 10 | 
 11 | # model config
 12 | model_name = ''
 13 | # model_name = './llama-7b-quantized' # can generate local dir via quantize.py
 14 | tokenizer_name = 'huggyllama/llama-13b'
 15 | DEV = 'cuda'
 16 | 
 17 | # load model
 18 | transformers.logging.set_verbosity_info()
 19 | llm = AutoLLMForCausalLM.from_pretrained(model_name)
 20 | llm.eval()
 21 | llm = llm.to(DEV)
 22 | llm = to_half_precision(llm)
 23 | 
 24 | # load tokenizer
 25 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 26 | tokenizer.pad_token_id = 0
 27 | 
 28 | # finetune training config
 29 | mbatch_size=1
 30 | batch_size=2
 31 | epochs=3
 32 | lr=2e-4
 33 | cutoff_len=256
 34 | lora_r=8
 35 | lora_alpha=16
 36 | lora_dropout=0.05
 37 | val_set_size=0.2
 38 | warmup_steps=50
 39 | save_steps=50
 40 | save_total_limit=3
 41 | logging_steps=10
 42 | 
 43 | data_type = 'alpaca'
 44 | dataset = None # will load alpaca from HF
 45 | adapter_path = './llama-7b-quantized-lora'
 46 | 
 47 | # set up finetuning config
 48 | tune_config = FinetuneConfig(
 49 |     dataset=dataset, 
 50 |     ds_type=data_type, 
 51 |     lora_out_dir=adapter_path, 
 52 |     mbatch_size=mbatch_size,
 53 |     batch_size=batch_size,
 54 |     epochs=epochs, 
 55 |     lr=lr,
 56 |     cutoff_len=cutoff_len,
 57 |     lora_r=lora_r, 
 58 |     lora_alpha=lora_alpha, 
 59 |     lora_dropout=lora_dropout,
 60 |     val_set_size=val_set_size,
 61 |     warmup_steps=warmup_steps,
 62 |     save_steps=save_steps,
 63 |     save_total_limit=save_total_limit,
 64 |     logging_steps=logging_steps,
 65 | )
 66 | 
 67 | # set up lora config    
 68 | lora_config = quant_peft.LoraConfig(
 69 |     r=tune_config.lora_r,
 70 |     lora_alpha=tune_config.lora_alpha,
 71 |     target_modules=["q_proj", "v_proj"],
 72 |     lora_dropout=tune_config.lora_dropout,
 73 |     bias="none",
 74 |     task_type="CAUSAL_LM",
 75 | )
 76 | 
 77 | # create a new lora from config
 78 | model = quant_peft.get_peft_model(llm, lora_config)
 79 | 
 80 | # load stanford alpaca data
 81 | data = TrainSAD(
 82 |     tune_config.dataset, 
 83 |     tune_config.val_set_size, 
 84 |     tokenizer, 
 85 |     tune_config.cutoff_len
 86 | )
 87 | data.prepare_data() # this tokenizes the dataset
 88 | 
 89 | # training args
 90 | training_arguments = transformers.TrainingArguments(
 91 |     per_device_train_batch_size=tune_config.mbatch_size,
 92 |     gradient_accumulation_steps=tune_config.gradient_accumulation_steps,
 93 |     warmup_steps=tune_config.warmup_steps,
 94 |     num_train_epochs=tune_config.epochs,
 95 |     learning_rate=tune_config.lr,
 96 |     fp16=True,
 97 |     logging_steps=tune_config.logging_steps,
 98 |     evaluation_strategy="no",
 99 |     save_strategy="steps",
100 |     eval_steps=None,
101 |     save_steps=tune_config.save_steps,
102 |     output_dir=tune_config.lora_out_dir,
103 |     save_total_limit=tune_config.save_total_limit,
104 |     load_best_model_at_end=False,
105 |     ddp_find_unused_parameters=False if tune_config.ddp else None,
106 | )
107 | 
108 | # start trainer
109 | trainer = transformers.Trainer(
110 |     model=model,
111 |     train_dataset=data.train_data,
112 |     eval_dataset=data.val_data,
113 |     args=training_arguments,
114 |     data_collator=transformers.DataCollatorForLanguageModeling(
115 |         tokenizer, mlm=False
116 |     ),
117 | )
118 | print(training_arguments.parallel_mode)
119 | model.config.use_cache = False
120 | 
121 | # use half precision
122 | model = to_half_precision(model)
123 | 
124 | # start training
125 | checkpoint_dir = tune_config.lora_out_dir
126 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
127 |     trainer.train(resume_from_checkpoint=True)
128 | else:
129 |     trainer.train()
130 | 
131 | # Save Model
132 | model.save_pretrained(tune_config.lora_out_dir)
133 | 
134 | 


--------------------------------------------------------------------------------
/llmtune/data/gpt4all.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typing import Dict, Any
 3 | from datasets import load_dataset
 4 | from llmtune.data.abstract import AbstractTrainData
 5 | 
 6 | # GPT4All-like Data
 7 | class TrainGPT4All(AbstractTrainData):
 8 |     def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None:
 9 |         super().__init__(dataset, val_set_size, tokenizer, cutoff_len)
10 | 
11 |     def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
12 |         pass
13 | 
14 |     def tokenize_inputs(self, examples):
15 |         max_length = self.cutoff_len
16 |         input_ids = torch.full((len(examples["prompt"]), max_length), self.tokenizer.pad_token_id)
17 |         # ignore bos
18 |         newline_tokens = self.tokenizer("\n", return_tensors="pt")["input_ids"][0, 1:]
19 | 
20 |         out = {"labels": [], "attention_mask": []}
21 |         for i, (prompt, response) in enumerate(zip(examples["prompt"], examples["response"])):
22 |             input_tokens = self.tokenizer(prompt, truncation=True, max_length=max_length // 2, return_tensors="pt")["input_ids"].squeeze()
23 |             if input_tokens.dim() == 0:
24 |                 input_tokens = input_tokens.unsqueeze(0)
25 | 
26 |             input_len = len(input_tokens)
27 | 
28 |             # plus one since we remove bos from response
29 |             # but we subtract one since we want to add eos token
30 |             remaining_tokens = max_length - input_len - len(newline_tokens) + 1
31 |             # remove bos
32 |             target_tokens = self.tokenizer(response, truncation=True, max_length=remaining_tokens, return_tensors="pt")["input_ids"].squeeze()[1:]
33 | 
34 |             input_ids[i, :input_len] = input_tokens
35 |             # add newline between prompt and response
36 |             newline_plus_inputs = input_len + len(newline_tokens)
37 |             input_ids[i, input_len: newline_plus_inputs] = newline_tokens
38 | 
39 |             # add target tokens, remove bos
40 |             input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens
41 |             # add eos token, enforce stopping if we don't truncate
42 |             # we don't want long code to stop generating if truncated during training
43 |             if newline_plus_inputs + len(target_tokens) < max_length:
44 |                 input_ids[i, newline_plus_inputs + len(target_tokens)] = self.tokenizer.eos_token_id
45 | 
46 |             labels = input_ids[i].clone()
47 |             labels[: newline_plus_inputs] = -100
48 |             labels[labels == self.tokenizer.pad_token_id] = -100
49 |             # to debug this, can set all values == -100 to the pad token, then assert that tokenizer.decode(labels, skip_special_tokens=True).strip() == response
50 | 
51 |             attention_mask = input_ids[i].ne(self.tokenizer.pad_token_id).int()
52 | 
53 |             out["labels"].append(labels)
54 |             out["attention_mask"].append(attention_mask)
55 | 
56 |         out["input_ids"] = input_ids
57 | 
58 |         out = {k: torch.stack(v) if isinstance(v, list) else v for k, v in out.items()}
59 | 
60 |         return out
61 | 
62 |     def prepare_data(self, **kwargs) -> None:
63 |         dataset = load_dataset("json", data_files=self.dataset)
64 | 
65 |         self.val_data = None
66 |         if self.val_set_size > 0:
67 |             dataset = dataset["train"].train_test_split(
68 |                 test_size=self.val_set_size, shuffle=True, seed=42  # ! Seed = 42 (?)
69 |             )
70 |             train_dataset, val_dataset = dataset["train"], dataset["test"]
71 | 
72 |             # tokenize inputs and return labels and attention mask
73 |             val_dataset = val_dataset.map(
74 |                 lambda ele: self.tokenize_inputs(ele),
75 |                 batched=True,
76 |                 remove_columns=["source", "prompt"],
77 |             )
78 |             self.val_data = val_dataset.with_format("torch")
79 |         else:
80 |             train_dataset = dataset["train"]
81 | 
82 |         train_dataset = train_dataset.map(
83 |             lambda ele: self.tokenize_inputs(ele),
84 |             batched=True,
85 |             remove_columns=["source", "prompt"],
86 |         )
87 |         self.train_data = train_dataset.with_format("torch")
88 | 


--------------------------------------------------------------------------------
/finetune/bbh-eval/bbh_dev.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from argparse import Namespace
  4 | from typing import List
  5 | 
  6 | from datasets import load_dataset, get_dataset_config_names
  7 | from fire import Fire
  8 | from pydantic import BaseModel
  9 | from tqdm import tqdm
 10 | 
 11 | from modeling_dev import select_model, EvalModel
 12 | 
 13 | 
 14 | class BBHSample(BaseModel):
 15 |     input: str
 16 |     target: str
 17 | 
 18 |     def as_prompt(self, include_answer: bool = True):
 19 |         prompt = self.input
 20 |         prompt += "\nAnswer:"
 21 |         if include_answer:
 22 |             prompt += " {}\n\n".format(self.target)
 23 |         return prompt
 24 | 
 25 | 
 26 | class BBHData(BaseModel):
 27 |     samples: List[BBHSample]
 28 | 
 29 |     @classmethod
 30 |     def get_config_names(cls, path: str = "lukaemon/bbh") -> List[str]:
 31 |         return get_dataset_config_names(path)
 32 | 
 33 |     @classmethod
 34 |     def load_from_huggingface(
 35 |         cls, path: str = "lukaemon/bbh", config: str = "", split: str = "test"
 36 |     ):
 37 |         data = load_dataset(path, config, split=split)
 38 |         samples = [BBHSample(**raw) for raw in tqdm(data, desc=str((path, split)))]
 39 |         return cls(samples=samples)
 40 | 
 41 | 
 42 | def gen_prompt(data: BBHData, k=-1):
 43 |     prompt = ""
 44 |     if k == -1:
 45 |         k = len(data.samples)
 46 |     for i in range(k):
 47 |         prompt += data.samples[i].as_prompt()
 48 |     return prompt
 49 | 
 50 | 
 51 | def evaluate(model: EvalModel, data: BBHData, ntrain: int) -> dict:
 52 |     data_train = BBHData(samples=data.samples[:ntrain])
 53 |     data_test = BBHData(samples=data.samples[ntrain:])
 54 |     is_correct = []
 55 | 
 56 |     for i in range(len(data_test.samples)):
 57 |         # get prompt and make sure it fits
 58 |         k = int(ntrain)
 59 |         prompt_end = data_test.samples[i].as_prompt(include_answer=False)
 60 |         train_prompt = gen_prompt(data_train, k)
 61 |         prompt = train_prompt + prompt_end
 62 | 
 63 |         while not model.check_valid_length(prompt) and k > 0:
 64 |             k -= 1
 65 |             train_prompt = gen_prompt(data_train, k)
 66 |             prompt = train_prompt + prompt_end
 67 | 
 68 |         label = data_test.samples[i].target
 69 |         pred = model.run(prompt)
 70 |         is_correct.append(pred.strip().startswith(label))
 71 |         if i == 0:
 72 |             print(dict(prompt=prompt, label=label, pred=pred))
 73 | 
 74 |     return dict(score=sum(is_correct) / len(is_correct))
 75 | 
 76 | 
 77 | def main(data_dir: str = "lukaemon/bbh", ntrain: int = 3, **kwargs):
 78 |     args = Namespace(**locals())
 79 |     model = select_model(max_input_length=2048, max_output_length=32, **kwargs)
 80 |     print(locals())
 81 | 
 82 |     if 'load_4bit' in kwargs:
 83 |         loadin_4bit = 'true'
 84 |     else:
 85 |         loadin_4bit = 'false'
 86 | 
 87 |     if 'load_8bit' in kwargs:
 88 |         loadin_8bit = 'true'
 89 |     else:
 90 |         loadin_8bit = 'false'
 91 | 
 92 |     if 'lora_path' in kwargs:
 93 |         file_name = f"all_results_{kwargs['model_path'].replace('/', '-')}_{kwargs['lora_path'].replace('/', '-')}_4bit_{loadin_4bit}_8bit_{loadin_8bit}.txt"
 94 |     else:
 95 |         file_name = f"all_results_{kwargs['model_path'].replace('/', '-')}_4bit_{loadin_4bit}_8bit_{loadin_8bit}.txt"
 96 | 
 97 |     all_results = []
 98 |     if os.path.exists(file_name):
 99 |         with open(file_name, "r") as f:
100 |             print(f"Loading {file_name}")
101 |             all_results = json.load(f)
102 |             print(all_results)
103 | 
104 |     start = len(all_results)
105 |     for name in tqdm(BBHData.get_config_names()[start:]):
106 |         data = BBHData.load_from_huggingface(config=name)
107 |         result = evaluate(model, data, ntrain=ntrain)
108 |         all_results.append(result)
109 |         print(dict(name=name, **result))
110 | 
111 |         # Save the state of all_results after each iteration
112 |         with open(file_name, "w") as f:
113 |             json.dump(all_results, f)
114 | 
115 |     score = sum(res["score"] for res in all_results) / len(all_results)
116 |     print(dict(average=score))
117 |     return score
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     Fire()
122 | 


--------------------------------------------------------------------------------
/llmtune/data/calibration.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | 
  5 | def set_seed(seed):
  6 |     np.random.seed(seed)
  7 |     torch.random.manual_seed(seed)
  8 | 
  9 | 
 10 | def get_wikitext2(nsamples, seed, seqlen, model):
 11 |     from datasets import load_dataset
 12 |     traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
 13 |     testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
 14 | 
 15 |     from transformers import AutoTokenizer 
 16 |     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
 17 |     trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt')
 18 |     testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
 19 | 
 20 |     import random
 21 |     random.seed(seed)
 22 |     trainloader = []
 23 |     for _ in range(nsamples):
 24 |         i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
 25 |         j = i + seqlen
 26 |         inp = trainenc.input_ids[:, i:j]
 27 |         tar = inp.clone()
 28 |         tar[:, :-1] = -100
 29 |         trainloader.append((inp, tar))
 30 |     return trainloader, testenc
 31 | 
 32 | def get_ptb(nsamples, seed, seqlen, model):
 33 |     from datasets import load_dataset
 34 |     traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
 35 |     valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation')
 36 | 
 37 |     from transformers import AutoTokenizer 
 38 |     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
 39 |     trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt')
 40 |     testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt')
 41 | 
 42 |     import random
 43 |     random.seed(seed)
 44 |     trainloader = []
 45 |     for _ in range(nsamples):
 46 |         i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
 47 |         j = i + seqlen
 48 |         inp = trainenc.input_ids[:, i:j]
 49 |         tar = inp.clone()
 50 |         tar[:, :-1] = -100
 51 |         trainloader.append((inp, tar))
 52 |     return trainloader, testenc
 53 | 
 54 | def get_c4(nsamples, seed, seqlen, model):
 55 |     from datasets import load_dataset
 56 |     traindata = load_dataset(
 57 |         'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train', use_auth_token=True
 58 |     )
 59 |     valdata = load_dataset(
 60 |         'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation',use_auth_token=True
 61 |     )
 62 | 
 63 |     from transformers import AutoTokenizer
 64 |     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
 65 | 
 66 |     import random
 67 |     random.seed(seed)
 68 |     trainloader = []
 69 |     for _ in range(nsamples):
 70 |         while True:
 71 |             i = random.randint(0, len(traindata) - 1)
 72 |             trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
 73 |             if trainenc.input_ids.shape[1] >= seqlen:
 74 |                 break
 75 |         i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
 76 |         j = i + seqlen
 77 |         inp = trainenc.input_ids[:, i:j]
 78 |         tar = inp.clone()
 79 |         tar[:, :-1] = -100
 80 |         trainloader.append((inp, tar))
 81 | 
 82 |     import random
 83 |     random.seed(0)
 84 |     valenc = []
 85 |     for _ in range(256):
 86 |         while True:
 87 |             i = random.randint(0, len(valdata) - 1)
 88 |             tmp = tokenizer(valdata[i]['text'], return_tensors='pt')
 89 |             if tmp.input_ids.shape[1] >= seqlen:
 90 |                 break
 91 |         i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
 92 |         j = i + seqlen
 93 |         valenc.append(tmp.input_ids[:, i:j])
 94 |     valenc = torch.hstack(valenc)
 95 |     class TokenizerWrapper:
 96 |         def __init__(self, input_ids):
 97 |             self.input_ids = input_ids
 98 |     valenc = TokenizerWrapper(valenc)
 99 | 
100 |     return trainloader, valenc 
101 | 
102 | 
103 | def get_calibration_loaders(
104 |     name, nsamples=128, seed=0, seqlen=2048, model=''
105 | ):
106 |     if 'wikitext2' in name:
107 |         return get_wikitext2(nsamples, seed, seqlen, model)
108 |     if 'ptb' in name:
109 |         return get_ptb(nsamples, seed, seqlen, model)
110 |     if 'c4' in name:
111 |         return get_c4(nsamples, seed, seqlen, model)
112 | 


--------------------------------------------------------------------------------
/llmtune/engine/inference/matmult.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | try:
  4 |     import quant_cuda
  5 | except:
  6 |     print('CUDA extension not installed. Inference will not work.')
  7 | 
  8 | # Global Buffer
  9 | buffer_mat_dic = {}
 10 | use_new = True
 11 | auto_switch = True
 12 | auto_switch_thd = 8
 13 | debug = False
 14 | cache_buffer = True
 15 | 
 16 | def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda', bits=4):
 17 |     target_shape = (shape_of_qweight[0] * (32 // bits), shape_of_qweight[1])
 18 |     if not cache_buffer:
 19 |         return torch.zeros(target_shape, dtype=dtype, device=device)
 20 |     if target_shape not in buffer_mat_dic.keys():
 21 |         buffer_mat_dic[target_shape] = torch.zeros(target_shape, dtype=dtype, device=device)
 22 |     else:
 23 |         if buffer_mat_dic[target_shape].device != device:
 24 |             buffer_mat_dic[target_shape] = buffer_mat_dic[target_shape].to(device)
 25 |         if buffer_mat_dic[target_shape].dtype != dtype:
 26 |             buffer_mat_dic[target_shape] = buffer_mat_dic[target_shape].to(dtype=dtype)
 27 |     return buffer_mat_dic[target_shape]
 28 | 
 29 | def _matmul4bit_v1_recons(x, qweight, scales, zeros, transpose=False):
 30 |     if debug:
 31 |         print('_matmul4bit_v1_recons')
 32 |     if not transpose:
 33 |         assert qweight.shape[0] * 8 == x.shape[-1]
 34 |     else:
 35 |         assert qweight.shape[1] == x.shape[-1]
 36 |     buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
 37 |     quant_cuda.vecquant4recons_v1(qweight, buffer, scales, zeros)
 38 |     # dtype = x.dtype
 39 |     # x = x.float()
 40 |     if not transpose:
 41 |         output = torch.matmul(x, buffer)
 42 |     else:
 43 |         output = torch.matmul(x, buffer.T)
 44 |     # output = output.to(dtype)
 45 |     return output
 46 | 
 47 | 
 48 | def _matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx, transpose=False):
 49 |     if debug:
 50 |         print('_matmul4bit_v2_recons')
 51 |     if not transpose:
 52 |         assert qweight.shape[0] * 8 == x.shape[-1]
 53 |     else:
 54 |         assert qweight.shape[1] == x.shape[-1]
 55 |     buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
 56 |     quant_cuda.vecquant4recons_v2(qweight, buffer, scales, zeros, g_idx)
 57 |     if not transpose:
 58 |         output = torch.matmul(x, buffer)
 59 |     else:
 60 |         output = torch.matmul(x, buffer.T)
 61 |     return output
 62 | 
 63 | 
 64 | def _matmul2bit_v2_recons(x, qweight, scales, zeros, g_idx, transpose=False):
 65 |     if debug:
 66 |         print('_matmul2bit_v2_recons')
 67 |     if not transpose:
 68 |         assert qweight.shape[0] * 16 == x.shape[-1]
 69 |     else:
 70 |         assert qweight.shape[1] == x.shape[-1]
 71 |     buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device, bits=2)
 72 |     quant_cuda.vecquant2recons_v2(qweight, buffer, scales, zeros, g_idx)
 73 |     if not transpose:
 74 |         output = torch.matmul(x, buffer)
 75 |     else:
 76 |         output = torch.matmul(x, buffer.T)
 77 |     return output
 78 | 
 79 | 
 80 | def matmul4bit(x, qweight, scales, zeros, g_idx=None):
 81 |     raise NotImplementedError()
 82 |     # detect if zeros is int32
 83 |     if zeros.dtype != torch.int32:
 84 |         # use v1
 85 |         if use_new:
 86 |             if auto_switch:
 87 |                 if np.prod(x.shape[:-1]) > auto_switch_thd:
 88 |                     output = _matmul4bit_v1_recons(x.half(), qweight, scales.half(), zeros.half())
 89 |                 else:
 90 |                     output = _matmul4bit_v1(x, qweight, scales, zeros)
 91 |         else:
 92 |             output = _matmul4bit_v1(x, qweight, scales, zeros)
 93 |     else:
 94 |         if g_idx is None:
 95 |             g_idx = torch.zeros(qweight.shape[0] * 8, dtype=torch.int32, device=x.device)
 96 |         # use v2
 97 |         if use_new:
 98 |             if auto_switch:
 99 |                 if np.prod(x.shape[:-1]) > auto_switch_thd:
100 |                     output = _matmul4bit_v2_recons(x.half(), qweight, scales.half(), zeros, g_idx)
101 |                 else:
102 |                     output = _matmul4bit_v2(x, qweight, scales, zeros, g_idx)
103 |         else:
104 |             output = _matmul4bit_v2(x, qweight, scales, zeros, g_idx)
105 |     return output
106 | 
107 | 
108 | def matmul3bit(x, qweight, scales, zeros, g_idx, outfeatures):
109 |     out_shape = x.shape[:-1] + (outfeatures, )
110 |     x = x.reshape(-1,x.shape[-1])  
111 |     output = torch.zeros((x.shape[0], outfeatures), device=x.device, dtype=torch.float32)
112 |     quant_cuda.vecquant3matmul(x.float(), qweight, output, scales.float(), zeros, g_idx)
113 |     output = output.reshape(out_shape)
114 |     return output


--------------------------------------------------------------------------------
/llmtune/engine/quant/gptq/quantizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | 
  6 | def quantize(x, scale, zero, maxq):
  7 |     if maxq < 0:
  8 |         return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
  9 |     q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
 10 |     return scale * (q - zero)
 11 | 
 12 | class Quantizer(nn.Module):
 13 |     def __init__(self, shape=1):
 14 |         super(Quantizer, self).__init__()
 15 |         self.register_buffer('maxq', torch.tensor(0))
 16 |         self.register_buffer('scale', torch.zeros(shape))
 17 |         self.register_buffer('zero', torch.zeros(shape))
 18 | 
 19 |     def configure(
 20 |         self,
 21 |         bits, perchannel=False, sym=True, 
 22 |         mse=False, norm=2.4, grid=100, maxshrink=.8,
 23 |         trits=False
 24 |     ):
 25 |         
 26 |         self.maxq = torch.tensor(2 ** bits - 1)
 27 |         self.perchannel = perchannel
 28 |         self.sym = sym
 29 |         self.mse = mse
 30 |         self.norm = norm
 31 |         self.grid = grid
 32 |         self.maxshrink = maxshrink 
 33 |         if trits:
 34 |             self.maxq = torch.tensor(-1) 
 35 | 
 36 |     def find_params(self, x, weight=False):
 37 |         dev = x.device
 38 |         self.maxq = self.maxq.to(dev)
 39 | 
 40 |         shape = x.shape
 41 |         if self.perchannel:
 42 |             if weight:
 43 |                 x = x.flatten(1)
 44 |             else:
 45 |                 if len(shape) == 4:
 46 |                     x = x.permute([1, 0, 2, 3])
 47 |                     x = x.flatten(1)
 48 |                 if len(shape) == 3:
 49 |                     x = x.reshape((-1, shape[-1])).t()
 50 |                 if len(shape) == 2:
 51 |                     x = x.t()
 52 |         else:
 53 |             x = x.flatten().unsqueeze(0)
 54 | 
 55 |         tmp = torch.zeros(x.shape[0], device=dev)
 56 |         xmin = torch.minimum(x.min(1)[0], tmp)
 57 |         xmax = torch.maximum(x.max(1)[0], tmp)
 58 | 
 59 |         if self.sym:
 60 |             xmax = torch.maximum(torch.abs(xmin), xmax)
 61 |             tmp = xmin < 0
 62 |             if torch.any(tmp):
 63 |                 xmin[tmp] = -xmax[tmp]
 64 |         tmp = (xmin == 0) & (xmax == 0)
 65 |         xmin[tmp] = -1
 66 |         xmax[tmp] = +1
 67 | 
 68 |         if self.maxq < 0:
 69 |             self.scale = xmax
 70 |             self.zero = xmin
 71 |         else:
 72 |             self.scale = (xmax - xmin) / self.maxq
 73 |             if self.sym:
 74 |                 self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
 75 |             else:
 76 |                 self.zero = torch.round(-xmin / self.scale)
 77 | 
 78 |         if self.mse:
 79 |             best = torch.full([x.shape[0]], float('inf'), device=dev)
 80 |             for i in range(int(self.maxshrink * self.grid)):
 81 |                 p = 1 - i / self.grid 
 82 |                 xmin1 = p * xmin
 83 |                 xmax1 = p * xmax
 84 |                 scale1 = (xmax1 - xmin1) / self.maxq
 85 |                 zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
 86 |                 q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
 87 |                 q -= x
 88 |                 q.abs_()
 89 |                 q.pow_(self.norm)
 90 |                 err = torch.sum(q, 1)
 91 |                 tmp = err < best
 92 |                 if torch.any(tmp):
 93 |                     best[tmp] = err[tmp]
 94 |                     self.scale[tmp] = scale1[tmp]
 95 |                     self.zero[tmp] = zero1[tmp]
 96 |         if not self.perchannel:
 97 |             if weight:
 98 |                 tmp = shape[0]
 99 |             else:
100 |                 tmp = shape[1] if len(shape) != 3 else shape[2]
101 |             self.scale = self.scale.repeat(tmp)
102 |             self.zero = self.zero.repeat(tmp)
103 | 
104 |         if weight:
105 |             shape = [-1] + [1] * (len(shape) - 1)
106 |             self.scale = self.scale.reshape(shape)
107 |             self.zero = self.zero.reshape(shape)
108 |             return
109 |         if len(shape) == 4:
110 |             self.scale = self.scale.reshape((1, -1, 1, 1))
111 |             self.zero = self.zero.reshape((1, -1, 1, 1))
112 |         if len(shape) == 3:
113 |             self.scale = self.scale.reshape((1, 1, -1))
114 |             self.zero = self.zero.reshape((1, 1, -1)) 
115 |         if len(shape) == 2:
116 |             self.scale = self.scale.unsqueeze(0)
117 |             self.zero = self.zero.unsqueeze(0)
118 | 
119 |     def quantize(self, x):
120 |         if self.ready():
121 |             return quantize(x, self.scale, self.zero, self.maxq)
122 |         return x
123 | 
124 |     def enabled(self):
125 |         return self.maxq > 0
126 | 
127 |     def ready(self):
128 |         return torch.all(self.scale != 0)
129 | 


--------------------------------------------------------------------------------
/llmtune/engine/inference/cuda/quant_cuda.cpp:
--------------------------------------------------------------------------------
  1 | #include <torch/all.h>
  2 | #include <torch/python.h>
  3 | #include <c10/cuda/CUDAGuard.h>
  4 | 
  5 | // standard forward operations
  6 | 
  7 | void vecquant2matmul_cuda(
  8 |   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  9 |   torch::Tensor scales, torch::Tensor zeros,
 10 |   torch::Tensor g_idx
 11 | ); 
 12 | 
 13 | void vecquant2matmul(
 14 |   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 15 |   torch::Tensor scales, torch::Tensor zeros,
 16 |   torch::Tensor g_idx
 17 | ) {
 18 |   const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
 19 |   vecquant2matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
 20 | }
 21 | 
 22 | void vecquant3matmul_cuda(
 23 |   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 24 |   torch::Tensor scales, torch::Tensor zeros,
 25 |   torch::Tensor g_idx
 26 | ); 
 27 | 
 28 | void vecquant3matmul(
 29 |   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 30 |   torch::Tensor scales, torch::Tensor zeros,
 31 |   torch::Tensor g_idx
 32 | ) {
 33 |   const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
 34 |   vecquant3matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
 35 | }
 36 | 
 37 | void vecquant4matmul_cuda(
 38 |   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 39 |   torch::Tensor scales, torch::Tensor zeros,
 40 |   torch::Tensor g_idx
 41 | ); 
 42 | 
 43 | void vecquant4matmul(
 44 |   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 45 |   torch::Tensor scales, torch::Tensor zeros,
 46 |   torch::Tensor g_idx
 47 | ) {
 48 |   const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
 49 |   vecquant4matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
 50 | }
 51 | 
 52 | void vecquant8matmul_cuda(
 53 |   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 54 |   torch::Tensor scales, torch::Tensor zeros,
 55 |   torch::Tensor g_idx
 56 | ); 
 57 | 
 58 | void vecquant8matmul(
 59 |   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 60 |   torch::Tensor scales, torch::Tensor zeros,
 61 |   torch::Tensor g_idx
 62 | ) {
 63 |   const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
 64 |   vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
 65 | }
 66 | 
 67 | // methods based on reconstruction (unpacking)
 68 | 
 69 | void vecquant4recons_v1_cuda(
 70 |   torch::Tensor mat, torch::Tensor res, torch::Tensor scales, torch::Tensor zeros
 71 | );
 72 | 
 73 | void vecquant4recons_v1(
 74 |   torch::Tensor mat, torch::Tensor res,
 75 |   torch::Tensor scales, torch::Tensor zeros
 76 | ) {
 77 |   const at::cuda::OptionalCUDAGuard device_guard(device_of(scales));
 78 |   vecquant4recons_v1_cuda(mat, res, scales, zeros);
 79 | }
 80 | 
 81 | void vecquant4recons_v2_cuda(
 82 |   torch::Tensor mat, torch::Tensor res,
 83 |   torch::Tensor scales, torch::Tensor zeros,
 84 |   torch::Tensor g_idx
 85 | );
 86 | 
 87 | void vecquant4recons_v2(
 88 |   torch::Tensor mat, torch::Tensor res, torch::Tensor scales, torch::Tensor zeros, torch::Tensor g_idx
 89 | ) {
 90 |   const at::cuda::OptionalCUDAGuard device_guard(device_of(scales));
 91 |   vecquant4recons_v2_cuda(mat, res, scales, zeros, g_idx);
 92 | }
 93 | 
 94 | void vecquant2recons_v2_cuda(
 95 |   torch::Tensor mat, torch::Tensor res,
 96 |   torch::Tensor scales, torch::Tensor zeros,
 97 |   torch::Tensor g_idx
 98 | );
 99 | 
100 | void vecquant2recons_v2(
101 |   torch::Tensor mat, torch::Tensor res, torch::Tensor scales, torch::Tensor zeros, torch::Tensor g_idx
102 | ) {
103 |   const at::cuda::OptionalCUDAGuard device_guard(device_of(scales));
104 |   vecquant2recons_v2_cuda(mat, res, scales, zeros, g_idx);
105 | }
106 | 
107 | void vecquant4matmul_v1_faster_cuda(
108 |   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
109 |   torch::Tensor scales, torch::Tensor zeros
110 | );
111 | 
112 | void vecquant4matmul_v1_faster(
113 |   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
114 |   torch::Tensor scales, torch::Tensor zeros
115 | ) {
116 |   const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
117 |   vecquant4matmul_v1_faster_cuda(vec, mat, mul, scales, zeros);
118 | }
119 | 
120 | 
121 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
122 |   m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
123 |   m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
124 |   m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
125 |   m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
126 | 
127 |   // Reconstruction Kernel
128 |   m.def("vecquant4recons_v1", &vecquant4recons_v1, "Vector 4-bit Quantized Matrix Reconstruction (CUDA)");
129 |   m.def("vecquant4recons_v2", &vecquant4recons_v2, "Vector 4-bit Quantized Matrix Reconstruction (CUDA) with group-size support");
130 |   m.def("vecquant2recons_v2", &vecquant2recons_v2, "Vector 2-bit Quantized Matrix Reconstruction (CUDA) with group-size support");
131 | }


--------------------------------------------------------------------------------
/llmtune/executor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import torch
  4 | 
  5 | from llmtune.config import DEV
  6 | from llmtune.utils import to_half_precision
  7 | 
  8 | def load_llm(model_name_or_path):
  9 |     from llmtune.llms.autollm import AutoLLMForCausalLM
 10 |     llm = AutoLLMForCausalLM.from_pretrained(model_name_or_path)
 11 |     return llm
 12 | 
 13 | def load_tokenizer(model_name_or_path, llm_config=None):
 14 |     from llmtune.llms.autollm import get_default_tokenizer
 15 |     if llm_config is not None:
 16 |         model_type = llm_config.model_type
 17 |     else:
 18 |         model_type = None
 19 |     return get_default_tokenizer(model_name_or_path, model_type)
 20 | 
 21 | def load_adapter(llm, adapter_path=None, lora_config=None):
 22 |     from llmtune.engine.lora.peft import quant_peft
 23 |     if adapter_path is None and lora_config is not None:
 24 |         model = quant_peft.get_peft_model(llm, lora_config)
 25 |     elif adapter_path is not None and lora_config is None:
 26 |         model = quant_peft.PeftModel.from_pretrained(
 27 |             llm, adapter_path, 
 28 |             device_map='auto',
 29 |             torch_dtype=torch.float32
 30 |         )
 31 |         print(adapter_path, 'loaded')
 32 |     else:
 33 |         ValueError('Need to specify adapter_path or lora_config')
 34 |     return model
 35 | 
 36 | def generate(
 37 |     llm, tokenizer, prompt, min_length, max_length, temperature, top_k, top_p
 38 | ):
 39 |     llm.to(DEV)
 40 |     llm = to_half_precision(llm)
 41 |     input_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEV)
 42 | 
 43 |     with torch.no_grad():
 44 |         generated_ids = llm.generate(
 45 |             inputs=input_ids,
 46 |             do_sample=True,
 47 |             min_length=min_length,
 48 |             max_length=max_length,
 49 |             top_p=top_p,
 50 |             top_k=top_k,
 51 |             temperature=temperature,
 52 |         )
 53 |     return tokenizer.decode([el.item() for el in generated_ids[0]])    
 54 | 
 55 | def finetune(llm, tokenizer, tune_config):
 56 |     import transformers
 57 |     from llmtune.data import load_finetuning_data
 58 |     from llmtune.engine.lora.peft import quant_peft
 59 |     transformers.logging.set_verbosity_info()
 60 |     tokenizer.pad_token_id = 0
 61 |     
 62 |     lora_config = quant_peft.LoraConfig(
 63 |         r=tune_config.lora_r,
 64 |         lora_alpha=tune_config.lora_alpha,
 65 |         target_modules=["q_proj", "v_proj"],
 66 |         lora_dropout=tune_config.lora_dropout,
 67 |         bias="none",
 68 |         task_type="CAUSAL_LM",
 69 |     )
 70 |     model = load_adapter(llm, lora_config=lora_config)
 71 |     model.print_trainable_parameters()
 72 | 
 73 |     data = load_finetuning_data(tune_config, tokenizer)
 74 | 
 75 |     training_arguments = transformers.TrainingArguments(
 76 |         per_device_train_batch_size=tune_config.mbatch_size,
 77 |         gradient_accumulation_steps=tune_config.gradient_accumulation_steps,
 78 |         warmup_steps=tune_config.warmup_steps,
 79 |         num_train_epochs=tune_config.epochs,
 80 |         learning_rate=tune_config.lr,
 81 |         fp16=True,
 82 |         logging_steps=tune_config.logging_steps,
 83 |         evaluation_strategy="no",
 84 |         save_strategy="steps",
 85 |         eval_steps=None,
 86 |         save_steps=tune_config.save_steps,
 87 |         output_dir=tune_config.lora_out_dir,
 88 |         save_total_limit=tune_config.save_total_limit,
 89 |         load_best_model_at_end=False,
 90 |         ddp_find_unused_parameters=False if tune_config.ddp else None,
 91 |     )
 92 | 
 93 |     trainer = transformers.Trainer(
 94 |         model=model,
 95 |         train_dataset=data.train_data,
 96 |         eval_dataset=data.val_data,
 97 |         args=training_arguments,
 98 |         data_collator=transformers.DataCollatorForLanguageModeling(
 99 |             tokenizer, mlm=False
100 |         ),
101 |     )
102 |     print(training_arguments.parallel_mode)
103 |     model.config.use_cache = False
104 | 
105 |     # use half precision
106 |     model = to_half_precision(model)
107 | 
108 |     # start training
109 |     checkpoint_dir = tune_config.lora_out_dir
110 |     if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
111 |         trainer.train(resume_from_checkpoint=True)
112 |     else:
113 |         trainer.train()
114 | 
115 |     # Save Model
116 |     model.save_pretrained(tune_config.lora_out_dir)
117 | 
118 | def quantize(llm, config):
119 |     from llmtune.data.calibration import get_calibration_loaders
120 |     from llmtune.engine.quant.gptq.executor import GPTQAlgorithm
121 | 
122 |     llm.eval()
123 |     dataloader, _ = get_calibration_loaders(
124 |         config.dataset, 
125 |         nsamples=config.nsamples, 
126 |         seed=config.seed, 
127 |         model=llm.base_model.name_or_path, 
128 |         seqlen=llm.base_model.seqlen
129 |     )
130 | 
131 |     gptq = GPTQAlgorithm(config)
132 |     llm = gptq.quantize(llm, dataloader)
133 | 
134 |     llm.save_pretrained(config.save)
135 |     print(f'Model weights saved to: {config.save}')
136 | 
137 |     


--------------------------------------------------------------------------------
/llmtune/engine/quant/gptq/extras.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from llmtune.engine.quant.gptq.algorithm import GPTQ
  4 | from llmtune.engine.quant.gptq.quantizer import Quantizer
  5 | from llmtune.engine.quant.converter import make_quant
  6 | from llmtune.engine.inference.modules import QuantLinear
  7 | from llmtune.utils import find_layers
  8 | 
  9 | @torch.no_grad()
 10 | def quantize_opt(
 11 |     model, dataloader, bits, groupsize, act_order, nsamples, percdamp, 
 12 |     sym=False, true_sequential=False, nearest=False, trits=False, dev='cuda'
 13 | ):
 14 |     print('Starting ...')
 15 |     if nearest is True or true_sequential is True:
 16 |         raise NotImplementedError()
 17 | 
 18 |     use_cache = model.config.use_cache
 19 |     model.config.use_cache = False
 20 |     layers = model.model.decoder.layers
 21 | 
 22 |     model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 
 23 |     model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
 24 |     if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
 25 |         model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
 26 |     if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
 27 |         model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
 28 |     layers[0] = layers[0].to(dev)
 29 | 
 30 |     dtype = next(iter(model.parameters())).dtype
 31 |     inps = torch.zeros(
 32 |         (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
 33 |     )
 34 |     cache = {'i': 0, 'attention_mask': None}
 35 | 
 36 |     class Catcher(nn.Module):
 37 |         def __init__(self, module):
 38 |             super().__init__()
 39 |             self.module = module
 40 |         def forward(self, inp, **kwargs):
 41 |             inps[cache['i']] = inp
 42 |             cache['i'] += 1
 43 |             cache['attention_mask'] = kwargs['attention_mask']
 44 |             raise ValueError
 45 |     layers[0] = Catcher(layers[0])
 46 |     for batch in dataloader:
 47 |         try:
 48 |             model(batch[0].to(dev))
 49 |         except ValueError:
 50 |             pass
 51 |     layers[0] = layers[0].module
 52 | 
 53 |     layers[0] = layers[0].cpu()
 54 |     model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
 55 |     model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
 56 |     if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
 57 |         model.model.decoder.project_out = model.model.decoder.project_out.cpu()
 58 |     if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
 59 |         model.model.decoder.project_in = model.model.decoder.project_in.cpu()
 60 |     torch.cuda.empty_cache()
 61 | 
 62 |     outs = torch.zeros_like(inps)
 63 |     attention_mask = cache['attention_mask']
 64 | 
 65 |     print('Ready.')
 66 | 
 67 |     quantizers = {}
 68 |     for i in range(len(layers)):
 69 |         layer = layers[i].to(dev)
 70 |         
 71 |         subset = find_layers(layer) 
 72 |         gptq = {}   
 73 |         for name in subset: 
 74 |             gptq[name] = GPTQ(subset[name]) 
 75 |             gptq[name].quantizer = Quantizer()  
 76 |             gptq[name].quantizer.configure( bits, perchannel=True, sym=sym, mse=False, trits=trits  )   
 77 |             
 78 |         def add_batch(name):    
 79 |             def tmp(_, inp, out):   
 80 |                 gptq[name].add_batch(inp[0].data, out.data) 
 81 |             return tmp  
 82 |             
 83 |         handles = []    
 84 |         for name in subset: 
 85 |             handles.append(subset[name].register_forward_hook(add_batch(name))) 
 86 |             
 87 |         for j in range(nsamples):  
 88 |             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] 
 89 |             
 90 |         for h in handles:   
 91 |             h.remove()  
 92 |             
 93 |         for name in subset: 
 94 |             print(f'Quantizing {name} in layer {i+1}/{len(layers)}...')
 95 |             scale,zero,g_idx = gptq[name].fasterquant(percdamp=percdamp, groupsize=groupsize, actorder=act_order)
 96 |             quantizers['model.decoder.layers.%d.%s' % (i, name)] = (gptq[name].quantizer.cpu(),scale.cpu(),zero.cpu(),g_idx.cpu())
 97 |             gptq[name].free()
 98 | 
 99 |         for j in range(nsamples):
100 |             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
101 | 
102 |         layers[i] = layer.cpu()
103 |         del layer
104 |         del gptq 
105 |         torch.cuda.empty_cache()
106 | 
107 |         inps, outs = outs, inps
108 | 
109 |     model.config.use_cache = use_cache
110 |     
111 |     return quantizers
112 | 
113 | def pack_opt(model, quantizers, wbits, groupsize):
114 |     layers = find_layers(model)
115 |     layers = {n: layers[n] for n in quantizers}
116 |     make_quant(model, quantizers, wbits, groupsize)
117 |     qlayers = find_layers(model, [QuantLinear])
118 |     print('Packing ...')
119 |     for name in qlayers:
120 |         print(name)
121 |         quantizers[name],scale,zero,g_idx = quantizers[name]
122 |         qlayers[name].pack(layers[name], scale, zero, g_idx)
123 |     print('Done.')
124 |     return model


--------------------------------------------------------------------------------
/finetune/samsum-llama/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | 
  4 | import torch
  5 | import numpy as np
  6 | 
  7 | 
  8 | def set_random_seed(seed):
  9 |     random.seed(seed)
 10 |     np.random.seed(seed)
 11 |     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
 12 |     os.environ["PL_GLOBAL_SEED"] = str(seed)
 13 |     os.environ["PYTHONHASHSEED"] = str(seed)
 14 |     torch.manual_seed(seed)
 15 |     torch.cuda.manual_seed_all(seed)
 16 |     torch.backends.cudnn.benchmark = False
 17 |     torch.backends.cudnn.deterministic = True
 18 | 
 19 | 
 20 | def fix_tokenizer(tokenizer):
 21 |     # Fixing broken tokenizers
 22 |     special_tokens = dict()
 23 |     for token_id in range(1000):
 24 |         token = tokenizer.convert_ids_to_tokens(token_id)
 25 |         if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad" in token:
 26 |             special_tokens["pad_token"] = token
 27 |         if tokenizer.bos_token_id in (None, tokenizer.vocab_size) and "<s>" in token:
 28 |             special_tokens["bos_token"] = token
 29 |         if tokenizer.eos_token_id in (None, tokenizer.vocab_size) and "</s>" in token:
 30 |             special_tokens["eos_token"] = token
 31 |         if tokenizer.unk_token_id in (None, tokenizer.vocab_size) and "unk" in token:
 32 |             special_tokens["unk_token"] = token
 33 |         if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep" in token:
 34 |             special_tokens["sep_token"] = token
 35 | 
 36 |     if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "bos_token" in special_tokens:
 37 |         special_tokens["sep_token"] = special_tokens["bos_token"]
 38 | 
 39 |     if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad_token" not in special_tokens:
 40 |         if tokenizer.unk_token_id is not None:
 41 |             special_tokens["pad_token"] = tokenizer.unk_token
 42 |         else:
 43 |             special_tokens["pad_token"] = "<|pad|>"
 44 | 
 45 |     if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep_token" not in special_tokens:
 46 |         if tokenizer.bos_token_id is not None:
 47 |             special_tokens["sep_token"] = tokenizer.bos_token
 48 |         else:
 49 |             special_tokens["sep_token"] = "<|sep|>"
 50 |     print(special_tokens)
 51 |     tokenizer.add_special_tokens(special_tokens)
 52 | 
 53 |     print("Vocab size: ", tokenizer.vocab_size)
 54 |     print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
 55 |     print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
 56 |     print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
 57 |     print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
 58 |     print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
 59 |     return tokenizer
 60 | 
 61 | 
 62 | def fix_model(model, tokenizer, use_resize=True):
 63 |     model.config.pad_token_id = tokenizer.pad_token_id
 64 |     assert model.config.pad_token_id is not None
 65 | 
 66 |     bos_candidates = (
 67 |         tokenizer.bos_token_id,
 68 |         tokenizer.cls_token_id,
 69 |         tokenizer.sep_token_id,
 70 |         tokenizer.unk_token_id
 71 |     )
 72 |     for bos_candidate in bos_candidates:
 73 |         model.config.bos_token_id = bos_candidate
 74 |         if bos_candidate is not None:
 75 |             break
 76 |     assert model.config.bos_token_id is not None
 77 |     model.config.decoder_start_token_id = model.config.bos_token_id
 78 | 
 79 |     eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id)
 80 |     for eos_candidate in eos_candidates:
 81 |         model.config.eos_token_id = eos_candidate
 82 |         if eos_candidate is not None:
 83 |             break
 84 |     assert model.config.eos_token_id is not None
 85 | 
 86 |     if use_resize:
 87 |         model.resize_token_embeddings(len(tokenizer))
 88 | 
 89 |     return model
 90 | 
 91 | 
 92 | def gen_batch(records, batch_size):
 93 |     batch_start = 0
 94 |     while batch_start < len(records):
 95 |         batch_end = batch_start + batch_size
 96 |         batch = records[batch_start: batch_end]
 97 |         batch_start = batch_end
 98 |         yield batch
 99 | 
100 | 
101 | def print_special_tokens(tokenizer):
102 |     print("Vocab size: ", tokenizer.vocab_size)
103 |     print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
104 |     print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
105 |     print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
106 |     print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
107 |     print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
108 |     return tokenizer
109 | 
110 | # PAD:  0 <unk>
111 | # BOS:  1 <s>
112 | # EOS:  2 </s>
113 | # UNK:  0 <unk>
114 | # SEP:  1 <s>
115 | 
116 | def fix_tokenizer_opt(tokenizer):
117 |     # Fixing broken tokenizers
118 |     special_tokens = {
119 |         'pad_token': '<unk>', 
120 |         'bos_token': '<s>',
121 |         'eos_token': '</s>',
122 |         'unk_token': '<unk>',
123 |         'sep_token': '<s>'
124 | 
125 |     }
126 |     
127 |     tokenizer.add_special_tokens(special_tokens)
128 | 
129 |     print("Vocab size: ", tokenizer.vocab_size)
130 |     print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
131 |     print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
132 |     print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
133 |     print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
134 |     print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
135 |     return tokenizer


--------------------------------------------------------------------------------
/finetune/samsum-opt/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | 
  4 | import torch
  5 | import numpy as np
  6 | 
  7 | 
  8 | def set_random_seed(seed):
  9 |     random.seed(seed)
 10 |     np.random.seed(seed)
 11 |     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
 12 |     os.environ["PL_GLOBAL_SEED"] = str(seed)
 13 |     os.environ["PYTHONHASHSEED"] = str(seed)
 14 |     torch.manual_seed(seed)
 15 |     torch.cuda.manual_seed_all(seed)
 16 |     torch.backends.cudnn.benchmark = False
 17 |     torch.backends.cudnn.deterministic = True
 18 | 
 19 | 
 20 | def fix_tokenizer(tokenizer):
 21 |     # Fixing broken tokenizers
 22 |     special_tokens = dict()
 23 |     for token_id in range(1000):
 24 |         token = tokenizer.convert_ids_to_tokens(token_id)
 25 |         if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad" in token:
 26 |             special_tokens["pad_token"] = token
 27 |         if tokenizer.bos_token_id in (None, tokenizer.vocab_size) and "<s>" in token:
 28 |             special_tokens["bos_token"] = token
 29 |         if tokenizer.eos_token_id in (None, tokenizer.vocab_size) and "</s>" in token:
 30 |             special_tokens["eos_token"] = token
 31 |         if tokenizer.unk_token_id in (None, tokenizer.vocab_size) and "unk" in token:
 32 |             special_tokens["unk_token"] = token
 33 |         if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep" in token:
 34 |             special_tokens["sep_token"] = token
 35 | 
 36 |     if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "bos_token" in special_tokens:
 37 |         special_tokens["sep_token"] = special_tokens["bos_token"]
 38 | 
 39 |     if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad_token" not in special_tokens:
 40 |         if tokenizer.unk_token_id is not None:
 41 |             special_tokens["pad_token"] = tokenizer.unk_token
 42 |         else:
 43 |             special_tokens["pad_token"] = "<|pad|>"
 44 | 
 45 |     if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep_token" not in special_tokens:
 46 |         if tokenizer.bos_token_id is not None:
 47 |             special_tokens["sep_token"] = tokenizer.bos_token
 48 |         else:
 49 |             special_tokens["sep_token"] = "<|sep|>"
 50 |     print(special_tokens)
 51 |     tokenizer.add_special_tokens(special_tokens)
 52 | 
 53 |     print("Vocab size: ", tokenizer.vocab_size)
 54 |     print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
 55 |     print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
 56 |     print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
 57 |     print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
 58 |     print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
 59 |     return tokenizer
 60 | 
 61 | 
 62 | def fix_model(model, tokenizer, use_resize=True):
 63 |     model.config.pad_token_id = tokenizer.pad_token_id
 64 |     assert model.config.pad_token_id is not None
 65 | 
 66 |     bos_candidates = (
 67 |         tokenizer.bos_token_id,
 68 |         tokenizer.cls_token_id,
 69 |         tokenizer.sep_token_id,
 70 |         tokenizer.unk_token_id
 71 |     )
 72 |     for bos_candidate in bos_candidates:
 73 |         model.config.bos_token_id = bos_candidate
 74 |         if bos_candidate is not None:
 75 |             break
 76 |     assert model.config.bos_token_id is not None
 77 |     model.config.decoder_start_token_id = model.config.bos_token_id
 78 | 
 79 |     eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id)
 80 |     for eos_candidate in eos_candidates:
 81 |         model.config.eos_token_id = eos_candidate
 82 |         if eos_candidate is not None:
 83 |             break
 84 |     assert model.config.eos_token_id is not None
 85 | 
 86 |     if use_resize:
 87 |         model.resize_token_embeddings(len(tokenizer))
 88 | 
 89 |     return model
 90 | 
 91 | 
 92 | def gen_batch(records, batch_size):
 93 |     batch_start = 0
 94 |     while batch_start < len(records):
 95 |         batch_end = batch_start + batch_size
 96 |         batch = records[batch_start: batch_end]
 97 |         batch_start = batch_end
 98 |         yield batch
 99 | 
100 | 
101 | def print_special_tokens(tokenizer):
102 |     print("Vocab size: ", tokenizer.vocab_size)
103 |     print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
104 |     print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
105 |     print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
106 |     print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
107 |     print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
108 |     return tokenizer
109 | 
110 | # PAD:  0 <unk>
111 | # BOS:  1 <s>
112 | # EOS:  2 </s>
113 | # UNK:  0 <unk>
114 | # SEP:  1 <s>
115 | 
116 | def fix_tokenizer_opt(tokenizer):
117 |     # Fixing broken tokenizers
118 |     special_tokens = {
119 |         'pad_token': '<unk>', 
120 |         'bos_token': '<s>',
121 |         'eos_token': '</s>',
122 |         'unk_token': '<unk>',
123 |         'sep_token': '<s>'
124 | 
125 |     }
126 |     
127 |     tokenizer.add_special_tokens(special_tokens)
128 | 
129 |     print("Vocab size: ", tokenizer.vocab_size)
130 |     print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
131 |     print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
132 |     print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
133 |     print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
134 |     print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
135 |     return tokenizer


--------------------------------------------------------------------------------
/finetune/samsum-llama/eval_samsum_4bit_bnb.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | # Set up the argument parser
  4 | parser = argparse.ArgumentParser(description='Python script to work with models')
  5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
  6 | parser.add_argument('--adapter', type=str, help='adapter ID for huggingface', required=True)
  7 | parser.add_argument('--file_name', type=str, help='backup file name', required=True)
  8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
  9 | 
 10 | # Parse the arguments
 11 | args = parser.parse_args()
 12 | 
 13 | # Use the command line arguments in your script
 14 | print('Model Name:', args.model_name)
 15 | print('Adapter Name: ', args.adapter)
 16 | print('Output file:', args.file_name)
 17 | print('Seed: ', args.seed)
 18 | 
 19 | import random
 20 | import json
 21 | import os
 22 | 
 23 | # import wandb
 24 | import torch
 25 | import numpy as np
 26 | # import bitsandbytes as bnb
 27 | from tqdm import tqdm
 28 | import transformers
 29 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
 30 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
 31 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 32 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training, PeftModel
 33 | from datasets import load_dataset
 34 | 
 35 | from utils import *
 36 | from data import *
 37 | 
 38 | import evaluate
 39 | import numpy as np
 40 | from datasets import load_from_disk
 41 | from tqdm import tqdm
 42 | 
 43 | 
 44 | output_dir = args.adapter
 45 | model_name = args.model_name
 46 | seed = args.seed
 47 | train_sample_rate = 1.0
 48 | val_sample_rate = 1.0
 49 | local_rank = 0
 50 | 
 51 | set_random_seed(seed)
 52 | logging.set_verbosity_info()
 53 | 
 54 | # with open(config_file, "r") as r:
 55 | #     config = json.load(r)
 56 | 
 57 | os.environ["WANDB_DISABLED"] = "true"
 58 | 
 59 | device_map = "auto"
 60 | world_size = int(os.environ.get("WORLD_SIZE", 1))
 61 | ddp = world_size != 1
 62 | 
 63 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
 64 | tokenizer = fix_tokenizer(tokenizer)
 65 | # tokenizer.save_pretrained(output_dir)
 66 | 
 67 | dataset = load_dataset('samsum')
 68 | val_records = dataset['test']
 69 | 
 70 | ## Config for llama 7-b
 71 | model_type = "causal"
 72 | templates_path = "llama_lora_samsum.json"
 73 | only_target_loss = False
 74 | mode = "instruct"
 75 | 
 76 | model_types = {
 77 |     "causal": AutoModelForCausalLM,
 78 |     "seq2seq": AutoModelForSeq2SeqLM
 79 | }
 80 | load_in_8bit = False
 81 | load_in_4bit = True
 82 | if load_in_8bit:
 83 |     assert not load_in_4bit
 84 |     model = model_types[model_type].from_pretrained(
 85 |         model_name,
 86 |         load_in_8bit=True,
 87 |         device_map=device_map
 88 |     )
 89 | elif load_in_4bit:
 90 |     assert not load_in_8bit
 91 |     #use_bf16 = trainer_config.get("bf16", False)
 92 |     use_bf16 = True
 93 |     compute_dtype = torch.bfloat16 if use_bf16 else torch.float16
 94 |     model = model_types[model_type].from_pretrained(
 95 |         model_name,
 96 |         load_in_4bit=True,
 97 |         device_map=device_map,
 98 |         quantization_config=BitsAndBytesConfig(
 99 |             load_in_4bit=True,
100 |             bnb_4bit_compute_dtype=compute_dtype,
101 |             bnb_4bit_use_double_quant=True,
102 |         ),
103 |         torch_dtype=torch.bfloat16 if use_bf16 else torch.float32
104 |     )
105 | else:
106 |     model = model_types[model_type].from_pretrained(model_name)
107 | 
108 | # Default model generation params
109 | model = fix_model(model, tokenizer, use_resize=False)
110 | model.config.num_beams = 5
111 | 
112 | 
113 | peft_model_id = args.adapter
114 | model = PeftModel.from_pretrained(model, peft_model_id)
115 | 
116 | # Metric
117 | metric = evaluate.load("rouge")
118 | 
119 | def evaluate_peft_model(sample,max_target_length=45):
120 |     # Load dataset from the hub and get a sample
121 |     sample_word = f"### Summarize this: {sample}\n ### Output: "
122 |     with torch.inference_mode(), torch.autocast("cuda"):
123 |         input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda()
124 |         outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 45)
125 |         output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"")
126 |         print(f"Output:\n{output}")
127 |     # Some simple post-processing
128 |     return output
129 | 
130 | # run predictions
131 | # this can take ~45 minutes
132 | predictions = []
133 | for sample in tqdm(dataset['test']['dialogue']):
134 |     p = evaluate_peft_model(sample)
135 |     predictions.append(p)
136 | 
137 | # compute metric
138 | rogue = metric.compute(predictions=predictions, references=dataset['test']['summary'], use_stemmer=True)
139 | 
140 | # print results
141 | print(f'Seed: {seed}')
142 | print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
143 | print(f"rouge2: {rogue['rouge2']* 100:2f}%")
144 | print(f"rougeL: {rogue['rougeL']* 100:2f}%")
145 | print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
146 | 
147 | file_name = args.file_name
148 | with open(file_name, 'w') as f:
149 |     for item in predictions:
150 |         # write each item on a new line
151 |         f.write("%s\n" % item)
152 |     f.write(f'Seed: {seed}')
153 |     f.write(f"Rogue1: {rogue['rouge1']* 100:2f}%")
154 |     f.write(f"rouge2: {rogue['rouge2']* 100:2f}%")
155 |     f.write(f"rougeL: {rogue['rougeL']* 100:2f}%")
156 |     f.write(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")


--------------------------------------------------------------------------------
/finetune/samsum-llama/data.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import json
  3 | from typing import Optional
  4 | from dataclasses import dataclass
  5 | from typing import List, Dict, Tuple, Any
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn.functional as F
 10 | from torch.utils.data import Dataset
 11 | from transformers import AutoTokenizer, PreTrainedTokenizerBase
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | class InstructDataset(Dataset):
 16 |     def __init__(
 17 |         self,
 18 |         original_records: List[Dict],
 19 |         tokenizer: AutoTokenizer,
 20 |         max_source_tokens_count: int,
 21 |         max_target_tokens_count: int,
 22 |         templates_path: str,
 23 |         sample_rate: float = 1.0,
 24 |         only_target_loss: bool = True,
 25 |         input_type: str = "causal",
 26 |         target_field: str = "human_reference",
 27 |         source_field: str = "input",
 28 |         use_padding: bool = False
 29 |     ):
 30 |         self.original_records = original_records
 31 |         self.sample_rate = sample_rate
 32 |         self.tokenizer = tokenizer
 33 |         self.max_source_tokens_count = max_source_tokens_count
 34 |         self.max_target_tokens_count = max_target_tokens_count
 35 |         self.only_target_loss = only_target_loss
 36 |         self.input_type = input_type
 37 |         self.target_field = target_field
 38 |         self.source_field = source_field
 39 |         self.use_padding = use_padding
 40 |         self.is_printed = False
 41 | 
 42 |         with open(templates_path) as r:
 43 |             self.templates = json.load(r)
 44 | 
 45 |         self.records = []
 46 |         for record in tqdm(original_records): #original dataset
 47 |             if random.random() > self.sample_rate:
 48 |                 continue
 49 |             tensors = self.convert_record(record)
 50 |             if tensors is None:
 51 |                 continue
 52 |             self.records.append(tensors)
 53 | 
 54 |     def __len__(self):
 55 |         return len(self.records)
 56 | 
 57 |     def __getitem__(self, index):
 58 |         return self.records[index]
 59 | 
 60 |     def convert_record(self, record):
 61 |         instruction = record["dialogue"]
 62 |         #inp = record[self.source_field] #basically no use
 63 |         out = record[self.target_field]
 64 |         # if inp.strip() != "" and False:
 65 |         #     templates = self.templates["prompts_input"]
 66 |         #     prompt_template = random.choice(templates)
 67 |         #     source = prompt_template.format(instruction=instruction.strip(), inp=inp.strip())
 68 |         # else:
 69 |         templates = self.templates["prompts_no_input"] ## This is what we want
 70 |         prompt_template = random.choice(templates)
 71 |         source = prompt_template.format(instruction=instruction.strip()) ## put the prompt inside
 72 |         target = out.strip()
 73 |         if not self.is_printed:
 74 |             print("Source and target examples")
 75 |             print(source)
 76 |             print(target)
 77 |             self.is_printed = True
 78 |         if self.input_type == "causal":
 79 |             return self.convert_causal(source, target)
 80 |         elif self.input_type == "seq2seq":
 81 |             return self.convert_seq2seq(source, target)
 82 |         else:
 83 |             assert False
 84 | 
 85 |     def convert_causal(self, source, target=None):
 86 |         source_tokens = self.tokenizer(
 87 |             source,
 88 |             add_special_tokens=False,
 89 |             max_length=self.max_source_tokens_count,
 90 |             padding=False,
 91 |             truncation=True
 92 |         )["input_ids"]
 93 |         ## added the box_token id
 94 |         if self.tokenizer.bos_token_id:
 95 |             source_tokens.insert(0, self.tokenizer.bos_token_id) ## box_token_id
 96 |         input_ids = source_tokens[:]
 97 |         actual_length = len(input_ids)
 98 |         max_length = self.max_source_tokens_count + self.max_target_tokens_count + 2
 99 |         if target is not None:
100 |             target_tokens = self.tokenizer(
101 |                 target,
102 |                 add_special_tokens=False,
103 |                 max_length=self.max_target_tokens_count,
104 |                 padding=False,
105 |                 truncation=True
106 |             )["input_ids"]
107 |             input_ids += target_tokens + [self.tokenizer.eos_token_id] ## eos_token_id
108 |             actual_length = len(input_ids)
109 |             if self.use_padding:
110 |                 padding = [self.tokenizer.pad_token_id for i in range(len(input_ids), max_length)]
111 |                 input_ids.extend(padding)
112 | 
113 |         input_ids = torch.LongTensor(input_ids)
114 |         labels = input_ids.clone()
115 |         attention_mask = input_ids.new_ones(input_ids.size())
116 |         if self.use_padding:
117 |             labels[actual_length:] = -100
118 |             attention_mask[actual_length:] = 0
119 |         if self.only_target_loss:
120 |             labels[:len(source_tokens)] = -100
121 |         assert input_ids.size(0) == labels.size(0) == attention_mask.size(0) <= max_length
122 | 
123 |         return {
124 |             "input_ids": input_ids,
125 |             "labels": labels,
126 |             "attention_mask": attention_mask
127 |         }
128 | 
129 |     def convert_seq2seq(self, source, target=None):
130 |         inputs = self.tokenizer(
131 |             source,
132 |             add_special_tokens=True,
133 |             max_length=self.max_source_tokens_count,
134 |             padding=False,
135 |             truncation=True,
136 |             return_tensors="pt"
137 |         )
138 |         inputs = {k: v.squeeze(0) for k, v in inputs.items()}
139 |         if target is not None:
140 |             outputs = self.tokenizer(
141 |                 target,
142 |                 add_special_tokens=True,
143 |                 max_length=self.max_target_tokens_count,
144 |                 padding=False,
145 |                 truncation=True,
146 |                 return_tensors="pt"
147 |             )
148 |             labels = outputs["input_ids"].squeeze(0).tolist()
149 |             if labels[-1] != self.tokenizer.eos_token_id:
150 |                 labels.append(self.tokenizer.eos_token_id)
151 |             inputs["labels"] = torch.LongTensor(labels)
152 |         return inputs
153 | 


--------------------------------------------------------------------------------
/finetune/samsum-opt/data.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import json
  3 | from typing import Optional
  4 | from dataclasses import dataclass
  5 | from typing import List, Dict, Tuple, Any
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn.functional as F
 10 | from torch.utils.data import Dataset
 11 | from transformers import AutoTokenizer, PreTrainedTokenizerBase
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | class InstructDataset(Dataset):
 16 |     def __init__(
 17 |         self,
 18 |         original_records: List[Dict],
 19 |         tokenizer: AutoTokenizer,
 20 |         max_source_tokens_count: int,
 21 |         max_target_tokens_count: int,
 22 |         templates_path: str,
 23 |         sample_rate: float = 1.0,
 24 |         only_target_loss: bool = True,
 25 |         input_type: str = "causal",
 26 |         target_field: str = "human_reference",
 27 |         source_field: str = "input",
 28 |         use_padding: bool = False
 29 |     ):
 30 |         self.original_records = original_records
 31 |         self.sample_rate = sample_rate
 32 |         self.tokenizer = tokenizer
 33 |         self.max_source_tokens_count = max_source_tokens_count
 34 |         self.max_target_tokens_count = max_target_tokens_count
 35 |         self.only_target_loss = only_target_loss
 36 |         self.input_type = input_type
 37 |         self.target_field = target_field
 38 |         self.source_field = source_field
 39 |         self.use_padding = use_padding
 40 |         self.is_printed = False
 41 | 
 42 |         with open(templates_path) as r:
 43 |             self.templates = json.load(r)
 44 | 
 45 |         self.records = []
 46 |         for record in tqdm(original_records): #original dataset
 47 |             if random.random() > self.sample_rate:
 48 |                 continue
 49 |             tensors = self.convert_record(record)
 50 |             if tensors is None:
 51 |                 continue
 52 |             self.records.append(tensors)
 53 | 
 54 |     def __len__(self):
 55 |         return len(self.records)
 56 | 
 57 |     def __getitem__(self, index):
 58 |         return self.records[index]
 59 | 
 60 |     def convert_record(self, record):
 61 |         instruction = record["dialogue"]
 62 |         #inp = record[self.source_field] #basically no use
 63 |         out = record[self.target_field]
 64 |         # if inp.strip() != "" and False:
 65 |         #     templates = self.templates["prompts_input"]
 66 |         #     prompt_template = random.choice(templates)
 67 |         #     source = prompt_template.format(instruction=instruction.strip(), inp=inp.strip())
 68 |         # else:
 69 |         templates = self.templates["prompts_no_input"] ## This is what we want
 70 |         prompt_template = random.choice(templates)
 71 |         source = prompt_template.format(instruction=instruction.strip()) ## put the prompt inside
 72 |         target = out.strip()
 73 |         if not self.is_printed:
 74 |             print("Source and target examples")
 75 |             print(source)
 76 |             print(target)
 77 |             self.is_printed = True
 78 |         if self.input_type == "causal":
 79 |             return self.convert_causal(source, target)
 80 |         elif self.input_type == "seq2seq":
 81 |             return self.convert_seq2seq(source, target)
 82 |         else:
 83 |             assert False
 84 | 
 85 |     def convert_causal(self, source, target=None):
 86 |         source_tokens = self.tokenizer(
 87 |             source,
 88 |             add_special_tokens=False,
 89 |             max_length=self.max_source_tokens_count,
 90 |             padding=False,
 91 |             truncation=True
 92 |         )["input_ids"]
 93 |         ## added the box_token id
 94 |         if self.tokenizer.bos_token_id:
 95 |             source_tokens.insert(0, self.tokenizer.bos_token_id) ## box_token_id
 96 |         input_ids = source_tokens[:]
 97 |         actual_length = len(input_ids)
 98 |         max_length = self.max_source_tokens_count + self.max_target_tokens_count + 2
 99 |         if target is not None:
100 |             target_tokens = self.tokenizer(
101 |                 target,
102 |                 add_special_tokens=False,
103 |                 max_length=self.max_target_tokens_count,
104 |                 padding=False,
105 |                 truncation=True
106 |             )["input_ids"]
107 |             input_ids += target_tokens + [self.tokenizer.eos_token_id] ## eos_token_id
108 |             actual_length = len(input_ids)
109 |             if self.use_padding:
110 |                 padding = [self.tokenizer.pad_token_id for i in range(len(input_ids), max_length)]
111 |                 input_ids.extend(padding)
112 | 
113 |         input_ids = torch.LongTensor(input_ids)
114 |         labels = input_ids.clone()
115 |         attention_mask = input_ids.new_ones(input_ids.size())
116 |         if self.use_padding:
117 |             labels[actual_length:] = -100
118 |             attention_mask[actual_length:] = 0
119 |         if self.only_target_loss:
120 |             labels[:len(source_tokens)] = -100
121 |         assert input_ids.size(0) == labels.size(0) == attention_mask.size(0) <= max_length
122 | 
123 |         return {
124 |             "input_ids": input_ids,
125 |             "labels": labels,
126 |             "attention_mask": attention_mask
127 |         }
128 | 
129 |     def convert_seq2seq(self, source, target=None):
130 |         inputs = self.tokenizer(
131 |             source,
132 |             add_special_tokens=True,
133 |             max_length=self.max_source_tokens_count,
134 |             padding=False,
135 |             truncation=True,
136 |             return_tensors="pt"
137 |         )
138 |         inputs = {k: v.squeeze(0) for k, v in inputs.items()}
139 |         if target is not None:
140 |             outputs = self.tokenizer(
141 |                 target,
142 |                 add_special_tokens=True,
143 |                 max_length=self.max_target_tokens_count,
144 |                 padding=False,
145 |                 truncation=True,
146 |                 return_tensors="pt"
147 |             )
148 |             labels = outputs["input_ids"].squeeze(0).tolist()
149 |             if labels[-1] != self.tokenizer.eos_token_id:
150 |                 labels.append(self.tokenizer.eos_token_id)
151 |             inputs["labels"] = torch.LongTensor(labels)
152 |         return inputs
153 | 


--------------------------------------------------------------------------------
/finetune/samsum-opt/eval_samsum_opt_4bit_llmtune.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | # Set up the argument parser
  4 | parser = argparse.ArgumentParser(description='Python script to work with models')
  5 | parser.add_argument('--model_name', type=str, help='HF model name with your user', required=True)
  6 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
  7 | parser.add_argument('--file_name', type=str, help='backup file name', required=True)
  8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
  9 | 
 10 | # Parse the arguments
 11 | args = parser.parse_args()
 12 | 
 13 | # Use the command line arguments in your script
 14 | print('Model Name:', args.model_name)
 15 | print('Adapter Path: ', args.adapter)
 16 | print('Seed: ', args.seed)
 17 | 
 18 | import random
 19 | import json
 20 | import os
 21 | 
 22 | # import wandb
 23 | import torch
 24 | import numpy as np
 25 | # import bitsandbytes as bnb
 26 | from tqdm import tqdm
 27 | import transformers
 28 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
 29 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
 30 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 31 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
 32 | from datasets import load_dataset
 33 | 
 34 | from utils import *
 35 | from data import *
 36 | 
 37 | import evaluate
 38 | import numpy as np
 39 | from datasets import load_from_disk
 40 | from tqdm import tqdm
 41 | 
 42 | from llmtune.llms.autollm import AutoLLMForCausalLM
 43 | from llmtune.engine.lora.config import FinetuneConfig
 44 | from llmtune.engine.lora.peft import quant_peft
 45 | from llmtune.utils import to_half_precision
 46 | 
 47 | output_dir = args.adapter
 48 | seed = args.seed
 49 | train_sample_rate = 1.0
 50 | val_sample_rate = 1.0
 51 | local_rank = 0
 52 | 
 53 | # model config
 54 | model_name = args.model_name
 55 | tokenizer_name = "facebook/opt-6.7b"
 56 | DEV = 'cuda'
 57 | 
 58 | set_random_seed(42)
 59 | logging.set_verbosity_info()
 60 | 
 61 | # with open(config_file, "r") as r:
 62 | #     config = json.load(r)
 63 | 
 64 | device_map = "auto"
 65 | world_size = int(os.environ.get("WORLD_SIZE", 1))
 66 | ddp = world_size != 1
 67 | 
 68 | transformers.logging.set_verbosity_info()
 69 | 
 70 | # load tokenizer
 71 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 72 | tokenizer.pad_token_id = 0
 73 | ## Fix Tokenizer
 74 | tokenizer = fix_tokenizer_opt(tokenizer)
 75 | 
 76 | # load model
 77 | llm = AutoLLMForCausalLM.from_pretrained(model_name)
 78 | ## Fix Model
 79 | lllm = fix_model(llm, tokenizer, use_resize=False)
 80 | llm.eval()
 81 | llm = llm.to(DEV)
 82 | llm = to_half_precision(llm)
 83 | 
 84 | 
 85 | 
 86 | ## dataset
 87 | dataset = load_dataset('samsum')
 88 | train_records = dataset['train']
 89 | val_records = dataset['test']
 90 | #random.shuffle(train_records)
 91 | print("train_record[0]: ",train_records[0])
 92 | 
 93 | ## Config for llama 7-b
 94 | model_type = "causal"
 95 | templates_path = "llama_lora_samsum.json"
 96 | only_target_loss = False
 97 | mode = "instruct"
 98 | 
 99 | 
100 | adapter_path = args.adapter
101 | model = quant_peft.PeftModel.from_pretrained(
102 |     llm, adapter_path, 
103 |     device_map='auto'
104 | )
105 | print(adapter_path, 'loaded')
106 | 
107 | 
108 | # Model configs
109 | model.config.num_beams = 5
110 | 
111 | 
112 | # Metric
113 | metric = evaluate.load("rouge")
114 | 
115 | def evaluate_peft_model(sample,max_target_length=45):
116 |     # Load dataset from the hub and get a sample
117 |     sample_word = f"### Summarize this: {sample}\n ### Output: "
118 |     input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda()
119 |     # with torch.inference_mode(), torch.autocast("cuda"):
120 |     print("input_ids: ",input_ids)
121 |     outputs = model.generate(input_ids=input_ids, do_sample=True, max_new_tokens = 45)
122 |     output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"")
123 |     print(f"Output:\n{output}")
124 |     # Some simple post-processing
125 |     return output
126 | 
127 | # run predictions
128 | # this can take ~45 minutes
129 | predictions = []
130 | for sample in tqdm(dataset['test']['dialogue']):
131 |     p = evaluate_peft_model(sample)
132 |     predictions.append(p)
133 | 
134 | # compute metric
135 | 
136 | 
137 | file_name = args.file_name
138 | # with open(file_name, 'w') as f:
139 | #     for item in predictions:
140 | #         # write each item on a new line
141 | #         f.write("%s\n" % item)
142 | #     f.write(f'Seed: {seed}')
143 | 
144 | 
145 | # def process_file(filename):
146 | #     output_list = []
147 | #     delete_lines = False
148 | #     with open(filename, 'r') as file:
149 | #         for line in file:
150 | #             stripped_line = line.strip()
151 | #             if stripped_line.startswith("### Summarize this:"):
152 | #                 delete_lines = True
153 | #                 continue
154 | #             elif stripped_line.startswith("### Output: "):
155 | #                 output = stripped_line[len("### Output: "):]
156 | #                 output_list.append(output)
157 | #                 delete_lines = False
158 | #                 continue
159 | 
160 | #             if not delete_lines:
161 | #                 output_list.append(stripped_line)
162 | 
163 | #     return output_list
164 | 
165 | # predictions = process_file(file_name)
166 | # predictions.pop()
167 | 
168 | rogue = metric.compute(predictions=predictions, references=dataset['test']['summary'], use_stemmer=True)
169 | 
170 | # print results
171 | print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
172 | print(f"rouge2: {rogue['rouge2']* 100:2f}%")
173 | print(f"rougeL: {rogue['rougeL']* 100:2f}%")
174 | print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
175 | 
176 | with open(file_name, 'w') as f:
177 |     for item in predictions:
178 |         # write each item on a new line
179 |         f.write("%s\n" % item)
180 |     f.write(f'Seed: {seed}\n')
181 |     f.write(f"Rogue1: {rogue['rouge1']* 100:2f}%\n")
182 |     f.write(f"rouge2: {rogue['rouge2']* 100:2f}%\n")
183 |     f.write(f"rougeL: {rogue['rougeL']* 100:2f}%\n")
184 |     f.write(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%\n")
185 | 
186 | 


--------------------------------------------------------------------------------
/finetune/mnli-llama/data_mnli_label.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import json
  3 | from typing import Optional
  4 | from dataclasses import dataclass
  5 | from typing import List, Dict, Tuple, Any
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn.functional as F
 10 | from torch.utils.data import Dataset
 11 | from transformers import AutoTokenizer, PreTrainedTokenizerBase
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | class InstructDataset(Dataset):
 16 |     def __init__(
 17 |         self,
 18 |         original_records: List[Dict],
 19 |         tokenizer: AutoTokenizer,
 20 |         max_source_tokens_count: int,
 21 |         max_target_tokens_count: int,
 22 |         templates_path: str,
 23 |         sample_rate: float = 1.0,
 24 |         only_target_loss: bool = True,
 25 |         input_type: str = "causal",
 26 |         target_field: str = "human_reference",
 27 |         source_field: str = "input",
 28 |         use_padding: bool = False
 29 |     ):
 30 |         self.original_records = original_records
 31 |         self.sample_rate = sample_rate
 32 |         self.tokenizer = tokenizer
 33 |         self.max_source_tokens_count = max_source_tokens_count
 34 |         self.max_target_tokens_count = max_target_tokens_count
 35 |         self.only_target_loss = only_target_loss
 36 |         self.input_type = input_type
 37 |         self.target_field = target_field
 38 |         self.source_field = source_field
 39 |         self.use_padding = use_padding
 40 |         self.is_printed = False
 41 | 
 42 |         with open(templates_path) as r:
 43 |             self.templates = json.load(r)
 44 | 
 45 |         self.records = []
 46 |         for record in tqdm(original_records): #original dataset
 47 |             if random.random() > self.sample_rate:
 48 |                 continue
 49 |             tensors = self.convert_record(record)
 50 |             if tensors is None:
 51 |                 continue
 52 |             self.records.append(tensors)
 53 | 
 54 |     def __len__(self):
 55 |         return len(self.records)
 56 | 
 57 |     def __getitem__(self, index):
 58 |         return self.records[index]
 59 | 
 60 |     def convert_record(self, record):
 61 |         instruction = record["premise"]
 62 |         hypothesis = record["hypothesis"]
 63 |         genre = record["genre"]
 64 |         #inp = record[self.source_field] #basically no use
 65 |         out = record["label"]
 66 |         # if inp.strip() != "" and False:
 67 |         #     templates = self.templates["prompts_input"]
 68 |         #     prompt_template = random.choice(templates)
 69 |         #     source = prompt_template.format(instruction=instruction.strip(), inp=inp.strip())
 70 |         # else:
 71 |         templates = self.templates["prompts_no_input"] ## This is what we want
 72 |         prompt_template = random.choice(templates)
 73 |         source = prompt_template.format(instruction=instruction.strip(), hypothesis=hypothesis.strip(), genre=genre.strip()) ## put the prompt inside
 74 |         target = str(out)
 75 |         if not self.is_printed:
 76 |             print("Source and target examples")
 77 |             print(source)
 78 |             print(target)
 79 |             self.is_printed = True
 80 |         if self.input_type == "causal":
 81 |             return self.convert_causal(source, target)
 82 |         elif self.input_type == "seq2seq":
 83 |             return self.convert_seq2seq(source, target)
 84 |         else:
 85 |             assert False
 86 | 
 87 |     def convert_causal(self, source, target=None):
 88 |         source_tokens = self.tokenizer(
 89 |             source,
 90 |             add_special_tokens=False,
 91 |             max_length=self.max_source_tokens_count,
 92 |             padding=False,
 93 |             truncation=True
 94 |         )["input_ids"]
 95 |         ## added the box_token id
 96 |         if self.tokenizer.bos_token_id:
 97 |             source_tokens.insert(0, self.tokenizer.bos_token_id) ## box_token_id
 98 |         input_ids = source_tokens[:]
 99 |         actual_length = len(input_ids)
100 |         max_length = self.max_source_tokens_count + self.max_target_tokens_count + 2
101 |         if target is not None:
102 |             target_tokens = self.tokenizer(
103 |                 target,
104 |                 add_special_tokens=False,
105 |                 max_length=self.max_target_tokens_count,
106 |                 padding=False,
107 |                 truncation=True
108 |             )["input_ids"]
109 |             input_ids += target_tokens + [self.tokenizer.eos_token_id] ## eos_token_id
110 |             actual_length = len(input_ids)
111 |             if self.use_padding:
112 |                 padding = [self.tokenizer.pad_token_id for i in range(len(input_ids), max_length)]
113 |                 input_ids.extend(padding)
114 | 
115 |         input_ids = torch.LongTensor(input_ids)
116 |         labels = input_ids.clone()
117 |         attention_mask = input_ids.new_ones(input_ids.size())
118 |         if self.use_padding:
119 |             labels[actual_length:] = -100
120 |             attention_mask[actual_length:] = 0
121 |         if self.only_target_loss:
122 |             labels[:len(source_tokens)] = -100
123 |         assert input_ids.size(0) == labels.size(0) == attention_mask.size(0) <= max_length
124 | 
125 |         return {
126 |             "input_ids": input_ids,
127 |             "labels": labels,
128 |             "attention_mask": attention_mask
129 |         }
130 | 
131 |     def convert_seq2seq(self, source, target=None):
132 |         inputs = self.tokenizer(
133 |             source,
134 |             add_special_tokens=True,
135 |             max_length=self.max_source_tokens_count,
136 |             padding=False,
137 |             truncation=True,
138 |             return_tensors="pt"
139 |         )
140 |         inputs = {k: v.squeeze(0) for k, v in inputs.items()}
141 |         if target is not None:
142 |             outputs = self.tokenizer(
143 |                 target,
144 |                 add_special_tokens=True,
145 |                 max_length=self.max_target_tokens_count,
146 |                 padding=False,
147 |                 truncation=True,
148 |                 return_tensors="pt"
149 |             )
150 |             labels = outputs["input_ids"].squeeze(0).tolist()
151 |             if labels[-1] != self.tokenizer.eos_token_id:
152 |                 labels.append(self.tokenizer.eos_token_id)
153 |             inputs["labels"] = torch.LongTensor(labels)
154 |         return inputs
155 | 


--------------------------------------------------------------------------------
/llmtune/engine/quant/gptq/algorithm.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import time
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import transformers
  7 | 
  8 | DEBUG = False 
  9 | torch.backends.cuda.matmul.allow_tf32 = False
 10 | torch.backends.cudnn.allow_tf32 = False
 11 | 
 12 | def quantize(x, scale, zero, maxq):
 13 |     if maxq < 0:
 14 |         return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
 15 |     q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
 16 |     return scale * (q - zero)
 17 | 
 18 | class GPTQ:
 19 |     def __init__(self, layer):
 20 |         self.layer = layer
 21 |         self.dev = self.layer.weight.device
 22 |         W = layer.weight.data.clone()
 23 |         if isinstance(self.layer, nn.Conv2d):
 24 |             W = W.flatten(1)
 25 |         if isinstance(self.layer, transformers.Conv1D):
 26 |             W = W.t()
 27 |         self.rows = W.shape[0]
 28 |         self.columns = W.shape[1]
 29 |         self.H = torch.zeros((self.columns, self.columns), device=self.dev)
 30 |         self.nsamples = 0
 31 | 
 32 |     def add_batch(self, inp, out):
 33 |         if DEBUG:
 34 |             self.inp1 = inp
 35 |             self.out1 = out
 36 |         if len(inp.shape) == 2:
 37 |             inp = inp.unsqueeze(0)
 38 |         tmp = inp.shape[0]
 39 |         if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
 40 |             if len(inp.shape) == 3:
 41 |                 inp = inp.reshape((-1, inp.shape[-1]))
 42 |             inp = inp.t()
 43 |         if isinstance(self.layer, nn.Conv2d):
 44 |             unfold = nn.Unfold(
 45 |                 self.layer.kernel_size,
 46 |                 dilation=self.layer.dilation,
 47 |                 padding=self.layer.padding,
 48 |                 stride=self.layer.stride
 49 |             )
 50 |             inp = unfold(inp)
 51 |             inp = inp.permute([1, 0, 2])
 52 |             inp = inp.flatten(1)
 53 |         self.H *= self.nsamples / (self.nsamples + tmp)
 54 |         self.nsamples += tmp
 55 |         # inp = inp.float()
 56 |         inp = math.sqrt(2 / self.nsamples) * inp.float()
 57 |         # self.H += 2 / self.nsamples * inp.matmul(inp.t())
 58 |         self.H += inp.matmul(inp.t())
 59 | 
 60 |     def fasterquant(
 61 |         self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False
 62 |     ):
 63 |         W = self.layer.weight.data.clone()
 64 |         if isinstance(self.layer, nn.Conv2d):
 65 |             W = W.flatten(1)
 66 |         if isinstance(self.layer, transformers.Conv1D):
 67 |             W = W.t()
 68 |         W = W.float()
 69 | 
 70 |         tick = time.time()
 71 | 
 72 |         if not self.quantizer.ready():
 73 |             self.quantizer.find_params(W, weight=True)
 74 | 
 75 |         H = self.H
 76 |         del self.H
 77 |         dead = torch.diag(H) == 0
 78 |         H[dead, dead] = 1
 79 |         W[:, dead] = 0
 80 |         
 81 |         if actorder:
 82 |             perm = torch.argsort(torch.diag(H), descending=True)
 83 |             W = W[:, perm]
 84 |             H = H[perm][:, perm]
 85 | 
 86 |         Losses = torch.zeros_like(W)
 87 |         Q = torch.zeros_like(W)
 88 | 
 89 |         damp = percdamp * torch.mean(torch.diag(H))
 90 |         diag = torch.arange(self.columns, device=self.dev)
 91 |         H[diag, diag] += damp
 92 |         H = torch.linalg.cholesky(H)
 93 |         H = torch.cholesky_inverse(H)
 94 |         H = torch.linalg.cholesky(H, upper=True)
 95 |         Hinv = H
 96 |         
 97 |         g_idx = []
 98 |         scale = []
 99 |         zero = []
100 |         now_idx = 1
101 | 
102 |         for i1 in range(0, self.columns, blocksize):
103 |             i2 = min(i1 + blocksize, self.columns)
104 |             count = i2 - i1
105 | 
106 |             W1 = W[:, i1:i2].clone()
107 |             Q1 = torch.zeros_like(W1)
108 |             Err1 = torch.zeros_like(W1)
109 |             Losses1 = torch.zeros_like(W1)
110 |             Hinv1 = Hinv[i1:i2, i1:i2]
111 | 
112 |             for i in range(count):
113 |                 w = W1[:, i]
114 |                 d = Hinv1[i, i]
115 | 
116 |                 if groupsize != -1:
117 |                     if (i1 + i) % groupsize == 0:
118 |                         self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True)
119 | 
120 |                     if ((i1 + i) // groupsize) - now_idx == -1:
121 |                         scale.append(self.quantizer.scale)
122 |                         zero.append(self.quantizer.zero)
123 |                         now_idx += 1
124 | 
125 |                 q = quantize(
126 |                     w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq
127 |                 ).flatten()
128 |                 Q1[:, i] = q
129 |                 Losses1[:, i] = (w - q) ** 2 / d ** 2
130 | 
131 |                 err1 = (w - q) / d
132 |                 W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
133 |                 Err1[:, i] = err1
134 | 
135 |             Q[:, i1:i2] = Q1
136 |             Losses[:, i1:i2] = Losses1 / 2
137 | 
138 |             W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
139 | 
140 |             if DEBUG:
141 |                 self.layer.weight.data[:, :i2] = Q[:, :i2]
142 |                 self.layer.weight.data[:, i2:] = W[:, i2:]
143 |                 print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
144 |                 print(torch.sum(Losses))
145 | 
146 |         torch.cuda.synchronize()
147 |         print('time %.2f' % (time.time() - tick))
148 |         print('error', torch.sum(Losses).item())
149 |         
150 |         groupsize = groupsize if groupsize != -1 else self.columns
151 |         g_idx = [i // groupsize  for i in range(self.columns)]
152 |         g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
153 |         if actorder:
154 |             invperm = torch.argsort(perm)
155 |             Q = Q[:, invperm]
156 |             g_idx = g_idx[invperm]
157 | 
158 |         if isinstance(self.layer, transformers.Conv1D):
159 |             Q = Q.t()
160 |         self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
161 |         if DEBUG:
162 |             print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
163 |             
164 |         if scale == []:
165 |             scale.append(self.quantizer.scale)
166 |             zero.append(self.quantizer.zero)
167 |         scale = torch.cat(scale,dim=1)
168 |         zero = torch.cat(zero,dim=1)
169 |         return scale,zero,g_idx
170 |             
171 |     def free(self):
172 |         if DEBUG:
173 |             self.inp1 = None
174 |             self.out1 = None
175 |         self.H = None
176 |         self.Losses = None
177 |         self.Trace = None
178 |         torch.cuda.empty_cache()
179 | 


--------------------------------------------------------------------------------
/llmtune/llms/autollm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from torch import nn
  4 | from typing import Dict, List, Optional, Union
  5 | from transformers import AutoTokenizer
  6 | from transformers.utils.hub import (
  7 |     PushToHubMixin, cached_file, create_repo, 
  8 |     create_commit, CommitOperationAdd
  9 | )
 10 | from llmtune.llms.config import AutoLLMConfig, LLMType
 11 | from llmtune.llms.llama.model import load_llama, load_llama_tokenizer
 12 | from llmtune.llms.opt.model import load_opt, load_opt_tokenizer
 13 | from llmtune.llms.bloom.model import load_bloom, load_bloom_tokenizer
 14 | 
 15 | def get_default_tokenizer(name_or_path, model_type=None):
 16 |     if model_type is not None:
 17 |         if model_type == 'llama':
 18 |             return load_llama_tokenizer(name_or_path)
 19 |         elif model_type == 'opt':
 20 |             return load_opt_tokenizer(name_or_path)
 21 |         elif model_type == 'bloom':
 22 |             return load_bloom_tokenizer(name_or_path)
 23 |         else:
 24 |             raise ValueError()
 25 |     else:
 26 |         return AutoTokenizer.from_pretrained(name_or_path)
 27 | 
 28 | class AutoLLMForCausalLM(nn.Module, PushToHubMixin):
 29 |     def __init__(
 30 |         self, 
 31 |         base_model, 
 32 |         llm_config
 33 |     ):
 34 |         super().__init__()
 35 |         self.base_model = base_model
 36 |         self.llm_config = llm_config
 37 | 
 38 |     @property
 39 |     def is_quantized(self):
 40 |         return self.llm_config.is_quantized
 41 | 
 42 |     def set_quant_config(self, quant_config):
 43 |         self.llm_config.set_quant_config(quant_config)
 44 | 
 45 |     @property
 46 |     def device(self):
 47 |         if not self.hf_device_map:
 48 |             return self.base_model.device
 49 |         else:
 50 |             device = [
 51 |                 d for d in self.hf_device_map.values() 
 52 |                 if d not in {'cpu', 'disk'}
 53 |             ][0]
 54 |             return torch.device(device)
 55 | 
 56 |     @property
 57 |     def hf_device_map(self):
 58 |         return getattr(self.base_model, "hf_device_map", None)
 59 | 
 60 |     @property
 61 |     def config(self):
 62 |         return self.base_model.config
 63 | 
 64 |     @property
 65 |     def _keys_to_ignore_on_save(self):
 66 |         return self.base_model._keys_to_ignore_on_save
 67 | 
 68 |     @property
 69 |     def _no_split_modules(self):
 70 |         return self.base_model._no_split_modules
 71 |     
 72 |     def to(self, device: Union[str, torch.device]):
 73 |         self.base_model = self.base_model.to(device)
 74 |         return self
 75 | 
 76 |     def forward(self, *args, **kwargs):
 77 |         return self.base_model(*args, **kwargs)
 78 | 
 79 |     def generate(self, **kwargs):
 80 |         with (
 81 |             torch.inference_mode(), 
 82 |             torch.amp.autocast(device_type=self.device.type)
 83 |         ):
 84 |             return self.base_model.generate(**kwargs)
 85 | 
 86 |     def prepare_inputs_for_generation(self, *args, **kwargs):
 87 |         return self.base_model.prepare_inputs_for_generation(*args, **kwargs)
 88 | 
 89 |     @classmethod
 90 |     def from_pretrained(
 91 |         cls,
 92 |         model_name_or_path: str,
 93 |         device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
 94 |         device: Optional[Union[str, int]] = None,
 95 |     ):
 96 |         # load config
 97 |         llm_config = AutoLLMConfig.from_pretrained(model_name_or_path)
 98 |         load_quantized = llm_config.quant_config is not None
 99 | 
100 |         # resolve path to checkpoint (could be None)
101 |         checkpoint = None
102 |         if load_quantized:
103 |             if os.path.isdir(model_name_or_path):
104 |                 checkpoint = os.path.join(
105 |                     model_name_or_path, 'quantized_weights.pt'
106 |                 )
107 |             else: # remote
108 |                 checkpoint = cached_file(
109 |                     model_name_or_path, 'quantized_weights.pt'
110 |                 )
111 |             if checkpoint is None:
112 |                 raise FileNotFoundError(
113 |                     f"Couldn't find quantized weights in {model_name_or_path}"
114 |                 )
115 | 
116 |         # load base model
117 |         if llm_config.model_type == LLMType.LLAMA.value:
118 |             model = load_llama(llm_config, checkpoint)
119 |         elif llm_config.model_type == LLMType.OPT.value:
120 |             model = load_opt(llm_config, checkpoint)
121 |         elif llm_config.model_type == LLMType.BLOOM.value:
122 |             model = load_bloom(llm_config, checkpoint)
123 |         else:
124 |             raise NotImplementedError(
125 |                f'{llm_config.model_type} not supported'
126 |             )
127 | 
128 |         return cls(model, llm_config)
129 | 
130 |     def save_pretrained(self, save_dir: str):
131 |         os.makedirs(save_dir, exist_ok=True)
132 |         print('test')
133 | 
134 |         # save config
135 |         self.llm_config.save_pretrained(save_dir)
136 | 
137 |         # save base model
138 |         self.base_model.to('cpu')
139 |         print(self.llm_config.quant_config)
140 |         if not self.is_quantized:
141 |             self.base_model.save_pretrained(save_dir)
142 |         else:
143 |             torch.save(
144 |                 self.base_model.state_dict(), 
145 |                 os.path.join(save_dir, 'quantized_weights.pt')
146 |             )
147 |             self.llm_config.base_config.model_name_or_path = save_dir
148 | 
149 |     def push_to_hub(
150 |         self,
151 |         repo_id: str,
152 |         save_dir: str,
153 |         commit_message: Optional[str] = "",
154 |         use_auth_token: Optional[Union[bool, str]] = None,
155 |         private: Optional[bool] = None,
156 |         token: Optional[Union[bool, str]] = None,
157 |         create_pr: Optional[bool] = False,
158 |     ) -> str:
159 |         
160 |         if not os.path.exists(save_dir):
161 |             print(f"Saving model to {save_dir}")
162 |             self.save_pretrained(save_dir)
163 | 
164 |         repo_url = create_repo(
165 |             repo_id=repo_id, token=token, private=private, 
166 |             exist_ok=True, repo_type="model"
167 |         )
168 |         repo_id = repo_url.repo_id
169 | 
170 |         operations = [
171 |             CommitOperationAdd(
172 |                 path_or_fileobj=os.path.join(save_dir, f), 
173 |                 path_in_repo=f
174 |             )
175 |             for f in os.listdir(save_dir)
176 |         ]
177 |         print(
178 |             f"Uploading the following files to {repo_id}: "
179 |             f"{','.join(os.listdir(save_dir))}"
180 |         )
181 |         return create_commit(
182 |             repo_id=repo_id,
183 |             operations=operations,
184 |             commit_message=commit_message,
185 |             token=use_auth_token,
186 |             create_pr=create_pr,
187 |             repo_type="model",
188 |         )
189 | 
190 | 


--------------------------------------------------------------------------------
/llmtune/engine/inference/modules.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | from llmtune.engine.inference.autograd import (
  6 |     Autograd2bit, Autograd4bit, Autograd3bit
  7 | )
  8 | 
  9 | try:
 10 |     import quant_cuda
 11 | except:
 12 |     print('CUDA extension not installed. Inference will not work.')
 13 | 
 14 | # Assumes layer is perfectly divisible into 256 * 256 blocks
 15 | class QuantLinear(nn.Module): 
 16 |     def __init__(
 17 |         self, bits, groupsize, in_features, out_features, bias, is_cuda=True
 18 |     ):
 19 |         super().__init__()
 20 |         if bits not in [2,3,4,8]:
 21 |             raise NotImplementedError("Only 2,3,4,8 bits are supported.")
 22 |         self.in_features = in_features
 23 |         self.out_features = out_features
 24 |         self.bits = bits
 25 |         self.groupsize = groupsize if groupsize != -1 else in_features
 26 |         self.maxq = 2 ** self.bits - 1
 27 | 
 28 |         self.register_buffer('qweight', torch.zeros((in_features // 32 * self.bits, out_features), dtype=torch.int32))
 29 |         self.register_buffer('qzeros', torch.zeros((math.ceil(in_features / self.groupsize), out_features // 32 * self.bits), dtype=torch.int32))
 30 |         self.register_buffer('scales', torch.zeros((math.ceil(in_features / self.groupsize), out_features), dtype=torch.float16))
 31 |         self.register_buffer('g_idx', torch.tensor([i // self.groupsize  for i in range(in_features)], dtype = torch.int32))
 32 |         if bias is not None:
 33 |             self.register_buffer('bias', torch.zeros((out_features),dtype=torch.float16))
 34 |         else:
 35 |             self.bias = None
 36 |         
 37 |         # is performed by unpacking the weights and using torch.matmul
 38 |         if self.bits in [2,4,8]: 
 39 |             self.register_buffer('wf',torch.tensor(list(range(0,32,self.bits)), dtype=torch.int32).unsqueeze(0),persistent=False)
 40 |         elif self.bits == 3:
 41 |             self.register_buffer('wf', torch.tensor([[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
 42 |                                                      [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
 43 |                                                      [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],], dtype=torch.int32).reshape(1,3,12), persistent=False)
 44 |             
 45 |         self.is_cuda = is_cuda
 46 | 
 47 |     def pack(self, linear, scales, zeros, g_idx = None):
 48 |         self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
 49 |         
 50 |         scales = scales.t().contiguous()
 51 |         zeros = zeros.t().contiguous()
 52 |         scale_zeros = zeros * scales
 53 |         self.scales = scales.clone().half()
 54 |         if linear.bias is not None:
 55 |             self.bias = linear.bias.clone().half()
 56 |             
 57 |         intweight = []
 58 |         for idx in range(self.in_features):
 59 |             intweight.append(
 60 |                 torch.round(
 61 |                     (linear.weight.data[:,idx] + scale_zeros[self.g_idx[idx]]) 
 62 |                     / self.scales[self.g_idx[idx]]).to(torch.int)[:,None]
 63 |             )
 64 |         intweight = torch.cat(intweight,dim=1)
 65 |         intweight = intweight.t().contiguous()
 66 |         intweight = intweight.numpy().astype(np.uint32)
 67 |         qweight = np.zeros(
 68 |             (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
 69 |         )
 70 |         i = 0
 71 |         row = 0
 72 |         while row < qweight.shape[0]:
 73 |             if self.bits in [2,4,8]:
 74 |                 for j in range(i, i + (32//self.bits)):
 75 |                     qweight[row] |= intweight[j] << (self.bits * (j - i))
 76 |                 i += 32//self.bits
 77 |                 row += 1
 78 |             elif self.bits == 3:
 79 |                 for j in range(i, i + 10):
 80 |                     qweight[row] |= intweight[j] << (3 * (j - i))
 81 |                 i += 10
 82 |                 qweight[row] |= intweight[i] << 30
 83 |                 row += 1
 84 |                 qweight[row] |= (intweight[i] >> 2) & 1
 85 |                 i += 1
 86 |                 for j in range(i, i + 10):
 87 |                     qweight[row] |= intweight[j] << (3 * (j - i) + 1)
 88 |                 i += 10
 89 |                 qweight[row] |= intweight[i] << 31
 90 |                 row += 1
 91 |                 qweight[row] |= (intweight[i] >> 1) & 0x3
 92 |                 i += 1
 93 |                 for j in range(i, i + 10):
 94 |                     qweight[row] |= intweight[j] << (3 * (j - i) + 2)
 95 |                 i += 10
 96 |                 row += 1
 97 |             else:
 98 |                 raise NotImplementedError("Only 2,3,4,8 bits are supported.")
 99 |                 
100 |         qweight = qweight.astype(np.int32)
101 |         self.qweight = torch.from_numpy(qweight) 
102 |         
103 |         zeros -= 1;
104 |         zeros = zeros.numpy().astype(np.uint32)
105 |         qzeros = np.zeros(
106 |             (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
107 |         )
108 |         i = 0
109 |         col = 0
110 |         while col < qzeros.shape[1]:
111 |             if self.bits in [2,4,8]:
112 |                 for j in range(i, i + (32//self.bits)):
113 |                     qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
114 |                 i += 32//self.bits
115 |                 col += 1
116 |             elif self.bits == 3:
117 |                 for j in range(i, i + 10):
118 |                     qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
119 |                 i += 10
120 |                 qzeros[:, col] |= zeros[:, i] << 30
121 |                 col += 1
122 |                 qzeros[:, col] |= (zeros[:, i] >> 2) & 1
123 |                 i += 1
124 |                 for j in range(i, i + 10):
125 |                     qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
126 |                 i += 10
127 |                 qzeros[:, col] |= zeros[:, i] << 31
128 |                 col += 1
129 |                 qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
130 |                 i += 1
131 |                 for j in range(i, i + 10):
132 |                     qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
133 |                 i += 10
134 |                 col += 1
135 |             else:
136 |                 raise NotImplementedError("Only 2,3,4,8 bits are supported.")
137 |                 
138 |         qzeros = qzeros.astype(np.int32)
139 |         self.qzeros = torch.from_numpy(qzeros) 
140 | 
141 |     def forward(self, x):
142 |         if self.bits == 4:
143 |             out = Autograd4bit.apply(
144 |                 x, 
145 |                 self.qweight, 
146 |                 self.scales, 
147 |                 self.qzeros, 
148 |                 self.g_idx, 
149 |             )
150 |             if self.bias is not None:
151 |                 out += self.bias
152 |         elif self.bits == 2:
153 |             out = Autograd2bit.apply(
154 |                 x, 
155 |                 self.qweight, 
156 |                 self.scales, 
157 |                 self.qzeros, 
158 |                 self.g_idx, 
159 |             )
160 |             if self.bias is not None:
161 |                 out += self.bias
162 |         elif self.bits == 3:
163 |             out = Autograd3bit.apply(
164 |                 x, 
165 |                 self.qweight, 
166 |                 self.scales, 
167 |                 self.qzeros, 
168 |                 self.g_idx, 
169 |                 self.wf,
170 |                 self.out_features,
171 |             )
172 |             if self.bias is not None:
173 |                 out += self.bias
174 |         else:
175 |             raise NotImplementedError()
176 |         return out
177 | 


--------------------------------------------------------------------------------
/llmtune/engine/inference/autograd.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | from . import matmult as mm
  6 | from torch.cuda.amp import custom_bwd, custom_fwd
  7 | 
  8 | class Autograd4bit(torch.autograd.Function):
  9 |     @staticmethod
 10 |     @custom_fwd(cast_inputs=torch.float16)
 11 |     def forward(ctx, x, qweight, scales, zeros, g_idx):
 12 |         ctx.save_for_backward(qweight, scales, zeros, g_idx)
 13 |         if g_idx is None:
 14 |             output = mm._matmul4bit_v1_recons(
 15 |                 x, qweight, scales, zeros
 16 |             )
 17 |         else:
 18 |             output = mm._matmul4bit_v2_recons(
 19 |                 x, qweight, scales, zeros, g_idx
 20 |             )
 21 |         output = output.clone()
 22 |         return output
 23 | 
 24 |     @staticmethod
 25 |     @custom_bwd
 26 |     def backward(ctx, grad_output):
 27 |         qweight, scales, zeros, g_idx = ctx.saved_tensors
 28 |         if ctx.needs_input_grad[0]:
 29 |             if g_idx is None:
 30 |                 grad = mm._matmul4bit_v1_recons(
 31 |                     grad_output, qweight, scales, zeros, transpose=True
 32 |                 )
 33 |             else:
 34 |                 grad = mm._matmul4bit_v2_recons(
 35 |                     grad_output, qweight, scales, zeros, g_idx, transpose=True
 36 |                 )
 37 |         return grad, None, None, None, None, None, None
 38 | 
 39 | class Autograd2bit(torch.autograd.Function):
 40 |     @staticmethod
 41 |     @custom_fwd(cast_inputs=torch.float16)
 42 |     def forward(ctx, x, qweight, scales, zeros, g_idx):
 43 |         ctx.save_for_backward(qweight, scales, zeros, g_idx)
 44 |         output = mm._matmul2bit_v2_recons(x, qweight, scales, zeros, g_idx)
 45 |         output = output.clone()
 46 |         return output
 47 | 
 48 |     @staticmethod
 49 |     @custom_bwd
 50 |     def backward(ctx, grad_output):
 51 |         qweight, scales, zeros, g_idx = ctx.saved_tensors
 52 |         if ctx.needs_input_grad[0]:
 53 |             grad = mm._matmul2bit_v2_recons(
 54 |                 grad_output, qweight, scales, zeros, g_idx, transpose=True
 55 |             )
 56 |         return grad, None, None, None, None, None, None        
 57 | 
 58 | class Autograd3bit(torch.autograd.Function):
 59 |     @staticmethod
 60 |     @custom_fwd(cast_inputs=torch.float16)
 61 |     def forward(ctx, x, qweight, scales, qzeros, g_idx, wf, outfeatures):
 62 |         ctx.save_for_backward(qweight, scales, qzeros, g_idx, wf)
 63 |         # output = mm.matmul3bit(x, qweight, scales, qzeros, g_idx, outfeatures)
 64 |         # output = output.half()
 65 |         # below, we instead unpack weights in pytorch
 66 |         weight = unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf)
 67 |         output = torch.matmul(x.half(), weight)
 68 |         output.reshape(x.shape[:-1] + (outfeatures,))
 69 |         return output
 70 | 
 71 |     @staticmethod
 72 |     @custom_bwd
 73 |     def backward(ctx, grad_output):
 74 |         qweight, scales, qzeros, g_idx, wf = ctx.saved_tensors
 75 |         if ctx.needs_input_grad[0]:
 76 |             weight = unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf)
 77 |             grad = torch.matmul(grad_output.half(), weight.T)
 78 |         return grad, None, None, None, None, None, None, None
 79 | 
 80 | def classic_forward(
 81 |     x, qweight, bias, scales, qzeros, g_idx, outfeatures, wf=None,
 82 |     bits=4, is_cuda=True, kernel_switch_threshold=128
 83 | ):
 84 |     out_shape = x.shape[:-1] + (outfeatures, )
 85 |     x = x.reshape(-1,x.shape[-1])     
 86 |     # dtype = x.dtype
 87 |     # x = x.float()
 88 |     if  is_cuda is True and (kernel_switch_threshold is False or x.shape[0] < kernel_switch_threshold):
 89 |         raise NotImplementedError() # code below needs some fixes
 90 |         out = torch.zeros((x.shape[0], outfeatures), device=x.device, dtype=torch.float32)
 91 |         if bits == 2:
 92 |             quant_cuda.vecquant2matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx)
 93 |         elif bits == 3:
 94 |             quant_cuda.vecquant3matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx)
 95 |         elif bits == 4:
 96 |             quant_cuda.vecquant4matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx)
 97 |         elif bits == 8:
 98 |             quant_cuda.vecquant8matmul(x.float(), qweight, out, scales.float(), qzeros, g_idx)
 99 |         out = out.half()
100 |     else:
101 |         weight = unpack_weight(qweight, scales, qzeros, g_idx, wf, bits)
102 |         out = torch.matmul(x.half(), weight)
103 |         del weight
104 | 
105 |     out = out.reshape(out_shape)
106 |     out = out + bias if bias is not None else out
107 |     # out = out.to(dtype)
108 |     return out
109 | 
110 | def unpack_weight(qweight, scales, qzeros, g_idx, wf=None, bits=4):
111 |     if bits == 3:
112 |         return unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf)
113 |     elif bits in [2,4,8]:
114 |        zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0)).to(torch.int16 if self.bits == 8 else torch.int8)
115 |        torch.bitwise_and(zeros, (2 ** bits) - 1, out=zeros)
116 |            
117 |        zeros = zeros + 1
118 |        zeros = zeros.reshape(scales.shape)   
119 |                    
120 |        weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8)
121 |        torch.bitwise_and(weight,(2 ** bits) - 1, out=weight)
122 | 
123 |        weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
124 |               
125 |        g_idx_long = g_idx.to(torch.long)
126 |        weight = (scales[g_idx_long] * (weight - zeros[g_idx_long]))
127 |     else:
128 |         raise NotImplementedError()
129 | 
130 |     return weight    
131 | 
132 | def unpack_weight_3bits(qweight, scales, qzeros, g_idx, wf=None):
133 |     zeros = qzeros.reshape(qzeros.shape[0], qzeros.shape[1]//3, 3, 1).expand(-1, -1, -1, 12)
134 |     zeros = (zeros >> wf.unsqueeze(0))
135 |     zeros[:,:,0,10] = (zeros[:,:,0,10]&0x3) | ((zeros[:,:,1,0] << 2)&0x4)
136 |     zeros[:,:,1,11] = (zeros[:,:,1,11]&0x1) | ((zeros[:,:,2,0] << 1)&0x6)
137 |     zeros &= 0x7
138 |     zeros = torch.cat([zeros[:,:,0,:11], zeros[:,:,1,1:12], zeros[:,:,2,1:11]], dim=2)
139 | 
140 |     zeros = zeros + 1
141 |     zeros = zeros.reshape(scales.shape)  
142 | 
143 |     weight = qweight.reshape(qweight.shape[0]//3, 3, 1, qweight.shape[1]).expand(-1, -1, 12, -1)
144 |     weight = (weight >> wf.unsqueeze(-1))&0x7
145 |     weight[:,0,10] = (weight[:,0,10]&0x3) | ((weight[:,1,0] << 2)&0x4)
146 |     weight[:,1,11] = (weight[:,1,11]&0x1) | ((weight[:,2,0] << 1)&0x6)
147 |     weight &= 0x7
148 |     weight = torch.cat([weight[:,0,:11], weight[:,1,1:12], weight[:,2,1:11]], dim=1)
149 | 
150 |     weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
151 |            
152 |     g_idx_long = g_idx.to(torch.long)
153 |     weight = (scales[g_idx_long] * (weight - zeros[g_idx_long]))
154 |     # out = torch.matmul(x.half(), weights)
155 |     # weight -= zeros[g_idx_long]
156 |     # weight = weight.to(torch.half)
157 |     # weight *= scales[g_idx_long]
158 |     return weight      
159 | 
160 | # ----------------------------------------------------------------------------
161 | # helpers
162 | 
163 | buffer_mat_dic = {}
164 | def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda'):
165 |     if shape_of_qweight not in buffer_mat_dic.keys():
166 |         buffer_mat_dic[shape_of_qweight] = torch.zeros(
167 |             (shape_of_qweight[0] * 8, shape_of_qweight[1]), 
168 |             dtype=dtype, device=device
169 |         )
170 |     return buffer_mat_dic[shape_of_qweight]
171 | 


--------------------------------------------------------------------------------
/llmtune/run.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from llmtune.config import LLM_MODELS
  3 | 
  4 | # ----------------------------------------------------------------------------
  5 | 
  6 | def make_parser():
  7 |     parser = argparse.ArgumentParser()
  8 |     parser.set_defaults(func=lambda args: parser.print_help())
  9 |     subparsers = parser.add_subparsers(title='Commands')
 10 | 
 11 |     # generate
 12 | 
 13 |     gen_parser = subparsers.add_parser('generate')
 14 |     gen_parser.set_defaults(func=generate)
 15 | 
 16 |     gen_parser.add_argument('--model', required=True,
 17 |         help='Path or HF hub name of model to load')
 18 |     gen_parser.add_argument('--tokenizer', required=False,
 19 |         help='Path or HF hub name of tokenizer to load (default is model)')
 20 |     gen_parser.add_argument('--adapter', type=str, required=False,
 21 |         help='Path to the folder with the Lora adapter.')
 22 |     gen_parser.add_argument('--groupsize', type=int, default=-1,
 23 |         help='Groupsize used for quantization; -1 uses full row.')
 24 |     gen_parser.add_argument('--prompt', type=str, default='',
 25 |         help='Text used to initialize generation')
 26 |     gen_parser.add_argument('--instruction', type=str, default='',
 27 |         help='Instruction for an alpaca-style model')    
 28 |     gen_parser.add_argument('--min-length', type=int, default=10, 
 29 |         help='Minimum length of the sequence to be generated.')
 30 |     gen_parser.add_argument('--max-length', type=int, default=200,
 31 |         help='Maximum length of the sequence to be generated.')
 32 |     gen_parser.add_argument('--top_p', type=float, default=.95,
 33 |         help='Top p sampling parameter.')
 34 |     gen_parser.add_argument('--top_k', type=int, default=50,
 35 |         help='Top p sampling parameter.')
 36 |     gen_parser.add_argument('--temperature', type=float, default=1.0,
 37 |         help='Sampling temperature.')
 38 | 
 39 |     # quantize
 40 | 
 41 |     quant_parser = subparsers.add_parser('quantize')
 42 |     quant_parser.set_defaults(func=quantize)
 43 | 
 44 |     quant_parser.add_argument('--model', required=True,
 45 |         help='Path or HF hub name of model to load')
 46 |     quant_parser.add_argument('--save', type=str, required=True,
 47 |         help='Path to the saved model weights.')
 48 |     quant_parser.add_argument('--bits', type=int, # required=True,
 49 |         choices=[2, 3, 4, 8], help='#bits to use for quantization.')
 50 |     quant_parser.add_argument('--dataset', type=str, default='c4',
 51 |         choices=['wikitext2', 'ptb', 'c4'],
 52 |         help='Where to extract calibration data from.')
 53 |     quant_parser.add_argument('--seed', type=int, default=0, 
 54 |         help='Seed for sampling the calibration data.')
 55 |     quant_parser.add_argument('--nsamples', type=int, default=128,
 56 |         help='Number of calibration data samples.')
 57 |     quant_parser.add_argument('--percdamp', type=float, default=.01,
 58 |         help='Percent of the average Hessian diagonal to use for dampening.')
 59 |     quant_parser.add_argument('--groupsize', type=int, default=-1,
 60 |         help='Groupsize to use for quantization; -1 uses full row.')
 61 |     quant_parser.add_argument('--act-order', action='store_true',
 62 |         help='Whether to apply the activation order GPTQ heuristic.')
 63 |     quant_parser.add_argument('--nearest', action='store_true',
 64 |         help='Use basic round-to-nearest quantization.')
 65 | 
 66 |     # finetune
 67 | 
 68 |     tune_parser = subparsers.add_parser('finetune')
 69 |     tune_parser.set_defaults(func=finetune)
 70 | 
 71 |     # finetune model config
 72 |     tune_parser.add_argument('--model', required=True,
 73 |         help='Path or HF hub name of model to load')
 74 |     tune_parser.add_argument('--tokenizer', required=False,
 75 |         help='Path or HF hub name of tokenizer to load (default is model)')
 76 |     tune_parser.add_argument("--data-type", choices=["alpaca", "gpt4all"],
 77 |         help="Dataset format", default="alpaca")
 78 |     tune_parser.add_argument("--dataset", required=False,
 79 |         help="Path to local dataset file.")
 80 |     tune_parser.add_argument('--adapter', type=str, required=False,
 81 |         help='Path to Lora adapter folder (also holds checkpoints)')
 82 |     tune_parser.add_argument('--groupsize', type=int,
 83 |         help='Groupsize used for quantization; -1 uses full row.')
 84 | 
 85 |     # finetune training config
 86 |     tune_parser.add_argument("--mbatch_size", default=1, type=int, 
 87 |         help="Micro-batch size. ")
 88 |     tune_parser.add_argument("--batch_size", default=2, type=int, 
 89 |         help="Batch size. ")
 90 |     tune_parser.add_argument("--epochs", default=3, type=int, 
 91 |         help="Epochs. ")
 92 |     tune_parser.add_argument("--lr", default=2e-4, type=float, 
 93 |         help="Learning rate. ")
 94 |     tune_parser.add_argument("--cutoff_len", default=256, type=int, 
 95 |         help="")
 96 |     tune_parser.add_argument("--lora_r", default=8, type=int, 
 97 |         help="")
 98 |     tune_parser.add_argument("--lora_alpha", default=16, type=int, 
 99 |         help="")
100 |     tune_parser.add_argument("--lora_dropout", default=0.05, type=float, 
101 |         help="")
102 |     tune_parser.add_argument("--val_set_size", default=0.2, type=float, 
103 |         help="Validation set size. ")
104 |     tune_parser.add_argument("--warmup_steps", default=50, type=int, 
105 |         help="")
106 |     tune_parser.add_argument("--save_steps", default=50, type=int, 
107 |         help="")
108 |     tune_parser.add_argument("--save_total_limit", default=3, type=int, 
109 |         help="")
110 |     tune_parser.add_argument("--logging_steps", default=10, type=int, 
111 |         help="")
112 | 
113 |     return parser
114 | 
115 | # ----------------------------------------------------------------------------
116 | 
117 | def main():
118 |     parser = make_parser()
119 |     args = parser.parse_args()
120 |     args.func(args)
121 | 
122 | def generate(args):
123 |     import llmtune.executor as llmtune
124 |     llm = llmtune.load_llm(args.model)
125 |     tk_name = args.tokenizer if args.tokenizer is not None else args.model
126 |     tokenizer = llmtune.load_tokenizer(tk_name, llm.llm_config)
127 |     if args.adapter is not None:
128 |         llm = llmtune.load_adapter(llm, adapter_path=args.adapter)
129 |     if args.prompt and args.instruction:
130 |         raise Exception('Cannot specify both prompt and instruction')
131 |     if args.instruction:
132 |         from llmtune.data.alpaca import make_prompt
133 |         prompt = make_prompt(args.instruction, input_="")
134 |     else:
135 |         prompt = args.prompt
136 | 
137 |     output = llmtune.generate(
138 |         llm, 
139 |         tokenizer, 
140 |         prompt, 
141 |         args.min_length, 
142 |         args.max_length, 
143 |         args.temperature,        
144 |         args.top_k, 
145 |         args.top_p, 
146 |     )
147 | 
148 |     if args.instruction:
149 |         from llmtune.data.alpaca import make_output
150 |         output = make_output(output)
151 | 
152 |     print(output)
153 | 
154 | def finetune(args):
155 |     import llmtune.executor as llmtune
156 |     llm = llmtune.load_llm(args.model)
157 |     tk_name = args.tokenizer if args.tokenizer is not None else args.model
158 |     tokenizer = llmtune.load_tokenizer(tk_name, llm.llm_config)
159 |     from llmtune.config import get_finetune_config
160 |     finetune_config = get_finetune_config(args)
161 |     from llmtune.executor import finetune
162 |     finetune(llm, tokenizer, finetune_config)
163 | 
164 | def quantize(args):
165 |     from llmtune.config import get_quant_config
166 |     quant_config = get_quant_config(args)
167 |     import llmtune.executor as llmtune
168 |     llm = llmtune.load_llm(args.model)
169 |     output = llmtune.quantize(
170 |         llm, 
171 |         quant_config 
172 |     )
173 | 
174 | if __name__ == '__main__':
175 |     main()    


--------------------------------------------------------------------------------
/finetune/mnli-llama/eval_mnli_llmtune.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | # Set up the argument parser
  4 | parser = argparse.ArgumentParser(description='Python script to work with models')
  5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
  6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True)
  7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
  8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
  9 | parser.add_argument('--file_name', type=str, help='file name to store predictions and acc', required=True)
 10 | parser.add_argument('--checkpoint_name', type=str, help='folder name to store all the check points', required=True)
 11 | parser.add_argument('--start_index', type=int, help='model seed number', required=True)
 12 | parser.add_argument('--end_index', type=int, help='model seed number', required=True)
 13 | 
 14 | # Parse the arguments
 15 | args = parser.parse_args()
 16 | 
 17 | # Use the command line arguments in your script
 18 | print('Model Name:', args.model_name)
 19 | print('Weight Path:', args.weight_path)
 20 | print('Adapter Path: ', args.adapter)
 21 | print('Seed: ', args.seed)
 22 | 
 23 | import random
 24 | import json
 25 | import os
 26 | import pickle
 27 | 
 28 | # import wandb
 29 | import torch
 30 | import numpy as np
 31 | # import bitsandbytes as bnb
 32 | from tqdm import tqdm
 33 | import transformers
 34 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
 35 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
 36 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 37 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
 38 | from datasets import load_dataset
 39 | 
 40 | from utils import *
 41 | from data_mnli_label import *
 42 | 
 43 | import evaluate
 44 | import numpy as np
 45 | from datasets import load_from_disk
 46 | from tqdm import tqdm
 47 | 
 48 | from llmtune.executor import load_llm, load_adapter
 49 | from llmtune.engine.lora.peft import quant_peft
 50 | 
 51 | output_dir = args.adapter
 52 | model_name = "huggyllama/llama-13b"
 53 | seed = args.seed
 54 | train_sample_rate = 1.0
 55 | val_sample_rate = 1.0
 56 | local_rank = 0
 57 | 
 58 | set_random_seed(seed)
 59 | logging.set_verbosity_info()
 60 | 
 61 | # with open(config_file, "r") as r:
 62 | #     config = json.load(r)
 63 | 
 64 | device_map = "auto"
 65 | world_size = int(os.environ.get("WORLD_SIZE", 1))
 66 | ddp = world_size != 1
 67 | 
 68 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
 69 | tokenizer = fix_tokenizer(tokenizer)
 70 | # tokenizer.save_pretrained(output_dir)
 71 | 
 72 | dataset = load_dataset('multi_nli')
 73 | train_records = dataset['train']
 74 | val_records = dataset['validation_matched']
 75 | #random.shuffle(train_records)
 76 | print("train_record[0]: ",train_records[0])
 77 | 
 78 | ## Config for llama 7-b
 79 | model_type = "causal"
 80 | templates_path = "llama_lora_mnli.json"
 81 | only_target_loss = False
 82 | 
 83 | llmtune_model_name = args.model_name
 84 | llmtune_quantized_weights_path = args.weight_path ## probably want to change this using our version of the right way
 85 | llmtune_groupsize = 64
 86 | 
 87 | 
 88 | llm, _ = load_llm(
 89 |     llmtune_model_name,
 90 |     llmtune_quantized_weights_path,
 91 |     llmtune_groupsize
 92 | )
 93 | model = fix_model(llm, tokenizer, use_resize=False)
 94 | 
 95 | # Default model generation params
 96 | model.config.num_beams = 5
 97 | 
 98 | 
 99 | if not ddp and torch.cuda.device_count() > 1:
100 |     model.is_parallelizable = True
101 |     model.model_parallel = True
102 | 
103 | 
104 | model = load_adapter(model, adapter_path=output_dir)
105 | 
106 | # Metric
107 | 
108 | def evaluate_peft_model_mnli(sample,max_target_length=65):
109 |     instruction, input, genre = sample['premise'], sample['hypothesis'], sample['genre']
110 |     sample_word = f"### Premise: {instruction}\n ### Hypothesis: {input}\n ### Genre: {genre} ### Label: "
111 |     print(sample_word)
112 |     input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda()
113 |     outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 5)
114 |     output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"")
115 |     output = output.strip()
116 |     print(f"Output:\n{output}")
117 |     # Some simple post-processing
118 |     return output
119 | 
120 | 
121 | 
122 | def acc_compute(predictions,references):
123 |     acc = 0
124 |     for i in range(len(predictions)):
125 |         if predictions[i].lower() == references[i].lower():
126 |             acc += 1
127 |     acc /= len(predictions)
128 | 
129 |     print("accuracy:", acc)
130 |     return acc
131 | 
132 | 
133 | def store_pred(file_name_pickle_pred,file_name_pickle_ref,predictions,references):
134 |     with open(file_name_pickle_pred, "wb") as fp:   #Pickling
135 |         pickle.dump(predictions, fp)
136 |     with open(file_name_pickle_ref, "wb") as fp:   #Pickling
137 |         pickle.dump(references, fp)
138 | 
139 | 
140 | 
141 | 
142 | ##Arguments setting
143 | start_index = args.start_index
144 | end_index = args.end_index
145 | eval_len =  end_index - start_index
146 | eval_save_len = eval_len // 10 
147 | print("Evaluation will start at: ", start_index)
148 | print("Evaluation will end at: ", end_index)
149 | print(f'Evaluation will save at every {eval_save_len} steps')
150 | 
151 | 
152 | ## Create Check point Folder
153 | checkpoint_path = f'{args.checkpoint_name}_{start_index}_{end_index}'
154 | 
155 | current_directory = os.getcwd()
156 | final_directory = os.path.join(current_directory, checkpoint_path)
157 | if not os.path.exists(final_directory):
158 |    os.makedirs(final_directory)
159 | 
160 | 
161 | 
162 | 
163 | 
164 | predictions = []
165 | references_orig = val_records['label'][start_index:end_index]
166 | ## convert references to list of strings
167 | references = []
168 | for item in references_orig:
169 |     references.append(str(item))
170 | 
171 | 
172 | count_eval = 0
173 | for idx in tqdm(range(start_index, end_index)):
174 |     sample = val_records[idx]
175 |     p = evaluate_peft_model_mnli(sample)
176 |     predictions.append(p)
177 |     count_eval += 1
178 |     ## Detecting checkpoing
179 |     if (count_eval%eval_save_len == 0):
180 |        print(f'=>=>Checkpointing at {count_eval} steps<=<=')
181 | 
182 |        predictions_step = [s.strip() for s in predictions]
183 |        print("prediction_step: ", predictions_step)
184 |        references_step = references[0:count_eval]
185 |        print("references_step: ", references_step)
186 |        acc = acc_compute(predictions_step,references_step)
187 |        checkpoint_name_txt = f'{final_directory}/{count_eval}.txt'
188 |        checkpoint_name_pred = f'{final_directory}/{count_eval}_pred' ## pickle file for pred list
189 |        checkpoint_name_ref = f'{final_directory}/{count_eval}_ref' ## pickle file for ref list
190 |        ## writing pickle file
191 |        store_pred(checkpoint_name_pred,checkpoint_name_ref,predictions_step,checkpoint_name_ref)
192 |        with open(checkpoint_name_txt, "w") as f:
193 |             for item in predictions_step:
194 |                 # write each item on a new line
195 |                 f.write("%s\n" % item)
196 |             f.write("%s\n" % acc)
197 |             
198 | 
199 | 
200 |        
201 | predictions = [s.strip() for s in predictions]
202 | 
203 | 
204 | 
205 | file_name = args.file_name
206 | 
207 | with open(file_name, 'w') as f:
208 |     for item in predictions:
209 |         # write each item on a new line
210 |         f.write("%s\n" % item)
211 |     f.write("%s\n" % acc)
212 | 
213 | 
214 | file_name_pickle_pred = f'{final_directory}/final_pred_{start_index}_{end_index}'
215 | file_name_pickle_ref = f'{final_directory}/final_ref_{start_index}_{end_index}'
216 | 
217 | store_pred(file_name_pickle_pred,file_name_pickle_ref,predictions,references)
218 | 
219 | 
220 | """
221 | Loading pickle file
222 | with open("test", "rb") as fp:   # Unpickling
223 |   b = pickle.load(fp)
224 | """
225 | 


--------------------------------------------------------------------------------
/finetune/samsum-opt/train_samsum_opt_4bit_llmtune.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | # Set up the argument parser
  4 | parser = argparse.ArgumentParser(description='Python script to work with models')
  5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
  6 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
  7 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True)
  8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
  9 | 
 10 | # Parse the arguments
 11 | args = parser.parse_args()
 12 | 
 13 | # Use the command line arguments in your script
 14 | print('Model Name:', args.model_name)
 15 | print('Adapter Path: ', args.adapter)
 16 | print('Seed: ', args.seed)
 17 | print('mbatch_size: ', args.mbatch_size)
 18 | 
 19 | 
 20 | import os
 21 | import torch
 22 | import transformers
 23 | from transformers import AutoTokenizer
 24 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
 25 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
 26 | from llmtune.llms.autollm import AutoLLMForCausalLM
 27 | from llmtune.engine.lora.config import FinetuneConfig
 28 | from llmtune.engine.lora.peft import quant_peft
 29 | from llmtune.utils import to_half_precision
 30 | from datasets import load_dataset
 31 | 
 32 | from utils import *
 33 | from data import *
 34 | 
 35 | # os env setting
 36 | os.environ["WANDB_DISABLED"] = "true"
 37 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 38 | 
 39 | # model config
 40 | model_name = args.model_name
 41 | tokenizer_name = 'facebook/opt-6.7b'
 42 | DEV = 'cuda'
 43 | 
 44 | transformers.logging.set_verbosity_info()
 45 | 
 46 | # load tokenizer
 47 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 48 | tokenizer.pad_token_id = 0
 49 | ## Fix Tokenizer
 50 | tokenizer = fix_tokenizer_opt(tokenizer)
 51 | 
 52 | # load model
 53 | llm = AutoLLMForCausalLM.from_pretrained(model_name)
 54 | ## Fix Model
 55 | lllm = fix_model(llm, tokenizer, use_resize=False)
 56 | llm.eval()
 57 | llm = llm.to(DEV)
 58 | llm = to_half_precision(llm)
 59 | 
 60 | 
 61 | # finetune training config
 62 | MICRO_BATCH_SIZE=args.mbatch_size
 63 | BATCH_SIZE = 128
 64 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
 65 | EPOCHS = 3  
 66 | LEARNING_RATE = 1e-3  # the Karpathy constant
 67 | CUTOFF_LEN = 128  # 128 accounts for about 95% of the data
 68 | LORA_R = 8
 69 | LORA_ALPHA = 32
 70 | LORA_DROPOUT = 0.1
 71 | VAL_SET_SIZE= 2000
 72 | 
 73 | # data/gpu config
 74 | seed = args.seed
 75 | set_random_seed(seed)
 76 | train_sample_rate = 1.0
 77 | val_sample_rate = 1.0
 78 | 
 79 | device_map = "auto"
 80 | world_size = int(os.environ.get("WORLD_SIZE", 1))
 81 | ddp = world_size != 1
 82 | 
 83 | # if ddp:
 84 | #     device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
 85 | #     gradient_accumulation_steps = gradient_accumulation_steps // world_size
 86 | 
 87 | # adapter_path = '/share/kuleshov/vk379/alpacas/opt-7b-quantized-lora'
 88 | lora_out_dir = args.adapter
 89 | 
 90 | # set up lora config    
 91 | lora_config = quant_peft.LoraConfig(
 92 |     r=LORA_R,
 93 |     lora_alpha=LORA_ALPHA,
 94 |     target_modules=["q_proj", "v_proj"],
 95 |     lora_dropout=LORA_DROPOUT,
 96 |     bias="none",
 97 |     task_type="CAUSAL_LM",
 98 | )
 99 | 
100 | 
101 | if not ddp and torch.cuda.device_count() > 1:
102 |     llm.is_parallelizable = True
103 |     llm.model_parallel = True
104 | 
105 | 
106 | # create a new lora from config
107 | model = quant_peft.get_peft_model(llm, lora_config)
108 | 
109 | if not ddp and torch.cuda.device_count() > 1:
110 |     print("GPU parallel acctivated")
111 |     model.is_parallelizable = True
112 |     model.model_parallel = True
113 | 
114 | # load stanford alpaca data
115 | dataset = load_dataset('samsum')
116 | train_records = dataset['train']
117 | val_records = dataset['test']
118 | 
119 | ## Config for llama 65-b
120 | model_type = "causal"
121 | templates_path = "llama_lora_samsum.json"
122 | only_target_loss = False
123 | mode = "instruct"
124 | 
125 | if mode == "instruct":
126 |     max_source_tokens_count = 205 # Changed depending on the dataset
127 |     max_target_tokens_count = 45
128 |     target_field = "summary"
129 |     source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response)
130 | 
131 |     train_dataset = InstructDataset(
132 |         train_records,
133 |         tokenizer,
134 |         max_source_tokens_count=max_source_tokens_count,
135 |         max_target_tokens_count=max_target_tokens_count,
136 |         sample_rate=train_sample_rate,
137 |         input_type=model_type,
138 |         templates_path=templates_path,
139 |         target_field=target_field,
140 |         source_field=source_field,
141 |         only_target_loss=only_target_loss
142 |     )
143 | 
144 |     val_dataset = InstructDataset(
145 |         val_records,
146 |         tokenizer,
147 |         max_source_tokens_count=max_source_tokens_count,
148 |         max_target_tokens_count=max_target_tokens_count,
149 |         sample_rate=val_sample_rate,
150 |         input_type=model_type,
151 |         templates_path=templates_path,
152 |         target_field=target_field,
153 |         source_field=source_field,
154 |         only_target_loss=only_target_loss
155 |     )
156 | 
157 |     ## Save the model
158 |     dataloader_train = torch.utils.data.DataLoader(train_dataset)
159 |     # torch.save(dataloader_train,'dataloader_train.pth')
160 | 
161 |     dataloader_val = torch.utils.data.DataLoader(val_dataset)
162 |     # torch.save(dataloader_val,'dataloader_val.pth')
163 | 
164 | else:
165 |     assert False
166 | 
167 | if "seq2seq" in model_type:
168 |     data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)
169 | else:
170 |     data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
171 | 
172 | print("INPUT_IDS")
173 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0])
174 | print("MASK")
175 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0])
176 | print("LABELS")
177 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0])
178 | 
179 | 
180 | 
181 | # Model configs
182 | model.config.num_beams = 5
183 | if mode == "instruct":
184 |     max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1
185 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count
186 | 
187 | 
188 | # Training args
189 | training_arguments = transformers.TrainingArguments(
190 |     per_device_train_batch_size = MICRO_BATCH_SIZE,
191 |     per_device_eval_batch_size = 1,
192 |     gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
193 |     warmup_ratio=0.06,
194 |     #num_train_epochs=3,
195 |     max_steps = 400,
196 |     learning_rate=LEARNING_RATE,
197 |     lr_scheduler_type = "cosine", ## LoRA original paper uses linear
198 |     fp16=True,
199 |     logging_steps=50,
200 |     evaluation_strategy="steps",
201 |     logging_strategy="steps",
202 |     save_strategy="steps",
203 |     eval_steps=50,
204 |     save_steps=50,
205 |     output_dir=lora_out_dir,
206 |     optim = "adamw_torch",
207 |     torch_compile = False,
208 |     save_total_limit=2,
209 |     load_best_model_at_end=True,
210 |     ddp_find_unused_parameters=False if ddp else None,
211 | )
212 | 
213 | 
214 | def preprocess_logits_for_metrics(logits, labels):
215 |     """
216 |     Original Trainer may have a memory leak.
217 |     This is a workaround to avoid storing too many tensors that are not needed.
218 |     """
219 |     pred_ids = torch.argmax(logits[0], dim=-1)
220 |     return pred_ids, labels
221 | 
222 | # Start trainer
223 | trainer = transformers.Trainer(
224 |     model=model,
225 |     args=training_arguments,
226 |     train_dataset=train_dataset,
227 |     eval_dataset=val_dataset,
228 |     data_collator=data_collator,
229 |     preprocess_logits_for_metrics = preprocess_logits_for_metrics,
230 | )
231 | 
232 | # print("Prallel Training status: ", training_arguments.parallel_mode)
233 | model.config.use_cache = False
234 | 
235 | # use half precision
236 | model = to_half_precision(model)
237 | 
238 | # start training
239 | checkpoint_dir = lora_out_dir
240 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
241 |     trainer.train(resume_from_checkpoint=True)
242 | else:
243 |     trainer.train()
244 | 
245 | # Save Model
246 | model.save_pretrained(lora_out_dir)


--------------------------------------------------------------------------------
/finetune/samsum-llama/eval_samsum_4bit_llmtune.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | # Set up the argument parser
  4 | parser = argparse.ArgumentParser(description='Python script to work with models')
  5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
  6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True)
  7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
  8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
  9 | parser.add_argument('--file_name', type=str, help='file name to store predictions and acc', required=True)
 10 | parser.add_argument('--checkpoint_name', type=str, help='folder name to store all the check points', required=True)
 11 | parser.add_argument('--start_index', type=int, help='model seed number', required=True)
 12 | parser.add_argument('--end_index', type=int, help='model seed number', required=True)
 13 | 
 14 | # Parse the arguments
 15 | args = parser.parse_args()
 16 | 
 17 | # Use the command line arguments in your script
 18 | print('Model Name:', args.model_name)
 19 | print('Weight Path:', args.weight_path)
 20 | print('Adapter Path: ', args.adapter)
 21 | print('Seed: ', args.seed)
 22 | 
 23 | import random
 24 | import json
 25 | import os
 26 | 
 27 | #for eval
 28 | import pickle
 29 | 
 30 | # import wandb
 31 | import torch
 32 | import numpy as np
 33 | # import bitsandbytes as bnb
 34 | from tqdm import tqdm
 35 | import transformers
 36 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
 37 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
 38 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 39 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
 40 | from datasets import load_dataset
 41 | 
 42 | from utils import *
 43 | from data import *
 44 | 
 45 | import evaluate
 46 | import numpy as np
 47 | from datasets import load_from_disk
 48 | from tqdm import tqdm
 49 | 
 50 | from llmtune.executor import load_llm, load_adapter
 51 | from llmtune.engine.lora.peft import quant_peft
 52 | 
 53 | output_dir = args.adapter
 54 | seed = args.seed
 55 | train_sample_rate = 1.0
 56 | val_sample_rate = 1.0
 57 | local_rank = 0
 58 | 
 59 | set_random_seed(seed)
 60 | logging.set_verbosity_info()
 61 | 
 62 | # with open(config_file, "r") as r:
 63 | #     config = json.load(r)
 64 | 
 65 | device_map = "auto"
 66 | world_size = int(os.environ.get("WORLD_SIZE", 1))
 67 | ddp = world_size != 1
 68 | 
 69 | tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-13b", use_fast=False,model_max_length=250)
 70 | tokenizer = fix_tokenizer(tokenizer)
 71 | # tokenizer.save_pretrained(output_dir)
 72 | 
 73 | dataset = load_dataset('samsum')
 74 | train_records = dataset['train']
 75 | val_records = dataset['test']
 76 | #random.shuffle(train_records)
 77 | print("train_record[0]: ",train_records[0])
 78 | 
 79 | ## Config for llama 7-b
 80 | model_type = "causal"
 81 | templates_path = "llama_lora_samsum.json"
 82 | only_target_loss = False
 83 | mode = "instruct"
 84 | 
 85 | llmtune_model_name = args.model_name
 86 | llmtune_quantized_weights_path = args.weight_path
 87 | llmtune_groupsize = 64
 88 | 
 89 | llm, _ = load_llm(
 90 |     llmtune_model_name,
 91 |     llmtune_quantized_weights_path,
 92 |     llmtune_groupsize
 93 | )
 94 | model = fix_model(llm, tokenizer, use_resize=False)
 95 | 
 96 | # Default model generation params
 97 | model.config.num_beams = 5
 98 | 
 99 | 
100 | if not ddp and torch.cuda.device_count() > 1:
101 |     model.is_parallelizable = True
102 |     model.model_parallel = True
103 | 
104 | 
105 | model = load_adapter(model, adapter_path=output_dir)
106 | 
107 | # Metric
108 | metric = evaluate.load("rouge")
109 | 
110 | def evaluate_peft_model_samsum(sample,max_target_length=45):
111 |     # Load dataset from the hub and get a sample
112 |     sample_word = f"### Summarize this: {sample}\n ### Output: "
113 |     input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda()
114 |     with torch.autocast("cuda"):
115 |         outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 45)
116 |     output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"")
117 |     output = output.strip()
118 |     print(f"Output:\n{output}")
119 |     # Some simple post-processing
120 |     return output
121 | 
122 | 
123 | def rouge_compute(predictions,references):
124 |     rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
125 |     return rogue
126 | 
127 | 
128 | def store_pred(file_name_pickle_pred,file_name_pickle_ref,predictions,references):
129 |     with open(file_name_pickle_pred, "wb") as fp:   #Pickling
130 |         pickle.dump(predictions, fp)
131 |     with open(file_name_pickle_ref, "wb") as fp:   #Pickling
132 |         pickle.dump(references, fp)
133 | 
134 | 
135 | 
136 | ##Arguments setting
137 | start_index = args.start_index
138 | end_index = args.end_index
139 | eval_len =  end_index - start_index
140 | eval_save_len = eval_len // 10 
141 | print("Evaluation will start at: ", start_index)
142 | print("Evaluation will end at: ", end_index)
143 | print(f'Evaluation will save at every {eval_save_len} steps')
144 | 
145 | 
146 | ## Create Check point Folder
147 | checkpoint_path = f'{args.checkpoint_name}_{start_index}_{end_index}'
148 | 
149 | current_directory = os.getcwd()
150 | final_directory = os.path.join(current_directory, checkpoint_path)
151 | if not os.path.exists(final_directory):
152 |    os.makedirs(final_directory)
153 | 
154 | 
155 | 
156 | predictions = []
157 | references_origin = val_records['summary'][start_index:end_index]
158 | references = []
159 | 
160 | count_eval = 0
161 | 
162 | 
163 | for idx in tqdm(range(start_index, end_index)):
164 |     sample = val_records['dialogue'][idx]
165 |         # Load dataset from the hub and get a sample
166 |     sample_word = f"### Summarize this: {sample}\n ### Output: "
167 |     input_ids = tokenizer(sample_word, return_tensors="pt", truncation=True).input_ids.cuda()
168 | 
169 |     print("length of input ids:", len(input_ids[0]))
170 |     # if (len(input_ids[0]) < 300): 
171 |     with torch.inference_mode(), torch.autocast("cuda"):
172 |         outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens = 45)
173 |         output = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True).replace(sample_word,"")
174 |     output = output.strip()
175 |     print(f"Model Output: \n{output}")
176 |     predictions.append(output)
177 |     print(f"Reference Output: \n {references_origin[count_eval]}")
178 |     references.append(references_origin[count_eval])
179 |     count_eval+=1
180 | 
181 |     ## Detecting checkpoing
182 |     if (count_eval%eval_save_len == 0):
183 |        print(f'=>=>Checkpointing at {count_eval} steps<=<=')
184 | 
185 |        predictions_step = [s.strip() for s in predictions]
186 |        print("prediction_step: ", predictions_step)
187 |        references_step = references
188 |        print("references_step: ", references_step)
189 |        rouge = rouge_compute(predictions_step,references_step)
190 |        checkpoint_name_txt = f'{final_directory}/{count_eval}.txt'
191 |        checkpoint_name_pred = f'{final_directory}/{count_eval}_pred' ## pickle file for pred list
192 |        checkpoint_name_ref = f'{final_directory}/{count_eval}_ref' ## pickle file for ref list
193 |        ## writing pickle file
194 |        store_pred(checkpoint_name_pred,checkpoint_name_ref,predictions_step,checkpoint_name_ref)
195 |        with open(checkpoint_name_txt, "w") as f:
196 |             for item in predictions_step:
197 |                 # write each item on a new line
198 |                 f.write("%s\n" % item)
199 |             f.write(f'Seed: {seed}')
200 |             f.write(f"Rogue1: {rouge['rouge1']* 100:2f}%")
201 |             f.write(f"rouge2: {rouge['rouge2']* 100:2f}%")
202 |             f.write(f"rougeL: {rouge['rougeL']* 100:2f}%")
203 |             f.write(f"rougeLsum: {rouge['rougeLsum']* 100:2f}%")
204 |             
205 | 
206 | predictions = [s.strip() for s in predictions]
207 | 
208 | 
209 | # compute metric
210 | rouge = metric.compute(predictions=predictions, references=references, use_stemmer=True)
211 | 
212 | file_name = args.file_name
213 | with open(file_name, 'w') as f:
214 |     f.write(f'Seed: {seed}')
215 |     f.write(f"Rogue1: {rouge['rouge1']* 100:2f}%")
216 |     f.write(f"rouge2: {rouge['rouge2']* 100:2f}%")
217 |     f.write(f"rougeL: {rouge['rougeL']* 100:2f}%")
218 |     f.write(f"rougeLsum: {rouge['rougeLsum']* 100:2f}%")
219 | 


--------------------------------------------------------------------------------
/finetune/samsum-llama/train_samsum_4bit.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | # Set up the argument parser
  4 | parser = argparse.ArgumentParser(description='Python script to work with models')
  5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
  6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True)
  7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
  8 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True)
  9 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
 10 | 
 11 | # Parse the arguments
 12 | args = parser.parse_args()
 13 | 
 14 | # Use the command line arguments in your script
 15 | print('Model Name:', args.model_name)
 16 | print('Weight Path:', args.weight_path)
 17 | print('Adapter Path: ', args.adapter)
 18 | print('Seed: ', args.seed)
 19 | print('mbatch_size: ', args.mbatch_size)
 20 | 
 21 | 
 22 | import random
 23 | import json
 24 | import os
 25 | 
 26 | # import wandb
 27 | import torch
 28 | import numpy as np
 29 | import bitsandbytes as bnb
 30 | from tqdm import tqdm
 31 | import transformers
 32 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
 33 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
 34 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 35 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
 36 | from datasets import load_dataset
 37 | 
 38 | # from src.dataset import InstructDataset, ChatDataset
 39 | # from src.util.dl import set_random_seed, fix_tokenizer, fix_model
 40 | # from src.util.io import read_jsonl
 41 | 
 42 | from utils import *
 43 | from data import *
 44 | 
 45 | from llmtune.executor import load_llm, load_adapter
 46 | from llmtune.engine.lora.peft import quant_peft
 47 | 
 48 | 
 49 | # os.environ["WANDB_LOG_MODEL"] = "checkpoint"
 50 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 51 | 
 52 | 
 53 | class SavePeftModelCallback(TrainerCallback):
 54 |     def on_save(
 55 |         self,
 56 |         args: TrainingArguments,
 57 |         state: TrainerState,
 58 |         control: TrainerControl,
 59 |         **kwargs,
 60 |     ):
 61 |         checkpoint_folder = os.path.join(
 62 |             args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
 63 |         )
 64 | 
 65 |         peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
 66 |         kwargs["model"].save_pretrained(peft_model_path)
 67 |         return control
 68 | 
 69 | checkpoint = None
 70 | seed = args.seed
 71 | train_sample_rate = 1.0
 72 | val_sample_rate = 1.0
 73 | local_rank = 0
 74 | # report_to = "wandb"
 75 | output_dir = args.adapter
 76 | 
 77 | set_random_seed(seed)
 78 | logging.set_verbosity_info()
 79 | 
 80 | # with open(config_file, "r") as r:
 81 | #     config = json.load(r)
 82 | 
 83 | device_map = "auto"
 84 | world_size = int(os.environ.get("WORLD_SIZE", 1))
 85 | ddp = world_size != 1
 86 | # if ddp:
 87 | #     device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
 88 | #     gradient_accumulation_steps = gradient_accumulation_steps // world_size
 89 | 
 90 | #deepspeed_config = config.get("deepspeed")
 91 | 
 92 | 
 93 | 
 94 | ### Training Configuration
 95 | #trainer_config = config["trainer"]
 96 | 
 97 | MICRO_BATCH_SIZE = args.mbatch_size  # this could actually be 5 but i like powers of 2
 98 | BATCH_SIZE = 128
 99 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
100 | EPOCHS = 3  # we don't need 3 tbh
101 | LEARNING_RATE = 1e-3  # the Karpathy constant
102 | CUTOFF_LEN = 128  # 128 accounts for about 95% of the data
103 | LORA_R = 8
104 | LORA_ALPHA = 16
105 | LORA_DROPOUT = 0.05
106 | VAL_SET_SIZE= 2000
107 | 
108 | def preprocess_logits_for_metrics(logits, labels):
109 |     """
110 |     Original Trainer may have a memory leak.
111 |     This is a workaround to avoid storing too many tensors that are not needed.
112 |     """
113 |     pred_ids = torch.argmax(logits[0], dim=-1)
114 |     return pred_ids, labels
115 | 
116 | trainer_config = transformers.TrainingArguments(
117 |     per_device_train_batch_size = MICRO_BATCH_SIZE,
118 |     per_device_eval_batch_size = MICRO_BATCH_SIZE,
119 |     gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
120 |     warmup_ratio=0.06,
121 |     #num_train_epochs=3,
122 |     max_steps = 350,
123 |     learning_rate=LEARNING_RATE,
124 |     lr_scheduler_type = "cosine", ## LoRA original paper uses linear
125 |     fp16=True,
126 |     logging_steps=50,
127 |     evaluation_strategy="steps",
128 |     logging_strategy="steps",
129 |     save_strategy="steps",
130 |     eval_steps=50,
131 |     save_steps=50,
132 |     # report_to=report_to,
133 |     output_dir=output_dir,
134 |     optim = "adamw_torch",
135 |     torch_compile = False,
136 |     save_total_limit=2,
137 |     load_best_model_at_end=True,
138 |     ddp_find_unused_parameters=False if ddp else None,
139 | )
140 | 
141 | 
142 | # ### Apply LoRA
143 | #
144 | # Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.
145 | 
146 | target_modules = None
147 | target_modules = ['q_proj', 'v_proj'] # edit with your desired target modules
148 | #lora_config = config.get("lora")
149 | lora_config = LoraConfig(
150 |     r=8, lora_alpha=32, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
151 | )
152 | 
153 | callbacks = [SavePeftModelCallback] if lora_config else []
154 | ##no need to use callbacks
155 | callbacks = []
156 | 
157 | training_args = trainer_config
158 | 
159 | 
160 | model_name = "huggyllama/llama-13b"
161 | 
162 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
163 | tokenizer = fix_tokenizer(tokenizer)
164 | # tokenizer.save_pretrained(output_dir)
165 | 
166 | dataset = load_dataset('samsum')
167 | train_records = dataset['train']
168 | val_records = dataset['test']
169 | #random.shuffle(train_records)
170 | print("train_record[0]: ",train_records[0])
171 | 
172 | ## Config for llama 65-b
173 | model_type = "causal"
174 | templates_path = "llama_lora_samsum.json"
175 | only_target_loss = False
176 | mode = "instruct"
177 | 
178 | llmtune_model_name = args.model_name
179 | llmtune_quantized_weights_path = args.weight_path
180 | llmtune_groupsize = 64
181 | 
182 | if mode == "instruct":
183 |     max_source_tokens_count = 255 # Changed depending on the dataset
184 |     max_target_tokens_count = 50
185 |     target_field = "summary"
186 |     source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response)
187 | 
188 |     train_dataset = InstructDataset(
189 |         train_records,
190 |         tokenizer,
191 |         max_source_tokens_count=max_source_tokens_count,
192 |         max_target_tokens_count=max_target_tokens_count,
193 |         sample_rate=train_sample_rate,
194 |         input_type=model_type,
195 |         templates_path=templates_path,
196 |         target_field=target_field,
197 |         source_field=source_field,
198 |         only_target_loss=only_target_loss
199 |     )
200 | 
201 |     val_dataset = InstructDataset(
202 |         val_records,
203 |         tokenizer,
204 |         max_source_tokens_count=max_source_tokens_count,
205 |         max_target_tokens_count=max_target_tokens_count,
206 |         sample_rate=val_sample_rate,
207 |         input_type=model_type,
208 |         templates_path=templates_path,
209 |         target_field=target_field,
210 |         source_field=source_field,
211 |         only_target_loss=only_target_loss
212 |     )
213 | 
214 |     ## Save the model
215 |     dataloader_train = torch.utils.data.DataLoader(train_dataset)
216 |     # torch.save(dataloader_train,'dataloader_train.pth')
217 | 
218 |     dataloader_val = torch.utils.data.DataLoader(val_dataset)
219 |     # torch.save(dataloader_val,'dataloader_val.pth')
220 | 
221 | else:
222 |     assert False
223 | 
224 | if "seq2seq" in model_type:
225 |     data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)
226 | else:
227 |     data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
228 | 
229 | print("INPUT_IDS")
230 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0])
231 | print("MASK")
232 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0])
233 | print("LABELS")
234 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0])
235 | 
236 | llm, _ = load_llm(
237 |     llmtune_model_name,
238 |     llmtune_quantized_weights_path,
239 |     llmtune_groupsize
240 | )
241 | model = fix_model(llm, tokenizer, use_resize=False)
242 | 
243 | # Default model generation params
244 | model.config.num_beams = 5
245 | if mode == "instruct":
246 |     max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1
247 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count
248 | 
249 | if not ddp and torch.cuda.device_count() > 1:
250 |     model.is_parallelizable = True
251 |     model.model_parallel = True
252 | 
253 | if lora_config:
254 |     #lora_config = LoraConfig(**lora_config)
255 |     # model = get_peft_model(model, lora_config)
256 |     model = load_adapter(model, lora_config=lora_config)
257 | 
258 | trainer_class = Trainer ##if not omit_base_model_save else TrainerNoBaseSave
259 | print("Trainer class:", trainer_class)
260 | trainer = trainer_class(
261 |     model=model,
262 |     args=training_args,
263 |     train_dataset=train_dataset,
264 |     eval_dataset=val_dataset,
265 |     callbacks=callbacks,
266 |     data_collator=data_collator,
267 |     preprocess_logits_for_metrics = preprocess_logits_for_metrics,
268 | )
269 | 
270 | # with wandb.init(project="llama_ft_samsum", name="llama finetuning run") as run: ## changed the name don't forget
271 | checkpoint_dir = output_dir
272 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
273 |     trainer.train(resume_from_checkpoint=True)
274 | else:
275 |     trainer.train()
276 | model.save_pretrained(output_dir)


--------------------------------------------------------------------------------
/finetune/mnli-llama/train_mnli_llmtune_label.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | # Set up the argument parser
  4 | parser = argparse.ArgumentParser(description='Python script to work with models')
  5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
  6 | parser.add_argument('--weight_path', type=str, help='Path to the weights', required=True)
  7 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
  8 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True)
  9 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
 10 | 
 11 | # Parse the arguments
 12 | args = parser.parse_args()
 13 | 
 14 | # Use the command line arguments in your script
 15 | print('Model Name:', args.model_name)
 16 | print('Weight Path:', args.weight_path)
 17 | print('Adapter Path: ', args.adapter)
 18 | print('Seed: ', args.seed)
 19 | print('mbatch_size: ', args.mbatch_size)
 20 | 
 21 | 
 22 | import random
 23 | import json
 24 | import os
 25 | 
 26 | # import wandb
 27 | import torch
 28 | import numpy as np
 29 | import bitsandbytes as bnb
 30 | from tqdm import tqdm
 31 | import transformers
 32 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
 33 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
 34 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 35 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
 36 | from datasets import load_dataset
 37 | 
 38 | # from src.dataset import InstructDataset, ChatDataset
 39 | # from src.util.dl import set_random_seed, fix_tokenizer, fix_model
 40 | # from src.util.io import read_jsonl
 41 | 
 42 | from utils import *
 43 | from data_mnli_label import *
 44 | 
 45 | from llmtune.executor import load_llm, load_adapter
 46 | from llmtune.engine.lora.peft import quant_peft
 47 | 
 48 | 
 49 | # os.environ["WANDB_LOG_MODEL"] = "checkpoint"
 50 | os.environ["WANDB_DISABLED"] = "true"
 51 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 52 | 
 53 | 
 54 | class SavePeftModelCallback(TrainerCallback):
 55 |     def on_save(
 56 |         self,
 57 |         args: TrainingArguments,
 58 |         state: TrainerState,
 59 |         control: TrainerControl,
 60 |         **kwargs,
 61 |     ):
 62 |         checkpoint_folder = os.path.join(
 63 |             args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
 64 |         )
 65 | 
 66 |         peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
 67 |         kwargs["model"].save_pretrained(peft_model_path)
 68 |         return control
 69 | 
 70 | checkpoint = None
 71 | seed = args.seed
 72 | train_sample_rate = 1.0
 73 | val_sample_rate = 1.0
 74 | local_rank = 0
 75 | # report_to = "wandb"
 76 | output_dir = args.adapter
 77 | 
 78 | set_random_seed(seed)
 79 | logging.set_verbosity_info()
 80 | 
 81 | # with open(config_file, "r") as r:
 82 | #     config = json.load(r)
 83 | 
 84 | device_map = "auto"
 85 | world_size = int(os.environ.get("WORLD_SIZE", 1))
 86 | ddp = world_size != 1
 87 | 
 88 | if ddp:
 89 |     device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
 90 |     gradient_accumulation_steps = gradient_accumulation_steps // world_size
 91 | 
 92 | #deepspeed_config = config.get("deepspeed")
 93 | 
 94 | 
 95 | 
 96 | ### Training Configuration
 97 | #trainer_config = config["trainer"]
 98 | 
 99 | MICRO_BATCH_SIZE = args.mbatch_size  # this could actually be 5 but i like powers of 2
100 | BATCH_SIZE = 256
101 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
102 | EPOCHS = 1  # we don't need 3 tbh
103 | LEARNING_RATE = 1e-3  # the Karpathy constant
104 | CUTOFF_LEN = 128  # 128 accounts for about 95% of the data
105 | LORA_R = 8
106 | LORA_ALPHA = 16
107 | LORA_DROPOUT = 0.05
108 | VAL_SET_SIZE= 2000
109 | 
110 | def preprocess_logits_for_metrics(logits, labels):
111 |     """
112 |     Original Trainer may have a memory leak.
113 |     This is a workaround to avoid storing too many tensors that are not needed.
114 |     """
115 |     pred_ids = torch.argmax(logits[0], dim=-1)
116 |     return pred_ids, labels
117 | 
118 | trainer_config = transformers.TrainingArguments(
119 |     per_device_train_batch_size = MICRO_BATCH_SIZE,
120 |     per_device_eval_batch_size = MICRO_BATCH_SIZE,
121 |     gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
122 |     warmup_ratio=0.06,
123 |     num_train_epochs=EPOCHS,
124 |     # max_steps = 350,
125 |     learning_rate=LEARNING_RATE,
126 |     lr_scheduler_type = "cosine", ## LoRA original paper uses linear
127 |     fp16=True,
128 |     logging_steps=150,
129 |     evaluation_strategy="steps",
130 |     logging_strategy="steps",
131 |     save_strategy="steps",
132 |     eval_steps=300,
133 |     save_steps=300,
134 |     # report_to=report_to,
135 |     output_dir=output_dir,
136 |     optim = "adamw_torch",
137 |     torch_compile = False,
138 |     save_total_limit=2,
139 |     load_best_model_at_end=True,
140 |     ddp_find_unused_parameters=False if ddp else None,
141 | )
142 | 
143 | 
144 | # ### Apply LoRA
145 | #
146 | # Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.
147 | 
148 | target_modules = None
149 | target_modules = ['q_proj', 'v_proj'] # edit with your desired target modules
150 | #lora_config = config.get("lora")
151 | lora_config = LoraConfig(
152 |     r=8, lora_alpha=32, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
153 | )
154 | 
155 | callbacks = [SavePeftModelCallback] if lora_config else []
156 | ##no need to use callbacks
157 | callbacks = []
158 | 
159 | training_args = trainer_config
160 | 
161 | 
162 | model_name = "huggyllama/llama-13b"
163 | 
164 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
165 | tokenizer = fix_tokenizer(tokenizer)
166 | # tokenizer.save_pretrained(output_dir)
167 | 
168 | dataset = load_dataset('multi_nli')
169 | train_records = dataset['train']
170 | val_records = dataset['validation_matched']
171 | #random.shuffle(train_records)
172 | print("train_record[0]: ",train_records[0])
173 | 
174 | model_type = "causal"
175 | templates_path = "llama_lora_mnli_label.json"
176 | only_target_loss = False
177 | mode = "instruct"
178 | 
179 | llmtune_model_name = args.model_name
180 | llmtune_quantized_weights_path = args.weight_path
181 | llmtune_groupsize = 64
182 | 
183 | if mode == "instruct":
184 |     max_source_tokens_count = 64 # Changed depending on the dataset
185 |     max_target_tokens_count = 4
186 |     target_field = ""
187 |     source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response)
188 | 
189 |     train_dataset = InstructDataset(
190 |         train_records,
191 |         tokenizer,
192 |         max_source_tokens_count=max_source_tokens_count,
193 |         max_target_tokens_count=max_target_tokens_count,
194 |         sample_rate=train_sample_rate,
195 |         input_type=model_type,
196 |         templates_path=templates_path,
197 |         target_field=target_field,
198 |         source_field=source_field,
199 |         only_target_loss=only_target_loss
200 |     )
201 | 
202 |     val_dataset = InstructDataset(
203 |         val_records,
204 |         tokenizer,
205 |         max_source_tokens_count=max_source_tokens_count,
206 |         max_target_tokens_count=max_target_tokens_count,
207 |         sample_rate=val_sample_rate,
208 |         input_type=model_type,
209 |         templates_path=templates_path,
210 |         target_field=target_field,
211 |         source_field=source_field,
212 |         only_target_loss=only_target_loss
213 |     )
214 | 
215 |     ## Save the model
216 |     dataloader_train = torch.utils.data.DataLoader(train_dataset)
217 |     # torch.save(dataloader_train,'dataloader_train.pth')
218 | 
219 |     dataloader_val = torch.utils.data.DataLoader(val_dataset)
220 |     # torch.save(dataloader_val,'dataloader_val.pth')
221 | 
222 | else:
223 |     assert False
224 | 
225 | if "seq2seq" in model_type:
226 |     data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)
227 | else:
228 |     data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
229 | 
230 | print("INPUT_IDS")
231 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0])
232 | print("MASK")
233 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0])
234 | print("LABELS")
235 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0])
236 | 
237 | llm, _ = load_llm(
238 |     llmtune_model_name,
239 |     llmtune_quantized_weights_path,
240 |     llmtune_groupsize
241 | )
242 | model = fix_model(llm, tokenizer, use_resize=False)
243 | 
244 | # Default model generation params
245 | model.config.num_beams = 5
246 | if mode == "instruct":
247 |     max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1
248 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count
249 | 
250 | if not ddp and torch.cuda.device_count() > 1:
251 |     model.is_parallelizable = True
252 |     model.model_parallel = True
253 | 
254 | if lora_config:
255 |     #lora_config = LoraConfig(**lora_config)
256 |     # model = get_peft_model(model, lora_config)
257 |     model = load_adapter(model, lora_config=lora_config)
258 | 
259 | trainer_class = Trainer ##if not omit_base_model_save else TrainerNoBaseSave
260 | print("Trainer class:", trainer_class)
261 | trainer = trainer_class(
262 |     model=model,
263 |     args=training_args,
264 |     train_dataset=train_dataset,
265 |     eval_dataset=val_dataset,
266 |     callbacks=callbacks,
267 |     data_collator=data_collator,
268 |     # preprocess_logits_for_metrics = preprocess_logits_for_metrics,
269 | )
270 | 
271 | # with wandb.init(project="llama_ft_samsum", name="llama finetuning run") as run: ## changed the name don't forget
272 | checkpoint_dir = output_dir
273 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
274 |     trainer.train(resume_from_checkpoint=True)
275 | else:
276 |     trainer.train()
277 | model.save_pretrained(output_dir)


--------------------------------------------------------------------------------
/finetune/samsum-llama/train_samsum_4bit_bnb.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | # Set up the argument parser
  4 | parser = argparse.ArgumentParser(description='Python script to work with models')
  5 | parser.add_argument('--model_name', type=str, help='Name of the model', required=True)
  6 | parser.add_argument('--adapter', type=str, help='Path to store adapter weight', required=True)
  7 | parser.add_argument('--mbatch_size', type=int, help='mbatch size for training', required=True)
  8 | parser.add_argument('--seed', type=int, help='model seed number', required=True)
  9 | parser.add_argument('--repo_name', type=str, help='HF model name', required=True)
 10 | 
 11 | 
 12 | # Parse the arguments
 13 | args = parser.parse_args()
 14 | 
 15 | # Use the command line arguments in your script
 16 | print('Model Name:', args.model_name)
 17 | print('Adapter Path: ', args.adapter)
 18 | print('Seed: ', args.seed)
 19 | print('mbatch_size: ', args.mbatch_size)
 20 | 
 21 | 
 22 | import random
 23 | import json
 24 | import os
 25 | 
 26 | # import wandb
 27 | import torch
 28 | import numpy as np
 29 | import bitsandbytes as bnb
 30 | from tqdm import tqdm
 31 | import transformers
 32 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorForTokenClassification, DataCollatorForSeq2Seq
 33 | from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
 34 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 35 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
 36 | from datasets import load_dataset
 37 | 
 38 | from utils import *
 39 | from data import *
 40 | 
 41 | 
 42 | 
 43 | 
 44 | os.environ["WANDB_DISABLED"] = "true"
 45 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 46 | 
 47 | 
 48 | 
 49 | 
 50 | class SavePeftModelCallback(TrainerCallback):
 51 |     def on_save(
 52 |         self,
 53 |         args: TrainingArguments,
 54 |         state: TrainerState,
 55 |         control: TrainerControl,
 56 |         **kwargs,
 57 |     ):
 58 |         checkpoint_folder = os.path.join(
 59 |             args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
 60 |         )
 61 | 
 62 |         peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
 63 |         kwargs["model"].save_pretrained(peft_model_path)
 64 |         return control
 65 | 
 66 | 
 67 | checkpoint = None
 68 | seed = args.seed
 69 | train_sample_rate = 1.0
 70 | val_sample_rate = 1.0
 71 | local_rank = 0
 72 | output_dir = args.adapter
 73 | 
 74 | set_random_seed(seed)
 75 | logging.set_verbosity_info()
 76 | 
 77 | # with open(config_file, "r") as r:
 78 | #     config = json.load(r)
 79 | 
 80 | 
 81 | device_map = "auto"
 82 | world_size = int(os.environ.get("WORLD_SIZE", 1))
 83 | ddp = world_size != 1
 84 | if ddp:
 85 |     device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
 86 |     gradient_accumulation_steps = gradient_accumulation_steps // world_size
 87 | 
 88 | 
 89 | #deepspeed_config = config.get("deepspeed")
 90 | 
 91 | 
 92 | 
 93 | 
 94 | ### Training Configuration
 95 | #trainer_config = config["trainer"]
 96 | 
 97 | MICRO_BATCH_SIZE = args.mbatch_size  # this could actually be 5 but i like powers of 2
 98 | BATCH_SIZE = 128
 99 | GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
100 | EPOCHS = 3  # we don't need 3 tbh
101 | LEARNING_RATE = 1e-3  # the Karpathy constant
102 | CUTOFF_LEN = 128  # 128 accounts for about 95% of the data
103 | LORA_R = 8
104 | LORA_ALPHA = 16
105 | LORA_DROPOUT = 0.05
106 | VAL_SET_SIZE= 2000
107 | 
108 | def preprocess_logits_for_metrics(logits, labels):
109 |     """
110 |     Original Trainer may have a memory leak.
111 |     This is a workaround to avoid storing too many tensors that are not needed.
112 |     """
113 |     pred_ids = torch.argmax(logits[0], dim=-1)
114 |     return pred_ids, labels
115 | 
116 | trainer_config = transformers.TrainingArguments(
117 |     per_device_train_batch_size = MICRO_BATCH_SIZE,
118 |     per_device_eval_batch_size = MICRO_BATCH_SIZE,
119 |     gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
120 |     warmup_ratio=0.06,
121 |     #num_train_epochs=3,
122 |     max_steps = 350,
123 |     learning_rate=LEARNING_RATE,
124 |     lr_scheduler_type = "cosine", ## LoRA original paper uses linear
125 |     fp16=True,
126 |     logging_steps=50,
127 |     evaluation_strategy="steps",
128 |     logging_strategy="steps",
129 |     save_strategy="steps",
130 |     eval_steps=50,
131 |     save_steps=50,
132 |     # report_to=report_to,
133 |     output_dir=output_dir,
134 |     optim = "adamw_torch",
135 |     torch_compile = False,
136 |     save_total_limit=2,
137 |     load_best_model_at_end=False,
138 |     ddp_find_unused_parameters=False if ddp else None,
139 | )
140 | 
141 | 
142 | # ### Apply LoRA
143 | #
144 | # Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.
145 | 
146 | target_modules = None
147 | target_modules = ['q_proj', 'v_proj'] # edit with your desired target modules
148 | #lora_config = config.get("lora")
149 | lora_config = LoraConfig(
150 |     r=8, lora_alpha=32, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
151 | )
152 | 
153 | callbacks = [SavePeftModelCallback] if lora_config else []
154 | ##no need to use callbacks
155 | callbacks = []
156 | 
157 | training_args = trainer_config
158 | 
159 | 
160 | model_name = args.model_name
161 | 
162 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
163 | tokenizer = fix_tokenizer(tokenizer)
164 | # tokenizer.save_pretrained(output_dir)
165 | 
166 | dataset = load_dataset('samsum')
167 | train_records = dataset['train']
168 | val_records = dataset['test']
169 | #random.shuffle(train_records)
170 | print("train_record[0]: ",train_records[0])
171 | 
172 | ## Config for llama 65-b
173 | model_type = "causal"
174 | templates_path = "llama_lora_samsum.json"
175 | only_target_loss = False
176 | mode = "instruct"
177 | 
178 | 
179 | if mode == "instruct":
180 |     max_source_tokens_count = 255 # Changed depending on the dataset
181 |     max_target_tokens_count = 50
182 |     target_field = "summary"
183 |     source_field = "" #does not matter. (original alpaca-lora paper has additional "input" alongside instruction: instruction-input-output vs. instruction-response)
184 | 
185 |     train_dataset = InstructDataset(
186 |         train_records,
187 |         tokenizer,
188 |         max_source_tokens_count=max_source_tokens_count,
189 |         max_target_tokens_count=max_target_tokens_count,
190 |         sample_rate=train_sample_rate,
191 |         input_type=model_type,
192 |         templates_path=templates_path,
193 |         target_field=target_field,
194 |         source_field=source_field,
195 |         only_target_loss=only_target_loss
196 |     )
197 | 
198 |     val_dataset = InstructDataset(
199 |         val_records,
200 |         tokenizer,
201 |         max_source_tokens_count=max_source_tokens_count,
202 |         max_target_tokens_count=max_target_tokens_count,
203 |         sample_rate=val_sample_rate,
204 |         input_type=model_type,
205 |         templates_path=templates_path,
206 |         target_field=target_field,
207 |         source_field=source_field,
208 |         only_target_loss=only_target_loss
209 |     )
210 | 
211 | else:
212 |     assert False
213 | 
214 | if "seq2seq" in model_type:
215 |     data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)
216 | else:
217 |     data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
218 | 
219 | print("INPUT_IDS")
220 | print(data_collator([train_dataset[0], train_dataset[1]])["input_ids"][0])
221 | print("MASK")
222 | print(data_collator([train_dataset[0], train_dataset[1]])["attention_mask"][0])
223 | print("LABELS")
224 | print(data_collator([train_dataset[0], train_dataset[1]])["labels"][0])
225 | 
226 | 
227 | model_types = {
228 |     "causal": AutoModelForCausalLM,
229 |     "seq2seq": AutoModelForSeq2SeqLM
230 | }
231 | ## Decide whether to laod in 8-bit
232 | load_in_8bit = False
233 | load_in_4bit = True
234 | if load_in_8bit:
235 |     assert not load_in_4bit
236 |     model = model_types[model_type].from_pretrained(
237 |         model_name,
238 |         load_in_8bit=True,
239 |         device_map=device_map
240 |     )
241 |     model = fix_model(model, tokenizer, use_resize=False)
242 |     model = prepare_model_for_int8_training(model)
243 | elif load_in_4bit:
244 |     assert not load_in_8bit
245 |     # use_bf16 = trainer_config.get("bf16", False)
246 |     use_bf16 = getattr(trainer_config, "bf16", False)
247 |     compute_dtype = torch.bfloat16 if use_bf16 else torch.float16
248 |     model = model_types[model_type].from_pretrained(
249 |         model_name,
250 |         load_in_4bit=True,
251 |         device_map=device_map,
252 |         quantization_config=BitsAndBytesConfig(
253 |             load_in_4bit=True,
254 |             llm_int8_threshold=6.0,
255 |             llm_int8_has_fp16_weight=False,
256 |             bnb_4bit_compute_dtype=compute_dtype,
257 |             bnb_4bit_use_double_quant=True,
258 |             bnb_4bit_quant_type="nf4"
259 |         ),
260 |         torch_dtype=torch.bfloat16 if use_bf16 else torch.float32
261 |     )
262 |     model = fix_model(model, tokenizer, use_resize=False)
263 |     model = prepare_model_for_int8_training(model)
264 | else:
265 |     model = model_types[model_type].from_pretrained(model_name)
266 |     model = fix_model(model, tokenizer)
267 | 
268 | # Default model generation params
269 | model.config.num_beams = 5
270 | if mode == "instruct":
271 |     max_tokens_count = max_target_tokens_count + max_source_tokens_count + 1
272 | model.config.max_length = max_tokens_count if model_type == "causal" else max_target_tokens_count
273 | 
274 | if not ddp and torch.cuda.device_count() > 1:
275 |     model.is_parallelizable = True
276 |     model.model_parallel = True
277 | 
278 | if lora_config:
279 |     #lora_config = LoraConfig(**lora_config)
280 |     model = get_peft_model(model, lora_config)
281 | 
282 | trainer_class = Trainer ##if not omit_base_model_save else TrainerNoBaseSave
283 | print("Trainer class:", trainer_class)
284 | trainer = trainer_class(
285 |     model=model,
286 |     args=training_args,
287 |     train_dataset=train_dataset,
288 |     eval_dataset=val_dataset,
289 |     callbacks=callbacks,
290 |     data_collator=data_collator,
291 |     preprocess_logits_for_metrics = preprocess_logits_for_metrics,
292 | )
293 | 
294 | # with wandb.init(project="llama_ft_samsum", name="llama finetuning run") as run: ## changed the name don't forget
295 | checkpoint_dir = output_dir
296 | if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
297 |     trainer.train(resume_from_checkpoint=True)
298 | else:
299 |     trainer.train()
300 | model.save_pretrained(output_dir)
301 | 
302 | trainer.model.push_to_hub(args.repo_name)


--------------------------------------------------------------------------------