├── poetry ├── 13b_deepspeed_config.json ├── run_llama.py ├── run_llama_lora.py ├── run_rugpt13b_lora.py ├── run_rugpt_generator.py ├── finetune_fredt5_poetry_generator.py ├── finetune_rugpt13b.py ├── finetune_rugpt13b_lora.py ├── finetune_llama.py ├── finetune_rugpt_with_prompt_masking.py └── finetune_llama_lora.py ├── chitchat ├── run_chitchat_fredt5.py ├── run_chitchat_gpt.py ├── finetune_chitchat_fredt5_with_trainer.py └── finetune_chitchat_gpt_with_trainer.py ├── .gitignore ├── LICENSE └── README.md /poetry/13b_deepspeed_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { "enabled": "auto" }, 3 | "fp16": { "enabled": "auto" }, 4 | 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | 15 | "zero_optimization": { 16 | "stage": 2, 17 | "offload_optimizer": { 18 | "device": "cpu", 19 | "pin_memory": true 20 | }, 21 | "allgather_partitions": true, 22 | "allgather_bucket_size": 2e8, 23 | "overlap_comm": true, 24 | "reduce_scatter": true, 25 | "reduce_bucket_size": 2e8, 26 | "contiguous_gradients": true 27 | }, 28 | 29 | 30 | "train_batch_size": "auto", 31 | "gradient_accumulation_steps": "auto" 32 | } 33 | -------------------------------------------------------------------------------- /poetry/run_llama.py: -------------------------------------------------------------------------------- 1 | """ 2 | Инференс стихов из отфайнтюненной модели LLaMa (см. finetune_llama.py) 3 | """ 4 | 5 | import transformers 6 | import torch 7 | 8 | 9 | model_path = "/home/jovyan/polygon/text_generator/tmp/verses_model=llama7b_domain=lyrics_syllables=0" 10 | 11 | print('Loading model "{}"...'.format(model_path)) 12 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_path) 13 | model = transformers.AutoModelForCausalLM.from_pretrained(model_path, 14 | # load_in_8bit=model_args.load_in_8bit, 15 | # device_map="auto" 16 | ) 17 | 18 | device = torch.device("cuda") 19 | model.to(device) 20 | 21 | while True: 22 | seed = input(':> ') 23 | prompt = '' + seed + '#' 24 | 25 | encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt") 26 | #print('DEBUG@26 encoded_prompt=', encoded_prompt) 27 | encoded_prompt = encoded_prompt.to(device) 28 | 29 | pad_token_id = tokenizer.encode('', add_special_tokens=False)[0] 30 | # end_token_id = self.tokenizer.encode('', add_special_tokens=False)[0] 31 | 32 | output_sequences = model.generate( 33 | input_ids=encoded_prompt, 34 | pad_token_id=pad_token_id, 35 | do_sample=True, 36 | temperature=1.0, 37 | top_p=0.80, 38 | max_length=300, 39 | num_return_sequences=5, 40 | ) 41 | 42 | stop_token = '' 43 | 44 | generated_sequences = set() 45 | for generated_sequence_idx, generated_sequence in enumerate(output_sequences): 46 | generated_sequence = generated_sequence.tolist() 47 | #print('DEBUG@46 ==> ', generated_sequence) 48 | 49 | text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) 50 | if stop_token in text: 51 | text = text[: text.find(stop_token)] 52 | 53 | text = text[text.index('#') + 1:].strip() 54 | text = text.replace('\u2010', '').replace('\u0301', '') 55 | print('='*80) 56 | print(text) 57 | -------------------------------------------------------------------------------- /poetry/run_llama_lora.py: -------------------------------------------------------------------------------- 1 | """ 2 | Инференс стихов из отфайнтюненной модели LLaMa+LoRa (см. finetune_llama_lora.py) 3 | """ 4 | 5 | import transformers 6 | import torch 7 | from peft import PeftModel, PeftConfig 8 | 9 | 10 | model_path = "/home/jovyan/polygon/text_generator/tmp/verses_model=llama7b_lora_domain=lyrics_syllables=0" 11 | 12 | print('Loading LLaMa tokenizer "{}"...'.format(model_path)) 13 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_path) 14 | 15 | peft_model_id = model_path 16 | print('Loading peft model "{}"...'.format(peft_model_id)) 17 | config = PeftConfig.from_pretrained(peft_model_id) 18 | 19 | print('Loading backbone LLaMa "{}"...'.format(config.base_model_name_or_path)) 20 | model = transformers.AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path) 21 | model = PeftModel.from_pretrained(model, peft_model_id) 22 | 23 | device = torch.device("cuda") 24 | model.to(device) 25 | 26 | while True: 27 | seed = input(':> ') 28 | prompt = '' + seed + '#' 29 | 30 | encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt") 31 | #print('DEBUG@26 encoded_prompt=', encoded_prompt) 32 | encoded_prompt = encoded_prompt.to(device) 33 | 34 | pad_token_id = tokenizer.encode('', add_special_tokens=False)[0] 35 | # end_token_id = self.tokenizer.encode('', add_special_tokens=False)[0] 36 | 37 | output_sequences = model.generate( 38 | input_ids=encoded_prompt, 39 | pad_token_id=pad_token_id, 40 | do_sample=True, 41 | temperature=1.0, 42 | top_p=0.80, 43 | max_length=300, 44 | num_return_sequences=5, 45 | ) 46 | 47 | stop_token = '' 48 | 49 | generated_sequences = set() 50 | for generated_sequence_idx, generated_sequence in enumerate(output_sequences): 51 | generated_sequence = generated_sequence.tolist() 52 | #print('DEBUG@46 ==> ', generated_sequence) 53 | 54 | text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) 55 | if stop_token in text: 56 | text = text[: text.find(stop_token)] 57 | 58 | text = text[text.index('#') + 1:].strip() 59 | text = text.replace('\u2010', '').replace('\u0301', '') 60 | print('='*80) 61 | print(text) 62 | -------------------------------------------------------------------------------- /chitchat/run_chitchat_fredt5.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | import torch 5 | import transformers 6 | from transformers import T5Config 7 | 8 | 9 | if __name__ == '__main__': 10 | proj_dir = os.path.expanduser('~/polygon/chatbot') 11 | 12 | parser = argparse.ArgumentParser(description='Консольная интерактивная проверка модели читчата') 13 | parser.add_argument('--model', type=str, default=os.path.join(proj_dir, 'tmp', 'fredt5_chitchat'), help='Путь к каталогу с файлами модели') 14 | args = parser.parse_args() 15 | 16 | use_cuda = torch.cuda.is_available() 17 | device = torch.device("cuda" if use_cuda else "cpu") 18 | 19 | model_dir = args.model 20 | print(f'Loading model "{model_dir}"...') 21 | t5_config = T5Config.from_pretrained(model_dir) 22 | 23 | if 'FRED-T5' in t5_config.name_or_path: 24 | t5_tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_dir) 25 | else: 26 | t5_tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir) 27 | 28 | t5_model = transformers.T5ForConditionalGeneration.from_pretrained(model_dir) 29 | t5_model.to(device) 30 | t5_model.eval() 31 | 32 | while True: 33 | print('-'*80) 34 | dialog = [] 35 | while True: 36 | msg = input('H:> ').strip() 37 | if len(msg) == 0: 38 | break 39 | 40 | msg = msg[0].upper() + msg[1:] 41 | 42 | dialog.append('человек: ' + msg) 43 | 44 | #prompt = ''+'\n'.join(dialog) 45 | prompt = '' + '\n'.join(dialog) + '\nчатбот: ' 46 | 47 | input_ids = t5_tokenizer(prompt, return_tensors='pt').input_ids 48 | out_ids = t5_model.generate(input_ids=input_ids.to(device), 49 | max_length=200, 50 | eos_token_id=t5_tokenizer.eos_token_id, 51 | early_stopping=True, 52 | do_sample=True, 53 | temperature=1.0, 54 | top_k=0, 55 | top_p=0.85) 56 | 57 | t5_output = t5_tokenizer.decode(out_ids[0][1:]) 58 | if '' in t5_output: 59 | t5_output = t5_output[:t5_output.find('')].strip() 60 | 61 | t5_output = t5_output.replace('', '').strip() 62 | 63 | print('B:> {}'.format(t5_output)) 64 | dialog.append('чатбот: ' + t5_output) 65 | -------------------------------------------------------------------------------- /chitchat/run_chitchat_gpt.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import torch 4 | import transformers 5 | 6 | 7 | class Chitchat(object): 8 | def __init__(self, device, models_dir): 9 | model_name = os.path.join(models_dir, 'rugpt_chitchat') 10 | self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) 11 | self.model = transformers.AutoModelForCausalLM.from_pretrained(model_name) 12 | self.model.to(device) 13 | self.model.eval() 14 | 15 | def reply(self, history, num_return_sequences): 16 | prompt = '' + '\n'.join(history) + '\nчатбот:' 17 | encoded_prompt = self.tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(device) 18 | output_sequences = self.model.generate(input_ids=encoded_prompt, 19 | max_length=len(prompt) + 120, 20 | temperature=0.90, 21 | typical_p=None, 22 | top_k=0, 23 | top_p=0.8, 24 | do_sample=True, 25 | num_return_sequences=num_return_sequences, 26 | pad_token_id=self.tokenizer.pad_token_id) 27 | 28 | replies = [] 29 | 30 | for o in output_sequences: 31 | reply = self.tokenizer.decode(o.tolist(), clean_up_tokenization_spaces=True) 32 | reply = reply[len(prompt):] # отсекаем затравку 33 | reply = reply[: reply.find('')] 34 | 35 | if '\nчеловек:' in reply: 36 | reply = reply[:reply.index('\nчеловек:')] 37 | 38 | reply = reply.strip() 39 | 40 | if reply not in replies: # только уникальные реплики, сохраняем порядок выдачи 41 | replies.append(reply) 42 | 43 | return replies 44 | 45 | 46 | if __name__ == '__main__': 47 | device = "cuda" if torch.cuda.is_available() else "cpu" 48 | models_dir = os.path.expanduser('~/polygon/chatbot/tmp') 49 | 50 | chitchat = Chitchat(device, models_dir) 51 | 52 | while True: 53 | dialog = [] 54 | while True: 55 | msg = input('H:> ').strip() 56 | if msg: 57 | dialog.append('человек: ' + msg) 58 | reply = chitchat.reply(dialog, num_return_sequences=1)[0] 59 | print(f'B:> {reply}') 60 | dialog.append('чатбот: ' + reply) 61 | else: 62 | dialog = [] 63 | print('-'*100) 64 | 65 | 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /poetry/run_rugpt13b_lora.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import sys 4 | import argparse 5 | 6 | import torch 7 | import transformers 8 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 9 | 10 | from peft import PeftModel, PeftConfig 11 | 12 | 13 | class RugptGenerator: 14 | def __init__(self, model_path, temperature, top_p): 15 | self.model_path = os.path.expanduser(model_path) 16 | self.temperature = temperature 17 | self.top_p = top_p 18 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 19 | self.tokenizer = None 20 | self.model = None 21 | 22 | def load(self): 23 | self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_path) 24 | self.tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''}) 25 | #self.model = PeftModel.from_pretrained(self.model_path) 26 | 27 | peft_model_id = self.model_path 28 | config = PeftConfig.from_pretrained(peft_model_id) 29 | model = transformers.AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path) 30 | model = PeftModel.from_pretrained(model, peft_model_id) 31 | self.model = model.to(self.device) 32 | #self.model.eval() 33 | 34 | def generate_output(self, context, num_return_sequences): 35 | length = 200 36 | 37 | encoded_prompt = self.tokenizer.encode(context, add_special_tokens=False, return_tensors="pt") 38 | encoded_prompt = encoded_prompt.to(self.device) 39 | 40 | pad_token_id = self.tokenizer.encode('', add_special_tokens=False)[0] 41 | #end_token_id = self.tokenizer.encode('', add_special_tokens=False)[0] 42 | 43 | output_sequences = self.model.generate( 44 | input_ids=encoded_prompt, 45 | max_length=length + len(encoded_prompt[0]), 46 | num_return_sequences=num_return_sequences, 47 | pad_token_id=pad_token_id, 48 | #end_token_id=end_token_id, 49 | do_sample=True, 50 | temperature=self.temperature, 51 | top_p=self.top_p 52 | ) 53 | 54 | stop_token = '' 55 | 56 | generated_sequences = set() 57 | for generated_sequence_idx, generated_sequence in enumerate(output_sequences): 58 | generated_sequence = generated_sequence.tolist() 59 | 60 | text = self.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) 61 | if stop_token in text: 62 | text = text[: text.find(stop_token)] 63 | 64 | text = text[text.index('#')+1:].strip() 65 | text = text.replace('\u2010', '').replace('\u0301', '') 66 | 67 | generated_sequences.add(text) 68 | 69 | return list(generated_sequences) 70 | 71 | 72 | if __name__ == '__main__': 73 | proj_dir = os.path.expanduser('~/polygon/text_generator') 74 | 75 | parser = argparse.ArgumentParser(description='Отладочный консольный генератор стихов на базе rugpt13B+LoRa') 76 | parser.add_argument('--model_path', type=str, default=os.path.join(proj_dir, 'tmp', 'verses_rugpt13b_lora_domain=lyrycs_syllables=1')) 77 | parser.add_argument('--temperature', type=float, default=1.0, help='Температура сэмплинга') 78 | parser.add_argument('--top_p', type=float, default=0.8, help='top-p') 79 | parser.add_argument('--top_k', type=int, default=0, help='top-k') 80 | parser.add_argument('--typical_p', type=float, default=0.0, help='typical-p') 81 | 82 | args = parser.parse_args() 83 | 84 | use_cuda = torch.cuda.is_available() 85 | device = torch.device("cuda" if use_cuda else "cpu") 86 | 87 | poem_generator = RugptGenerator(model_path=args.model_path, temperature=args.temperature, top_p=args.top_p) 88 | poem_generator.load() 89 | 90 | while True: 91 | prompt = input(':> ').strip() 92 | if prompt: 93 | seed = prompt + '#' 94 | px = poem_generator.generate_output(seed, num_return_sequences=10) 95 | print('-'*80) 96 | for ipoem, p in enumerate(px, start=1): 97 | print('='*30 + ' POEM #{} '.format(ipoem) + '='*30) 98 | print(p) 99 | print('-'*80) 100 | -------------------------------------------------------------------------------- /poetry/run_rugpt_generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import sys 4 | import argparse 5 | 6 | import torch 7 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 8 | 9 | from generative_poetry.whitespace_normalization import normalize_whitespaces 10 | 11 | 12 | class RugptGenerator: 13 | def __init__(self, model_path, generation_config): 14 | self.model_path = model_path 15 | self.generation_config = generation_config 16 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 17 | self.tokenizer = None 18 | self.model = None 19 | 20 | def load(self): 21 | model_name_or_path = os.path.expanduser(self.model_path) 22 | self.tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path) 23 | self.tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''}) 24 | self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path) 25 | self.model.to(self.device) 26 | self.model.eval() 27 | 28 | def generate_output(self, context, num_return_sequences): 29 | encoded_prompt = self.tokenizer.encode(context, add_special_tokens=False, return_tensors="pt") 30 | encoded_prompt = encoded_prompt.to(self.device) 31 | 32 | pad_token_id = self.tokenizer.encode('', add_special_tokens=False)[0] 33 | #end_token_id = self.tokenizer.encode('', add_special_tokens=False)[0] 34 | 35 | output_sequences = self.model.generate( 36 | input_ids=encoded_prompt, 37 | pad_token_id=pad_token_id, 38 | **self.generation_config 39 | ) 40 | 41 | stop_token = '' 42 | 43 | generated_sequences = set() 44 | for generated_sequence_idx, generated_sequence in enumerate(output_sequences): 45 | generated_sequence = generated_sequence.tolist() 46 | 47 | text = self.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) 48 | if stop_token in text: 49 | text = text[: text.find(stop_token)] 50 | 51 | text = text[text.index('#')+1:].strip() 52 | text = text.replace('\u2010', '').replace('\u0301', '') 53 | text = normalize_whitespaces(text) 54 | generated_sequences.add(text) 55 | 56 | return list(generated_sequences) 57 | 58 | 59 | if __name__ == '__main__': 60 | proj_dir = os.path.expanduser('~/polygon/text_generator') 61 | 62 | parser = argparse.ArgumentParser(description='Отладочный консольный генератор пирожков') 63 | parser.add_argument('--model_path', type=str, default=os.path.join(proj_dir, 'tmp', 'verses_rugpt.all')) 64 | parser.add_argument('--max_length', type=int, default=300) 65 | parser.add_argument('--num_return_sequences', type=int, default=5) 66 | parser.add_argument('--do_sample', type=bool, default=True) 67 | parser.add_argument('--num_beams', type=int, default=1) 68 | parser.add_argument('--num_beam_groups', type=int, default=1) 69 | parser.add_argument('--penalty_alpha', type=float, default=None) 70 | parser.add_argument('--epsilon_cutoff', type=float, default=0.0) 71 | parser.add_argument('--eta_cutoff', type=float, default=0.0) 72 | parser.add_argument('--diversity_penalty', type=float, default=0.0) 73 | parser.add_argument('--repetition_penalty', type=float, default=None) 74 | parser.add_argument('--encoder_repetition_penalty', type=float, default=1.0) 75 | parser.add_argument('--length_penalty', type=float, default=1.0) 76 | parser.add_argument('--no_repeat_ngram_size', type=int, default=0) 77 | parser.add_argument('--renormalize_logits', type=bool, default=False) 78 | parser.add_argument('--temperature', type=float, default=0.9, help='Температура сэмплинга') 79 | parser.add_argument('--top_p', type=float, default=0.6, help='top-p') 80 | parser.add_argument('--top_k', type=int, default=0, help='top-k') 81 | parser.add_argument('--typical_p', type=float, default=None, help='typical-p') 82 | args = parser.parse_args() 83 | 84 | generation_args = {'max_length': args.max_length, 85 | 'num_return_sequences': args.num_return_sequences, 86 | 'do_sample': args.do_sample, 87 | 'num_beams': args.num_beams, 88 | 'num_beam_groups': args.num_beam_groups, 89 | 'penalty_alpha': args.penalty_alpha, 90 | 'epsilon_cutoff': args.epsilon_cutoff, 91 | 'eta_cutoff': args.eta_cutoff, 92 | 'diversity_penalty': args.diversity_penalty, 93 | 'repetition_penalty': args.repetition_penalty, 94 | 'encoder_repetition_penalty': args.encoder_repetition_penalty, 95 | 'length_penalty': args.length_penalty, 96 | 'no_repeat_ngram_size': args.no_repeat_ngram_size, 97 | 'renormalize_logits': args.renormalize_logits, 98 | 'temperature': args.temperature, 99 | 'top_p': args.top_p, 100 | 'top_k': args.top_k, 101 | 'typical_p': args.typical_p, 102 | } 103 | 104 | use_cuda = torch.cuda.is_available() 105 | device = torch.device("cuda" if use_cuda else "cpu") 106 | 107 | poem_generator = RugptGenerator(args.model_path, generation_args) 108 | poem_generator.load() 109 | 110 | while True: 111 | prompt = input(':> ').strip() 112 | if prompt: 113 | seed = prompt + '#' 114 | px = poem_generator.generate_output(seed, num_return_sequences=10) 115 | print('-'*80) 116 | for ipoem, p in enumerate(px, start=1): 117 | print('='*30 + ' POEM #{} '.format(ipoem) + '='*30) 118 | print(p) 119 | print('-'*80) 120 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LM-finetune 2 | 3 | В этом репе я собрал свои текущие рабочие скрипты для файнтюна языковых моделей (rugpt, LLaMa, FRED T5) средствами transformers. 4 | В случае больших моделей (7B и 13B) используются варианты а) deepspeed б) LoRa. 5 | 6 | В коде нет ничего нового и особо умного, просто базовый пайплайн в рамках рекомендаций для transformers.Trainer. 7 | 8 | ## ГЕНЕРАТОР СТИХОВ 9 | 10 | ### Генератор стихов на базе модели LLaMa 7B и 13B 11 | 12 | Код: [finetune_llama.py](./poetry/finetune_llama.py) 13 | 14 | Используется deepspeed, что позволяет тюнить модели на 40Гб гпушках. Судя по отчету deepspeed'а, возможен 15 | также файнтюн на V100 с 32Гб. Обратите внимание, что требуется очень много обычной RAM, более 240 Гб, 16 | чтобы deepspeed выгружал туда тензоры. 17 | 18 | Запуск файнтюна на 4 GPU: 19 | 20 | ``` 21 | python -m torch.distributed.launch --nproc_per_node=4 finetune_llama.py \ 22 | --dataset_path ~/polygon/text_generator/tmp/лирика.jsonl \ 23 | --max_samples 10000 \ 24 | --output_syllables 0 \ 25 | --model_name_or_path decapoda-research/llama-7b-hf \ 26 | --output_dir ~/polygon/text_generator/tmp/verses_model=llama7b_domain=lyrics_syllables=0 \ 27 | --overwrite_output_dir 1 \ 28 | --per_device_train_batch_size 8 \ 29 | --learning_rate 1e-5 \ 30 | --num_train_epochs 1 \ 31 | --bf16 1 \ 32 | --fp16 0 \ 33 | --gradient_checkpointing 0 \ 34 | --gradient_accumulation_step 8 \ 35 | --do_train 1 \ 36 | --do_eval 0 \ 37 | --report_to tensorboard \ 38 | --evaluation_strategy no \ 39 | --logging_strategy steps \ 40 | --logging_steps 10 \ 41 | --save_strategy no \ 42 | --deepspeed 13b_deepspeed_config.json 43 | ``` 44 | 45 | Файл с конфигурацией deepspeed'а: [13b_deepspeed_config.json](./poetry/13b_deepspeed_config.json) 46 | 47 | Код инференса: [run_llama.py](./poetry/run_llama.py). Disclaimer: этот код инференсит только на 80Гб A100. 48 | 49 | ### Генератор стихов на базе LLaMa 7B и 13B с использованием библиотеки PEFT (метод LoRa) 50 | 51 | Код файнтюна: [finetune_llama_lora.py](./poetry/finetune_llama_lora.py) 52 | 53 | Запуск файнтюна на 2 ГПУ: 54 | 55 | ``` 56 | python -m torch.distributed.run --nproc_per_node=2 finetune_llama_lora.py \ 57 | --dataset_path ~/polygon/text_generator/tmp/лирика.jsonl \ 58 | --max_samples 10000 \ 59 | --output_syllables 0 \ 60 | --model_name_or_path decapoda-research/llama-7b-hf \ 61 | --output_dir ~/polygon/text_generator/tmp/verses_model=llama7b_lora_domain=lyrics_syllables=0 \ 62 | --overwrite_output_dir 1 \ 63 | --per_device_train_batch_size 1 \ 64 | --learning_rate 1e-4 \ 65 | --num_train_epochs 1 \ 66 | --bf16 0 \ 67 | --fp16 0 \ 68 | --gradient_checkpointing 0 \ 69 | --gradient_accumulation_step 8 \ 70 | --do_train 1 \ 71 | --do_eval 0 \ 72 | --report_to tensorboard \ 73 | --evaluation_strategy no \ 74 | --logging_strategy steps \ 75 | --logging_steps 200 \ 76 | --save_strategy no \ 77 | ``` 78 | 79 | Код инференса: [run_llama_lora.py](./poetry/run_llama_lora.py) 80 | 81 | ### Генератор стихов на базе FRED T5 XL 82 | 83 | Код для файнтюна: [finetune_fredt5_poetry_generator.py](./poetry/finetune_fredt5_poetry_generator.py) 84 | 85 | Запуск файнтюна на 2 ГПУ: 86 | 87 | ``` 88 | python -m torch.distributed.run --nproc_per_node=2 finetune_fredt5_poetry_generator.py \ 89 | --model_name_or_path ai-forever/FRED-T5-1.7B \ 90 | --dataset_path ~/polygon/text_generator/tmp/all_verses.jsonl \ 91 | --prompt prompt_text \ 92 | --optim "adafactor" \ 93 | --learning_rate 1e-3 \ 94 | --lr_scheduler_type constant \ 95 | --per_device_train_batch_size 8 \ 96 | --gradient_checkpointing 0 \ 97 | --gradient_accumulation_steps 4 \ 98 | --num_train_epochs 1 \ 99 | --report_to tensorboard \ 100 | --logging_strategy steps \ 101 | --logging_steps 100 \ 102 | --output_dir ~/polygon/text_generator/tmp/verses_fredt5 \ 103 | --save_strategy no 104 | ``` 105 | 106 | Запуск инференса: [run_fredt5_poetry_generator.py](./poetry/run_fredt5_poetry_generator.py) 107 | 108 | 109 | ### Генератор стихов на базе моделей rugpt (кроме rugpt13B) 110 | 111 | Код для файнтюна: [finetune_rugpt_with_prompt_masking.py](./poetry/finetune_rugpt_with_prompt_masking.py) 112 | 113 | Запуск на 2 ГПУ, базовая модель rugpt3large_based_on_gpt2: 114 | 115 | ``` 116 | python -m torch.distributed.run --nproc_per_node=2 finetune_rugpt_with_prompt_masking.py \ 117 | --dataset_path ~/polygon/text_generator/tmp/лирика.jsonl \ 118 | --output_syllables 1 \ 119 | --model_name_or_path sberbank-ai/rugpt3large_based_on_gpt2 \ 120 | --output_dir ~/polygon/text_generator/tmp/verses_model=rugpt_large_domain=lyrics_syllables=1 \ 121 | --overwrite_output_dir 1 \ 122 | --per_device_train_batch_size 8 \ 123 | --learning_rate 5e-5 \ 124 | --num_train_epochs 1 \ 125 | --fp16 1 \ 126 | --gradient_checkpointing 0 \ 127 | --gradient_accumulation_step 8 \ 128 | --do_train 1 \ 129 | --do_eval 0 \ 130 | --report_to tensorboard \ 131 | --evaluation_strategy no \ 132 | --logging_strategy steps \ 133 | --logging_steps 200 \ 134 | --save_strategy no 135 | ``` 136 | 137 | Инференс: [run_rugpt_generator.py](./poetry/run_rugpt_generator.py) 138 | 139 | 140 | ## ЧИТЧАТ 141 | 142 | ### Файнтюн читчата на базе модели FRED T5 XL 1.7B 143 | 144 | Особенность подхода: вместо префикса для входной последовательности ставится селектор денойзера ``, 145 | и добавляется токен `` в том месте (конец диалога), где находится генерируемая реплика. 146 | 147 | Код: [finetune_chitchat_fredt5_with_trainer.py](./chitchat/finetune_chitchat_fredt5_with_trainer.py). 148 | 149 | Пример запуска на 1 ГПУ: 150 | 151 | ``` 152 | python finetune_chitchat_fredt5_with_trainer.py \ 153 | --dataset_path axioma_dialogues.json \ 154 | --optim "adafactor" \ 155 | --learning_rate 1e-4 \ 156 | --lr_scheduler_type constant \ 157 | --per_gpu_train_batch_size 6 \ 158 | --gradient_checkpointing 0 \ 159 | --gradient_accumulation_steps 8 \ 160 | --num_train_epochs 1 \ 161 | --report_to tensorboard \ 162 | --logging_strategy steps \ 163 | --logging_steps 500 \ 164 | --output_dir ~/polygon/chatbot/tmp/fredt5_chitchat \ 165 | --save_strategy no 166 | ``` 167 | 168 | Датасет для этой модели: [axioma_dialogues.json](./chitchat/axioma_dialogues.json) сделан из русскоязычной части [датасета проекта OpenAssistant](https://huggingface.co/datasets/OpenAssistant/oasst1). 169 | Каждая ответная реплика вместе с предшествующим контекстом образует отдельный сэмпл для seq2seq модели. Реплики человека и чатбота отмечаются 170 | метками `` и `<и>` соответственно. Для файнтюна они преобразуются в префиксы `человек:` и `чатбот:`. 171 | 172 | После файнтюна запустить генерацию можно с помощью кода [run_chitchat_fredt5.py](./chitchat/run_chitchat_fredt5.py). 173 | 174 | ### Файнтюн читчата на базе модели sberbank-ai/rugpt3medium_based_on_gpt2 175 | 176 | Также подходит для других моделей семейства rugpt. 177 | 178 | Код [finetune_chitchat_gpt_with_trainer.py](./chitchat/finetune_chitchat_gpt_with_trainer.py). 179 | 180 | Датасет: [axioma_dialogues.solid.json](./chitchat/axioma_dialogues.solid.json) 181 | 182 | Запуск файнтюна на 1 GPU: 183 | 184 | ``` 185 | python finetune_chitchat_gpt_with_trainer.py \ 186 | --model_name_or_path sberbank-ai/rugpt3medium_based_on_gpt2 \ 187 | --learning_rate 1e-5 \ 188 | --lr_scheduler_type constant \ 189 | --per_gpu_train_batch_size 2 \ 190 | --gradient_checkpointing 0 \ 191 | --gradient_accumulation_steps 8 \ 192 | --num_train_epochs 1 \ 193 | --report_to tensorboard \ 194 | --logging_strategy steps \ 195 | --logging_steps 100 \ 196 | --output_dir ~/polygon/chatbot/tmp/rugpt_chitchat \ 197 | --save_strategy no 198 | ``` 199 | 200 | Код инференса: [run_chitchat_gpt.py](./chitchat/run_chitchat_gpt.py). 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /chitchat/finetune_chitchat_fredt5_with_trainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Тренировка модели болталки Axioma на FRED T5 для проекта https://github.com/Koziev/chatbot 3 | Эксперимент с файнтюном: токены истории диалога не включаем в backprop, присваивая соответствующим целям (labels) значение -100 4 | Прочие хинты по тренировке: https://kelijah.livejournal.com/315826.html 5 | """ 6 | 7 | import os 8 | import json 9 | import sys 10 | import io 11 | import random 12 | import itertools 13 | from typing import Any, Dict, List, Optional, Tuple, Union 14 | import shutil 15 | import logging 16 | from dataclasses import dataclass, field 17 | 18 | import torch 19 | import torch.optim 20 | from torch.utils.data import Dataset, DataLoader 21 | import transformers 22 | from transformers import AutoTokenizer 23 | from transformers import TrainingArguments, Trainer, TrainerCallback 24 | from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config 25 | from transformers import HfArgumentParser 26 | from pynvml import * 27 | 28 | 29 | proj_dir = os.path.expanduser('~/polygon/chatbot') 30 | 31 | 32 | def print_gpu_utilization(): 33 | nvmlInit() 34 | handle = nvmlDeviceGetHandleByIndex(0) 35 | info = nvmlDeviceGetMemoryInfo(handle) 36 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.") 37 | 38 | 39 | def load_samples(dataset_path, tokenizer): 40 | samples = [] 41 | with open(dataset_path, 'r') as f: 42 | for sample in json.load(f): 43 | try: 44 | # 01.05.2023 эксперимент: вместо спецтокенов и используем метки 45 | seed = '' + sample['context'].replace('', 'человек: ').replace('', 'чатбот: ') + '\nчатбот: ' 46 | reply = '' + sample['reply'] 47 | input_tokens = tokenizer.encode(seed, add_special_tokens=False, truncation=True, max_length=1024) 48 | output_tokens = tokenizer.encode(reply, add_special_tokens=False) # , truncation=True, max_length=1024) 49 | if len(input_tokens) < 512 and len(output_tokens) < 512: # пока ограничим многословность 50 | samples.append({'input_tokens': input_tokens, 51 | 'output_tokens': output_tokens, 52 | 'seed': seed, 53 | 'reply': reply}) 54 | except Exception as ex: 55 | print(ex) 56 | 57 | return samples 58 | 59 | 60 | class FinetuneDataset(Dataset): 61 | def __init__(self, samples, tokenizer): 62 | self.tokenizer = tokenizer 63 | self.max_input_len = 0 64 | self.max_output_len = 0 65 | self.samples = [] 66 | 67 | self.bos_token_id = tokenizer.encode('', add_special_tokens=False)[0] 68 | self.eos_token_id = tokenizer.encode('', add_special_tokens=False)[0] 69 | self.pad_token_id = tokenizer.encode('', add_special_tokens=False)[0] 70 | 71 | for sample in samples: 72 | input_ids = sample['input_tokens'] 73 | output_ids = sample['output_tokens'] + [self.eos_token_id] 74 | self.samples.append((input_ids, output_ids)) 75 | self.max_input_len = max(self.max_input_len, len(input_ids)) 76 | self.max_output_len = max(self.max_output_len, len(output_ids)) 77 | 78 | def __len__(self): 79 | return len(self.samples) 80 | 81 | def __getitem__(self, index: int): 82 | input_ids, output_ids = self.samples[index] 83 | 84 | input_npad = self.max_input_len - len(input_ids) 85 | attention_mask = [1]*len(input_ids) + [0]*input_npad 86 | input_ids = input_ids + input_npad * [self.pad_token_id] 87 | 88 | output_npad = self.max_output_len - len(output_ids) 89 | labels = output_ids + output_npad * [-100] 90 | 91 | return {'input_ids': torch.LongTensor(input_ids), 92 | 'attention_mask': attention_mask, 93 | 'labels': torch.LongTensor(labels), 94 | } 95 | 96 | 97 | @dataclass 98 | class ModelArguments: 99 | """ 100 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 101 | """ 102 | model_name_or_path: Optional[str] = field( 103 | default='ai-forever/FRED-T5-1.7B', 104 | metadata={"help": "The model checkpoint for weights initialization."}, 105 | ) 106 | 107 | 108 | @dataclass 109 | class DataTrainingArguments: 110 | """ 111 | Arguments pertaining to what data we are going to input our model for training and eval. 112 | """ 113 | dataset_path: Optional[str] = field( 114 | default=os.path.join(proj_dir, 'tmp', 'axioma_dialogues.json'), 115 | metadata={"help": "Путь к датасету с диалогами"} 116 | ) 117 | 118 | 119 | if __name__ == '__main__': 120 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 121 | 122 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 123 | 124 | if not training_args.optim: 125 | training_args.optim = "adafactor" 126 | 127 | if not training_args.output_dir: 128 | training_args.output_dir = os.path.join(proj_dir, 'tmp', 'fredt5_chitchat') 129 | 130 | verbose = training_args.local_rank in (-1, 0) 131 | 132 | # Setup logging 133 | logging.basicConfig( 134 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 135 | datefmt="%m/%d/%Y %H:%M:%S", 136 | handlers=[logging.StreamHandler(sys.stdout)], 137 | ) 138 | 139 | log_level = training_args.get_process_log_level() 140 | logger = logging.getLogger(__name__) 141 | logger.setLevel(log_level) 142 | #datasets.utils.logging.set_verbosity(log_level) 143 | transformers.utils.logging.set_verbosity(log_level) 144 | transformers.utils.logging.enable_default_handler() 145 | transformers.utils.logging.enable_explicit_format() 146 | 147 | logger.info( 148 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 149 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 150 | ) 151 | logger.info(f"Training/evaluation parameters {training_args}") 152 | 153 | rank0 = training_args.local_rank in (-1, 0) 154 | 155 | # Удаляем старые логи tensorboard 156 | if rank0: 157 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs') 158 | #if os.path.exists(tensorboard_dir): 159 | # logger.info('Removing "%s"', tensorboard_dir) 160 | # shutil.rmtree(tensorboard_dir) 161 | 162 | device = training_args.device 163 | logger.info('device={}'.format(device)) 164 | 165 | pretrained_model_name = model_args.model_name_or_path 166 | 167 | logger.info('Loading pretrained model "%s"', pretrained_model_name) 168 | tokenizer = transformers.GPT2Tokenizer.from_pretrained(pretrained_model_name) 169 | model = transformers.T5ForConditionalGeneration.from_pretrained(pretrained_model_name) 170 | model.to(device) 171 | 172 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''}) 173 | 174 | if rank0: 175 | print_gpu_utilization() 176 | logger.info('\nTokenizer:') 177 | for token in ' '.split(): 178 | logger.info('token "%s" id=%s'.format(token, str(tokenizer.encode(token, add_special_tokens=False)))) 179 | 180 | logger.info('Loading dataset "%s"...', data_args.dataset_path) 181 | train_samples = load_samples(data_args.dataset_path, tokenizer) 182 | logger.info('Train samples: %d', len(train_samples)) 183 | 184 | train_dataset = FinetuneDataset(train_samples, tokenizer) 185 | # test_dataset = FinetuneDataset(test_samples, tokenizer) 186 | 187 | trainer = Trainer( 188 | model=model, 189 | args=training_args, 190 | train_dataset=train_dataset, 191 | tokenizer=tokenizer, 192 | data_collator=None, 193 | ) 194 | 195 | try: 196 | logger.info('Start training...') 197 | train_result = trainer.train() 198 | 199 | if rank0: 200 | metrics = train_result.metrics 201 | trainer.log_metrics("train", metrics) 202 | trainer.save_metrics("train", metrics) 203 | except KeyboardInterrupt: 204 | print('!!! Ctrl+C !!!') 205 | 206 | if rank0: 207 | logger.info(f'Saving the model and tokenizer') 208 | trainer.save_model(output_dir=training_args.output_dir) 209 | tokenizer.save_pretrained(training_args.output_dir) 210 | #model.save_pretrained(training_args.output_dir) 211 | 212 | logger.info('All done :)') 213 | -------------------------------------------------------------------------------- /poetry/finetune_fredt5_poetry_generator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Тренировка модели генерации стихов на претрейненной модели FRED T5 XL 3 | """ 4 | import logging 5 | import os 6 | import json 7 | import io 8 | import random 9 | import itertools 10 | import sys 11 | from dataclasses import dataclass, field 12 | from typing import Any, Dict, List, Optional, Tuple, Union 13 | import shutil 14 | import argparse 15 | 16 | import numpy as np 17 | import tqdm 18 | import sklearn.model_selection 19 | import torch 20 | import scipy 21 | import torch.nn as nn 22 | import torch.nn.functional as F 23 | import torch.optim 24 | from torch.utils.tensorboard import SummaryWriter 25 | from torch.utils.data import Dataset, DataLoader 26 | from transformers import AutoModelForCausalLM 27 | import transformers 28 | from transformers import AutoTokenizer 29 | from transformers import TrainingArguments, Trainer, TrainerCallback 30 | from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config 31 | from transformers import HfArgumentParser 32 | from pynvml import * 33 | 34 | 35 | proj_dir = os.path.expanduser('~/polygon/text_generator') 36 | 37 | 38 | def print_gpu_utilization(): 39 | nvmlInit() 40 | handle = nvmlDeviceGetHandleByIndex(0) 41 | info = nvmlDeviceGetMemoryInfo(handle) 42 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.") 43 | 44 | 45 | def load_samples(dataset_path, tokenizer): 46 | samples = [] 47 | with open(dataset_path, 'r') as f: 48 | for line in f: 49 | sample = json.loads(line) 50 | try: 51 | input_text = '' + sample['prompt_text'] 52 | 53 | # Вариант с генерацией обычного текста 54 | output_text = sample['output_text'] 55 | output_text = '\n'.join(output_text.split('\n')[:4]) 56 | 57 | input_tokens = tokenizer.encode(input_text, add_special_tokens=False, truncation=True, max_length=512) 58 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False) 59 | samples.append({'input_tokens': input_tokens, 'output_tokens': output_tokens, 60 | 'input_text': input_text, 'output_text': output_text}) 61 | 62 | except Exception as ex: 63 | print(ex) 64 | 65 | return samples 66 | 67 | 68 | class FinetuneDataset(Dataset): 69 | def __init__(self, samples, tokenizer): 70 | self.tokenizer = tokenizer 71 | self.max_input_len = 0 72 | self.max_output_len = 0 73 | self.samples = [] 74 | 75 | self.bos_token_id = tokenizer.encode('', add_special_tokens=False)[0] 76 | self.eos_token_id = tokenizer.encode('', add_special_tokens=False)[0] 77 | self.pad_token_id = tokenizer.encode('', add_special_tokens=False)[0] 78 | 79 | for sample in samples: 80 | input_ids = sample['input_tokens'] 81 | output_ids = sample['output_tokens'] + [self.eos_token_id] 82 | self.samples.append((input_ids, output_ids)) 83 | self.max_input_len = max(self.max_input_len, len(input_ids)) 84 | self.max_output_len = max(self.max_output_len, len(output_ids)) 85 | 86 | def __len__(self): 87 | return len(self.samples) 88 | 89 | def __getitem__(self, index: int): 90 | input_ids, output_ids = self.samples[index] 91 | 92 | input_npad = self.max_input_len - len(input_ids) 93 | attention_mask = [1]*len(input_ids) + [0]*input_npad 94 | input_ids = input_ids + input_npad * [self.pad_token_id] 95 | 96 | output_npad = self.max_output_len - len(output_ids) 97 | labels = output_ids + output_npad * [-100] 98 | 99 | return {'input_ids': torch.LongTensor(input_ids), 100 | 'attention_mask': attention_mask, 101 | 'labels': torch.LongTensor(labels), 102 | } 103 | 104 | 105 | @dataclass 106 | class ModelArguments: 107 | """ 108 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 109 | """ 110 | model_name_or_path: Optional[str] = field( 111 | default='ai-forever/FRED-T5-1.7B', 112 | metadata={"help": "The model checkpoint for weights initialization."}, 113 | ) 114 | 115 | 116 | @dataclass 117 | class DataTrainingArguments: 118 | """ 119 | Arguments pertaining to what data we are going to input our model for training and eval. 120 | """ 121 | dataset_path: Optional[str] = field( 122 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'long_poems_gpt_dataset.jsonl')), 123 | metadata={"help": "Путь к датасету со стихами"} 124 | ) 125 | 126 | 127 | class MyPrinterCallback(TrainerCallback): 128 | def __init__(self, filepath): 129 | self.wrt = open(filepath, 'w') 130 | 131 | def on_log(self, args, state, control, logs=None, **kwargs): 132 | if state.is_local_process_zero: 133 | if 'epoch' in logs and 'loss' in logs: 134 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss'])) 135 | self.wrt.flush() 136 | 137 | 138 | if __name__ == '__main__': 139 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 140 | 141 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 142 | 143 | if not training_args.optim: 144 | training_args.optim = "adafactor" 145 | 146 | if not training_args.output_dir: 147 | training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 't5_poetry_generator')) 148 | 149 | verbose = training_args.local_rank in (-1, 0) 150 | 151 | # Setup logging 152 | logging.basicConfig( 153 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 154 | datefmt="%m/%d/%Y %H:%M:%S", 155 | handlers=[logging.StreamHandler(sys.stdout)], 156 | ) 157 | 158 | log_level = training_args.get_process_log_level() 159 | logger = logging.getLogger(__name__) 160 | logger.setLevel(log_level) 161 | #datasets.utils.logging.set_verbosity(log_level) 162 | transformers.utils.logging.set_verbosity(log_level) 163 | transformers.utils.logging.enable_default_handler() 164 | transformers.utils.logging.enable_explicit_format() 165 | 166 | logger.info( 167 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 168 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 169 | ) 170 | logger.info(f"Training/evaluation parameters {training_args}") 171 | 172 | # Удаляем старые логи tensorboard 173 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs') 174 | if os.path.exists(tensorboard_dir): 175 | logger.info('Removing "%s"', tensorboard_dir) 176 | shutil.rmtree(tensorboard_dir) 177 | 178 | device = training_args.device 179 | logging.info('device={}'.format(device)) 180 | 181 | pretrained_model_name = model_args.model_name_or_path 182 | 183 | logger.info('Loading pretrained model "%s"', pretrained_model_name) 184 | if 'FRED-T5' in pretrained_model_name: 185 | tokenizer = transformers.GPT2Tokenizer.from_pretrained(pretrained_model_name) 186 | else: 187 | tokenizer = transformers.T5Tokenizer.from_pretrained(pretrained_model_name) 188 | 189 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''}) 190 | 191 | model = transformers.T5ForConditionalGeneration.from_pretrained(pretrained_model_name) 192 | model.to(device) 193 | print_gpu_utilization() 194 | 195 | logger.info('\nLoading dataset "%s"...', data_args.dataset_path) 196 | train_samples = load_samples(data_args.dataset_path, tokenizer) 197 | logger.info('Train samples: %d', len(train_samples)) 198 | 199 | train_dataset = FinetuneDataset(train_samples, tokenizer) 200 | # test_dataset = FinetuneDataset(test_samples, tokenizer) 201 | 202 | printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_fredt5_poetry_generator.loss.log')) 203 | 204 | trainer = Trainer( 205 | model=model, 206 | args=training_args, 207 | train_dataset=train_dataset, 208 | # eval_dataset=test_dataset, 209 | tokenizer=tokenizer, 210 | data_collator=None, 211 | # compute_metrics=compute_metrics, 212 | callbacks=[printer] #[EarlyStoppingCallback(early_stopping_patience=5)] 213 | ) 214 | 215 | logger.info('Start training...') 216 | train_result = trainer.train() 217 | 218 | logger.info(f'Saving the model and tokenizer') 219 | trainer.save_model(output_dir=training_args.output_dir) 220 | 221 | metrics = train_result.metrics 222 | trainer.log_metrics("train", metrics) 223 | trainer.save_metrics("train", metrics) 224 | 225 | logger.info('All done :)') 226 | -------------------------------------------------------------------------------- /chitchat/finetune_chitchat_gpt_with_trainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Тренировка модели болталки Axioma. 3 | Эксперимент с файнтюном: токены истории диалога не включаем в backprop, присваивая соответствующим целям (labels) значение -100 4 | """ 5 | import logging 6 | import os 7 | import json 8 | import sys 9 | import io 10 | import random 11 | import itertools 12 | import shutil 13 | from dataclasses import dataclass, field 14 | from typing import Any, Dict, List, Optional, Tuple, Union 15 | 16 | import numpy as np 17 | import tqdm 18 | import sklearn.model_selection 19 | import torch 20 | import scipy 21 | import torch.nn as nn 22 | import torch.nn.functional as F 23 | import torch.optim as optim 24 | from torch.utils.tensorboard import SummaryWriter 25 | from torch.utils.data import Dataset, DataLoader 26 | from transformers import AutoModelForCausalLM 27 | import transformers 28 | from transformers import AutoTokenizer 29 | from transformers import TrainingArguments, Trainer, TrainerCallback 30 | from transformers import HfArgumentParser 31 | from pynvml import * 32 | 33 | 34 | proj_dir = os.path.expanduser('~/polygon/chatbot') 35 | 36 | 37 | def print_gpu_utilization(): 38 | nvmlInit() 39 | handle = nvmlDeviceGetHandleByIndex(0) 40 | info = nvmlDeviceGetMemoryInfo(handle) 41 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.") 42 | 43 | 44 | def load_samples(dataset_path, tokenizer): 45 | samples = [] 46 | with open(dataset_path, 'r') as f: 47 | data = json.load(f) 48 | for sample in tqdm.tqdm(data, desc='Loading samples', total=len(data)): 49 | try: 50 | lines = [] 51 | for i, msg in enumerate(sample): 52 | if 0 == (i % 2): 53 | lines.append('человек: ' + msg) 54 | else: 55 | lines.append('чатбот: ' + msg) 56 | 57 | text = '\n'.join(lines) 58 | tokens = tokenizer.encode(text, add_special_tokens=False, truncation=False) 59 | if len(tokens) < 512: 60 | samples.append({'tokens': tokens, 'text': text}) 61 | else: 62 | lines0 = list(lines) 63 | 64 | lines = lines[:-1] 65 | while len(lines) > 1: 66 | text = '\n'.join(lines) 67 | tokens = tokenizer.encode(text, add_special_tokens=False, truncation=False) 68 | if len(tokens) < 512: 69 | samples.append({'tokens': tokens, 'text': text}) 70 | break 71 | else: 72 | lines = lines[:-1] 73 | 74 | 75 | except Exception as ex: 76 | print(ex) 77 | 78 | return samples 79 | 80 | 81 | class FinetuneDataset(Dataset): 82 | def __init__(self, samples, tokenizer): 83 | self.tokenizer = tokenizer 84 | self.max_len = 0 85 | self.samples = [] 86 | 87 | self.bos_token_id = tokenizer.encode('', add_special_tokens=False)[0] 88 | self.eos_token_id = tokenizer.encode('', add_special_tokens=False)[0] 89 | self.pad_token_id = tokenizer.encode('', add_special_tokens=False)[0] 90 | 91 | for sample in samples: 92 | input_ids = [self.bos_token_id] + sample['tokens'] + [self.eos_token_id] 93 | labels = input_ids 94 | attention_map = [1] * len(labels) 95 | self.samples.append((input_ids, labels, attention_map)) 96 | self.max_len = max(self.max_len, len(input_ids)) 97 | 98 | def __len__(self): 99 | return len(self.samples) 100 | 101 | def __getitem__(self, index: int): 102 | input_ids, labels, attention_map = self.samples[index] 103 | npad = self.max_len - len(input_ids) 104 | input_ids = input_ids + npad * [self.pad_token_id] 105 | labels = labels + [-100] * npad 106 | attention_mask = attention_map + [0] * npad 107 | return {'input_ids': torch.LongTensor(input_ids), 108 | 'labels': torch.LongTensor(labels), 109 | 'attention_mask': torch.LongTensor(attention_mask)} 110 | 111 | 112 | @dataclass 113 | class ModelArguments: 114 | """ 115 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 116 | """ 117 | model_name_or_path: Optional[str] = field( 118 | default='sberbank-ai/rugpt3medium_based_on_gpt2', 119 | metadata={"help": "The model checkpoint for weights initialization."}, 120 | ) 121 | 122 | 123 | @dataclass 124 | class DataTrainingArguments: 125 | """ 126 | Arguments pertaining to what data we are going to input our model for training and eval. 127 | """ 128 | dataset_path: Optional[str] = field( 129 | default=os.path.join(proj_dir, 'tmp', 'axioma_dialogues.solid.json'), 130 | metadata={"help": "Путь к датасету со диалогами"} 131 | ) 132 | 133 | 134 | if __name__ == '__main__': 135 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 136 | 137 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 138 | 139 | if not training_args.output_dir: 140 | training_args.output_dir = os.path.join(proj_dir, 'tmp', 'rugpt_chitchat') 141 | 142 | rank0 = training_args.local_rank in (-1, 0) 143 | 144 | # Setup logging 145 | logging.basicConfig( 146 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 147 | datefmt="%m/%d/%Y %H:%M:%S", 148 | handlers=[logging.StreamHandler(sys.stdout)], 149 | ) 150 | 151 | log_level = training_args.get_process_log_level() 152 | logger = logging.getLogger(__name__) 153 | logger.setLevel(log_level) 154 | #datasets.utils.logging.set_verbosity(log_level) 155 | transformers.utils.logging.set_verbosity(log_level) 156 | transformers.utils.logging.enable_default_handler() 157 | transformers.utils.logging.enable_explicit_format() 158 | 159 | logger.info( 160 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 161 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 162 | ) 163 | logger.info(f"Training/evaluation parameters {training_args}") 164 | 165 | # Удаляем старые логи tensorboard 166 | if rank0: 167 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs') 168 | if os.path.exists(tensorboard_dir): 169 | logger.info('Removing "%s"', tensorboard_dir) 170 | shutil.rmtree(tensorboard_dir) 171 | 172 | pretrained_model_name = model_args.model_name_or_path 173 | 174 | print('Loading pretrained model "{}"...'.format(pretrained_model_name)) 175 | if 'xglm' in pretrained_model_name.lower(): 176 | tokenizer = transformers.XGLMTokenizer.from_pretrained(pretrained_model_name) 177 | model = transformers.XGLMForCausalLM.from_pretrained(pretrained_model_name) 178 | elif 'bloom' in pretrained_model_name: 179 | tokenizer = transformers.BloomTokenizer.from_pretrained(pretrained_model_name) 180 | model = transformers.BloomForCausalLM.from_pretrained(pretrained_model_name) 181 | else: 182 | tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name) 183 | model = transformers.AutoModelForCausalLM.from_pretrained(pretrained_model_name) 184 | 185 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''}) 186 | 187 | if rank0: 188 | print_gpu_utilization() 189 | 190 | print('\nTokenizer:') 191 | for token in ' '.split(): 192 | print('token "{}" id={}'.format(token, tokenizer.encode(token, add_special_tokens=False))) 193 | 194 | print('\nLoading dataset...') 195 | train_samples = load_samples(data_args.dataset_path, tokenizer) 196 | print('Train samples: {}'.format(len(train_samples))) 197 | 198 | train_dataset = FinetuneDataset(train_samples, tokenizer) 199 | 200 | trainer = Trainer( 201 | model=model, 202 | args=training_args, 203 | train_dataset=train_dataset, 204 | tokenizer=tokenizer, 205 | data_collator=None, 206 | # compute_metrics=compute_metrics, 207 | # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] 208 | ) 209 | 210 | logger.info('Start training...') 211 | try: 212 | train_result = trainer.train() 213 | 214 | metrics = train_result.metrics 215 | trainer.log_metrics("train", metrics) 216 | trainer.save_metrics("train", metrics) 217 | except KeyboardInterrupt as ex: 218 | print('!!! CTRL+C !!!') 219 | 220 | logger.info(f'Saving the model and tokenizer to "%s"', training_args.output_dir) 221 | trainer.save_model(output_dir=training_args.output_dir) 222 | #model.save_pretrained(training_args.output_dir) 223 | #tokenizer.save_pretrained(training_args.output_dir) 224 | 225 | logger.info('All done :)') 226 | -------------------------------------------------------------------------------- /poetry/finetune_rugpt13b.py: -------------------------------------------------------------------------------- 1 | """ 2 | Тренировка модели генерации силлабо-тонической поэзии поверх rugpt13B с использованием deepspeed. 3 | """ 4 | import glob 5 | import logging 6 | import os 7 | import json 8 | from dataclasses import dataclass, field 9 | from typing import Any, Dict, List, Optional, Tuple, Union 10 | import shutil 11 | 12 | from torch.utils.data import Dataset, DataLoader 13 | from transformers import AutoModelForCausalLM 14 | import transformers 15 | from transformers import AutoTokenizer 16 | from transformers import TrainingArguments, Trainer, TrainerCallback 17 | from transformers import HfArgumentParser 18 | import deepspeed 19 | from pynvml import * 20 | 21 | 22 | proj_dir = os.path.expanduser('~/polygon/text_generator') 23 | 24 | 25 | def print_gpu_utilization(): 26 | nvmlInit() 27 | handle = nvmlDeviceGetHandleByIndex(0) 28 | info = nvmlDeviceGetMemoryInfo(handle) 29 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.") 30 | 31 | 32 | def pad_sequence(sequence, pad_id, max_len): 33 | l = len(sequence) 34 | if l < max_len: 35 | return sequence + [pad_id] * (max_len - l) 36 | else: 37 | return sequence 38 | 39 | 40 | def load_samples(data_args, tokenizer): 41 | samples = [] 42 | with open(data_args.dataset_path, 'r') as f: 43 | for sample_str in f: 44 | sample = json.loads(sample_str) 45 | prompt = sample['prompt_text'] 46 | if prompt: 47 | if data_args.output_syllables: 48 | # Вариант с генерацией цепочки слогов 49 | lines = [] 50 | for line in sample['output'].split(''): 51 | line = line.strip() 52 | tokens = line.split(' ') 53 | tokens = tokens[::-1] 54 | line = ' '.join(tokens) 55 | line = line.replace(' | ', '|') 56 | line = line.replace(' ', '\u2010') 57 | line = line.replace('|', ' ') 58 | lines.append(line) 59 | output_text = '\n'.join(lines) 60 | else: 61 | # Генерируется обычный текст. 62 | output_text = sample['output_text'] 63 | 64 | # 29.04.2023 ограничим 2 первым катренами 65 | output_text = '\n\n'.join(output_text.split('\n\n')[:2]) 66 | 67 | input_tokens = tokenizer.encode(prompt, add_special_tokens=False) 68 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False) 69 | samples.append((input_tokens, output_tokens, prompt, output_text)) 70 | 71 | if data_args.max_samples > 0 and len(samples) >= data_args.max_samples: 72 | break 73 | 74 | return samples 75 | 76 | 77 | class FinetuneDataset(Dataset): 78 | def __init__(self, samples, tokenizer): 79 | self.tokenizer = tokenizer 80 | self.max_len = 0 81 | self.samples = [] 82 | 83 | self.bos_token_id = tokenizer.bos_token_id 84 | self.eos_token_id = tokenizer.eos_token_id 85 | assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1) 86 | self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0] 87 | self.pad_token_id = tokenizer.pad_token_id 88 | 89 | for src_ids, output_ids, src_text, output_text in samples: 90 | input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id] 91 | 92 | # Токены затравки дают label=-100 93 | labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id] 94 | 95 | attention_map = [1] * len(labels) 96 | 97 | self.samples.append((input_ids, labels, attention_map)) 98 | self.max_len = max(self.max_len, len(input_ids)) 99 | 100 | def __len__(self): 101 | return len(self.samples) 102 | 103 | def __getitem__(self, index: int): 104 | input_ids, labels, attention_map = self.samples[index] 105 | npad = self.max_len - len(input_ids) 106 | input_ids = input_ids + npad*[self.pad_token_id] 107 | labels = labels + [-100] * npad 108 | attention_mask = attention_map + [0] * npad 109 | return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask} 110 | 111 | 112 | @dataclass 113 | class ModelArguments: 114 | """ 115 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 116 | """ 117 | model_name_or_path: Optional[str] = field( 118 | default='ai-forever/rugpt13b', 119 | metadata={"help": "The model checkpoint for weights initialization."}, 120 | ) 121 | 122 | 123 | @dataclass 124 | class DataSetArguments: 125 | output_syllables: Optional[bool] = field( 126 | metadata={"help": "Силлабо-тоническое представление выходного текста (true) или обычное (false)"} 127 | ) 128 | 129 | dataset_path: Optional[str] = field( 130 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')), 131 | metadata={"help": "Путь к датасету со стихами"} 132 | ) 133 | 134 | max_samples: Optional[int] = field( 135 | default=-1, 136 | metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"} 137 | ) 138 | 139 | 140 | class MyPrinterCallback(TrainerCallback): 141 | def __init__(self, filepath): 142 | self.wrt = open(filepath, 'w') 143 | 144 | def on_log(self, args, state, control, logs=None, **kwargs): 145 | if state.is_local_process_zero: 146 | if 'epoch' in logs and 'loss' in logs: 147 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss'])) 148 | self.wrt.flush() 149 | 150 | 151 | if __name__ == '__main__': 152 | parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments)) 153 | 154 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 155 | 156 | if not training_args.output_dir: 157 | training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'verses_rugpt13b_lora')) 158 | 159 | # Setup logging 160 | logging.basicConfig( 161 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 162 | datefmt="%m/%d/%Y %H:%M:%S", 163 | handlers=[logging.StreamHandler(sys.stdout)], 164 | ) 165 | 166 | log_level = training_args.get_process_log_level() 167 | logger = logging.getLogger(__name__) 168 | logger.setLevel(log_level) 169 | #datasets.utils.logging.set_verbosity(log_level) 170 | transformers.utils.logging.set_verbosity(log_level) 171 | transformers.utils.logging.enable_default_handler() 172 | transformers.utils.logging.enable_explicit_format() 173 | 174 | logger.info( 175 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 176 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 177 | ) 178 | logger.info(f"Training/evaluation parameters {training_args}") 179 | 180 | # Удаляем старые логи tensorboard 181 | if training_args.local_rank in (-1, 0): 182 | for f in glob.glob(training_args.output_dir+'/*'): 183 | if os.path.isfile(f): 184 | os.remove(f) 185 | 186 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs') 187 | if os.path.exists(tensorboard_dir): 188 | logger.info('Removing "%s"', tensorboard_dir) 189 | shutil.rmtree(tensorboard_dir) 190 | 191 | #device = training_args.device 192 | #logging.info('device={}'.format(device)) 193 | 194 | logger.info('Loading tokenizer "%s"', model_args.model_name_or_path) 195 | 196 | tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name_or_path) 197 | 198 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''}) 199 | tokenizer.save_pretrained(training_args.output_dir) 200 | 201 | for t in ['#', '', '', '']: 202 | logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False))) 203 | 204 | logger.info('Loading pretrained model "%s"', model_args.model_name_or_path) 205 | model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path) 206 | # model.half() 207 | 208 | if training_args.local_rank in (0, -1): 209 | print('=' * 80) 210 | print_gpu_utilization() 211 | 212 | print('=' * 30 + 'ZeRo 2' + '=' * 30) 213 | deepspeed.runtime.zero.stage_1_and_2.estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=1, 214 | num_nodes=1, 215 | additional_buffer_factor=1.5) 216 | print('=' * 80) 217 | 218 | 219 | logger.info('Loading dataset "%s"', data_args.dataset_path) 220 | train_samples = load_samples(data_args, tokenizer) 221 | logger.info('Training set: %d samples', len(train_samples)) 222 | 223 | train_dataset = FinetuneDataset(train_samples, tokenizer) 224 | 225 | printer = MyPrinterCallback(os.path.join(training_args.output_dir, 'finetune_rugpt13b.loss.log')) 226 | trainer = Trainer( 227 | model=model, 228 | args=training_args, 229 | train_dataset=train_dataset, 230 | tokenizer=tokenizer, 231 | data_collator=None, 232 | callbacks=[printer] 233 | ) 234 | 235 | logger.info('Start training...') 236 | train_result = trainer.train() 237 | 238 | logger.info(f'Saving the model and tokenizer') 239 | trainer.save_model(training_args.output_dir) 240 | 241 | metrics = train_result.metrics 242 | trainer.log_metrics("train", metrics) 243 | trainer.save_metrics("train", metrics) 244 | 245 | logger.info('All done :)') 246 | -------------------------------------------------------------------------------- /poetry/finetune_rugpt13b_lora.py: -------------------------------------------------------------------------------- 1 | """ 2 | Тренировка модели генерации силлабо-тонической поэзии поверх rugpt13B с использованием LoRa. 3 | """ 4 | import glob 5 | import logging 6 | import os 7 | import json 8 | import io 9 | import random 10 | import itertools 11 | import sys 12 | from typing import Any, Dict, List, Optional, Tuple, Union 13 | from dataclasses import dataclass, field 14 | from typing import Any, Dict, List, Optional, Tuple, Union 15 | import shutil 16 | from pathlib import Path 17 | 18 | import numpy as np 19 | import tqdm 20 | import sklearn.model_selection 21 | import torch 22 | import scipy 23 | import torch.nn as nn 24 | import torch.nn.functional as F 25 | import torch.optim as optim 26 | from torch.utils.tensorboard import SummaryWriter 27 | from torch.utils.data import Dataset, DataLoader 28 | from transformers import AutoModelForCausalLM 29 | import transformers 30 | from transformers import AutoTokenizer 31 | from transformers import TrainingArguments, Trainer, TrainerCallback 32 | from transformers import HfArgumentParser 33 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training 34 | from pynvml import * 35 | 36 | 37 | proj_dir = os.path.expanduser('~/polygon/text_generator') 38 | 39 | 40 | def print_gpu_utilization(): 41 | nvmlInit() 42 | handle = nvmlDeviceGetHandleByIndex(0) 43 | info = nvmlDeviceGetMemoryInfo(handle) 44 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.") 45 | 46 | 47 | def pad_sequence(sequence, pad_id, max_len): 48 | l = len(sequence) 49 | if l < max_len: 50 | return sequence + [pad_id] * (max_len - l) 51 | else: 52 | return sequence 53 | 54 | 55 | def load_samples(data_args, tokenizer): 56 | samples = [] 57 | with open(data_args.dataset_path, 'r') as f: 58 | for sample_str in f: 59 | sample = json.loads(sample_str) 60 | prompt = sample['prompt_text'] 61 | if prompt: 62 | if data_args.output_syllables: 63 | # Вариант с генерацией цепочки слогов 64 | lines = [] 65 | for line in sample['output'].split(''): 66 | line = line.strip() 67 | tokens = line.split(' ') 68 | tokens = tokens[::-1] 69 | line = ' '.join(tokens) 70 | line = line.replace(' | ', '|') 71 | line = line.replace(' ', '\u2010') 72 | line = line.replace('|', ' ') 73 | lines.append(line) 74 | output_text = '\n'.join(lines) 75 | else: 76 | # Генерируется обычный текст. 77 | output_text = sample['output_text'] 78 | 79 | # 29.04.2023 ограничим 2 первым катренами 80 | output_text = '\n\n'.join(output_text.split('\n\n')[:2]) 81 | 82 | input_tokens = tokenizer.encode(prompt, add_special_tokens=False) 83 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False) 84 | samples.append((input_tokens, output_tokens, prompt, output_text)) 85 | 86 | if data_args.max_samples > 0 and len(samples) >= data_args.max_samples: 87 | break 88 | 89 | return samples 90 | 91 | 92 | class FinetuneDataset(Dataset): 93 | def __init__(self, samples, tokenizer): 94 | self.tokenizer = tokenizer 95 | self.max_len = 0 96 | self.samples = [] 97 | 98 | self.bos_token_id = tokenizer.bos_token_id 99 | self.eos_token_id = tokenizer.eos_token_id 100 | assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1) 101 | self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0] 102 | self.pad_token_id = tokenizer.pad_token_id 103 | 104 | for src_ids, output_ids, src_text, output_text in samples: 105 | input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id] 106 | 107 | # Токены затравки дают label=-100 108 | labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id] 109 | 110 | attention_map = [1] * len(labels) 111 | 112 | self.samples.append((input_ids, labels, attention_map)) 113 | self.max_len = max(self.max_len, len(input_ids)) 114 | 115 | def __len__(self): 116 | return len(self.samples) 117 | 118 | def __getitem__(self, index: int): 119 | input_ids, labels, attention_map = self.samples[index] 120 | npad = self.max_len - len(input_ids) 121 | input_ids = input_ids + npad*[self.pad_token_id] 122 | labels = labels + [-100] * npad 123 | attention_mask = attention_map + [0] * npad 124 | return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask} 125 | 126 | 127 | @dataclass 128 | class ModelArguments: 129 | """ 130 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 131 | """ 132 | model_name_or_path: Optional[str] = field( 133 | default='ai-forever/rugpt13b', 134 | metadata={"help": "The model checkpoint for weights initialization."}, 135 | ) 136 | 137 | 138 | @dataclass 139 | class DataSetArguments: 140 | output_syllables: Optional[bool] = field( 141 | metadata={"help": "Силлабо-тоническое представление выходного текста (true) или обычное (false)"} 142 | ) 143 | 144 | dataset_path: Optional[str] = field( 145 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')), 146 | metadata={"help": "Путь к датасету со стихами"} 147 | ) 148 | 149 | max_samples: Optional[int] = field( 150 | default=-1, 151 | metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"} 152 | ) 153 | 154 | 155 | class MyPrinterCallback(TrainerCallback): 156 | def __init__(self, filepath): 157 | self.wrt = open(filepath, 'w') 158 | 159 | def on_log(self, args, state, control, logs=None, **kwargs): 160 | if state.is_local_process_zero: 161 | if 'epoch' in logs and 'loss' in logs: 162 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss'])) 163 | self.wrt.flush() 164 | 165 | 166 | if __name__ == '__main__': 167 | parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments)) 168 | 169 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 170 | 171 | if not training_args.output_dir: 172 | training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'verses_rugpt13b_lora')) 173 | 174 | # Setup logging 175 | logging.basicConfig( 176 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 177 | datefmt="%m/%d/%Y %H:%M:%S", 178 | handlers=[logging.StreamHandler(sys.stdout)], 179 | ) 180 | 181 | log_level = training_args.get_process_log_level() 182 | logger = logging.getLogger(__name__) 183 | logger.setLevel(log_level) 184 | #datasets.utils.logging.set_verbosity(log_level) 185 | transformers.utils.logging.set_verbosity(log_level) 186 | transformers.utils.logging.enable_default_handler() 187 | transformers.utils.logging.enable_explicit_format() 188 | 189 | logger.info( 190 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 191 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 192 | ) 193 | logger.info(f"Training/evaluation parameters {training_args}") 194 | 195 | # Удаляем старые логи tensorboard 196 | if training_args.local_rank in (-1, 0): 197 | for f in glob.glob(training_args.output_dir+'/*'): 198 | if os.path.isfile(f): 199 | os.remove(f) 200 | 201 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs') 202 | if os.path.exists(tensorboard_dir): 203 | logger.info('Removing "%s"', tensorboard_dir) 204 | shutil.rmtree(tensorboard_dir) 205 | 206 | device = training_args.device 207 | logging.info('device={}'.format(device)) 208 | 209 | logger.info('Loading tokenizer "%s"', model_args.model_name_or_path) 210 | 211 | tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_args.model_name_or_path) 212 | 213 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''}) 214 | tokenizer.save_pretrained(training_args.output_dir) 215 | 216 | for t in ['#', '', '', '']: 217 | logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False))) 218 | 219 | logger.info('Loading pretrained model "%s"', model_args.model_name_or_path) 220 | model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path) 221 | 222 | logger.info('Wrapping model to peft...') 223 | lora_config = LoraConfig(**{ 224 | "r": 8, 225 | "lora_alpha": 16, 226 | "lora_dropout": 0.10, 227 | "bias": "none", 228 | #"target_modules": ["q_proj", "v_proj"], 229 | "task_type": "CAUSAL_LM" 230 | }) 231 | model = get_peft_model(model, lora_config) 232 | print(model.print_trainable_parameters()) 233 | 234 | logger.info('Loading dataset "%s"', data_args.dataset_path) 235 | train_samples = load_samples(data_args, tokenizer) 236 | logger.info('Training set: %d samples', len(train_samples)) 237 | 238 | train_dataset = FinetuneDataset(train_samples, tokenizer) 239 | 240 | printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_rugpt13b_lora.loss.log')) 241 | trainer = Trainer( 242 | model=model, 243 | args=training_args, 244 | train_dataset=train_dataset, 245 | tokenizer=tokenizer, 246 | data_collator=None, 247 | callbacks=[printer] 248 | ) 249 | 250 | logger.info('Start training for local_rank=%d...', training_args.local_rank) 251 | train_result = trainer.train() 252 | 253 | if training_args.local_rank in (0, -1): 254 | logger.info(f'Saving the model and tokenizer') 255 | model.save_pretrained(save_directory=training_args.output_dir) 256 | 257 | metrics = train_result.metrics 258 | trainer.log_metrics("train", metrics) 259 | trainer.save_metrics("train", metrics) 260 | 261 | logger.info('All done :)') 262 | -------------------------------------------------------------------------------- /poetry/finetune_llama.py: -------------------------------------------------------------------------------- 1 | """ 2 | Эксперимент с файнтюном модели LLaMa на стихах. 3 | """ 4 | 5 | import glob 6 | import logging 7 | import os 8 | import json 9 | import io 10 | import random 11 | import itertools 12 | import sys 13 | from typing import Any, Dict, List, Optional, Tuple, Union 14 | from dataclasses import dataclass, field 15 | from typing import Any, Dict, List, Optional, Tuple, Union 16 | import shutil 17 | from pathlib import Path 18 | 19 | import numpy as np 20 | import tqdm 21 | import sklearn.model_selection 22 | import torch 23 | import scipy 24 | import torch.nn as nn 25 | import torch.nn.functional as F 26 | import torch.optim as optim 27 | from torch.utils.tensorboard import SummaryWriter 28 | from torch.utils.data import Dataset, DataLoader 29 | from transformers import AutoModelForCausalLM 30 | import transformers 31 | from transformers import AutoTokenizer 32 | from transformers import TrainingArguments, Trainer, TrainerCallback 33 | from transformers import HfArgumentParser 34 | from pynvml import * 35 | import deepspeed 36 | 37 | proj_dir = os.path.expanduser('~/polygon/text_generator') 38 | 39 | 40 | def print_gpu_utilization(): 41 | nvmlInit() 42 | handle = nvmlDeviceGetHandleByIndex(0) 43 | info = nvmlDeviceGetMemoryInfo(handle) 44 | # logger.info(f"GPU memory occupied: {info.used//1024**2} MB.") 45 | print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.") 46 | 47 | 48 | def pad_sequence(sequence, pad_id, max_len): 49 | l = len(sequence) 50 | if l < max_len: 51 | return sequence + [pad_id] * (max_len - l) 52 | else: 53 | return sequence 54 | 55 | 56 | def load_samples(data_args, tokenizer): 57 | samples = [] 58 | with open(data_args.dataset_path, 'r') as f: 59 | for sample_str in f: 60 | sample = json.loads(sample_str) 61 | prompt = sample['prompt_text'] 62 | if prompt: 63 | if data_args.output_syllables: 64 | # Вариант с генерацией цепочки слогов 65 | lines = [] 66 | for line in sample['output'].split(''): 67 | line = line.strip() 68 | tokens = line.split(' ') 69 | tokens = tokens[::-1] 70 | line = ' '.join(tokens) 71 | line = line.replace(' | ', '|') 72 | line = line.replace(' ', '\u2010') 73 | line = line.replace('|', ' ') 74 | lines.append(line) 75 | output_text = '\n'.join(lines) 76 | else: 77 | output_text = sample['output_text'] 78 | 79 | # 29.04.2023 ограничим 2 первым катренами 80 | output_text = '\n\n'.join(output_text.split('\n\n')[:2]) 81 | 82 | input_tokens = tokenizer.encode(prompt, add_special_tokens=False) 83 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False) 84 | samples.append((input_tokens, output_tokens, prompt, output_text)) 85 | 86 | if data_args.max_samples > 0 and len(samples) >= data_args.max_samples: 87 | break 88 | 89 | return samples 90 | 91 | 92 | class FinetuneDataset(Dataset): 93 | def __init__(self, samples, tokenizer): 94 | self.tokenizer = tokenizer 95 | self.max_len = 0 96 | self.samples = [] 97 | 98 | self.bos_token_id = tokenizer.bos_token_id 99 | self.eos_token_id = tokenizer.eos_token_id 100 | assert (len(tokenizer.encode('#', add_special_tokens=False)) == 1) 101 | self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0] 102 | self.pad_token_id = tokenizer.pad_token_id 103 | 104 | for src_ids, output_ids, src_text, output_text in samples: 105 | input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id] 106 | 107 | # Токены затравки дают label=-100 108 | labels = [-100] + [-100] * len(src_ids) + [-100] + output_ids + [self.eos_token_id] 109 | 110 | attention_map = [1] * len(labels) 111 | 112 | self.samples.append((input_ids, labels, attention_map)) 113 | self.max_len = max(self.max_len, len(input_ids)) 114 | 115 | def __len__(self): 116 | return len(self.samples) 117 | 118 | def __getitem__(self, index: int): 119 | input_ids, labels, attention_map = self.samples[index] 120 | npad = self.max_len - len(input_ids) 121 | input_ids = input_ids + npad * [self.pad_token_id] 122 | labels = labels + [-100] * npad 123 | attention_mask = attention_map + [0] * npad 124 | return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask} 125 | 126 | 127 | @dataclass 128 | class ModelArguments: 129 | """ 130 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 131 | """ 132 | model_name_or_path: Optional[str] = field( 133 | default='decapoda-research/llama-7b-hf', 134 | metadata={"help": "The model checkpoint for weights initialization."}, 135 | ) 136 | 137 | load_in_8bit: Optional[bool] = field( 138 | default=False, 139 | metadata={"help": "Load model in 8-bit"}, 140 | ) 141 | 142 | 143 | @dataclass 144 | class DataSetArguments: 145 | """ 146 | Arguments pertaining to what data we are going to input our model for training and eval. 147 | """ 148 | dataset_path: Optional[str] = field( 149 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')), 150 | metadata={"help": "Путь к датасету со стихами"} 151 | ) 152 | 153 | output_syllables: Optional[bool] = field( 154 | default=False, 155 | metadata={"help": "Силлабо-тоническое представление выходного текста"} 156 | ) 157 | 158 | max_samples: Optional[int] = field( 159 | default=-1, 160 | metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"} 161 | ) 162 | 163 | 164 | class MyPrinterCallback(TrainerCallback): 165 | def __init__(self, filepath): 166 | self.wrt = open(filepath, 'w') 167 | 168 | def on_log(self, args, state, control, logs=None, **kwargs): 169 | if state.is_local_process_zero: 170 | if 'epoch' in logs and 'loss' in logs: 171 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss'])) 172 | self.wrt.flush() 173 | 174 | 175 | if __name__ == '__main__': 176 | parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments)) 177 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 178 | 179 | # Setup logging 180 | logging.basicConfig( 181 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 182 | datefmt="%d.%m.%Y %H:%M:%S", 183 | handlers=[logging.StreamHandler(sys.stdout)], 184 | ) 185 | 186 | log_level = training_args.get_process_log_level() 187 | logger = logging.getLogger(__name__) 188 | logger.setLevel(log_level) 189 | # datasets.utils.logging.set_verbosity(log_level) 190 | transformers.utils.logging.set_verbosity(log_level) 191 | transformers.utils.logging.enable_default_handler() 192 | transformers.utils.logging.enable_explicit_format() 193 | 194 | logger.info( 195 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 196 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 197 | ) 198 | logger.info(f"Training/evaluation parameters {training_args}") 199 | 200 | # Удаляем старые логи tensorboard 201 | if training_args.local_rank in (-1, 0): 202 | for f in glob.glob(training_args.output_dir + '/*'): 203 | if os.path.isfile(f): 204 | os.remove(f) 205 | 206 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs') 207 | if os.path.exists(tensorboard_dir): 208 | logger.info('Removing "%s"', tensorboard_dir) 209 | shutil.rmtree(tensorboard_dir) 210 | 211 | logger.info('Loading pretrained model "%s"', model_args.model_name_or_path) 212 | model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, 213 | # load_in_8bit=model_args.load_in_8bit, 214 | # device_map="auto" 215 | ) 216 | #model.half() 217 | #model.to(device) 218 | 219 | if training_args.local_rank in (0, -1): 220 | print('=' * 80) 221 | print_gpu_utilization() 222 | 223 | if training_args.deepspeed: 224 | print('=' * 30 + 'ZeRo 2' + '=' * 30) 225 | deepspeed.runtime.zero.stage_1_and_2.estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=1, 226 | num_nodes=1, 227 | additional_buffer_factor=1.5) 228 | print('=' * 80) 229 | 230 | # ------------------------- ТОКЕНИЗАТОР ---------------------------------- 231 | logger.info('Loading tokenizer "%s"', model_args.model_name_or_path) 232 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_args.model_name_or_path) 233 | 234 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''}) 235 | 236 | for t in ['#', '', '', '']: 237 | logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False))) 238 | 239 | tokenizer.save_pretrained(training_args.output_dir) 240 | 241 | logger.info('Loading dataset "%s"', data_args.dataset_path) 242 | train_samples = load_samples(data_args, tokenizer) 243 | logger.info('Training set: %d samples', len(train_samples)) 244 | 245 | train_dataset = FinetuneDataset(train_samples, tokenizer) 246 | 247 | printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_llama.loss.log')) 248 | trainer = Trainer( 249 | model=model, 250 | args=training_args, 251 | train_dataset=train_dataset, 252 | tokenizer=tokenizer, 253 | data_collator=None, 254 | callbacks=[printer] 255 | ) 256 | 257 | logger.info('Start training...') 258 | train_result = trainer.train() 259 | 260 | #trainer.save_model(output_dir=training_args.output_dir) 261 | if training_args.local_rank in (0, -1): 262 | logger.info(f'Saving the model and tokenizer') 263 | model.save_pretrained(save_directory=training_args.output_dir) 264 | 265 | metrics = train_result.metrics 266 | trainer.log_metrics("train", metrics) 267 | trainer.save_metrics("train", metrics) 268 | 269 | logger.info('All done :)') 270 | -------------------------------------------------------------------------------- /poetry/finetune_rugpt_with_prompt_masking.py: -------------------------------------------------------------------------------- 1 | """ 2 | Тренировка модели генерации стихов поверх rugpt*** с исключением обратного распространения на токенах затравки. 3 | """ 4 | import glob 5 | import logging 6 | import os 7 | import json 8 | import io 9 | import random 10 | import itertools 11 | import sys 12 | from typing import Any, Dict, List, Optional, Tuple, Union 13 | from dataclasses import dataclass, field 14 | from typing import Any, Dict, List, Optional, Tuple, Union 15 | import shutil 16 | from pathlib import Path 17 | 18 | import numpy as np 19 | import tqdm 20 | import sklearn.model_selection 21 | import torch 22 | import scipy 23 | import torch.nn as nn 24 | import torch.nn.functional as F 25 | import torch.optim as optim 26 | from torch.utils.tensorboard import SummaryWriter 27 | from torch.utils.data import Dataset, DataLoader 28 | from transformers import AutoModelForCausalLM 29 | import transformers 30 | from transformers import AutoTokenizer 31 | from transformers import TrainingArguments, Trainer, TrainerCallback 32 | from transformers import HfArgumentParser 33 | from pynvml import * 34 | 35 | 36 | proj_dir = os.path.expanduser('~/polygon/text_generator') 37 | 38 | 39 | def print_gpu_utilization(): 40 | nvmlInit() 41 | handle = nvmlDeviceGetHandleByIndex(0) 42 | info = nvmlDeviceGetMemoryInfo(handle) 43 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.") 44 | 45 | 46 | def pad_sequence(sequence, pad_id, max_len): 47 | l = len(sequence) 48 | if l < max_len: 49 | return sequence + [pad_id] * (max_len - l) 50 | else: 51 | return sequence 52 | 53 | 54 | def load_samples(data_args, tokenizer, model_args): 55 | samples = [] 56 | with open(data_args.dataset_path, 'r') as f: 57 | for sample_str in f: 58 | sample = json.loads(sample_str) 59 | prompt = sample['prompt_text'] 60 | if prompt: 61 | if data_args.output_syllables: 62 | # Вариант с генерацией цепочки слогов 63 | lines = [] 64 | for line in sample['output'].split(''): 65 | line = line.strip() 66 | tokens = line.split(' ') 67 | tokens = tokens[::-1] 68 | line = ' '.join(tokens) 69 | line = line.replace(' | ', '|') 70 | line = line.replace(' ', '\u2010') 71 | line = line.replace('|', ' ') 72 | lines.append(line) 73 | output_text = '\n'.join(lines) 74 | else: 75 | output_text = sample['output_text'] 76 | 77 | # 29.04.2023 ограничим 2 первым катренами 78 | output_text = '\n\n'.join(output_text.split('\n\n')[:2]) 79 | 80 | if 'xglm' in model_args.model_name_or_path.lower(): 81 | # 21.05.2023 почему-то токенизатор XGLM иногда теряет переводы строк. 82 | # Поэтому заменим на особое сочетание, которое при генерации будем заменять обратно на \n 83 | output_text = output_text.replace('\n', '\\n') 84 | 85 | input_tokens = tokenizer.encode(prompt, add_special_tokens=False) 86 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False) 87 | samples.append((input_tokens, output_tokens, prompt, output_text)) 88 | 89 | if data_args.max_samples > 0 and len(samples) >= data_args.max_samples: 90 | break 91 | 92 | return samples 93 | 94 | 95 | class FinetuneDataset(Dataset): 96 | def __init__(self, samples, tokenizer): 97 | self.tokenizer = tokenizer 98 | self.max_len = 0 99 | self.samples = [] 100 | 101 | self.bos_token_id = tokenizer.bos_token_id 102 | self.eos_token_id = tokenizer.eos_token_id 103 | assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1) 104 | self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0] 105 | self.pad_token_id = tokenizer.pad_token_id 106 | 107 | for src_ids, output_ids, src_text, output_text in samples: 108 | input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id] 109 | 110 | # Токены затравки дают label=-100 111 | labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id] 112 | 113 | attention_map = [1] * len(labels) 114 | 115 | self.samples.append((input_ids, labels, attention_map)) 116 | self.max_len = max(self.max_len, len(input_ids)) 117 | 118 | def __len__(self): 119 | return len(self.samples) 120 | 121 | def __getitem__(self, index: int): 122 | input_ids, labels, attention_map = self.samples[index] 123 | npad = self.max_len - len(input_ids) 124 | input_ids = input_ids + npad*[self.pad_token_id] 125 | labels = labels + [-100] * npad 126 | attention_mask = attention_map + [0] * npad 127 | return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask} 128 | 129 | 130 | 131 | @dataclass 132 | class ModelArguments: 133 | """ 134 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 135 | """ 136 | model_name_or_path: Optional[str] = field( 137 | default='sberbank-ai/rugpt3large_based_on_gpt2', 138 | metadata={"help": "The model checkpoint for weights initialization."}, 139 | ) 140 | 141 | tokenizer_path: Optional[str] = field( 142 | default='sberbank-ai/rugpt3large_based_on_gpt2', 143 | metadata={"help": "Path to tokenizer."}, 144 | ) 145 | 146 | 147 | @dataclass 148 | class DataSetArguments: 149 | """ 150 | Arguments pertaining to what data we are going to input our model for training and eval. 151 | """ 152 | dataset_path: Optional[str] = field( 153 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'пирожки.jsonl')), 154 | metadata={"help": "Путь к датасету со стихами"} 155 | ) 156 | 157 | output_syllables: Optional[bool] = field( 158 | default=False, 159 | metadata={"help": "Силлабо-тоническое представление выходного текста"} 160 | ) 161 | 162 | max_samples: Optional[int] = field( 163 | default=-1, 164 | metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"} 165 | ) 166 | 167 | 168 | class MyPrinterCallback(TrainerCallback): 169 | def __init__(self, filepath): 170 | self.wrt = open(filepath, 'w') 171 | 172 | def on_log(self, args, state, control, logs=None, **kwargs): 173 | if state.is_local_process_zero: 174 | if 'epoch' in logs and 'loss' in logs: 175 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss'])) 176 | self.wrt.flush() 177 | 178 | 179 | if __name__ == '__main__': 180 | parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments)) 181 | 182 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 183 | 184 | if not training_args.output_dir: 185 | training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'verses_pirozhki_rugpt')) 186 | 187 | verbose = training_args.local_rank in (-1, 0) 188 | 189 | # Setup logging 190 | logging.basicConfig( 191 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 192 | datefmt="%m/%d/%Y %H:%M:%S", 193 | handlers=[logging.StreamHandler(sys.stdout)], 194 | ) 195 | 196 | log_level = training_args.get_process_log_level() 197 | logger = logging.getLogger(__name__) 198 | logger.setLevel(log_level) 199 | #datasets.utils.logging.set_verbosity(log_level) 200 | transformers.utils.logging.set_verbosity(log_level) 201 | transformers.utils.logging.enable_default_handler() 202 | transformers.utils.logging.enable_explicit_format() 203 | 204 | logger.info( 205 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 206 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 207 | ) 208 | logger.info(f"Training/evaluation parameters {training_args}") 209 | 210 | # Удаляем старые логи tensorboard 211 | if training_args.local_rank in (-1, 0): 212 | for f in glob.glob(training_args.output_dir+'/*'): 213 | if os.path.isfile(f): 214 | os.remove(f) 215 | 216 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs') 217 | if os.path.exists(tensorboard_dir): 218 | logger.info('Removing "%s"', tensorboard_dir) 219 | shutil.rmtree(tensorboard_dir) 220 | 221 | device = training_args.device 222 | logging.info('device={}'.format(device)) 223 | 224 | if not model_args.tokenizer_path: 225 | model_args.tokenizer_path = model_args.model_name_or_path 226 | 227 | logger.info('Loading tokenizer "%s"', model_args.tokenizer_path) 228 | 229 | if 'llama' in model_args.tokenizer_path: 230 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_args.tokenizer_path) 231 | else: 232 | #tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_args.tokenizer_path) 233 | tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.tokenizer_path) 234 | 235 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''}) 236 | tokenizer.save_pretrained(training_args.output_dir) 237 | 238 | for t in ['#', '', '', '']: 239 | logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False))) 240 | 241 | logger.info('Loading pretrained model "%s"', model_args.model_name_or_path) 242 | model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path) 243 | model.to(device) 244 | 245 | logger.info('Loading dataset "%s"', data_args.dataset_path) 246 | train_samples = load_samples(data_args, tokenizer, model_args) 247 | logger.info('Training set: %d samples', len(train_samples)) 248 | 249 | train_dataset = FinetuneDataset(train_samples, tokenizer) 250 | 251 | printer = MyPrinterCallback(os.path.join(training_args.output_dir, 'finetune_rugpt_with_prompt_masking.loss.log')) 252 | trainer = Trainer( 253 | model=model, 254 | args=training_args, 255 | train_dataset=train_dataset, 256 | tokenizer=tokenizer, 257 | data_collator=None, 258 | callbacks=[printer] 259 | ) 260 | 261 | logger.info('Start training...') 262 | train_result = trainer.train() 263 | 264 | logger.info(f'Saving the model and tokenizer') 265 | trainer.save_model(output_dir=training_args.output_dir) 266 | 267 | metrics = train_result.metrics 268 | trainer.log_metrics("train", metrics) 269 | trainer.save_metrics("train", metrics) 270 | 271 | logger.info('All done :)') 272 | -------------------------------------------------------------------------------- /poetry/finetune_llama_lora.py: -------------------------------------------------------------------------------- 1 | """ 2 | Эксперимент с файнтюном на стихах модели LLaMa + LoRa. 3 | """ 4 | import glob 5 | import logging 6 | import os 7 | import json 8 | import io 9 | import random 10 | import itertools 11 | import sys 12 | from typing import Any, Dict, List, Optional, Tuple, Union 13 | from dataclasses import dataclass, field 14 | from typing import Any, Dict, List, Optional, Tuple, Union 15 | import shutil 16 | from pathlib import Path 17 | 18 | import numpy as np 19 | import tqdm 20 | import sklearn.model_selection 21 | import torch 22 | import scipy 23 | import torch.nn as nn 24 | import torch.nn.functional as F 25 | import torch.optim as optim 26 | from torch.utils.tensorboard import SummaryWriter 27 | from torch.utils.data import Dataset, DataLoader 28 | from transformers import AutoModelForCausalLM 29 | import transformers 30 | from transformers import AutoTokenizer 31 | from transformers import TrainingArguments, Trainer, TrainerCallback 32 | from transformers import HfArgumentParser 33 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training 34 | from pynvml import * 35 | 36 | 37 | proj_dir = os.path.expanduser('~/polygon/text_generator') 38 | 39 | 40 | def print_gpu_utilization(): 41 | nvmlInit() 42 | handle = nvmlDeviceGetHandleByIndex(0) 43 | info = nvmlDeviceGetMemoryInfo(handle) 44 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.") 45 | 46 | 47 | def pad_sequence(sequence, pad_id, max_len): 48 | l = len(sequence) 49 | if l < max_len: 50 | return sequence + [pad_id] * (max_len - l) 51 | else: 52 | return sequence 53 | 54 | 55 | def load_samples(data_args, tokenizer): 56 | samples = [] 57 | with open(data_args.dataset_path, 'r') as f: 58 | for sample_str in f: 59 | sample = json.loads(sample_str) 60 | prompt = sample['prompt_text'] 61 | if prompt: 62 | if data_args.output_syllables: 63 | # Вариант с генерацией цепочки слогов 64 | lines = [] 65 | for line in sample['output'].split(''): 66 | line = line.strip() 67 | tokens = line.split(' ') 68 | tokens = tokens[::-1] 69 | line = ' '.join(tokens) 70 | line = line.replace(' | ', '|') 71 | line = line.replace(' ', '\u2010') 72 | line = line.replace('|', ' ') 73 | lines.append(line) 74 | output_text = '\n'.join(lines) 75 | else: 76 | output_text = sample['output_text'] 77 | 78 | # 29.04.2023 ограничим 2 первым катренами 79 | output_text = '\n\n'.join(output_text.split('\n\n')[:2]) 80 | 81 | input_tokens = tokenizer.encode(prompt, add_special_tokens=False) 82 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False) 83 | samples.append((input_tokens, output_tokens, prompt, output_text)) 84 | 85 | if data_args.max_samples > 0 and len(samples) >= data_args.max_samples: 86 | break 87 | 88 | return samples 89 | 90 | 91 | class FinetuneDataset(Dataset): 92 | def __init__(self, samples, tokenizer): 93 | self.tokenizer = tokenizer 94 | self.max_len = 0 95 | self.samples = [] 96 | 97 | self.bos_token_id = tokenizer.bos_token_id 98 | self.eos_token_id = tokenizer.eos_token_id 99 | assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1) 100 | self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0] 101 | self.pad_token_id = tokenizer.pad_token_id 102 | 103 | for src_ids, output_ids, src_text, output_text in samples: 104 | input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id] 105 | 106 | # Токены затравки дают label=-100 107 | labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id] 108 | 109 | attention_map = [1] * len(labels) 110 | 111 | self.samples.append((input_ids, labels, attention_map)) 112 | self.max_len = max(self.max_len, len(input_ids)) 113 | 114 | def __len__(self): 115 | return len(self.samples) 116 | 117 | def __getitem__(self, index: int): 118 | input_ids, labels, attention_map = self.samples[index] 119 | npad = self.max_len - len(input_ids) 120 | input_ids = input_ids + npad*[self.pad_token_id] 121 | labels = labels + [-100] * npad 122 | attention_mask = attention_map + [0] * npad 123 | return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask} 124 | 125 | 126 | 127 | @dataclass 128 | class ModelArguments: 129 | """ 130 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 131 | """ 132 | model_name_or_path: Optional[str] = field( 133 | default='decapoda-research/llama-7b-hf', 134 | metadata={"help": "The model checkpoint for weights initialization."}, 135 | ) 136 | 137 | load_in_8bit: Optional[bool] = field( 138 | default=False, 139 | metadata={"help": "Load model in 8-bit"}, 140 | ) 141 | 142 | 143 | @dataclass 144 | class DataSetArguments: 145 | """ 146 | Arguments pertaining to what data we are going to input our model for training and eval. 147 | """ 148 | dataset_path: Optional[str] = field( 149 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')), 150 | metadata={"help": "Путь к датасету со стихами"} 151 | ) 152 | 153 | output_syllables: Optional[bool] = field( 154 | default=False, 155 | metadata={"help": "Силлабо-тоническое представление выходного текста"} 156 | ) 157 | 158 | max_samples: Optional[int] = field( 159 | default=-1, 160 | metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"} 161 | ) 162 | 163 | 164 | @dataclass 165 | class LoRaArguments: 166 | # "r": 8, 167 | # "lora_alpha": 16, 168 | # "lora_dropout": 0.10, 169 | lora_r: Optional[int] = field( 170 | default=8 171 | ) 172 | 173 | lora_alpha: Optional[int] = field( 174 | default=16 175 | ) 176 | 177 | lora_dropout: Optional[float] = field( 178 | default=0.10 179 | ) 180 | 181 | 182 | class MyPrinterCallback(TrainerCallback): 183 | def __init__(self, filepath): 184 | self.wrt = open(filepath, 'w') 185 | 186 | def on_log(self, args, state, control, logs=None, **kwargs): 187 | if state.is_local_process_zero: 188 | if 'epoch' in logs and 'loss' in logs: 189 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss'])) 190 | self.wrt.flush() 191 | 192 | 193 | def fix_model(model, tokenizer, use_resize=True): 194 | model.config.pad_token_id = tokenizer.pad_token_id 195 | assert model.config.pad_token_id is not None 196 | 197 | bos_candidates = ( 198 | tokenizer.bos_token_id, 199 | tokenizer.cls_token_id, 200 | tokenizer.sep_token_id, 201 | tokenizer.unk_token_id 202 | ) 203 | for bos_candidate in bos_candidates: 204 | model.config.bos_token_id = bos_candidate 205 | if bos_candidate is not None: 206 | break 207 | assert model.config.bos_token_id is not None 208 | model.config.decoder_start_token_id = model.config.bos_token_id 209 | 210 | eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id) 211 | for eos_candidate in eos_candidates: 212 | model.config.eos_token_id = eos_candidate 213 | if eos_candidate is not None: 214 | break 215 | assert model.config.eos_token_id is not None 216 | if use_resize: 217 | model.resize_token_embeddings(len(tokenizer)) 218 | 219 | return model 220 | 221 | 222 | if __name__ == '__main__': 223 | parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments, LoRaArguments)) 224 | model_args, data_args, training_args, lora_args = parser.parse_args_into_dataclasses() 225 | 226 | lora_config = LoraConfig(**{ 227 | "r": lora_args.lora_r, 228 | "lora_alpha": lora_args.lora_alpha, 229 | "lora_dropout": lora_args.lora_dropout, 230 | "bias": "none", 231 | "target_modules": ["q_proj", "v_proj"], 232 | "task_type": "CAUSAL_LM" 233 | }) 234 | 235 | # Setup logging 236 | logging.basicConfig( 237 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 238 | datefmt="%d.%m.%Y %H:%M:%S", 239 | handlers=[logging.StreamHandler(sys.stdout)], 240 | ) 241 | 242 | log_level = training_args.get_process_log_level() 243 | logger = logging.getLogger(__name__) 244 | logger.setLevel(log_level) 245 | #datasets.utils.logging.set_verbosity(log_level) 246 | transformers.utils.logging.set_verbosity(log_level) 247 | transformers.utils.logging.enable_default_handler() 248 | transformers.utils.logging.enable_explicit_format() 249 | 250 | logger.info( 251 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 252 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 253 | ) 254 | logger.info(f"Training/evaluation parameters {training_args}") 255 | 256 | # Удаляем старые логи tensorboard 257 | if training_args.local_rank in (-1, 0): 258 | for f in glob.glob(training_args.output_dir+'/*'): 259 | if os.path.isfile(f): 260 | os.remove(f) 261 | 262 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs') 263 | if os.path.exists(tensorboard_dir): 264 | logger.info('Removing "%s"', tensorboard_dir) 265 | shutil.rmtree(tensorboard_dir) 266 | 267 | #device = training_args.device 268 | #logging.info('device={}'.format(device)) 269 | 270 | logger.info('Loading tokenizer "%s"', model_args.model_name_or_path) 271 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_args.model_name_or_path) 272 | 273 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''}) 274 | 275 | for t in ['#', '', '', '']: 276 | logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False))) 277 | 278 | logger.info('Loading pretrained model "%s"', model_args.model_name_or_path) 279 | model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, 280 | load_in_8bit=model_args.load_in_8bit, 281 | #device_map="auto" 282 | ) 283 | 284 | #model.config.save_pretrained(training_args.output_dir) 285 | 286 | if model_args.load_in_8bit: 287 | model = fix_model(model, tokenizer, use_resize=False) 288 | model = prepare_model_for_int8_training(model) 289 | 290 | #model.to(device) 291 | 292 | tokenizer.save_pretrained(training_args.output_dir) 293 | 294 | logger.info('Wrapping LLaMa to peft...') 295 | model = get_peft_model(model, lora_config) 296 | 297 | #model.save_pretrained(training_args.output_dir) 298 | 299 | logger.info('Loading dataset "%s"', data_args.dataset_path) 300 | train_samples = load_samples(data_args, tokenizer) 301 | logger.info('Training set: %d samples', len(train_samples)) 302 | 303 | train_dataset = FinetuneDataset(train_samples, tokenizer) 304 | 305 | printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_llama.loss.log')) 306 | trainer = Trainer( 307 | model=model, 308 | args=training_args, 309 | train_dataset=train_dataset, 310 | tokenizer=tokenizer, 311 | data_collator=None, 312 | callbacks=[printer] 313 | ) 314 | 315 | logger.info('Start training...') 316 | train_result = trainer.train() 317 | 318 | # trainer.save_model(output_dir=training_args.output_dir) 319 | if training_args.local_rank in (0, -1): 320 | logger.info(f'Saving the model and tokenizer') 321 | model.save_pretrained(training_args.output_dir) 322 | 323 | metrics = train_result.metrics 324 | trainer.log_metrics("train", metrics) 325 | trainer.save_metrics("train", metrics) 326 | 327 | logger.info('All done :)') 328 | --------------------------------------------------------------------------------