├── poetry
├── 13b_deepspeed_config.json
├── run_llama.py
├── run_llama_lora.py
├── run_rugpt13b_lora.py
├── run_rugpt_generator.py
├── finetune_fredt5_poetry_generator.py
├── finetune_rugpt13b.py
├── finetune_rugpt13b_lora.py
├── finetune_llama.py
├── finetune_rugpt_with_prompt_masking.py
└── finetune_llama_lora.py
├── chitchat
├── run_chitchat_fredt5.py
├── run_chitchat_gpt.py
├── finetune_chitchat_fredt5_with_trainer.py
└── finetune_chitchat_gpt_with_trainer.py
├── .gitignore
├── LICENSE
└── README.md
/poetry/13b_deepspeed_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": { "enabled": "auto" },
3 | "fp16": { "enabled": "auto" },
4 |
5 | "optimizer": {
6 | "type": "AdamW",
7 | "params": {
8 | "lr": "auto",
9 | "betas": "auto",
10 | "eps": "auto",
11 | "weight_decay": "auto"
12 | }
13 | },
14 |
15 | "zero_optimization": {
16 | "stage": 2,
17 | "offload_optimizer": {
18 | "device": "cpu",
19 | "pin_memory": true
20 | },
21 | "allgather_partitions": true,
22 | "allgather_bucket_size": 2e8,
23 | "overlap_comm": true,
24 | "reduce_scatter": true,
25 | "reduce_bucket_size": 2e8,
26 | "contiguous_gradients": true
27 | },
28 |
29 |
30 | "train_batch_size": "auto",
31 | "gradient_accumulation_steps": "auto"
32 | }
33 |
--------------------------------------------------------------------------------
/poetry/run_llama.py:
--------------------------------------------------------------------------------
1 | """
2 | Инференс стихов из отфайнтюненной модели LLaMa (см. finetune_llama.py)
3 | """
4 |
5 | import transformers
6 | import torch
7 |
8 |
9 | model_path = "/home/jovyan/polygon/text_generator/tmp/verses_model=llama7b_domain=lyrics_syllables=0"
10 |
11 | print('Loading model "{}"...'.format(model_path))
12 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_path)
13 | model = transformers.AutoModelForCausalLM.from_pretrained(model_path,
14 | # load_in_8bit=model_args.load_in_8bit,
15 | # device_map="auto"
16 | )
17 |
18 | device = torch.device("cuda")
19 | model.to(device)
20 |
21 | while True:
22 | seed = input(':> ')
23 | prompt = '' + seed + '#'
24 |
25 | encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
26 | #print('DEBUG@26 encoded_prompt=', encoded_prompt)
27 | encoded_prompt = encoded_prompt.to(device)
28 |
29 | pad_token_id = tokenizer.encode('', add_special_tokens=False)[0]
30 | # end_token_id = self.tokenizer.encode('', add_special_tokens=False)[0]
31 |
32 | output_sequences = model.generate(
33 | input_ids=encoded_prompt,
34 | pad_token_id=pad_token_id,
35 | do_sample=True,
36 | temperature=1.0,
37 | top_p=0.80,
38 | max_length=300,
39 | num_return_sequences=5,
40 | )
41 |
42 | stop_token = ''
43 |
44 | generated_sequences = set()
45 | for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
46 | generated_sequence = generated_sequence.tolist()
47 | #print('DEBUG@46 ==> ', generated_sequence)
48 |
49 | text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
50 | if stop_token in text:
51 | text = text[: text.find(stop_token)]
52 |
53 | text = text[text.index('#') + 1:].strip()
54 | text = text.replace('\u2010', '').replace('\u0301', '')
55 | print('='*80)
56 | print(text)
57 |
--------------------------------------------------------------------------------
/poetry/run_llama_lora.py:
--------------------------------------------------------------------------------
1 | """
2 | Инференс стихов из отфайнтюненной модели LLaMa+LoRa (см. finetune_llama_lora.py)
3 | """
4 |
5 | import transformers
6 | import torch
7 | from peft import PeftModel, PeftConfig
8 |
9 |
10 | model_path = "/home/jovyan/polygon/text_generator/tmp/verses_model=llama7b_lora_domain=lyrics_syllables=0"
11 |
12 | print('Loading LLaMa tokenizer "{}"...'.format(model_path))
13 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_path)
14 |
15 | peft_model_id = model_path
16 | print('Loading peft model "{}"...'.format(peft_model_id))
17 | config = PeftConfig.from_pretrained(peft_model_id)
18 |
19 | print('Loading backbone LLaMa "{}"...'.format(config.base_model_name_or_path))
20 | model = transformers.AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
21 | model = PeftModel.from_pretrained(model, peft_model_id)
22 |
23 | device = torch.device("cuda")
24 | model.to(device)
25 |
26 | while True:
27 | seed = input(':> ')
28 | prompt = '' + seed + '#'
29 |
30 | encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
31 | #print('DEBUG@26 encoded_prompt=', encoded_prompt)
32 | encoded_prompt = encoded_prompt.to(device)
33 |
34 | pad_token_id = tokenizer.encode('', add_special_tokens=False)[0]
35 | # end_token_id = self.tokenizer.encode('', add_special_tokens=False)[0]
36 |
37 | output_sequences = model.generate(
38 | input_ids=encoded_prompt,
39 | pad_token_id=pad_token_id,
40 | do_sample=True,
41 | temperature=1.0,
42 | top_p=0.80,
43 | max_length=300,
44 | num_return_sequences=5,
45 | )
46 |
47 | stop_token = ''
48 |
49 | generated_sequences = set()
50 | for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
51 | generated_sequence = generated_sequence.tolist()
52 | #print('DEBUG@46 ==> ', generated_sequence)
53 |
54 | text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
55 | if stop_token in text:
56 | text = text[: text.find(stop_token)]
57 |
58 | text = text[text.index('#') + 1:].strip()
59 | text = text.replace('\u2010', '').replace('\u0301', '')
60 | print('='*80)
61 | print(text)
62 |
--------------------------------------------------------------------------------
/chitchat/run_chitchat_fredt5.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 |
4 | import torch
5 | import transformers
6 | from transformers import T5Config
7 |
8 |
9 | if __name__ == '__main__':
10 | proj_dir = os.path.expanduser('~/polygon/chatbot')
11 |
12 | parser = argparse.ArgumentParser(description='Консольная интерактивная проверка модели читчата')
13 | parser.add_argument('--model', type=str, default=os.path.join(proj_dir, 'tmp', 'fredt5_chitchat'), help='Путь к каталогу с файлами модели')
14 | args = parser.parse_args()
15 |
16 | use_cuda = torch.cuda.is_available()
17 | device = torch.device("cuda" if use_cuda else "cpu")
18 |
19 | model_dir = args.model
20 | print(f'Loading model "{model_dir}"...')
21 | t5_config = T5Config.from_pretrained(model_dir)
22 |
23 | if 'FRED-T5' in t5_config.name_or_path:
24 | t5_tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_dir)
25 | else:
26 | t5_tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir)
27 |
28 | t5_model = transformers.T5ForConditionalGeneration.from_pretrained(model_dir)
29 | t5_model.to(device)
30 | t5_model.eval()
31 |
32 | while True:
33 | print('-'*80)
34 | dialog = []
35 | while True:
36 | msg = input('H:> ').strip()
37 | if len(msg) == 0:
38 | break
39 |
40 | msg = msg[0].upper() + msg[1:]
41 |
42 | dialog.append('человек: ' + msg)
43 |
44 | #prompt = ''+'\n'.join(dialog)
45 | prompt = '' + '\n'.join(dialog) + '\nчатбот: '
46 |
47 | input_ids = t5_tokenizer(prompt, return_tensors='pt').input_ids
48 | out_ids = t5_model.generate(input_ids=input_ids.to(device),
49 | max_length=200,
50 | eos_token_id=t5_tokenizer.eos_token_id,
51 | early_stopping=True,
52 | do_sample=True,
53 | temperature=1.0,
54 | top_k=0,
55 | top_p=0.85)
56 |
57 | t5_output = t5_tokenizer.decode(out_ids[0][1:])
58 | if '' in t5_output:
59 | t5_output = t5_output[:t5_output.find('')].strip()
60 |
61 | t5_output = t5_output.replace('', '').strip()
62 |
63 | print('B:> {}'.format(t5_output))
64 | dialog.append('чатбот: ' + t5_output)
65 |
--------------------------------------------------------------------------------
/chitchat/run_chitchat_gpt.py:
--------------------------------------------------------------------------------
1 | import os.path
2 |
3 | import torch
4 | import transformers
5 |
6 |
7 | class Chitchat(object):
8 | def __init__(self, device, models_dir):
9 | model_name = os.path.join(models_dir, 'rugpt_chitchat')
10 | self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
11 | self.model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
12 | self.model.to(device)
13 | self.model.eval()
14 |
15 | def reply(self, history, num_return_sequences):
16 | prompt = '' + '\n'.join(history) + '\nчатбот:'
17 | encoded_prompt = self.tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(device)
18 | output_sequences = self.model.generate(input_ids=encoded_prompt,
19 | max_length=len(prompt) + 120,
20 | temperature=0.90,
21 | typical_p=None,
22 | top_k=0,
23 | top_p=0.8,
24 | do_sample=True,
25 | num_return_sequences=num_return_sequences,
26 | pad_token_id=self.tokenizer.pad_token_id)
27 |
28 | replies = []
29 |
30 | for o in output_sequences:
31 | reply = self.tokenizer.decode(o.tolist(), clean_up_tokenization_spaces=True)
32 | reply = reply[len(prompt):] # отсекаем затравку
33 | reply = reply[: reply.find('')]
34 |
35 | if '\nчеловек:' in reply:
36 | reply = reply[:reply.index('\nчеловек:')]
37 |
38 | reply = reply.strip()
39 |
40 | if reply not in replies: # только уникальные реплики, сохраняем порядок выдачи
41 | replies.append(reply)
42 |
43 | return replies
44 |
45 |
46 | if __name__ == '__main__':
47 | device = "cuda" if torch.cuda.is_available() else "cpu"
48 | models_dir = os.path.expanduser('~/polygon/chatbot/tmp')
49 |
50 | chitchat = Chitchat(device, models_dir)
51 |
52 | while True:
53 | dialog = []
54 | while True:
55 | msg = input('H:> ').strip()
56 | if msg:
57 | dialog.append('человек: ' + msg)
58 | reply = chitchat.reply(dialog, num_return_sequences=1)[0]
59 | print(f'B:> {reply}')
60 | dialog.append('чатбот: ' + reply)
61 | else:
62 | dialog = []
63 | print('-'*100)
64 |
65 |
66 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/poetry/run_rugpt13b_lora.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import sys
4 | import argparse
5 |
6 | import torch
7 | import transformers
8 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
9 |
10 | from peft import PeftModel, PeftConfig
11 |
12 |
13 | class RugptGenerator:
14 | def __init__(self, model_path, temperature, top_p):
15 | self.model_path = os.path.expanduser(model_path)
16 | self.temperature = temperature
17 | self.top_p = top_p
18 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19 | self.tokenizer = None
20 | self.model = None
21 |
22 | def load(self):
23 | self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_path)
24 | self.tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''})
25 | #self.model = PeftModel.from_pretrained(self.model_path)
26 |
27 | peft_model_id = self.model_path
28 | config = PeftConfig.from_pretrained(peft_model_id)
29 | model = transformers.AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
30 | model = PeftModel.from_pretrained(model, peft_model_id)
31 | self.model = model.to(self.device)
32 | #self.model.eval()
33 |
34 | def generate_output(self, context, num_return_sequences):
35 | length = 200
36 |
37 | encoded_prompt = self.tokenizer.encode(context, add_special_tokens=False, return_tensors="pt")
38 | encoded_prompt = encoded_prompt.to(self.device)
39 |
40 | pad_token_id = self.tokenizer.encode('', add_special_tokens=False)[0]
41 | #end_token_id = self.tokenizer.encode('', add_special_tokens=False)[0]
42 |
43 | output_sequences = self.model.generate(
44 | input_ids=encoded_prompt,
45 | max_length=length + len(encoded_prompt[0]),
46 | num_return_sequences=num_return_sequences,
47 | pad_token_id=pad_token_id,
48 | #end_token_id=end_token_id,
49 | do_sample=True,
50 | temperature=self.temperature,
51 | top_p=self.top_p
52 | )
53 |
54 | stop_token = ''
55 |
56 | generated_sequences = set()
57 | for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
58 | generated_sequence = generated_sequence.tolist()
59 |
60 | text = self.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
61 | if stop_token in text:
62 | text = text[: text.find(stop_token)]
63 |
64 | text = text[text.index('#')+1:].strip()
65 | text = text.replace('\u2010', '').replace('\u0301', '')
66 |
67 | generated_sequences.add(text)
68 |
69 | return list(generated_sequences)
70 |
71 |
72 | if __name__ == '__main__':
73 | proj_dir = os.path.expanduser('~/polygon/text_generator')
74 |
75 | parser = argparse.ArgumentParser(description='Отладочный консольный генератор стихов на базе rugpt13B+LoRa')
76 | parser.add_argument('--model_path', type=str, default=os.path.join(proj_dir, 'tmp', 'verses_rugpt13b_lora_domain=lyrycs_syllables=1'))
77 | parser.add_argument('--temperature', type=float, default=1.0, help='Температура сэмплинга')
78 | parser.add_argument('--top_p', type=float, default=0.8, help='top-p')
79 | parser.add_argument('--top_k', type=int, default=0, help='top-k')
80 | parser.add_argument('--typical_p', type=float, default=0.0, help='typical-p')
81 |
82 | args = parser.parse_args()
83 |
84 | use_cuda = torch.cuda.is_available()
85 | device = torch.device("cuda" if use_cuda else "cpu")
86 |
87 | poem_generator = RugptGenerator(model_path=args.model_path, temperature=args.temperature, top_p=args.top_p)
88 | poem_generator.load()
89 |
90 | while True:
91 | prompt = input(':> ').strip()
92 | if prompt:
93 | seed = prompt + '#'
94 | px = poem_generator.generate_output(seed, num_return_sequences=10)
95 | print('-'*80)
96 | for ipoem, p in enumerate(px, start=1):
97 | print('='*30 + ' POEM #{} '.format(ipoem) + '='*30)
98 | print(p)
99 | print('-'*80)
100 |
--------------------------------------------------------------------------------
/poetry/run_rugpt_generator.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import sys
4 | import argparse
5 |
6 | import torch
7 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
8 |
9 | from generative_poetry.whitespace_normalization import normalize_whitespaces
10 |
11 |
12 | class RugptGenerator:
13 | def __init__(self, model_path, generation_config):
14 | self.model_path = model_path
15 | self.generation_config = generation_config
16 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17 | self.tokenizer = None
18 | self.model = None
19 |
20 | def load(self):
21 | model_name_or_path = os.path.expanduser(self.model_path)
22 | self.tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
23 | self.tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''})
24 | self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
25 | self.model.to(self.device)
26 | self.model.eval()
27 |
28 | def generate_output(self, context, num_return_sequences):
29 | encoded_prompt = self.tokenizer.encode(context, add_special_tokens=False, return_tensors="pt")
30 | encoded_prompt = encoded_prompt.to(self.device)
31 |
32 | pad_token_id = self.tokenizer.encode('', add_special_tokens=False)[0]
33 | #end_token_id = self.tokenizer.encode('', add_special_tokens=False)[0]
34 |
35 | output_sequences = self.model.generate(
36 | input_ids=encoded_prompt,
37 | pad_token_id=pad_token_id,
38 | **self.generation_config
39 | )
40 |
41 | stop_token = ''
42 |
43 | generated_sequences = set()
44 | for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
45 | generated_sequence = generated_sequence.tolist()
46 |
47 | text = self.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
48 | if stop_token in text:
49 | text = text[: text.find(stop_token)]
50 |
51 | text = text[text.index('#')+1:].strip()
52 | text = text.replace('\u2010', '').replace('\u0301', '')
53 | text = normalize_whitespaces(text)
54 | generated_sequences.add(text)
55 |
56 | return list(generated_sequences)
57 |
58 |
59 | if __name__ == '__main__':
60 | proj_dir = os.path.expanduser('~/polygon/text_generator')
61 |
62 | parser = argparse.ArgumentParser(description='Отладочный консольный генератор пирожков')
63 | parser.add_argument('--model_path', type=str, default=os.path.join(proj_dir, 'tmp', 'verses_rugpt.all'))
64 | parser.add_argument('--max_length', type=int, default=300)
65 | parser.add_argument('--num_return_sequences', type=int, default=5)
66 | parser.add_argument('--do_sample', type=bool, default=True)
67 | parser.add_argument('--num_beams', type=int, default=1)
68 | parser.add_argument('--num_beam_groups', type=int, default=1)
69 | parser.add_argument('--penalty_alpha', type=float, default=None)
70 | parser.add_argument('--epsilon_cutoff', type=float, default=0.0)
71 | parser.add_argument('--eta_cutoff', type=float, default=0.0)
72 | parser.add_argument('--diversity_penalty', type=float, default=0.0)
73 | parser.add_argument('--repetition_penalty', type=float, default=None)
74 | parser.add_argument('--encoder_repetition_penalty', type=float, default=1.0)
75 | parser.add_argument('--length_penalty', type=float, default=1.0)
76 | parser.add_argument('--no_repeat_ngram_size', type=int, default=0)
77 | parser.add_argument('--renormalize_logits', type=bool, default=False)
78 | parser.add_argument('--temperature', type=float, default=0.9, help='Температура сэмплинга')
79 | parser.add_argument('--top_p', type=float, default=0.6, help='top-p')
80 | parser.add_argument('--top_k', type=int, default=0, help='top-k')
81 | parser.add_argument('--typical_p', type=float, default=None, help='typical-p')
82 | args = parser.parse_args()
83 |
84 | generation_args = {'max_length': args.max_length,
85 | 'num_return_sequences': args.num_return_sequences,
86 | 'do_sample': args.do_sample,
87 | 'num_beams': args.num_beams,
88 | 'num_beam_groups': args.num_beam_groups,
89 | 'penalty_alpha': args.penalty_alpha,
90 | 'epsilon_cutoff': args.epsilon_cutoff,
91 | 'eta_cutoff': args.eta_cutoff,
92 | 'diversity_penalty': args.diversity_penalty,
93 | 'repetition_penalty': args.repetition_penalty,
94 | 'encoder_repetition_penalty': args.encoder_repetition_penalty,
95 | 'length_penalty': args.length_penalty,
96 | 'no_repeat_ngram_size': args.no_repeat_ngram_size,
97 | 'renormalize_logits': args.renormalize_logits,
98 | 'temperature': args.temperature,
99 | 'top_p': args.top_p,
100 | 'top_k': args.top_k,
101 | 'typical_p': args.typical_p,
102 | }
103 |
104 | use_cuda = torch.cuda.is_available()
105 | device = torch.device("cuda" if use_cuda else "cpu")
106 |
107 | poem_generator = RugptGenerator(args.model_path, generation_args)
108 | poem_generator.load()
109 |
110 | while True:
111 | prompt = input(':> ').strip()
112 | if prompt:
113 | seed = prompt + '#'
114 | px = poem_generator.generate_output(seed, num_return_sequences=10)
115 | print('-'*80)
116 | for ipoem, p in enumerate(px, start=1):
117 | print('='*30 + ' POEM #{} '.format(ipoem) + '='*30)
118 | print(p)
119 | print('-'*80)
120 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Legal Code
2 |
3 | CC0 1.0 Universal
4 |
5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12 | HEREUNDER.
13 |
14 | Statement of Purpose
15 |
16 | The laws of most jurisdictions throughout the world automatically confer
17 | exclusive Copyright and Related Rights (defined below) upon the creator
18 | and subsequent owner(s) (each and all, an "owner") of an original work of
19 | authorship and/or a database (each, a "Work").
20 |
21 | Certain owners wish to permanently relinquish those rights to a Work for
22 | the purpose of contributing to a commons of creative, cultural and
23 | scientific works ("Commons") that the public can reliably and without fear
24 | of later claims of infringement build upon, modify, incorporate in other
25 | works, reuse and redistribute as freely as possible in any form whatsoever
26 | and for any purposes, including without limitation commercial purposes.
27 | These owners may contribute to the Commons to promote the ideal of a free
28 | culture and the further production of creative, cultural and scientific
29 | works, or to gain reputation or greater distribution for their Work in
30 | part through the use and efforts of others.
31 |
32 | For these and/or other purposes and motivations, and without any
33 | expectation of additional consideration or compensation, the person
34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35 | is an owner of Copyright and Related Rights in the Work, voluntarily
36 | elects to apply CC0 to the Work and publicly distribute the Work under its
37 | terms, with knowledge of his or her Copyright and Related Rights in the
38 | Work and the meaning and intended legal effect of CC0 on those rights.
39 |
40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
41 | protected by copyright and related or neighboring rights ("Copyright and
42 | Related Rights"). Copyright and Related Rights include, but are not
43 | limited to, the following:
44 |
45 | i. the right to reproduce, adapt, distribute, perform, display,
46 | communicate, and translate a Work;
47 | ii. moral rights retained by the original author(s) and/or performer(s);
48 | iii. publicity and privacy rights pertaining to a person's image or
49 | likeness depicted in a Work;
50 | iv. rights protecting against unfair competition in regards to a Work,
51 | subject to the limitations in paragraph 4(a), below;
52 | v. rights protecting the extraction, dissemination, use and reuse of data
53 | in a Work;
54 | vi. database rights (such as those arising under Directive 96/9/EC of the
55 | European Parliament and of the Council of 11 March 1996 on the legal
56 | protection of databases, and under any national implementation
57 | thereof, including any amended or successor version of such
58 | directive); and
59 | vii. other similar, equivalent or corresponding rights throughout the
60 | world based on applicable law or treaty, and any national
61 | implementations thereof.
62 |
63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
65 | irrevocably and unconditionally waives, abandons, and surrenders all of
66 | Affirmer's Copyright and Related Rights and associated claims and causes
67 | of action, whether now known or unknown (including existing as well as
68 | future claims and causes of action), in the Work (i) in all territories
69 | worldwide, (ii) for the maximum duration provided by applicable law or
70 | treaty (including future time extensions), (iii) in any current or future
71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
72 | including without limitation commercial, advertising or promotional
73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74 | member of the public at large and to the detriment of Affirmer's heirs and
75 | successors, fully intending that such Waiver shall not be subject to
76 | revocation, rescission, cancellation, termination, or any other legal or
77 | equitable action to disrupt the quiet enjoyment of the Work by the public
78 | as contemplated by Affirmer's express Statement of Purpose.
79 |
80 | 3. Public License Fallback. Should any part of the Waiver for any reason
81 | be judged legally invalid or ineffective under applicable law, then the
82 | Waiver shall be preserved to the maximum extent permitted taking into
83 | account Affirmer's express Statement of Purpose. In addition, to the
84 | extent the Waiver is so judged Affirmer hereby grants to each affected
85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
88 | maximum duration provided by applicable law or treaty (including future
89 | time extensions), (iii) in any current or future medium and for any number
90 | of copies, and (iv) for any purpose whatsoever, including without
91 | limitation commercial, advertising or promotional purposes (the
92 | "License"). The License shall be deemed effective as of the date CC0 was
93 | applied by Affirmer to the Work. Should any part of the License for any
94 | reason be judged legally invalid or ineffective under applicable law, such
95 | partial invalidity or ineffectiveness shall not invalidate the remainder
96 | of the License, and in such case Affirmer hereby affirms that he or she
97 | will not (i) exercise any of his or her remaining Copyright and Related
98 | Rights in the Work or (ii) assert any associated claims and causes of
99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 |
102 | 4. Limitations and Disclaimers.
103 |
104 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 | surrendered, licensed or otherwise affected by this document.
106 | b. Affirmer offers the Work as-is and makes no representations or
107 | warranties of any kind concerning the Work, express, implied,
108 | statutory or otherwise, including without limitation warranties of
109 | title, merchantability, fitness for a particular purpose, non
110 | infringement, or the absence of latent or other defects, accuracy, or
111 | the present or absence of errors, whether or not discoverable, all to
112 | the greatest extent permissible under applicable law.
113 | c. Affirmer disclaims responsibility for clearing rights of other persons
114 | that may apply to the Work or any use thereof, including without
115 | limitation any person's Copyright and Related Rights in the Work.
116 | Further, Affirmer disclaims responsibility for obtaining any necessary
117 | consents, permissions or other rights required for any use of the
118 | Work.
119 | d. Affirmer understands and acknowledges that Creative Commons is not a
120 | party to this document and has no duty or obligation with respect to
121 | this CC0 or use of the Work.
122 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LM-finetune
2 |
3 | В этом репе я собрал свои текущие рабочие скрипты для файнтюна языковых моделей (rugpt, LLaMa, FRED T5) средствами transformers.
4 | В случае больших моделей (7B и 13B) используются варианты а) deepspeed б) LoRa.
5 |
6 | В коде нет ничего нового и особо умного, просто базовый пайплайн в рамках рекомендаций для transformers.Trainer.
7 |
8 | ## ГЕНЕРАТОР СТИХОВ
9 |
10 | ### Генератор стихов на базе модели LLaMa 7B и 13B
11 |
12 | Код: [finetune_llama.py](./poetry/finetune_llama.py)
13 |
14 | Используется deepspeed, что позволяет тюнить модели на 40Гб гпушках. Судя по отчету deepspeed'а, возможен
15 | также файнтюн на V100 с 32Гб. Обратите внимание, что требуется очень много обычной RAM, более 240 Гб,
16 | чтобы deepspeed выгружал туда тензоры.
17 |
18 | Запуск файнтюна на 4 GPU:
19 |
20 | ```
21 | python -m torch.distributed.launch --nproc_per_node=4 finetune_llama.py \
22 | --dataset_path ~/polygon/text_generator/tmp/лирика.jsonl \
23 | --max_samples 10000 \
24 | --output_syllables 0 \
25 | --model_name_or_path decapoda-research/llama-7b-hf \
26 | --output_dir ~/polygon/text_generator/tmp/verses_model=llama7b_domain=lyrics_syllables=0 \
27 | --overwrite_output_dir 1 \
28 | --per_device_train_batch_size 8 \
29 | --learning_rate 1e-5 \
30 | --num_train_epochs 1 \
31 | --bf16 1 \
32 | --fp16 0 \
33 | --gradient_checkpointing 0 \
34 | --gradient_accumulation_step 8 \
35 | --do_train 1 \
36 | --do_eval 0 \
37 | --report_to tensorboard \
38 | --evaluation_strategy no \
39 | --logging_strategy steps \
40 | --logging_steps 10 \
41 | --save_strategy no \
42 | --deepspeed 13b_deepspeed_config.json
43 | ```
44 |
45 | Файл с конфигурацией deepspeed'а: [13b_deepspeed_config.json](./poetry/13b_deepspeed_config.json)
46 |
47 | Код инференса: [run_llama.py](./poetry/run_llama.py). Disclaimer: этот код инференсит только на 80Гб A100.
48 |
49 | ### Генератор стихов на базе LLaMa 7B и 13B с использованием библиотеки PEFT (метод LoRa)
50 |
51 | Код файнтюна: [finetune_llama_lora.py](./poetry/finetune_llama_lora.py)
52 |
53 | Запуск файнтюна на 2 ГПУ:
54 |
55 | ```
56 | python -m torch.distributed.run --nproc_per_node=2 finetune_llama_lora.py \
57 | --dataset_path ~/polygon/text_generator/tmp/лирика.jsonl \
58 | --max_samples 10000 \
59 | --output_syllables 0 \
60 | --model_name_or_path decapoda-research/llama-7b-hf \
61 | --output_dir ~/polygon/text_generator/tmp/verses_model=llama7b_lora_domain=lyrics_syllables=0 \
62 | --overwrite_output_dir 1 \
63 | --per_device_train_batch_size 1 \
64 | --learning_rate 1e-4 \
65 | --num_train_epochs 1 \
66 | --bf16 0 \
67 | --fp16 0 \
68 | --gradient_checkpointing 0 \
69 | --gradient_accumulation_step 8 \
70 | --do_train 1 \
71 | --do_eval 0 \
72 | --report_to tensorboard \
73 | --evaluation_strategy no \
74 | --logging_strategy steps \
75 | --logging_steps 200 \
76 | --save_strategy no \
77 | ```
78 |
79 | Код инференса: [run_llama_lora.py](./poetry/run_llama_lora.py)
80 |
81 | ### Генератор стихов на базе FRED T5 XL
82 |
83 | Код для файнтюна: [finetune_fredt5_poetry_generator.py](./poetry/finetune_fredt5_poetry_generator.py)
84 |
85 | Запуск файнтюна на 2 ГПУ:
86 |
87 | ```
88 | python -m torch.distributed.run --nproc_per_node=2 finetune_fredt5_poetry_generator.py \
89 | --model_name_or_path ai-forever/FRED-T5-1.7B \
90 | --dataset_path ~/polygon/text_generator/tmp/all_verses.jsonl \
91 | --prompt prompt_text \
92 | --optim "adafactor" \
93 | --learning_rate 1e-3 \
94 | --lr_scheduler_type constant \
95 | --per_device_train_batch_size 8 \
96 | --gradient_checkpointing 0 \
97 | --gradient_accumulation_steps 4 \
98 | --num_train_epochs 1 \
99 | --report_to tensorboard \
100 | --logging_strategy steps \
101 | --logging_steps 100 \
102 | --output_dir ~/polygon/text_generator/tmp/verses_fredt5 \
103 | --save_strategy no
104 | ```
105 |
106 | Запуск инференса: [run_fredt5_poetry_generator.py](./poetry/run_fredt5_poetry_generator.py)
107 |
108 |
109 | ### Генератор стихов на базе моделей rugpt (кроме rugpt13B)
110 |
111 | Код для файнтюна: [finetune_rugpt_with_prompt_masking.py](./poetry/finetune_rugpt_with_prompt_masking.py)
112 |
113 | Запуск на 2 ГПУ, базовая модель rugpt3large_based_on_gpt2:
114 |
115 | ```
116 | python -m torch.distributed.run --nproc_per_node=2 finetune_rugpt_with_prompt_masking.py \
117 | --dataset_path ~/polygon/text_generator/tmp/лирика.jsonl \
118 | --output_syllables 1 \
119 | --model_name_or_path sberbank-ai/rugpt3large_based_on_gpt2 \
120 | --output_dir ~/polygon/text_generator/tmp/verses_model=rugpt_large_domain=lyrics_syllables=1 \
121 | --overwrite_output_dir 1 \
122 | --per_device_train_batch_size 8 \
123 | --learning_rate 5e-5 \
124 | --num_train_epochs 1 \
125 | --fp16 1 \
126 | --gradient_checkpointing 0 \
127 | --gradient_accumulation_step 8 \
128 | --do_train 1 \
129 | --do_eval 0 \
130 | --report_to tensorboard \
131 | --evaluation_strategy no \
132 | --logging_strategy steps \
133 | --logging_steps 200 \
134 | --save_strategy no
135 | ```
136 |
137 | Инференс: [run_rugpt_generator.py](./poetry/run_rugpt_generator.py)
138 |
139 |
140 | ## ЧИТЧАТ
141 |
142 | ### Файнтюн читчата на базе модели FRED T5 XL 1.7B
143 |
144 | Особенность подхода: вместо префикса для входной последовательности ставится селектор денойзера ``,
145 | и добавляется токен `` в том месте (конец диалога), где находится генерируемая реплика.
146 |
147 | Код: [finetune_chitchat_fredt5_with_trainer.py](./chitchat/finetune_chitchat_fredt5_with_trainer.py).
148 |
149 | Пример запуска на 1 ГПУ:
150 |
151 | ```
152 | python finetune_chitchat_fredt5_with_trainer.py \
153 | --dataset_path axioma_dialogues.json \
154 | --optim "adafactor" \
155 | --learning_rate 1e-4 \
156 | --lr_scheduler_type constant \
157 | --per_gpu_train_batch_size 6 \
158 | --gradient_checkpointing 0 \
159 | --gradient_accumulation_steps 8 \
160 | --num_train_epochs 1 \
161 | --report_to tensorboard \
162 | --logging_strategy steps \
163 | --logging_steps 500 \
164 | --output_dir ~/polygon/chatbot/tmp/fredt5_chitchat \
165 | --save_strategy no
166 | ```
167 |
168 | Датасет для этой модели: [axioma_dialogues.json](./chitchat/axioma_dialogues.json) сделан из русскоязычной части [датасета проекта OpenAssistant](https://huggingface.co/datasets/OpenAssistant/oasst1).
169 | Каждая ответная реплика вместе с предшествующим контекстом образует отдельный сэмпл для seq2seq модели. Реплики человека и чатбота отмечаются
170 | метками `` и `<и>` соответственно. Для файнтюна они преобразуются в префиксы `человек:` и `чатбот:`.
171 |
172 | После файнтюна запустить генерацию можно с помощью кода [run_chitchat_fredt5.py](./chitchat/run_chitchat_fredt5.py).
173 |
174 | ### Файнтюн читчата на базе модели sberbank-ai/rugpt3medium_based_on_gpt2
175 |
176 | Также подходит для других моделей семейства rugpt.
177 |
178 | Код [finetune_chitchat_gpt_with_trainer.py](./chitchat/finetune_chitchat_gpt_with_trainer.py).
179 |
180 | Датасет: [axioma_dialogues.solid.json](./chitchat/axioma_dialogues.solid.json)
181 |
182 | Запуск файнтюна на 1 GPU:
183 |
184 | ```
185 | python finetune_chitchat_gpt_with_trainer.py \
186 | --model_name_or_path sberbank-ai/rugpt3medium_based_on_gpt2 \
187 | --learning_rate 1e-5 \
188 | --lr_scheduler_type constant \
189 | --per_gpu_train_batch_size 2 \
190 | --gradient_checkpointing 0 \
191 | --gradient_accumulation_steps 8 \
192 | --num_train_epochs 1 \
193 | --report_to tensorboard \
194 | --logging_strategy steps \
195 | --logging_steps 100 \
196 | --output_dir ~/polygon/chatbot/tmp/rugpt_chitchat \
197 | --save_strategy no
198 | ```
199 |
200 | Код инференса: [run_chitchat_gpt.py](./chitchat/run_chitchat_gpt.py).
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
--------------------------------------------------------------------------------
/chitchat/finetune_chitchat_fredt5_with_trainer.py:
--------------------------------------------------------------------------------
1 | """
2 | Тренировка модели болталки Axioma на FRED T5 для проекта https://github.com/Koziev/chatbot
3 | Эксперимент с файнтюном: токены истории диалога не включаем в backprop, присваивая соответствующим целям (labels) значение -100
4 | Прочие хинты по тренировке: https://kelijah.livejournal.com/315826.html
5 | """
6 |
7 | import os
8 | import json
9 | import sys
10 | import io
11 | import random
12 | import itertools
13 | from typing import Any, Dict, List, Optional, Tuple, Union
14 | import shutil
15 | import logging
16 | from dataclasses import dataclass, field
17 |
18 | import torch
19 | import torch.optim
20 | from torch.utils.data import Dataset, DataLoader
21 | import transformers
22 | from transformers import AutoTokenizer
23 | from transformers import TrainingArguments, Trainer, TrainerCallback
24 | from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
25 | from transformers import HfArgumentParser
26 | from pynvml import *
27 |
28 |
29 | proj_dir = os.path.expanduser('~/polygon/chatbot')
30 |
31 |
32 | def print_gpu_utilization():
33 | nvmlInit()
34 | handle = nvmlDeviceGetHandleByIndex(0)
35 | info = nvmlDeviceGetMemoryInfo(handle)
36 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
37 |
38 |
39 | def load_samples(dataset_path, tokenizer):
40 | samples = []
41 | with open(dataset_path, 'r') as f:
42 | for sample in json.load(f):
43 | try:
44 | # 01.05.2023 эксперимент: вместо спецтокенов и используем метки
45 | seed = '' + sample['context'].replace('', 'человек: ').replace('', 'чатбот: ') + '\nчатбот: '
46 | reply = '' + sample['reply']
47 | input_tokens = tokenizer.encode(seed, add_special_tokens=False, truncation=True, max_length=1024)
48 | output_tokens = tokenizer.encode(reply, add_special_tokens=False) # , truncation=True, max_length=1024)
49 | if len(input_tokens) < 512 and len(output_tokens) < 512: # пока ограничим многословность
50 | samples.append({'input_tokens': input_tokens,
51 | 'output_tokens': output_tokens,
52 | 'seed': seed,
53 | 'reply': reply})
54 | except Exception as ex:
55 | print(ex)
56 |
57 | return samples
58 |
59 |
60 | class FinetuneDataset(Dataset):
61 | def __init__(self, samples, tokenizer):
62 | self.tokenizer = tokenizer
63 | self.max_input_len = 0
64 | self.max_output_len = 0
65 | self.samples = []
66 |
67 | self.bos_token_id = tokenizer.encode('', add_special_tokens=False)[0]
68 | self.eos_token_id = tokenizer.encode('', add_special_tokens=False)[0]
69 | self.pad_token_id = tokenizer.encode('', add_special_tokens=False)[0]
70 |
71 | for sample in samples:
72 | input_ids = sample['input_tokens']
73 | output_ids = sample['output_tokens'] + [self.eos_token_id]
74 | self.samples.append((input_ids, output_ids))
75 | self.max_input_len = max(self.max_input_len, len(input_ids))
76 | self.max_output_len = max(self.max_output_len, len(output_ids))
77 |
78 | def __len__(self):
79 | return len(self.samples)
80 |
81 | def __getitem__(self, index: int):
82 | input_ids, output_ids = self.samples[index]
83 |
84 | input_npad = self.max_input_len - len(input_ids)
85 | attention_mask = [1]*len(input_ids) + [0]*input_npad
86 | input_ids = input_ids + input_npad * [self.pad_token_id]
87 |
88 | output_npad = self.max_output_len - len(output_ids)
89 | labels = output_ids + output_npad * [-100]
90 |
91 | return {'input_ids': torch.LongTensor(input_ids),
92 | 'attention_mask': attention_mask,
93 | 'labels': torch.LongTensor(labels),
94 | }
95 |
96 |
97 | @dataclass
98 | class ModelArguments:
99 | """
100 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
101 | """
102 | model_name_or_path: Optional[str] = field(
103 | default='ai-forever/FRED-T5-1.7B',
104 | metadata={"help": "The model checkpoint for weights initialization."},
105 | )
106 |
107 |
108 | @dataclass
109 | class DataTrainingArguments:
110 | """
111 | Arguments pertaining to what data we are going to input our model for training and eval.
112 | """
113 | dataset_path: Optional[str] = field(
114 | default=os.path.join(proj_dir, 'tmp', 'axioma_dialogues.json'),
115 | metadata={"help": "Путь к датасету с диалогами"}
116 | )
117 |
118 |
119 | if __name__ == '__main__':
120 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
121 |
122 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
123 |
124 | if not training_args.optim:
125 | training_args.optim = "adafactor"
126 |
127 | if not training_args.output_dir:
128 | training_args.output_dir = os.path.join(proj_dir, 'tmp', 'fredt5_chitchat')
129 |
130 | verbose = training_args.local_rank in (-1, 0)
131 |
132 | # Setup logging
133 | logging.basicConfig(
134 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
135 | datefmt="%m/%d/%Y %H:%M:%S",
136 | handlers=[logging.StreamHandler(sys.stdout)],
137 | )
138 |
139 | log_level = training_args.get_process_log_level()
140 | logger = logging.getLogger(__name__)
141 | logger.setLevel(log_level)
142 | #datasets.utils.logging.set_verbosity(log_level)
143 | transformers.utils.logging.set_verbosity(log_level)
144 | transformers.utils.logging.enable_default_handler()
145 | transformers.utils.logging.enable_explicit_format()
146 |
147 | logger.info(
148 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
149 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
150 | )
151 | logger.info(f"Training/evaluation parameters {training_args}")
152 |
153 | rank0 = training_args.local_rank in (-1, 0)
154 |
155 | # Удаляем старые логи tensorboard
156 | if rank0:
157 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
158 | #if os.path.exists(tensorboard_dir):
159 | # logger.info('Removing "%s"', tensorboard_dir)
160 | # shutil.rmtree(tensorboard_dir)
161 |
162 | device = training_args.device
163 | logger.info('device={}'.format(device))
164 |
165 | pretrained_model_name = model_args.model_name_or_path
166 |
167 | logger.info('Loading pretrained model "%s"', pretrained_model_name)
168 | tokenizer = transformers.GPT2Tokenizer.from_pretrained(pretrained_model_name)
169 | model = transformers.T5ForConditionalGeneration.from_pretrained(pretrained_model_name)
170 | model.to(device)
171 |
172 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''})
173 |
174 | if rank0:
175 | print_gpu_utilization()
176 | logger.info('\nTokenizer:')
177 | for token in ' '.split():
178 | logger.info('token "%s" id=%s'.format(token, str(tokenizer.encode(token, add_special_tokens=False))))
179 |
180 | logger.info('Loading dataset "%s"...', data_args.dataset_path)
181 | train_samples = load_samples(data_args.dataset_path, tokenizer)
182 | logger.info('Train samples: %d', len(train_samples))
183 |
184 | train_dataset = FinetuneDataset(train_samples, tokenizer)
185 | # test_dataset = FinetuneDataset(test_samples, tokenizer)
186 |
187 | trainer = Trainer(
188 | model=model,
189 | args=training_args,
190 | train_dataset=train_dataset,
191 | tokenizer=tokenizer,
192 | data_collator=None,
193 | )
194 |
195 | try:
196 | logger.info('Start training...')
197 | train_result = trainer.train()
198 |
199 | if rank0:
200 | metrics = train_result.metrics
201 | trainer.log_metrics("train", metrics)
202 | trainer.save_metrics("train", metrics)
203 | except KeyboardInterrupt:
204 | print('!!! Ctrl+C !!!')
205 |
206 | if rank0:
207 | logger.info(f'Saving the model and tokenizer')
208 | trainer.save_model(output_dir=training_args.output_dir)
209 | tokenizer.save_pretrained(training_args.output_dir)
210 | #model.save_pretrained(training_args.output_dir)
211 |
212 | logger.info('All done :)')
213 |
--------------------------------------------------------------------------------
/poetry/finetune_fredt5_poetry_generator.py:
--------------------------------------------------------------------------------
1 | """
2 | Тренировка модели генерации стихов на претрейненной модели FRED T5 XL
3 | """
4 | import logging
5 | import os
6 | import json
7 | import io
8 | import random
9 | import itertools
10 | import sys
11 | from dataclasses import dataclass, field
12 | from typing import Any, Dict, List, Optional, Tuple, Union
13 | import shutil
14 | import argparse
15 |
16 | import numpy as np
17 | import tqdm
18 | import sklearn.model_selection
19 | import torch
20 | import scipy
21 | import torch.nn as nn
22 | import torch.nn.functional as F
23 | import torch.optim
24 | from torch.utils.tensorboard import SummaryWriter
25 | from torch.utils.data import Dataset, DataLoader
26 | from transformers import AutoModelForCausalLM
27 | import transformers
28 | from transformers import AutoTokenizer
29 | from transformers import TrainingArguments, Trainer, TrainerCallback
30 | from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
31 | from transformers import HfArgumentParser
32 | from pynvml import *
33 |
34 |
35 | proj_dir = os.path.expanduser('~/polygon/text_generator')
36 |
37 |
38 | def print_gpu_utilization():
39 | nvmlInit()
40 | handle = nvmlDeviceGetHandleByIndex(0)
41 | info = nvmlDeviceGetMemoryInfo(handle)
42 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
43 |
44 |
45 | def load_samples(dataset_path, tokenizer):
46 | samples = []
47 | with open(dataset_path, 'r') as f:
48 | for line in f:
49 | sample = json.loads(line)
50 | try:
51 | input_text = '' + sample['prompt_text']
52 |
53 | # Вариант с генерацией обычного текста
54 | output_text = sample['output_text']
55 | output_text = '\n'.join(output_text.split('\n')[:4])
56 |
57 | input_tokens = tokenizer.encode(input_text, add_special_tokens=False, truncation=True, max_length=512)
58 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
59 | samples.append({'input_tokens': input_tokens, 'output_tokens': output_tokens,
60 | 'input_text': input_text, 'output_text': output_text})
61 |
62 | except Exception as ex:
63 | print(ex)
64 |
65 | return samples
66 |
67 |
68 | class FinetuneDataset(Dataset):
69 | def __init__(self, samples, tokenizer):
70 | self.tokenizer = tokenizer
71 | self.max_input_len = 0
72 | self.max_output_len = 0
73 | self.samples = []
74 |
75 | self.bos_token_id = tokenizer.encode('', add_special_tokens=False)[0]
76 | self.eos_token_id = tokenizer.encode('', add_special_tokens=False)[0]
77 | self.pad_token_id = tokenizer.encode('', add_special_tokens=False)[0]
78 |
79 | for sample in samples:
80 | input_ids = sample['input_tokens']
81 | output_ids = sample['output_tokens'] + [self.eos_token_id]
82 | self.samples.append((input_ids, output_ids))
83 | self.max_input_len = max(self.max_input_len, len(input_ids))
84 | self.max_output_len = max(self.max_output_len, len(output_ids))
85 |
86 | def __len__(self):
87 | return len(self.samples)
88 |
89 | def __getitem__(self, index: int):
90 | input_ids, output_ids = self.samples[index]
91 |
92 | input_npad = self.max_input_len - len(input_ids)
93 | attention_mask = [1]*len(input_ids) + [0]*input_npad
94 | input_ids = input_ids + input_npad * [self.pad_token_id]
95 |
96 | output_npad = self.max_output_len - len(output_ids)
97 | labels = output_ids + output_npad * [-100]
98 |
99 | return {'input_ids': torch.LongTensor(input_ids),
100 | 'attention_mask': attention_mask,
101 | 'labels': torch.LongTensor(labels),
102 | }
103 |
104 |
105 | @dataclass
106 | class ModelArguments:
107 | """
108 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
109 | """
110 | model_name_or_path: Optional[str] = field(
111 | default='ai-forever/FRED-T5-1.7B',
112 | metadata={"help": "The model checkpoint for weights initialization."},
113 | )
114 |
115 |
116 | @dataclass
117 | class DataTrainingArguments:
118 | """
119 | Arguments pertaining to what data we are going to input our model for training and eval.
120 | """
121 | dataset_path: Optional[str] = field(
122 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'long_poems_gpt_dataset.jsonl')),
123 | metadata={"help": "Путь к датасету со стихами"}
124 | )
125 |
126 |
127 | class MyPrinterCallback(TrainerCallback):
128 | def __init__(self, filepath):
129 | self.wrt = open(filepath, 'w')
130 |
131 | def on_log(self, args, state, control, logs=None, **kwargs):
132 | if state.is_local_process_zero:
133 | if 'epoch' in logs and 'loss' in logs:
134 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
135 | self.wrt.flush()
136 |
137 |
138 | if __name__ == '__main__':
139 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
140 |
141 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
142 |
143 | if not training_args.optim:
144 | training_args.optim = "adafactor"
145 |
146 | if not training_args.output_dir:
147 | training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 't5_poetry_generator'))
148 |
149 | verbose = training_args.local_rank in (-1, 0)
150 |
151 | # Setup logging
152 | logging.basicConfig(
153 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
154 | datefmt="%m/%d/%Y %H:%M:%S",
155 | handlers=[logging.StreamHandler(sys.stdout)],
156 | )
157 |
158 | log_level = training_args.get_process_log_level()
159 | logger = logging.getLogger(__name__)
160 | logger.setLevel(log_level)
161 | #datasets.utils.logging.set_verbosity(log_level)
162 | transformers.utils.logging.set_verbosity(log_level)
163 | transformers.utils.logging.enable_default_handler()
164 | transformers.utils.logging.enable_explicit_format()
165 |
166 | logger.info(
167 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
168 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
169 | )
170 | logger.info(f"Training/evaluation parameters {training_args}")
171 |
172 | # Удаляем старые логи tensorboard
173 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
174 | if os.path.exists(tensorboard_dir):
175 | logger.info('Removing "%s"', tensorboard_dir)
176 | shutil.rmtree(tensorboard_dir)
177 |
178 | device = training_args.device
179 | logging.info('device={}'.format(device))
180 |
181 | pretrained_model_name = model_args.model_name_or_path
182 |
183 | logger.info('Loading pretrained model "%s"', pretrained_model_name)
184 | if 'FRED-T5' in pretrained_model_name:
185 | tokenizer = transformers.GPT2Tokenizer.from_pretrained(pretrained_model_name)
186 | else:
187 | tokenizer = transformers.T5Tokenizer.from_pretrained(pretrained_model_name)
188 |
189 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''})
190 |
191 | model = transformers.T5ForConditionalGeneration.from_pretrained(pretrained_model_name)
192 | model.to(device)
193 | print_gpu_utilization()
194 |
195 | logger.info('\nLoading dataset "%s"...', data_args.dataset_path)
196 | train_samples = load_samples(data_args.dataset_path, tokenizer)
197 | logger.info('Train samples: %d', len(train_samples))
198 |
199 | train_dataset = FinetuneDataset(train_samples, tokenizer)
200 | # test_dataset = FinetuneDataset(test_samples, tokenizer)
201 |
202 | printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_fredt5_poetry_generator.loss.log'))
203 |
204 | trainer = Trainer(
205 | model=model,
206 | args=training_args,
207 | train_dataset=train_dataset,
208 | # eval_dataset=test_dataset,
209 | tokenizer=tokenizer,
210 | data_collator=None,
211 | # compute_metrics=compute_metrics,
212 | callbacks=[printer] #[EarlyStoppingCallback(early_stopping_patience=5)]
213 | )
214 |
215 | logger.info('Start training...')
216 | train_result = trainer.train()
217 |
218 | logger.info(f'Saving the model and tokenizer')
219 | trainer.save_model(output_dir=training_args.output_dir)
220 |
221 | metrics = train_result.metrics
222 | trainer.log_metrics("train", metrics)
223 | trainer.save_metrics("train", metrics)
224 |
225 | logger.info('All done :)')
226 |
--------------------------------------------------------------------------------
/chitchat/finetune_chitchat_gpt_with_trainer.py:
--------------------------------------------------------------------------------
1 | """
2 | Тренировка модели болталки Axioma.
3 | Эксперимент с файнтюном: токены истории диалога не включаем в backprop, присваивая соответствующим целям (labels) значение -100
4 | """
5 | import logging
6 | import os
7 | import json
8 | import sys
9 | import io
10 | import random
11 | import itertools
12 | import shutil
13 | from dataclasses import dataclass, field
14 | from typing import Any, Dict, List, Optional, Tuple, Union
15 |
16 | import numpy as np
17 | import tqdm
18 | import sklearn.model_selection
19 | import torch
20 | import scipy
21 | import torch.nn as nn
22 | import torch.nn.functional as F
23 | import torch.optim as optim
24 | from torch.utils.tensorboard import SummaryWriter
25 | from torch.utils.data import Dataset, DataLoader
26 | from transformers import AutoModelForCausalLM
27 | import transformers
28 | from transformers import AutoTokenizer
29 | from transformers import TrainingArguments, Trainer, TrainerCallback
30 | from transformers import HfArgumentParser
31 | from pynvml import *
32 |
33 |
34 | proj_dir = os.path.expanduser('~/polygon/chatbot')
35 |
36 |
37 | def print_gpu_utilization():
38 | nvmlInit()
39 | handle = nvmlDeviceGetHandleByIndex(0)
40 | info = nvmlDeviceGetMemoryInfo(handle)
41 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
42 |
43 |
44 | def load_samples(dataset_path, tokenizer):
45 | samples = []
46 | with open(dataset_path, 'r') as f:
47 | data = json.load(f)
48 | for sample in tqdm.tqdm(data, desc='Loading samples', total=len(data)):
49 | try:
50 | lines = []
51 | for i, msg in enumerate(sample):
52 | if 0 == (i % 2):
53 | lines.append('человек: ' + msg)
54 | else:
55 | lines.append('чатбот: ' + msg)
56 |
57 | text = '\n'.join(lines)
58 | tokens = tokenizer.encode(text, add_special_tokens=False, truncation=False)
59 | if len(tokens) < 512:
60 | samples.append({'tokens': tokens, 'text': text})
61 | else:
62 | lines0 = list(lines)
63 |
64 | lines = lines[:-1]
65 | while len(lines) > 1:
66 | text = '\n'.join(lines)
67 | tokens = tokenizer.encode(text, add_special_tokens=False, truncation=False)
68 | if len(tokens) < 512:
69 | samples.append({'tokens': tokens, 'text': text})
70 | break
71 | else:
72 | lines = lines[:-1]
73 |
74 |
75 | except Exception as ex:
76 | print(ex)
77 |
78 | return samples
79 |
80 |
81 | class FinetuneDataset(Dataset):
82 | def __init__(self, samples, tokenizer):
83 | self.tokenizer = tokenizer
84 | self.max_len = 0
85 | self.samples = []
86 |
87 | self.bos_token_id = tokenizer.encode('', add_special_tokens=False)[0]
88 | self.eos_token_id = tokenizer.encode('', add_special_tokens=False)[0]
89 | self.pad_token_id = tokenizer.encode('', add_special_tokens=False)[0]
90 |
91 | for sample in samples:
92 | input_ids = [self.bos_token_id] + sample['tokens'] + [self.eos_token_id]
93 | labels = input_ids
94 | attention_map = [1] * len(labels)
95 | self.samples.append((input_ids, labels, attention_map))
96 | self.max_len = max(self.max_len, len(input_ids))
97 |
98 | def __len__(self):
99 | return len(self.samples)
100 |
101 | def __getitem__(self, index: int):
102 | input_ids, labels, attention_map = self.samples[index]
103 | npad = self.max_len - len(input_ids)
104 | input_ids = input_ids + npad * [self.pad_token_id]
105 | labels = labels + [-100] * npad
106 | attention_mask = attention_map + [0] * npad
107 | return {'input_ids': torch.LongTensor(input_ids),
108 | 'labels': torch.LongTensor(labels),
109 | 'attention_mask': torch.LongTensor(attention_mask)}
110 |
111 |
112 | @dataclass
113 | class ModelArguments:
114 | """
115 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
116 | """
117 | model_name_or_path: Optional[str] = field(
118 | default='sberbank-ai/rugpt3medium_based_on_gpt2',
119 | metadata={"help": "The model checkpoint for weights initialization."},
120 | )
121 |
122 |
123 | @dataclass
124 | class DataTrainingArguments:
125 | """
126 | Arguments pertaining to what data we are going to input our model for training and eval.
127 | """
128 | dataset_path: Optional[str] = field(
129 | default=os.path.join(proj_dir, 'tmp', 'axioma_dialogues.solid.json'),
130 | metadata={"help": "Путь к датасету со диалогами"}
131 | )
132 |
133 |
134 | if __name__ == '__main__':
135 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
136 |
137 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
138 |
139 | if not training_args.output_dir:
140 | training_args.output_dir = os.path.join(proj_dir, 'tmp', 'rugpt_chitchat')
141 |
142 | rank0 = training_args.local_rank in (-1, 0)
143 |
144 | # Setup logging
145 | logging.basicConfig(
146 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
147 | datefmt="%m/%d/%Y %H:%M:%S",
148 | handlers=[logging.StreamHandler(sys.stdout)],
149 | )
150 |
151 | log_level = training_args.get_process_log_level()
152 | logger = logging.getLogger(__name__)
153 | logger.setLevel(log_level)
154 | #datasets.utils.logging.set_verbosity(log_level)
155 | transformers.utils.logging.set_verbosity(log_level)
156 | transformers.utils.logging.enable_default_handler()
157 | transformers.utils.logging.enable_explicit_format()
158 |
159 | logger.info(
160 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
161 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
162 | )
163 | logger.info(f"Training/evaluation parameters {training_args}")
164 |
165 | # Удаляем старые логи tensorboard
166 | if rank0:
167 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
168 | if os.path.exists(tensorboard_dir):
169 | logger.info('Removing "%s"', tensorboard_dir)
170 | shutil.rmtree(tensorboard_dir)
171 |
172 | pretrained_model_name = model_args.model_name_or_path
173 |
174 | print('Loading pretrained model "{}"...'.format(pretrained_model_name))
175 | if 'xglm' in pretrained_model_name.lower():
176 | tokenizer = transformers.XGLMTokenizer.from_pretrained(pretrained_model_name)
177 | model = transformers.XGLMForCausalLM.from_pretrained(pretrained_model_name)
178 | elif 'bloom' in pretrained_model_name:
179 | tokenizer = transformers.BloomTokenizer.from_pretrained(pretrained_model_name)
180 | model = transformers.BloomForCausalLM.from_pretrained(pretrained_model_name)
181 | else:
182 | tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name)
183 | model = transformers.AutoModelForCausalLM.from_pretrained(pretrained_model_name)
184 |
185 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''})
186 |
187 | if rank0:
188 | print_gpu_utilization()
189 |
190 | print('\nTokenizer:')
191 | for token in ' '.split():
192 | print('token "{}" id={}'.format(token, tokenizer.encode(token, add_special_tokens=False)))
193 |
194 | print('\nLoading dataset...')
195 | train_samples = load_samples(data_args.dataset_path, tokenizer)
196 | print('Train samples: {}'.format(len(train_samples)))
197 |
198 | train_dataset = FinetuneDataset(train_samples, tokenizer)
199 |
200 | trainer = Trainer(
201 | model=model,
202 | args=training_args,
203 | train_dataset=train_dataset,
204 | tokenizer=tokenizer,
205 | data_collator=None,
206 | # compute_metrics=compute_metrics,
207 | # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
208 | )
209 |
210 | logger.info('Start training...')
211 | try:
212 | train_result = trainer.train()
213 |
214 | metrics = train_result.metrics
215 | trainer.log_metrics("train", metrics)
216 | trainer.save_metrics("train", metrics)
217 | except KeyboardInterrupt as ex:
218 | print('!!! CTRL+C !!!')
219 |
220 | logger.info(f'Saving the model and tokenizer to "%s"', training_args.output_dir)
221 | trainer.save_model(output_dir=training_args.output_dir)
222 | #model.save_pretrained(training_args.output_dir)
223 | #tokenizer.save_pretrained(training_args.output_dir)
224 |
225 | logger.info('All done :)')
226 |
--------------------------------------------------------------------------------
/poetry/finetune_rugpt13b.py:
--------------------------------------------------------------------------------
1 | """
2 | Тренировка модели генерации силлабо-тонической поэзии поверх rugpt13B с использованием deepspeed.
3 | """
4 | import glob
5 | import logging
6 | import os
7 | import json
8 | from dataclasses import dataclass, field
9 | from typing import Any, Dict, List, Optional, Tuple, Union
10 | import shutil
11 |
12 | from torch.utils.data import Dataset, DataLoader
13 | from transformers import AutoModelForCausalLM
14 | import transformers
15 | from transformers import AutoTokenizer
16 | from transformers import TrainingArguments, Trainer, TrainerCallback
17 | from transformers import HfArgumentParser
18 | import deepspeed
19 | from pynvml import *
20 |
21 |
22 | proj_dir = os.path.expanduser('~/polygon/text_generator')
23 |
24 |
25 | def print_gpu_utilization():
26 | nvmlInit()
27 | handle = nvmlDeviceGetHandleByIndex(0)
28 | info = nvmlDeviceGetMemoryInfo(handle)
29 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
30 |
31 |
32 | def pad_sequence(sequence, pad_id, max_len):
33 | l = len(sequence)
34 | if l < max_len:
35 | return sequence + [pad_id] * (max_len - l)
36 | else:
37 | return sequence
38 |
39 |
40 | def load_samples(data_args, tokenizer):
41 | samples = []
42 | with open(data_args.dataset_path, 'r') as f:
43 | for sample_str in f:
44 | sample = json.loads(sample_str)
45 | prompt = sample['prompt_text']
46 | if prompt:
47 | if data_args.output_syllables:
48 | # Вариант с генерацией цепочки слогов
49 | lines = []
50 | for line in sample['output'].split(''):
51 | line = line.strip()
52 | tokens = line.split(' ')
53 | tokens = tokens[::-1]
54 | line = ' '.join(tokens)
55 | line = line.replace(' | ', '|')
56 | line = line.replace(' ', '\u2010')
57 | line = line.replace('|', ' ')
58 | lines.append(line)
59 | output_text = '\n'.join(lines)
60 | else:
61 | # Генерируется обычный текст.
62 | output_text = sample['output_text']
63 |
64 | # 29.04.2023 ограничим 2 первым катренами
65 | output_text = '\n\n'.join(output_text.split('\n\n')[:2])
66 |
67 | input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
68 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
69 | samples.append((input_tokens, output_tokens, prompt, output_text))
70 |
71 | if data_args.max_samples > 0 and len(samples) >= data_args.max_samples:
72 | break
73 |
74 | return samples
75 |
76 |
77 | class FinetuneDataset(Dataset):
78 | def __init__(self, samples, tokenizer):
79 | self.tokenizer = tokenizer
80 | self.max_len = 0
81 | self.samples = []
82 |
83 | self.bos_token_id = tokenizer.bos_token_id
84 | self.eos_token_id = tokenizer.eos_token_id
85 | assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1)
86 | self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0]
87 | self.pad_token_id = tokenizer.pad_token_id
88 |
89 | for src_ids, output_ids, src_text, output_text in samples:
90 | input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id]
91 |
92 | # Токены затравки дают label=-100
93 | labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id]
94 |
95 | attention_map = [1] * len(labels)
96 |
97 | self.samples.append((input_ids, labels, attention_map))
98 | self.max_len = max(self.max_len, len(input_ids))
99 |
100 | def __len__(self):
101 | return len(self.samples)
102 |
103 | def __getitem__(self, index: int):
104 | input_ids, labels, attention_map = self.samples[index]
105 | npad = self.max_len - len(input_ids)
106 | input_ids = input_ids + npad*[self.pad_token_id]
107 | labels = labels + [-100] * npad
108 | attention_mask = attention_map + [0] * npad
109 | return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
110 |
111 |
112 | @dataclass
113 | class ModelArguments:
114 | """
115 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
116 | """
117 | model_name_or_path: Optional[str] = field(
118 | default='ai-forever/rugpt13b',
119 | metadata={"help": "The model checkpoint for weights initialization."},
120 | )
121 |
122 |
123 | @dataclass
124 | class DataSetArguments:
125 | output_syllables: Optional[bool] = field(
126 | metadata={"help": "Силлабо-тоническое представление выходного текста (true) или обычное (false)"}
127 | )
128 |
129 | dataset_path: Optional[str] = field(
130 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')),
131 | metadata={"help": "Путь к датасету со стихами"}
132 | )
133 |
134 | max_samples: Optional[int] = field(
135 | default=-1,
136 | metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"}
137 | )
138 |
139 |
140 | class MyPrinterCallback(TrainerCallback):
141 | def __init__(self, filepath):
142 | self.wrt = open(filepath, 'w')
143 |
144 | def on_log(self, args, state, control, logs=None, **kwargs):
145 | if state.is_local_process_zero:
146 | if 'epoch' in logs and 'loss' in logs:
147 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
148 | self.wrt.flush()
149 |
150 |
151 | if __name__ == '__main__':
152 | parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments))
153 |
154 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
155 |
156 | if not training_args.output_dir:
157 | training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'verses_rugpt13b_lora'))
158 |
159 | # Setup logging
160 | logging.basicConfig(
161 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
162 | datefmt="%m/%d/%Y %H:%M:%S",
163 | handlers=[logging.StreamHandler(sys.stdout)],
164 | )
165 |
166 | log_level = training_args.get_process_log_level()
167 | logger = logging.getLogger(__name__)
168 | logger.setLevel(log_level)
169 | #datasets.utils.logging.set_verbosity(log_level)
170 | transformers.utils.logging.set_verbosity(log_level)
171 | transformers.utils.logging.enable_default_handler()
172 | transformers.utils.logging.enable_explicit_format()
173 |
174 | logger.info(
175 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
176 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
177 | )
178 | logger.info(f"Training/evaluation parameters {training_args}")
179 |
180 | # Удаляем старые логи tensorboard
181 | if training_args.local_rank in (-1, 0):
182 | for f in glob.glob(training_args.output_dir+'/*'):
183 | if os.path.isfile(f):
184 | os.remove(f)
185 |
186 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
187 | if os.path.exists(tensorboard_dir):
188 | logger.info('Removing "%s"', tensorboard_dir)
189 | shutil.rmtree(tensorboard_dir)
190 |
191 | #device = training_args.device
192 | #logging.info('device={}'.format(device))
193 |
194 | logger.info('Loading tokenizer "%s"', model_args.model_name_or_path)
195 |
196 | tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name_or_path)
197 |
198 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''})
199 | tokenizer.save_pretrained(training_args.output_dir)
200 |
201 | for t in ['#', '', '', '']:
202 | logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False)))
203 |
204 | logger.info('Loading pretrained model "%s"', model_args.model_name_or_path)
205 | model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)
206 | # model.half()
207 |
208 | if training_args.local_rank in (0, -1):
209 | print('=' * 80)
210 | print_gpu_utilization()
211 |
212 | print('=' * 30 + 'ZeRo 2' + '=' * 30)
213 | deepspeed.runtime.zero.stage_1_and_2.estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=1,
214 | num_nodes=1,
215 | additional_buffer_factor=1.5)
216 | print('=' * 80)
217 |
218 |
219 | logger.info('Loading dataset "%s"', data_args.dataset_path)
220 | train_samples = load_samples(data_args, tokenizer)
221 | logger.info('Training set: %d samples', len(train_samples))
222 |
223 | train_dataset = FinetuneDataset(train_samples, tokenizer)
224 |
225 | printer = MyPrinterCallback(os.path.join(training_args.output_dir, 'finetune_rugpt13b.loss.log'))
226 | trainer = Trainer(
227 | model=model,
228 | args=training_args,
229 | train_dataset=train_dataset,
230 | tokenizer=tokenizer,
231 | data_collator=None,
232 | callbacks=[printer]
233 | )
234 |
235 | logger.info('Start training...')
236 | train_result = trainer.train()
237 |
238 | logger.info(f'Saving the model and tokenizer')
239 | trainer.save_model(training_args.output_dir)
240 |
241 | metrics = train_result.metrics
242 | trainer.log_metrics("train", metrics)
243 | trainer.save_metrics("train", metrics)
244 |
245 | logger.info('All done :)')
246 |
--------------------------------------------------------------------------------
/poetry/finetune_rugpt13b_lora.py:
--------------------------------------------------------------------------------
1 | """
2 | Тренировка модели генерации силлабо-тонической поэзии поверх rugpt13B с использованием LoRa.
3 | """
4 | import glob
5 | import logging
6 | import os
7 | import json
8 | import io
9 | import random
10 | import itertools
11 | import sys
12 | from typing import Any, Dict, List, Optional, Tuple, Union
13 | from dataclasses import dataclass, field
14 | from typing import Any, Dict, List, Optional, Tuple, Union
15 | import shutil
16 | from pathlib import Path
17 |
18 | import numpy as np
19 | import tqdm
20 | import sklearn.model_selection
21 | import torch
22 | import scipy
23 | import torch.nn as nn
24 | import torch.nn.functional as F
25 | import torch.optim as optim
26 | from torch.utils.tensorboard import SummaryWriter
27 | from torch.utils.data import Dataset, DataLoader
28 | from transformers import AutoModelForCausalLM
29 | import transformers
30 | from transformers import AutoTokenizer
31 | from transformers import TrainingArguments, Trainer, TrainerCallback
32 | from transformers import HfArgumentParser
33 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
34 | from pynvml import *
35 |
36 |
37 | proj_dir = os.path.expanduser('~/polygon/text_generator')
38 |
39 |
40 | def print_gpu_utilization():
41 | nvmlInit()
42 | handle = nvmlDeviceGetHandleByIndex(0)
43 | info = nvmlDeviceGetMemoryInfo(handle)
44 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
45 |
46 |
47 | def pad_sequence(sequence, pad_id, max_len):
48 | l = len(sequence)
49 | if l < max_len:
50 | return sequence + [pad_id] * (max_len - l)
51 | else:
52 | return sequence
53 |
54 |
55 | def load_samples(data_args, tokenizer):
56 | samples = []
57 | with open(data_args.dataset_path, 'r') as f:
58 | for sample_str in f:
59 | sample = json.loads(sample_str)
60 | prompt = sample['prompt_text']
61 | if prompt:
62 | if data_args.output_syllables:
63 | # Вариант с генерацией цепочки слогов
64 | lines = []
65 | for line in sample['output'].split(''):
66 | line = line.strip()
67 | tokens = line.split(' ')
68 | tokens = tokens[::-1]
69 | line = ' '.join(tokens)
70 | line = line.replace(' | ', '|')
71 | line = line.replace(' ', '\u2010')
72 | line = line.replace('|', ' ')
73 | lines.append(line)
74 | output_text = '\n'.join(lines)
75 | else:
76 | # Генерируется обычный текст.
77 | output_text = sample['output_text']
78 |
79 | # 29.04.2023 ограничим 2 первым катренами
80 | output_text = '\n\n'.join(output_text.split('\n\n')[:2])
81 |
82 | input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
83 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
84 | samples.append((input_tokens, output_tokens, prompt, output_text))
85 |
86 | if data_args.max_samples > 0 and len(samples) >= data_args.max_samples:
87 | break
88 |
89 | return samples
90 |
91 |
92 | class FinetuneDataset(Dataset):
93 | def __init__(self, samples, tokenizer):
94 | self.tokenizer = tokenizer
95 | self.max_len = 0
96 | self.samples = []
97 |
98 | self.bos_token_id = tokenizer.bos_token_id
99 | self.eos_token_id = tokenizer.eos_token_id
100 | assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1)
101 | self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0]
102 | self.pad_token_id = tokenizer.pad_token_id
103 |
104 | for src_ids, output_ids, src_text, output_text in samples:
105 | input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id]
106 |
107 | # Токены затравки дают label=-100
108 | labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id]
109 |
110 | attention_map = [1] * len(labels)
111 |
112 | self.samples.append((input_ids, labels, attention_map))
113 | self.max_len = max(self.max_len, len(input_ids))
114 |
115 | def __len__(self):
116 | return len(self.samples)
117 |
118 | def __getitem__(self, index: int):
119 | input_ids, labels, attention_map = self.samples[index]
120 | npad = self.max_len - len(input_ids)
121 | input_ids = input_ids + npad*[self.pad_token_id]
122 | labels = labels + [-100] * npad
123 | attention_mask = attention_map + [0] * npad
124 | return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
125 |
126 |
127 | @dataclass
128 | class ModelArguments:
129 | """
130 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
131 | """
132 | model_name_or_path: Optional[str] = field(
133 | default='ai-forever/rugpt13b',
134 | metadata={"help": "The model checkpoint for weights initialization."},
135 | )
136 |
137 |
138 | @dataclass
139 | class DataSetArguments:
140 | output_syllables: Optional[bool] = field(
141 | metadata={"help": "Силлабо-тоническое представление выходного текста (true) или обычное (false)"}
142 | )
143 |
144 | dataset_path: Optional[str] = field(
145 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')),
146 | metadata={"help": "Путь к датасету со стихами"}
147 | )
148 |
149 | max_samples: Optional[int] = field(
150 | default=-1,
151 | metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"}
152 | )
153 |
154 |
155 | class MyPrinterCallback(TrainerCallback):
156 | def __init__(self, filepath):
157 | self.wrt = open(filepath, 'w')
158 |
159 | def on_log(self, args, state, control, logs=None, **kwargs):
160 | if state.is_local_process_zero:
161 | if 'epoch' in logs and 'loss' in logs:
162 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
163 | self.wrt.flush()
164 |
165 |
166 | if __name__ == '__main__':
167 | parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments))
168 |
169 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
170 |
171 | if not training_args.output_dir:
172 | training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'verses_rugpt13b_lora'))
173 |
174 | # Setup logging
175 | logging.basicConfig(
176 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
177 | datefmt="%m/%d/%Y %H:%M:%S",
178 | handlers=[logging.StreamHandler(sys.stdout)],
179 | )
180 |
181 | log_level = training_args.get_process_log_level()
182 | logger = logging.getLogger(__name__)
183 | logger.setLevel(log_level)
184 | #datasets.utils.logging.set_verbosity(log_level)
185 | transformers.utils.logging.set_verbosity(log_level)
186 | transformers.utils.logging.enable_default_handler()
187 | transformers.utils.logging.enable_explicit_format()
188 |
189 | logger.info(
190 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
191 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
192 | )
193 | logger.info(f"Training/evaluation parameters {training_args}")
194 |
195 | # Удаляем старые логи tensorboard
196 | if training_args.local_rank in (-1, 0):
197 | for f in glob.glob(training_args.output_dir+'/*'):
198 | if os.path.isfile(f):
199 | os.remove(f)
200 |
201 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
202 | if os.path.exists(tensorboard_dir):
203 | logger.info('Removing "%s"', tensorboard_dir)
204 | shutil.rmtree(tensorboard_dir)
205 |
206 | device = training_args.device
207 | logging.info('device={}'.format(device))
208 |
209 | logger.info('Loading tokenizer "%s"', model_args.model_name_or_path)
210 |
211 | tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_args.model_name_or_path)
212 |
213 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''})
214 | tokenizer.save_pretrained(training_args.output_dir)
215 |
216 | for t in ['#', '', '', '']:
217 | logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False)))
218 |
219 | logger.info('Loading pretrained model "%s"', model_args.model_name_or_path)
220 | model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)
221 |
222 | logger.info('Wrapping model to peft...')
223 | lora_config = LoraConfig(**{
224 | "r": 8,
225 | "lora_alpha": 16,
226 | "lora_dropout": 0.10,
227 | "bias": "none",
228 | #"target_modules": ["q_proj", "v_proj"],
229 | "task_type": "CAUSAL_LM"
230 | })
231 | model = get_peft_model(model, lora_config)
232 | print(model.print_trainable_parameters())
233 |
234 | logger.info('Loading dataset "%s"', data_args.dataset_path)
235 | train_samples = load_samples(data_args, tokenizer)
236 | logger.info('Training set: %d samples', len(train_samples))
237 |
238 | train_dataset = FinetuneDataset(train_samples, tokenizer)
239 |
240 | printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_rugpt13b_lora.loss.log'))
241 | trainer = Trainer(
242 | model=model,
243 | args=training_args,
244 | train_dataset=train_dataset,
245 | tokenizer=tokenizer,
246 | data_collator=None,
247 | callbacks=[printer]
248 | )
249 |
250 | logger.info('Start training for local_rank=%d...', training_args.local_rank)
251 | train_result = trainer.train()
252 |
253 | if training_args.local_rank in (0, -1):
254 | logger.info(f'Saving the model and tokenizer')
255 | model.save_pretrained(save_directory=training_args.output_dir)
256 |
257 | metrics = train_result.metrics
258 | trainer.log_metrics("train", metrics)
259 | trainer.save_metrics("train", metrics)
260 |
261 | logger.info('All done :)')
262 |
--------------------------------------------------------------------------------
/poetry/finetune_llama.py:
--------------------------------------------------------------------------------
1 | """
2 | Эксперимент с файнтюном модели LLaMa на стихах.
3 | """
4 |
5 | import glob
6 | import logging
7 | import os
8 | import json
9 | import io
10 | import random
11 | import itertools
12 | import sys
13 | from typing import Any, Dict, List, Optional, Tuple, Union
14 | from dataclasses import dataclass, field
15 | from typing import Any, Dict, List, Optional, Tuple, Union
16 | import shutil
17 | from pathlib import Path
18 |
19 | import numpy as np
20 | import tqdm
21 | import sklearn.model_selection
22 | import torch
23 | import scipy
24 | import torch.nn as nn
25 | import torch.nn.functional as F
26 | import torch.optim as optim
27 | from torch.utils.tensorboard import SummaryWriter
28 | from torch.utils.data import Dataset, DataLoader
29 | from transformers import AutoModelForCausalLM
30 | import transformers
31 | from transformers import AutoTokenizer
32 | from transformers import TrainingArguments, Trainer, TrainerCallback
33 | from transformers import HfArgumentParser
34 | from pynvml import *
35 | import deepspeed
36 |
37 | proj_dir = os.path.expanduser('~/polygon/text_generator')
38 |
39 |
40 | def print_gpu_utilization():
41 | nvmlInit()
42 | handle = nvmlDeviceGetHandleByIndex(0)
43 | info = nvmlDeviceGetMemoryInfo(handle)
44 | # logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
45 | print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")
46 |
47 |
48 | def pad_sequence(sequence, pad_id, max_len):
49 | l = len(sequence)
50 | if l < max_len:
51 | return sequence + [pad_id] * (max_len - l)
52 | else:
53 | return sequence
54 |
55 |
56 | def load_samples(data_args, tokenizer):
57 | samples = []
58 | with open(data_args.dataset_path, 'r') as f:
59 | for sample_str in f:
60 | sample = json.loads(sample_str)
61 | prompt = sample['prompt_text']
62 | if prompt:
63 | if data_args.output_syllables:
64 | # Вариант с генерацией цепочки слогов
65 | lines = []
66 | for line in sample['output'].split(''):
67 | line = line.strip()
68 | tokens = line.split(' ')
69 | tokens = tokens[::-1]
70 | line = ' '.join(tokens)
71 | line = line.replace(' | ', '|')
72 | line = line.replace(' ', '\u2010')
73 | line = line.replace('|', ' ')
74 | lines.append(line)
75 | output_text = '\n'.join(lines)
76 | else:
77 | output_text = sample['output_text']
78 |
79 | # 29.04.2023 ограничим 2 первым катренами
80 | output_text = '\n\n'.join(output_text.split('\n\n')[:2])
81 |
82 | input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
83 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
84 | samples.append((input_tokens, output_tokens, prompt, output_text))
85 |
86 | if data_args.max_samples > 0 and len(samples) >= data_args.max_samples:
87 | break
88 |
89 | return samples
90 |
91 |
92 | class FinetuneDataset(Dataset):
93 | def __init__(self, samples, tokenizer):
94 | self.tokenizer = tokenizer
95 | self.max_len = 0
96 | self.samples = []
97 |
98 | self.bos_token_id = tokenizer.bos_token_id
99 | self.eos_token_id = tokenizer.eos_token_id
100 | assert (len(tokenizer.encode('#', add_special_tokens=False)) == 1)
101 | self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0]
102 | self.pad_token_id = tokenizer.pad_token_id
103 |
104 | for src_ids, output_ids, src_text, output_text in samples:
105 | input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id]
106 |
107 | # Токены затравки дают label=-100
108 | labels = [-100] + [-100] * len(src_ids) + [-100] + output_ids + [self.eos_token_id]
109 |
110 | attention_map = [1] * len(labels)
111 |
112 | self.samples.append((input_ids, labels, attention_map))
113 | self.max_len = max(self.max_len, len(input_ids))
114 |
115 | def __len__(self):
116 | return len(self.samples)
117 |
118 | def __getitem__(self, index: int):
119 | input_ids, labels, attention_map = self.samples[index]
120 | npad = self.max_len - len(input_ids)
121 | input_ids = input_ids + npad * [self.pad_token_id]
122 | labels = labels + [-100] * npad
123 | attention_mask = attention_map + [0] * npad
124 | return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
125 |
126 |
127 | @dataclass
128 | class ModelArguments:
129 | """
130 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
131 | """
132 | model_name_or_path: Optional[str] = field(
133 | default='decapoda-research/llama-7b-hf',
134 | metadata={"help": "The model checkpoint for weights initialization."},
135 | )
136 |
137 | load_in_8bit: Optional[bool] = field(
138 | default=False,
139 | metadata={"help": "Load model in 8-bit"},
140 | )
141 |
142 |
143 | @dataclass
144 | class DataSetArguments:
145 | """
146 | Arguments pertaining to what data we are going to input our model for training and eval.
147 | """
148 | dataset_path: Optional[str] = field(
149 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')),
150 | metadata={"help": "Путь к датасету со стихами"}
151 | )
152 |
153 | output_syllables: Optional[bool] = field(
154 | default=False,
155 | metadata={"help": "Силлабо-тоническое представление выходного текста"}
156 | )
157 |
158 | max_samples: Optional[int] = field(
159 | default=-1,
160 | metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"}
161 | )
162 |
163 |
164 | class MyPrinterCallback(TrainerCallback):
165 | def __init__(self, filepath):
166 | self.wrt = open(filepath, 'w')
167 |
168 | def on_log(self, args, state, control, logs=None, **kwargs):
169 | if state.is_local_process_zero:
170 | if 'epoch' in logs and 'loss' in logs:
171 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
172 | self.wrt.flush()
173 |
174 |
175 | if __name__ == '__main__':
176 | parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments))
177 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
178 |
179 | # Setup logging
180 | logging.basicConfig(
181 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
182 | datefmt="%d.%m.%Y %H:%M:%S",
183 | handlers=[logging.StreamHandler(sys.stdout)],
184 | )
185 |
186 | log_level = training_args.get_process_log_level()
187 | logger = logging.getLogger(__name__)
188 | logger.setLevel(log_level)
189 | # datasets.utils.logging.set_verbosity(log_level)
190 | transformers.utils.logging.set_verbosity(log_level)
191 | transformers.utils.logging.enable_default_handler()
192 | transformers.utils.logging.enable_explicit_format()
193 |
194 | logger.info(
195 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
196 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
197 | )
198 | logger.info(f"Training/evaluation parameters {training_args}")
199 |
200 | # Удаляем старые логи tensorboard
201 | if training_args.local_rank in (-1, 0):
202 | for f in glob.glob(training_args.output_dir + '/*'):
203 | if os.path.isfile(f):
204 | os.remove(f)
205 |
206 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
207 | if os.path.exists(tensorboard_dir):
208 | logger.info('Removing "%s"', tensorboard_dir)
209 | shutil.rmtree(tensorboard_dir)
210 |
211 | logger.info('Loading pretrained model "%s"', model_args.model_name_or_path)
212 | model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path,
213 | # load_in_8bit=model_args.load_in_8bit,
214 | # device_map="auto"
215 | )
216 | #model.half()
217 | #model.to(device)
218 |
219 | if training_args.local_rank in (0, -1):
220 | print('=' * 80)
221 | print_gpu_utilization()
222 |
223 | if training_args.deepspeed:
224 | print('=' * 30 + 'ZeRo 2' + '=' * 30)
225 | deepspeed.runtime.zero.stage_1_and_2.estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=1,
226 | num_nodes=1,
227 | additional_buffer_factor=1.5)
228 | print('=' * 80)
229 |
230 | # ------------------------- ТОКЕНИЗАТОР ----------------------------------
231 | logger.info('Loading tokenizer "%s"', model_args.model_name_or_path)
232 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_args.model_name_or_path)
233 |
234 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''})
235 |
236 | for t in ['#', '', '', '']:
237 | logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False)))
238 |
239 | tokenizer.save_pretrained(training_args.output_dir)
240 |
241 | logger.info('Loading dataset "%s"', data_args.dataset_path)
242 | train_samples = load_samples(data_args, tokenizer)
243 | logger.info('Training set: %d samples', len(train_samples))
244 |
245 | train_dataset = FinetuneDataset(train_samples, tokenizer)
246 |
247 | printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_llama.loss.log'))
248 | trainer = Trainer(
249 | model=model,
250 | args=training_args,
251 | train_dataset=train_dataset,
252 | tokenizer=tokenizer,
253 | data_collator=None,
254 | callbacks=[printer]
255 | )
256 |
257 | logger.info('Start training...')
258 | train_result = trainer.train()
259 |
260 | #trainer.save_model(output_dir=training_args.output_dir)
261 | if training_args.local_rank in (0, -1):
262 | logger.info(f'Saving the model and tokenizer')
263 | model.save_pretrained(save_directory=training_args.output_dir)
264 |
265 | metrics = train_result.metrics
266 | trainer.log_metrics("train", metrics)
267 | trainer.save_metrics("train", metrics)
268 |
269 | logger.info('All done :)')
270 |
--------------------------------------------------------------------------------
/poetry/finetune_rugpt_with_prompt_masking.py:
--------------------------------------------------------------------------------
1 | """
2 | Тренировка модели генерации стихов поверх rugpt*** с исключением обратного распространения на токенах затравки.
3 | """
4 | import glob
5 | import logging
6 | import os
7 | import json
8 | import io
9 | import random
10 | import itertools
11 | import sys
12 | from typing import Any, Dict, List, Optional, Tuple, Union
13 | from dataclasses import dataclass, field
14 | from typing import Any, Dict, List, Optional, Tuple, Union
15 | import shutil
16 | from pathlib import Path
17 |
18 | import numpy as np
19 | import tqdm
20 | import sklearn.model_selection
21 | import torch
22 | import scipy
23 | import torch.nn as nn
24 | import torch.nn.functional as F
25 | import torch.optim as optim
26 | from torch.utils.tensorboard import SummaryWriter
27 | from torch.utils.data import Dataset, DataLoader
28 | from transformers import AutoModelForCausalLM
29 | import transformers
30 | from transformers import AutoTokenizer
31 | from transformers import TrainingArguments, Trainer, TrainerCallback
32 | from transformers import HfArgumentParser
33 | from pynvml import *
34 |
35 |
36 | proj_dir = os.path.expanduser('~/polygon/text_generator')
37 |
38 |
39 | def print_gpu_utilization():
40 | nvmlInit()
41 | handle = nvmlDeviceGetHandleByIndex(0)
42 | info = nvmlDeviceGetMemoryInfo(handle)
43 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
44 |
45 |
46 | def pad_sequence(sequence, pad_id, max_len):
47 | l = len(sequence)
48 | if l < max_len:
49 | return sequence + [pad_id] * (max_len - l)
50 | else:
51 | return sequence
52 |
53 |
54 | def load_samples(data_args, tokenizer, model_args):
55 | samples = []
56 | with open(data_args.dataset_path, 'r') as f:
57 | for sample_str in f:
58 | sample = json.loads(sample_str)
59 | prompt = sample['prompt_text']
60 | if prompt:
61 | if data_args.output_syllables:
62 | # Вариант с генерацией цепочки слогов
63 | lines = []
64 | for line in sample['output'].split(''):
65 | line = line.strip()
66 | tokens = line.split(' ')
67 | tokens = tokens[::-1]
68 | line = ' '.join(tokens)
69 | line = line.replace(' | ', '|')
70 | line = line.replace(' ', '\u2010')
71 | line = line.replace('|', ' ')
72 | lines.append(line)
73 | output_text = '\n'.join(lines)
74 | else:
75 | output_text = sample['output_text']
76 |
77 | # 29.04.2023 ограничим 2 первым катренами
78 | output_text = '\n\n'.join(output_text.split('\n\n')[:2])
79 |
80 | if 'xglm' in model_args.model_name_or_path.lower():
81 | # 21.05.2023 почему-то токенизатор XGLM иногда теряет переводы строк.
82 | # Поэтому заменим на особое сочетание, которое при генерации будем заменять обратно на \n
83 | output_text = output_text.replace('\n', '\\n')
84 |
85 | input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
86 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
87 | samples.append((input_tokens, output_tokens, prompt, output_text))
88 |
89 | if data_args.max_samples > 0 and len(samples) >= data_args.max_samples:
90 | break
91 |
92 | return samples
93 |
94 |
95 | class FinetuneDataset(Dataset):
96 | def __init__(self, samples, tokenizer):
97 | self.tokenizer = tokenizer
98 | self.max_len = 0
99 | self.samples = []
100 |
101 | self.bos_token_id = tokenizer.bos_token_id
102 | self.eos_token_id = tokenizer.eos_token_id
103 | assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1)
104 | self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0]
105 | self.pad_token_id = tokenizer.pad_token_id
106 |
107 | for src_ids, output_ids, src_text, output_text in samples:
108 | input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id]
109 |
110 | # Токены затравки дают label=-100
111 | labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id]
112 |
113 | attention_map = [1] * len(labels)
114 |
115 | self.samples.append((input_ids, labels, attention_map))
116 | self.max_len = max(self.max_len, len(input_ids))
117 |
118 | def __len__(self):
119 | return len(self.samples)
120 |
121 | def __getitem__(self, index: int):
122 | input_ids, labels, attention_map = self.samples[index]
123 | npad = self.max_len - len(input_ids)
124 | input_ids = input_ids + npad*[self.pad_token_id]
125 | labels = labels + [-100] * npad
126 | attention_mask = attention_map + [0] * npad
127 | return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
128 |
129 |
130 |
131 | @dataclass
132 | class ModelArguments:
133 | """
134 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
135 | """
136 | model_name_or_path: Optional[str] = field(
137 | default='sberbank-ai/rugpt3large_based_on_gpt2',
138 | metadata={"help": "The model checkpoint for weights initialization."},
139 | )
140 |
141 | tokenizer_path: Optional[str] = field(
142 | default='sberbank-ai/rugpt3large_based_on_gpt2',
143 | metadata={"help": "Path to tokenizer."},
144 | )
145 |
146 |
147 | @dataclass
148 | class DataSetArguments:
149 | """
150 | Arguments pertaining to what data we are going to input our model for training and eval.
151 | """
152 | dataset_path: Optional[str] = field(
153 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'пирожки.jsonl')),
154 | metadata={"help": "Путь к датасету со стихами"}
155 | )
156 |
157 | output_syllables: Optional[bool] = field(
158 | default=False,
159 | metadata={"help": "Силлабо-тоническое представление выходного текста"}
160 | )
161 |
162 | max_samples: Optional[int] = field(
163 | default=-1,
164 | metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"}
165 | )
166 |
167 |
168 | class MyPrinterCallback(TrainerCallback):
169 | def __init__(self, filepath):
170 | self.wrt = open(filepath, 'w')
171 |
172 | def on_log(self, args, state, control, logs=None, **kwargs):
173 | if state.is_local_process_zero:
174 | if 'epoch' in logs and 'loss' in logs:
175 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
176 | self.wrt.flush()
177 |
178 |
179 | if __name__ == '__main__':
180 | parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments))
181 |
182 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
183 |
184 | if not training_args.output_dir:
185 | training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'verses_pirozhki_rugpt'))
186 |
187 | verbose = training_args.local_rank in (-1, 0)
188 |
189 | # Setup logging
190 | logging.basicConfig(
191 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
192 | datefmt="%m/%d/%Y %H:%M:%S",
193 | handlers=[logging.StreamHandler(sys.stdout)],
194 | )
195 |
196 | log_level = training_args.get_process_log_level()
197 | logger = logging.getLogger(__name__)
198 | logger.setLevel(log_level)
199 | #datasets.utils.logging.set_verbosity(log_level)
200 | transformers.utils.logging.set_verbosity(log_level)
201 | transformers.utils.logging.enable_default_handler()
202 | transformers.utils.logging.enable_explicit_format()
203 |
204 | logger.info(
205 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
206 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
207 | )
208 | logger.info(f"Training/evaluation parameters {training_args}")
209 |
210 | # Удаляем старые логи tensorboard
211 | if training_args.local_rank in (-1, 0):
212 | for f in glob.glob(training_args.output_dir+'/*'):
213 | if os.path.isfile(f):
214 | os.remove(f)
215 |
216 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
217 | if os.path.exists(tensorboard_dir):
218 | logger.info('Removing "%s"', tensorboard_dir)
219 | shutil.rmtree(tensorboard_dir)
220 |
221 | device = training_args.device
222 | logging.info('device={}'.format(device))
223 |
224 | if not model_args.tokenizer_path:
225 | model_args.tokenizer_path = model_args.model_name_or_path
226 |
227 | logger.info('Loading tokenizer "%s"', model_args.tokenizer_path)
228 |
229 | if 'llama' in model_args.tokenizer_path:
230 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_args.tokenizer_path)
231 | else:
232 | #tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_args.tokenizer_path)
233 | tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.tokenizer_path)
234 |
235 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''})
236 | tokenizer.save_pretrained(training_args.output_dir)
237 |
238 | for t in ['#', '', '', '']:
239 | logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False)))
240 |
241 | logger.info('Loading pretrained model "%s"', model_args.model_name_or_path)
242 | model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)
243 | model.to(device)
244 |
245 | logger.info('Loading dataset "%s"', data_args.dataset_path)
246 | train_samples = load_samples(data_args, tokenizer, model_args)
247 | logger.info('Training set: %d samples', len(train_samples))
248 |
249 | train_dataset = FinetuneDataset(train_samples, tokenizer)
250 |
251 | printer = MyPrinterCallback(os.path.join(training_args.output_dir, 'finetune_rugpt_with_prompt_masking.loss.log'))
252 | trainer = Trainer(
253 | model=model,
254 | args=training_args,
255 | train_dataset=train_dataset,
256 | tokenizer=tokenizer,
257 | data_collator=None,
258 | callbacks=[printer]
259 | )
260 |
261 | logger.info('Start training...')
262 | train_result = trainer.train()
263 |
264 | logger.info(f'Saving the model and tokenizer')
265 | trainer.save_model(output_dir=training_args.output_dir)
266 |
267 | metrics = train_result.metrics
268 | trainer.log_metrics("train", metrics)
269 | trainer.save_metrics("train", metrics)
270 |
271 | logger.info('All done :)')
272 |
--------------------------------------------------------------------------------
/poetry/finetune_llama_lora.py:
--------------------------------------------------------------------------------
1 | """
2 | Эксперимент с файнтюном на стихах модели LLaMa + LoRa.
3 | """
4 | import glob
5 | import logging
6 | import os
7 | import json
8 | import io
9 | import random
10 | import itertools
11 | import sys
12 | from typing import Any, Dict, List, Optional, Tuple, Union
13 | from dataclasses import dataclass, field
14 | from typing import Any, Dict, List, Optional, Tuple, Union
15 | import shutil
16 | from pathlib import Path
17 |
18 | import numpy as np
19 | import tqdm
20 | import sklearn.model_selection
21 | import torch
22 | import scipy
23 | import torch.nn as nn
24 | import torch.nn.functional as F
25 | import torch.optim as optim
26 | from torch.utils.tensorboard import SummaryWriter
27 | from torch.utils.data import Dataset, DataLoader
28 | from transformers import AutoModelForCausalLM
29 | import transformers
30 | from transformers import AutoTokenizer
31 | from transformers import TrainingArguments, Trainer, TrainerCallback
32 | from transformers import HfArgumentParser
33 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
34 | from pynvml import *
35 |
36 |
37 | proj_dir = os.path.expanduser('~/polygon/text_generator')
38 |
39 |
40 | def print_gpu_utilization():
41 | nvmlInit()
42 | handle = nvmlDeviceGetHandleByIndex(0)
43 | info = nvmlDeviceGetMemoryInfo(handle)
44 | logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
45 |
46 |
47 | def pad_sequence(sequence, pad_id, max_len):
48 | l = len(sequence)
49 | if l < max_len:
50 | return sequence + [pad_id] * (max_len - l)
51 | else:
52 | return sequence
53 |
54 |
55 | def load_samples(data_args, tokenizer):
56 | samples = []
57 | with open(data_args.dataset_path, 'r') as f:
58 | for sample_str in f:
59 | sample = json.loads(sample_str)
60 | prompt = sample['prompt_text']
61 | if prompt:
62 | if data_args.output_syllables:
63 | # Вариант с генерацией цепочки слогов
64 | lines = []
65 | for line in sample['output'].split(''):
66 | line = line.strip()
67 | tokens = line.split(' ')
68 | tokens = tokens[::-1]
69 | line = ' '.join(tokens)
70 | line = line.replace(' | ', '|')
71 | line = line.replace(' ', '\u2010')
72 | line = line.replace('|', ' ')
73 | lines.append(line)
74 | output_text = '\n'.join(lines)
75 | else:
76 | output_text = sample['output_text']
77 |
78 | # 29.04.2023 ограничим 2 первым катренами
79 | output_text = '\n\n'.join(output_text.split('\n\n')[:2])
80 |
81 | input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
82 | output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
83 | samples.append((input_tokens, output_tokens, prompt, output_text))
84 |
85 | if data_args.max_samples > 0 and len(samples) >= data_args.max_samples:
86 | break
87 |
88 | return samples
89 |
90 |
91 | class FinetuneDataset(Dataset):
92 | def __init__(self, samples, tokenizer):
93 | self.tokenizer = tokenizer
94 | self.max_len = 0
95 | self.samples = []
96 |
97 | self.bos_token_id = tokenizer.bos_token_id
98 | self.eos_token_id = tokenizer.eos_token_id
99 | assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1)
100 | self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0]
101 | self.pad_token_id = tokenizer.pad_token_id
102 |
103 | for src_ids, output_ids, src_text, output_text in samples:
104 | input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id]
105 |
106 | # Токены затравки дают label=-100
107 | labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id]
108 |
109 | attention_map = [1] * len(labels)
110 |
111 | self.samples.append((input_ids, labels, attention_map))
112 | self.max_len = max(self.max_len, len(input_ids))
113 |
114 | def __len__(self):
115 | return len(self.samples)
116 |
117 | def __getitem__(self, index: int):
118 | input_ids, labels, attention_map = self.samples[index]
119 | npad = self.max_len - len(input_ids)
120 | input_ids = input_ids + npad*[self.pad_token_id]
121 | labels = labels + [-100] * npad
122 | attention_mask = attention_map + [0] * npad
123 | return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
124 |
125 |
126 |
127 | @dataclass
128 | class ModelArguments:
129 | """
130 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
131 | """
132 | model_name_or_path: Optional[str] = field(
133 | default='decapoda-research/llama-7b-hf',
134 | metadata={"help": "The model checkpoint for weights initialization."},
135 | )
136 |
137 | load_in_8bit: Optional[bool] = field(
138 | default=False,
139 | metadata={"help": "Load model in 8-bit"},
140 | )
141 |
142 |
143 | @dataclass
144 | class DataSetArguments:
145 | """
146 | Arguments pertaining to what data we are going to input our model for training and eval.
147 | """
148 | dataset_path: Optional[str] = field(
149 | default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')),
150 | metadata={"help": "Путь к датасету со стихами"}
151 | )
152 |
153 | output_syllables: Optional[bool] = field(
154 | default=False,
155 | metadata={"help": "Силлабо-тоническое представление выходного текста"}
156 | )
157 |
158 | max_samples: Optional[int] = field(
159 | default=-1,
160 | metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"}
161 | )
162 |
163 |
164 | @dataclass
165 | class LoRaArguments:
166 | # "r": 8,
167 | # "lora_alpha": 16,
168 | # "lora_dropout": 0.10,
169 | lora_r: Optional[int] = field(
170 | default=8
171 | )
172 |
173 | lora_alpha: Optional[int] = field(
174 | default=16
175 | )
176 |
177 | lora_dropout: Optional[float] = field(
178 | default=0.10
179 | )
180 |
181 |
182 | class MyPrinterCallback(TrainerCallback):
183 | def __init__(self, filepath):
184 | self.wrt = open(filepath, 'w')
185 |
186 | def on_log(self, args, state, control, logs=None, **kwargs):
187 | if state.is_local_process_zero:
188 | if 'epoch' in logs and 'loss' in logs:
189 | self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
190 | self.wrt.flush()
191 |
192 |
193 | def fix_model(model, tokenizer, use_resize=True):
194 | model.config.pad_token_id = tokenizer.pad_token_id
195 | assert model.config.pad_token_id is not None
196 |
197 | bos_candidates = (
198 | tokenizer.bos_token_id,
199 | tokenizer.cls_token_id,
200 | tokenizer.sep_token_id,
201 | tokenizer.unk_token_id
202 | )
203 | for bos_candidate in bos_candidates:
204 | model.config.bos_token_id = bos_candidate
205 | if bos_candidate is not None:
206 | break
207 | assert model.config.bos_token_id is not None
208 | model.config.decoder_start_token_id = model.config.bos_token_id
209 |
210 | eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id)
211 | for eos_candidate in eos_candidates:
212 | model.config.eos_token_id = eos_candidate
213 | if eos_candidate is not None:
214 | break
215 | assert model.config.eos_token_id is not None
216 | if use_resize:
217 | model.resize_token_embeddings(len(tokenizer))
218 |
219 | return model
220 |
221 |
222 | if __name__ == '__main__':
223 | parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments, LoRaArguments))
224 | model_args, data_args, training_args, lora_args = parser.parse_args_into_dataclasses()
225 |
226 | lora_config = LoraConfig(**{
227 | "r": lora_args.lora_r,
228 | "lora_alpha": lora_args.lora_alpha,
229 | "lora_dropout": lora_args.lora_dropout,
230 | "bias": "none",
231 | "target_modules": ["q_proj", "v_proj"],
232 | "task_type": "CAUSAL_LM"
233 | })
234 |
235 | # Setup logging
236 | logging.basicConfig(
237 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
238 | datefmt="%d.%m.%Y %H:%M:%S",
239 | handlers=[logging.StreamHandler(sys.stdout)],
240 | )
241 |
242 | log_level = training_args.get_process_log_level()
243 | logger = logging.getLogger(__name__)
244 | logger.setLevel(log_level)
245 | #datasets.utils.logging.set_verbosity(log_level)
246 | transformers.utils.logging.set_verbosity(log_level)
247 | transformers.utils.logging.enable_default_handler()
248 | transformers.utils.logging.enable_explicit_format()
249 |
250 | logger.info(
251 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
252 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
253 | )
254 | logger.info(f"Training/evaluation parameters {training_args}")
255 |
256 | # Удаляем старые логи tensorboard
257 | if training_args.local_rank in (-1, 0):
258 | for f in glob.glob(training_args.output_dir+'/*'):
259 | if os.path.isfile(f):
260 | os.remove(f)
261 |
262 | tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
263 | if os.path.exists(tensorboard_dir):
264 | logger.info('Removing "%s"', tensorboard_dir)
265 | shutil.rmtree(tensorboard_dir)
266 |
267 | #device = training_args.device
268 | #logging.info('device={}'.format(device))
269 |
270 | logger.info('Loading tokenizer "%s"', model_args.model_name_or_path)
271 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_args.model_name_or_path)
272 |
273 | tokenizer.add_special_tokens({'bos_token': '', 'eos_token': '', 'pad_token': ''})
274 |
275 | for t in ['#', '', '', '']:
276 | logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False)))
277 |
278 | logger.info('Loading pretrained model "%s"', model_args.model_name_or_path)
279 | model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path,
280 | load_in_8bit=model_args.load_in_8bit,
281 | #device_map="auto"
282 | )
283 |
284 | #model.config.save_pretrained(training_args.output_dir)
285 |
286 | if model_args.load_in_8bit:
287 | model = fix_model(model, tokenizer, use_resize=False)
288 | model = prepare_model_for_int8_training(model)
289 |
290 | #model.to(device)
291 |
292 | tokenizer.save_pretrained(training_args.output_dir)
293 |
294 | logger.info('Wrapping LLaMa to peft...')
295 | model = get_peft_model(model, lora_config)
296 |
297 | #model.save_pretrained(training_args.output_dir)
298 |
299 | logger.info('Loading dataset "%s"', data_args.dataset_path)
300 | train_samples = load_samples(data_args, tokenizer)
301 | logger.info('Training set: %d samples', len(train_samples))
302 |
303 | train_dataset = FinetuneDataset(train_samples, tokenizer)
304 |
305 | printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_llama.loss.log'))
306 | trainer = Trainer(
307 | model=model,
308 | args=training_args,
309 | train_dataset=train_dataset,
310 | tokenizer=tokenizer,
311 | data_collator=None,
312 | callbacks=[printer]
313 | )
314 |
315 | logger.info('Start training...')
316 | train_result = trainer.train()
317 |
318 | # trainer.save_model(output_dir=training_args.output_dir)
319 | if training_args.local_rank in (0, -1):
320 | logger.info(f'Saving the model and tokenizer')
321 | model.save_pretrained(training_args.output_dir)
322 |
323 | metrics = train_result.metrics
324 | trainer.log_metrics("train", metrics)
325 | trainer.save_metrics("train", metrics)
326 |
327 | logger.info('All done :)')
328 |
--------------------------------------------------------------------------------