├── poetry
    ├── 13b_deepspeed_config.json
    ├── run_llama.py
    ├── run_llama_lora.py
    ├── run_rugpt13b_lora.py
    ├── run_rugpt_generator.py
    ├── finetune_fredt5_poetry_generator.py
    ├── finetune_rugpt13b.py
    ├── finetune_rugpt13b_lora.py
    ├── finetune_llama.py
    ├── finetune_rugpt_with_prompt_masking.py
    └── finetune_llama_lora.py
├── chitchat
    ├── run_chitchat_fredt5.py
    ├── run_chitchat_gpt.py
    ├── finetune_chitchat_fredt5_with_trainer.py
    └── finetune_chitchat_gpt_with_trainer.py
├── .gitignore
├── LICENSE
└── README.md


/poetry/13b_deepspeed_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "bf16": { "enabled": "auto" },
 3 |  "fp16": { "enabled": "auto" },
 4 |     
 5 |   "optimizer": {
 6 |         "type": "AdamW",
 7 |         "params": {
 8 |             "lr": "auto",
 9 |             "betas": "auto",
10 |             "eps": "auto",
11 |             "weight_decay": "auto"
12 |         }
13 |     },
14 |     
15 |     "zero_optimization": {
16 |         "stage": 2,
17 |         "offload_optimizer": {
18 |             "device": "cpu",
19 |             "pin_memory": true
20 |         },
21 |         "allgather_partitions": true,
22 |         "allgather_bucket_size": 2e8,
23 |         "overlap_comm": true,
24 |         "reduce_scatter": true,
25 |         "reduce_bucket_size": 2e8,
26 |         "contiguous_gradients": true
27 |     },
28 |     
29 |         
30 |         "train_batch_size": "auto",
31 |         "gradient_accumulation_steps": "auto"
32 | }
33 | 


--------------------------------------------------------------------------------
/poetry/run_llama.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Инференс стихов из отфайнтюненной модели LLaMa (см. finetune_llama.py)
 3 | """
 4 | 
 5 | import transformers
 6 | import torch
 7 | 
 8 | 
 9 | model_path = "/home/jovyan/polygon/text_generator/tmp/verses_model=llama7b_domain=lyrics_syllables=0"
10 | 
11 | print('Loading model "{}"...'.format(model_path))
12 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_path)
13 | model = transformers.AutoModelForCausalLM.from_pretrained(model_path,
14 |                                                           # load_in_8bit=model_args.load_in_8bit,
15 |                                                           # device_map="auto"
16 |                                                           )
17 | 
18 | device = torch.device("cuda")
19 | model.to(device)
20 | 
21 | while True:
22 |     seed = input(':> ')
23 |     prompt = '<s>' + seed + '#'
24 | 
25 |     encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
26 |     #print('DEBUG@26 encoded_prompt=', encoded_prompt)
27 |     encoded_prompt = encoded_prompt.to(device)
28 | 
29 |     pad_token_id = tokenizer.encode('<pad>', add_special_tokens=False)[0]
30 |     # end_token_id = self.tokenizer.encode('</s>', add_special_tokens=False)[0]
31 | 
32 |     output_sequences = model.generate(
33 |         input_ids=encoded_prompt,
34 |         pad_token_id=pad_token_id,
35 |         do_sample=True,
36 |         temperature=1.0,
37 |         top_p=0.80,
38 |         max_length=300,
39 |         num_return_sequences=5,
40 |     )
41 | 
42 |     stop_token = '</s>'
43 | 
44 |     generated_sequences = set()
45 |     for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
46 |         generated_sequence = generated_sequence.tolist()
47 |         #print('DEBUG@46 ==> ', generated_sequence)
48 | 
49 |         text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
50 |         if stop_token in text:
51 |             text = text[: text.find(stop_token)]
52 | 
53 |         text = text[text.index('#') + 1:].strip()
54 |         text = text.replace('\u2010', '').replace('\u0301', '')
55 |         print('='*80)
56 |         print(text)
57 | 


--------------------------------------------------------------------------------
/poetry/run_llama_lora.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Инференс стихов из отфайнтюненной модели LLaMa+LoRa (см. finetune_llama_lora.py)
 3 | """
 4 | 
 5 | import transformers
 6 | import torch
 7 | from peft import PeftModel, PeftConfig
 8 | 
 9 | 
10 | model_path = "/home/jovyan/polygon/text_generator/tmp/verses_model=llama7b_lora_domain=lyrics_syllables=0"
11 | 
12 | print('Loading LLaMa tokenizer "{}"...'.format(model_path))
13 | tokenizer = transformers.LlamaTokenizer.from_pretrained(model_path)
14 | 
15 | peft_model_id = model_path
16 | print('Loading peft model "{}"...'.format(peft_model_id))
17 | config = PeftConfig.from_pretrained(peft_model_id)
18 | 
19 | print('Loading backbone LLaMa "{}"...'.format(config.base_model_name_or_path))
20 | model = transformers.AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
21 | model = PeftModel.from_pretrained(model, peft_model_id)
22 | 
23 | device = torch.device("cuda")
24 | model.to(device)
25 | 
26 | while True:
27 |     seed = input(':> ')
28 |     prompt = '<s>' + seed + '#'
29 | 
30 |     encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
31 |     #print('DEBUG@26 encoded_prompt=', encoded_prompt)
32 |     encoded_prompt = encoded_prompt.to(device)
33 | 
34 |     pad_token_id = tokenizer.encode('<pad>', add_special_tokens=False)[0]
35 |     # end_token_id = self.tokenizer.encode('</s>', add_special_tokens=False)[0]
36 | 
37 |     output_sequences = model.generate(
38 |         input_ids=encoded_prompt,
39 |         pad_token_id=pad_token_id,
40 |         do_sample=True,
41 |         temperature=1.0,
42 |         top_p=0.80,
43 |         max_length=300,
44 |         num_return_sequences=5,
45 |     )
46 | 
47 |     stop_token = '</s>'
48 | 
49 |     generated_sequences = set()
50 |     for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
51 |         generated_sequence = generated_sequence.tolist()
52 |         #print('DEBUG@46 ==> ', generated_sequence)
53 | 
54 |         text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
55 |         if stop_token in text:
56 |             text = text[: text.find(stop_token)]
57 | 
58 |         text = text[text.index('#') + 1:].strip()
59 |         text = text.replace('\u2010', '').replace('\u0301', '')
60 |         print('='*80)
61 |         print(text)
62 | 


--------------------------------------------------------------------------------
/chitchat/run_chitchat_fredt5.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | import torch
 5 | import transformers
 6 | from transformers import T5Config
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     proj_dir = os.path.expanduser('~/polygon/chatbot')
11 | 
12 |     parser = argparse.ArgumentParser(description='Консольная интерактивная проверка модели читчата')
13 |     parser.add_argument('--model', type=str, default=os.path.join(proj_dir, 'tmp', 'fredt5_chitchat'), help='Путь к каталогу с файлами модели')
14 |     args = parser.parse_args()
15 | 
16 |     use_cuda = torch.cuda.is_available()
17 |     device = torch.device("cuda" if use_cuda else "cpu")
18 | 
19 |     model_dir = args.model
20 |     print(f'Loading model "{model_dir}"...')
21 |     t5_config = T5Config.from_pretrained(model_dir)
22 | 
23 |     if 'FRED-T5' in t5_config.name_or_path:
24 |         t5_tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_dir)
25 |     else:
26 |         t5_tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir)
27 | 
28 |     t5_model = transformers.T5ForConditionalGeneration.from_pretrained(model_dir)
29 |     t5_model.to(device)
30 |     t5_model.eval()
31 | 
32 |     while True:
33 |         print('-'*80)
34 |         dialog = []
35 |         while True:
36 |             msg = input('H:> ').strip()
37 |             if len(msg) == 0:
38 |                 break
39 | 
40 |             msg = msg[0].upper() + msg[1:]
41 | 
42 |             dialog.append('человек: ' + msg)
43 | 
44 |             #prompt = '<LM>'+'\n'.join(dialog)
45 |             prompt = '<SC1>' + '\n'.join(dialog) + '\nчатбот: <extra_id_0>'
46 | 
47 |             input_ids = t5_tokenizer(prompt, return_tensors='pt').input_ids
48 |             out_ids = t5_model.generate(input_ids=input_ids.to(device),
49 |                                         max_length=200,
50 |                                         eos_token_id=t5_tokenizer.eos_token_id,
51 |                                         early_stopping=True,
52 |                                         do_sample=True,
53 |                                         temperature=1.0,
54 |                                         top_k=0,
55 |                                         top_p=0.85)
56 | 
57 |             t5_output = t5_tokenizer.decode(out_ids[0][1:])
58 |             if '</s>' in t5_output:
59 |                 t5_output = t5_output[:t5_output.find('</s>')].strip()
60 | 
61 |             t5_output = t5_output.replace('<extra_id_0>', '').strip()
62 | 
63 |             print('B:> {}'.format(t5_output))
64 |             dialog.append('чатбот: ' + t5_output)
65 | 


--------------------------------------------------------------------------------
/chitchat/run_chitchat_gpt.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | import torch
 4 | import transformers
 5 | 
 6 | 
 7 | class Chitchat(object):
 8 |     def __init__(self, device, models_dir):
 9 |         model_name = os.path.join(models_dir, 'rugpt_chitchat')
10 |         self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
11 |         self.model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
12 |         self.model.to(device)
13 |         self.model.eval()
14 | 
15 |     def reply(self, history, num_return_sequences):
16 |         prompt = '<s>' + '\n'.join(history) + '\nчатбот:'
17 |         encoded_prompt = self.tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(device)
18 |         output_sequences = self.model.generate(input_ids=encoded_prompt,
19 |                                                max_length=len(prompt) + 120,
20 |                                                temperature=0.90,
21 |                                                typical_p=None,
22 |                                                top_k=0,
23 |                                                top_p=0.8,
24 |                                                do_sample=True,
25 |                                                num_return_sequences=num_return_sequences,
26 |                                                pad_token_id=self.tokenizer.pad_token_id)
27 | 
28 |         replies = []
29 | 
30 |         for o in output_sequences:
31 |             reply = self.tokenizer.decode(o.tolist(), clean_up_tokenization_spaces=True)
32 |             reply = reply[len(prompt):]  # отсекаем затравку
33 |             reply = reply[: reply.find('</s>')]
34 | 
35 |             if '\nчеловек:' in reply:
36 |                 reply = reply[:reply.index('\nчеловек:')]
37 | 
38 |             reply = reply.strip()
39 | 
40 |             if reply not in replies:  # только уникальные реплики, сохраняем порядок выдачи
41 |                 replies.append(reply)
42 | 
43 |         return replies
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     device = "cuda" if torch.cuda.is_available() else "cpu"
48 |     models_dir = os.path.expanduser('~/polygon/chatbot/tmp')
49 | 
50 |     chitchat = Chitchat(device, models_dir)
51 | 
52 |     while True:
53 |         dialog = []
54 |         while True:
55 |             msg = input('H:> ').strip()
56 |             if msg:
57 |                 dialog.append('человек: ' + msg)
58 |                 reply = chitchat.reply(dialog, num_return_sequences=1)[0]
59 |                 print(f'B:> {reply}')
60 |                 dialog.append('чатбот: ' + reply)
61 |             else:
62 |                 dialog = []
63 |                 print('-'*100)
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/poetry/run_rugpt13b_lora.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import sys
  4 | import argparse
  5 | 
  6 | import torch
  7 | import transformers
  8 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
  9 | 
 10 | from peft import PeftModel, PeftConfig
 11 | 
 12 | 
 13 | class RugptGenerator:
 14 |     def __init__(self, model_path, temperature, top_p):
 15 |         self.model_path = os.path.expanduser(model_path)
 16 |         self.temperature = temperature
 17 |         self.top_p = top_p
 18 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 19 |         self.tokenizer = None
 20 |         self.model = None
 21 | 
 22 |     def load(self):
 23 |         self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_path)
 24 |         self.tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})
 25 |         #self.model = PeftModel.from_pretrained(self.model_path)
 26 | 
 27 |         peft_model_id = self.model_path
 28 |         config = PeftConfig.from_pretrained(peft_model_id)
 29 |         model = transformers.AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
 30 |         model = PeftModel.from_pretrained(model, peft_model_id)
 31 |         self.model = model.to(self.device)
 32 |         #self.model.eval()
 33 | 
 34 |     def generate_output(self, context, num_return_sequences):
 35 |         length = 200
 36 | 
 37 |         encoded_prompt = self.tokenizer.encode(context, add_special_tokens=False, return_tensors="pt")
 38 |         encoded_prompt = encoded_prompt.to(self.device)
 39 | 
 40 |         pad_token_id = self.tokenizer.encode('<pad>', add_special_tokens=False)[0]
 41 |         #end_token_id = self.tokenizer.encode('</s>', add_special_tokens=False)[0]
 42 | 
 43 |         output_sequences = self.model.generate(
 44 |             input_ids=encoded_prompt,
 45 |             max_length=length + len(encoded_prompt[0]),
 46 |             num_return_sequences=num_return_sequences,
 47 |             pad_token_id=pad_token_id,
 48 |             #end_token_id=end_token_id,
 49 |             do_sample=True,
 50 |             temperature=self.temperature,
 51 |             top_p=self.top_p
 52 |         )
 53 | 
 54 |         stop_token = '</s>'
 55 | 
 56 |         generated_sequences = set()
 57 |         for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
 58 |             generated_sequence = generated_sequence.tolist()
 59 | 
 60 |             text = self.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
 61 |             if stop_token in text:
 62 |                 text = text[: text.find(stop_token)]
 63 | 
 64 |             text = text[text.index('#')+1:].strip()
 65 |             text = text.replace('\u2010', '').replace('\u0301', '')
 66 | 
 67 |             generated_sequences.add(text)
 68 | 
 69 |         return list(generated_sequences)
 70 | 
 71 | 
 72 | if __name__ == '__main__':
 73 |     proj_dir = os.path.expanduser('~/polygon/text_generator')
 74 | 
 75 |     parser = argparse.ArgumentParser(description='Отладочный консольный генератор стихов на базе rugpt13B+LoRa')
 76 |     parser.add_argument('--model_path', type=str, default=os.path.join(proj_dir, 'tmp', 'verses_rugpt13b_lora_domain=lyrycs_syllables=1'))
 77 |     parser.add_argument('--temperature', type=float, default=1.0, help='Температура сэмплинга')
 78 |     parser.add_argument('--top_p', type=float, default=0.8, help='top-p')
 79 |     parser.add_argument('--top_k', type=int, default=0, help='top-k')
 80 |     parser.add_argument('--typical_p', type=float, default=0.0, help='typical-p')
 81 | 
 82 |     args = parser.parse_args()
 83 | 
 84 |     use_cuda = torch.cuda.is_available()
 85 |     device = torch.device("cuda" if use_cuda else "cpu")
 86 | 
 87 |     poem_generator = RugptGenerator(model_path=args.model_path, temperature=args.temperature, top_p=args.top_p)
 88 |     poem_generator.load()
 89 | 
 90 |     while True:
 91 |         prompt = input(':> ').strip()
 92 |         if prompt:
 93 |             seed = prompt + '#'
 94 |             px = poem_generator.generate_output(seed, num_return_sequences=10)
 95 |             print('-'*80)
 96 |             for ipoem, p in enumerate(px, start=1):
 97 |                 print('='*30 + ' POEM #{} '.format(ipoem) + '='*30)
 98 |                 print(p)
 99 |             print('-'*80)
100 | 


--------------------------------------------------------------------------------
/poetry/run_rugpt_generator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import sys
  4 | import argparse
  5 | 
  6 | import torch
  7 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
  8 | 
  9 | from generative_poetry.whitespace_normalization import normalize_whitespaces
 10 | 
 11 | 
 12 | class RugptGenerator:
 13 |     def __init__(self, model_path, generation_config):
 14 |         self.model_path = model_path
 15 |         self.generation_config = generation_config
 16 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 17 |         self.tokenizer = None
 18 |         self.model = None
 19 | 
 20 |     def load(self):
 21 |         model_name_or_path = os.path.expanduser(self.model_path)
 22 |         self.tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
 23 |         self.tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})
 24 |         self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
 25 |         self.model.to(self.device)
 26 |         self.model.eval()
 27 | 
 28 |     def generate_output(self, context, num_return_sequences):
 29 |         encoded_prompt = self.tokenizer.encode(context, add_special_tokens=False, return_tensors="pt")
 30 |         encoded_prompt = encoded_prompt.to(self.device)
 31 | 
 32 |         pad_token_id = self.tokenizer.encode('<pad>', add_special_tokens=False)[0]
 33 |         #end_token_id = self.tokenizer.encode('</s>', add_special_tokens=False)[0]
 34 | 
 35 |         output_sequences = self.model.generate(
 36 |             input_ids=encoded_prompt,
 37 |             pad_token_id=pad_token_id,
 38 |             **self.generation_config
 39 |         )
 40 | 
 41 |         stop_token = '</s>'
 42 | 
 43 |         generated_sequences = set()
 44 |         for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
 45 |             generated_sequence = generated_sequence.tolist()
 46 | 
 47 |             text = self.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
 48 |             if stop_token in text:
 49 |                 text = text[: text.find(stop_token)]
 50 | 
 51 |             text = text[text.index('#')+1:].strip()
 52 |             text = text.replace('\u2010', '').replace('\u0301', '')
 53 |             text = normalize_whitespaces(text)
 54 |             generated_sequences.add(text)
 55 | 
 56 |         return list(generated_sequences)
 57 | 
 58 | 
 59 | if __name__ == '__main__':
 60 |     proj_dir = os.path.expanduser('~/polygon/text_generator')
 61 | 
 62 |     parser = argparse.ArgumentParser(description='Отладочный консольный генератор пирожков')
 63 |     parser.add_argument('--model_path', type=str, default=os.path.join(proj_dir, 'tmp', 'verses_rugpt.all'))
 64 |     parser.add_argument('--max_length', type=int, default=300)
 65 |     parser.add_argument('--num_return_sequences', type=int, default=5)
 66 |     parser.add_argument('--do_sample', type=bool, default=True)
 67 |     parser.add_argument('--num_beams', type=int, default=1)
 68 |     parser.add_argument('--num_beam_groups', type=int, default=1)
 69 |     parser.add_argument('--penalty_alpha', type=float, default=None)
 70 |     parser.add_argument('--epsilon_cutoff', type=float, default=0.0)
 71 |     parser.add_argument('--eta_cutoff', type=float, default=0.0)
 72 |     parser.add_argument('--diversity_penalty', type=float, default=0.0)
 73 |     parser.add_argument('--repetition_penalty', type=float, default=None)
 74 |     parser.add_argument('--encoder_repetition_penalty', type=float, default=1.0)
 75 |     parser.add_argument('--length_penalty', type=float, default=1.0)
 76 |     parser.add_argument('--no_repeat_ngram_size', type=int, default=0)
 77 |     parser.add_argument('--renormalize_logits', type=bool, default=False)
 78 |     parser.add_argument('--temperature', type=float, default=0.9, help='Температура сэмплинга')
 79 |     parser.add_argument('--top_p', type=float, default=0.6, help='top-p')
 80 |     parser.add_argument('--top_k', type=int, default=0, help='top-k')
 81 |     parser.add_argument('--typical_p', type=float, default=None, help='typical-p')
 82 |     args = parser.parse_args()
 83 | 
 84 |     generation_args = {'max_length': args.max_length,
 85 |                        'num_return_sequences': args.num_return_sequences,
 86 |                        'do_sample': args.do_sample,
 87 |                        'num_beams': args.num_beams,
 88 |                        'num_beam_groups': args.num_beam_groups,
 89 |                        'penalty_alpha': args.penalty_alpha,
 90 |                        'epsilon_cutoff': args.epsilon_cutoff,
 91 |                        'eta_cutoff': args.eta_cutoff,
 92 |                        'diversity_penalty': args.diversity_penalty,
 93 |                        'repetition_penalty': args.repetition_penalty,
 94 |                        'encoder_repetition_penalty': args.encoder_repetition_penalty,
 95 |                        'length_penalty': args.length_penalty,
 96 |                        'no_repeat_ngram_size': args.no_repeat_ngram_size,
 97 |                        'renormalize_logits': args.renormalize_logits,
 98 |                        'temperature': args.temperature,
 99 |                        'top_p': args.top_p,
100 |                        'top_k': args.top_k,
101 |                        'typical_p': args.typical_p,
102 |                        }
103 | 
104 |     use_cuda = torch.cuda.is_available()
105 |     device = torch.device("cuda" if use_cuda else "cpu")
106 | 
107 |     poem_generator = RugptGenerator(args.model_path, generation_args)
108 |     poem_generator.load()
109 | 
110 |     while True:
111 |         prompt = input(':> ').strip()
112 |         if prompt:
113 |             seed = prompt + '#'
114 |             px = poem_generator.generate_output(seed, num_return_sequences=10)
115 |             print('-'*80)
116 |             for ipoem, p in enumerate(px, start=1):
117 |                 print('='*30 + ' POEM #{} '.format(ipoem) + '='*30)
118 |                 print(p)
119 |             print('-'*80)
120 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LM-finetune
  2 | 
  3 | В этом репе я собрал свои текущие рабочие скрипты для файнтюна языковых моделей (rugpt, LLaMa, FRED T5) средствами transformers.
  4 | В случае больших моделей (7B и 13B) используются варианты а) deepspeed б) LoRa.
  5 | 
  6 | В коде нет ничего нового и особо умного, просто базовый пайплайн в рамках рекомендаций для transformers.Trainer.
  7 | 
  8 | ## ГЕНЕРАТОР СТИХОВ
  9 | 
 10 | ### Генератор стихов на базе модели LLaMa 7B и 13B
 11 | 
 12 | Код: [finetune_llama.py](./poetry/finetune_llama.py)
 13 | 
 14 | Используется deepspeed, что позволяет тюнить модели на 40Гб гпушках. Судя по отчету deepspeed'а, возможен
 15 | также файнтюн на V100 с 32Гб. Обратите внимание, что требуется очень много обычной RAM, более 240 Гб,
 16 | чтобы deepspeed выгружал туда тензоры.
 17 | 
 18 | Запуск файнтюна на 4 GPU:
 19 | 
 20 | ```
 21 | python -m torch.distributed.launch --nproc_per_node=4 finetune_llama.py \
 22 | --dataset_path ~/polygon/text_generator/tmp/лирика.jsonl \
 23 | --max_samples 10000 \
 24 | --output_syllables 0 \
 25 | --model_name_or_path decapoda-research/llama-7b-hf \
 26 | --output_dir ~/polygon/text_generator/tmp/verses_model=llama7b_domain=lyrics_syllables=0 \
 27 | --overwrite_output_dir 1 \
 28 | --per_device_train_batch_size 8 \
 29 | --learning_rate 1e-5 \
 30 | --num_train_epochs 1 \
 31 | --bf16 1 \
 32 | --fp16 0 \
 33 | --gradient_checkpointing 0 \
 34 | --gradient_accumulation_step 8 \
 35 | --do_train 1 \
 36 | --do_eval 0 \
 37 | --report_to tensorboard \
 38 | --evaluation_strategy no \
 39 | --logging_strategy steps \
 40 | --logging_steps 10 \
 41 | --save_strategy no \
 42 | --deepspeed 13b_deepspeed_config.json
 43 | ```
 44 | 
 45 | Файл с конфигурацией deepspeed'а: [13b_deepspeed_config.json](./poetry/13b_deepspeed_config.json)
 46 | 
 47 | Код инференса: [run_llama.py](./poetry/run_llama.py). Disclaimer: этот код инференсит только на 80Гб A100.
 48 | 
 49 | ### Генератор стихов на базе LLaMa 7B и 13B с использованием библиотеки PEFT (метод LoRa)
 50 | 
 51 | Код файнтюна: [finetune_llama_lora.py](./poetry/finetune_llama_lora.py)
 52 | 
 53 | Запуск файнтюна на 2 ГПУ:
 54 | 
 55 | ```
 56 | python -m torch.distributed.run --nproc_per_node=2 finetune_llama_lora.py \
 57 | --dataset_path ~/polygon/text_generator/tmp/лирика.jsonl \
 58 | --max_samples 10000 \
 59 | --output_syllables 0 \
 60 | --model_name_or_path decapoda-research/llama-7b-hf \
 61 | --output_dir ~/polygon/text_generator/tmp/verses_model=llama7b_lora_domain=lyrics_syllables=0 \
 62 | --overwrite_output_dir 1 \
 63 | --per_device_train_batch_size 1 \
 64 | --learning_rate 1e-4 \
 65 | --num_train_epochs 1 \
 66 | --bf16 0 \
 67 | --fp16 0 \
 68 | --gradient_checkpointing 0 \
 69 | --gradient_accumulation_step 8 \
 70 | --do_train 1 \
 71 | --do_eval 0 \
 72 | --report_to tensorboard \
 73 | --evaluation_strategy no \
 74 | --logging_strategy steps \
 75 | --logging_steps 200 \
 76 | --save_strategy no \
 77 | ```
 78 | 
 79 | Код инференса: [run_llama_lora.py](./poetry/run_llama_lora.py)
 80 | 
 81 | ### Генератор стихов на базе FRED T5 XL
 82 | 
 83 | Код для файнтюна: [finetune_fredt5_poetry_generator.py](./poetry/finetune_fredt5_poetry_generator.py)
 84 | 
 85 | Запуск файнтюна на 2 ГПУ:
 86 | 
 87 | ```
 88 | python -m torch.distributed.run --nproc_per_node=2 finetune_fredt5_poetry_generator.py \
 89 |  --model_name_or_path ai-forever/FRED-T5-1.7B \
 90 |  --dataset_path ~/polygon/text_generator/tmp/all_verses.jsonl \
 91 |  --prompt prompt_text \
 92 |  --optim "adafactor" \
 93 |  --learning_rate 1e-3 \
 94 |  --lr_scheduler_type constant \
 95 |  --per_device_train_batch_size 8 \
 96 |  --gradient_checkpointing 0 \
 97 |  --gradient_accumulation_steps 4 \
 98 |  --num_train_epochs 1 \
 99 |  --report_to tensorboard \
100 |  --logging_strategy steps \
101 |  --logging_steps 100 \
102 |  --output_dir ~/polygon/text_generator/tmp/verses_fredt5 \
103 |  --save_strategy no
104 | ```
105 | 
106 | Запуск инференса: [run_fredt5_poetry_generator.py](./poetry/run_fredt5_poetry_generator.py)
107 | 
108 | 
109 | ### Генератор стихов на базе моделей rugpt (кроме rugpt13B)
110 | 
111 | Код для файнтюна: [finetune_rugpt_with_prompt_masking.py](./poetry/finetune_rugpt_with_prompt_masking.py)
112 | 
113 | Запуск на 2 ГПУ, базовая модель rugpt3large_based_on_gpt2:
114 | 
115 | ```
116 | python -m torch.distributed.run --nproc_per_node=2 finetune_rugpt_with_prompt_masking.py \
117 | --dataset_path ~/polygon/text_generator/tmp/лирика.jsonl \
118 | --output_syllables 1 \
119 | --model_name_or_path sberbank-ai/rugpt3large_based_on_gpt2 \
120 | --output_dir ~/polygon/text_generator/tmp/verses_model=rugpt_large_domain=lyrics_syllables=1 \
121 | --overwrite_output_dir 1 \
122 | --per_device_train_batch_size 8 \
123 | --learning_rate 5e-5 \
124 | --num_train_epochs 1 \
125 | --fp16 1 \
126 | --gradient_checkpointing 0 \
127 | --gradient_accumulation_step 8 \
128 | --do_train 1 \
129 | --do_eval 0 \
130 | --report_to tensorboard \
131 | --evaluation_strategy no \
132 | --logging_strategy steps \
133 | --logging_steps 200 \
134 | --save_strategy no
135 | ```
136 | 
137 | Инференс: [run_rugpt_generator.py](./poetry/run_rugpt_generator.py)
138 | 
139 | 
140 | ## ЧИТЧАТ
141 | 
142 | ### Файнтюн читчата на базе модели FRED T5 XL 1.7B
143 | 
144 | Особенность подхода: вместо префикса <LM> для входной последовательности ставится селектор денойзера `<SC1>`,
145 | и добавляется токен `<extra_id_0>` в том месте (конец диалога), где находится генерируемая реплика.
146 | 
147 | Код: [finetune_chitchat_fredt5_with_trainer.py](./chitchat/finetune_chitchat_fredt5_with_trainer.py).
148 | 
149 | Пример запуска на 1 ГПУ:
150 | 
151 | ```
152 | python finetune_chitchat_fredt5_with_trainer.py \
153 |  --dataset_path axioma_dialogues.json \
154 |  --optim "adafactor" \
155 |  --learning_rate 1e-4 \
156 |  --lr_scheduler_type constant \
157 |  --per_gpu_train_batch_size 6 \
158 |  --gradient_checkpointing 0 \
159 |  --gradient_accumulation_steps 8 \
160 |  --num_train_epochs 1 \
161 |  --report_to tensorboard \
162 |  --logging_strategy steps \
163 |  --logging_steps 500 \
164 |  --output_dir ~/polygon/chatbot/tmp/fredt5_chitchat \
165 |  --save_strategy no
166 | ```
167 | 
168 | Датасет для этой модели: [axioma_dialogues.json](./chitchat/axioma_dialogues.json) сделан из русскоязычной части [датасета проекта OpenAssistant](https://huggingface.co/datasets/OpenAssistant/oasst1).
169 | Каждая ответная реплика вместе с предшествующим контекстом образует отдельный сэмпл для seq2seq модели. Реплики человека и чатбота отмечаются
170 | метками `<h>` и `<и>` соответственно. Для файнтюна они преобразуются в префиксы `человек:` и `чатбот:`.
171 | 
172 | После файнтюна запустить генерацию можно с помощью кода [run_chitchat_fredt5.py](./chitchat/run_chitchat_fredt5.py).
173 | 
174 | ### Файнтюн читчата на базе модели sberbank-ai/rugpt3medium_based_on_gpt2
175 | 
176 | Также подходит для других моделей семейства rugpt.
177 | 
178 | Код [finetune_chitchat_gpt_with_trainer.py](./chitchat/finetune_chitchat_gpt_with_trainer.py).
179 | 
180 | Датасет: [axioma_dialogues.solid.json](./chitchat/axioma_dialogues.solid.json)
181 | 
182 | Запуск файнтюна на 1 GPU:
183 | 
184 | ```
185 | python finetune_chitchat_gpt_with_trainer.py \
186 |  --model_name_or_path sberbank-ai/rugpt3medium_based_on_gpt2 \
187 |  --learning_rate 1e-5 \
188 |  --lr_scheduler_type constant \
189 |  --per_gpu_train_batch_size 2 \
190 |  --gradient_checkpointing 0 \
191 |  --gradient_accumulation_steps 8 \
192 |  --num_train_epochs 1 \
193 |  --report_to tensorboard \
194 |  --logging_strategy steps \
195 |  --logging_steps 100 \
196 |  --output_dir ~/polygon/chatbot/tmp/rugpt_chitchat \
197 |  --save_strategy no
198 | ```
199 | 
200 | Код инференса: [run_chitchat_gpt.py](./chitchat/run_chitchat_gpt.py).
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 


--------------------------------------------------------------------------------
/chitchat/finetune_chitchat_fredt5_with_trainer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Тренировка модели болталки Axioma на FRED T5 для проекта https://github.com/Koziev/chatbot
  3 | Эксперимент с файнтюном: токены истории диалога не включаем в backprop, присваивая соответствующим целям (labels) значение -100
  4 | Прочие хинты по тренировке: https://kelijah.livejournal.com/315826.html
  5 | """
  6 | 
  7 | import os
  8 | import json
  9 | import sys
 10 | import io
 11 | import random
 12 | import itertools
 13 | from typing import Any, Dict, List, Optional, Tuple, Union
 14 | import shutil
 15 | import logging
 16 | from dataclasses import dataclass, field
 17 | 
 18 | import torch
 19 | import torch.optim
 20 | from torch.utils.data import Dataset, DataLoader
 21 | import transformers
 22 | from transformers import AutoTokenizer
 23 | from transformers import TrainingArguments, Trainer, TrainerCallback
 24 | from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
 25 | from transformers import HfArgumentParser
 26 | from pynvml import *
 27 | 
 28 | 
 29 | proj_dir = os.path.expanduser('~/polygon/chatbot')
 30 | 
 31 | 
 32 | def print_gpu_utilization():
 33 |     nvmlInit()
 34 |     handle = nvmlDeviceGetHandleByIndex(0)
 35 |     info = nvmlDeviceGetMemoryInfo(handle)
 36 |     logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
 37 | 
 38 | 
 39 | def load_samples(dataset_path, tokenizer):
 40 |     samples = []
 41 |     with open(dataset_path, 'r') as f:
 42 |         for sample in json.load(f):
 43 |             try:
 44 |                 # 01.05.2023 эксперимент: вместо спецтокенов <b> и <h> используем метки
 45 |                 seed = '<SC1>' + sample['context'].replace('<h>', 'человек: ').replace('<b>', 'чатбот: ') + '\nчатбот: <extra_id_0>'
 46 |                 reply = '<extra_id_0>' + sample['reply']
 47 |                 input_tokens = tokenizer.encode(seed, add_special_tokens=False, truncation=True, max_length=1024)
 48 |                 output_tokens = tokenizer.encode(reply, add_special_tokens=False)  # , truncation=True, max_length=1024)
 49 |                 if len(input_tokens) < 512 and len(output_tokens) < 512:  # пока ограничим многословность
 50 |                     samples.append({'input_tokens': input_tokens,
 51 |                                     'output_tokens': output_tokens,
 52 |                                     'seed': seed,
 53 |                                     'reply': reply})
 54 |             except Exception as ex:
 55 |                 print(ex)
 56 | 
 57 |     return samples
 58 | 
 59 | 
 60 | class FinetuneDataset(Dataset):
 61 |     def __init__(self, samples, tokenizer):
 62 |         self.tokenizer = tokenizer
 63 |         self.max_input_len = 0
 64 |         self.max_output_len = 0
 65 |         self.samples = []
 66 | 
 67 |         self.bos_token_id = tokenizer.encode('<s>', add_special_tokens=False)[0]
 68 |         self.eos_token_id = tokenizer.encode('</s>', add_special_tokens=False)[0]
 69 |         self.pad_token_id = tokenizer.encode('<pad>', add_special_tokens=False)[0]
 70 | 
 71 |         for sample in samples:
 72 |             input_ids = sample['input_tokens']
 73 |             output_ids = sample['output_tokens'] + [self.eos_token_id]
 74 |             self.samples.append((input_ids, output_ids))
 75 |             self.max_input_len = max(self.max_input_len, len(input_ids))
 76 |             self.max_output_len = max(self.max_output_len, len(output_ids))
 77 | 
 78 |     def __len__(self):
 79 |         return len(self.samples)
 80 | 
 81 |     def __getitem__(self, index: int):
 82 |         input_ids, output_ids = self.samples[index]
 83 | 
 84 |         input_npad = self.max_input_len - len(input_ids)
 85 |         attention_mask = [1]*len(input_ids) + [0]*input_npad
 86 |         input_ids = input_ids + input_npad * [self.pad_token_id]
 87 | 
 88 |         output_npad = self.max_output_len - len(output_ids)
 89 |         labels = output_ids + output_npad * [-100]
 90 | 
 91 |         return {'input_ids': torch.LongTensor(input_ids),
 92 |                 'attention_mask': attention_mask,
 93 |                 'labels': torch.LongTensor(labels),
 94 |                 }
 95 | 
 96 | 
 97 | @dataclass
 98 | class ModelArguments:
 99 |     """
100 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
101 |     """
102 |     model_name_or_path: Optional[str] = field(
103 |         default='ai-forever/FRED-T5-1.7B',
104 |         metadata={"help": "The model checkpoint for weights initialization."},
105 |     )
106 | 
107 | 
108 | @dataclass
109 | class DataTrainingArguments:
110 |     """
111 |     Arguments pertaining to what data we are going to input our model for training and eval.
112 |     """
113 |     dataset_path: Optional[str] = field(
114 |         default=os.path.join(proj_dir, 'tmp', 'axioma_dialogues.json'),
115 |         metadata={"help": "Путь к датасету с диалогами"}
116 |     )
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
121 | 
122 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
123 | 
124 |     if not training_args.optim:
125 |         training_args.optim = "adafactor"
126 | 
127 |     if not training_args.output_dir:
128 |         training_args.output_dir = os.path.join(proj_dir, 'tmp', 'fredt5_chitchat')
129 | 
130 |     verbose = training_args.local_rank in (-1, 0)
131 | 
132 |     # Setup logging
133 |     logging.basicConfig(
134 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
135 |         datefmt="%m/%d/%Y %H:%M:%S",
136 |         handlers=[logging.StreamHandler(sys.stdout)],
137 |     )
138 | 
139 |     log_level = training_args.get_process_log_level()
140 |     logger = logging.getLogger(__name__)
141 |     logger.setLevel(log_level)
142 |     #datasets.utils.logging.set_verbosity(log_level)
143 |     transformers.utils.logging.set_verbosity(log_level)
144 |     transformers.utils.logging.enable_default_handler()
145 |     transformers.utils.logging.enable_explicit_format()
146 | 
147 |     logger.info(
148 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
149 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
150 |     )
151 |     logger.info(f"Training/evaluation parameters {training_args}")
152 | 
153 |     rank0 = training_args.local_rank in (-1, 0)
154 | 
155 |     # Удаляем старые логи tensorboard
156 |     if rank0:
157 |         tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
158 |         #if os.path.exists(tensorboard_dir):
159 |         #    logger.info('Removing "%s"', tensorboard_dir)
160 |         #    shutil.rmtree(tensorboard_dir)
161 | 
162 |     device = training_args.device
163 |     logger.info('device={}'.format(device))
164 | 
165 |     pretrained_model_name = model_args.model_name_or_path
166 | 
167 |     logger.info('Loading pretrained model "%s"', pretrained_model_name)
168 |     tokenizer = transformers.GPT2Tokenizer.from_pretrained(pretrained_model_name)
169 |     model = transformers.T5ForConditionalGeneration.from_pretrained(pretrained_model_name)
170 |     model.to(device)
171 | 
172 |     tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})
173 | 
174 |     if rank0:
175 |         print_gpu_utilization()
176 |         logger.info('\nTokenizer:')
177 |         for token in '<s> </s> <pad>'.split():
178 |             logger.info('token "%s" id=%s'.format(token, str(tokenizer.encode(token, add_special_tokens=False))))
179 | 
180 |     logger.info('Loading dataset "%s"...', data_args.dataset_path)
181 |     train_samples = load_samples(data_args.dataset_path, tokenizer)
182 |     logger.info('Train samples: %d', len(train_samples))
183 | 
184 |     train_dataset = FinetuneDataset(train_samples, tokenizer)
185 |     # test_dataset = FinetuneDataset(test_samples, tokenizer)
186 | 
187 |     trainer = Trainer(
188 |         model=model,
189 |         args=training_args,
190 |         train_dataset=train_dataset,
191 |         tokenizer=tokenizer,
192 |         data_collator=None,
193 |     )
194 | 
195 |     try:
196 |         logger.info('Start training...')
197 |         train_result = trainer.train()
198 | 
199 |         if rank0:
200 |             metrics = train_result.metrics
201 |             trainer.log_metrics("train", metrics)
202 |             trainer.save_metrics("train", metrics)
203 |     except KeyboardInterrupt:
204 |         print('!!! Ctrl+C !!!')
205 | 
206 |     if rank0:
207 |         logger.info(f'Saving the model and tokenizer')
208 |         trainer.save_model(output_dir=training_args.output_dir)
209 |         tokenizer.save_pretrained(training_args.output_dir)
210 |         #model.save_pretrained(training_args.output_dir)
211 | 
212 |     logger.info('All done :)')
213 | 


--------------------------------------------------------------------------------
/poetry/finetune_fredt5_poetry_generator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Тренировка модели генерации стихов на претрейненной модели FRED T5 XL
  3 | """
  4 | import logging
  5 | import os
  6 | import json
  7 | import io
  8 | import random
  9 | import itertools
 10 | import sys
 11 | from dataclasses import dataclass, field
 12 | from typing import Any, Dict, List, Optional, Tuple, Union
 13 | import shutil
 14 | import argparse
 15 | 
 16 | import numpy as np
 17 | import tqdm
 18 | import sklearn.model_selection
 19 | import torch
 20 | import scipy
 21 | import torch.nn as nn
 22 | import torch.nn.functional as F
 23 | import torch.optim
 24 | from torch.utils.tensorboard import SummaryWriter
 25 | from torch.utils.data import Dataset, DataLoader
 26 | from transformers import AutoModelForCausalLM
 27 | import transformers
 28 | from transformers import AutoTokenizer
 29 | from transformers import TrainingArguments, Trainer, TrainerCallback
 30 | from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
 31 | from transformers import HfArgumentParser
 32 | from pynvml import *
 33 | 
 34 | 
 35 | proj_dir = os.path.expanduser('~/polygon/text_generator')
 36 | 
 37 | 
 38 | def print_gpu_utilization():
 39 |     nvmlInit()
 40 |     handle = nvmlDeviceGetHandleByIndex(0)
 41 |     info = nvmlDeviceGetMemoryInfo(handle)
 42 |     logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
 43 | 
 44 | 
 45 | def load_samples(dataset_path, tokenizer):
 46 |     samples = []
 47 |     with open(dataset_path, 'r') as f:
 48 |         for line in f:
 49 |             sample = json.loads(line)
 50 |             try:
 51 |                 input_text = '<LM>' + sample['prompt_text']
 52 | 
 53 |                 # Вариант с генерацией обычного текста
 54 |                 output_text = sample['output_text']
 55 |                 output_text = '\n'.join(output_text.split('\n')[:4])
 56 | 
 57 |                 input_tokens = tokenizer.encode(input_text, add_special_tokens=False, truncation=True, max_length=512)
 58 |                 output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
 59 |                 samples.append({'input_tokens': input_tokens, 'output_tokens': output_tokens,
 60 |                                 'input_text': input_text, 'output_text': output_text})
 61 | 
 62 |             except Exception as ex:
 63 |                 print(ex)
 64 | 
 65 |     return samples
 66 | 
 67 | 
 68 | class FinetuneDataset(Dataset):
 69 |     def __init__(self, samples, tokenizer):
 70 |         self.tokenizer = tokenizer
 71 |         self.max_input_len = 0
 72 |         self.max_output_len = 0
 73 |         self.samples = []
 74 | 
 75 |         self.bos_token_id = tokenizer.encode('<s>', add_special_tokens=False)[0]
 76 |         self.eos_token_id = tokenizer.encode('</s>', add_special_tokens=False)[0]
 77 |         self.pad_token_id = tokenizer.encode('<pad>', add_special_tokens=False)[0]
 78 | 
 79 |         for sample in samples:
 80 |             input_ids = sample['input_tokens']
 81 |             output_ids = sample['output_tokens'] + [self.eos_token_id]
 82 |             self.samples.append((input_ids, output_ids))
 83 |             self.max_input_len = max(self.max_input_len, len(input_ids))
 84 |             self.max_output_len = max(self.max_output_len, len(output_ids))
 85 | 
 86 |     def __len__(self):
 87 |         return len(self.samples)
 88 | 
 89 |     def __getitem__(self, index: int):
 90 |         input_ids, output_ids = self.samples[index]
 91 | 
 92 |         input_npad = self.max_input_len - len(input_ids)
 93 |         attention_mask = [1]*len(input_ids) + [0]*input_npad
 94 |         input_ids = input_ids + input_npad * [self.pad_token_id]
 95 | 
 96 |         output_npad = self.max_output_len - len(output_ids)
 97 |         labels = output_ids + output_npad * [-100]
 98 | 
 99 |         return {'input_ids': torch.LongTensor(input_ids),
100 |                 'attention_mask': attention_mask,
101 |                 'labels': torch.LongTensor(labels),
102 |                 }
103 | 
104 | 
105 | @dataclass
106 | class ModelArguments:
107 |     """
108 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
109 |     """
110 |     model_name_or_path: Optional[str] = field(
111 |         default='ai-forever/FRED-T5-1.7B',
112 |         metadata={"help": "The model checkpoint for weights initialization."},
113 |     )
114 | 
115 | 
116 | @dataclass
117 | class DataTrainingArguments:
118 |     """
119 |     Arguments pertaining to what data we are going to input our model for training and eval.
120 |     """
121 |     dataset_path: Optional[str] = field(
122 |         default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'long_poems_gpt_dataset.jsonl')),
123 |         metadata={"help": "Путь к датасету со стихами"}
124 |     )
125 | 
126 | 
127 | class MyPrinterCallback(TrainerCallback):
128 |     def __init__(self, filepath):
129 |         self.wrt = open(filepath, 'w')
130 | 
131 |     def on_log(self, args, state, control, logs=None, **kwargs):
132 |         if state.is_local_process_zero:
133 |             if 'epoch' in logs and 'loss' in logs:
134 |                 self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
135 |                 self.wrt.flush()
136 | 
137 | 
138 | if __name__ == '__main__':
139 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
140 | 
141 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
142 | 
143 |     if not training_args.optim:
144 |         training_args.optim = "adafactor"
145 | 
146 |     if not training_args.output_dir:
147 |         training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 't5_poetry_generator'))
148 | 
149 |     verbose = training_args.local_rank in (-1, 0)
150 | 
151 |     # Setup logging
152 |     logging.basicConfig(
153 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
154 |         datefmt="%m/%d/%Y %H:%M:%S",
155 |         handlers=[logging.StreamHandler(sys.stdout)],
156 |     )
157 | 
158 |     log_level = training_args.get_process_log_level()
159 |     logger = logging.getLogger(__name__)
160 |     logger.setLevel(log_level)
161 |     #datasets.utils.logging.set_verbosity(log_level)
162 |     transformers.utils.logging.set_verbosity(log_level)
163 |     transformers.utils.logging.enable_default_handler()
164 |     transformers.utils.logging.enable_explicit_format()
165 | 
166 |     logger.info(
167 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
168 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
169 |     )
170 |     logger.info(f"Training/evaluation parameters {training_args}")
171 | 
172 |     # Удаляем старые логи tensorboard
173 |     tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
174 |     if os.path.exists(tensorboard_dir):
175 |         logger.info('Removing "%s"', tensorboard_dir)
176 |         shutil.rmtree(tensorboard_dir)
177 | 
178 |     device = training_args.device
179 |     logging.info('device={}'.format(device))
180 | 
181 |     pretrained_model_name = model_args.model_name_or_path
182 | 
183 |     logger.info('Loading pretrained model "%s"', pretrained_model_name)
184 |     if 'FRED-T5' in pretrained_model_name:
185 |         tokenizer = transformers.GPT2Tokenizer.from_pretrained(pretrained_model_name)
186 |     else:
187 |         tokenizer = transformers.T5Tokenizer.from_pretrained(pretrained_model_name)
188 | 
189 |     tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})
190 | 
191 |     model = transformers.T5ForConditionalGeneration.from_pretrained(pretrained_model_name)
192 |     model.to(device)
193 |     print_gpu_utilization()
194 | 
195 |     logger.info('\nLoading dataset "%s"...', data_args.dataset_path)
196 |     train_samples = load_samples(data_args.dataset_path, tokenizer)
197 |     logger.info('Train samples: %d', len(train_samples))
198 | 
199 |     train_dataset = FinetuneDataset(train_samples, tokenizer)
200 |     # test_dataset = FinetuneDataset(test_samples, tokenizer)
201 | 
202 |     printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_fredt5_poetry_generator.loss.log'))
203 | 
204 |     trainer = Trainer(
205 |         model=model,
206 |         args=training_args,
207 |         train_dataset=train_dataset,
208 |         # eval_dataset=test_dataset,
209 |         tokenizer=tokenizer,
210 |         data_collator=None,
211 |         # compute_metrics=compute_metrics,
212 |         callbacks=[printer]  #[EarlyStoppingCallback(early_stopping_patience=5)]
213 |     )
214 | 
215 |     logger.info('Start training...')
216 |     train_result = trainer.train()
217 | 
218 |     logger.info(f'Saving the model and tokenizer')
219 |     trainer.save_model(output_dir=training_args.output_dir)
220 | 
221 |     metrics = train_result.metrics
222 |     trainer.log_metrics("train", metrics)
223 |     trainer.save_metrics("train", metrics)
224 | 
225 |     logger.info('All done :)')
226 | 


--------------------------------------------------------------------------------
/chitchat/finetune_chitchat_gpt_with_trainer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Тренировка модели болталки Axioma.
  3 | Эксперимент с файнтюном: токены истории диалога не включаем в backprop, присваивая соответствующим целям (labels) значение -100
  4 | """
  5 | import logging
  6 | import os
  7 | import json
  8 | import sys
  9 | import io
 10 | import random
 11 | import itertools
 12 | import shutil
 13 | from dataclasses import dataclass, field
 14 | from typing import Any, Dict, List, Optional, Tuple, Union
 15 | 
 16 | import numpy as np
 17 | import tqdm
 18 | import sklearn.model_selection
 19 | import torch
 20 | import scipy
 21 | import torch.nn as nn
 22 | import torch.nn.functional as F
 23 | import torch.optim as optim
 24 | from torch.utils.tensorboard import SummaryWriter
 25 | from torch.utils.data import Dataset, DataLoader
 26 | from transformers import AutoModelForCausalLM
 27 | import transformers
 28 | from transformers import AutoTokenizer
 29 | from transformers import TrainingArguments, Trainer, TrainerCallback
 30 | from transformers import HfArgumentParser
 31 | from pynvml import *
 32 | 
 33 | 
 34 | proj_dir = os.path.expanduser('~/polygon/chatbot')
 35 | 
 36 | 
 37 | def print_gpu_utilization():
 38 |     nvmlInit()
 39 |     handle = nvmlDeviceGetHandleByIndex(0)
 40 |     info = nvmlDeviceGetMemoryInfo(handle)
 41 |     logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
 42 | 
 43 | 
 44 | def load_samples(dataset_path, tokenizer):
 45 |     samples = []
 46 |     with open(dataset_path, 'r') as f:
 47 |         data = json.load(f)
 48 |         for sample in tqdm.tqdm(data, desc='Loading samples', total=len(data)):
 49 |             try:
 50 |                 lines = []
 51 |                 for i, msg in enumerate(sample):
 52 |                     if 0 == (i % 2):
 53 |                         lines.append('человек: ' + msg)
 54 |                     else:
 55 |                         lines.append('чатбот: ' + msg)
 56 | 
 57 |                 text = '\n'.join(lines)
 58 |                 tokens = tokenizer.encode(text, add_special_tokens=False, truncation=False)
 59 |                 if len(tokens) < 512:
 60 |                     samples.append({'tokens': tokens, 'text': text})
 61 |                 else:
 62 |                     lines0 = list(lines)
 63 | 
 64 |                     lines = lines[:-1]
 65 |                     while len(lines) > 1:
 66 |                         text = '\n'.join(lines)
 67 |                         tokens = tokenizer.encode(text, add_special_tokens=False, truncation=False)
 68 |                         if len(tokens) < 512:
 69 |                             samples.append({'tokens': tokens, 'text': text})
 70 |                             break
 71 |                         else:
 72 |                             lines = lines[:-1]
 73 | 
 74 | 
 75 |             except Exception as ex:
 76 |                 print(ex)
 77 | 
 78 |     return samples
 79 | 
 80 | 
 81 | class FinetuneDataset(Dataset):
 82 |     def __init__(self, samples, tokenizer):
 83 |         self.tokenizer = tokenizer
 84 |         self.max_len = 0
 85 |         self.samples = []
 86 | 
 87 |         self.bos_token_id = tokenizer.encode('<s>', add_special_tokens=False)[0]
 88 |         self.eos_token_id = tokenizer.encode('</s>', add_special_tokens=False)[0]
 89 |         self.pad_token_id = tokenizer.encode('<pad>', add_special_tokens=False)[0]
 90 | 
 91 |         for sample in samples:
 92 |             input_ids = [self.bos_token_id] + sample['tokens'] + [self.eos_token_id]
 93 |             labels = input_ids
 94 |             attention_map = [1] * len(labels)
 95 |             self.samples.append((input_ids, labels, attention_map))
 96 |             self.max_len = max(self.max_len, len(input_ids))
 97 | 
 98 |     def __len__(self):
 99 |         return len(self.samples)
100 | 
101 |     def __getitem__(self, index: int):
102 |         input_ids, labels, attention_map = self.samples[index]
103 |         npad = self.max_len - len(input_ids)
104 |         input_ids = input_ids + npad * [self.pad_token_id]
105 |         labels = labels + [-100] * npad
106 |         attention_mask = attention_map + [0] * npad
107 |         return {'input_ids': torch.LongTensor(input_ids),
108 |                 'labels': torch.LongTensor(labels),
109 |                 'attention_mask': torch.LongTensor(attention_mask)}
110 | 
111 | 
112 | @dataclass
113 | class ModelArguments:
114 |     """
115 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
116 |     """
117 |     model_name_or_path: Optional[str] = field(
118 |         default='sberbank-ai/rugpt3medium_based_on_gpt2',
119 |         metadata={"help": "The model checkpoint for weights initialization."},
120 |     )
121 | 
122 | 
123 | @dataclass
124 | class DataTrainingArguments:
125 |     """
126 |     Arguments pertaining to what data we are going to input our model for training and eval.
127 |     """
128 |     dataset_path: Optional[str] = field(
129 |         default=os.path.join(proj_dir, 'tmp', 'axioma_dialogues.solid.json'),
130 |         metadata={"help": "Путь к датасету со диалогами"}
131 |     )
132 | 
133 | 
134 | if __name__ == '__main__':
135 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
136 | 
137 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
138 | 
139 |     if not training_args.output_dir:
140 |         training_args.output_dir = os.path.join(proj_dir, 'tmp', 'rugpt_chitchat')
141 | 
142 |     rank0 = training_args.local_rank in (-1, 0)
143 | 
144 |     # Setup logging
145 |     logging.basicConfig(
146 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
147 |         datefmt="%m/%d/%Y %H:%M:%S",
148 |         handlers=[logging.StreamHandler(sys.stdout)],
149 |     )
150 | 
151 |     log_level = training_args.get_process_log_level()
152 |     logger = logging.getLogger(__name__)
153 |     logger.setLevel(log_level)
154 |     #datasets.utils.logging.set_verbosity(log_level)
155 |     transformers.utils.logging.set_verbosity(log_level)
156 |     transformers.utils.logging.enable_default_handler()
157 |     transformers.utils.logging.enable_explicit_format()
158 | 
159 |     logger.info(
160 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
161 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
162 |     )
163 |     logger.info(f"Training/evaluation parameters {training_args}")
164 | 
165 |     # Удаляем старые логи tensorboard
166 |     if rank0:
167 |         tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
168 |         if os.path.exists(tensorboard_dir):
169 |             logger.info('Removing "%s"', tensorboard_dir)
170 |             shutil.rmtree(tensorboard_dir)
171 | 
172 |     pretrained_model_name = model_args.model_name_or_path
173 | 
174 |     print('Loading pretrained model "{}"...'.format(pretrained_model_name))
175 |     if 'xglm' in pretrained_model_name.lower():
176 |         tokenizer = transformers.XGLMTokenizer.from_pretrained(pretrained_model_name)
177 |         model = transformers.XGLMForCausalLM.from_pretrained(pretrained_model_name)
178 |     elif 'bloom' in pretrained_model_name:
179 |         tokenizer = transformers.BloomTokenizer.from_pretrained(pretrained_model_name)
180 |         model = transformers.BloomForCausalLM.from_pretrained(pretrained_model_name)
181 |     else:
182 |         tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name)
183 |         model = transformers.AutoModelForCausalLM.from_pretrained(pretrained_model_name)
184 | 
185 |     tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})
186 | 
187 |     if rank0:
188 |         print_gpu_utilization()
189 | 
190 |     print('\nTokenizer:')
191 |     for token in '<s> </s> <pad>'.split():
192 |         print('token "{}" id={}'.format(token, tokenizer.encode(token, add_special_tokens=False)))
193 | 
194 |     print('\nLoading dataset...')
195 |     train_samples = load_samples(data_args.dataset_path, tokenizer)
196 |     print('Train samples: {}'.format(len(train_samples)))
197 | 
198 |     train_dataset = FinetuneDataset(train_samples, tokenizer)
199 | 
200 |     trainer = Trainer(
201 |         model=model,
202 |         args=training_args,
203 |         train_dataset=train_dataset,
204 |         tokenizer=tokenizer,
205 |         data_collator=None,
206 |         # compute_metrics=compute_metrics,
207 |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
208 |     )
209 | 
210 |     logger.info('Start training...')
211 |     try:
212 |         train_result = trainer.train()
213 | 
214 |         metrics = train_result.metrics
215 |         trainer.log_metrics("train", metrics)
216 |         trainer.save_metrics("train", metrics)
217 |     except KeyboardInterrupt as ex:
218 |         print('!!! CTRL+C !!!')
219 | 
220 |     logger.info(f'Saving the model and tokenizer to "%s"', training_args.output_dir)
221 |     trainer.save_model(output_dir=training_args.output_dir)
222 |     #model.save_pretrained(training_args.output_dir)
223 |     #tokenizer.save_pretrained(training_args.output_dir)
224 | 
225 |     logger.info('All done :)')
226 | 


--------------------------------------------------------------------------------
/poetry/finetune_rugpt13b.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Тренировка модели генерации силлабо-тонической поэзии поверх rugpt13B с использованием deepspeed.
  3 | """
  4 | import glob
  5 | import logging
  6 | import os
  7 | import json
  8 | from dataclasses import dataclass, field
  9 | from typing import Any, Dict, List, Optional, Tuple, Union
 10 | import shutil
 11 | 
 12 | from torch.utils.data import Dataset, DataLoader
 13 | from transformers import AutoModelForCausalLM
 14 | import transformers
 15 | from transformers import AutoTokenizer
 16 | from transformers import TrainingArguments, Trainer, TrainerCallback
 17 | from transformers import HfArgumentParser
 18 | import deepspeed
 19 | from pynvml import *
 20 | 
 21 | 
 22 | proj_dir = os.path.expanduser('~/polygon/text_generator')
 23 | 
 24 | 
 25 | def print_gpu_utilization():
 26 |     nvmlInit()
 27 |     handle = nvmlDeviceGetHandleByIndex(0)
 28 |     info = nvmlDeviceGetMemoryInfo(handle)
 29 |     logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
 30 | 
 31 | 
 32 | def pad_sequence(sequence, pad_id, max_len):
 33 |     l = len(sequence)
 34 |     if l < max_len:
 35 |         return sequence + [pad_id] * (max_len - l)
 36 |     else:
 37 |         return sequence
 38 | 
 39 | 
 40 | def load_samples(data_args, tokenizer):
 41 |     samples = []
 42 |     with open(data_args.dataset_path, 'r') as f:
 43 |         for sample_str in f:
 44 |             sample = json.loads(sample_str)
 45 |             prompt = sample['prompt_text']
 46 |             if prompt:
 47 |                 if data_args.output_syllables:
 48 |                     # Вариант с генерацией цепочки слогов
 49 |                     lines = []
 50 |                     for line in sample['output'].split('<nl>'):
 51 |                         line = line.strip()
 52 |                         tokens = line.split(' ')
 53 |                         tokens = tokens[::-1]
 54 |                         line = ' '.join(tokens)
 55 |                         line = line.replace(' | ', '|')
 56 |                         line = line.replace(' ', '\u2010')
 57 |                         line = line.replace('|', ' ')
 58 |                         lines.append(line)
 59 |                     output_text = '\n'.join(lines)
 60 |                 else:
 61 |                     # Генерируется обычный текст.
 62 |                     output_text = sample['output_text']
 63 | 
 64 |                     # 29.04.2023 ограничим 2 первым катренами
 65 |                     output_text = '\n\n'.join(output_text.split('\n\n')[:2])
 66 | 
 67 |                 input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
 68 |                 output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
 69 |                 samples.append((input_tokens, output_tokens, prompt, output_text))
 70 | 
 71 |                 if data_args.max_samples > 0 and len(samples) >= data_args.max_samples:
 72 |                     break
 73 | 
 74 |     return samples
 75 | 
 76 | 
 77 | class FinetuneDataset(Dataset):
 78 |     def __init__(self, samples, tokenizer):
 79 |         self.tokenizer = tokenizer
 80 |         self.max_len = 0
 81 |         self.samples = []
 82 | 
 83 |         self.bos_token_id = tokenizer.bos_token_id
 84 |         self.eos_token_id = tokenizer.eos_token_id
 85 |         assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1)
 86 |         self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0]
 87 |         self.pad_token_id = tokenizer.pad_token_id
 88 | 
 89 |         for src_ids, output_ids, src_text, output_text in samples:
 90 |             input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id]
 91 | 
 92 |             # Токены затравки дают label=-100
 93 |             labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id]
 94 | 
 95 |             attention_map = [1] * len(labels)
 96 | 
 97 |             self.samples.append((input_ids, labels, attention_map))
 98 |             self.max_len = max(self.max_len, len(input_ids))
 99 | 
100 |     def __len__(self):
101 |         return len(self.samples)
102 | 
103 |     def __getitem__(self, index: int):
104 |         input_ids, labels, attention_map = self.samples[index]
105 |         npad = self.max_len - len(input_ids)
106 |         input_ids = input_ids + npad*[self.pad_token_id]
107 |         labels = labels + [-100] * npad
108 |         attention_mask = attention_map + [0] * npad
109 |         return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
110 | 
111 | 
112 | @dataclass
113 | class ModelArguments:
114 |     """
115 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
116 |     """
117 |     model_name_or_path: Optional[str] = field(
118 |         default='ai-forever/rugpt13b',
119 |         metadata={"help": "The model checkpoint for weights initialization."},
120 |     )
121 | 
122 | 
123 | @dataclass
124 | class DataSetArguments:
125 |     output_syllables: Optional[bool] = field(
126 |         metadata={"help": "Силлабо-тоническое представление выходного текста (true) или обычное (false)"}
127 |     )
128 | 
129 |     dataset_path: Optional[str] = field(
130 |         default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')),
131 |         metadata={"help": "Путь к датасету со стихами"}
132 |     )
133 | 
134 |     max_samples: Optional[int] = field(
135 |         default=-1,
136 |         metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"}
137 |     )
138 | 
139 | 
140 | class MyPrinterCallback(TrainerCallback):
141 |     def __init__(self, filepath):
142 |         self.wrt = open(filepath, 'w')
143 | 
144 |     def on_log(self, args, state, control, logs=None, **kwargs):
145 |         if state.is_local_process_zero:
146 |             if 'epoch' in logs and 'loss' in logs:
147 |                 self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
148 |                 self.wrt.flush()
149 | 
150 | 
151 | if __name__ == '__main__':
152 |     parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments))
153 | 
154 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
155 | 
156 |     if not training_args.output_dir:
157 |         training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'verses_rugpt13b_lora'))
158 | 
159 |     # Setup logging
160 |     logging.basicConfig(
161 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
162 |         datefmt="%m/%d/%Y %H:%M:%S",
163 |         handlers=[logging.StreamHandler(sys.stdout)],
164 |     )
165 | 
166 |     log_level = training_args.get_process_log_level()
167 |     logger = logging.getLogger(__name__)
168 |     logger.setLevel(log_level)
169 |     #datasets.utils.logging.set_verbosity(log_level)
170 |     transformers.utils.logging.set_verbosity(log_level)
171 |     transformers.utils.logging.enable_default_handler()
172 |     transformers.utils.logging.enable_explicit_format()
173 | 
174 |     logger.info(
175 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
176 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
177 |     )
178 |     logger.info(f"Training/evaluation parameters {training_args}")
179 | 
180 |     # Удаляем старые логи tensorboard
181 |     if training_args.local_rank in (-1, 0):
182 |         for f in glob.glob(training_args.output_dir+'/*'):
183 |             if os.path.isfile(f):
184 |                 os.remove(f)
185 | 
186 |         tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
187 |         if os.path.exists(tensorboard_dir):
188 |             logger.info('Removing "%s"', tensorboard_dir)
189 |             shutil.rmtree(tensorboard_dir)
190 | 
191 |     #device = training_args.device
192 |     #logging.info('device={}'.format(device))
193 | 
194 |     logger.info('Loading tokenizer "%s"', model_args.model_name_or_path)
195 | 
196 |     tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name_or_path)
197 | 
198 |     tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})
199 |     tokenizer.save_pretrained(training_args.output_dir)
200 | 
201 |     for t in ['#', '<s>', '</s>', '<pad>']:
202 |         logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False)))
203 | 
204 |     logger.info('Loading pretrained model "%s"', model_args.model_name_or_path)
205 |     model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)
206 |     # model.half()
207 | 
208 |     if training_args.local_rank in (0, -1):
209 |         print('=' * 80)
210 |         print_gpu_utilization()
211 | 
212 |         print('=' * 30 + 'ZeRo 2' + '=' * 30)
213 |         deepspeed.runtime.zero.stage_1_and_2.estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=1,
214 |                                                                                             num_nodes=1,
215 |                                                                                             additional_buffer_factor=1.5)
216 |         print('=' * 80)
217 | 
218 | 
219 |     logger.info('Loading dataset "%s"', data_args.dataset_path)
220 |     train_samples = load_samples(data_args, tokenizer)
221 |     logger.info('Training set: %d samples', len(train_samples))
222 | 
223 |     train_dataset = FinetuneDataset(train_samples, tokenizer)
224 | 
225 |     printer = MyPrinterCallback(os.path.join(training_args.output_dir, 'finetune_rugpt13b.loss.log'))
226 |     trainer = Trainer(
227 |         model=model,
228 |         args=training_args,
229 |         train_dataset=train_dataset,
230 |         tokenizer=tokenizer,
231 |         data_collator=None,
232 |         callbacks=[printer]
233 |     )
234 | 
235 |     logger.info('Start training...')
236 |     train_result = trainer.train()
237 | 
238 |     logger.info(f'Saving the model and tokenizer')
239 |     trainer.save_model(training_args.output_dir)
240 | 
241 |     metrics = train_result.metrics
242 |     trainer.log_metrics("train", metrics)
243 |     trainer.save_metrics("train", metrics)
244 | 
245 |     logger.info('All done :)')
246 | 


--------------------------------------------------------------------------------
/poetry/finetune_rugpt13b_lora.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Тренировка модели генерации силлабо-тонической поэзии поверх rugpt13B с использованием LoRa.
  3 | """
  4 | import glob
  5 | import logging
  6 | import os
  7 | import json
  8 | import io
  9 | import random
 10 | import itertools
 11 | import sys
 12 | from typing import Any, Dict, List, Optional, Tuple, Union
 13 | from dataclasses import dataclass, field
 14 | from typing import Any, Dict, List, Optional, Tuple, Union
 15 | import shutil
 16 | from pathlib import Path
 17 | 
 18 | import numpy as np
 19 | import tqdm
 20 | import sklearn.model_selection
 21 | import torch
 22 | import scipy
 23 | import torch.nn as nn
 24 | import torch.nn.functional as F
 25 | import torch.optim as optim
 26 | from torch.utils.tensorboard import SummaryWriter
 27 | from torch.utils.data import Dataset, DataLoader
 28 | from transformers import AutoModelForCausalLM
 29 | import transformers
 30 | from transformers import AutoTokenizer
 31 | from transformers import TrainingArguments, Trainer, TrainerCallback
 32 | from transformers import HfArgumentParser
 33 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
 34 | from pynvml import *
 35 | 
 36 | 
 37 | proj_dir = os.path.expanduser('~/polygon/text_generator')
 38 | 
 39 | 
 40 | def print_gpu_utilization():
 41 |     nvmlInit()
 42 |     handle = nvmlDeviceGetHandleByIndex(0)
 43 |     info = nvmlDeviceGetMemoryInfo(handle)
 44 |     logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
 45 | 
 46 | 
 47 | def pad_sequence(sequence, pad_id, max_len):
 48 |     l = len(sequence)
 49 |     if l < max_len:
 50 |         return sequence + [pad_id] * (max_len - l)
 51 |     else:
 52 |         return sequence
 53 | 
 54 | 
 55 | def load_samples(data_args, tokenizer):
 56 |     samples = []
 57 |     with open(data_args.dataset_path, 'r') as f:
 58 |         for sample_str in f:
 59 |             sample = json.loads(sample_str)
 60 |             prompt = sample['prompt_text']
 61 |             if prompt:
 62 |                 if data_args.output_syllables:
 63 |                     # Вариант с генерацией цепочки слогов
 64 |                     lines = []
 65 |                     for line in sample['output'].split('<nl>'):
 66 |                         line = line.strip()
 67 |                         tokens = line.split(' ')
 68 |                         tokens = tokens[::-1]
 69 |                         line = ' '.join(tokens)
 70 |                         line = line.replace(' | ', '|')
 71 |                         line = line.replace(' ', '\u2010')
 72 |                         line = line.replace('|', ' ')
 73 |                         lines.append(line)
 74 |                     output_text = '\n'.join(lines)
 75 |                 else:
 76 |                     # Генерируется обычный текст.
 77 |                     output_text = sample['output_text']
 78 | 
 79 |                     # 29.04.2023 ограничим 2 первым катренами
 80 |                     output_text = '\n\n'.join(output_text.split('\n\n')[:2])
 81 | 
 82 |                 input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
 83 |                 output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
 84 |                 samples.append((input_tokens, output_tokens, prompt, output_text))
 85 | 
 86 |                 if data_args.max_samples > 0 and len(samples) >= data_args.max_samples:
 87 |                     break
 88 | 
 89 |     return samples
 90 | 
 91 | 
 92 | class FinetuneDataset(Dataset):
 93 |     def __init__(self, samples, tokenizer):
 94 |         self.tokenizer = tokenizer
 95 |         self.max_len = 0
 96 |         self.samples = []
 97 | 
 98 |         self.bos_token_id = tokenizer.bos_token_id
 99 |         self.eos_token_id = tokenizer.eos_token_id
100 |         assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1)
101 |         self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0]
102 |         self.pad_token_id = tokenizer.pad_token_id
103 | 
104 |         for src_ids, output_ids, src_text, output_text in samples:
105 |             input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id]
106 | 
107 |             # Токены затравки дают label=-100
108 |             labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id]
109 | 
110 |             attention_map = [1] * len(labels)
111 | 
112 |             self.samples.append((input_ids, labels, attention_map))
113 |             self.max_len = max(self.max_len, len(input_ids))
114 | 
115 |     def __len__(self):
116 |         return len(self.samples)
117 | 
118 |     def __getitem__(self, index: int):
119 |         input_ids, labels, attention_map = self.samples[index]
120 |         npad = self.max_len - len(input_ids)
121 |         input_ids = input_ids + npad*[self.pad_token_id]
122 |         labels = labels + [-100] * npad
123 |         attention_mask = attention_map + [0] * npad
124 |         return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
125 | 
126 | 
127 | @dataclass
128 | class ModelArguments:
129 |     """
130 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
131 |     """
132 |     model_name_or_path: Optional[str] = field(
133 |         default='ai-forever/rugpt13b',
134 |         metadata={"help": "The model checkpoint for weights initialization."},
135 |     )
136 | 
137 | 
138 | @dataclass
139 | class DataSetArguments:
140 |     output_syllables: Optional[bool] = field(
141 |         metadata={"help": "Силлабо-тоническое представление выходного текста (true) или обычное (false)"}
142 |     )
143 | 
144 |     dataset_path: Optional[str] = field(
145 |         default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')),
146 |         metadata={"help": "Путь к датасету со стихами"}
147 |     )
148 | 
149 |     max_samples: Optional[int] = field(
150 |         default=-1,
151 |         metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"}
152 |     )
153 | 
154 | 
155 | class MyPrinterCallback(TrainerCallback):
156 |     def __init__(self, filepath):
157 |         self.wrt = open(filepath, 'w')
158 | 
159 |     def on_log(self, args, state, control, logs=None, **kwargs):
160 |         if state.is_local_process_zero:
161 |             if 'epoch' in logs and 'loss' in logs:
162 |                 self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
163 |                 self.wrt.flush()
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments))
168 | 
169 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
170 | 
171 |     if not training_args.output_dir:
172 |         training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'verses_rugpt13b_lora'))
173 | 
174 |     # Setup logging
175 |     logging.basicConfig(
176 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
177 |         datefmt="%m/%d/%Y %H:%M:%S",
178 |         handlers=[logging.StreamHandler(sys.stdout)],
179 |     )
180 | 
181 |     log_level = training_args.get_process_log_level()
182 |     logger = logging.getLogger(__name__)
183 |     logger.setLevel(log_level)
184 |     #datasets.utils.logging.set_verbosity(log_level)
185 |     transformers.utils.logging.set_verbosity(log_level)
186 |     transformers.utils.logging.enable_default_handler()
187 |     transformers.utils.logging.enable_explicit_format()
188 | 
189 |     logger.info(
190 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
191 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
192 |     )
193 |     logger.info(f"Training/evaluation parameters {training_args}")
194 | 
195 |     # Удаляем старые логи tensorboard
196 |     if training_args.local_rank in (-1, 0):
197 |         for f in glob.glob(training_args.output_dir+'/*'):
198 |             if os.path.isfile(f):
199 |                 os.remove(f)
200 | 
201 |         tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
202 |         if os.path.exists(tensorboard_dir):
203 |             logger.info('Removing "%s"', tensorboard_dir)
204 |             shutil.rmtree(tensorboard_dir)
205 | 
206 |     device = training_args.device
207 |     logging.info('device={}'.format(device))
208 | 
209 |     logger.info('Loading tokenizer "%s"', model_args.model_name_or_path)
210 | 
211 |     tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_args.model_name_or_path)
212 | 
213 |     tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})
214 |     tokenizer.save_pretrained(training_args.output_dir)
215 | 
216 |     for t in ['#', '<s>', '</s>', '<pad>']:
217 |         logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False)))
218 | 
219 |     logger.info('Loading pretrained model "%s"', model_args.model_name_or_path)
220 |     model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)
221 | 
222 |     logger.info('Wrapping model to peft...')
223 |     lora_config = LoraConfig(**{
224 |         "r": 8,
225 |         "lora_alpha": 16,
226 |         "lora_dropout": 0.10,
227 |         "bias": "none",
228 |         #"target_modules": ["q_proj", "v_proj"],
229 |         "task_type": "CAUSAL_LM"
230 |     })
231 |     model = get_peft_model(model, lora_config)
232 |     print(model.print_trainable_parameters())
233 | 
234 |     logger.info('Loading dataset "%s"', data_args.dataset_path)
235 |     train_samples = load_samples(data_args, tokenizer)
236 |     logger.info('Training set: %d samples', len(train_samples))
237 | 
238 |     train_dataset = FinetuneDataset(train_samples, tokenizer)
239 | 
240 |     printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_rugpt13b_lora.loss.log'))
241 |     trainer = Trainer(
242 |         model=model,
243 |         args=training_args,
244 |         train_dataset=train_dataset,
245 |         tokenizer=tokenizer,
246 |         data_collator=None,
247 |         callbacks=[printer]
248 |     )
249 | 
250 |     logger.info('Start training for local_rank=%d...', training_args.local_rank)
251 |     train_result = trainer.train()
252 | 
253 |     if training_args.local_rank in (0, -1):
254 |         logger.info(f'Saving the model and tokenizer')
255 |         model.save_pretrained(save_directory=training_args.output_dir)
256 | 
257 |         metrics = train_result.metrics
258 |         trainer.log_metrics("train", metrics)
259 |         trainer.save_metrics("train", metrics)
260 | 
261 |     logger.info('All done :)')
262 | 


--------------------------------------------------------------------------------
/poetry/finetune_llama.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Эксперимент с файнтюном модели LLaMa на стихах.
  3 | """
  4 | 
  5 | import glob
  6 | import logging
  7 | import os
  8 | import json
  9 | import io
 10 | import random
 11 | import itertools
 12 | import sys
 13 | from typing import Any, Dict, List, Optional, Tuple, Union
 14 | from dataclasses import dataclass, field
 15 | from typing import Any, Dict, List, Optional, Tuple, Union
 16 | import shutil
 17 | from pathlib import Path
 18 | 
 19 | import numpy as np
 20 | import tqdm
 21 | import sklearn.model_selection
 22 | import torch
 23 | import scipy
 24 | import torch.nn as nn
 25 | import torch.nn.functional as F
 26 | import torch.optim as optim
 27 | from torch.utils.tensorboard import SummaryWriter
 28 | from torch.utils.data import Dataset, DataLoader
 29 | from transformers import AutoModelForCausalLM
 30 | import transformers
 31 | from transformers import AutoTokenizer
 32 | from transformers import TrainingArguments, Trainer, TrainerCallback
 33 | from transformers import HfArgumentParser
 34 | from pynvml import *
 35 | import deepspeed
 36 | 
 37 | proj_dir = os.path.expanduser('~/polygon/text_generator')
 38 | 
 39 | 
 40 | def print_gpu_utilization():
 41 |     nvmlInit()
 42 |     handle = nvmlDeviceGetHandleByIndex(0)
 43 |     info = nvmlDeviceGetMemoryInfo(handle)
 44 |     # logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
 45 |     print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")
 46 | 
 47 | 
 48 | def pad_sequence(sequence, pad_id, max_len):
 49 |     l = len(sequence)
 50 |     if l < max_len:
 51 |         return sequence + [pad_id] * (max_len - l)
 52 |     else:
 53 |         return sequence
 54 | 
 55 | 
 56 | def load_samples(data_args, tokenizer):
 57 |     samples = []
 58 |     with open(data_args.dataset_path, 'r') as f:
 59 |         for sample_str in f:
 60 |             sample = json.loads(sample_str)
 61 |             prompt = sample['prompt_text']
 62 |             if prompt:
 63 |                 if data_args.output_syllables:
 64 |                     # Вариант с генерацией цепочки слогов
 65 |                     lines = []
 66 |                     for line in sample['output'].split('<nl>'):
 67 |                         line = line.strip()
 68 |                         tokens = line.split(' ')
 69 |                         tokens = tokens[::-1]
 70 |                         line = ' '.join(tokens)
 71 |                         line = line.replace(' | ', '|')
 72 |                         line = line.replace(' ', '\u2010')
 73 |                         line = line.replace('|', ' ')
 74 |                         lines.append(line)
 75 |                     output_text = '\n'.join(lines)
 76 |                 else:
 77 |                     output_text = sample['output_text']
 78 | 
 79 |                     # 29.04.2023 ограничим 2 первым катренами
 80 |                     output_text = '\n\n'.join(output_text.split('\n\n')[:2])
 81 | 
 82 |                 input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
 83 |                 output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
 84 |                 samples.append((input_tokens, output_tokens, prompt, output_text))
 85 | 
 86 |                 if data_args.max_samples > 0 and len(samples) >= data_args.max_samples:
 87 |                     break
 88 | 
 89 |     return samples
 90 | 
 91 | 
 92 | class FinetuneDataset(Dataset):
 93 |     def __init__(self, samples, tokenizer):
 94 |         self.tokenizer = tokenizer
 95 |         self.max_len = 0
 96 |         self.samples = []
 97 | 
 98 |         self.bos_token_id = tokenizer.bos_token_id
 99 |         self.eos_token_id = tokenizer.eos_token_id
100 |         assert (len(tokenizer.encode('#', add_special_tokens=False)) == 1)
101 |         self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0]
102 |         self.pad_token_id = tokenizer.pad_token_id
103 | 
104 |         for src_ids, output_ids, src_text, output_text in samples:
105 |             input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id]
106 | 
107 |             # Токены затравки дают label=-100
108 |             labels = [-100] + [-100] * len(src_ids) + [-100] + output_ids + [self.eos_token_id]
109 | 
110 |             attention_map = [1] * len(labels)
111 | 
112 |             self.samples.append((input_ids, labels, attention_map))
113 |             self.max_len = max(self.max_len, len(input_ids))
114 | 
115 |     def __len__(self):
116 |         return len(self.samples)
117 | 
118 |     def __getitem__(self, index: int):
119 |         input_ids, labels, attention_map = self.samples[index]
120 |         npad = self.max_len - len(input_ids)
121 |         input_ids = input_ids + npad * [self.pad_token_id]
122 |         labels = labels + [-100] * npad
123 |         attention_mask = attention_map + [0] * npad
124 |         return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
125 | 
126 | 
127 | @dataclass
128 | class ModelArguments:
129 |     """
130 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
131 |     """
132 |     model_name_or_path: Optional[str] = field(
133 |         default='decapoda-research/llama-7b-hf',
134 |         metadata={"help": "The model checkpoint for weights initialization."},
135 |     )
136 | 
137 |     load_in_8bit: Optional[bool] = field(
138 |         default=False,
139 |         metadata={"help": "Load model in 8-bit"},
140 |     )
141 | 
142 | 
143 | @dataclass
144 | class DataSetArguments:
145 |     """
146 |     Arguments pertaining to what data we are going to input our model for training and eval.
147 |     """
148 |     dataset_path: Optional[str] = field(
149 |         default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')),
150 |         metadata={"help": "Путь к датасету со стихами"}
151 |     )
152 | 
153 |     output_syllables: Optional[bool] = field(
154 |         default=False,
155 |         metadata={"help": "Силлабо-тоническое представление выходного текста"}
156 |     )
157 | 
158 |     max_samples: Optional[int] = field(
159 |         default=-1,
160 |         metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"}
161 |     )
162 | 
163 | 
164 | class MyPrinterCallback(TrainerCallback):
165 |     def __init__(self, filepath):
166 |         self.wrt = open(filepath, 'w')
167 | 
168 |     def on_log(self, args, state, control, logs=None, **kwargs):
169 |         if state.is_local_process_zero:
170 |             if 'epoch' in logs and 'loss' in logs:
171 |                 self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
172 |                 self.wrt.flush()
173 | 
174 | 
175 | if __name__ == '__main__':
176 |     parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments))
177 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
178 | 
179 |     # Setup logging
180 |     logging.basicConfig(
181 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
182 |         datefmt="%d.%m.%Y %H:%M:%S",
183 |         handlers=[logging.StreamHandler(sys.stdout)],
184 |     )
185 | 
186 |     log_level = training_args.get_process_log_level()
187 |     logger = logging.getLogger(__name__)
188 |     logger.setLevel(log_level)
189 |     # datasets.utils.logging.set_verbosity(log_level)
190 |     transformers.utils.logging.set_verbosity(log_level)
191 |     transformers.utils.logging.enable_default_handler()
192 |     transformers.utils.logging.enable_explicit_format()
193 | 
194 |     logger.info(
195 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
196 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
197 |     )
198 |     logger.info(f"Training/evaluation parameters {training_args}")
199 | 
200 |     # Удаляем старые логи tensorboard
201 |     if training_args.local_rank in (-1, 0):
202 |         for f in glob.glob(training_args.output_dir + '/*'):
203 |             if os.path.isfile(f):
204 |                 os.remove(f)
205 | 
206 |         tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
207 |         if os.path.exists(tensorboard_dir):
208 |             logger.info('Removing "%s"', tensorboard_dir)
209 |             shutil.rmtree(tensorboard_dir)
210 | 
211 |     logger.info('Loading pretrained model "%s"', model_args.model_name_or_path)
212 |     model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path,
213 |                                                               # load_in_8bit=model_args.load_in_8bit,
214 |                                                               # device_map="auto"
215 |                                                               )
216 |     #model.half()
217 |     #model.to(device)
218 | 
219 |     if training_args.local_rank in (0, -1):
220 |         print('=' * 80)
221 |         print_gpu_utilization()
222 | 
223 |         if training_args.deepspeed:
224 |             print('=' * 30 + 'ZeRo 2' + '=' * 30)
225 |             deepspeed.runtime.zero.stage_1_and_2.estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=1,
226 |                                                                                                 num_nodes=1,
227 |                                                                                                 additional_buffer_factor=1.5)
228 |         print('=' * 80)
229 | 
230 |     # ------------------------- ТОКЕНИЗАТОР ----------------------------------
231 |     logger.info('Loading tokenizer "%s"', model_args.model_name_or_path)
232 |     tokenizer = transformers.LlamaTokenizer.from_pretrained(model_args.model_name_or_path)
233 | 
234 |     tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '</s>'})
235 | 
236 |     for t in ['#', '<s>', '</s>', '<pad>']:
237 |         logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False)))
238 | 
239 |     tokenizer.save_pretrained(training_args.output_dir)
240 | 
241 |     logger.info('Loading dataset "%s"', data_args.dataset_path)
242 |     train_samples = load_samples(data_args, tokenizer)
243 |     logger.info('Training set: %d samples', len(train_samples))
244 | 
245 |     train_dataset = FinetuneDataset(train_samples, tokenizer)
246 | 
247 |     printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_llama.loss.log'))
248 |     trainer = Trainer(
249 |         model=model,
250 |         args=training_args,
251 |         train_dataset=train_dataset,
252 |         tokenizer=tokenizer,
253 |         data_collator=None,
254 |         callbacks=[printer]
255 |     )
256 | 
257 |     logger.info('Start training...')
258 |     train_result = trainer.train()
259 | 
260 |     #trainer.save_model(output_dir=training_args.output_dir)
261 |     if training_args.local_rank in (0, -1):
262 |         logger.info(f'Saving the model and tokenizer')
263 |         model.save_pretrained(save_directory=training_args.output_dir)
264 | 
265 |         metrics = train_result.metrics
266 |         trainer.log_metrics("train", metrics)
267 |         trainer.save_metrics("train", metrics)
268 | 
269 |     logger.info('All done :)')
270 | 


--------------------------------------------------------------------------------
/poetry/finetune_rugpt_with_prompt_masking.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Тренировка модели генерации стихов поверх rugpt*** с исключением обратного распространения на токенах затравки.
  3 | """
  4 | import glob
  5 | import logging
  6 | import os
  7 | import json
  8 | import io
  9 | import random
 10 | import itertools
 11 | import sys
 12 | from typing import Any, Dict, List, Optional, Tuple, Union
 13 | from dataclasses import dataclass, field
 14 | from typing import Any, Dict, List, Optional, Tuple, Union
 15 | import shutil
 16 | from pathlib import Path
 17 | 
 18 | import numpy as np
 19 | import tqdm
 20 | import sklearn.model_selection
 21 | import torch
 22 | import scipy
 23 | import torch.nn as nn
 24 | import torch.nn.functional as F
 25 | import torch.optim as optim
 26 | from torch.utils.tensorboard import SummaryWriter
 27 | from torch.utils.data import Dataset, DataLoader
 28 | from transformers import AutoModelForCausalLM
 29 | import transformers
 30 | from transformers import AutoTokenizer
 31 | from transformers import TrainingArguments, Trainer, TrainerCallback
 32 | from transformers import HfArgumentParser
 33 | from pynvml import *
 34 | 
 35 | 
 36 | proj_dir = os.path.expanduser('~/polygon/text_generator')
 37 | 
 38 | 
 39 | def print_gpu_utilization():
 40 |     nvmlInit()
 41 |     handle = nvmlDeviceGetHandleByIndex(0)
 42 |     info = nvmlDeviceGetMemoryInfo(handle)
 43 |     logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
 44 | 
 45 | 
 46 | def pad_sequence(sequence, pad_id, max_len):
 47 |     l = len(sequence)
 48 |     if l < max_len:
 49 |         return sequence + [pad_id] * (max_len - l)
 50 |     else:
 51 |         return sequence
 52 | 
 53 | 
 54 | def load_samples(data_args, tokenizer, model_args):
 55 |     samples = []
 56 |     with open(data_args.dataset_path, 'r') as f:
 57 |         for sample_str in f:
 58 |             sample = json.loads(sample_str)
 59 |             prompt = sample['prompt_text']
 60 |             if prompt:
 61 |                 if data_args.output_syllables:
 62 |                     # Вариант с генерацией цепочки слогов
 63 |                     lines = []
 64 |                     for line in sample['output'].split('<nl>'):
 65 |                         line = line.strip()
 66 |                         tokens = line.split(' ')
 67 |                         tokens = tokens[::-1]
 68 |                         line = ' '.join(tokens)
 69 |                         line = line.replace(' | ', '|')
 70 |                         line = line.replace(' ', '\u2010')
 71 |                         line = line.replace('|', ' ')
 72 |                         lines.append(line)
 73 |                     output_text = '\n'.join(lines)
 74 |                 else:
 75 |                     output_text = sample['output_text']
 76 | 
 77 |                     # 29.04.2023 ограничим 2 первым катренами
 78 |                     output_text = '\n\n'.join(output_text.split('\n\n')[:2])
 79 | 
 80 |                 if 'xglm' in model_args.model_name_or_path.lower():
 81 |                     # 21.05.2023 почему-то токенизатор XGLM иногда теряет переводы строк.
 82 |                     # Поэтому заменим на особое сочетание, которое при генерации будем заменять обратно на \n
 83 |                     output_text = output_text.replace('\n', '\\n')
 84 | 
 85 |                 input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
 86 |                 output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
 87 |                 samples.append((input_tokens, output_tokens, prompt, output_text))
 88 | 
 89 |                 if data_args.max_samples > 0 and len(samples) >= data_args.max_samples:
 90 |                     break
 91 | 
 92 |     return samples
 93 | 
 94 | 
 95 | class FinetuneDataset(Dataset):
 96 |     def __init__(self, samples, tokenizer):
 97 |         self.tokenizer = tokenizer
 98 |         self.max_len = 0
 99 |         self.samples = []
100 | 
101 |         self.bos_token_id = tokenizer.bos_token_id
102 |         self.eos_token_id = tokenizer.eos_token_id
103 |         assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1)
104 |         self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0]
105 |         self.pad_token_id = tokenizer.pad_token_id
106 | 
107 |         for src_ids, output_ids, src_text, output_text in samples:
108 |             input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id]
109 | 
110 |             # Токены затравки дают label=-100
111 |             labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id]
112 | 
113 |             attention_map = [1] * len(labels)
114 | 
115 |             self.samples.append((input_ids, labels, attention_map))
116 |             self.max_len = max(self.max_len, len(input_ids))
117 | 
118 |     def __len__(self):
119 |         return len(self.samples)
120 | 
121 |     def __getitem__(self, index: int):
122 |         input_ids, labels, attention_map = self.samples[index]
123 |         npad = self.max_len - len(input_ids)
124 |         input_ids = input_ids + npad*[self.pad_token_id]
125 |         labels = labels + [-100] * npad
126 |         attention_mask = attention_map + [0] * npad
127 |         return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
128 | 
129 | 
130 | 
131 | @dataclass
132 | class ModelArguments:
133 |     """
134 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
135 |     """
136 |     model_name_or_path: Optional[str] = field(
137 |         default='sberbank-ai/rugpt3large_based_on_gpt2',
138 |         metadata={"help": "The model checkpoint for weights initialization."},
139 |     )
140 | 
141 |     tokenizer_path: Optional[str] = field(
142 |         default='sberbank-ai/rugpt3large_based_on_gpt2',
143 |         metadata={"help": "Path to tokenizer."},
144 |     )
145 | 
146 | 
147 | @dataclass
148 | class DataSetArguments:
149 |     """
150 |     Arguments pertaining to what data we are going to input our model for training and eval.
151 |     """
152 |     dataset_path: Optional[str] = field(
153 |         default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'пирожки.jsonl')),
154 |         metadata={"help": "Путь к датасету со стихами"}
155 |     )
156 | 
157 |     output_syllables: Optional[bool] = field(
158 |         default=False,
159 |         metadata={"help": "Силлабо-тоническое представление выходного текста"}
160 |     )
161 | 
162 |     max_samples: Optional[int] = field(
163 |         default=-1,
164 |         metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"}
165 |     )
166 | 
167 | 
168 | class MyPrinterCallback(TrainerCallback):
169 |     def __init__(self, filepath):
170 |         self.wrt = open(filepath, 'w')
171 | 
172 |     def on_log(self, args, state, control, logs=None, **kwargs):
173 |         if state.is_local_process_zero:
174 |             if 'epoch' in logs and 'loss' in logs:
175 |                 self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
176 |                 self.wrt.flush()
177 | 
178 | 
179 | if __name__ == '__main__':
180 |     parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments))
181 | 
182 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
183 | 
184 |     if not training_args.output_dir:
185 |         training_args.output_dir = os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'verses_pirozhki_rugpt'))
186 | 
187 |     verbose = training_args.local_rank in (-1, 0)
188 | 
189 |     # Setup logging
190 |     logging.basicConfig(
191 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
192 |         datefmt="%m/%d/%Y %H:%M:%S",
193 |         handlers=[logging.StreamHandler(sys.stdout)],
194 |     )
195 | 
196 |     log_level = training_args.get_process_log_level()
197 |     logger = logging.getLogger(__name__)
198 |     logger.setLevel(log_level)
199 |     #datasets.utils.logging.set_verbosity(log_level)
200 |     transformers.utils.logging.set_verbosity(log_level)
201 |     transformers.utils.logging.enable_default_handler()
202 |     transformers.utils.logging.enable_explicit_format()
203 | 
204 |     logger.info(
205 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
206 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
207 |     )
208 |     logger.info(f"Training/evaluation parameters {training_args}")
209 | 
210 |     # Удаляем старые логи tensorboard
211 |     if training_args.local_rank in (-1, 0):
212 |         for f in glob.glob(training_args.output_dir+'/*'):
213 |             if os.path.isfile(f):
214 |                 os.remove(f)
215 | 
216 |         tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
217 |         if os.path.exists(tensorboard_dir):
218 |             logger.info('Removing "%s"', tensorboard_dir)
219 |             shutil.rmtree(tensorboard_dir)
220 | 
221 |     device = training_args.device
222 |     logging.info('device={}'.format(device))
223 | 
224 |     if not model_args.tokenizer_path:
225 |         model_args.tokenizer_path = model_args.model_name_or_path
226 | 
227 |     logger.info('Loading tokenizer "%s"', model_args.tokenizer_path)
228 | 
229 |     if 'llama' in model_args.tokenizer_path:
230 |         tokenizer = transformers.LlamaTokenizer.from_pretrained(model_args.tokenizer_path)
231 |     else:
232 |         #tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_args.tokenizer_path)
233 |         tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.tokenizer_path)
234 | 
235 |     tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})
236 |     tokenizer.save_pretrained(training_args.output_dir)
237 | 
238 |     for t in ['#', '<s>', '</s>', '<pad>']:
239 |         logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False)))
240 | 
241 |     logger.info('Loading pretrained model "%s"', model_args.model_name_or_path)
242 |     model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)
243 |     model.to(device)
244 | 
245 |     logger.info('Loading dataset "%s"', data_args.dataset_path)
246 |     train_samples = load_samples(data_args, tokenizer, model_args)
247 |     logger.info('Training set: %d samples', len(train_samples))
248 | 
249 |     train_dataset = FinetuneDataset(train_samples, tokenizer)
250 | 
251 |     printer = MyPrinterCallback(os.path.join(training_args.output_dir, 'finetune_rugpt_with_prompt_masking.loss.log'))
252 |     trainer = Trainer(
253 |         model=model,
254 |         args=training_args,
255 |         train_dataset=train_dataset,
256 |         tokenizer=tokenizer,
257 |         data_collator=None,
258 |         callbacks=[printer]
259 |     )
260 | 
261 |     logger.info('Start training...')
262 |     train_result = trainer.train()
263 | 
264 |     logger.info(f'Saving the model and tokenizer')
265 |     trainer.save_model(output_dir=training_args.output_dir)
266 | 
267 |     metrics = train_result.metrics
268 |     trainer.log_metrics("train", metrics)
269 |     trainer.save_metrics("train", metrics)
270 | 
271 |     logger.info('All done :)')
272 | 


--------------------------------------------------------------------------------
/poetry/finetune_llama_lora.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Эксперимент с файнтюном на стихах модели LLaMa + LoRa.
  3 | """
  4 | import glob
  5 | import logging
  6 | import os
  7 | import json
  8 | import io
  9 | import random
 10 | import itertools
 11 | import sys
 12 | from typing import Any, Dict, List, Optional, Tuple, Union
 13 | from dataclasses import dataclass, field
 14 | from typing import Any, Dict, List, Optional, Tuple, Union
 15 | import shutil
 16 | from pathlib import Path
 17 | 
 18 | import numpy as np
 19 | import tqdm
 20 | import sklearn.model_selection
 21 | import torch
 22 | import scipy
 23 | import torch.nn as nn
 24 | import torch.nn.functional as F
 25 | import torch.optim as optim
 26 | from torch.utils.tensorboard import SummaryWriter
 27 | from torch.utils.data import Dataset, DataLoader
 28 | from transformers import AutoModelForCausalLM
 29 | import transformers
 30 | from transformers import AutoTokenizer
 31 | from transformers import TrainingArguments, Trainer, TrainerCallback
 32 | from transformers import HfArgumentParser
 33 | from peft import get_peft_model, LoraConfig, prepare_model_for_int8_training
 34 | from pynvml import *
 35 | 
 36 | 
 37 | proj_dir = os.path.expanduser('~/polygon/text_generator')
 38 | 
 39 | 
 40 | def print_gpu_utilization():
 41 |     nvmlInit()
 42 |     handle = nvmlDeviceGetHandleByIndex(0)
 43 |     info = nvmlDeviceGetMemoryInfo(handle)
 44 |     logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
 45 | 
 46 | 
 47 | def pad_sequence(sequence, pad_id, max_len):
 48 |     l = len(sequence)
 49 |     if l < max_len:
 50 |         return sequence + [pad_id] * (max_len - l)
 51 |     else:
 52 |         return sequence
 53 | 
 54 | 
 55 | def load_samples(data_args, tokenizer):
 56 |     samples = []
 57 |     with open(data_args.dataset_path, 'r') as f:
 58 |         for sample_str in f:
 59 |             sample = json.loads(sample_str)
 60 |             prompt = sample['prompt_text']
 61 |             if prompt:
 62 |                 if data_args.output_syllables:
 63 |                     # Вариант с генерацией цепочки слогов
 64 |                     lines = []
 65 |                     for line in sample['output'].split('<nl>'):
 66 |                         line = line.strip()
 67 |                         tokens = line.split(' ')
 68 |                         tokens = tokens[::-1]
 69 |                         line = ' '.join(tokens)
 70 |                         line = line.replace(' | ', '|')
 71 |                         line = line.replace(' ', '\u2010')
 72 |                         line = line.replace('|', ' ')
 73 |                         lines.append(line)
 74 |                     output_text = '\n'.join(lines)
 75 |                 else:
 76 |                     output_text = sample['output_text']
 77 | 
 78 |                     # 29.04.2023 ограничим 2 первым катренами
 79 |                     output_text = '\n\n'.join(output_text.split('\n\n')[:2])
 80 | 
 81 |                 input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
 82 |                 output_tokens = tokenizer.encode(output_text, add_special_tokens=False)
 83 |                 samples.append((input_tokens, output_tokens, prompt, output_text))
 84 | 
 85 |                 if data_args.max_samples > 0 and len(samples) >= data_args.max_samples:
 86 |                     break
 87 | 
 88 |     return samples
 89 | 
 90 | 
 91 | class FinetuneDataset(Dataset):
 92 |     def __init__(self, samples, tokenizer):
 93 |         self.tokenizer = tokenizer
 94 |         self.max_len = 0
 95 |         self.samples = []
 96 | 
 97 |         self.bos_token_id = tokenizer.bos_token_id
 98 |         self.eos_token_id = tokenizer.eos_token_id
 99 |         assert(len(tokenizer.encode('#', add_special_tokens=False)) == 1)
100 |         self.sep_token_id = tokenizer.encode('#', add_special_tokens=False)[0]
101 |         self.pad_token_id = tokenizer.pad_token_id
102 | 
103 |         for src_ids, output_ids, src_text, output_text in samples:
104 |             input_ids = [self.bos_token_id] + src_ids + [self.sep_token_id] + output_ids + [self.eos_token_id]
105 | 
106 |             # Токены затравки дают label=-100
107 |             labels = [-100] + [-100]*len(src_ids) + [-100] + output_ids + [self.eos_token_id]
108 | 
109 |             attention_map = [1] * len(labels)
110 | 
111 |             self.samples.append((input_ids, labels, attention_map))
112 |             self.max_len = max(self.max_len, len(input_ids))
113 | 
114 |     def __len__(self):
115 |         return len(self.samples)
116 | 
117 |     def __getitem__(self, index: int):
118 |         input_ids, labels, attention_map = self.samples[index]
119 |         npad = self.max_len - len(input_ids)
120 |         input_ids = input_ids + npad*[self.pad_token_id]
121 |         labels = labels + [-100] * npad
122 |         attention_mask = attention_map + [0] * npad
123 |         return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
124 | 
125 | 
126 | 
127 | @dataclass
128 | class ModelArguments:
129 |     """
130 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
131 |     """
132 |     model_name_or_path: Optional[str] = field(
133 |         default='decapoda-research/llama-7b-hf',
134 |         metadata={"help": "The model checkpoint for weights initialization."},
135 |     )
136 | 
137 |     load_in_8bit: Optional[bool] = field(
138 |         default=False,
139 |         metadata={"help": "Load model in 8-bit"},
140 |     )
141 | 
142 | 
143 | @dataclass
144 | class DataSetArguments:
145 |     """
146 |     Arguments pertaining to what data we are going to input our model for training and eval.
147 |     """
148 |     dataset_path: Optional[str] = field(
149 |         default=os.path.join(proj_dir, 'tmp', os.path.join(proj_dir, 'tmp', 'лирика.jsonl')),
150 |         metadata={"help": "Путь к датасету со стихами"}
151 |     )
152 | 
153 |     output_syllables: Optional[bool] = field(
154 |         default=False,
155 |         metadata={"help": "Силлабо-тоническое представление выходного текста"}
156 |     )
157 | 
158 |     max_samples: Optional[int] = field(
159 |         default=-1,
160 |         metadata={"help": "Максимальное кол-во сэмплов, считываемых из датасета"}
161 |     )
162 | 
163 | 
164 | @dataclass
165 | class LoRaArguments:
166 |     #         "r": 8,
167 |     #         "lora_alpha": 16,
168 |     #         "lora_dropout": 0.10,
169 |     lora_r: Optional[int] = field(
170 |         default=8
171 |     )
172 | 
173 |     lora_alpha: Optional[int] = field(
174 |       default=16
175 |     )
176 | 
177 |     lora_dropout: Optional[float] = field(
178 |         default=0.10
179 |     )
180 | 
181 | 
182 | class MyPrinterCallback(TrainerCallback):
183 |     def __init__(self, filepath):
184 |         self.wrt = open(filepath, 'w')
185 | 
186 |     def on_log(self, args, state, control, logs=None, **kwargs):
187 |         if state.is_local_process_zero:
188 |             if 'epoch' in logs and 'loss' in logs:
189 |                 self.wrt.write('{}\t{}\n'.format(logs['epoch'], logs['loss']))
190 |                 self.wrt.flush()
191 | 
192 | 
193 | def fix_model(model, tokenizer, use_resize=True):
194 |     model.config.pad_token_id = tokenizer.pad_token_id
195 |     assert model.config.pad_token_id is not None
196 | 
197 |     bos_candidates = (
198 |         tokenizer.bos_token_id,
199 |         tokenizer.cls_token_id,
200 |         tokenizer.sep_token_id,
201 |         tokenizer.unk_token_id
202 |     )
203 |     for bos_candidate in bos_candidates:
204 |         model.config.bos_token_id = bos_candidate
205 |         if bos_candidate is not None:
206 |             break
207 |     assert model.config.bos_token_id is not None
208 |     model.config.decoder_start_token_id = model.config.bos_token_id
209 | 
210 |     eos_candidates = (tokenizer.eos_token_id, tokenizer.sep_token_id)
211 |     for eos_candidate in eos_candidates:
212 |         model.config.eos_token_id = eos_candidate
213 |         if eos_candidate is not None:
214 |             break
215 |     assert model.config.eos_token_id is not None
216 |     if use_resize:
217 |         model.resize_token_embeddings(len(tokenizer))
218 | 
219 |     return model
220 | 
221 | 
222 | if __name__ == '__main__':
223 |     parser = HfArgumentParser((ModelArguments, DataSetArguments, TrainingArguments, LoRaArguments))
224 |     model_args, data_args, training_args, lora_args = parser.parse_args_into_dataclasses()
225 | 
226 |     lora_config = LoraConfig(**{
227 |         "r": lora_args.lora_r,
228 |         "lora_alpha": lora_args.lora_alpha,
229 |         "lora_dropout": lora_args.lora_dropout,
230 |         "bias": "none",
231 |         "target_modules": ["q_proj", "v_proj"],
232 |         "task_type": "CAUSAL_LM"
233 |     })
234 | 
235 |     # Setup logging
236 |     logging.basicConfig(
237 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
238 |         datefmt="%d.%m.%Y %H:%M:%S",
239 |         handlers=[logging.StreamHandler(sys.stdout)],
240 |     )
241 | 
242 |     log_level = training_args.get_process_log_level()
243 |     logger = logging.getLogger(__name__)
244 |     logger.setLevel(log_level)
245 |     #datasets.utils.logging.set_verbosity(log_level)
246 |     transformers.utils.logging.set_verbosity(log_level)
247 |     transformers.utils.logging.enable_default_handler()
248 |     transformers.utils.logging.enable_explicit_format()
249 | 
250 |     logger.info(
251 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
252 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
253 |     )
254 |     logger.info(f"Training/evaluation parameters {training_args}")
255 | 
256 |     # Удаляем старые логи tensorboard
257 |     if training_args.local_rank in (-1, 0):
258 |         for f in glob.glob(training_args.output_dir+'/*'):
259 |             if os.path.isfile(f):
260 |                 os.remove(f)
261 | 
262 |         tensorboard_dir = os.path.join(training_args.output_dir, 'runs')
263 |         if os.path.exists(tensorboard_dir):
264 |             logger.info('Removing "%s"', tensorboard_dir)
265 |             shutil.rmtree(tensorboard_dir)
266 | 
267 |     #device = training_args.device
268 |     #logging.info('device={}'.format(device))
269 | 
270 |     logger.info('Loading tokenizer "%s"', model_args.model_name_or_path)
271 |     tokenizer = transformers.LlamaTokenizer.from_pretrained(model_args.model_name_or_path)
272 | 
273 |     tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '</s>'})
274 | 
275 |     for t in ['#', '<s>', '</s>', '<pad>']:
276 |         logger.debug('Tokenizer: token=%s ==> %s', t, str(tokenizer.encode(t, add_special_tokens=False)))
277 | 
278 |     logger.info('Loading pretrained model "%s"', model_args.model_name_or_path)
279 |     model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path,
280 |                                                               load_in_8bit=model_args.load_in_8bit,
281 |                                                               #device_map="auto"
282 |                                                               )
283 | 
284 |     #model.config.save_pretrained(training_args.output_dir)
285 | 
286 |     if model_args.load_in_8bit:
287 |         model = fix_model(model, tokenizer, use_resize=False)
288 |         model = prepare_model_for_int8_training(model)
289 | 
290 |     #model.to(device)
291 | 
292 |     tokenizer.save_pretrained(training_args.output_dir)
293 | 
294 |     logger.info('Wrapping LLaMa to peft...')
295 |     model = get_peft_model(model, lora_config)
296 | 
297 |     #model.save_pretrained(training_args.output_dir)
298 | 
299 |     logger.info('Loading dataset "%s"', data_args.dataset_path)
300 |     train_samples = load_samples(data_args, tokenizer)
301 |     logger.info('Training set: %d samples', len(train_samples))
302 | 
303 |     train_dataset = FinetuneDataset(train_samples, tokenizer)
304 | 
305 |     printer = MyPrinterCallback(os.path.join(proj_dir, 'tmp', 'finetune_llama.loss.log'))
306 |     trainer = Trainer(
307 |         model=model,
308 |         args=training_args,
309 |         train_dataset=train_dataset,
310 |         tokenizer=tokenizer,
311 |         data_collator=None,
312 |         callbacks=[printer]
313 |     )
314 | 
315 |     logger.info('Start training...')
316 |     train_result = trainer.train()
317 | 
318 |     # trainer.save_model(output_dir=training_args.output_dir)
319 |     if training_args.local_rank in (0, -1):
320 |         logger.info(f'Saving the model and tokenizer')
321 |         model.save_pretrained(training_args.output_dir)
322 | 
323 |         metrics = train_result.metrics
324 |         trainer.log_metrics("train", metrics)
325 |         trainer.save_metrics("train", metrics)
326 | 
327 |     logger.info('All done :)')
328 | 


--------------------------------------------------------------------------------