├── LICENSE
├── matplotgraph.py
├── train_utils.py
├── custom_scheduler.py
├── README.md
└── script.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 FartyPants
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/matplotgraph.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | def create_graph(lora_path, lora_name):
 5 |     try:
 6 |         import matplotlib.pyplot as plt
 7 |         from matplotlib.ticker import ScalarFormatter
 8 |         
 9 |         peft_model_path = f'{lora_path}/training_graph.json'
10 |         image_model_path = f'{lora_path}/training_graph.png'
11 |         # Check if the JSON file exists
12 |         if os.path.exists(peft_model_path):
13 |             # Load data from JSON file
14 |             with open(peft_model_path, 'r') as file:
15 |                 data = json.load(file)
16 |             # Extract x, y1, and y2 values
17 |             x = [item['epoch'] for item in data]
18 |             y1 = [item['learning_rate'] for item in data]
19 |             y2 = [item['loss'] for item in data]
20 | 
21 |             # Create the line chart
22 |             fig, ax1 = plt.subplots(figsize=(10, 6))
23 |         
24 | 
25 |             # Plot y1 (learning rate) on the first y-axis
26 |             ax1.plot(x, y1, 'b-', label='Learning Rate')
27 |             ax1.set_xlabel('Epoch')
28 |             ax1.set_ylabel('Learning Rate', color='b')
29 |             ax1.tick_params('y', colors='b')
30 | 
31 |             # Create a second y-axis
32 |             ax2 = ax1.twinx()
33 | 
34 |             # Plot y2 (loss) on the second y-axis
35 |             ax2.plot(x, y2, 'r-', label='Loss')
36 |             ax2.set_ylabel('Loss', color='r')
37 |             ax2.tick_params('y', colors='r')
38 | 
39 |             # Set the y-axis formatter to display numbers in scientific notation
40 |             ax1.yaxis.set_major_formatter(ScalarFormatter(useMathText=True))
41 |             ax1.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
42 | 
43 |             # Add grid
44 |             ax1.grid(True)
45 | 
46 |             # Combine the legends for both plots
47 |             lines, labels = ax1.get_legend_handles_labels()
48 |             lines2, labels2 = ax2.get_legend_handles_labels()
49 |             ax2.legend(lines + lines2, labels + labels2, loc='best')
50 | 
51 |             # Set the title
52 |             plt.title(f'{lora_name} LR and Loss vs Epoch')
53 | 
54 |             # Save the chart as an image
55 |             plt.savefig(image_model_path)
56 | 
57 |             print(f"Graph saved in {image_model_path}")
58 |         else:
59 |             print(f"File 'training_graph.json' does not exist in the {lora_path}")
60 |       
61 |     except ImportError:
62 |         print("matplotlib is not installed. Please install matplotlib to create PNG graphs")


--------------------------------------------------------------------------------
/train_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from modules import shared, utils
  3 | from pathlib import Path
  4 | import requests
  5 | import tqdm
  6 | import json
  7 | 
  8 | '''
  9 | def get_gpu_memory_usage(rank):
 10 |     return {
 11 |         'total': round(torch.cuda.get_device_properties(rank).total_memory / (1024**3), 2),
 12 |         'max': round(torch.cuda.max_memory_allocated(rank) / (1024**3), 2),
 13 |         'reserved': round(torch.cuda.memory_reserved(rank) / (1024**3), 2),
 14 |         'allocated': round(torch.cuda.memory_allocated(rank) / (1024**3), 2)
 15 |     }
 16 | '''
 17 | 
 18 | def list_subfoldersByTime(directory):
 19 | 
 20 |     if not directory.endswith('/'):
 21 |         directory += '/'
 22 |     subfolders = []
 23 |     subfolders.append('None') 
 24 |     path = directory
 25 |     name_list = os.listdir(path)
 26 |     full_list = [os.path.join(path,i) for i in name_list]
 27 |     time_sorted_list = sorted(full_list, key=os.path.getmtime,reverse=True)
 28 | 
 29 |     for entry in time_sorted_list:
 30 |         if os.path.isdir(entry):
 31 |             entry_str = f"{entry}"  # Convert entry to a string
 32 |             full_path = entry_str
 33 |             entry_str = entry_str.replace('\\','/')
 34 |             entry_str = entry_str.replace(f"{directory}", "")  # Remove directory part
 35 |             subfolders.append(entry_str)
 36 | 
 37 |     return subfolders
 38 | 
 39 | def get_available_loras_local(_sortedByTime):
 40 |     
 41 |     model_dir = shared.args.lora_dir  # Update with the appropriate directory path
 42 |     subfolders = []
 43 |     if _sortedByTime:
 44 |         subfolders = list_subfoldersByTime(model_dir)
 45 |     else:
 46 |         subfolders = utils.get_available_loras()        
 47 | 
 48 |     return subfolders
 49 | 
 50 | 
 51 | # FPHAM SPLIT BY SENTENCE BLOCK ===============
 52 |      
 53 | def split_sentences(text: str, cutoff_len: int):
 54 |     sentences = []
 55 |     sentence = ''
 56 |     delimiters = ['. ', '? ', '! ', '... ', '.\n', '?\n', '!\n','...\n','</s>','<//>']
 57 |     abbreviations = ['Mr. ', 'Mrs. ', 'Dr. ', 'Ms. ', 'St. ', 'Prof. ', 'Jr. ', 'Ltd. ', 'Capt. ', 'Col. ', 'Gen. ', 'Ave. ', 'Blvd. ', 'Co. ', 'Corp. ', 'Dept. ', 'Est. ', 'Gov. ', 'Inc. ', 'Ph.D. ', 'Univ. ']
 58 |     errors = 0
 59 |     max_cut = cutoff_len-1
 60 |     prev_char = ''  
 61 | 
 62 |     for char in text:
 63 |         sentence += char
 64 | 
 65 |     
 66 |         if (any(sentence.endswith(delimiter) for delimiter in delimiters) and
 67 |             not (prev_char.isupper() and len(sentence) >= 3 and sentence[-3] != ' ') and 
 68 |             not any(sentence.endswith(abbreviation) for abbreviation in abbreviations)):
 69 |             tokens = shared.tokenizer.encode(sentence)
 70 |             
 71 |             if len(tokens) > max_cut:
 72 |                 tokens = tokens[:max_cut]
 73 |                 sentence = shared.tokenizer.decode(tokens, skip_special_tokens=True)
 74 |                 errors = errors + 1
 75 | 
 76 |             sentences.append({'text': sentence, 'size': len(tokens)})
 77 |             
 78 |             sentence = ''
 79 | 
 80 |         prev_char = char
 81 | 
 82 |     if sentence:
 83 |         tokens = shared.tokenizer.encode(sentence)
 84 |         if len(tokens) > max_cut:
 85 |             tokens = tokens[:max_cut]
 86 |             sentence = shared.tokenizer.decode(tokens, skip_special_tokens=True)  
 87 |             errors = errors + 1
 88 | 
 89 |         sentences.append({'text': sentence, 'size': len(tokens)})
 90 | 
 91 |     if errors > 0:
 92 |         print(f"Trimmed sentences beyond Cutoff Length: {errors}")
 93 | 
 94 |     return sentences
 95 | 
 96 | # The goal of following code is to create blocks of text + overlapping blocks while:
 97 | # respects sentence boundaries
 98 | # always uses all the text 
 99 | # hard cut defined by hard_cut_string or </s> will always end at the end of data block
100 | # no overlapping blocks will be created across hard cut or across </s> token
101 | 
102 | def precise_cut(text: str, overlap: bool, min_chars_cut: int, eos_to_hc: bool, cutoff_len: int, hard_cut_string: str, debug_slicer:bool,EOS_token_str:str,BOS_token_str:str):
103 | 
104 |     EOSX_str = '<//>' #hardcut placeholder
105 |     EOS_str = EOS_token_str 
106 |     print("Precise raw text slicer: ON")
107 |     
108 |     cut_string = hard_cut_string.replace('\\n', '\n')
109 |     text = text.replace(cut_string, EOSX_str)
110 |     sentences = split_sentences(text, cutoff_len)
111 | 
112 |     print(f"Sentences: {len(sentences)}")
113 |     sentencelist = []
114 |     currentSentence = ''
115 |     totalLength = 0
116 |     max_cut = cutoff_len-1
117 |     half_cut = cutoff_len//2
118 |     halfcut_length = 0
119 | 
120 |     edgeindex = []
121 |     half_index = 0
122 | 
123 |     for index, item in enumerate(sentences):
124 |         
125 |         if halfcut_length+ item['size'] < half_cut:
126 |             halfcut_length += item['size']
127 |             half_index = index
128 |         else:
129 |             edgeindex.append(half_index)
130 |             halfcut_length = -2 * max_cut
131 | 
132 | 
133 |         if totalLength + item['size'] < max_cut and not currentSentence.endswith(EOSX_str): 
134 |             currentSentence += item['text']
135 |             totalLength += item['size']
136 |         else:
137 | 
138 |             if len(currentSentence.strip()) > min_chars_cut:
139 |                 sentencelist.append(currentSentence.strip())
140 | 
141 |             currentSentence = item['text']
142 |             totalLength = item['size']
143 |             halfcut_length = item['size']
144 |             
145 |     if len(currentSentence.strip()) > min_chars_cut:    
146 |         sentencelist.append(currentSentence.strip())
147 | 
148 |     unique_blocks = len(sentencelist)
149 |     print(f"Text Blocks: {unique_blocks}")
150 | 
151 |     #overlap strategies: 
152 |     # don't overlap across HARD CUT (EOSX)
153 |     if overlap:
154 |         for edge_idx in edgeindex:
155 |             currentSentence = ''
156 |             totalLength = 0
157 | 
158 |             for item in sentences[edge_idx:]:
159 |                 if totalLength + item['size'] < max_cut:
160 |                     currentSentence += item['text']
161 |                     totalLength += item['size']
162 |                 else:
163 |                     #if by chance EOSX is at the end then it's acceptable
164 |                     if currentSentence.endswith(EOSX_str) and len(currentSentence.strip()) > min_chars_cut:
165 |                             sentencelist.append(currentSentence.strip())    
166 |                     # otherwise don't cross hard cut    
167 |                     elif EOSX_str not in currentSentence and len(currentSentence.strip()) > min_chars_cut:
168 |                         sentencelist.append(currentSentence.strip())
169 |                     
170 |                     currentSentence = ''
171 |                     totalLength = 0
172 |                     break
173 |         
174 |         print(f"+ Overlapping blocks: {len(sentencelist)-unique_blocks}")
175 | 
176 |     num_EOS = 0
177 |     for i in range(len(sentencelist)):
178 |         if eos_to_hc:
179 |             sentencelist[i] = sentencelist[i].replace(EOSX_str, EOS_str)
180 |         else:
181 |             sentencelist[i] = sentencelist[i].replace(EOSX_str, '')
182 |         
183 |         #someone may have had stop strings in the raw text...
184 |         replace_str = f"{EOS_str}{EOS_str}"
185 |         sentencelist[i] = sentencelist[i].replace(replace_str, EOS_str)
186 |         num_EOS += sentencelist[i].count(EOS_str)
187 | 
188 |     if num_EOS > 0:
189 |         print(f"+ EOS count: {num_EOS}")
190 | 
191 |     #final check for useless lines
192 |     sentencelist = [item for item in sentencelist if item.strip() != EOS_str]
193 |     sentencelist = [item for item in sentencelist if item.strip() != ""]
194 | 
195 | 
196 |     if debug_slicer:
197 |                     # Write the log file
198 |         Path('logs').mkdir(exist_ok=True)
199 |         sentencelist_dict = {index: sentence for index, sentence in enumerate(sentencelist)}
200 |         output_file = "logs/sentencelist.json"
201 |         with open(output_file, 'w') as f:
202 |             json.dump(sentencelist_dict, f,indent=2)
203 |         
204 |         print("Saved sentencelist.json in logs folder")
205 |     
206 |     return sentencelist   
207 | 
208 | 
209 | def sliding_block_cut(text: str, min_chars_cut: int, eos_to_hc: bool, cutoff_len: int, hard_cut_string: str, debug_slicer:bool,EOS_token_str:str,BOS_token_str:str):
210 | 
211 |     EOSX_str = '<//>' #hardcut placeholder
212 |     EOS_str = EOS_token_str 
213 |     print("Mega Block Overlap: ON")
214 |     
215 |     cut_string = hard_cut_string.replace('\\n', '\n')
216 |     text = text.replace(cut_string, EOSX_str)
217 |     sentences = split_sentences(text, cutoff_len)
218 | 
219 |     print(f"Sentences: {len(sentences)}")
220 |     sentencelist = []
221 |     
222 |     max_cut = cutoff_len-1
223 | 
224 |     #print(f"max_cut: {max_cut}")
225 |     advancing_to = 0
226 | 
227 |     prev_block_lastsentence = ""
228 |     
229 | 
230 |     for i in range(len(sentences)):
231 |         totalLength = 0
232 |         currentSentence = ''
233 |         lastsentence = ""
234 |         
235 |         if i >= advancing_to:
236 |             for k in range(i, len(sentences)):
237 |                 
238 |                 current_length = sentences[k]['size']
239 | 
240 |                 if totalLength + current_length <= max_cut and not currentSentence.endswith(EOSX_str):
241 |                     currentSentence += sentences[k]['text']
242 |                     totalLength += current_length
243 |                     lastsentence = sentences[k]['text']
244 |                 else:
245 |                     if len(currentSentence.strip()) > min_chars_cut:
246 |                         if prev_block_lastsentence!=lastsentence:
247 |                             sentencelist.append(currentSentence.strip())
248 |                             prev_block_lastsentence = lastsentence
249 |                         
250 |                     advancing_to = 0
251 |                     if currentSentence.endswith(EOSX_str):
252 |                         advancing_to = k
253 | 
254 |                     currentSentence = ""
255 |                     totalLength = 0
256 |                     break
257 |             
258 |             if currentSentence != "":
259 |                 if len(currentSentence.strip()) > min_chars_cut:
260 |                     sentencelist.append(currentSentence.strip())
261 | 
262 |     unique_blocks = len(sentencelist)
263 |     print(f"Text Blocks: {unique_blocks}")
264 |     num_EOS = 0
265 |     for i in range(len(sentencelist)):
266 |         if eos_to_hc:
267 |             sentencelist[i] = sentencelist[i].replace(EOSX_str, EOS_str)
268 |         else:
269 |             sentencelist[i] = sentencelist[i].replace(EOSX_str, '')
270 |         
271 |         #someone may have had stop strings in the raw text...
272 |         replace_str = f"{EOS_str}{EOS_str}"
273 |         sentencelist[i] = sentencelist[i].replace(replace_str, EOS_str)
274 |         num_EOS += sentencelist[i].count(EOS_str)
275 | 
276 |     if num_EOS > 0:
277 |         print(f"+ EOS count: {num_EOS}")
278 | 
279 |     #final check for useless lines
280 |     sentencelist = [item for item in sentencelist if item.strip() != EOS_str]
281 |     sentencelist = [item for item in sentencelist if item.strip() != ""]
282 | 
283 | 
284 |     if debug_slicer:
285 |                     # Write the log file
286 |         Path('logs').mkdir(exist_ok=True)
287 |         sentencelist_dict = {index: sentence for index, sentence in enumerate(sentencelist)}
288 |         output_file = "logs/sentencelist.json"
289 |         with open(output_file, 'w') as f:
290 |             json.dump(sentencelist_dict, f,indent=2)
291 |         
292 |         print("Saved sentencelist.json in logs folder")
293 |     
294 |     return sentencelist   
295 | 
296 | # Example usage:
297 | # download_file_from_url('https://example.com/path/to/your/file.ext', '/output/directory')
298 | 
299 | def download_file_from_url(url, overwrite, output_dir_in, valid_extensions = {'.txt', '.json'}):
300 |     try:
301 |     # Validate and sanitize the URL
302 |     #parsed_url = urllib.parse.urlparse(url)
303 |     #if not parsed_url.netloc:
304 |     #    raise ValueError("Invalid URL")
305 |     #filename = os.path.basename(parsed_url.path)
306 | 
307 |     # Get the filename from the URL
308 | 
309 |         session = requests.Session()
310 |         headers = {}
311 |         mode = 'wb'
312 |         filename = url.split('/')[-1]
313 | 
314 |         output_dir = str(output_dir_in)
315 |         # Construct the full path to the output file
316 |         local_filename = os.path.join(output_dir, filename)
317 | 
318 |         # Check if the local file already exists
319 |         overw = ''
320 |         if os.path.exists(local_filename):
321 |             if not overwrite:
322 |                 yield f"File '{local_filename}' already exists. Aborting."
323 |                 return
324 |             else:
325 |                 overw = ' [Overwrite existing]'
326 | 
327 |         filename_lower = filename.lower()
328 | 
329 |         # Send an HTTP GET request to the URL with a timeout
330 |         file_extension = os.path.splitext(filename_lower)[-1]
331 |         
332 |         if file_extension not in valid_extensions:
333 |             yield f"Invalid file extension: {file_extension}. Only {valid_extensions} files are supported."
334 |             return
335 | 
336 |         with session.get(url, stream=True, headers=headers, timeout=10) as r:
337 |             r.raise_for_status() 
338 |             # total size can be wildly inaccurate
339 |             #total_size = int(r.headers.get('content-length', 0))
340 |             
341 |             block_size = 1024 * 4  
342 |             with open(local_filename, mode) as f:
343 |                 count = 0
344 |                 for data in r.iter_content(block_size):
345 |                     f.write(data)
346 |                     count += len(data)
347 | 
348 |                     yield f"Downloaded: {count} " + overw
349 | 
350 |             # Verify file size if possible
351 |             if os.path.exists(local_filename):
352 |                 downloaded_size = os.path.getsize(local_filename)
353 |                 if downloaded_size > 0:
354 |                     yield f"File '{filename}' downloaded to '{output_dir}' ({downloaded_size} bytes)."
355 |                     print("File Downloaded")
356 |                 else:
357 |                     print("Downloaded file is zero")
358 |                     yield f"Failed. Downloaded file size is zero)."
359 |             else:
360 |                 print(f"Error: {local_filename} failed to download.")
361 |                 yield f"Error: {local_filename} failed to download"
362 | 
363 |     except Exception as e:
364 |         print(f"An error occurred: {e}")
365 |         yield f"An error occurred: {e}"
366 | 
367 |     finally:
368 |         # Close the session to release resources
369 |         session.close()
370 | 
371 | def string_to_dict(input_string: str):
372 |     # Your input string
373 |     #input_string = 'bos_token: "<s>", eos_token: "</s>", unk_token: "<unk>"'
374 | 
375 |     # Split the string into key-value pairs
376 |     key_value_pairs = input_string.split(',')
377 | 
378 |     # Initialize an empty dictionary
379 |     config_dict = {}
380 | 
381 |     # Iterate through the key-value pairs and populate the dictionary
382 |     for pair in key_value_pairs:
383 |         key, value = pair.split(':')
384 |         key = key.strip()
385 |         value = value.strip()
386 |         value = value.strip('"')
387 |         config_dict[key] = value
388 | 
389 |     return config_dict
390 | 
391 | # from transformers import AddedToken
392 | #  for k, val in special_tokens.items():
393 | #      tokenizer.add_special_tokens({k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)})
394 | 


--------------------------------------------------------------------------------
/custom_scheduler.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import torch
  3 | import transformers
  4 | import math
  5 | from torch.optim.lr_scheduler import LambdaLR
  6 | 
  7 | from peft import (
  8 |     PeftModel,
  9 | )
 10 | 
 11 | RED = "\033[91m"
 12 | YELLOW = "\033[93m"
 13 | GREEN = "\033[92m"
 14 | RESET = "\033[0m"
 15 | 
 16 | last_print_label = ''
 17 | 
 18 | custom_scheduler_params = {'trigger_loss': 0.0, 'ramp_down_ratio':1.0, 'current_loss': 0.0,'dynamic_scheduler_stop': False, 'calc_ramp_down_at_step': 0, 'calc_num_training_steps': 0}
 19 | 
 20 | 
 21 | def custom_scheduler_global_update(current_loss: float):
 22 |     custom_scheduler_params.update({'current_loss': current_loss})
 23 |   
 24 | def custom_scheduler_global_setup(trigger_loss: float, ramp_down_ratio: float):
 25 |     custom_scheduler_params.update({'trigger_loss': trigger_loss})
 26 |     custom_scheduler_params.update({'ramp_down_ratio': ramp_down_ratio})
 27 | 
 28 |     # calculates the total num steps after trigger
 29 |     custom_scheduler_params.update({'calc_num_training_steps': 0})
 30 |     #calculates steps when the ramp_down trigger occured
 31 |     custom_scheduler_params.update({'calc_ramp_down_at_step': 0})
 32 |     # triggers scheduler stopping after it reached calc_num_training_steps
 33 |     custom_scheduler_params.update({'dynamic_scheduler_stop': False})
 34 | 
 35 | 
 36 | # hold constant to the half of epochs then cosine down to 0
 37 | def _get_fp_half_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_firstepoch_steps: int):
 38 |     
 39 |     global last_print_label
 40 |     print_label = ''
 41 | 
 42 |     half_steps = num_training_steps//2
 43 |     
 44 |     num_warmup_steps = min(num_warmup_steps,half_steps)
 45 | 
 46 |     if current_step < num_warmup_steps:
 47 |         print_label = 'Scheduler: Warmup'
 48 |     elif current_step < half_steps:
 49 |         print_label = 'Scheduler: Hold'
 50 |     else:
 51 |         print_label = 'Scheduler: Annealing'
 52 |     
 53 |     if print_label != last_print_label:
 54 |         print(print_label)
 55 |     
 56 |     last_print_label = print_label
 57 | 
 58 |     if current_step < num_warmup_steps:
 59 |         return float(current_step) / float(max(1, num_warmup_steps))
 60 |     
 61 |     if current_step < half_steps:
 62 |         return 1.0 
 63 |     
 64 |     progress = float(current_step - half_steps) / float(max(1, num_training_steps - half_steps))
 65 |     num_cycles = 0.5
 66 |     return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))    
 67 |  
 68 | 
 69 | # raise up in cosine, then fall back in cosine
 70 | def _get_fp_cosine_raise_and_fall_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_firstepoch_steps: int):
 71 |     
 72 |     global last_print_label
 73 |     print_label = ''
 74 | 
 75 |     half_steps = num_training_steps//2
 76 |     
 77 |     #num_warmup_steps = min(num_warmup_steps,half_steps)
 78 | 
 79 |     if current_step < half_steps:
 80 |         print_label = 'Scheduler: Raise'
 81 |     else:
 82 |         print_label = 'Scheduler: Fall'
 83 |     
 84 |     if print_label != last_print_label:
 85 |         print(print_label)
 86 |     
 87 |     last_print_label = print_label
 88 | 
 89 |     
 90 |     # linear
 91 |     #    return float(current_step) / float(max(1, num_warmup_steps))
 92 |     
 93 |     progress = float(current_step - half_steps) / float(max(1, num_training_steps - half_steps))
 94 |     num_cycles = 0.5
 95 |     return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))    
 96 | 
 97 | # raise up in sine for 1 epoch, hold for 1 epoch, then fall back in cosine for the rest
 98 | def _get_fp_cosine_raise_hold_fall_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_firstepoch_steps: int):
 99 |     
100 |     """
101 |     Calculates a learning rate multiplier using a raise-hold-fall pattern:
102 |     1.  Raises using a sine wave from 0 to 1 for num_firstepoch_steps.
103 |     2.  Holds the value at 1 for num_firstepoch_steps.
104 |     3.  Falls using a cosine wave from 1 to 0 for the remaining steps.
105 | 
106 |     Args:
107 |         current_step: The current training step.
108 |         num_training_steps: The total number of training steps.
109 |         num_firstepoch_steps: The number of steps in the first epoch (used for raise/hold length).
110 |         last_print_label: Used to avoid printing multiple times.
111 |     Returns:
112 |         The learning rate multiplier for the current step.
113 |     """
114 |     global last_print_label
115 | 
116 |     print_label = ''
117 |     lr_mult = 0.0
118 | 
119 |     raise_steps = num_firstepoch_steps
120 |     hold_steps = num_firstepoch_steps
121 |     fall_steps = num_training_steps - (raise_steps + hold_steps)
122 | 
123 |     if current_step < raise_steps:
124 |         print_label = 'Scheduler: Raise'
125 |         progress = current_step / float(raise_steps)
126 |         lr_mult = math.sin(math.pi / 2.0 * progress)
127 |     elif current_step < raise_steps + hold_steps:
128 |         print_label = 'Scheduler: Hold'
129 |         lr_mult = 1.0
130 |     else:
131 |         print_label = 'Scheduler: Fall'
132 |         progress = (current_step - (raise_steps + hold_steps)) / float(fall_steps)
133 |         lr_mult = math.cos(math.pi / 2.0 * progress)
134 | 
135 |     if last_print_label != print_label:
136 |         print(print_label)
137 | 
138 |     last_print_label = print_label
139 |     
140 |     return max(0.0, lr_mult)
141 |  
142 | # constant to the first epoch then cosine down to 0 over the rest epochs
143 | def _get_fp_cosine_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_firstepoch_steps: int):
144 |     
145 |     global last_print_label
146 |     print_label = ''
147 |     
148 |     num_warmup_steps = min(num_warmup_steps,num_firstepoch_steps)
149 | 
150 |     if current_step < num_warmup_steps:
151 |         print_label = 'Scheduler: Warmup'
152 |     elif current_step < num_firstepoch_steps:
153 |         print_label = 'Scheduler: Hold'
154 |     else:
155 |         print_label = 'Scheduler: Annealing'
156 |     
157 |     if print_label != last_print_label:
158 |         print(print_label)
159 |     
160 |     last_print_label = print_label
161 | 
162 |     if current_step < num_warmup_steps:
163 |         return float(current_step) / float(max(1, num_warmup_steps))
164 |     
165 |     if current_step < num_firstepoch_steps:
166 |         return 1.0 
167 |     
168 |     progress = float(current_step - num_firstepoch_steps) / float(max(1, num_training_steps - num_firstepoch_steps))
169 |     num_cycles = 0.5
170 |     return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))    
171 |     
172 | # halve lr each epoch   
173 | def _get_fp_step_decay_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_firstepoch_steps: int):
174 |     
175 |     global last_print_label
176 | 
177 |     print_label = ''
178 |     lr_mult = 0.0
179 | 
180 |     num_warmup_steps = min(num_warmup_steps, num_firstepoch_steps)
181 |     
182 |     current_epoch = (current_step // num_firstepoch_steps) + 1
183 | 
184 |     if current_step < num_warmup_steps:
185 |         print_label = 'Scheduler: Warmup'
186 |         lr_mult = float(current_step) / float(max(1, num_warmup_steps))
187 |     elif current_step < num_firstepoch_steps:
188 |         print_label = 'Scheduler: Hold'
189 |         lr_mult = 1.0
190 |     else:
191 |         print_label = 'Scheduler: Step Decay'
192 |         lr_mult = 1.0 / float(2 ** (current_epoch - 1))
193 |     
194 |     if last_print_label != print_label:
195 |         print(print_label)
196 |     
197 |     last_print_label = print_label
198 | 
199 |     return lr_mult
200 | 
201 | # epoch decay: 1/(1 + decay * epoch)
202 | 
203 | def custom_cosine_scheduler_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_firstepoch_steps, last_epoch=-1):
204 |     """
205 |     Args:
206 |         optimizer ([`~torch.optim.Optimizer`]):
207 |             The optimizer for which to schedule the learning rate.
208 |         num_warmup_steps (`int`):
209 |             The number of steps for the warmup phase.
210 |         num_training_steps (`int`):
211 |             The total number of training steps.
212 |         last_epoch (`int`, *optional*, defaults to -1):
213 |             The index of the last epoch when resuming training.
214 | 
215 |     Return:
216 |         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
217 |     """
218 |     
219 |     lr_lambda = partial(
220 |         _get_fp_cosine_schedule_with_warmup_lr_lambda,
221 |         num_warmup_steps=num_warmup_steps,
222 |         num_training_steps=num_training_steps,
223 |         num_firstepoch_steps = num_firstepoch_steps,
224 |     )
225 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
226 | 
227 | def custom_half_scheduler_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_firstepoch_steps, last_epoch=-1):
228 |     """
229 |     Args:
230 |         optimizer ([`~torch.optim.Optimizer`]):
231 |             The optimizer for which to schedule the learning rate.
232 |         num_warmup_steps (`int`):
233 |             The number of steps for the warmup phase.
234 |         num_training_steps (`int`):
235 |             The total number of training steps.
236 |         last_epoch (`int`, *optional*, defaults to -1):
237 |             The index of the last epoch when resuming training.
238 | 
239 |     Return:
240 |         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
241 |     """
242 |     
243 |     lr_lambda = partial(
244 |         _get_fp_half_schedule_with_warmup_lr_lambda,
245 |         num_warmup_steps=num_warmup_steps,
246 |         num_training_steps=num_training_steps,
247 |         num_firstepoch_steps = num_firstepoch_steps,
248 |     )
249 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
250 | 
251 | def custom_raise_fall_scheduler_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_firstepoch_steps, last_epoch=-1):
252 |     """
253 |     Args:
254 |         optimizer ([`~torch.optim.Optimizer`]):
255 |             The optimizer for which to schedule the learning rate.
256 |         num_warmup_steps (`int`):
257 |             The number of steps for the warmup phase.
258 |         num_training_steps (`int`):
259 |             The total number of training steps.
260 |         last_epoch (`int`, *optional*, defaults to -1):
261 |             The index of the last epoch when resuming training.
262 | 
263 |     Return:
264 |         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
265 |     """
266 |     
267 |     lr_lambda = partial(
268 |         _get_fp_cosine_raise_and_fall_lr_lambda,
269 |         num_warmup_steps=num_warmup_steps,
270 |         num_training_steps=num_training_steps,
271 |         num_firstepoch_steps = num_firstepoch_steps,
272 |     )
273 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
274 | 
275 | def custom_raise_hold_fall_scheduler(optimizer, num_warmup_steps, num_training_steps, num_firstepoch_steps, last_epoch=-1):
276 |     """
277 |     Args:
278 |         optimizer ([`~torch.optim.Optimizer`]):
279 |             The optimizer for which to schedule the learning rate.
280 |         num_warmup_steps (`int`):
281 |             The number of steps for the warmup phase.
282 |         num_training_steps (`int`):
283 |             The total number of training steps.
284 |         last_epoch (`int`, *optional*, defaults to -1):
285 |             The index of the last epoch when resuming training.
286 | 
287 |     Return:
288 |         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
289 |     """
290 |     
291 |     lr_lambda = partial(
292 |         _get_fp_cosine_raise_hold_fall_lr_lambda,
293 |         num_warmup_steps=num_warmup_steps,
294 |         num_training_steps=num_training_steps,
295 |         num_firstepoch_steps = num_firstepoch_steps,
296 |     )
297 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
298 | 
299 | def custom_step_decay_scheduler(optimizer, num_warmup_steps, num_training_steps, num_firstepoch_steps, last_epoch=-1):
300 |     """
301 |     Args:
302 |         optimizer ([`~torch.optim.Optimizer`]):
303 |             The optimizer for which to schedule the learning rate.
304 |         num_warmup_steps (`int`):
305 |             The number of steps for the warmup phase.
306 |         num_training_steps (`int`):
307 |             The total number of training steps.
308 |         last_epoch (`int`, *optional*, defaults to -1):
309 |             The index of the last epoch when resuming training.
310 | 
311 |     Return:
312 |         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
313 |     """
314 |     
315 |     lr_lambda = partial(
316 |         _get_fp_step_decay_schedule_with_warmup_lr_lambda,
317 |         num_warmup_steps=num_warmup_steps,
318 |         num_training_steps=num_training_steps,
319 |         num_firstepoch_steps = num_firstepoch_steps,
320 |     )
321 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
322 | 
323 | def neftune_forward(self, input: torch.Tensor):
324 |     """
325 |     Implements the NEFTune forward pass for the model. Note this works only for
326 |     torch.nn.Embedding layers. This method is slightly adapted from the original source code
327 |     that can be found here: https://github.com/neelsjain/NEFTune
328 | 
329 |     Args:
330 |         input (`torch.Tensor`):
331 |             The input tensor to the model.
332 |         noise_alpha (`float`):
333 |             The noise alpha value to use for the NEFTune forward pass.
334 |     """
335 |     embeddings = torch.nn.functional.embedding(
336 |         input, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse
337 |     )
338 | 
339 |     if self.training:
340 |         # Add noise to the embeddings
341 |         dims = torch.tensor(embeddings.size(1) * embeddings.size(2))
342 |         mag_norm = self.neftune_noise_alpha / torch.sqrt(dims)
343 |         embeddings = embeddings + torch.zeros_like(embeddings).uniform_(-mag_norm, mag_norm)
344 | 
345 |     return embeddings    
346 | 
347 | 
348 | class FPNEFtuneTrainer(transformers.Trainer):
349 |     def __init__(self,neftune_noise_alpha:float = 0.0, model = None, *args, **kwargs):
350 |         self.neftune_noise_alpha = neftune_noise_alpha
351 |         if self.neftune_noise_alpha > 0.0:
352 |             model = self._activate_neftune(model)
353 |         super().__init__(model = model, *args, **kwargs)
354 | 
355 |    
356 |     def _activate_neftune(self, model):
357 |         r"""
358 |         Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper: https://arxiv.org/abs/2310.05914
359 |         """
360 |         print(f"Activating {RED}NEFtune{RESET} with scale: {self.neftune_noise_alpha}")
361 |         if isinstance(model, transformers.PreTrainedModel):
362 |             embeddings = model.get_input_embeddings()
363 |         elif isinstance(model, PeftModel):
364 |             embeddings = model.base_model.get_input_embeddings()
365 | 
366 |         embeddings.neftune_noise_alpha = self.neftune_noise_alpha
367 |         old_forward = embeddings.forward
368 | 
369 |         # This hack seems to be needed to properly use a custom forward pass
370 |         # all credits to: https://discuss.pytorch.org/t/how-can-i-replace-the-forward-method-of-a-predefined-torchvision-model-with-my-customized-forward-function/54224/11
371 |         bound_method = neftune_forward.__get__(embeddings, embeddings.__class__)
372 |         setattr(embeddings, "forward", bound_method)
373 | 
374 |         # embeddings.forward = neftune_forward
375 |         embeddings._trl_old_forward = old_forward
376 | 
377 |         return model
378 |     
379 |     def train(self, *args, **kwargs):
380 |         output = super().train(*args, **kwargs)
381 | 
382 |         # After training we make sure to retrieve back the original forward pass method
383 |         # for the embedding layer
384 |         if self.neftune_noise_alpha is not None:
385 | 
386 |             if isinstance(self.model, transformers.PreTrainedModel):
387 |                 embeddings = self.model.get_input_embeddings()
388 |             elif isinstance(self.model, PeftModel):
389 |                 embeddings = self.model.base_model.get_input_embeddings()
390 | 
391 |             if hasattr(embeddings, "_trl_old_forward"):
392 |                 embeddings.forward = embeddings._trl_old_forward
393 |                 del embeddings._trl_old_forward
394 |                 del embeddings.neftune_noise_alpha
395 | 
396 |         return output
397 | 
398 | 
399 | class FPSchedulerTrainer(transformers.Trainer):
400 |     def __init__(self,neftune_noise_alpha:float = 0.0, model = None, *args, **kwargs):
401 |         self.neftune_noise_alpha = neftune_noise_alpha
402 |         if self.neftune_noise_alpha > 0.0:
403 |             model = self._activate_neftune(model)
404 |         super().__init__(model = model, *args, **kwargs)
405 | 
406 |    
407 |     def _activate_neftune(self, model):
408 |         r"""
409 |         Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper: https://arxiv.org/abs/2310.05914
410 |         """
411 |         print(f"Activating {RED}NEFtune{RESET} with scale: {self.neftune_noise_alpha}")
412 |         if isinstance(model, transformers.PreTrainedModel):
413 |             embeddings = model.get_input_embeddings()
414 |         elif isinstance(model, PeftModel):
415 |             embeddings = model.base_model.get_input_embeddings()
416 | 
417 |         embeddings.neftune_noise_alpha = self.neftune_noise_alpha
418 |         old_forward = embeddings.forward
419 | 
420 |         # This hack seems to be needed to properly use a custom forward pass
421 |         # all credits to: https://discuss.pytorch.org/t/how-can-i-replace-the-forward-method-of-a-predefined-torchvision-model-with-my-customized-forward-function/54224/11
422 |         bound_method = neftune_forward.__get__(embeddings, embeddings.__class__)
423 |         setattr(embeddings, "forward", bound_method)
424 | 
425 |         # embeddings.forward = neftune_forward
426 |         embeddings._trl_old_forward = old_forward
427 | 
428 |         return model
429 |     
430 |     def train(self, *args, **kwargs):
431 |         output = super().train(*args, **kwargs)
432 | 
433 |         # After training we make sure to retrieve back the original forward pass method
434 |         # for the embedding layer
435 |         if self.neftune_noise_alpha is not None:
436 | 
437 |             if isinstance(self.model, transformers.PreTrainedModel):
438 |                 embeddings = self.model.get_input_embeddings()
439 |             elif isinstance(self.model, PeftModel):
440 |                 embeddings = self.model.base_model.get_input_embeddings()
441 | 
442 |             if hasattr(embeddings, "_trl_old_forward"):
443 |                 embeddings.forward = embeddings._trl_old_forward
444 |                 del embeddings._trl_old_forward
445 |                 del embeddings.neftune_noise_alpha
446 | 
447 |         return output
448 | 
449 | 
450 |     def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None):
451 |         #Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or passed as an argument.
452 |         
453 |         num_train_epochs = self.args.num_train_epochs
454 |         num_warmup_steps=self.args.get_warmup_steps(num_training_steps)
455 |         num_firstepoch_steps = math.ceil(num_training_steps/num_train_epochs)
456 |         num_warmup_acc = num_warmup_steps*self.args.gradient_accumulation_steps 
457 |         num_firstepoch_steps_acc = num_firstepoch_steps*self.args.gradient_accumulation_steps
458 |         num_training_steps_acc = num_training_steps*self.args.gradient_accumulation_steps
459 | 
460 |         custom_scheduler_params.update({'dynamic_scheduler_stop': False})
461 |  
462 |         print (f"Warm-up steps aligned to Gradient accumulation ({self.args.gradient_accumulation_steps}) = {num_warmup_acc} actual warmup steps")
463 |         
464 |         if self.args.lr_scheduler_type == 'cosine':
465 |             
466 |             num_warmup_acc_min = min(num_warmup_acc, num_firstepoch_steps_acc)
467 | 
468 |             print ("FP Sheduler FP_low_epoch_annealing: Warmup, Hold for 1 epoch, Annealing for the rest")
469 |             if num_warmup_acc>num_firstepoch_steps_acc:
470 |                 print(f"\033[1;31;1mWARNING: The number of warmup steps is set too high! It will be clamped to 1 epoch, essentially going from warmup to annealing.\033[0;37;0m")
471 | 
472 |             print (f"FP Scheduler Warmup: 0-{num_warmup_acc_min}, Hold: {num_warmup_acc_min}-{num_firstepoch_steps_acc}, Annealing: {num_firstepoch_steps_acc}-{num_training_steps_acc}")
473 | 
474 |             self.lr_scheduler = custom_cosine_scheduler_with_warmup(
475 |                     optimizer=self.optimizer if optimizer is None else optimizer,
476 |                     num_warmup_steps=num_warmup_steps,
477 |                     num_training_steps=num_training_steps, 
478 |                     num_firstepoch_steps = num_firstepoch_steps,
479 |                 )
480 |             self._created_lr_scheduler = True
481 |             return self.lr_scheduler
482 |         elif self.args.lr_scheduler_type == 'constant':
483 |            
484 |             half_step_acc = num_training_steps_acc//2
485 |             num_warmup_acc_min = min(num_warmup_acc, half_step_acc)
486 | 
487 |             print ("FP Sheduler FP_half_time_annealing: Warmup, Hold for half training steps, Annealing ")
488 |             if num_warmup_acc>half_step_acc:
489 |                 print(f"\033[1;31;1mWARNING: The number of warmup steps is set too high! It will be clamped to half of all epochs, essentially going from warmup to annealing in the middle.\033[0;37;0m")
490 | 
491 |             print (f"FP Scheduler Warmup: 0-{num_warmup_acc_min}, Hold: {num_warmup_acc_min}-{half_step_acc}, Annealing: {half_step_acc}-{num_training_steps_acc}")
492 | 
493 |             self.lr_scheduler = custom_half_scheduler_with_warmup(
494 |                     optimizer=self.optimizer if optimizer is None else optimizer,
495 |                     num_warmup_steps=num_warmup_steps,
496 |                     num_training_steps=num_training_steps, 
497 |                     num_firstepoch_steps = num_firstepoch_steps,
498 |                 )
499 |             self._created_lr_scheduler = True
500 |             return self.lr_scheduler
501 |         elif self.args.lr_scheduler_type == 'constant_with_warmup':
502 |            
503 |             half_step_acc = num_training_steps_acc//2
504 |             
505 |             print ("FP scheduler FP_raise_fall_creative (sine curve with the max peak in half training step)")
506 |             if num_warmup_steps>0:
507 |                 print(f"Warmup doesn't apply to this scheduler [Raise-Fall]")
508 | 
509 |             print (f"Raise: 0-{half_step_acc}, Fall: {half_step_acc}-{num_training_steps_acc}")
510 | 
511 |             self.lr_scheduler = custom_raise_fall_scheduler_with_warmup(
512 |                     optimizer=self.optimizer if optimizer is None else optimizer,
513 |                     num_warmup_steps=num_warmup_steps,
514 |                     num_training_steps=num_training_steps, 
515 |                     num_firstepoch_steps = num_firstepoch_steps,
516 |                 )
517 |             self._created_lr_scheduler = True
518 |             return self.lr_scheduler        
519 |         elif self.args.lr_scheduler_type == 'linear':
520 |            
521 |             print ("FP scheduler FP_3epoch_raise_hold_fall: Sine Raise for 1 epoch - Constant Hold for 1 epoch - Cosine Fall for the rest")
522 |             if num_warmup_steps>0:
523 |                 print(f"Warmup doesn't apply to this scheduler")
524 | 
525 |             if num_train_epochs<3:
526 |                 print(f"\033[1;31;1mWARNING: The number of epochs is set to less than 3. This scheduler is designed for 3 and more epochs .\033[0;37;0m")
527 | 
528 |             print (f"Raise (1st epoch): 0-{num_firstepoch_steps_acc}, Hold (2nd epoch) {num_firstepoch_steps_acc}-{num_firstepoch_steps_acc*2}, Fall (3rd-> epoch) {num_firstepoch_steps_acc*2}-{num_training_steps_acc}")
529 | 
530 |             self.lr_scheduler = custom_raise_hold_fall_scheduler(
531 |                     optimizer=self.optimizer if optimizer is None else optimizer,
532 |                     num_warmup_steps=num_warmup_steps,
533 |                     num_training_steps=num_training_steps, 
534 |                     num_firstepoch_steps = num_firstepoch_steps,
535 |                 )
536 |             self._created_lr_scheduler = True
537 |             return self.lr_scheduler        
538 |         elif self.args.lr_scheduler_type == 'cosine_with_restarts':
539 |            
540 |             half_step_acc = num_training_steps_acc//2
541 |             num_warmup_acc_min = min(num_warmup_acc, num_firstepoch_steps_acc)
542 |             
543 |             print ("FP Scheduler FP_step_decay_with_warmup: Warmup, Hold for 1 epoch, Step Decay for the rest epochs")
544 |             if num_warmup_acc>num_firstepoch_steps_acc:
545 |                 print(f"\033[1;31;1mWARNING: The number of warmup steps is set too high! It will be clamped to 1 epoch, essentially going from warmup to step decay.\033[0;37;0m")
546 | 
547 |             print (f"Warmup: 0-{num_warmup_acc_min}, Hold: {num_warmup_acc_min}-{num_firstepoch_steps_acc}, Step Decay: {num_firstepoch_steps_acc}-{num_training_steps_acc}")
548 | 
549 |             self.lr_scheduler = custom_step_decay_scheduler(
550 |                     optimizer=self.optimizer if optimizer is None else optimizer,
551 |                     num_warmup_steps=num_warmup_steps,
552 |                     num_training_steps=num_training_steps, 
553 |                     num_firstepoch_steps = num_firstepoch_steps,
554 |                 )
555 |             self._created_lr_scheduler = True
556 |             return self.lr_scheduler        
557 |         else:
558 |             return  super().create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Training_PRO WIP
  2 | 
  3 | This is an expanded and reworked Training tab - the very latest and newest version
  4 | Maintained by FP
  5 | 
  6 | [![ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/Q5Q5MOB4M)
  7 | 
  8 | Repo home:
  9 | 
 10 | https://github.com/FartyPants/Training_PRO
 11 | 
 12 | In general the repo is WAY ahead (could be a year by now) of the Training PRO extension included in text WebUi. The idea is to keep the extension supplied with the WebUI well tested and stable, while the repo version adds many experimental features that could change shape in further weeks. 
 13 | 
 14 | 
 15 | ## Training PRO is featured in my huge book "The Cranky Man's Guide to LoRA & QLoRA" 
 16 | 
 17 | <img height="200" alt="The Cranky Man's Guide to LoRA & QLoRA" src="https://github.com/user-attachments/assets/afdbaae1-54a6-421f-a52c-ce6ea4477514" />
 18 | 
 19 | Find it on [Amazon](https://www.amazon.com/dp/B0FLBTR2FS), [Apple Books](https://books.apple.com/us/book/the-cranky-mans-guide-to-lora-and-qlora/id6749593842), [Kobo](https://www.kobo.com/ca/en/ebook/the-cranky-man-s-guide-to-lora-and-qlora), [Barnes & Noble](https://www.barnesandnoble.com/w/the-cranky-mans-guide-to-lora-and-qlora-f-p-ham/1148001179)
 20 | 
 21 | 
 22 | ## Installation:
 23 | 
 24 | Since a stable version of Training PRO is included in WebUI, to avoid issues with WebUI updates, put this repo in Training_PRO_wip folder and use Training_PRO_wip in Session, instead of the supllied Traing PRO that comes with WebUI
 25 | 
 26 | Clone repo to your extensions folder in Training_PRO_wip
 27 | ```
 28 | cd text-generation-webui\extensions\
 29 | git clone https://github.com/FartyPants/Training_PRO Training_PRO_wip
 30 | ```
 31 | Now use the Training_PRO_wip in Session, instead of the supllied Traing PRO
 32 | 
 33 | ![image](https://github.com/FartyPants/Training_PRO/assets/23346289/4778ceff-dd23-4121-ac84-10a0f1c2cd63)
 34 | 
 35 | ## Blog post
 36 | How to convert JSON to JSONL to be used with jinja embedded templates
 37 | https://ko-fi.com/post/Convert-JSON-to-JSONL-in-Training-PRO-W7W2100SMQ
 38 | 
 39 | ## News
 40 | May 2025
 41 | - changed to the user_data folder WebUI seems to be using now
 42 | 
 43 | January 2025
 44 | - fix for gradient_checkpoint error when used without BnB
 45 | - added new custom schedulers: 
 46 |   FP_3Epoch_Raise_Hold_Fall: (min 3 epochs) 1st epoch sine, 2nd epoch Hold, rest of the epochs cosine
 47 |   FP_step_decay_with_Warmup - every next epoch will halve the LR
 48 | - Coninued Pretraining - adding lm_head and embed_tokens to the training
 49 |   ![image](https://github.com/user-attachments/assets/ccf1a12a-2ad4-482a-87d4-0854c4c93a89)
 50 |   
 51 |   This will do basically sorta full finetune if used with All Linear Targets and the LORA files will be huge (size of the quantized model)
 52 |   YOU have to use 8bit optimiser with this, otherwise it won't fit into your 24GB - so you need to use 4-bit BnB to load model, then select 8-bit Adam, like paged_adamw_8bit. You can tune LLAMA-3 8B if you are careful, with rank 16 or 32 and 2-4 batch and 256 context length.
 53 | - paged_adamw_8bit and adamw_8bit added
 54 |   
 55 | 
 56 | July 2024
 57 | - patch for llama 3 padding 
 58 | - changed how blocks are eliminated
 59 | - shuffled code to make sure the model is reloaded first
 60 | - Hybrid training parameters
 61 |   ![image](https://github.com/FartyPants/Training_PRO/assets/23346289/2054b25f-d4d1-4b23-95dc-c55e738ec6f7)
 62 | - tools to convert from JSON to JSONL
 63 |   ![image](https://github.com/FartyPants/Training_PRO/assets/23346289/ae61d3ab-5f6b-460c-97c1-ae763e8eca3a)
 64 | - dump entire dataset to log
 65 | 
 66 | February 2024
 67 | - Hybrid Training (experimental) allows you to use instruct dataset AND Raw text file at the same time creating a hybrid finetune.
 68 | - Ability to use JSONL (OpenAi) datasets. The format will be chosen automatically from the Template embedded in tokenizer
 69 | 
 70 |   ![image](https://github.com/FartyPants/Training_PRO/assets/23346289/81fd0375-3fcb-45a0-a603-c9ad3b8359f9)
 71 | - perlexity eval max_length from webui truncation_length_max
 72 | - stop at epoch (can be changed during training)
 73 | - force bin instead of safetensors (for now)
 74 | - remove torch detour and instead set warning ignore
 75 | - epoch log is now in 3 decimal numbers instead of 2
 76 | - fix for some confusion in raw text over what is EOS token
 77 | - Suggestions for Maximum Context length (Measures the longest block in tokens)
 78 | - Eliminate cutoff blocks - instead of trimming block if it is above cutoff it will eliminate the block all together. 
 79 | - fixes, Group samples by length - makes learning more efficient
 80 | - NEFtune: add noise to help with generalization
 81 | - Loss Graph in interface.
 82 | - Supports Mistral training
 83 | - some roundabout around pytorch and transformers version desync
 84 | 
 85 | ![image](https://github.com/FartyPants/Training_PRO/assets/23346289/e389ec69-d7ad-4922-9ad9-865625997479)
 86 | 
 87 | ## Features/Changes from Main Training in WebUI
 88 | 
 89 | - Chunking: precise raw text slicer (PRTS) uses sentence slicing and making sure things are clean on all ends
 90 | - overlap chunking - this special overlapping will make additional overlap block based on logical rules (aka no overlap block on hard cut)
 91 | - custom scheduler (follow the code to make your own) In LR Scheduler select FP_low_epoch_annealing - this scheduler will keep the LR constant for first epoch then use cosine for the rest - this part would be best to spawn into a new py file
 92 | - saves graph png file at the end with learning rate and loss per epoch
 93 | - adding EOS to each block or to hard cut only
 94 | - automatically lowers gradient accumulation if you go overboard and set gradient accumulation that will be higher than actual data - transformers would then throw error (or they used to, not sure if still true) but in any way, it will fix bad data
 95 | - turn BOS on and OFF
 96 | - target selector
 97 | - DEMENTOR LEARNING (experimental) Deep Memorization Enforcement Through Overlapping and Repetition. This is an experiment for long-text learning using low epochs (basically use 1 epoch with constant LR or 2 epochs with FP_low_epoch_annealing LR scheduler)
 98 | - Getting rid of micro batch size/batch size confusion. Now there is True Batch Size and Gradient accumulation slider, consisten with all the other training out there
 99 | - Ability to save Checkpoint during training with a button
100 | - Ability to change Stop Loss during training
101 | - different modes of checkpoint auto saving
102 | - Function to Check Dataset and suggest parameters such as warmup and checkpoint save frequency before training
103 | - Graph Training Loss in interface
104 | - more custom schedulers
105 |   
106 | ### Notes:
107 | 
108 | This uses it's own chunking code for raw text based on sentence splitting. This will avoid weird cuts in the chunks and each chunk should now start with sentence and end on some sentence. It works hand in hand with Hard Cut. A propper use is to structure your text into logical blocks (ideas) separated by three \n then use three \n in hard cut. This way each chunk will contain only one flow of ideas and not derail in the thoughts. And Overlapping code will create overlapped blocks on sentence basis too, but not cross hard cut, thus not cross different ideas either. Does it make any sense? No? Hmmmm...
109 | 
110 | ### Custom schedulers
111 | 
112 | A bunch of custom (combination) schedulers are added to the LR schedule. These are based on my own experiments
113 | 
114 | **FP_low_epoch_annealing**
115 | 
116 | Uses constant LR (with warmup) for 1 epoch only. The rest of the epoch(s) is cosine annealing. So 10 epochs - 1 will be constant 9 will be nose dive down. However a typical usage would be 2 epochs (hence low epoch in name). 1st is constant, the second is annealing. Simple. I use it 90% of time.
117 | 
118 | **FP_half_time_annealing**
119 | 
120 | Like the low epoch, but now the total number of steps is divided by 2. First half is constant, second half is annealing. So 10 epochs - 5 will be constant, 5 will be cosine nose down.
121 | 
122 | **FP_raise_fall_creative**
123 | 
124 | This is a sine raise till half of the total steps then cosine fall the rest. (Or you may think of the curve as sine in its entirety. The most learning is done in the hump, in the middle. The warmup entry has no effect, since sine is automatically warm up.
125 | The idea is to start very mildly as not to overfit with the first blocks of dataset. It seems to broaden the scope of the model making it less strict for tight dataset. 
126 | 
127 | ### Targets
128 | 
129 | Normal LORA is q, v and that's what you should use. You can use (q k v o) or (q k v) and it will give you a lot more trainable parameters. The benefit is that you can keep rank lower and still attain the same coherency as q v with high rank. Guanaco has been trained with QLORA and q k v o for example and they swear by it.
130 | 
131 | ### DEMENTOR LEARNING (experimental) Deep Memorization Enforcement Through Overlapping and Repetition
132 | 
133 | This is and experimental chunking to train long-form text in low number of epochs (basically 1) with sliding repetition. The depth of learning directly depends on the cutoff_length. Increasing cutoff length will also increase number of blocks created from long-form text (which is contrary to normal training). It is based on my own wild experiments. 
134 | 
135 | ### Getting rid of batch size and micro batch size
136 | 
137 | Keeping consistency with everyone else. 
138 | 
139 | Listen, There is only ONE batch size - the True batch size (called previously micro-batch size in WebUI) - this is how many blocks are processed at once (during a single step). It eats GPU, but it really helps with the quality training (in fact the ideal batch size would be the same as number of blocks - which is unrealistic) - so the idea is to cram as much True Batch Size before your GPU blows with OOM. On 24GB this is about 10 for 13b (loaded with 4-bit)
140 | 
141 | So no micro batch size - it is now called True Batch Size, because that's what it is.
142 | 
143 | The other thing is Gradient Accumulation - this is an emulation of the above Batch size - a virtual batch size, if you will. If your GPU can't handle real batch size then you may fake it using Gradient Accumulation. This will accumulate the gradients over so many steps defined here and then update the weights at the end without increase in GPU.
144 | Gradient accumulation is like a virtual Batch size multiplier without the GPU penalty.
145 | 
146 | If your batch size is 4 and your gradient accumulation is 2 then it sort of behaves as if we have batch size 8. *Sort of* because Batch size of 4 and GA of 2 is NOT the same as batch size of 2 and GA of 4. (It produces different weights - hence it's not an equivalent). The idea is that if you don't have GPU - using GA to extend batch size is the next best thing (good enough) since you have no other choice.
147 | 
148 | If all you can afford is 1 batch size, then increasing GA will likely make the learning better in some range of GA (it's not always more is better).
149 | 
150 | However - GA is not some golden goose. As said, it isn't the same as batch size. In fact GA may worsen your learning as well.
151 | 
152 | I would suggest a series of experiment where you would put batch size as high as possible without OOM, set GA 1, then repeat training while increasing the GA (2, 4...), and see how the model changes. It's likely that it would follow some sort of curve where GA will seem to help before it will make it worse. Some people believe that if you can squeeze 6 BATCH Size, then you should not bother with GA at all... YMMW
153 | 
154 | High Batch Size vs High GA would also likely produce different results in terms of learning  words vs style. How? Hmmmm... good question.
155 | 
156 | One optical "benefit" of GA is that the loss will fluctuate less (because of all the gradient accumulation, which works as a form of noise smoothing as well).
157 | 
158 | ### Eliminating bad blocks
159 | 
160 | If you use JSON and a block is longer than Maximum context length (Cutoff) by default it will be trimmed to the Maximum context length (Cutoff). That's the default behavior. While it may work on some cases where the end of the block is not much more important than the beginning, in some other cases this may create a really bad situation. Imagine if you are training a text labeling system where you train it on something like this:
161 | 
162 | USER: Determine the type of text: ... some long text... 
163 | 
164 | ASSISTANT: Poetry
165 | 
166 | In such case trimming the block will probably cutoff the entire answer thus making the block useless. Not only that, also skewing the whole functionality where the model may learn that entering long text means no answer will be given.
167 | 
168 | Elimninate cutoff blocks option will simply not use such block at all. No block is much preferable than having a bad block.
169 | 
170 | This options apply only to JSON dataset for obvious reasons.
171 | 
172 | Also watch the terminal window to see how many blocks you had elimitnated - if it is too many, then you need to increase Maximum context length (Cutoff).
173 | 
174 | ### Group Samples by Length
175 | 
176 | Makes the training more efficient as it will group blocks with simillar size into one batch. The effect can be observed on the loss graph, because the loss will became much more oscillating. This is due to the fact that different block lengths have different loss and when grouped the effect is amplified. 
177 | 
178 | ![image](https://github.com/FartyPants/Training_PRO/assets/23346289/57acfb4c-085a-4d0c-a801-faa11832d413)
179 | 
180 | ### JSONL datasets
181 | Those are the new type of datasets that have role defined. They expect the jinja Template. The format is:
182 | 
183 | >[
184 | >  {
185 | >    "messages": [
186 | >      {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."},
187 | >      {"role": "user", "content": "What's the capital of France?"},
188 | >      {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}
189 | >    ]
190 | >  },
191 | 
192 | The format will be chosen autmatically from the chat Template embedded in the tokenizer metadata. If no format is specified (legacy) then the jinja instruction template specified in WebUI will be used
193 | 
194 | ## hybrid training
195 | 
196 | Did you wonder what would happen if you train partially on raw text and also on instruct dataset? Now you can with Hybrid training. Simply select both files - dataset (and format) and raw text file. And go!
197 | What this will give you - IDK. Experiment. But in general it can stylize instruct response with the writing style of the raw text file. (Of course the correct ratio matters!) Or do some other damage. Now go and test it.
198 | 
199 | 
200 | 
201 | ---
202 | 
203 | ### **Training PRO: User Guide**
204 | 
205 | This Gradio extension provides comprehensive tools for training LoRA (Low-Rank Adaptation) adapters for your LLMs. It covers everything from dataset preparation to advanced training settings and evaluation.
206 | 
207 | ---
208 | 
209 | ### **Overview**
210 | 
211 | The extension is divided into two main tabs:
212 | 
213 | 1.  **Train LoRA:** Where you configure and start your LoRA training runs.
214 | 2.  **Perplexity evaluation:** Where you can evaluate the performance of your models (and LoRAs) on various text datasets.
215 | 
216 | ---
217 | 
218 | ### **1. Train LoRA Tab: Your Fine-Tuning Control Center**
219 | 
220 | This is where you'll spend most of your time configuring your training.
221 | 
222 | #### **A. General LoRA Settings**
223 | 
224 | *   **Ver: 25.05.20:** This just indicates the version of the extension you're using.
225 | *   **Copy parameters from (`copy_from`):**
226 |     *   **How to Use:** Select a previously saved LoRA training run from this dropdown list.
227 |     *   **What it Does:** Fills in *all* the settings on the page with the parameters used for that past LoRA. This is incredibly useful for replicating successful runs or starting new ones based on existing configurations.
228 | *   **Sort list by Date (`sort_byTime`):**
229 |     *   **How to Use:** Check this box.
230 |     *   **What it Does:** Changes the "Copy parameters from" dropdown to sort your saved LoRAs by the date they were created (most recent first) instead of alphabetically.
231 | *   **Name (`lora_name`):**
232 |     *   **How to Use:** Type the name for your new LoRA adapter here. This will be the name of the folder where your LoRA files are saved.
233 | *   **Override Existing Files (`always_override`):**
234 |     *   **How to Use:**
235 |         *   **Checked:** If a LoRA with the same name already exists, checking this box will delete the old one and start a new training from scratch.
236 |         *   **Unchecked:** If a LoRA with the same name exists, the training will *resume* from where it left off, attempting to continue the previous training. (Note: The "LoRA Rank" must be the same as the original for this to work).
237 |     *   **What it Does:** Controls whether you start fresh or continue existing training.
238 | 
239 | #### **B. LoRA Core Parameters**
240 | 
241 | These are fundamental settings that define the LoRA adapter itself and the learning process.
242 | 
243 | *   **LoRA Rank (`lora_rank`):**
244 |     *   **How to Use:** Adjust the slider to set the LoRA rank.
245 |     *   **What it Does:** This is also known as "dimension count." It's the primary factor determining the LoRA's capacity to learn.
246 |         *   **Higher values (e.g., 128, 256, 1024+):** Create a larger LoRA file, allowing it to learn more complex patterns and fine details. Requires more VRAM. Use for teaching new factual information or very specific behaviors.
247 |         *   **Smaller values (e.g., 4, 8, 16):** Create a smaller LoRA file, generally used for stylistic changes, tone, or minor concept adjustments. Requires less VRAM.
248 | *   **LoRA Alpha (`lora_alpha`):**
249 |     *   **How to Use:** Adjust the slider.
250 |     *   **What it Does:** This value scales the LoRA's influence. A common rule of thumb is to set Alpha to 1x or 2x the Rank. The actual scaling is `LoRA_alpha / rank`. Higher alpha means more influence.
251 | *   **Rank Stabilised LoRA (rsLoRA) Accordion:**
252 |     *   **Use rsLoRA (`lora_RS`):**
253 |         *   **How to Use:** Check this box to enable rsLoRA.
254 |         *   **What it Does:** Activates an alternative scaling method where the scale is `rsLoRA_Alpha / sqrt(rank)`. This can sometimes lead to more stable training.
255 |     *   **rsLoRA Alpha (`lora_RS_alpha`):**
256 |         *   **How to Use:** Enter a numerical value.
257 |         *   **What it Does:** Sets the alpha for rsLoRA. A common starting point is 16.
258 | *   **True Batch Size (`micro_batch_size`):**
259 |     *   **How to Use:** Adjust the slider.
260 |     *   **What it Does:** This determines how many "text blocks" (or individual data points) are processed by your GPU in a single training step. Higher values generally lead to more stable training (the model sees more examples at once), but require significantly more VRAM.
261 | *   **Gradient Accumulation Steps (`grad_accumulation`):**
262 |     *   **How to Use:** Adjust the slider.
263 |     *   **What it Does:** This is a clever way to simulate a larger "effective batch size" without requiring more VRAM. If you set `True Batch Size` to 4 and `Gradient Accumulation` to 8, the model will process 4 blocks, store the gradients, then process another 4 blocks, and so on, 8 times. After these 8 steps (32 "virtual" blocks), it updates its weights. This evens out learning but can sometimes reduce the fidelity of training.
264 | *   **Epochs (`epochs`):**
265 |     *   **How to Use:** Enter a number.
266 |     *   **What it Does:** The number of times the entire dataset will be fed through the model for training. If you have 1000 items and set Epochs to 3, the model will see all 1000 items three times. More epochs usually mean more learning but also higher risk of overfitting (where the model memorizes the training data instead of generalizing).
267 | *   **Learning Rate (`learning_rate`):**
268 |     *   **How to Use:** Enter a value in scientific notation (e.g., `3e-4`).
269 |     *   **What it Does:** This controls how much the model's weights are adjusted with each training step.
270 |         *   `3e-4` (0.0003) is a common starting point.
271 |         *   `1e-2` (0.01) is very high, risking unstable training.
272 |         *   `1e-6` (0.000001) is very low, making training slow.
273 | *   **LR Scheduler (`lr_scheduler_type`):**
274 |     *   **How to Use:** Select from the dropdown.
275 |     *   **What it Does:** Defines how the learning rate changes over the course of training.
276 |         *   **`linear`:** Learning rate gradually decreases from its initial value to 0.
277 |         *   **`constant`:** Learning rate stays the same throughout training.
278 |         *   **`constant_with_warmup`:** Stays constant after an initial "warmup" phase.
279 |         *   **`cosine`, `cosine_with_restarts`, `polynomial`, `inverse_sqrt`:** More complex patterns for learning rate decay.
280 |         *   **`FP_low_epoch_annealing` (FP custom):** Starts with a warmup, holds constant for the first epoch, then gradually reduces the learning rate (anneals) for the rest.
281 |         *   **`FP_half_time_annealing` (FP custom):** Warmup, then holds constant for the first half of total steps, then anneals.
282 |         *   **`FP_raise_fall_creative` (FP custom):** Learning rate increases to a peak in the middle of training, then falls.
283 |         *   **`FP_3epoch_raise_hold_fall` (FP custom):** Learning rate "raises" during the 1st epoch, "holds" during the 2nd, and "falls" during the 3rd and subsequent epochs. Designed for at least 3 epochs.
284 |         *   **`FP_step_decay_with_warmup` (FP custom):** Warmup, holds constant for the first epoch, then halves the learning rate with each subsequent epoch.
285 | 
286 | #### **C. Checkpoints (Saving Your Progress)**
287 | 
288 | *   **Save every n steps (`save_steps`):**
289 |     *   **How to Use:** Enter a number (0 to disable).
290 |     *   **What it Does:** Your LoRA will be saved as a checkpoint every `n` training steps, and also at the end of each full epoch. This is a basic periodic backup.
291 | *   **Save at 10% Loss change (`save_steps_under_loss`):**
292 |     *   **How to Use:** Adjust the slider to a loss value (e.g., 1.8).
293 |     *   **What it Does:** This is a smart saving feature. Once the training loss falls *below* this value, the system will save a checkpoint. It will then save *again* every time the loss drops by at least 10% from the previous saved checkpoint's loss. This helps capture good states.
294 | *   **Queue Checkpoint Now (`save_chackpoint_now`):**
295 |     *   **How to Use:** Click this button.
296 |     *   **What it Does:** Immediately queues a save operation. The current training step will complete, and then a checkpoint will be saved. Useful for manual backups during a long run.
297 | 
298 | #### **D. Stops (Automatic Training Termination)**
299 | 
300 | These settings can be changed *during* training to fine-tune when the process ends.
301 | 
302 | *   **Stop at loss (`stop_at_loss`):**
303 |     *   **How to Use:** Adjust the slider (0 to disable).
304 |     *   **What it Does:** If the training loss reaches or falls below this specified value, the training will automatically stop. Prevents over-training if you hit a good performance level.
305 | *   **Stop at Epoch (`stop_at_epoch`):**
306 |     *   **How to Use:** Adjust the slider (0 to disable).
307 |     *   **What it Does:** The training will stop at the end of the specified epoch. Useful for limiting training time or comparing specific epoch outputs.
308 | 
309 | #### **E. Advanced Options (Fine-Grained Control)**
310 | 
311 | *   **Warmup Steps (`warmup_steps`):**
312 |     *   **How to Use:** Enter a number (0 to use Warmup Ratio instead).
313 |     *   **What it Does:** During these initial steps, the learning rate gradually increases. This "warms up" the model, reducing sudden large updates and preventing early instability.
314 | *   **Warmup Ratio (`warmup_ratio`):**
315 |     *   **How to Use:** Adjust the slider (0.0 to disable, or if Warmup Steps is used).
316 |     *   **What it Does:** If "Warmup Steps" is 0, this ratio determines the portion of total training steps that will be used for a linear warmup. For example, 0.1 means 10% of total steps are for warmup.
317 | *   **NEFtune noise scale (`neft_noise_alpha`):**
318 |     *   **How to Use:** Adjust the slider (0.0 to disable). Recommended starting value: 5.
319 |     *   **What it Does:** Adds a small amount of random noise to the model's embeddings during training. This can help the model generalize better and prevent it from memorizing the training data too strictly.
320 | *   **LLaMA Target Projections (`training_projection`):**
321 |     *   **How to Use:** Select which parts of the model (projections) the LoRA should modify.
322 |     *   **What it Does:** LoRA works by modifying specific internal layers (projections) of the model.
323 |         *   `q-v` (default): Focuses on query and value projections (common for QLoRA).
324 |         *   `all`: Modifies all common projection types.
325 |         *   `q-k-v-o`: Modifies query, key, value, and output projections.
326 |         *   `q-k-v`: Modifies query, key, and value projections.
327 |         *   `k-v-down`: Modifies key, value, and "down" projections.
328 |     *   **Use Case:** Experimenting with these can sometimes yield different results, as it changes which parts of the model's "thinking" are adjusted by the LoRA.
329 | *   **Continued Pretraining Accordion:**
330 |     *   **Train Head (`lora_modulessave`):**
331 |         *   **How to Use:** Check this box.
332 |         *   **What it Does:** Also trains the `lm_head` (the part of the model that predicts the next token) and `embed_tokens` (the part that turns words into numerical representations). This is like "full fine-tuning" for specific parts of the model, which is much more memory intensive.
333 |         *   **Warning:** This requires significantly more VRAM. If enabled, it's *highly recommended* to use an 8-bit AdamW optimizer (like `paged_adamw_8bit`) to manage VRAM.
334 | *   **Use Gradient Checkpoint (`use_grad_checkpoint`):**
335 |     *   **How to Use:** Check this box.
336 |     *   **What it Does:** Reduces VRAM usage during training but makes the training process slower. It achieves this by re-calculating some intermediate values during the backward pass instead of storing them.
337 | *   **LoRA Dropout (`lora_dropout`):**
338 |     *   **How to Use:** Adjust the slider (e.g., 0.05 for 5%).
339 |     *   **What it Does:** Introduces a small chance that some LoRA layers will be temporarily "dropped out" during training. This can prevent overfitting by forcing the model to learn more robust features. Most users can leave this at default.
340 | *   **Optimizer (`optimizer`):**
341 |     *   **How to Use:** Select from the dropdown.
342 |     *   **What it Does:** The algorithm used to update the model's weights during training. `adamw_torch` is a good general choice. `paged_adamw_8bit` is excellent for saving VRAM, especially when "Train Head" is enabled. Different optimizers can affect training speed and final quality.
343 | *   **Train Only After (`train_only_after`):**
344 |     *   **How to Use:** Type a specific string (e.g., `### Response:`)
345 |     *   **What it Does:** If you're using instruction-response datasets (like Alpaca), you often only want the model to learn from the "response" part, not the "instruction" part. This setting tells the model to ignore the loss calculation for any text *before* this string in your data, focusing only on the part after it.
346 | *   **Train on Inputs (Full sequence loss) (`train_on_inputs`):**
347 |     *   **How to Use:** Check or uncheck this box.
348 |     *   **What it Does:**
349 |         *   **Checked (default):** The model calculates loss and trains on the *entire* sequence of text (the input/instruction *plus* the response).
350 |         *   **Unchecked (recommended for instruction tuning with `Train Only After`):** If `Train Only After` is specified, the model *only* calculates loss and trains on the portion of text *after* that string. This is ideal for teaching a model to generate good responses to prompts without altering its understanding of the prompts themselves.
351 | *   **Add BOS token (`add_bos_token`):**
352 |     *   **How to Use:** Check this box.
353 |     *   **What it Does:** Adds a "Beginning Of Sequence" token (e.g., `<s>`) to the start of each text item. This helps the model understand that a new sequence is beginning. Generally, leave this ON.
354 | *   **Add EOS token (`add_eos_token`):**
355 |     *   **How to Use:** Check this box.
356 |     *   **What it Does:** Adds an "End Of Sequence" token (e.g., `</s>`) to the end of each text item. This helps the model understand when to stop generating text. For JSONL datasets, this is typically controlled by the chat template.
357 | *   **EOS placement (Text file) (`add_eos_token_type`):**
358 |     *   **How to Use:** Select from the dropdown.
359 |     *   **What it Does:**
360 |         *   **`Every Block`:** Adds an EOS token at the end of every processed text block.
361 |         *   **`Hard Cut Blocks Only`:** Only adds an EOS token at the end of text blocks that were explicitly separated by the "Hard Cut String."
362 | *   **Group Samples by Length (`group_by_length`):**
363 |     *   **How to Use:** Check this box.
364 |     *   **What it Does:** Groups training examples of similar lengths together. This can make training more efficient by reducing the amount of padding needed for shorter sequences within a batch.
365 | *   **Eliminate cutoff blocks (`eliminate_long_blocks`):**
366 |     *   **How to Use:** Check this box.
367 |     *   **What it Does:** If a text block (or a JSON entry after formatting) exceeds the "Maximum context length (Cutoff)" even after trimming, this option will *remove* that block entirely from the dataset instead of just truncating it. Useful for ensuring all training data fits perfectly.
368 | *   **Enable higher ranks (`higher_rank_limit`):**
369 |     *   **How to Use:** Check this box.
370 |     *   **What it Does:** Increases the maximum values available on the "LoRA Rank" and "LoRA Alpha" sliders.
371 |     *   **Warning:** Only use this if you have a datacenter-class GPU with extremely high VRAM (e.g., 80GB+) as higher ranks consume much more memory.
372 | *   **Save detailed logs with (`report_to`):**
373 |     *   **How to Use:** Select `wandb` (Weights & Biases) or `tensorboard` to integrate with these logging platforms.
374 |     *   **What it Does:** If selected, detailed training metrics, losses, and other information will be sent to the chosen platform for more advanced visualization and tracking.
375 | 
376 | #### **F. Dataset & Data Preparation Tabs**
377 | 
378 | This section is crucial for feeding your model the right data. You must select one primary data source (JSON, JSONL, or Text File).
379 | 
380 | *   **JSON Dataset Tab:** For structured JSON files (list of dictionaries, e.g., Alpaca format).
381 |     *   **Dataset (`dataset`):**
382 |         *   **How to Use:** Select your JSON training file (e.g., `alpaca_data.json`) from `user_data/training/datasets/`.
383 |     *   **Evaluation Dataset (`eval_dataset`):**
384 |         *   **How to Use:** Select an optional separate JSON file for evaluation.
385 |         *   **What it Does:** The model will periodically be tested on this data to monitor its progress and prevent overfitting.
386 |     *   **Data Format (`format`):**
387 |         *   **How to Use:** Select a JSON format file (e.g., `alpaca.json`) from `user_data/training/formats/`.
388 |         *   **What it Does:** This file tells the trainer how to combine different fields (like `instruction`, `input`, `output`) from your JSON dataset into a single text sequence that the model can understand.
389 |     *   **Evaluate every n steps (`eval_steps`):**
390 |         *   **How to Use:** Enter a number.
391 |         *   **What it Does:** If an evaluation dataset is provided, the model's performance will be checked every `n` training steps.
392 | 
393 | *   **JSONL Dataset Tab:** For JSON Lines files, typically used with chat-based models.
394 |     *   **JSONL Dataset (`datasetJSONL`):**
395 |         *   **How to Use:** Select your JSONL training file (e.g., `chat_data.jsonl`) from `user_data/training/datasets/`.
396 |     *   **JSONL Evaluation Dataset (`eval_datasetJSONL`):**
397 |         *   **How to Use:** Select an optional JSONL evaluation file.
398 |     *   **Evaluate every n steps (`eval_stepsJSONL`):**
399 |         *   **How to Use:** Enter a number.
400 |         *   **What it Does:** Evaluation frequency for JSONL datasets.
401 |     *   **Note:** The format for JSONL files is automatically derived from the model's tokenizer chat template. If your model's tokenizer doesn't have one, it will fall back to the "Instruction template" set in `text-generation-webui`'s "Parameters" tab.
402 |     *   **Important:** This JSONL processing automatically appends `<|EOFUSER|>` to user messages, which is then removed after applying the chat template.
403 | 
404 | *   **Text file Tab:** For raw `.txt` files where the entire text is used for training.
405 |     *   **Text file (`raw_text_file`):**
406 |         *   **How to Use:** Select your raw text file (e.g., `my_novel.txt`) or a folder containing multiple `.txt` files from `user_data/training/datasets/`.
407 |     *   **Add Overlapping blocks (`precize_slicing_overlap`):**
408 |         *   **How to Use:** Check this box.
409 |         *   **What it Does:** When splitting your raw text into manageable blocks, this creates additional "overlapping" blocks. For example, if block 1 is sentences A-B-C, an overlapping block might be sentences B-C-D. This helps the model learn context across block boundaries.
410 |     *   **DEMENTOR Long-form Learning by FP (Highly Experimental, use low epochs) (`sliding_window`):**
411 |         *   **How to Use:** Check this box.
412 |         *   **What it Does:** Activates a special "Deep Memorization Enforcement Through Overlapping and Repetition" slicing strategy. It uses a "sliding window" approach to generate text blocks. This is intended for teaching the model very long-form text patterns, potentially with fewer epochs. It's experimental, so use with caution.
413 |     *   **Hard Cut String (`hard_cut_string`):**
414 |         *   **How to Use:** Enter a string (e.g., `\\n\\n\\n` for three newlines).
415 |         *   **What it Does:** This string indicates a definitive logical break in your text (like a chapter break or a new topic). The slicer will ensure that no text blocks or overlapping blocks cross this string, preserving logical boundaries.
416 |     *   **Ignore small blocks (`min_chars`):**
417 |         *   **How to Use:** Enter a number.
418 |         *   **What it Does:** Any text block generated during slicing that has fewer characters than this number will be discarded from the training dataset. Useful for removing noise or very short, uninformative segments.
419 | 
420 | *   **Hybrid Tab (Experimental):** For training with both structured (JSON/JSONL) and unstructured (raw text) data simultaneously.
421 |     *   **Hybrid Training (`hybrid_training`):**
422 |         *   **How to Use:** Check this box.
423 |         *   **What it Does:** Enables the hybrid training mode. You must also select a `Raw Text file` *and* a `JSON` or `JSONL` dataset.
424 |     *   **Percentage of Dataset used (`hybrid_data_ratio`):**
425 |         *   **How to Use:** Adjust the slider (0-100%).
426 |         *   **What it Does:** Controls how much of the selected JSON or JSONL dataset will be included in the hybrid training.
427 |     *   **Percentage of Text file used (`hybrid_text_ratio`):**
428 |         *   **How to Use:** Adjust the slider (0-100%).
429 |         *   **What it Does:** Controls how much of the selected raw text file will be included in the hybrid training.
430 |     *   **Use Case:** This is experimental but can be powerful for blending instruction-following (from JSON/JSONL) with general knowledge or style (from raw text).
431 | 
432 | *   **URL Tab:** For downloading datasets directly from a URL.
433 |     *   **Download JSON or txt file to datasets (or formats) folder (`download_file_url`):**
434 |         *   **How to Use:** Paste the URL of your `.json` or `.txt` file. For GitHub, use the "Raw" URL. For Hugging Face, ensure the URL contains `/resolve/` not `/blob/`.
435 |     *   **Overwrite (`download_check_overwrite`):**
436 |         *   **How to Use:** Check this if you want to replace an existing file with the same name.
437 |     *   **Destination (`download_folder`):**
438 |         *   **How to Use:** Select whether to save the downloaded file to the `datasets` folder or the `formats` folder.
439 |     *   **Download (`download_button`):**
440 |         *   **How to Use:** Click to start the download.
441 |     *   **Download Status (`download_status`):** Displays progress and messages.
442 | 
443 | *   **Tools Tab:** Utility functions for preparing your datasets.
444 |     *   **Evaluation dataset split (percentage) (`split_dataset_perc`) & Split dataset (`split_dataset_do`):**
445 |         *   **How to Use:** Select a JSON dataset in the "JSON Dataset" tab, set the percentage (e.g., 10 for a 90/10 split), then click "Split dataset."
446 |         *   **What it Does:** Takes your selected JSON dataset and splits it into two new files: `your_dataset_name_train.json` and `your_dataset_name_eval.json`, according to the specified percentage. It then automatically selects these new files for your training and evaluation datasets.
447 |     *   **Convert JSON to JSONL (`convert_system`) & Convert JSON to JSONL (`convert_do`):**
448 |         *   **How to Use:** Select a JSON dataset, add a "System Message" (if desired), then click "Convert JSON to JSONL."
449 |         *   **What it Does:** Converts your standard JSON dataset (with `instruction` and `output` fields) into the JSONL format, suitable for chat models. The "System Message" will be added to each entry.
450 |     *   **Simple TXT to JSONL conversion (`convert_system2`, `convert_prompt2`) & Convert TXT to JSONL (`convert_do2`):**
451 |         *   **How to Use:** Select a raw text file (where each logical item is separated by at least three empty lines), add a "System Message," add a "Prompt" that will be applied to every item (e.g., "Write me a limerick."), then click "Convert TXT to JSONL."
452 |         *   **What it Does:** Converts your raw text file into a JSONL format where each block of text becomes an assistant's response to your specified prompt and system message.
453 |     *   **Dump Training Dataset (`dump_dataset`) & Clean up dump dataset (`dump_dataset_remove_s`):**
454 |         *   **How to Use:** Check "Dump Training Dataset" to enable. Optionally check "Clean up dump dataset" to remove BOS/EOS tokens.
455 |         *   **What it Does:** Just before training begins, the tool will decode your entire prepared training dataset (after all formatting and slicing) and save it as a JSON file in the `logs/` folder. This is invaluable for debugging and verifying how your data is actually being fed to the model.
456 | 
457 | #### **G. Final Configuration and Execution**
458 | 
459 | *   **Maximum context length (Cutoff) (`cutoff_len`):**
460 |     *   **How to Use:** Adjust the slider.
461 |     *   **What it Does:** This is the *maximum number of tokens* that any single block of text (from JSON, JSONL, or raw text) will have. If a block is longer, it will be truncated. Higher values require significantly more VRAM and can slow down training. This is a critical setting for managing GPU memory.
462 | *   **Verify Dataset/Text File and suggest data entries (`check_dataset_btn`):**
463 |     *   **How to Use:** Click this button *before* starting training.
464 |     *   **What it Does:** Analyzes your selected dataset(s) and current settings. It provides a summary in the "Dataset info" textbox, including:
465 |         *   Number of blocks.
466 |         *   Length of the longest block (in tokens), and suggestions for optimal `Cutoff` length.
467 |         *   Total number of tokens.
468 |         *   Calculated total training steps, steps per epoch.
469 |         *   Suggestions for `Save every n steps` and `Warmup steps`.
470 |         *   **Crucial Warning:** It will tell you if your "Gradient Accumulation" setting is too high for your dataset size, which can cause Accelerate/Transformers to crash.
471 |     *   **Recommendation:** Always run this before a new training session!
472 | *   **Start LoRA Training (`start_button`):**
473 |     *   **How to Use:** After configuring everything, click this to begin.
474 |     *   **What it Does:** Initializes the training process. The "Output" area will show real-time status updates, and the "Graph" will plot loss.
475 | *   **Interrupt (`stop_button`):**
476 |     *   **How to Use:** Click this button at any time during training.
477 |     *   **What it Does:** Politely tells the training process to stop after the current training step is finished. It will then save the current state of your LoRA.
478 | *   **Graph (`plot_graph`):**
479 |     *   **What it Does:** Displays a live plot of your training loss over epochs/steps. This is an essential visual aid for monitoring training progress and detecting issues like overfitting (loss goes up) or non-convergence.
480 | *   **Output (`output`):**
481 |     *   **What it Does:** Shows textual status updates, progress indicators (steps, time), and final messages (e.g., "Done! LoRA saved to...").
482 | 
483 | ---
484 | 
485 | ### **2. Perplexity Evaluation Tab: Assessing Your Model's Performance**
486 | 
487 | This tab allows you to measure how well your loaded model (or selected other models) can predict text in various datasets. Lower perplexity generally means a better model.
488 | 
489 | *   **Models (`models`):**
490 |     *   **How to Use:** Select one or more base models (and their loaded LoRAs) from your `text-generation-webui` model directory.
491 |     *   **What it Does:** These are the models that will be evaluated.
492 | *   **Input dataset (`evaluate_text_file`):**
493 |     *   **How to Use:** Select a text file for evaluation. Options include:
494 |         *   `wikitext`, `ptb`, `ptb_new`: Standard public datasets (automatically downloaded).
495 |         *   Your local `.txt` files from `user_data/training/datasets/`.
496 | *   **Stride (`stride_length`):**
497 |     *   **How to Use:** Adjust the slider (e.g., 512).
498 |     *   **What it Does:** This setting balances evaluation speed and accuracy. A stride of 1 means the model evaluates every single token, which is very slow but most accurate. A higher stride (e.g., 512) means the model "jumps" ahead by that many tokens for each new evaluation, making it much faster but less precise.
499 | *   **max_length (`max_length`):**
500 |     *   **How to Use:** Enter a number (0 to use the model's full context length).
501 |     *   **What it Does:** Sets the maximum context (number of tokens) the model will use for each evaluation chunk. If 0, it uses the maximum context length your model was trained for.
502 | *   **Evaluate loaded model (`start_current_evaluation`):**
503 |     *   **How to Use:** Click this button.
504 |     *   **What it Does:** Evaluates the *currently loaded* model in your `text-generation-webui` session on the selected input dataset.
505 | *   **Evaluate selected models (`start_evaluation`):**
506 |     *   **How to Use:** Select models from the "Models" dropdown, then click this button.
507 |     *   **What it Does:** Evaluates all the models you've selected from the dropdown on the input dataset.
508 | *   **Interrupt (`stop_evaluation`):**
509 |     *   **How to Use:** Click this button during an evaluation.
510 |     *   **What it Does:** Stops the evaluation process.
511 | *   **Evaluation Log (`evaluation_log`):**
512 |     *   **What it Does:** Displays real-time messages and results of the current evaluation.
513 | *   **Evaluation Table (`evaluation_table`):**
514 |     *   **What it Does:** Shows a historical table of all your past perplexity evaluations, including model names, datasets, perplexity scores, and any comments you've added. It's interactive, so you can edit comments directly.
515 | *   **Save comments (`save_comments`):**
516 |     *   **How to Use:** Click this after editing comments in the "Evaluation Table."
517 |     *   **What it Does:** Saves any changes you've made to the comments column in the table.
518 | *   **Refresh the table (`refresh_table`):**
519 |     *   **How to Use:** Click this button.
520 |     *   **What it Does:** Updates the "Evaluation Table" to ensure it reflects the latest evaluation results and saved comments.
521 | 
522 | ---
523 | 
524 | ### **General Tips and Warnings for Training PRO:**
525 | 
526 | *   **Model Reload After Training:** After any training run (even if interrupted or failed), your base model becomes "dirty" in memory. You *must* go to the `text-generation-webui`'s "Model" tab and **reload your base model** before using it for inference or starting another training run.
527 | *   **VRAM Management:** LoRA training, especially with higher ranks or `Train Head` enabled, can be very VRAM-intensive. Monitor your GPU usage. Reduce `micro_batch_size`, increase `Gradient Accumulation Steps`, decrease `cutoff_len`, or lower `LoRA Rank` if you hit VRAM limits.
528 | *   **Disk Space:** LoRA checkpoints can accumulate quickly, and a full LoRA can be several GBs. Ensure you have enough disk space.
529 | *   **Tokenizer Issues:** The tool attempts to handle common tokenizer problems (like missing PAD or EOS tokens), but a good, well-behaved tokenizer from your base model is always best. Pay attention to warnings in the console about tokenizers.
530 | *   **Experimentation:** LoRA training often involves experimentation. Start with smaller datasets or fewer epochs to test your settings before committing to long, resource-intensive runs.
531 | *   **The "Instruct" Tab / Console:** Keep an eye on the command line console where `text-generation-webui` is running. Training PRO outputs a lot of useful information, warnings, and debug messages there.
532 | 
533 | This guide should give you a solid foundation for navigating and utilizing the powerful features of the "Training PRO" extension!
534 | 


--------------------------------------------------------------------------------
/script.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | 
   3 | os.environ["WANDB_MODE"] = "offline"
   4 | # os.environ["WANDB_DISABLED"] = "true"
   5 | 
   6 | import warnings
   7 | 
   8 | warnings.filterwarnings(action = "ignore", message="torch.utils.checkpoint:")
   9 | warnings.filterwarnings(action = "ignore", message="`do_sample` is set to `False`")
  10 | warnings.simplefilter(action='ignore', category=FutureWarning)
  11 | 
  12 | 
  13 | import json
  14 | import math
  15 | import random
  16 | import shutil
  17 | import sys
  18 | import threading
  19 | import time
  20 | import traceback
  21 | from datetime import datetime
  22 | from pathlib import Path
  23 | 
  24 | import gradio as gr
  25 | import pandas as pd
  26 | import torch
  27 | import transformers
  28 | 
  29 | from functools import partial
  30 | 
  31 | from .custom_scheduler import FPSchedulerTrainer, FPNEFtuneTrainer
  32 | 
  33 | from .matplotgraph import create_graph
  34 | from .train_utils import get_available_loras_local, precise_cut, sliding_block_cut, download_file_from_url
  35 | 
  36 | # this keeps changing lately so it is now a variable
  37 | TRAINING_DATASET_FOLDER = 'user_data/training/datasets'
  38 | TRAINING_FORMATS_FOLDER = 'user_data/training/formats'
  39 | 
  40 | 
  41 | import bitsandbytes as bnb
  42 | 
  43 | from datasets import Dataset, load_dataset, DatasetDict
  44 | from peft import (
  45 |     LoraConfig,
  46 |     get_peft_model,
  47 |     prepare_model_for_kbit_training,
  48 |     set_peft_model_state_dict
  49 | )
  50 | from peft.utils.other import \
  51 |     TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as model_to_lora_modules
  52 | from transformers.models.auto.modeling_auto import (
  53 |     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
  54 | )
  55 | 
  56 | 
  57 | from modules import shared, utils
  58 | from modules.ui import create_refresh_button
  59 | 
  60 | from modules.evaluate import (
  61 |     calculate_perplexity,
  62 |     generate_markdown_table,
  63 |     save_past_evaluations
  64 | )
  65 | from modules.logging_colors import logger
  66 | from modules.models import reload_model, unload_model, load_model
  67 | from modules.utils import natural_keys
  68 | 
  69 | params = {
  70 |         "display_name": "Training PRO",
  71 |         "is_tab": True
  72 | }
  73 | 
  74 | non_serialized_params = {
  75 |         "debug_slicer": False,
  76 |         "Lora_sortedByTime": False,
  77 |         "stop_at_loss": 0,
  78 |         "stop_at_epoch": 0,
  79 |         "save_steps_under_loss": 0.0,
  80 |         "save_checkpoint_now": False,
  81 |         "training_loop": False,
  82 |         "current_stability": 0,
  83 |         "save_epochs": 0,
  84 |         "checkpoint_offset": 0,
  85 |         "epoch_offset":0,
  86 |         "safe_serialization": False,
  87 |         "dump_dataset": False,
  88 |         "dump_dataset_remove_s": True,
  89 | }
  90 | 
  91 | mapped_prompts = 0
  92 | 
  93 | MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
  94 | 
  95 | PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to", "precize_slicing_overlap", "add_eos_token_type", "save_steps_under_loss", "add_bos_token", "training_projection","sliding_window","warmup_ratio","grad_accumulation","neft_noise_alpha","group_by_length","eliminate_long_blocks","stop_at_epoch","datasetJSONL", "eval_datasetJSONL", "eval_stepsJSONL","hybrid_training", "hybrid_data_ratio","hybrid_text_ratio","lora_RS","lora_RS_alpha","lora_modulessave","use_grad_checkpoint"]
  96 | WANT_INTERRUPT = False
  97 | 
  98 | train_log = {}
  99 | train_template = {}
 100 | train_log_graph = []
 101 | 
 102 | train_choices = ["all","q-k-v-o","q-k-v","k-v-down","q-v"]
 103 | 
 104 | statistics = {
 105 | 			'loss': [],
 106 | 			'lr': [],
 107 | }
 108 | 
 109 | RED = "\033[91m"
 110 | YELLOW = "\033[93m"
 111 | GREEN = "\033[92m"
 112 | RESET = "\033[0m"
 113 | 
 114 | def ui():
 115 | 
 116 |     with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
 117 |         tmp = gr.State('')
 118 |         with gr.Row():
 119 |             with gr.Column():
 120 |                 # YY.MM.DD
 121 |                 gr.Markdown("`Ver: 25.05.20` This is enhanced version of QLora Training. [Maintained by FP](https://github.com/FartyPants/Training_PRO/tree/main)")
 122 | 
 123 |                 with gr.Row():
 124 |                     with gr.Column(scale=5):
 125 |                         with gr.Row():
 126 |                             copy_from = gr.Dropdown(label='Copy parameters from', value='None', choices=get_available_loras_local(non_serialized_params['Lora_sortedByTime']), elem_classes=['slim-dropdown'],allow_custom_value = True)
 127 |                             create_refresh_button(copy_from, lambda: None, lambda: {'choices': get_available_loras_local(non_serialized_params['Lora_sortedByTime'])}, 'refresh-button')
 128 |                     with gr.Column():
 129 |                         sort_byTime = gr.Checkbox(label='Sort list by Date', value=False, info='Sorts Loras by date created.', elem_classes=['no-background'])                        
 130 | 
 131 |                 with gr.Row():
 132 |                     with gr.Column(scale=5):
 133 |                         lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
 134 |     
 135 |                     with gr.Column():
 136 |                         always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).', elem_classes=['no-background'])
 137 | 
 138 |                 with gr.Row():
 139 |                     with gr.Column():
 140 |                         lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='Also called dimension count. Higher values = larger file, more content control. Smaller values = smaller file, less control. Use 4 or 8 for style, 128 or 256 to teach, 1024+ for fine-detail on big data. More VRAM is needed for higher ranks.')
 141 |                         lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='Alpha determines Scaling of the LoRA. A good standard value is 1x-2x of Rank. scale = LORA_alpha/rank')
 142 |                         with gr.Accordion(label='Rank Stabilised LoRA', open=False):
 143 |                             with gr.Row():
 144 |                                 lora_RS = gr.Checkbox(label='Use rsLoRA', value=False, info='scale = rsLoRA_Alpha/sqrt(rank)')
 145 |                                 lora_RS_alpha = gr.Number(label='rsLoRA Alpha', value=16) 
 146 |                                              
 147 |                         batch_size = gr.Slider(visible= False, label='Batch Size', value=0, minimum=0, maximum=1024, step=4, info='Now Replaced with Gradient accumulation. Keeping it for sake of old saved data')
 148 |                         micro_batch_size = gr.Slider(label='True Batch Size', value=4, minimum=1, maximum=128, step=1, info='Specifies how many text blocks per step will be trained. The higher value, the better the concept of training will be, but it requires more GPU memory and it reduces speed.')
 149 |                         grad_accumulation = gr.Slider(label='Gradient Accumulation Steps', value=1, minimum=1, maximum=256, step=1, info="Virtually multiplies the Batch Size by averaging the learning over more than one step. VRAM friendly. Evens out loss fluctuations but can also degrade training fidelity.")
 150 | 
 151 |                     with gr.Column():
 152 |                         epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
 153 |                         learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='In scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
 154 |                         lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt', 'FP_low_epoch_annealing', 'FP_half_time_annealing','FP_raise_fall_creative','FP_3epoch_raise_hold_fall','FP_step_decay_with_warmup'], info='Learning rate scheduler - defines how the learning rate changes over time. (FP_ = my Own Custom schedulers)', elem_classes=['slim-dropdown'])
 155 |                         
 156 |                 with gr.Accordion(label='Checkpoints', open=True):
 157 |                     with gr.Row():
 158 |                         with gr.Column():
 159 |                             save_steps = gr.Number(label='Save every n steps', value=0, info='A checkpoint will be saved every n steps and at each Epoch boundary. (0 = OFF)')
 160 |                         with gr.Column():    
 161 |                             save_steps_under_loss = gr.Slider(label='Save at 10% Loss change', value=1.8, minimum=0.0, maximum=3.0, step=0.1, info="Saves checkpoints at (or bellow) this loss and then each time loss falls by at least 10% This works independently from 'Save every n steps'")    
 162 |                     with gr.Row():        
 163 |                         save_chackpoint_now = gr.Button('Queue Checkpoint Now')
 164 |                 with gr.Accordion(label ='Stops (can be changed during training)',open = True):
 165 |                     with gr.Row():
 166 |                         with gr.Column():
 167 |                             stop_at_loss = gr.Slider(label='Stop at loss', minimum=0.0, maximum=3.0, step=0.1, value=0.00, info='If non 0 the process will automatically stop once the desired loss value is reached.')
 168 |                         with gr.Column():
 169 |                             stop_at_epoch = gr.Slider(label='Stop at Epoch', minimum=0, maximum=20, step=1, value=0, info='If non 0 the process will stop early once the set epoch is reached.')                              
 170 |      
 171 |                 with gr.Accordion(label='Advanced Options', open=True):
 172 |                     with gr.Row():
 173 |                         with gr.Column():
 174 |                             warmup_steps = gr.Number(label='Warmup Steps', value=100, info='Number of max steps used for a linear warmup. Reduces early over-fitting by the first training blocks. Value has precedent over Warmup Ratio. Aligns to the closest multiple of graddient accumulation')
 175 |                             warmup_ratio = gr.Slider(label='Warmup Ratio', minimum=0.0, maximum=0.2, step=0.025, value=0.0, info='Ratio of total training steps that will be used for a linear warmup. It applies only if Warmup Step is 0.')
 176 |                             neft_noise_alpha = gr.Slider(label='NEFtune noise scale', minimum=0.0, maximum=15, step=1, value=0.0, info='Add noise to the training to improve generalization. [0 - OFF, Starting value to experiment: 5]')
 177 |                             training_projection = gr.Radio(value = train_choices[4], label='LLaMA Target Projections', info='Change the targets (LORA is typically q-v)', choices=train_choices)
 178 |                             with gr.Accordion(label ='Continued Pretraining',open = False):
 179 |                                 with gr.Row():
 180 |                                     lora_modulessave = gr.Checkbox(label='Train Head', value=False, info='Train lm_head and embed_tokens')
 181 |                                 gr.Markdown('If you use Train Head, you should use 8-bit AdamW optimizer (paged_adamw_8bit), or your puny VRAM will explode. With 4-bit BnB and Rank 16 you COULD pretrain 8B model on 24GB VRAM.')
 182 |                             use_grad_checkpoint = gr.Checkbox(label='Use Gradient Checkpoint', value=False, info='Reduces memory usage but increase computation time')
 183 |                             lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
 184 |                             optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad','adamw_8bit','paged_adamw_8bit'], info='Different optimizer implementation options, for advanced users. Effects of different options are not well documented yet.', elem_classes=['slim-dropdown'])
 185 | 
 186 |                         with gr.Column():
 187 |                             train_only_after = gr.Textbox(label='Train Only After', value='', info='Only consider text *after* this string in any given chunk for training. For Alpaca datasets, use "### Response:" to only train the response and ignore the input.')
 188 |                             add_bos_token = gr.Checkbox(label='Add BOS token', value=True, info="Adds BOS token to each item. (Should be always ON)")
 189 |                             add_eos_token = gr.Checkbox(label='Add EOS token', value=True, info="Adds EOS token for each JSON/Text item. JSONL is controlled by Instruct Template")
 190 |                             add_eos_token_type = gr.Dropdown(label='EOS placement (Text file)', choices=['Every Block', 'Hard Cut Blocks Only'], value='Every Block', info='', allow_custom_value = False)
 191 |                             group_by_length = gr.Checkbox(label='Group Samples by Length', value=False, info='Group together samples of roughly the same length in the training dataset.')
 192 |                             eliminate_long_blocks = gr.Checkbox(label='Eliminate cutoff blocks', value=False, info='Instead of just trimming blocks at cutoff, eliminate them from dataset alltogether if they are too long.')
 193 |                             higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
 194 |                             report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
 195 |                 # for future            
 196 |                 #with gr.Accordion(label='Dynamic Scheduler', open = False):
 197 |                 #    ds_min_epochs = gr.Number(label='Minimum Epochs', value='1', info='Minimum epochs that will be always performed before ramp down can be triggered')
 198 |                 #    ds_max_epochs = gr.Number(label='Maximum Epochs (fallback)', value='50', info='Maximum Epochs before the training will bail out completely (should be a large number)')
 199 |                 #    ds_loss_trigger = gr.Slider(label='Trigger Loss', minimum=0.0, maximum=2.8, step=0.1, value=1.6, info='Loss at which the ramp down schedule will be triggered')
 200 |                 #    ds_loss_rolling_window = gr.Number(label='Loss rolling average', value='4', info='Calculate loss by averaging last x numbers to avoid jumps and noise')
 201 |                 #    ds_epochs_to_ramp = gr.Slider(label='Ramp down ratio', minimum=0.0, maximum=2.0, step=0.1, value=1.00, info='How long the ramp down will last relative to ellapsed steps (before trigger)')
 202 |                 #    gr.Markdown('These are settings for FP_dynamic_loss_trigger scheduler. The scheduler will do warm up, then hold constant untill a loss falls under Trigger Loss, then it will commence linear ramp down schedule and stop. The length of ramp down is set by Ramp down ratio where (ramp down steps) = ratio * (elapsed steps). (The time to completition shown will be very high untill ramp down is triggered.)')
 203 |                         
 204 | 
 205 |             with gr.Column():
 206 |                 with gr.Tab(label='JSON Dataset'):
 207 |                     with gr.Row():
 208 |                         with gr.Column():
 209 |                             with gr.Row():
 210 |                                 dataset = gr.Dropdown(choices=get_datasets(TRAINING_DATASET_FOLDER, 'json'), value='None', label='Dataset', info='The flexible dataset JSON file to use for training.', allow_custom_value=True, elem_classes=['slim-dropdown'])
 211 |                                 create_refresh_button(dataset, lambda: None, lambda: {'choices': get_datasets(TRAINING_DATASET_FOLDER, 'json')}, 'refresh-button')
 212 |                             with gr.Row():
 213 |                                 eval_dataset = gr.Dropdown(choices=get_datasets(TRAINING_DATASET_FOLDER, 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', allow_custom_value=True, elem_classes=['slim-dropdown'])
 214 |                                 create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': get_datasets(TRAINING_DATASET_FOLDER, 'json')}, 'refresh-button')
 215 |                         with gr.Column():
 216 |                             with gr.Row():
 217 |                                 format = gr.Dropdown(choices=get_datasets(TRAINING_FORMATS_FOLDER, 'json'), value='None', label='Data Format', info='The format file used to decide how to format the JSON dataset input.', elem_classes=['slim-dropdown'])
 218 |                                 create_refresh_button(format, lambda: None, lambda: {'choices': get_datasets(TRAINING_FORMATS_FOLDER, 'json')}, 'refresh-button')
 219 |                             with gr.Row():
 220 |                                 eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
 221 |                 with gr.Tab(label='JSONL Dataset'):
 222 |                     with gr.Row():
 223 |                         with gr.Column():
 224 |                             with gr.Row():
 225 |                                 datasetJSONL = gr.Dropdown(choices=get_datasets(TRAINING_DATASET_FOLDER, 'jsonl'), value='None', label='JSONL Dataset', info='JSONL dataset file to use for training. See OpenAI documentation.', allow_custom_value=True, elem_classes=['slim-dropdown'])
 226 |                                 create_refresh_button(datasetJSONL, lambda: None, lambda: {'choices': get_datasets(TRAINING_DATASET_FOLDER, 'jsonl')}, 'refresh-button')
 227 |                             with gr.Row():
 228 |                                 eval_datasetJSONL = gr.Dropdown(choices=get_datasets(TRAINING_DATASET_FOLDER, 'jsonl'), value='None', label='JSONL Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', allow_custom_value=True, elem_classes=['slim-dropdown'])
 229 |                                 create_refresh_button(eval_datasetJSONL, lambda: None, lambda: {'choices': get_datasets(TRAINING_DATASET_FOLDER, 'jsonl')}, 'refresh-button')
 230 |                         with gr.Column():
 231 |                             with gr.Row():
 232 |                                 gr.Markdown('The format will be chosen automatically from the chat template in tokenizer. If the tokenizer doesn\'t have chat template defined (legacy), select the correct template in the WebUI [Parameters - Instruction template]')
 233 |                             with gr.Row():
 234 |                                 eval_stepsJSONL = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation JSONL dataset is given, test it every time this many steps pass.')
 235 | 
 236 |                 with gr.Tab(label="Text file"):
 237 |                     with gr.Row():
 238 |                         raw_text_file = gr.Dropdown(choices=get_datasets(TRAINING_DATASET_FOLDER, 'txt'), value='None', label='Text file', info='The text file to use for training.', allow_custom_value=True, elem_classes=['slim-dropdown'])
 239 |                         create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': get_datasets(TRAINING_DATASET_FOLDER, 'txt')}, 'refresh-button')
 240 | 
 241 |                     with gr.Row():
 242 |                         with gr.Column():
 243 |                             precize_slicing_overlap = gr.Checkbox(label='Add Overlapping blocks', value = True)
 244 |                             sliding_window = gr.Checkbox(label='DEMENTOR Long-form Learning by FP (Highly Experimental, use low epochs)', value = False, info='Deep Memorization Enforcement Through Overlapping and Repetition. (I named it, so shush). Special process for learning long-form text using low amount of epochs.')
 245 |                             #debug_slicer = gr.Checkbox(label='Dump sentencelist.json to logs', value = non_serialized_params['debug_slicer'], info='Debug Slicer')
 246 | 
 247 |                         with gr.Column():
 248 |                             hard_cut_string = gr.Textbox(label='Hard Cut String', value='\\n\\n\\n', info='String that indicates a cut between logical blocks of text (ex. Ideas or Chapters). Helps prevent unwanted overlap between unrelated ideas.')
 249 |                             min_chars = gr.Number(label='Ignore small blocks', value=0, info='Ignore Text blocks that have less or equal characters than this number.')
 250 |                 with gr.Tab(label="Hybrid"):
 251 |                     hybrid_training = gr.Checkbox(label='Hybrid Training (Experimental)', value = False, info = 'Train using Raw text file AND JSON or JSONL dataset at the same time.')
 252 |                     with gr.Row():
 253 |                         hybrid_data_ratio = gr.Slider(value = 100, minimum=0, maximum=100,label='Percentage of Dataset used')
 254 |                         hybrid_text_ratio = gr.Slider(value = 100, minimum=0, maximum=100,label='Percentage of Text file used')
 255 |                     gr.Markdown('This is an experimental hybrid training using both instruct and non-instruct data at once. You need to select Raw Text file AND JSON or JSONL dataset.\n\nOptionally you can set a percentage of dataset / text to dial the correct model response.')
 256 |                 with gr.Tab(label="URL"):
 257 |                     with gr.Row():
 258 |                         with gr.Column():
 259 |                             download_file_url = gr.Textbox(label='Download JSON or txt file to datasets (or formats) folder', value='',info='The URL of a file to download. If on github, make sure you get url of the raw file (https://raw.githubusercontent.com/...). If huggin face, make sure the url has /resolve/ in it not /blob/')
 260 |                             with gr.Row():
 261 |                                 download_check_overwrite = gr.Checkbox(label='Overwrite', value=False, info='Overwrite if file exist')
 262 |                                 download_folder = gr.Radio(label="Destination", value=TRAINING_DATASET_FOLDER, choices=[TRAINING_DATASET_FOLDER, TRAINING_FORMATS_FOLDER], interactive=True)
 263 |                             download_button = gr.Button('Download')
 264 |                             download_status = gr.Textbox(label='Download Status', value='', interactive=False)
 265 |                 with gr.Tab(label="Tools"):
 266 |                     with gr.Row():
 267 |                         with gr.Column():
 268 |                             split_dataset_perc = gr.Number(label='Evaluation dataset split (percentage)', value=10, info='Splits JSON dataset into _train and _eval files by the split percentage. Make sure the JSON is selected in the Formatted Dataset tab first.')
 269 |                             split_dataset_do = gr.Button('Split dataset')
 270 |                         with gr.Column():    
 271 |                             convert_system = gr.Textbox(label = 'Convert JSON to JSONL', info = 'Select JSON in JSON Dataset tab and add System Message:', value='You are a helpful AI assistant.', lines=2)
 272 |                             convert_do = gr.Button('Convert JSON to JSONL')
 273 |                     with gr.Row():
 274 |                         with gr.Column():
 275 |                             convert_system2 = gr.Textbox(label = 'Simple TXT to JSONL conversion', info = 'Select TXT in Text File tab. Each item in txt should be separated by at least 3 empty lines. Enter system message:', value='You are a helpful AI assistant.', lines=1)
 276 |                             convert_prompt2 = gr.Textbox(label = 'Prompt', info = 'Prompt that will be inserted for every item', value='Write me a limerick.', lines=1)
 277 |                             convert_do2 = gr.Button('Convert TXT to JSONL')
 278 |                         with gr.Column():
 279 |                             dump_dataset = gr.Checkbox(label='Dump Training Dataset', value=False, info='Just before training begins, decode and dump the entire dataset into JSON file in /logs/')
 280 |                             dump_dataset_remove_s = gr.Checkbox(label='Clean up dump dataset', value=True, info='Removes BOS and EOS form the dump dataset')    
 281 |                 with gr.Row():
 282 |                     with gr.Column():
 283 |                         with gr.Row():
 284 |                             cutoff_len = gr.Slider(label='Maximum context length (Cutoff)', minimum=32, maximum=4096, value=256, step=32, info='The maximum length of a chunk (in tokens). Applies to both JSON dataset and text files. Higher values require much more VRAM.')
 285 |                 with gr.Row():
 286 |                     with gr.Column():
 287 |                         check_dataset_btn = gr.Button('Verify Dataset/Text File and suggest data entries')    
 288 |                         check_dataset_txt = gr.Textbox(label='Dataset info', value='')
 289 | 
 290 |                 with gr.Row():
 291 |                     start_button = gr.Button("Start LoRA Training", variant='primary')
 292 |                     stop_button = gr.Button("Interrupt")
 293 | 
 294 |                 with gr.Accordion(label="Graph", open=True):
 295 |                     with gr.Row():
 296 |                         # show_actions_button = False - we use old gradio
 297 |                         plot_graph = gr.LinePlot(x="epoch", y="value", title="Loss Metrics", overlay_point=True, tooltip=["epoch", "value"], x_lim=[0, 1], y_lim=[0, 3.5], width=500, height=250) 
 298 |  
 299 |                 output = gr.Markdown(value="Ready")
 300 | 
 301 |     with gr.Tab('Perplexity evaluation', elem_id='evaluate-tab'):
 302 |         with gr.Row():
 303 |             with gr.Column():
 304 |                 models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True)
 305 |                 evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + get_datasets(TRAINING_DATASET_FOLDER, 'txt')[1:], value='wikitext', label='Input dataset', info='The text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under dataset folder.')
 306 |                 with gr.Row():
 307 |                     with gr.Column():
 308 |                         stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
 309 | 
 310 |                     with gr.Column():
 311 |                         max_length = gr.Number(label='max_length', precision=0, step=256, value=0, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
 312 | 
 313 |                 with gr.Row():
 314 |                     start_current_evaluation = gr.Button("Evaluate loaded model")
 315 |                     start_evaluation = gr.Button("Evaluate selected models")
 316 |                     stop_evaluation = gr.Button("Interrupt")
 317 | 
 318 |             with gr.Column():
 319 |                 evaluation_log = gr.Markdown(value='')
 320 | 
 321 |         evaluation_table = gr.Dataframe(value=generate_markdown_table(), interactive=True)
 322 |         with gr.Row():
 323 |             save_comments = gr.Button('Save comments', elem_classes="small-button")
 324 |             refresh_table = gr.Button('Refresh the table', elem_classes="small-button")
 325 | 
 326 |     # Training events
 327 |     all_params = [lora_name, always_override, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, raw_text_file, higher_rank_limit, warmup_steps, optimizer, hard_cut_string, train_only_after, stop_at_loss, add_eos_token, min_chars, report_to, precize_slicing_overlap, add_eos_token_type, save_steps_under_loss, add_bos_token, training_projection,sliding_window,warmup_ratio,grad_accumulation, neft_noise_alpha,group_by_length,eliminate_long_blocks,stop_at_epoch, datasetJSONL, eval_datasetJSONL, eval_stepsJSONL, hybrid_training, hybrid_data_ratio, hybrid_text_ratio,lora_RS,lora_RS_alpha,lora_modulessave,use_grad_checkpoint]
 328 | 
 329 |     def fix_old_version(batch_size_val,micro_batch_size_val, grad_accumulation_val):
 330 |         if batch_size_val>0:
 331 |             gradient_acc =  batch_size_val // micro_batch_size_val
 332 |             print(f"Using Old version of Batch Size ({batch_size_val}) to set Gradient Accumulation: {gradient_acc}")
 333 |             return gradient_acc
 334 | 
 335 |         return grad_accumulation_val
 336 | 
 337 |     
 338 |     copy_from.change(partial(do_copy_params, all_params= all_params), copy_from, all_params).then(fix_old_version,[batch_size,micro_batch_size, grad_accumulation],grad_accumulation)
 339 |     start_button.click(do_train, all_params, [output,plot_graph])
 340 |     stop_button.click(do_interrupt, None, None, queue=False)
 341 |     higher_rank_limit.change(change_rank_limit, [higher_rank_limit], [lora_rank, lora_alpha])
 342 | 
 343 |     def trigger_stop_at_loss(stop_at_loss_value):
 344 |         non_serialized_params.update({"stop_at_loss": stop_at_loss_value})
 345 |         if non_serialized_params['training_loop']:
 346 |             print(f"Queue: [Stop at loss Change] to {stop_at_loss_value}")
 347 | 
 348 |     def trigger_stop_at_epoch(stop_at_epoch_value):
 349 |         non_serialized_params.update({"stop_at_epoch": stop_at_epoch_value})
 350 |         if non_serialized_params['training_loop']:
 351 |             print(f"Queue: [Stop at Epoch Change] to {stop_at_epoch_value}")    
 352 | 
 353 |     stop_at_loss.change(trigger_stop_at_loss, stop_at_loss, None)
 354 |     stop_at_epoch.change(trigger_stop_at_epoch, stop_at_epoch, None)
 355 | 
 356 |     def trigger_save_checkpoint():
 357 |         non_serialized_params.update({"save_checkpoint_now": True})
 358 |         if non_serialized_params['training_loop']:
 359 |             print("Queue: [Save checkpoint] Checkpoint will be saved after the current step is finished.")
 360 |         else:
 361 |             print("Use during the training to save the checkpoint at any time.")
 362 | 
 363 | 
 364 |     def update_button():
 365 |         return gr.Button.update('[Checkpoint in Queue]', variant='stop', interactive=True)
 366 | 
 367 |     def update_button2():
 368 |         time.sleep(1.0)
 369 |         return gr.Button.update('Queue Checkpoint Now', variant='secondary',interactive = True)
 370 | 
 371 |     save_chackpoint_now.click(trigger_save_checkpoint, None, None).then(update_button, None,save_chackpoint_now).then(update_button2, None,save_chackpoint_now)
 372 | 
 373 |     dataset_calc_params = [save_steps,micro_batch_size, epochs, cutoff_len, dataset, format, raw_text_file, warmup_steps, hard_cut_string, min_chars, precize_slicing_overlap,sliding_window,warmup_ratio,grad_accumulation, datasetJSONL, hybrid_training, hybrid_data_ratio, hybrid_text_ratio]
 374 | 
 375 |     def check_dataset(save_steps:int, micro_batch_size: int, epochs: int, cutoff_len: int, dataset:str, format:str, raw_text_file:str, warmup_steps:int, hard_cut_string:str, min_chars:int, precize_slicing_overlap:bool,sliding_window:bool,warmup_ratio:float,grad_accumulation:int, datasetJSONL:str, hybrid_training:bool, hybrid_data_ratio:int, hybrid_text_ratio:int):
 376 |         result = "Specify JSON dastaset or Text file"
 377 |         total_blocks = 0
 378 |         if shared.tokenizer is None:
 379 |             yield "Tokenizer is not available. Please Load some Model first."
 380 |             return
 381 |         
 382 |         # hybrid training hybrid_training
 383 |         raw_text_used = False
 384 |         hybrid_text_train_data = None
 385 |         max_length_tokens = 0
 386 |         hybrid_total_text_blocks = 0
 387 |         totl_size_in_tokens = 0
 388 | 
 389 |         if hybrid_training == True:
 390 |             print(f" === {RED}Hybrid Training{RESET} ===")
 391 |             if raw_text_file not in ['None', '']:
 392 |                 if datasetJSONL not in ['None', '']:
 393 |                     print(f" - Raw text + JSONL")
 394 |                 elif dataset not in ['None', '']:
 395 |                     print(f" - Raw text + JSON")
 396 |                 else:
 397 |                     print(f" - {RED}Error:{RESET} for Hybrid training you need Raw text AND JSONL or JSON dataset")
 398 |                     yield "Missing dataset and raw file for hybrid training, cannot continue."
 399 |                     return
 400 |         
 401 |             else:
 402 |                 print(f" - {RED}Error:{RESET} for Hybrid training you need JSONL or JSON dataset AND Raw text file.")    
 403 |                 yield "Missing dataset and raw file for hybrid training, cannot continue."
 404 |                 return        
 405 |         
 406 |         if raw_text_file not in ['None', '']:
 407 |             logger.info("Loading Text file...")
 408 |             fullpath = clean_path(TRAINING_DATASET_FOLDER, f'{raw_text_file}')
 409 |             fullpath = Path(fullpath)
 410 |             if fullpath.is_dir():
 411 |                 logger.info('Training path directory {}'.format(raw_text_file))
 412 |                 raw_text = ""
 413 |                 file_paths = sorted(fullpath.glob('*.txt'), key=lambda path: natural_keys(path.name))
 414 |                 for file_path in file_paths:
 415 |                     if file_path.is_file():
 416 |                         with file_path.open('r', encoding='utf-8') as file:
 417 |                             raw_text += file.read().replace('\r', '')
 418 | 
 419 |                         logger.info(f"Loaded training file: {file_path.name}")
 420 |             else:
 421 |                 try:
 422 |                     with open(clean_path(TRAINING_DATASET_FOLDER, f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
 423 |                         raw_text = file.read().replace('\r', '')
 424 |                 except:
 425 |                     yield f"{raw_text_file}.txt doesn't seem to exsist anymore... check your {TRAINING_DATASET_FOLDER} folder"
 426 |                     return
 427 |             
 428 |  
 429 |             if min_chars<0:
 430 |                 min_chars = 0
 431 | 
 432 |             EOS_token_str = '</s>'
 433 |             BOS_token_str = '<s>'
 434 |            
 435 |             if hasattr(shared.tokenizer, 'bos_token'):
 436 |                 BOS_token_str = shared.tokenizer.bos_token
 437 |             else:    
 438 |                 print(f" - No {RED}BOS{RESET} token defined in tokenizer, using default")
 439 | 
 440 |             if hasattr(shared.tokenizer, 'eos_token'):
 441 |                 EOS_token_str = shared.tokenizer.eos_token
 442 |             else:
 443 |                 print(f" - No {RED}EOS{RESET} token defined in tokenizer, using default")    
 444 |                 
 445 |  
 446 |             print(f"Tokenizer BOS token: {GREEN}{BOS_token_str}{RESET}, EOS token:  {RED}{EOS_token_str}{RESET}")
 447 |             # == New more precise slicing on sentence boundary ==
 448 |             if sliding_window:
 449 |                 text_chunks = sliding_block_cut(raw_text, min_chars, False, cutoff_len, hard_cut_string,non_serialized_params['debug_slicer'],EOS_token_str,BOS_token_str)
 450 |             else:
 451 |                 text_chunks = precise_cut(raw_text, precize_slicing_overlap, min_chars, False, cutoff_len, hard_cut_string,non_serialized_params['debug_slicer'],EOS_token_str,BOS_token_str)
 452 | 
 453 | 
 454 |             total_blocks = len(text_chunks)
 455 |             
 456 |             hybrid_total_text_blocks = total_blocks
 457 |             
 458 |             if hybrid_training==False:
 459 |                 raw_text_used = True
 460 | 
 461 |             max_length = 0
 462 |             max_text = ''
 463 |             # calculate total size
 464 |             total_size = 0
 465 |             for example in text_chunks:
 466 |                 if len(example) > max_length:
 467 |                     max_length = len(example)
 468 |                     max_text = example
 469 |                 total_size += len(example)
 470 | 
 471 |             input_ids = shared.tokenizer.encode(max_text, truncation=True, max_length=8192)
 472 | 
 473 |             # for english
 474 |             totl_size_in_tokens = total_size*1.53 
 475 |             
 476 |             result = f"Text: ({raw_text_file}.txt) has {total_blocks} blocks (Block Size {cutoff_len} tokens)"
 477 |             result += f"\nLongest Plain Text Block: {len(input_ids)+1}"
 478 |              
 479 |             if hybrid_training == True:
 480 |                 num_text_to_keep = int(total_blocks * float(hybrid_text_ratio) / 100.0)
 481 |                 result += f"\nUsing {hybrid_text_ratio}% of text: ({num_text_to_keep}/{total_blocks}) blocks"
 482 |                 hybrid_total_text_blocks = num_text_to_keep
 483 | 
 484 |             #no suggestion for plaintext as it is set by cutoff_len anyway
 485 |             max_length_tokens = 0
 486 | 
 487 |             del text_chunks
 488 |         
 489 |         # datasets
 490 |         if raw_text_used == False:
 491 |             data = None
 492 |             format_data: dict[str, str] = {}
 493 |             format_text = ''
 494 | 
 495 |             if datasetJSONL not in ['None', '']:
 496 | 
 497 |                 logger.info("Loading JSONL datasets...")
 498 |             
 499 |                 with open(clean_path(TRAINING_DATASET_FOLDER, f'{datasetJSONL}.jsonl'), 'r', encoding='utf-8-sig') as dataFile:
 500 |                     loaded_JSONLdata = json.load(dataFile)
 501 | 
 502 |                 
 503 |                 chat_template = shared.tokenizer.chat_template
 504 |                 format_text = "Template: [Embedded]"
 505 |                 if shared.tokenizer.chat_template is None or shared.tokenizer.chat_template =='':
 506 |                     print(f"{RED}Missing chat template in tokenizer. Using instruction_template instead{RESET}")
 507 |                     shared.tokenizer.chat_template = shared.persistent_interface_state['instruction_template_str'] 
 508 |                     format_text = "Template: [Missing] << using instruction template instead"
 509 | 
 510 |                 logger.info("Applying chat template")               
 511 |                 data_list = [{"jsonl": shared.tokenizer.apply_chat_template(entry["messages"], tokenize=False, add_generation_prompt=False)} for entry in loaded_JSONLdata]
 512 |                 
 513 |                 shared.tokenizer.chat_template = chat_template
 514 |                 data = DatasetDict()
 515 |                 data['train'] = Dataset.from_list(data_list)
 516 |                 format_data = {"jsonl": "%jsonl%"}
 517 | 
 518 |             else:
 519 |                 if dataset in ['None', '']:
 520 |                     yield "Select dataset or text file."
 521 |                     return 
 522 | 
 523 |                 if format in ['None', '']:
 524 |                     yield "Select format choice for dataset."
 525 |                     return
 526 |             
 527 |                 if shared.tokenizer.pad_token_id is None:
 528 |                     print("Missing pad ID - setting to 0")
 529 |                     shared.tokenizer.pad_token_id = 0
 530 | 
 531 |                 with open(clean_path(TRAINING_FORMATS_FOLDER, f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
 532 |                     format_data: dict[str, str] = json.load(formatFile)
 533 | 
 534 |                 format_text = f'Format: [JSON] {format}'    
 535 | 
 536 |                 logger.info("Loading JSON datasets...")
 537 | 
 538 |                 data = load_dataset("json", data_files=clean_path(TRAINING_DATASET_FOLDER, f'{dataset}.json'))
 539 |      
 540 |             def generate_prompt(data_point: dict[str, str]):
 541 |                 for options, data in format_data.items():
 542 |                     if set(options.split(',')) == set(x[0] for x in data_point.items() if (type(x[1]) is str and len(x[1].strip()) > 0)):
 543 |                         for key, val in data_point.items():
 544 |                             if type(val) is str:
 545 |                                 data = data.replace(f'%{key}%', val)
 546 |                         return data
 547 |                 raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(format_data.keys())}"')
 548 | 
 549 |             def tokenize_dummy(prompt):
 550 | 
 551 |                 input_ids = shared.tokenizer.encode(prompt, truncation=True, max_length=8192)
 552 |                 labels = [1] * len(input_ids)
 553 |                 input_ids = torch.tensor(input_ids)
 554 |                 pad_token_id = shared.tokenizer.pad_token_id
 555 |                 return {
 556 |                     "input_ids": input_ids,
 557 |                     "labels": labels,
 558 |                     "attention_mask": input_ids.ne(pad_token_id),
 559 |                 }
 560 | 
 561 |             def generate_and_tokenize_prompt(data_point):
 562 |                 prompt = generate_prompt(data_point)
 563 |                 return tokenize_dummy(prompt)
 564 |             
 565 |           
 566 |             data_keys = [] 
 567 | 
 568 |             if data:
 569 |                 if 'train' in data:  # Check if the 'train' split exists in the dataset
 570 |                     data_keys = list(data['train'][0].keys())
 571 |                     print("Data Keys:", data_keys)
 572 |             else:
 573 |                 print("The dataset is empty.")
 574 | 
 575 |             if shared.tokenizer.pad_token_id is None:
 576 |                 print("Missing pad ID - setting to 0")
 577 |                 shared.tokenizer.pad_token_id = 0
 578 | 
 579 |             train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
 580 |             total_blocks = train_data.num_rows
 581 | 
 582 |             max_length = 0
 583 |             second_max_length = 0
 584 |             total_size_tk = 0
 585 | 
 586 |             for example in train_data:
 587 |                 length = len(example['input_ids'])
 588 |                 total_size_tk += length    
 589 |                 if length > max_length:
 590 |                     second_max_length = max_length
 591 |                     max_length = length
 592 |                 elif length > second_max_length:
 593 |                     second_max_length = length
 594 | 
 595 |             max_length_tokens = max_length
 596 |             totl_size_in_tokens = totl_size_in_tokens + total_size_tk
 597 | 
 598 |             if hybrid_training:
 599 |                 result = result+'\n'
 600 |             else:
 601 |                 result = ''
 602 | 
 603 | 
 604 |             result += f"Dataset: ({dataset}.json) has {total_blocks} blocks @ length = {cutoff_len} tokens\nKeys: {data_keys}  {format_text}"
 605 |             result += f"\nLongest Data Block: {max_length_tokens} tokens. Second Longest Block: {second_max_length} tokens."
 606 | 
 607 |             if hybrid_training == True:
 608 |                 num_data_to_keep = int(total_blocks * float(hybrid_data_ratio) / 100.0)
 609 |                 result += f"\nUsing {hybrid_data_ratio}% of dataset: ({num_data_to_keep}/{total_blocks}) blocks"
 610 |                 total_blocks = num_data_to_keep
 611 | 
 612 |             #for options, data in format_data.items():
 613 |             #    format_keys = options.split(',')
 614 |             #    result += f"{format_keys}, "
 615 |             #result = result.rstrip()    
 616 |             #result = result.rstrip(',')  
 617 | 
 618 |         if total_blocks>0:
 619 |             
 620 |             if hybrid_training == True:
 621 |                total_blocks = hybrid_total_text_blocks + total_blocks
 622 |                result += f"\n[Total number of Hybrid blocks: {total_blocks}]"
 623 | 
 624 | 
 625 |             result += f"\n[Total Number of Tokens (sum): {totl_size_in_tokens}]" 
 626 | 
 627 |             number_ofSteps = int(math.ceil(total_blocks / micro_batch_size) * epochs) 
 628 |             num_stepsPer_epoch = int(math.ceil(number_ofSteps/epochs))
 629 |             min_warm = math.ceil(100 / grad_accumulation)
 630 | 
 631 |             warmup_steps_suggest = min(int(min_warm*grad_accumulation), int(math.ceil(number_ofSteps * 0.1)))
 632 |             warmup_steps_suggest = min(warmup_steps_suggest,num_stepsPer_epoch)
 633 | 
 634 |             save_each_n_min = int(math.ceil(number_ofSteps/10))
 635 |             save_each_n_max = int(math.ceil(number_ofSteps/5))
 636 |             gradient_accumulation_max = int(total_blocks)//micro_batch_size
 637 | 
 638 |             result += f"\n[Batch Size: {micro_batch_size}, Epochs: {epochs}, Gradient Accumulation: {grad_accumulation}]\n"
 639 |             result += f"Total number of steps: {number_ofSteps}\n"
 640 |             result += f"Steps per each Epoch: {num_stepsPer_epoch}\n"
 641 |             result += f"Suggestions:\n"
 642 | 
 643 |             if max_length_tokens>0:
 644 |                 next_max_multiple = ((max_length_tokens + 31) // 32) * 32
 645 |                 result += f"Maximum context length: {next_max_multiple} (Current: {cutoff_len})\n"
 646 | 
 647 |             result += f"Checkpoints: Save every {save_each_n_min} - {save_each_n_max} steps (Current: {int(save_steps)})\n"
 648 |             result += f"Warmup steps: {warmup_steps_suggest} (Current: {int(warmup_steps)})"
 649 | 
 650 | 
 651 | 
 652 |             if gradient_accumulation_max < grad_accumulation: 
 653 |                 result += f"\n\nWARNING: Gradient Accumulation {grad_accumulation} is too high: It should be below {gradient_accumulation_max}"
 654 | 
 655 | 
 656 |             result = result.strip()
 657 | 
 658 |         yield result
 659 |         return
 660 |     
 661 |     check_dataset_btn.click(check_dataset, dataset_calc_params ,check_dataset_txt)
 662 | 
 663 |     # Evaluation events. For some reason, the interrupt event
 664 |     # doesn't work with the .then() syntax, so I write them one
 665 |     # by one in this ugly but functional way.
 666 |     ev = start_evaluation.click(calculate_perplexity, [models, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False)
 667 |     start_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False)
 668 | 
 669 |     start_current_evaluation.click(lambda: ['current model'], None, tmp)
 670 |     ev_cur = start_current_evaluation.click(calculate_perplexity, [tmp, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False)
 671 |     start_current_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False)
 672 | 
 673 |     stop_evaluation.click(None, None, None, cancels=[ev, ev_cur], queue=False)
 674 |     refresh_table.click(generate_markdown_table, None, evaluation_table, show_progress=True)
 675 |     save_comments.click(
 676 |         save_past_evaluations, evaluation_table, None).then(
 677 |         lambda: "Comments saved.", None, evaluation_log, show_progress=False)
 678 | 
 679 |     def reload_lora():
 680 |         return gr.Dropdown.update(choices=get_available_loras_local(non_serialized_params['Lora_sortedByTime']))
 681 |  
 682 |     # nonserialized items
 683 | 
 684 |     sort_byTime.change(lambda x: non_serialized_params.update({"Lora_sortedByTime": x}), sort_byTime, None).then(reload_lora,None,copy_from) 
 685 |     #debug_slicer.change(lambda x: non_serialized_params.update({"debug_slicer": x}), debug_slicer, None)
 686 | 
 687 |     def update_dataset():
 688 |         return gr.update(choices=get_datasets(TRAINING_DATASET_FOLDER, 'json')), gr.update(choices=get_datasets(TRAINING_DATASET_FOLDER, 'txt'))
 689 | 
 690 |     download_button.click(download_file_from_url, [download_file_url,download_check_overwrite,download_folder] , download_status).then(update_dataset,None,[dataset , raw_text_file])
 691 | 
 692 |     def update_datasetJSON():
 693 |         return gr.update(choices=get_datasets(TRAINING_DATASET_FOLDER, 'json')), gr.update(choices=get_datasets(TRAINING_DATASET_FOLDER, 'json'))
 694 | 
 695 | 
 696 |     def split_dataset(dataset, split_dataset_perc):
 697 | 
 698 |         if dataset == 'None' or dataset == '':
 699 |             print("No dataset selected in Formatted Datasets")
 700 |             return
 701 |         
 702 |         # Load the original JSON data
 703 |         logger.info("Splitting JSON datasets 90/10...")
 704 | 
 705 |         dataset_json_new = f'{dataset}_train.json'
 706 |         eval_json_new = f'{dataset}_eval.json'
 707 |        
 708 |         dataset_json = f'{dataset}.json'
 709 |        
 710 | 
 711 |         with open(clean_path(TRAINING_DATASET_FOLDER, dataset_json), 'r', encoding='utf-8-sig') as f:
 712 |             data = json.load(f)
 713 | 
 714 |         # Define the split ratio (e.g., 80% for training, 20% for evaluation)
 715 |         split_ratio = 1.0 - float(split_dataset_perc)/100.0
 716 |         total_samples = len(data)
 717 |         split_index = int(total_samples * split_ratio)
 718 |         print(f" + training: {split_index} blocks")
 719 |         print(f" + eval: {total_samples - split_index} blocks")
 720 |         # Shuffle the data to ensure randomness
 721 |         random.shuffle(data)
 722 | 
 723 |         # Split the data into training and evaluation sets
 724 | 
 725 |         train_data = data[:split_index]
 726 |         eval_data = data[split_index:]
 727 | 
 728 |         # Save the training data to a new JSON file
 729 |         with open(clean_path(TRAINING_DATASET_FOLDER, dataset_json_new), 'w', encoding='utf-8') as f:
 730 |             json.dump(train_data, f, indent=2)
 731 | 
 732 |         # Save the evaluation data to a new JSON file
 733 |         with open(clean_path(TRAINING_DATASET_FOLDER, eval_json_new), 'w', encoding='utf-8') as f:
 734 |             json.dump(eval_data, f, indent=2)    
 735 | 
 736 | 
 737 |     def select_dataset(dataset):
 738 |         dataset_json_new = f'{dataset}_train.json'
 739 |         eval_json_new = f'{dataset}_eval.json'
 740 |         path1 = clean_path(TRAINING_DATASET_FOLDER, dataset_json_new)
 741 |         path2 = clean_path(TRAINING_DATASET_FOLDER, eval_json_new)
 742 |         returnA = 'None'
 743 |         returnB = 'None'
 744 | 
 745 |         if Path(path1).is_file():
 746 |            print(f"{dataset_json_new} file selected for training")
 747 |            returnA = dataset_json_new.replace('.json', '')
 748 | 
 749 |         if Path(path2).is_file():
 750 |            print(f"{eval_json_new} file selected for evaluation")
 751 |            returnB = eval_json_new.replace('.json', '')
 752 | 
 753 |         
 754 |         return returnA, returnB
 755 | 
 756 | 
 757 | 
 758 |     split_dataset_do.click(split_dataset,[dataset,split_dataset_perc],None).then(update_datasetJSON, None,[dataset, eval_dataset]).then(select_dataset, dataset,[dataset,eval_dataset])
 759 | 
 760 |     def update_datasetJSONL():
 761 |         return gr.update(choices=get_datasets(TRAINING_DATASET_FOLDER, 'jsonl')),gr.update(choices=get_datasets(TRAINING_DATASET_FOLDER, 'jsonl'))
 762 | 
 763 |     def update_datasetJSON():
 764 |         return gr.update(choices=get_datasets(TRAINING_DATASET_FOLDER, 'json')),gr.update(choices=get_datasets(TRAINING_DATASET_FOLDER, 'json'))
 765 | 
 766 |     def convert_json_to_jsonl(dataset, system_text):
 767 |         if dataset == 'None' or dataset == '':
 768 |             print("No dataset selected in Formatted Datasets")
 769 |             return
 770 |  
 771 |         dataset_json_new = f'{dataset}.jsonl'
 772 |         dataset_json = f'{dataset}.json'
 773 |       
 774 | 
 775 |         with open(clean_path(TRAINING_DATASET_FOLDER, dataset_json), 'r', encoding='utf-8-sig') as f:
 776 |             data = json.load(f)
 777 | 
 778 |         print(f"Converting {dataset_json}...")    
 779 |         converted_data = []
 780 |             
 781 |         for entry in data:
 782 |             if system_text == '':
 783 |                 converted_entry = {
 784 |                     "messages": [
 785 |                         {"role": "user", "content": entry["instruction"]},
 786 |                         {"role": "assistant", "content": entry["output"]}
 787 |                     ]
 788 |                 }
 789 |             else:     
 790 |                 converted_entry = {
 791 |                     "messages": [
 792 |                         {"role": "system", "content": system_text},
 793 |                         {"role": "user", "content": entry["instruction"]},
 794 |                         {"role": "assistant", "content": entry["output"]}
 795 |                     ]
 796 |                 }
 797 |             converted_data.append(converted_entry)
 798 | 
 799 |         print(f"Saving {dataset_json_new}")
 800 |         with open(clean_path(TRAINING_DATASET_FOLDER, dataset_json_new), 'w') as outfile:
 801 |             json.dump(converted_data, outfile, indent=2)
 802 | 
 803 |     def convert_text_to_jsonl(textfile, system_text, prompt):
 804 |         if textfile == 'None' or textfile == '':
 805 |             print("No plain text selected in tab Text file")
 806 |             return
 807 |  
 808 |         dataset_json_new = f'{textfile}.jsonl'
 809 |         dataset_txt = f'{textfile}.txt'
 810 |       
 811 | 
 812 |         with open(clean_path(TRAINING_DATASET_FOLDER, dataset_txt), 'r', encoding='utf-8-sig') as f:
 813 |             text = f.read().replace('\r', '')
 814 | 
 815 |         text_list = text.split("\n\n\n")
 816 |         
 817 |         print(f"Converting {dataset_txt}...")    
 818 |         converted_data = []
 819 |             
 820 |         for entry in text_list:
 821 |             entry = entry.strip()
 822 |             if entry!='':
 823 |                 converted_entry = {
 824 |                     "messages": [
 825 |                         {"role": "system", "content": system_text},
 826 |                         {"role": "user", "content": prompt},
 827 |                         {"role": "assistant", "content": entry}
 828 |                     ]
 829 |                 }
 830 |                 converted_data.append(converted_entry)
 831 | 
 832 |         print(f"Saving {dataset_json_new}")
 833 |         with open(clean_path(TRAINING_DATASET_FOLDER, dataset_json_new), 'w') as outfile:
 834 |             json.dump(converted_data, outfile, indent=2)
 835 | 
 836 |     def select_datasetJSONL(dataset):
 837 |         dataset_json_new = f'{dataset}.jsonl'
 838 |         pathJSONL = clean_path(TRAINING_DATASET_FOLDER, dataset_json_new)
 839 |         returnA = 'None'
 840 |         returnB = 'None'
 841 | 
 842 |         if Path(pathJSONL).is_file():
 843 |            print(f"{dataset_json_new} file selected for training")
 844 |            returnB = dataset_json_new.replace('.jsonl', '')
 845 | 
 846 |         return returnA, returnB
 847 | 
 848 |     def select_datasetJSON(dataset):
 849 |         dataset_json_new = f'{dataset}.json'
 850 |         pathJSON = clean_path(TRAINING_DATASET_FOLDER, dataset_json_new)
 851 |         return_to_clear = 'None'
 852 |         return_to_set = 'None'
 853 | 
 854 |         if Path(pathJSON).is_file():
 855 |            print(f"{dataset_json_new} file selected for training")
 856 |            return_to_set = dataset_json_new.replace('.json', '')
 857 | 
 858 |         return return_to_clear, return_to_set
 859 | 
 860 | 
 861 |     convert_do.click(convert_json_to_jsonl,[dataset,convert_system],None).then(update_datasetJSONL, None,[datasetJSONL,eval_datasetJSONL]).then(select_datasetJSONL, dataset,[dataset,datasetJSONL])
 862 |     convert_do2.click(convert_text_to_jsonl,[raw_text_file,convert_system2,convert_prompt2],None).then(update_datasetJSONL, None,[datasetJSONL,eval_datasetJSONL]).then(select_datasetJSONL, raw_text_file,[raw_text_file,datasetJSONL])
 863 | 
 864 |     dump_dataset.change(lambda x: non_serialized_params.update({"dump_dataset": x}), dump_dataset, None)
 865 |     dump_dataset_remove_s.change(lambda x: non_serialized_params.update({"dump_dataset_remove_s": x}), dump_dataset_remove_s, None)
 866 | 
 867 | def get_datasets(path: str, ext: str):
 868 |     # include subdirectories for raw txt files to allow training from a subdirectory of txt files
 869 |     if ext == "txt":
 870 |         return ['None'] + sorted(set([k.stem for k in list(Path(path).glob('*.txt')) + list(Path(path).glob('*/')) if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
 871 | 
 872 |     return ['None'] + sorted(set([k.stem for k in Path(path).glob(f'*.{ext}') if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
 873 | 
 874 | def do_interrupt():
 875 |     global WANT_INTERRUPT
 876 |     WANT_INTERRUPT = True
 877 | 
 878 | def reload_model_local():
 879 |     try:
 880 |         modelname = shared.model_name
 881 |         unload_model()
 882 |         shared.model_name = modelname
 883 | 
 884 |         if shared.model_name != '':
 885 |             shared.model, shared.tokenizer = load_model(shared.model_name, shared.args.loader)
 886 | 
 887 |         if shared.model is not None:
 888 |             print(f"Successfully reloaded  `{shared.model_name}`.")
 889 |         else:
 890 |             print(f"Failed to reload `{shared.model_name}`.")
 891 |     except:
 892 |         exc = traceback.format_exc()
 893 |         logger.error('Failed to load the model.')
 894 |         print(exc)
 895 | 
 896 | 
 897 | def do_copy_params(lora_name: str, all_params):
 898 | 
 899 |     if lora_name:
 900 |         f_name = f"{shared.args.lora_dir}/{clean_path(None, lora_name)}/training_parameters.json"
 901 |         if Path(f_name).is_file():
 902 |             with open(f_name, 'r', encoding='utf-8') as format_file:
 903 |                 params: dict[str, str] = json.load(format_file)
 904 |         else:
 905 |             params = {}
 906 |     else:
 907 |         params = {}        
 908 | 
 909 |     result = list()
 910 |     for i in range(0, len(PARAMETERS)):
 911 |         key = PARAMETERS[i]
 912 |         if key in params:
 913 |             result.append(params[key])
 914 |         else:
 915 |             result.append(all_params[i])
 916 | 
 917 |     return result
 918 | 
 919 | 
 920 | def change_rank_limit(use_higher_ranks: bool):
 921 |     mult = 2 if use_higher_ranks else 1
 922 |     return {"maximum": 1024 * mult, "__type__": "update"}, {"maximum": 2048 * mult, "__type__": "update"}
 923 | 
 924 | 
 925 | def clean_path(base_path: str, path: str):
 926 |     """Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
 927 |     path = path.replace('\\', '/').replace('..', '_')
 928 |     if base_path is None:
 929 |         return path
 930 | 
 931 |     return f'{Path(base_path).absolute()}/{path}'
 932 | 
 933 | 
 934 | def backup_adapter(input_folder):
 935 |     # Get the creation date of the file adapter_model.bin
 936 |     try:
 937 |         adapter_file = Path(f"{input_folder}/adapter_model.bin")
 938 |         if adapter_file.is_file():
 939 | 
 940 |             logger.info("Backing up existing LoRA adapter...")
 941 |             creation_date = datetime.fromtimestamp(adapter_file.stat().st_ctime)
 942 |             creation_date_str = creation_date.strftime("Backup-%Y-%m-%d")
 943 | 
 944 |             # Create the new subfolder
 945 |             subfolder_path = Path(f"{input_folder}/{creation_date_str}")
 946 |             subfolder_path.mkdir(parents=True, exist_ok=True)
 947 | 
 948 |             # Check if the file already exists in the subfolder
 949 |             backup_adapter_file = Path(f"{input_folder}/{creation_date_str}/adapter_model.bin")
 950 |             if backup_adapter_file.is_file():
 951 |                 print(" - Backup already exists. Skipping backup process.")
 952 |                 return
 953 | 
 954 |             # Copy existing files to the new subfolder
 955 |             existing_files = Path(input_folder).iterdir()
 956 |             for file in existing_files:
 957 |                 if file.is_file():
 958 |                     shutil.copy2(file, subfolder_path)
 959 |     except Exception as e:
 960 |         print("An error occurred in backup_adapter:", str(e))
 961 | 
 962 | 
 963 | def calc_trainable_parameters(model):
 964 |     trainable_params = 0
 965 |     all_param = 0
 966 |     for _, param in model.named_parameters():
 967 |         num_params = param.numel()
 968 |         # if using DS Zero 3 and the weights are initialized empty
 969 |         if num_params == 0 and hasattr(param, "ds_numel"):
 970 |             num_params = param.ds_numel
 971 | 
 972 |         all_param += num_params
 973 |         if param.requires_grad:
 974 |             trainable_params += num_params
 975 | 
 976 |     return trainable_params, all_param
 977 | 
 978 | def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str, precize_slicing_overlap: bool, add_eos_token_type: str, save_steps_under_loss: float, add_bos_token: bool, training_projection: str,sliding_window:bool,warmup_ratio:float, grad_accumulation: int,neft_noise_alpha:float, group_by_length:bool,eliminate_long_blocks:bool, stop_at_epoch: float, datasetJSONL:str, eval_datasetJSONL:str, eval_stepsJSONL:int, hybrid_training:bool, hybrid_data_ratio:int, hybrid_text_ratio:int,lora_RS:bool,lora_RS_alpha:int,lora_modulessave:bool,use_grad_checkpoint:bool):
 979 | 
 980 | 
 981 | #    if shared.args.monkey_patch:
 982 | #        from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
 983 | #            replace_peft_model_with_int4_lora_model
 984 | #        )
 985 | #        replace_peft_model_with_int4_lora_model()
 986 |     
 987 |     global train_log_graph
 988 |     global WANT_INTERRUPT
 989 |     global mapped_prompts
 990 | 
 991 |     mapped_prompts = 0
 992 |     WANT_INTERRUPT = False
 993 | 
 994 |     statistics['loss'] = []
 995 | 
 996 |     statistics['loss'].append({'epoch': 0, 'value': 0})
 997 |     zero_pd = pd.DataFrame(statistics['loss'])
 998 | 
 999 |     # == Input validation / processing ==
1000 |     yield "Preparing the input...", zero_pd
1001 |     lora_file_path = clean_path(None, lora_name)
1002 |     if lora_file_path.strip() == '':
1003 |         yield "Missing or invalid LoRA file name input.", zero_pd
1004 |         return
1005 | 
1006 |     lora_file_path = f"{Path(shared.args.lora_dir)}/{lora_file_path}"
1007 |     actual_lr = float(learning_rate)
1008 |     model_type = type(shared.model).__name__
1009 | 
1010 |     if model_type in MODEL_CLASSES:
1011 |         model_id = MODEL_CLASSES[model_type]
1012 |     else:
1013 |         model_id = "llama"
1014 |         if model_type == "PeftModelForCausalLM":
1015 |             if len(shared.lora_names) > 0:
1016 |                 yield "You are trying to train a LoRA while you already have another LoRA loaded. This will work, but may have unexpected effects. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*", zero_pd
1017 |                 logger.warning("Training LoRA over top of another LoRA. May have unexpected effects.")
1018 |             else:
1019 |                 yield "Model ID not matched due to LoRA loading. Consider reloading base model. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*", zero_pd
1020 |                 logger.warning("Model ID not matched due to LoRA loading. Consider reloading base model.")
1021 |         else:
1022 |             yield "LoRA training has only currently been validated for LLaMA, OPT, GPT-J, and GPT-NeoX models. Unexpected errors may follow. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*", zero_pd
1023 |             logger.warning(f"LoRA training has only currently been validated for LLaMA, OPT, GPT-J, and GPT-NeoX models. (Found model type: {model_type})")
1024 | 
1025 |         time.sleep(5)
1026 | 
1027 | #    if shared.args.loader == 'GPTQ-for-LLaMa' and not shared.args.monkey_patch:
1028 | #        yield "LoRA training with GPTQ-for-LLaMa requires loading with `--monkey-patch`", zero_pd
1029 | #        return
1030 | 
1031 |     if cutoff_len <= 0 or micro_batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
1032 |         yield "Cannot input zeroes.", zero_pd
1033 |         return
1034 | 
1035 |     #in new version we dumped this in favor of grad_accumulation
1036 |     #set it to zero fo new save
1037 |     batch_size = 0
1038 | 
1039 |    # change: reload earlier
1040 |     
1041 |     # == We MUST reload model if it went through any previous training, even failed one ==
1042 |     if shared.model_dirty_from_training:
1043 |         selected_model = shared.model_name
1044 |         if selected_model:
1045 |             print("\033[1;31;1m(Model has been modified by previous training, it needs to be reloaded...)\033[0;37;0m")
1046 |             try:
1047 |                 yield f"Reloading {selected_model}...", zero_pd
1048 |                 reload_model_local()
1049 |                 
1050 |                 if shared.tokenizer.pad_token_id is None:
1051 |                     print("Missing pad_token_id ID - setting to 0")
1052 |                     shared.tokenizer.pad_token_id = 0
1053 | 
1054 |                 shared.tokenizer.padding_side = "left"
1055 | 
1056 |                 if shared.model is not None:
1057 |                     print("Model reloaded OK, continue with training.")
1058 |                 else:
1059 |                     return f"Failed to load {selected_model}."
1060 |             except:
1061 |                 exc = traceback.format_exc()
1062 |                 logger.error('Failed to reload the model.')
1063 |                 print(exc)
1064 |                 return exc.replace('\n', '\n\n')    
1065 |     
1066 |     # == check tokenizer ==
1067 |     pad_token_id = None
1068 |     pad_token = None
1069 |     eos_token_id = None
1070 |     eos_token = None
1071 | 
1072 |     print (f"{YELLOW} Tokenizer safety check {RESET}")
1073 | 
1074 |  
1075 |     if hasattr(shared.tokenizer, 'pad_token_id'):
1076 |         if pad_token_id is None:
1077 |             print(f"{RED} Missing pad_token_id - setting to 0 {RESET}")
1078 |             shared.tokenizer.pad_token_id = 0
1079 | 
1080 |         pad_token_id = shared.tokenizer.pad_token_id
1081 |         pad_token = shared.tokenizer.convert_ids_to_tokens(pad_token_id)
1082 |         print(f" Pad Token id from tokenizer: {pad_token_id} {GREEN}{pad_token}{RESET} ")
1083 |        
1084 |     if hasattr(shared.tokenizer, 'eos_token_id'):
1085 |         eos_token_id = shared.tokenizer.eos_token_id
1086 | 
1087 |     if hasattr(shared.tokenizer, 'eos_token'):
1088 |         eos_token = shared.tokenizer.eos_token
1089 | 
1090 |     if pad_token == '!': 
1091 |         print(f"{RED} Patching PAD token from 0 to <|finetune_right_pad_id|> {RESET} (LLama 3)")
1092 |         pad_token_id = shared.tokenizer.convert_tokens_to_ids("<|finetune_right_pad_id|>")
1093 |         pad_token = "<|finetune_right_pad_id|>"
1094 | 
1095 |         if pad_token_id is None:
1096 |             print(f"{RED} (failed) Patching PAD token to <|vision_pad|> {RESET} (Qwen)")
1097 |             pad_token_id = shared.tokenizer.convert_tokens_to_ids("<|vision_pad|>")
1098 |             pad_token = "<|vision_pad|>"
1099 | 
1100 |         if pad_token_id is None:
1101 |             print(f"{RED} (failed) Patching PAD token to <|end_of_text|> {RESET} (Llama)")
1102 |             pad_token_id = shared.tokenizer.convert_tokens_to_ids("<|end_of_text|>")
1103 |             pad_token = "<|end_of_text|>"
1104 | 
1105 |         if pad_token_id is None:
1106 |             print(f"{RED} (failed) Patching PAD token to {eos_token} {RESET} (Qwen)")
1107 |             pad_token_id = eos_token_id
1108 |             pad_token = eos_token
1109 | 
1110 |         # save it to shared
1111 |         if hasattr(shared.tokenizer, 'pad_token_id'):
1112 |             shared.tokenizer.pad_token_id = pad_token_id
1113 | 
1114 |         if hasattr(shared.tokenizer, 'pad_token'):   
1115 |             shared.tokenizer.pad_token = pad_token
1116 |         
1117 |     # I give up!
1118 |     if pad_token_id is None:
1119 |         print(f"{RED} Giving up on PAD token - setting it as 0 {RESET}")
1120 |         pad_token_id = 0
1121 |         pad_token = shared.tokenizer.convert_ids_to_tokens(pad_token_id)
1122 |  
1123 |     if eos_token_id is None:
1124 |         print(f"{RED} EOS token is missing - that's not good {RESET}")
1125 |         eos_token_id = shared.tokenizer.convert_tokens_to_ids("<|end_of_text|>")
1126 |         eos_token = "<|end_of_text|>"
1127 | 
1128 |     if eos_token_id is None:
1129 |         print(f"{RED} Tokenizer is seriously broken!{RESET}")
1130 |         print(f"{RED} Last chance to make it running setting EOS as PAD {RESET}")
1131 |         eos_token_id = pad_token_id
1132 |         eos_token = pad_token
1133 | 
1134 | 
1135 | 
1136 |     print(f" Pad Token id: {pad_token_id} {GREEN}{pad_token}{RESET} ")
1137 |     print(f" EOS Token id: {eos_token_id} {GREEN}{eos_token}{RESET} ")
1138 | 
1139 |     #LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
1140 |     #LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
1141 |     #LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
1142 |     #LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
1143 | 
1144 |     if pad_token_id == eos_token_id:
1145 |         print(f"{RED}Pad Token is same as EOS Token. The fine-tune might have issue generating EOS{RESET} ")
1146 | 
1147 | 
1148 |     #shared.tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
1149 | 
1150 |     gradient_accumulation_steps = grad_accumulation #batch_size // micro_batch_size
1151 |     
1152 |     # llama 3 padding should be "<|end_of_text|>" or <|reserved_special_token_0|>
1153 |     shared.tokenizer.padding_side = "left"
1154 | 
1155 |     def encode(text, prepend_bos_token):
1156 |        
1157 |         mx_len = cutoff_len
1158 |         
1159 |         # If eliminate_long_blocks is enabled, override the max length to 8192
1160 |         if eliminate_long_blocks:
1161 |             mx_len = 8192
1162 | 
1163 |         # Encode the text using the tokenizer with truncation applied
1164 |         # The tokenizer may automatically add a BOS token at the beginning
1165 |         result = shared.tokenizer.encode(text, truncation=True, max_length=mx_len)
1166 |         
1167 |         # Check if the tokenizer added two BOS tokens at the beginning
1168 |         # This happens if the tokenizer is configured to always prepend a BOS token
1169 |         if len(result) >= 2 and result[:2] == [shared.tokenizer.bos_token_id, shared.tokenizer.bos_token_id]:
1170 |             result = result[1:] # Remove the duplicate BOS token
1171 | 
1172 |         # If prepend_bos_token is False and the first token is a BOS token, remove it
1173 |         if not prepend_bos_token and result[0] == shared.tokenizer.bos_token_id:
1174 |             result = result[1:]
1175 | 
1176 |         return result
1177 | 
1178 |     def tokenize(prompt, append_eos_token=False, prepend_bos_token = False):
1179 | 
1180 |         if train_only_after == '' or train_only_after not in prompt:
1181 |             input_ids = encode(prompt, prepend_bos_token)
1182 | 
1183 |             if append_eos_token and input_ids[-1] != shared.tokenizer.eos_token_id and len(input_ids) < cutoff_len:
1184 |                 input_ids.append(shared.tokenizer.eos_token_id)
1185 |             
1186 |             len_before = len(input_ids)
1187 |             # padding
1188 |             if (cutoff_len - len(input_ids))> 0:
1189 |                 input_ids = [shared.tokenizer.pad_token_id] * (cutoff_len - len(input_ids)) + input_ids
1190 |             
1191 |             #print(f"{len_before} -> {len(input_ids)}")
1192 | 
1193 |             labels = [1] * len(input_ids)
1194 |         else:
1195 |             ind = prompt.index(train_only_after) + len(train_only_after)
1196 |             before_tokens = encode(prompt[:ind], prepend_bos_token)
1197 |             after_tokens = encode(prompt[ind:], False)
1198 | 
1199 |             if append_eos_token and after_tokens[-1] != shared.tokenizer.eos_token_id:
1200 |                 after_tokens.append(shared.tokenizer.eos_token_id)
1201 | 
1202 |             full_length = len(after_tokens) + len(before_tokens)
1203 |             if full_length > cutoff_len:
1204 |                 after_tokens = after_tokens[:cutoff_len - len(before_tokens)]
1205 |             else:
1206 |                 before_tokens = [shared.tokenizer.pad_token_id] * (cutoff_len - full_length) + before_tokens
1207 | 
1208 |             input_ids = before_tokens + after_tokens
1209 |             labels = [-100] * len(before_tokens) + [1] * len(after_tokens)
1210 | 
1211 |             #print(f"{len(input_ids)}")
1212 | 
1213 |         input_ids = torch.tensor(input_ids)
1214 |         return {
1215 |             "input_ids": input_ids,
1216 |             "labels": labels,
1217 |             "attention_mask": input_ids.ne(shared.tokenizer.pad_token_id),
1218 |         }
1219 | 
1220 |     train_template.clear()
1221 | 
1222 |             
1223 |     #reset stuff
1224 |     print(f"*** LoRA: {lora_name} ***")
1225 |     non_serialized_params.update({"stop_at_loss": stop_at_loss})
1226 |     non_serialized_params.update({"stop_at_epoch": stop_at_epoch})
1227 |     non_serialized_params.update({"save_steps_under_loss": save_steps_under_loss+0.01})
1228 |     non_serialized_params.update({"save_checkpoint_now": False})
1229 |     non_serialized_params.update({"training_loop": False})
1230 |     non_serialized_params.update({"current_stability": 0})
1231 |     non_serialized_params.update({"save_epochs": 0})
1232 |     non_serialized_params.update({"checkpoint_offset": 0})
1233 |     non_serialized_params.update({"epoch_offset": 0})
1234 |     train_log_graph.clear()
1235 |    
1236 |     # END OF FPHAM SENTENCE SPLIT functions ===================     
1237 | 
1238 |     # hybrid training hybrid_training
1239 |     raw_text_used = False
1240 |     hybrid_text_train_data = None
1241 | 
1242 |     if hybrid_training == True:
1243 |         print(f" === {RED}Hybrid Training{RESET} ===")
1244 |         if raw_text_file not in ['None', '']:
1245 |             if datasetJSONL not in ['None', '']:
1246 |                 print(f" - Raw text + JSONL")
1247 |             elif dataset not in ['None', '']:
1248 |                 print(f" - Raw text + JSON")
1249 |             else:
1250 |                 print(f" - {RED}Error:{RESET} for Hybrid training you need Raw text AND JSONL or JSON dataset")
1251 |                 yield "Missing dataset and raw file for hybrid training, cannot continue.", zero_pd
1252 |                 return
1253 |     
1254 |         else:
1255 |             print(f" - {RED}Error:{RESET} for Hybrid training you need JSONL or JSON dataset AND Raw text file.")    
1256 |             yield "Missing dataset and raw file for hybrid training, cannot continue.", zero_pd
1257 |             return
1258 | 
1259 | 
1260 |     # == Prep the dataset, format, etc ==
1261 |     if raw_text_file not in ['None', '']:
1262 |         train_template["template_type"] = "raw_text"
1263 |         logger.info("Loading text file...")
1264 |         fullpath = clean_path(TRAINING_DATASET_FOLDER, f'{raw_text_file}')
1265 |         fullpath = Path(fullpath)
1266 |         if fullpath.is_dir():
1267 |             logger.info('Training path directory {}'.format(raw_text_file))
1268 |             raw_text = ""
1269 |             file_paths = sorted(fullpath.glob('*.txt'), key=lambda path: natural_keys(path.name))
1270 |             for file_path in file_paths:
1271 |                 if file_path.is_file():
1272 |                     with file_path.open('r', encoding='utf-8') as file:
1273 |                         raw_text += file.read().replace('\r', '')
1274 | 
1275 |                     logger.info(f"Loaded training file: {file_path.name}")
1276 |         else:
1277 |             with open(clean_path(TRAINING_DATASET_FOLDER, f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
1278 |                 raw_text = file.read().replace('\r', '')
1279 |         
1280 |         # FPHAM PRECISE SLICING        
1281 |         if min_chars<0:
1282 |             min_chars = 0
1283 | 
1284 |         EOS_token_str = '</s>'
1285 |         BOS_token_str = '<s>'
1286 |         
1287 |         if hasattr(shared.tokenizer, 'bos_token'):
1288 |             BOS_token_str = shared.tokenizer.bos_token
1289 |         else:    
1290 |             print(f" - No {RED}BOS{RESET} token defined in tokenizer, using default")
1291 | 
1292 |         if hasattr(shared.tokenizer, 'eos_token'):
1293 |             EOS_token_str = shared.tokenizer.eos_token
1294 |         else:
1295 |             print(f" - No {RED}EOS{RESET} token defined in tokenizer, using default")    
1296 |             
1297 | 
1298 |         print(f"Tokenizer BOS token: {GREEN}{BOS_token_str}{RESET}, EOS token:  {RED}{EOS_token_str}{RESET}")
1299 | 
1300 |         add_EOS_to_all = add_eos_token and add_eos_token_type == 'Every Block'
1301 |         add_EOS_to_HC = add_eos_token and add_eos_token_type != 'Every Block'
1302 | 
1303 |         #print (f"add_eos_token {add_eos_token}, add_EOS_to_all {add_EOS_to_all}, add_EOS_to_HC {add_EOS_to_HC}")
1304 | 
1305 |         # == New more precise slicing on sentence boundary ==
1306 |         if sliding_window:
1307 |             text_chunks = sliding_block_cut(raw_text, min_chars, add_EOS_to_HC, cutoff_len, hard_cut_string,non_serialized_params['debug_slicer'],EOS_token_str,BOS_token_str)
1308 |         else:
1309 |             text_chunks = precise_cut(raw_text, precize_slicing_overlap, min_chars, add_EOS_to_HC, cutoff_len, hard_cut_string,non_serialized_params['debug_slicer'],EOS_token_str,BOS_token_str)
1310 | 
1311 |         if hybrid_training==True:
1312 |             hybrid_text_train_data = Dataset.from_list([tokenize(x, add_EOS_to_all, add_bos_token) for x in text_chunks])
1313 |         else:    
1314 |             train_data = Dataset.from_list([tokenize(x, add_EOS_to_all, add_bos_token) for x in text_chunks])
1315 |             raw_text_used  = True
1316 | 
1317 |         if add_EOS_to_all:
1318 |             print(f"Added EOS to {len(text_chunks)} blocks") 
1319 | 
1320 |         print(f"All Data Blocks: {len(text_chunks)}")
1321 |         
1322 |         del text_chunks
1323 |         eval_data = None
1324 |     
1325 |     if raw_text_used == False:
1326 |         data = None
1327 |         eval_data = None
1328 |         format_data: dict[str, str] = {}
1329 |         train_template["template_type"] = "dataset"
1330 |         #=== JSONL ====
1331 |         if datasetJSONL not in ['None', '']:
1332 |     
1333 |             logger.info("Loading JSONL datasets...")
1334 |         
1335 |             with open(clean_path(TRAINING_DATASET_FOLDER, f'{datasetJSONL}.jsonl'), 'r', encoding='utf-8-sig') as dataFile:
1336 |                 loaded_JSONLdata = json.load(dataFile)
1337 |             
1338 |             chat_template = shared.tokenizer.chat_template
1339 | 
1340 |             if shared.tokenizer.chat_template is None or shared.tokenizer.chat_template =='':
1341 |                 print(f"{RED}No chat template defined in tokenizer. Using instruction_template{RESET}")
1342 |                 shared.tokenizer.chat_template = shared.persistent_interface_state['instruction_template_str'] 
1343 | 
1344 |             # The chat template is responsible for EOS and BOS
1345 |             add_eos_token = False
1346 |             add_bos_token = False
1347 | 
1348 |             logger.info("Applying chat template")               
1349 |             data_list = [{"jsonl": shared.tokenizer.apply_chat_template(entry["messages"], tokenize=False, add_generation_prompt=False)} for entry in loaded_JSONLdata]
1350 |             
1351 |             # another way would be to save data_list as JSON and then load it using load_dataset
1352 |             data = DatasetDict()
1353 |             data['train'] = Dataset.from_list(data_list)
1354 | 
1355 |             if eval_datasetJSONL not in ['None', '']:
1356 |                 logger.info("Loading JSONL eval dataset...")
1357 |                 with open(clean_path(TRAINING_DATASET_FOLDER, f'{eval_datasetJSONL}.jsonl'), 'r', encoding='utf-8-sig') as dataFileeval:
1358 |                     loaded_JSONLevaldata = json.load(dataFileeval)
1359 |                 logger.info("Applying chat template to eval dataset")     
1360 |                 data_list_eval = [{"jsonl": shared.tokenizer.apply_chat_template(entry["messages"], tokenize=False, add_generation_prompt=False)} for entry in loaded_JSONLevaldata]
1361 |                
1362 |                 eval_data = DatasetDict()
1363 |                 eval_data['train'] = Dataset.from_list(data_list_eval)
1364 | 
1365 |             format_data = {"jsonl": "%jsonl%"}
1366 |             shared.tokenizer.chat_template = chat_template
1367 |             eval_steps = eval_stepsJSONL
1368 | 
1369 |         else:
1370 |             #=== JSON ====
1371 |             if dataset in ['None', '']:
1372 |                 yield "Missing dataset choice input, cannot continue.", zero_pd
1373 |                 return
1374 | 
1375 |             if format in ['None', '']:
1376 |                 yield "Missing format choice input, cannot continue.", zero_pd
1377 |                 return
1378 | 
1379 |             with open(clean_path(TRAINING_FORMATS_FOLDER, f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
1380 |                 format_data: dict[str, str] = json.load(formatFile)
1381 | 
1382 |             dataset_json = f'{dataset}.json'
1383 |             eval_json = f'{eval_dataset}.json'
1384 | 
1385 |             logger.info("Loading JSON training dataset...")
1386 |             data = load_dataset("json", data_files=clean_path(TRAINING_DATASET_FOLDER, dataset_json))
1387 | 
1388 |             if eval_dataset not in ['None', '']:
1389 |                 logger.info("Loading JSON eval dataset...")
1390 |                 eval_data = load_dataset("json", data_files=clean_path(TRAINING_DATASET_FOLDER, eval_json))
1391 | 
1392 | 
1393 |             # == store training prompt ==
1394 |             for _, value in format_data.items():
1395 |                 prompt_key = f"template_{len(train_template)}"
1396 |                 train_template[prompt_key] = value
1397 | 
1398 |         def generate_prompt(data_point: dict[str, str]):
1399 |             
1400 |             for options, data in format_data.items():
1401 |                 if set(options.split(',')) == set(x[0] for x in data_point.items() if (type(x[1]) is str and len(x[1].strip()) > 0)):
1402 |                     for key, val in data_point.items():
1403 |                         if type(val) is str:
1404 |                             data = data.replace(f'%{key}%', val)
1405 |                     return data
1406 |             raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(format_data.keys())}"')
1407 | 
1408 |         def generate_and_tokenize_prompt(data_point):
1409 |             global mapped_prompts
1410 |             mapped_prompts = mapped_prompts + 1
1411 |             prompt = generate_prompt(data_point)
1412 |             return tokenize(prompt, add_eos_token, add_bos_token)
1413 | 
1414 |         train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
1415 |         print(f"Rows: {train_data.num_rows}")
1416 |         print(f"Tokenized Prompts: {mapped_prompts}")
1417 | 
1418 |         if hybrid_training==True and hybrid_text_train_data:
1419 |             print(f"Merging Raw text ({len(hybrid_text_train_data)}) and dataset ({len(train_data)})")
1420 |             merged_train_data = []
1421 |             num_data_to_keep = int(len(train_data) * float(hybrid_data_ratio) / 100.0)
1422 |             num_text_to_keep = int(len(hybrid_text_train_data) * float(hybrid_text_ratio) / 100.0)
1423 |             
1424 |             print(f" - Using {hybrid_data_ratio}% of dataset ({num_data_to_keep}/{len(train_data)}) blocks")
1425 |             print(f" - Using {hybrid_text_ratio}% of text ({num_text_to_keep}/{len(hybrid_text_train_data)}) blocks")
1426 |             count = 0
1427 |             if hybrid_data_ratio > 0:
1428 |                 for example in train_data:
1429 |                     merged_train_data.append(example)
1430 |                     count += 1   
1431 |                     if count >= num_data_to_keep and hybrid_data_ratio < 100:
1432 |                         break
1433 |             count = 0
1434 |             if hybrid_text_ratio > 0:    
1435 |                 for example in hybrid_text_train_data:
1436 |                     merged_train_data.append(example) 
1437 |                     count += 1   
1438 |                     if count >= num_text_to_keep and hybrid_text_ratio < 100:
1439 |                         break
1440 | 
1441 |             train_data = Dataset.from_list(merged_train_data)
1442 |             num_items_after = len(train_data)
1443 |             print(f"- Total after merge: {num_items_after} blocks")
1444 | 
1445 | 
1446 |         #if eliminate_long_blocks:
1447 |         # always filter
1448 |         num_items_before = len(train_data)
1449 |         print(f"Filtering {num_items_before} blocks...")
1450 |         filtered_train_data = []
1451 |         for example in train_data:
1452 |                 
1453 |                 #if len(example['input_ids']) > 0:
1454 |                     #if example['input_ids'][0] == shared.tokenizer.pad_token_id:
1455 |                         #filtered_train_data.append(example)
1456 |             if len(example['input_ids']) == cutoff_len:
1457 |                 filtered_train_data.append(example)        
1458 |          
1459 |         train_data = Dataset.from_list(filtered_train_data)
1460 |         num_items_after = len(train_data)
1461 |         if eliminate_long_blocks:
1462 |             print(f" - Eliminated {RED}{num_items_before - num_items_after} blocks{RESET} that were above  {cutoff_len} tokens cutoff")
1463 |         else:
1464 |             print(f" - Eliminated {RED}{num_items_before - num_items_after} blocks{RESET} that were invalid")
1465 | 
1466 | 
1467 |         if eval_data is not None:  
1468 |             eval_data = eval_data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
1469 | 
1470 |         print(f"BOS: {add_bos_token} EOS: {add_eos_token}") 
1471 |         print(f"Final Data Blocks: {len(train_data)}")
1472 | 
1473 |  
1474 |     # == Start prepping the model itself ==
1475 |     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
1476 |         logger.info("Getting model ready...")
1477 |         # here we can disable gradient checkpoint, by default = true,  use_gradient_checkpointing=True
1478 |         # if bnb
1479 |         if 'quantization_config' in shared.model.config.to_dict():
1480 |             print(f"Method: {RED}QLORA{RESET}")
1481 |             prepare_model_for_kbit_training(shared.model)
1482 |         else:
1483 |             print(f"Method: {RED}LoRA{RESET}")
1484 | 
1485 |     # base model is now frozen and should not be reused for any other LoRA training than this one
1486 |     shared.model_dirty_from_training = True
1487 |     print(f"Transformers Model Type: {YELLOW}{model_type}{RESET}")
1488 | 
1489 |   
1490 |     model_to_lora_modules[model_id] = ["q_proj", "v_proj"]
1491 | 
1492 |     if training_projection==train_choices[0]:
1493 |         model_to_lora_modules[model_id] = ["gate_proj","down_proj","up_proj","q_proj","k_proj","v_proj","o_proj"]
1494 |     elif training_projection==train_choices[1]:
1495 |         model_to_lora_modules[model_id] = ["q_proj","k_proj", "v_proj", "o_proj"]
1496 |     elif training_projection==train_choices[2]:
1497 |         model_to_lora_modules[model_id] = ["q_proj","k_proj", "v_proj"]
1498 |     elif training_projection==train_choices[3]:
1499 |         model_to_lora_modules[model_id] = ["k_proj", "v_proj", "down_proj"]        
1500 |     else:
1501 |         model_to_lora_modules[model_id] = ["q_proj", "v_proj"]        
1502 |         
1503 |     
1504 |     logger.info("Preparing for training...")
1505 |     # == Create LoRA config ==
1506 |    
1507 |  
1508 |     modules_save = None
1509 |     real_alpha = lora_alpha
1510 | 
1511 | 
1512 |     # modules_to_save = ["lm_head", "embed_tokens"]
1513 |     # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
1514 |     # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
1515 |     # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
1516 | 
1517 |     if lora_modulessave:
1518 | 
1519 |         print (f"{YELLOW}Trying Full Finetune in lm_head and embed_tokens{RESET}")
1520 | 
1521 |         if not hasattr(shared.model, 'lm_head'):
1522 |             print(f"{RED}Model error: this model doesn't have lm_head {RESET} You need a foundation base Mistral or LLama model")
1523 |         else:
1524 |             print(f"Model has lm_head:{GREEN} OK {RESET}")
1525 | 
1526 |         modules_save=["lm_head","embed_tokens"]
1527 |         #check if optimizer has "_8bit" substring
1528 |         if optimizer.find("_8bit") == -1:
1529 |             print(f"{RED}VRAM Warning: Using lm_head and embed_tokens for training. It's recomended to use 8bit Adam optimizer. Current optimizer: {optimizer}{RESET}")
1530 | 
1531 |     scalling = real_alpha/lora_rank
1532 | 
1533 |     if lora_RS:
1534 |         
1535 |         print(f"{RED}Using RS LoRA{RESET} with alpha: {lora_RS_alpha}")
1536 |         real_alpha = lora_RS_alpha
1537 |         if real_alpha < 1: 
1538 |             real_alpha = 1
1539 |         scalling = real_alpha / math.sqrt(lora_rank)
1540 |     
1541 |     print(f"Training Scaling: {scalling}")
1542 | 
1543 |     config = LoraConfig(
1544 |         r=lora_rank,
1545 |         lora_alpha=real_alpha,
1546 |         target_modules=model_to_lora_modules[model_id],
1547 |         lora_dropout=lora_dropout,
1548 |         bias="none",
1549 |         task_type="CAUSAL_LM",
1550 |         modules_to_save=modules_save,
1551 |         use_rslora=lora_RS,
1552 |     )
1553 | 
1554 |     # == Backup the existing adapter ==
1555 |     if not always_override:
1556 |         backup_adapter(lora_file_path)
1557 | 
1558 |     # == get model trainable params
1559 |     model_trainable_params, model_all_params = calc_trainable_parameters(shared.model)
1560 | 
1561 |     try:
1562 |         logger.info("Creating LoRA model...")
1563 | 
1564 |         if use_grad_checkpoint:
1565 |             shared.model.enable_input_require_grads()
1566 | 
1567 |     
1568 |         torch.cuda.empty_cache()
1569 | 
1570 |         lora_model = get_peft_model(shared.model, config)
1571 | 
1572 |         
1573 |         if not always_override and Path(f"{lora_file_path}/adapter_model.bin").is_file():
1574 |             logger.info("Loading existing LoRA data...")
1575 |             state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin")
1576 |             set_peft_model_state_dict(lora_model, state_dict_peft)
1577 | 
1578 |             print(f" + Continue Training on {RED}{lora_file_path}/adapter_model.bin{RESET}")
1579 |             
1580 |             #load training_log.json if exist
1581 |            
1582 |             if Path(f"{lora_file_path}/training_log.json").is_file():
1583 |                 with open(f"{lora_file_path}/training_log.json", 'r') as json_file:
1584 |                     json_ilog = json.load(json_file)
1585 |                     for key, value in json_ilog.items():
1586 |                         if key=='current_steps':
1587 |                             non_serialized_params.update({"checkpoint_offset": int(value+1)})
1588 |                             print(f" + Checkpoints will be saved with offset: {RED}{non_serialized_params['checkpoint_offset']}{RESET}")
1589 |                         if key=='epoch':
1590 |                             non_serialized_params.update({"epoch_offset": value})
1591 |                             print(f" + Epoch offset: {RED}{non_serialized_params['epoch_offset']}{RESET}")
1592 |            
1593 | 
1594 |             if Path(f"{lora_file_path}/training_graph.json").is_file():
1595 |                 try:
1596 |                     with open(f"{lora_file_path}/training_graph.json", 'r') as json_file:
1597 |                         train_log_graph = json.load(json_file)
1598 |                         print(" + Training Graph loaded")   
1599 |                 except:
1600 |                     print(f"Can't read training_graph")
1601 | 
1602 | 
1603 |     except:
1604 |         yield traceback.format_exc().replace('\n', '\n\n'), zero_pd
1605 |         return
1606 | 
1607 | #    if shared.args.monkey_patch:
1608 | #        from alpaca_lora_4bit.autograd_4bit import Autograd4bitQuantLinear
1609 | #        from alpaca_lora_4bit.models import Linear4bitLt
1610 | #        for _, m in lora_model.named_modules():
1611 | #            if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
1612 | #                if m.is_v1_model:
1613 | #                    m.zeros = m.zeros.half()
1614 | #                m.scales = m.scales.half()
1615 | 
1616 |     class Tracked():
1617 |         def __init__(self):
1618 |             self.current_steps = 0
1619 |             self.max_steps = 0
1620 |             self.did_save = False
1621 | 
1622 |     tracked = Tracked()
1623 |     actual_save_steps = math.ceil(save_steps / gradient_accumulation_steps)
1624 | 
1625 |     class Callbacks(transformers.TrainerCallback):
1626 |         def on_step_begin(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs):
1627 |             tracked.current_steps = state.global_step * gradient_accumulation_steps
1628 |             tracked.max_steps = state.max_steps * gradient_accumulation_steps
1629 |             ssteps10 = int(max(2,(state.max_steps/epochs)*0.1))
1630 | 
1631 |             if WANT_INTERRUPT:
1632 |                 control.should_epoch_stop = True
1633 |                 control.should_training_stop = True
1634 |             else:
1635 |                 current_loss = float(train_log.get('loss', 0.0))
1636 |                 current_epoch_int = int(float(train_log.get('epoch', 0.0)))
1637 |               
1638 |                 force_save = False
1639 | 
1640 |                 current_steps_offset = tracked.current_steps + non_serialized_params['checkpoint_offset']
1641 | 
1642 |                 folder_save = f"checkpoint-{current_steps_offset}"    
1643 | 
1644 |                 # save if triggered by user
1645 |                 if non_serialized_params['save_checkpoint_now']:
1646 |                     force_save = True
1647 |                     non_serialized_params.update({"save_checkpoint_now": False})
1648 |                     print(f"\033[1;31;1mSave Checkpoint manually trigerred.\033[0;37;0m")
1649 |                     folder_save = f"checkpoint-{current_steps_offset}-user"  
1650 | 
1651 |                 patience = 3     # Set the number of consecutive steps for tracking stability
1652 |                 
1653 |                 if gradient_accumulation_steps==1:
1654 |                     patience = 4
1655 | 
1656 |                 min_steps = ssteps10
1657 | 
1658 |                 # Save each time the loss is below the threshold 
1659 |                 if current_loss < non_serialized_params['save_steps_under_loss'] and current_loss > 0 and state.global_step > min_steps:
1660 |                     current_stability = non_serialized_params['current_stability']
1661 |                     current_stability += 1
1662 |                     non_serialized_params.update({"current_stability": current_stability}) 
1663 | 
1664 |                     if current_stability >= patience:
1665 |                         current_stability = 0
1666 |                         non_serialized_params.update({"current_stability": current_stability})     
1667 |                         current_loss_dec = round(current_loss, 2)
1668 |                         loss_str = f"{current_loss_dec:.2f}"
1669 |                         loss_str = loss_str.replace('.', '_')
1670 |                         new_save = (current_loss_dec-0.1) + 0.01
1671 |                         non_serialized_params.update({"save_steps_under_loss": new_save})
1672 | 
1673 |                         folder_save = f"checkpoint-{current_steps_offset}-loss-{loss_str}" 
1674 |                         force_save = True   
1675 | 
1676 |                    
1677 |                 else:
1678 |                     # Reset stability if the loss goes above the threshold
1679 |                     non_serialized_params.update({"current_stability": 0})   
1680 | 
1681 |                 # Save full epochs
1682 |                 if actual_save_steps>0 and current_epoch_int > non_serialized_params['save_epochs'] and state.global_step > min_steps: 
1683 | 
1684 |                     
1685 |                     current_epoch_offset = current_epoch_int
1686 |                     
1687 |                     if non_serialized_params['epoch_offset'] > 0:
1688 |                         current_epoch_offset = current_epoch_int + round(non_serialized_params['epoch_offset'], 2)
1689 |                     
1690 |                     ep_off_str = f"{current_epoch_offset}"
1691 |                     ep_off_str = ep_off_str.replace('.', '_')
1692 |                     folder_save = f"checkpoint-{current_steps_offset}-epoch-{ep_off_str}" 
1693 | 
1694 |                     non_serialized_params.update({"save_epochs": current_epoch_int})
1695 |                     force_save = True
1696 | 
1697 |                 # save each actual_save_steps
1698 |                 if state.global_step > 0 and actual_save_steps > 0 and state.global_step % actual_save_steps == 0:
1699 |                     folder_save = f"checkpoint-{current_steps_offset}"  
1700 |                     force_save = True   
1701 | 
1702 |                 if force_save:       
1703 |                     lora_model.save_pretrained(f"{lora_file_path}/{folder_save}/", safe_serialization = non_serialized_params['safe_serialization'])
1704 |                     print(f"\033[1;30;40mStep: {tracked.current_steps:6} \033[0;37;0m Saved: [{folder_save}]")
1705 |                     # Save log
1706 |                     with open(f"{lora_file_path}/{folder_save}/training_log.json", 'w', encoding='utf-8') as file:
1707 |                         json.dump(train_log, file, indent=2)
1708 |                     # == Save training prompt ==
1709 |                     with open(f"{lora_file_path}/{folder_save}/training_prompt.json", 'w', encoding='utf-8') as file:
1710 |                         json.dump(train_template, file, indent=2)
1711 |                 
1712 |                 epoch_int = int(state.epoch)
1713 |                 if epoch_int > (stop_at_epoch - 1) and stop_at_epoch > 0:
1714 |                     control.should_epoch_stop = True
1715 |                     control.should_training_stop = True
1716 |                     print(f"{RED}Stop at Epoch {stop_at_epoch} reached.{RESET}")
1717 | 
1718 |         def on_substep_end(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs):
1719 |             tracked.current_steps += 1
1720 |             if WANT_INTERRUPT:
1721 |                 control.should_epoch_stop = True
1722 |                 control.should_training_stop = True
1723 | 
1724 |         def on_log(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs, **kwargs):
1725 |             
1726 |             logs["epoch"] = round(state.epoch, 3)
1727 |             train_log.update(logs)
1728 | 
1729 |             current_steps_offset = tracked.current_steps + non_serialized_params['checkpoint_offset']
1730 |             current_epoch_offset = train_log.get('epoch', 0.000) + non_serialized_params['epoch_offset']
1731 | 
1732 |             train_log.update({"current_steps": tracked.current_steps})
1733 |             train_log.update({"current_steps_adjusted": current_steps_offset})
1734 |             train_log.update({"epoch_adjusted": current_epoch_offset})
1735 | 
1736 |             if WANT_INTERRUPT:
1737 |                 print("\033[1;31;1mInterrupted by user\033[0;37;0m")
1738 | 
1739 |             if non_serialized_params['checkpoint_offset']>0:
1740 |                 print(f"\033[1;30;40mStep: {tracked.current_steps:6} [+{non_serialized_params['checkpoint_offset']}] \033[0;37;0m", end='')
1741 |             else:
1742 |                 print(f"\033[1;30;40mStep: {tracked.current_steps:6} \033[0;37;0m", end='')
1743 |             
1744 |             graphentry = {
1745 |                 'current_steps': int(train_log.get('current_steps_adjusted',0)),
1746 |                 'loss': float(train_log.get('loss', 0.0)),
1747 |                 'learning_rate': float(train_log.get('learning_rate', 0.0)),
1748 |                 'epoch': float(train_log.get('epoch_adjusted', 0.000))
1749 |             }
1750 | 
1751 |             cur_loss = float(train_log.get('loss', 0.0))
1752 |             cur_lr = float(train_log.get('learning_rate', 0.0))
1753 |             cur_epoch = float(train_log.get('epoch', 0.000))
1754 |             
1755 |             if len(statistics['loss']) == 1:
1756 |                 first_epoch = statistics['loss'][0]['epoch']
1757 |                 first_value = statistics['loss'][0]['value']
1758 |                 if first_value ==0:
1759 |                      statistics['loss'] = []
1760 | 
1761 | 
1762 |             statistics['loss'].append({'epoch': cur_epoch, 'value': cur_loss})
1763 |             statistics['lr'].append({'epoch': cur_epoch, 'value': cur_lr})
1764 | 
1765 |             # Add the entry to the continuous log
1766 |             train_log_graph.append(graphentry)
1767 | 
1768 |             # Save the graph log for now, we can later generate full graph
1769 |             with open(f"{lora_file_path}/training_graph.json", 'w') as file:
1770 |                 json.dump(train_log_graph, file, indent=4)
1771 | 
1772 |             if 'loss' in logs:
1773 |                 loss = float(logs['loss'])
1774 |                 if loss <= stop_at_loss and stop_at_loss > 0:
1775 |                     control.should_epoch_stop = True
1776 |                     control.should_training_stop = True
1777 |                     print(f"{RED}Stop Loss {stop_at_loss} reached.{RESET}")
1778 |                   
1779 | 
1780 |     # FPHAM SAMPLE REQ Transformers error handling
1781 |     gradient_accumulation_max = int(train_data.num_rows)//micro_batch_size
1782 |     
1783 |     if gradient_accumulation_max < gradient_accumulation_steps:
1784 |         print(f"{RED}WARNING:{RESET} Current gradient accumulation is {RED}too high{RESET} for the amount of training data.")
1785 |         print(f"Gradient accumulation: {gradient_accumulation_steps} should be less than: {gradient_accumulation_max}. {RED}This could crash Accelerate/Transformers{RESET}")
1786 |         #min_batchSize = sample_req*micro_batch_size
1787 |         print(f"Preferable fix: {RED}Increase the size of dataset{RESET}")
1788 |         print(f"... or Decrerase Gradient Accumulation {RED}{gradient_accumulation_steps}{RESET} to below {GREEN}{gradient_accumulation_max}{RESET}")
1789 |         gradient_accumulation_steps = max(1,gradient_accumulation_max-1)
1790 |         print(f"Last resort fix for this run: Lowering Gradient accumulation to {GREEN}{gradient_accumulation_steps}{RESET} [Good luck]")
1791 | 
1792 |     else:
1793 |         print(f"Data Size Check: Gradient accumulation: {YELLOW}{gradient_accumulation_steps}{RESET} <= Blocks/Batch {gradient_accumulation_max} ... {GREEN}[OK]{RESET}")
1794 | 
1795 |     #END OF FPHAM SAMPLE REQ
1796 | 
1797 |     # FPHAM Custom Scheduler ==
1798 |     custom_scheduller = False
1799 |     lr_scheduler_type_arg = lr_scheduler_type
1800 | 
1801 |     if lr_scheduler_type == 'FP_low_epoch_annealing':
1802 |         custom_scheduller = True
1803 |         lr_scheduler_type_arg = 'cosine'
1804 |     elif lr_scheduler_type == 'FP_half_time_annealing':
1805 |         custom_scheduller = True
1806 |         lr_scheduler_type_arg = 'constant'
1807 |     elif lr_scheduler_type =='FP_raise_fall_creative':
1808 |         custom_scheduller = True
1809 |         lr_scheduler_type_arg = 'constant_with_warmup'
1810 |     elif lr_scheduler_type =='FP_3epoch_raise_hold_fall':
1811 |         custom_scheduller = True
1812 |         lr_scheduler_type_arg = 'linear'
1813 |     elif lr_scheduler_type =='FP_step_decay_with_warmup':
1814 |         custom_scheduller = True
1815 |         lr_scheduler_type_arg = 'cosine_with_restarts'
1816 |     
1817 |     #gradient_checkpointing=True
1818 |     #group_by_length 
1819 | 
1820 |     # Fix training for mixed precision models
1821 |     for param in shared.model.parameters():
1822 |         if param.requires_grad:
1823 |             param.data = param.data.float()
1824 | 
1825 |     #lora_model.gradient_checkpointing_enable()  
1826 |       
1827 |     args=transformers.TrainingArguments(
1828 |             report_to=report_to if report_to != "None" else None,
1829 |             per_device_train_batch_size=micro_batch_size,
1830 |             gradient_accumulation_steps=gradient_accumulation_steps,
1831 |             warmup_steps=math.ceil(warmup_steps / gradient_accumulation_steps),
1832 |             warmup_ratio = warmup_ratio,
1833 |             num_train_epochs=epochs,
1834 |             learning_rate=actual_lr,
1835 |             fp16=False if shared.args.cpu else True,
1836 |             optim=optimizer,
1837 |             logging_steps=1,
1838 |             evaluation_strategy="steps" if eval_data is not None else "no",
1839 |             eval_steps=math.ceil(eval_steps / gradient_accumulation_steps) if eval_data is not None else None,
1840 |             save_strategy="steps" if eval_data is not None else "no",
1841 |             output_dir=lora_file_path,
1842 |             lr_scheduler_type=lr_scheduler_type_arg,
1843 |             load_best_model_at_end=False,
1844 |             ddp_find_unused_parameters=None,
1845 |             no_cuda=shared.args.cpu,
1846 |             group_by_length = group_by_length,
1847 |             gradient_checkpointing=use_grad_checkpoint,
1848 |         )
1849 | 
1850 |     if custom_scheduller:
1851 |         trainer = FPSchedulerTrainer(
1852 |             neftune_noise_alpha=neft_noise_alpha,
1853 |             model=lora_model,
1854 |             train_dataset=train_data,
1855 |             eval_dataset=eval_data,
1856 |             args=args,
1857 |             data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
1858 |             callbacks=list([Callbacks()])
1859 |         )
1860 |     elif neft_noise_alpha > 0:
1861 |             trainer = FPNEFtuneTrainer(
1862 |             neftune_noise_alpha=neft_noise_alpha,
1863 |             model=lora_model,
1864 |             train_dataset=train_data,
1865 |             eval_dataset=eval_data,
1866 |             args=args,
1867 |             data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
1868 |             callbacks=list([Callbacks()])
1869 |         )
1870 |     else:
1871 |         trainer = transformers.Trainer(
1872 |             model=lora_model,
1873 |             train_dataset=train_data,
1874 |             eval_dataset=eval_data,
1875 |             args=args,
1876 |             data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
1877 |             callbacks=list([Callbacks()])
1878 |         )
1879 |     
1880 |     # END OF FPHAM CUSTOM SCHEDULER
1881 | 
1882 |     lora_model.config.use_cache = False
1883 | 
1884 |     if torch.__version__ >= "2" and sys.platform != "win32":
1885 |         lora_model = torch.compile(lora_model)
1886 | 
1887 |     # == Save parameters for reuse ==
1888 |     with open(f"{lora_file_path}/training_parameters.json", 'w', encoding='utf-8') as file:
1889 |         vars = locals()
1890 |         json.dump({x: vars[x] for x in PARAMETERS}, file, indent=2)
1891 | 
1892 |     # == Save training prompt ==
1893 |     with open(f"{lora_file_path}/training_prompt.json", 'w', encoding='utf-8') as file:
1894 |         json.dump(train_template, file, indent=2)
1895 | 
1896 |     # == Main run and monitor loop ==
1897 |     logger.info("Starting training...")
1898 |     yield "Starting...", zero_pd
1899 | 
1900 |     lora_trainable_param, lora_all_param = calc_trainable_parameters(lora_model)
1901 | 
1902 |     projections_string = ", ".join([projection.replace("_proj", "") for projection in model_to_lora_modules[model_id]])
1903 | 
1904 |     print(f"Training '{model_id}' model using {YELLOW}({projections_string}){RESET} projections")
1905 | 
1906 |     if lora_all_param > 0:
1907 |         print(f"Trainable params: {lora_trainable_param:,d} ({RED}{100 * lora_trainable_param / lora_all_param:.4f} %{RESET}), All params: {lora_all_param:,d} (Model: {model_all_params:,d})")
1908 | 
1909 | 
1910 |     train_log.update({"base_model_name": shared.model_name})
1911 |     train_log.update({"base_model_class": shared.model.__class__.__name__})
1912 |     train_log.update({"base_loaded_in_4bit": getattr(lora_model, "is_loaded_in_4bit", False)})
1913 |     train_log.update({"base_loaded_in_8bit": getattr(lora_model, "is_loaded_in_8bit", False)})
1914 |     train_log.update({"projections": projections_string})
1915 |     if non_serialized_params['checkpoint_offset'] > 0:
1916 |         train_log.update({"last_run_steps_offset": non_serialized_params['checkpoint_offset']})
1917 |         train_log.update({"last_run_epoch_offset": non_serialized_params['epoch_offset']})
1918 | 
1919 | 
1920 |     if non_serialized_params['checkpoint_offset'] > 0:
1921 |         print(f"Continue training on {RED}previous adapter{RESET} from epoch: {RED}{non_serialized_params['epoch_offset']}{RESET}")
1922 | 
1923 |     if stop_at_loss > 0:
1924 |         print(f"Monitoring loss {RED}(Auto-Stop at: {stop_at_loss}){RESET}")
1925 | 
1926 |     if stop_at_epoch > 0:
1927 |         print(f"Monitoring Epoch {RED}(Auto-Stop at the end of: {stop_at_epoch}){RESET}")
1928 | 
1929 |     if WANT_INTERRUPT:
1930 |         yield "Interrupted before start.", zero_pd
1931 |         return
1932 | 
1933 |     def log_train_dataset(trainer):
1934 |         decoded_entries = []
1935 |         # Try to decode the entries and write the log file
1936 |         try:
1937 |             # Iterate over the first 10 elements in the dataset (or fewer if there are less than 10)
1938 |             for i in range(min(10, len(trainer.train_dataset))):
1939 |                 decoded_text = shared.tokenizer.decode(trainer.train_dataset[i]['input_ids'])
1940 |                 decoded_entries.append({"value": decoded_text})
1941 | 
1942 |             # Write the log file
1943 |             Path('logs').mkdir(exist_ok=True)
1944 |             with open(Path('logs/train_dataset_sample.json'), 'w') as json_file:
1945 |                 json.dump(decoded_entries, json_file, indent=4)
1946 | 
1947 |             logger.info("Log file 'train_dataset_sample.json' created in the 'logs' directory.")
1948 |         except Exception as e:
1949 |             logger.error(f"Failed to create log file due to error: {e}")
1950 | 
1951 |     def dump_train_dataset(trainer, remove_SYS):
1952 |         decoded_entries = []
1953 |         # Try to decode the entries and write the log file
1954 |         # Get the current date and time as a string in 'YYYYMMDD_HHMM' format
1955 |         mydate = datetime.now().strftime('%Y%m%d_%H%M')
1956 |         dfname = f"{mydate}_dataset_dump.json"
1957 |         try:
1958 |             logger.info("Dumping the current dataset before training starts... Wait ...")
1959 |             for i in range(len(trainer.train_dataset)):
1960 |                 decoded_text = shared.tokenizer.decode(trainer.train_dataset[i]['input_ids'])
1961 |                 decoded_text = decoded_text.replace('<unk>','')
1962 |                 if remove_SYS:
1963 |                     decoded_text = decoded_text.replace('<s> ','')
1964 |                     decoded_text = decoded_text.replace('<s>','')
1965 |                     decoded_text = decoded_text.replace('</s>','')
1966 | 
1967 |                 decoded_entries.append({"text": decoded_text})
1968 | 
1969 |             # Write the log file
1970 |             Path('logs').mkdir(exist_ok=True)
1971 |             with open(Path(f'logs/{dfname}'), 'w') as json_file:
1972 |                 json.dump(decoded_entries, json_file, indent=4)
1973 | 
1974 |             logger.info(f"The dataset was dumped to file:'{dfname}' created in the 'logs' directory.")
1975 |         except Exception as e:
1976 |             logger.error(f"Failed to create dump file due to error: {e}")
1977 | 
1978 |     def threaded_run():
1979 |         log_train_dataset(trainer)
1980 |         if non_serialized_params['dump_dataset'] == True:
1981 |             dump_train_dataset(trainer, non_serialized_params['dump_dataset_remove_s'])
1982 | 
1983 |         trainer.train()
1984 |         # Note: save in the thread in case the gradio thread breaks (eg browser closed)
1985 |         lora_model.save_pretrained(lora_file_path, safe_serialization = non_serialized_params['safe_serialization'])
1986 |         logger.info("LoRA training run is completed and saved.")
1987 |         # Save log
1988 |         with open(f"{lora_file_path}/training_log.json", 'w', encoding='utf-8') as file:
1989 |             json.dump(train_log, file, indent=2)
1990 | 
1991 |     thread = threading.Thread(target=threaded_run)
1992 |     thread.start()
1993 |     last_step = 0
1994 |     start_time = time.perf_counter()
1995 | 
1996 |     while thread.is_alive():
1997 |         time.sleep(0.5)
1998 | 
1999 |         if statistics['loss']:
2000 |             max_value_dict = max(statistics['loss'], key=lambda x: x['value'])
2001 |             max_value = max_value_dict['value']+0.4
2002 |             first_epoch = statistics['loss'][0]['epoch']
2003 |             last_epoch = statistics['loss'][-1]['epoch']
2004 |         else:
2005 |             max_value = 3.5
2006 |             last_epoch = 0
2007 |             first_epoch = 0           
2008 | 
2009 |         if WANT_INTERRUPT:
2010 | 
2011 |             losses = gr.LinePlot.update(
2012 | 				value = pd.DataFrame(statistics['loss']),
2013 |                 x="epoch", y="value",
2014 |                 title="Loss Metrics",
2015 |                 overlay_point=True, tooltip=["epoch", "value"],
2016 | 				x_lim=[first_epoch,last_epoch], y_lim=[0,max_value],
2017 |                 width=500, height=250 )
2018 | 
2019 |             yield "Interrupting, please wait... *(Run will stop after the current training step completes.)*", losses
2020 | 
2021 |         elif tracked.current_steps != last_step:
2022 |             last_step = tracked.current_steps
2023 |             time_elapsed = time.perf_counter() - start_time
2024 |             lastloss = float(train_log.get('loss', 0.0))
2025 | 
2026 |             non_serialized_params.update({"training_loop": True})               
2027 | 
2028 |             if lastloss > 0:
2029 |                 lastloss_str = f", ... Current Loss: `{lastloss:.2f}`"
2030 |             else:
2031 |                 lastloss_str = ""
2032 | 
2033 |             if time_elapsed <= 0:
2034 |                 timer_info = ""
2035 |                 total_time_estimate = 999
2036 |             else:
2037 |                 its = tracked.current_steps / time_elapsed
2038 |                 if its > 1:
2039 |                     timer_info = f"`{its:.2f}` it/s"
2040 |                 else:
2041 |                     timer_info = f"`{1.0/its:.2f}` s/it"
2042 | 
2043 |                 total_time_estimate = (1.0 / its) * (tracked.max_steps)
2044 | 
2045 |             if stop_at_loss != non_serialized_params['stop_at_loss']:
2046 |                 stop_at_loss = non_serialized_params['stop_at_loss']
2047 |                 print(f"Stop at loss changed {RED}(Auto-Stop at: {stop_at_loss}){RESET}")
2048 | 
2049 |             if stop_at_epoch != non_serialized_params['stop_at_epoch']:
2050 |                 stop_at_epoch = non_serialized_params['stop_at_epoch']
2051 |                 print(f"Stop at epoch changed {RED}(Auto-Stop at the end of: {stop_at_epoch}){RESET}")
2052 |             
2053 |             losses = gr.LinePlot.update(
2054 | 				value = pd.DataFrame(statistics['loss']),
2055 |                 x="epoch", y="value",
2056 |                 title="Loss Metrics",
2057 |                 overlay_point=True, tooltip=["epoch", "value"],
2058 | 				x_lim=[first_epoch,last_epoch], y_lim=[0,max_value],
2059 |                 width=500, height=250 )
2060 | 				
2061 | 
2062 |             yield f"Running... **{tracked.current_steps}** / **{tracked.max_steps}** ... {timer_info}, {format_time(time_elapsed)} / {format_time(total_time_estimate)} ... {format_time(total_time_estimate - time_elapsed)} remaining {lastloss_str}", losses
2063 | 
2064 |     # Saving in the train thread might fail if an error occurs, so save here if so.
2065 | 
2066 |     #return_pd = pd.DataFrame(statistics['loss'])
2067 | 
2068 |     if statistics['loss']:
2069 |         max_value_dict = max(statistics['loss'], key=lambda x: x['value'])
2070 |         max_value = max_value_dict['value']+0.4
2071 |         first_epoch = statistics['loss'][0]['epoch']
2072 |         last_epoch = statistics['loss'][-1]['epoch']
2073 |     else:
2074 |         max_value = 3.5
2075 |         last_epoch = 0
2076 |         first_epoch = 0 
2077 | 
2078 |     return_pd = gr.LinePlot.update(
2079 |         value = pd.DataFrame(statistics['loss']),
2080 |         x="epoch", y="value",
2081 |         title="Loss Metrics",
2082 |         overlay_point=True, tooltip=["epoch", "value"],
2083 |         x_lim=[first_epoch,last_epoch], y_lim=[0,max_value],
2084 |         width=500, height=250)
2085 | 
2086 |     non_serialized_params.update({"training_loop": False})
2087 | 
2088 |     if not tracked.did_save:
2089 |         logger.info("Training complete, saving...")
2090 |         lora_model.save_pretrained(lora_file_path, safe_serialization = non_serialized_params['safe_serialization'])
2091 | 
2092 |     if WANT_INTERRUPT:
2093 |         logger.info("Training interrupted.")
2094 |         yield f"Interrupted by user. LoRA saved to `{lora_file_path}`.", return_pd
2095 |     else:
2096 |         logger.info("Training complete!")
2097 |         yield f"Done! LoRA saved to `{lora_file_path}`.\n\nBefore testing your new LoRA, make sure to first reload the model, as it is currently dirty from training.", return_pd
2098 | 
2099 |     create_graph(lora_file_path, lora_name)
2100 | 
2101 | def format_time(seconds: float):
2102 |     if seconds < 120:
2103 |         return f"`{seconds:.0f}` seconds"
2104 | 
2105 |     minutes = seconds / 60
2106 |     if minutes < 120:
2107 |         return f"`{minutes:.0f}` minutes"
2108 | 
2109 |     hours = minutes / 60
2110 |     return f"`{hours:.0f}` hours"
2111 | 


--------------------------------------------------------------------------------