├── .gitattributes ├── requirements.txt ├── starjob130k.json ├── README.md └── train_llama_3.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.json filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.44.2 2 | wandb==0.18.0 3 | torch==2.4.1 4 | trl==0.10.1 5 | datasets==2.21.0 6 | unsloth==2024.8 7 | numpy==1.26.4 8 | -------------------------------------------------------------------------------- /starjob130k.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c9ec14ef703a5aa4cc360fe82d8f806fbe03cbee5846a8ff34c503baf6290dd1 3 | size 1907297770 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Starjob Dataset designed to train LLMs on JSSP 3 | 4 | [![Hugging Face](https://img.shields.io/badge/HuggingFace-Dataset-yellow?logo=huggingface&logoColor=white)](https://huggingface.co/datasets/henri24/Starjob) 5 | 6 | Dataset is available at [Hugging Face](https://huggingface.co/datasets/henri24/Starjob) 7 | 8 | 9 | 10 | ## Dataset Overview 11 | 12 | **Dataset Name:** starjob130k.json 13 | **Number of Entries:** 130,000 14 | **Number of Fields:** 5 15 | 16 | ## Fields Description 17 | 18 | 1. **num_jobs** 19 | - **Type:** int64 20 | - **Number of Unique Values:** 16 21 | 22 | 2. **num_machines** 23 | - **Type:** int64 24 | - **Number of Unique Values:** 16 25 | 26 | 3. **instruction** 27 | - **Type:** object 28 | - **Number of Unique Values:** 130,000 29 | - **Initial description of the problem detailing the number of jobs and machines involved.** 30 | 31 | 4. **input** 32 | - **Type:** object 33 | - **Number of Unique Values:** 130,000 34 | - **Description of the problem in LLM format** 35 | 36 | 5. **output** 37 | - **Type:** object 38 | - **Number of Unique Values:** 130,000 39 | - **Solution in LLM format:** 130,000 40 | 41 | 6. **matrix** 42 | - **Type:** object 43 | - **Number of Unique Values:** 130,000 44 | - **Input problem OR-Tool makspan and solution in Matrix format** 45 | 46 | 47 | ## Usage 48 | 49 | This dataset can be used for training LLMs for job-shop scheduling problems (JSSP). Each entry provides information about the number of jobs, the number of machines, and other relevant details formatted in natural language. 50 | 51 | 52 | # Setting Up Your Python Environment 53 | 54 | Follow these instructions to create a virtual environment and install the necessary libraries. 55 | 56 | ## Step 1: Create a Virtual Environment 57 | 58 | ```bash 59 | python3 -m venv llm_env 60 | ``` 61 | 62 | Activate the Virtual Environment 63 | After creating the virtual environment, activate it using the following command: 64 | 65 | On Windows 66 | ```bash 67 | .\llm_env\Scripts\activate 68 | ``` 69 | 70 | On macOS and Linux 71 | ```bash 72 | source llm_env/bin/activate 73 | ``` 74 | 75 | # Install the Required Libraries 76 | ```bash 77 | pip install -r requirements.txt 78 | ``` 79 | 80 | # Training 81 | Make sure to put dataset.json under data directory 82 | 83 | ```bash 84 | python train_llama_3.py 85 | ``` 86 | 87 | ## License 88 | 89 | This dataset is licensed under the Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0). For more details, see the [license description](https://creativecommons.org/licenses/by-sa/4.0/). The dataset will remain accessible for an extended period. 90 | 91 | -------------------------------------------------------------------------------- /train_llama_3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import wandb 4 | from unsloth import FastLanguageModel, is_bfloat16_supported 5 | from datasets import load_dataset 6 | from trl import SFTTrainer 7 | from transformers import TrainingArguments 8 | 9 | def main(): 10 | # Set up argument parser 11 | parser = argparse.ArgumentParser(description="Train a FastLanguageModel with specified parameters.") 12 | 13 | # Model and data parameters 14 | parser.add_argument('--max_seq_length', type=int, default=50000, help='Maximum sequence length') 15 | parser.add_argument('--dtype', type=str, default='bfloat16', choices=['bfloat16', 'float16'], help='Data type (bfloat16 or float16)') 16 | parser.add_argument('--load_in_4bit', action='store_true',default=True, help='Use 4-bit quantization to reduce memory usage') 17 | 18 | # LoRA hyperparameters 19 | parser.add_argument('--lora_r', type=int, default=64, help='Rank of the LoRA decomposition') 20 | parser.add_argument('--lora_alpha', type=int, default=64, help='Scaling factor for LoRA updates') 21 | parser.add_argument('--lora_dropout', type=float, default=0.0, help='Dropout rate for LoRA layers') 22 | parser.add_argument('--bias', type=str, default='none', choices=['none', 'all', 'lora_only'], help='Bias type') 23 | 24 | # Additional configurations 25 | parser.add_argument('--use_gradient_checkpointing', type=str, default='unsloth', help='Use gradient checkpointing') 26 | parser.add_argument('--random_state', type=int, default=42, help='Random state for reproducibility') 27 | parser.add_argument('--use_rslora', action='store_true',default=False, help='Use RSLoRA') 28 | parser.add_argument('--loftq_config', type=str, default=None, help='LoFT-Q configuration') 29 | 30 | # Training hyperparameters 31 | parser.add_argument('--per_device_train_batch_size', type=int, default=4, help='Batch size per device during training') 32 | parser.add_argument('--gradient_accumulation_steps', type=int, default=4, help='Number of gradient accumulation steps') 33 | parser.add_argument('--warmup_steps', type=int, default=5, help='Number of warmup steps') 34 | parser.add_argument('--num_train_epochs', type=int, default=2, help='Number of training epochs') 35 | parser.add_argument('--learning_rate', type=float, default=2e-4, help='Learning rate') 36 | parser.add_argument('--logging_steps', type=int, default=1, help='Logging steps') 37 | parser.add_argument('--optim', type=str, default='adamw_8bit', help='Optimizer') 38 | parser.add_argument('--weight_decay', type=float, default=0.01, help='Weight decay') 39 | parser.add_argument('--lr_scheduler_type', type=str, default='linear', help='Learning rate scheduler type') 40 | parser.add_argument('--seed', type=int, default=42, help='Random seed') 41 | parser.add_argument('--save_total_limit', type=int, default=50, help='Total save limit for model checkpoints') 42 | parser.add_argument('--save_step', type=int, default=200, help='Steps interval to save model checkpoints') 43 | parser.add_argument('--per_device_eval_batch_size', type=int, default=2, help='Batch size per device during evaluation') 44 | parser.add_argument('--train_lm_head', action='store_true',default=False, help='Weather to train the language model head or not') 45 | parser.add_argument('--train_embed_tokens', action='store_true',default=False, help='Weather to train the embed_tokens or not') 46 | 47 | # Output directory 48 | parser.add_argument('--output_dir', type=str, default=None, help='Output directory name') 49 | 50 | args = parser.parse_args() 51 | 52 | # ========================= 53 | # Generate Output Directory Name 54 | # ========================= 55 | 56 | # Create an output directory name based on hyperparameters 57 | if args.output_dir is None: 58 | dir_out = f"output_alpha{args.lora_alpha}_r{args.lora_r}_train_lm_head{args.train_lm_head}_train_embed_tok_{args.train_embed_tokens}_seq{args.max_seq_length}_b{args.per_device_train_batch_size}_ep{args.num_train_epochs}" 59 | else: 60 | dir_out = args.output_dir 61 | 62 | # ========================= 63 | # Initialize WandB 64 | # ========================= 65 | 66 | # Initialize Weights & Biases for experiment tracking 67 | wandb.init( 68 | project="llama3-jssp-clean", # Change the project name if needed 69 | name=dir_out, 70 | ) 71 | 72 | # ========================= 73 | # Load Model and Tokenizer 74 | # ========================= 75 | 76 | # Set dtype 77 | dtype = torch.bfloat16 if args.dtype == 'bfloat16' else torch.float16 78 | 79 | # Load the pre-trained model and tokenizer 80 | model, tokenizer = FastLanguageModel.from_pretrained( 81 | model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", 82 | max_seq_length=args.max_seq_length, 83 | dtype=dtype, 84 | load_in_4bit=args.load_in_4bit, 85 | ) 86 | 87 | target_modules =[ 88 | "q_proj", "k_proj", "v_proj", "o_proj", 89 | "gate_proj", "up_proj", "down_proj", 90 | ] 91 | if args.train_lm_head: 92 | target_modules.append('lm_head') 93 | if args.train_embed_tokens: 94 | target_modules.append('embed_tokens') 95 | 96 | # Configure the model with PEFT (Parameter-Efficient Fine-Tuning) 97 | model = FastLanguageModel.get_peft_model( 98 | model, 99 | r=args.lora_r, 100 | target_modules=target_modules, 101 | lora_alpha=args.lora_alpha, 102 | lora_dropout=args.lora_dropout, 103 | bias=args.bias, 104 | use_gradient_checkpointing=args.use_gradient_checkpointing, 105 | random_state=args.random_state, 106 | use_rslora=args.use_rslora, 107 | loftq_config=args.loftq_config, 108 | ) 109 | 110 | # Define the Alpaca-style prompt template 111 | alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. 112 | 113 | ### Instruction: 114 | {} 115 | 116 | ### Input: 117 | {} 118 | 119 | ### Response: 120 | {}""" 121 | EOS_TOKEN = tokenizer.eos_token 122 | 123 | def formatting_prompts_func(examples): 124 | instructions = examples["instruction"] 125 | inputs = examples["input"] 126 | outputs = examples["output"] 127 | texts = [] 128 | for instruction, input_text, output in zip(instructions, inputs, outputs): 129 | text = alpaca_prompt.format(instruction, input_text, output) + EOS_TOKEN 130 | texts.append(text) 131 | return {"text": texts} 132 | 133 | # ========================= 134 | # Load and Prepare Dataset 135 | # ========================= 136 | 137 | #put the data in the data folder 138 | dataset = load_dataset('./data/', split="train") 139 | split_dataset = dataset.train_test_split(test_size=0.02, seed=args.seed) 140 | train_dataset = split_dataset['train'].map(formatting_prompts_func, batched=True) 141 | eval_dataset = split_dataset['test'].map(formatting_prompts_func, batched=True) 142 | 143 | # ========================= 144 | # Initialize the Trainer 145 | # ========================= 146 | 147 | trainer = SFTTrainer( 148 | model=model, 149 | tokenizer=tokenizer, 150 | train_dataset=train_dataset, 151 | eval_dataset=eval_dataset, 152 | dataset_text_field="text", 153 | max_seq_length=args.max_seq_length, 154 | dataset_num_proc=20, 155 | packing=False, 156 | args=TrainingArguments( 157 | per_device_train_batch_size=args.per_device_train_batch_size, 158 | gradient_accumulation_steps=args.gradient_accumulation_steps, 159 | warmup_steps=args.warmup_steps, 160 | num_train_epochs=args.num_train_epochs, 161 | learning_rate=args.learning_rate, 162 | # fp16=True, 163 | bf16=is_bfloat16_supported(), 164 | logging_steps=args.logging_steps, 165 | optim=args.optim, 166 | weight_decay=args.weight_decay, 167 | lr_scheduler_type=args.lr_scheduler_type, 168 | seed=args.seed, 169 | output_dir=dir_out, 170 | report_to="wandb", 171 | load_best_model_at_end=True, 172 | metric_for_best_model="eval_loss", 173 | greater_is_better=False, 174 | save_total_limit=args.save_total_limit, 175 | save_steps=args.save_step, 176 | eval_strategy="steps", 177 | eval_steps=args.save_step, 178 | per_device_eval_batch_size=args.per_device_eval_batch_size, 179 | ), 180 | ) 181 | 182 | # ========================= 183 | # Monitor GPU Memory Usage 184 | # ========================= 185 | 186 | gpu_stats = torch.cuda.get_device_properties(0) 187 | start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) 188 | max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) 189 | print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") 190 | print(f"{start_gpu_memory} GB of memory reserved.") 191 | 192 | # ========================= 193 | # Start Training 194 | # ========================= 195 | 196 | trainer_stats = trainer.train() 197 | 198 | if __name__ == "__main__": 199 | main() 200 | --------------------------------------------------------------------------------