├── llm_tutorial ├── requirement.txt ├── run_train_gen.sh ├── utils.py ├── ds_config_zero2.json ├── ds_config_zero3.json ├── CustomizedTrainer.py ├── data_utils.py └── demo_trainer.py ├── README.md ├── LICENSE └── run.sh /llm_tutorial/requirement.txt: -------------------------------------------------------------------------------- 1 | # recommended under python 3.10 2 | torch 3 | pytorch_lightning 4 | scikit-learn 5 | transformers 6 | mpi4py # if you cannot install by pip, use conda install 7 | deepspeed 8 | accelerate 9 | peft 10 | vllm 11 | sentencepiece 12 | datasets 13 | nltk 14 | metrics 15 | wandb 16 | loguru 17 | protobuf==3.20 -------------------------------------------------------------------------------- /llm_tutorial/run_train_gen.sh: -------------------------------------------------------------------------------- 1 | deepspeed demo_trainer.py \ 2 | --model_name_or_path "/data/LLAMA2_hf/llama_13B" \ 3 | --deepspeed ./ds_config_zero2.json \ 4 | --bf16 \ 5 | --do_train \ 6 | --do_eval \ 7 | --do_predict \ 8 | --mode "generation" \ 9 | --dataset_name cnn_dailymail \ 10 | --dataset_config "3.0.0" \ 11 | --text_column "article" \ 12 | --reference_column "highlights" \ 13 | --source_prefix "summarize: " \ 14 | --output_dir ./output/cnn_dm \ 15 | --per_device_train_batch_size 4 \ 16 | --per_device_eval_batch_size 4 \ 17 | --overwrite_output_dir -------------------------------------------------------------------------------- /llm_tutorial/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import datasets 3 | import transformers 4 | import sys 5 | 6 | def setup_logger(logger_file_path, training_args): 7 | logging.basicConfig( 8 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 9 | datefmt="%m/%d/%Y %H:%M:%S", 10 | handlers=[logging.StreamHandler(sys.stdout)], 11 | ) 12 | logger = logging.getLogger(__name__) 13 | logger.addHandler(logging.FileHandler(logger_file_path)) 14 | log_level = training_args.get_process_log_level() 15 | logger.setLevel(log_level) 16 | datasets.utils.logging.set_verbosity(log_level) 17 | transformers.utils.logging.set_verbosity(log_level) 18 | transformers.utils.logging.enable_default_handler() 19 | transformers.utils.logging.enable_explicit_format() 20 | return logger 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLPResearchScaffolding 2 | Scaffold for NLP researcher to quickly set up the codebase 3 | 4 | Author: Chenghao Yang (chenghao at uchicago dot edu) 5 | 6 | ## Usage 7 | First install anaconda corresonpoding to your system architecture. 8 | 9 | Then just run the `run.sh` shell scripts and you will get the environment setup. 10 | 11 | ## Current Environment Package 12 | 1. Basic NLP toolkits (NLTK & Spacy), with their package data downloaded. 13 | 1. Basic ML and Scientific Computing Toolkits (scipy, scikit-learn) 14 | 1. Pytorch (may add TensorFlow later if there is enough requests and good PRs :-) ) 15 | 1. Huggingface Suites (Transformers, Datasets, Evaluate, Accelerate) 16 | 1. Large Language Models stuffs (Protobuf, SentencePiece) 17 | 1. Logging Library (loguru) and Vim Setup from [Ultimate Vimrc](https://github.com/amix/vimrc). 18 | 1. Metrics (Rouge-score) 19 | 20 | ## TO-DOs 21 | 1. Tries to provide some interface to easilly select which toolkit we want to install. 22 | 2. Tries to suppress all user checking ([y/n]) and make it fully automatic. 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Chenghao (Alan) Yang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /llm_tutorial/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "optimizer": { 12 | "type": "AdamW", 13 | "params": { 14 | "lr": "auto", 15 | "betas": "auto", 16 | "eps": "auto", 17 | "weight_decay": "auto" 18 | } 19 | }, 20 | 21 | "scheduler": { 22 | "type": "WarmupLR", 23 | "params": { 24 | "warmup_min_lr": "auto", 25 | "warmup_max_lr": "auto", 26 | "warmup_num_steps": "auto" 27 | } 28 | }, 29 | 30 | "zero_optimization": { 31 | "stage": 2, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 2e8, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 2e8, 41 | "contiguous_gradients": true 42 | }, 43 | 44 | "gradient_accumulation_steps": "auto", 45 | "gradient_clipping": "auto", 46 | "steps_per_print": 2000, 47 | "train_batch_size": "auto", 48 | "train_micro_batch_size_per_gpu": "auto", 49 | "wall_clock_breakdown": false 50 | } -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | project_dir=path/to/your/project 2 | 3 | # create local environment 4 | cd $project_dir 5 | conda create -p ./env python=3.7 6 | conda activate ./env 7 | 8 | # fancy logging package 9 | pip install loguru 10 | 11 | # install pytorch with cuda 12 | conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia 13 | 14 | # install huggingface 15 | conda install transformers datasets evaluate 16 | # large language model related utils, st=sentencepiece is useful for tokenizer loading 17 | conda install sentencepiece 18 | # cannot use conda, protobuf is set to be 3.20 as it is more compatible with released LLM so far 19 | pip install protobuf==3.20 20 | pip install accelerate 21 | # setup huggingface cache dir 22 | echo "export HF_HOME=${project_dir}/transformers" >> ~/.bashrc 23 | echo "export TRANSFORMERS_CACHE=${project_dir}/transformers" >> ~/.bashrc 24 | 25 | # install nltk, spacy, ..., often useful for pre-processing and post-processing 26 | conda install nltk 27 | python -m nltk.downloader all 28 | 29 | pip install -U spacy 30 | python -m spacy download en_core_web_sm 31 | 32 | # scipy, sklearn.. 33 | conda install scipy scikit-learn 34 | 35 | # install rouge for summary evaluation 36 | pip install rouge-score 37 | 38 | # setup vim (optional) 39 | cd ~/ 40 | wget https://raw.githubusercontent.com/amix/vimrc/master/vimrcs/basic.vim 41 | mv basic.vim .vimrc 42 | 43 | # let git use vim (optional) 44 | git config --global core.editor "vim" 45 | 46 | 47 | # setup openai interface for using GPT-3/3.5/... 48 | pip install openai 49 | 50 | cd - 51 | 52 | -------------------------------------------------------------------------------- /llm_tutorial/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "optimizer": { 12 | "type": "AdamW", 13 | "params": { 14 | "lr": "auto", 15 | "betas": "auto", 16 | "eps": "auto", 17 | "weight_decay": "auto" 18 | } 19 | }, 20 | 21 | "scheduler": { 22 | "type": "WarmupLR", 23 | "params": { 24 | "warmup_min_lr": "auto", 25 | "warmup_max_lr": "auto", 26 | "warmup_num_steps": "auto" 27 | } 28 | }, 29 | 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "stage3_gather_16bit_weights_on_model_save": true 49 | }, 50 | 51 | "gradient_accumulation_steps": "auto", 52 | "gradient_clipping": "auto", 53 | "steps_per_print": 2000, 54 | "train_batch_size": "auto", 55 | "train_micro_batch_size_per_gpu": "auto", 56 | "wall_clock_breakdown": false 57 | } -------------------------------------------------------------------------------- /llm_tutorial/CustomizedTrainer.py: -------------------------------------------------------------------------------- 1 | from transformers import Trainer 2 | from transformers.utils import ( 3 | is_peft_available 4 | ) 5 | from peft import PeftModel 6 | from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES 7 | from transformers.modeling_utils import unwrap_model 8 | 9 | 10 | class CustomizedTrainer(Trainer): 11 | def compute_loss(self, model, inputs, return_outputs=False): 12 | """ 13 | How the loss is computed by Trainer. By default, all models return the loss in the first element. 14 | 15 | Subclass and override for custom behavior. 16 | """ 17 | if self.label_smoother is not None and "labels" in inputs: 18 | labels = inputs.pop("labels") 19 | else: 20 | labels = None 21 | outputs = model(**inputs) 22 | # Save past state if it exists 23 | # TODO: this needs to be fixed and made cleaner later. 24 | if self.args.past_index >= 0: 25 | self._past = outputs[self.args.past_index] 26 | 27 | if labels is not None: 28 | if is_peft_available() and isinstance(model, PeftModel): 29 | model_name = unwrap_model(model.base_model)._get_name() 30 | else: 31 | model_name = unwrap_model(model)._get_name() 32 | if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): 33 | loss = self.label_smoother(outputs, labels, shift_labels=True) 34 | else: 35 | loss = self.label_smoother(outputs, labels) 36 | else: 37 | if isinstance(outputs, dict) and "loss" not in outputs: 38 | raise ValueError( 39 | "The model did not return a loss from the inputs, only the following keys: " 40 | f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." 41 | ) 42 | # We don't use .loss here since the model may return tuples instead of ModelOutput. 43 | loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] 44 | 45 | return (loss, outputs) if return_outputs else loss 46 | -------------------------------------------------------------------------------- /llm_tutorial/data_utils.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | def create_dataset(data_args, script_args, training_args, logger): 3 | if data_args.task_name is not None and data_args.task_family_name is not None: 4 | # Downloading and loading a dataset from the hub. 5 | raw_datasets = load_dataset( 6 | data_args.task_family_name, 7 | data_args.task_name, 8 | cache_dir=script_args.cache_dir, 9 | ) 10 | elif data_args.dataset_name is not None: 11 | # Downloading and loading a dataset from the hub. 12 | raw_datasets = load_dataset( 13 | data_args.dataset_name, 14 | data_args.dataset_config_name, 15 | cache_dir=script_args.cache_dir, 16 | ) 17 | else: 18 | # Loading a dataset from your local files. 19 | # CSV/JSON training and evaluation files are needed. 20 | data_files = {"train": data_args.train_file, "validation": data_args.validation_file} 21 | 22 | # Get the test dataset: you can provide your own CSV/JSON test file (see below) 23 | # when you use `do_predict` without specifying a GLUE benchmark task. 24 | if training_args.do_predict: 25 | if data_args.test_file is not None: 26 | train_extension = data_args.train_file.split(".")[-1] 27 | test_extension = data_args.test_file.split(".")[-1] 28 | assert ( 29 | test_extension == train_extension 30 | ), "`test_file` should have the same extension (csv or json) as `train_file`." 31 | data_files["test"] = data_args.test_file 32 | print(f"set up test files : {data_args.test_file}") 33 | else: 34 | raise ValueError("Need either a GLUE task or a test file for `do_predict`.") 35 | 36 | for key in data_files.keys(): 37 | logger.info(f"load a local file for {key}: {data_files[key]}") 38 | 39 | if data_args.train_file.endswith(".csv"): 40 | # Loading a dataset from local csv files 41 | raw_datasets = load_dataset( 42 | "csv", 43 | data_files=data_files, 44 | cache_dir=script_args.cache_dir, 45 | ) 46 | else: 47 | # Loading a dataset from local json files 48 | raw_datasets = load_dataset( 49 | "json", 50 | data_files=data_files, 51 | cache_dir=script_args.cache_dir, 52 | ) 53 | return raw_datasets 54 | 55 | def example_input_fn_clf(examples, data_args): 56 | return examples[data_args.text_column] 57 | 58 | 59 | def preprocess_function_clf(examples, tokenizer, data_args, label_to_id=None, padding="max_length", compose_input_fn=example_input_fn_clf): 60 | # extract the input from batched samples 61 | # used with normal trainer 62 | input_examples = compose_input_fn(examples, data_args) 63 | # Tokenize the texts 64 | result = tokenizer(input_examples, padding=padding, max_length=data_args.max_seq_length, truncation=True) 65 | 66 | # Map labels to IDs (not necessary for GLUE tasks) 67 | if label_to_id is not None and data_args.label_column in examples: 68 | result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples[data_args.label_column]] 69 | if data_args.label_column != "label" and "label" in examples: 70 | result["label"] = examples[data_args.label_column] 71 | result['original_label'] = examples['label'] 72 | return result 73 | 74 | def example_input_fn_gen(examples, data_args, prefix="Please Generate a Response: "): 75 | inputs, targets = [], [] 76 | for i in range(len(examples[data_args.text_column])): 77 | if examples[data_args.text_column][i] and examples[data_args.reference_column][i]: 78 | inputs.append(examples[data_args.text_column][i]) 79 | targets.append(examples[data_args.reference_column][i]) 80 | 81 | inputs = [prefix + inp for inp in inputs] 82 | return inputs, targets 83 | def preprocess_function_gen(examples, tokenizer, data_args, padding='max_length', compose_input_fn=example_input_fn_gen): 84 | # remove pairs where at least one record is None 85 | # have to be used with seq2seq trainer 86 | inputs, targets = compose_input_fn(examples, data_args) 87 | 88 | model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) 89 | 90 | # Tokenize targets with the `text_target` keyword argument 91 | labels = tokenizer(text_target=targets, max_length=data_args.max_target_length, padding=padding, truncation=True) 92 | 93 | # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore 94 | # padding in the loss. 95 | if padding == "max_length" and data_args.ignore_pad_token_for_loss: 96 | labels["input_ids"] = [ 97 | [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] 98 | ] 99 | 100 | model_inputs["labels"] = labels["input_ids"] 101 | return model_inputs 102 | -------------------------------------------------------------------------------- /llm_tutorial/demo_trainer.py: -------------------------------------------------------------------------------- 1 | # Author: Chenghao Yang 2 | # Based on https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py 3 | # and https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_classification.py 4 | import logging 5 | import os 6 | from dataclasses import dataclass, field 7 | from typing import Optional 8 | 9 | from transformers import AutoModelForCausalLM, HfArgumentParser, Trainer, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoConfig 10 | # seq2seq training_args is a subclass of training_args 11 | from transformers import AutoTokenizer, default_data_collator, DataCollatorForSeq2Seq 12 | 13 | from data_utils import create_dataset, preprocess_function_clf, preprocess_function_gen 14 | from utils import setup_logger 15 | 16 | 17 | @dataclass 18 | class DataArguments: 19 | """ 20 | Arguments pertaining to what data we are going to input our model for training and eval. 21 | 22 | Using `HfArgumentParser` we can turn this class 23 | into argparse arguments to be able to specify them on 24 | the command line. 25 | """ 26 | preprocessing_num_workers: Optional[int] = field( 27 | default=10, 28 | metadata={"help": "The number of processes to use for the preprocessing."}, 29 | ) 30 | overwrite_cache: bool = field( 31 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 32 | ) 33 | task_family_name: Optional[str] = field( 34 | default=None, 35 | metadata={"help": "The name of the task_family (e.g., glue)"}, 36 | ) 37 | task_name: Optional[str] = field( 38 | default=None, 39 | metadata={"help": "The name of the task to train on under task_family (e.g., mnli)"}, 40 | ) 41 | dataset_name: Optional[str] = field( 42 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 43 | ) 44 | dataset_config_name: Optional[str] = field( 45 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 46 | ) 47 | train_file: Optional[str] = field( 48 | default=None, metadata={"help": "A csv or a json file containing the training data."} 49 | ) 50 | validation_file: Optional[str] = field( 51 | default=None, metadata={"help": "A csv or a json file containing the validation data."} 52 | ) 53 | test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."}) 54 | max_seq_length: int = field( 55 | default=768, 56 | metadata={ 57 | "help": ( 58 | "The maximum total input sequence length after tokenization. Sequences longer " 59 | "than this will be truncated, sequences shorter will be padded." 60 | ) 61 | }, 62 | ) 63 | text_column: Optional[str] = field(default='text', metadata={ 64 | "help": "The name of the column in the datasets containing the full texts."}) 65 | label_column: Optional[str] = field(default='label', metadata={ 66 | "help": "The name of the column in the datasets containing the full labels."}) 67 | # for generation tasks only 68 | reference_column: Optional[str] = field( 69 | default=None, 70 | metadata={ 71 | "help": "The name of the column in the datasets containing the reference (e.g., summary, translation)."}, 72 | ) 73 | max_source_length: Optional[int] = field( 74 | default=1024, 75 | metadata={ 76 | "help": ( 77 | "The maximum total input sequence length after tokenization. Sequences longer " 78 | "than this will be truncated, sequences shorter will be padded." 79 | ) 80 | }, 81 | ) 82 | max_target_length: Optional[int] = field( 83 | default=300, 84 | metadata={ 85 | "help": ( 86 | "The maximum total sequence length for target text after tokenization. Sequences longer " 87 | "than this will be truncated, sequences shorter will be padded." 88 | ) 89 | }, 90 | ) 91 | num_beams: Optional[int] = field( 92 | default=None, 93 | metadata={ 94 | "help": ( 95 | "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, " 96 | "which is used during ``evaluate`` and ``predict``." 97 | ) 98 | }, 99 | ) 100 | ignore_pad_token_for_loss: bool = field( 101 | default=True, 102 | metadata={ 103 | "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." 104 | }, 105 | ) 106 | source_prefix: Optional[str] = field( 107 | default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."} 108 | ) 109 | temperature: Optional[float] = field( 110 | default=1.0, 111 | metadata={ 112 | "help": ( 113 | "temperature for decoding" 114 | ) 115 | }, 116 | ) 117 | top_k: Optional[int] = field( 118 | default=50, 119 | metadata={ 120 | "help": ( 121 | "top_k for decoding" 122 | ) 123 | }, 124 | ) 125 | top_p: Optional[float] = field( 126 | default=0.95, 127 | metadata={ 128 | "help": ( 129 | "top_p for decoding" 130 | ) 131 | }, 132 | ) 133 | min_length: Optional[int] = field( 134 | default=250, 135 | metadata={ 136 | "help": ( 137 | "min_length for decoding" 138 | ) 139 | }, 140 | ) 141 | val_max_target_length: Optional[int] = field( 142 | default=None, 143 | metadata={ 144 | "help": ( 145 | "The maximum total sequence length for validation target text after tokenization. Sequences longer " 146 | "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." 147 | "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " 148 | "during ``evaluate`` and ``predict``." 149 | ) 150 | }, 151 | ) 152 | max_train_samples: Optional[int] = field( 153 | default=100, 154 | metadata={ 155 | "help": ( 156 | "max number of training samples being used" 157 | ) 158 | }, 159 | ) 160 | max_eval_samples: Optional[int] = field( 161 | default=100, 162 | metadata={ 163 | "help": ( 164 | "max number of evaluation samples being used" 165 | ) 166 | }, 167 | ) 168 | 169 | def __post_init__(self): 170 | if self.train_file is None or self.validation_file is None: 171 | # raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.") 172 | assert self.dataset_name is not None or ( 173 | self.task_name is not None and self.task_family_name is not None), "Need a dataset to train the model" 174 | else: 175 | train_extension = self.train_file.split(".")[-1] 176 | assert train_extension in ["csv", 177 | "json"], f"`train_file`({train_extension}) should be a csv or a json file." 178 | validation_extension = self.validation_file.split(".")[-1] 179 | assert ( 180 | validation_extension == train_extension 181 | ), f"`validation_file` should have the same extension (csv or json, now {validation_extension}!={train_extension}) as `train_file`." 182 | 183 | 184 | @dataclass 185 | class ScriptArguments: 186 | """ 187 | Script-related Arguments 188 | """ 189 | model_name_or_path: Optional[str] = field( 190 | default="/net/projects/veitch/LLMs/llama2-based-models/llama2-hf/Llama-2-13b-hf", 191 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}, 192 | ) 193 | exp_name: Optional[str] = field( 194 | default="llama2-7b-peft-768", 195 | metadata={"help": "The name of the experiment, used to name the output directory."}, 196 | ) 197 | logger_file_path: Optional[str] = field( 198 | default="./log/train.log", 199 | metadata={"help": "The name of the logger file."}, 200 | ) 201 | mode: Optional[str] = field( 202 | default="classification", 203 | metadata={"help": "The mode of the experiment, either classification or generation."}, 204 | ) 205 | cache_dir: Optional[str] = field( 206 | default="./cache", 207 | metadata={"help": "The name of the cache directory."}, 208 | ) 209 | prediction_file_path: Optional[str] = field( 210 | default="./prediction.txt", 211 | metadata={"help": "The name of the prediction output file."}, 212 | ) 213 | 214 | def __post_init__(self): 215 | os.makedirs(self.cache_dir, exist_ok=True) 216 | os.makedirs(os.path.dirname(self.logger_file_path), exist_ok=True) 217 | assert self.mode in ["classification", 218 | "generation"], f"mode should be either classification or generation, now {self.mode} is not supported." 219 | 220 | 221 | if __name__ == '__main__': 222 | # see https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments for a complete 223 | # list of arguments 224 | parser = HfArgumentParser((ScriptArguments, DataArguments, Seq2SeqTrainingArguments)) 225 | script_args, data_args, training_args = parser.parse_args_into_dataclasses() 226 | 227 | logger = setup_logger(script_args.logger_file_path, training_args) 228 | logger.warning( 229 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 230 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 231 | ) 232 | logger.warning(f"Training/evaluation parameters {training_args}") 233 | logger.warning(f"Script parameters {script_args}") 234 | logger.warning(f"Data parameters {data_args}") 235 | 236 | # Step-0: create config, and tokenizer (strings -> token ids) 237 | model_config = AutoConfig.from_pretrained(script_args.model_name_or_path) 238 | padding_side = "left" if model_config.is_decoder else "right" 239 | 240 | tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path, padding_side=padding_side) 241 | # for llama specifically 242 | if "llama" in script_args.model_name_or_path: 243 | tokenizer.pad_token = tokenizer.eos_token 244 | # step-1: load dataset 245 | dataset = create_dataset(data_args=data_args, script_args=script_args, training_args=training_args, logger=logger) 246 | # step-2: preprocessing 247 | # get metadata information 248 | column_names = dataset["train"].column_names 249 | if script_args.mode == "classification": 250 | label_list = dataset["train"].unique(data_args.label_column) 251 | label_list.sort() # Let's sort it for determinism 252 | num_labels = len(label_list) 253 | if script_args.mode == "generation": 254 | dataset = dataset.map( 255 | lambda examples: preprocess_function_gen(examples, data_args=data_args, tokenizer=tokenizer), 256 | batched=True, 257 | num_proc=data_args.preprocessing_num_workers, 258 | remove_columns=column_names, 259 | load_from_cache_file=not data_args.overwrite_cache, 260 | desc="Running tokenizer on dataset", 261 | ) 262 | elif script_args.mode == "classification": 263 | dataset = dataset.map( 264 | lambda examples: preprocess_function_clf(examples, data_args=data_args, tokenizer=tokenizer), 265 | batched=True, 266 | num_proc=data_args.preprocessing_num_workers, 267 | load_from_cache_file=not data_args.overwrite_cache, 268 | desc="Running tokenizer on dataset", 269 | ) 270 | else: 271 | raise NotImplementedError(f"mode {script_args.mode} is not supported") 272 | # step-3: create the model, first at CPU 273 | if script_args.mode == "classification": 274 | model = AutoModelForSequenceClassification.from_pretrained( 275 | script_args.model_name_or_path, 276 | num_labels=num_labels, 277 | ) 278 | # logging.warning(f"model name: {script_args.model_name_or_path} loaded as AutoModelForSequenceClassification") 279 | elif script_args.mode == "generation": 280 | try: 281 | # decoder-only 282 | model = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path) 283 | except: 284 | # seq2seq 285 | model = AutoModelForSeq2SeqLM.from_pretrained(script_args.model_name_or_path) 286 | # step-4: create the trainer 287 | if script_args.mode == "classification": 288 | trainer = Trainer( 289 | model=model, 290 | args=training_args, 291 | train_dataset=dataset["train"], 292 | eval_dataset=dataset["validation"] if "validation" in dataset else dataset['test'], 293 | tokenizer=tokenizer, 294 | data_collator=default_data_collator, 295 | ) 296 | else: 297 | label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id 298 | data_collator = DataCollatorForSeq2Seq( 299 | tokenizer, 300 | model=model, 301 | label_pad_token_id=label_pad_token_id, 302 | pad_to_multiple_of=8 if training_args.fp16 else None, 303 | ) 304 | trainer = Seq2SeqTrainer( 305 | model=model, 306 | args=training_args, 307 | train_dataset=dataset['train'] if training_args.do_train else None, 308 | eval_dataset=dataset['validation'] if training_args.do_eval else None, 309 | tokenizer=tokenizer, 310 | data_collator=data_collator, 311 | ) 312 | 313 | if training_args.do_train: 314 | train_result = trainer.train() 315 | trainer.save_model() # Saves the tokenizer too for easy upload 316 | 317 | metrics = train_result.metrics 318 | train_dataset = dataset['train'] 319 | max_train_samples = ( 320 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 321 | ) 322 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 323 | 324 | trainer.log_metrics("train", metrics) 325 | trainer.save_metrics("train", metrics) 326 | trainer.save_state() 327 | 328 | # Evaluation 329 | results = {} 330 | max_length = ( 331 | training_args.generation_max_length 332 | if training_args.generation_max_length is not None 333 | else data_args.val_max_target_length 334 | ) 335 | num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams 336 | if training_args.do_eval: 337 | logger.info("*** Evaluate ***") 338 | metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") 339 | eval_dataset = dataset['validation'] if "validation" in dataset else dataset['test'] 340 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 341 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 342 | 343 | trainer.log_metrics("eval", metrics) 344 | trainer.save_metrics("eval", metrics) 345 | 346 | if training_args.do_predict: 347 | logger.info("*** Predict ***") 348 | predict_dataset = dataset['test'] 349 | predict_results = trainer.predict( 350 | predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams, 351 | temperature=data_args.temperature, 352 | top_k=data_args.top_k, 353 | top_p=data_args.top_p, 354 | min_new_tokens=data_args.min_length 355 | ) 356 | metrics = predict_results.metrics 357 | max_predict_samples = ( 358 | data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) 359 | ) 360 | metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) 361 | 362 | trainer.log_metrics("predict", metrics) 363 | trainer.save_metrics("predict", metrics) 364 | reshaped_predictions = predict_results.predictions.reshape(-1, predict_results.predictions.shape[-1]) 365 | if trainer.is_world_process_zero(): 366 | predictions = tokenizer.batch_decode( 367 | reshaped_predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True 368 | ) 369 | predictions = [pred.strip() for pred in predictions] 370 | with open(script_args.prediction_file_path, "w") as writer: 371 | writer.write("\n\n\n".join(predictions)) 372 | 373 | kwargs = {"finetuned_from": script_args.model_name_or_path, "tasks": data_args.task_name} 374 | if data_args.dataset_name is not None: 375 | kwargs["dataset_tags"] = data_args.dataset_name 376 | if data_args.dataset_config_name is not None: 377 | kwargs["dataset_args"] = data_args.dataset_config_name 378 | kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" 379 | else: 380 | kwargs["dataset"] = data_args.dataset_name 381 | 382 | 383 | if training_args.push_to_hub: 384 | trainer.push_to_hub(**kwargs) 385 | else: 386 | trainer.create_model_card(**kwargs) 387 | --------------------------------------------------------------------------------