├── llm_tutorial
    ├── requirement.txt
    ├── run_train_gen.sh
    ├── utils.py
    ├── ds_config_zero2.json
    ├── ds_config_zero3.json
    ├── CustomizedTrainer.py
    ├── data_utils.py
    └── demo_trainer.py
├── README.md
├── LICENSE
└── run.sh


/llm_tutorial/requirement.txt:
--------------------------------------------------------------------------------
 1 | # recommended under python 3.10
 2 | torch
 3 | pytorch_lightning
 4 | scikit-learn
 5 | transformers
 6 | mpi4py # if you cannot install by pip, use conda install
 7 | deepspeed
 8 | accelerate
 9 | peft
10 | vllm
11 | sentencepiece
12 | datasets
13 | nltk
14 | metrics
15 | wandb
16 | loguru
17 | protobuf==3.20


--------------------------------------------------------------------------------
/llm_tutorial/run_train_gen.sh:
--------------------------------------------------------------------------------
 1 | deepspeed demo_trainer.py \
 2 |     --model_name_or_path "/data/LLAMA2_hf/llama_13B" \
 3 |     --deepspeed ./ds_config_zero2.json \
 4 |     --bf16 \
 5 |     --do_train \
 6 |     --do_eval \
 7 |     --do_predict \
 8 |     --mode "generation" \
 9 |     --dataset_name cnn_dailymail \
10 |     --dataset_config "3.0.0" \
11 |     --text_column "article" \
12 |     --reference_column "highlights" \
13 |     --source_prefix "summarize: " \
14 |     --output_dir ./output/cnn_dm \
15 |     --per_device_train_batch_size 4 \
16 |     --per_device_eval_batch_size 4 \
17 |     --overwrite_output_dir


--------------------------------------------------------------------------------
/llm_tutorial/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import datasets
 3 | import transformers
 4 | import sys
 5 | 
 6 | def setup_logger(logger_file_path, training_args):
 7 |     logging.basicConfig(
 8 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 9 |         datefmt="%m/%d/%Y %H:%M:%S",
10 |         handlers=[logging.StreamHandler(sys.stdout)],
11 |     )
12 |     logger = logging.getLogger(__name__)
13 |     logger.addHandler(logging.FileHandler(logger_file_path))
14 |     log_level = training_args.get_process_log_level()
15 |     logger.setLevel(log_level)
16 |     datasets.utils.logging.set_verbosity(log_level)
17 |     transformers.utils.logging.set_verbosity(log_level)
18 |     transformers.utils.logging.enable_default_handler()
19 |     transformers.utils.logging.enable_explicit_format()
20 |     return logger
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NLPResearchScaffolding
 2 | Scaffold for NLP researcher to quickly set up the codebase
 3 | 
 4 | Author: Chenghao Yang (chenghao at uchicago dot edu)
 5 | 
 6 | ## Usage
 7 | First install anaconda corresonpoding to your system architecture.  
 8 | 
 9 | Then just run the `run.sh` shell scripts and you will get the environment setup.
10 | 
11 | ## Current Environment Package
12 | 1. Basic NLP toolkits (NLTK & Spacy), with their package data downloaded. 
13 | 1. Basic ML and Scientific Computing Toolkits (scipy, scikit-learn)
14 | 1. Pytorch (may add TensorFlow later if there is enough requests and good PRs :-) )
15 | 1. Huggingface Suites (Transformers, Datasets, Evaluate, Accelerate)
16 | 1. Large Language Models stuffs (Protobuf, SentencePiece)
17 | 1. Logging Library (loguru) and Vim Setup from [Ultimate Vimrc](https://github.com/amix/vimrc). 
18 | 1. Metrics (Rouge-score)
19 | 
20 | ## TO-DOs
21 | 1. Tries to provide some interface to easilly select which toolkit we want to install.
22 | 2. Tries to suppress all user checking ([y/n]) and make it fully automatic. 
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Chenghao (Alan) Yang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/llm_tutorial/ds_config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "optimizer": {
12 |         "type": "AdamW",
13 |         "params": {
14 |             "lr": "auto",
15 |             "betas": "auto",
16 |             "eps": "auto",
17 |             "weight_decay": "auto"
18 |         }
19 |     },
20 | 
21 |     "scheduler": {
22 |         "type": "WarmupLR",
23 |         "params": {
24 |             "warmup_min_lr": "auto",
25 |             "warmup_max_lr": "auto",
26 |             "warmup_num_steps": "auto"
27 |         }
28 |     },
29 | 
30 |     "zero_optimization": {
31 |         "stage": 2,
32 |         "offload_optimizer": {
33 |             "device": "cpu",
34 |             "pin_memory": true
35 |         },
36 |         "allgather_partitions": true,
37 |         "allgather_bucket_size": 2e8,
38 |         "overlap_comm": true,
39 |         "reduce_scatter": true,
40 |         "reduce_bucket_size": 2e8,
41 |         "contiguous_gradients": true
42 |     },
43 | 
44 |     "gradient_accumulation_steps": "auto",
45 |     "gradient_clipping": "auto",
46 |     "steps_per_print": 2000,
47 |     "train_batch_size": "auto",
48 |     "train_micro_batch_size_per_gpu": "auto",
49 |     "wall_clock_breakdown": false
50 | }


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | project_dir=path/to/your/project
 2 | 
 3 | # create local environment
 4 | cd $project_dir
 5 | conda create -p ./env python=3.7
 6 | conda activate ./env
 7 | 
 8 | # fancy logging package
 9 | pip install loguru
10 | 
11 | # install pytorch with cuda
12 | conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
13 | 
14 | # install huggingface
15 | conda install transformers datasets evaluate
16 | # large language model related utils, st=sentencepiece is useful for tokenizer loading
17 | conda install sentencepiece
18 | # cannot use conda, protobuf is set to be 3.20 as it is more compatible with released LLM so far
19 | pip install protobuf==3.20
20 | pip install accelerate
21 | # setup huggingface cache dir
22 | echo "export HF_HOME=${project_dir}/transformers" >> ~/.bashrc
23 | echo "export TRANSFORMERS_CACHE=${project_dir}/transformers" >> ~/.bashrc
24 | 
25 | # install nltk, spacy, ..., often useful for pre-processing and post-processing
26 | conda install nltk
27 | python -m nltk.downloader all
28 | 
29 | pip install -U spacy
30 | python -m spacy download en_core_web_sm
31 | 
32 | # scipy, sklearn..
33 | conda install scipy scikit-learn
34 | 
35 | # install rouge for summary evaluation
36 | pip install rouge-score
37 | 
38 | # setup vim (optional)
39 | cd ~/
40 | wget https://raw.githubusercontent.com/amix/vimrc/master/vimrcs/basic.vim
41 | mv basic.vim .vimrc
42 | 
43 | # let git use vim (optional)
44 | git config --global core.editor "vim"
45 | 
46 | 
47 | # setup openai interface for using GPT-3/3.5/...
48 | pip install openai
49 | 
50 | cd -
51 | 
52 | 


--------------------------------------------------------------------------------
/llm_tutorial/ds_config_zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "optimizer": {
12 |         "type": "AdamW",
13 |         "params": {
14 |             "lr": "auto",
15 |             "betas": "auto",
16 |             "eps": "auto",
17 |             "weight_decay": "auto"
18 |         }
19 |     },
20 | 
21 |     "scheduler": {
22 |         "type": "WarmupLR",
23 |         "params": {
24 |             "warmup_min_lr": "auto",
25 |             "warmup_max_lr": "auto",
26 |             "warmup_num_steps": "auto"
27 |         }
28 |     },
29 | 
30 |     "zero_optimization": {
31 |         "stage": 3,
32 |         "offload_optimizer": {
33 |             "device": "cpu",
34 |             "pin_memory": true
35 |         },
36 |         "offload_param": {
37 |             "device": "cpu",
38 |             "pin_memory": true
39 |         },
40 |         "overlap_comm": true,
41 |         "contiguous_gradients": true,
42 |         "sub_group_size": 1e9,
43 |         "reduce_bucket_size": "auto",
44 |         "stage3_prefetch_bucket_size": "auto",
45 |         "stage3_param_persistence_threshold": "auto",
46 |         "stage3_max_live_parameters": 1e9,
47 |         "stage3_max_reuse_distance": 1e9,
48 |         "stage3_gather_16bit_weights_on_model_save": true
49 |     },
50 | 
51 |     "gradient_accumulation_steps": "auto",
52 |     "gradient_clipping": "auto",
53 |     "steps_per_print": 2000,
54 |     "train_batch_size": "auto",
55 |     "train_micro_batch_size_per_gpu": "auto",
56 |     "wall_clock_breakdown": false
57 | }


--------------------------------------------------------------------------------
/llm_tutorial/CustomizedTrainer.py:
--------------------------------------------------------------------------------
 1 | from transformers import Trainer
 2 | from transformers.utils import (
 3 |     is_peft_available
 4 | )
 5 | from peft import PeftModel
 6 | from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 7 | from transformers.modeling_utils import unwrap_model
 8 | 
 9 | 
10 | class CustomizedTrainer(Trainer):
11 |     def compute_loss(self, model, inputs, return_outputs=False):
12 |         """
13 |         How the loss is computed by Trainer. By default, all models return the loss in the first element.
14 | 
15 |         Subclass and override for custom behavior.
16 |         """
17 |         if self.label_smoother is not None and "labels" in inputs:
18 |             labels = inputs.pop("labels")
19 |         else:
20 |             labels = None
21 |         outputs = model(**inputs)
22 |         # Save past state if it exists
23 |         # TODO: this needs to be fixed and made cleaner later.
24 |         if self.args.past_index >= 0:
25 |             self._past = outputs[self.args.past_index]
26 | 
27 |         if labels is not None:
28 |             if is_peft_available() and isinstance(model, PeftModel):
29 |                 model_name = unwrap_model(model.base_model)._get_name()
30 |             else:
31 |                 model_name = unwrap_model(model)._get_name()
32 |             if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
33 |                 loss = self.label_smoother(outputs, labels, shift_labels=True)
34 |             else:
35 |                 loss = self.label_smoother(outputs, labels)
36 |         else:
37 |             if isinstance(outputs, dict) and "loss" not in outputs:
38 |                 raise ValueError(
39 |                     "The model did not return a loss from the inputs, only the following keys: "
40 |                     f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
41 |                 )
42 |             # We don't use .loss here since the model may return tuples instead of ModelOutput.
43 |             loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
44 | 
45 |         return (loss, outputs) if return_outputs else loss
46 | 


--------------------------------------------------------------------------------
/llm_tutorial/data_utils.py:
--------------------------------------------------------------------------------
  1 | from datasets import load_dataset
  2 | def create_dataset(data_args, script_args, training_args, logger):
  3 |     if data_args.task_name is not None and data_args.task_family_name is not None:
  4 |         # Downloading and loading a dataset from the hub.
  5 |         raw_datasets = load_dataset(
  6 |             data_args.task_family_name,
  7 |             data_args.task_name,
  8 |             cache_dir=script_args.cache_dir,
  9 |         )
 10 |     elif data_args.dataset_name is not None:
 11 |         # Downloading and loading a dataset from the hub.
 12 |         raw_datasets = load_dataset(
 13 |             data_args.dataset_name,
 14 |             data_args.dataset_config_name,
 15 |             cache_dir=script_args.cache_dir,
 16 |         )
 17 |     else:
 18 |         # Loading a dataset from your local files.
 19 |         # CSV/JSON training and evaluation files are needed.
 20 |         data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
 21 | 
 22 |         # Get the test dataset: you can provide your own CSV/JSON test file (see below)
 23 |         # when you use `do_predict` without specifying a GLUE benchmark task.
 24 |         if training_args.do_predict:
 25 |             if data_args.test_file is not None:
 26 |                 train_extension = data_args.train_file.split(".")[-1]
 27 |                 test_extension = data_args.test_file.split(".")[-1]
 28 |                 assert (
 29 |                         test_extension == train_extension
 30 |                 ), "`test_file` should have the same extension (csv or json) as `train_file`."
 31 |                 data_files["test"] = data_args.test_file
 32 |                 print(f"set up test files : {data_args.test_file}")
 33 |             else:
 34 |                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
 35 | 
 36 |         for key in data_files.keys():
 37 |             logger.info(f"load a local file for {key}: {data_files[key]}")
 38 | 
 39 |         if data_args.train_file.endswith(".csv"):
 40 |             # Loading a dataset from local csv files
 41 |             raw_datasets = load_dataset(
 42 |                 "csv",
 43 |                 data_files=data_files,
 44 |                 cache_dir=script_args.cache_dir,
 45 |             )
 46 |         else:
 47 |             # Loading a dataset from local json files
 48 |             raw_datasets = load_dataset(
 49 |                 "json",
 50 |                 data_files=data_files,
 51 |                 cache_dir=script_args.cache_dir,
 52 |             )
 53 |     return raw_datasets
 54 | 
 55 | def example_input_fn_clf(examples, data_args):
 56 |     return examples[data_args.text_column]
 57 | 
 58 | 
 59 | def preprocess_function_clf(examples, tokenizer, data_args, label_to_id=None, padding="max_length", compose_input_fn=example_input_fn_clf):
 60 |     # extract the input from batched samples
 61 |     # used with normal trainer
 62 |     input_examples = compose_input_fn(examples, data_args)
 63 |     # Tokenize the texts
 64 |     result = tokenizer(input_examples, padding=padding, max_length=data_args.max_seq_length, truncation=True)
 65 | 
 66 |     # Map labels to IDs (not necessary for GLUE tasks)
 67 |     if label_to_id is not None and data_args.label_column in examples:
 68 |         result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples[data_args.label_column]]
 69 |     if data_args.label_column != "label" and "label" in examples:
 70 |         result["label"] = examples[data_args.label_column]
 71 |         result['original_label'] = examples['label']
 72 |     return result
 73 | 
 74 | def example_input_fn_gen(examples, data_args, prefix="Please Generate a Response: "):
 75 |     inputs, targets = [], []
 76 |     for i in range(len(examples[data_args.text_column])):
 77 |         if examples[data_args.text_column][i] and examples[data_args.reference_column][i]:
 78 |             inputs.append(examples[data_args.text_column][i])
 79 |             targets.append(examples[data_args.reference_column][i])
 80 | 
 81 |     inputs = [prefix + inp for inp in inputs]
 82 |     return inputs, targets
 83 | def preprocess_function_gen(examples, tokenizer, data_args, padding='max_length', compose_input_fn=example_input_fn_gen):
 84 |     # remove pairs where at least one record is None
 85 |     # have to be used with seq2seq trainer
 86 |     inputs, targets = compose_input_fn(examples, data_args)
 87 | 
 88 |     model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
 89 | 
 90 |     # Tokenize targets with the `text_target` keyword argument
 91 |     labels = tokenizer(text_target=targets, max_length=data_args.max_target_length, padding=padding, truncation=True)
 92 | 
 93 |     # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
 94 |     # padding in the loss.
 95 |     if padding == "max_length" and data_args.ignore_pad_token_for_loss:
 96 |         labels["input_ids"] = [
 97 |             [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
 98 |         ]
 99 | 
100 |     model_inputs["labels"] = labels["input_ids"]
101 |     return model_inputs
102 | 


--------------------------------------------------------------------------------
/llm_tutorial/demo_trainer.py:
--------------------------------------------------------------------------------
  1 | # Author: Chenghao Yang
  2 | # Based on https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py
  3 | # and https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_classification.py
  4 | import logging
  5 | import os
  6 | from dataclasses import dataclass, field
  7 | from typing import Optional
  8 | 
  9 | from transformers import AutoModelForCausalLM, HfArgumentParser, Trainer, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoConfig
 10 | # seq2seq training_args is a subclass of training_args
 11 | from transformers import AutoTokenizer, default_data_collator, DataCollatorForSeq2Seq
 12 | 
 13 | from data_utils import create_dataset, preprocess_function_clf, preprocess_function_gen
 14 | from utils import setup_logger
 15 | 
 16 | 
 17 | @dataclass
 18 | class DataArguments:
 19 |     """
 20 |     Arguments pertaining to what data we are going to input our model for training and eval.
 21 | 
 22 |     Using `HfArgumentParser` we can turn this class
 23 |     into argparse arguments to be able to specify them on
 24 |     the command line.
 25 |     """
 26 |     preprocessing_num_workers: Optional[int] = field(
 27 |         default=10,
 28 |         metadata={"help": "The number of processes to use for the preprocessing."},
 29 |     )
 30 |     overwrite_cache: bool = field(
 31 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
 32 |     )
 33 |     task_family_name: Optional[str] = field(
 34 |         default=None,
 35 |         metadata={"help": "The name of the task_family (e.g., glue)"},
 36 |     )
 37 |     task_name: Optional[str] = field(
 38 |         default=None,
 39 |         metadata={"help": "The name of the task to train on under task_family (e.g., mnli)"},
 40 |     )
 41 |     dataset_name: Optional[str] = field(
 42 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
 43 |     )
 44 |     dataset_config_name: Optional[str] = field(
 45 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
 46 |     )
 47 |     train_file: Optional[str] = field(
 48 |         default=None, metadata={"help": "A csv or a json file containing the training data."}
 49 |     )
 50 |     validation_file: Optional[str] = field(
 51 |         default=None, metadata={"help": "A csv or a json file containing the validation data."}
 52 |     )
 53 |     test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
 54 |     max_seq_length: int = field(
 55 |         default=768,
 56 |         metadata={
 57 |             "help": (
 58 |                 "The maximum total input sequence length after tokenization. Sequences longer "
 59 |                 "than this will be truncated, sequences shorter will be padded."
 60 |             )
 61 |         },
 62 |     )
 63 |     text_column: Optional[str] = field(default='text', metadata={
 64 |         "help": "The name of the column in the datasets containing the full texts."})
 65 |     label_column: Optional[str] = field(default='label', metadata={
 66 |         "help": "The name of the column in the datasets containing the full labels."})
 67 |     # for generation tasks only
 68 |     reference_column: Optional[str] = field(
 69 |         default=None,
 70 |         metadata={
 71 |             "help": "The name of the column in the datasets containing the reference (e.g., summary, translation)."},
 72 |     )
 73 |     max_source_length: Optional[int] = field(
 74 |         default=1024,
 75 |         metadata={
 76 |             "help": (
 77 |                 "The maximum total input sequence length after tokenization. Sequences longer "
 78 |                 "than this will be truncated, sequences shorter will be padded."
 79 |             )
 80 |         },
 81 |     )
 82 |     max_target_length: Optional[int] = field(
 83 |         default=300,
 84 |         metadata={
 85 |             "help": (
 86 |                 "The maximum total sequence length for target text after tokenization. Sequences longer "
 87 |                 "than this will be truncated, sequences shorter will be padded."
 88 |             )
 89 |         },
 90 |     )
 91 |     num_beams: Optional[int] = field(
 92 |         default=None,
 93 |         metadata={
 94 |             "help": (
 95 |                 "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
 96 |                 "which is used during ``evaluate`` and ``predict``."
 97 |             )
 98 |         },
 99 |     )
100 |     ignore_pad_token_for_loss: bool = field(
101 |         default=True,
102 |         metadata={
103 |             "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
104 |         },
105 |     )
106 |     source_prefix: Optional[str] = field(
107 |         default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
108 |     )
109 |     temperature: Optional[float] = field(
110 |         default=1.0,
111 |         metadata={
112 |             "help": (
113 |                 "temperature for decoding"
114 |             )
115 |         },
116 |     )
117 |     top_k: Optional[int] = field(
118 |         default=50,
119 |         metadata={
120 |             "help": (
121 |                 "top_k for decoding"
122 |             )
123 |         },
124 |     )
125 |     top_p: Optional[float] = field(
126 |         default=0.95,
127 |         metadata={
128 |             "help": (
129 |                 "top_p for decoding"
130 |             )
131 |         },
132 |     )
133 |     min_length: Optional[int] = field(
134 |         default=250,
135 |         metadata={
136 |             "help": (
137 |                 "min_length for decoding"
138 |             )
139 |         },
140 |     )
141 |     val_max_target_length: Optional[int] = field(
142 |         default=None,
143 |         metadata={
144 |             "help": (
145 |                 "The maximum total sequence length for validation target text after tokenization. Sequences longer "
146 |                 "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
147 |                 "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
148 |                 "during ``evaluate`` and ``predict``."
149 |             )
150 |         },
151 |     )
152 |     max_train_samples: Optional[int] = field(
153 |         default=100,
154 |         metadata={
155 |             "help": (
156 |                 "max number of training samples being used"
157 |             )
158 |         },
159 |     )
160 |     max_eval_samples: Optional[int] = field(
161 |         default=100,
162 |         metadata={
163 |             "help": (
164 |                 "max number of evaluation samples being used"
165 |             )
166 |         },
167 |     )
168 | 
169 |     def __post_init__(self):
170 |         if self.train_file is None or self.validation_file is None:
171 |             # raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
172 |             assert self.dataset_name is not None or (
173 |                     self.task_name is not None and self.task_family_name is not None), "Need a dataset to train the model"
174 |         else:
175 |             train_extension = self.train_file.split(".")[-1]
176 |             assert train_extension in ["csv",
177 |                                        "json"], f"`train_file`({train_extension}) should be a csv or a json file."
178 |             validation_extension = self.validation_file.split(".")[-1]
179 |             assert (
180 |                     validation_extension == train_extension
181 |             ), f"`validation_file` should have the same extension (csv or json, now {validation_extension}!={train_extension}) as `train_file`."
182 | 
183 | 
184 | @dataclass
185 | class ScriptArguments:
186 |     """
187 |     Script-related Arguments
188 |     """
189 |     model_name_or_path: Optional[str] = field(
190 |         default="/net/projects/veitch/LLMs/llama2-based-models/llama2-hf/Llama-2-13b-hf",
191 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
192 |     )
193 |     exp_name: Optional[str] = field(
194 |         default="llama2-7b-peft-768",
195 |         metadata={"help": "The name of the experiment, used to name the output directory."},
196 |     )
197 |     logger_file_path: Optional[str] = field(
198 |         default="./log/train.log",
199 |         metadata={"help": "The name of the logger file."},
200 |     )
201 |     mode: Optional[str] = field(
202 |         default="classification",
203 |         metadata={"help": "The mode of the experiment, either classification or generation."},
204 |     )
205 |     cache_dir: Optional[str] = field(
206 |         default="./cache",
207 |         metadata={"help": "The name of the cache directory."},
208 |     )
209 |     prediction_file_path: Optional[str] = field(
210 |         default="./prediction.txt",
211 |         metadata={"help": "The name of the prediction output file."},
212 |     )
213 | 
214 |     def __post_init__(self):
215 |         os.makedirs(self.cache_dir, exist_ok=True)
216 |         os.makedirs(os.path.dirname(self.logger_file_path), exist_ok=True)
217 |         assert self.mode in ["classification",
218 |                              "generation"], f"mode should be either classification or generation, now {self.mode} is not supported."
219 | 
220 | 
221 | if __name__ == '__main__':
222 |     # see https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments for a complete
223 |     # list of arguments
224 |     parser = HfArgumentParser((ScriptArguments, DataArguments, Seq2SeqTrainingArguments))
225 |     script_args, data_args, training_args = parser.parse_args_into_dataclasses()
226 | 
227 |     logger = setup_logger(script_args.logger_file_path, training_args)
228 |     logger.warning(
229 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
230 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
231 |     )
232 |     logger.warning(f"Training/evaluation parameters {training_args}")
233 |     logger.warning(f"Script parameters {script_args}")
234 |     logger.warning(f"Data parameters {data_args}")
235 | 
236 |     # Step-0: create config, and tokenizer (strings -> token ids)
237 |     model_config = AutoConfig.from_pretrained(script_args.model_name_or_path)
238 |     padding_side = "left" if model_config.is_decoder else "right"
239 | 
240 |     tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path, padding_side=padding_side)
241 |     # for llama specifically
242 |     if "llama" in script_args.model_name_or_path:
243 |         tokenizer.pad_token = tokenizer.eos_token
244 |     # step-1: load dataset
245 |     dataset = create_dataset(data_args=data_args, script_args=script_args, training_args=training_args, logger=logger)
246 |     # step-2: preprocessing
247 |     # get metadata information
248 |     column_names = dataset["train"].column_names
249 |     if script_args.mode == "classification":
250 |         label_list = dataset["train"].unique(data_args.label_column)
251 |         label_list.sort()  # Let's sort it for determinism
252 |         num_labels = len(label_list)
253 |     if script_args.mode == "generation":
254 |         dataset = dataset.map(
255 |             lambda examples: preprocess_function_gen(examples, data_args=data_args, tokenizer=tokenizer),
256 |             batched=True,
257 |             num_proc=data_args.preprocessing_num_workers,
258 |             remove_columns=column_names,
259 |             load_from_cache_file=not data_args.overwrite_cache,
260 |             desc="Running tokenizer on dataset",
261 |         )
262 |     elif script_args.mode == "classification":
263 |         dataset = dataset.map(
264 |             lambda examples: preprocess_function_clf(examples, data_args=data_args, tokenizer=tokenizer),
265 |             batched=True,
266 |             num_proc=data_args.preprocessing_num_workers,
267 |             load_from_cache_file=not data_args.overwrite_cache,
268 |             desc="Running tokenizer on dataset",
269 |         )
270 |     else:
271 |         raise NotImplementedError(f"mode {script_args.mode} is not supported")
272 |     # step-3: create the model, first at CPU
273 |     if script_args.mode == "classification":
274 |         model = AutoModelForSequenceClassification.from_pretrained(
275 |             script_args.model_name_or_path,
276 |             num_labels=num_labels,
277 |         )
278 |         # logging.warning(f"model name: {script_args.model_name_or_path} loaded as AutoModelForSequenceClassification")
279 |     elif script_args.mode == "generation":
280 |         try:
281 |             # decoder-only
282 |             model = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path)
283 |         except:
284 |             # seq2seq
285 |             model = AutoModelForSeq2SeqLM.from_pretrained(script_args.model_name_or_path)
286 |     # step-4: create the trainer
287 |     if script_args.mode == "classification":
288 |         trainer = Trainer(
289 |             model=model,
290 |             args=training_args,
291 |             train_dataset=dataset["train"],
292 |             eval_dataset=dataset["validation"] if "validation" in dataset else dataset['test'],
293 |             tokenizer=tokenizer,
294 |             data_collator=default_data_collator,
295 |         )
296 |     else:
297 |         label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
298 |         data_collator = DataCollatorForSeq2Seq(
299 |             tokenizer,
300 |             model=model,
301 |             label_pad_token_id=label_pad_token_id,
302 |             pad_to_multiple_of=8 if training_args.fp16 else None,
303 |         )
304 |         trainer = Seq2SeqTrainer(
305 |             model=model,
306 |             args=training_args,
307 |             train_dataset=dataset['train'] if training_args.do_train else None,
308 |             eval_dataset=dataset['validation'] if training_args.do_eval else None,
309 |             tokenizer=tokenizer,
310 |             data_collator=data_collator,
311 |         )
312 | 
313 |     if training_args.do_train:
314 |         train_result = trainer.train()
315 |         trainer.save_model()  # Saves the tokenizer too for easy upload
316 | 
317 |         metrics = train_result.metrics
318 |         train_dataset = dataset['train']
319 |         max_train_samples = (
320 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
321 |         )
322 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
323 | 
324 |         trainer.log_metrics("train", metrics)
325 |         trainer.save_metrics("train", metrics)
326 |         trainer.save_state()
327 | 
328 |     # Evaluation
329 |     results = {}
330 |     max_length = (
331 |         training_args.generation_max_length
332 |         if training_args.generation_max_length is not None
333 |         else data_args.val_max_target_length
334 |     )
335 |     num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
336 |     if training_args.do_eval:
337 |         logger.info("*** Evaluate ***")
338 |         metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval")
339 |         eval_dataset = dataset['validation'] if "validation" in dataset else dataset['test']
340 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
341 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
342 | 
343 |         trainer.log_metrics("eval", metrics)
344 |         trainer.save_metrics("eval", metrics)
345 | 
346 |     if training_args.do_predict:
347 |         logger.info("*** Predict ***")
348 |         predict_dataset = dataset['test']
349 |         predict_results = trainer.predict(
350 |             predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams,
351 |             temperature=data_args.temperature,
352 |             top_k=data_args.top_k,
353 |             top_p=data_args.top_p,
354 |             min_new_tokens=data_args.min_length
355 |         )
356 |         metrics = predict_results.metrics
357 |         max_predict_samples = (
358 |             data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
359 |         )
360 |         metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
361 | 
362 |         trainer.log_metrics("predict", metrics)
363 |         trainer.save_metrics("predict", metrics)
364 |         reshaped_predictions = predict_results.predictions.reshape(-1, predict_results.predictions.shape[-1])
365 |         if trainer.is_world_process_zero():
366 |             predictions = tokenizer.batch_decode(
367 |                 reshaped_predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
368 |             )
369 |             predictions = [pred.strip() for pred in predictions]
370 |             with open(script_args.prediction_file_path, "w") as writer:
371 |                 writer.write("\n\n\n".join(predictions))
372 | 
373 |     kwargs = {"finetuned_from": script_args.model_name_or_path, "tasks": data_args.task_name}
374 |     if data_args.dataset_name is not None:
375 |         kwargs["dataset_tags"] = data_args.dataset_name
376 |         if data_args.dataset_config_name is not None:
377 |             kwargs["dataset_args"] = data_args.dataset_config_name
378 |             kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
379 |         else:
380 |             kwargs["dataset"] = data_args.dataset_name
381 | 
382 | 
383 |     if training_args.push_to_hub:
384 |         trainer.push_to_hub(**kwargs)
385 |     else:
386 |         trainer.create_model_card(**kwargs)
387 | 


--------------------------------------------------------------------------------