├── src ├── exp │ ├── __init__.py │ ├── cal_ttest.py │ ├── analysis_sgm.py │ ├── run_exp_api.py │ ├── analysis_mmlt.py │ ├── run_exp.py │ ├── cal_mlt_scores.py │ ├── cal_elm_rmse.py │ └── cal_level_scores.py ├── data_process │ ├── __init__.py │ ├── raw_openhermes_process.py │ ├── build_arena_dataset.py │ ├── build_tlg_dataset.py │ └── build_training_dataset.py ├── finetuning │ ├── callback.py │ ├── dataset.py │ └── finetune.py └── utils │ ├── __init__.py │ ├── count.py │ ├── json_file.py │ ├── config.py │ └── templates.py ├── images ├── TLG.png ├── mmlt.png ├── sgm.png ├── method.png ├── TLG_ruler.png └── overall_performance.png ├── requirements.txt ├── scripts ├── Yi-1.5-6B │ ├── run_mmlt.sh │ ├── run_self_generated_mlt.sh │ ├── run_tlg.sh │ ├── ruler.sh │ ├── vanilla.sh │ ├── ruler_lm_eval.sh │ └── vanilla_lm_eval.sh ├── gemma-7b │ ├── run_mmlt.sh │ ├── run_self_generated_mlt.sh │ ├── run_tlg.sh │ ├── ruler.sh │ ├── vanilla.sh │ ├── ruler_lm_eval.sh │ └── vanilla_lm_eval.sh ├── Qwen1.5-7B │ ├── run_mmlt.sh │ ├── run_self_generated_mlt.sh │ ├── run_tlg.sh │ ├── ruler.sh │ ├── vanilla.sh │ ├── ruler_lm_eval.sh │ └── vanilla_lm_eval.sh ├── Meta-Llama-3-8B │ ├── run_mmlt.sh │ ├── run_self_generated_mlt.sh │ ├── run_tlg.sh │ ├── ruler.sh │ ├── vanilla.sh │ ├── ruler_lm_eval.sh │ └── vanilla_lm_eval.sh ├── Mistral-7B-v0.3 │ ├── run_mmlt.sh │ ├── run_self_generated_mlt.sh │ ├── run_tlg.sh │ ├── ruler.sh │ ├── vanilla.sh │ ├── ruler_lm_eval.sh │ └── vanilla_lm_eval.sh ├── deepseek-llm-7b-base │ ├── run_mmlt.sh │ ├── run_self_generated_mlt.sh │ ├── run_tlg.sh │ ├── ruler.sh │ ├── vanilla.sh │ ├── ruler_lm_eval.sh │ └── vanilla_lm_eval.sh └── download.sh ├── configs ├── ds_config_zero2.json ├── ds_config_zero3.json └── ds_config_zero3_cpu_offload.json ├── LICENSE ├── .gitignore └── README.md /src/exp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data_process/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/finetuning/callback.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .count import * 2 | from .json_file import * -------------------------------------------------------------------------------- /images/TLG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Geaming2002/Ruler/HEAD/images/TLG.png -------------------------------------------------------------------------------- /images/mmlt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Geaming2002/Ruler/HEAD/images/mmlt.png -------------------------------------------------------------------------------- /images/sgm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Geaming2002/Ruler/HEAD/images/sgm.png -------------------------------------------------------------------------------- /images/method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Geaming2002/Ruler/HEAD/images/method.png -------------------------------------------------------------------------------- /images/TLG_ruler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Geaming2002/Ruler/HEAD/images/TLG_ruler.png -------------------------------------------------------------------------------- /images/overall_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Geaming2002/Ruler/HEAD/images/overall_performance.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed==0.13.1 2 | nlp==0.4.0 3 | nltk==3.8.1 4 | openai==1.50.0 5 | pandas==2.2.3 6 | python-dotenv==1.0.1 7 | rich==13.8.1 8 | scikit_learn==1.5.2 9 | scipy==1.14.1 10 | shortuuid==1.0.13 11 | tiktoken==0.7.0 12 | torch==2.4.0 13 | tqdm==4.66.4 14 | transformers==4.44.2 15 | vllm==0.5.5 16 | -------------------------------------------------------------------------------- /scripts/Yi-1.5-6B/run_mmlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/multi_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Yi-1.5-6B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/multi_mlt/mmlt_ruler_Yi-1.5-6B.jsonl 7 | -------------------------------------------------------------------------------- /scripts/gemma-7b/run_mmlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/multi_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_gemma-7b_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/multi_mlt/mmlt_ruler_gemma-7b.jsonl 7 | 8 | -------------------------------------------------------------------------------- /scripts/Qwen1.5-7B/run_mmlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/multi_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Qwen1.5-7B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/multi_mlt/mmlt_ruler_Qwen1.5-7B.jsonl 7 | 8 | -------------------------------------------------------------------------------- /scripts/Meta-Llama-3-8B/run_mmlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/multi_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Meta-Llama-3-8B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/multi_mlt/mmlt_ruler_Meta-Llama-3-8B.jsonl 7 | -------------------------------------------------------------------------------- /scripts/Mistral-7B-v0.3/run_mmlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/multi_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Mistral-7B-v0.3_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/multi_mlt/mmlt_ruler_Mistral-7B-v0.3.jsonl 7 | -------------------------------------------------------------------------------- /scripts/Yi-1.5-6B/run_self_generated_mlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/self_generated_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Yi-1.5-6B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/self_generated_mlt/sgm_ruler_Yi-1.5-6B.jsonl 7 | -------------------------------------------------------------------------------- /scripts/deepseek-llm-7b-base/run_mmlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/multi_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_deepseek-llm-7b-base_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/multi_mlt/mmlt_ruler_deepseek-llm-7b-base.jsonl 7 | -------------------------------------------------------------------------------- /scripts/gemma-7b/run_self_generated_mlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/self_generated_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_gemma-7b_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/self_generated_mlt/sgm_ruler_gemma-7b.jsonl 7 | 8 | -------------------------------------------------------------------------------- /scripts/Qwen1.5-7B/run_self_generated_mlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/self_generated_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Qwen1.5-7B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/self_generated_mlt/sgm_ruler_Qwen1.5-7B.jsonl 7 | 8 | -------------------------------------------------------------------------------- /scripts/Meta-Llama-3-8B/run_self_generated_mlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/self_generated_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Meta-Llama-3-8B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/self_generated_mlt/sgm_ruler_Meta-Llama-3-8B.jsonl 7 | -------------------------------------------------------------------------------- /scripts/Mistral-7B-v0.3/run_self_generated_mlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/self_generated_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Mistral-7B-v0.3_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/self_generated_mlt/sgm_ruler_Mistral-7B-v0.3.jsonl 7 | -------------------------------------------------------------------------------- /scripts/deepseek-llm-7b-base/run_self_generated_mlt.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/self_generated_mlt.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_deepseek-llm-7b-base_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/self_generated_mlt/sgm_ruler_deepseek-llm-7b-base.jsonl 7 | -------------------------------------------------------------------------------- /src/utils/count.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | def count_words(text): 4 | """Counts the number of words.""" 5 | tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") 6 | tokens = tokenizer.tokenize(text) 7 | num_words = len(tokens) 8 | # print(tokens) 9 | return num_words 10 | 11 | 12 | def count_tokens(tokenizer, text): 13 | inputs = tokenizer.encode(text, return_tensors="pt") 14 | return inputs.shape[1] 15 | 16 | if __name__ == "__main__": 17 | pass -------------------------------------------------------------------------------- /src/utils/json_file.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def load_json(path): 5 | with open(path, "r") as file: 6 | return json.load(file) 7 | 8 | 9 | def load_jsonl(path): 10 | data = [] 11 | with open(path, "r") as file: 12 | for line in file: 13 | json_data = json.loads(line) 14 | data.append(json_data) 15 | return data 16 | 17 | 18 | def save_jsonl(path, data): 19 | with open(path, "w", encoding="utf-8") as file: 20 | for item in data: 21 | json_string = json.dumps(item, ensure_ascii=False) 22 | file.write(json_string + "\n") 23 | -------------------------------------------------------------------------------- /scripts/gemma-7b/run_tlg.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/tlg_dataset.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_gemma-7b_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/tlg/tlg_ot_ruler_gemma-7b.jsonl 7 | 8 | python exp/run_exp.py\ 9 | --dataset_path ../datasets/tlg_dataset.jsonl\ 10 | --gpus 1\ 11 | --template custom\ 12 | --model_name_or_path ../outputs/checkpoints/vanilla_gemma-7b_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 13 | --output_path ../outputs/tlg/tlg_ot_vanilla_gemma-7b.jsonl 14 | -------------------------------------------------------------------------------- /scripts/Yi-1.5-6B/run_tlg.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/tlg_dataset.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Yi-1.5-6B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/tlg/tlg_ot_ruler_Yi-1.5-6B.jsonl 7 | 8 | python exp/run_exp.py\ 9 | --dataset_path ../datasets/tlg_dataset.jsonl\ 10 | --gpus 1\ 11 | --template custom\ 12 | --model_name_or_path ../outputs/checkpoints/vanilla_Yi-1.5-6B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 13 | --output_path ../outputs/tlg/tlg_ot_vanilla_Yi-1.5-6B.jsonl 14 | -------------------------------------------------------------------------------- /scripts/Qwen1.5-7B/run_tlg.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/tlg_dataset.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Qwen1.5-7B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/tlg/tlg_ot_ruler_Qwen1.5-7B.jsonl 7 | 8 | python exp/run_exp.py\ 9 | --dataset_path ../datasets/tlg_dataset.jsonl\ 10 | --gpus 1\ 11 | --template custom\ 12 | --model_name_or_path ../outputs/checkpoints/vanilla_Qwen1.5-7B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 13 | --output_path ../outputs/tlg/tlg_ot_vanilla_Qwen1.5-7B.jsonl 14 | -------------------------------------------------------------------------------- /scripts/download.sh: -------------------------------------------------------------------------------- 1 | # datasets 2 | mkdir -p datasets 3 | mkdir -p datasets/LongForm 4 | mkdir -p datasets/OpenHermes 5 | # logs 6 | mkdir -p logs 7 | # outputs 8 | mkdir -p outputs 9 | mkdir -p outputs/checkpoints 10 | mkdir -p outputs/multi_mlt 11 | mkdir -p outputs/other_tasks 12 | mkdir -p outputs/self_generated_mlt 13 | mkdir -p outputs/tlg 14 | 15 | # download longform 16 | huggingface-cli download --repo-type dataset --resume-download akoksal/LongForm --local-dir ../datasets/LongForm 17 | # download openhermes 18 | huggingface-cli download --repo-type dataset --resume-download teknium/OpenHermes-2.5 --local-dir ../datasets/OpenHermes 19 | -------------------------------------------------------------------------------- /scripts/Meta-Llama-3-8B/run_tlg.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/tlg_dataset.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Meta-Llama-3-8B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/tlg/tlg_ot_ruler_Meta-Llama-3-8B.jsonl 7 | 8 | python exp/run_exp.py\ 9 | --dataset_path ../datasets/tlg_dataset.jsonl\ 10 | --gpus 1\ 11 | --template custom\ 12 | --model_name_or_path ../outputs/checkpoints/vanilla_Meta-Llama-3-8B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 13 | --output_path ../outputs/tlg/tlg_ot_vanilla_Meta-Llama-3-8B.jsonl 14 | -------------------------------------------------------------------------------- /scripts/Mistral-7B-v0.3/run_tlg.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/tlg_dataset.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_Mistral-7B-v0.3_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/tlg/tlg_ot_ruler_Mistral-7B-v0.3.jsonl 7 | 8 | python exp/run_exp.py\ 9 | --dataset_path ../datasets/tlg_dataset.jsonl\ 10 | --gpus 1\ 11 | --template custom\ 12 | --model_name_or_path ../outputs/checkpoints/vanilla_Mistral-7B-v0.3_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 13 | --output_path ../outputs/tlg/tlg_ot_vanilla_Mistral-7B-v0.3.jsonl 14 | -------------------------------------------------------------------------------- /scripts/deepseek-llm-7b-base/run_tlg.sh: -------------------------------------------------------------------------------- 1 | python exp/run_exp.py\ 2 | --dataset_path ../datasets/tlg_dataset.jsonl\ 3 | --gpus 1\ 4 | --template custom\ 5 | --model_name_or_path ../outputs/checkpoints/ruler_deepseek-llm-7b-base_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 6 | --output_path ../outputs/tlg/tlg_ot_ruler_deepseek-llm-7b-base.jsonl 7 | 8 | python exp/run_exp.py\ 9 | --dataset_path ../datasets/tlg_dataset.jsonl\ 10 | --gpus 1\ 11 | --template custom\ 12 | --model_name_or_path ../outputs/checkpoints/vanilla_deepseek-llm-7b-base_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841\ 13 | --output_path ../outputs/tlg/tlg_ot_vanilla_deepseek-llm-7b-base.jsonl 14 | -------------------------------------------------------------------------------- /configs/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /src/data_process/raw_openhermes_process.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from utils import load_json, save_jsonl 3 | 4 | 5 | def main(args): 6 | # raw data load 7 | df = load_json(args.dataset_path) 8 | data = [] 9 | for d in df: 10 | if len(d["conversations"]) == 2: 11 | data.append(d) 12 | # save to output_path 13 | save_jsonl(args.output_path, data) 14 | 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("--dataset_path", type=str, default=None) 19 | parser.add_argument("--model_name_or_path", type=str, default=None) 20 | parser.add_argument("--output_path", type=str, default=None) 21 | args = parser.parse_args() 22 | main(args) 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Geaming 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/data_process/build_arena_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import nlp 3 | import random 4 | import argparse 5 | import pandas as pd 6 | from utils import load_jsonl, save_jsonl 7 | from utils.config import TARGET_LENGTH 8 | 9 | 10 | def main(args): 11 | df = load_jsonl(args.dataset_path) 12 | data = [] 13 | id = 0 14 | for d in df: 15 | data.append({"id": id, "Instruction": d["turns"][0]["content"]}) 16 | id += 1 17 | if args.num is not None: 18 | random.seed(args.random_seed) 19 | random.shuffle(data) 20 | data = data[: args.num] 21 | data = [ 22 | {**d, "TargetLength": tl} for d in data for tl in TARGET_LENGTH 23 | ] 24 | # save to output_path 25 | save_jsonl(args.output_path, data) 26 | 27 | 28 | if __name__ == "__main__": 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--dataset_path", type=str, default=None) 31 | parser.add_argument("--num", type=int, default=None) 32 | parser.add_argument("--random_seed", type=int, default=10) 33 | parser.add_argument("--output_path", type=str, default=None) 34 | args = parser.parse_args() 35 | main(args) 36 | -------------------------------------------------------------------------------- /src/data_process/build_tlg_dataset.py: -------------------------------------------------------------------------------- 1 | import random 2 | import argparse 3 | from utils import load_jsonl, save_jsonl 4 | from utils.config import TARGET_LENGTH 5 | 6 | 7 | def main(args): 8 | # random seed 9 | random.seed(args.random_seed) 10 | # raw data load 11 | df = load_jsonl(args.dataset_path) 12 | # random sample 13 | random.shuffle(df) 14 | df = df[: args.num] 15 | # add target length 16 | data = [] 17 | target_lengths = [random.choice(TARGET_LENGTH) for _ in range(args.num)] 18 | for idx in range(len(df)): 19 | d = {} 20 | d['id'] = idx 21 | d["Instruction"] = df[idx]["conversations"][0]["value"] 22 | d["TargetLength"] = target_lengths[idx] 23 | data.append(d) 24 | # save to output_path 25 | save_jsonl(args.output_path, data) 26 | 27 | 28 | if __name__ == "__main__": 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--dataset_path", type=str, default=None) 31 | parser.add_argument("--num", type=int, default=None) 32 | parser.add_argument("--random_seed", type=int, default=10) 33 | parser.add_argument("--output_path", type=str, default=None) 34 | args = parser.parse_args() 35 | main(args) 36 | -------------------------------------------------------------------------------- /src/exp/cal_ttest.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from utils import load_jsonl 3 | from rich.table import Table 4 | from rich.console import Console 5 | from utils.count import count_words 6 | import scipy.stats as stats 7 | 8 | 9 | def main(args): 10 | # raw data load 11 | vanilla_df = load_jsonl(args.vanilla_dataset_path) 12 | ruler_dataset_path = args.vanilla_dataset_path.replace("tlg_", "tlg_Ruler_") 13 | ruler_df = load_jsonl(ruler_dataset_path) 14 | print(ruler_dataset_path) 15 | vanilla_lengths, ruler_lengths = [], [] 16 | for idx in range(len(vanilla_df)): 17 | vanilla_lengths.append(count_words(vanilla_df[idx]["output"])) 18 | ruler_lengths.append(count_words(ruler_df[idx]["output"])) 19 | table = Table(show_header=True, header_style="bold magenta") 20 | table.add_column("Model", style="dim", width=12) 21 | table.add_column("t", justify="right") 22 | table.add_column("p", justify="right") 23 | t_statistic, p_value = stats.ttest_ind(ruler_lengths,vanilla_lengths) 24 | table.add_row( 25 | args.vanilla_dataset_path.split("/")[-1][4:], 26 | f"{t_statistic:.4f}", 27 | f"{p_value:.4f}", 28 | ) 29 | console = Console() 30 | console.print(table) 31 | # print(f"{t_statistic:.4f}|{p_value:.4f}|") 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("--vanilla_dataset_path", type=str, default=None) 36 | args = parser.parse_args() 37 | main(args) 38 | -------------------------------------------------------------------------------- /configs/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | 15 | "scheduler": { 16 | "type": "WarmupLR", 17 | "params": { 18 | "warmup_min_lr": "auto", 19 | "warmup_max_lr": "auto", 20 | "warmup_num_steps": "auto" 21 | } 22 | }, 23 | 24 | "zero_optimization": { 25 | "stage": 3, 26 | "offload_optimizer": { 27 | "device": "none", 28 | "pin_memory": true 29 | }, 30 | "offload_param": { 31 | "device": "none", 32 | "pin_memory": true 33 | }, 34 | "overlap_comm": true, 35 | "contiguous_gradients": true, 36 | "sub_group_size": 1e9, 37 | "reduce_bucket_size": "auto", 38 | "stage3_prefetch_bucket_size": "auto", 39 | "stage3_param_persistence_threshold": "auto", 40 | "stage3_max_live_parameters": 1e9, 41 | "stage3_max_reuse_distance": 1e9, 42 | "stage3_gather_16bit_weights_on_model_save": true 43 | }, 44 | 45 | "gradient_accumulation_steps": "auto", 46 | "gradient_clipping": "auto", 47 | "steps_per_print": 20, 48 | "train_batch_size": "auto", 49 | "train_micro_batch_size_per_gpu": "auto", 50 | "wall_clock_breakdown": false 51 | } -------------------------------------------------------------------------------- /configs/ds_config_zero3_cpu_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | 15 | "scheduler": { 16 | "type": "WarmupLR", 17 | "params": { 18 | "warmup_min_lr": "auto", 19 | "warmup_max_lr": "auto", 20 | "warmup_num_steps": "auto" 21 | } 22 | }, 23 | 24 | "zero_optimization": { 25 | "stage": 3, 26 | "offload_optimizer": { 27 | "device": "cpu", 28 | "pin_memory": true 29 | }, 30 | "offload_param": { 31 | "device": "cpu", 32 | "pin_memory": true 33 | }, 34 | "overlap_comm": true, 35 | "contiguous_gradients": true, 36 | "sub_group_size": 1e9, 37 | "reduce_bucket_size": "auto", 38 | "stage3_prefetch_bucket_size": "auto", 39 | "stage3_param_persistence_threshold": "auto", 40 | "stage3_max_live_parameters": 1e9, 41 | "stage3_max_reuse_distance": 1e9, 42 | "stage3_gather_16bit_weights_on_model_save": true 43 | }, 44 | 45 | "gradient_accumulation_steps": "auto", 46 | "gradient_clipping": "auto", 47 | "steps_per_print": 20, 48 | "train_batch_size": "auto", 49 | "train_micro_batch_size_per_gpu": "auto", 50 | "wall_clock_breakdown": false 51 | } -------------------------------------------------------------------------------- /scripts/Meta-Llama-3-8B/ruler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=4,5,6,7 3 | 4 | export MASTER_PORT=$(echo $METIS_WORKER_0_PORT | cut -d',' -f1) 5 | 6 | LEARNING_RATE=2e-5 7 | NUM_TRAIN_EPOCHS=3 8 | VANILLA=False 9 | 10 | MODEL_NAME_OR_PATH=/data1/HF-Models/meta-llama/Meta-Llama-3-8B 11 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 12 | MODEL=${MODEL_NAME_OR_PATH##*/} 13 | 14 | TEMPLATE=custom 15 | echo "Finetune data template: ${TEMPLATE}" 16 | 17 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 18 | echo "Finetune data path: ${DATA_PATH}" 19 | 20 | MODEL_MAX_LENGTH=2048 21 | echo "Model max length: ${MODEL_MAX_LENGTH}" 22 | 23 | BATCH_SIZE=4 24 | echo "Per device train batch size: ${BATCH_SIZE}" 25 | 26 | GRAD_ACCUM=8 27 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 28 | 29 | OUTPUT_DIR="../outputs/checkpoints/ruler_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 30 | LOG_DIR=../logs 31 | 32 | deepspeed finetuning/finetune.py \ 33 | --vanilla $VANILLA \ 34 | --deepspeed ../configs/ds_config_zero3.json \ 35 | --model_name_or_path $MODEL_NAME_OR_PATH \ 36 | --template $TEMPLATE\ 37 | --model_max_length $MODEL_MAX_LENGTH \ 38 | --data_path $DATA_PATH \ 39 | --output_dir $OUTPUT_DIR \ 40 | --bf16 True \ 41 | --tf32 True \ 42 | --per_device_train_batch_size ${BATCH_SIZE} \ 43 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 44 | --gradient_checkpointing True \ 45 | --lr_scheduler_type cosine \ 46 | --learning_rate ${LEARNING_RATE} \ 47 | --warmup_ratio 0.05 \ 48 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 49 | --evaluation_strategy no \ 50 | --save_strategy epoch \ 51 | --save_total_limit 1 \ 52 | --logging_steps 5 \ 53 | 2>&1 | tee ${LOG_DIR}/output_ruler_${MODEL}.log -------------------------------------------------------------------------------- /scripts/Mistral-7B-v0.3/ruler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=4,5,6,7 3 | 4 | export MASTER_PORT=$(echo $METIS_WORKER_0_PORT | cut -d',' -f1) 5 | 6 | LEARNING_RATE=2e-5 7 | NUM_TRAIN_EPOCHS=3 8 | VANILLA=False 9 | 10 | MODEL_NAME_OR_PATH=/data1/HF-Models/mistralai/Mistral-7B-v0.3 11 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 12 | MODEL=${MODEL_NAME_OR_PATH##*/} 13 | 14 | TEMPLATE=custom 15 | echo "Finetune data template: ${TEMPLATE}" 16 | 17 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 18 | echo "Finetune data path: ${DATA_PATH}" 19 | 20 | MODEL_MAX_LENGTH=2048 21 | echo "Model max length: ${MODEL_MAX_LENGTH}" 22 | 23 | BATCH_SIZE=4 24 | echo "Per device train batch size: ${BATCH_SIZE}" 25 | 26 | GRAD_ACCUM=8 27 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 28 | 29 | OUTPUT_DIR="../outputs/checkpoints/ruler_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 30 | LOG_DIR=../logs 31 | 32 | deepspeed finetuning/finetune.py \ 33 | --vanilla $VANILLA \ 34 | --deepspeed ../configs/ds_config_zero3.json \ 35 | --model_name_or_path $MODEL_NAME_OR_PATH \ 36 | --template $TEMPLATE\ 37 | --model_max_length $MODEL_MAX_LENGTH \ 38 | --data_path $DATA_PATH \ 39 | --output_dir $OUTPUT_DIR \ 40 | --bf16 True \ 41 | --tf32 True \ 42 | --per_device_train_batch_size ${BATCH_SIZE} \ 43 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 44 | --gradient_checkpointing True \ 45 | --lr_scheduler_type cosine \ 46 | --learning_rate ${LEARNING_RATE} \ 47 | --warmup_ratio 0.05 \ 48 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 49 | --evaluation_strategy no \ 50 | --save_strategy epoch \ 51 | --save_total_limit 1 \ 52 | --logging_steps 5 \ 53 | 2>&1 | tee ${LOG_DIR}/output_ruler_${MODEL}.log -------------------------------------------------------------------------------- /scripts/Mistral-7B-v0.3/vanilla.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=4,5,6,7 3 | 4 | export MASTER_PORT=$(echo $METIS_WORKER_0_PORT | cut -d',' -f1) 5 | 6 | LEARNING_RATE=2e-5 7 | NUM_TRAIN_EPOCHS=3 8 | VANILLA=True 9 | 10 | MODEL_NAME_OR_PATH=/data1/HF-Models/mistralai/Mistral-7B-v0.3 11 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 12 | MODEL=${MODEL_NAME_OR_PATH##*/} 13 | 14 | TEMPLATE=custom 15 | echo "Finetune data template: ${TEMPLATE}" 16 | 17 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 18 | echo "Finetune data path: ${DATA_PATH}" 19 | 20 | MODEL_MAX_LENGTH=2048 21 | echo "Model max length: ${MODEL_MAX_LENGTH}" 22 | 23 | BATCH_SIZE=4 24 | echo "Per device train batch size: ${BATCH_SIZE}" 25 | 26 | GRAD_ACCUM=8 27 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 28 | 29 | OUTPUT_DIR="../outputs/checkpoints/vanilla_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 30 | LOG_DIR=../logs 31 | 32 | deepspeed finetuning/finetune.py \ 33 | --vanilla $VANILLA \ 34 | --deepspeed ../configs/ds_config_zero3.json \ 35 | --model_name_or_path $MODEL_NAME_OR_PATH \ 36 | --template $TEMPLATE\ 37 | --model_max_length $MODEL_MAX_LENGTH \ 38 | --data_path $DATA_PATH \ 39 | --output_dir $OUTPUT_DIR \ 40 | --bf16 True \ 41 | --tf32 True \ 42 | --per_device_train_batch_size ${BATCH_SIZE} \ 43 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 44 | --gradient_checkpointing True \ 45 | --lr_scheduler_type cosine \ 46 | --learning_rate ${LEARNING_RATE} \ 47 | --warmup_ratio 0.05 \ 48 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 49 | --evaluation_strategy no \ 50 | --save_strategy epoch \ 51 | --save_total_limit 1 \ 52 | --logging_steps 5 \ 53 | 2>&1 | tee ${LOG_DIR}/output_vanilla_${MODEL}.log -------------------------------------------------------------------------------- /scripts/Meta-Llama-3-8B/vanilla.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=4,5,6,7 3 | 4 | export MASTER_PORT=$(echo $METIS_WORKER_0_PORT | cut -d',' -f1) 5 | 6 | LEARNING_RATE=2e-5 7 | NUM_TRAIN_EPOCHS=3 8 | VANILLA=True 9 | 10 | MODEL_NAME_OR_PATH=/data1/HF-Models/meta-llama/Meta-Llama-3-8B 11 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 12 | MODEL=${MODEL_NAME_OR_PATH##*/} 13 | 14 | TEMPLATE=custom 15 | echo "Finetune data template: ${TEMPLATE}" 16 | 17 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 18 | echo "Finetune data path: ${DATA_PATH}" 19 | 20 | MODEL_MAX_LENGTH=2048 21 | echo "Model max length: ${MODEL_MAX_LENGTH}" 22 | 23 | BATCH_SIZE=4 24 | echo "Per device train batch size: ${BATCH_SIZE}" 25 | 26 | GRAD_ACCUM=8 27 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 28 | 29 | OUTPUT_DIR="../outputs/checkpoints/vanilla_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 30 | LOG_DIR=../logs 31 | 32 | deepspeed finetuning/finetune.py \ 33 | --vanilla $VANILLA \ 34 | --deepspeed ../configs/ds_config_zero3.json \ 35 | --model_name_or_path $MODEL_NAME_OR_PATH \ 36 | --template $TEMPLATE\ 37 | --model_max_length $MODEL_MAX_LENGTH \ 38 | --data_path $DATA_PATH \ 39 | --output_dir $OUTPUT_DIR \ 40 | --bf16 True \ 41 | --tf32 True \ 42 | --per_device_train_batch_size ${BATCH_SIZE} \ 43 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 44 | --gradient_checkpointing True \ 45 | --lr_scheduler_type cosine \ 46 | --learning_rate ${LEARNING_RATE} \ 47 | --warmup_ratio 0.05 \ 48 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 49 | --evaluation_strategy no \ 50 | --save_strategy epoch \ 51 | --save_total_limit 1 \ 52 | --logging_steps 5 \ 53 | 2>&1 | tee ${LOG_DIR}/output_vanilla_${MODEL}.log 54 | -------------------------------------------------------------------------------- /src/utils/config.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | 4 | class Inf: 5 | def __gt__(self, other): 6 | return True 7 | 8 | def __ge__(self, other): 9 | return True 10 | 11 | def __lt__(self, other): 12 | return False 13 | 14 | def __eq__(self, other): 15 | return isinstance(other, Inf) 16 | 17 | def __repr__(self): 18 | return "Inf" 19 | 20 | 21 | inf = Inf() 22 | 23 | 24 | # FLCG EXP 25 | LEVEL0 = ["10", "30", "50", "80"] 26 | LEVEL1 = ["150", "300", "500"] 27 | LEVEL2 = ["700", ">800"] 28 | RANGE = OrderedDict( 29 | { 30 | # level:0 31 | "10": {"PM": [0, 20], "FM": [0, 20]}, 32 | "30": {"PM": [20, 40], "FM": [20, 40]}, 33 | "50": {"PM": [40, 60], "FM": [40, 60]}, 34 | "80": {"PM": [70, 90], "FM": [60, 100]}, 35 | # level:1 36 | "150": {"PM": [130, 170], "FM": [100, 200]}, 37 | "300": {"PM": [280, 320], "FM": [200, 400]}, 38 | "500": {"PM": [450, 550], "FM": [400, 600]}, 39 | # level:2 40 | "700": {"PM": [630, 770], "FM": [600, 800]}, 41 | ">800": {"PM": [800, inf], "FM": [800, inf]}, 42 | } 43 | ) 44 | 45 | TARGET_LENGTH = list(RANGE.keys()) 46 | 47 | MetaLengthToken = [ 48 | ["[MLT:10]", [5, 15]], 49 | ["[MLT:30]", [25, 35]], 50 | ["[MLT:50]", [45, 55]], 51 | ["[MLT:80]", [75, 85]], 52 | ["[MLT:150]", [135, 155]], 53 | ["[MLT:300]", [295, 305]], 54 | ["[MLT:500]", [495, 505]], 55 | ["[MLT:700]", [695, 705]], 56 | ["[MLT:>800]", [800, inf]], 57 | ] 58 | 59 | # MLT training dataset 60 | SAMPLE = { 61 | "[MLT:10]": 10000 * 2, 62 | "[MLT:30]": 10000 * 2, 63 | "[MLT:50]": 10000 * 2, 64 | "[MLT:80]": 10000 * 2, 65 | "[MLT:150]": 10000 * 2, 66 | "[MLT:300]": 10000 * 2, 67 | "[MLT:500]": 10000 * 2, 68 | "[MLT:700]": 10000 * 2, 69 | "[MLT:>800]": 10000 * 2, 70 | } 71 | -------------------------------------------------------------------------------- /scripts/Qwen1.5-7B/ruler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | find_free_port() { 5 | while : 6 | do 7 | PORT=$(( ( RANDOM % 64512 ) + 1024 )) 8 | (echo >/dev/tcp/localhost/$PORT) >/dev/null 2>&1 9 | if [ $? -ne 0 ]; then 10 | echo $PORT 11 | return 12 | fi 13 | done 14 | } 15 | 16 | export MASTER_PORT=$(find_free_port) 17 | 18 | LEARNING_RATE=2e-5 19 | NUM_TRAIN_EPOCHS=3 20 | VANILLA=False 21 | 22 | MODEL_NAME_OR_PATH=/data1/HF-Models/Qwen/Qwen1.5-7B 23 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 24 | MODEL=${MODEL_NAME_OR_PATH##*/} 25 | 26 | TEMPLATE=custom 27 | echo "Finetune data template: ${TEMPLATE}" 28 | 29 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 30 | echo "Finetune data path: ${DATA_PATH}" 31 | 32 | MODEL_MAX_LENGTH=2048 33 | echo "Model max length: ${MODEL_MAX_LENGTH}" 34 | 35 | BATCH_SIZE=4 36 | echo "Per device train batch size: ${BATCH_SIZE}" 37 | 38 | GRAD_ACCUM=8 39 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 40 | 41 | OUTPUT_DIR="../outputs/checkpoints/ruler_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 42 | LOG_DIR=../logs 43 | 44 | deepspeed --master_port=$MASTER_PORT finetuning/finetune.py \ 45 | --vanilla $VANILLA \ 46 | --deepspeed ../configs/ds_config_zero3.json \ 47 | --model_name_or_path $MODEL_NAME_OR_PATH \ 48 | --template $TEMPLATE\ 49 | --model_max_length $MODEL_MAX_LENGTH \ 50 | --data_path $DATA_PATH \ 51 | --output_dir $OUTPUT_DIR \ 52 | --bf16 True \ 53 | --tf32 True \ 54 | --per_device_train_batch_size ${BATCH_SIZE} \ 55 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 56 | --gradient_checkpointing True \ 57 | --lr_scheduler_type cosine \ 58 | --learning_rate ${LEARNING_RATE} \ 59 | --warmup_ratio 0.05 \ 60 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 61 | --evaluation_strategy no \ 62 | --save_strategy epoch \ 63 | --save_total_limit 1 \ 64 | --logging_steps 5 \ 65 | 2>&1 | tee ${LOG_DIR}/output_ruler_${MODEL}.log -------------------------------------------------------------------------------- /scripts/Yi-1.5-6B/ruler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | find_free_port() { 5 | while : 6 | do 7 | PORT=$(( ( RANDOM % 64512 ) + 1024 )) 8 | (echo >/dev/tcp/localhost/$PORT) >/dev/null 2>&1 9 | if [ $? -ne 0 ]; then 10 | echo $PORT 11 | return 12 | fi 13 | done 14 | } 15 | 16 | export MASTER_PORT=$(find_free_port) 17 | 18 | LEARNING_RATE=2e-5 19 | NUM_TRAIN_EPOCHS=3 20 | VANILLA=False 21 | 22 | MODEL_NAME_OR_PATH=/data1/HF-Models/01-ai/Yi-1.5-6B 23 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 24 | MODEL=${MODEL_NAME_OR_PATH##*/} 25 | 26 | TEMPLATE=custom 27 | echo "Finetune data template: ${TEMPLATE}" 28 | 29 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 30 | echo "Finetune data path: ${DATA_PATH}" 31 | 32 | MODEL_MAX_LENGTH=2048 33 | echo "Model max length: ${MODEL_MAX_LENGTH}" 34 | 35 | BATCH_SIZE=4 36 | echo "Per device train batch size: ${BATCH_SIZE}" 37 | 38 | GRAD_ACCUM=8 39 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 40 | 41 | OUTPUT_DIR="../outputs/checkpoints/ruler_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 42 | LOG_DIR=../logs 43 | 44 | deepspeed --master_port=$MASTER_PORT finetuning/finetune.py \ 45 | --vanilla $VANILLA \ 46 | --deepspeed ../configs/ds_config_zero3.json \ 47 | --model_name_or_path $MODEL_NAME_OR_PATH \ 48 | --template $TEMPLATE\ 49 | --model_max_length $MODEL_MAX_LENGTH \ 50 | --data_path $DATA_PATH \ 51 | --output_dir $OUTPUT_DIR \ 52 | --bf16 True \ 53 | --tf32 True \ 54 | --per_device_train_batch_size ${BATCH_SIZE} \ 55 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 56 | --gradient_checkpointing True \ 57 | --lr_scheduler_type cosine \ 58 | --learning_rate ${LEARNING_RATE} \ 59 | --warmup_ratio 0.05 \ 60 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 61 | --evaluation_strategy no \ 62 | --save_strategy epoch \ 63 | --save_total_limit 1 \ 64 | --logging_steps 5 \ 65 | 2>&1 | tee ${LOG_DIR}/output_ruler_${MODEL}.log -------------------------------------------------------------------------------- /scripts/gemma-7b/ruler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=4,5,6,7 3 | 4 | find_free_port() { 5 | while : 6 | do 7 | PORT=$(( ( RANDOM % 64512 ) + 1024 )) 8 | (echo >/dev/tcp/localhost/$PORT) >/dev/null 2>&1 9 | if [ $? -ne 0 ]; then 10 | echo $PORT 11 | return 12 | fi 13 | done 14 | } 15 | 16 | export MASTER_PORT=$(find_free_port) 17 | 18 | LEARNING_RATE=2e-5 19 | NUM_TRAIN_EPOCHS=3 20 | VANILLA=False 21 | 22 | MODEL_NAME_OR_PATH=/data1/HF-Models/google/gemma-7b 23 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 24 | MODEL=${MODEL_NAME_OR_PATH##*/} 25 | 26 | TEMPLATE=custom 27 | echo "Finetune data template: ${TEMPLATE}" 28 | 29 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 30 | echo "Finetune data path: ${DATA_PATH}" 31 | 32 | MODEL_MAX_LENGTH=2048 33 | echo "Model max length: ${MODEL_MAX_LENGTH}" 34 | 35 | BATCH_SIZE=4 36 | echo "Per device train batch size: ${BATCH_SIZE}" 37 | 38 | GRAD_ACCUM=8 39 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 40 | 41 | OUTPUT_DIR="../outputs/checkpoints/ruler_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 42 | LOG_DIR=../logs 43 | 44 | deepspeed --master_port=$MASTER_PORT finetuning/finetune.py \ 45 | --vanilla $VANILLA \ 46 | --deepspeed ../configs/ds_config_zero3.json \ 47 | --model_name_or_path $MODEL_NAME_OR_PATH \ 48 | --template $TEMPLATE\ 49 | --model_max_length $MODEL_MAX_LENGTH \ 50 | --data_path $DATA_PATH \ 51 | --output_dir $OUTPUT_DIR \ 52 | --bf16 True \ 53 | --tf32 True \ 54 | --per_device_train_batch_size ${BATCH_SIZE} \ 55 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 56 | --gradient_checkpointing True \ 57 | --lr_scheduler_type cosine \ 58 | --learning_rate ${LEARNING_RATE} \ 59 | --warmup_ratio 0.05 \ 60 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 61 | --evaluation_strategy no \ 62 | --save_strategy epoch \ 63 | --save_total_limit 1 \ 64 | --logging_steps 5 \ 65 | 2>&1 | tee ${LOG_DIR}/output_ruler_${MODEL}.log -------------------------------------------------------------------------------- /scripts/Qwen1.5-7B/vanilla.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | find_free_port() { 5 | while : 6 | do 7 | PORT=$(( ( RANDOM % 64512 ) + 1024 )) 8 | (echo >/dev/tcp/localhost/$PORT) >/dev/null 2>&1 9 | if [ $? -ne 0 ]; then 10 | echo $PORT 11 | return 12 | fi 13 | done 14 | } 15 | 16 | export MASTER_PORT=$(find_free_port) 17 | 18 | LEARNING_RATE=2e-5 19 | NUM_TRAIN_EPOCHS=3 20 | VANILLA=True 21 | 22 | MODEL_NAME_OR_PATH=/data1/HF-Models/Qwen/Qwen1.5-7B 23 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 24 | MODEL=${MODEL_NAME_OR_PATH##*/} 25 | 26 | TEMPLATE=custom 27 | echo "Finetune data template: ${TEMPLATE}" 28 | 29 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 30 | echo "Finetune data path: ${DATA_PATH}" 31 | 32 | MODEL_MAX_LENGTH=2048 33 | echo "Model max length: ${MODEL_MAX_LENGTH}" 34 | 35 | BATCH_SIZE=4 36 | echo "Per device train batch size: ${BATCH_SIZE}" 37 | 38 | GRAD_ACCUM=8 39 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 40 | 41 | OUTPUT_DIR="../outputs/checkpoints/vanilla_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 42 | LOG_DIR=../logs 43 | 44 | deepspeed --master_port=$MASTER_PORT finetuning/finetune.py \ 45 | --vanilla $VANILLA \ 46 | --deepspeed ../configs/ds_config_zero3.json \ 47 | --model_name_or_path $MODEL_NAME_OR_PATH \ 48 | --template $TEMPLATE\ 49 | --model_max_length $MODEL_MAX_LENGTH \ 50 | --data_path $DATA_PATH \ 51 | --output_dir $OUTPUT_DIR \ 52 | --bf16 True \ 53 | --tf32 True \ 54 | --per_device_train_batch_size ${BATCH_SIZE} \ 55 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 56 | --gradient_checkpointing True \ 57 | --lr_scheduler_type cosine \ 58 | --learning_rate ${LEARNING_RATE} \ 59 | --warmup_ratio 0.05 \ 60 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 61 | --evaluation_strategy no \ 62 | --save_strategy epoch \ 63 | --save_total_limit 1 \ 64 | --logging_steps 5 \ 65 | 2>&1 | tee ${LOG_DIR}/output_vanilla_${MODEL}.log 66 | -------------------------------------------------------------------------------- /scripts/Yi-1.5-6B/vanilla.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=4,5,6,7 3 | 4 | find_free_port() { 5 | while : 6 | do 7 | PORT=$(( ( RANDOM % 64512 ) + 1024 )) 8 | (echo >/dev/tcp/localhost/$PORT) >/dev/null 2>&1 9 | if [ $? -ne 0 ]; then 10 | echo $PORT 11 | return 12 | fi 13 | done 14 | } 15 | 16 | export MASTER_PORT=$(find_free_port) 17 | 18 | LEARNING_RATE=2e-5 19 | NUM_TRAIN_EPOCHS=3 20 | VANILLA=True 21 | 22 | MODEL_NAME_OR_PATH=/data1/HF-Models/01-ai/Yi-1.5-6B 23 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 24 | MODEL=${MODEL_NAME_OR_PATH##*/} 25 | 26 | TEMPLATE=custom 27 | echo "Finetune data template: ${TEMPLATE}" 28 | 29 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 30 | echo "Finetune data path: ${DATA_PATH}" 31 | 32 | MODEL_MAX_LENGTH=2048 33 | echo "Model max length: ${MODEL_MAX_LENGTH}" 34 | 35 | BATCH_SIZE=4 36 | echo "Per device train batch size: ${BATCH_SIZE}" 37 | 38 | GRAD_ACCUM=8 39 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 40 | 41 | OUTPUT_DIR="../outputs/checkpoints/vanilla_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 42 | LOG_DIR=../logs 43 | 44 | deepspeed --master_port=$MASTER_PORT finetuning/finetune.py \ 45 | --vanilla $VANILLA \ 46 | --deepspeed ../configs/ds_config_zero3.json \ 47 | --model_name_or_path $MODEL_NAME_OR_PATH \ 48 | --template $TEMPLATE\ 49 | --model_max_length $MODEL_MAX_LENGTH \ 50 | --data_path $DATA_PATH \ 51 | --output_dir $OUTPUT_DIR \ 52 | --bf16 True \ 53 | --tf32 True \ 54 | --per_device_train_batch_size ${BATCH_SIZE} \ 55 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 56 | --gradient_checkpointing True \ 57 | --lr_scheduler_type cosine \ 58 | --learning_rate ${LEARNING_RATE} \ 59 | --warmup_ratio 0.05 \ 60 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 61 | --evaluation_strategy no \ 62 | --save_strategy epoch \ 63 | --save_total_limit 1 \ 64 | --logging_steps 5 \ 65 | 2>&1 | tee ${LOG_DIR}/output_vanilla_${MODEL}.log 66 | -------------------------------------------------------------------------------- /scripts/gemma-7b/vanilla.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=4,5,6,7 3 | 4 | find_free_port() { 5 | while : 6 | do 7 | PORT=$(( ( RANDOM % 64512 ) + 1024 )) 8 | (echo >/dev/tcp/localhost/$PORT) >/dev/null 2>&1 9 | if [ $? -ne 0 ]; then 10 | echo $PORT 11 | return 12 | fi 13 | done 14 | } 15 | 16 | export MASTER_PORT=$(find_free_port) 17 | 18 | LEARNING_RATE=2e-5 19 | NUM_TRAIN_EPOCHS=3 20 | VANILLA=True 21 | 22 | MODEL_NAME_OR_PATH=/data1/HF-Models/google/gemma-7b 23 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 24 | MODEL=${MODEL_NAME_OR_PATH##*/} 25 | 26 | TEMPLATE=custom 27 | echo "Finetune data template: ${TEMPLATE}" 28 | 29 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 30 | echo "Finetune data path: ${DATA_PATH}" 31 | 32 | MODEL_MAX_LENGTH=2048 33 | echo "Model max length: ${MODEL_MAX_LENGTH}" 34 | 35 | BATCH_SIZE=4 36 | echo "Per device train batch size: ${BATCH_SIZE}" 37 | 38 | GRAD_ACCUM=8 39 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 40 | 41 | OUTPUT_DIR="../outputs/checkpoints/vanilla_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 42 | LOG_DIR=../logs 43 | 44 | deepspeed --master_port=$MASTER_PORT finetuning/finetune.py \ 45 | --vanilla $VANILLA \ 46 | --deepspeed ../configs/ds_config_zero3.json \ 47 | --model_name_or_path $MODEL_NAME_OR_PATH \ 48 | --template $TEMPLATE\ 49 | --model_max_length $MODEL_MAX_LENGTH \ 50 | --data_path $DATA_PATH \ 51 | --output_dir $OUTPUT_DIR \ 52 | --bf16 True \ 53 | --tf32 True \ 54 | --per_device_train_batch_size ${BATCH_SIZE} \ 55 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 56 | --gradient_checkpointing True \ 57 | --lr_scheduler_type cosine \ 58 | --learning_rate ${LEARNING_RATE} \ 59 | --warmup_ratio 0.05 \ 60 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 61 | --evaluation_strategy no \ 62 | --save_strategy epoch \ 63 | --save_total_limit 1 \ 64 | --logging_steps 5 \ 65 | 2>&1 | tee ${LOG_DIR}/output_vanilla_${MODEL}.log 66 | -------------------------------------------------------------------------------- /scripts/deepseek-llm-7b-base/ruler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=4,5,6,7 3 | 4 | find_free_port() { 5 | while : 6 | do 7 | PORT=$(( ( RANDOM % 64512 ) + 1024 )) 8 | (echo >/dev/tcp/localhost/$PORT) >/dev/null 2>&1 9 | if [ $? -ne 0 ]; then 10 | echo $PORT 11 | return 12 | fi 13 | done 14 | } 15 | 16 | export MASTER_PORT=$(find_free_port) 17 | 18 | LEARNING_RATE=2e-5 19 | NUM_TRAIN_EPOCHS=3 20 | VANILLA=False 21 | 22 | MODEL_NAME_OR_PATH=/data1/HF-Models/deepseek-ai/deepseek-llm-7b-base 23 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 24 | MODEL=${MODEL_NAME_OR_PATH##*/} 25 | 26 | TEMPLATE=custom 27 | echo "Finetune data template: ${TEMPLATE}" 28 | 29 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 30 | echo "Finetune data path: ${DATA_PATH}" 31 | 32 | MODEL_MAX_LENGTH=2048 33 | echo "Model max length: ${MODEL_MAX_LENGTH}" 34 | 35 | BATCH_SIZE=4 36 | echo "Per device train batch size: ${BATCH_SIZE}" 37 | 38 | GRAD_ACCUM=8 39 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 40 | 41 | OUTPUT_DIR="../outputs/checkpoints/ruler_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 42 | LOG_DIR=../logs 43 | 44 | deepspeed --master_port=$MASTER_PORT finetuning/finetune.py \ 45 | --vanilla $VANILLA \ 46 | --deepspeed ../configs/ds_config_zero3.json \ 47 | --model_name_or_path $MODEL_NAME_OR_PATH \ 48 | --template $TEMPLATE\ 49 | --model_max_length $MODEL_MAX_LENGTH \ 50 | --data_path $DATA_PATH \ 51 | --output_dir $OUTPUT_DIR \ 52 | --bf16 True \ 53 | --tf32 True \ 54 | --per_device_train_batch_size ${BATCH_SIZE} \ 55 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 56 | --gradient_checkpointing True \ 57 | --lr_scheduler_type cosine \ 58 | --learning_rate ${LEARNING_RATE} \ 59 | --warmup_ratio 0.05 \ 60 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 61 | --evaluation_strategy no \ 62 | --save_strategy epoch \ 63 | --save_total_limit 1 \ 64 | --logging_steps 5 \ 65 | 2>&1 | tee ${LOG_DIR}/output_ruler_${MODEL}.log -------------------------------------------------------------------------------- /scripts/deepseek-llm-7b-base/vanilla.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=4,5,6,7 3 | 4 | find_free_port() { 5 | while : 6 | do 7 | PORT=$(( ( RANDOM % 64512 ) + 1024 )) 8 | (echo >/dev/tcp/localhost/$PORT) >/dev/null 2>&1 9 | if [ $? -ne 0 ]; then 10 | echo $PORT 11 | return 12 | fi 13 | done 14 | } 15 | 16 | export MASTER_PORT=$(find_free_port) 17 | 18 | LEARNING_RATE=2e-5 19 | NUM_TRAIN_EPOCHS=3 20 | VANILLA=True 21 | 22 | MODEL_NAME_OR_PATH=/data1/HF-Models/deepseek-ai/deepseek-llm-7b-base 23 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 24 | MODEL=${MODEL_NAME_OR_PATH##*/} 25 | 26 | TEMPLATE=custom 27 | echo "Finetune data template: ${TEMPLATE}" 28 | 29 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 30 | echo "Finetune data path: ${DATA_PATH}" 31 | 32 | MODEL_MAX_LENGTH=2048 33 | echo "Model max length: ${MODEL_MAX_LENGTH}" 34 | 35 | BATCH_SIZE=4 36 | echo "Per device train batch size: ${BATCH_SIZE}" 37 | 38 | GRAD_ACCUM=8 39 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 40 | 41 | OUTPUT_DIR="../outputs/checkpoints/vanilla_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 42 | LOG_DIR=../logs 43 | 44 | deepspeed --master_port=$MASTER_PORT finetuning/finetune.py \ 45 | --vanilla $VANILLA \ 46 | --deepspeed ../configs/ds_config_zero3.json \ 47 | --model_name_or_path $MODEL_NAME_OR_PATH \ 48 | --template $TEMPLATE\ 49 | --model_max_length $MODEL_MAX_LENGTH \ 50 | --data_path $DATA_PATH \ 51 | --output_dir $OUTPUT_DIR \ 52 | --bf16 True \ 53 | --tf32 True \ 54 | --per_device_train_batch_size ${BATCH_SIZE} \ 55 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 56 | --gradient_checkpointing True \ 57 | --lr_scheduler_type cosine \ 58 | --learning_rate ${LEARNING_RATE} \ 59 | --warmup_ratio 0.05 \ 60 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 61 | --evaluation_strategy no \ 62 | --save_strategy epoch \ 63 | --save_total_limit 1 \ 64 | --logging_steps 5 \ 65 | 2>&1 | tee ${LOG_DIR}/output_vanilla_${MODEL}.log 66 | -------------------------------------------------------------------------------- /src/exp/analysis_sgm.py: -------------------------------------------------------------------------------- 1 | import re 2 | import argparse 3 | from utils.config import MetaLengthToken, RANGE 4 | from utils import load_jsonl 5 | from rich.table import Table 6 | from rich.console import Console 7 | from utils.count import count_words 8 | 9 | 10 | def main(args): 11 | # raw data load 12 | df = load_jsonl(args.dataset_path) 13 | # draw table 14 | table = Table(show_header=True, header_style="bold magenta") 15 | table.add_column("Model", style="dim", width=15) 16 | for mlt in MetaLengthToken: 17 | table.add_column(mlt[0].split(":")[-1][:-1], justify="right") 18 | table.add_column("FM", justify="right") 19 | table.add_column("Avg", justify="right") 20 | count = {mlt[0]: 0 for mlt in MetaLengthToken} 21 | for d in df: 22 | if d["output"].count("MLT") != 1: 23 | d["output"] = "[MLT:" + d["output"].split("[MLT:")[1] 24 | for mlt in MetaLengthToken: 25 | if mlt[0] in d["output"]: 26 | count[mlt[0]] += 1 27 | hit = 0 28 | all_wc = 0 29 | for d in df: 30 | cleaned_text = re.sub(r"\[MLT:\d+\]", "", d["output"]) # clean MLT 31 | wc = count_words(cleaned_text) 32 | mlt = d["output"].split("]")[0] + "]" 33 | if (wc > RANGE[mlt.split(":")[-1][:-1]]["FM"][0]) and ( 34 | wc <= RANGE[mlt.split(":")[-1][:-1]]["FM"][1] 35 | ): 36 | hit += 1 37 | all_wc += wc 38 | table.add_row( 39 | args.dataset_path.split("/")[-1].split("tl_")[-1][:15], 40 | f"{count['[MLT:10]']}", 41 | f"{count['[MLT:30]']}", 42 | f"{count['[MLT:50]']}", 43 | f"{count['[MLT:80]']}", 44 | f"{count['[MLT:150]']}", 45 | f"{count['[MLT:300]']}", 46 | f"{count['[MLT:500]']}", 47 | f"{count['[MLT:700]']}", 48 | f"{count['[MLT:>800]']}", 49 | f"{hit/len(df)*100:.2f}", 50 | f"{all_wc/len(df):.0f}", 51 | ) 52 | console = Console() 53 | console.print(table) 54 | 55 | 56 | if __name__ == "__main__": 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("--dataset_path", type=str, default=None) 59 | args = parser.parse_args() 60 | main(args) 61 | -------------------------------------------------------------------------------- /src/exp/run_exp_api.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import openai 3 | import os 4 | from tqdm import tqdm 5 | from utils import load_jsonl, save_jsonl 6 | from dotenv import load_dotenv 7 | 8 | 9 | load_dotenv() 10 | 11 | 12 | def main(args): 13 | client = openai.OpenAI( 14 | api_key=args.key, base_url=os.getenv("OPENAI_BASE_URL") 15 | ) 16 | def get_completion_openai( 17 | prompt: str, 18 | model: str, 19 | ) -> str: 20 | """ 21 | Generate a completion using the OpenAI API. 22 | 23 | Args: 24 | prompt (str): The user's prompt or query. 25 | model (str, optional): The name of the OpenAI model to use for generating the completion. 26 | Defaults to "gpt-4-turbo". 27 | """ 28 | response = client.chat.completions.create( 29 | model=model, 30 | top_p=1, 31 | max_tokens=2048, 32 | messages=[ 33 | {"role": "user", "content": prompt}, 34 | ], 35 | ) 36 | return response.choices[0].message.content 37 | 38 | # raw data load 39 | df = load_jsonl(args.dataset_path) 40 | # load tokenizer and llm 41 | for idx in tqdm(range(len(df))): 42 | instruction = df[idx]["Instruction"] 43 | targetlength = df[idx]["TargetLength"] if "TargetLength" in df[idx] else "" 44 | if targetlength != "": 45 | targetlength = targetlength.replace(">", "more than ") 46 | question = f"{instruction}\nThe response should have a word count of {targetlength} words." 47 | df[idx]["prompt"] = question 48 | flag = False 49 | while not flag: 50 | try: 51 | output = get_completion_openai(question, args.model) 52 | flag = True 53 | except Exception as e: 54 | print(e) 55 | df[idx]["output"] = output 56 | if idx % 10 == 0: 57 | save_jsonl(args.output_path, df) 58 | # save to output_path 59 | save_jsonl(args.output_path, df) 60 | 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument("--dataset_path", type=str, default=None) 65 | parser.add_argument("--model", type=str, default=None) 66 | parser.add_argument("--output_path", type=str, default=None) 67 | parser.add_argument("--key", type=str, default=None) 68 | args = parser.parse_args() 69 | main(args) 70 | -------------------------------------------------------------------------------- /src/exp/analysis_mmlt.py: -------------------------------------------------------------------------------- 1 | import re 2 | import argparse 3 | from utils.config import MetaLengthToken, RANGE 4 | from utils import load_jsonl 5 | from rich.table import Table 6 | from rich.console import Console 7 | from utils.count import count_words 8 | 9 | 10 | def main(args): 11 | # raw data load 12 | df = load_jsonl(args.dataset_path) 13 | # draw table 14 | table = Table(show_header=True, header_style="bold magenta") 15 | table.add_column("Model", style="dim", width=15) 16 | for mlt in MetaLengthToken: 17 | table.add_column(mlt[0].split(':')[-1][:-1], justify="right") 18 | table.add_column('Acc', justify="right") 19 | count = {mlt[0]:0 for mlt in MetaLengthToken} 20 | hit = {mlt[0]:0 for mlt in MetaLengthToken} 21 | for d in df: 22 | count[f"[MLT:{d['TargetLength']}]"] += 1 23 | for d in df: 24 | wc = count_words(d['output']) 25 | mlt = f"[MLT:{d['TargetLength']}]" 26 | if (wc > RANGE[mlt.split(':')[-1][:-1]]['FM'][0]) and (wc <= RANGE[mlt.split(':')[-1][:-1]]['FM'][1]): 27 | hit[mlt] += 1 28 | # print(hit) 29 | # print(count) 30 | table.add_row( 31 | args.dataset_path.split('/')[-1].split('tl_')[-1][:15], 32 | f"{hit['[MLT:10]']/count['[MLT:10]']*100:.2f}", 33 | f"{hit['[MLT:30]']/count['[MLT:30]']*100:.2f}", 34 | f"{hit['[MLT:50]']/count['[MLT:50]']*100:.2f}", 35 | f"{hit['[MLT:80]']/count['[MLT:80]']*100:.2f}", 36 | f"{hit['[MLT:150]']/count['[MLT:150]']*100:.2f}", 37 | f"{hit['[MLT:300]']/count['[MLT:300]']*100:.2f}", 38 | f"{hit['[MLT:500]']/count['[MLT:500]']*100:.2f}", 39 | f"{hit['[MLT:700]']/count['[MLT:700]']*100:.2f}", 40 | f"{hit['[MLT:>800]']/count['[MLT:>800]']*100:.2f}", 41 | f"{sum(hit.values())/sum(count.values())*100:.2f}", 42 | ) 43 | console = Console() 44 | console.print(table) 45 | latex = [ 46 | f"{hit['[MLT:10]']/count['[MLT:10]']*100:.1f}", 47 | f"{hit['[MLT:30]']/count['[MLT:30]']*100:.1f}", 48 | f"{hit['[MLT:50]']/count['[MLT:50]']*100:.1f}", 49 | f"{hit['[MLT:80]']/count['[MLT:80]']*100:.1f}", 50 | f"{hit['[MLT:150]']/count['[MLT:150]']*100:.1f}", 51 | f"{hit['[MLT:300]']/count['[MLT:300]']*100:.1f}", 52 | f"{hit['[MLT:500]']/count['[MLT:500]']*100:.1f}", 53 | f"{hit['[MLT:700]']/count['[MLT:700]']*100:.1f}", 54 | f"{hit['[MLT:>800]']/count['[MLT:>800]']*100:.1f}", 55 | f"{sum(hit.values())/sum(count.values())*100:.2f}", 56 | ] 57 | print('&'.join(latex) + '\\\\') 58 | 59 | 60 | if __name__ == "__main__": 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument("--dataset_path", type=str, default=None) 63 | args = parser.parse_args() 64 | main(args) 65 | -------------------------------------------------------------------------------- /src/exp/run_exp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tqdm import tqdm 3 | from vllm import LLM, SamplingParams 4 | from transformers import AutoTokenizer 5 | from utils import load_jsonl, save_jsonl 6 | from utils.templates import TemplatesMapping 7 | 8 | 9 | def main(args): 10 | # raw data load 11 | df = load_jsonl(args.dataset_path) 12 | # load tokenizer and llm 13 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) 14 | llm = LLM( 15 | model=args.model_name_or_path, 16 | trust_remote_code=True, 17 | tensor_parallel_size=args.gpus, 18 | ) 19 | template = TemplatesMapping[args.template] 20 | if args.template == "default" in args.dataset_path: 21 | terminators = TemplatesMapping["default"].get_stop_tokens( 22 | args.model_name_or_path 23 | ) 24 | elif "self_generated_mlt.jsonl" in args.dataset_path: 25 | terminators = ["<|end_of_text|>", "<|eot_id|>"] 26 | else: 27 | terminators = template.STOP_TOKENS 28 | print(f"> STOP_TOKENS:{terminators}") 29 | terminators = tokenizer.convert_tokens_to_ids(terminators) 30 | skip_sepcial_tokens = False if "self_generated_mlt.jsonl" in args.dataset_path else True 31 | sampling_params = SamplingParams( 32 | temperature=0, 33 | max_tokens=2048, 34 | stop_token_ids=terminators, 35 | skip_special_tokens=skip_sepcial_tokens, 36 | ) 37 | for idx in tqdm(range(len(df))): 38 | instruction = df[idx]["Instruction"] 39 | targetlength = df[idx]["TargetLength"] if "TargetLength" in df[idx] else "" 40 | if args.template == "default": 41 | prompts = [ 42 | template.apply_template_for_generation( 43 | instruction, targetlength, tokenizer 44 | ) 45 | ] 46 | elif args.template == "custom": 47 | if "vanilla" in args.model_name_or_path: 48 | prompts = [ 49 | template.apply_template_for_generation_vanilla(instruction, targetlength) 50 | ] 51 | else: 52 | prompts = [ 53 | template.apply_template_for_generation(instruction, targetlength) 54 | ] 55 | else: 56 | prompts = [ 57 | template.apply_template_for_generation(instruction, targetlength) 58 | ] 59 | df[idx]["prompt"] = prompts[0] 60 | outputs = llm.generate(prompts, sampling_params) 61 | for output in outputs: 62 | generated_text = output.outputs[0].text 63 | df[idx]["output"] = generated_text 64 | if idx % 100 == 0: 65 | save_jsonl(args.output_path, df) 66 | # save to output_path 67 | save_jsonl(args.output_path, df) 68 | 69 | 70 | if __name__ == "__main__": 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument("--dataset_path", type=str, default=None) 73 | parser.add_argument("--model_name_or_path", type=str, default=None) 74 | parser.add_argument("--gpus", type=int, default=1) 75 | parser.add_argument("--template", type=str, default="default") 76 | parser.add_argument("--output_path", type=str, default=None) 77 | args = parser.parse_args() 78 | main(args) 79 | -------------------------------------------------------------------------------- /scripts/Yi-1.5-6B/ruler_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=7 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=ruler_Yi-1.5-6B 7 | MODEL_NAME_OR_PATH=/data1/lijiaming/Ruler/checkpoints/ruler_Yi-1.5-6B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /scripts/Yi-1.5-6B/vanilla_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=6 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=vanilla_Yi-1.5-6B 7 | MODEL_NAME_OR_PATH=/data1/lijiaming/Ruler/checkpoints/vanilla_Yi-1.5-6B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /scripts/gemma-7b/ruler_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=2 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=ruler_gemma-7b 7 | MODEL_NAME_OR_PATH=/home/lijiaming/workspace/Seed/Seed-Ruler/outputs/checkpoints/ruler_gemma-7b_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /scripts/Qwen1.5-7B/ruler_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=4 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=ruler_Qwen1.5-7B 7 | MODEL_NAME_OR_PATH=/home/lijiaming/workspace/Seed/Seed-Ruler/outputs/checkpoints/ruler_Qwen1.5-7B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /scripts/gemma-7b/vanilla_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=3 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=vanilla_gemma-7b 7 | MODEL_NAME_OR_PATH=/home/lijiaming/workspace/Seed/Seed-Ruler/outputs/checkpoints/vanilla_gemma-7b_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /src/exp/cal_mlt_scores.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from utils.config import RANGE, LEVEL0, LEVEL1, LEVEL2 3 | from utils import load_jsonl 4 | from rich.table import Table 5 | from rich.console import Console 6 | from utils.count import count_words 7 | 8 | 9 | def metric_targetlength(df, LEVEL): 10 | result = { 11 | targetlength: {"PM": {"in": 0, "out": 0}, "FM": {"in": 0, "out": 0}} 12 | for targetlength in LEVEL 13 | } 14 | for d in df: 15 | length = count_words(d["output"]) 16 | if d["TargetLength"] in result: 17 | # PM 18 | if ( 19 | length > RANGE[d["TargetLength"]]["PM"][0] 20 | and length <= RANGE[d["TargetLength"]]["PM"][1] 21 | ): 22 | result[d["TargetLength"]]["PM"]["in"] += 1 23 | else: 24 | result[d["TargetLength"]]["PM"]["out"] += 1 25 | # FM 26 | if ( 27 | length > RANGE[d["TargetLength"]]["FM"][0] 28 | and length <= RANGE[d["TargetLength"]]["FM"][1] 29 | ): 30 | result[d["TargetLength"]]["FM"]["in"] += 1 31 | else: 32 | result[d["TargetLength"]]["FM"]["out"] += 1 33 | # draw table 34 | table = Table(show_header=True, header_style="bold magenta") 35 | table.add_column("TargetLength", style="dim", width=12) 36 | table.add_column("PM_in", justify="right") 37 | table.add_column("PM_out", justify="right") 38 | table.add_column("PM", justify="right") 39 | table.add_column("FM_in", justify="right") 40 | table.add_column("FM_out", justify="right") 41 | table.add_column("FM", justify="right") 42 | # latex_str = "" 43 | for key in result: 44 | table.add_row( 45 | key, 46 | f"{result[key]['PM']['in']}", 47 | f"{result[key]['PM']['out']}", 48 | f"{result[key]['PM']['in'] / (result[key]['PM']['in'] + result[key]['PM']['out'])*100:.2f}", 49 | f"{result[key]['FM']['in']}", 50 | f"{result[key]['FM']['out']}", 51 | f"{result[key]['FM']['in'] / (result[key]['FM']['in'] + result[key]['FM']['out'])*100:.2f}", 52 | ) 53 | # latex_str = ( 54 | # latex_str 55 | # + "&" 56 | # + f"{result[key]['PM']['in'] / (result[key]['PM']['in'] + result[key]['PM']['out'])*100:.2f}" 57 | # + "&" 58 | # + f"{result[key]['FM']['in'] / (result[key]['FM']['in'] + result[key]['FM']['out'])*100:.2f}" 59 | # ) 60 | table.add_row( 61 | "Total", 62 | f"{sum([result[key]['PM']['in']for key in result])}", 63 | f"{sum([result[key]['PM']['out']for key in result])}", 64 | f"{sum([result[key]['PM']['in']for key in result]) / (sum([result[key]['PM']['in']for key in result]) + sum([result[key]['PM']['out']for key in result]))*100:.2f}", 65 | f"{sum([result[key]['FM']['in']for key in result])}", 66 | f"{sum([result[key]['FM']['out']for key in result])}", 67 | f"{sum([result[key]['FM']['in']for key in result]) / (sum([result[key]['FM']['in']for key in result]) + sum([result[key]['FM']['out']for key in result]))*100:.2f}", 68 | ) 69 | console = Console() 70 | console.print(table) 71 | # print(latex_str) 72 | 73 | 74 | def main(args): 75 | # raw data load 76 | df = load_jsonl(args.dataset_path) 77 | print(f"> LEVEL0{'='*20}") 78 | metric_targetlength(df, LEVEL0) 79 | print(f"> LEVEL1{'='*20}") 80 | metric_targetlength(df, LEVEL1) 81 | print(f"> LEVEL2{'='*20}") 82 | metric_targetlength(df, LEVEL2) 83 | 84 | 85 | if __name__ == "__main__": 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument("--dataset_path", type=str, default=None) 88 | args = parser.parse_args() 89 | main(args) 90 | -------------------------------------------------------------------------------- /scripts/Qwen1.5-7B/vanilla_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=5 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=vanilla_Qwen1.5-7B 7 | MODEL_NAME_OR_PATH=/home/lijiaming/workspace/Seed/Seed-Ruler/outputs/checkpoints/vanilla_Qwen1.5-7B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /scripts/Meta-Llama-3-8B/ruler_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=0 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=ruler_Meta-Llama-3-8B 7 | MODEL_NAME_OR_PATH=/home/lijiaming/workspace/Seed/Seed-Ruler/outputs/checkpoints/ruler_Meta-Llama-3-8B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /scripts/Mistral-7B-v0.3/ruler_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=0 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=ruler_Mistral-7B-v0.3 7 | MODEL_NAME_OR_PATH=/home/lijiaming/workspace/Seed/Seed-Ruler/outputs/checkpoints/ruler_Mistral-7B-v0.3_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /scripts/Meta-Llama-3-8B/vanilla_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=0 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=vanilla_Meta-Llama-3-8B 7 | MODEL_NAME_OR_PATH=/home/lijiaming/workspace/Seed/Seed-Ruler/outputs/checkpoints/vanilla_Meta-Llama-3-8B_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /scripts/deepseek-llm-7b-base/ruler_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=1 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=ruler_deepseek-llm-7b-base 7 | MODEL_NAME_OR_PATH=/home/lijiaming/workspace/Seed/Seed-Ruler/outputs/checkpoints/ruler_deepseek-llm-7b-base_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/overall_performance/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /scripts/deepseek-llm-7b-base/vanilla_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=1 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=vanilla_deepseek-llm-7b-base 7 | MODEL_NAME_OR_PATH=/home/lijiaming/workspace/Seed/Seed-Ruler/outputs/checkpoints/vanilla_deepseek-llm-7b-base_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | # lm_eval --model $MODEL \ 16 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | # --tasks leaderboard \ 18 | # --device cuda \ 19 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | # --batch_size 1 \ 21 | # --write_out \ 22 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | lm_eval --model $MODEL \ 25 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | --tasks ai2_arc \ 27 | --device cuda \ 28 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | --batch_size 1 \ 30 | --num_fewshot 25 \ 31 | --write_out \ 32 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | lm_eval --model $MODEL \ 35 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | --tasks hellaswag \ 37 | --device cuda \ 38 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | --batch_size 1 \ 40 | --num_fewshot 10 \ 41 | --write_out \ 42 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | lm_eval --model $MODEL \ 45 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | --tasks truthfulqa \ 47 | --device cuda \ 48 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | --batch_size 1 \ 50 | --num_fewshot 0 \ 51 | --write_out \ 52 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | lm_eval --model $MODEL \ 55 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | --tasks mmlu \ 57 | --device cuda \ 58 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | --batch_size 1 \ 60 | --num_fewshot 5 \ 61 | --write_out \ 62 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | lm_eval --model $MODEL \ 65 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | --tasks winogrande \ 67 | --device cuda \ 68 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | --batch_size 1 \ 70 | --num_fewshot 5 \ 71 | --write_out \ 72 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | lm_eval --model $MODEL \ 75 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | --tasks gsm8k \ 77 | --device cuda \ 78 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | --batch_size 1 \ 80 | --num_fewshot 5 \ 81 | --write_out \ 82 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /scripts/Mistral-7B-v0.3/vanilla_lm_eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export CUDA_VISIBLE_DEVICES=0 3 | export NUMEXPR_MAX_THREADS=128 4 | 5 | MODEL=vllm 6 | MODEL_NAME=vanilla_Mistral-7B-v0.3 7 | MODEL_NAME_OR_PATH=/home/lijiaming/workspace/Seed/Seed-Ruler/outputs/checkpoints/vanilla_Mistral-7B-v0.3_bs_4_ga_8_lr_2e-5_eps_3/checkpoint-2841 8 | OUTPUT_PATH=../outputs/other_tasks/${MODEL_NAME} 9 | TOKENIZER_MODE=auto 10 | NUM_GPUS=1 11 | GPU_MEMORY_UTILIZATION=0.8 12 | 13 | mkdir -p $OUTPUT_PATH 14 | 15 | lm_eval --model $MODEL \ 16 | --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 17 | --tasks leaderboard \ 18 | --device cuda \ 19 | --output_path ${OUTPUT_PATH}/${MODEL}_eval_leaderboard \ 20 | --batch_size 1 \ 21 | --write_out \ 22 | 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_leaderboard.log 23 | 24 | # lm_eval --model $MODEL \ 25 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 26 | # --tasks ai2_arc \ 27 | # --device cuda \ 28 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc \ 29 | # --batch_size 1 \ 30 | # --num_fewshot 25 \ 31 | # --write_out \ 32 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_ai2_arc.log 33 | 34 | # lm_eval --model $MODEL \ 35 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 36 | # --tasks hellaswag \ 37 | # --device cuda \ 38 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_hellaswag \ 39 | # --batch_size 1 \ 40 | # --num_fewshot 10 \ 41 | # --write_out \ 42 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_hellaswag.log 43 | 44 | # lm_eval --model $MODEL \ 45 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 46 | # --tasks truthfulqa \ 47 | # --device cuda \ 48 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa \ 49 | # --batch_size 1 \ 50 | # --num_fewshot 0 \ 51 | # --write_out \ 52 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_truthfulqa.log 53 | 54 | # lm_eval --model $MODEL \ 55 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 56 | # --tasks mmlu \ 57 | # --device cuda \ 58 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_mmlu \ 59 | # --batch_size 1 \ 60 | # --num_fewshot 5 \ 61 | # --write_out \ 62 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_mmlu.log 63 | 64 | # lm_eval --model $MODEL \ 65 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 66 | # --tasks winogrande \ 67 | # --device cuda \ 68 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_winogrande \ 69 | # --batch_size 1 \ 70 | # --num_fewshot 5 \ 71 | # --write_out \ 72 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_winogrande.log 73 | 74 | # lm_eval --model $MODEL \ 75 | # --model_args pretrained=${MODEL_NAME_OR_PATH},trust_remote_code=True,tokenizer_mode=${TOKENIZER_MODE},tensor_parallel_size=${NUM_GPUS},dtype=auto,gpu_memory_utilization=${GPU_MEMORY_UTILIZATION} \ 76 | # --tasks gsm8k \ 77 | # --device cuda \ 78 | # --output_path ${OUTPUT_PATH}/${MODEL}_eval_gsm8k \ 79 | # --batch_size 1 \ 80 | # --num_fewshot 5 \ 81 | # --write_out \ 82 | # 2>&1 | tee ${OUTPUT_PATH}/${MODEL}_eval_gsm8k.log -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | outputs/ 163 | datasets/ 164 | logs/ 165 | test/ 166 | !datasets/download.sh -------------------------------------------------------------------------------- /src/finetuning/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import PreTrainedTokenizer 3 | from datasets import load_dataset 4 | from utils.config import MetaLengthToken 5 | from typing import Dict, Sequence 6 | 7 | IGNORE_INDEX = -100 8 | 9 | 10 | class DataCollatorForSupervisedDataset: 11 | """Collate examples for supervised fine-tuning.""" 12 | 13 | def __init__(self, tokenizer: PreTrainedTokenizer): 14 | self.tokenizer = tokenizer 15 | 16 | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: 17 | input_ids, labels = tuple( 18 | [instance[key] for instance in instances] for key in ("input_ids", "labels") 19 | ) 20 | input_ids = [torch.tensor(x) for x in input_ids] 21 | input_ids = torch.nn.utils.rnn.pad_sequence( 22 | input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id 23 | ) 24 | labels = [torch.tensor(x) for x in labels] 25 | labels = torch.nn.utils.rnn.pad_sequence( 26 | labels, batch_first=True, padding_value=IGNORE_INDEX 27 | ) 28 | 29 | return dict( 30 | input_ids=input_ids, 31 | labels=labels, 32 | attention_mask=input_ids.ne(self.tokenizer.pad_token_id), 33 | ) 34 | 35 | 36 | def preprocess_template(instruction, mlt, output, tokenizer, template,vanilla): 37 | if vanilla: 38 | mlt = '' 39 | prompts = template.apply_template(instruction, mlt, output) 40 | input_ids = tokenizer.encode( 41 | prompts, truncation=True, max_length=tokenizer.model_max_length 42 | ) # truncation 43 | split_token_idx = None 44 | for i in MetaLengthToken: 45 | i_id = tokenizer.convert_tokens_to_ids(i[0]) 46 | if i_id in input_ids: 47 | split_token_idx = input_ids.index(i_id) 48 | if split_token_idx is None: 49 | labels = [IGNORE_INDEX for _ in range(len(input_ids))] 50 | else: 51 | labels = [ 52 | input_ids[i] if i >= split_token_idx else IGNORE_INDEX 53 | for i in range(len(input_ids)) 54 | ] 55 | # vanilla 56 | if vanilla: 57 | instruction_prompts = template.apply_template_for_instruction(instruction) 58 | instruction_ids = tokenizer.encode( 59 | instruction_prompts, truncation=True, max_length=tokenizer.model_max_length 60 | ) 61 | labels = [ 62 | input_ids[i] if i >= len(instruction_ids) else IGNORE_INDEX 63 | for i in range(len(input_ids)) 64 | ] 65 | 66 | return input_ids, labels 67 | 68 | 69 | def preprocess(examples, tokenizer, template, vanilla): 70 | processed_input_ids, processed_labels = [], [] 71 | 72 | instructions, mlts, outputs = ( 73 | examples["Instruction"], 74 | examples["mlt"], 75 | examples["output"], 76 | ) 77 | for instruction, mlt, output in zip(instructions, mlts, outputs): 78 | input_ids, labels = preprocess_template( 79 | instruction, mlt, output, tokenizer, template, vanilla 80 | ) 81 | 82 | processed_input_ids.append(input_ids) 83 | processed_labels.append(labels) 84 | 85 | return {"input_ids": processed_input_ids, "labels": processed_labels} 86 | 87 | 88 | def load_custom_dataset(tokenizer: PreTrainedTokenizer, data_path: str, template, vanilla): 89 | train_datasets = load_dataset("json", data_files=data_path, split="train") 90 | 91 | train_dataset = train_datasets.map( 92 | preprocess, 93 | batched=True, 94 | batch_size=3000, 95 | num_proc=32, 96 | remove_columns=train_datasets.column_names, 97 | keep_in_memory=True, 98 | load_from_cache_file=False, 99 | desc="Running Encoding", 100 | fn_kwargs={"tokenizer": tokenizer, "template": template, "vanilla":vanilla}, 101 | ) 102 | 103 | torch.distributed.barrier() 104 | 105 | return train_dataset 106 | -------------------------------------------------------------------------------- /src/exp/cal_elm_rmse.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from utils.config import LEVEL0, LEVEL1, LEVEL2 3 | from utils import load_jsonl 4 | from utils.count import count_words 5 | from rich.table import Table 6 | from rich.console import Console 7 | from sklearn.metrics import root_mean_squared_error 8 | 9 | 10 | def calculate_rmse(actual, predicted): 11 | """ 12 | Calculate the Root Mean Square Error between two arrays using scikit-learn. 13 | 14 | Parameters: 15 | actual (array-like): The array of actual values. 16 | predicted (array-like): The array of predicted values. 17 | 18 | Returns: 19 | float: The calculated RMSE value. 20 | """ 21 | # Calculate the RMSE 22 | rmse = root_mean_squared_error(actual, predicted) 23 | 24 | return rmse 25 | 26 | 27 | def elm(list1, list2): 28 | """ 29 | Count the number of elements that are the same in both lists at the same positions. 30 | 31 | Parameters: 32 | list1 (list): The first list. 33 | list2 (list): The second list. 34 | 35 | Returns: 36 | int: The count of elements that are the same at the same positions. 37 | """ 38 | # Use zip to pair the elements and then check for equality 39 | same_position_count = sum(1 for a, b in zip(list1, list2) if a == b) 40 | 41 | return same_position_count 42 | 43 | 44 | def main(args): 45 | # raw data load 46 | df = load_jsonl(args.dataset_path) 47 | # calculate metric 48 | predicted_lengths = [] 49 | target_lengths = [] 50 | predicted_lengths_0, predicted_lengths_1, predicted_lengths_2 = [], [], [] 51 | target_lengths_0, target_lengths_1, target_lengths_2 = [], [], [] 52 | for d in df: 53 | length = count_words(d["output"]) 54 | if d["TargetLength"] != ">800": 55 | predicted_lengths.append(length) 56 | target_lengths.append(int(d["TargetLength"])) 57 | if d["TargetLength"] in LEVEL0: 58 | predicted_lengths_0.append(length) 59 | target_lengths_0.append(int(d["TargetLength"])) 60 | elif d["TargetLength"] in LEVEL1: 61 | predicted_lengths_1.append(length) 62 | target_lengths_1.append(int(d["TargetLength"])) 63 | elif d["TargetLength"] in LEVEL2 and d["TargetLength"] != ">800": 64 | predicted_lengths_2.append(length) 65 | target_lengths_2.append(int(d["TargetLength"])) 66 | else: 67 | if d["TargetLength"] != ">800": 68 | raise KeyError 69 | table = Table(show_header=True, header_style="bold magenta") 70 | table.add_column("Model", style="dim", width=12) 71 | table.add_column("Level 0_elm", justify="right") 72 | table.add_column("Level 0_rmse", justify="right") 73 | table.add_column("Level 1_elm", justify="right") 74 | table.add_column("Level 1_rmse", justify="right") 75 | table.add_column("Level 2_elm", justify="right") 76 | table.add_column("Level 2_rmse", justify="right") 77 | table.add_column("All Level_elm", justify="right") 78 | table.add_column("All Level 0_rmse", justify="right") 79 | table.add_row( 80 | args.dataset_path.split("/")[-1][4:], 81 | f"{elm(target_lengths_0,predicted_lengths_0)/len(predicted_lengths_0)*100:.2f}", 82 | f"{calculate_rmse(predicted_lengths_0,target_lengths_0):.2f}", 83 | f"{elm(target_lengths_1,predicted_lengths_1)/len(predicted_lengths_1)*100:.2f}", 84 | f"{calculate_rmse(predicted_lengths_1,target_lengths_1):.2f}", 85 | f"{elm(target_lengths_2,predicted_lengths_2)/len(predicted_lengths_2)*100:.2f}", 86 | f"{calculate_rmse(predicted_lengths_2,target_lengths_2):.2f}", 87 | f"{elm(target_lengths,predicted_lengths)/len(predicted_lengths)*100:.2f}", 88 | f"{calculate_rmse(predicted_lengths,target_lengths):.2f}", 89 | ) 90 | console = Console() 91 | console.print(table) 92 | print(f"{elm(target_lengths_0,predicted_lengths_0)/len(predicted_lengths_0)*100:.2f}/{calculate_rmse(predicted_lengths_0,target_lengths_0):.2f}|{elm(target_lengths_1,predicted_lengths_1)/len(predicted_lengths_1)*100:.2f}/{calculate_rmse(predicted_lengths_1,target_lengths_1):.2f}|{elm(target_lengths_2,predicted_lengths_2)/len(predicted_lengths_2)*100:.2f}/{calculate_rmse(predicted_lengths_2,target_lengths_2):.2f}|{elm(target_lengths,predicted_lengths)/len(predicted_lengths)*100:.2f}/{calculate_rmse(predicted_lengths,target_lengths):.2f}|") 93 | 94 | 95 | if __name__ == "__main__": 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument("--dataset_path", type=str, default=None) 98 | args = parser.parse_args() 99 | main(args) 100 | -------------------------------------------------------------------------------- /src/exp/cal_level_scores.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from utils.config import RANGE, LEVEL0, LEVEL1, LEVEL2 3 | from utils import load_jsonl 4 | from rich.table import Table 5 | from rich.console import Console 6 | from utils.count import count_words 7 | 8 | 9 | def main(args): 10 | result = RANGE 11 | for key in result: 12 | ( 13 | result[key]["PM_in"], 14 | result[key]["PM_out"], 15 | result[key]["FM_in"], 16 | result[key]["FM_out"], 17 | ) = 0, 0, 0, 0 18 | # raw data load 19 | df = load_jsonl(args.dataset_path) 20 | # calculate metric 21 | for d in df: 22 | length = count_words(d["output"]) 23 | # PM 24 | if ( 25 | length > result[d["TargetLength"]]["PM"][0] 26 | and length <= result[d["TargetLength"]]["PM"][1] 27 | ): 28 | result[d["TargetLength"]]["PM_in"] += 1 29 | else: 30 | result[d["TargetLength"]]["PM_out"] += 1 31 | # FM 32 | if ( 33 | length > result[d["TargetLength"]]["FM"][0] 34 | and length <= result[d["TargetLength"]]["FM"][1] 35 | ): 36 | result[d["TargetLength"]]["FM_in"] += 1 37 | else: 38 | result[d["TargetLength"]]["FM_out"] += 1 39 | # level 0 40 | levle0_pm_in, levle0_pm_out, levle0_fm_in, levle0_fm_out = 0, 0, 0, 0 41 | # level 1 42 | levle1_pm_in, levle1_pm_out, levle1_fm_in, levle1_fm_out = 0, 0, 0, 0 43 | # level 2 44 | levle2_pm_in, levle2_pm_out, levle2_fm_in, levle2_fm_out = 0, 0, 0, 0 45 | for key in result: 46 | if key in LEVEL0: 47 | levle0_pm_in += result[key]["PM_in"] 48 | levle0_pm_out += result[key]["PM_out"] 49 | levle0_fm_in += result[key]["FM_in"] 50 | levle0_fm_out += result[key]["FM_out"] 51 | elif key in LEVEL1: 52 | levle1_pm_in += result[key]["PM_in"] 53 | levle1_pm_out += result[key]["PM_out"] 54 | levle1_fm_in += result[key]["FM_in"] 55 | levle1_fm_out += result[key]["FM_out"] 56 | elif key in LEVEL2: 57 | levle2_pm_in += result[key]["PM_in"] 58 | levle2_pm_out += result[key]["PM_out"] 59 | levle2_fm_in += result[key]["FM_in"] 60 | levle2_fm_out += result[key]["FM_out"] 61 | # draw table 62 | table = Table(show_header=True, header_style="bold magenta") 63 | table.add_column("Level", style="dim", width=12) 64 | table.add_column("PM_in", justify="right") 65 | table.add_column("PM_out", justify="right") 66 | table.add_column("PM", justify="right") 67 | table.add_column("FM_in", justify="right") 68 | table.add_column("FM_out", justify="right") 69 | table.add_column("FM", justify="right") 70 | table.add_row( 71 | "Level:0", 72 | f"{levle0_pm_in}", 73 | f"{levle0_pm_out}", 74 | f"{levle0_pm_in/(levle0_pm_in + levle0_pm_out)*100:.2f}", 75 | f"{levle0_fm_in}", 76 | f"{levle0_fm_out}", 77 | f"{levle0_fm_in/(levle0_fm_in + levle0_fm_out)*100:.2f}", 78 | ) 79 | table.add_row( 80 | "Level:1", 81 | f"{levle1_pm_in}", 82 | f"{levle1_pm_out}", 83 | f"{levle1_pm_in/(levle1_pm_in + levle1_pm_out)*100:.2f}", 84 | f"{levle1_fm_in}", 85 | f"{levle1_fm_out}", 86 | f"{levle1_fm_in/(levle1_fm_in + levle1_fm_out)*100:.2f}", 87 | ) 88 | table.add_row( 89 | "Level:2", 90 | f"{levle2_pm_in}", 91 | f"{levle2_pm_out}", 92 | f"{levle2_pm_in/(levle2_pm_in + levle2_pm_out)*100:.2f}", 93 | f"{levle2_fm_in}", 94 | f"{levle2_fm_out}", 95 | f"{levle2_fm_in/(levle2_fm_in + levle2_fm_out)*100:.2f}", 96 | ) 97 | table.add_row( 98 | "All Level", 99 | f"{levle0_pm_in +levle1_pm_in + levle2_pm_in}", 100 | f"{levle0_pm_out+ levle1_pm_out + levle2_pm_out}", 101 | f"{(levle0_pm_in +levle1_pm_in + levle2_pm_in)/(levle0_pm_in +levle1_pm_in + levle2_pm_in + levle0_pm_out+ levle1_pm_out + levle2_pm_out)*100:.2f}", 102 | f"{levle0_fm_in +levle1_fm_in + levle2_fm_in}", 103 | f"{levle0_fm_out+ levle1_fm_out + levle2_fm_out}", 104 | f"{(levle0_fm_in +levle1_fm_in + levle2_fm_in)/(levle0_fm_in +levle1_fm_in + levle2_fm_in + levle0_fm_out+ levle1_fm_out + levle2_fm_out)*100:.2f}", 105 | ) 106 | console = Console() 107 | console.print(table) 108 | 109 | 110 | if __name__ == "__main__": 111 | parser = argparse.ArgumentParser() 112 | parser.add_argument("--dataset_path", type=str, default=None) 113 | args = parser.parse_args() 114 | main(args) 115 | -------------------------------------------------------------------------------- /src/finetuning/finetune.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import transformers 4 | 5 | from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, Trainer 6 | from dataclasses import dataclass, field 7 | 8 | from dataset import load_custom_dataset, DataCollatorForSupervisedDataset 9 | from utils.config import MetaLengthToken 10 | from utils.templates import TemplatesMapping 11 | 12 | 13 | @dataclass 14 | class ModelArguments: 15 | model_name_or_path: str = field( 16 | default="", 17 | metadata={"help": "The model checkpoint for weights initialization."}, 18 | ) 19 | template: str = field(default="", metadata={"help": "The template used to train"}) 20 | 21 | 22 | @dataclass 23 | class DataArguments: 24 | data_path: str = field( 25 | default=None, metadata={"help": "Path to the training data."} 26 | ) 27 | 28 | 29 | @dataclass 30 | class TrainingArguments(transformers.TrainingArguments): 31 | vanilla: bool = field( 32 | default=False, 33 | metadata={"help": "Vanilla finetuning or Ruler finetuning, defaulty is False."}, 34 | ) 35 | model_max_length: int = field( 36 | default=2048, 37 | metadata={ 38 | "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." 39 | }, 40 | ) 41 | gradient_checkpointing_kwargs: dict = field( 42 | default_factory=lambda: {"use_reentrant": False}, 43 | metadata={"help": "gradient checkpointing kwargs"}, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) 49 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 50 | print(training_args.vanilla) 51 | if training_args.local_rank == 0: 52 | print("=" * 100) 53 | print(training_args) 54 | 55 | if training_args.local_rank == 0: 56 | print("> Loading tokenizer from {}".format(model_args.model_name_or_path)) 57 | 58 | tokenizer = AutoTokenizer.from_pretrained( 59 | model_args.model_name_or_path, 60 | model_max_length=training_args.model_max_length, 61 | padding_side="right", 62 | truncation_side="right", 63 | use_fast=True, 64 | trust_remote_code=True, 65 | ) 66 | template = TemplatesMapping[model_args.template] 67 | # add special tokens 68 | if training_args.vanilla: 69 | special_tokens = {"additional_special_tokens": [t for t in template.SPECIAL_TOKENS]} 70 | elif model_args.template == 'custom': 71 | special_tokens = {"additional_special_tokens": [t for t in template.SPECIAL_TOKENS + [m[0]for m in MetaLengthToken]]} 72 | else: 73 | special_tokens = {"additional_special_tokens": [t[0] for t in MetaLengthToken]} 74 | print(f"> New special tokens: {special_tokens}") 75 | tokenizer.add_special_tokens(special_tokens) 76 | for st in special_tokens["additional_special_tokens"]: 77 | print(f"{st}:{tokenizer.convert_tokens_to_ids(st)}") 78 | 79 | tokenizer.pad_token = ( 80 | tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token 81 | ) 82 | if training_args.local_rank == 0: 83 | print("> PAD Token:", tokenizer.pad_token, tokenizer.pad_token_id) 84 | print("> BOS Token", tokenizer.bos_token, tokenizer.bos_token_id) 85 | print("> EOS Token", tokenizer.eos_token, tokenizer.eos_token_id) 86 | 87 | if training_args.local_rank == 0: 88 | print("> Loading model from {}".format(model_args.model_name_or_path)) 89 | 90 | if "glm-4" in model_args.model_name_or_path: # glm-4 not support flash attention 2s 91 | model = AutoModelForCausalLM.from_pretrained( 92 | model_args.model_name_or_path, 93 | torch_dtype=torch.bfloat16, 94 | trust_remote_code=True, 95 | ) 96 | else: 97 | model = AutoModelForCausalLM.from_pretrained( 98 | model_args.model_name_or_path, 99 | attn_implementation="flash_attention_2", 100 | torch_dtype=torch.bfloat16, 101 | trust_remote_code=True, 102 | ) 103 | model.resize_token_embeddings(len(tokenizer)) 104 | train_dataset = load_custom_dataset( 105 | tokenizer=tokenizer, 106 | data_path=data_args.data_path, 107 | template=template, 108 | vanilla=training_args.vanilla, 109 | ) 110 | 111 | if training_args.local_rank == 0: 112 | print("> Training dataset samples:", len(train_dataset)) 113 | for index in random.sample(range(len(train_dataset)), 3): 114 | print("=" * 100) 115 | print( 116 | f"Sample {index} of the training set:\n{tokenizer.decode(list(train_dataset[index]['input_ids']))}" 117 | ) 118 | print(f"{train_dataset[index]['input_ids']}") 119 | print(f"{train_dataset[index]['labels']}") 120 | 121 | data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) 122 | 123 | trainer = Trainer( 124 | model=model, 125 | tokenizer=tokenizer, 126 | args=training_args, 127 | train_dataset=train_dataset, 128 | data_collator=data_collator, 129 | ) 130 | 131 | trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) 132 | -------------------------------------------------------------------------------- /src/data_process/build_training_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import nlp 3 | import random 4 | import argparse 5 | import pandas as pd 6 | from utils import load_jsonl, save_jsonl 7 | from utils.config import MetaLengthToken, SAMPLE 8 | from utils.count import count_words 9 | 10 | 11 | 12 | 13 | def list_files(directory): 14 | return [ 15 | f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) 16 | ] 17 | 18 | 19 | def add_MLT(instruction: str): 20 | result = None 21 | word_count = count_words(instruction) 22 | for mlt in MetaLengthToken: 23 | if word_count > mlt[1][0] and word_count <= mlt[1][1]: 24 | result = mlt[0] 25 | return result 26 | 27 | 28 | def process_OpenHermes(dataset_path, random_seed, num): 29 | # set random seed 30 | random.seed(random_seed) 31 | df = load_jsonl(dataset_path) 32 | random.shuffle(df) 33 | print(f"{'='*10}First data in TLG Dataset{'='*10}") 34 | print(df[0]["conversations"][0]["value"]) 35 | print(f"{'='*10}Last data in TLG Dataset{'='*10}") 36 | print(df[num - 1]["conversations"][0]["value"]) 37 | print("=" * 20) 38 | df = df[num:] # cut off the FLCG exp dataset 39 | # sampled data 40 | sampled_data = {key[0]: [] for key in MetaLengthToken} 41 | for idx in range(len(df)): 42 | d = {} 43 | d["Instruction"] = df[idx]["conversations"][0]["value"] 44 | d["word_count"] = len(df[idx]["conversations"][1]["value"].split()) 45 | d["output"] = df[idx]["conversations"][1]["value"] 46 | d["mlt"] = add_MLT(df[idx]["conversations"][1]["value"]) 47 | if d["mlt"] is not None: 48 | sampled_data[d["mlt"]].append(d) 49 | return sampled_data 50 | 51 | 52 | def process_longform(dir_path): 53 | # sampled data 54 | sampled_data = {key[0]: [] for key in MetaLengthToken} 55 | longform_files = list_files(dir_path) 56 | for file in longform_files: 57 | df = pd.read_parquet(f"{dir_path}/{file}") 58 | for idx in range(df.shape[0]): 59 | d = {} 60 | d["Instruction"] = df.iloc[idx]["input"] 61 | d["word_count"] = len(df.iloc[idx]["output"].split()) 62 | d["output"] = df.iloc[idx]["output"] 63 | d["mlt"] = add_MLT(df.iloc[idx]["output"]) 64 | if d["mlt"] is not None: 65 | sampled_data[d["mlt"]].append(d) 66 | return sampled_data 67 | 68 | 69 | def process_eli5(): 70 | # sampled data 71 | sampled_data = {key[0]: [] for key in MetaLengthToken} 72 | eli5 = nlp.load_dataset("eli5") 73 | files = ["train_eli5", "test_eli5", "validation_eli5"] 74 | for file in files: 75 | for data in eli5[file]: 76 | d = {} 77 | d["Instruction"] = data["title"] 78 | answer = "" 79 | for i in data["answers"]["text"]: 80 | if len(i.split()) > len(answer.split()): 81 | answer = i 82 | d["word_count"] = len(answer.split()) 83 | d["output"] = answer 84 | d["mlt"] = add_MLT(answer) 85 | if d["mlt"] is not None: 86 | sampled_data[d["mlt"]].append(d) 87 | return sampled_data 88 | 89 | 90 | def main(args): 91 | sampled_data = {key[0]: [] for key in MetaLengthToken} 92 | # OpenHermes2.5 93 | openhermes_data = process_OpenHermes(args.dataset_path, args.random_seed, args.num) 94 | print(f"{'='*10}OpenHermes2.5 dataset{'='*10}") 95 | for key in openhermes_data: 96 | random.shuffle(openhermes_data[key]) 97 | data_num = min(len(openhermes_data[key]), SAMPLE[key] - len(sampled_data[key])) 98 | sampled_data[key] += openhermes_data[key][:data_num] 99 | print(f"{key}-{len(openhermes_data[key])}-take {data_num}.") 100 | # Long Form 101 | longform_data = process_longform(args.longform_dir) 102 | print(f"{'='*10}LongForm dataset{'='*10}") 103 | for key in longform_data: 104 | random.shuffle(longform_data[key]) 105 | data_num = min(len(longform_data[key]), SAMPLE[key] - len(sampled_data[key])) 106 | sampled_data[key] += longform_data[key][:data_num] 107 | print(f"{key}-{len(longform_data[key])}-take {data_num}") 108 | # ELI5 109 | eli5_data = process_eli5() 110 | print(f"{'='*10}ELI5 dataset{'='*10}") 111 | for key in eli5_data: 112 | random.shuffle(eli5_data[key]) 113 | data_num = min(len(eli5_data[key]), SAMPLE[key] - len(sampled_data[key])) 114 | sampled_data[key] += eli5_data[key][:data_num] 115 | print(f"{key}-{len(eli5_data[key])}-take {data_num}") 116 | print(f"{'='*10}FINAL{'='*10}") 117 | data = [] 118 | for key in sampled_data: 119 | data += sampled_data[key] 120 | print(f"{key}-{len(sampled_data[key])}") 121 | random.shuffle(data) 122 | global_id = 0 123 | for d in data: 124 | d["id"] = global_id 125 | global_id += 1 126 | print(f"Total:{global_id}") 127 | # save to output_path 128 | save_jsonl(args.output_path, data) 129 | 130 | 131 | if __name__ == "__main__": 132 | parser = argparse.ArgumentParser() 133 | parser.add_argument("--dataset_path", type=str, default=None) 134 | parser.add_argument("--longform_dir", type=str, default=None) 135 | parser.add_argument("--num", type=int, default=None) 136 | parser.add_argument("--random_seed", type=int, default=10) 137 | parser.add_argument("--output_path", type=str, default=None) 138 | args = parser.parse_args() 139 | main(args) 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ruler: A Model-Agnostic Method to Control Generated Length for Large Language Models 2 | 3 | 4 | 5 | ![Method](images/method.png) 6 | 7 | ## 🤩 Release 8 | - [2024/09/20] 🥳 [Ruler](https://arxiv.org/abs/2409.18943) is accepted by EMNLP 2024 Findings. 9 | 10 | ## 😎 Overview 11 | 12 | **Ruler** a novel, model-agnostic approach employs Meta Length Tokens (*MLTs*) to enhance the instruction-following ability of LLMs under length-constrained instructions 13 | 14 | **Ruler** equips LLMs with the ability to generate responses of a target length. Moreover, it can automatically generate appropriate *MLT* when not target length is provided. Comprehensive experments show the effectiveness of **Ruler** across different LLMs. 15 | 16 | ## 🧐 Quickstart 17 | 18 | We also provide a more [detailed experiments document](./experiments.md) (specific to each experiment and including all the results!). 19 | 20 | ### Prepare Environment 21 | 22 | First, you should set up a python environment. This code base has been tested under python 3.x, and we officially support python 3.10. 23 | ```bash 24 | conda create -n ruler python=3.10 25 | cd Ruler # where contains 'requirements.txt' 26 | pip install -r requirements.txt 27 | 28 | export PYTHONPATH=xxxx/Ruler/src 29 | cd src 30 | 31 | # create folders and download datasets 32 | bash ../scripts/download.sh 33 | ``` 34 | ### Target Length Generation Task 35 | 36 | **Closed-source Model** 37 | 38 | ```shell 39 | python exp/run_exp_api.py\ 40 | --dataset_path ../datasets/tlg_dataset.jsonl\ 41 | --model \ 42 | --output_path ../outputs/tlg/tlg_.jsonl 43 | --key 44 | ``` 45 | 46 | **Open-source Model** 47 | 48 | ```shell 49 | python exp/run_exp.py\ 50 | --dataset_path ../datasets/tlg_dataset.jsonl\ 51 | --model_name_or_path \ 52 | --output_path ../outputs/tlg/tlg_.jsonl 53 | ``` 54 | 55 | **Calculate scores** 56 | 57 | Different `Levels`: 58 | 59 | ```shell 60 | python exp/cal_level_scores.py\ 61 | --dataset_path 62 | ``` 63 | 64 | Different `MLT`: 65 | 66 | ```shell 67 | python exp/cal_mlt_scores.py\ 68 | --dataset_path 69 | ``` 70 | 71 | ![TLG](images/TLG.png) 72 | 73 | ### Ruler 74 | 75 | Finetuning scripts: 76 | ```shell 77 | export CUDA_VISIBLE_DEVICES=0,1,2,3 78 | 79 | find_free_port() { 80 | while : 81 | do 82 | PORT=$(( ( RANDOM % 64512 ) + 1024 )) 83 | (echo >/dev/tcp/localhost/$PORT) >/dev/null 2>&1 84 | if [ $? -ne 0 ]; then 85 | echo $PORT 86 | return 87 | fi 88 | done 89 | } 90 | 91 | export MASTER_PORT=$(find_free_port) 92 | 93 | LEARNING_RATE=2e-5 94 | NUM_TRAIN_EPOCHS=3 95 | VANILLA=False 96 | 97 | MODEL_NAME_OR_PATH= 98 | echo "Finetune from: ${MODEL_NAME_OR_PATH}" 99 | MODEL=${MODEL_NAME_OR_PATH##*/} 100 | 101 | TEMPLATE=custom 102 | echo "Finetune data template: ${TEMPLATE}" 103 | 104 | DATA_PATH=../datasets/ruler_training_dataset.jsonl 105 | echo "Finetune data path: ${DATA_PATH}" 106 | 107 | MODEL_MAX_LENGTH=2048 108 | echo "Model max length: ${MODEL_MAX_LENGTH}" 109 | 110 | BATCH_SIZE=4 111 | echo "Per device train batch size: ${BATCH_SIZE}" 112 | 113 | GRAD_ACCUM=8 114 | echo "Gradient accumulation steps: ${GRAD_ACCUM}" 115 | 116 | OUTPUT_DIR="../outputs/checkpoints/ruler_${MODEL}_bs_${BATCH_SIZE}_ga_${GRAD_ACCUM}_lr_${LEARNING_RATE}_eps_${NUM_TRAIN_EPOCHS}" 117 | LOG_DIR=../logs 118 | 119 | deepspeed --master_port=$MASTER_PORT finetuning/finetune.py \ 120 | --vanilla $VANILLA \ 121 | --deepspeed ../configs/ds_config_zero3.json \ 122 | --model_name_or_path $MODEL_NAME_OR_PATH \ 123 | --template $TEMPLATE\ 124 | --model_max_length $MODEL_MAX_LENGTH \ 125 | --data_path $DATA_PATH \ 126 | --output_dir $OUTPUT_DIR \ 127 | --bf16 True \ 128 | --tf32 True \ 129 | --per_device_train_batch_size ${BATCH_SIZE} \ 130 | --gradient_accumulation_steps ${GRAD_ACCUM} \ 131 | --gradient_checkpointing True \ 132 | --lr_scheduler_type cosine \ 133 | --learning_rate ${LEARNING_RATE} \ 134 | --warmup_ratio 0.05 \ 135 | --num_train_epochs ${NUM_TRAIN_EPOCHS} \ 136 | --evaluation_strategy no \ 137 | --save_strategy epoch \ 138 | --save_total_limit 1 \ 139 | --logging_steps 5 \ 140 | 2>&1 | tee ${LOG_DIR}/output_ruler_${MODEL}.log 141 | ``` 142 | 143 | ![TLG_ruler](images/TLG_ruler.png) 144 | 145 | ### Multi MLT Generation Experiment 146 | 147 | **Run exp:** 148 | 149 | ```shell 150 | python exp/run_exp.py\ 151 | --dataset_path ../data/multi_mlt.jsonl\ 152 | --model_name_or_path \ 153 | --gpus 1\ 154 | --template