├── .DS_Store
├── MergeBench.png
├── merging
    ├── requirements.txt
    ├── merging_methods
    │   ├── __init__.py
    │   ├── merger.py
    │   ├── task_arithmetic.py
    │   ├── TIES.py
    │   ├── DARE.py
    │   ├── utils.py
    │   ├── consensus.py
    │   ├── localize_and_stitch.py
    │   ├── fisher_utils.py
    │   ├── regmean_utils.py
    │   ├── regmean.py
    │   ├── ties_merging_utils.py
    │   ├── fisher.py
    │   ├── localize_utils.py
    │   └── regmean_plusplus.py
    ├── README.md
    ├── main.py
    ├── prepare_args.py
    └── taskloader.py
├── merged_models
    ├── Llama-3.1-8B_merged
    │   └── RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1
    │   │   ├── code_eval.json
    │   │   ├── safety_eval.json
    │   │   └── lm_eval.json
    └── Llama-3.2-3B_merged
    │   └── RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1
    │       ├── code_eval.json
    │       ├── safety_eval.json
    │       └── lm_eval.json
├── scripts
    ├── evaluate.sh
    └── merge.sh
└── README.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uiuctml/MergeBench/HEAD/.DS_Store


--------------------------------------------------------------------------------
/MergeBench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uiuctml/MergeBench/HEAD/MergeBench.png


--------------------------------------------------------------------------------
/merging/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.1.2
2 | transformers
3 | datasets
4 | trl
5 | protobuf==3.20.3
6 | flash-attn==2.5.7
7 | tqdm
8 | deepspeed==0.14.5


--------------------------------------------------------------------------------
/merging/merging_methods/__init__.py:
--------------------------------------------------------------------------------
1 | from merging_methods.merger import Merger
2 | from merging_methods.task_arithmetic import TaskArithmetic
3 | from merging_methods.TIES import TIES
4 | from merging_methods.DARE import DARE
5 | from merging_methods.localize_and_stitch import LocalizeAndStitch
6 | from merging_methods.consensus import Consensus
7 | from merging_methods.regmean import RegMean
8 | from merging_methods.regmean_plusplus import RegMeanPlusPlus


--------------------------------------------------------------------------------
/merging/merging_methods/merger.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | 
 4 | class Merger(nn.Module):
 5 |     def __init__(self, base_model, ft_models, save_path):
 6 |         super().__init__()
 7 |         
 8 |         self.base_model_name = base_model
 9 |         self.base_model = AutoModelForCausalLM.from_pretrained(self.base_model_name, torch_dtype="bfloat16")
10 |         self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)
11 |         self.ft_ckpts = [AutoModelForCausalLM.from_pretrained(ft_model, torch_dtype="bfloat16") for ft_model in ft_models]
12 |         self.save_path = save_path
13 |     
14 |     def merge(self, **kwargs):
15 |         pass
16 | 


--------------------------------------------------------------------------------
/merging/merging_methods/task_arithmetic.py:
--------------------------------------------------------------------------------
 1 | from merging_methods.utils import *
 2 | from merging_methods.merger import Merger
 3 | 
 4 | class TaskArithmetic(Merger):
 5 |     def __init__(self, base_model, ft_models, save_path):
 6 |         super().__init__(base_model, ft_models, save_path)
 7 |     
 8 |     def merge(self, **kwargs):
 9 |         scaling_coef = kwargs['scaling_coef']
10 |         task_vectors = [get_task_vector(ft_model, self.base_model) for ft_model in self.ft_ckpts]
11 |         merged_tv = scaling_coef * sum(task_vectors)
12 |         merged_model = vector_to_state_dict(merged_tv, self.base_model)
13 | 
14 |         merged_model.save_pretrained(self.save_path)
15 |         self.tokenizer.save_pretrained(self.save_path)
16 | 


--------------------------------------------------------------------------------
/merging/merging_methods/TIES.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from merging_methods.utils import *
 3 | from merging_methods.ties_merging_utils import *
 4 | from merging_methods.merger import Merger
 5 | import time
 6 | 
 7 | class TIES(Merger):
 8 |     def __init__(self, base_model, ft_models, save_path):
 9 |         super().__init__(base_model, ft_models, save_path)
10 |     
11 |     def merge(self, **kwargs):
12 |         scaling_coef = kwargs['scaling_coef']
13 |         task_vectors = [get_task_vector(ft_model, self.base_model) for ft_model in self.ft_ckpts]
14 |         
15 |         start = time.time()
16 | 
17 |         merged_tv = scaling_coef * ties_merging(torch.stack(task_vectors), reset_thresh=kwargs['K'], merge_func=kwargs['merge_func'])
18 |         merged_model = vector_to_state_dict(merged_tv, self.base_model)
19 |         print("Time taken for ties: ", time.time() - start)
20 | 
21 |         merged_model.save_pretrained(self.save_path)
22 |         self.tokenizer.save_pretrained(self.save_path)
23 | 


--------------------------------------------------------------------------------
/merging/README.md:
--------------------------------------------------------------------------------
 1 | # Implementation of model merging methods
 2 | 
 3 | ## Merging
 4 | Each merging algorithms contain specific parameters and configurations, and all of which are prepared in `prepare_args.py`. Please refer to their original papers for more detailed definitions. We provide examples for running each merging algorithms in `scripts/merge.sh`.
 5 | 
 6 | To install the packages required for merging, run the following commands:
 7 | ```
 8 | conda create -n merging
 9 | conda activate merging
10 | pip install -r requirements.txt
11 | ```
12 | 
13 | ## Adding new merging methods
14 | 1. Create a new python file under the `merging_methods` directory, e.g., `task_arithmetic.py`. 
15 | 2. Within the file, define the class name for the merging algorithm `TaskArithmetic`, which inherits from the abstract method `Merger` in `merger.py`. 
16 | 3. Remember to add the class in `__init__.py`, e.g., 
17 | ```
18 | from merging_methods.task_arithmetic import TaskArithmetic
19 | ```
20 | 4. Add any method dependent hyperparameters in `prepare_args.py`, which will be passed into the class via `kwargs`. The argument parsing in `main.py` only handles generic arguments.
21 | 5. Overwrite the `merge` function to implement the details of merging, which ends with saving the merged model and tokenizer. 
22 | 


--------------------------------------------------------------------------------
/merging/merging_methods/DARE.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from merging_methods.utils import *
 3 | from merging_methods.merger import Merger
 4 | import time
 5 | 
 6 | class DARE(Merger):
 7 |     def __init__(self, base_model, ft_models, save_path):
 8 |         super().__init__(base_model, ft_models, save_path)
 9 |     
10 | 
11 |     def random_drop_and_rescale(self, task_vector, p=0.8):
12 |         if not 0 <= p < 1:
13 |             raise ValueError("p must be in the range [0, 1).")
14 |         
15 |         # Generate a binary mask: 1 with probability (1-p) and 0 with probability p.
16 |         mask = torch.bernoulli(torch.full(task_vector.shape, 1 - p, device=task_vector.device))
17 |         
18 |         # Apply the mask and rescale the kept values by 1/(1-p)
19 |         return task_vector * mask / (1 - p)
20 |     
21 |     def merge(self, **kwargs):
22 |         p = kwargs['p']
23 |         coeff = kwargs['scaling_coef']
24 |         
25 |         task_vectors = [get_task_vector(ft_model, self.base_model) for ft_model in self.ft_ckpts]
26 |         start = time.time()
27 |         task_vectors = [self.random_drop_and_rescale(task_vector, p) for task_vector in task_vectors]
28 |         merged_tv = sum(task_vectors) * coeff
29 |         print("Time taken for random drop and rescale: ", time.time() - start)
30 |         merged_model = vector_to_state_dict(merged_tv, self.base_model)
31 | 
32 |         merged_model.save_pretrained(self.save_path)
33 |         self.tokenizer.save_pretrained(self.save_path)
34 | 


--------------------------------------------------------------------------------
/merging/merging_methods/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def flatten_ckpt_into_vec(ckpt):
 4 |     vec = []
 5 |     for param in ckpt.values():
 6 |         vec.append(param.flatten())
 7 |     return torch.cat(vec)
 8 | 
 9 | def select_trainable_params(model):
10 |     params = {}
11 | 
12 |     for n, p in model.named_parameters():
13 |         if 'embed' not in n and 'Embedding' not in n:
14 |             params[n] = p
15 |                     
16 |     return params
17 | 
18 | def get_task_vector(ft_model, base_model):
19 |     ft_model.to('cpu')
20 |     base_model.to('cpu')
21 | 
22 |     ft_params = select_trainable_params(ft_model)
23 |     base_params = select_trainable_params(base_model)
24 | 
25 |     ft_vec = flatten_ckpt_into_vec(ft_params)
26 |     base_vec = flatten_ckpt_into_vec(base_params)
27 | 
28 |     return ft_vec - base_vec
29 | 
30 | def vector_to_state_dict(vec, pretrained_model, return_dict=False):
31 |     i = 0
32 |     vec.to('cpu')
33 |     pretrained_model.to('cpu')
34 |     for k, v in pretrained_model.state_dict().items():
35 |         if 'embed' not in k.lower() and 'lm_head' not in k:
36 |             if torch.nonzero(v).size(0) == 0:
37 |                 continue
38 |             vec[i:i+v.numel()].reshape(v.shape).to(pretrained_model.device)
39 |             pretrained_model.state_dict()[k] += vec[i:i+v.numel()].reshape(v.shape)
40 |             i += v.numel()
41 | 
42 |     if return_dict:
43 |         return pretrained_model.state_dict()
44 |     else:
45 |         return pretrained_model
46 | 


--------------------------------------------------------------------------------
/merging/main.py:
--------------------------------------------------------------------------------
 1 | from prepare_args import prepare_args, create_parser
 2 | import importlib
 3 | 
 4 | def get_ft_ckpts(base_model):
 5 |     model_name = base_model.split('/')[-1]
 6 |     task_names = ['instruction', 'math', 'coding', 'safety', 'multilingual']
 7 |     return [f'MergeBench/{model_name}_{task_name}' for task_name in task_names]
 8 | 
 9 | def parse_args():
10 |     parser = create_parser()
11 | 
12 |     parser.add_argument('--base-model', default='meta-llama/Llama-3.2-3B', type=str)
13 |     parser.add_argument('--algo', default='TaskArithmetic', type=str, choices=['TaskArithmetic', 'TIES', 'DARE', 'LocalizeAndStitch', 'Consensus', 'RegMean', 'RegMeanPlusPlus', 'Fisher'])
14 |     parser.add_argument('--save-path', default='./merged_models/', type=str)
15 | 
16 |     return parser.parse_args()
17 | 
18 | def main(args):
19 |     kwargs = prepare_args(args)
20 |     merger_module = importlib.import_module("merging_methods")
21 |     ft_ckpts = get_ft_ckpts(args.base_model)
22 | 
23 |     kwargs_str = "_".join(f"{key}_{value}" for key, value in kwargs.items() if key not in ['fisher_only','merge_only','save_group','task_names','keep_checkpoints'])
24 |     if args.save_group:
25 |         task_group = args.save_group
26 |     elif args.task_names:
27 |         task_group = args.task_names
28 |     else:
29 |         task_group = None
30 | 
31 |     save_path = args.save_path + args.base_model.split('/')[1] + '_merged/' + args.algo
32 |     if task_group:
33 |         save_path += '_task_names_' + task_group
34 |     if kwargs_str != '':
35 |         save_path += '_' + kwargs_str
36 |         
37 |     print('merged model save to:',save_path)
38 |     merger = getattr(merger_module, args.algo)(args.base_model, ft_ckpts, save_path)
39 |     print(args)
40 |     print(kwargs)
41 |     merger.merge(**kwargs)
42 | 
43 | if __name__ == "__main__":
44 |     args = parse_args()
45 |     main(args)


--------------------------------------------------------------------------------
/merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1/code_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "humanevalplus": {
 3 |     "pass@1": 0.47317073170731705,
 4 |     "pass@10": 0.6524390243902439
 5 |   },
 6 |   "mbppplus": {
 7 |     "pass@1": 0.5732804232804233,
 8 |     "pass@10": 0.656084656084656
 9 |   },
10 |   "config": {
11 |     "prefix": "",
12 |     "do_sample": true,
13 |     "temperature": 0.2,
14 |     "top_k": 0,
15 |     "top_p": 0.95,
16 |     "n_samples": 10,
17 |     "eos": "<|endoftext|>",
18 |     "seed": 0,
19 |     "model": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1",
20 |     "modeltype": "causal",
21 |     "peft_model": null,
22 |     "revision": null,
23 |     "use_auth_token": true,
24 |     "trust_remote_code": false,
25 |     "tasks": "humanevalplus,mbppplus",
26 |     "instruction_tokens": null,
27 |     "batch_size": 10,
28 |     "max_length_generation": 512,
29 |     "precision": "bf16",
30 |     "load_in_8bit": false,
31 |     "load_in_4bit": false,
32 |     "left_padding": false,
33 |     "limit": null,
34 |     "limit_start": 0,
35 |     "save_every_k_tasks": -1,
36 |     "postprocess": true,
37 |     "allow_code_execution": true,
38 |     "generation_only": false,
39 |     "load_generations_path": null,
40 |     "load_data_path": null,
41 |     "metric_output_path": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1/results/code_eval.json",
42 |     "save_generations": false,
43 |     "load_generations_intermediate_paths": null,
44 |     "save_generations_path": "generations.json",
45 |     "save_references": false,
46 |     "save_references_path": "references.json",
47 |     "prompt": "prompt",
48 |     "max_memory_per_gpu": null,
49 |     "check_references": false
50 |   }
51 | }


--------------------------------------------------------------------------------
/merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1/code_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "humanevalplus": {
 3 |     "pass@1": 0.31036585365853653,
 4 |     "pass@10": 0.4634146341463415
 5 |   },
 6 |   "mbppplus": {
 7 |     "pass@1": 0.4304232804232804,
 8 |     "pass@10": 0.5582010582010583
 9 |   },
10 |   "config": {
11 |     "prefix": "",
12 |     "do_sample": true,
13 |     "temperature": 0.2,
14 |     "top_k": 0,
15 |     "top_p": 0.95,
16 |     "n_samples": 10,
17 |     "eos": "<|endoftext|>",
18 |     "seed": 0,
19 |     "model": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1",
20 |     "modeltype": "causal",
21 |     "peft_model": null,
22 |     "revision": null,
23 |     "use_auth_token": true,
24 |     "trust_remote_code": false,
25 |     "tasks": "humanevalplus,mbppplus",
26 |     "instruction_tokens": null,
27 |     "batch_size": 10,
28 |     "max_length_generation": 512,
29 |     "precision": "bf16",
30 |     "load_in_8bit": false,
31 |     "load_in_4bit": false,
32 |     "left_padding": false,
33 |     "limit": null,
34 |     "limit_start": 0,
35 |     "save_every_k_tasks": -1,
36 |     "postprocess": true,
37 |     "allow_code_execution": true,
38 |     "generation_only": false,
39 |     "load_generations_path": null,
40 |     "load_data_path": null,
41 |     "metric_output_path": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1/results/code_eval.json",
42 |     "save_generations": false,
43 |     "load_generations_intermediate_paths": null,
44 |     "save_generations_path": "generations.json",
45 |     "save_references": false,
46 |     "save_references_path": "references.json",
47 |     "prompt": "prompt",
48 |     "max_memory_per_gpu": null,
49 |     "check_references": false
50 |   }
51 | }


--------------------------------------------------------------------------------
/scripts/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL=$1
 4 | GPU_ID=$2
 5 | OUTPUT_PATH=$3
 6 | 
 7 | echo $MODEL
 8 | echo $GPU_ID
 9 | echo $OUTPUT_PATH
10 | 
11 | export CUDA_VISIBLE_DEVICES=$GPU_ID
12 | 
13 | source $(conda info --base)/etc/profile.d/conda.sh
14 | mkdir -p $OUTPUT_PATH
15 | 
16 | conda activate lmeval
17 | 
18 | lm_eval --model hf \
19 |     --model_args pretrained=$MODEL \
20 |     --tasks gsm8k_cot \
21 |     --device cuda:$GPU_ID \
22 |     --batch_size 16 \
23 |     --output_path $OUTPUT_PATH
24 | 
25 | lm_eval --model hf \
26 |     --model_args pretrained=$MODEL \
27 |     --tasks m_mmlu_fr,arc_fr,hellaswag_fr,m_mmlu_es,arc_es,hellaswag_es,m_mmlu_de,arc_de,hellaswag_de,m_mmlu_ru,arc_ru,hellaswag_ru \
28 |     --device cuda:$GPU_ID \
29 |     --batch_size 8 \
30 |     --output_path $OUTPUT_PATH
31 | 
32 | lm_eval --model hf \
33 |     --model_args pretrained=$MODEL \
34 |     --tasks ifeval \
35 |     --device cuda:$GPU_ID \
36 |     --batch_size 8 \
37 |     --output_path $OUTPUT_PATH
38 | 
39 | 
40 | conda deactivate
41 | conda activate bigcode
42 | cd bigcode-evaluation-harness
43 | 
44 | accelerate launch  main.py \
45 |   --model $MODEL \
46 |   --max_length_generation 512 \
47 |   --precision bf16 \
48 |   --tasks humanevalplus,mbppplus \
49 |   --temperature 0.2 \
50 |   --n_samples 10 \
51 |   --batch_size 10 \
52 |   --allow_code_execution \
53 |   --metric_output_path $OUTPUT_PATH/code_eval.json \
54 |   --use_auth_token
55 | 
56 | cd ..
57 | conda deactivate
58 | conda activate safety-eval
59 | cd safety-eval-fork
60 | 
61 | export OPENAI_API_KEY=''
62 | 
63 | python evaluation/eval.py generators \
64 |   --model_name_or_path $MODEL \
65 |   --use_vllm \
66 |   --model_input_template_path_or_name llama3 \
67 |   --tasks wildguardtest,harmbench,xstest,do_anything_now \
68 |   --report_output_path $OUTPUT_PATH/safety_eval.json \
69 |   --save_individual_results_path $OUTPUT_PATH/safety_generation.json \
70 |   --batch_size 8
71 | 


--------------------------------------------------------------------------------
/scripts/merge.sh:
--------------------------------------------------------------------------------
 1 | # Model soup
 2 | python ./merging/main.py --algo TaskArithmetic --scaling-coef 0.2 --base-model meta-llama/Llama-3.2-3B
 3 | 
 4 | # Task arithmetic
 5 | python ./merging/main.py --algo TaskArithmetic --scaling-coef 0.4 --base-model meta-llama/Llama-3.2-3B
 6 | 
 7 | # Fisher Merging
 8 | ALL_TASKS=DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF
 9 | 
10 | for TASK in DartMath WildguardMix MagiCoder Aya Tulu3IF; do
11 |   deepspeed --master_port=61001 --include=localhost:0,1,2,3 ./merging/main.py \
12 |     --algo Fisher \
13 |     --base-model meta-llama/Llama-3.2-3B \
14 |     --task_names $TASK \
15 |     --save_group $ALL_TASKS \
16 |     --fisher_only \
17 |     --model_coeff 1
18 | done
19 | 
20 | python ./merging/main.py \
21 |     --algo Fisher \
22 |     --base-model meta-llama/Llama-3.2-3B \
23 |     --task_names $ALL_TASKS \
24 |     --save_group $ALL_TASKS \
25 |     --merge_only \
26 |     --keep_checkpoints \
27 |     --model_coeff 1
28 | 
29 | # RegMean
30 | python ./merging/main.py --algo RegMean --base-model meta-llama/Llama-3.2-3B --task_names DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF --reduction 0.5
31 | 
32 | # RegMeanPlusPlus
33 | python ./merging/main.py --algo RegMeanPlusPlus --base-model meta-llama/Llama-3.2-3B --task_names DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF --reduction 0.1
34 | 
35 | # TIES Merging
36 | python ./merging/main.py --algo TIES --base-model meta-llama/Llama-3.2-3B --K 0.3 --scaling-coef 0.4
37 | 
38 | # DARE
39 | python ./merging/main.py --algo RegMean --base-model meta-llama/Llama-3.2-3B --p 0.9 --scaling-coef 0.4
40 | 
41 | # Consensus TA
42 | python ./merging/main.py --algo RegMean --base-model meta-llama/Llama-3.2-3B --scaling-coef 0.4
43 | 
44 | # Dataless Localize-and-Stitch
45 | python ./merging/main.py --algo LocalizeAndStitch --base-model meta-llama/Llama-3.2-3B --sparsity 0.1 --dataless
46 | 
47 | # Localize-and-Stitch
48 | python ./merging/main.py --algo LocalizeAndStitch --base-model meta-llama/Llama-3.2-3B  --lr 1e8 --sparsity 0.1 --n_epochs 1
49 | 


--------------------------------------------------------------------------------
/merging/merging_methods/consensus.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from merging_methods.utils import *
 3 | from merging_methods.merger import Merger
 4 | 
 5 | 
 6 | class Consensus(Merger):
 7 |     def __init__(self, base_model, ft_models, save_path):
 8 |         super().__init__(base_model, ft_models, save_path)\
 9 |     
10 |     def tune_lamda(self, mtl_tv, tv, i):
11 |         for lamda in [0.2,0.6]:    
12 |             print(f'Tuning lamda: {lamda} for model {i}')
13 |             tall_mask = (torch.abs(tv) > torch.abs(mtl_tv - tv) * lamda)
14 |         
15 |             masked_model = vector_to_state_dict(tv * tall_mask, self.base_model)
16 |             
17 |             save_dir = './tmp/' + self.base_model_name.split('/')[1] + '/' + f'Consensus_{i}_lamda_' + str(lamda)
18 |             masked_model.save_pretrained(save_dir)
19 |             self.tokenizer.save_pretrained(save_dir)
20 |     
21 |     def merge(self, **kwargs):
22 |         k = kwargs['k']
23 |         scaling_coef = kwargs['scaling_coef']
24 | 
25 |         task_vectors = [get_task_vector(ft_model, self.base_model) for ft_model in self.ft_ckpts]
26 |         mtl_tv = sum(task_vectors)
27 | 
28 |         tall_masks = []
29 |         # replace this with results from the tune_lamda function
30 |         lamdas = [0.2, 0.2, 0.2, 0.2, 0.2]
31 |         for i in range(len(task_vectors)):
32 |             tv = task_vectors[i]
33 |             tall_mask = (torch.abs(tv) > torch.abs(mtl_tv - tv) * lamdas[i])
34 |             tall_masks.append(tall_mask)
35 |         
36 |         consensus_mask = torch.zeros_like(tall_masks[0], dtype=torch.int16)
37 |         for mask in tall_masks:
38 |             consensus_mask += mask.to(torch.int16)
39 |         consensus_mask = consensus_mask >= k
40 |         
41 |         merged_tv = mtl_tv * consensus_mask * scaling_coef
42 |             
43 |         merged_model = vector_to_state_dict(merged_tv, self.base_model)
44 | 
45 |         merged_model.save_pretrained(self.save_path)
46 |         self.tokenizer.save_pretrained(self.save_path)


--------------------------------------------------------------------------------
/merging/merging_methods/localize_and_stitch.py:
--------------------------------------------------------------------------------
 1 | from merging_methods.utils import *
 2 | from merging_methods.merger import Merger
 3 | from merging_methods.localize_utils import *
 4 | from transformers import AutoModelForCausalLM
 5 | from datasets import load_dataset
 6 | 
 7 | 
 8 | class LocalizeAndStitch(Merger):
 9 |     def __init__(self, base_model, ft_models, save_path):
10 |         super().__init__(base_model, ft_models, save_path)
11 | 
12 |         self.task_names = ['instruction', 'math', 'coding', 'safety', 'multilingual']
13 |     
14 |     def extract_format_keys(self, task):
15 |         dataset = load_dataset(f'MergeBench/{task}_val', split='train')
16 | 
17 |         if task == 'safety':
18 |             format_keys = {"instruction_key": "prompt", "output_key": "response"}
19 |         elif task == 'multilingual':
20 |             format_keys = {"instruction_key": "inputs", "output_key": "targets"}
21 |         elif task == 'math': 
22 |             format_keys = {"instruction_key": "query", "output_key": "response"}
23 |         elif task == 'instruction':
24 |             format_keys = {"instruction_key": "instruction", "output_key": "output"}
25 |         elif task == 'coding': 
26 |             format_keys = {"output_key": "response"}
27 |         
28 |         return dataset, format_keys
29 |         
30 |     def merge(self, **kwargs):
31 |         graft_args = {}
32 |         dataless = kwargs['dataless']
33 |         graft_args['sparsity'] = kwargs['sparsity']
34 |         graft_args['sigmoid_bias'] = kwargs['sigmoid_bias']
35 |         if not dataless:
36 |             graft_args['lr'] = kwargs['learning_rate']
37 |             graft_args['num_train_epochs'] = kwargs['num_train_epochs']
38 |             graft_args['l1_strength'] = kwargs['l1_strength']
39 | 
40 |         # Localize
41 |         masks = []
42 |         for i in range(len(self.ft_ckpts)):
43 |             current_task = self.task_names[i]
44 |             print(f'Localizing {current_task} model')
45 |             ft_model = self.ft_ckpts[i]
46 |             trainable_params = select_trainable_params(ft_model)
47 | 
48 |             localizer = Localizer(trainable_params, self.base_model, ft_model, graft_args, self.base_model_name)
49 |             
50 |             if not dataless:
51 |                 print(f'Training mask {current_task} model')
52 |                 dataset, format_keys = self.extract_format_keys(self.task_names[i])
53 | 
54 |                 localizer.train_mask(dataset, format_keys) 
55 |             
56 |             mask, _ = localizer.interpolate_model(round_=True, return_mask=True, train=False)
57 |             masks.append(mask)
58 |         
59 |         # Stitch
60 |         final_model = AutoModelForCausalLM.from_pretrained(self.base_model_name)
61 |         stitcher = Stitcher(trainable_params, final_model, self.base_model, self.ft_ckpts, masks)
62 |         merged_model = stitcher.interpolate_models()
63 | 
64 |         merged_model.save_pretrained(self.save_path)
65 |         self.tokenizer.save_pretrained(self.save_path)


--------------------------------------------------------------------------------
/merging/prepare_args.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def create_parser():
 4 |     _parser = argparse.ArgumentParser(description='Configuration for MergeBench')
 5 | 
 6 |     # DDP
 7 |     _parser.add_argument("--local_rank", type=int, default=0)
 8 | 
 9 |     # Task arithmetic
10 |     _parser.add_argument('--scaling-coef', default=1, type=float)
11 | 
12 |     # TIES
13 |     _parser.add_argument('--K', default=0.2, type=float)
14 |     _parser.add_argument('--merge_func', default="sum", type=str)
15 | 
16 |     # DARE
17 |     _parser.add_argument('--p', default=0.8, type=float)
18 | 
19 |     # LocalizeAndStitch
20 |     _parser.add_argument('--sigmoid_bias', default=3, type=float)
21 |     _parser.add_argument('--sparsity', default=0.1, type=float)
22 |     _parser.add_argument('--lr', default=1e8, type=float)
23 |     _parser.add_argument('--n_epochs', default=1, type=int)
24 |     _parser.add_argument('--l1_strength', default=0.0, type=float)
25 |     _parser.add_argument('--dataless', action='store_true')
26 | 
27 |     # Consensus
28 |     _parser.add_argument('--k', default=2, type=int)
29 |     _parser.add_argument('--lamda', default=0.5, type=float)
30 | 
31 |     # RegMean and RegMeanPlusPlus
32 |     _parser.add_argument('--task_names', type=str)
33 |     _parser.add_argument('--reduction', type=float)
34 | 
35 |     # Fisher
36 |     _parser.add_argument("--fisher_only", action="store_true", help='fisher-stage1')
37 |     _parser.add_argument("--merge_only", action="store_true", help='fisher-stage2')
38 |     _parser.add_argument("--save_group", type=str, default=None)
39 |     _parser.add_argument("--model_coeff_value", type=float, default=0.3)
40 |     _parser.add_argument("--keep_checkpoints", action="store_true", help='whether delete intermediate files')
41 |     
42 |     
43 |     return _parser
44 | 
45 | def prepare_args(params):
46 |     kwargs = {}
47 |     if params.algo == 'TaskArithmetic':
48 |         kwargs['scaling_coef'] = params.scaling_coef
49 |     elif params.algo == 'TIES':
50 |         kwargs['scaling_coef'] = params.scaling_coef
51 |         kwargs['merge_func'] = params.merge_func
52 |         kwargs['K'] = params.K
53 |     elif params.algo == 'DARE':
54 |         kwargs['scaling_coef'] = params.scaling_coef
55 |         kwargs['p'] = params.p
56 |     elif params.algo == 'LocalizeAndStitch':
57 |         kwargs['sparsity'] = params.sparsity
58 |         kwargs['dataless'] = params.dataless
59 |         kwargs['sigmoid_bias'] = params.sigmoid_bias
60 |         if not params.dataless:
61 |             kwargs['learning_rate'] = params.lr
62 |             kwargs['num_train_epochs'] = params.n_epochs
63 |             kwargs['l1_strength'] = params.l1_strength
64 |     elif params.algo == 'Consensus':
65 |         kwargs['scaling_coef'] = params.scaling_coef
66 |         kwargs['k'] = params.k
67 |         kwargs['lamda'] = params.lamda
68 |     elif params.algo == 'RegMean' or params.algo == 'RegMeanPlusPlus':
69 |         kwargs['task_names'] = params.task_names
70 |         kwargs['reduction'] = params.reduction
71 |     elif params.algo == 'Fisher':
72 |         kwargs['task_names'] = params.task_names
73 |         kwargs['merge_only'] = params.merge_only
74 |         kwargs['fisher_only'] = params.fisher_only
75 |         kwargs['save_group'] = params.save_group
76 |         kwargs['model_coeff_value'] = params.model_coeff_value
77 |         kwargs['keep_checkpoints'] = params.keep_checkpoints
78 |     else:
79 |         raise ValueError('No support merging method {}'.format(params.algo)) 
80 | 
81 |     return kwargs


--------------------------------------------------------------------------------
/merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1/safety_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "wildguardtest": {
 3 |         "micro harm (lower)": 0.4753004005340454,
 4 |         "prompt_type harm (lower)": {
 5 |             "private_information_individual": 0.19753086419753085,
 6 |             "others": 0.45918367346938777,
 7 |             "cyberattack": 0.7083333333333334,
 8 |             "sexual_content": 0.44680851063829785,
 9 |             "social_stereotypes_and_unfair_discrimination": 0.34210526315789475,
10 |             "toxic_language_hate_speech": 0.7804878048780488,
11 |             "violence_and_physical_harm": 0.7,
12 |             "defamation_encouraging_unethical_or_unsafe_actions": 0.717391304347826,
13 |             "fraud_assisting_illegal_activities": 0.55,
14 |             "mental_health_over-reliance_crisis": 0.4666666666666667,
15 |             "causing_material_harm_by_disseminating_misinformation": 0.4444444444444444,
16 |             "disseminating_false_or_misleading_information_encouraging_disinformation_campaigns": 0.7674418604651163,
17 |             "copyright_violations": 0.3,
18 |             "sensitive_information_organization_government": 0.10204081632653061
19 |         },
20 |         "adversarial harm (lower)": 0.47181008902077154,
21 |         "vanilla harm (lower)": 0.47815533980582525
22 |     },
23 |     "harmbench": {
24 |         "micro ASR (lower)": 0.65625,
25 |         "FunctionalCategory ASR (lower)": {
26 |             "standard": 0.7735849056603774,
27 |             "contextual": 0.7160493827160493,
28 |             "copyright": 0.3625
29 |         },
30 |         "SemanticCategory ASR (lower)": {
31 |             "misinformation_disinformation": 0.5740740740740741,
32 |             "chemical_biological": 0.9047619047619048,
33 |             "illegal": 0.7924528301886793,
34 |             "harmful": 0.6111111111111112,
35 |             "copyright": 0.3625,
36 |             "cybercrime_intrusion": 0.9423076923076923,
37 |             "harassment_bullying": 0.47619047619047616
38 |         }
39 |     },
40 |     "xstest": {
41 |         "inverted_rta_all_safe": 0.884,
42 |         "inverted_rta_all_contrast": 0.775,
43 |         "inverted_rta_homonyms": 0.92,
44 |         "inverted_rta_contrast_homonyms": 0.8,
45 |         "inverted_rta_figurative_language": 0.8,
46 |         "inverted_rta_contrast_figurative_language": 0.96,
47 |         "inverted_rta_safe_targets": 0.84,
48 |         "inverted_rta_contrast_safe_targets": 0.8,
49 |         "inverted_rta_safe_contexts": 0.92,
50 |         "inverted_rta_contrast_safe_contexts": 0.92,
51 |         "inverted_rta_definitions": 0.92,
52 |         "inverted_rta_contrast_definitions": 0.84,
53 |         "inverted_rta_nons_group_real_discr": 0.8,
54 |         "inverted_rta_real_group_nons_discr": 0.84,
55 |         "inverted_rta_contrast_discr": 0.6799999999999999,
56 |         "inverted_rta_historical_events": 0.96,
57 |         "inverted_rta_contrast_historical_events": 0.6,
58 |         "inverted_rta_privacy_public": 0.92,
59 |         "inverted_rta_privacy_fictional": 0.92,
60 |         "inverted_rta_contrast_privacy": 0.6,
61 |         "refusal_prf": {
62 |             "precision": 0.6081081081081081,
63 |             "recall": 0.225,
64 |             "f1": 0.3284671532846716
65 |         },
66 |         "overall_accuracy": 0.5911111111111111
67 |     },
68 |     "do_anything_now": {
69 |         "macro ASR": 0.6066666666666667,
70 |         "jailbreak_chat ASR": 0.6206896551724138,
71 |         "LLM Promptwriting ASR": 0.5,
72 |         "BreakGPT ASR": 0.375,
73 |         "ChatGPTJailbreak ASR": 0.5,
74 |         "ChatGPT ASR": 0.8,
75 |         "AI Prompt Sharing ASR": 0.3333333333333333
76 |     }
77 | }


--------------------------------------------------------------------------------
/merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1/safety_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "wildguardtest": {
 3 |         "micro harm (lower)": 0.650200267022697,
 4 |         "prompt_type harm (lower)": {
 5 |             "private_information_individual": 0.5432098765432098,
 6 |             "others": 0.6428571428571429,
 7 |             "cyberattack": 0.6875,
 8 |             "sexual_content": 0.6595744680851063,
 9 |             "social_stereotypes_and_unfair_discrimination": 0.5263157894736842,
10 |             "toxic_language_hate_speech": 0.8292682926829268,
11 |             "violence_and_physical_harm": 0.875,
12 |             "defamation_encouraging_unethical_or_unsafe_actions": 0.8260869565217391,
13 |             "fraud_assisting_illegal_activities": 0.5333333333333333,
14 |             "mental_health_over-reliance_crisis": 0.6,
15 |             "causing_material_harm_by_disseminating_misinformation": 0.7333333333333333,
16 |             "disseminating_false_or_misleading_information_encouraging_disinformation_campaigns": 0.8372093023255814,
17 |             "copyright_violations": 0.5666666666666667,
18 |             "sensitive_information_organization_government": 0.4897959183673469
19 |         },
20 |         "adversarial harm (lower)": 0.5964391691394659,
21 |         "vanilla harm (lower)": 0.6941747572815534
22 |     },
23 |     "harmbench": {
24 |         "micro ASR (lower)": 0.646875,
25 |         "FunctionalCategory ASR (lower)": {
26 |             "standard": 0.8113207547169812,
27 |             "contextual": 0.7037037037037037,
28 |             "copyright": 0.2625
29 |         },
30 |         "SemanticCategory ASR (lower)": {
31 |             "misinformation_disinformation": 0.7222222222222222,
32 |             "chemical_biological": 0.8809523809523809,
33 |             "illegal": 0.8113207547169812,
34 |             "harmful": 0.6111111111111112,
35 |             "copyright": 0.2625,
36 |             "cybercrime_intrusion": 0.8461538461538461,
37 |             "harassment_bullying": 0.5714285714285714
38 |         }
39 |     },
40 |     "xstest": {
41 |         "inverted_rta_all_safe": 0.376,
42 |         "inverted_rta_all_contrast": 0.565,
43 |         "inverted_rta_homonyms": 0.43999999999999995,
44 |         "inverted_rta_contrast_homonyms": 0.6,
45 |         "inverted_rta_figurative_language": 0.43999999999999995,
46 |         "inverted_rta_contrast_figurative_language": 0.6799999999999999,
47 |         "inverted_rta_safe_targets": 0.24,
48 |         "inverted_rta_contrast_safe_targets": 0.72,
49 |         "inverted_rta_safe_contexts": 0.52,
50 |         "inverted_rta_contrast_safe_contexts": 0.56,
51 |         "inverted_rta_definitions": 0.64,
52 |         "inverted_rta_contrast_definitions": 0.52,
53 |         "inverted_rta_nons_group_real_discr": 0.28,
54 |         "inverted_rta_real_group_nons_discr": 0.31999999999999995,
55 |         "inverted_rta_contrast_discr": 0.36,
56 |         "inverted_rta_historical_events": 0.4,
57 |         "inverted_rta_contrast_historical_events": 0.64,
58 |         "inverted_rta_privacy_public": 0.28,
59 |         "inverted_rta_privacy_fictional": 0.19999999999999996,
60 |         "inverted_rta_contrast_privacy": 0.43999999999999995,
61 |         "refusal_prf": {
62 |             "precision": 0.35802469135802467,
63 |             "recall": 0.435,
64 |             "f1": 0.3927765237020316
65 |         },
66 |         "overall_accuracy": 0.4022222222222222
67 |     },
68 |     "do_anything_now": {
69 |         "macro ASR": 0.5633333333333334,
70 |         "jailbreak_chat ASR": 0.5593869731800766,
71 |         "LLM Promptwriting ASR": 0.5,
72 |         "BreakGPT ASR": 0.4375,
73 |         "ChatGPTJailbreak ASR": 0.5,
74 |         "ChatGPT ASR": 1.0,
75 |         "AI Prompt Sharing ASR": 0.3333333333333333
76 |     }
77 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MergeBench: A Benchmark for Merging Domain-Specialized LLMs
 2 | 
 3 | This is the official repo of MergeBench in the paper ["MergeBench: A Benchmark for Merging Domain-Specialized LLMs"](https://arxiv.org/abs/2505.10833) at NeurIPS 2025 Datasets and Benchmarks Track.
 4 | 
 5 | 
 6 | ![alt text](MergeBench.png "MergeBench")
 7 | 
 8 | ## Abstract
 9 | Model merging provides a scalable alternative to multi-task training by combining specialized finetuned models through parameter arithmetic, enabling efficient deployment without the need for joint training or access to all task data. While recent methods have shown promise, existing evaluations are limited in both model scale and task diversity, leaving open questions about their applicability to large, domain-specialized LLMs. To tackle the challenges, we introduce MergeBench, a comprehensive evaluation suite designed to assess model merging at scale. MergeBench builds on state-of-the-art open-source language models, including Llama and Gemma families at 2B to 9B scales, and covers five key domains: instruction following, mathematics, multilingual understanding, coding and safety. We standardize finetuning and evaluation protocols, and assess eight representative merging methods across multi-task performance, forgetting and runtime efficiency. Based on extensive experiments, we provide practical guidelines for algorithm selection and share insights showing that model merging tends to perform better on stronger base models, with techniques such as merging coefficient tuning and sparsification improving knowledge retention. However, several challenges remain, including the computational cost on large models, the gap for in-domain performance compared to multi-task models, and the underexplored role of model merging in standard LLM training pipelines. We hope MergeBench provides a foundation for future research to advance the understanding and practical application of model merging.
10 | 
11 | ## Merging Algorithms
12 | All of the constituent model checkpoints are available at https://huggingface.co/MergeBench. We provide further details in the readme file of the `merging` folder.
13 | 
14 | ## Evaluation
15 | We utilize three existing evaluation packages, and we recommend creating separate environments for each evaluation.
16 | ### lm-eval
17 | ```
18 | conda create -n lmeval python=3.10.9
19 | conda activate lmeval
20 | 
21 | git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
22 | cd lm-evaluation-harness
23 | pip install -e .
24 | 
25 | pip3 install torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
26 | 
27 | pip install langdetect
28 | pip install immutabledict
29 | ```
30 | 
31 | ### bigcode-eval
32 | ```
33 | conda create -n bigcode python=3.10.9
34 | conda activate bigcode
35 | 
36 | git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git
37 | cd bigcode-evaluation-harness
38 | 
39 | pip install -e .
40 | pip3 install torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
41 | pip install numpy==1.24.1
42 | ```
43 | 
44 | ### safety-eval
45 | To install the evaluation
46 | ```
47 | git clone https://github.com/nouhadziri/safety-eval-fork
48 | conda create -n safety-eval python=3.10 && conda activate safety-eval
49 | pip install -e .
50 | pip install -r requirements.txt
51 | pip install vllm==0.4.2
52 | ```
53 | Running the evaluation necessitates a value for openai API key as some tasks in the benchmark suite requires openai API. However, for the ones we test on, it is not required, and you can put the placeholder as follows
54 | ```
55 | export OPENAI_API_KEY=''
56 | ```
57 | 
58 | To perform the full evaluation on all five task categories on the base `Llama-3.2-3B` model with GPU 0 and save the results in the folder `results/llama-3.2-3b`, run the following command:
59 | ```
60 | bash scripts/evaluate.sh meta-llama/Llama-3.2-3B 0 results/llama-3.2-3b
61 | ```
62 | 
63 | ## Citation
64 | ```
65 | @inproceedings{
66 |     he2025mergebench,
67 |     title={MergeBench: A Benchmark for Merging Domain-Specialized {LLM}s},
68 |     author={Yifei He and Siqi Zeng and Yuzheng Hu and Rui Yang and Tong Zhang and Han Zhao},
69 |     booktitle={The Thirty-ninth Annual Conference on Neural Information Processing Systems Datasets and Benchmarks Track},
70 |     year={2025},
71 |     url={https://openreview.net/forum?id=rw50iUoyLu}
72 | }
73 | ```


--------------------------------------------------------------------------------
/merging/merging_methods/fisher_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gc
  3 | 
  4 | import torch
  5 | from torch.nn import functional as F
  6 | from trl import SFTTrainer
  7 | 
  8 | 
  9 | 
 10 | class FisherTrainer(SFTTrainer):
 11 |     
 12 |     def __init__(
 13 |         self,
 14 |         fisher_variant="hard", 
 15 |         **kwargs
 16 |     ):
 17 |         super().__init__(**kwargs)
 18 |         self.fisher_variant = fisher_variant  
 19 | 
 20 |     def compute_loss(self, model, inputs, num_items_in_batch, return_outputs=False):
 21 |         outputs = model(
 22 |             input_ids=inputs["input_ids"],
 23 |             attention_mask=inputs["attention_mask"],
 24 |             return_dict=True
 25 |         )
 26 | 
 27 |         logits = outputs.logits[:, -1, :]  # (batch_size, vocab_size), check last token
 28 | 
 29 |         if self.fisher_variant == "hard":
 30 |             log_probs = F.log_softmax(logits, dim=-1)
 31 |             _, target_labels = logits.max(dim=-1)
 32 |             loss = F.nll_loss(log_probs, target_labels)
 33 | 
 34 |         elif self.fisher_variant == "soft":
 35 |             probs = torch.softmax(logits, dim=-1).detach() 
 36 |             log_probs = torch.log_softmax(logits, dim=-1)
 37 | 
 38 |             vocab_size = probs.size(-1)
 39 |             nll_losses = []
 40 |             for label_id in range(vocab_size):
 41 |                 targets = torch.full(
 42 |                     (probs.size(0),), label_id,
 43 |                     dtype=torch.long, device=probs.device
 44 |                 )
 45 |                 nll_loss_per_label = F.nll_loss(
 46 |                     log_probs, targets, reduction="none"
 47 |                 )
 48 |                 nll_losses.append(nll_loss_per_label)
 49 | 
 50 |             nll_losses = torch.stack(nll_losses, dim=-1)
 51 |             weighted_nll_losses = probs * nll_losses
 52 |             loss = weighted_nll_losses.sum(dim=-1).mean()
 53 | 
 54 |         else:
 55 |             loss = outputs.loss  
 56 |         return (loss, outputs) if return_outputs else loss
 57 |     
 58 | 
 59 | def save_tensor_dict(tensor_dict, path):
 60 |     os.makedirs(path, exist_ok=True)
 61 |     for key, tensor in tensor_dict.items():
 62 |         filename = os.path.join(path, key + ".pt")
 63 |         torch.save(tensor, filename)
 64 |     return
 65 | 
 66 | 
 67 | def cleanup_task_loader(task_loader):
 68 |     """
 69 |     Safely clean up task_loader and its nested trainer to reduce CPU and GPU memory usage.
 70 |     """
 71 |     trainer = getattr(task_loader, 'trainer', None)
 72 | 
 73 |     if trainer is not None:
 74 |         for attr in [
 75 |             'model', 'processing_class', 'train_dataset', 'eval_dataset',
 76 |             'callback_handler', 'args', 'data_collator',
 77 |             'train_dataloader', 'eval_dataloader',
 78 |             'optimizer', 'lr_scheduler',
 79 |         ]:
 80 |             if hasattr(trainer, attr):
 81 |                 try:
 82 |                     setattr(trainer, attr, None)
 83 |                 except Exception as e:
 84 |                     print(f"Warning: couldn't clear trainer.{attr}: {e}")
 85 |         try:
 86 |             del trainer
 87 |         except Exception as e:
 88 |             print(f"Warning: couldn't delete trainer: {e}")
 89 | 
 90 |     for attr in ['training_dataset', 'training_args']:
 91 |         if hasattr(task_loader, attr):
 92 |             try:
 93 |                 setattr(task_loader, attr, None)
 94 |             except Exception as e:
 95 |                 print(f"Warning: couldn't clear task_loader.{attr}: {e}")
 96 | 
 97 |     try:
 98 |         del task_loader
 99 |     except Exception as e:
100 |         print(f"Warning: couldn't delete task_loader: {e}")
101 | 
102 |     gc.collect()
103 |     torch.cuda.empty_cache()
104 | 
105 | 
106 | def get_expected_fisher_keys(model):
107 | 
108 |     return {
109 |         name for name, param in model.named_parameters()
110 |         if param.requires_grad and "lm_head" not in name
111 |     }
112 | 
113 | def is_tensor_dict_complete(path, keys):
114 |     if not os.path.exists(path):
115 |         print(f'{path} doesn\'t exist')
116 |         return False
117 |     for k in keys:
118 |         file_path = os.path.join(path, k + ".pt")
119 |         if not os.path.exists(file_path):
120 |             print(f'{file_path} doesn\'t exist')
121 |             return False
122 |         try:
123 |             _ = torch.load(file_path, map_location="cpu")
124 |         except Exception:
125 |             print(f'{file_path} is corrupted')
126 |             return False  # File exists but is corrupted or unreadable
127 |     return True


--------------------------------------------------------------------------------
/merging/merging_methods/regmean_utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import re
  3 | import os
  4 | from tqdm import tqdm
  5 | import torch
  6 | from torch import nn
  7 | import gc
  8 | 
  9 | # https://github.com/bloomberg/dataless-model-merging/blob/main/regmean_demo.ipynb
 10 | 
 11 | def filter_modules_by_regex(base_module, include_patterns, include_type):
 12 |     modules = {}
 13 |     for name, module in base_module.named_modules():
 14 |         valid_name = not include_patterns or any(
 15 |             [re.match(patt, name) for patt in include_patterns]
 16 |         )
 17 |         valid_type = not include_type or any(
 18 |             [isinstance(module, md_cls) for md_cls in include_type]
 19 |         )
 20 |         if valid_type and valid_name:
 21 |             modules[name] = module
 22 |     return modules
 23 | 
 24 | 
 25 | def send_inputs_to_device(inputs, device):
 26 |     for k, v in inputs.items():
 27 |         if isinstance(v, torch.Tensor): inputs[k] = v.to(device)  # for 'hidden_states', 'attention_mask', 'position_ids', 'cache_position', 'past_key_values'
 28 |         elif isinstance(v, tuple): inputs[k] = tuple([vv.to(device) for vv in v])  # for 'position_embeddings'
 29 | 
 30 |     return inputs
 31 | 
 32 | 
 33 | def compute_grams(trainer, finetuned_model, train_dataloader):
 34 |     covs = {}
 35 |     xn = {}
 36 | 
 37 |     def get_grams(name):
 38 |         def hook(module, input, output):
 39 |             """
 40 |             Note: adhere to signature of hook functions
 41 |             """
 42 |             x = input[0].detach()  # $[b,t,h]
 43 |             x = x.view(-1, x.size(-1))
 44 |             xtx = torch.matmul(x.transpose(0, 1), x)  # [h,h]
 45 |             if name not in covs:
 46 |                 covs[name] = xtx / x.size(0)
 47 |                 xn[name] = x.size(0)
 48 |             else:
 49 |                 covs[name] = (covs[name] * xn[name] + xtx) / (x.size(0) + xn[name])
 50 |                 xn[name] += x.size(0)
 51 | 
 52 |         return hook
 53 | 
 54 |     device = "cpu"
 55 |     if trainer is not None:
 56 |         device = trainer.args.device
 57 |     elif torch.cuda.is_available():
 58 |         device = "cuda:0"
 59 |     model = finetuned_model.to(device)
 60 |     linear_modules = filter_modules_by_regex(
 61 |         model, None, [nn.Linear]
 62 |     )
 63 |     handles = []
 64 |     for name, module in linear_modules.items():
 65 |         handle = module.register_forward_hook(get_grams(name))
 66 |         handles.append(handle)
 67 | 
 68 |     total = len(train_dataloader)
 69 |     for inputs in tqdm(
 70 |         train_dataloader, total=total, desc="Computing gram matrix",
 71 |         disable = type(train_dataloader) == list
 72 |     ):
 73 |         if type(train_dataloader) == list:
 74 |             # For RegMeanPlusPlus
 75 |             inputs = send_inputs_to_device(inputs, device)
 76 |             _ = model(**inputs)
 77 |             inputs = send_inputs_to_device(inputs, "cpu")
 78 |         else:
 79 |             # For RegMean
 80 |             inputs = trainer._prepare_inputs(inputs)
 81 |             _ = model(**inputs)
 82 | 
 83 |     for handle in handles:
 84 |         handle.remove()
 85 | 
 86 |     return covs
 87 | 
 88 | def reduce_non_diag(cov_mat, a):
 89 |     diag_weight = torch.diag(torch.ones(cov_mat.size(0), dtype=cov_mat.dtype) - a).to(cov_mat.device)
 90 |     non_diag_weight = torch.zeros_like(diag_weight).fill_(a)
 91 |     weight = diag_weight + non_diag_weight
 92 |     return cov_mat * weight
 93 | 
 94 | def save_tensor_dict(tensor_dict, path):
 95 |     os.makedirs(path, exist_ok=True)
 96 |     for key, tensor in tensor_dict.items():
 97 |         torch.save(tensor, os.path.join(path, key + ".pt"))
 98 | 
 99 | def cleanup_task_loader(task_loader):
100 |     """
101 |     Safely clean up task_loader and its nested trainer to reduce CPU memory usage.
102 |     """
103 |     trainer = getattr(task_loader, 'trainer', None)
104 | 
105 |     if trainer is not None:
106 |         for attr in [
107 |             'model', 'processing_class', 'train_dataset', 'eval_dataset',
108 |             'callback_handler', 'args', 'data_collator',
109 |             'train_dataloader', 'eval_dataloader',
110 |             'optimizer', 'lr_scheduler',
111 |         ]:
112 |             if hasattr(trainer, attr):
113 |                 try:
114 |                     setattr(trainer, attr, None)
115 |                 except Exception as e:
116 |                     print(f"Warning: couldn't clear trainer.{attr}: {e}")
117 |         del trainer
118 | 
119 |     for attr in ['training_dataset', 'training_args']:
120 |         if hasattr(task_loader, attr):
121 |             try:
122 |                 setattr(task_loader, attr, None)
123 |             except Exception as e:
124 |                 print(f"Warning: couldn't clear task_loader.{attr}: {e}")
125 |     del task_loader
126 | 
127 |     gc.collect()


--------------------------------------------------------------------------------
/merging/merging_methods/regmean.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gc
  3 | import shutil
  4 | 
  5 | from tqdm import tqdm
  6 | 
  7 | import torch
  8 | 
  9 | from merging_methods.utils import *
 10 | from merging_methods.merger import Merger
 11 | 
 12 | import sys
 13 | sys.path.append('<YOUR PATH HERE>/MergeBench/merging')
 14 | from .regmean_utils import compute_grams, save_tensor_dict, reduce_non_diag, cleanup_task_loader
 15 | from taskloader import *
 16 | 
 17 | # https://github.com/bloomberg/dataless-model-merging/blob/main/regmean_demo.ipynb
 18 | 
 19 | class RegMean(Merger):
 20 |     def __init__(self, base_model, ft_models, save_path):
 21 |         super().__init__(base_model, ft_models, save_path)
 22 |     
 23 |     def merge(self, **kwargs):
 24 |         reduction = kwargs["reduction"]
 25 | 
 26 |         exam_datasets = kwargs["task_names"].split("-")
 27 | 
 28 |         save_dir = self.save_path
 29 |         gram_dir = os.path.join(save_dir, "regmean")
 30 |         param_dir = os.path.join(save_dir, "params")
 31 | 
 32 |         all_param_names = set()
 33 |         model_params = self.base_model.state_dict()
 34 |         all_param_names.update(model_params.keys())
 35 |         
 36 |         gram_dirs = [os.path.join(gram_dir, dataset_name) for dataset_name in exam_datasets]
 37 |         param_dirs = [os.path.join(param_dir, dataset_name) for dataset_name in exam_datasets]
 38 | 
 39 |         for idx, dataset_name in enumerate(exam_datasets):
 40 |             finetuned_model = self.ft_ckpts[idx]
 41 |             task_loader = TaskLoader(dataset_name, self.base_model, self.tokenizer, sample_size=1000)
 42 |             trainer = task_loader.trainer
 43 |             dataloader = trainer.get_train_dataloader()
 44 |             with torch.no_grad():
 45 |                 grams = compute_grams(trainer, finetuned_model, dataloader)
 46 |             save_tensor_dict(grams, os.path.join(gram_dir, dataset_name)) # contains most (linear) params grams
 47 |             save_tensor_dict(finetuned_model.state_dict(), os.path.join(param_dir, dataset_name)) # contains all params
 48 | 
 49 |             finetuned_model.to("cpu")
 50 |             cleanup_task_loader(task_loader)
 51 |             del finetuned_model, grams #, trainer, dataloader, task_loader
 52 |             torch.cuda.empty_cache()
 53 |             gc.collect()
 54 |         self.ft_ckpts = []
 55 |         gc.collect()
 56 | 
 57 | 
 58 |         with torch.no_grad():
 59 |             gram_module_names = {f[:-3] for f in os.listdir(gram_dirs[0]) if f.endswith(".pt")}
 60 |             avg_params = {}
 61 |             for name in tqdm(all_param_names, desc='Merging'):
 62 |                 h_avged = False
 63 |                 if name.endswith('.weight') and not name.startswith('lm_head'):
 64 |                     module_name = name[:-len('.weight')]
 65 |                     if module_name in gram_module_names:
 66 |                         sum_gram, grams = None, None
 67 |                         for model_id in range(len(gram_dirs)):
 68 |                             param_grams = torch.load(os.path.join(gram_dirs[model_id], module_name + ".pt"), map_location='cpu').detach()
 69 |                             param_grams = reduce_non_diag(param_grams, a=reduction) # avoid degeneration
 70 |                             param = torch.load(os.path.join(param_dirs[model_id], name + ".pt"), map_location='cpu').detach()
 71 |                             gram_m_w = torch.matmul(param_grams, param.transpose(0, 1))
 72 |                             if sum_gram is None:
 73 |                                 sum_gram = param_grams.clone()
 74 |                                 sum_gram_m_ws = gram_m_w.clone()
 75 |                             else:
 76 |                                 sum_gram.add_(param_grams)
 77 |                                 sum_gram_m_ws.add_(gram_m_w)
 78 |                             del param_grams, param, gram_m_w
 79 |                             gc.collect()
 80 |                         sum_gram_f32 = sum_gram.to(dtype=torch.float32)
 81 |                         cond_number = torch.linalg.cond(sum_gram_f32)
 82 |                         threshold = 1e8 
 83 |                         if cond_number > threshold or torch.any(torch.diag(sum_gram_f32) == 0):
 84 |                             sum_gram_inv = torch.linalg.pinv(sum_gram_f32).to(dtype=sum_gram_m_ws.dtype)
 85 |                         else:
 86 |                             sum_gram_inv = torch.inverse(sum_gram_f32).to(dtype=sum_gram_m_ws.dtype)
 87 |                         wt = torch.matmul(sum_gram_inv, sum_gram_m_ws)
 88 |                         avg_params[name] = wt.transpose(0, 1)
 89 |                         h_avged = True
 90 |                 
 91 |                 if not h_avged: # if not averaged with regmean, then do simple avg
 92 |                     filtered_model_params = None
 93 |                     for model_id in range(len(gram_dirs)):
 94 |                         if not name.startswith('model.embed') and not name.startswith('lm_head'): # embed_tokens.weight have incompatible dimensions due to vocab size difference
 95 |                             filtered_model_param = torch.load(os.path.join(param_dirs[model_id], name + ".pt"), map_location='cpu').detach()
 96 |                             if filtered_model_params is None:
 97 |                                 filtered_model_params = filtered_model_param.clone()
 98 |                             else:
 99 |                                 filtered_model_params.add_(filtered_model_param)
100 |                             del filtered_model_param
101 |                             gc.collect()
102 |                             avg_params[name] = filtered_model_params.div(len(gram_dirs))
103 |             
104 |         shutil.rmtree(gram_dir)
105 |         shutil.rmtree(param_dir)
106 | 
107 |         incompatible_params = self.base_model.load_state_dict(avg_params, strict=False)
108 |         self.base_model.save_pretrained(save_dir)
109 |         self.tokenizer.save_pretrained(save_dir)
110 | 


--------------------------------------------------------------------------------
/merging/merging_methods/ties_merging_utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os, copy
  3 | import torch
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import re
  7 | from collections import OrderedDict
  8 | import torch.nn.functional as F
  9 | # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 10 | 
 11 | ## Model conversion utils
 12 | def state_dict_to_vector(state_dict, remove_keys=[]):
 13 |     shared_state_dict = copy.deepcopy(state_dict)
 14 |     for key in remove_keys:
 15 |         if key in shared_state_dict:
 16 |             del shared_state_dict[key]
 17 |     sorted_shared_state_dict = OrderedDict(sorted(shared_state_dict.items()))
 18 |     return torch.nn.utils.parameters_to_vector(
 19 |         [value.reshape(-1) for key, value in sorted_shared_state_dict.items()]
 20 |     )
 21 | 
 22 | 
 23 | def add_ptm_to_tv(tv_dict, ptm_dict):
 24 |     assert set(tv_dict.keys()) == set(
 25 |         ptm_dict.keys()
 26 |     ), "Differing parameter names in models."
 27 |     final_dict = copy.deepcopy(tv_dict)
 28 |     for k, v in ptm_dict.items():
 29 |         final_dict[k] = tv_dict[k] + v
 30 |     return final_dict
 31 | 
 32 | 
 33 | def check_parameterNamesMatch(checkpoints):
 34 |     parameter_names = set(checkpoints[0].keys())
 35 | 
 36 |     if len(checkpoints) >= 2:
 37 |         # raise ValueError("Number of models is less than 2.")
 38 |         for checkpoint in checkpoints[1:]:
 39 |             current_parameterNames = set(checkpoint.keys())
 40 |             if current_parameterNames != parameter_names:
 41 |                 raise ValueError(
 42 |                     "Differing parameter names in models. "
 43 |                     f"The different parameters are {parameter_names.symmetric_difference(current_parameterNames)}"
 44 |                 )
 45 | 
 46 | def check_state_dicts_equal(state_dict1, state_dict2):
 47 |     if set(state_dict1.keys()) != set(state_dict2.keys()):
 48 |         return False
 49 | 
 50 |     for key in state_dict1.keys():
 51 |         if not torch.equal(state_dict1[key], state_dict2[key]):
 52 |             return False
 53 | 
 54 |     return True
 55 | 
 56 | 
 57 | 
 58 | ## TIES MERGING UTILS
 59 | 
 60 | def topk_values_mask(M, K=0.7, return_mask=False):
 61 |     if K > 1:
 62 |         K /= 100
 63 | 
 64 |     original_shape = M.shape
 65 |     if M.dim() == 1:
 66 |         M = M.unsqueeze(0)
 67 | 
 68 |     n, d = M.shape
 69 |     k = int(d * K)
 70 |     k = d - k  # Keep top k elements instead of bottom k elements
 71 | 
 72 |     # Find the k-th smallest element by magnitude for each row
 73 |     kth_values, _ = M.abs().kthvalue(k, dim=1, keepdim=True)
 74 |     # Create a mask tensor with True for the top k elements in each row
 75 |     mask = M.abs() >= kth_values
 76 |     final_mask = mask.squeeze() if original_shape == M.squeeze().shape else mask
 77 | 
 78 |     if return_mask:
 79 |         return M * final_mask, final_mask.float().mean(dim=1), final_mask
 80 |     return M * final_mask, final_mask.float().mean(dim=1)
 81 | 
 82 | 
 83 | def resolve_zero_signs(sign_to_mult, method="majority"):
 84 |     majority_sign = torch.sign(sign_to_mult.sum())
 85 | 
 86 |     if method == "majority":
 87 |         sign_to_mult[sign_to_mult == 0] = majority_sign
 88 |     elif method == "minority":
 89 |         sign_to_mult[sign_to_mult == 0] = -1 * majority_sign
 90 |     return sign_to_mult
 91 | 
 92 | 
 93 | def resolve_sign(Tensor):
 94 |     sign_to_mult = torch.sign(Tensor.sum(dim=0))
 95 |     sign_to_mult = resolve_zero_signs(sign_to_mult, "majority")
 96 |     return sign_to_mult
 97 | 
 98 | 
 99 | def disjoint_merge(Tensor, merge_func, sign_to_mult):
100 |     merge_func = merge_func.split("-")[-1]
101 | 
102 |     # If sign is provided then we select the corresponding entries and aggregate.
103 |     if sign_to_mult is not None:
104 |         rows_to_keep = torch.where(
105 |             sign_to_mult.unsqueeze(0) > 0, Tensor > 0, Tensor < 0
106 |         )
107 |         selected_entries = Tensor * rows_to_keep
108 |     # Else we select all non-zero entries and aggregate.
109 |     else:
110 |         rows_to_keep = Tensor != 0
111 |         selected_entries = Tensor * rows_to_keep
112 | 
113 |     if merge_func == "mean":
114 |         non_zero_counts = (selected_entries != 0).sum(dim=0).float()
115 |         disjoint_aggs = torch.sum(selected_entries, dim=0) / torch.clamp(non_zero_counts, min=1)
116 |     elif merge_func == "sum":
117 |         disjoint_aggs = torch.sum(selected_entries, dim=0)
118 |     elif merge_func == "max":
119 |         disjoint_aggs = selected_entries.abs().max(dim=0)[0]
120 |         disjoint_aggs *= sign_to_mult
121 |     else:
122 |         raise ValueError(f"Merge method {merge_func} is not defined.")
123 | 
124 |     return disjoint_aggs
125 | 
126 | 
127 | def ties_merging(
128 |         flat_task_checks,
129 |         reset_thresh=None,
130 |         merge_func="",
131 | ):
132 |     all_checks = flat_task_checks.clone()
133 |     updated_checks, *_ = topk_values_mask(
134 |         all_checks, K=reset_thresh, return_mask=False
135 |     )
136 |     print(f"RESOLVING SIGN")
137 |     final_signs = resolve_sign(updated_checks)
138 |     assert final_signs is not None
139 | 
140 |     print(f"Disjoint AGGREGATION: {merge_func}")
141 |     merged_tv = disjoint_merge(updated_checks, merge_func, final_signs)
142 | 
143 |     return merged_tv
144 | 
145 | def disjoint_merge_split(Tensor, merge_func, sign_to_mult):
146 |     merge_func = merge_func.split("-")[-1]
147 | 
148 |     # If sign is provided then we select the corresponding entries and aggregate.
149 |     if sign_to_mult is not None:
150 |         rows_to_keep = torch.where(
151 |             sign_to_mult.unsqueeze(0) > 0, Tensor > 0, Tensor < 0
152 |         )
153 |         selected_entries = Tensor * rows_to_keep
154 |     # Else we select all non-zero entries and aggregate.
155 |     else:
156 |         rows_to_keep = Tensor != 0
157 |         selected_entries = Tensor * rows_to_keep
158 | 
159 |     if merge_func == "sum":
160 |         disjoint_aggs = torch.sum(selected_entries, dim=0)
161 |     else:
162 |         raise ValueError(f"Merge method {merge_func} is not defined.")
163 | 
164 |     return selected_entries, disjoint_aggs
165 | 
166 | 
167 | def ties_merging_split(
168 |         flat_task_checks,
169 |         reset_thresh=None,
170 |         merge_func="",
171 | ):
172 |     all_checks = flat_task_checks.clone()
173 |     updated_checks, *_ = topk_values_mask(
174 |         all_checks, K=reset_thresh, return_mask=False
175 |     )
176 |     print(f"RESOLVING SIGN")
177 |     final_signs = resolve_sign(updated_checks)
178 |     assert final_signs is not None
179 | 
180 |     print(f"Disjoint AGGREGATION: {merge_func}")
181 |     selected_entries, merged_tv = disjoint_merge_split(updated_checks, merge_func, final_signs)
182 | 
183 |     return selected_entries, merged_tv


--------------------------------------------------------------------------------
/merging/merging_methods/fisher.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gc
  3 | import shutil
  4 | 
  5 | from tqdm import tqdm
  6 | 
  7 | import torch
  8 | import torch.distributed as dist
  9 | 
 10 | from accelerate.state import AcceleratorState, GradientState
 11 | 
 12 | from merging_methods.utils import *
 13 | from merging_methods.merger import Merger
 14 | 
 15 | import sys
 16 | sys.path.append('<YOUR PATH HERE>/MergeBench/merging')
 17 | from .fisher_utils import FisherTrainer, save_tensor_dict, cleanup_task_loader, get_expected_fisher_keys, is_tensor_dict_complete
 18 | from taskloader import *
 19 | 
 20 | from accelerate.utils.deepspeed import DeepSpeedEngineWrapper
 21 | from deepspeed.utils import safe_get_full_grad
 22 | import os
 23 | import gc
 24 | import shutil
 25 | 
 26 | from tqdm import tqdm
 27 | 
 28 | import torch
 29 | import torch.distributed as dist
 30 | 
 31 | from accelerate.state import AcceleratorState, GradientState
 32 | 
 33 | from merging_methods.utils import *
 34 | from merging_methods.merger import Merger
 35 | 
 36 | import sys
 37 | sys.path.append('/home/cindy2000_sh/MergeBench/merging')
 38 | from .fisher_utils import FisherTrainer, save_tensor_dict, cleanup_task_loader, get_expected_fisher_keys, is_tensor_dict_complete
 39 | from taskloader import *
 40 | 
 41 | from accelerate.utils.deepspeed import DeepSpeedEngineWrapper
 42 | from deepspeed.utils import safe_get_full_grad
 43 | 
 44 | # https://github.com/mmatena/model_merging/blob/master/model_merging/fisher.py
 45 | 
 46 | class Fisher(Merger):
 47 |     def __init__(self, base_model, ft_models, save_path):
 48 |         super().__init__(base_model, ft_models, save_path)
 49 |         self.base_model = self.base_model.to('cpu')
 50 |         self.ft_ckpts = [ft_model.to('cpu') for ft_model in self.ft_ckpts]
 51 |     
 52 |     def merge(self, **kwargs):
 53 |         fisher_only = kwargs["fisher_only"]
 54 |         merge_only = kwargs["merge_only"]
 55 |         model_coeff_value = kwargs["model_coeff_value"]
 56 |         keep_checkpoints = kwargs['keep_checkpoints']
 57 |         save_group = kwargs['save_group']
 58 | 
 59 |         exam_datasets = kwargs["task_names"].split("-")
 60 |         all_tasks = save_group.split("-")
 61 |         self.ft_ckpts = {
 62 |             task: model.to('cpu')
 63 |             for task, model in zip(all_tasks, self.ft_ckpts)
 64 |         }
 65 | 
 66 |         save_dir = self.save_path
 67 |         fisher_dir = os.path.join(save_dir, "fisher")
 68 |         param_dir = os.path.join(save_dir, "params")
 69 | 
 70 |         all_param_names = set()
 71 |         model_params = self.base_model.state_dict()
 72 |         all_param_names.update(model_params.keys())
 73 | 
 74 |         fisher_dirs = [os.path.join(fisher_dir, dataset_name) for dataset_name in exam_datasets]
 75 |         param_dirs = [os.path.join(param_dir, dataset_name) for dataset_name in exam_datasets]
 76 | 
 77 |         if fisher_only:
 78 |             for idx, dataset_name in enumerate(exam_datasets):
 79 |                 
 80 |                 fisher = {}
 81 |                 n_steps_ref = [0] # Mutable n_steps (so it updates across steps)
 82 | 
 83 |                 def make_patched_backward(fisher_dict, n_steps_ref):
 84 |                     def patched_backward(self, loss, **kwargs):
 85 |                         self.engine.backward(loss, **kwargs)
 86 |                         with torch.no_grad():
 87 |                             for name, param in self.engine.module.named_parameters():
 88 |                                 if not param.requires_grad:
 89 |                                     continue
 90 |                                 grad_ds = safe_get_full_grad(param)
 91 |                                 if grad_ds is not None:
 92 |                                     grad_cpu = grad_ds.detach().cpu()
 93 |                                     grad_sq_cpu = grad_cpu ** 2
 94 |                                     if name not in fisher_dict:
 95 |                                         fisher_dict[name] = grad_sq_cpu
 96 |                                     else:
 97 |                                         fisher_dict[name] += grad_sq_cpu
 98 |                                     del grad_ds, grad_cpu, grad_sq_cpu
 99 |                             torch.cuda.empty_cache()
100 |                             n_steps_ref[0] += 1
101 |                         self.engine.step()
102 |                     return patched_backward
103 | 
104 |                 
105 |                 DeepSpeedEngineWrapper.backward = make_patched_backward(fisher, n_steps_ref)
106 |                 
107 |                 finetuned_model = self.ft_ckpts[dataset_name]
108 |                 expected_fisher_keys = get_expected_fisher_keys(finetuned_model)
109 |                 param_path = param_dirs[idx]
110 |                 fisher_path = fisher_dirs[idx]
111 | 
112 |                 fisher_complete = is_tensor_dict_complete(fisher_path, expected_fisher_keys)
113 |                 params_complete = is_tensor_dict_complete(param_path, all_param_names)
114 | 
115 |                 print(exam_datasets,"fisher_complete:",fisher_complete,"params_complete:",params_complete)
116 | 
117 |                 if fisher_complete and params_complete:
118 |                     print(f"Skipping {dataset_name} — already processed.")
119 |                     self.ft_ckpts[dataset_name] = finetuned_model.to("cpu")
120 |                     continue
121 | 
122 |                 elif fisher_complete and not params_complete:
123 |                     print(f"Processing {dataset_name}")
124 |                     if dist.get_rank() == 0:
125 |                         save_tensor_dict(finetuned_model.state_dict(), os.path.join(param_dir, dataset_name))
126 |                     continue
127 | 
128 |                 print(f"Processing {dataset_name}")
129 |                 if not params_complete:
130 |                     if dist.get_rank() == 0:
131 |                         save_tensor_dict(finetuned_model.state_dict(), os.path.join(param_dir, dataset_name))
132 |                         
133 |                 finetuned_model = self.ft_ckpts[dataset_name].to("cuda")
134 | 
135 |                 task_loader = TaskLoader(dataset_name, finetuned_model, self.tokenizer, sample_size=1000)
136 |                 sft_trainer = task_loader.trainer
137 | 
138 |                 sft_args = sft_trainer.args
139 |                 sft_model = sft_trainer.model
140 |                 sft_train_dataset = sft_trainer.train_dataset
141 |                 sft_formatting_func = getattr(sft_trainer, "formatting_func", None)
142 | 
143 |                 sft_model.gradient_checkpointing_enable()
144 | 
145 |                 AcceleratorState._reset_state(True)
146 |                 GradientState._reset_state()
147 | 
148 |                 fisher_trainer = FisherTrainer(
149 |                                 model=sft_model,
150 |                                 args=sft_args,
151 |                                 train_dataset=sft_train_dataset,
152 |                                 formatting_func=sft_formatting_func,
153 |                             )
154 | 
155 |                 fisher_trainer.train()
156 |                 
157 |                 for k in fisher:
158 |                     fisher[k] /= n_steps_ref[0]  
159 |                 
160 |                 if dist.get_rank() == 0:       
161 |                     save_tensor_dict(fisher, os.path.join(fisher_dir, dataset_name))
162 | 
163 |                 self.ft_ckpts[dataset_name] = self.ft_ckpts[dataset_name].to("cpu")
164 |                 cleanup_task_loader(task_loader)
165 |                 del fisher_trainer
166 |                 gc.collect()
167 |                 torch.cuda.empty_cache()
168 | 
169 |             self.ft_ckpts = []
170 |             torch.cuda.empty_cache()
171 |             gc.collect()
172 | 
173 |         if merge_only:
174 |             if not dist.is_initialized() or dist.get_rank() == 0:
175 |                 # https://github.com/uiuctml/MergeBench/blob/main/merging/clip_merging_code/src/main_fisher.py
176 |                 model_coeffs = torch.ones(len(exam_datasets)) * model_coeff_value
177 |                 avg_params = {}
178 |                 fisher_module_names = {f[:-3] for f in os.listdir(fisher_dirs[0]) if f.endswith(".pt")}
179 | 
180 |                 for n in tqdm(all_param_names, desc='Merging'):
181 |                     if n in fisher_module_names and not n.startswith('model.embed'):
182 |                         param_list = []
183 |                         fisher_list = []
184 | 
185 |                         fisher_list = [torch.load(os.path.join(fisher_dirs[model_id], n + ".pt"), map_location="cpu") for model_id in range(len(fisher_dirs))]
186 |                         param_list = [torch.load(os.path.join(param_dirs[model_id], n + ".pt"), map_location="cpu") for model_id in range(len(fisher_dirs))]
187 | 
188 |                         params = torch.stack(param_list)  # [N, *]
189 |                         fisher = torch.stack(fisher_list) + 1.0e-10  # [N, *]
190 | 
191 |                         coeff = model_coeffs.view(-1, *[1 for _ in range(params.dim() - 1)]).to(params.device)
192 |                         fisher = fisher.to(params.device)
193 |                         sum_p = (params * fisher * coeff).sum(0)
194 |                         denom = (fisher * coeff).sum(0)
195 |                         avg_p = sum_p / denom
196 | 
197 |                         avg_params[n] = avg_p.cpu()  
198 | 
199 |                         del param_list, fisher_list, params, fisher, sum_p, denom, avg_p
200 |                         torch.cuda.empty_cache()
201 |                 
202 |                 # remove intermediate checkpoints
203 |                 if not keep_checkpoints:
204 |                     shutil.rmtree(fisher_dir)
205 |                     shutil.rmtree(param_dir)
206 | 
207 |                 incompatible_params = self.base_model.load_state_dict(avg_params, strict=False)
208 |                 self.base_model.save_pretrained(save_dir)
209 |                 self.tokenizer.save_pretrained(save_dir)


--------------------------------------------------------------------------------
/merging/merging_methods/localize_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from tqdm import tqdm
  4 | from merging_methods.utils import get_task_vector, vector_to_state_dict
  5 | from taskloader import formatting_prompts_func
  6 | from trl import SFTTrainer, SFTConfig
  7 | from transformers import AutoModelForCausalLM, TrainerCallback
  8 | from accelerate import dispatch_model
  9 | 
 10 | class Localizer():
 11 |     def __init__(self, trainable_params, pretrained_model, finetuned_model, graft_args, base_model_name):
 12 |         super().__init__()
 13 |         
 14 |         self.params = trainable_params
 15 |         self.pretrained_model = pretrained_model
 16 |         self.finetuned_model = finetuned_model
 17 |         self.graft_args = graft_args
 18 |         self.base_model_name = base_model_name
 19 | 
 20 |         self.pretrained_model.to("cpu")
 21 |         self.finetuned_model.to("cpu")
 22 |         self.finetuned_model.eval()
 23 |         self.pretrained_model.eval()
 24 |         for param in self.pretrained_model.parameters():
 25 |             param.requires_grad = False   
 26 |         for param in self.finetuned_model.parameters():
 27 |             param.requires_grad = False
 28 | 
 29 |         self.task_vector = get_task_vector(self.finetuned_model, self.pretrained_model)
 30 |         self.num_params = len(self.task_vector)
 31 | 
 32 |         # self.create_binary_masks()
 33 |         self.mask = self.create_topk_mask()
 34 | 
 35 | 
 36 |     def reset_model(self):
 37 |         self.model = AutoModelForCausalLM.from_pretrained(self.base_model_name, 
 38 |                                                     torch_dtype="bfloat16", 
 39 |                                                     attn_implementation="flash_attention_2", 
 40 |                                                     device_map='auto')
 41 |         self.device_map = self.model.hf_device_map
 42 | 
 43 | 
 44 |     def create_topk_mask(self):
 45 | 
 46 |         abs_tv = torch.abs(self.task_vector)
 47 |         k = int(self.graft_args['sparsity'] * abs_tv.numel())  # 1% of the total number of elements
 48 | 
 49 |         # Get the k largest values; returns values and their indices
 50 |         values, indices = torch.topk(abs_tv.view(-1), k)
 51 |         threshold = values.min()
 52 | 
 53 |         mask = torch.zeros_like(self.task_vector, requires_grad=False)
 54 |         mask[torch.abs(self.task_vector) >= threshold] = self.graft_args['sigmoid_bias']
 55 |         # print non-zero count in mask
 56 |         print('Initial topk sparsity in my mask: ', torch.nonzero(mask).numel() / self.num_params)
 57 | 
 58 |         mask[torch.abs(self.task_vector) < threshold] = -self.graft_args['sigmoid_bias']
 59 |         # mask[torch.abs(self.task_vector) > threshold] = 1
 60 | 
 61 |         return mask
 62 | 
 63 | 
 64 |     def interpolate_model(self, round_, return_mask=False, train=True):  
 65 | 
 66 |         sigmoid = torch.nn.Sigmoid()
 67 |         frac = sigmoid(self.mask)
 68 |         
 69 |         if round_:
 70 |             frac = torch.round(frac)
 71 |         
 72 |         final_tv = self.task_vector.clone()
 73 |         final_tv = final_tv * frac 
 74 |         self.model = vector_to_state_dict(final_tv, self.pretrained_model, return_dict=False)
 75 |         self.pretrained_model = AutoModelForCausalLM.from_pretrained(self.base_model_name, 
 76 |                                                     torch_dtype="bfloat16", 
 77 |                                                     attn_implementation="flash_attention_2")
 78 |         
 79 |         if train:
 80 |             self.model = dispatch_model(self.model, device_map=self.device_map)
 81 | 
 82 |         if round_:
 83 |             proportion = len(torch.nonzero(frac.bool())) / self.num_params
 84 |             print('Proportion in my mask: ', proportion)
 85 |         
 86 |         if return_mask:
 87 |             return frac, proportion
 88 | 
 89 | 
 90 |     def train_mask(self, dataset, format_keys):
 91 |         
 92 |         sigmoid = torch.nn.Sigmoid()
 93 |         
 94 |         # Create the interpolated model with the current mask
 95 |         self.reset_model()
 96 | 
 97 |         for i in range(self.graft_args['num_train_epochs']):
 98 |             print(f"Training epoch {i+1}")
 99 | 
100 |             self.interpolate_model(round_=False)
101 |             self.model.train()
102 |             for param in self.model.parameters():
103 |                 param.requires_grad = True
104 |             
105 |             training_args = SFTConfig(
106 |                             per_device_train_batch_size=2,  # Minimum batch size
107 |                             packing=True,
108 |                             gradient_checkpointing=True,
109 |                             save_strategy="no",
110 |                             optim="adamw_torch_fused",
111 |                             bf16=True,
112 |                             report_to=None,
113 |                             do_eval=False,
114 |                             num_train_epochs=self.graft_args['num_train_epochs'],
115 |                             output_dir="output",
116 |                             max_seq_length=3072,
117 |                         )
118 |             
119 |             # Create SFTTrainer
120 |             trainer = SFTTrainer(
121 |                 model=self.model,
122 |                 args=training_args,
123 |                 train_dataset=dataset,
124 |                 formatting_func=lambda examples: formatting_prompts_func(
125 |                                     examples, **format_keys
126 |                                 ),
127 |             )
128 |             
129 |             # Define a callback to track gradients during training
130 |             class GradientTrackingCallback(TrainerCallback):
131 |                 def __init__(self):
132 |                     self.accumulated_grads = {}
133 |                     self.num_backward_calls = 0
134 |                 
135 |                 def on_optimizer_step(self, args, state, control, model, **kwargs):
136 |                     self.num_backward_calls += 1
137 |                     for name, param in model.named_parameters():
138 |                         # if 'embed' not in name.lower() and 'lm_head' not in name.lower():
139 |                         if 'embed' not in name.lower():
140 |                             if name not in self.accumulated_grads:
141 |                                 self.accumulated_grads[name] = param.grad.to('cpu').detach().clone()
142 |                             else:
143 |                                 self.accumulated_grads[name] += param.grad.to('cpu').detach().clone()
144 |                     return control
145 |                 
146 |                 def get_total_grads(self):
147 |                     grad_vector = torch.cat([grad.flatten() for k, grad in self.accumulated_grads.items()])
148 |                     return grad_vector
149 | 
150 |             # Convert accumulated gradients dict to a single tensor
151 |             gradient_callback = GradientTrackingCallback()
152 |             trainer.add_callback(gradient_callback)
153 |             
154 |             # Train for one epoch
155 |             trainer.train()
156 | 
157 |             # gradient of the loss with respect to the model
158 |             grad = gradient_callback.get_total_grads()
159 |             grad = grad * self.task_vector
160 | 
161 |             # Reset model for next epoch
162 |             self.reset_model()
163 |             
164 |             # Take the gradient step to update the mask
165 |             with torch.no_grad():
166 |                 # gradient of the model with respect to the mask
167 |                 derivative = sigmoid(self.mask) * (1 - sigmoid(self.mask))
168 |                 reg_term = self.graft_args['l1_strength'] * torch.where(self.mask > 0, derivative, -derivative)
169 |                 grad.to(self.mask.device)
170 |                 # print("total_grad: ", (total_grad * derivative).mean())
171 |                 print(self.graft_args['lr'] * grad * derivative - reg_term)
172 |                 self.mask -= self.graft_args['lr'] * grad * derivative - reg_term
173 |                 print("Gradient step on mask complete")
174 | 
175 |                 cur_mask = self.mask.clone()
176 |                 cur_mask = torch.round(sigmoid(cur_mask))
177 |                 print('Proportion in my mask: ', len(torch.nonzero(cur_mask.bool())) / self.num_params)
178 | 
179 | 
180 | class Stitcher(nn.Module):
181 |     def __init__(self, trainable_params, model, pretrained_model, finetuned_models, masks):
182 |         super().__init__()
183 |         self.params = trainable_params
184 |         self.pretrained_model = pretrained_model
185 |         self.finetuned_models = finetuned_models
186 |         self.model = model
187 | 
188 |         self.masks = masks
189 |         if len(self.masks) > 1:
190 |             self.masks = self.get_average_masks()
191 |         self.task_vector = torch.zeros_like(get_task_vector(self.finetuned_models[0], self.pretrained_model))
192 | 
193 | 
194 |     def get_average_masks(self):
195 |             
196 |         def reciprocal_with_zero(tensor):
197 |             mask = tensor == 0
198 |             reciprocal = torch.reciprocal(tensor)
199 |             reciprocal = reciprocal.masked_fill(mask, 0)
200 |             return reciprocal
201 | 
202 |         output_masks = []
203 |         for i in range(len(self.masks)):
204 |             output_mask = self.masks[i].clone().detach()
205 |             for j in range(len(self.masks)):
206 |                 if i == j: continue
207 |                 intersect = torch.logical_and(self.masks[i], self.masks[j])
208 |             output_mask = output_mask + intersect
209 |             output_mask = reciprocal_with_zero(output_mask)
210 |             output_masks.append(output_mask)
211 | 
212 |         return output_masks
213 | 
214 | 
215 |     def interpolate_models(self):
216 | 
217 |         for finetuned_model, mask in zip(self.finetuned_models, self.masks):
218 |             with torch.no_grad():
219 |                 self.task_vector += mask * get_task_vector(finetuned_model, self.pretrained_model)
220 |     
221 |         self.model = vector_to_state_dict(self.task_vector, self.pretrained_model, return_dict=False)
222 |         
223 |         return self.model
224 |                         


--------------------------------------------------------------------------------
/merging/merging_methods/regmean_plusplus.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gc
  3 | import shutil
  4 | from collections import defaultdict
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | import torch
  9 | from torch import nn
 10 | 
 11 | from merging_methods.utils import *
 12 | from merging_methods.merger import Merger
 13 | 
 14 | import sys
 15 | sys.path.append('<YOUR PATH HERE>/MergeBench/merging')
 16 | from .regmean_utils import *
 17 | from taskloader import *
 18 | 
 19 | 
 20 | ARCHITECTURE_MODULE_MAP = {
 21 |     "LlamaForCausalLM": {
 22 |         "transformer_layers_string": "model.layers",
 23 |         "embedding_layer_string": "model.embed_tokens",
 24 |         "lm_head_string": "lm_head",
 25 |     },
 26 |     "Gemma2ForCausalLM": {
 27 |         "transformer_layers_string": "model.layers",
 28 |         "embedding_layer_string": "model.embed_tokens",
 29 |         "lm_head_string": "lm_head",
 30 |     }
 31 | }
 32 | 
 33 | 
 34 | class RegMeanPlusPlus(Merger):
 35 |     def __init__(self, base_model, ft_models, save_path):
 36 |         super().__init__(base_model, ft_models, save_path)
 37 |         self.merged_model = self.base_model
 38 |         self.merged_model.config.use_cache = False  # Don't need to store past key values as we are not generating text
 39 |         self.number_of_layers = self.merged_model.config.num_hidden_layers
 40 | 
 41 |         self.num_finetuned_models = len(self.ft_ckpts)
 42 | 
 43 |         module_map = ARCHITECTURE_MODULE_MAP[self.base_model.config.architectures[0]]
 44 |         self.transformer_layers_string = module_map["transformer_layers_string"]
 45 |         self.embedding_layer_string = module_map["embedding_layer_string"]
 46 |         self.lm_head_string = module_map["lm_head_string"]
 47 | 
 48 |         self.post_init()
 49 |         
 50 |     def post_init(self):
 51 |         # Init the merged model by weight averaging for weights other than the transformer layers, embedding layers, and LM head
 52 |         merged_model_param_names = self.merged_model.state_dict().keys()
 53 |         merged_model_param_names = [name for name in merged_model_param_names if not (name.startswith(self.transformer_layers_string) or name.startswith(self.embedding_layer_string) or name.startswith(self.lm_head_string))]
 54 |         
 55 |         for name in tqdm(merged_model_param_names, desc='Init the merged model by weight averaging'):
 56 |             merged_param = torch.mean(torch.stack([self.ft_ckpts[i].state_dict()[name] for i in range(self.num_finetuned_models)]), dim=0)
 57 |             self.merged_model.state_dict()[name].copy_(merged_param)
 58 |         
 59 |     def get_first_layer_input(self, model, trainer, dataloader):
 60 |         first_layer_input_batch = []
 61 | 
 62 |         # https://docs.pytorch.org/docs/stable/generated/torch.nn.Module.html
 63 |         def hook(module, args, kwargs, output):
 64 |             for k, v in kwargs.items():
 65 |                 if isinstance(v, torch.Tensor): kwargs[k] = v.detach().cpu()  # for 'hidden_states' (if it's explicitly passed with keyword arguments), 'attention_mask', 'position_ids', 'cache_position'
 66 |                 elif isinstance(v, tuple): kwargs[k] = tuple([vv.detach().cpu() for vv in v])  # for 'position_embeddings'
 67 |                 else: kwargs[k] = v  # for 'past_key_values'
 68 |             
 69 |             if len(args) > 0:
 70 |                 first_layer_input_batch.append({"hidden_states": args[0].detach().cpu(), **kwargs})
 71 |             else:
 72 |                 first_layer_input_batch.append(kwargs)
 73 | 
 74 |         model.to(trainer.args.device)
 75 |         model.config.num_hidden_layers = 1
 76 | 
 77 |         handle = eval(f"model.{self.transformer_layers_string}[0]").register_forward_hook(hook, with_kwargs=True)
 78 | 
 79 |         total = len(dataloader)
 80 |         for inputs in tqdm(dataloader, total=total, desc="Get (merged) model's 1st layer input"):
 81 |             inputs = trainer._prepare_inputs(inputs)
 82 |             _ = model(**inputs)
 83 | 
 84 |         handle.remove()
 85 |         
 86 |         model.to("cpu")
 87 |         model.config.num_hidden_layers = self.number_of_layers
 88 | 
 89 |         return first_layer_input_batch
 90 | 
 91 |     def forward_layer(self, layer, task_input):
 92 |         device = "cuda:0" if torch.cuda.is_available() else "cpu"
 93 |         layer.to(device)
 94 | 
 95 |         for batch_idx, inputs in enumerate(task_input):
 96 |             inputs = send_inputs_to_device(inputs, device)
 97 |             hidden_states = layer(**inputs)
 98 |             if type(hidden_states) == tuple:
 99 |                 hidden_states = hidden_states[0]
100 |             task_input[batch_idx]['hidden_states'] = hidden_states.detach().cpu()
101 | 
102 |             inputs = send_inputs_to_device(inputs, "cpu")
103 | 
104 |         layer.to("cpu")
105 | 
106 |         return task_input
107 |     
108 |     def merge(self, **kwargs):
109 |         reduction = kwargs["reduction"]
110 | 
111 |         exam_datasets = kwargs["task_names"].split("-")
112 | 
113 |         save_dir = self.save_path
114 |         gram_dir = os.path.join(save_dir, "regmeanplusplus")
115 |         param_dir = os.path.join(save_dir, "params")
116 | 
117 |         gram_dirs = [os.path.join(gram_dir, dataset_name) for dataset_name in exam_datasets]
118 |         param_dirs = [os.path.join(param_dir, dataset_name) for dataset_name in exam_datasets]
119 | 
120 |         # 1. Compute inputs for 1st layer
121 |         task_inputs = defaultdict(list)
122 |         for idx, dataset_name in enumerate(exam_datasets):
123 |             task_loader = TaskLoader(dataset_name, self.merged_model, self.tokenizer)
124 |             trainer = task_loader.trainer
125 |             dataloader = trainer.get_train_dataloader()
126 |             with torch.no_grad():
127 |                 task_inputs[dataset_name] = self.get_first_layer_input(
128 |                     self.merged_model, 
129 |                     trainer, 
130 |                     dataloader
131 |                 )
132 | 
133 |             cleanup_task_loader(task_loader)
134 |             torch.cuda.empty_cache()
135 |             gc.collect()
136 |         gc.collect()
137 | 
138 |         # 2. Merge each layer
139 |         for layer_idx in tqdm(range(self.number_of_layers), desc="Merging layers"):
140 |             # 2.1. Compute grams for each finetuned model
141 |             for idx, dataset_name in enumerate(exam_datasets):
142 |                 finetuned_layer = eval(f"self.ft_ckpts[{idx}].{self.transformer_layers_string}")[layer_idx]
143 |                 with torch.no_grad():
144 |                     grams = compute_grams(None, finetuned_layer, task_inputs[dataset_name])
145 |                 save_tensor_dict(grams, os.path.join(gram_dir, dataset_name)) # contains most (linear) params grams
146 |                 save_tensor_dict(finetuned_layer.state_dict(), os.path.join(param_dir, dataset_name)) # contains all params
147 | 
148 |                 finetuned_layer.to("cpu")
149 |                 del finetuned_layer, grams
150 |                 torch.cuda.empty_cache()
151 |                 gc.collect()
152 |             gc.collect()
153 | 
154 |             layer_param_names = eval(f"self.merged_model.{self.transformer_layers_string}")[layer_idx].state_dict().keys()
155 | 
156 |             # 2.2. Merge parameters for this layer
157 |             with torch.no_grad():
158 |                 gram_module_names = {f[:-3] for f in os.listdir(gram_dirs[0]) if f.endswith(".pt")}
159 |                 avg_params = {}
160 |                 for name in layer_param_names:
161 |                     h_avged = False
162 |                     if name.endswith('.weight') and not name.startswith('lm_head'):
163 |                         module_name = name[:-len('.weight')]
164 |                         if module_name in gram_module_names:
165 |                             sum_gram, grams = None, None
166 |                             for model_id in range(len(gram_dirs)):
167 |                                 param_grams = torch.load(os.path.join(gram_dirs[model_id], module_name + ".pt"), map_location='cpu').detach()
168 |                                 param_grams = reduce_non_diag(param_grams, a=reduction) # avoid degeneration
169 |                                 param = torch.load(os.path.join(param_dirs[model_id], name + ".pt"), map_location='cpu').detach()
170 |                                 gram_m_w = torch.matmul(param_grams, param.transpose(0, 1))
171 |                                 if sum_gram is None:
172 |                                     sum_gram = param_grams.clone()
173 |                                     sum_gram_m_ws = gram_m_w.clone()
174 |                                 else:
175 |                                     sum_gram.add_(param_grams)
176 |                                     sum_gram_m_ws.add_(gram_m_w)
177 |                                 del param_grams, param, gram_m_w
178 |                                 gc.collect()
179 |                             sum_gram_f32 = sum_gram.to(dtype=torch.float32)
180 |                             cond_number = torch.linalg.cond(sum_gram_f32)
181 |                             threshold = 1e8 
182 |                             if cond_number > threshold or torch.any(torch.diag(sum_gram_f32) == 0):
183 |                                 sum_gram_inv = torch.linalg.pinv(sum_gram_f32).to(dtype=sum_gram_m_ws.dtype)
184 |                             else:
185 |                                 sum_gram_inv = torch.inverse(sum_gram_f32).to(dtype=sum_gram_m_ws.dtype)
186 |                             wt = torch.matmul(sum_gram_inv, sum_gram_m_ws)
187 |                             avg_params[name] = wt.transpose(0, 1)
188 |                             h_avged = True
189 |                     
190 |                     if not h_avged: # if not averaged with regmean, then do simple avg
191 |                         filtered_model_params = None
192 |                         for model_id in range(len(gram_dirs)):
193 |                             if not name.startswith('model.embed') and not name.startswith('lm_head'): # embed_tokens.weight have incompatible dimensions due to vocab size difference
194 |                                 filtered_model_param = torch.load(os.path.join(param_dirs[model_id], name + ".pt"), map_location='cpu').detach()
195 |                                 if filtered_model_params is None:
196 |                                     filtered_model_params = filtered_model_param.clone()
197 |                                 else:
198 |                                     filtered_model_params.add_(filtered_model_param)
199 |                                 del filtered_model_param
200 |                                 gc.collect()
201 |                                 avg_params[name] = filtered_model_params.div(len(gram_dirs))
202 | 
203 |                 eval(f"self.merged_model.{self.transformer_layers_string}")[layer_idx].load_state_dict(avg_params, strict=False)
204 |                 avg_params = {}
205 |                 del avg_params
206 |                 
207 |             shutil.rmtree(gram_dir)
208 |             shutil.rmtree(param_dir)
209 | 
210 |             # 2.3. Compute inputs for next layer
211 |             if layer_idx == self.number_of_layers - 1:
212 |                 task_inputs = {}
213 |                 del task_inputs
214 |                 continue
215 |             
216 |             # # May be just need to update 'hidden_states' for task_inputs[dataset_name]
217 |             # # Check 'past_key_values'
218 |             for idx, dataset_name in enumerate(exam_datasets):
219 |                 with torch.no_grad():
220 |                     task_inputs[dataset_name] = self.forward_layer(
221 |                         eval(f"self.merged_model.{self.transformer_layers_string}")[layer_idx], 
222 |                         task_inputs[dataset_name]
223 |                     )
224 |                 torch.cuda.empty_cache()
225 |                 gc.collect()
226 |             
227 |             gc.collect()
228 | 
229 |         self.merged_model.save_pretrained(save_dir)
230 |         self.tokenizer.save_pretrained(save_dir)
231 | 


--------------------------------------------------------------------------------
/merging/taskloader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from datasets import load_dataset
  4 | from trl import SFTConfig, SFTTrainer
  5 | 
  6 | cache_dir = os.getenv('HF_HOME', '/data/huggingface')
  7 | 
  8 | def formatting_prompts_func(examples, instruction_key='instruction', input_key='input', output_key='output'):
  9 |     # alpaca style prompts
 10 |     # also works for gpteacher because gpteacher inherits alpaca prompt
 11 |     # https://github.com/huggingface/trl/pull/444#issue-1760952763
 12 |     instruction = examples[instruction_key]
 13 |     if 'input' in examples:
 14 |         input_text = examples[input_key]
 15 |     else:
 16 |         input_text = ''
 17 |     response = examples[output_key]
 18 | 
 19 |     if len(input_text) > 0:
 20 |         text = f'''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 21 |         
 22 |         ### Instruction:
 23 |         {instruction}
 24 |         
 25 |         ### Input:
 26 |         {input_text}
 27 |         
 28 |         ### Response:
 29 |         {response}
 30 |         '''
 31 |     else:
 32 |         text = f'''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 33 |         
 34 |         ### Instruction:
 35 |         {instruction}
 36 |         
 37 |         ### Response:
 38 |         {response}
 39 |         '''
 40 | 
 41 |     return text
 42 | 
 43 | 
 44 | class TaskLoader:
 45 |     def __new__(cls, task_name, *args, **kwargs):
 46 |         if task_name in globals() and issubclass(globals()[task_name], cls):
 47 |             subclass = globals()[task_name]  
 48 |             return super().__new__(subclass)  
 49 |         else:
 50 |             raise ValueError(f"Invalid task name: {task_name}")
 51 |         
 52 |     def __init__(self, task_name, *args, **kwargs):
 53 |         self.task_name = task_name
 54 | 
 55 | 
 56 | class WildguardMix(TaskLoader):
 57 |     def __init__(self, task_name, model, tokenizer, sample_size=None):
 58 |         super().__init__(task_name, model, tokenizer, sample_size=sample_size)
 59 | 
 60 |         self.training_args = SFTConfig(
 61 |                             learning_rate=1e-5,
 62 |                             num_train_epochs=1,
 63 |                             lr_scheduler_type='cosine',
 64 |                             optim="adamw_torch",
 65 |                             bf16=True,
 66 |                             dataset_num_proc=48,
 67 |                             packing=False,
 68 |                             max_length=2048, # 4096
 69 |                             gradient_checkpointing=True,
 70 |                             per_device_train_batch_size=1,
 71 |                             # deepspeed='/home/cindy2000_sh/MergeBench/deepspeed_configs/zero3.json',
 72 |                             output_dir="./tmp",
 73 |                             save_strategy='no',
 74 |                         ) 
 75 | 
 76 |         self.training_dataset = load_dataset('MergeBench/safety_val',cache_dir=cache_dir)
 77 |         self.training_dataset = self.training_dataset.rename_column("prompt", "query")
 78 |         
 79 |         if sample_size is None:
 80 |             self.training_dataset = self.training_dataset["train"]
 81 |         else:
 82 |             self.training_dataset = self.training_dataset["train"].shuffle(seed=42).select(range(sample_size))                     
 83 |         self.trainer = SFTTrainer(model=model,
 84 |                                   args=self.training_args,
 85 |                                   train_dataset=self.training_dataset,  
 86 |                                   formatting_func=lambda examples: formatting_prompts_func(
 87 |                                     examples, instruction_key="query", output_key="response"
 88 |                                 ),
 89 |                             )
 90 | 
 91 | 
 92 | class MagiCoder(TaskLoader):
 93 |     def __init__(self, task_name, model, tokenizer, sample_size=None):
 94 |         super().__init__(task_name, model, tokenizer, sample_size=sample_size)
 95 | 
 96 |         self.training_args = SFTConfig(
 97 |                             learning_rate=1e-5,
 98 |                             num_train_epochs=1,
 99 |                             lr_scheduler_type='cosine',
100 |                             optim="adamw_torch",
101 |                             bf16=True,
102 |                             dataset_num_proc=48,
103 |                             packing=False,
104 |                             max_length=2048, # 4096
105 |                             gradient_checkpointing=True,
106 |                             per_device_train_batch_size=1,
107 |                             # deepspeed='/home/cindy2000_sh/MergeBench/deepspeed_configs/zero3.json',
108 |                             output_dir="./tmp",
109 |                             save_strategy='no',
110 |                         ) 
111 | 
112 |         self.training_dataset = load_dataset('MergeBench/coding_val',cache_dir=cache_dir)
113 | 
114 |         if sample_size is None:
115 |             self.training_dataset = self.training_dataset["train"]
116 |         else:
117 |             self.training_dataset = self.training_dataset["train"].shuffle(seed=42).select(range(sample_size))                
118 |         self.trainer = SFTTrainer(model=model,
119 |                                   args=self.training_args,
120 |                                   train_dataset=self.training_dataset,  
121 |                                   formatting_func=lambda examples: formatting_prompts_func(
122 |                                     examples, output_key="response"
123 |                                 ),
124 |                             )
125 | 
126 | 
127 | class Aya(TaskLoader):
128 |     # TODO: match with Yuzheng's config
129 |     def __init__(self, task_name, model, tokenizer, sample_size=None):
130 |         super().__init__(task_name, model, tokenizer, sample_size=sample_size)
131 | 
132 |         self.training_args = SFTConfig(
133 |                             learning_rate=2e-5,
134 |                             num_train_epochs=1,
135 |                             lr_scheduler_type='cosine',
136 |                             optim="adamw_torch",
137 |                             bf16=True,
138 |                             dataset_num_proc=48,
139 |                             packing=False,
140 |                             max_length=2048, 
141 |                             gradient_checkpointing=True,
142 |                             per_device_train_batch_size=1,
143 |                             # deepspeed='/home/cindy2000_sh/MergeBench/deepspeed_configs/zero3.json',
144 |                             output_dir="./tmp",
145 |                             save_strategy='no',
146 |                         ) 
147 | 
148 |         self.training_dataset = load_dataset('MergeBench/multilingual_val',cache_dir=cache_dir)
149 |         if sample_size is None:
150 |             self.training_dataset = self.training_dataset["train"]
151 |         else:
152 |             self.training_dataset = self.training_dataset["train"].shuffle(seed=42).select(range(sample_size))                
153 |         self.trainer = SFTTrainer(model=model,
154 |                                   args=self.training_args,
155 |                                   train_dataset=self.training_dataset,  
156 |                                   formatting_func=lambda examples: formatting_prompts_func(
157 |                                     examples, instruction_key="inputs", output_key="targets"
158 |                                 ),
159 |                             )
160 |         
161 | 
162 | class DartMath(TaskLoader):
163 |     def __init__(self, task_name, model, tokenizer, sample_size=None):
164 |         super().__init__(task_name, model, tokenizer, sample_size=sample_size)
165 | 
166 |         self.training_args = SFTConfig(
167 |                             learning_rate=1e-5,
168 |                             num_train_epochs=1,
169 |                             lr_scheduler_type='cosine',
170 |                             optim="adamw_torch",
171 |                             bf16=True,
172 |                             dataset_num_proc=48,
173 |                             packing=False,
174 |                             max_length=2048, 
175 |                             gradient_checkpointing=True,
176 |                             per_device_train_batch_size=1,
177 |                             # deepspeed='/home/cindy2000_sh/MergeBench/deepspeed_configs/zero3.json',
178 |                             output_dir="./tmp",
179 |                             save_strategy='no',
180 |                         ) 
181 | 
182 |         self.training_dataset = load_dataset('MergeBench/math_val',cache_dir=cache_dir)
183 | 
184 |         if sample_size is None:
185 |             self.training_dataset = self.training_dataset["train"]
186 |         else:
187 |             self.training_dataset = self.training_dataset["train"].shuffle(seed=42).select(range(sample_size))                
188 |         self.trainer = SFTTrainer(model=model,
189 |                                   args=self.training_args,
190 |                                   train_dataset=self.training_dataset,  
191 |                                   formatting_func=lambda examples: formatting_prompts_func(
192 |                                     examples, instruction_key="query", output_key="response"
193 |                                 ),
194 |                             )
195 |                             
196 | class Tulu3IF(TaskLoader):
197 |     def __init__(self, task_name, model, tokenizer, sample_size=None):
198 |         super().__init__(task_name, model, tokenizer, sample_size=sample_size)
199 | 
200 |         self.training_args = SFTConfig(
201 |                             learning_rate=1e-5,
202 |                             num_train_epochs=1,
203 |                             lr_scheduler_type='cosine',
204 |                             optim="adamw_torch",
205 |                             bf16=True,
206 |                             dataset_num_proc=48,
207 |                             packing=False,
208 |                             max_length=2048, 
209 |                             gradient_checkpointing=True,
210 |                             per_device_train_batch_size=1,
211 |                             # deepspeed='/home/cindy2000_sh/MergeBench/deepspeed_configs/zero3.json',
212 |                             output_dir="./tmp",
213 |                             save_strategy='no',
214 |                         ) 
215 | 
216 |         self.training_dataset = load_dataset('MergeBench/instruction_val',cache_dir=cache_dir)
217 | 
218 |         if sample_size is None:
219 |             self.training_dataset = self.training_dataset['train']
220 |         else:
221 |             self.training_dataset = self.training_dataset['train'].shuffle(seed=42).select(range(sample_size))    
222 | 
223 |                 
224 |         self.trainer = SFTTrainer(model=model,
225 |                                   args=self.training_args,
226 |                                   train_dataset=self.training_dataset,  
227 |                                   formatting_func=lambda examples: formatting_prompts_func(
228 |                                     examples
229 |                                 ),
230 |                             )
231 | 
232 | if __name__ == "__main__":
233 |     from transformers import AutoTokenizer, AutoModelForCausalLM
234 | 
235 |     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
236 |     model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B", cache_dir=cache_dir)
237 | 
238 |     task_preprocessor = TaskLoader('WildguardMix', model, tokenizer, sample_size=None)
239 |     # task_preprocessor = TaskLoader('MagiCoder', model, tokenizer, sample_size=None)
240 |     # task_preprocessor = TaskLoader('Aya', model, tokenizer, sample_size=None)
241 |     # task_preprocessor = TaskLoader('DartMath', model, tokenizer, sample_size=None)
242 |     # task_preprocessor = TaskLoader('Tulu3IF', model, tokenizer, sample_size=None)
243 | 


--------------------------------------------------------------------------------
/merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1/lm_eval.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "results": {
   3 |     "arc_de": {
   4 |       "alias": "arc_de",
   5 |       "acc,none": 0.3746792130025663,
   6 |       "acc_stderr,none": 0.01416314857981058,
   7 |       "acc_norm,none": 0.41317365269461076,
   8 |       "acc_norm_stderr,none": 0.01440786699505746
   9 |     },
  10 |     "arc_es": {
  11 |       "alias": "arc_es",
  12 |       "acc,none": 0.43504273504273505,
  13 |       "acc_stderr,none": 0.014499949963905008,
  14 |       "acc_norm,none": 0.4478632478632479,
  15 |       "acc_norm_stderr,none": 0.01454416474185364
  16 |     },
  17 |     "arc_fr": {
  18 |       "alias": "arc_fr",
  19 |       "acc,none": 0.43199315654405473,
  20 |       "acc_stderr,none": 0.014494184864971343,
  21 |       "acc_norm,none": 0.4473909324208725,
  22 |       "acc_norm_stderr,none": 0.014548933904137591
  23 |     },
  24 |     "arc_ru": {
  25 |       "alias": "arc_ru",
  26 |       "acc,none": 0.3652694610778443,
  27 |       "acc_stderr,none": 0.014088993137853638,
  28 |       "acc_norm,none": 0.41146278870829767,
  29 |       "acc_norm_stderr,none": 0.014398950037131911
  30 |     },
  31 |     "gsm8k_cot": {
  32 |       "alias": "gsm8k_cot",
  33 |       "exact_match,strict-match": 0.6580742987111448,
  34 |       "exact_match_stderr,strict-match": 0.013066089625182818,
  35 |       "exact_match,flexible-extract": 0.733131159969674,
  36 |       "exact_match_stderr,flexible-extract": 0.012183780551887959
  37 |     },
  38 |     "hellaswag_de": {
  39 |       "alias": "hellaswag_de",
  40 |       "acc,none": 0.48484201537147736,
  41 |       "acc_stderr,none": 0.005163807946876662,
  42 |       "acc_norm,none": 0.6253202391118702,
  43 |       "acc_norm_stderr,none": 0.005001279196555146
  44 |     },
  45 |     "hellaswag_es": {
  46 |       "alias": "hellaswag_es",
  47 |       "acc,none": 0.5306165991039045,
  48 |       "acc_stderr,none": 0.005154837402482832,
  49 |       "acc_norm,none": 0.6901002773629187,
  50 |       "acc_norm_stderr,none": 0.004776693619654075
  51 |     },
  52 |     "hellaswag_fr": {
  53 |       "alias": "hellaswag_fr",
  54 |       "acc,none": 0.5197044334975369,
  55 |       "acc_stderr,none": 0.005170455686499735,
  56 |       "acc_norm,none": 0.6726279717284215,
  57 |       "acc_norm_stderr,none": 0.0048562894826775825
  58 |     },
  59 |     "hellaswag_ru": {
  60 |       "alias": "hellaswag_ru",
  61 |       "acc,none": 0.47249784296807595,
  62 |       "acc_stderr,none": 0.005184999806361907,
  63 |       "acc_norm,none": 0.6024590163934426,
  64 |       "acc_norm_stderr,none": 0.005082664197928633
  65 |     },
  66 |     "hendrycks_math": {
  67 |       "exact_match,none": 0.0022,
  68 |       "exact_match_stderr,none": 0.0006628553260613068,
  69 |       "alias": "hendrycks_math"
  70 |     },
  71 |     "hendrycks_math_algebra": {
  72 |       "alias": " - hendrycks_math_algebra",
  73 |       "exact_match,none": 0.003369839932603201,
  74 |       "exact_match_stderr,none": 0.0016827876052283514
  75 |     },
  76 |     "hendrycks_math_counting_and_prob": {
  77 |       "alias": " - hendrycks_math_counting_and_prob",
  78 |       "exact_match,none": 0.002109704641350211,
  79 |       "exact_match_stderr,none": 0.0021097046413502104
  80 |     },
  81 |     "hendrycks_math_geometry": {
  82 |       "alias": " - hendrycks_math_geometry",
  83 |       "exact_match,none": 0.0020876826722338203,
  84 |       "exact_match_stderr,none": 0.0020876826722338216
  85 |     },
  86 |     "hendrycks_math_intermediate_algebra": {
  87 |       "alias": " - hendrycks_math_intermediate_algebra",
  88 |       "exact_match,none": 0.0011074197120708748,
  89 |       "exact_match_stderr,none": 0.0011074197120708852
  90 |     },
  91 |     "hendrycks_math_num_theory": {
  92 |       "alias": " - hendrycks_math_num_theory",
  93 |       "exact_match,none": 0.0,
  94 |       "exact_match_stderr,none": 0.0
  95 |     },
  96 |     "hendrycks_math_prealgebra": {
  97 |       "alias": " - hendrycks_math_prealgebra",
  98 |       "exact_match,none": 0.003444316877152698,
  99 |       "exact_match_stderr,none": 0.0019862902400464752
 100 |     },
 101 |     "hendrycks_math_precalc": {
 102 |       "alias": " - hendrycks_math_precalc",
 103 |       "exact_match,none": 0.0018315018315018315,
 104 |       "exact_match_stderr,none": 0.0018315018315018376
 105 |     },
 106 |     "ifeval": {
 107 |       "alias": "ifeval",
 108 |       "prompt_level_strict_acc,none": 0.11090573012939002,
 109 |       "prompt_level_strict_acc_stderr,none": 0.013513069747049506,
 110 |       "inst_level_strict_acc,none": 0.19184652278177458,
 111 |       "inst_level_strict_acc_stderr,none": "N/A",
 112 |       "prompt_level_loose_acc,none": 0.133086876155268,
 113 |       "prompt_level_loose_acc_stderr,none": 0.014617009342904507,
 114 |       "inst_level_loose_acc,none": 0.22182254196642687,
 115 |       "inst_level_loose_acc_stderr,none": "N/A"
 116 |     },
 117 |     "m_mmlu_de": {
 118 |       "alias": "m_mmlu_de",
 119 |       "acc,none": 0.5123698898778096,
 120 |       "acc_stderr,none": 0.0043412463290572224
 121 |     },
 122 |     "m_mmlu_es": {
 123 |       "alias": "m_mmlu_es",
 124 |       "acc,none": 0.5345732713364332,
 125 |       "acc_stderr,none": 0.00431981691610148
 126 |     },
 127 |     "m_mmlu_fr": {
 128 |       "alias": "m_mmlu_fr",
 129 |       "acc,none": 0.5292949354518371,
 130 |       "acc_stderr,none": 0.00436268123167117
 131 |     },
 132 |     "m_mmlu_ru": {
 133 |       "alias": "m_mmlu_ru",
 134 |       "acc,none": 0.4891212424079342,
 135 |       "acc_stderr,none": 0.00438324059821865
 136 |     }
 137 |   },
 138 |   "groups": {
 139 |     "hendrycks_math": {
 140 |       "exact_match,none": 0.0022,
 141 |       "exact_match_stderr,none": 0.0006628553260613068,
 142 |       "alias": "hendrycks_math"
 143 |     }
 144 |   },
 145 |   "group_subtasks": {
 146 |     "arc_de": [],
 147 |     "arc_es": [],
 148 |     "arc_fr": [],
 149 |     "arc_ru": [],
 150 |     "gsm8k_cot": [],
 151 |     "hellaswag_de": [],
 152 |     "hellaswag_es": [],
 153 |     "hellaswag_fr": [],
 154 |     "hellaswag_ru": [],
 155 |     "hendrycks_math": [
 156 |       "hendrycks_math_algebra",
 157 |       "hendrycks_math_counting_and_prob",
 158 |       "hendrycks_math_geometry",
 159 |       "hendrycks_math_intermediate_algebra",
 160 |       "hendrycks_math_num_theory",
 161 |       "hendrycks_math_prealgebra",
 162 |       "hendrycks_math_precalc"
 163 |     ],
 164 |     "ifeval": [],
 165 |     "m_mmlu_de": [],
 166 |     "m_mmlu_es": [],
 167 |     "m_mmlu_fr": [],
 168 |     "m_mmlu_ru": []
 169 |   },
 170 |   "configs": {
 171 |     "arc_de": {
 172 |       "task": "arc_de",
 173 |       "tag": [
 174 |         "arc_multilingual"
 175 |       ],
 176 |       "dataset_path": "alexandrainst/m_arc",
 177 |       "dataset_name": "de",
 178 |       "training_split": "train",
 179 |       "validation_split": "validation",
 180 |       "test_split": "test",
 181 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        # breakpoint()\n        out_doc = {\n            \"id\": doc[\"id\"],\n            \"query\": \"Question: \" + preprocess(doc[\"instruction\"]) + \"\\nAnswer:\",\n            \"choices\": [\n                preprocess(option)\n                for option in [\n                    doc[\"option_a\"],\n                    doc[\"option_b\"],\n                    doc[\"option_c\"],\n                    doc[\"option_d\"],\n                    doc[\"option_e\"],\n                ]\n                if option\n            ],\n            \"gold\": [\"A\", \"B\", \"C\", \"D\", \"E\"].index(doc[\"answer\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 182 |       "doc_to_text": "query",
 183 |       "doc_to_target": "gold",
 184 |       "unsafe_code": false,
 185 |       "doc_to_choice": "choices",
 186 |       "description": "",
 187 |       "target_delimiter": " ",
 188 |       "fewshot_delimiter": "\n\n",
 189 |       "num_fewshot": 0,
 190 |       "metric_list": [
 191 |         {
 192 |           "metric": "acc",
 193 |           "aggregation": "mean",
 194 |           "higher_is_better": true
 195 |         },
 196 |         {
 197 |           "metric": "acc_norm",
 198 |           "aggregation": "mean",
 199 |           "higher_is_better": true
 200 |         }
 201 |       ],
 202 |       "output_type": "multiple_choice",
 203 |       "repeats": 1,
 204 |       "should_decontaminate": true,
 205 |       "doc_to_decontamination_query": "query",
 206 |       "metadata": {
 207 |         "version": 2.0,
 208 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 209 |       }
 210 |     },
 211 |     "arc_es": {
 212 |       "task": "arc_es",
 213 |       "tag": [
 214 |         "arc_multilingual"
 215 |       ],
 216 |       "dataset_path": "alexandrainst/m_arc",
 217 |       "dataset_name": "es",
 218 |       "training_split": "train",
 219 |       "validation_split": "validation",
 220 |       "test_split": "test",
 221 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        # breakpoint()\n        out_doc = {\n            \"id\": doc[\"id\"],\n            \"query\": \"Question: \" + preprocess(doc[\"instruction\"]) + \"\\nAnswer:\",\n            \"choices\": [\n                preprocess(option)\n                for option in [\n                    doc[\"option_a\"],\n                    doc[\"option_b\"],\n                    doc[\"option_c\"],\n                    doc[\"option_d\"],\n                    doc[\"option_e\"],\n                ]\n                if option\n            ],\n            \"gold\": [\"A\", \"B\", \"C\", \"D\", \"E\"].index(doc[\"answer\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 222 |       "doc_to_text": "query",
 223 |       "doc_to_target": "gold",
 224 |       "unsafe_code": false,
 225 |       "doc_to_choice": "choices",
 226 |       "description": "",
 227 |       "target_delimiter": " ",
 228 |       "fewshot_delimiter": "\n\n",
 229 |       "num_fewshot": 0,
 230 |       "metric_list": [
 231 |         {
 232 |           "metric": "acc",
 233 |           "aggregation": "mean",
 234 |           "higher_is_better": true
 235 |         },
 236 |         {
 237 |           "metric": "acc_norm",
 238 |           "aggregation": "mean",
 239 |           "higher_is_better": true
 240 |         }
 241 |       ],
 242 |       "output_type": "multiple_choice",
 243 |       "repeats": 1,
 244 |       "should_decontaminate": true,
 245 |       "doc_to_decontamination_query": "query",
 246 |       "metadata": {
 247 |         "version": 2.0,
 248 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 249 |       }
 250 |     },
 251 |     "arc_fr": {
 252 |       "task": "arc_fr",
 253 |       "tag": [
 254 |         "arc_multilingual"
 255 |       ],
 256 |       "dataset_path": "alexandrainst/m_arc",
 257 |       "dataset_name": "fr",
 258 |       "training_split": "train",
 259 |       "validation_split": "validation",
 260 |       "test_split": "test",
 261 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        # breakpoint()\n        out_doc = {\n            \"id\": doc[\"id\"],\n            \"query\": \"Question: \" + preprocess(doc[\"instruction\"]) + \"\\nAnswer:\",\n            \"choices\": [\n                preprocess(option)\n                for option in [\n                    doc[\"option_a\"],\n                    doc[\"option_b\"],\n                    doc[\"option_c\"],\n                    doc[\"option_d\"],\n                    doc[\"option_e\"],\n                ]\n                if option\n            ],\n            \"gold\": [\"A\", \"B\", \"C\", \"D\", \"E\"].index(doc[\"answer\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 262 |       "doc_to_text": "query",
 263 |       "doc_to_target": "gold",
 264 |       "unsafe_code": false,
 265 |       "doc_to_choice": "choices",
 266 |       "description": "",
 267 |       "target_delimiter": " ",
 268 |       "fewshot_delimiter": "\n\n",
 269 |       "num_fewshot": 0,
 270 |       "metric_list": [
 271 |         {
 272 |           "metric": "acc",
 273 |           "aggregation": "mean",
 274 |           "higher_is_better": true
 275 |         },
 276 |         {
 277 |           "metric": "acc_norm",
 278 |           "aggregation": "mean",
 279 |           "higher_is_better": true
 280 |         }
 281 |       ],
 282 |       "output_type": "multiple_choice",
 283 |       "repeats": 1,
 284 |       "should_decontaminate": true,
 285 |       "doc_to_decontamination_query": "query",
 286 |       "metadata": {
 287 |         "version": 2.0,
 288 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 289 |       }
 290 |     },
 291 |     "arc_ru": {
 292 |       "task": "arc_ru",
 293 |       "tag": [
 294 |         "arc_multilingual"
 295 |       ],
 296 |       "dataset_path": "alexandrainst/m_arc",
 297 |       "dataset_name": "ru",
 298 |       "training_split": "train",
 299 |       "validation_split": "validation",
 300 |       "test_split": "test",
 301 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        # breakpoint()\n        out_doc = {\n            \"id\": doc[\"id\"],\n            \"query\": \"Question: \" + preprocess(doc[\"instruction\"]) + \"\\nAnswer:\",\n            \"choices\": [\n                preprocess(option)\n                for option in [\n                    doc[\"option_a\"],\n                    doc[\"option_b\"],\n                    doc[\"option_c\"],\n                    doc[\"option_d\"],\n                    doc[\"option_e\"],\n                ]\n                if option\n            ],\n            \"gold\": [\"A\", \"B\", \"C\", \"D\", \"E\"].index(doc[\"answer\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 302 |       "doc_to_text": "query",
 303 |       "doc_to_target": "gold",
 304 |       "unsafe_code": false,
 305 |       "doc_to_choice": "choices",
 306 |       "description": "",
 307 |       "target_delimiter": " ",
 308 |       "fewshot_delimiter": "\n\n",
 309 |       "num_fewshot": 0,
 310 |       "metric_list": [
 311 |         {
 312 |           "metric": "acc",
 313 |           "aggregation": "mean",
 314 |           "higher_is_better": true
 315 |         },
 316 |         {
 317 |           "metric": "acc_norm",
 318 |           "aggregation": "mean",
 319 |           "higher_is_better": true
 320 |         }
 321 |       ],
 322 |       "output_type": "multiple_choice",
 323 |       "repeats": 1,
 324 |       "should_decontaminate": true,
 325 |       "doc_to_decontamination_query": "query",
 326 |       "metadata": {
 327 |         "version": 2.0,
 328 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 329 |       }
 330 |     },
 331 |     "gsm8k_cot": {
 332 |       "task": "gsm8k_cot",
 333 |       "tag": [
 334 |         "chain_of_thought"
 335 |       ],
 336 |       "dataset_path": "gsm8k",
 337 |       "dataset_name": "main",
 338 |       "test_split": "test",
 339 |       "doc_to_text": "Q: {{question}}\nA:",
 340 |       "doc_to_target": "{{answer.split('####')[-1].strip() if answer is defined else target}}",
 341 |       "unsafe_code": false,
 342 |       "description": "",
 343 |       "target_delimiter": " ",
 344 |       "fewshot_delimiter": "\n\n",
 345 |       "fewshot_config": {
 346 |         "sampler": "first_n",
 347 |         "samples": [
 348 |           {
 349 |             "question": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
 350 |             "target": "There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6."
 351 |           },
 352 |           {
 353 |             "question": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
 354 |             "target": "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5."
 355 |           },
 356 |           {
 357 |             "question": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
 358 |             "target": "Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39."
 359 |           },
 360 |           {
 361 |             "question": "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?",
 362 |             "target": "Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. The answer is 8."
 363 |           },
 364 |           {
 365 |             "question": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
 366 |             "target": "Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. The answer is 9."
 367 |           },
 368 |           {
 369 |             "question": "There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?",
 370 |             "target": "There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is 29."
 371 |           },
 372 |           {
 373 |             "question": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
 374 |             "target": "Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33."
 375 |           },
 376 |           {
 377 |             "question": "Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
 378 |             "target": "Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8."
 379 |           }
 380 |         ]
 381 |       },
 382 |       "num_fewshot": 8,
 383 |       "metric_list": [
 384 |         {
 385 |           "aggregation": "mean",
 386 |           "higher_is_better": true,
 387 |           "ignore_case": true,
 388 |           "ignore_punctuation": false,
 389 |           "metric": "exact_match",
 390 |           "regexes_to_ignore": [
 391 |             ",",
 392 |             "\\$",
 393 |             "(?s).*#### ",
 394 |             "\\.$"
 395 |           ]
 396 |         }
 397 |       ],
 398 |       "output_type": "generate_until",
 399 |       "generation_kwargs": {
 400 |         "do_sample": false,
 401 |         "until": [
 402 |           "Q:",
 403 |           "</s>",
 404 |           "<|im_end|>"
 405 |         ]
 406 |       },
 407 |       "repeats": 1,
 408 |       "filter_list": [
 409 |         {
 410 |           "filter": [
 411 |             {
 412 |               "function": "regex",
 413 |               "regex_pattern": "The answer is (\\-?[0-9\\.\\,]+)."
 414 |             },
 415 |             {
 416 |               "function": "take_first"
 417 |             }
 418 |           ],
 419 |           "name": "strict-match"
 420 |         },
 421 |         {
 422 |           "filter": [
 423 |             {
 424 |               "function": "regex",
 425 |               "group_select": -1,
 426 |               "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
 427 |             },
 428 |             {
 429 |               "function": "take_first"
 430 |             }
 431 |           ],
 432 |           "name": "flexible-extract"
 433 |         }
 434 |       ],
 435 |       "should_decontaminate": false,
 436 |       "metadata": {
 437 |         "version": 3.0,
 438 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 439 |       }
 440 |     },
 441 |     "hellaswag_de": {
 442 |       "task": "hellaswag_de",
 443 |       "tag": [
 444 |         "hellaswag_multilingual"
 445 |       ],
 446 |       "dataset_path": "alexandrainst/m_hellaswag",
 447 |       "dataset_name": "de",
 448 |       "validation_split": "val",
 449 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 450 |       "doc_to_text": "query",
 451 |       "doc_to_target": "{{label.lstrip()}}",
 452 |       "unsafe_code": false,
 453 |       "doc_to_choice": "choices",
 454 |       "description": "",
 455 |       "target_delimiter": " ",
 456 |       "fewshot_delimiter": "\n\n",
 457 |       "num_fewshot": 0,
 458 |       "metric_list": [
 459 |         {
 460 |           "metric": "acc",
 461 |           "aggregation": "mean",
 462 |           "higher_is_better": true
 463 |         },
 464 |         {
 465 |           "metric": "acc_norm",
 466 |           "aggregation": "mean",
 467 |           "higher_is_better": true
 468 |         }
 469 |       ],
 470 |       "output_type": "multiple_choice",
 471 |       "repeats": 1,
 472 |       "should_decontaminate": false,
 473 |       "metadata": {
 474 |         "version": 1.0,
 475 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 476 |       }
 477 |     },
 478 |     "hellaswag_es": {
 479 |       "task": "hellaswag_es",
 480 |       "tag": [
 481 |         "hellaswag_multilingual"
 482 |       ],
 483 |       "dataset_path": "alexandrainst/m_hellaswag",
 484 |       "dataset_name": "es",
 485 |       "validation_split": "val",
 486 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 487 |       "doc_to_text": "query",
 488 |       "doc_to_target": "{{label.lstrip()}}",
 489 |       "unsafe_code": false,
 490 |       "doc_to_choice": "choices",
 491 |       "description": "",
 492 |       "target_delimiter": " ",
 493 |       "fewshot_delimiter": "\n\n",
 494 |       "num_fewshot": 0,
 495 |       "metric_list": [
 496 |         {
 497 |           "metric": "acc",
 498 |           "aggregation": "mean",
 499 |           "higher_is_better": true
 500 |         },
 501 |         {
 502 |           "metric": "acc_norm",
 503 |           "aggregation": "mean",
 504 |           "higher_is_better": true
 505 |         }
 506 |       ],
 507 |       "output_type": "multiple_choice",
 508 |       "repeats": 1,
 509 |       "should_decontaminate": false,
 510 |       "metadata": {
 511 |         "version": 1.0,
 512 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 513 |       }
 514 |     },
 515 |     "hellaswag_fr": {
 516 |       "task": "hellaswag_fr",
 517 |       "tag": [
 518 |         "hellaswag_multilingual"
 519 |       ],
 520 |       "dataset_path": "alexandrainst/m_hellaswag",
 521 |       "dataset_name": "fr",
 522 |       "validation_split": "val",
 523 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 524 |       "doc_to_text": "query",
 525 |       "doc_to_target": "{{label.lstrip()}}",
 526 |       "unsafe_code": false,
 527 |       "doc_to_choice": "choices",
 528 |       "description": "",
 529 |       "target_delimiter": " ",
 530 |       "fewshot_delimiter": "\n\n",
 531 |       "num_fewshot": 0,
 532 |       "metric_list": [
 533 |         {
 534 |           "metric": "acc",
 535 |           "aggregation": "mean",
 536 |           "higher_is_better": true
 537 |         },
 538 |         {
 539 |           "metric": "acc_norm",
 540 |           "aggregation": "mean",
 541 |           "higher_is_better": true
 542 |         }
 543 |       ],
 544 |       "output_type": "multiple_choice",
 545 |       "repeats": 1,
 546 |       "should_decontaminate": false,
 547 |       "metadata": {
 548 |         "version": 1.0,
 549 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 550 |       }
 551 |     },
 552 |     "hellaswag_ru": {
 553 |       "task": "hellaswag_ru",
 554 |       "tag": [
 555 |         "hellaswag_multilingual"
 556 |       ],
 557 |       "dataset_path": "alexandrainst/m_hellaswag",
 558 |       "dataset_name": "ru",
 559 |       "validation_split": "val",
 560 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 561 |       "doc_to_text": "query",
 562 |       "doc_to_target": "{{label.lstrip()}}",
 563 |       "unsafe_code": false,
 564 |       "doc_to_choice": "choices",
 565 |       "description": "",
 566 |       "target_delimiter": " ",
 567 |       "fewshot_delimiter": "\n\n",
 568 |       "num_fewshot": 0,
 569 |       "metric_list": [
 570 |         {
 571 |           "metric": "acc",
 572 |           "aggregation": "mean",
 573 |           "higher_is_better": true
 574 |         },
 575 |         {
 576 |           "metric": "acc_norm",
 577 |           "aggregation": "mean",
 578 |           "higher_is_better": true
 579 |         }
 580 |       ],
 581 |       "output_type": "multiple_choice",
 582 |       "repeats": 1,
 583 |       "should_decontaminate": false,
 584 |       "metadata": {
 585 |         "version": 1.0,
 586 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 587 |       }
 588 |     },
 589 |     "hendrycks_math_algebra": {
 590 |       "task": "hendrycks_math_algebra",
 591 |       "tag": [
 592 |         "math_word_problems"
 593 |       ],
 594 |       "dataset_path": "EleutherAI/hendrycks_math",
 595 |       "dataset_name": "algebra",
 596 |       "training_split": "train",
 597 |       "test_split": "test",
 598 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 599 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 600 |       "doc_to_target": "{{answer}}",
 601 |       "unsafe_code": false,
 602 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 603 |       "description": "",
 604 |       "target_delimiter": " ",
 605 |       "fewshot_delimiter": "\n\n",
 606 |       "num_fewshot": 0,
 607 |       "metric_list": [
 608 |         {
 609 |           "metric": "exact_match",
 610 |           "aggregation": "mean",
 611 |           "higher_is_better": true
 612 |         }
 613 |       ],
 614 |       "output_type": "generate_until",
 615 |       "generation_kwargs": {
 616 |         "until": [
 617 |           "Problem:"
 618 |         ],
 619 |         "do_sample": false,
 620 |         "temperature": 0.0
 621 |       },
 622 |       "repeats": 1,
 623 |       "should_decontaminate": false,
 624 |       "metadata": {
 625 |         "version": 1.0,
 626 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 627 |       }
 628 |     },
 629 |     "hendrycks_math_counting_and_prob": {
 630 |       "task": "hendrycks_math_counting_and_prob",
 631 |       "tag": [
 632 |         "math_word_problems"
 633 |       ],
 634 |       "dataset_path": "EleutherAI/hendrycks_math",
 635 |       "dataset_name": "counting_and_probability",
 636 |       "training_split": "train",
 637 |       "test_split": "test",
 638 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 639 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 640 |       "doc_to_target": "{{answer}}",
 641 |       "unsafe_code": false,
 642 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 643 |       "description": "",
 644 |       "target_delimiter": " ",
 645 |       "fewshot_delimiter": "\n\n",
 646 |       "num_fewshot": 0,
 647 |       "metric_list": [
 648 |         {
 649 |           "metric": "exact_match",
 650 |           "aggregation": "mean",
 651 |           "higher_is_better": true
 652 |         }
 653 |       ],
 654 |       "output_type": "generate_until",
 655 |       "generation_kwargs": {
 656 |         "until": [
 657 |           "Problem:"
 658 |         ],
 659 |         "do_sample": false,
 660 |         "temperature": 0.0
 661 |       },
 662 |       "repeats": 1,
 663 |       "should_decontaminate": false,
 664 |       "metadata": {
 665 |         "version": 1.0,
 666 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 667 |       }
 668 |     },
 669 |     "hendrycks_math_geometry": {
 670 |       "task": "hendrycks_math_geometry",
 671 |       "tag": [
 672 |         "math_word_problems"
 673 |       ],
 674 |       "dataset_path": "EleutherAI/hendrycks_math",
 675 |       "dataset_name": "geometry",
 676 |       "training_split": "train",
 677 |       "test_split": "test",
 678 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 679 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 680 |       "doc_to_target": "{{answer}}",
 681 |       "unsafe_code": false,
 682 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 683 |       "description": "",
 684 |       "target_delimiter": " ",
 685 |       "fewshot_delimiter": "\n\n",
 686 |       "num_fewshot": 0,
 687 |       "metric_list": [
 688 |         {
 689 |           "metric": "exact_match",
 690 |           "aggregation": "mean",
 691 |           "higher_is_better": true
 692 |         }
 693 |       ],
 694 |       "output_type": "generate_until",
 695 |       "generation_kwargs": {
 696 |         "until": [
 697 |           "Problem:"
 698 |         ],
 699 |         "do_sample": false,
 700 |         "temperature": 0.0
 701 |       },
 702 |       "repeats": 1,
 703 |       "should_decontaminate": false,
 704 |       "metadata": {
 705 |         "version": 1.0,
 706 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 707 |       }
 708 |     },
 709 |     "hendrycks_math_intermediate_algebra": {
 710 |       "task": "hendrycks_math_intermediate_algebra",
 711 |       "tag": [
 712 |         "math_word_problems"
 713 |       ],
 714 |       "dataset_path": "EleutherAI/hendrycks_math",
 715 |       "dataset_name": "intermediate_algebra",
 716 |       "training_split": "train",
 717 |       "test_split": "test",
 718 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 719 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 720 |       "doc_to_target": "{{answer}}",
 721 |       "unsafe_code": false,
 722 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 723 |       "description": "",
 724 |       "target_delimiter": " ",
 725 |       "fewshot_delimiter": "\n\n",
 726 |       "num_fewshot": 0,
 727 |       "metric_list": [
 728 |         {
 729 |           "metric": "exact_match",
 730 |           "aggregation": "mean",
 731 |           "higher_is_better": true
 732 |         }
 733 |       ],
 734 |       "output_type": "generate_until",
 735 |       "generation_kwargs": {
 736 |         "until": [
 737 |           "Problem:"
 738 |         ],
 739 |         "do_sample": false,
 740 |         "temperature": 0.0
 741 |       },
 742 |       "repeats": 1,
 743 |       "should_decontaminate": false,
 744 |       "metadata": {
 745 |         "version": 1.0,
 746 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 747 |       }
 748 |     },
 749 |     "hendrycks_math_num_theory": {
 750 |       "task": "hendrycks_math_num_theory",
 751 |       "tag": [
 752 |         "math_word_problems"
 753 |       ],
 754 |       "dataset_path": "EleutherAI/hendrycks_math",
 755 |       "dataset_name": "number_theory",
 756 |       "training_split": "train",
 757 |       "test_split": "test",
 758 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 759 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 760 |       "doc_to_target": "{{answer}}",
 761 |       "unsafe_code": false,
 762 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 763 |       "description": "",
 764 |       "target_delimiter": " ",
 765 |       "fewshot_delimiter": "\n\n",
 766 |       "num_fewshot": 0,
 767 |       "metric_list": [
 768 |         {
 769 |           "metric": "exact_match",
 770 |           "aggregation": "mean",
 771 |           "higher_is_better": true
 772 |         }
 773 |       ],
 774 |       "output_type": "generate_until",
 775 |       "generation_kwargs": {
 776 |         "until": [
 777 |           "Problem:"
 778 |         ],
 779 |         "do_sample": false,
 780 |         "temperature": 0.0
 781 |       },
 782 |       "repeats": 1,
 783 |       "should_decontaminate": false,
 784 |       "metadata": {
 785 |         "version": 1.0,
 786 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 787 |       }
 788 |     },
 789 |     "hendrycks_math_prealgebra": {
 790 |       "task": "hendrycks_math_prealgebra",
 791 |       "tag": [
 792 |         "math_word_problems"
 793 |       ],
 794 |       "dataset_path": "EleutherAI/hendrycks_math",
 795 |       "dataset_name": "prealgebra",
 796 |       "training_split": "train",
 797 |       "test_split": "test",
 798 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 799 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 800 |       "doc_to_target": "{{answer}}",
 801 |       "unsafe_code": false,
 802 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 803 |       "description": "",
 804 |       "target_delimiter": " ",
 805 |       "fewshot_delimiter": "\n\n",
 806 |       "num_fewshot": 0,
 807 |       "metric_list": [
 808 |         {
 809 |           "metric": "exact_match",
 810 |           "aggregation": "mean",
 811 |           "higher_is_better": true
 812 |         }
 813 |       ],
 814 |       "output_type": "generate_until",
 815 |       "generation_kwargs": {
 816 |         "until": [
 817 |           "Problem:"
 818 |         ],
 819 |         "do_sample": false,
 820 |         "temperature": 0.0
 821 |       },
 822 |       "repeats": 1,
 823 |       "should_decontaminate": false,
 824 |       "metadata": {
 825 |         "version": 1.0,
 826 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 827 |       }
 828 |     },
 829 |     "hendrycks_math_precalc": {
 830 |       "task": "hendrycks_math_precalc",
 831 |       "tag": [
 832 |         "math_word_problems"
 833 |       ],
 834 |       "dataset_path": "EleutherAI/hendrycks_math",
 835 |       "dataset_name": "precalculus",
 836 |       "training_split": "train",
 837 |       "test_split": "test",
 838 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 839 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 840 |       "doc_to_target": "{{answer}}",
 841 |       "unsafe_code": false,
 842 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 843 |       "description": "",
 844 |       "target_delimiter": " ",
 845 |       "fewshot_delimiter": "\n\n",
 846 |       "num_fewshot": 0,
 847 |       "metric_list": [
 848 |         {
 849 |           "metric": "exact_match",
 850 |           "aggregation": "mean",
 851 |           "higher_is_better": true
 852 |         }
 853 |       ],
 854 |       "output_type": "generate_until",
 855 |       "generation_kwargs": {
 856 |         "until": [
 857 |           "Problem:"
 858 |         ],
 859 |         "do_sample": false,
 860 |         "temperature": 0.0
 861 |       },
 862 |       "repeats": 1,
 863 |       "should_decontaminate": false,
 864 |       "metadata": {
 865 |         "version": 1.0,
 866 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 867 |       }
 868 |     },
 869 |     "ifeval": {
 870 |       "task": "ifeval",
 871 |       "dataset_path": "google/IFEval",
 872 |       "test_split": "train",
 873 |       "doc_to_text": "prompt",
 874 |       "doc_to_target": 0,
 875 |       "unsafe_code": false,
 876 |       "process_results": "def process_results(doc, results):\n    inp = InputExample(\n        key=doc[\"key\"],\n        instruction_id_list=doc[\"instruction_id_list\"],\n        prompt=doc[\"prompt\"],\n        kwargs=doc[\"kwargs\"],\n    )\n    response = results[0]\n\n    out_strict = test_instruction_following_strict(inp, response)\n    out_loose = test_instruction_following_loose(inp, response)\n\n    return {\n        \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n        \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n        \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n        \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n    }\n",
 877 |       "description": "",
 878 |       "target_delimiter": " ",
 879 |       "fewshot_delimiter": "\n\n",
 880 |       "num_fewshot": 0,
 881 |       "metric_list": [
 882 |         {
 883 |           "metric": "prompt_level_strict_acc",
 884 |           "aggregation": "mean",
 885 |           "higher_is_better": true
 886 |         },
 887 |         {
 888 |           "metric": "inst_level_strict_acc",
 889 |           "aggregation": "def agg_inst_level_acc(items):\n    flat_items = [item for sublist in items for item in sublist]\n    inst_level_acc = sum(flat_items) / len(flat_items)\n    return inst_level_acc\n",
 890 |           "higher_is_better": true
 891 |         },
 892 |         {
 893 |           "metric": "prompt_level_loose_acc",
 894 |           "aggregation": "mean",
 895 |           "higher_is_better": true
 896 |         },
 897 |         {
 898 |           "metric": "inst_level_loose_acc",
 899 |           "aggregation": "def agg_inst_level_acc(items):\n    flat_items = [item for sublist in items for item in sublist]\n    inst_level_acc = sum(flat_items) / len(flat_items)\n    return inst_level_acc\n",
 900 |           "higher_is_better": true
 901 |         }
 902 |       ],
 903 |       "output_type": "generate_until",
 904 |       "generation_kwargs": {
 905 |         "until": [],
 906 |         "do_sample": false,
 907 |         "temperature": 0.0,
 908 |         "max_gen_toks": 1280
 909 |       },
 910 |       "repeats": 1,
 911 |       "should_decontaminate": false,
 912 |       "metadata": {
 913 |         "version": 4.0,
 914 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 915 |       }
 916 |     },
 917 |     "m_mmlu_de": {
 918 |       "task": "m_mmlu_de",
 919 |       "tag": [
 920 |         "m_mmlu"
 921 |       ],
 922 |       "dataset_path": "alexandrainst/m_mmlu",
 923 |       "dataset_name": "de",
 924 |       "test_split": "test",
 925 |       "fewshot_split": "train",
 926 |       "doc_to_text": "{{instruction.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:",
 927 |       "doc_to_target": "answer",
 928 |       "unsafe_code": false,
 929 |       "doc_to_choice": [
 930 |         "A",
 931 |         "B",
 932 |         "C",
 933 |         "D"
 934 |       ],
 935 |       "description": "",
 936 |       "target_delimiter": " ",
 937 |       "fewshot_delimiter": "\n\n",
 938 |       "fewshot_config": {
 939 |         "sampler": "first_n"
 940 |       },
 941 |       "num_fewshot": 0,
 942 |       "metric_list": [
 943 |         {
 944 |           "metric": "acc",
 945 |           "aggregation": "mean",
 946 |           "higher_is_better": true
 947 |         }
 948 |       ],
 949 |       "output_type": "multiple_choice",
 950 |       "repeats": 1,
 951 |       "should_decontaminate": false,
 952 |       "metadata": {
 953 |         "version": 0.0,
 954 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 955 |       }
 956 |     },
 957 |     "m_mmlu_es": {
 958 |       "task": "m_mmlu_es",
 959 |       "tag": [
 960 |         "m_mmlu"
 961 |       ],
 962 |       "dataset_path": "alexandrainst/m_mmlu",
 963 |       "dataset_name": "es",
 964 |       "test_split": "test",
 965 |       "fewshot_split": "train",
 966 |       "doc_to_text": "{{instruction.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:",
 967 |       "doc_to_target": "answer",
 968 |       "unsafe_code": false,
 969 |       "doc_to_choice": [
 970 |         "A",
 971 |         "B",
 972 |         "C",
 973 |         "D"
 974 |       ],
 975 |       "description": "",
 976 |       "target_delimiter": " ",
 977 |       "fewshot_delimiter": "\n\n",
 978 |       "fewshot_config": {
 979 |         "sampler": "first_n"
 980 |       },
 981 |       "num_fewshot": 0,
 982 |       "metric_list": [
 983 |         {
 984 |           "metric": "acc",
 985 |           "aggregation": "mean",
 986 |           "higher_is_better": true
 987 |         }
 988 |       ],
 989 |       "output_type": "multiple_choice",
 990 |       "repeats": 1,
 991 |       "should_decontaminate": false,
 992 |       "metadata": {
 993 |         "version": 0.0,
 994 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 995 |       }
 996 |     },
 997 |     "m_mmlu_fr": {
 998 |       "task": "m_mmlu_fr",
 999 |       "tag": [
1000 |         "m_mmlu"
1001 |       ],
1002 |       "dataset_path": "alexandrainst/m_mmlu",
1003 |       "dataset_name": "fr",
1004 |       "test_split": "test",
1005 |       "fewshot_split": "train",
1006 |       "doc_to_text": "{{instruction.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:",
1007 |       "doc_to_target": "answer",
1008 |       "unsafe_code": false,
1009 |       "doc_to_choice": [
1010 |         "A",
1011 |         "B",
1012 |         "C",
1013 |         "D"
1014 |       ],
1015 |       "description": "",
1016 |       "target_delimiter": " ",
1017 |       "fewshot_delimiter": "\n\n",
1018 |       "fewshot_config": {
1019 |         "sampler": "first_n"
1020 |       },
1021 |       "num_fewshot": 0,
1022 |       "metric_list": [
1023 |         {
1024 |           "metric": "acc",
1025 |           "aggregation": "mean",
1026 |           "higher_is_better": true
1027 |         }
1028 |       ],
1029 |       "output_type": "multiple_choice",
1030 |       "repeats": 1,
1031 |       "should_decontaminate": false,
1032 |       "metadata": {
1033 |         "version": 0.0,
1034 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
1035 |       }
1036 |     },
1037 |     "m_mmlu_ru": {
1038 |       "task": "m_mmlu_ru",
1039 |       "tag": [
1040 |         "m_mmlu"
1041 |       ],
1042 |       "dataset_path": "alexandrainst/m_mmlu",
1043 |       "dataset_name": "ru",
1044 |       "test_split": "test",
1045 |       "fewshot_split": "train",
1046 |       "doc_to_text": "{{instruction.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:",
1047 |       "doc_to_target": "answer",
1048 |       "unsafe_code": false,
1049 |       "doc_to_choice": [
1050 |         "A",
1051 |         "B",
1052 |         "C",
1053 |         "D"
1054 |       ],
1055 |       "description": "",
1056 |       "target_delimiter": " ",
1057 |       "fewshot_delimiter": "\n\n",
1058 |       "fewshot_config": {
1059 |         "sampler": "first_n"
1060 |       },
1061 |       "num_fewshot": 0,
1062 |       "metric_list": [
1063 |         {
1064 |           "metric": "acc",
1065 |           "aggregation": "mean",
1066 |           "higher_is_better": true
1067 |         }
1068 |       ],
1069 |       "output_type": "multiple_choice",
1070 |       "repeats": 1,
1071 |       "should_decontaminate": false,
1072 |       "metadata": {
1073 |         "version": 0.0,
1074 |         "pretrained": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
1075 |       }
1076 |     }
1077 |   },
1078 |   "versions": {
1079 |     "arc_de": 2.0,
1080 |     "arc_es": 2.0,
1081 |     "arc_fr": 2.0,
1082 |     "arc_ru": 2.0,
1083 |     "gsm8k_cot": 3.0,
1084 |     "hellaswag_de": 1.0,
1085 |     "hellaswag_es": 1.0,
1086 |     "hellaswag_fr": 1.0,
1087 |     "hellaswag_ru": 1.0,
1088 |     "hendrycks_math": 1.0,
1089 |     "hendrycks_math_algebra": 1.0,
1090 |     "hendrycks_math_counting_and_prob": 1.0,
1091 |     "hendrycks_math_geometry": 1.0,
1092 |     "hendrycks_math_intermediate_algebra": 1.0,
1093 |     "hendrycks_math_num_theory": 1.0,
1094 |     "hendrycks_math_prealgebra": 1.0,
1095 |     "hendrycks_math_precalc": 1.0,
1096 |     "ifeval": 4.0,
1097 |     "m_mmlu_de": 0.0,
1098 |     "m_mmlu_es": 0.0,
1099 |     "m_mmlu_fr": 0.0,
1100 |     "m_mmlu_ru": 0.0
1101 |   },
1102 |   "n-shot": {
1103 |     "arc_de": 0,
1104 |     "arc_es": 0,
1105 |     "arc_fr": 0,
1106 |     "arc_ru": 0,
1107 |     "gsm8k_cot": 8,
1108 |     "hellaswag_de": 0,
1109 |     "hellaswag_es": 0,
1110 |     "hellaswag_fr": 0,
1111 |     "hellaswag_ru": 0,
1112 |     "hendrycks_math_algebra": 0,
1113 |     "hendrycks_math_counting_and_prob": 0,
1114 |     "hendrycks_math_geometry": 0,
1115 |     "hendrycks_math_intermediate_algebra": 0,
1116 |     "hendrycks_math_num_theory": 0,
1117 |     "hendrycks_math_prealgebra": 0,
1118 |     "hendrycks_math_precalc": 0,
1119 |     "ifeval": 0,
1120 |     "m_mmlu_de": 0,
1121 |     "m_mmlu_es": 0,
1122 |     "m_mmlu_fr": 0,
1123 |     "m_mmlu_ru": 0
1124 |   },
1125 |   "higher_is_better": {
1126 |     "arc_de": {
1127 |       "acc": true,
1128 |       "acc_norm": true
1129 |     },
1130 |     "arc_es": {
1131 |       "acc": true,
1132 |       "acc_norm": true
1133 |     },
1134 |     "arc_fr": {
1135 |       "acc": true,
1136 |       "acc_norm": true
1137 |     },
1138 |     "arc_ru": {
1139 |       "acc": true,
1140 |       "acc_norm": true
1141 |     },
1142 |     "gsm8k_cot": {
1143 |       "exact_match": true
1144 |     },
1145 |     "hellaswag_de": {
1146 |       "acc": true,
1147 |       "acc_norm": true
1148 |     },
1149 |     "hellaswag_es": {
1150 |       "acc": true,
1151 |       "acc_norm": true
1152 |     },
1153 |     "hellaswag_fr": {
1154 |       "acc": true,
1155 |       "acc_norm": true
1156 |     },
1157 |     "hellaswag_ru": {
1158 |       "acc": true,
1159 |       "acc_norm": true
1160 |     },
1161 |     "hendrycks_math": {
1162 |       "exact_match": true
1163 |     },
1164 |     "hendrycks_math_algebra": {
1165 |       "exact_match": true
1166 |     },
1167 |     "hendrycks_math_counting_and_prob": {
1168 |       "exact_match": true
1169 |     },
1170 |     "hendrycks_math_geometry": {
1171 |       "exact_match": true
1172 |     },
1173 |     "hendrycks_math_intermediate_algebra": {
1174 |       "exact_match": true
1175 |     },
1176 |     "hendrycks_math_num_theory": {
1177 |       "exact_match": true
1178 |     },
1179 |     "hendrycks_math_prealgebra": {
1180 |       "exact_match": true
1181 |     },
1182 |     "hendrycks_math_precalc": {
1183 |       "exact_match": true
1184 |     },
1185 |     "ifeval": {
1186 |       "prompt_level_strict_acc": true,
1187 |       "inst_level_strict_acc": true,
1188 |       "prompt_level_loose_acc": true,
1189 |       "inst_level_loose_acc": true
1190 |     },
1191 |     "m_mmlu_de": {
1192 |       "acc": true
1193 |     },
1194 |     "m_mmlu_es": {
1195 |       "acc": true
1196 |     },
1197 |     "m_mmlu_fr": {
1198 |       "acc": true
1199 |     },
1200 |     "m_mmlu_ru": {
1201 |       "acc": true
1202 |     }
1203 |   },
1204 |   "n-samples": {
1205 |     "m_mmlu_ru": {
1206 |       "original": 13007,
1207 |       "effective": 13007
1208 |     },
1209 |     "m_mmlu_fr": {
1210 |       "original": 13091,
1211 |       "effective": 13091
1212 |     },
1213 |     "m_mmlu_es": {
1214 |       "original": 13334,
1215 |       "effective": 13334
1216 |     },
1217 |     "m_mmlu_de": {
1218 |       "original": 13258,
1219 |       "effective": 13258
1220 |     },
1221 |     "ifeval": {
1222 |       "original": 541,
1223 |       "effective": 541
1224 |     },
1225 |     "hendrycks_math_algebra": {
1226 |       "original": 1187,
1227 |       "effective": 1187
1228 |     },
1229 |     "hendrycks_math_counting_and_prob": {
1230 |       "original": 474,
1231 |       "effective": 474
1232 |     },
1233 |     "hendrycks_math_geometry": {
1234 |       "original": 479,
1235 |       "effective": 479
1236 |     },
1237 |     "hendrycks_math_intermediate_algebra": {
1238 |       "original": 903,
1239 |       "effective": 903
1240 |     },
1241 |     "hendrycks_math_num_theory": {
1242 |       "original": 540,
1243 |       "effective": 540
1244 |     },
1245 |     "hendrycks_math_prealgebra": {
1246 |       "original": 871,
1247 |       "effective": 871
1248 |     },
1249 |     "hendrycks_math_precalc": {
1250 |       "original": 546,
1251 |       "effective": 546
1252 |     },
1253 |     "hellaswag_ru": {
1254 |       "original": 9272,
1255 |       "effective": 9272
1256 |     },
1257 |     "hellaswag_fr": {
1258 |       "original": 9338,
1259 |       "effective": 9338
1260 |     },
1261 |     "hellaswag_es": {
1262 |       "original": 9374,
1263 |       "effective": 9374
1264 |     },
1265 |     "hellaswag_de": {
1266 |       "original": 9368,
1267 |       "effective": 9368
1268 |     },
1269 |     "gsm8k_cot": {
1270 |       "original": 1319,
1271 |       "effective": 1319
1272 |     },
1273 |     "arc_ru": {
1274 |       "original": 1169,
1275 |       "effective": 1169
1276 |     },
1277 |     "arc_fr": {
1278 |       "original": 1169,
1279 |       "effective": 1169
1280 |     },
1281 |     "arc_es": {
1282 |       "original": 1170,
1283 |       "effective": 1170
1284 |     },
1285 |     "arc_de": {
1286 |       "original": 1169,
1287 |       "effective": 1169
1288 |     }
1289 |   },
1290 |   "config": {
1291 |     "model": "vllm",
1292 |     "model_args": "pretrained=merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1",
1293 |     "batch_size": "auto",
1294 |     "batch_sizes": [],
1295 |     "device": "cuda:0",
1296 |     "use_cache": null,
1297 |     "limit": null,
1298 |     "bootstrap_iters": 100000,
1299 |     "gen_kwargs": null,
1300 |     "random_seed": 0,
1301 |     "numpy_seed": 1234,
1302 |     "torch_seed": 1234,
1303 |     "fewshot_seed": 1234
1304 |   },
1305 |   "git_hash": "f91dd3c",
1306 |   "date": 1763373562.252434,
1307 |   "pretty_env_info": "PyTorch version: 2.8.0+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.3 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.16.3\nLibc version: glibc-2.31\n\nPython version: 3.10.9 (main, Mar  8 2023, 10:47:38) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-69-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A40\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                    x86_64\nCPU op-mode(s):                  32-bit, 64-bit\nByte Order:                      Little Endian\nAddress sizes:                   46 bits physical, 57 bits virtual\nCPU(s):                          52\nOn-line CPU(s) list:             0-51\nThread(s) per core:              1\nCore(s) per socket:              26\nSocket(s):                       2\nNUMA node(s):                    4\nVendor ID:                       GenuineIntel\nCPU family:                      6\nModel:                           106\nModel name:                      Intel(R) Xeon(R) Gold 5320 CPU @ 2.20GHz\nStepping:                        6\nCPU MHz:                         814.783\nCPU max MHz:                     3400.0000\nCPU min MHz:                     800.0000\nBogoMIPS:                        4400.00\nL1d cache:                       2.4 MiB\nL1i cache:                       1.6 MiB\nL2 cache:                        65 MiB\nL3 cache:                        78 MiB\nNUMA node0 CPU(s):               0,4,8,12,16,20,24,28,32,36,40,44,48\nNUMA node1 CPU(s):               2,6,10,14,18,22,26,30,34,38,42,46,50\nNUMA node2 CPU(s):               1,5,9,13,17,21,25,29,33,37,41,45,49\nNUMA node3 CPU(s):               3,7,11,15,19,23,27,31,35,39,43,47,51\nVulnerability Itlb multihit:     Not affected\nVulnerability L1tf:              Not affected\nVulnerability Mds:               Not affected\nVulnerability Meltdown:          Not affected\nVulnerability Mmio stale data:   Mitigation; Clear CPU buffers; SMT disabled\nVulnerability Retbleed:          Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:        Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence\nVulnerability Srbds:             Not affected\nVulnerability Tsx async abort:   Not affected\nFlags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 invpcid_single intel_ppin ssbd mba ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect wbnoinvd dtherm ida arat pln pts avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid fsrm md_clear pconfig flush_l1d arch_capabilities\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.3\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.8.0\n[pip3] torchaudio==2.8.0\n[pip3] torchvision==0.23.0\n[pip3] triton==3.4.0\n[conda] numpy                                2.2.6            pypi_0           pypi\n[conda] nvidia-cublas-cu12                   12.8.4.1         pypi_0           pypi\n[conda] nvidia-cuda-cupti-cu12               12.8.90          pypi_0           pypi\n[conda] nvidia-cuda-nvrtc-cu12               12.8.93          pypi_0           pypi\n[conda] nvidia-cuda-runtime-cu12             12.8.90          pypi_0           pypi\n[conda] nvidia-cudnn-cu12                    9.10.2.21        pypi_0           pypi\n[conda] nvidia-cufft-cu12                    11.3.3.83        pypi_0           pypi\n[conda] nvidia-curand-cu12                   10.3.9.90        pypi_0           pypi\n[conda] nvidia-cusolver-cu12                 11.7.3.90        pypi_0           pypi\n[conda] nvidia-cusparse-cu12                 12.5.8.93        pypi_0           pypi\n[conda] nvidia-cusparselt-cu12               0.7.1            pypi_0           pypi\n[conda] nvidia-nccl-cu12                     2.27.3           pypi_0           pypi\n[conda] nvidia-nvjitlink-cu12                12.8.93          pypi_0           pypi\n[conda] nvidia-nvtx-cu12                     12.8.90          pypi_0           pypi\n[conda] torch                                2.8.0            pypi_0           pypi\n[conda] torchaudio                           2.8.0            pypi_0           pypi\n[conda] torchvision                          0.23.0           pypi_0           pypi\n[conda] triton                               3.4.0            pypi_0           pypi",
1308 |   "transformers_version": "4.57.1",
1309 |   "lm_eval_version": "0.4.9.1",
1310 |   "upper_git_hash": null,
1311 |   "tokenizer_pad_token": [
1312 |     "<|end_of_text|>",
1313 |     "128001"
1314 |   ],
1315 |   "tokenizer_eos_token": [
1316 |     "<|end_of_text|>",
1317 |     "128001"
1318 |   ],
1319 |   "tokenizer_bos_token": [
1320 |     "<|begin_of_text|>",
1321 |     "128000"
1322 |   ],
1323 |   "eot_token_id": 128001,
1324 |   "max_length": 131072,
1325 |   "task_hashes": {},
1326 |   "model_source": "vllm",
1327 |   "model_name": "merged_models/Llama-3.1-8B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1",
1328 |   "model_name_sanitized": "merged_models__Llama-3.1-8B_merged__RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1",
1329 |   "system_instruction": null,
1330 |   "system_instruction_sha": null,
1331 |   "fewshot_as_multiturn": false,
1332 |   "chat_template": null,
1333 |   "chat_template_sha": null,
1334 |   "start_time": 15748933.200144514,
1335 |   "end_time": 15757932.640946288,
1336 |   "total_evaluation_time_seconds": "8999.44080177322"
1337 | }


--------------------------------------------------------------------------------
/merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1/lm_eval.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "results": {
   3 |     "arc_de": {
   4 |       "alias": "arc_de",
   5 |       "acc,none": 0.31137724550898205,
   6 |       "acc_stderr,none": 0.01354917023720016,
   7 |       "acc_norm,none": 0.358426005132592,
   8 |       "acc_norm_stderr,none": 0.014031422783275219
   9 |     },
  10 |     "arc_es": {
  11 |       "alias": "arc_es",
  12 |       "acc,none": 0.3700854700854701,
  13 |       "acc_stderr,none": 0.014121621753736043,
  14 |       "acc_norm,none": 0.4008547008547009,
  15 |       "acc_norm_stderr,none": 0.014333502054419352
  16 |     },
  17 |     "arc_fr": {
  18 |       "alias": "arc_fr",
  19 |       "acc,none": 0.35243798118049613,
  20 |       "acc_stderr,none": 0.013978501429969674,
  21 |       "acc_norm,none": 0.40461933276304535,
  22 |       "acc_norm_stderr,none": 0.014361481979772708
  23 |     },
  24 |     "arc_ru": {
  25 |       "alias": "arc_ru",
  26 |       "acc,none": 0.34473909324208724,
  27 |       "acc_stderr,none": 0.013906920607432557,
  28 |       "acc_norm,none": 0.369546621043627,
  29 |       "acc_norm_stderr,none": 0.014123413837443284
  30 |     },
  31 |     "gsm8k_cot": {
  32 |       "alias": "gsm8k_cot",
  33 |       "exact_match,strict-match": 0.3510235026535254,
  34 |       "exact_match_stderr,strict-match": 0.01314694594139722,
  35 |       "exact_match,flexible-extract": 0.400303260045489,
  36 |       "exact_match_stderr,flexible-extract": 0.013495926436566438
  37 |     },
  38 |     "hellaswag_de": {
  39 |       "alias": "hellaswag_de",
  40 |       "acc,none": 0.42410333048676346,
  41 |       "acc_stderr,none": 0.005106318088351463,
  42 |       "acc_norm,none": 0.5470751494449189,
  43 |       "acc_norm_stderr,none": 0.0051432342942479385
  44 |     },
  45 |     "hellaswag_es": {
  46 |       "alias": "hellaswag_es",
  47 |       "acc,none": 0.46607638148069125,
  48 |       "acc_stderr,none": 0.005152628194218952,
  49 |       "acc_norm,none": 0.6067847237038617,
  50 |       "acc_norm_stderr,none": 0.005045372103545006
  51 |     },
  52 |     "hellaswag_fr": {
  53 |       "alias": "hellaswag_fr",
  54 |       "acc,none": 0.44688370100663954,
  55 |       "acc_stderr,none": 0.005145194613284539,
  56 |       "acc_norm,none": 0.5938102377382737,
  57 |       "acc_norm_stderr,none": 0.0050825849670259144
  58 |     },
  59 |     "hellaswag_ru": {
  60 |       "alias": "hellaswag_ru",
  61 |       "acc,none": 0.4074633304572908,
  62 |       "acc_stderr,none": 0.005103153017972229,
  63 |       "acc_norm,none": 0.5282571182053495,
  64 |       "acc_norm_stderr,none": 0.005184561926719888
  65 |     },
  66 |     "hendrycks_math": {
  67 |       "exact_match,none": 0.0172,
  68 |       "exact_match_stderr,none": 0.0018375943530154295,
  69 |       "alias": "hendrycks_math"
  70 |     },
  71 |     "hendrycks_math_algebra": {
  72 |       "alias": " - hendrycks_math_algebra",
  73 |       "exact_match,none": 0.017691659646166806,
  74 |       "exact_match_stderr,none": 0.0038279464976423414
  75 |     },
  76 |     "hendrycks_math_counting_and_prob": {
  77 |       "alias": " - hendrycks_math_counting_and_prob",
  78 |       "exact_match,none": 0.014767932489451477,
  79 |       "exact_match_stderr,none": 0.005546238589668472
  80 |     },
  81 |     "hendrycks_math_geometry": {
  82 |       "alias": " - hendrycks_math_geometry",
  83 |       "exact_match,none": 0.025052192066805846,
  84 |       "exact_match_stderr,none": 0.007148247838013836
  85 |     },
  86 |     "hendrycks_math_intermediate_algebra": {
  87 |       "alias": " - hendrycks_math_intermediate_algebra",
  88 |       "exact_match,none": 0.007751937984496124,
  89 |       "exact_match_stderr,none": 0.0029201960269643937
  90 |     },
  91 |     "hendrycks_math_num_theory": {
  92 |       "alias": " - hendrycks_math_num_theory",
  93 |       "exact_match,none": 0.014814814814814815,
  94 |       "exact_match_stderr,none": 0.005203704987512652
  95 |     },
  96 |     "hendrycks_math_prealgebra": {
  97 |       "alias": " - hendrycks_math_prealgebra",
  98 |       "exact_match,none": 0.027554535017221583,
  99 |       "exact_match_stderr,none": 0.005549700480393211
 100 |     },
 101 |     "hendrycks_math_precalc": {
 102 |       "alias": " - hendrycks_math_precalc",
 103 |       "exact_match,none": 0.01282051282051282,
 104 |       "exact_match_stderr,none": 0.004818950982487616
 105 |     },
 106 |     "ifeval": {
 107 |       "alias": "ifeval",
 108 |       "prompt_level_strict_acc,none": 0.06839186691312385,
 109 |       "prompt_level_strict_acc_stderr,none": 0.010862304803962516,
 110 |       "inst_level_strict_acc,none": 0.09712230215827339,
 111 |       "inst_level_strict_acc_stderr,none": "N/A",
 112 |       "prompt_level_loose_acc,none": 0.08687615526802218,
 113 |       "prompt_level_loose_acc_stderr,none": 0.012120436438929415,
 114 |       "inst_level_loose_acc,none": 0.11151079136690648,
 115 |       "inst_level_loose_acc_stderr,none": "N/A"
 116 |     },
 117 |     "m_mmlu_de": {
 118 |       "alias": "m_mmlu_de",
 119 |       "acc,none": 0.4712626338814301,
 120 |       "acc_stderr,none": 0.004335397038388265
 121 |     },
 122 |     "m_mmlu_es": {
 123 |       "alias": "m_mmlu_es",
 124 |       "acc,none": 0.4887505624718764,
 125 |       "acc_stderr,none": 0.0043290850402920265
 126 |     },
 127 |     "m_mmlu_fr": {
 128 |       "alias": "m_mmlu_fr",
 129 |       "acc,none": 0.47979527919945003,
 130 |       "acc_stderr,none": 0.0043666190641897815
 131 |     },
 132 |     "m_mmlu_ru": {
 133 |       "alias": "m_mmlu_ru",
 134 |       "acc,none": 0.4423771815176443,
 135 |       "acc_stderr,none": 0.0043550661143529155
 136 |     }
 137 |   },
 138 |   "groups": {
 139 |     "hendrycks_math": {
 140 |       "exact_match,none": 0.0172,
 141 |       "exact_match_stderr,none": 0.0018375943530154295,
 142 |       "alias": "hendrycks_math"
 143 |     }
 144 |   },
 145 |   "group_subtasks": {
 146 |     "arc_de": [],
 147 |     "arc_es": [],
 148 |     "arc_fr": [],
 149 |     "arc_ru": [],
 150 |     "gsm8k_cot": [],
 151 |     "hellaswag_de": [],
 152 |     "hellaswag_es": [],
 153 |     "hellaswag_fr": [],
 154 |     "hellaswag_ru": [],
 155 |     "hendrycks_math": [
 156 |       "hendrycks_math_algebra",
 157 |       "hendrycks_math_counting_and_prob",
 158 |       "hendrycks_math_geometry",
 159 |       "hendrycks_math_intermediate_algebra",
 160 |       "hendrycks_math_num_theory",
 161 |       "hendrycks_math_prealgebra",
 162 |       "hendrycks_math_precalc"
 163 |     ],
 164 |     "ifeval": [],
 165 |     "m_mmlu_de": [],
 166 |     "m_mmlu_es": [],
 167 |     "m_mmlu_fr": [],
 168 |     "m_mmlu_ru": []
 169 |   },
 170 |   "configs": {
 171 |     "arc_de": {
 172 |       "task": "arc_de",
 173 |       "tag": [
 174 |         "arc_multilingual"
 175 |       ],
 176 |       "dataset_path": "alexandrainst/m_arc",
 177 |       "dataset_name": "de",
 178 |       "training_split": "train",
 179 |       "validation_split": "validation",
 180 |       "test_split": "test",
 181 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        # breakpoint()\n        out_doc = {\n            \"id\": doc[\"id\"],\n            \"query\": \"Question: \" + preprocess(doc[\"instruction\"]) + \"\\nAnswer:\",\n            \"choices\": [\n                preprocess(option)\n                for option in [\n                    doc[\"option_a\"],\n                    doc[\"option_b\"],\n                    doc[\"option_c\"],\n                    doc[\"option_d\"],\n                    doc[\"option_e\"],\n                ]\n                if option\n            ],\n            \"gold\": [\"A\", \"B\", \"C\", \"D\", \"E\"].index(doc[\"answer\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 182 |       "doc_to_text": "query",
 183 |       "doc_to_target": "gold",
 184 |       "unsafe_code": false,
 185 |       "doc_to_choice": "choices",
 186 |       "description": "",
 187 |       "target_delimiter": " ",
 188 |       "fewshot_delimiter": "\n\n",
 189 |       "num_fewshot": 0,
 190 |       "metric_list": [
 191 |         {
 192 |           "metric": "acc",
 193 |           "aggregation": "mean",
 194 |           "higher_is_better": true
 195 |         },
 196 |         {
 197 |           "metric": "acc_norm",
 198 |           "aggregation": "mean",
 199 |           "higher_is_better": true
 200 |         }
 201 |       ],
 202 |       "output_type": "multiple_choice",
 203 |       "repeats": 1,
 204 |       "should_decontaminate": true,
 205 |       "doc_to_decontamination_query": "query",
 206 |       "metadata": {
 207 |         "version": 2.0,
 208 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 209 |       }
 210 |     },
 211 |     "arc_es": {
 212 |       "task": "arc_es",
 213 |       "tag": [
 214 |         "arc_multilingual"
 215 |       ],
 216 |       "dataset_path": "alexandrainst/m_arc",
 217 |       "dataset_name": "es",
 218 |       "training_split": "train",
 219 |       "validation_split": "validation",
 220 |       "test_split": "test",
 221 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        # breakpoint()\n        out_doc = {\n            \"id\": doc[\"id\"],\n            \"query\": \"Question: \" + preprocess(doc[\"instruction\"]) + \"\\nAnswer:\",\n            \"choices\": [\n                preprocess(option)\n                for option in [\n                    doc[\"option_a\"],\n                    doc[\"option_b\"],\n                    doc[\"option_c\"],\n                    doc[\"option_d\"],\n                    doc[\"option_e\"],\n                ]\n                if option\n            ],\n            \"gold\": [\"A\", \"B\", \"C\", \"D\", \"E\"].index(doc[\"answer\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 222 |       "doc_to_text": "query",
 223 |       "doc_to_target": "gold",
 224 |       "unsafe_code": false,
 225 |       "doc_to_choice": "choices",
 226 |       "description": "",
 227 |       "target_delimiter": " ",
 228 |       "fewshot_delimiter": "\n\n",
 229 |       "num_fewshot": 0,
 230 |       "metric_list": [
 231 |         {
 232 |           "metric": "acc",
 233 |           "aggregation": "mean",
 234 |           "higher_is_better": true
 235 |         },
 236 |         {
 237 |           "metric": "acc_norm",
 238 |           "aggregation": "mean",
 239 |           "higher_is_better": true
 240 |         }
 241 |       ],
 242 |       "output_type": "multiple_choice",
 243 |       "repeats": 1,
 244 |       "should_decontaminate": true,
 245 |       "doc_to_decontamination_query": "query",
 246 |       "metadata": {
 247 |         "version": 2.0,
 248 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 249 |       }
 250 |     },
 251 |     "arc_fr": {
 252 |       "task": "arc_fr",
 253 |       "tag": [
 254 |         "arc_multilingual"
 255 |       ],
 256 |       "dataset_path": "alexandrainst/m_arc",
 257 |       "dataset_name": "fr",
 258 |       "training_split": "train",
 259 |       "validation_split": "validation",
 260 |       "test_split": "test",
 261 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        # breakpoint()\n        out_doc = {\n            \"id\": doc[\"id\"],\n            \"query\": \"Question: \" + preprocess(doc[\"instruction\"]) + \"\\nAnswer:\",\n            \"choices\": [\n                preprocess(option)\n                for option in [\n                    doc[\"option_a\"],\n                    doc[\"option_b\"],\n                    doc[\"option_c\"],\n                    doc[\"option_d\"],\n                    doc[\"option_e\"],\n                ]\n                if option\n            ],\n            \"gold\": [\"A\", \"B\", \"C\", \"D\", \"E\"].index(doc[\"answer\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 262 |       "doc_to_text": "query",
 263 |       "doc_to_target": "gold",
 264 |       "unsafe_code": false,
 265 |       "doc_to_choice": "choices",
 266 |       "description": "",
 267 |       "target_delimiter": " ",
 268 |       "fewshot_delimiter": "\n\n",
 269 |       "num_fewshot": 0,
 270 |       "metric_list": [
 271 |         {
 272 |           "metric": "acc",
 273 |           "aggregation": "mean",
 274 |           "higher_is_better": true
 275 |         },
 276 |         {
 277 |           "metric": "acc_norm",
 278 |           "aggregation": "mean",
 279 |           "higher_is_better": true
 280 |         }
 281 |       ],
 282 |       "output_type": "multiple_choice",
 283 |       "repeats": 1,
 284 |       "should_decontaminate": true,
 285 |       "doc_to_decontamination_query": "query",
 286 |       "metadata": {
 287 |         "version": 2.0,
 288 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 289 |       }
 290 |     },
 291 |     "arc_ru": {
 292 |       "task": "arc_ru",
 293 |       "tag": [
 294 |         "arc_multilingual"
 295 |       ],
 296 |       "dataset_path": "alexandrainst/m_arc",
 297 |       "dataset_name": "ru",
 298 |       "training_split": "train",
 299 |       "validation_split": "validation",
 300 |       "test_split": "test",
 301 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        # breakpoint()\n        out_doc = {\n            \"id\": doc[\"id\"],\n            \"query\": \"Question: \" + preprocess(doc[\"instruction\"]) + \"\\nAnswer:\",\n            \"choices\": [\n                preprocess(option)\n                for option in [\n                    doc[\"option_a\"],\n                    doc[\"option_b\"],\n                    doc[\"option_c\"],\n                    doc[\"option_d\"],\n                    doc[\"option_e\"],\n                ]\n                if option\n            ],\n            \"gold\": [\"A\", \"B\", \"C\", \"D\", \"E\"].index(doc[\"answer\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 302 |       "doc_to_text": "query",
 303 |       "doc_to_target": "gold",
 304 |       "unsafe_code": false,
 305 |       "doc_to_choice": "choices",
 306 |       "description": "",
 307 |       "target_delimiter": " ",
 308 |       "fewshot_delimiter": "\n\n",
 309 |       "num_fewshot": 0,
 310 |       "metric_list": [
 311 |         {
 312 |           "metric": "acc",
 313 |           "aggregation": "mean",
 314 |           "higher_is_better": true
 315 |         },
 316 |         {
 317 |           "metric": "acc_norm",
 318 |           "aggregation": "mean",
 319 |           "higher_is_better": true
 320 |         }
 321 |       ],
 322 |       "output_type": "multiple_choice",
 323 |       "repeats": 1,
 324 |       "should_decontaminate": true,
 325 |       "doc_to_decontamination_query": "query",
 326 |       "metadata": {
 327 |         "version": 2.0,
 328 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 329 |       }
 330 |     },
 331 |     "gsm8k_cot": {
 332 |       "task": "gsm8k_cot",
 333 |       "tag": [
 334 |         "chain_of_thought"
 335 |       ],
 336 |       "dataset_path": "gsm8k",
 337 |       "dataset_name": "main",
 338 |       "test_split": "test",
 339 |       "doc_to_text": "Q: {{question}}\nA:",
 340 |       "doc_to_target": "{{answer.split('####')[-1].strip() if answer is defined else target}}",
 341 |       "unsafe_code": false,
 342 |       "description": "",
 343 |       "target_delimiter": " ",
 344 |       "fewshot_delimiter": "\n\n",
 345 |       "fewshot_config": {
 346 |         "sampler": "first_n",
 347 |         "samples": [
 348 |           {
 349 |             "question": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
 350 |             "target": "There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6."
 351 |           },
 352 |           {
 353 |             "question": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
 354 |             "target": "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5."
 355 |           },
 356 |           {
 357 |             "question": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
 358 |             "target": "Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39."
 359 |           },
 360 |           {
 361 |             "question": "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?",
 362 |             "target": "Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. The answer is 8."
 363 |           },
 364 |           {
 365 |             "question": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
 366 |             "target": "Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. The answer is 9."
 367 |           },
 368 |           {
 369 |             "question": "There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?",
 370 |             "target": "There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is 29."
 371 |           },
 372 |           {
 373 |             "question": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
 374 |             "target": "Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33."
 375 |           },
 376 |           {
 377 |             "question": "Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
 378 |             "target": "Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8."
 379 |           }
 380 |         ]
 381 |       },
 382 |       "num_fewshot": 8,
 383 |       "metric_list": [
 384 |         {
 385 |           "aggregation": "mean",
 386 |           "higher_is_better": true,
 387 |           "ignore_case": true,
 388 |           "ignore_punctuation": false,
 389 |           "metric": "exact_match",
 390 |           "regexes_to_ignore": [
 391 |             ",",
 392 |             "\\$",
 393 |             "(?s).*#### ",
 394 |             "\\.$"
 395 |           ]
 396 |         }
 397 |       ],
 398 |       "output_type": "generate_until",
 399 |       "generation_kwargs": {
 400 |         "do_sample": false,
 401 |         "until": [
 402 |           "Q:",
 403 |           "</s>",
 404 |           "<|im_end|>"
 405 |         ]
 406 |       },
 407 |       "repeats": 1,
 408 |       "filter_list": [
 409 |         {
 410 |           "filter": [
 411 |             {
 412 |               "function": "regex",
 413 |               "regex_pattern": "The answer is (\\-?[0-9\\.\\,]+)."
 414 |             },
 415 |             {
 416 |               "function": "take_first"
 417 |             }
 418 |           ],
 419 |           "name": "strict-match"
 420 |         },
 421 |         {
 422 |           "filter": [
 423 |             {
 424 |               "function": "regex",
 425 |               "group_select": -1,
 426 |               "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
 427 |             },
 428 |             {
 429 |               "function": "take_first"
 430 |             }
 431 |           ],
 432 |           "name": "flexible-extract"
 433 |         }
 434 |       ],
 435 |       "should_decontaminate": false,
 436 |       "metadata": {
 437 |         "version": 3.0,
 438 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 439 |       }
 440 |     },
 441 |     "hellaswag_de": {
 442 |       "task": "hellaswag_de",
 443 |       "tag": [
 444 |         "hellaswag_multilingual"
 445 |       ],
 446 |       "dataset_path": "alexandrainst/m_hellaswag",
 447 |       "dataset_name": "de",
 448 |       "validation_split": "val",
 449 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 450 |       "doc_to_text": "query",
 451 |       "doc_to_target": "{{label.lstrip()}}",
 452 |       "unsafe_code": false,
 453 |       "doc_to_choice": "choices",
 454 |       "description": "",
 455 |       "target_delimiter": " ",
 456 |       "fewshot_delimiter": "\n\n",
 457 |       "num_fewshot": 0,
 458 |       "metric_list": [
 459 |         {
 460 |           "metric": "acc",
 461 |           "aggregation": "mean",
 462 |           "higher_is_better": true
 463 |         },
 464 |         {
 465 |           "metric": "acc_norm",
 466 |           "aggregation": "mean",
 467 |           "higher_is_better": true
 468 |         }
 469 |       ],
 470 |       "output_type": "multiple_choice",
 471 |       "repeats": 1,
 472 |       "should_decontaminate": false,
 473 |       "metadata": {
 474 |         "version": 1.0,
 475 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 476 |       }
 477 |     },
 478 |     "hellaswag_es": {
 479 |       "task": "hellaswag_es",
 480 |       "tag": [
 481 |         "hellaswag_multilingual"
 482 |       ],
 483 |       "dataset_path": "alexandrainst/m_hellaswag",
 484 |       "dataset_name": "es",
 485 |       "validation_split": "val",
 486 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 487 |       "doc_to_text": "query",
 488 |       "doc_to_target": "{{label.lstrip()}}",
 489 |       "unsafe_code": false,
 490 |       "doc_to_choice": "choices",
 491 |       "description": "",
 492 |       "target_delimiter": " ",
 493 |       "fewshot_delimiter": "\n\n",
 494 |       "num_fewshot": 0,
 495 |       "metric_list": [
 496 |         {
 497 |           "metric": "acc",
 498 |           "aggregation": "mean",
 499 |           "higher_is_better": true
 500 |         },
 501 |         {
 502 |           "metric": "acc_norm",
 503 |           "aggregation": "mean",
 504 |           "higher_is_better": true
 505 |         }
 506 |       ],
 507 |       "output_type": "multiple_choice",
 508 |       "repeats": 1,
 509 |       "should_decontaminate": false,
 510 |       "metadata": {
 511 |         "version": 1.0,
 512 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 513 |       }
 514 |     },
 515 |     "hellaswag_fr": {
 516 |       "task": "hellaswag_fr",
 517 |       "tag": [
 518 |         "hellaswag_multilingual"
 519 |       ],
 520 |       "dataset_path": "alexandrainst/m_hellaswag",
 521 |       "dataset_name": "fr",
 522 |       "validation_split": "val",
 523 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 524 |       "doc_to_text": "query",
 525 |       "doc_to_target": "{{label.lstrip()}}",
 526 |       "unsafe_code": false,
 527 |       "doc_to_choice": "choices",
 528 |       "description": "",
 529 |       "target_delimiter": " ",
 530 |       "fewshot_delimiter": "\n\n",
 531 |       "num_fewshot": 0,
 532 |       "metric_list": [
 533 |         {
 534 |           "metric": "acc",
 535 |           "aggregation": "mean",
 536 |           "higher_is_better": true
 537 |         },
 538 |         {
 539 |           "metric": "acc_norm",
 540 |           "aggregation": "mean",
 541 |           "higher_is_better": true
 542 |         }
 543 |       ],
 544 |       "output_type": "multiple_choice",
 545 |       "repeats": 1,
 546 |       "should_decontaminate": false,
 547 |       "metadata": {
 548 |         "version": 1.0,
 549 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 550 |       }
 551 |     },
 552 |     "hellaswag_ru": {
 553 |       "task": "hellaswag_ru",
 554 |       "tag": [
 555 |         "hellaswag_multilingual"
 556 |       ],
 557 |       "dataset_path": "alexandrainst/m_hellaswag",
 558 |       "dataset_name": "ru",
 559 |       "validation_split": "val",
 560 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 561 |       "doc_to_text": "query",
 562 |       "doc_to_target": "{{label.lstrip()}}",
 563 |       "unsafe_code": false,
 564 |       "doc_to_choice": "choices",
 565 |       "description": "",
 566 |       "target_delimiter": " ",
 567 |       "fewshot_delimiter": "\n\n",
 568 |       "num_fewshot": 0,
 569 |       "metric_list": [
 570 |         {
 571 |           "metric": "acc",
 572 |           "aggregation": "mean",
 573 |           "higher_is_better": true
 574 |         },
 575 |         {
 576 |           "metric": "acc_norm",
 577 |           "aggregation": "mean",
 578 |           "higher_is_better": true
 579 |         }
 580 |       ],
 581 |       "output_type": "multiple_choice",
 582 |       "repeats": 1,
 583 |       "should_decontaminate": false,
 584 |       "metadata": {
 585 |         "version": 1.0,
 586 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 587 |       }
 588 |     },
 589 |     "hendrycks_math_algebra": {
 590 |       "task": "hendrycks_math_algebra",
 591 |       "tag": [
 592 |         "math_word_problems"
 593 |       ],
 594 |       "dataset_path": "EleutherAI/hendrycks_math",
 595 |       "dataset_name": "algebra",
 596 |       "training_split": "train",
 597 |       "test_split": "test",
 598 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 599 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 600 |       "doc_to_target": "{{answer}}",
 601 |       "unsafe_code": false,
 602 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 603 |       "description": "",
 604 |       "target_delimiter": " ",
 605 |       "fewshot_delimiter": "\n\n",
 606 |       "num_fewshot": 0,
 607 |       "metric_list": [
 608 |         {
 609 |           "metric": "exact_match",
 610 |           "aggregation": "mean",
 611 |           "higher_is_better": true
 612 |         }
 613 |       ],
 614 |       "output_type": "generate_until",
 615 |       "generation_kwargs": {
 616 |         "until": [
 617 |           "Problem:"
 618 |         ],
 619 |         "do_sample": false,
 620 |         "temperature": 0.0
 621 |       },
 622 |       "repeats": 1,
 623 |       "should_decontaminate": false,
 624 |       "metadata": {
 625 |         "version": 1.0,
 626 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 627 |       }
 628 |     },
 629 |     "hendrycks_math_counting_and_prob": {
 630 |       "task": "hendrycks_math_counting_and_prob",
 631 |       "tag": [
 632 |         "math_word_problems"
 633 |       ],
 634 |       "dataset_path": "EleutherAI/hendrycks_math",
 635 |       "dataset_name": "counting_and_probability",
 636 |       "training_split": "train",
 637 |       "test_split": "test",
 638 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 639 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 640 |       "doc_to_target": "{{answer}}",
 641 |       "unsafe_code": false,
 642 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 643 |       "description": "",
 644 |       "target_delimiter": " ",
 645 |       "fewshot_delimiter": "\n\n",
 646 |       "num_fewshot": 0,
 647 |       "metric_list": [
 648 |         {
 649 |           "metric": "exact_match",
 650 |           "aggregation": "mean",
 651 |           "higher_is_better": true
 652 |         }
 653 |       ],
 654 |       "output_type": "generate_until",
 655 |       "generation_kwargs": {
 656 |         "until": [
 657 |           "Problem:"
 658 |         ],
 659 |         "do_sample": false,
 660 |         "temperature": 0.0
 661 |       },
 662 |       "repeats": 1,
 663 |       "should_decontaminate": false,
 664 |       "metadata": {
 665 |         "version": 1.0,
 666 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 667 |       }
 668 |     },
 669 |     "hendrycks_math_geometry": {
 670 |       "task": "hendrycks_math_geometry",
 671 |       "tag": [
 672 |         "math_word_problems"
 673 |       ],
 674 |       "dataset_path": "EleutherAI/hendrycks_math",
 675 |       "dataset_name": "geometry",
 676 |       "training_split": "train",
 677 |       "test_split": "test",
 678 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 679 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 680 |       "doc_to_target": "{{answer}}",
 681 |       "unsafe_code": false,
 682 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 683 |       "description": "",
 684 |       "target_delimiter": " ",
 685 |       "fewshot_delimiter": "\n\n",
 686 |       "num_fewshot": 0,
 687 |       "metric_list": [
 688 |         {
 689 |           "metric": "exact_match",
 690 |           "aggregation": "mean",
 691 |           "higher_is_better": true
 692 |         }
 693 |       ],
 694 |       "output_type": "generate_until",
 695 |       "generation_kwargs": {
 696 |         "until": [
 697 |           "Problem:"
 698 |         ],
 699 |         "do_sample": false,
 700 |         "temperature": 0.0
 701 |       },
 702 |       "repeats": 1,
 703 |       "should_decontaminate": false,
 704 |       "metadata": {
 705 |         "version": 1.0,
 706 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 707 |       }
 708 |     },
 709 |     "hendrycks_math_intermediate_algebra": {
 710 |       "task": "hendrycks_math_intermediate_algebra",
 711 |       "tag": [
 712 |         "math_word_problems"
 713 |       ],
 714 |       "dataset_path": "EleutherAI/hendrycks_math",
 715 |       "dataset_name": "intermediate_algebra",
 716 |       "training_split": "train",
 717 |       "test_split": "test",
 718 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 719 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 720 |       "doc_to_target": "{{answer}}",
 721 |       "unsafe_code": false,
 722 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 723 |       "description": "",
 724 |       "target_delimiter": " ",
 725 |       "fewshot_delimiter": "\n\n",
 726 |       "num_fewshot": 0,
 727 |       "metric_list": [
 728 |         {
 729 |           "metric": "exact_match",
 730 |           "aggregation": "mean",
 731 |           "higher_is_better": true
 732 |         }
 733 |       ],
 734 |       "output_type": "generate_until",
 735 |       "generation_kwargs": {
 736 |         "until": [
 737 |           "Problem:"
 738 |         ],
 739 |         "do_sample": false,
 740 |         "temperature": 0.0
 741 |       },
 742 |       "repeats": 1,
 743 |       "should_decontaminate": false,
 744 |       "metadata": {
 745 |         "version": 1.0,
 746 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 747 |       }
 748 |     },
 749 |     "hendrycks_math_num_theory": {
 750 |       "task": "hendrycks_math_num_theory",
 751 |       "tag": [
 752 |         "math_word_problems"
 753 |       ],
 754 |       "dataset_path": "EleutherAI/hendrycks_math",
 755 |       "dataset_name": "number_theory",
 756 |       "training_split": "train",
 757 |       "test_split": "test",
 758 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 759 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 760 |       "doc_to_target": "{{answer}}",
 761 |       "unsafe_code": false,
 762 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 763 |       "description": "",
 764 |       "target_delimiter": " ",
 765 |       "fewshot_delimiter": "\n\n",
 766 |       "num_fewshot": 0,
 767 |       "metric_list": [
 768 |         {
 769 |           "metric": "exact_match",
 770 |           "aggregation": "mean",
 771 |           "higher_is_better": true
 772 |         }
 773 |       ],
 774 |       "output_type": "generate_until",
 775 |       "generation_kwargs": {
 776 |         "until": [
 777 |           "Problem:"
 778 |         ],
 779 |         "do_sample": false,
 780 |         "temperature": 0.0
 781 |       },
 782 |       "repeats": 1,
 783 |       "should_decontaminate": false,
 784 |       "metadata": {
 785 |         "version": 1.0,
 786 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 787 |       }
 788 |     },
 789 |     "hendrycks_math_prealgebra": {
 790 |       "task": "hendrycks_math_prealgebra",
 791 |       "tag": [
 792 |         "math_word_problems"
 793 |       ],
 794 |       "dataset_path": "EleutherAI/hendrycks_math",
 795 |       "dataset_name": "prealgebra",
 796 |       "training_split": "train",
 797 |       "test_split": "test",
 798 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 799 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 800 |       "doc_to_target": "{{answer}}",
 801 |       "unsafe_code": false,
 802 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 803 |       "description": "",
 804 |       "target_delimiter": " ",
 805 |       "fewshot_delimiter": "\n\n",
 806 |       "num_fewshot": 0,
 807 |       "metric_list": [
 808 |         {
 809 |           "metric": "exact_match",
 810 |           "aggregation": "mean",
 811 |           "higher_is_better": true
 812 |         }
 813 |       ],
 814 |       "output_type": "generate_until",
 815 |       "generation_kwargs": {
 816 |         "until": [
 817 |           "Problem:"
 818 |         ],
 819 |         "do_sample": false,
 820 |         "temperature": 0.0
 821 |       },
 822 |       "repeats": 1,
 823 |       "should_decontaminate": false,
 824 |       "metadata": {
 825 |         "version": 1.0,
 826 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 827 |       }
 828 |     },
 829 |     "hendrycks_math_precalc": {
 830 |       "task": "hendrycks_math_precalc",
 831 |       "tag": [
 832 |         "math_word_problems"
 833 |       ],
 834 |       "dataset_path": "EleutherAI/hendrycks_math",
 835 |       "dataset_name": "precalculus",
 836 |       "training_split": "train",
 837 |       "test_split": "test",
 838 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc: dict) -> dict:\n        out_doc = {\n            \"problem\": doc[\"problem\"],\n            \"solution\": doc[\"solution\"],\n            \"answer\": remove_boxed(last_boxed_only_string(doc[\"solution\"])),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
 839 |       "doc_to_text": "Problem: {{problem}}\nAnswer:",
 840 |       "doc_to_target": "{{answer}}",
 841 |       "unsafe_code": false,
 842 |       "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n    retval = 0\n    indices = [pos for pos, char in enumerate(results[0]) if char == \"$\"]\n    if len(indices) <= 1:\n        answer = results[0]\n    else:\n        answer = results[0][indices[0] + 1 : indices[-1]]\n\n    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc[\"solution\"]))):\n        retval = 1\n\n    results = {\n        \"exact_match\": retval,\n    }\n    return results\n",
 843 |       "description": "",
 844 |       "target_delimiter": " ",
 845 |       "fewshot_delimiter": "\n\n",
 846 |       "num_fewshot": 0,
 847 |       "metric_list": [
 848 |         {
 849 |           "metric": "exact_match",
 850 |           "aggregation": "mean",
 851 |           "higher_is_better": true
 852 |         }
 853 |       ],
 854 |       "output_type": "generate_until",
 855 |       "generation_kwargs": {
 856 |         "until": [
 857 |           "Problem:"
 858 |         ],
 859 |         "do_sample": false,
 860 |         "temperature": 0.0
 861 |       },
 862 |       "repeats": 1,
 863 |       "should_decontaminate": false,
 864 |       "metadata": {
 865 |         "version": 1.0,
 866 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 867 |       }
 868 |     },
 869 |     "ifeval": {
 870 |       "task": "ifeval",
 871 |       "dataset_path": "google/IFEval",
 872 |       "test_split": "train",
 873 |       "doc_to_text": "prompt",
 874 |       "doc_to_target": 0,
 875 |       "unsafe_code": false,
 876 |       "process_results": "def process_results(doc, results):\n    inp = InputExample(\n        key=doc[\"key\"],\n        instruction_id_list=doc[\"instruction_id_list\"],\n        prompt=doc[\"prompt\"],\n        kwargs=doc[\"kwargs\"],\n    )\n    response = results[0]\n\n    out_strict = test_instruction_following_strict(inp, response)\n    out_loose = test_instruction_following_loose(inp, response)\n\n    return {\n        \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n        \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n        \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n        \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n    }\n",
 877 |       "description": "",
 878 |       "target_delimiter": " ",
 879 |       "fewshot_delimiter": "\n\n",
 880 |       "num_fewshot": 0,
 881 |       "metric_list": [
 882 |         {
 883 |           "metric": "prompt_level_strict_acc",
 884 |           "aggregation": "mean",
 885 |           "higher_is_better": true
 886 |         },
 887 |         {
 888 |           "metric": "inst_level_strict_acc",
 889 |           "aggregation": "def agg_inst_level_acc(items):\n    flat_items = [item for sublist in items for item in sublist]\n    inst_level_acc = sum(flat_items) / len(flat_items)\n    return inst_level_acc\n",
 890 |           "higher_is_better": true
 891 |         },
 892 |         {
 893 |           "metric": "prompt_level_loose_acc",
 894 |           "aggregation": "mean",
 895 |           "higher_is_better": true
 896 |         },
 897 |         {
 898 |           "metric": "inst_level_loose_acc",
 899 |           "aggregation": "def agg_inst_level_acc(items):\n    flat_items = [item for sublist in items for item in sublist]\n    inst_level_acc = sum(flat_items) / len(flat_items)\n    return inst_level_acc\n",
 900 |           "higher_is_better": true
 901 |         }
 902 |       ],
 903 |       "output_type": "generate_until",
 904 |       "generation_kwargs": {
 905 |         "until": [],
 906 |         "do_sample": false,
 907 |         "temperature": 0.0,
 908 |         "max_gen_toks": 1280
 909 |       },
 910 |       "repeats": 1,
 911 |       "should_decontaminate": false,
 912 |       "metadata": {
 913 |         "version": 4.0,
 914 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 915 |       }
 916 |     },
 917 |     "m_mmlu_de": {
 918 |       "task": "m_mmlu_de",
 919 |       "tag": [
 920 |         "m_mmlu"
 921 |       ],
 922 |       "dataset_path": "alexandrainst/m_mmlu",
 923 |       "dataset_name": "de",
 924 |       "test_split": "test",
 925 |       "fewshot_split": "train",
 926 |       "doc_to_text": "{{instruction.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:",
 927 |       "doc_to_target": "answer",
 928 |       "unsafe_code": false,
 929 |       "doc_to_choice": [
 930 |         "A",
 931 |         "B",
 932 |         "C",
 933 |         "D"
 934 |       ],
 935 |       "description": "",
 936 |       "target_delimiter": " ",
 937 |       "fewshot_delimiter": "\n\n",
 938 |       "fewshot_config": {
 939 |         "sampler": "first_n"
 940 |       },
 941 |       "num_fewshot": 0,
 942 |       "metric_list": [
 943 |         {
 944 |           "metric": "acc",
 945 |           "aggregation": "mean",
 946 |           "higher_is_better": true
 947 |         }
 948 |       ],
 949 |       "output_type": "multiple_choice",
 950 |       "repeats": 1,
 951 |       "should_decontaminate": false,
 952 |       "metadata": {
 953 |         "version": 0.0,
 954 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 955 |       }
 956 |     },
 957 |     "m_mmlu_es": {
 958 |       "task": "m_mmlu_es",
 959 |       "tag": [
 960 |         "m_mmlu"
 961 |       ],
 962 |       "dataset_path": "alexandrainst/m_mmlu",
 963 |       "dataset_name": "es",
 964 |       "test_split": "test",
 965 |       "fewshot_split": "train",
 966 |       "doc_to_text": "{{instruction.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:",
 967 |       "doc_to_target": "answer",
 968 |       "unsafe_code": false,
 969 |       "doc_to_choice": [
 970 |         "A",
 971 |         "B",
 972 |         "C",
 973 |         "D"
 974 |       ],
 975 |       "description": "",
 976 |       "target_delimiter": " ",
 977 |       "fewshot_delimiter": "\n\n",
 978 |       "fewshot_config": {
 979 |         "sampler": "first_n"
 980 |       },
 981 |       "num_fewshot": 0,
 982 |       "metric_list": [
 983 |         {
 984 |           "metric": "acc",
 985 |           "aggregation": "mean",
 986 |           "higher_is_better": true
 987 |         }
 988 |       ],
 989 |       "output_type": "multiple_choice",
 990 |       "repeats": 1,
 991 |       "should_decontaminate": false,
 992 |       "metadata": {
 993 |         "version": 0.0,
 994 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
 995 |       }
 996 |     },
 997 |     "m_mmlu_fr": {
 998 |       "task": "m_mmlu_fr",
 999 |       "tag": [
1000 |         "m_mmlu"
1001 |       ],
1002 |       "dataset_path": "alexandrainst/m_mmlu",
1003 |       "dataset_name": "fr",
1004 |       "test_split": "test",
1005 |       "fewshot_split": "train",
1006 |       "doc_to_text": "{{instruction.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:",
1007 |       "doc_to_target": "answer",
1008 |       "unsafe_code": false,
1009 |       "doc_to_choice": [
1010 |         "A",
1011 |         "B",
1012 |         "C",
1013 |         "D"
1014 |       ],
1015 |       "description": "",
1016 |       "target_delimiter": " ",
1017 |       "fewshot_delimiter": "\n\n",
1018 |       "fewshot_config": {
1019 |         "sampler": "first_n"
1020 |       },
1021 |       "num_fewshot": 0,
1022 |       "metric_list": [
1023 |         {
1024 |           "metric": "acc",
1025 |           "aggregation": "mean",
1026 |           "higher_is_better": true
1027 |         }
1028 |       ],
1029 |       "output_type": "multiple_choice",
1030 |       "repeats": 1,
1031 |       "should_decontaminate": false,
1032 |       "metadata": {
1033 |         "version": 0.0,
1034 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
1035 |       }
1036 |     },
1037 |     "m_mmlu_ru": {
1038 |       "task": "m_mmlu_ru",
1039 |       "tag": [
1040 |         "m_mmlu"
1041 |       ],
1042 |       "dataset_path": "alexandrainst/m_mmlu",
1043 |       "dataset_name": "ru",
1044 |       "test_split": "test",
1045 |       "fewshot_split": "train",
1046 |       "doc_to_text": "{{instruction.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:",
1047 |       "doc_to_target": "answer",
1048 |       "unsafe_code": false,
1049 |       "doc_to_choice": [
1050 |         "A",
1051 |         "B",
1052 |         "C",
1053 |         "D"
1054 |       ],
1055 |       "description": "",
1056 |       "target_delimiter": " ",
1057 |       "fewshot_delimiter": "\n\n",
1058 |       "fewshot_config": {
1059 |         "sampler": "first_n"
1060 |       },
1061 |       "num_fewshot": 0,
1062 |       "metric_list": [
1063 |         {
1064 |           "metric": "acc",
1065 |           "aggregation": "mean",
1066 |           "higher_is_better": true
1067 |         }
1068 |       ],
1069 |       "output_type": "multiple_choice",
1070 |       "repeats": 1,
1071 |       "should_decontaminate": false,
1072 |       "metadata": {
1073 |         "version": 0.0,
1074 |         "pretrained": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1"
1075 |       }
1076 |     }
1077 |   },
1078 |   "versions": {
1079 |     "arc_de": 2.0,
1080 |     "arc_es": 2.0,
1081 |     "arc_fr": 2.0,
1082 |     "arc_ru": 2.0,
1083 |     "gsm8k_cot": 3.0,
1084 |     "hellaswag_de": 1.0,
1085 |     "hellaswag_es": 1.0,
1086 |     "hellaswag_fr": 1.0,
1087 |     "hellaswag_ru": 1.0,
1088 |     "hendrycks_math": 1.0,
1089 |     "hendrycks_math_algebra": 1.0,
1090 |     "hendrycks_math_counting_and_prob": 1.0,
1091 |     "hendrycks_math_geometry": 1.0,
1092 |     "hendrycks_math_intermediate_algebra": 1.0,
1093 |     "hendrycks_math_num_theory": 1.0,
1094 |     "hendrycks_math_prealgebra": 1.0,
1095 |     "hendrycks_math_precalc": 1.0,
1096 |     "ifeval": 4.0,
1097 |     "m_mmlu_de": 0.0,
1098 |     "m_mmlu_es": 0.0,
1099 |     "m_mmlu_fr": 0.0,
1100 |     "m_mmlu_ru": 0.0
1101 |   },
1102 |   "n-shot": {
1103 |     "arc_de": 0,
1104 |     "arc_es": 0,
1105 |     "arc_fr": 0,
1106 |     "arc_ru": 0,
1107 |     "gsm8k_cot": 8,
1108 |     "hellaswag_de": 0,
1109 |     "hellaswag_es": 0,
1110 |     "hellaswag_fr": 0,
1111 |     "hellaswag_ru": 0,
1112 |     "hendrycks_math_algebra": 0,
1113 |     "hendrycks_math_counting_and_prob": 0,
1114 |     "hendrycks_math_geometry": 0,
1115 |     "hendrycks_math_intermediate_algebra": 0,
1116 |     "hendrycks_math_num_theory": 0,
1117 |     "hendrycks_math_prealgebra": 0,
1118 |     "hendrycks_math_precalc": 0,
1119 |     "ifeval": 0,
1120 |     "m_mmlu_de": 0,
1121 |     "m_mmlu_es": 0,
1122 |     "m_mmlu_fr": 0,
1123 |     "m_mmlu_ru": 0
1124 |   },
1125 |   "higher_is_better": {
1126 |     "arc_de": {
1127 |       "acc": true,
1128 |       "acc_norm": true
1129 |     },
1130 |     "arc_es": {
1131 |       "acc": true,
1132 |       "acc_norm": true
1133 |     },
1134 |     "arc_fr": {
1135 |       "acc": true,
1136 |       "acc_norm": true
1137 |     },
1138 |     "arc_ru": {
1139 |       "acc": true,
1140 |       "acc_norm": true
1141 |     },
1142 |     "gsm8k_cot": {
1143 |       "exact_match": true
1144 |     },
1145 |     "hellaswag_de": {
1146 |       "acc": true,
1147 |       "acc_norm": true
1148 |     },
1149 |     "hellaswag_es": {
1150 |       "acc": true,
1151 |       "acc_norm": true
1152 |     },
1153 |     "hellaswag_fr": {
1154 |       "acc": true,
1155 |       "acc_norm": true
1156 |     },
1157 |     "hellaswag_ru": {
1158 |       "acc": true,
1159 |       "acc_norm": true
1160 |     },
1161 |     "hendrycks_math": {
1162 |       "exact_match": true
1163 |     },
1164 |     "hendrycks_math_algebra": {
1165 |       "exact_match": true
1166 |     },
1167 |     "hendrycks_math_counting_and_prob": {
1168 |       "exact_match": true
1169 |     },
1170 |     "hendrycks_math_geometry": {
1171 |       "exact_match": true
1172 |     },
1173 |     "hendrycks_math_intermediate_algebra": {
1174 |       "exact_match": true
1175 |     },
1176 |     "hendrycks_math_num_theory": {
1177 |       "exact_match": true
1178 |     },
1179 |     "hendrycks_math_prealgebra": {
1180 |       "exact_match": true
1181 |     },
1182 |     "hendrycks_math_precalc": {
1183 |       "exact_match": true
1184 |     },
1185 |     "ifeval": {
1186 |       "prompt_level_strict_acc": true,
1187 |       "inst_level_strict_acc": true,
1188 |       "prompt_level_loose_acc": true,
1189 |       "inst_level_loose_acc": true
1190 |     },
1191 |     "m_mmlu_de": {
1192 |       "acc": true
1193 |     },
1194 |     "m_mmlu_es": {
1195 |       "acc": true
1196 |     },
1197 |     "m_mmlu_fr": {
1198 |       "acc": true
1199 |     },
1200 |     "m_mmlu_ru": {
1201 |       "acc": true
1202 |     }
1203 |   },
1204 |   "n-samples": {
1205 |     "m_mmlu_ru": {
1206 |       "original": 13007,
1207 |       "effective": 13007
1208 |     },
1209 |     "m_mmlu_fr": {
1210 |       "original": 13091,
1211 |       "effective": 13091
1212 |     },
1213 |     "m_mmlu_es": {
1214 |       "original": 13334,
1215 |       "effective": 13334
1216 |     },
1217 |     "m_mmlu_de": {
1218 |       "original": 13258,
1219 |       "effective": 13258
1220 |     },
1221 |     "ifeval": {
1222 |       "original": 541,
1223 |       "effective": 541
1224 |     },
1225 |     "hendrycks_math_algebra": {
1226 |       "original": 1187,
1227 |       "effective": 1187
1228 |     },
1229 |     "hendrycks_math_counting_and_prob": {
1230 |       "original": 474,
1231 |       "effective": 474
1232 |     },
1233 |     "hendrycks_math_geometry": {
1234 |       "original": 479,
1235 |       "effective": 479
1236 |     },
1237 |     "hendrycks_math_intermediate_algebra": {
1238 |       "original": 903,
1239 |       "effective": 903
1240 |     },
1241 |     "hendrycks_math_num_theory": {
1242 |       "original": 540,
1243 |       "effective": 540
1244 |     },
1245 |     "hendrycks_math_prealgebra": {
1246 |       "original": 871,
1247 |       "effective": 871
1248 |     },
1249 |     "hendrycks_math_precalc": {
1250 |       "original": 546,
1251 |       "effective": 546
1252 |     },
1253 |     "hellaswag_ru": {
1254 |       "original": 9272,
1255 |       "effective": 9272
1256 |     },
1257 |     "hellaswag_fr": {
1258 |       "original": 9338,
1259 |       "effective": 9338
1260 |     },
1261 |     "hellaswag_es": {
1262 |       "original": 9374,
1263 |       "effective": 9374
1264 |     },
1265 |     "hellaswag_de": {
1266 |       "original": 9368,
1267 |       "effective": 9368
1268 |     },
1269 |     "gsm8k_cot": {
1270 |       "original": 1319,
1271 |       "effective": 1319
1272 |     },
1273 |     "arc_ru": {
1274 |       "original": 1169,
1275 |       "effective": 1169
1276 |     },
1277 |     "arc_fr": {
1278 |       "original": 1169,
1279 |       "effective": 1169
1280 |     },
1281 |     "arc_es": {
1282 |       "original": 1170,
1283 |       "effective": 1170
1284 |     },
1285 |     "arc_de": {
1286 |       "original": 1169,
1287 |       "effective": 1169
1288 |     }
1289 |   },
1290 |   "config": {
1291 |     "model": "vllm",
1292 |     "model_args": "pretrained=merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1",
1293 |     "batch_size": "auto",
1294 |     "batch_sizes": [],
1295 |     "device": "cuda:0",
1296 |     "use_cache": null,
1297 |     "limit": null,
1298 |     "bootstrap_iters": 100000,
1299 |     "gen_kwargs": null,
1300 |     "random_seed": 0,
1301 |     "numpy_seed": 1234,
1302 |     "torch_seed": 1234,
1303 |     "fewshot_seed": 1234
1304 |   },
1305 |   "git_hash": "f91dd3c",
1306 |   "date": 1763359638.5471172,
1307 |   "pretty_env_info": "PyTorch version: 2.8.0+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.3 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.16.3\nLibc version: glibc-2.31\n\nPython version: 3.10.9 (main, Mar  8 2023, 10:47:38) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-69-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-PCIE-40GB\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                    x86_64\nCPU op-mode(s):                  32-bit, 64-bit\nByte Order:                      Little Endian\nAddress sizes:                   46 bits physical, 57 bits virtual\nCPU(s):                          52\nOn-line CPU(s) list:             0-51\nThread(s) per core:              1\nCore(s) per socket:              26\nSocket(s):                       2\nNUMA node(s):                    4\nVendor ID:                       GenuineIntel\nCPU family:                      6\nModel:                           106\nModel name:                      Intel(R) Xeon(R) Gold 5320 CPU @ 2.20GHz\nStepping:                        6\nCPU MHz:                         800.000\nCPU max MHz:                     3400.0000\nCPU min MHz:                     800.0000\nBogoMIPS:                        4400.00\nL1d cache:                       2.4 MiB\nL1i cache:                       1.6 MiB\nL2 cache:                        65 MiB\nL3 cache:                        78 MiB\nNUMA node0 CPU(s):               0,4,8,12,16,20,24,28,32,36,40,44,48\nNUMA node1 CPU(s):               2,6,10,14,18,22,26,30,34,38,42,46,50\nNUMA node2 CPU(s):               1,5,9,13,17,21,25,29,33,37,41,45,49\nNUMA node3 CPU(s):               3,7,11,15,19,23,27,31,35,39,43,47,51\nVulnerability Itlb multihit:     Not affected\nVulnerability L1tf:              Not affected\nVulnerability Mds:               Not affected\nVulnerability Meltdown:          Not affected\nVulnerability Mmio stale data:   Mitigation; Clear CPU buffers; SMT disabled\nVulnerability Retbleed:          Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:        Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence\nVulnerability Srbds:             Not affected\nVulnerability Tsx async abort:   Not affected\nFlags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 invpcid_single intel_ppin ssbd mba ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect wbnoinvd dtherm ida arat pln pts avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid fsrm md_clear pconfig flush_l1d arch_capabilities\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.3\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.8.0\n[pip3] torchaudio==2.8.0\n[pip3] torchvision==0.23.0\n[pip3] triton==3.4.0\n[conda] numpy                                2.2.6            pypi_0           pypi\n[conda] nvidia-cublas-cu12                   12.8.4.1         pypi_0           pypi\n[conda] nvidia-cuda-cupti-cu12               12.8.90          pypi_0           pypi\n[conda] nvidia-cuda-nvrtc-cu12               12.8.93          pypi_0           pypi\n[conda] nvidia-cuda-runtime-cu12             12.8.90          pypi_0           pypi\n[conda] nvidia-cudnn-cu12                    9.10.2.21        pypi_0           pypi\n[conda] nvidia-cufft-cu12                    11.3.3.83        pypi_0           pypi\n[conda] nvidia-curand-cu12                   10.3.9.90        pypi_0           pypi\n[conda] nvidia-cusolver-cu12                 11.7.3.90        pypi_0           pypi\n[conda] nvidia-cusparse-cu12                 12.5.8.93        pypi_0           pypi\n[conda] nvidia-cusparselt-cu12               0.7.1            pypi_0           pypi\n[conda] nvidia-nccl-cu12                     2.27.3           pypi_0           pypi\n[conda] nvidia-nvjitlink-cu12                12.8.93          pypi_0           pypi\n[conda] nvidia-nvtx-cu12                     12.8.90          pypi_0           pypi\n[conda] torch                                2.8.0            pypi_0           pypi\n[conda] torchaudio                           2.8.0            pypi_0           pypi\n[conda] torchvision                          0.23.0           pypi_0           pypi\n[conda] triton                               3.4.0            pypi_0           pypi",
1308 |   "transformers_version": "4.57.1",
1309 |   "lm_eval_version": "0.4.9.1",
1310 |   "upper_git_hash": null,
1311 |   "tokenizer_pad_token": [
1312 |     "<|end_of_text|>",
1313 |     "128001"
1314 |   ],
1315 |   "tokenizer_eos_token": [
1316 |     "<|end_of_text|>",
1317 |     "128001"
1318 |   ],
1319 |   "tokenizer_bos_token": [
1320 |     "<|begin_of_text|>",
1321 |     "128000"
1322 |   ],
1323 |   "eot_token_id": 128001,
1324 |   "max_length": 131072,
1325 |   "task_hashes": {},
1326 |   "model_source": "vllm",
1327 |   "model_name": "merged_models/Llama-3.2-3B_merged/RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1",
1328 |   "model_name_sanitized": "merged_models__Llama-3.2-3B_merged__RegMeanPlusPlus_task_names_DartMath-WildguardMix-MagiCoder-Aya-Tulu3IF_reduction_0.1",
1329 |   "system_instruction": null,
1330 |   "system_instruction_sha": null,
1331 |   "fewshot_as_multiturn": false,
1332 |   "chat_template": null,
1333 |   "chat_template_sha": null,
1334 |   "start_time": 15734951.01637468,
1335 |   "end_time": 15737821.727023767,
1336 |   "total_evaluation_time_seconds": "2870.710649088025"
1337 | }


--------------------------------------------------------------------------------