├── .gitignore
├── README.md
├── eval_layercast.py
├── eval_main.py
├── eval_passk.py
├── evals
    ├── README.md
    ├── __init__.py
    ├── base_instruct_evals.md
    ├── batch
    │   ├── __init__.py
    │   ├── engines
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── initializer.py
    │   │   └── vllm_engine.py
    │   ├── env_config.py
    │   ├── logging
    │   │   └── __init__.py
    │   ├── pipeline.py
    │   ├── tokenizer.py
    │   ├── utils.py
    │   └── workload.py
    ├── cli.py
    ├── common
    │   ├── __init__.py
    │   └── entities.py
    ├── inference_and_check.py
    ├── labeled_numina_difficulty
    │   └── README.md
    ├── models
    │   ├── __init__.py
    │   ├── base.py
    │   ├── model_configs.yaml
    │   └── system_prompts
    │   │   └── prime.txt
    ├── ray_configs
    │   └── ray_config.yaml
    ├── scoring
    │   ├── __init__.py
    │   ├── apps
    │   │   ├── __init__.py
    │   │   ├── apps_scorer.py
    │   │   └── apps_util.py
    │   ├── base.py
    │   ├── gsm8k
    │   │   ├── __init__.py
    │   │   └── gsm8k_scorer.py
    │   ├── ifeval
    │   │   ├── __init__.py
    │   │   ├── ifeval_scorer.py
    │   │   ├── instructions.py
    │   │   ├── instructions_main.py
    │   │   ├── instructions_registry.py
    │   │   └── instructions_util.py
    │   ├── livecodebench
    │   │   ├── __init__.py
    │   │   ├── livecodebench_scorer.py
    │   │   └── livecodebench_util.py
    │   ├── math
    │   │   ├── __init__.py
    │   │   └── math_scorer.py
    │   ├── taco
    │   │   ├── __init__.py
    │   │   ├── taco_scorer.py
    │   │   └── taco_util.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   └── pyext2.py
    ├── tasks
    │   ├── __init__.py
    │   ├── aime
    │   │   ├── aime24.yaml
    │   │   ├── aime24_sky.yaml
    │   │   ├── aime25_1.yaml
    │   │   ├── aime25_2.yaml
    │   │   └── aime_handler.py
    │   ├── amc23
    │   │   ├── amc23.yaml
    │   │   └── amc23_handler.py
    │   ├── apps
    │   │   ├── apps.yaml
    │   │   ├── apps_handler.py
    │   │   └── apps_util.py
    │   ├── arc
    │   │   ├── arc_c.yaml
    │   │   └── arc_handler.py
    │   ├── base.py
    │   ├── gpqa_diamond
    │   │   ├── gpqa_diamond.yaml
    │   │   └── gpqa_diamond_handler.py
    │   ├── gsm8k
    │   │   ├── gsm8k.yaml
    │   │   └── gsm8k_handler.py
    │   ├── liveaops
    │   │   ├── liveaops.yaml
    │   │   └── liveaops_handler.py
    │   ├── livecodebench
    │   │   ├── livecodebench.yaml
    │   │   ├── livecodebench_easy.yaml
    │   │   ├── livecodebench_handler.py
    │   │   ├── livecodebench_hard.yaml
    │   │   ├── livecodebench_medium.yaml
    │   │   └── livecodebench_util.py
    │   ├── math
    │   │   ├── math500.yaml
    │   │   └── math_handler.py
    │   ├── minervamath
    │   │   ├── minervamath.yaml
    │   │   └── minervamath_handler.py
    │   ├── mmlu
    │   │   ├── mmlu.yaml
    │   │   ├── mmlu_handler.py
    │   │   └── mmlu_pro.yaml
    │   ├── numina
    │   │   ├── numina.yaml
    │   │   ├── numina_amc_aime.yaml
    │   │   ├── numina_handler.py
    │   │   ├── numina_math.yaml
    │   │   └── numina_olympiads.yaml
    │   ├── olympiadbench
    │   │   ├── olympiadbench_handler.py
    │   │   └── olympiadbench_math_en.yaml
    │   ├── omni_math
    │   │   ├── omni_handler.py
    │   │   └── omni_math.yaml
    │   ├── taco
    │   │   ├── pyext2.py
    │   │   ├── taco.yaml
    │   │   ├── taco_handler.py
    │   │   └── taco_util.py
    │   └── task_util.py
    └── util
    │   ├── __init__.py
    │   ├── cli_util.py
    │   ├── common.py
    │   ├── math_parsing_util.py
    │   ├── metrics.py
    │   ├── response.py
    │   └── results.py
├── figures
    └── reproduciblellm_fig1.png
├── patch_vllm.py
└── prompt_util
    └── prompt_template.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *pytest*
 2 | *.egg-info
 3 | *output*
 4 | *.log
 5 | **/__pycache__/
 6 | outputs/
 7 | scoring_results/
 8 | sh/my*
 9 | sh/test*
10 | vllm_version_test/test*
11 | acc_folder*/
12 | scoring*.py
13 | arxiv_exp/test*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Challenges and Solutions of LLM Reproducibility
  2 | 
  3 | Codebase of [Give Me FP32 or Give Me Death? Challenges and Solutions for Reproducible Reasoning](https://arxiv.org/abs/2506.09501)
  4 | 
  5 | ## News
  6 | - [2025.06.18]: Our paper has been released on [arxiv](https://arxiv.org/abs/2506.09501). Feel free to ⭐UPVOTE in [huggingface](https://huggingface.co/papers/2506.09501)
  7 | 
  8 | <br>
  9 | 
 10 | <p align="center">
 11 |   <img src="figures/reproduciblellm_fig1.png" width="800"/>
 12 | </p>
 13 | 
 14 | <p align="center">
 15 |   <i><b>Figure 1.</b> <b>Left:</b> Under BF16 precision and greedy decoding, the model's output can vary significantly depending on factors such as GPU count, evaluation batch size, and GPU hardware version. <b>Right:</b> For example, changes in evaluation batch size alone can lead to noticeable differences in responses, which is often ignored and not standardized by evaluation benchmarks.</i>
 16 | </p>
 17 | 
 18 | ## Overview
 19 | This repository contains the official implementation of **"Give Me FP32 or Give Me Death? Challenges and Solutions for Reproducible Reasoning"**. We present the first systematic study on the fragility of LLM reproducibility under different system configurations. Our work identifies reduced numerical precision as a key source of divergence, and introduces LayerCast, a hybrid-precision inference pipeline that balances memory efficiency with numerical stability. 
 20 | 
 21 | ## Environment Setup
 22 | 
 23 | ```bash
 24 | conda create -n reproducible_llm python=3.12 -y
 25 | conda activate reproducible_llm
 26 | pip install vllm==0.8.2
 27 | pip install datasets latex2sympy2 word2number immutabledict nltk langdetect
 28 | ```
 29 | #### Impact of Serving System Version
 30 | We consistently used vLLM 0.8.2 for our experiments. Please make sure to use the same vLLM version, since different versions of serving frameworks may employ different GPU kernels, which may have varying numerical stability.
 31 | 
 32 | 
 33 | ## Getting Started
 34 | ### To download this repository:
 35 | ```bash
 36 | git clone https://github.com/nanomaoli/llm_reproducibility.git
 37 | cd llm_reproducibility
 38 | ```
 39 | ### To reproduce the main experiments:
 40 | Set CUDA_VISIBLE_DEVICES to control the number of GPUs used, and specify a descriptive exp_name to help track different configurations.
 41 | #### Run inference with greedy decoding:
 42 | ```python
 43 | [CUDA_VISIBLE_DEVICES] python eval_main.py --model [MODEL] \
 44 |     --task [TASK] \
 45 |     --dtype [dtype] \
 46 |     --seed [RANDOM_SEED] \
 47 |     --batch_size [BS] \
 48 |     --max_tokens [MAX_TOKENS] \
 49 |     --exp_name [NAME_OF_THE_RUN]
 50 | ```
 51 | Model responses and logprobs will be saved to `outputs/vllm_main/{exp_name}/{model}`. We save logprobs of the 5 most likely tokens for analysis in our paper.
 52 | Scoring results will appear in `scoring_results/greedy`.
 53 | 
 54 | *Example:*
 55 | ```python
 56 | CUDA_VISIBLE_DEVICES=0,1 python eval_main.py --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
 57 |     --task math500 \
 58 |     --dtype bfloat16 \
 59 |     --seed 42 \
 60 |     --batch_size 32 \
 61 |     --max_tokens 32768 \
 62 |     --exp_name 2a100_math500_bf16_bs32
 63 | ```
 64 | 
 65 | #### Run inference with greedy decoding using LayerCast:
 66 | LayerCast uses `float32` for computation, so `--dtype` should be set accordingly.
 67 | ```python
 68 | [CUDA_VISIBLE_DEVICES] python eval_layercast.py --model [MODEL] \
 69 |     --task [TASK] \
 70 |     --dtype float32 \
 71 |     --seed [RANDOM_SEED] \
 72 |     --batch_size [BS] \
 73 |     --max_tokens [MAX_TOKENS] \
 74 |     --exp_name [NAME_OF_THE_RUN]
 75 | ```
 76 | Model responses and logprobs will be saved to `outputs/vllm_layercast/{exp_name}/{model}`. 
 77 | Scoring results will appear in `scoring_results/greedy_layercast`.
 78 | 
 79 | *Example:*
 80 | ```python
 81 | CUDA_VISIBLE_DEVICES=0,1 python eval_layercast.py --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
 82 |     --task math500 \
 83 |     --dtype float32 \
 84 |     --seed 42 \
 85 |     --batch_size 32 \
 86 |     --max_tokens 32768 \
 87 |     --exp_name 2a100_math500_layercast_bs32
 88 | ```
 89 | 
 90 | 
 91 | #### Run inference with random sampling (`n` independent samples per problem):
 92 | 
 93 | ```python
 94 | [CUDA_VISIBLE_DEVICES] python eval_passk.py --model [MODEL] \
 95 |     --task [TASK] \
 96 |     --dtype [dtype] \
 97 |     --seed [RANDOM_SEED] \
 98 |     --batch_size [BS] \
 99 |     --max_tokens [MAX_TOKENS] \
100 |     --passk [n] \
101 |     --exp_name [NAME_OF_THE_RUN]
102 | ```
103 | Model responses will be saved to `outputs/vllm_passk/{exp_name}/{model}`.
104 | Scoring results will appear in `scoring_results/random_passk`.
105 | 
106 | *Example:*
107 | ```python
108 | CUDA_VISIBLE_DEVICES=0,1 python eval_passk.py --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
109 |     --task math500 \
110 |     --dtype bfloat16 \
111 |     --seed 42 \
112 |     --batch_size 32 \
113 |     --max_tokens 32768 \
114 |     --passk 4 \
115 |     --exp_name 2a100_pass4_math500_bf16_bs32
116 | ```
117 | 
118 | ## Citation
119 | 
120 | If you find our work interesting or helpful, please kindly cite our paper.
121 | 
122 | ```bibtex
123 | @misc{yuan2025fp32deathchallengessolutions,
124 |       title={Give Me FP32 or Give Me Death? Challenges and Solutions for Reproducible Reasoning}, 
125 |       author={Jiayi Yuan and Hao Li and Xinheng Ding and Wenya Xie and Yu-Jhe Li and Wentian Zhao and Kun Wan and Jing Shi and Xia Hu and Zirui Liu},
126 |       year={2025},
127 |       eprint={2506.09501},
128 |       archivePrefix={arXiv},
129 |       primaryClass={cs.CL},
130 |       url={https://arxiv.org/abs/2506.09501}, 
131 | }
132 | ```
133 | 
134 | ## References
135 | Our evaluation implementation is adapted from [SkyThought](https://github.com/NovaSky-AI/SkyThought) repository.
136 | 


--------------------------------------------------------------------------------
/eval_main.py:
--------------------------------------------------------------------------------
  1 | import vllm
  2 | import torch
  3 | import logging
  4 | import datasets
  5 | from vllm import SamplingParams
  6 | import os
  7 | import argparse
  8 | import json
  9 | import glob
 10 | from pathlib import Path
 11 | from typing import Any, Dict, List, Tuple, Optional
 12 | from tqdm import tqdm
 13 | from evals.tasks import TASK_HANDLER_MAP, TASK_NAMES_TO_YAML, TaskConfig, TaskHandler
 14 | from evals.util.results import SummaryResults, save_summary
 15 | 
 16 | from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
 17 |                                          parse_chat_messages,)
 18 | from vllm.utils import is_list_of
 19 | from vllm.inputs import TextPrompt, TokensPrompt
 20 | from prompt_util.prompt_template import make_conversation_from_contents
 21 | from evals.tasks import TASK_HANDLER_MAP, TASK_NAMES_TO_YAML, TaskConfig
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | # Add argument parser
 26 | def parse_args():
 27 |     parser = argparse.ArgumentParser(description='Run model inference with configurable parameters')
 28 |     parser.add_argument('--model', type=str, default='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
 29 |                       help='Model name or path')
 30 |     parser.add_argument('--task', type=str, default='math500',
 31 |                       help='Task name')
 32 |     parser.add_argument('--dtype', type=str, default='bfloat16',
 33 |                       help='Data type for model (e.g., bfloat16, float16)')
 34 |     parser.add_argument('--seed', type=int, default=42,
 35 |                       help='Random seed')
 36 |     parser.add_argument('--batch_size', type=int, default=1,
 37 |                       help='Batch size for inference')
 38 |     parser.add_argument('--max_tokens', type=int, default=32768,
 39 |                       help='Maximum number of tokens to generate')
 40 |     parser.add_argument('--exp_name', type=str, default='baseline',
 41 |                       help='Experiment name')
 42 |     return parser.parse_args()
 43 |    
 44 | 
 45 | 
 46 | def set_seed(seed):
 47 |     torch.manual_seed(seed)
 48 |     torch.cuda.manual_seed(seed)
 49 |     torch.cuda.manual_seed_all(seed)
 50 |     torch.backends.cudnn.deterministic = True
 51 |     torch.backends.cudnn.benchmark = False
 52 | 
 53 | 
 54 | # Determine the starting point based on existing .pt files
 55 | def get_resume_point(output_path, batch_size):
 56 |     # Find all .pt files in the output directory
 57 |     pt_files = glob.glob(f'{output_path}/problem_*_token_ids_*.pt')
 58 |     if not pt_files:
 59 |         return 0  # No files exist, start from the beginning
 60 | 
 61 |     # Extract global_idx from filenames
 62 |     global_indices = []
 63 |     for pt_file in pt_files:
 64 |         # Filename format: problem_<global_idx>_token_ids_*.pt
 65 |         parts = os.path.basename(pt_file).split('_')
 66 |         try:
 67 |             global_idx = int(parts[1])  # Extract the global_idx
 68 |             global_indices.append(global_idx)
 69 |         except (IndexError, ValueError):
 70 |             continue
 71 | 
 72 |     if not global_indices:
 73 |         return 0  # No valid indices found, start from the beginning
 74 | 
 75 |     # Find the largest global_idx and calculate the starting batch
 76 |     max_global_idx = max(global_indices)
 77 |     resume_point = ((max_global_idx + 1) // batch_size) * batch_size
 78 |     print(f"Resuming from batch starting at index {resume_point} (max_global_idx={max_global_idx})")
 79 |     return resume_point
 80 | 
 81 | def score_responses(
 82 |     handler: TaskHandler,
 83 |     list_of_results: List[Dict[str, Any]],
 84 |     eval_data: List[Dict[str, Any]],
 85 | ) -> Tuple[float, Dict[str, List[int]], int]:
 86 | 
 87 |     if not list_of_results:
 88 |         return 0.0, {}, 0
 89 | 
 90 |     total_correct = 0
 91 |     total_finish = 0
 92 |     id_to_scores = {}
 93 | 
 94 |     for result in tqdm(list_of_results, desc="Scoring responses"):
 95 |         # Get content from the result
 96 |         model_response = result['model_answer']
 97 |         problem_id = result['problem_id']
 98 |         problem = eval_data[problem_id]
 99 |         
100 |         new_response_entry = handler.update_results(
101 |             problem=problem,
102 |             response=model_response,
103 |         )
104 |         
105 |         if problem_id not in id_to_scores:
106 |             id_to_scores[problem_id] = [0]
107 |         id_to_scores[problem_id][0] = new_response_entry["correctness"]
108 |         
109 |         total_correct += new_response_entry["correctness"]
110 |         total_finish += 1
111 | 
112 |     accuracy = round(total_correct / total_finish, 4) if total_finish else 0
113 |     return accuracy, id_to_scores, total_finish
114 | 
115 | if __name__ == '__main__':
116 |     args = parse_args()
117 |     set_seed(args.seed)
118 |     # Create outputs directory if it doesn't exist
119 |     output_path = f'./outputs/vllm_main/{args.exp_name}/{args.model}'
120 |     os.makedirs(output_path, exist_ok=True)
121 | 
122 |     task_config = TaskConfig.from_yaml(TASK_NAMES_TO_YAML[args.task])
123 |     handler_name = task_config.handler
124 |     handler_cls = TASK_HANDLER_MAP[handler_name]
125 |     handler = handler_cls(task_config)
126 |     eval_data = handler.load_and_filter_dataset(0, -1) # start from 0, load all
127 |     remaining_data = handler.process_remaining_data(eval_data, {})
128 |     conversations = handler.make_conversations(
129 |         remaining_data,
130 |         None, # str(model_config.system_prompt),
131 |         None, # model_config.user_template,
132 |         None, # model_config.assistant_prefill,
133 |     )
134 |     total_samples = len(conversations)
135 |     print(f"Total samples in the dataset: {total_samples}")
136 | 
137 |     # Get number of available GPUs
138 |     num_gpus = torch.cuda.device_count()
139 |     print(f"Using {num_gpus} GPUs for tensor parallelism")
140 |     
141 |     model = vllm.LLM(model=args.model, 
142 |                     tensor_parallel_size=num_gpus,
143 |                     # max_model_len=length_used,
144 |                     dtype=args.dtype,
145 |                     enforce_eager=True)
146 |     # Configure sampling parameters to return logits
147 |     sampling_params = SamplingParams(temperature=0.0, logprobs=5, max_tokens=args.max_tokens, seed=args.seed)
148 | 
149 |     # Process in batches
150 |     qa_pairs = []
151 |     jsonl_path = f'{output_path}/qa_pairs_{args.dtype}_bs_{args.batch_size}.jsonl'
152 |     
153 |     start_point = get_resume_point(output_path, args.batch_size)
154 |     
155 |     for batch_start in range(start_point, total_samples, args.batch_size):
156 |         batch_end = min(batch_start + args.batch_size, total_samples)
157 |         current_batch = conversations[batch_start:batch_end]
158 |         print(f"Processing batch {batch_start//args.batch_size + 1}/{(total_samples + args.batch_size - 1)//args.batch_size}")
159 |         
160 |         tokenizer = model.get_tokenizer()
161 |         model_config = model.llm_engine.get_model_config()
162 |         prompts = []
163 | 
164 |         for msgs in current_batch:
165 |             # NOTE: _parse_chat_message_content_parts() currently doesn't
166 |             # handle mm_processor_kwargs, since there is no implementation in
167 |             # the chat message parsing for it.
168 |             conversation, mm_data = parse_chat_messages(
169 |                 msgs,
170 |                 model_config,
171 |                 tokenizer,
172 |                 content_format='string',
173 |             )
174 | 
175 |             prompt_data = apply_hf_chat_template(
176 |                 tokenizer,
177 |                 conversation=conversation,
178 |                 chat_template=None,
179 |                 add_generation_prompt=True,
180 |                 continue_final_message=False,
181 |                 tools=None,
182 |             )
183 | 
184 |             if is_list_of(prompt_data, int):
185 |                 prompt = TokensPrompt(prompt_token_ids=prompt_data)
186 |             else:
187 |                 prompt = TextPrompt(prompt=prompt_data)
188 | 
189 |             if mm_data is not None:
190 |                 prompt["multi_modal_data"] = mm_data
191 | 
192 |             prompts.append(prompt)
193 |         
194 |         # Generate with logits for current batch
195 |         response = model.generate(prompts, sampling_params=sampling_params)
196 |         # Extract output text and logits for each sample in the batch
197 |         qa_pairs = []
198 |         for idx, output in enumerate(response):
199 |             global_idx = batch_start + idx
200 |             generated_text = output.outputs[0].text
201 |             token_logprobs = output.outputs[0].logprobs
202 |             # Create tensors from token_logprobs
203 |             num_tokens = len(token_logprobs)
204 |             token_ids = torch.zeros((num_tokens, 5), dtype=torch.long)
205 |             logprobs = torch.zeros((num_tokens, 5), dtype=torch.float32)
206 |             # Save QA pair to JSONL file
207 |             qa_pair = {
208 |                 "problem_id": global_idx,
209 |                 "question": current_batch[idx],
210 |                 "model_answer": generated_text,
211 |             }
212 | 
213 |             for i, logprobs_dict in enumerate(token_logprobs):
214 |                 # Extract token IDs in order of rank
215 |                 sorted_items = sorted(logprobs_dict.items(), key=lambda x: x[1].rank)
216 |                 for j, (token_id, L) in enumerate(sorted_items):
217 |                     token_ids[i, j] = token_id
218 |                     logprobs[i, j] = L.logprob
219 |             
220 |             torch.save(token_ids, f'{output_path}/problem_{global_idx}_{args.task}_token_ids_bs_{args.batch_size}_{args.dtype}_max_tokens_{args.max_tokens}.pt')
221 |             torch.save(logprobs, f'{output_path}/problem_{global_idx}_{args.task}_logprobs_bs_{args.batch_size}_{args.dtype}_max_tokens_{args.max_tokens}.pt')
222 |             print(f"Saved tensors for problem {global_idx}")
223 |             
224 |             qa_pairs.append(qa_pair)
225 |     
226 |         with open(jsonl_path, 'a') as f:
227 |             for qa_pair in qa_pairs:
228 |                 f.write(json.dumps(qa_pair) + '\n')
229 |         print(f"Saved QA pairs to for batch {batch_start//args.batch_size + 1}")
230 |     
231 |     responses_path = Path(jsonl_path)
232 | 
233 |     if responses_path.stat().st_size == 0:
234 |         raise ValueError(f"Response file is empty: {responses_path}")
235 |         
236 |     print(f"Valid response file: {responses_path}")
237 |     
238 |     # Read the .jsonl file line by line and parse each line as a JSON object
239 |     with open(responses_path, "r") as f:
240 |         list_of_results = [json.loads(line) for line in f]
241 |     
242 |     # Check if the response file is a list of dictionaries
243 |     if not all(isinstance(result, dict) for result in list_of_results):
244 |         raise ValueError(f"Response file does not contain valid dictionaries on each line: {responses_path}")
245 |     
246 |     # Check if the response file is a list of dictionaries
247 |     if not isinstance(list_of_results, list):
248 |         raise ValueError(f"Response file is not a list of dictionaries: {responses_path}")
249 |     
250 |     # Obtain the correct task handler
251 |     task = args.task
252 |     if task not in TASK_NAMES_TO_YAML:
253 |         raise ValueError(
254 |             f"Task {task} not found. Should be one of {TASK_NAMES_TO_YAML.keys()}"
255 |         )
256 |     task_config = TaskConfig.from_yaml(TASK_NAMES_TO_YAML[task])
257 |     handler_name = task_config.handler
258 |     handler_cls = TASK_HANDLER_MAP[handler_name]
259 |     handler = handler_cls(task_config)
260 |     
261 |     raw_dataset = handler.load_and_filter_dataset(0, -1) # start from 0, load all
262 |     eval_data = [
263 |         row.to_dict()
264 |         for _, row in raw_dataset.iterrows()
265 |     ]
266 |     
267 |     accuracy, id_to_scores, total_finish = score_responses(handler, list_of_results, eval_data)
268 |     logger.info(f"Accuracy: {accuracy}")
269 |     
270 |     num_responses_total = len(id_to_scores)
271 | 
272 |     summary_data = SummaryResults(
273 |         accuracy=accuracy,
274 |     )
275 |     
276 |     # Create outputs directory if it doesn't exist
277 |     acc_path = f'./scoring_results/greedy'
278 |     os.makedirs(acc_path, exist_ok=True)
279 |     
280 |     sanitized_model_name = args.model.replace("/", "_")
281 |     summary_file = Path(acc_path) / f"{sanitized_model_name}_{args.exp_name}_summary.jsonl"
282 |     save_summary(summary_file, summary_data)
283 |     logger.info(f"Summary saved to {summary_file}")
284 | 


--------------------------------------------------------------------------------
/evals/README.md:
--------------------------------------------------------------------------------
  1 | # Skythought-evals: Data Generation and Evaluation Tools
  2 | 
  3 | 
  4 | ## Requirements 
  5 | 
  6 | Make sure you have installed the `skythought` package as outlined in the [README.md](/README.md#usage).
  7 | 
  8 | For running OpenAI model, export the OpenAI key. 
  9 | ```shell
 10 | export OPENAI_API_KEY={openai_api_key}
 11 | ```
 12 | 
 13 | ## Usage
 14 | 
 15 | We provide three commands in the CLI: 
 16 | 
 17 | - `skythought evaluate` : Evaluate a model on a given task.
 18 | - `skythought generate`: Generate model outputs for a pre-configured task.
 19 | - `skythought score`: Score saved generations for a given task.
 20 | 
 21 | For a walkthrough on the basics, please refer to the [example](../../examples/evaluate.ipynb). 
 22 | 
 23 | ## Generation and Evaluation
 24 | 
 25 | ### Benchmark Evaluation
 26 | 
 27 | Given below are two examples for evaluation.
 28 | 
 29 | ```shell
 30 | skythought evaluate --model NovaSky-AI/Sky-T1-32B-Preview --task aime  --backend vllm --backend-args tensor_parallel_size=8  --sampling-params temperature=0.6,top_p=0.95 --n 8 --result-dir ./
 31 | 
 32 | skythought evaluate --model NovaSky-AI/Sky-T1-32B-Preview --task gpqa_diamond --backend vllm --backend-args tensor_parallel_size=8 --sampling-params temperature=0.6,top_p=0.95 --n 8
 33 | ```
 34 | 
 35 | **Note**: The `GPQADiamond` dataset is gated and requires first receiving access at this Huggingface [link](https://huggingface.co/datasets/Idavidrein/gpqa) (which is granted immediately), then logging into your Huggingface account in your terminal session with `huggingface-cli login`. 
 36 | 
 37 | 
 38 | The results will be saved in a folder in `result-dir`:
 39 | 
 40 | ```bash
 41 | result-dir/
 42 | ├── Qwen_QwQ-32B-Preview_aime_myHash
 43 | │   ├── results.json # contains the full results for the benchmark
 44 | │   └── summary.json # contains summary of the run with configuration and metrics
 45 | ```
 46 | 
 47 | ### Scaling evaluation with Ray
 48 | 
 49 | You can scale evaluations across multiple model replicas (and across multiple nodes) using [ray](https://docs.ray.io) backend:
 50 | 
 51 | ```shell
 52 | skythought evaluate --model Qwen/QwQ-32B-Preview --task aime --backend ray --backend-args tensor_parallel_size=4,num_replicas=4 --result-dir ./
 53 | ```
 54 | 
 55 | By default, we make use of the configuration in [ray_configs/ray_config.yaml](./ray_configs/ray_config.yaml). You can also customize the following parameters for ray: 
 56 | 
 57 | - `tensor_parallel_size`: Tensor parallel size per replica. Defaults to 4.  
 58 | - `accelerator_type`: GPU accelerator type. See [the list of available types](https://docs.ray.io/en/latest/ray-core/accelerator-types.html) for more information. Defaults to None, which means any available GPUs in the Ray cluster will be used.  
 59 | - `num_replicas`: Number of model replicas to use for inference. Defaults to 2.  
 60 | - `batch_size`: Batch size per model replica for inference.  
 61 | - `gpu_memory_utilization`: Fraction of GPU memory allocated to the model executor in vLLM. Defaults to 0.9.  
 62 | - `dtype`: Data type used for inference. Defaults to "auto".
 63 | 
 64 | 
 65 | ### Optimized settings for 32B and 7B models
 66 | 
 67 | The following are optimized settings on a 8xH100 or a 8xA100 node. We recommend using `ray` backend for best performance. 
 68 | 
 69 | For 32B models, we recommend using the default backend configuration for best performance. 
 70 | 
 71 | ```shell
 72 | skythought evaluate --model Qwen/QwQ-32B-Preview --task aime24 --backend ray --result-dir ./
 73 | ```
 74 | 
 75 | For 7B models, we recommend using `tensor_parallel_size=1` and `num_replicas=8` for best performance. For example, the previous command will change to:
 76 | 
 77 | ```shell
 78 | skythought evaluate --model Qwen/Qwen2-7B-Instruct --task math500 --backend ray --backend-args tensor_parallel_size=1,num_replicas=8 --result-dir ./
 79 | ```
 80 | 
 81 | #### Multi-node inference
 82 | 
 83 | Note that if you have a ray cluster setup, you can scale the number of replicas as needed with `num_replicas` argument in `backend-args` to make full use of your cluster. Make sure to execute the script on the head node and ensure that `--result-dir` is a valid directory that the head node can write to. 
 84 | 
 85 | ### Best-of-N Evaluation
 86 | 
 87 | You can use the `--n` parameter to specify the number of generations per problem. For `n>1` , we calculate pass
 88 | 
 89 | ```bash
 90 | skythought evaluate --model Qwen/Qwen2-7B-Instruct --task math500 --backend ray --backend-args tensor_parallel_size=1,num_replicas=8 --sampling-params temperature=0.7,max_tokens=4096 --n 64 --result-dir ./
 91 | ```
 92 | 
 93 | ### Distill and Reject Sampling
 94 | Currently we support distill and reject sampling for NUMINA, APPS, and TACO datasets. For NUMINA, the source can be one from `[amc_aime, math, olympiads]`.
 95 | 
 96 | #### Example Usage
 97 | 
 98 | ```shell
 99 | skythought generate --model Qwen/QwQ-32B-Preview --task numina_amc_aime --backend ray --backend-args tensor_parallel_size=8 --sampling-params max_tokens=16384 --result-dir $SKYT_HOME/data
100 | ```
101 | 
102 | Once the generations are saved, you can then apply any postprocessing on the results (saved in a `results.json` file in separate run folder) and then run:
103 | 
104 | ```shell
105 | skythought score --task numina_amc_aime --run-dir <path>
106 | ```
107 | 
108 | ### Reproducibility Issues
109 | 
110 | 
111 | We've noticed that it can be hard to reproduce results in reasoning benchmarks. Beyond the lack of agreed sampling parameters and metrics in the field at the moment, there can be significant differences in results across different evaluation codebases, and even for the same codebase with a different set of dependencies. In half-precision (bfloat16 or float16), numerical error accumulation will change outputs ever so slightly, which can dramatically alter final performance. There are three factors we've noticed that affect results:
112 | 
113 | - Long context generations: Errors can accumulate so that the output changes at 1k+ tokens, which compound as you keep generating. Since we typically set max tokens to be 16k or 32k tokens, the final solution will change significantly
114 | - vLLM settings:  With vLLM, we’ve also noticed that at half-precision, different batch sizes can affect downstream evaluation results by a few percentage points. Further, different tensor parallelism settings can also change results in half-precision.
115 | - vLLM version: Different versions of vLLM will use different CUDA-Toolkit or Flash attention versions. Even for the same settings, these differences in the underlying kernels used can change results. 
116 | 
117 |  We recommend to run evaluation benchmarks at full precision, i.e float32 to avoid this. In full-precision, evaluation results should be robust to changes in batch size, tensor parallel size, version differences, etc.
118 | 
119 | 
120 | ## Key Concepts
121 | 
122 | ### Tasks
123 | 
124 | A Task consists of task-specific configuration and implements 
125 | - Dataset loading and preprocessing 
126 | - Creating of input conversation to the model
127 | - Scoring of model responses
128 | 
129 | The configuration (`TaskConfig`) contains dataset loading related details such as Hugging Face dataset ID, the particular subset for this benchmark (e.g., ”Challenge” subset for ARC), and a task template, which contains task-specific instructions to be used (Eg: `Return your answer in \boxed{}`). Each configuration is stored in a YAML. For example, you can see the YAML in this [aime24.yaml file](./tasks/aime/aime24.yaml)
130 | 
131 | Internally, a Task implementation is termed a "TaskHandler", you can see one such implementation [here](./tasks/aime/aime_handler.py). 
132 | 
133 | 
134 | To add a new task `mytask`: 
135 | - First, see if the task can be simply specified as a configuration (One example is [`aime25`](./tasks/aime/aime25.yaml)). If so, you can add a YAML file in the appropriate folder and re-use an existing handler. (All available handlers are specified [here](./tasks/__init__.py)). 
136 | - If not, you should create a new `TaskHandler` subclass for this task along with a task configuration YAML (`mytask.yaml`). 
137 | 
138 | ### Models
139 | 
140 | A Model consists of the model ID and templating configuration. This configuration optionally contains the system prompt and an assistant prefill message. Different reasoning models use their own system prompt, and some perform best when the response is prefilled with special tokens. 
141 | 
142 | We store our pre-configured models as well as a list of system prompt templates [here](./models/model_configs.yaml). 
143 | 
144 | ### Backend
145 | 
146 | The Backend is concerned with how the LLM instance is created and queried. For flexibility, we support 
147 | - Local inference with vLLM (basic single node) or Ray+vLLM (more scalable single and multi-node inference)
148 | - Remote inference behind an OpenAI-compatible endpoint. 
149 | 
150 | The Backend also consists of configuration at instantiation (ex; the data type for the model), along with sampling parameters during generation (temperature, max tokens, etc).
151 | 
152 | During evaluation, the above tie in together and the flow is as follows: 
153 | 1. Load dataset and create conversations based on the Task and Model specified by the user
154 | 2. Generate model responses from the Backend based on the provided sampling parameters
155 | 3. Score model responses based on the Task 
156 | 4. Output final results
157 | 


--------------------------------------------------------------------------------
/evals/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanomaoli/llm_reproducibility/8a373c5a159a27e59783394827cecadd6255484e/evals/__init__.py


--------------------------------------------------------------------------------
/evals/base_instruct_evals.md:
--------------------------------------------------------------------------------
  1 | # Reproducing results on non-reasoning benchmarks
  2 | 
  3 | For the full set of results, see [here](./README.md#results-on-qa-and-instruction-following-benchmarks). 
  4 | 
  5 | ## Installation instructions
  6 | 
  7 | 1. For `lm_eval`, install the package by executing the following : 
  8 | 
  9 | ```bash
 10 | git clone https://github.com/EleutherAI/lm-evaluation-harness
 11 | cd lm-evaluation-harness
 12 | git checkout 703fbff
 13 | pip install -e ".[ifeval]"
 14 | ```
 15 | 
 16 | For more details, you can refer to the official instructions [here](https://github.com/EleutherAI/lm-evaluation-harness/tree/703fbffd6fe5e136bbb9d884cb40844e5503ae5d?tab=readme-ov-file#install). We report results with commit https://github.com/EleutherAI/lm-evaluation-harness/commit/703fbffd6fe5e136bbb9d884cb40844e5503ae5d
 17 | 
 18 | 2. For `fastchat`, follow the instructions [here](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge#install). The current implementation of Fastchat is based on OpenAI version <= 0.28.0. For making use of the latest vllm backend, it is recommended to migrate the `llm_judge` folder to use openai>=1.0.0. You can run `openai migrate` for the fastchat codebase or follow the PR [here](https://github.com/lm-sys/FastChat/pull/2915/files)
 19 | 3. For `BFCL`, you can follow the official instructions [here](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard#basic-installation). We further evaulate on all test categories, which requires [setting up environment variables](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard#setting-up-environment-variables), and [obtaining API keys for executable test categories](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard#api-keys-for-executable-test-categories). Make sure to use changes from [this PR](https://github.com/ShishirPatil/gorilla/pull/888) for QwQ and Sky-T1 model support.
 20 | 4. For `Arena-Hard` results, you can follow the instructions [here](https://github.com/lmarena/arena-hard-auto). We use `gpt-4-1106-preview` as the judge.
 21 | 
 22 | ## Commands for reproducing results
 23 | 
 24 | All the benchmarks were run on a 8xH100 machine with the `vllm` backend. If you're running on a different device, make sure to tweak `tensor_parallel_size` and if needed the `batch_size` arguments.  Expect some variance in scores (+/- 1%) for different evaluation settings (ex: `tensor_parallel_size`)
 25 | 
 26 | All the commands below are given for `NovaSky-AI/Sky-T1-32B-Preview`. Simply substitute the model name for `Qwen/Qwen-2.5-32B-Instruct`. For `Qwen/QwQ-32B-Preview`, we further make use of two arguments `revision=refs/pr/58,tokenizer_revision=refs/pr/58` to use a corrected revision of QwQ. For more details on this, see https://github.com/NovaSky-AI/SkyThought/pull/26#issuecomment-2606435601. 
 27 | 
 28 | ### MMLU (0 shot; no CoT)
 29 | 
 30 | ```bash
 31 | lm_eval --model vllm     --model_args pretrained=NovaSky-AI/Sky-T1-32B-Preview,tensor_parallel_size=8,dtype=auto,gpu_memory_utilization=0.8,data_parallel_size=1,max_model_len=2048     --tasks mmlu --trust_remote_code     --batch_size 8 --apply_chat_template --fewshot_as_multiturn
 32 | ```
 33 | 
 34 | For QwQ, you would do 
 35 | 
 36 | ```bash
 37 | lm_eval --model vllm     --model_args pretrained=Qwen/QwQ-32B-Preview,tensor_parallel_size=8,dtype=auto,gpu_memory_utilization=0.8,data_parallel_size=1,max_model_len=2048revision=refs/pr/58,tokenizer_revision=refs/pr/58   --tasks mmlu --trust_remote_code     --batch_size 8 --apply_chat_template --fewshot_as_multiturn
 38 | ```
 39 | 
 40 | ### MMLU (5 shot; no CoT)
 41 | 
 42 | ```bash
 43 | lm_eval --model vllm     --model_args pretrained=NovaSky-AI/Sky-T1-32B-Preview,tensor_parallel_size=8,dtype=auto,gpu_memory_utilization=0.8,data_parallel_size=1,max_model_len=2048     --tasks mmlu --trust_remote_code     --batch_size 8 --apply_chat_template --fewshot_as_multiturn --num_fewshot 5
 44 | ```
 45 | 
 46 | ### ARC-C (0 shot; no CoT)
 47 | 
 48 | ```bash
 49 | lm_eval --model vllm     --model_args pretrained=NovaSky-AI/Sky-T1-32B-Preview,tensor_parallel_size=8,dtype=auto,gpu_memory_utilization=0.8,data_parallel_size=1,max_model_len=2048     --tasks arc_challenge --trust_remote_code     --batch_size 8 --apply_chat_template --fewshot_as_multiturn
 50 | ```
 51 | 
 52 | ### IFEval
 53 | 
 54 | ```bash
 55 | lm_eval --model vllm     --model_args pretrained=NovaSky-AI/Sky-T1-32B-Preview,tensor_parallel_size=8,dtype=auto,gpu_memory_utilization=0.9,data_parallel_size=1     --tasks leaderboard_ifeval --trust_remote_code   --batch_size auto --apply_chat_template --fewshot_as_multiturn
 56 | ```
 57 | 
 58 | We use the `prompt_level_strict_acc` metric following Qwen-2.5. 
 59 | 
 60 | ### MGSM (native CoT)
 61 | 
 62 | ```bash 
 63 | lm_eval --model vllm     --model_args pretrained=NovaSky-AI/Sky-T1-32B-Preview,tensor_parallel_size=8,dtype=auto,gpu_memory_utilization=0.8,data_parallel_size=1,max_model_len=2048     --tasks mgsm_direct --trust_remote_code     --batch_size 8 --apply_chat_template --fewshot_as_multiturn
 64 | ```
 65 | 
 66 | We report the average value of `flexible-extract` filter. 
 67 | 
 68 | ### MGSM (8-shot; native CoT)
 69 | 
 70 | ```bash
 71 | lm_eval --model vllm     --model_args pretrained=NovaSky-AI/Sky-T1-32B-Preview,tensor_parallel_size=8,dtype=auto,gpu_memory_utilization=0.8,data_parallel_size=1,max_model_len=2048     --tasks mgsm_direct --trust_remote_code --batch_size 8 --apply_chat_template --fewshot_as_multiturn --num_fewshot 8
 72 | ```
 73 | 
 74 | ### LLM-as-a-Judge
 75 | 
 76 | We use the default settings - with `max_tokens` 1024 and the `gpt-4` judge. We observe that some reasoning models like `Qwen/QwQ-32B-Preview` are unable to provide brief responses sometimes and thus get truncated responses at the used `max_tokens`. While this will effect the final rating, given the context length limitations of the commonly used `gpt-4` judge (8K tokens), we stick to the 1024 `max_tokens` budget for consistency. 
 77 | 
 78 | 1. First, serve the model with vLLM 
 79 | 
 80 | 
 81 | ```bash
 82 | vllm serve NovaSky-AI/Sky-T1-32B-Preview --dtype auto --tensor-parallel-size 8 --gpu-memory-utilization 0.9
 83 | ```
 84 | 
 85 | For `Qwen/QwQ-32B-Preview`,  use 
 86 | 
 87 | ```bash 
 88 | vllm serve Qwen/QwQ-32B-Preview --dtype auto --tensor-parallel-size 8 --gpu-memory-utilization 0.9 --revision refs/pr/58 --tokenizer-revision refs/pr/58
 89 | ```
 90 | 
 91 | 2. Next, generate model response 
 92 | 
 93 | ```bash
 94 | python gen_api_answer.py --model NovaSky-AI/Sky-T1-32B-Preview --openai-api-base http://localhost:8000/v1 --parallel 50
 95 | ```
 96 | 
 97 | Note: The generated results will be in `data/model_answer/<repo_id>/<model name>.jsonl` . Move them to the root folder `data/model_answer/`
 98 | 
 99 | 3. After generating responses for all the models, evaluate with the default settings
100 | 
101 | ```bash
102 | export OPENAI_API_KEY=XXXXXX  # set the OpenAI API key
103 | python gen_judgment.py --model-list Sky-T1-32B-Preview QwQ-32B-Preview Qwen2.5-32B-Instruct --parallel  2
104 | ```
105 | 4. Get MTBench scores (we use the average score of both turns)
106 | 
107 | ```bash
108 | python show_result.py
109 | ```
110 | 
111 | ### BFCL-v3
112 | 
113 | Our results are reported on `test-category` `all` . Make sure to get the API keys for the executable test categories by following the instructions [here](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard#api-keys-for-executable-test-categories)
114 | 
115 | Run
116 | 
117 | ```bash
118 | bfcl generate --model NovaSky-AI/Sky-T1-32B-Preview --test-category all --backend vllm --num-gpus 8 --gpu-memory-utilization 0.9 
119 | ```
120 | 
121 | For evaluation, you can simply run
122 | 
123 | ```bash
124 | bfcl evaluate --model Qwen/QwQ-32B-Preview,NovaSky-AI/Sky-T1-32B-Preview,Qwen/Qwen2.5-32B-Instruct  --test-category all --api-sanity-check
125 | ```
126 | ### Arena Hard
127 | For `Arena-Hard`, we use the following script to start a `TGI` service for generating answers 
128 | ```bash
129 | hf_pat=
130 | model=NovaSky-AI/Sky-T1-32B-Preview
131 | volume=/mnt/local_storage/data/cache
132 | port=1996
133 | 
134 | huggingface-cli download $model
135 | sudo docker run --gpus 8 -e HUGGING_FACE_HUB_TOKEN=$hf_pat --shm-size 2000g -p $port:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.2.0 --model-id $model --max-input-length 8192 --max-batch-total-tokens 8193 --max-batch-prefill-tokens 8193 --max-total-tokens 8193 --sharded true
136 | ```
137 | For running the `gen_answer.py` script, we use the following `config_api` yaml setting. For `qwq-32b-preview`, we explicitly specify the system prompt as `You are a helpful and harmless assistant. You are Qwen developed by Alibaba.` to avoid the CoT prompt.
138 | ```yaml
139 | ...
140 | sky-T1-32B-Preview:
141 |     model_name: sky-T1-32B-Preview
142 |     endpoints:
143 |         - api_base: http://localhost:1996/v1
144 |           api_key: empty
145 |     api_type: openai
146 |     parallel: 8
147 | ...
148 | ```
149 | and finally for `gen_judgment.py`, we use `gpt-4-1106-preview` as the judge.
150 | 
151 | #### Supplementary results for Arena-Hard
152 | 
153 | Here are some supplementary results for Arena-Hard, compared with o1-mini which is the best performing model on this benchmark (as of Jan 2025). 
154 | 
155 | | model | score | rating_q025 | rating_q975 | CI | avg_tokens | date |
156 | |-------|--------|------------|-------------|-------|------------|-------|
157 | | o1-mini-2024-09-12 | 91.98 | 90.88 | 93.12 | (-1.10, +1.14) | 1399.0 | 2025-01-18 |
158 | | sky-T1-32B-Preview | 74.79 | 72.28 | 76.8 | (-2.51, +2.01) | 847.0 | 2025-01-18 |
159 | | qwen2.5-32b-instruct | 66.51 | 64.55 | 68.4 | (-1.96, +1.89) | 611.0 | 2025-01-18 |
160 | | qwq-32b-preview | 52.6 | 50.86 | 54.91 | (-1.74, +2.31) | 1005.0 | 2025-01-23 |
161 | 
162 | For more details, see: https://github.com/NovaSky-AI/SkyThought/pull/26#issuecomment-2599525551 
163 | 


--------------------------------------------------------------------------------
/evals/batch/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = []
 2 | 
 3 | from .engines import init_engine_from_config
 4 | from .pipeline import Pipeline
 5 | from .workload import (
 6 |     EvalWorkload,
 7 | )
 8 | 
 9 | __all__ = [
10 |     "Pipeline",
11 |     "init_engine_from_config",
12 |     "EvalWorkload",
13 | ]
14 | 


--------------------------------------------------------------------------------
/evals/batch/engines/__init__.py:
--------------------------------------------------------------------------------
 1 | """LLM Engines."""
 2 | 
 3 | __all__ = []
 4 | 
 5 | from .initializer import EngineInitializerBase, init_engine_from_config
 6 | 
 7 | __all__ = [
 8 |     "EngineInitializerBase",
 9 |     "init_engine_from_config",
10 | ]
11 | 


--------------------------------------------------------------------------------
/evals/batch/engines/base.py:
--------------------------------------------------------------------------------
 1 | """Engine base."""
 2 | 
 3 | from typing import Any, AsyncGenerator, Dict
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | class EngineBase:
 9 |     """Base class for engines."""
10 | 
11 |     async def __call__(
12 |         self, batch: Dict[str, np.ndarray]
13 |     ) -> AsyncGenerator[Dict[str, Any], None]:
14 |         """Call the LLM engine asynchronously to process a Ray Data batch.
15 | 
16 |         Args:
17 |             batch: The batch.
18 | 
19 |         Yields:
20 |             The output.
21 |         """
22 |         raise NotImplementedError
23 | 


--------------------------------------------------------------------------------
/evals/batch/engines/initializer.py:
--------------------------------------------------------------------------------
  1 | """Engine initializers.
  2 | Note that this file should not import any engine dependent modeules, such as
  3 | vLLM, because the engine initializer is used in the driver node which may
  4 | not have GPUs.
  5 | """
  6 | 
  7 | import os
  8 | from pathlib import Path
  9 | from typing import Any, Dict, Optional, Union
 10 | 
 11 | import yaml
 12 | 
 13 | from ..utils import (
 14 |     download_model_from_hf,
 15 |     update_dict_recursive,
 16 | )
 17 | from ..workload import EvalWorkload
 18 | from .base import EngineBase
 19 | 
 20 | 
 21 | class EngineInitializerBase:
 22 |     """Base class for engine initializer.
 23 | 
 24 |     Args:
 25 |         model_id: The model id.
 26 |         accelerator_type: The accelerator type.
 27 |         engine_kwargs: The engine specific configurations.
 28 |         ray_env_vars: The Ray runtime environment
 29 |     """
 30 | 
 31 |     use_ray_placement_group: bool = False
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         model_id: str,
 36 |         accelerator_type: str,
 37 |         engine_kwargs: Dict[str, Any],
 38 |         lora_adapter: Optional[str] = None,
 39 |         ray_env_vars: Dict[str, Any] = None,
 40 |     ):
 41 |         self._model = model_id
 42 |         self._accelerator_type = accelerator_type
 43 |         self._ray_env_vars = ray_env_vars or {}
 44 |         self.lora_adapter = lora_adapter
 45 |         self.engine_kwargs = engine_kwargs
 46 | 
 47 |     @property
 48 |     def model(self) -> str:
 49 |         return self._model
 50 | 
 51 |     @property
 52 |     def accelerator_type(self) -> str:
 53 |         return self._accelerator_type
 54 | 
 55 |     @property
 56 |     def ray_env_vars(self) -> Dict[str, str]:
 57 |         return self._ray_env_vars
 58 | 
 59 |     @property
 60 |     def num_gpus(self) -> int:
 61 |         """The number of GPUs used per engine."""
 62 |         raise NotImplementedError
 63 | 
 64 |     @property
 65 |     def max_model_len(self) -> Optional[int]:
 66 |         """The maximum model length set by the engine."""
 67 |         return None
 68 | 
 69 |     def get_engine_cls(self) -> EngineBase:
 70 |         """Get the engine class.
 71 | 
 72 |         Returns:
 73 |             The engine class.
 74 |         """
 75 |         raise NotImplementedError
 76 | 
 77 |     def get_engine_constructor_args(self, workload: EvalWorkload) -> Dict[str, Any]:
 78 |         """Get the engine constructor arguments.
 79 | 
 80 |         Args:
 81 |             workload: The workload that the engine will process.
 82 | 
 83 |         Returns:
 84 |             The engine constructor keyword arguments.
 85 |         """
 86 |         raise NotImplementedError
 87 | 
 88 | 
 89 | class vLLMEngineInitializer(EngineInitializerBase):
 90 |     use_ray_placement_group: bool = False
 91 | 
 92 |     def __init__(
 93 |         self,
 94 |         model_id: str,
 95 |         accelerator_type: str,
 96 |         engine_kwargs: Dict[str, Any],
 97 |         lora_adapter: Optional[str] = None,
 98 |         ray_env_vars: Dict[str, Any] = None,
 99 |     ):
100 |         super().__init__(
101 |             model_id, accelerator_type, engine_kwargs, lora_adapter, ray_env_vars
102 |         )
103 | 
104 |         # Override vLLM default configs. Note that this is only effective
105 |         # when the config is not set by users.
106 |         self.engine_kwargs.setdefault("gpu_memory_utilization", 0.95)
107 |         self.engine_kwargs.setdefault("use_v2_block_manager", True)
108 |         self.engine_kwargs.setdefault("enable_prefix_caching", False)
109 |         self.engine_kwargs.setdefault("enforce_eager", False)
110 |         self.engine_kwargs.setdefault("pipeline_parallel_size", 1)
111 |         self.engine_kwargs.setdefault("max_num_seqs", 256)
112 |         self.engine_kwargs.setdefault("tensor_parallel_size", 1)
113 |         self.engine_kwargs.setdefault("max_logprobs", 0)
114 |         self.engine_kwargs.setdefault("distributed_executor_backend", "mp")
115 | 
116 |         # Set engine environment variables.
117 |         self._ray_env_vars.setdefault("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
118 |         self._ray_env_vars.setdefault("ENABLE_ANYSCALE_PREFIX_OPTIMIZATIONS", "0")
119 |         # FIXME: This should already be deprecated and can be removed.
120 |         self._ray_env_vars.setdefault("VLLM_DISABLE_LOGPROBS", "1")
121 |         for key, value in self._ray_env_vars.items():
122 |             os.environ[key] = str(value)
123 | 
124 |     def get_engine_cls(self):
125 |         from .vllm_engine import AsyncLLMPredictor
126 | 
127 |         return AsyncLLMPredictor
128 | 
129 |     @property
130 |     def num_gpus(self) -> int:
131 |         assert "tensor_parallel_size" in self.engine_kwargs
132 |         assert "pipeline_parallel_size" in self.engine_kwargs
133 |         tp_size = self.engine_kwargs["tensor_parallel_size"]
134 |         pp_size = self.engine_kwargs["pipeline_parallel_size"]
135 |         return tp_size * pp_size
136 | 
137 |     @property
138 |     def max_model_len(self) -> Optional[int]:
139 |         """The maximum model length set by the engine."""
140 |         return self.engine_kwargs.get("max_model_len", None)
141 | 
142 |     def get_engine_constructor_args(self, workload: EvalWorkload):
143 |         from vllm import PoolingParams, SamplingParams
144 |         from vllm.config import PoolerConfig
145 | 
146 |         constructor_kwargs = {
147 |             "model": self.model,
148 |             "lora_adapter": self.lora_adapter,
149 |         }
150 | 
151 |         if sampling_params := workload.sampling_params:
152 |             # Sampling params is given: Auto-regressive generation.
153 |             # In this case, we need to set max_tokens and max_model_len.
154 | 
155 |             max_tokens = sampling_params.get("max_tokens", None)
156 |             if max_tokens is None:
157 |                 raise ValueError("max_tokens is required for vLLM engine.")
158 | 
159 |             vllm_sampling_params = SamplingParams(**workload.sampling_params)
160 |             vllm_sampling_params.max_tokens = max_tokens
161 |             vllm_sampling_params.detokenize = False
162 |             constructor_kwargs["params"] = vllm_sampling_params
163 | 
164 |             if (
165 |                 "max_model_len" not in self.engine_kwargs
166 |                 and workload.max_tokens_in_prompt < 0
167 |             ):
168 |                 raise ValueError(
169 |                     "Neither max_tokens_in_prompt nor max_model_len is set. If you "
170 |                     "intend to let the pipeline infer max_tokens_in_prompt but got this error, "
171 |                     "it is either because the workload has not been tokenized, or the "
172 |                     "workload bypass the tokenizer but does not set max_tokens_in_prompt by itself."
173 |                 )
174 | 
175 |             # Use max_tokens_in_prompt + max_tokens as the max_model_len. max_tokens_in_prompt
176 |             # is either inferred by materializing tokenized dataset, set by the workload, or
177 |             # set by the engine.
178 |             self.engine_kwargs["max_model_len"] = (
179 |                 workload.max_tokens_in_prompt + max_tokens
180 |             )
181 |         else:
182 |             # Sampling params is not given: Embedding workload.
183 |             # In this case, we need to set pooling_params and task.
184 | 
185 |             if workload.pooling_params is None:
186 |                 raise ValueError(
187 |                     "pooling_params is required for vLLM engine for embedding workload."
188 |                 )
189 |             constructor_kwargs["params"] = PoolingParams(**workload.pooling_params)
190 |             constructor_kwargs["task"] = "embed"
191 | 
192 |             # Construct PoolerConfig if override_pooler_config is specified.
193 |             if pooler_config := self.engine_kwargs.get("override_pooler_config", None):
194 |                 self.engine_kwargs["override_pooler_config"] = PoolerConfig(
195 |                     **pooler_config
196 |                 )
197 | 
198 |         constructor_kwargs.update(self.engine_kwargs)
199 |         return constructor_kwargs
200 | 
201 | 
202 | def init_engine_from_config(
203 |     config: Union[Dict[str, Any], str], override: Optional[Dict[str, Any]] = None
204 | ) -> EngineInitializerBase:
205 |     """Initialize an engine initializer from a config file or a config dict.
206 | 
207 |     Args:
208 |         config: A config file (in YAML) or a config dict. It should include
209 |         the following keys: "engine", backend engine to use; "model",
210 |         model to use; "accelerator_type", the GPU type; "configs",
211 |         the engine specific configurations.
212 |         override: Override values in config["configs"].
213 | 
214 |     Returns:
215 |         An engine initializer.
216 |     """
217 |     if isinstance(config, str):
218 |         config_path = Path(config)
219 |         if not config_path.exists():
220 |             raise FileNotFoundError(f"Engine config file {config} not found.")
221 |         with open(config_path, "r") as filep:
222 |             config = yaml.safe_load(filep)
223 | 
224 |     assert isinstance(config, dict)
225 | 
226 |     # Override configs
227 |     if override is not None:
228 |         update_dict_recursive(config, override)
229 | 
230 |     # Ray runtime environments.
231 |     runtime_env: Dict[str, Any] = config.get("runtime_env", {})
232 |     ray_env_vars: Dict[str, Any] = runtime_env.get("env_vars", {})
233 | 
234 |     # Download model and save to local path in advance, in case
235 |     # too many worker downloads the model in parallel and hit huggingface rate limit.
236 |     assert "model_id" in config and isinstance(config["model_id"], str)
237 |     if ray_env_vars.pop("PREDOWNLOAD_MODEL_FROM_HF", "0") == "1":
238 |         config["model_id"] = download_model_from_hf(
239 |             config["model_id"], "/mnt/cluster_storage"
240 |         )
241 | 
242 |     # Do not download LoRA adapter here because it is not used in the driver node.
243 |     lora_adapter = None
244 |     if "lora_config" in config:
245 |         lora_adapter = config["lora_config"].get("dynamic_lora_loading_path", None)
246 | 
247 |     # Sanity check for engine kwargs.
248 |     for key in ("llm_engine", "model_id", "accelerator_type"):
249 |         if key not in config:
250 |             raise KeyError(f"Required {key} not found in config.")
251 |     if "engine_kwargs" not in config:
252 |         config["engine_kwargs"] = {}
253 | 
254 |     name = config["llm_engine"]
255 |     if name == "vllm":
256 |         return vLLMEngineInitializer(
257 |             model_id=config["model_id"],
258 |             accelerator_type=config["accelerator_type"],
259 |             engine_kwargs=config["engine_kwargs"],
260 |             lora_adapter=lora_adapter,
261 |             ray_env_vars=ray_env_vars,
262 |         )
263 | 
264 |     raise ValueError(f"Unknown engine: {name}")
265 | 


--------------------------------------------------------------------------------
/evals/batch/env_config.py:
--------------------------------------------------------------------------------
 1 | """Environment configurations for Ray."""
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import Dict, Optional
 5 | 
 6 | from .logging import get_logger
 7 | 
 8 | logger = get_logger(__name__)
 9 | 
10 | 
11 | @dataclass
12 | class EnvConfig:
13 |     """Environment configurations for Ray."""
14 | 
15 |     # General configurations.
16 |     hf_token: Optional[str] = None
17 |     ray_override_job_runtime_env: str = "1"
18 | 
19 |     # Ray Data configurations.
20 |     ray_data_default_wait_for_min_actors_s: int = 600
21 | 
22 |     # The number of LLM engine replicas to use.
23 |     num_replicas: int = 1
24 |     # The batch size. This represents the unit of fault tolerance.
25 |     # Smaller batch size implies more fault tolerance but may
26 |     # introduce more overhead. Batch size should at least be 16 to
27 |     # avoid hanging.
28 |     batch_size: int = 256
29 | 
30 |     def gen_ray_runtime_envs(self, engine_envs: Dict[str, str]) -> Dict[str, str]:
31 |         """Generate Ray runtime environment variables."""
32 |         envs = {k.upper(): str(v) for k, v in engine_envs.items()}
33 | 
34 |         for key in (
35 |             "hf_token",
36 |             "ray_override_job_runtime_env",
37 |             "ray_data_default_wait_for_min_actors_s",
38 |         ):
39 |             if getattr(self, key) is not None:
40 |                 envs[key.upper()] = str(getattr(self, key))
41 |         return envs
42 | 


--------------------------------------------------------------------------------
/evals/batch/logging/__init__.py:
--------------------------------------------------------------------------------
 1 | """Logging."""
 2 | 
 3 | import logging
 4 | from typing import Optional
 5 | 
 6 | from ray._private.ray_logging.filters import CoreContextFilter
 7 | from ray._private.ray_logging.formatters import JSONFormatter
 8 | 
 9 | 
10 | def _add_ray_logging(handler: logging.Handler):
11 |     """Add Ray logging to the handler.
12 | 
13 |     This is not used for now and will be enabled after the Ray Job is supported.
14 | 
15 |     Args:
16 |         handler: The handler to add Ray logging to.
17 |     """
18 |     handler.addFilter(CoreContextFilter())
19 |     handler.setFormatter(JSONFormatter())
20 | 
21 | 
22 | def _setup_logger(logger_name: str):
23 |     """Setup logger given the logger name.
24 | 
25 |     This function is idempotent and won't set up the same logger multiple times.
26 | 
27 |     Args:
28 |         logger_name: The name of the logger.
29 |     """
30 |     logger = logging.getLogger(logger_name)
31 | 
32 |     # Skip setup if the logger already has handlers setup.
33 |     if logger.handlers:
34 |         return
35 | 
36 |     handler = logging.StreamHandler()
37 |     logger.addHandler(handler)
38 |     logger.setLevel(logging.INFO)
39 |     logger.propagate = False
40 | 
41 | 
42 | def get_logger(name: Optional[str] = None) -> logging.Logger:
43 |     """Get a structured logger.
44 | 
45 |     Loggers by default are logging to stdout, and are expected to be scraped by an
46 |     external process.
47 | 
48 |     Args:
49 |         name: The name of the logger.
50 | 
51 |     Returns:
52 |         A logger instance.
53 |     """
54 |     _setup_logger(name)
55 |     return logging.getLogger(name)
56 | 


--------------------------------------------------------------------------------
/evals/batch/pipeline.py:
--------------------------------------------------------------------------------
  1 | """Pipeline for batch processing large-scale LLM workloads."""
  2 | 
  3 | import os
  4 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
  5 | 
  6 | import ray
  7 | from ray.data._internal.stats import DatasetStats
  8 | from ray.data.dataset import Dataset
  9 | from ray.util import remove_placement_group
 10 | from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 11 | 
 12 | from .engines import EngineInitializerBase, init_engine_from_config
 13 | from .env_config import EnvConfig
 14 | from .logging import get_logger
 15 | from .tokenizer import Detokenizer
 16 | from .workload import EvalWorkload
 17 | 
 18 | if TYPE_CHECKING:
 19 |     from ray.util.placement_group import PlacementGroup
 20 | 
 21 | logger = get_logger(__name__)
 22 | 
 23 | 
 24 | class Pipeline:
 25 |     """Pipeline for batch processing large-scale LLM workloads.
 26 | 
 27 |     Args:
 28 |         engine_initializer: An engine initializer to create and initialize an engine.
 29 |         workload: Workload instance.
 30 |         env_config: EnvConfig to provide environment configurations of Ray.
 31 |     """
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         engine_initializer: EngineInitializerBase,
 36 |         env_config: EnvConfig,
 37 |     ):
 38 |         self.engine_initializer = engine_initializer
 39 |         self.env_config = env_config
 40 |         self.num_replicas: int = self.env_config.num_replicas
 41 |         self.ds: Optional[Dataset] = None
 42 |         self.stats: Optional[DatasetStats] = None
 43 | 
 44 |         self.pgs: List["PlacementGroup"] = []
 45 | 
 46 |         if not ray.is_initialized():
 47 |             ray.init(runtime_env={"env_vars": self.env_vars})
 48 | 
 49 |     @classmethod
 50 |     def from_config(
 51 |         cls, engine_cfg: Union[Dict[str, Any], str], workload: EvalWorkload, **kwargs
 52 |     ):
 53 |         """Initialize the pipeline from a configuration file or dictionary.
 54 | 
 55 |         Args:
 56 |             engine_cfg: A config file (in YAML) or a config dict. It should include
 57 |                 the following keys: "engine", backend engine to use; "model",
 58 |                 model to use; "accelerator_type", the GPU type; "configs",
 59 |                 the engine specific configurations.
 60 |             workload: Workload instance.
 61 |             **kwargs: environment configuration parameters. See `EnvConfig` for more details.
 62 |         """
 63 |         engine_initializer = init_engine_from_config(engine_cfg)
 64 |         env_config = EnvConfig(**kwargs)
 65 |         return cls(engine_initializer, workload, env_config)
 66 | 
 67 |     @property
 68 |     def env_vars(self) -> Dict[str, Any]:
 69 |         return self.env_config.gen_ray_runtime_envs(
 70 |             self.engine_initializer.ray_env_vars
 71 |         )
 72 | 
 73 |     def load(
 74 |         self,
 75 |         repartition_by_batch_size: bool = False,
 76 |     ) -> Dataset:
 77 |         """Use the given workload to load and process the dataset,
 78 |         and then tokenize the prompts if needed. The processed dataset
 79 |         will be repartitioned based on the number of replicas and batch size.
 80 | 
 81 |         Args:
 82 |             repartition_by_batch_size: Whether to repartition the dataset by the
 83 |                 batch size for fault tolerance granularity. You should enable
 84 |                 this when the dataset is not from parquet and checkpointing is
 85 |                 disabled.
 86 | 
 87 |         Returns:
 88 |             The processed dataset.
 89 |         """
 90 |         ds, num_blocks = self.workload.get_preprocessed_dataset(
 91 |             self.env_config.batch_size,
 92 |             repartition_by_batch_size,
 93 |         )
 94 |         if num_blocks is not None and num_blocks < self.num_replicas:
 95 |             logger.warning(
 96 |                 "The number of blocks (%d) is less than the number of replicas (%d). "
 97 |                 "This may result in suboptimal performance.",
 98 |                 num_blocks,
 99 |                 self.num_replicas,
100 |             )
101 | 
102 |         if self.workload.need_tokenize:
103 |             # TODO: Figure out a better concurrency.
104 |             # Now we simply assume each LLM replica could have 4 tokenizers.
105 |             # This is a heuristic and may not be optimal.
106 |             tokenizer_concurrency = self.num_replicas * 4
107 |             ds = ds.map_batches(
108 |                 self.workload.tokenizer_cls,
109 |                 fn_constructor_kwargs=self.workload.tokenizer_constructor_kwargs(
110 |                     self.engine_initializer.model
111 |                 ),
112 |                 zero_copy_batch=True,
113 |                 concurrency=(1, tokenizer_concurrency),
114 |                 batch_size=self.env_config.batch_size,
115 |             )
116 | 
117 |         # If max tokens in prompt is not set in the workload and max_model_len is not set
118 |         # in the engine, we need to materialize the dataset to get the maximum tokens in prompt.
119 |         # This may hurt the overall throughput but may be memory efficient.
120 |         if self.workload.max_tokens_in_prompt == -1:
121 |             if self.engine_initializer.max_model_len is not None:
122 |                 max_tokens = self.workload.sampling_params.get("max_tokens", 0)
123 |                 max_tokens_in_prompt = (
124 |                     self.engine_initializer.max_model_len - max_tokens
125 |                 )
126 |                 msg = f"Max Prompt Tokens (max_model_len - max_tokens): {max_tokens_in_prompt}"
127 |             else:
128 |                 logger.info(
129 |                     "Materializing dataset after tokenization to get max prompt tokens"
130 |                 )
131 |                 ds = ds.materialize()
132 | 
133 |                 max_tokens_in_prompt = int(ds.max("num_text_tokens"))
134 |                 msg = f"Max Prompt Tokens (inferred): {max_tokens_in_prompt}"
135 |             self.workload.max_tokens_in_prompt = max_tokens_in_prompt
136 |         else:
137 |             msg = f"Max Prompt Tokens (specified in wokrload): {self.workload.max_tokens_in_prompt}"
138 | 
139 |         logger.info(msg)
140 |         self.ds = ds
141 |         return ds
142 | 
143 |     def __call__(self, workload: EvalWorkload):
144 |         self.workload: EvalWorkload = workload
145 |         # Set the task to "embed" if sampling params are not given.
146 |         self.task_type_str: str = (
147 |             "auto" if self.workload.sampling_params is not None else "embed"
148 |         )
149 |         return self.run(eager=False)
150 | 
151 |     def run(
152 |         self,
153 |         dataset: Optional[Dataset] = None,
154 |         output_path: Optional[str] = None,
155 |         detokenize: bool = True,
156 |         eager: bool = True,
157 |         repartition_by_batch_size: bool = False,
158 |     ) -> Optional[Dataset]:
159 |         """Perform batch processing on the dataset with LLM engines.
160 | 
161 |         Args:
162 |             dataset: The dataset to process. If None, we directly use the given workload
163 |                 to load and process the dataset.
164 |             output_path: The output path to write the processed dataset to parquet. It can be
165 |                 a path to a S3 bucket, or a path to local disk (with local:// as the prefix). If None,
166 |                 the processed dataset will be materialized but not be written.
167 |             detokenize: Whether to detokenize the generated text. Default is True.
168 |             eager: Whether to run the pipeline eagerly. If True, the dataset will be materialized.
169 |                 If False, we skip the materialization step and return the dataset. If output_path is specified,
170 |                 the dataset will be written to files and therefore will be materialized
171 |                 regardless of the eager flag.
172 |             repartition_by_batch_size: Whether to repartition the dataset by the
173 |                 batch size for fault tolerance granularity. You should enable
174 |                 this when the dataset is not from parquet and checkpointing is
175 |                 disabled.
176 | 
177 |         Returns:
178 |             The processed dataset. If output_path is not None, the dataset will be None after writing.
179 |         """
180 |         if not eager and output_path is not None:
181 |             logger.warning("Eager mode is enforced because output path is specified")
182 |             eager = True
183 | 
184 |         # Expend output_path in case environment variable is used.
185 |         if output_path is not None:
186 |             output_path = os.path.expanduser(output_path)
187 | 
188 |         # Force skipping detokenizer if task is "embed".
189 |         if self.task_type_str == "embed" and detokenize:
190 |             logger.info("Detokenization is skipped because of embedding workload")
191 |             detokenize = False
192 | 
193 |         ray_remote_args = {}
194 |         if self.engine_initializer.accelerator_type:
195 |             ray_remote_args["accelerator_type"] = (
196 |                 self.engine_initializer.accelerator_type
197 |             )
198 |         ray_remote_args.update({"runtime_env": {"env_vars": self.env_vars}})
199 | 
200 |         if dataset is not None:
201 |             self.ds = dataset
202 |         elif self.ds is None:
203 |             self.load(repartition_by_batch_size)
204 |         assert self.ds is not None
205 | 
206 |         num_gpus = self.engine_initializer.num_gpus
207 |         if self.engine_initializer.use_ray_placement_group:
208 |             # Specify the number of GPUs required per LLM instance.
209 |             # Note: for TP>1, num_gpus has to be 0 - instead, we specify a placement group
210 |             if self.engine_initializer.num_gpus > 1:
211 | 
212 |                 def _scheduling_strategy_fn(
213 |                     num_gpus_per_instance: int, accelerator_type: str
214 |                 ):
215 |                     def _get_bundle() -> Dict[str, float]:
216 |                         bundle: Dict[str, float] = {"GPU": 1, "CPU": 1}
217 |                         if accelerator_type:
218 |                             bundle[f"accelerator_type:{accelerator_type}"] = 0.001
219 |                         return bundle
220 | 
221 |                     pg = ray.util.placement_group(
222 |                         [_get_bundle()] * num_gpus_per_instance,
223 |                         strategy="STRICT_PACK",
224 |                     )
225 |                     self.pgs.append(pg)
226 |                     return dict(
227 |                         scheduling_strategy=PlacementGroupSchedulingStrategy(
228 |                             pg, placement_group_capture_child_tasks=True
229 |                         )
230 |                     )
231 | 
232 |                 ray_remote_args.update(
233 |                     _scheduling_strategy_fn(
234 |                         self.engine_initializer.num_gpus,
235 |                         self.engine_initializer.accelerator_type,
236 |                     )
237 |                 )
238 | 
239 |         self.ds = self.ds.map_batches(
240 |             self.engine_initializer.get_engine_cls(),
241 |             fn_constructor_kwargs=self.engine_initializer.get_engine_constructor_args(
242 |                 self.workload
243 |             ),
244 |             zero_copy_batch=True,
245 |             # The number of running actors.
246 |             concurrency=self.env_config.num_replicas,
247 |             # The number of running batches for an actor in Ray Core level.
248 |             # The value may not be optimal when the batch size is too small,
249 |             # but it should be good enough for batch size >= 64.
250 |             max_concurrency=4,
251 |             batch_size=self.env_config.batch_size,
252 |             num_gpus=num_gpus,
253 |             **ray_remote_args,
254 |         )
255 | 
256 |         # Skip detokenization. Usually used for tuning, profiling, and embedding.
257 |         if detokenize:
258 |             self.ds = self.ds.map_batches(
259 |                 Detokenizer,
260 |                 fn_constructor_kwargs={"model": self.engine_initializer.model},
261 |                 zero_copy_batch=True,
262 |                 concurrency=(1, self.num_replicas),
263 |                 batch_size=self.env_config.batch_size,
264 |             )
265 | 
266 |         if output_path is not None:
267 |             # Dataset will become None after writing to parquet.
268 |             self.ds = self.ds.write_parquet(output_path)
269 |         elif eager:
270 |             self.ds = self.ds.materialize()
271 | 
272 |         # If the dataset pipeline is executed due to eager mode, we can cleanup.
273 |         if eager:
274 |             self.cleanup()
275 | 
276 |         return self.ds
277 | 
278 |     def cleanup(self):
279 |         for pg in self.pgs:
280 |             remove_placement_group(pg)
281 |         self.pgs.clear()
282 | 


--------------------------------------------------------------------------------
/evals/batch/tokenizer.py:
--------------------------------------------------------------------------------
  1 | """Tokenizer and detokenizer for LLMs."""
  2 | 
  3 | import time
  4 | from typing import Any, AsyncGenerator, Dict, Union
  5 | 
  6 | import numpy as np
  7 | from transformers import (
  8 |     AutoProcessor,
  9 |     AutoTokenizer,
 10 |     PreTrainedTokenizer,  # type: ignore
 11 |     PreTrainedTokenizerFast,
 12 | )
 13 | 
 14 | from .logging import get_logger
 15 | from .utils import async_caller_empty_batch_handler, maybe_download_model_from_s3
 16 | 
 17 | AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast, Any]
 18 | 
 19 | logger = get_logger(__name__)
 20 | 
 21 | 
 22 | def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
 23 |     """Get tokenizer with cached properties.
 24 | 
 25 |     This will patch the tokenizer object in place.
 26 |     By default, transformers will recompute multiple tokenizer properties
 27 |     each time they are called, leading to a significant slowdown. This
 28 |     function caches these properties for faster access.
 29 | 
 30 |     Args:
 31 |         tokenizer: The tokenizer object.
 32 | 
 33 |     Returns:
 34 |         The patched tokenizer object.
 35 |     """
 36 |     chat_template = getattr(tokenizer, "chat_template", None)
 37 |     # For VLM, the text tokenizer is wrapped by a processor.
 38 |     if hasattr(tokenizer, "tokenizer"):
 39 |         tokenizer = tokenizer.tokenizer
 40 |         # Some VLM's tokenizer has chat_template attribute (e.g. Qwen/Qwen2-VL-7B-Instruct),
 41 |         # however some other VLM's tokenizer does not have chat_template attribute (e.g.
 42 |         # mistral-community/pixtral-12b). Therefore, we cache the processor's chat_template.
 43 |         if chat_template is None:
 44 |             chat_template = getattr(tokenizer, "chat_template", None)
 45 | 
 46 |     tokenizer_all_special_ids = set(tokenizer.all_special_ids)
 47 |     tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
 48 |     tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
 49 |     tokenizer_len = len(tokenizer)
 50 | 
 51 |     class CachedTokenizer(tokenizer.__class__):  # type: ignore
 52 |         @property
 53 |         def all_special_ids(self):
 54 |             return tokenizer_all_special_ids
 55 | 
 56 |         @property
 57 |         def all_special_tokens(self):
 58 |             return tokenizer_all_special_tokens
 59 | 
 60 |         @property
 61 |         def all_special_tokens_extended(self):
 62 |             return tokenizer_all_special_tokens_extended
 63 | 
 64 |         @property
 65 |         def chat_template(self):
 66 |             return chat_template
 67 | 
 68 |         def __len__(self):
 69 |             return tokenizer_len
 70 | 
 71 |     CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
 72 | 
 73 |     tokenizer.__class__ = CachedTokenizer
 74 |     return tokenizer
 75 | 
 76 | 
 77 | class ChatTemplateTokenizer:
 78 |     """Tokenizer with chat template applied.
 79 | 
 80 |     Args:
 81 |         model: The model name.
 82 |     """
 83 | 
 84 |     def __init__(self, model: str) -> None:
 85 |         self.model = maybe_download_model_from_s3(model)
 86 |         self.tokenizer = get_cached_tokenizer(AutoProcessor.from_pretrained(self.model))
 87 | 
 88 |     @async_caller_empty_batch_handler
 89 |     async def __call__(
 90 |         self, batch: Dict[str, np.ndarray]
 91 |     ) -> AsyncGenerator[Dict[str, Any], None]:
 92 |         """Call the tokenizer to process a batch.
 93 |         This function first process inputs in the batch asynchronously to apply
 94 |         chat template because this step cannot be batched. Then it tokenizes all inputs at once.
 95 | 
 96 |         Args:
 97 |             batch: The batch.
 98 | 
 99 |         Yields:
100 |             The output.
101 |         """
102 |         if "messages" not in batch:
103 |             raise KeyError(f'"messages" not found in {batch.keys()=}')
104 | 
105 |         start_t = time.perf_counter()
106 |         messages = batch["messages"].tolist()
107 | 
108 |         # Tokenize text prompts.
109 |         full_prompts = []
110 |         for conversation in messages:
111 |             # add generation prompt only if the last message is from the user
112 |             add_generation_prompt = conversation[-1]["role"] == "user"
113 |             full_prompts.append(
114 |                 self.tokenizer.apply_chat_template(
115 |                     conversation,
116 |                     tokenize=False,
117 |                     add_generation_prompt=add_generation_prompt,
118 |                     continue_final_message=not add_generation_prompt,
119 |                 )
120 |             )
121 |         tokens = self.tokenizer(full_prompts)["input_ids"]
122 |         time_taken_tokenizer = time.perf_counter() - start_t
123 | 
124 |         ret = {
125 |             **batch,
126 |             "prompt": full_prompts,
127 |             "tokenized_prompt": tokens,
128 |             "num_text_tokens": [len(t) for t in tokens],
129 |             "time_taken_tokenizer": [time_taken_tokenizer] * len(tokens),
130 |         }
131 | 
132 |         yield ret
133 | 
134 | 
135 | class Detokenizer:
136 |     """Detokenizer for LLMs.
137 | 
138 |     Args:
139 |         model: The model name.
140 |     """
141 | 
142 |     def __init__(self, model: str) -> None:
143 |         self.model = maybe_download_model_from_s3(model)
144 |         self.tokenizer = get_cached_tokenizer(AutoTokenizer.from_pretrained(self.model))
145 | 
146 |     async def __call__(
147 |         self, batch: Dict[str, np.ndarray]
148 |     ) -> AsyncGenerator[Dict[str, Any], None]:
149 |         """Detokenize the batch.
150 | 
151 |         Args:
152 |             batch: The batch data.
153 | 
154 |         Returns:
155 |             The detokenized batch.
156 |         """
157 |         start_t = time.perf_counter()
158 |         generated_tokens = batch["generated_tokens"]
159 |         flattened = False
160 |         # if the generated tokens are nested lists, flatten them
161 |         if isinstance(generated_tokens[0][0], np.ndarray):
162 |             # flatten the lists of lists for detokenization
163 |             flattened = True
164 |             generated_tokens = [
165 |                 token for tokens in generated_tokens for token in tokens
166 |             ]  # flattens list
167 |         generated_text = self.tokenizer.batch_decode(
168 |             generated_tokens, skip_special_tokens=True
169 |         )
170 |         if flattened:
171 |             # unflatten the list back to original structure
172 |             curr_idx = 0
173 |             generated_text_unflattened = []
174 |             for sublist in batch["generated_tokens"]:
175 |                 sublist_len = len(sublist)
176 |                 generated_text_unflattened.append(
177 |                     generated_text[curr_idx : curr_idx + sublist_len]
178 |                 )
179 |                 curr_idx += sublist_len
180 |             generated_text = generated_text_unflattened
181 |         time_taken_detokenizer = time.perf_counter() - start_t
182 |         yield {
183 |             **batch,
184 |             "generated_text": generated_text,
185 |             "time_taken_detokenizer": [time_taken_detokenizer] * len(generated_text),
186 |         }
187 | 


--------------------------------------------------------------------------------
/evals/batch/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions"""
  2 | 
  3 | import os
  4 | import subprocess
  5 | import time
  6 | from functools import wraps
  7 | from pathlib import Path
  8 | from typing import Any, Callable, Dict, List, Optional
  9 | 
 10 | import pyarrow
 11 | import ray
 12 | from filelock import FileLock
 13 | from huggingface_hub import snapshot_download
 14 | from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit  # type: ignore
 15 | from ray.data import Dataset
 16 | 
 17 | from .logging import get_logger
 18 | 
 19 | logger = get_logger(__name__)
 20 | 
 21 | 
 22 | # The default local root directory to store models downloaded from S3.
 23 | # This path should always available on Anyscale platform. If not, then
 24 | # we will fallback to FALLBACK_LOCAL_MODEL_ROOT.
 25 | DEFAULT_LOCAL_MODEL_ROOT = "/mnt/local_storage/cache"
 26 | FALLBACK_LOCAL_MODEL_ROOT = "/tmp/cache"
 27 | 
 28 | 
 29 | def update_dict_recursive(
 30 |     orig: Dict[str, Any], update_dict: Dict[str, Any]
 31 | ) -> Dict[str, Any]:
 32 |     """Update a dictionary (in-place) recursively.
 33 | 
 34 |     Args:
 35 |         orig: The original dictionary.
 36 |         update_dict: The dictionary to update.
 37 | 
 38 |     Returns:
 39 |         The updated dictionary.
 40 |     """
 41 |     for key, value in update_dict.items():
 42 |         if isinstance(value, dict):
 43 |             orig[key] = update_dict_recursive(orig.get(key, {}), value)
 44 |         else:
 45 |             orig[key] = value
 46 |     return orig
 47 | 
 48 | 
 49 | def wait_for_gpu_memory_to_clear(threshold_bytes: int, timeout_s: float = 120) -> None:
 50 |     """Wait for GPU memory to be below a threshold.
 51 |     Use nvml instead of pytorch to reduce measurement error from torch cuda context.
 52 | 
 53 |     Args:
 54 |         threshold_bytes: The threshold in bytes.
 55 |         timeout_s: The timeout in seconds.
 56 | 
 57 |     Raises:
 58 |         ValueError: If the memory is not free after the timeout.
 59 |     """
 60 |     devices = [int(x) for x in ray.get_gpu_ids()]
 61 |     nvmlInit()
 62 |     start_time = time.monotonic()
 63 |     while True:
 64 |         output = {}
 65 |         output_raw = {}
 66 |         for device in devices:
 67 |             dev_handle = nvmlDeviceGetHandleByIndex(device)
 68 |             mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
 69 |             gb_used = mem_info.used / 2**30
 70 |             output_raw[device] = gb_used
 71 |             output[device] = f"{gb_used:.02f}"
 72 | 
 73 |         logger.info(
 74 |             "GPU memory used (GB): " + "; ".join(f"{k}={v}" for k, v in output.items())
 75 |         )
 76 | 
 77 |         dur_s = time.monotonic() - start_time
 78 |         if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
 79 |             logger.info(
 80 |                 "Done waiting for free GPU memory on devices %s (%.2f GB) %.02f s",
 81 |                 devices,
 82 |                 threshold_bytes / 2**30,
 83 |                 dur_s,
 84 |             )
 85 |             break
 86 | 
 87 |         if dur_s >= timeout_s:
 88 |             raise ValueError(
 89 |                 f"Memory of devices {devices=} not free after "
 90 |                 f"{dur_s=:.02f} ({threshold_bytes/2**30=})"
 91 |             )
 92 | 
 93 |         time.sleep(5)
 94 | 
 95 | 
 96 | def run_s3_command(command: List[str], error_msg: Optional[str] = None) -> Any:
 97 |     """Run a S3 command and raise an exception if it fails.
 98 | 
 99 |     Args:
100 |         command: The command to run.
101 |         error_msg: The error message to raise if the command fails.
102 | 
103 |     Returns:
104 |         The result of the command.
105 |     """
106 |     try:
107 |         return subprocess.run(command, check=True, capture_output=True)
108 |     except Exception as err:
109 |         # Not using logger.exception since we raise anyway.
110 |         if isinstance(err, (subprocess.TimeoutExpired, subprocess.CalledProcessError)):
111 |             stdout_txt = f"\nSTDOUT: {err.stdout.decode()}" if err.stdout else ""
112 |             stderr_txt = f"\nSTDERR: {err.stderr.decode()}" if err.stderr else ""
113 |         else:
114 |             stdout_txt = ""
115 |             stderr_txt = ""
116 | 
117 |         if error_msg is not None:
118 |             logger.error(
119 |                 "(%s) %s. Command %s.%s%s",
120 |                 str(err),
121 |                 error_msg,
122 |                 command,
123 |                 stdout_txt,
124 |                 stderr_txt,
125 |             )
126 |         raise
127 | 
128 | 
129 | def download_hf_model_from_s3(s3_path: str, local_path_root: str) -> str:
130 |     """Download model files from s3 to the local path. The model path prefix
131 |     will be added to the local path.
132 | 
133 |     Args:
134 |         s3_path: The s3 path to download from.
135 |         local_path_root: The local path root to download to.
136 | 
137 |     Returns:
138 |         The local path where the files are downloaded.
139 |     """
140 |     if not s3_path.startswith("s3://"):
141 |         raise ValueError(f"Invalid s3 path: {s3_path}")
142 | 
143 |     prefix = "/".join(s3_path.split("/")[3:])
144 |     local_path = Path(local_path_root) / prefix
145 | 
146 |     # Use aws s3 sync to make sure we don't download the same files again.
147 |     command = ["aws", "s3", "sync", s3_path, local_path]
148 | 
149 |     logger.info(
150 |         "Downloading %s to %s using %s",
151 |         s3_path,
152 |         local_path,
153 |         command,
154 |     )
155 |     with FileLock(local_path / ".lock", timeout=-1):
156 |         run_s3_command(command, f"Failed to sync model from {s3_path} to {local_path}")
157 |     return str(local_path)
158 | 
159 | 
160 | def maybe_download_model_from_s3(
161 |     model_path: str, local_path_root: Optional[str] = None
162 | ) -> str:
163 |     """Download model from s3 to the local path, and return the local model path.
164 | 
165 |     Args:
166 |         model_path: The maybe s3 path to download from.
167 |         lora_path_root: The local path root to download to. If not provided,
168 |             will use the default path (/mnt/local_storage/cache or /tmp/cache).
169 | 
170 |     Returns:
171 |         The local path where the model is downloaded.
172 |     """
173 |     s3_path = os.path.expandvars(model_path)
174 |     if not s3_path.startswith("s3://"):
175 |         return model_path
176 | 
177 |     local_root = Path(local_path_root or DEFAULT_LOCAL_MODEL_ROOT)
178 |     try:
179 |         local_root.mkdir(parents=True, exist_ok=True)
180 |         # Check if the directory is writable.
181 |         with open(local_root / ".test", "w") as fp:
182 |             fp.write("test")
183 |     except PermissionError:
184 |         logger.warning(
185 |             "Failed to create local root directory at %s (Permission denied). "
186 |             "Reset local root to %s",
187 |             local_root,
188 |             FALLBACK_LOCAL_MODEL_ROOT,
189 |         )
190 |         local_root = Path(FALLBACK_LOCAL_MODEL_ROOT)
191 |         local_root.mkdir(parents=True, exist_ok=True)
192 | 
193 |     return download_hf_model_from_s3(s3_path, local_root)
194 | 
195 | 
196 | def download_model_from_hf(
197 |     model_name: str, local_path_root: Optional[str] = None
198 | ) -> str:
199 |     """Download model files from Hugging Face to the local path.
200 |     If the local path has permission issues, return the original model name, but warn the user.
201 | 
202 |     Args:
203 |         model_name: The model name to download.
204 |         local_path_root: The local path root to download to. If not provided,
205 |             will use the default path (/mnt/local_storage/cache or /tmp/cache
206 | 
207 |     Returns:
208 |         The local path where the files are downloaded.
209 |     """
210 |     # If the model_name is already a local path, skip downloading
211 |     if model_name.startswith("/"):
212 |         return model_name
213 | 
214 |     local_model_path = Path(local_path_root or DEFAULT_LOCAL_MODEL_ROOT) / model_name
215 |     try:
216 |         local_model_path.mkdir(parents=True, exist_ok=True)
217 | 
218 |         # Check directory is writable by trying to list files (avoiding .test file creation)
219 |         if not os.access(local_model_path, os.W_OK):
220 |             raise PermissionError
221 |     except PermissionError:
222 |         logger.warning(
223 |             "Failed to create or write to the model directory at %s (Permission denied). "
224 |             "Please grant permission, or each worker may download the model, hitting rate limits.",
225 |             local_model_path,
226 |         )
227 |         return model_name  # Return the original model name
228 | 
229 |     snapshot_download(repo_id=model_name, local_dir=str(local_model_path))
230 | 
231 |     return str(local_model_path)
232 | 
233 | 
234 | def async_caller_empty_batch_handler(func) -> Callable:
235 |     """A decorator to handle the case where all rows are checkpointed.
236 |     When all rows are checkpointed, we will still get a batch
237 |     in pyarrow.Table format with empty rows. This is a bug and
238 |     is being tracked here:
239 |     https://github.com/anyscale/rayturbo/issues/1292
240 | 
241 |     Args:
242 |         func: The function to wrap.
243 | 
244 |     Returns:
245 |         The wrapped function.
246 |     """
247 | 
248 |     @wraps(func)
249 |     async def wrapper(self, batch):
250 |         if not isinstance(batch, pyarrow.lib.Table) or batch.num_rows > 0:
251 |             async for x in func(self, batch):
252 |                 yield x
253 |         else:
254 |             yield {}
255 | 
256 |     return wrapper
257 | 
258 | 
259 | def has_materialized(ds: Dataset) -> bool:
260 |     """Check if the dataset has been materialized.
261 |     TODO: This API should be moved to Ray Data.
262 | 
263 |     Args:
264 |         ds: The dataset to check.
265 | 
266 |     Returns:
267 |         True if the dataset is materialized, False otherwise.
268 |     """
269 |     return bool(ds.stats())
270 | 


--------------------------------------------------------------------------------
/evals/batch/workload.py:
--------------------------------------------------------------------------------
  1 | """The workload."""
  2 | 
  3 | import math
  4 | from dataclasses import dataclass, field
  5 | from pathlib import Path
  6 | from typing import Any, Dict, Optional, Tuple
  7 | 
  8 | import yaml
  9 | from ray.data.dataset import Dataset
 10 | 
 11 | from .logging import get_logger
 12 | from .tokenizer import ChatTemplateTokenizer
 13 | 
 14 | logger = get_logger(__name__)
 15 | 
 16 | 
 17 | def load_config_from_path(config_path: str) -> Dict[str, Any]:
 18 |     if isinstance(config_path, str):
 19 |         config_path = Path(config_path)
 20 |         if not config_path.exists():
 21 |             raise FileNotFoundError(f"Engine config file {config_path} not found.")
 22 |         with open(config_path, "r") as filep:
 23 |             config = yaml.safe_load(filep)
 24 | 
 25 |     assert isinstance(config, dict)
 26 |     return config
 27 | 
 28 | 
 29 | @dataclass
 30 | class EvalWorkload:
 31 |     # The ray.data.Dataset. If None, the Worklod must initialize the dataset
 32 |     # in __post_init__().
 33 |     dataset: Optional[Dataset]
 34 |     # Sampling a fraction of dataset for benchmarking and testing. If the value
 35 |     # is greater than one, it means to take the first N rows from the dataset.
 36 |     dataset_fraction: float = 1.0
 37 |     # Tokenizer class for the workload.
 38 |     tokenizer_cls: Any = ChatTemplateTokenizer
 39 | 
 40 |     # Sampling parameters for the workload, such as max_tokens, temperature, etc.
 41 |     # It can only be None when the workload is used for embedding.
 42 |     sampling_params: Dict[str, Any] = field(
 43 |         default_factory=lambda: {"max_tokens": 4096}
 44 |     )
 45 |     # Pooling parameters for the workload, such as pooling_type, etc.
 46 |     # It can only be None when the workload is used for auto-regressive generation.
 47 |     pooling_params: Optional[Dict[str, Any]] = None
 48 | 
 49 |     need_tokenize: bool = True
 50 |     # When specified, the tokenization will be async because we don't need to
 51 |     # materialize an entire tokenized dataset to get the maximum tokens in prompt.
 52 |     # With the default value of -1, the actual value will be set after tokenization.
 53 |     max_tokens_in_prompt: int = -1
 54 | 
 55 |     # Do we want to carry over input keys that are not in the output?
 56 |     carryover_inputs: bool = True
 57 | 
 58 |     def validate(self):
 59 |         if not ((self.sampling_params is None) ^ (self.pooling_params is None)):
 60 |             raise ValueError(
 61 |                 "Either sampling_params or pooling_params must be specified."
 62 |             )
 63 | 
 64 |     def get_preprocessed_dataset(
 65 |         self,
 66 |         max_batch_size: int = 256,
 67 |         repartition_by_batch_size: bool = False,
 68 |     ) -> Tuple[Dataset, Optional[int]]:
 69 |         """Load the dataset and process it.
 70 | 
 71 |         Args:
 72 |             max_batch_size: The batch size. This determines the number of rows per
 73 |             block. Note that if some rows have already processed (checkpointed),
 74 |             the actual batch size may be smaller than this value.
 75 |             repartition_by_batch_size: Whether to repartition the dataset by the
 76 |                 batch size for fault tolerance granularity. You should enable
 77 |                 this when the dataset is not from parquet and checkpointing is
 78 |                 disabled.
 79 | 
 80 |         Returns:
 81 |             The processed dataset and the number of blocks. If checkpointing is
 82 |             enabled, then the number of blocks is unknown.
 83 |         """
 84 |         self.validate()
 85 |         if self.dataset is None:
 86 |             raise ValueError(
 87 |                 "dataset must be specified or initialized before calling "
 88 |                 "get_preprocessed_dataset()."
 89 |             )
 90 | 
 91 |         self.max_batch_size = max_batch_size
 92 | 
 93 |         ds = self.dataset
 94 |         if self.dataset_fraction < 1.0:
 95 |             logger.info("Sampling %f dataset", self.dataset_fraction)
 96 |             ds = ds.random_sample(self.dataset_fraction, seed=0)
 97 |         elif self.dataset_fraction > 1.0:
 98 |             n_rows = int(self.dataset_fraction)
 99 |             logger.info("Taking the first %d rows from dataset", n_rows)
100 |             ds = ds.limit(n_rows)
101 | 
102 |         if repartition_by_batch_size:
103 |             num_requests = ds.count()
104 |             num_blocks = math.ceil(num_requests / max_batch_size)
105 |             ds = ds.repartition(num_blocks)
106 | 
107 |             logger.info("#Requests: %d (%d blocks)", num_requests, num_blocks)
108 |         else:
109 |             # When checkpointing is enabled, the number of blocks is unknown
110 |             # at this point.
111 |             num_blocks = None
112 | 
113 |         mapper_fn = (
114 |             self.parse_row_with_carryover_input
115 |             if self.carryover_inputs
116 |             else self.parse_row
117 |         )
118 |         return ds.map(mapper_fn), num_blocks
119 | 
120 |     def tokenizer_constructor_kwargs(self, model: str):
121 |         """Return the keyword arguments for tokenizer constructor.
122 | 
123 |         Args:
124 |             model: The model name.
125 | 
126 |         Returns:
127 |             The keyword arguments for tokenizer constructor.
128 |         """
129 |         return {"model": model}
130 | 
131 |     def parse_row_with_carryover_input(self, row: dict[str, Any]) -> dict[str, Any]:
132 |         """Same as parse_row but carries over the input keys that are not in the output row.
133 | 
134 |         This is useful when we want to keep the input keys in the output.
135 |         This method assumes if user returns the same output keys as
136 |         input keys they have already copied input over and there is
137 |         no need to do it again for those keys. We will just copy the input_keys that
138 |         are not in the output row.
139 | 
140 |         Args:
141 |             row: The row to be parsed.
142 | 
143 |         Returns:
144 |             The parsed row.
145 |         """
146 |         input_row_keys = set(row.keys())
147 |         output_row = self.parse_row(row)
148 |         output_row_keys = set(output_row.keys())
149 |         return {
150 |             **{k: row[k] for k in input_row_keys if k not in output_row_keys},
151 |             **output_row,
152 |         }
153 | 
154 |     def parse_row(self, row: Dict[str, Any]) -> Dict[str, Any]:
155 |         """Parse each row in the dataset to make them compatible with
156 |         OpenAI chat API messages. Specifically, the output row should only
157 |         include a single key "messages" with type Dict[str, Union[str, List[Dict]]].
158 |         """
159 |         return {"messages": row["item"][1], "index": row["item"][0]}
160 | 


--------------------------------------------------------------------------------
/evals/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanomaoli/llm_reproducibility/8a373c5a159a27e59783394827cecadd6255484e/evals/common/__init__.py


--------------------------------------------------------------------------------
/evals/common/entities.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from dataclasses import dataclass
  3 | from enum import Enum
  4 | from importlib import resources
  5 | from pathlib import Path
  6 | from typing import Literal, Optional, Union
  7 | 
  8 | import yaml
  9 | from openai import NOT_GIVEN, NotGiven
 10 | from openai.types.chat import ChatCompletionReasoningEffort
 11 | from pydantic import BaseModel, ConfigDict, Field
 12 | from vllm import SamplingParams as VLLMSamplingParams
 13 | 
 14 | TEMPERATURE_DEFAULT = 0
 15 | TOP_P_DEFAULT = 1
 16 | MAX_TOKENS_DEFAULT = 32768
 17 | 
 18 | 
 19 | class Backend(str, Enum):
 20 |     VLLM = "vllm"
 21 |     OPENAI = "openai"
 22 |     RAY = "ray"
 23 | 
 24 | 
 25 | class OpenAISamplingParams(BaseModel):
 26 |     model_config = ConfigDict(arbitrary_types_allowed=True)
 27 | 
 28 |     temperature: float = TEMPERATURE_DEFAULT
 29 |     top_p: float = TOP_P_DEFAULT
 30 |     n: int = 1
 31 |     max_tokens: int = MAX_TOKENS_DEFAULT
 32 |     reasoning_effort: Union[ChatCompletionReasoningEffort, NotGiven] = NOT_GIVEN
 33 |     frequency_penalty: Optional[float] = None
 34 | 
 35 | 
 36 | class SamplingParameters(BaseModel):
 37 |     model_config = ConfigDict(arbitrary_types_allowed=True)
 38 | 
 39 |     params: Union[OpenAISamplingParams, VLLMSamplingParams]
 40 | 
 41 |     @classmethod
 42 |     def from_dict(cls, backend: Backend, params: dict):
 43 |         params = copy.deepcopy(params)
 44 |         if backend == Backend.OPENAI:
 45 |             return cls(params=OpenAISamplingParams(**params))
 46 |         # Currently, ray-data based processor only supports vllm as the inference engine
 47 |         elif backend in [Backend.VLLM, Backend.RAY]:
 48 |             return cls(params=VLLMSamplingParams(**params))
 49 |         else:
 50 |             raise ValueError(f"Invalid backend type: {backend}")
 51 | 
 52 |     def __repr__(self):
 53 |         return f"SamplingParameters(params={self.params})"
 54 | 
 55 |     def to_dict(self):
 56 |         if isinstance(self.params, OpenAISamplingParams):
 57 |             return self.params.model_dump()
 58 |         elif isinstance(self.params, VLLMSamplingParams):
 59 |             return {k: getattr(self.params, k) for k in self.params.__annotations__}
 60 |         else:
 61 |             raise ValueError(f"Invalid sampling parameters type: {type(self.params)}")
 62 | 
 63 | 
 64 | class OpenAIClientArgs(BaseModel):
 65 |     api_key: Optional[str] = Field(None, description="OpenAI API key")
 66 |     base_url: Optional[str] = Field(None, description="OpenAI base URL")
 67 |     project: Optional[str] = Field(None, description="OpenAI project")
 68 |     organization: Optional[str] = Field(None, description="OpenAI organization")
 69 | 
 70 | 
 71 | class RayLLMEngineArgs(BaseModel):
 72 | 
 73 |     tensor_parallel_size: Optional[int] = Field(
 74 |         default=None, description="Tensor parallelism size"
 75 |     )
 76 |     num_replicas: Optional[int] = Field(
 77 |         default=None, description="Number of replicas to use for Ray"
 78 |     )
 79 |     batch_size: Optional[int] = Field(default=None, description="Batch size for Ray")
 80 |     accelerator_type: Optional[str] = Field(
 81 |         default=None, description="Accelerator type for the inference engine"
 82 |     )
 83 |     gpu_memory_utilization: Optional[float] = Field(
 84 |         default=None, description="GPU memory utilization for the inference engine"
 85 |     )
 86 |     dtype: Optional[Literal["float32", "float16", "bfloat16", "float8", "auto"]] = (
 87 |         Field(default=None, description="Data type for inference engine.")
 88 |     )
 89 | 
 90 |     def get_ray_llm_config(self):
 91 |         config_path = Path(
 92 |             resources.files("skythought.evals").joinpath("ray_configs/ray_config.yaml")
 93 |         )
 94 |         with open(config_path) as f:
 95 |             default_config = yaml.safe_load(f)
 96 | 
 97 |         if self.tensor_parallel_size is not None:
 98 |             default_config["engine_kwargs"][
 99 |                 "tensor_parallel_size"
100 |             ] = self.tensor_parallel_size
101 | 
102 |         if self.num_replicas is not None:
103 |             default_config["env_config"]["num_replicas"] = self.num_replicas
104 | 
105 |         if self.batch_size is not None:
106 |             default_config["env_config"]["batch_size"] = self.batch_size
107 | 
108 |         if self.accelerator_type is not None:
109 |             default_config["accelerator_type"] = self.accelerator_type
110 | 
111 |         if self.gpu_memory_utilization is not None:
112 |             default_config["engine_kwargs"][
113 |                 "gpu_memory_utilization"
114 |             ] = self.gpu_memory_utilization
115 | 
116 |         # FIXME (sumanthrh): there can be a corner case when we support providing a config yaml directly, and this will override the dtype
117 |         if self.dtype is not None:
118 |             default_config["engine_kwargs"]["dtype"] = self.dtype
119 | 
120 |         return default_config
121 | 
122 | 
123 | @dataclass
124 | class BackendParameters:
125 |     model_config = ConfigDict(arbitrary_types_allowed=True)
126 | 
127 |     params: Union[dict, OpenAIClientArgs, RayLLMEngineArgs]
128 | 
129 |     @classmethod
130 |     def from_dict(cls, backend_type: Backend, params: dict):
131 |         if backend_type == Backend.RAY:
132 |             return cls(params=RayLLMEngineArgs(**params))
133 |         elif backend_type == Backend.VLLM:
134 |             # passed directly to LLM(..) instantiation
135 |             return cls(params=params)
136 |         elif backend_type == Backend.OPENAI:
137 |             return cls(params=OpenAIClientArgs(**params))
138 |         else:
139 |             raise ValueError(f"Invalid backend type: {backend_type}")
140 | 
141 |     def to_dict(self):
142 |         if isinstance(self.params, RayLLMEngineArgs):
143 |             return self.params.model_dump()
144 |         elif isinstance(self.params, dict):
145 |             return self.params
146 |         elif isinstance(self.params, OpenAIClientArgs):
147 |             return self.params.model_dump()
148 |         else:
149 |             raise ValueError(f"Invalid backend parameters type: {type(self.params)}")
150 | 


--------------------------------------------------------------------------------
/evals/labeled_numina_difficulty/README.md:
--------------------------------------------------------------------------------
1 | # Labeled NUMINA Difficulty Data 
2 | 
3 | We also include data of labeled difficulty from NUMINA, in the following files: `labeled_amc_aime_0_-1.json`, `labeled_math_0_-1.json`, `labeled_olympiads_0_-1.json`. These files can be found and downloaded from [HuggingFace](https://huggingface.co/datasets/NovaSky-AI/labeled_numina_difficulty). 


--------------------------------------------------------------------------------
/evals/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import ModelConfig, get_system_prompt_keys
2 | 
3 | __all__ = ["ModelConfig", "get_system_prompt_keys"]
4 | 


--------------------------------------------------------------------------------
/evals/models/base.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from pathlib import Path
  3 | from typing import Optional, Union
  4 | 
  5 | import yaml
  6 | from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator
  7 | 
  8 | MODEL_CONFIG_FILE_PATH = Path(__file__).parent / "model_configs.yaml"
  9 | # cache the configs in a global var
 10 | ALL_MODEL_CONFIGS = None
 11 | 
 12 | 
 13 | class StringInFile(BaseModel):
 14 |     path: str
 15 |     _string: str = PrivateAttr(default=None)
 16 | 
 17 |     @model_validator(mode="after")
 18 |     def validate_and_extract_string(self):
 19 |         full_path = Path(MODEL_CONFIG_FILE_PATH).parent / self.path
 20 |         if full_path.exists():
 21 |             with open(full_path, "r") as f:
 22 |                 self._string = f.read()
 23 |         else:
 24 |             raise ValueError("Invalid path")
 25 |         return self
 26 | 
 27 |     @property
 28 |     def string(self):
 29 |         return self._string
 30 | 
 31 |     def __str__(self) -> str:
 32 |         return self._string
 33 | 
 34 | 
 35 | def read_yaml(path: str):
 36 |     with open(path, "r") as f:
 37 |         return yaml.safe_load(f)
 38 | 
 39 | 
 40 | class ModelConfig(BaseModel):
 41 |     model_config = ConfigDict(protected_namespaces=())
 42 | 
 43 |     model_id: str
 44 |     name: Optional[str] = Field(default=None)
 45 |     # can be a string or a path to a file with the string
 46 |     system_prompt: Optional[Union[str, StringInFile]] = None
 47 |     user_template: Optional[Union[str, StringInFile]] = None
 48 |     assistant_prefill: Optional[str] = None
 49 | 
 50 |     @model_validator(mode="after")
 51 |     def validate_name(self):
 52 |         if self.name is None:
 53 |             self.name = self.model_id.split("/")[-1]
 54 |         return self
 55 | 
 56 |     @classmethod
 57 |     def from_model_id(
 58 |         cls,
 59 |         model_id: str,
 60 |         system_prompt_name: Optional[str] = None,
 61 |         system_prompt: Optional[str] = None,
 62 |         assistant_prefill: Optional[str] = None,
 63 |     ):
 64 |         global ALL_MODEL_CONFIGS
 65 |         # only one of the two can be provided
 66 |         assert (
 67 |             system_prompt_name is None or system_prompt is None
 68 |         ), "Only one of `system_prompt_name` or `system_prompt` can be provided"
 69 |         init_kwargs = {}
 70 |         if ALL_MODEL_CONFIGS is None:
 71 |             ALL_MODEL_CONFIGS = read_yaml(MODEL_CONFIG_FILE_PATH)
 72 |         if model_id in ALL_MODEL_CONFIGS["models"]:
 73 |             init_kwargs = ALL_MODEL_CONFIGS["models"][model_id]
 74 | 
 75 |         if system_prompt_name:
 76 |             if system_prompt_name not in ALL_MODEL_CONFIGS["system_prompts"]:
 77 |                 raise ValueError(
 78 |                     f"Invalid system prompt template {system_prompt_name} provided."
 79 |                 )
 80 |             init_kwargs["system_prompt"] = ALL_MODEL_CONFIGS["system_prompts"][
 81 |                 system_prompt_name
 82 |             ]
 83 |         elif system_prompt:
 84 |             init_kwargs["system_prompt"] = system_prompt
 85 |         # if none was provided, and the model is not in the config file
 86 |         elif model_id not in ALL_MODEL_CONFIGS["models"]:
 87 |             init_kwargs = {}
 88 |             warnings.warn(
 89 |                 f"Model {model_id} not found in {MODEL_CONFIG_FILE_PATH}. Initializing without any system prompt.",
 90 |                 stacklevel=2,
 91 |             )
 92 | 
 93 |         if assistant_prefill:
 94 |             init_kwargs["assistant_prefill"] = assistant_prefill
 95 | 
 96 |         init_kwargs["model_id"] = model_id
 97 |         return cls(**init_kwargs)
 98 | 
 99 | 
100 | def get_system_prompt_keys():
101 |     global ALL_MODEL_CONFIGS
102 |     if ALL_MODEL_CONFIGS is None:
103 |         ALL_MODEL_CONFIGS = read_yaml(MODEL_CONFIG_FILE_PATH)
104 |     return list(ALL_MODEL_CONFIGS["system_prompts"].keys())
105 | 


--------------------------------------------------------------------------------
/evals/models/model_configs.yaml:
--------------------------------------------------------------------------------
 1 | system_prompts: 
 2 |   qwen_cot: &qwen_cot_system_prompt "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step." 
 3 |   prime_rl: &prime_rl_system_prompt
 4 |     # system prompt can also point to a text file. the path to the file should be relative to the parent dir of model_configs.yaml
 5 |     path: system_prompts/prime.txt
 6 |   skythought:  &sky_t1_system_prompt  "Your role as an assistant involves thoroughly exploring questions through a systematic long \
 7 |         thinking process before providing the final precise and accurate solutions. This requires \
 8 |         engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, \
 9 |         backtracing, and iteration to develop well-considered thinking process. \
10 |         Please structure your response into two main sections: Thought and Solution. \
11 |         In the Thought section, detail your reasoning process using the specified format: \
12 |         <|begin_of_thought|> {thought with steps separated with '\n\n'} \
13 |         <|end_of_thought|> \
14 |         Each step should include detailed considerations such as analisying questions, summarizing \
15 |         relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining \
16 |         any errors, and revisiting previous steps. \
17 |         In the Solution section, based on various attempts, explorations, and reflections from the Thought \
18 |         section, systematically present the final solution that you deem correct. The solution should \
19 |         remain a logical, accurate, concise expression style and detail necessary step needed to reach the \
20 |         conclusion, formatted as follows: \
21 |         <|begin_of_solution|> \
22 |         {final formatted, precise, and clear solution} \
23 |         <|end_of_solution|> \
24 |         Now, try to solve the following question through the above guidelines:"
25 | 
26 | user_templates: null
27 | # Example: 
28 | #   o1_mini: &o1_mini "Question: {input}\nAnswer: "
29 | 
30 | models:
31 |   o1-mini:
32 |     # 'name' is by default <model_name> in the huggingface format <org>/<model_name>, but can be customized here
33 |     name: o1-mini 
34 |     system_prompt: null
35 |     # user template's use positional argument for formatting
36 |     user_template: "Question: {}\nAnswer: "
37 | 
38 |   o1-preview:
39 |     system_prompt: null
40 |     user_template: "Question: {}\nAnswer: "
41 | 
42 |   gpt-4o-mini:
43 |     system_prompt: null
44 |     user_template: "User: {}\nAssistant: "
45 | 
46 |   Qwen/Qwen2-7B-Instruct:
47 |     system_prompt: *qwen_cot_system_prompt 
48 |     
49 |   Qwen/QwQ-32B-Preview: 
50 |     system_prompt: *qwen_cot_system_prompt
51 | 
52 |   Qwen/Qwen2.5-72B-Instruct:
53 |     system_prompt: *qwen_cot_system_prompt
54 | 
55 |   Qwen/Qwen2.5-32B-Instruct:
56 |     system_prompt: *qwen_cot_system_prompt
57 | 
58 |   Qwen/Qwen2.5-7B-Instruct:
59 |     system_prompt: *qwen_cot_system_prompt
60 | 
61 |   Qwen/Qwen2.5-1.5B-Instruct:
62 |     system_prompt: *qwen_cot_system_prompt
63 | 
64 |   Qwen/Qwen2.5-Math-7B-Instruct:
65 |     system_prompt: *qwen_cot_system_prompt
66 | 
67 |   Qwen/Qwen2.5-Math-72B-Instruct:
68 |     system_prompt: *qwen_cot_system_prompt
69 | 
70 |   PRIME-RL/Eurus-2-7B-PRIME:
71 |     system_prompt: *prime_rl_system_prompt
72 | 
73 |   NovaSky-AI/Sky-T1-32B-Preview:  
74 |     system_prompt: *sky_t1_system_prompt
75 |         
76 |   NovaSky-AI/Sky-T1-32B-Flash: 
77 |     system_prompt: *sky_t1_system_prompt


--------------------------------------------------------------------------------
/evals/models/system_prompts/prime.txt:
--------------------------------------------------------------------------------
 1 | When tackling complex reasoning tasks, you have access to the following actions. Use them as needed to progress through your thought process. After each action, determine and state the next most appropriate action to take.
 2 | 
 3 | Actions:
 4 | 
 5 | {actions}
 6 | 
 7 | Your action should contain multiple steps, and each step starts with #. After each action (except OUTPUT), state which action you will take next with ''Next action: [Your action]'' and finish this turn. Continue this process until you reach a satisfactory conclusion or solution to the problem at hand, at which point you should use the [OUTPUT] action. The thought process is completely invisible to user, so [OUTPUT] should be a complete response. You should strictly follow the format below:
 8 | 
 9 | [ACTION NAME]
10 | 
11 | # Your action step 1
12 | 
13 | # Your action step 2
14 | 
15 | # Your action step 3
16 | 
17 | ...
18 | 
19 | Next action: [NEXT ACTION NAME]
20 | 
21 | 
22 | Now, begin with the [ASSESS] action for the following task:


--------------------------------------------------------------------------------
/evals/ray_configs/ray_config.yaml:
--------------------------------------------------------------------------------
 1 | llm_engine: vllm # currently only vllm supported
 2 | accelerator_type: null  # accelerator name as specified here: https://docs.ray.io/en/master/ray-core/accelerator-types.html#accelerator-types
 3 | engine_kwargs: # vllm engine kwargs 
 4 |   tensor_parallel_size: 4
 5 |   gpu_memory_utilization: 0.9
 6 |   dtype: auto
 7 |   # other optional vllm engine kwargs to tune performance!
 8 |   # pipeline_parallel_size: 1
 9 |   # max_num_seqs: 448
10 |   # use_v2_block_manager: True
11 |   # enable_prefix_caching: False
12 |   # preemption_mode: "recompute"
13 |   # block_size: 16
14 |   # kv_cache_dtype: "auto"
15 |   # enforce_eager: False
16 |   # enable_chunked_prefill: True
17 |   # max_num_batched_tokens: 8192
18 |   # max_seq_len_to_capture: 32768
19 | runtime_env:
20 |   env_vars:
21 |     VLLM_ATTENTION_BACKEND: "FLASH_ATTN"
22 | env_config:
23 |   num_replicas: 2  # number of vllm replicas 
24 |   batch_size: 128 # ray pipeline internal batch size (used for map_batches call internally). Should usually be set to a value in [64, 128, 256] for best performance.
25 | 


--------------------------------------------------------------------------------
/evals/scoring/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Scorer
2 | from .gsm8k import GSM8KScorer
3 | from .math import MathEqualScorer, MathVerifyScorer
4 | 
5 | __all__ = ["Scorer", "MathEqualScorer", "MathVerifyScorer", "GSM8KScorer"]
6 | 


--------------------------------------------------------------------------------
/evals/scoring/apps/__init__.py:
--------------------------------------------------------------------------------
1 | from .apps_scorer import APPSScorer
2 | 
3 | __all__ = ["APPSScorer"]
4 | 


--------------------------------------------------------------------------------
/evals/scoring/apps/apps_scorer.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import json
  3 | import multiprocessing
  4 | from multiprocessing import Manager
  5 | from typing import Any, Dict, List, Literal
  6 | 
  7 | import numpy as np
  8 | import ray
  9 | from ray.exceptions import GetTimeoutError
 10 | 
 11 | from ..base import Scorer
 12 | from ...util.common import has_code
 13 | 
 14 | from .apps_util import run_test as apps_run_test
 15 | 
 16 | 
 17 | class APPSScorer(Scorer):
 18 |     """Scorer for the APPS dataset
 19 | 
 20 |     For the APPS dataset format, see https://huggingface.co/datasets/codeparrot/apps
 21 | 
 22 |     Args:
 23 |         response_column: The column name for the response (str).
 24 |         solutions_column: The column name with solutions (str).
 25 |         input_output_column: The column name with the test inputs and outputs (str).
 26 |         keyword_args_column: The column name for the keyword arguments to the instruction builder (str).
 27 |         key_column: The column name for the unique identifier (str).
 28 |         backend: The backend to use for scoring. Supports "ray" or "mp" (str).
 29 |     """
 30 | 
 31 |     SCORE_COLUMN = "apps_score"
 32 |     # timeout per sample
 33 |     TIMEOUT = 10
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         response_column="response",
 38 |         solutions_column="solutions",
 39 |         input_output_column="input_output",
 40 |         backend: Literal["mp", "ray"] = "ray",
 41 |     ) -> None:
 42 |         super().__init__()
 43 |         self.response_column = response_column
 44 |         self.solutions_column = solutions_column
 45 |         self.input_output_column = input_output_column
 46 |         self.backend = backend
 47 |         if self.backend not in ["mp", "ray"]:
 48 |             raise ValueError(f"Invalid backend for `APPSScorer`: {self.backend}")
 49 | 
 50 |     def score(self, row: Dict[str, Any]):
 51 | 
 52 |         code_filter_result = has_code(row[self.response_column])
 53 |         if len(code_filter_result) == 0:
 54 |             return {self.SCORE_COLUMN: False}
 55 |         else:
 56 |             last_code = code_filter_result[-1]
 57 |             problem_to_check = copy.deepcopy(row)
 58 |             problem_to_check[self.input_output_column] = json.loads(
 59 |                 row[self.input_output_column]
 60 |             )
 61 |             try:
 62 |                 problem_to_check[self.solutions_column] = json.loads(
 63 |                     row[self.solutions_column]
 64 |                 )
 65 |             except Exception:
 66 |                 problem_to_check[self.solutions_column] = ""
 67 | 
 68 |         if self.backend == "ray":
 69 |             score = _run_test_ray(
 70 |                 problem_to_check[self.input_output_column],
 71 |                 last_code,
 72 |                 self.TIMEOUT,
 73 |                 False,
 74 |             )
 75 |         else:
 76 |             score = _run_test_mp(
 77 |                 problem_to_check[self.input_output_column],
 78 |                 last_code,
 79 |                 self.TIMEOUT,
 80 |                 False,
 81 |             )
 82 |         return {self.SCORE_COLUMN: score}
 83 | 
 84 | 
 85 | # NOTE (sumanthrh): We make sure that scoring for code generation is run on a separate process for isolation
 86 | # We need to run scoring for each data sample in a separate process. Since ray doesn't play well with
 87 | # multiprocessing, we launch scoring as a standalone ray task. Further, to make sure that resource requests
 88 | # don't blow up for batched processing- for example, in a ray data pipeline, we reduce `num_cpus` to 0.01 from the default
 89 | # value of 1. That way, scoring for different samples can timeshare on the same set of cpus.
 90 | @ray.remote(num_cpus=0.001)
 91 | def _temp_run_ray(input_outputs, generation, debug) -> List[bool]:
 92 |     try:
 93 |         result: List[bool] = apps_run_test(input_outputs, test=generation, debug=debug)
 94 |         return result
 95 |     except Exception:
 96 |         pass
 97 |     return []
 98 | 
 99 | 
100 | def _run_test_ray(input_outputs, generation, timeout, debug):
101 |     try:
102 |         result = ray.get(
103 |             _temp_run_ray.remote(input_outputs, generation, debug),
104 |             timeout=timeout + 1,
105 |         )
106 |     except GetTimeoutError:
107 |         result = []
108 |     return bool(result and np.all(result))
109 | 
110 | 
111 | def _run_test_mp(input_outputs, generation, timeout, debug):
112 |     def _temp_run(input_outputs, generation, debug, result) -> List[List[bool]]:
113 |         try:
114 |             result.append(
115 |                 apps_run_test(input_outputs=input_outputs, test=generation, debug=debug)
116 |             )
117 |         except Exception:
118 |             pass
119 | 
120 |     manager = Manager()
121 |     result = manager.list()
122 |     p = multiprocessing.Process(
123 |         target=_temp_run, args=(input_outputs, generation, False, result)
124 |     )
125 |     p.start()
126 |     p.join(timeout=timeout + 1)
127 |     if p.is_alive():
128 |         p.kill()
129 |     return bool(result and np.all(result[0]))
130 | 


--------------------------------------------------------------------------------
/evals/scoring/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, AsyncIterator, Dict, List
 3 | 
 4 | 
 5 | class Scorer(ABC):
 6 |     """Abstract base class for scorers."""
 7 | 
 8 |     SCORE_COLUMN = "score"
 9 | 
10 |     @abstractmethod
11 |     def score(self, row: dict) -> Dict[str, Any]:
12 |         """Scores a single row of data
13 | 
14 |         Args:
15 |             row: A dictionary containing the data to score. (dict)
16 | 
17 |         Returns:
18 |             A dictionary containing the score and any other relevant information.
19 |         """
20 |         pass
21 | 
22 |     def __call__(self, row: dict):
23 |         return {**row, **self.score(row)}
24 | 
25 | 
26 | class BatchScorer(ABC):
27 |     """
28 |     Abstract base class for batch scorers.
29 |     """
30 | 
31 |     SCORE_COLUMN = "score"
32 | 
33 |     INTERNAL_IDX_KEY = "__internal_idx__"
34 | 
35 |     @abstractmethod
36 |     async def score(self, rows: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]:
37 |         """Scores a batch of data
38 | 
39 |         Args:
40 |             rows: list of input dictionaries. (list)
41 | 
42 |         Returns:
43 |             An async iterator of dictionaries containing the score and any other relevant information.
44 |         """
45 |         pass
46 | 
47 |     async def __call__(self, batch: Dict[str, Any]) -> AsyncIterator[Dict[str, Any]]:
48 |         """Scores a batch of data
49 | 
50 |         Yields results for each row in the batch as they finish.
51 | 
52 |         Args:
53 |             batch: A dictionary containing the data to score. (dict)
54 | 
55 |         Returns:
56 |             An async iterator of dictionaries containing the score and any other relevant information.
57 |         """
58 |         key = next(iter(batch.keys()))
59 |         value = batch[key]
60 |         num_rows = len(value)
61 |         if hasattr(value, "tolist"):
62 |             batch = {k: v.tolist() for k, v in batch.items()}
63 |         else:
64 |             batch = {k: list(v) for k, v in batch.items()}
65 |         batch[self.INTERNAL_IDX_KEY] = list(range(num_rows))
66 |         rows = [{k: batch[k][i] for k in batch.keys()} for i in range(num_rows)]
67 |         async for result in self.score(rows):
68 |             if self.INTERNAL_IDX_KEY not in result:
69 |                 raise ValueError(
70 |                     f"`score` function must yield dictionaries with the key {self.INTERNAL_IDX_KEY}"
71 |                 )
72 |             idx = result[self.INTERNAL_IDX_KEY]
73 |             row = rows[idx]
74 |             yield {**row, **result}
75 | 


--------------------------------------------------------------------------------
/evals/scoring/gsm8k/__init__.py:
--------------------------------------------------------------------------------
1 | from .gsm8k_scorer import GSM8KScorer
2 | 
3 | __all__ = ["GSM8KScorer"]
4 | 


--------------------------------------------------------------------------------
/evals/scoring/gsm8k/gsm8k_scorer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Any, Dict, List
 3 | 
 4 | from ...util.math_parsing_util import extract_answer, math_equal
 5 | 
 6 | from ..base import Scorer
 7 | 
 8 | 
 9 | class GSM8KScorer(Scorer):
10 |     """Scorer for GSM8K based on the `math_equal` function from Qwen Math
11 | 
12 |     Args:
13 |         response_column: The column name for the model generated response.
14 |         answer_column: The column name for the ground truth answer.
15 |     """
16 | 
17 |     SCORE_COLUMN = "gsm8k_score"
18 |     INVALID_ANS = "[invalid]"
19 |     GT_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
20 |     ANS_RE = re.compile(r"((-?[$0-9.,]{2,})|(-?[0-9]+))")
21 | 
22 |     def __init__(self, response_column: str, answer_column: str):
23 | 
24 |         self.response_column = response_column
25 |         self.answer_column = answer_column
26 | 
27 |     def score(self, row: dict) -> Dict[str, Any]:
28 |         try:
29 |             pred = self.extract_pred_from_response(row[self.response_key])
30 |             ref = self.extract_gt_answer(row[self.answer_key])
31 |         except Exception:
32 |             return False
33 |         return {
34 |             self.SCORE_COLUMN: math_equal(pred, ref),
35 |         }
36 | 
37 |     def extract_gt_answer(self, completion):
38 |         match = self.GT_RE.search(completion)
39 |         if match:
40 |             match_str = match.group(1).strip()
41 |             match_str = match_str.replace(",", "")
42 |             return match_str
43 |         else:
44 |             return self.INVALID_ANS
45 | 
46 |     def extract_pred_from_response(self, response):
47 |         answer = extract_answer(response)
48 |         answer = self.sanitize_answer(response)
49 |         return answer
50 | 
51 |     def sanitize_answer(self, answer):
52 |         patterns_to_remove = [
53 |             ",",  # Remove commas
54 |             r"\$",  # Remove dollar signs
55 |             r"\.$" r"\*",  # Remove trailing period  # Remove asterisks
56 |         ]
57 |         for pattern in patterns_to_remove:
58 |             answer = re.sub(pattern, "", answer)
59 | 
60 |         matches = self.ANS_RE.findall(answer)
61 |         if matches:
62 |             # get the last match (i.e final response) and the first / outer capturing group
63 |             match_str = matches[-1][0].strip()
64 |             return match_str
65 |         else:
66 |             return self.INVALID_ANS
67 | 
68 |     @property
69 |     def expected_keys(self) -> List[str]:
70 |         return [self.response_column, self.answer_column]
71 | 


--------------------------------------------------------------------------------
/evals/scoring/ifeval/__init__.py:
--------------------------------------------------------------------------------
1 | from .ifeval_scorer import IfEvalScorer
2 | 
3 | __all__ = ["IfEvalScorer"]
4 | 


--------------------------------------------------------------------------------
/evals/scoring/ifeval/ifeval_scorer.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List
 2 | 
 3 | from instructions_main import (
 4 |     InputExample,
 5 |     test_instruction_following_loose,
 6 |     test_instruction_following_strict,
 7 | )
 8 | 
 9 | from ..base import Scorer
10 | 
11 | 
12 | def process_results(doc, response):
13 |     inp = InputExample(
14 |         key=doc["key"],
15 |         instruction_id_list=doc["instruction_id_list"],
16 |         prompt=doc["prompt"],
17 |         kwargs=doc["kwargs"],
18 |     )
19 | 
20 |     out_strict = test_instruction_following_strict(inp, response)
21 |     out_loose = test_instruction_following_loose(inp, response)
22 | 
23 |     return {
24 |         "prompt_level_strict_acc": out_strict.follow_all_instructions,
25 |         "inst_level_strict_acc": out_strict.follow_instruction_list,
26 |         "prompt_level_loose_acc": out_loose.follow_all_instructions,
27 |         "inst_level_loose_acc": out_loose.follow_instruction_list,
28 |     }
29 | 
30 | 
31 | class IfEvalScorer(Scorer):
32 |     """Scorer for the IF-Eval task
33 | 
34 |     For the IFEval dataset format, see https://huggingface.co/datasets/google/IFEval
35 | 
36 |     Args:
37 |         instruction_ids_column: The column name for the list of instruction ids (str).
38 |         prompt_column: The column name for the prompt (str).
39 |         keyword_args_column: The column name for the keyword arguments to the instruction builder (str).
40 |         key_column: The column name for the unique identifier (str).
41 |         response_column: The column name for the response (str).
42 |     """
43 | 
44 |     SCORE_COLUMN = "ifeval_score"
45 | 
46 |     def __init__(
47 |         self,
48 |         instruction_ids_column: str = "instruction_id_list",
49 |         prompt_column: str = "prompt",
50 |         keyword_args_column: str = "kwargs",
51 |         key_column: str = "key",
52 |         response_column: str = "response",
53 |     ):
54 |         self.instruction_ids_column = instruction_ids_column
55 |         self.response_column = response_column
56 | 
57 |     def score(self, row: dict) -> Dict[str, Any]:
58 |         return {self.SCORE_COLUMN: process_results(row, row[self.response_column])}
59 | 
60 |     @property
61 |     def expected_keys(self) -> List[str]:
62 |         return [
63 |             self.instruction_ids_column,
64 |             self.prompt_column,
65 |             self.keyword_args_column,
66 |             self.key_column,
67 |             self.response_column,
68 |         ]
69 | 


--------------------------------------------------------------------------------
/evals/scoring/ifeval/instructions_main.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | from typing import Dict, Optional, Union
  3 | 
  4 | from . import instructions_registry
  5 | 
  6 | 
  7 | @dataclasses.dataclass
  8 | class InputExample:
  9 |     key: int
 10 |     instruction_id_list: list[str]
 11 |     prompt: str
 12 |     kwargs: list[Dict[str, Optional[Union[str, int]]]]
 13 | 
 14 | 
 15 | @dataclasses.dataclass
 16 | class OutputExample:
 17 |     instruction_id_list: list[str]
 18 |     prompt: str
 19 |     response: str
 20 |     follow_all_instructions: bool
 21 |     follow_instruction_list: list[bool]
 22 | 
 23 | 
 24 | def test_instruction_following_strict(
 25 |     inp,
 26 |     response,
 27 | ):
 28 |     """Tests response to see if instructions are followed."""
 29 |     instruction_list = inp.instruction_id_list
 30 |     is_following_list = []
 31 | 
 32 |     for index, instruction_id in enumerate(instruction_list):
 33 |         instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
 34 |         instruction = instruction_cls(instruction_id)
 35 | 
 36 |         # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
 37 |         kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
 38 |         instruction.build_description(**kwargs)
 39 |         args = instruction.get_instruction_args()
 40 |         if args and "prompt" in args:
 41 |             instruction.build_description(prompt=inp.prompt)
 42 | 
 43 |         if response.strip() and instruction.check_following(response):
 44 |             is_following_list.append(True)
 45 |         else:
 46 |             is_following_list.append(False)
 47 | 
 48 |     return OutputExample(
 49 |         instruction_id_list=inp.instruction_id_list,
 50 |         prompt=inp.prompt,
 51 |         response=response,
 52 |         follow_all_instructions=all(is_following_list),
 53 |         follow_instruction_list=is_following_list,
 54 |     )
 55 | 
 56 | 
 57 | def test_instruction_following_loose(
 58 |     inp,
 59 |     response,
 60 | ):
 61 |     """Tests response for an upper bound for following instructions."""
 62 |     r = response.split("\n")
 63 |     response_remove_first = "\n".join(r[1:]).strip()
 64 |     response_remove_last = "\n".join(r[:-1]).strip()
 65 |     response_remove_both = "\n".join(r[1:-1]).strip()
 66 |     revised_response = response.replace("*", "")
 67 |     revised_response_remove_first = response_remove_first.replace("*", "")
 68 |     revised_response_remove_last = response_remove_last.replace("*", "")
 69 |     revised_response_remove_both = response_remove_both.replace("*", "")
 70 |     all_responses = [
 71 |         response,
 72 |         revised_response,
 73 |         response_remove_first,
 74 |         response_remove_last,
 75 |         response_remove_both,
 76 |         revised_response_remove_first,
 77 |         revised_response_remove_last,
 78 |         revised_response_remove_both,
 79 |     ]
 80 |     instruction_list = inp.instruction_id_list
 81 |     is_following_list = []
 82 | 
 83 |     for index, instruction_id in enumerate(instruction_list):
 84 |         instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
 85 |         instruction = instruction_cls(instruction_id)
 86 | 
 87 |         # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
 88 |         kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
 89 |         instruction.build_description(**kwargs)
 90 |         args = instruction.get_instruction_args()
 91 |         if args and "prompt" in args:
 92 |             instruction.build_description(prompt=inp.prompt)
 93 | 
 94 |         is_following = False
 95 |         for r in all_responses:
 96 |             if r.strip() and instruction.check_following(r):
 97 |                 is_following = True
 98 |                 break
 99 | 
100 |         is_following_list.append(is_following)
101 | 
102 |     return OutputExample(
103 |         instruction_id_list=inp.instruction_id_list,
104 |         prompt=inp.prompt,
105 |         response=response,
106 |         follow_all_instructions=all(is_following_list),
107 |         follow_instruction_list=is_following_list,
108 |     )
109 | 
110 | 
111 | def agg_inst_level_acc(items):
112 |     flat_items = [item for sublist in items for item in sublist]
113 |     inst_level_acc = sum(flat_items) / len(flat_items)
114 |     return inst_level_acc
115 | 


--------------------------------------------------------------------------------
/evals/scoring/ifeval/instructions_registry.py:
--------------------------------------------------------------------------------
 1 | """
 2 | IFEval scoring functions from Google's source code: https://github.com/google-research/google-research/blob/master/instruction_following_eval/instruction_following_eval.py
 3 | """
 4 | 
 5 | # coding=utf-8
 6 | # Copyright 2024 The Google Research Authors.
 7 | #
 8 | # Licensed under the Apache License, Version 2.0 (the "License");
 9 | # you may not use this file except in compliance with the License.
10 | # You may obtain a copy of the License at
11 | #
12 | #     http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 | 
20 | from . import instructions
21 | 
22 | _KEYWORD = "keywords:"
23 | 
24 | _LANGUAGE = "language:"
25 | 
26 | _LENGTH = "length_constraints:"
27 | 
28 | _CONTENT = "detectable_content:"
29 | 
30 | _FORMAT = "detectable_format:"
31 | 
32 | _MULTITURN = "multi-turn:"
33 | 
34 | _COMBINATION = "combination:"
35 | 
36 | _STARTEND = "startend:"
37 | 
38 | _CHANGE_CASES = "change_case:"
39 | 
40 | _PUNCTUATION = "punctuation:"
41 | 
42 | INSTRUCTION_DICT = {
43 |     _KEYWORD + "existence": instructions.KeywordChecker,
44 |     _KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
45 |     # TODO(jeffreyzhou): make a proper set of sentences to choose from
46 |     # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
47 |     _KEYWORD + "forbidden_words": instructions.ForbiddenWords,
48 |     _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
49 |     _LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
50 |     _LENGTH + "number_sentences": instructions.NumberOfSentences,
51 |     _LENGTH + "number_paragraphs": instructions.ParagraphChecker,
52 |     _LENGTH + "number_words": instructions.NumberOfWords,
53 |     _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
54 |     _CONTENT + "number_placeholders": instructions.PlaceholderChecker,
55 |     _CONTENT + "postscript": instructions.PostscriptChecker,
56 |     _FORMAT + "number_bullet_lists": instructions.BulletListChecker,
57 |     # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
58 |     # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
59 |     _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
60 |     _FORMAT + "number_highlighted_sections": (instructions.HighlightSectionChecker),
61 |     _FORMAT + "multiple_sections": instructions.SectionChecker,
62 |     # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
63 |     # _FORMAT + "rephrase": instructions.RephraseChecker,
64 |     _FORMAT + "json_format": instructions.JsonFormat,
65 |     _FORMAT + "title": instructions.TitleChecker,
66 |     # TODO(tianjianlu): Re-enable with specific prompts.
67 |     # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
68 |     _COMBINATION + "two_responses": instructions.TwoResponsesChecker,
69 |     _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
70 |     _STARTEND + "end_checker": instructions.EndChecker,
71 |     _CHANGE_CASES + "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
72 |     _CHANGE_CASES + "english_capital": instructions.CapitalLettersEnglishChecker,
73 |     _CHANGE_CASES + "english_lowercase": instructions.LowercaseLettersEnglishChecker,
74 |     _PUNCTUATION + "no_comma": instructions.CommaChecker,
75 |     _STARTEND + "quotation": instructions.QuotationChecker,
76 | }
77 | 


--------------------------------------------------------------------------------
/evals/scoring/livecodebench/__init__.py:
--------------------------------------------------------------------------------
1 | from .livecodebench_scorer import LiveCodeBenchScorer
2 | 
3 | __all__ = ["LiveCodeBenchScorer"]
4 | 


--------------------------------------------------------------------------------
/evals/scoring/livecodebench/livecodebench_scorer.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import copy
  3 | from typing import Any, AsyncIterator, Dict, List, Literal, Tuple
  4 | 
  5 | from ...util.common import has_code
  6 | 
  7 | from ..base import BatchScorer, Scorer
  8 | from .livecodebench_util import (
  9 |     _ray_wrapper,
 10 |     has_test_type,
 11 |     post_process_code,
 12 |     unsafe_lcb_runTests_mp,
 13 |     unsafe_lcb_runTests_ray,
 14 | )
 15 | 
 16 | 
 17 | class LiveCodeBenchScorer(Scorer):
 18 |     """Scorer for LiveCodeBench
 19 | 
 20 |     For the LiveCodeBench dataset format, see https://huggingface.co/datasets/livecodebench/code_generation_lite
 21 | 
 22 |     Args:
 23 |         question_content_column: The column name for the question (str).
 24 |         private_test_cases_column: The column name for the private test cases (str).
 25 |         public_test_cases_column: The column name for the public test cases (str).
 26 |         starter_code_column: The column name for the starter code (str).
 27 |         difficulty_column: The column name for the difficulty level (str).
 28 |         question_id_column: The column name for the question id (str).
 29 |         response_column: The column name for the response (str).
 30 |         backend: The backend to use for scoring. Supports "ray" or "mp" (str).
 31 |     """
 32 | 
 33 |     TIMEOUT = 6
 34 |     SCORE_COLUMN = "livecodebench_score"
 35 | 
 36 |     def __init__(
 37 |         self,
 38 |         question_content_column: str = "question_content",
 39 |         private_test_cases_column: str = "private_test_cases",
 40 |         public_test_cases_column: str = "public_test_cases",
 41 |         starter_code_column: str = "starter_code",
 42 |         difficulty_column: str = "difficulty",
 43 |         question_id_column: str = "question_id",
 44 |         response_column: str = "response",
 45 |         backend: Literal["ray", "mp"] = "ray",
 46 |     ):
 47 | 
 48 |         self.question_content_column = question_content_column
 49 |         self.private_test_cases_column = private_test_cases_column
 50 |         self.public_test_cases_column = public_test_cases_column
 51 |         self.starter_code_column = starter_code_column
 52 |         self.difficulty_column = difficulty_column
 53 |         self.question_id_column = question_id_column
 54 |         self.response_column = response_column
 55 |         self.backend = backend
 56 | 
 57 |     def score(self, row: dict) -> Dict[str, Any]:
 58 |         row = self.map_to_example(row)
 59 | 
 60 |         code_filter_result = has_code(row[self.response_column])
 61 |         last_code = None
 62 |         if len(code_filter_result) == 0:
 63 |             return {self.SCORE_COLUMN: False}
 64 |         else:
 65 |             last_code = code_filter_result[-1]
 66 |             problem_to_check = copy.deepcopy(row)
 67 | 
 68 |         if self.backend == "ray":
 69 |             result_list = unsafe_lcb_runTests_ray(
 70 |                 problem_to_check,
 71 |                 post_process_code(last_code),
 72 |                 self.TIMEOUT,
 73 |                 runtime_debug=False,
 74 |                 is_extracted=not row["is_stdin"],
 75 |             )
 76 |         else:
 77 |             result_list = unsafe_lcb_runTests_mp(
 78 |                 problem_to_check,
 79 |                 post_process_code(last_code),
 80 |                 self.TIMEOUT,
 81 |                 runtime_debug=False,
 82 |                 is_extracted=not row["is_stdin"],
 83 |             )
 84 |         details = [r[0] for r in result_list]
 85 |         all_passed = all(details)
 86 | 
 87 |         result = ""
 88 |         if result_list and all_passed:
 89 |             result = "passed"
 90 | 
 91 |         return {self.SCORE_COLUMN: result == "passed"}
 92 | 
 93 |     @property
 94 |     def expected_keys(self) -> List[str]:
 95 |         return [
 96 |             self.question_content_column,
 97 |             self.private_test_cases_column,
 98 |             self.public_test_cases_column,
 99 |             self.difficulty_column,
100 |             self.question_id_column,
101 |             self.starter_code_column,
102 |             self.response_column,
103 |         ]
104 | 
105 |     def map_to_example(self, row):
106 |         return {
107 |             "prompt": row[self.question_content_column],
108 |             "test": row[self.private_test_cases_column],
109 |             "entry_point": row[self.starter_code_column],
110 |             "canonical_solution": "",  # seems like live code bench lite does not have this field
111 |             "task_id": row[self.question_id_column],
112 |             "is_stdin": has_test_type(row[self.public_test_cases_column], "stdin"),
113 |             "public_test_cases": row[self.public_test_cases_column],
114 |             "difficulty": row[self.difficulty_column],
115 |             self.response_column: row[self.response_column],
116 |         }
117 | 
118 | 
119 | class LiveCodeBenchBatchScorer(BatchScorer):
120 |     """Batch scorer for LiveCodeBench
121 | 
122 |     For the LiveCodeBench dataset format, see https://huggingface.co/datasets/livecodebench/code_generation_lite
123 | 
124 |     Args:
125 |         question_content_column: The column name for the question (str).
126 |         private_test_cases_column: The column name for the private test cases (str).
127 |         public_test_cases_column: The column name for the public test cases (str).
128 |         starter_code_column: The column name for the starter code (str).
129 |         difficulty_column: The column name for the difficulty level (str).
130 |         question_id_column: The column name for the question id (str).
131 |         response_column: The column name for the response (str).
132 |     """
133 | 
134 |     TIMEOUT = 6
135 |     SCORE_COLUMN = "livecodebench_score"
136 | 
137 |     def __init__(
138 |         self,
139 |         question_content_column: str = "question_content",
140 |         private_test_cases_column: str = "private_test_cases",
141 |         public_test_cases_column: str = "public_test_cases",
142 |         starter_code_column: str = "starter_code",
143 |         difficulty_column: str = "difficulty",
144 |         question_id_column: str = "question_id",
145 |         response_column: str = "response",
146 |     ):
147 |         self.question_content_column = question_content_column
148 |         self.private_test_cases_column = private_test_cases_column
149 |         self.public_test_cases_column = public_test_cases_column
150 |         self.starter_code_column = starter_code_column
151 |         self.difficulty_column = difficulty_column
152 |         self.question_id_column = question_id_column
153 |         self.response_column = response_column
154 | 
155 |     async def score(self, rows: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]:
156 | 
157 |         inputs = []
158 |         ids = []
159 |         for row in rows:
160 |             row = self.map_to_example(row)
161 |             code_filter_result = has_code(row[self.response_column])
162 |             last_code = None
163 |             if len(code_filter_result) == 0:
164 |                 yield {
165 |                     self.INTERNAL_IDX_KEY: row[self.INTERNAL_IDX_KEY],
166 |                     self.SCORE_COLUMN: False,
167 |                 }
168 |             else:
169 |                 last_code = code_filter_result[-1]
170 |                 problem_to_check = copy.deepcopy(row)
171 | 
172 |             inputs.append(
173 |                 {
174 |                     "problem": problem_to_check,
175 |                     "completion": post_process_code(last_code),
176 |                     "timeout": self.TIMEOUT,
177 |                     "runtime_debug": False,
178 |                     "is_extracted": row["is_stdin"],
179 |                 }
180 |             )
181 |             ids.append(row[self.INTERNAL_IDX_KEY])
182 | 
183 |         async for output in _unsafe_lcb_runTests_ray_batch(ids, inputs):
184 |             idx, result_list = output
185 |             details = [r[0] for r in result_list]
186 |             all_passed = all(details)
187 | 
188 |             result = ""
189 |             if result_list and all_passed:
190 |                 result = "passed"
191 | 
192 |             yield {
193 |                 self.INTERNAL_IDX_KEY: idx,
194 |                 self.SCORE_COLUMN: result == "passed",
195 |             }
196 | 
197 |     def map_to_example(self, row):
198 |         return {
199 |             "prompt": row[self.question_content_column],
200 |             "test": row[self.private_test_cases_column],
201 |             "entry_point": row[self.starter_code_column],
202 |             "canonical_solution": "",  # seems like live code bench lite does not have this field
203 |             "task_id": row[self.question_id_column],
204 |             "is_stdin": has_test_type(row[self.public_test_cases_column], "stdin"),
205 |             "public_test_cases": row[self.public_test_cases_column],
206 |             "difficulty": row[self.difficulty_column],
207 |             self.response_column: row[self.response_column],
208 |             self.INTERNAL_IDX_KEY: row[self.INTERNAL_IDX_KEY],
209 |         }
210 | 
211 | 
212 | async def _unsafe_lcb_runTests_ray_batch(
213 |     ids, inputs
214 | ) -> AsyncIterator[Tuple[int, List[Tuple[bool, str, str, float]]]]:
215 |     refs = []
216 |     for idx, _input in zip(ids, inputs):
217 |         problem = _input["problem"]
218 |         completion = _input["completion"]
219 |         timeout = _input["timeout"]
220 |         runtime_debug = _input["runtime_debug"]
221 |         is_extracted = _input["is_extracted"]
222 |         test_cases = problem["test"]
223 | 
224 |         result_ref = _ray_wrapper.remote(
225 |             test_cases, completion, timeout, runtime_debug, is_extracted, idx
226 |         )
227 |         refs.append(result_ref)
228 | 
229 |     futs = [asyncio.wrap_future(ref.future()) for ref in refs]
230 |     for fut in asyncio.as_completed(futs):
231 |         idx, result = await fut
232 |         _input = inputs[ids.index(idx)]
233 |         ## This is supposed to be the case where not all test passed in the given timeout
234 |         for _i in range(len(_input["problem"]["test"]) - len(result)):
235 |             result.append((False, "Time out!.", "Error: Time out!", float("inf")))
236 |         yield idx, result
237 | 


--------------------------------------------------------------------------------
/evals/scoring/math/__init__.py:
--------------------------------------------------------------------------------
1 | from .math_scorer import MathEqualScorer, MathVerifyScorer
2 | 
3 | __all__ = ["MathVerifyScorer", "MathEqualScorer"]
4 | 


--------------------------------------------------------------------------------
/evals/scoring/math/math_scorer.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List
 2 | 
 3 | from ...util.math_parsing_util import extract_answer, math_equal
 4 | 
 5 | from ..base import Scorer
 6 | 
 7 | try:
 8 |     from math_verify import parse as mv_parse
 9 |     from math_verify import verify as mv_verify
10 | except ImportError:
11 |     mv_parse = None
12 |     mv_verify = None
13 | 
14 | 
15 | class MathEqualScorer(Scorer):
16 |     """Scorer for math based on the `math_equal` function from Qwen Math
17 | 
18 |     Args:
19 |         response_column: The column name for the model generated response. (str)
20 |         answer_column: The column name for the ground truth answer. (str)
21 |     """
22 | 
23 |     SCORE_COLUMN = "math_equal_score"
24 | 
25 |     def __init__(self, response_column: str, answer_column: str):
26 |         self.response_column = response_column
27 |         self.answer_column = answer_column
28 | 
29 |     def score(self, row: dict) -> Dict[str, Any]:
30 |         try:
31 |             pred = extract_answer(row[self.response_column])
32 |             ref = extract_answer(row[self.answer_column])
33 |         except Exception:
34 |             return False
35 |         return {self.SCORE_COLUMN: math_equal(pred, ref)}
36 | 
37 |     @property
38 |     def expected_keys(self) -> List[str]:
39 |         return [self.response_column, self.answer_column]
40 | 
41 | 
42 | class MathVerifyScorer(Scorer):
43 |     """Scorer for math based on the `math_verify` function from HuggingFace
44 | 
45 |     Args:
46 |         response_column: The column name for the model generated response. (str)
47 |         answer_column: The column name for the ground truth answer. (str)
48 |     """
49 | 
50 |     SCORE_COLUMN = "math_verify_score"
51 | 
52 |     def __init__(self, response_column: str, answer_column: str):
53 |         self.response_column = response_column
54 |         self.answer_column = answer_column
55 |         if mv_parse is None or mv_verify is None:
56 |             raise ImportError(
57 |                 "`math_verify` is not installed. Please install it with `pip install math_verify`."
58 |             )
59 | 
60 |     def score(self, row: dict) -> Dict[str, Any]:
61 |         try:
62 |             pred = mv_parse(row[self.response_key])
63 |             ref = mv_parse(row[self.answer_key])
64 |         except Exception:
65 |             return False
66 |         return {self.SCORE_COLUMN: mv_verify(pred, ref)}
67 | 
68 |     @property
69 |     def expected_keys(self) -> List[str]:
70 |         return [self.response_column, self.answer_column]
71 | 


--------------------------------------------------------------------------------
/evals/scoring/taco/__init__.py:
--------------------------------------------------------------------------------
1 | from .taco_scorer import TACOScorer
2 | 
3 | __all__ = ["TACOScorer"]
4 | 


--------------------------------------------------------------------------------
/evals/scoring/taco/taco_scorer.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | from multiprocessing import Manager
 3 | from typing import Any, Dict, Literal
 4 | 
 5 | import ray
 6 | 
 7 | from ...util.common import has_code
 8 | 
 9 | from ..base import Scorer
10 | from .taco_util import run_test as taco_run_test
11 | 
12 | 
13 | class TACOScorer(Scorer):
14 |     SCORE_COLUMN = "taco_score"
15 | 
16 |     def __init__(
17 |         self,
18 |         response_column="response",
19 |         input_output_column="input_output",
20 |         backend: Literal["ray", "mp"] = "ray",
21 |     ) -> None:
22 |         super().__init__()
23 |         self.response_column = response_column
24 |         self.input_output_column = input_output_column
25 |         self.backend = backend
26 |         if backend not in ["ray", "mp"]:
27 |             raise ValueError(f"Unsupported backend for launching tests: {backend}")
28 | 
29 |     def score(self, row: Dict[str, Any]):
30 |         # Initialize the response structure
31 |         response = row[self.response_column]
32 |         input_outputs = row[self.input_output_column]
33 | 
34 |         code_filter_result = has_code(response)
35 |         if len(code_filter_result) == 0:
36 |             return {self.SCORE_COLUMN: False}
37 |         else:
38 |             last_code = code_filter_result[-1]
39 |             if self.backend == "mp":
40 |                 curr_res, _ = _taco_run_tests_mp(input_outputs, generation=last_code)
41 |             else:
42 |                 curr_res, _ = _taco_run_tests_ray(input_outputs, generation=last_code)
43 | 
44 |             if curr_res:
45 |                 return {self.SCORE_COLUMN: True}
46 |             else:
47 |                 return {self.SCORE_COLUMN: False}
48 | 
49 | 
50 | def _taco_run_tests_mp(input_outputs, generation):
51 | 
52 |     def _temp_run(input_outputs, generation, debug, result):
53 |         try:
54 |             result.append(taco_run_test(input_outputs, test=generation, debug=debug))
55 |         except Exception as e:
56 |             print(f"Error in _temp_run: {e}")
57 | 
58 |     # run the test in a separate process for safety
59 |     manager = Manager()
60 |     result = manager.list()
61 |     p = multiprocessing.Process(
62 |         target=_temp_run, args=(input_outputs, generation, False, result)
63 |     )
64 |     p.start()
65 |     p.join()
66 |     if p.is_alive():
67 |         p.kill()
68 |     # get the first element in ListProxy - this is the result
69 |     result = result[0]
70 |     return bool(result and all(result)), result
71 | 
72 | 
73 | # NOTE (sumanthrh): We make sure that scoring for code generation is run on a separate process for isolation
74 | # We need to run scoring for each data sample in a separate process. Since ray doesn't play well with
75 | # multiprocessing, we launch scoring as a standalone ray task. Further, to make sure that resource requests
76 | # don't blow up for batched processing- for example, in a ray data pipeline, we reduce `num_cpus` to 0.001 from the default
77 | # value of 1. That way, scoring for different samples can timeshare on the same set of cpus.
78 | @ray.remote(num_cpus=0.001)
79 | def _temp_run_ray(input_outputs, generation, debug):
80 |     result = []
81 |     try:
82 |         result = taco_run_test(input_outputs, test=generation, debug=debug)
83 |     except Exception as e:
84 |         print(f"Error in _temp_run: {e}")
85 |     return result
86 | 
87 | 
88 | def _taco_run_tests_ray(input_outputs, generation):
89 |     # run the test in a separate process for safety
90 |     obj_ref = _temp_run_ray.remote(input_outputs, generation, False)
91 |     result = ray.get(obj_ref)
92 |     return bool(result and all(result)), result
93 | 


--------------------------------------------------------------------------------
/evals/scoring/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanomaoli/llm_reproducibility/8a373c5a159a27e59783394827cecadd6255484e/evals/scoring/utils/__init__.py


--------------------------------------------------------------------------------
/evals/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from .aime.aime_handler import AIMETaskHandler
 4 | from .amc23.amc23_handler import AMC23TaskHandler
 5 | from .apps.apps_handler import APPSTaskHandler
 6 | from .arc.arc_handler import ARCChallengeTaskHandler
 7 | from .base import ConversationType, TaskConfig, TaskHandler
 8 | from .gpqa_diamond.gpqa_diamond_handler import GPQADiamondTaskHandler
 9 | from .gsm8k.gsm8k_handler import GSM8KTaskHandler
10 | from .liveaops.liveaops_handler import LiveAOPSTaskHandler
11 | from .livecodebench.livecodebench_handler import LiveCodeBenchTaskHandler
12 | from .math.math_handler import MathTaskHandler
13 | from .minervamath.minervamath_handler import MinervaMathTaskHandler
14 | from .mmlu.mmlu_handler import MMLUProTaskHandler, MMLUTaskHandler
15 | from .numina.numina_handler import NUMINATaskHandler
16 | from .olympiadbench.olympiadbench_handler import OlympiadBenchMathTaskHandler
17 | from .omni_math.omni_handler import OMNIMathTaskHandler
18 | from .taco.taco_handler import TACOTaskHandler
19 | from .task_util import get_tasks
20 | 
21 | TASK_HANDLER_MAP = {
22 |     "numina": NUMINATaskHandler,
23 |     "apps": APPSTaskHandler,
24 |     "taco": TACOTaskHandler,
25 |     "math": MathTaskHandler,
26 |     "aime": AIMETaskHandler,
27 |     "gpqa_diamond": GPQADiamondTaskHandler,
28 |     "mmlu": MMLUTaskHandler,
29 |     "mmlu_pro": MMLUProTaskHandler,
30 |     "livecodebench": LiveCodeBenchTaskHandler,
31 |     "gsm8k": GSM8KTaskHandler,
32 |     "arc_c": ARCChallengeTaskHandler,
33 |     "amc23": AMC23TaskHandler,
34 |     "minervamath": MinervaMathTaskHandler,
35 |     "olympiadbench_math": OlympiadBenchMathTaskHandler,
36 |     "omni_math": OMNIMathTaskHandler,
37 |     "liveaops": LiveAOPSTaskHandler,
38 | }
39 | TASK_NAMES_TO_YAML = get_tasks(os.path.dirname(__file__))
40 | 
41 | __all__ = [
42 |     "AIMETaskHandler",
43 |     "APPSTaskHandler",
44 |     "TACOTaskHandler",
45 |     "MathTaskHandler",
46 |     "AMC23TaskHandler",
47 |     "NUMINATaskHandler",
48 |     "GPQADiamondTaskHandler",
49 |     "MMLUTaskHandler",
50 |     "MMLUProTaskHandler",
51 |     "LiveCodeBenchTaskHandler",
52 |     "GSM8KTaskHandler",
53 |     "ARCChallengeTaskHandler",
54 |     "TaskHandler",
55 |     "MathTaskHandler",
56 |     "OlympiadBenchMathTaskHandler",
57 |     "MinervaMathTaskHandler",
58 |     "TaskConfig",
59 |     "TASK_HANDLER_MAP",
60 |     "TASK_NAMES_TO_YAML",
61 |     "ConversationType",
62 | ]
63 | 


--------------------------------------------------------------------------------
/evals/tasks/aime/aime24.yaml:
--------------------------------------------------------------------------------
 1 | handler: aime
 2 | dataset_path: AI-MO/aimo-validation-aime
 3 | dataset_split: train
 4 | question_key: problem
 5 | answer_key: answer
 6 | templating_parameters:
 7 |   template: "Return your final response within \\boxed{{}}. {prompt}"
 8 | preprocess_config:
 9 |   url: "2024"
10 | 


--------------------------------------------------------------------------------
/evals/tasks/aime/aime24_sky.yaml:
--------------------------------------------------------------------------------
1 | handler: aime
2 | dataset_path: AI-MO/aimo-validation-aime
3 | dataset_split: train
4 | question_key: problem
5 | answer_key: answer
6 | templating_parameters:
7 |   template: "{prompt}\nReturn your final response within \\boxed{{}}"
8 | preprocess_config:
9 |   url: "2024"


--------------------------------------------------------------------------------
/evals/tasks/aime/aime25_1.yaml:
--------------------------------------------------------------------------------
 1 | handler: aime
 2 | dataset_path: opencompass/AIME2025
 3 | dataset_subset: AIME2025-I
 4 | dataset_split: test
 5 | question_key: question
 6 | answer_key: answer
 7 | templating_parameters:
 8 |   template: "{prompt}\nReturn your final response within \\boxed{{}}"
 9 | 
10 | 


--------------------------------------------------------------------------------
/evals/tasks/aime/aime25_2.yaml:
--------------------------------------------------------------------------------
 1 | handler: aime
 2 | dataset_path: opencompass/AIME2025
 3 | dataset_subset: AIME2025-II
 4 | dataset_split: test
 5 | question_key: question
 6 | answer_key: answer
 7 | templating_parameters:
 8 |   template: "{prompt}\nReturn your final response within \\boxed{{}}"
 9 | 
10 | 


--------------------------------------------------------------------------------
/evals/tasks/aime/aime_handler.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from ..math.math_handler import MathTaskHandler
 4 | 
 5 | 
 6 | class AIMETaskHandler(MathTaskHandler):
 7 |     def generate_prompt(self, problem: Dict):
 8 |         return self.task_config.templating_parameters["template"].format(
 9 |             prompt=problem[self.question_key]
10 |         )
11 | 
12 |     def load_and_filter_dataset(
13 |         self, start, end, split=None, subset=None, difficulty=None
14 |     ):
15 |         train_data = self.load_dataset(subset=subset, split=split).to_pandas()
16 |         if self.task_config.preprocess_config:
17 |             if "url" in self.task_config.preprocess_config:
18 |                 train_data = train_data[
19 |                     train_data["url"].str.contains(
20 |                         self.task_config.preprocess_config["url"], na=False
21 |                     )
22 |                 ]
23 |         return train_data.iloc[start:end] if end > 0 else train_data.iloc[start:]
24 | 


--------------------------------------------------------------------------------
/evals/tasks/amc23/amc23.yaml:
--------------------------------------------------------------------------------
 1 | handler: amc23
 2 | dataset_path: AI-MO/aimo-validation-amc
 3 | dataset_kwargs:
 4 |   trust_remote_code: true
 5 | dataset_split: train
 6 | question_key: problem
 7 | answer_key: answer
 8 | # Optionally, you can filter the dataset by difficulty
 9 | # preprocess_config:
10 | #   difficulty: easy
11 | templating_parameters: 
12 |   template: "Return your final response within \\boxed{{}}. {problem}"
13 | 


--------------------------------------------------------------------------------
/evals/tasks/amc23/amc23_handler.py:
--------------------------------------------------------------------------------
 1 | from ..math.math_handler import MathTaskHandler
 2 | 
 3 | 
 4 | class AMC23TaskHandler(MathTaskHandler):
 5 |     def load_and_filter_dataset(
 6 |         self, start, end, split=None, subset=None, difficulty=None
 7 |     ):
 8 |         train_data = self.load_dataset(subset=subset, split=split).to_pandas()
 9 |         filtered_data = train_data[train_data["url"].str.contains("2023", na=False)]
10 |         return filtered_data.iloc[start:end] if end > 0 else filtered_data.iloc[start:]
11 | 


--------------------------------------------------------------------------------
/evals/tasks/apps/apps.yaml:
--------------------------------------------------------------------------------
 1 | handler: apps
 2 | dataset_path: codeparrot/apps
 3 | dataset_subset: all
 4 | dataset_kwargs:
 5 |   trust_remote_code: true
 6 | dataset_split: test
 7 | question_key: question
 8 | answer_key: null
 9 | # preprocess_config:
10 | #   difficulty: null
11 | templating_parameters:
12 |   with_fn_name_template: "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}" 
13 |   without_fn_name_template: "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}"
14 |   # Add starter code on top of the initial template
15 |   with_starter_code_template: "{input}\n{starter_code}"
16 | # Optionally, you can filter the dataset by difficulty
17 | # preprocess_config:
18 | #   difficulty: easy
19 | 


--------------------------------------------------------------------------------
/evals/tasks/apps/apps_handler.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import json
  3 | import multiprocessing
  4 | from multiprocessing import Manager
  5 | 
  6 | import numpy as np
  7 | 
  8 | from ...util.common import has_code
  9 | 
 10 | from .apps_util import run_test as apps_run_test
 11 | from ..base import TaskHandler
 12 | 
 13 | 
 14 | class APPSTaskHandler(TaskHandler):
 15 | 
 16 |     def generate_prompt(self, problem):
 17 |         # test_case, prompt, starter_code=None
 18 |         test_case = json.loads(problem["input_output"])
 19 |         starter_code = problem["starter_code"]
 20 |         prompt = problem["question"]
 21 |         if not test_case.get("fn_name"):
 22 |             _input = self.task_config.templating_parameters[
 23 |                 "with_fn_name_template"
 24 |             ].format(prompt=prompt)
 25 |         else:
 26 |             _input = self.task_config.templating_parameters[
 27 |                 "without_fn_name_template"
 28 |             ].format(prompt=prompt)
 29 | 
 30 |         if starter_code is not None:
 31 |             _input = self.task_config.templating_parameters[
 32 |                 "with_starter_code_template"
 33 |             ].format(input=_input, starter_code=starter_code)
 34 |         return _input
 35 | 
 36 |     def check_correctness(self, problem, generation):
 37 |         TIMEOUT = 10
 38 | 
 39 |         def _temp_run(problem, generation, debug, result):
 40 |             try:
 41 |                 result.append(
 42 |                     apps_run_test(problem=problem, test=generation, debug=debug)
 43 |                 )
 44 |             except Exception:
 45 |                 pass
 46 | 
 47 |         manager = Manager()
 48 |         result = manager.list()
 49 |         p = multiprocessing.Process(
 50 |             target=_temp_run, args=(problem, generation, False, result)
 51 |         )
 52 |         p.start()
 53 |         p.join(timeout=TIMEOUT + 1)
 54 |         if p.is_alive():
 55 |             p.kill()
 56 |         return bool(result and np.all(result[0]))
 57 | 
 58 |     def update_results(self, problem, response):
 59 |         # Initialize the response structure
 60 |         response_entry = {
 61 |             "content": response,
 62 |             "correctness": None,
 63 |             "reason": None,
 64 |         }
 65 |         code_filter_result = has_code(response)
 66 |         if len(code_filter_result) == 0:
 67 |             response_entry["correctness"] = False
 68 |             response_entry["reason"] = "Does not contain code component."
 69 |         else:
 70 |             last_code = code_filter_result[-1]
 71 |             problem_to_check = copy.deepcopy(problem)
 72 |             problem_to_check["input_output"] = json.loads(problem["input_output"])
 73 |             try:
 74 |                 problem_to_check["solutions"] = json.loads(problem["solutions"])
 75 |             except Exception:
 76 |                 problem_to_check["solutions"] = ""
 77 |                 print("Empty solution from the dataset")
 78 |             curr_res = self.check_correctness(problem_to_check, generation=last_code)
 79 |             if curr_res:
 80 |                 response_entry["correctness"] = True
 81 |                 response_entry["reason"] = ""
 82 |             else:
 83 |                 response_entry["correctness"] = False
 84 |                 response_entry["reason"] = "Code is incorrect."
 85 | 
 86 |         return response_entry
 87 | 
 88 |     def load_and_filter_dataset(
 89 |         self, start, end, split=None, subset=None, difficulty=None
 90 |     ):
 91 |         train_data = self.load_dataset(subset=subset, split=split)
 92 |         if difficulty or "difficulty" in self.task_config.preprocess_config:
 93 |             difficulty = (
 94 |                 self.task_config.preprocess_config["difficulty"]
 95 |                 if not difficulty
 96 |                 else difficulty
 97 |             )
 98 |             train_data = train_data.filter(lambda x: x["difficulty"] == difficulty)
 99 | 
100 |         train_data = train_data.to_pandas()
101 | 
102 |         return train_data.iloc[start:end] if end > 0 else train_data.iloc[start:]
103 | 


--------------------------------------------------------------------------------
/evals/tasks/arc/arc_c.yaml:
--------------------------------------------------------------------------------
1 | handler: arc_c
2 | dataset_path: allenai/ai2_arc
3 | dataset_subset: ARC-Challenge
4 | dataset_split: train
5 | question_key: question
6 | answer_key: answerKey
7 | templating_parameters:
8 |   # We combine choices for a question into choices_text entry in the dataset
9 |   template: "Given the following question and four candidate answers (A, B, C and D), choose the best answer. Your response should end with \"The best answer is [the_answer_letter]\" where [the_answer_letter] is one of the four letter choice (A, B, C, or D).\n{question}\n{choices_text}"


--------------------------------------------------------------------------------
/evals/tasks/arc/arc_handler.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Any, Dict
 3 | 
 4 | from ...util.math_parsing_util import extract_answer
 5 | 
 6 | from ..base import TaskConfig, TaskHandler
 7 | 
 8 | 
 9 | class ARCChallengeTaskHandler(TaskHandler):
10 |     def __init__(self, task_config: TaskConfig) -> None:
11 |         super().__init__(task_config)
12 |         self.ans_re = re.compile(r"[Tt]he best answer is ([A-D])[\.\,]*", re.IGNORECASE)
13 |         self.letter_re = re.compile(r"([A-D])[\.\,]*")
14 |         self.canonical_options = ["A", "B", "C", "D"]
15 |         self.invalid_ans = "[invalid]"
16 | 
17 |     def generate_prompt(self, problem):
18 |         choices = problem["choices"]
19 |         choices_text = "\n".join(
20 |             [
21 |                 f"{label}.{choice}"
22 |                 for label, choice in zip(self.canonical_options, choices["text"])
23 |             ]
24 |         )
25 |         problem["choices_text"] = choices_text
26 |         full_prompt = self.task_config.templating_parameters["template"].format(
27 |             **problem
28 |         )
29 |         return full_prompt
30 | 
31 |     def check_correctness(self, problem: Dict[str, Any], generation: str) -> bool:
32 |         gt_answer = problem[self.task_config.answer_key]
33 |         if gt_answer not in self.canonical_options:
34 |             gt_answer = self.canonical_options[
35 |                 int(problem[self.task_config.answer_key]) - 1
36 |             ]
37 |         model_answer = self.get_answer(generation)
38 |         return model_answer == gt_answer
39 | 
40 |     def update_results(self, problem, response):
41 |         # Initialize the response structure
42 |         response_entry = {
43 |             "content": response,
44 |             "correctness": None,
45 |             "reason": None,
46 |         }
47 |         curr_res = self.check_correctness(problem, generation=response)
48 |         if curr_res:
49 |             response_entry["correctness"] = True
50 |             response_entry["reason"] = ""
51 |         else:
52 |             response_entry["correctness"] = False
53 |             response_entry["reason"] = "Solution is incorrect."
54 | 
55 |         return response_entry
56 | 
57 |     def load_and_filter_dataset(
58 |         self, start, end, split=None, subset=None, difficulty=None
59 |     ):
60 |         train_data = self.load_dataset(subset=subset, split=split).to_pandas()
61 |         return train_data.iloc[start:end] if end > 0 else train_data.iloc[start:]
62 | 
63 |     def get_answer(self, completion):
64 |         # First, we try to extract similar to MATH answers
65 |         answer = extract_answer(completion)
66 |         match = None
67 |         if answer:
68 |             # match for the letter answer needed.
69 |             match = self.letter_re.search(answer)
70 |             if match:
71 |                 return match.group(1).strip()
72 | 
73 |         if not answer or not match:
74 |             # try basic-regex based search
75 |             patterns_to_remove = [
76 |                 ",",  # Remove commas
77 |                 r"\$",  # Remove dollar signs
78 |                 r"\.$" r"\\",  # Remove trailing period  # Remove stray backslashes
79 |                 r"\*",  # Remove asterisks
80 |             ]
81 |             answer = completion
82 |             for pattern in patterns_to_remove:
83 |                 answer = re.sub(pattern, "", answer)
84 |             matches = self.ans_re.findall(answer)
85 |             if not matches:
86 |                 return self.invalid_ans
87 |             return matches[-1].strip()
88 | 


--------------------------------------------------------------------------------
/evals/tasks/base.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from typing import Any, Dict, List, Optional
  3 | from urllib.parse import urlparse
  4 | 
  5 | import pandas as pd
  6 | import yaml
  7 | from datasets import Dataset as HFDataset
  8 | from datasets import load_dataset
  9 | from pydantic import BaseModel, Field
 10 | 
 11 | ConversationType = List[Dict[str, Any]]
 12 | 
 13 | 
 14 | class TaskConfig(BaseModel):
 15 |     handler: str
 16 |     dataset_path: str
 17 |     dataset_subset: Optional[str] = None
 18 |     dataset_split: Optional[str] = None
 19 |     dataset_kwargs: Dict[str, Any] = Field(default_factory=dict)
 20 |     question_key: str
 21 |     # Optional answer key for datasets with a single correct answer
 22 |     answer_key: Optional[str] = None
 23 |     templating_parameters: Dict[str, str] = Field(default_factory=dict)
 24 |     # Example fields
 25 |     # fewshot_config: List[Dict[str, Any]] = Field(default_factory=list)
 26 |     # num_fewshot: int = 0
 27 | 
 28 |     preprocess_config: Dict[str, Any] = Field(default_factory=dict)
 29 | 
 30 |     @classmethod
 31 |     def from_yaml(cls, yaml_file_path) -> "TaskConfig":
 32 |         with open(yaml_file_path, "r", encoding="utf-8") as f:
 33 |             config_dict = yaml.safe_load(f)
 34 |         return cls(**config_dict)
 35 | 
 36 |     def update(self, **kwargs):
 37 |         for key, value in kwargs.items():
 38 |             setattr(self, key, value)
 39 | 
 40 | 
 41 | class TaskHandler(ABC):
 42 | 
 43 |     def __init__(self, task_config: TaskConfig):
 44 |         self.task_config = task_config
 45 | 
 46 |     @classmethod
 47 |     def from_config_path(cls, config_path: str) -> "TaskHandler":
 48 |         task_config = TaskConfig.from_yaml(config_path)
 49 |         return cls(task_config)
 50 | 
 51 |     @property
 52 |     def question_key(self):
 53 |         return self.task_config.question_key
 54 | 
 55 |     @abstractmethod
 56 |     def check_correctness(
 57 |         self, problem: Dict[str, Any], generation: Dict[str, Any]
 58 |     ) -> bool:
 59 |         pass
 60 | 
 61 |     @abstractmethod
 62 |     def update_results(self, problem: Dict[str, Any], response: str) -> Dict[str, Any]:
 63 |         pass
 64 | 
 65 |     def make_conversations(
 66 |         self,
 67 |         data: List[Dict[str, Any]],
 68 |         system_prompt: Optional[str] = None,
 69 |         user_template: Optional[str] = None,
 70 |         assistant_prefill: Optional[str] = None,
 71 |     ) -> List[ConversationType]:
 72 |         conversations = []
 73 |         for _, problem in enumerate(data):
 74 |             prompt_text = self.generate_prompt(problem)
 75 |             conversations.append(
 76 |                 make_conversation_from_contents(
 77 |                     [prompt_text],
 78 |                     system_prompt=system_prompt,
 79 |                     user_template=user_template,
 80 |                     assistant_prefill=assistant_prefill,
 81 |                 )
 82 |             )
 83 |         return conversations
 84 | 
 85 |     def load_dataset(self, subset=None, split=None, **kwargs) -> HFDataset:
 86 |         # check if the path provided is a valid URL
 87 |         parsed = urlparse(self.task_config.dataset_path)
 88 |         if not parsed.scheme:
 89 |             # HF dataset
 90 |             dataset = load_dataset(
 91 |                 path=self.task_config.dataset_path,
 92 |                 name=subset if subset else self.task_config.dataset_subset,
 93 |                 split=split if split else self.task_config.dataset_split,
 94 |                 **self.task_config.dataset_kwargs,
 95 |             )
 96 |         else:
 97 |             # Try to load URL
 98 |             # Only JSON supported for now
 99 |             if split is not None or subset is not None:
100 |                 raise ValueError(
101 |                     "URL-based dataset does not support loading arguments like `split`, `subset`"
102 |                 )
103 |             # By default, Huggingface will create a DatasetDict object with "train" split
104 |             dataset = load_dataset("json", data_files=[self.task_config.dataset_path])[
105 |                 "train"
106 |             ]
107 | 
108 |         # add an index column efficiently with map
109 |         dataset = dataset.map(add_idx_map, with_indices=True)
110 |         return dataset
111 | 
112 |     @abstractmethod
113 |     def load_and_filter_dataset(
114 |         self, start, end, split=None, subset=None, difficulty=None
115 |     ) -> pd.DataFrame:
116 |         pass
117 | 
118 |     def process_remaining_data(self, train_data, id_to_results: dict):
119 |         return [
120 |             row.to_dict()
121 |             for _, row in train_data.iterrows()
122 |             if str(row["_index"]) not in id_to_results
123 |         ]
124 | 
125 | 
126 | def add_idx_map(x: dict, idx: int) -> dict:
127 |     # We convert to string for consistency
128 |     x["_index"] = str(idx)
129 |     return x
130 | 
131 | 
132 | def make_conversation_from_contents(
133 |     contents: List[str],
134 |     system_prompt: Optional[str] = None,
135 |     user_template: Optional[str] = None,
136 |     assistant_prefill: Optional[str] = None,
137 | ) -> ConversationType:
138 |     """Makes a conversation given a list of user/assistant message strings.
139 | 
140 |     If system_prompt is provided, it will be added as the first message.
141 |     If user_template is provided, it will be used to format the user messages. This is useful for model-specific formatting.
142 | 
143 |     Args:
144 |         content: A list of user/assistant message strings.
145 |         system_prompt: An optional string for the system prompt.
146 |         user_template: An optional string for the user template.
147 | 
148 |     Returns:
149 |         A list of dictionaries representing the conversation.
150 |     """
151 | 
152 |     conversation = []
153 |     if system_prompt:
154 |         conversation.append({"role": "system", "content": system_prompt})
155 | 
156 |     for i, content in enumerate(contents):
157 |         if i % 2 == 0:
158 |             content = user_template.format(content) if user_template else content
159 |             conversation.append({"role": "user", "content": content})
160 |         else:
161 |             conversation.append({"role": "assistant", "content": content})
162 | 
163 |     if assistant_prefill and conversation[-1]["role"] == "user":
164 |         conversation.append({"role": "assistant", "content": assistant_prefill})
165 | 
166 |     return conversation
167 | 


--------------------------------------------------------------------------------
/evals/tasks/gpqa_diamond/gpqa_diamond.yaml:
--------------------------------------------------------------------------------
1 | handler: gpqa_diamond
2 | dataset_path: Idavidrein/gpqa
3 | dataset_subset: gpqa_diamond
4 | dataset_split: train
5 | question_key: Question
6 | answer_key: Answer
7 | templating_parameters:
8 |   # For GPQA, we combine the Question key and the multiple choice answers into a single `prompt` entry
9 |   template:  "Return your final response within \\boxed{{}} and only include the letter choice (A, B, C, or D) as your final response. {prompt}" 


--------------------------------------------------------------------------------
/evals/tasks/gpqa_diamond/gpqa_diamond_handler.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from ...util.math_parsing_util import get_multiple_choice_answer
 4 | 
 5 | from ..base import TaskHandler
 6 | 
 7 | 
 8 | class GPQADiamondTaskHandler(TaskHandler):
 9 | 
10 |     def generate_prompt(self, problem):
11 |         multiple_choice_string, correct_answer_letter = (
12 |             self.get_multiple_choice_answers(problem)
13 |         )
14 |         problem["Answer"] = correct_answer_letter
15 |         problem["prompt"] = problem["Question"] + "\n" + multiple_choice_string
16 |         return self.task_config.templating_parameters["template"].format(
17 |             prompt=problem["prompt"]
18 |         )
19 | 
20 |     def update_results(self, problem, response):
21 |         # Initialize the response structure
22 |         response_entry = {
23 |             "content": response,
24 |             "correctness": None,
25 |             "reason": None,
26 |         }
27 |         curr_res = self.check_correctness(problem, generation=response)
28 |         if curr_res:
29 |             response_entry["correctness"] = True
30 |             response_entry["reason"] = ""
31 |         else:
32 |             response_entry["correctness"] = False
33 |             response_entry["reason"] = "Solution is incorrect."
34 | 
35 |         return response_entry
36 | 
37 |     def check_correctness(self, problem, generation):
38 |         pred = get_multiple_choice_answer(generation)
39 |         answer = problem[self.task_config.answer_key]
40 |         return answer == pred
41 | 
42 |     def get_multiple_choice_answers(self, data):
43 |         answers = [
44 |             data["Correct Answer"],
45 |             data["Incorrect Answer 1"],
46 |             data["Incorrect Answer 2"],
47 |             data["Incorrect Answer 3"],
48 |         ]
49 |         random.shuffle(answers)
50 | 
51 |         # Map options to letters
52 |         options = ["A", "B", "C", "D"]
53 |         options_to_answers = {
54 |             letter: answer for letter, answer in zip(options, answers)
55 |         }
56 | 
57 |         # Format the options into the string
58 |         multiple_choice_string = ", ".join(
59 |             f"{letter}) {options_to_answers[letter]}" for letter in options
60 |         )
61 | 
62 |         # Save the letter corresponding to the correct answer
63 |         correct_answer_letter = next(
64 |             letter
65 |             for letter, answer in options_to_answers.items()
66 |             if answer == data["Correct Answer"]
67 |         )
68 | 
69 |         return multiple_choice_string, correct_answer_letter
70 | 
71 |     def load_and_filter_dataset(
72 |         self, start, end, split=None, subset=None, difficulty=None
73 |     ):
74 |         train_data = self.load_dataset(subset=subset, split=split).to_pandas()
75 |         return train_data.iloc[start:end] if end > 0 else train_data.iloc[start:]
76 | 


--------------------------------------------------------------------------------
/evals/tasks/gsm8k/gsm8k.yaml:
--------------------------------------------------------------------------------
 1 | handler: gsm8k
 2 | dataset_path: "openai/gsm8k"
 3 | dataset_subset: main 
 4 | dataset_split: test
 5 | question_key: question
 6 | answer_key: answer
 7 | templating_parameters:
 8 |   template: "Given the following problem, reason and give a final answer to the problem.\nProblem: {question}\nYour response should end with \"The final answer is [answer]\" where [answer] is the response to the problem."
 9 | 
10 | 


--------------------------------------------------------------------------------
/evals/tasks/gsm8k/gsm8k_handler.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Any, Dict
 3 | 
 4 | from ...util.math_parsing_util import extract_answer
 5 | 
 6 | from ..base import TaskConfig, TaskHandler
 7 | 
 8 | 
 9 | class GSM8KTaskHandler(TaskHandler):
10 |     def __init__(self, task_config: TaskConfig) -> None:
11 |         super().__init__(task_config)
12 |         self.ans_re = re.compile(r"((-?[$0-9.,]{2,})|(-?[0-9]+))")
13 |         self.gt_re = re.compile(r"#### (\-?[0-9\.\,]+)")
14 |         self.invalid_ans = "[invalid]"
15 | 
16 |     def generate_prompt(self, problem):
17 |         return self.task_config.templating_parameters["template"].format(**problem)
18 | 
19 |     def check_correctness(self, problem: Dict[str, Any], generation: str) -> bool:
20 |         gt_answer = self.extract_gt_answer(problem[self.task_config.answer_key])
21 |         model_answer = extract_answer(generation)
22 |         model_answer = self.sanitize_answer(model_answer)
23 |         return model_answer == gt_answer
24 | 
25 |     def update_results(self, problem, response):
26 |         # Initialize the response structure
27 |         response_entry = {
28 |             "content": response,
29 |             "correctness": None,
30 |             "reason": None,
31 |         }
32 |         curr_res = self.check_correctness(problem, generation=response)
33 |         if curr_res:
34 |             response_entry["correctness"] = True
35 |             response_entry["reason"] = ""
36 |         else:
37 |             response_entry["correctness"] = False
38 |             response_entry["reason"] = "Solution is incorrect."
39 | 
40 |         return response_entry
41 | 
42 |     def load_and_filter_dataset(
43 |         self, start, end, split=None, subset=None, difficulty=None
44 |     ):
45 |         train_data = self.load_dataset(subset=subset, split=split).to_pandas()
46 |         return train_data.iloc[start:end] if end > 0 else train_data.iloc[start:]
47 | 
48 |     def extract_gt_answer(self, completion):
49 |         match = self.gt_re.search(completion)
50 |         if match:
51 |             match_str = match.group(1).strip()
52 |             match_str = match_str.replace(",", "")
53 |             return match_str
54 |         else:
55 |             return self.invalid_ans
56 | 
57 |     def sanitize_answer(self, answer):
58 |         patterns_to_remove = [
59 |             ",",  # Remove commas
60 |             r"\$",  # Remove dollar signs
61 |             r"\.$" r"\*",  # Remove trailing period  # Remove asterisks
62 |         ]
63 |         for pattern in patterns_to_remove:
64 |             answer = re.sub(pattern, "", answer)
65 | 
66 |         matches = self.ans_re.findall(answer)
67 |         if matches:
68 |             # get the last match (i.e final response) and the first / outer capturing group
69 |             match_str = matches[-1][0].strip()
70 |             return match_str
71 |         else:
72 |             return self.invalid_ans
73 | 


--------------------------------------------------------------------------------
/evals/tasks/liveaops/liveaops.yaml:
--------------------------------------------------------------------------------
1 | handler: liveaops
2 | dataset_path: https://livemathbench.github.io/data/LiveAoPSBench-2024.jsonl
3 | dataset_subset: null # which subset on huggingface. Not applicable for a URL dataset
4 | dataset_split: null # Rule based evaluation
5 | question_key: question
6 | answer_key: answer
7 | templating_parameters: 
8 |   template: "Return your final response within \\boxed{{}}. {question}"
9 | 


--------------------------------------------------------------------------------
/evals/tasks/liveaops/liveaops_handler.py:
--------------------------------------------------------------------------------
 1 | from ...util.math_parsing_util import (
 2 |     extract_answer,
 3 |     math_equal,
 4 |     strip_answer_string,
 5 | )
 6 | 
 7 | from ..math.math_handler import MathTaskHandler
 8 | 
 9 | 
10 | class LiveAOPSTaskHandler(MathTaskHandler):
11 |     def generate_prompt(self, problem):
12 |         return self.task_config.templating_parameters["template"].format(**problem)
13 | 
14 |     def check_correctness(self, problem, generation):
15 |         # no preprocessing needed
16 |         answer = problem[self.task_config.answer_key]
17 |         pred = extract_answer(generation)
18 |         pred = strip_answer_string(pred)
19 |         return math_equal(pred, answer)
20 | 
21 |     def load_and_filter_dataset(
22 |         self, start, end, split=None, subset=None, difficulty=None
23 |     ):
24 |         assert difficulty is None, "LiveAOPS does not support `difficulty` argument"
25 |         dataset = self.load_dataset(subset=subset, split=split).to_pandas()
26 |         return dataset.iloc[start:end] if end > 0 else dataset.iloc[start:]
27 | 


--------------------------------------------------------------------------------
/evals/tasks/livecodebench/livecodebench.yaml:
--------------------------------------------------------------------------------
 1 | handler: livecodebench
 2 | dataset_path: "livecodebench/code_generation_lite"  # repo ID in huggingface
 3 | dataset_subset: null 
 4 | dataset_split: test
 5 | dataset_kwargs:
 6 |   version_tag: release_v2
 7 |   trust_remote_code: true
 8 | question_key: task_id
 9 | answer_key: null
10 | templating_parameters: 
11 |   stdin_template: "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}"
12 |   non_stdin_template: "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}"
13 | # Optionally, you can filter the dataset by difficulty
14 | # preprocess_config:
15 | #   difficulty: easy
16 | 


--------------------------------------------------------------------------------
/evals/tasks/livecodebench/livecodebench_easy.yaml:
--------------------------------------------------------------------------------
 1 | handler: livecodebench
 2 | dataset_path: "livecodebench/code_generation_lite"  # repo ID in huggingface
 3 | dataset_subset: null 
 4 | dataset_split: test
 5 | dataset_kwargs:
 6 |   version_tag: release_v2
 7 |   trust_remote_code: true
 8 | question_key: task_id
 9 | answer_key: null
10 | templating_parameters: 
11 |   stdin_template: "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}"
12 |   non_stdin_template: "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}"
13 | preprocess_config:
14 |   difficulty: easy
15 | 


--------------------------------------------------------------------------------
/evals/tasks/livecodebench/livecodebench_handler.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Dict
  3 | 
  4 | from datasets import Dataset as HFDataset
  5 | 
  6 | from ...util.common import has_code
  7 | 
  8 | from ..base import TaskHandler
  9 | from .livecodebench_util import (
 10 |     map_to_example,
 11 |     post_process_code,
 12 |     translate_private_test_cases,
 13 |     unsafe_lcb_runTests,
 14 | )
 15 | 
 16 | 
 17 | class LiveCodeBenchTaskHandler(TaskHandler):
 18 | 
 19 |     def generate_prompt(self, problem):
 20 |         if problem["is_stdin"]:
 21 |             return self.task_config.templating_parameters["stdin_template"].format(
 22 |                 **problem
 23 |             )
 24 |         else:
 25 |             return self.task_config.templating_parameters["non_stdin_template"].format(
 26 |                 **problem
 27 |             )
 28 | 
 29 |     def check_correctness(
 30 |         self,
 31 |         problem: Dict,
 32 |         completion: str,
 33 |         timeout: float,
 34 |         runtime_debug=False,
 35 |         is_extracted=False,
 36 |     ) -> Dict:
 37 |         """
 38 |         Evaluates the functional correctness of a completion by running the test
 39 |         suite provided in the problem.
 40 | 
 41 |         :param completion_id: an optional completion ID so we can match
 42 |             the results later even if execution finishes asynchronously.
 43 |         """
 44 |         result_list = unsafe_lcb_runTests(
 45 |             problem, completion, timeout, runtime_debug, is_extracted
 46 |         )
 47 |         details = [r[0] for r in result_list]
 48 |         all_passed = all(details)
 49 | 
 50 |         result = ""
 51 |         if result_list and all_passed:
 52 |             result = "passed"
 53 | 
 54 |         return result == "passed"
 55 | 
 56 |     def update_results(self, problem, response):
 57 |         # Initialize the response structure
 58 |         response_entry = {
 59 |             "content": response,
 60 |             "correctness": None,
 61 |             "reason": None,
 62 |         }
 63 |         code_filter_result = has_code(response)
 64 |         # print(response)
 65 |         if len(code_filter_result) == 0:
 66 |             response_entry["correctness"] = False
 67 |             response_entry["reason"] = "Does not contain code component."
 68 |         else:
 69 |             last_code = code_filter_result[-1]
 70 |             problem_to_check = copy.deepcopy(problem)
 71 | 
 72 |             curr_res = self.check_correctness(
 73 |                 problem=problem_to_check,
 74 |                 completion=post_process_code(last_code),
 75 |                 timeout=6,
 76 |                 is_extracted=not problem_to_check["is_stdin"],
 77 |             )
 78 |             if curr_res:
 79 |                 response_entry["correctness"] = True
 80 |                 response_entry["reason"] = ""
 81 |             else:
 82 |                 response_entry["correctness"] = False
 83 |                 response_entry["reason"] = "Code is incorrect."
 84 | 
 85 |         return response_entry
 86 | 
 87 |     def load_and_filter_dataset(
 88 |         self, start, end, split=None, subset=None, difficulty=None
 89 |     ):
 90 |         dataset: HFDataset = self.load_dataset(subset=subset, split=split)
 91 |         # Filter by CLI or config
 92 |         if difficulty or "difficulty" in self.task_config.preprocess_config:
 93 |             difficulty = (
 94 |                 difficulty
 95 |                 if difficulty
 96 |                 else self.task_config.preprocess_config["difficulty"]
 97 |             )
 98 |             dataset = dataset.filter(
 99 |                 lambda example: example["difficulty"] == difficulty
100 |             )
101 |         # We use a lower writer_batch_size to avoid pyarrow issues. JSON entries with LiveCodeBench are large.
102 |         # See: https://github.com/NovaSky-AI/SkyThought/pull/45 for details.
103 |         dataset = dataset.map(
104 |             lambda example: {
105 |                 "private_test_cases": translate_private_test_cases(
106 |                     example["private_test_cases"]
107 |                 )
108 |             },
109 |             writer_batch_size=100,
110 |         )
111 |         # Apply the mapping function
112 |         # TODO (sumanthrh): See if the appropriate livecodebench columns can be renamed instead and let other columns pass-through
113 |         dataset = dataset.map(
114 |             map_to_example,
115 |             remove_columns=dataset.column_names.remove("_index"),
116 |             writer_batch_size=100,
117 |         ).to_pandas()
118 |         return dataset.iloc[start:end] if end > 0 else dataset.iloc[start:]
119 | 


--------------------------------------------------------------------------------
/evals/tasks/livecodebench/livecodebench_hard.yaml:
--------------------------------------------------------------------------------
 1 | handler: livecodebench
 2 | dataset_path: "livecodebench/code_generation_lite"  # repo ID in huggingface
 3 | dataset_subset: null 
 4 | dataset_split: test
 5 | dataset_kwargs:
 6 |   version_tag: release_v2
 7 |   trust_remote_code: true
 8 | question_key: task_id
 9 | answer_key: null
10 | templating_parameters: 
11 |   stdin_template: "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}"
12 |   non_stdin_template: "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}"
13 | preprocess_config:
14 |   difficulty: hard
15 | 


--------------------------------------------------------------------------------
/evals/tasks/livecodebench/livecodebench_medium.yaml:
--------------------------------------------------------------------------------
 1 | handler: livecodebench
 2 | dataset_path: "livecodebench/code_generation_lite"  # repo ID in huggingface
 3 | dataset_subset: null 
 4 | dataset_split: test
 5 | dataset_kwargs:
 6 |   version_tag: release_v2
 7 |   trust_remote_code: true
 8 | question_key: task_id
 9 | answer_key: null
10 | templating_parameters: 
11 |   stdin_template: "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}"
12 |   non_stdin_template: "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}"
13 | preprocess_config:
14 |   difficulty: medium
15 | 


--------------------------------------------------------------------------------
/evals/tasks/math/math500.yaml:
--------------------------------------------------------------------------------
 1 | handler: math
 2 | dataset_path: "HuggingFaceH4/MATH-500"  # repo ID in huggingface
 3 | dataset_subset: null # which subset on huggingface
 4 | question_key: problem
 5 | answer_key: answer
 6 | dataset_split: test
 7 | templating_parameters: 
 8 |   template: "Return your final response within \\boxed{{}}. {problem}"
 9 | # optional. Not supported yet. 
10 | # fewshot_config:
11 | #   - question: ...
12 | #   - target:  ...
13 | # num_fewshot: 0
14 | 


--------------------------------------------------------------------------------
/evals/tasks/math/math_handler.py:
--------------------------------------------------------------------------------
 1 | from ...util.math_parsing_util import (
 2 |     extract_answer,
 3 |     math_equal,
 4 |     strip_answer_string,
 5 | )
 6 | 
 7 | from ..base import TaskHandler
 8 | 
 9 | 
10 | class MathTaskHandler(TaskHandler):
11 |     def generate_prompt(self, problem):
12 |         return self.task_config.templating_parameters["template"].format(**problem)
13 | 
14 |     def check_correctness(self, problem, generation):
15 |         answer = strip_answer_string(problem[self.task_config.answer_key])
16 |         pred = extract_answer(generation)
17 |         pred = strip_answer_string(pred)
18 |         return math_equal(pred, answer)
19 | 
20 |     def update_results(self, problem, response):
21 |         # Initialize the response structure
22 |         response_entry = {
23 |             "content": response,
24 |             "correctness": None,
25 |             "reason": None,
26 |         }
27 |         curr_res = self.check_correctness(problem, generation=response)
28 |         if curr_res:
29 |             response_entry["correctness"] = True
30 |             response_entry["reason"] = ""
31 |         else:
32 |             response_entry["correctness"] = False
33 |             response_entry["reason"] = "Solution is incorrect."
34 | 
35 |         return response_entry
36 | 
37 |     def load_and_filter_dataset(
38 |         self, start, end, split=None, subset=None, difficulty=None
39 |     ):
40 |         dataset = self.load_dataset(subset=subset, split=split).to_pandas()
41 |         return dataset.iloc[start:end] if end > 0 else dataset.iloc[start:]
42 | 


--------------------------------------------------------------------------------
/evals/tasks/minervamath/minervamath.yaml:
--------------------------------------------------------------------------------
1 | handler: math
2 | dataset_path: "svc-huggingface/minerva-math" # repo ID in huggingface
3 | dataset_subset: null # which subset on huggingface
4 | question_key: problem
5 | answer_key: solution
6 | dataset_split: test
7 | templating_parameters: 
8 |   template: "Return your final response within \\boxed{{}}. {problem}"


--------------------------------------------------------------------------------
/evals/tasks/minervamath/minervamath_handler.py:
--------------------------------------------------------------------------------
 1 | from ...util.math_parsing_util import (
 2 |     extract_answer,
 3 |     math_equal,
 4 |     strip_answer_string,
 5 | )
 6 | 
 7 | from ..math.math_handler import MathTaskHandler
 8 | 
 9 | 
10 | class MinervaMathTaskHandler(MathTaskHandler):
11 | 
12 |     def check_correctness(self, problem, generation):
13 |         answer = extract_answer(problem[self.task_config.answer_key])
14 |         answer = strip_answer_string(answer)
15 | 
16 |         pred = extract_answer(generation)
17 |         pred = strip_answer_string(pred)
18 |         return math_equal(pred, answer)
19 | 


--------------------------------------------------------------------------------
/evals/tasks/mmlu/mmlu.yaml:
--------------------------------------------------------------------------------
1 | handler: mmlu
2 | dataset_path: cais/mmlu
3 | dataset_subset: all
4 | dataset_split: test
5 | question_key: question
6 | answer_key: answer
7 | templating_parameters:
8 |   template: "Return your final response within \\boxed{{}}. {prompt}"
9 | 


--------------------------------------------------------------------------------
/evals/tasks/mmlu/mmlu_handler.py:
--------------------------------------------------------------------------------
 1 | from ...util.math_parsing_util import (
 2 |     get_multiple_choice_answer,
 3 |     mmlu_pro_extract_answer,
 4 | )
 5 | 
 6 | from ..base import TaskConfig, TaskHandler
 7 | 
 8 | 
 9 | class MMLUTaskHandler(TaskHandler):
10 |     def generate_prompt(self, problem):
11 |         multiple_choice_string = self.get_multiple_choice_answers(problem)
12 |         prompt = problem["question"] + "\n" + multiple_choice_string
13 |         return self.task_config.templating_parameters["template"].format(prompt=prompt)
14 | 
15 |     def check_correctness(self, problem, generation):
16 |         pred = get_multiple_choice_answer(generation)
17 |         abcd = "ABCD"
18 |         answer = abcd[problem[self.task_config.answer_key]]
19 |         return answer == pred
20 | 
21 |     def update_results(self, problem, response):
22 |         # Initialize the response structure
23 |         response_entry = {
24 |             "content": response,
25 |             "correctness": None,
26 |             "reason": None,
27 |         }
28 |         curr_res = self.check_correctness(problem, generation=response)
29 |         if curr_res:
30 |             response_entry["correctness"] = True
31 |             response_entry["reason"] = ""
32 |         else:
33 |             response_entry["correctness"] = False
34 |             response_entry["reason"] = "Solution is incorrect."
35 |         return response_entry
36 | 
37 |     def get_multiple_choice_answers(self, problem):
38 |         options = problem["choices"]
39 |         options_str = ""
40 |         for _, (label, option) in enumerate(zip("ABCD", options)):
41 |             options_str += f"({label}) {str(option).strip()} "
42 |         options_str = options_str[:-1]  # remove the last space
43 |         return f"Answer Choices: {options_str}"
44 | 
45 |     def load_and_filter_dataset(
46 |         self, start, end, split=None, subset=None, difficulty=None
47 |     ):
48 |         dataset = self.load_dataset(subset=subset, split=split).to_pandas()
49 |         return dataset.iloc[start:end] if end > 0 else dataset.iloc[start:]
50 | 
51 | 
52 | class MMLUProTaskHandler(MMLUTaskHandler):
53 |     def __init__(self, task_config: TaskConfig):
54 |         super().__init__(task_config)
55 |         self.choices = [
56 |             "A",
57 |             "B",
58 |             "C",
59 |             "D",
60 |             "E",
61 |             "F",
62 |             "G",
63 |             "H",
64 |             "I",
65 |             "J",
66 |             "K",
67 |             "L",
68 |             "M",
69 |             "N",
70 |             "O",
71 |             "P",
72 |         ]
73 | 
74 |     def generate_prompt(self, prompt):
75 |         return self.task_config.templating_parameters["template"].format(prompt=prompt)
76 | 
77 |     def check_correctness(self, problem, generation):
78 |         pred = mmlu_pro_extract_answer(generation)
79 |         answer = self.choices[problem["answer_index"]]
80 |         return answer == pred
81 | 
82 |     def get_multiple_choice_answers(self, problem):
83 |         options = problem["options"]
84 |         for i, (label, option) in enumerate(zip(self.choices[: len(options)], options)):
85 |             options[i] = f"({label}) {str(option).strip()}"
86 |         options = " ".join(options)
87 |         return f"Answer Choices: {options}"
88 | 
89 |     def load_and_filter_dataset(
90 |         self, start, end, split=None, subset=None, difficulty=None
91 |     ):
92 |         dataset = self.load_dataset(subset=subset, split=split).to_pandas()
93 |         return dataset.iloc[start:end] if end > 0 else dataset.iloc[start:]
94 | 


--------------------------------------------------------------------------------
/evals/tasks/mmlu/mmlu_pro.yaml:
--------------------------------------------------------------------------------
1 | handler: mmlu_pro
2 | dataset_path: TIGER-Lab/MMLU-Pro
3 | dataset_subset: default
4 | dataset_split: test
5 | question_key: question
6 | answer_key: answer
7 | templating_parameters:
8 |   template: "Return your final response within \\boxed{{}}. {prompt}"
9 | 


--------------------------------------------------------------------------------
/evals/tasks/numina/numina.yaml:
--------------------------------------------------------------------------------
 1 | handler: numina
 2 | dataset_path: "AI-MO/NuminaMath-CoT"
 3 | dataset_subset: null
 4 | dataset_split: train
 5 | question_key: problem
 6 | answer_key: solution
 7 | templating_parameters:
 8 |   template: "Return your final response within \\boxed{{}}. {prompt}"
 9 | # Optionally, you can filter the dataset by difficulty
10 | # preprocess_config:
11 | #   filter_difficulty: true
12 | #   math_difficulty_lower_bound: 4
13 | #   math_difficulty_upper_bound: 9
14 | #   source: math
15 | 


--------------------------------------------------------------------------------
/evals/tasks/numina/numina_amc_aime.yaml:
--------------------------------------------------------------------------------
 1 | handler: numina
 2 | dataset_path: "AI-MO/NuminaMath-CoT"
 3 | dataset_subset: null
 4 | dataset_split: train
 5 | question_key: problem
 6 | answer_key: solution
 7 | templating_parameters:
 8 |   template: "Return your final response within \\boxed{{}}. {prompt}"
 9 | preprocess_config:
10 |   filter_difficulty: true
11 |   math_difficulty_lower_bound: 1
12 |   math_difficulty_upper_bound: 9
13 |   source: amc_aime
14 | 


--------------------------------------------------------------------------------
/evals/tasks/numina/numina_handler.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | from datasets import load_dataset
 4 | 
 5 | from ...util.common import TimeoutException, timeout
 6 | from ...util.math_parsing_util import (
 7 |     extract_answer,
 8 |     math_equal,
 9 | )
10 | 
11 | from ..base import TaskHandler
12 | 
13 | 
14 | class NUMINATaskHandler(TaskHandler):
15 | 
16 |     def generate_prompt(self, problem: Dict[str, Any]):
17 |         prompt = problem["problem"]
18 |         return self.task_config.templating_parameters["template"].format(prompt=prompt)
19 | 
20 |     @timeout(5)  # Add timeout of 5 seconds
21 |     def check_correctness(self, problem, generation):
22 |         solution = extract_answer(problem[self.task_config.answer_key])
23 |         pred = extract_answer(generation)
24 |         return math_equal(pred, solution)
25 | 
26 |     def update_results(self, problem, response):
27 |         # Initialize the response structure
28 |         response_entry = {
29 |             "content": response,
30 |             "correctness": None,
31 |             "reason": None,
32 |         }
33 | 
34 |         try:
35 |             curr_res = self.check_correctness(problem, generation=response)
36 |             if curr_res:
37 |                 response_entry["correctness"] = True
38 |                 response_entry["reason"] = ""
39 |             else:
40 |                 response_entry["correctness"] = False
41 |                 response_entry["reason"] = "Solution is incorrect."
42 |         except TimeoutException as e:
43 |             response_entry["correctness"] = False
44 |             response_entry["reason"] = str(e)
45 | 
46 |         return response_entry
47 | 
48 |     @staticmethod
49 |     def get_difficulty_dict(subset, start, end):
50 |         diff_dict = {}
51 |         dataset = load_dataset(
52 |             "NovaSky-AI/labeled_numina_difficulty_859K",
53 |             trust_remote_code=True,
54 |             split="train",
55 |         )
56 |         for example in dataset:
57 |             # print(example)
58 |             diff_dict[example["problem"]] = example["gpt_difficulty_parsed"]
59 |         return diff_dict
60 | 
61 |     def load_and_filter_dataset(
62 |         self, start, end, split=None, subset=None, difficulty=None
63 |     ):
64 |         dataset = self.load_dataset(subset=subset, split=split)
65 | 
66 |         if "source" in self.task_config.preprocess_config:
67 |             source = self.task_config.preprocess_config["source"]
68 |             dataset = dataset.filter(lambda x: x["source"] == source)
69 | 
70 |         dataset = dataset.to_pandas()
71 |         # TODO (sumanthrh): this is hacky for numina. the start and end filter should be applied at the very end
72 |         # it is kept here for consistency with the original code.
73 |         dataset = dataset.iloc[start:end] if end > 0 else dataset.iloc[start:]
74 |         dataset = dataset[dataset["solution"].str.contains("boxed", na=False)]
75 | 
76 |         if "filter_difficulty" in self.task_config.preprocess_config:
77 |             lower_bound = self.task_config.preprocess_config[
78 |                 "math_difficulty_lower_bound"
79 |             ]
80 |             upper_bound = self.task_config.preprocess_config[
81 |                 "math_difficulty_upper_bound"
82 |             ]
83 |             diff_dict = self.get_difficulty_dict(
84 |                 self.task_config.dataset_subset, start, end
85 |             )
86 |             dataset = dataset[
87 |                 dataset["problem"]
88 |                 .map(diff_dict)
89 |                 .apply(lambda x: x >= lower_bound and x <= upper_bound)
90 |             ]
91 | 
92 |         return dataset
93 | 


--------------------------------------------------------------------------------
/evals/tasks/numina/numina_math.yaml:
--------------------------------------------------------------------------------
 1 | handler: numina
 2 | dataset_path: "AI-MO/NuminaMath-CoT"
 3 | dataset_subset: null
 4 | dataset_split: train
 5 | question_key: problem
 6 | answer_key: solution
 7 | templating_parameters:
 8 |   template: "Return your final response within \\boxed{{}}. {prompt}"
 9 | preprocess_config:
10 |   filter_difficulty: true
11 |   math_difficulty_lower_bound: 4
12 |   math_difficulty_upper_bound: 9
13 |   source: math
14 | 


--------------------------------------------------------------------------------
/evals/tasks/numina/numina_olympiads.yaml:
--------------------------------------------------------------------------------
 1 | handler: numina
 2 | dataset_path: "AI-MO/NuminaMath-CoT"
 3 | dataset_subset: null
 4 | dataset_split: train
 5 | question_key: problem
 6 | answer_key: solution
 7 | templating_parameters:
 8 |   template: "Return your final response within \\boxed{{}}. {prompt}"
 9 | preprocess_config:
10 |   filter_difficulty: true
11 |   math_difficulty_lower_bound: 9
12 |   math_difficulty_upper_bound: 9
13 |   source: olympiads
14 | 


--------------------------------------------------------------------------------
/evals/tasks/olympiadbench/olympiadbench_handler.py:
--------------------------------------------------------------------------------
 1 | from ...util.math_parsing_util import (
 2 |     extract_answer,
 3 |     math_equal,
 4 |     strip_answer_string,
 5 | )
 6 | 
 7 | from ..math.math_handler import MathTaskHandler
 8 | 
 9 | 
10 | class OlympiadBenchMathTaskHandler(MathTaskHandler):
11 |     def check_correctness(self, problem, generation):
12 |         # all problems have final answer in a list
13 |         answer = strip_answer_string(problem[self.task_config.answer_key][0])
14 |         pred = extract_answer(generation)
15 |         pred = strip_answer_string(pred)
16 |         return math_equal(pred, answer)
17 | 


--------------------------------------------------------------------------------
/evals/tasks/olympiadbench/olympiadbench_math_en.yaml:
--------------------------------------------------------------------------------
1 | handler: olympiadbench_math
2 | dataset_path: Hothan/OlympiadBench
3 | dataset_subset: OE_TO_maths_en_COMP
4 | dataset_split: train
5 | question_key: question
6 | answer_key: final_answer
7 | templating_parameters:
8 |   template: "Return your final response within \\boxed{{}}. {question}"
9 | 


--------------------------------------------------------------------------------
/evals/tasks/omni_math/omni_handler.py:
--------------------------------------------------------------------------------
 1 | from ...util.math_parsing_util import (
 2 |     extract_answer,
 3 |     math_equal,
 4 |     strip_answer_string,
 5 | )
 6 | 
 7 | from ..math.math_handler import MathTaskHandler
 8 | 
 9 | 
10 | class OMNIMathTaskHandler(MathTaskHandler):
11 |     def generate_prompt(self, problem):
12 |         return self.task_config.templating_parameters["template"].format(**problem)
13 | 
14 |     def check_correctness(self, problem, generation):
15 |         # no preprocessing needed
16 |         answer = problem[self.task_config.answer_key]
17 |         pred = extract_answer(generation)
18 |         pred = strip_answer_string(pred)
19 |         return math_equal(pred, answer)
20 | 


--------------------------------------------------------------------------------
/evals/tasks/omni_math/omni_math.yaml:
--------------------------------------------------------------------------------
 1 | handler: omni_math
 2 | dataset_path: "KbsdJames/Omni-MATH"  # repo ID in huggingface
 3 | dataset_subset: null # which subset on huggingface
 4 | dataset_split: test_rule_based # Rule based evaluation
 5 | dataset_kwargs:
 6 |   # NOTE: This is using the subset for rule-based evaluation in the below PR
 7 |   revision: refs/pr/2 
 8 | question_key: problem
 9 | answer_key: answer
10 | templating_parameters: 
11 |   template: "Return your final response within \\boxed{{}}. {problem}"


--------------------------------------------------------------------------------
/evals/tasks/taco/taco.yaml:
--------------------------------------------------------------------------------
 1 | handler: taco
 2 | dataset_path: "BAAI/TACO"
 3 | dataset_subset: MEDIUM
 4 | dataset_split: train
 5 | dataset_kwargs:
 6 |   trust_remote_code: true
 7 | question_key: question
 8 | answer_key: null
 9 | templating_parameters:
10 |   initial_template: "\nQUESTION:\n{prompt}"
11 |   # Add starter code to initial template
12 |   starter_code_template: "{input}\n{starter_code}"
13 |   # stdin template is used when there is no starter code or fn_name
14 |   stdin_template: "{input}\nUse Standard Input format\nANSWER:\n"
15 |   # call template is used when there is starter code or fn_name
16 |   call_template: "{input}\nUse Call-Based format\nANSWER:\n"
17 | # Optionally, you can filter the dataset by difficulty
18 | # preprocess_config:
19 | #   difficulty: easy
20 | 
21 | 


--------------------------------------------------------------------------------
/evals/tasks/taco/taco_handler.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import multiprocessing
  3 | from multiprocessing import Manager
  4 | 
  5 | import numpy as np
  6 | 
  7 | from ...util.common import has_code
  8 | 
  9 | from ..base import TaskHandler
 10 | from .taco_util import run_test as taco_run_test
 11 | 
 12 | 
 13 | class TACOTaskHandler(TaskHandler):
 14 | 
 15 |     def generate_prompt(self, problem):
 16 |         prompt = problem["question"]
 17 |         starter_code = (
 18 |             None if len(problem["starter_code"]) == 0 else problem["starter_code"]
 19 |         )
 20 |         try:
 21 |             input_outpout = json.loads(problem["input_output"])
 22 |             fn_name = (
 23 |                 None if not input_outpout.get("fn_name") else input_outpout["fn_name"]
 24 |             )
 25 |         except ValueError:
 26 |             fn_name = None
 27 | 
 28 |         _input = self.task_config.templating_parameters["initial_template"].format(
 29 |             prompt=prompt
 30 |         )
 31 | 
 32 |         if starter_code:
 33 |             _input = self.task_config.templating_parameters[
 34 |                 "starter_code_template"
 35 |             ].format(input=_input, starter_code=starter_code)
 36 |         else:
 37 |             _input = self.task_config.templating_parameters["initial_template"].format(
 38 |                 prompt=prompt
 39 |             )
 40 |         if (not fn_name) and (not starter_code):
 41 |             _input = self.task_config.templating_parameters["stdin_template"].format(
 42 |                 input=_input
 43 |             )
 44 |         else:
 45 |             _input = self.task_config.templating_parameters["call_template"].format(
 46 |                 input=_input
 47 |             )
 48 | 
 49 |         return _input
 50 | 
 51 |     def check_correctness(self, problem, generation):
 52 |         TIME_OUT = 300
 53 | 
 54 |         manager = Manager()
 55 |         result = manager.list()
 56 |         p = multiprocessing.Process(
 57 |             target=_temp_run, args=(problem, generation, False, result)
 58 |         )
 59 |         p.start()
 60 |         p.join(timeout=TIME_OUT + 1)
 61 |         if p.is_alive():
 62 |             p.kill()
 63 |         return bool(result and np.all(result[0]))
 64 | 
 65 |     def update_results(self, problem, response):
 66 |         # Initialize the response structure
 67 |         response_entry = {
 68 |             "content": response,
 69 |             "correctness": None,
 70 |             "reason": None,
 71 |         }
 72 |         code_filter_result = has_code(response)
 73 |         if len(code_filter_result) == 0:
 74 |             response_entry["correctness"] = False
 75 |             response_entry["reason"] = "Does not contain code component."
 76 |         else:
 77 |             last_code = code_filter_result[-1]
 78 |             curr_res = self.check_correctness(problem, generation=last_code)
 79 |             if curr_res:
 80 |                 response_entry["correctness"] = True
 81 |                 response_entry["reason"] = ""
 82 |             else:
 83 |                 response_entry["correctness"] = False
 84 |                 response_entry["reason"] = "Code is incorrect."
 85 | 
 86 |         return response_entry
 87 | 
 88 |     def load_and_filter_dataset(
 89 |         self, start, end, split=None, subset=None, difficulty=None
 90 |     ):
 91 |         dataset = self.load_dataset(subset=subset, split=split).to_pandas()
 92 |         if difficulty or "difficulty" in self.task_config.preprocess_config:
 93 |             difficulty = (
 94 |                 difficulty
 95 |                 if difficulty
 96 |                 else self.task_config.preprocess_config["difficulty"]
 97 |             )
 98 |             dataset = dataset.filter(
 99 |                 lambda example: example["difficulty"] == difficulty
100 |             )
101 | 
102 |         return dataset.iloc[start:end] if end > 0 else dataset.iloc[start:]
103 | 
104 | 
105 | def _temp_run(problem, generation, debug, result):
106 |     try:
107 |         result.append(
108 |             taco_run_test(problem["input_output"], test=generation, debug=debug)
109 |         )
110 |     except Exception as e:
111 |         print(f"Error in _temp_run: {e}")
112 | 


--------------------------------------------------------------------------------
/evals/tasks/task_util.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | from typing import Dict
 4 | 
 5 | 
 6 | def get_tasks(task_root_dir: str) -> Dict[str, str]:
 7 |     """Returns a dictionary of task names and their corresponding yaml file paths"""
 8 |     # list all yamls in subdirectories
 9 |     name_to_yaml = {}
10 |     for yaml_file in glob.glob(
11 |         os.path.join(task_root_dir, "**", "*.yaml"), recursive=True
12 |     ):
13 |         # arc.yaml -> arc
14 |         name = os.path.basename(yaml_file).split(".")[0]
15 | 
16 |         name_to_yaml[name] = yaml_file
17 | 
18 |     return name_to_yaml
19 | 


--------------------------------------------------------------------------------
/evals/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanomaoli/llm_reproducibility/8a373c5a159a27e59783394827cecadd6255484e/evals/util/__init__.py


--------------------------------------------------------------------------------
/evals/util/cli_util.py:
--------------------------------------------------------------------------------
 1 | from ast import literal_eval
 2 | from typing import Any, List
 3 | 
 4 | import msgpack
 5 | import xxhash
 6 | 
 7 | 
 8 | def _parse_multi_args(vals: str) -> dict:
 9 |     """Parse a multi-value argument into a dictionary.
10 | 
11 |     The argument can either be a comma separated list of key=value pairs, or a dictionary.
12 |     """
13 |     try:
14 |         # try to parse as a dictionary first
15 |         my_dict = literal_eval(vals)
16 |         assert isinstance(my_dict, dict)
17 |         return my_dict
18 |     except Exception:
19 |         # try to parse as a comma separated list of key=value pairs
20 |         vals = vals.replace(" ", "")
21 |         if not len(vals):
22 |             return {}
23 |         ret = {}
24 |         for val in vals.split(","):
25 |             k, v = val.split("=")
26 |             try:
27 |                 ret[k] = literal_eval(v)
28 |             except (ValueError, SyntaxError):
29 |                 # if literal eval fails, propagate as a string
30 |                 ret[k] = v
31 |         return ret
32 | 
33 | 
34 | def parse_multi_args(vals: str) -> dict:
35 |     try:
36 |         return _parse_multi_args(vals)
37 |     except Exception as err:
38 |         raise ValueError(
39 |             f"Expected comma separated list of parameters arg1=val1,args2=val2 or a dictionary, got invalid argument {vals}. "
40 |         ) from err
41 | 
42 | 
43 | def comma_separated_to_list(vals: str) -> List[str]:
44 |     vals = vals.replace(" ", "")
45 |     return vals.split(",")
46 | 
47 | 
48 | def to_tuple(d) -> tuple:
49 |     if isinstance(d, dict):
50 |         return tuple(map(to_tuple, d.items()))
51 |     elif isinstance(d, (set, list, tuple)):
52 |         return tuple(map(to_tuple, d))
53 |     else:
54 |         return d
55 | 
56 | 
57 | def get_deterministic_hash(d: Any, num_digits: int = 6) -> str:
58 |     """Get deterministic hash"""
59 |     tuple_form = to_tuple(d)
60 |     serialized = msgpack.packb(tuple_form, use_bin_type=True)
61 |     return xxhash.xxh32(serialized).hexdigest()[:num_digits]
62 | 


--------------------------------------------------------------------------------
/evals/util/common.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import os
 3 | import random
 4 | import re
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | 
 9 | 
10 | def set_seed(seed: int):
11 |     os.environ["PYTHONHASHSEED"] = str(seed)
12 |     random.seed(seed)
13 |     np.random.seed(seed)
14 |     torch.manual_seed(seed)
15 |     torch.cuda.manual_seed_all(seed)
16 | 
17 | 
18 | class TimeoutException(Exception):
19 |     """Custom exception for function timeout."""
20 | 
21 |     pass
22 | 
23 | 
24 | def timeout(seconds):
25 |     """Decorator to enforce a timeout on a function using multiprocessing."""
26 | 
27 |     def decorator(func):
28 |         def wrapper(*args, **kwargs):
29 |             # A queue to store the result or exception
30 |             queue = multiprocessing.Queue()
31 | 
32 |             def target(queue, *args, **kwargs):
33 |                 try:
34 |                     result = func(*args, **kwargs)
35 |                     queue.put((True, result))
36 |                 except Exception as e:
37 |                     queue.put((False, e))
38 | 
39 |             process = multiprocessing.Process(
40 |                 target=target, args=(queue, *args), kwargs=kwargs
41 |             )
42 |             process.start()
43 |             process.join(seconds)
44 | 
45 |             if process.is_alive():
46 |                 process.terminate()
47 |                 process.join()
48 |                 raise TimeoutException(
49 |                     f"Function '{func.__name__}' timed out after {seconds} seconds!"
50 |                 )
51 | 
52 |             success, value = queue.get()
53 |             if success:
54 |                 return value
55 |             else:
56 |                 raise value
57 | 
58 |         return wrapper
59 | 
60 |     return decorator
61 | 
62 | 
63 | def has_code(response):
64 |     pattern = r"```(?:[a-zA-Z]*)\n(.*?)```"
65 |     # Use re.DOTALL to match multiline content inside backticks
66 |     matches = re.findall(pattern, response, re.DOTALL)
67 |     # print(matches)
68 |     return matches
69 | 


--------------------------------------------------------------------------------
/evals/util/metrics.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import math
 3 | from collections import defaultdict
 4 | from typing import Dict, List
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | def _pass_at_k(n, c, k):
10 |     """
11 |     :param n: total number of samples
12 |     :param c: number of correct samples
13 |     :param k: k in pass@$k$
14 |     """
15 |     if n - c < k:
16 |         return 1.0
17 |     return float(1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
18 | 
19 | 
20 | def pass_at_k(N: int, id_to_scores: Dict[str, List[int]]):
21 |     final_passk_scores = {}
22 |     k_to_passk_scores = defaultdict(list)  # k -> list of scores
23 |     for _, sample_scores in id_to_scores.items():
24 |         # Start at N
25 |         k = N
26 |         is_power_of_2 = N == 2 ** (int(math.log2(N)))
27 |         while k > 0:
28 |             # calculate pass @ k
29 |             num_correct = np.sum(sample_scores)
30 |             pass_k = _pass_at_k(N, num_correct, k)
31 |             k_to_passk_scores[k].append(pass_k)
32 |             # corner case: when N is not a power of 2
33 |             if not is_power_of_2 and k == N:
34 |                 k = 2 ** (int(math.log2(N)))
35 |             else:
36 |                 # otherwise, just divide by 2
37 |                 k = k // 2
38 | 
39 |     for k in k_to_passk_scores:
40 |         final_passk_scores[f"{k=}"] = round(np.mean(k_to_passk_scores[k]) * 100, 3)
41 | 
42 |     # print("Final pass @ k:")
43 |     for k, s in final_passk_scores.items():
44 |         logging.info(f"k: {k}, pass @ k: {s}")
45 |     return final_passk_scores
46 | 


--------------------------------------------------------------------------------
/evals/util/response.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import List, Optional
  3 | 
  4 | 
  5 | @dataclass
  6 | class Response:
  7 |     response: List[str]
  8 |     num_completion_tokens: List[int]
  9 |     num_input_tokens: int
 10 |     index: Optional[int] = None
 11 | 
 12 |     @classmethod
 13 |     def from_ray_response(cls, response) -> "Response":
 14 |         """
 15 |         Factory method to create a Response instance from a rayllm response.
 16 | 
 17 |         Args:
 18 |             response: Ray response object containing generated text and token information
 19 | 
 20 |         Returns:
 21 |             Responses: New instance initialized with Ray response data
 22 |         """
 23 | 
 24 |         if isinstance(response["generated_text"], list):
 25 |             # n > 1 samples
 26 |             response_texts = response["generated_text"]
 27 |             num_completion_tokens = [
 28 |                 int(response["num_generated_tokens"][i])
 29 |                 for i in range(len(response["num_generated_tokens"]))
 30 |             ]
 31 |         else:
 32 |             response_texts = [response["generated_text"]]
 33 |             num_completion_tokens = [int(response["num_generated_tokens"])]
 34 |         return cls(
 35 |             response=response_texts,
 36 |             num_completion_tokens=num_completion_tokens,
 37 |             num_input_tokens=int(response["num_input_tokens"]),
 38 |             index=response["index"],
 39 |         )
 40 | 
 41 |     @classmethod
 42 |     def from_openai_response(cls, response) -> "Response":
 43 |         """
 44 |         Factory method to create a Response instance from an OpenAI response.
 45 | 
 46 |         Args:
 47 |             response: OpenAI response object containing message content and token information
 48 | 
 49 |         Returns:
 50 |             Responses: New instance initialized with OpenAI response data
 51 |         """
 52 |         return cls(
 53 |             response=[
 54 |                 response.choices[i].message.content
 55 |                 for i in range(len(response.choices))
 56 |             ],
 57 |             num_completion_tokens=[
 58 |                 response.usage.completion_tokens if i == 0 else 0
 59 |                 for i in range(len(response.choices))
 60 |             ],
 61 |             num_input_tokens=response.usage.prompt_tokens,
 62 |         )
 63 | 
 64 |     @classmethod
 65 |     def from_vllm_response(cls, response) -> "Response":
 66 |         """
 67 |         Factory method to create a Response instance from a vLLM response.
 68 | 
 69 |         Args:
 70 |             response: vLLM response object containing output text and token information
 71 | 
 72 |         Returns:
 73 |             Responses: New instance initialized with vLLM response data
 74 |         """
 75 |         response_texts = [
 76 |             response.outputs[i].text for i in range(len(response.outputs))
 77 |         ]
 78 |         num_completion_tokens = [
 79 |             len(response.outputs[i].token_ids) for i in range(len(response.outputs))
 80 |         ]
 81 |         return cls(
 82 |             response=response_texts,
 83 |             num_completion_tokens=num_completion_tokens,
 84 |             num_input_tokens=len(response.prompt_token_ids),
 85 |         )
 86 | 
 87 | 
 88 | @dataclass
 89 | class SingleParsedResponse:
 90 |     content: str
 91 |     correctness: Optional[bool] = None
 92 |     reason: Optional[str] = None
 93 | 
 94 |     def to_dict(self):
 95 |         return {
 96 |             "content": self.content,
 97 |             "correctness": self.correctness,
 98 |             "reason": self.reason,
 99 |         }
100 | 


--------------------------------------------------------------------------------
/evals/util/results.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dataclasses import asdict, dataclass
 3 | from pathlib import Path
 4 | from typing import Any, Dict, Optional
 5 | 
 6 | 
 7 | @dataclass
 8 | class SummaryResults:
 9 |     # configuration: Dict[str, Any]
10 |     # total_completion_tokens: int = 0
11 |     # avg_completion_tokens: float = 0
12 |     # total_prompt_tokens: int = 0
13 |     # avg_prompt_tokens: float = 0
14 |     accuracy: float = 0.0
15 |     pass_at_k: Optional[Dict[str, float]] = None
16 |     # mean_of_stdevs: float = None
17 |     # run_level_stdev: float = None
18 | 
19 |     def to_json_dict(self) -> Dict[str, Any]:
20 |         """Convert to a JSON-compatible dictionary."""
21 |         return asdict(self)
22 | 
23 | 
24 | def save_summary(summary_path: Path, summary: SummaryResults) -> None:
25 |     with open(summary_path, "w", encoding="utf-8") as f:
26 |         json.dump(summary.to_json_dict(), f, indent=4)
27 | 


--------------------------------------------------------------------------------
/figures/reproduciblellm_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanomaoli/llm_reproducibility/8a373c5a159a27e59783394827cecadd6255484e/figures/reproduciblellm_fig1.png


--------------------------------------------------------------------------------
/patch_vllm.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, List, Optional, Tuple
  2 | import torch
  3 | from typing import Optional, Tuple
  4 | from vllm.model_executor.layers.rotary_embedding import get_rope, RotaryEmbedding
  5 | from vllm.model_executor.layers.linear import UnquantizedLinearMethod
  6 | from vllm.model_executor.layers.vocab_parallel_embedding import UnquantizedEmbeddingMethod
  7 | from vllm.attention import get_attn_backend
  8 | import torch.nn.functional as F
  9 | from vllm.utils import get_dtype_size
 10 | from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 11 | from vllm.worker.cache_engine import CacheEngine
 12 | from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
 13 |                         get_dtype_size, is_pin_memory_available)
 14 | from vllm.distributed import get_pp_group
 15 | from vllm.model_executor.models.qwen2 import Qwen2Model, LogitsProcessor, get_sampler, ParallelLMHead, PPMissingLayer, maybe_prefix
 16 | from vllm.attention import Attention
 17 | # from vllm.v1.worker.gpu_model_runner
 18 | import sys
 19 | import pdb
 20 | # from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 21 | # STR_DTYPE_TO_TORCH_DTYPE["float32"] = torch.float32
 22 | # from vllm.config import CacheConfig
 23 | 
 24 | 
 25 | class ForkedPdb(pdb.Pdb):
 26 |     """
 27 |     PDB Subclass for debugging multi-processed code
 28 |     Suggested in: https://stackoverflow.com/questions/4716533/how-to-attach-debugger-to-a-python-subproccess
 29 |     """
 30 |     def interaction(self, *args, **kwargs):
 31 |         _stdin = sys.stdin
 32 |         try:
 33 |             sys.stdin = open('/dev/stdin')
 34 |             pdb.Pdb.interaction(self, *args, **kwargs)
 35 |         finally:
 36 |             sys.stdin = _stdin
 37 |             
 38 | def convert_linear_weights_to_fp16(model: torch.nn.Module):
 39 |     """Convert weights of linear layers to fp16 for storage."""
 40 |     for name, module in model.named_modules():
 41 |         if 'proj' in name:
 42 |             module.weight.data = module.weight.data.to(torch.float16)
 43 |             if module.bias is not None:
 44 |                 module.bias.data = module.bias.data.to(torch.float16)
 45 | 
 46 | def convert_linear_weights_to_bfloat16(model: torch.nn.Module):
 47 |     """Convert weights of linear layers to bfloat16 for storage."""
 48 |     for name, module in model.named_modules():
 49 |         if 'proj' in name:
 50 |             module.weight.data = module.weight.data.to(torch.bfloat16)
 51 |             if module.bias is not None:
 52 |                 module.bias.data = module.bias.data.to(torch.bfloat16)
 53 | 
 54 | def our_attn_forward(
 55 |     self,
 56 |     positions: torch.Tensor,
 57 |     hidden_states: torch.Tensor,
 58 | ) -> torch.Tensor:
 59 |     # Input is already in fp32 from previous layer
 60 |     qkv, _ = self.qkv_proj(hidden_states)
 61 |     q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 62 |     q, k = self.rotary_emb(positions, q, k)
 63 |     attn_output = self.attn(q, k, v)
 64 |     output, _ = self.o_proj(attn_output)
 65 |     return output  # Keep in fp32
 66 |     
 67 | def our_fp32_rope_forward_cuda(
 68 |     self,
 69 |     positions: torch.Tensor,
 70 |     query: torch.Tensor,
 71 |     key: torch.Tensor,
 72 |     offsets: Optional[torch.Tensor] = None,
 73 | ) -> Tuple[torch.Tensor, torch.Tensor]:
 74 |     from vllm import _custom_ops as ops
 75 |     # Everything is already in fp32, no need for conversion
 76 |     if self.cos_sin_cache.device != query.device:
 77 |         self.cos_sin_cache = self.cos_sin_cache.to(query.device)
 78 | 
 79 |     if offsets is not None:
 80 |         ops.batched_rotary_embedding(positions, query, key, self.head_size,
 81 |                                    self.cos_sin_cache,
 82 |                                    self.is_neox_style, self.rotary_dim,
 83 |                                    offsets)
 84 |     else:
 85 |         ops.rotary_embedding(positions, query, key, self.head_size,
 86 |                            self.cos_sin_cache, self.is_neox_style)
 87 |     return query, key
 88 |     
 89 | 
 90 | def our_linear_apply(self,
 91 |             layer: torch.nn.Module,
 92 |             x: torch.Tensor,
 93 |             bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 94 |     # x is already in fp32
 95 |     assert x.dtype == torch.float32
 96 |     # Upcast weights to fp32 for computation
 97 |     weight = layer.weight.to(torch.float32)
 98 |     if bias is not None:
 99 |         bias = bias.to(torch.float32)
100 |     return F.linear(x, weight, bias)  # Result stays in fp32
101 | 
102 | def patch_cache_engine():
103 |     original_init = CacheEngine.__init__
104 |     def custom_cache_engine_init(
105 |         self,
106 |         cache_config,
107 |         model_config,
108 |         parallel_config,
109 |         device_config,
110 |     ) -> None:
111 |         self.cache_config = cache_config
112 |         self.model_config = model_config
113 |         self.parallel_config = parallel_config
114 |         self.device_config = device_config
115 | 
116 |         self.head_size = model_config.get_head_size()
117 |         # Models like Jamba, have mixed typed layers, E.g Mamba
118 |         self.num_attention_layers = model_config.get_num_layers_by_block_type(
119 |             parallel_config, LayerBlockType.attention)
120 |         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
121 | 
122 |         self.block_size = cache_config.block_size
123 |         self.num_gpu_blocks = cache_config.num_gpu_blocks
124 |         if self.num_gpu_blocks:
125 |             self.num_gpu_blocks //= parallel_config.pipeline_parallel_size
126 |         self.num_cpu_blocks = cache_config.num_cpu_blocks
127 |         if self.num_cpu_blocks:
128 |             self.num_cpu_blocks //= parallel_config.pipeline_parallel_size
129 | 
130 |         self.dtype = torch.float32  # Force fp32 for cache
131 | 
132 |         # Get attention backend.
133 |         self.attn_backend = get_attn_backend(self.head_size,
134 |                                              model_config.dtype,
135 |                                              cache_config.cache_dtype,
136 |                                              self.block_size,
137 |                                              model_config.is_attention_free,
138 |                                              use_mla=model_config.use_mla)
139 | 
140 |         # Initialize the cache.
141 |         self.gpu_cache = self._allocate_kv_cache(
142 |             self.num_gpu_blocks, self.device_config.device_type)
143 |         self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
144 | 
145 |     @staticmethod
146 |     def our_get_cache_block_size(
147 |         cache_config,
148 |         model_config,
149 |         parallel_config,
150 |     ) -> int:
151 |         head_size = model_config.get_head_size()
152 |         num_heads = model_config.get_num_kv_heads(parallel_config)
153 |         num_attention_layers = model_config.get_num_layers_by_block_type(
154 |             parallel_config, LayerBlockType.attention)
155 | 
156 |         dtype = torch.float32  # Force fp32 for cache
157 |         key_cache_entry = num_heads * head_size
158 | 
159 |         # For MLA there is no value cache, since the latent vector
160 |         # is joint keys and values.
161 |         value_cache_entry = key_cache_entry if not model_config.use_mla else 0
162 |         total = num_attention_layers * cache_config.block_size * \
163 |             (key_cache_entry + value_cache_entry)
164 | 
165 |         dtype_size = get_dtype_size(dtype)
166 |         return dtype_size * total
167 | 
168 |     CacheEngine.__init__ = custom_cache_engine_init
169 |     CacheEngine.get_cache_block_size = our_get_cache_block_size
170 | 
171 | 
172 | def patch_qwen2_vllm():
173 |     # from vllm.platforms import _Backend
174 |     # from vllm.attention.selector import global_force_attn_backend
175 |     # global_force_attn_backend(_Backend.XFORMERS)
176 |     import os
177 |     os.environ["VLLM_ATTENTION_BACKEND"] = "XFORMERS" # FLASHINFER
178 |     from vllm.model_executor.models.qwen2 import Qwen2Attention, Qwen2ForCausalLM
179 |     
180 |     patch_cache_engine()
181 |     
182 |     def new_qwen2_lm_init(self, *, vllm_config, prefix: str = ""):
183 |         torch.nn.Module.__init__(self)
184 |         config = vllm_config.model_config.hf_config
185 |         quant_config = vllm_config.quant_config
186 |         lora_config = vllm_config.lora_config
187 | 
188 |         self.config = config
189 |         self.lora_config = lora_config
190 |         self.quant_config = quant_config
191 |         
192 |         self.model = Qwen2Model(vllm_config=vllm_config,
193 |                                 prefix=maybe_prefix(prefix, "model"))
194 | 
195 |         if get_pp_group().is_last_rank:
196 |             if config.tie_word_embeddings:
197 |                 self.lm_head = self.model.embed_tokens
198 |             else:
199 |                 self.lm_head = ParallelLMHead(config.vocab_size,
200 |                                               config.hidden_size,
201 |                                               quant_config=quant_config,
202 |                                               prefix=maybe_prefix(
203 |                                                   prefix, "lm_head"))
204 |         else:
205 |             self.lm_head = PPMissingLayer()
206 | 
207 |         # Convert linear weights to fp16 for storage
208 |         convert_linear_weights_to_bfloat16(self.model)
209 |         if not isinstance(self.lm_head, PPMissingLayer):
210 |             convert_linear_weights_to_bfloat16(self.lm_head)
211 | 
212 |         self.logits_processor = LogitsProcessor(config.vocab_size, scale=1.2)
213 |         self.sampler = get_sampler()
214 |         self.make_empty_intermediate_tensors = (
215 |             self.model.make_empty_intermediate_tensors)
216 | 
217 |     Qwen2ForCausalLM.__init__ = new_qwen2_lm_init
218 | 
219 |     # Store the original __init__
220 |     original_init = Qwen2Attention.__init__
221 |     def new_qwen2_init(self, *args, **kwargs):
222 |         # Call the original init first
223 |         original_init(self, *args, **kwargs)
224 |         self.rotary_emb = get_rope(
225 |             self.head_dim,
226 |             rotary_dim=self.head_dim,
227 |             max_position=kwargs['max_position'],
228 |             base=self.rope_theta,
229 |             rope_scaling=kwargs['rope_scaling'],
230 |             dtype=torch.float32  # RoPE computation in fp32
231 |         )
232 |     
233 |     Qwen2Attention.__init__ = new_qwen2_init
234 |     # Replace the apply method
235 |     UnquantizedLinearMethod.apply = our_linear_apply
236 |     UnquantizedEmbeddingMethod.apply = our_linear_apply
237 |     RotaryEmbedding.forward_cuda = our_fp32_rope_forward_cuda
238 |     Qwen2Attention.forward = our_attn_forward
239 |     print("Patched vLLM: Model loaded in fp32, linear weights stored in fp16, all computations in fp32")


--------------------------------------------------------------------------------
/prompt_util/prompt_template.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def make_conversation_from_contents(
 3 |     contents,
 4 |     system_prompt=None,
 5 |     user_template=None,
 6 |     assistant_prefill=None,
 7 | ):
 8 |     """Makes a conversation given a list of user/assistant message strings.
 9 | 
10 |     If system_prompt is provided, it will be added as the first message.
11 |     If user_template is provided, it will be used to format the user messages. This is useful for model-specific formatting.
12 | 
13 |     Args:
14 |         content: A list of user/assistant message strings.
15 |         system_prompt: An optional string for the system prompt.
16 |         user_template: An optional string for the user template.
17 | 
18 |     Returns:
19 |         A list of dictionaries representing the conversation.
20 |     """
21 | 
22 |     conversation = []
23 |     if system_prompt:
24 |         conversation.append({"role": "system", "content": system_prompt})
25 | 
26 |     for i, content in enumerate(contents):
27 |         if i % 2 == 0:
28 |             content = user_template.format(content) if user_template else content
29 |             conversation.append({"role": "user", "content": content})
30 |         else:
31 |             conversation.append({"role": "assistant", "content": content})
32 | 
33 |     if assistant_prefill and conversation[-1]["role"] == "user":
34 |         conversation.append({"role": "assistant", "content": assistant_prefill})
35 | 
36 |     return conversation


--------------------------------------------------------------------------------