├── ioi-evals.png ├── generate ├── .env.template ├── requirements.txt ├── .gitignore ├── get_context_length.py ├── utils │ ├── get_context_length.py │ ├── open_router_usage.py │ └── check_failures.py ├── TODO.md ├── slurm_standalone │ ├── serve_router.slurm │ ├── debug.slurm │ └── serve_r1.slurm ├── README.md ├── run_ioi_slurm.py └── evaluate.py ├── run_tests ├── .env.template ├── requirements.txt ├── piston │ ├── launch_piston_workers.sh │ ├── launch_single_piston.sh │ └── README.md ├── utils.py ├── custom_setup │ ├── compile │ └── run ├── selection_simulator.py ├── README.md ├── scoring.py ├── piston_client.py └── tests_runner.py ├── .gitignore └── README.md /ioi-evals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/ioi/HEAD/ioi-evals.png -------------------------------------------------------------------------------- /generate/.env.template: -------------------------------------------------------------------------------- 1 | OPENROUTER_API_KEY= 2 | OPENAI_API_KEY= 3 | ANTHROPIC_API_KEY= -------------------------------------------------------------------------------- /run_tests/.env.template: -------------------------------------------------------------------------------- 1 | PISTON_ENDPOINTS=slurm 2 | PISTON_MAX_REQUESTS_PER_ENDPOINT=1 -------------------------------------------------------------------------------- /run_tests/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets>=3.3.2 2 | tqdm 3 | python-dotenv 4 | loguru 5 | aiohttp 6 | huggingface_hub 7 | aiofiles 8 | uvloop 9 | tabulate -------------------------------------------------------------------------------- /generate/requirements.txt: -------------------------------------------------------------------------------- 1 | litellm 2 | datasets>=3.3.2 3 | tqdm 4 | python-dotenv 5 | loguru 6 | aiohttp 7 | huggingface_hub 8 | setuptools 9 | transformers>=4.48.3 10 | aiofiles 11 | polars 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Environment variables 2 | 3 | # logs 4 | logs/* 5 | 6 | # env 7 | .env 8 | 9 | # Python 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | *.so 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # Virtual Environment 32 | venv/ 33 | env/ 34 | ENV/ 35 | .venv/ 36 | .env/ 37 | 38 | # IDE 39 | .idea/ 40 | .vscode/ 41 | *.swp 42 | *.swo 43 | .DS_Store 44 | 45 | # Project specific 46 | results/ 47 | -------------------------------------------------------------------------------- /generate/.gitignore: -------------------------------------------------------------------------------- 1 | # Environment variables 2 | 3 | # logs 4 | logs/* 5 | 6 | # env 7 | .env 8 | 9 | # Python 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | *.so 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # Virtual Environment 32 | venv/ 33 | env/ 34 | ENV/ 35 | .venv/ 36 | .env/ 37 | 38 | # IDE 39 | .idea/ 40 | .vscode/ 41 | *.swp 42 | *.swo 43 | .DS_Store 44 | 45 | # Project specific 46 | results/ 47 | -------------------------------------------------------------------------------- /run_tests/piston/launch_piston_workers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # this simple script will launch a bunch of piston workers on the HF science cluster 4 | 5 | N_INSTANCES=${1:-5} # Default to 5 instances 6 | 7 | for i in $(seq 1 $N_INSTANCES); do 8 | # Find random (hopefully) available port 9 | PORT=$(comm -23 <(seq 2000 10000 | sort) <(ss -tan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n1) 10 | 11 | # the job name format is important for the code to then be able to get a list of workers. `piston-worker-` 12 | sbatch \ 13 | --job-name="piston-worker-$PORT" \ 14 | --export=ALL,PORT=$PORT \ 15 | /fsx/guilherme/piston/launch_single_piston.sh 16 | done -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IOI 2 | 3 | ![IOI Evals](ioi-evals.png) 4 | 5 | ## Dataset links 6 | - [Problem statements dataset](https://huggingface.co/datasets/open-r1/ioi) (IOI’2020 - IOI’2024): `open-r1/ioi` 7 | - [Test cases](https://huggingface.co/datasets/open-r1/ioi-test-cases): `open-r1/ioi-test-cases` 8 | - [Official (ground truth) solutions](https://huggingface.co/datasets/open-r1/ioi-sample-solutions): `open-r1/ioi-sample-solutions` 9 | - [Evaluation data for 40+ leading models on IOI’2024](https://huggingface.co/datasets/open-r1/ioi-2024-model-solutions): `open-r1/ioi-2024-model-solutions` 10 | 11 | ## Generating solutions 12 | To have models generate solutions to IOI problems, follow the instructions in the [generate](generate/README.md) directory. 13 | 14 | ## Running tests 15 | To run tests on generated solutions, follow the instructions in the [run_tests](run_tests/README.md) directory. 16 | -------------------------------------------------------------------------------- /generate/get_context_length.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | from typing import Dict, Any 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | def get_context_length(model_name: str) -> int: 8 | """Get maximum context length from model config.""" 9 | try: 10 | config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) 11 | # Check various possible context length attributes 12 | context_length = ( 13 | getattr(config, 'max_position_embeddings', None) or 14 | getattr(config, 'sliding_window', None) or 15 | getattr(config, 'max_sequence_length', None) or 16 | getattr(config, 'max_seq_len', None) or 17 | 4096 # Default fallback 18 | ) 19 | 20 | # Some models (like Qwen) might have sliding_window disabled 21 | if hasattr(config, 'use_sliding_window') and not config.use_sliding_window: 22 | # If sliding window is disabled, use max_position_embeddings instead 23 | context_length = getattr(config, 'max_position_embeddings', context_length) 24 | 25 | 26 | # Cap to 32k 27 | return min(context_length, 32768) 28 | except Exception as e: 29 | logger.warning(f"Could not get context length from config for {model_name}: {e}") 30 | return 4096 # Default fallback 31 | 32 | 33 | if __name__ == "__main__": 34 | import argparse 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument("--model_name", type=str, required=True) 37 | args = parser.parse_args() 38 | print(get_context_length(args.model_name)) 39 | -------------------------------------------------------------------------------- /generate/utils/get_context_length.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | from typing import Dict, Any 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | def get_context_length(model_name: str) -> int: 8 | """Get maximum context length from model config.""" 9 | try: 10 | config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) 11 | # Check various possible context length attributes 12 | context_length = ( 13 | getattr(config, 'max_position_embeddings', None) or 14 | getattr(config, 'sliding_window', None) or 15 | getattr(config, 'max_sequence_length', None) or 16 | getattr(config, 'max_seq_len', None) or 17 | 4096 # Default fallback 18 | ) 19 | 20 | # Some models (like Qwen) might have sliding_window disabled 21 | if hasattr(config, 'use_sliding_window') and not config.use_sliding_window: 22 | # If sliding window is disabled, use max_position_embeddings instead 23 | context_length = getattr(config, 'max_position_embeddings', context_length) 24 | 25 | 26 | # Cap to 32k 27 | return min(context_length, 32768) 28 | except Exception as e: 29 | logger.warning(f"Could not get context length from config for {model_name}: {e}") 30 | return 4096 # Default fallback 31 | 32 | 33 | if __name__ == "__main__": 34 | import argparse 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument("--model_name", type=str, required=True) 37 | args = parser.parse_args() 38 | print(get_context_length(args.model_name)) 39 | -------------------------------------------------------------------------------- /generate/TODO.md: -------------------------------------------------------------------------------- 1 | The tasks is to implement a simple repository for evaluation LLMs on IOI problems. 2 | 3 | 4 | ## Used frameworks 5 | - You should use LiteLLM to call the LLM providers. 6 | - You should use asyncio to run the LLM calls asynchronously. 7 | 8 | 9 | ## Steps for evaluation: 10 | - We have a dataset of IOI 2024 problems here: https://huggingface.co/datasets/open-r1/ioi-2024 with following format: 11 | ``` 12 | { 13 | 'name': str, 14 | 'id': str, 15 | 'day': str, 16 | 'subtask': str, 17 | 'statement': str, 18 | 'score': str, 19 | 'time_limit': str, 20 | } 21 | 22 | - Each problem is split into multiple subtasks (they have same id, but different subtask column). You will iterate over the problems and get their subtasks. 23 | 24 | - You will then take a problem and the subtasks and call a subtask sample funciton. Subtask sapmle function takes a problem and the last integer i, and returns the next subtask to solve. 25 | 26 | - You will then create a prompt based on subtasks and call the LLM with random seed. Then generate a next subtask and repeat untill you don't have 50 generations. 27 | 28 | - This way you will get 50 generations for each problem. And save them as dataset into org_id(arg)/model_id(arg). The resulting dataset will have following format: 29 | ``` 30 | { 31 | 'problem_id': str, 32 | 'subtask': str, 33 | 'prompt': str, 34 | 'generation': str, 35 | 'code': str, 36 | 'language': str, 37 | 'model_kwargs': dict, 38 | 'metadata': dict, 39 | } 40 | ``` 41 | 42 | Therefore if you have 90 problems and 50 generations for each problem, the resulting dataset will have 4500 samples. -------------------------------------------------------------------------------- /run_tests/piston/launch_single_piston.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=piston_worker 3 | #SBATCH --output=/fsx/guilherme/piston/worker-logs/%x-%j.out 4 | #SBATCH --error=/fsx/guilherme/piston/worker-logs/%x-%j.out # Redirect error logs to .out 5 | #SBATCH --cpus-per-task=2 6 | #SBATCH --mem-per-cpu=1950M 7 | #SBATCH --partition=hopper-cpu 8 | #SBATCH --time=48:00:00 9 | 10 | # sometimes if a bunch of workers start at the same time pyxis dies 11 | sleep $(( RANDOM % 20 )) 12 | 13 | # mounting the packages folder lets us not have to manually install the package on each instance 14 | # we use 63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a as the latest image requires isolate, which does not work on the HF science cluster (cgroups incompatibility) 15 | # feel free try with the latest image 16 | # the code you see below increases the very constrained piston default limits, and sets the repo url to the one hosting our IOI package 17 | srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/packages --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \ 18 | bash -c " 19 | export PISTON_COMPILE_TIMEOUT=60000 20 | export PISTON_RUN_TIMEOUT=60000 21 | export PISTON_OUTPUT_MAX_SIZE=1000000000 22 | export PISTON_MAX_FILE_SIZE=1000000000 23 | export PISTON_DISABLE_NETWORKING=true 24 | export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index 25 | 26 | sed -i '/app.use(body_parser.urlencoded/c\ app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js 27 | sed -i '/app.use(body_parser.json/c\ app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js 28 | 29 | # Start server in background 30 | node src 31 | " 32 | -------------------------------------------------------------------------------- /run_tests/utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from functools import lru_cache 3 | from itertools import islice 4 | 5 | from datasets import load_dataset 6 | 7 | 8 | def add_includes(code: str, problem_id: str) -> str: 9 | """ 10 | Fix common compilation errors for IOI problems. 11 | """ 12 | if not code: 13 | return code 14 | # has most of the useful functions 15 | code_header = '#include \n' 16 | # include the problem header 17 | problem_header_include = f'#include "{problem_id}.h"' 18 | if problem_header_include not in code: 19 | code_header += problem_header_include + '\n' 20 | # use namespace std since models forget std:: often 21 | if "using namespace std;" not in code and "std::" not in code: 22 | code_header += "\nusing namespace std;\n\n" 23 | return code_header + code 24 | 25 | @lru_cache 26 | def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]: 27 | """ 28 | Load IOI tests for a given year. 29 | """ 30 | tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train") 31 | test_cases = defaultdict(dict) 32 | for test_case in tests_dataset: 33 | test_cases[test_case['problem_id']][test_case['test_name']] = test_case['test_input'], test_case['test_output'] 34 | return test_cases 35 | 36 | def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]: 37 | """ 38 | Load IOI tests for a given year and problem id. 39 | """ 40 | return load_ioi_tests_for_year(year)[problem_id] 41 | 42 | def batched(iterable, n): 43 | "Batch data into lists of length n. The last batch may be shorter." 44 | # batched('ABCDEFG', 3) --> ABC DEF G 45 | if n < 1: 46 | return iterable 47 | it = iter(iterable) 48 | while (batch := list(islice(it, n))): 49 | yield batch -------------------------------------------------------------------------------- /run_tests/custom_setup/compile: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | manager_files=() # Array to store manager filenames 4 | current_dir="$(pwd)" 5 | 6 | # Checker compilation path 7 | checker_dir="$current_dir/checker" 8 | checker_src="$checker_dir/checker.cpp" 9 | 10 | if [ -e "$checker_src" ]; then 11 | echo "Compiling checker" 12 | checker_exe="$checker_dir/checker" 13 | g++ -x c++ -std=gnu++17 -O2 -o "$checker_exe" "$checker_src" 14 | chmod +x "$checker_exe" 15 | if [ $? -ne 0 ]; then 16 | echo "Could not compile checker" >&2 17 | exit 1 18 | fi 19 | echo "Compiled checker" 20 | else 21 | echo "No checker found at $checker_src" 22 | fi 23 | 24 | # Graders path 25 | graders_dir="$current_dir/graders" 26 | if [ ! -e "$graders_dir" ]; then 27 | echo "Grader folder was not found" >&2 28 | exit 1 29 | fi 30 | 31 | # Find and compile manager if it exists 32 | manager_src="$graders_dir/manager.cpp" 33 | if [ -e "$manager_src" ]; then 34 | echo "Compiling manager" 35 | manager_exe="$graders_dir/manager" 36 | g++ -x c++ -std=gnu++17 -O2 -o "$manager_exe" "$manager_src" 37 | chmod +x "$manager_exe" 38 | if [ $? -ne 0 ]; then 39 | echo "Could not compile manager" >&2 40 | exit 1 41 | fi 42 | manager_files+=("manager") 43 | fi 44 | 45 | # Process other graders 46 | graders_list=($(ls "$graders_dir" | grep -v 'manager.cpp')) 47 | for grader_name in "${graders_list[@]}"; do 48 | manager_files+=("$grader_name") 49 | done 50 | 51 | # Extract problem name and compile necessary files 52 | problem_name='?' 53 | for file in "${manager_files[@]}"; do 54 | if [[ "$file" == *.h && "$file" != "testlib.h" ]]; then 55 | problem_name="${file%.h}" 56 | echo "Problem name: $problem_name" 57 | break 58 | fi 59 | done 60 | 61 | files_to_compile=("graders/$problem_name.cpp") 62 | [ -e graders/grader.cpp ] && files_to_compile+=("graders/grader.cpp") 63 | [ -e graders/stub.cpp ] && files_to_compile+=("graders/stub.cpp") 64 | 65 | g++ -DEVAL -std=gnu++17 -O2 -pipe -s -o graders/"$problem_name" "${files_to_compile[@]}" 66 | if [ $? -ne 0 ]; then 67 | echo "Failed to compile $problem_name" >&2 68 | exit 1 69 | fi 70 | chmod +x graders/"$problem_name" 71 | echo "Compiled $problem_name from ${files_to_compile[@]} successfully" 72 | 73 | echo "Manager files: ${manager_files[@]}" -------------------------------------------------------------------------------- /generate/utils/open_router_usage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | from dotenv import load_dotenv 4 | 5 | # Load environment variables from .env file 6 | load_dotenv() 7 | 8 | def get_openrouter_usage_and_credits(): 9 | # Get API key from environment variables 10 | api_key = os.getenv('OPENROUTER_API_KEY') 11 | 12 | if not api_key: 13 | raise ValueError("OPENROUTER_API_KEY not found in environment variables") 14 | 15 | # API endpoints 16 | usage_url = "https://openrouter.ai/api/v1/auth/key" 17 | credits_url = "https://openrouter.ai/api/v1/credits" 18 | 19 | # Headers required for OpenRouter API 20 | headers = { 21 | "Authorization": f"Bearer {api_key}", 22 | } 23 | 24 | try: 25 | # Fetch usage data 26 | usage_response = requests.get(usage_url, headers=headers) 27 | usage_response.raise_for_status() # Raise an exception for bad status codes 28 | usage_data = usage_response.json() 29 | 30 | # Fetch credits data 31 | credits_response = requests.get(credits_url, headers=headers) 32 | print(credits_response.json()) 33 | credits_response.raise_for_status() # Raise an exception for bad status codes 34 | credits_data = credits_response.json() 35 | 36 | return usage_data, credits_data 37 | except requests.exceptions.RequestException as e: 38 | print(f"Error fetching data: {e}") 39 | return None, None 40 | 41 | if __name__ == "__main__": 42 | usage_data, credits_data = get_openrouter_usage_and_credits() 43 | if usage_data: 44 | print("OpenRouter Usage Information:") 45 | data = usage_data.get('data', {}) 46 | print(f"Label: {data.get('label', 'N/A')}") 47 | print(f"Limit: {data.get('limit', 'N/A')}") 48 | print(f"Usage: {data.get('usage', 'N/A')}") 49 | print(f"Limit Remaining: {data.get('limit_remaining', 'N/A')}") 50 | print(f"Is Free Tier: {data.get('is_free_tier', 'N/A')}") 51 | rate_limit = data.get('rate_limit', {}) 52 | print(f"Rate Limit Requests: {rate_limit.get('requests', 'N/A')}") 53 | print(f"Rate Limit Interval: {rate_limit.get('interval', 'N/A')}") 54 | 55 | if credits_data: 56 | print("OpenRouter Credits Information:") 57 | data = credits_data.get('data', {}) 58 | print(f"Total Credits: {data.get('total_credits', 'N/A')}") 59 | print(f"Total Usage: {data.get('total_usage', 'N/A')}") 60 | # Print any other relevant information from the response -------------------------------------------------------------------------------- /generate/slurm_standalone/serve_router.slurm: -------------------------------------------------------------------------------- 1 | # Credits to Anton Lozhkov 2 | #!/bin/bash 3 | #SBATCH --job-name=r1-router 4 | #SBATCH --partition=hopper-cpu 5 | #SBATCH --qos=high 6 | #SBATCH --nodes=1 7 | #SBATCH --cpus-per-task=8 8 | #SBATCH --mem-per-cpu=1875m 9 | #SBATCH --output=./logs/%x_%j_%n.out 10 | #SBATCH --error=./logs/%x_%j_%n.err 11 | #SBATCH --time=30-00:00:00 12 | #SBATCH --requeue 13 | 14 | set -exuo pipefail 15 | 16 | # Configuration variables 17 | ROUTER_PORT=39876 18 | SERVER_PORT=39877 # Must match the server script 19 | HEALTH_CHECK_TIMEOUT=10 # Timeout for health checks (seconds) 20 | UV_ENV=/fsx/hynek_kydlicek/projects/ioi-leaderboard/ioi-eval 21 | 22 | trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1 23 | 24 | # Environment setup 25 | source ~/.bashrc 26 | source $UV_ENV/bin/activate 27 | 28 | # Start the router 29 | python -m sglang_router.launch_router \ 30 | --port "$ROUTER_PORT" \ 31 | --host 0.0.0.0 \ 32 | --worker-startup-timeout-secs 300 & 33 | 34 | ROUTER_PID=$! 35 | 36 | # Wait for router to start 37 | sleep 10 38 | if ! curl -s -o /dev/null "http://localhost:${ROUTER_PORT}/health"; then 39 | echo "Router failed to start" 40 | kill $ROUTER_PID 41 | exit 1 42 | fi 43 | 44 | echo "Router started successfully on port $ROUTER_PORT" 45 | echo "Scanning for running r1-server instances..." 46 | 47 | # Get a list of r1-server job IDs and register their servers 48 | while IFS= read -r jobid; do 49 | [[ -z "$jobid" ]] && continue 50 | 51 | # Use scontrol to get the nodelist for this job 52 | nodelist=$(scontrol show job "$jobid" | grep NodeList | tail -n1 | grep -oP 'NodeList=ip[^ ]+') 53 | [[ -z "$nodelist" ]] && continue 54 | nodelist=${nodelist#NodeList=} 55 | 56 | # Get first node from the nodelist 57 | first_node=$(scontrol show hostnames "$nodelist" | head -n1) 58 | [[ -z "$first_node" ]] && continue 59 | [[ "$first_node" == "(null)" ]] && continue 60 | 61 | # Convert hostname to IP format 62 | server_ip=$(echo "$first_node" | sed -E 's/ip-([0-9]+)-([0-9]+)-([0-9]+)-([0-9]+)/\1.\2.\3.\4/') 63 | server_url="http://${server_ip}:${SERVER_PORT}" 64 | 65 | echo "Found server node: $first_node (${server_ip})" 66 | 67 | # Check if server is responding and register it 68 | if timeout "$HEALTH_CHECK_TIMEOUT" curl -s -o /dev/null "http://${server_ip}:${SERVER_PORT}/health"; then 69 | if curl -s -X POST "http://localhost:${ROUTER_PORT}/add_worker?url=${server_url}"; then 70 | echo "Successfully registered $server_url" 71 | else 72 | echo "Failed to register $server_url" 73 | fi 74 | else 75 | echo "Server at $server_url not healthy yet, skipping registration" 76 | fi 77 | done < <(squeue -h -u "$USER" -n r1-server -t RUNNING -o "%i") 78 | 79 | # Just keep router running and healthy 80 | while true; do 81 | if ! curl -s -o /dev/null "http://localhost:${ROUTER_PORT}/health"; then 82 | echo "Error: Router health check failed" 83 | exit 1 84 | fi 85 | sleep 300 86 | done -------------------------------------------------------------------------------- /generate/slurm_standalone/debug.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ioi-eval-Qwen-Qwen2.5-7B-Instruct 3 | #SBATCH --partition=hopper-prod 4 | #SBATCH --qos=normal 5 | #SBATCH --nodes=1 6 | #SBATCH --gpus-per-node=8 7 | #SBATCH --exclusive 8 | #SBATCH --output=/fsx/hynek_kydlicek/logs/ioi-eval/ioi-eval-Qwen-Qwen2.5-7B-Instruct/%j-%x.out 9 | #SBATCH --error=/fsx/hynek_kydlicek/logs/ioi-eval/ioi-eval-Qwen-Qwen2.5-7B-Instruct/%j-%x.out 10 | #SBATCH --time=7-00:00:00 11 | #SBATCH --ntasks-per-node=1 12 | 13 | set -exuo pipefail 14 | 15 | SERVER_PORT=39877 16 | DIST_PORT=45000 17 | UV_ENV=/fsx/hynek_kydlicek/projects/ioi-leaderboard/test 18 | 19 | # random sleep (0-100) to prevent ddosing server 20 | sleep $((RANDOM % 100 + 1)) 21 | 22 | # Environment configuration 23 | export OUTLINES_CACHE_DIR=/scratch/serve_r1/ocache/ 24 | export TRITON_HOME=/scratch/serve_r1/triton/ 25 | export GLOO_SOCKET_IFNAME="enp71s0" 26 | export NCCL_SOCKET_IFNAME="enp71s0" 27 | 28 | # Evaluation script path 29 | EVAL_SCRIPT_PATH="/fsx/hynek_kydlicek/projects/ioi-leaderboard/evaluate.py" 30 | 31 | module load cuda/12.4 32 | source ~/.bashrc 33 | 34 | # Activate uv 35 | source $UV_ENV/bin/activate 36 | 37 | # FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1) 38 | # FIRST_NODE_IP=$(srun --nodes=1 -w "$FIRST_NODE" hostname --ip-address) 39 | FIRST_NODE_IP="$(hostname --ip-address)" 40 | 41 | # Launch servers synchronously across all nodes 42 | bash -c "python -m sglang.launch_server \ 43 | --model-path 'Qwen/Qwen2-0.5B' \ 44 | --tp 2 \ 45 | --dist-init-addr '$FIRST_NODE_IP:$DIST_PORT' \ 46 | --nnodes 1 \ 47 | --node-rank \$SLURM_PROCID \ 48 | --port '$SERVER_PORT' \ 49 | --host 0.0.0.0 \ 50 | --trust-remote-code \ 51 | --max-running-requests 100 \ 52 | --context-length 4096" & 53 | 54 | # Wait for server with timeout 55 | TIMEOUT=3600 # 1h, but model loading should take ~30min 56 | START_TIME=$(date +%s) 57 | echo "Waiting for SGLang server (http://$FIRST_NODE_IP:$SERVER_PORT)..." 58 | 59 | while true; do 60 | if curl -s -o /dev/null -w "%{http_code}" "http://$FIRST_NODE_IP:$SERVER_PORT/health" >/dev/null 2>&1; then 61 | echo "Server is ready at http://$FIRST_NODE_IP:$SERVER_PORT" 62 | break 63 | fi 64 | 65 | CURRENT_TIME=$(date +%s) 66 | if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then 67 | echo "Error: Server failed to start within $TIMEOUT seconds" 68 | exit 1 69 | fi 70 | 71 | echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)" 72 | sleep 60 73 | done 74 | 75 | echo "Checking available models..." 76 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/models" 77 | sleep 10 78 | 79 | echo "Executing sanity check..." 80 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/completions" \ 81 | -H "Content-Type: application/json" \ 82 | -d '{ 83 | "model": "default", 84 | "prompt": "hi, how are you?", 85 | "max_tokens": 2048, 86 | "temperature": 0.6 87 | }' 88 | 89 | python "$EVAL_SCRIPT_PATH" \ 90 | --model_id "sglang/Qwen/Qwen2-0.5B" \ 91 | --api_base "http://localhost:$SERVER_PORT/v1" \ 92 | --concurrency 100 \ 93 | --org_id=ioi-leaderboard --num_problems=6 --num_generations=1 --model_postfix=test --num_subtasks=2 --override 94 | 95 | # Kill the server and exit 96 | pkill -f "python -m sglang.launch_server" 97 | exit 0 98 | -------------------------------------------------------------------------------- /run_tests/piston/README.md: -------------------------------------------------------------------------------- 1 | # Piston workers (Slurm) 2 | 3 | We have built a [piston](https://github.com/engineer-man/piston) package to run IOI problems. 4 | 5 | To launch a fleet of piston workers on a Slurm cluster, you can adapt the `/fsx` paths in `launch_piston_workers.sh` and `launch_single_piston.sh` and run: 6 | ```bash 7 | ./launch_piston_workers.sh (number of workers to launch) 8 | ``` 9 | 10 | This command will launch a Slurm job for each worker, which will be called `piston-worker-`, where `` is the port where the worker will be listening. 11 | 12 | > [!TIP] 13 | > To accelerate evaluation, we recommend spinning up as many Piston workers as possible. For example, our evaluations are typically run with 1,500 workers. 14 | 15 | ## First time setup 16 | 17 | You will need to install the [IOI package](https://github.com/guipenedo/piston/tree/master/packages/cms_ioi/1.0.0) in the workers. To do so, run the following steps: 18 | 19 | 1. Launch a single worker: 20 | ```bash 21 | ./launch_piston_workers.sh 1 22 | ``` 23 | 24 | 2. Assuming it's running on `ip-10-53-86-146:1234`, send the package install request: 25 | ```bash 26 | curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: application/json" -d '{"language": "cms_ioi", "version": "1.0.0"}' 27 | ``` 28 | 29 | 3. You can now launch more workers and due to the shared mounted packages directory, they should already have the package installed. 30 | 31 | To have the main script find the workers automatically, you can export the following environment variable: 32 | ```bash 33 | export PISTON_ENDPOINTS=slurm 34 | ``` 35 | 36 | You can also change `PISTON_MAX_REQUESTS_PER_ENDPOINT`, which tries to limit how many simultaneous requests each worker will handle (1 by default). Keep in mind that this is a local limit and in distributed setups, as there is no global limit, workers might sometimes be overwhelmed when some processes hit the same worker. 37 | 38 | # Piston workers (local docker) 39 | This will launch a single worker in a docker container. Consider launching multiple workers for better scalability. Replace 2000 with the port you want to use. 40 | Make sure to change `/path/to/local/packages` to the path you want to persist for package installs. 41 | 42 | ```bash 43 | docker run -d \ 44 | --name piston_worker \ 45 | -v /path/to/local/packages:/piston/packages \ 46 | -e PORT=2000 \ 47 | -e PISTON_COMPILE_TIMEOUT=60000 \ 48 | -e PISTON_RUN_TIMEOUT=60000 \ 49 | -e PISTON_OUTPUT_MAX_SIZE=1000000000 \ 50 | -e PISTON_MAX_FILE_SIZE=1000000000 \ 51 | -e PISTON_DISABLE_NETWORKING=true \ 52 | -e PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index \ 53 | -p 2000:2000 \ 54 | --entrypoint /bin/bash \ 55 | ghcr.io/engineer-man/piston@sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a \ 56 | -c "sed -i '/app.use(body_parser.urlencoded/c\ app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js && \ 57 | sed -i '/app.use(body_parser.json/c\ app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js && \ 58 | node src" 59 | ``` 60 | 61 | Install the package: 62 | ```bash 63 | curl -X POST http://localhost:2000/api/v2/packages -H "Content-Type: application/json" -d '{"language": "cms_ioi", "version": "1.0.0"}' 64 | ``` 65 | 66 | Remember to set `PISTON_ENDPOINTS`: 67 | ```bash 68 | export PISTON_ENDPOINTS=http://localhost:2000/api/v2,http://localhost:2001/api/v2,http://localhost:2002/api/v2 69 | ``` 70 | -------------------------------------------------------------------------------- /generate/README.md: -------------------------------------------------------------------------------- 1 | # IOI Problem Evaluation 2 | 3 | This repository contains code for evaluating Language Models on IOI 2024 problems using LiteLLM. 4 | 5 | ## Installation 6 | 7 | 1. Clone the repository 8 | 2. Create a virtual environment with `uv` (to install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/getting-started/installation/)): 9 | ```bash 10 | uv venv ioi --python 3.11 && source ioi/bin/activate && uv pip install --upgrade pip 11 | ``` 12 | 3. Install dependencies: 13 | ```bash 14 | 15 | uv pip install torch~=2.5.1 --index-url https://download.pytorch.org/whl/cu124 16 | uv pip install sgl-kernel --force-reinstall --no-deps 17 | uv pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/ 18 | uv pip install -r requirements.txt 19 | ``` 20 | 21 | ## Environment Setup (In case you want to use remote models) 22 | 23 | 1. Copy the environment template: 24 | ```bash 25 | cp .env.template .env 26 | ``` 27 | 28 | 2. Edit `.env` and: 29 | - Uncomment the variables for the LLM providers you plan to use 30 | - Replace the placeholder values with your actual API keys 31 | - Optional: Configure proxy settings if needed 32 | 33 | Example `.env` for using OpenAI's GPT-4: 34 | ```bash 35 | OPENAI_API_KEY=your_actual_key_here 36 | OPENAI_ORGANIZATION=your_org_id # Optional 37 | ``` 38 | 39 | ## Usage 40 | 41 | ### Running with Remote Models 42 | 43 | Run the evaluation with remote models: 44 | ```bash 45 | python evaluate.py --org_id YOUR_ORG_ID --model_id YOUR_MODEL_ID [--num_generations 50] [--concurrency 5] 46 | ``` 47 | 48 | Command line arguments: 49 | - `--org_id`: Organization ID (required) 50 | - `--model_id`: Model ID in LiteLLM format (required) 51 | - `--api_base`: API base URL for the model (optional) 52 | - `--num_generations`: Number of generations per problem (default: 50) 53 | - `--num_retries`: Number of retries for failed API calls (default: 10) 54 | - `--concurrency`: Number of concurrent generations (default: 20) 55 | - `--num_problems`: Number of problems to evaluate (default: all) 56 | - `--num_subtasks`: Number of subtasks to evaluate per problem (default: 1, use -1 for all) 57 | - `--dry_run`: Run without making actual LLM calls 58 | - `--override`: Override existing results and start fresh 59 | - `--model_postfix`: Postfix for the model name 60 | - `--revision`: Revision to use for the model 61 | - `--timeout`: Timeout for the LLM call in seconds (default: 600) 62 | - `--use_requests`: Use requests instead of litellm 63 | - `--max_tokens`: Maximum number of tokens for generation 64 | 65 | ### Running with Locally Deployed Models (SGLang) 66 | 67 | For locally deployed models using SGLang, you can use the provided scripts: 68 | 69 | #### Using SLURM for Distributed Deployment 70 | 71 | For HPC environments with SLURM, use `run_ioi_slurm.py` to evaluate open models: 72 | 73 | ```bash 74 | python run_ioi_slurm.py --model "MODEL_PATH" --concurrency 30 --startup_delay 7200 --logs_dir "DIR_FOR_OUTPUT_LOGS" --slurm_dir "DIR_FOR_SLUR_SCRIPT" --uv_env "PATH_TO_UV_ENV" --eval_args "--org_id YOUR_ORG_ID" 75 | ``` 76 | 77 | ## Output 78 | 79 | The results will be saved in directory specified by `--logs_dir` with structure: 80 | 81 | ``` 82 | {org_id}/{revision}-{model_id}-{postfix}/ 83 | ``` 84 | 85 | The output includes: 86 | - Generated code solutions for each problem and subtask 87 | - Metrics on generation performance 88 | - Token usage statistics 89 | 90 | You can analyze the results using the saved data to evaluate the model's performance on competitive programming tasks. -------------------------------------------------------------------------------- /run_tests/selection_simulator.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | 4 | def get_problem_scores(selected_dataset_samples) -> float: 5 | if not selected_dataset_samples: 6 | return 0.0 7 | 8 | subtask_scores = { 9 | subtask['subtask']: 0 for subtask in selected_dataset_samples[0]['all_subtasks_results'] 10 | } 11 | 12 | for submission in selected_dataset_samples: 13 | for subtask_result in submission['all_subtasks_results']: 14 | subtask_scores[subtask_result['subtask']] = max(subtask_scores[subtask_result['subtask']], subtask_result['weighted_score']) 15 | 16 | return sum(subtask_scores.values()) 17 | 18 | def get_submission_cot_length(submission) -> int: 19 | if "metadata" in submission: 20 | if 'output_tokens' in submission['metadata']['usage']: 21 | return submission['metadata']['usage']['output_tokens'] 22 | return submission['metadata']['usage']['completion_tokens'] 23 | # no token info. use pure length 24 | if 'generation' in submission: 25 | return len(submission['generation']) 26 | # crap... 27 | return 0 28 | 29 | def simulate_round_robin(all_submissions) -> float: 30 | if not all_submissions: 31 | return 0 32 | 33 | subtasks = [x['subtask'] for x in all_submissions[0]['all_subtasks_results']] 34 | submissions_by_target_subtask = {subtask: [] for subtask in subtasks} 35 | 36 | for submission in all_submissions: 37 | # if it failed to compile, skip 38 | if not submission["code"] or submission['all_subtasks_results'][0]['status'] == 'CE': 39 | continue 40 | submissions_by_target_subtask[submission['target_subtask']].append(submission) 41 | 42 | for target_subtask in submissions_by_target_subtask: 43 | # we only have access to the first subtask (examples/public test) 44 | submissions_by_target_subtask[target_subtask] = deque( 45 | sorted(submissions_by_target_subtask[target_subtask], 46 | key=lambda x: (x['all_subtasks_results'][0]['score'], get_submission_cot_length(x)), 47 | reverse=True) 48 | ) 49 | 50 | exhausted_subtasks = set([subtask for subtask in submissions_by_target_subtask if len(submissions_by_target_subtask[subtask]) == 0]) 51 | solved_subtasks = set([subtasks[0]]) # we don't explicitly care about solving the examples 52 | 53 | # only up to 50 submissions 54 | selected_submissions = [] 55 | 56 | subtask_i = len(subtasks) - 1 57 | 58 | while len(selected_submissions) < 50 and len(exhausted_subtasks.union(solved_subtasks)) < len(subtasks): 59 | subtask = subtasks[subtask_i] 60 | if subtask not in solved_subtasks and subtask not in exhausted_subtasks: 61 | sol = submissions_by_target_subtask[subtask].popleft() 62 | selected_submissions.append(sol) 63 | for subtask_to_check in range(len(sol['all_subtasks_results'])): 64 | if sol['all_subtasks_results'][subtask_to_check]['score'] == 1.0: 65 | solved_subtasks.add(subtask_to_check) 66 | if len(submissions_by_target_subtask[subtask]) == 0: 67 | exhausted_subtasks.add(subtask) 68 | subtask_i = (subtask_i - 1) % len(subtasks) 69 | 70 | remaining_submissions = deque(sorted( 71 | [submission for subtask_submissions in submissions_by_target_subtask.values() for submission in subtask_submissions], 72 | key=lambda x: (x['all_subtasks_results'][0]['score'], get_submission_cot_length(x), subtasks.index(x['target_subtask']) if x['target_subtask'] in subtasks else 0), reverse=True) 73 | ) 74 | while len(selected_submissions) < 50 and remaining_submissions: 75 | selected_submissions.append(remaining_submissions.popleft()) 76 | 77 | return selected_submissions 78 | -------------------------------------------------------------------------------- /generate/slurm_standalone/serve_r1.slurm: -------------------------------------------------------------------------------- 1 | # Credits to Anton Lozhkov 2 | #!/bin/bash 3 | #SBATCH --job-name=r1-server 4 | #SBATCH --partition=hopper-prod 5 | #SBATCH --qos=normal 6 | #SBATCH --nodes=2 7 | #SBATCH --gpus-per-node=8 8 | #SBATCH --exclusive 9 | #SBATCH --output=./logs/%x_%j_%n.out 10 | #SBATCH --error=./logs/%x_%j_%n.err 11 | #SBATCH --time=7-00:00:00 12 | #SBATCH --ntasks-per-node=1 13 | 14 | set -exuo pipefail 15 | 16 | MODEL_PATH="deepseek-ai/DeepSeek-R1" 17 | UV_ENV=/fsx/hynek_kydlicek/projects/ioi-leaderboard/ioi-eval 18 | ROUTER_ADDRESS="" 19 | SERVER_PORT=39877 20 | DIST_PORT=45000 21 | 22 | # TODO: Adjust these variables to your cluster configuration 23 | export OUTLINES_CACHE_DIR=/scratch/serve_r1/ocache/ 24 | export TRITON_HOME=/scratch/serve_r1/triton/ 25 | export GLOO_SOCKET_IFNAME="enp71s0" 26 | export NCCL_SOCKET_IFNAME="enp71s0" 27 | 28 | while getopts "m:e:r:h" opt; do 29 | case $opt in 30 | m) MODEL_PATH="$OPTARG" ;; 31 | r) ROUTER_ADDRESS="$OPTARG" ;; 32 | h|?) echo "Usage: sbatch $0 [-m MODEL_PATH] [-r ROUTER_ADDRESS]"; exit 1 ;; 33 | esac 34 | done 35 | 36 | # TODO: Environment setup, adjust to your cluster configuration 37 | module load cuda/12.4 38 | source ~/.bashrc 39 | source $UV_ENV/bin/activate 40 | 41 | FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1) 42 | FIRST_NODE_IP=$(srun --nodes=1 --ntasks=1 -w "$FIRST_NODE" hostname --ip-address) 43 | 44 | # Launch servers synchronously across all nodes 45 | # (--max-running-requests=56 is rough estimate to avoid too many evicted/preempted 16k-long requests) 46 | srun --nodes=2 --ntasks=2 --ntasks-per-node=1 \ 47 | bash -c "python -m sglang.launch_server \ 48 | --model-path '$MODEL_PATH' \ 49 | --tp 16 \ 50 | --dist-init-addr '$FIRST_NODE_IP:$DIST_PORT' \ 51 | --nnodes 2 \ 52 | --node-rank \$SLURM_PROCID \ 53 | --port '$SERVER_PORT' \ 54 | --host 0.0.0.0 \ 55 | --trust-remote-code \ 56 | --max-running-requests 24 \ 57 | --context-length 65536" & 58 | 59 | # Wait for server with timeout 60 | TIMEOUT=3600 # 1h, but model loading should take ~30min 61 | START_TIME=$(date +%s) 62 | echo "Waiting for SGLang server (http://$FIRST_NODE_IP:$SERVER_PORT)..." 63 | 64 | while true; do 65 | if curl -s -o /dev/null -w "%{http_code}" "http://$FIRST_NODE_IP:$SERVER_PORT/health" >/dev/null 2>&1; then 66 | echo "Server is ready at http://$FIRST_NODE_IP:$SERVER_PORT" 67 | break 68 | fi 69 | 70 | CURRENT_TIME=$(date +%s) 71 | if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then 72 | echo "Error: Server failed to start within $TIMEOUT seconds" 73 | exit 1 74 | fi 75 | 76 | echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)" 77 | sleep 60 78 | done 79 | 80 | # Register with router only if address was provided 81 | if [ -n "$ROUTER_ADDRESS" ]; then 82 | echo "Registering with router at $ROUTER_ADDRESS..." 83 | curl -X POST "http://$ROUTER_ADDRESS/add_worker?url=http://$FIRST_NODE_IP:$SERVER_PORT" || true 84 | sleep 10 85 | fi 86 | 87 | echo "Checking available models..." 88 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/models" 89 | sleep 10 90 | 91 | echo "Executing sanity check..." 92 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/completions" \ 93 | -H "Content-Type: application/json" \ 94 | -d "{ 95 | \"model\": \"default\", 96 | \"prompt\": \"<|begin▁of▁sentence|><|User|>hi, how are you?<|Assistant|>\", 97 | \"max_tokens\": 2048, 98 | \"temperature\": 0.6 99 | }" 100 | 101 | # Keep the job running with health checks 102 | while true; do 103 | if ! curl -s -o /dev/null "http://$FIRST_NODE_IP:$SERVER_PORT/health"; then 104 | echo "Error: Server health check failed" 105 | exit 1 106 | fi 107 | sleep 300 108 | done -------------------------------------------------------------------------------- /generate/utils/check_failures.py: -------------------------------------------------------------------------------- 1 | from huggingface_hub import HfApi 2 | import argparse 3 | from datasets import load_dataset 4 | 5 | # Parse command line arguments 6 | parser = argparse.ArgumentParser(description="Check which models failed to create datasets on HuggingFace Hub") 7 | parser.add_argument("--model_postfix", default="new-prompt", help="Postfix for the model name") 8 | parser.add_argument("--org_id", default="ioi-leaderboard", help="Organization ID") 9 | parser.add_argument("--check_generations", action="store_true", help="Check that all generations are not null or empty") 10 | args = parser.parse_args() 11 | 12 | # Initialize the Hugging Face API 13 | api = HfApi() 14 | 15 | # Organization ID where datasets are stored 16 | org_id = args.org_id 17 | 18 | # Read models from the file 19 | with open("models_to_run.txt", "r") as f: 20 | models = [line.strip() for line in f if line.strip()] 21 | 22 | # Get all datasets in the organization 23 | try: 24 | all_datasets = api.list_datasets(author=org_id) 25 | dataset_names = [dataset.id for dataset in all_datasets] 26 | except Exception as e: 27 | print(f"Error fetching datasets: {e}") 28 | dataset_names = [] 29 | 30 | # Check which models have datasets 31 | successful_models = [] 32 | failed_models = [] 33 | incomplete_models = [] 34 | 35 | for model in models: 36 | # Format the model name the same way as in the evaluator 37 | model_name = f"ioi-eval-sglang_{model.replace('/', '_')}" 38 | 39 | # Check if there's a model with the specified postfix 40 | if args.model_postfix: 41 | model_name_with_postfix = f"{model_name}-{args.model_postfix}" 42 | else: 43 | model_name_with_postfix = model_name 44 | 45 | # Full dataset path 46 | full_dataset_path = f"{org_id}/{model_name_with_postfix}" 47 | 48 | if full_dataset_path in dataset_names: 49 | # Dataset exists 50 | if args.check_generations: 51 | try: 52 | # Load the dataset to check generations 53 | dataset = load_dataset(full_dataset_path, split="train") 54 | 55 | # Check if any generations are null or empty 56 | null_or_empty = sum(1 for gen in dataset["generation"] if gen is None or gen == "") 57 | 58 | if null_or_empty > 0: 59 | print(f"Model {model} has {null_or_empty} null or empty generations out of {len(dataset)}") 60 | incomplete_models.append(model) 61 | else: 62 | successful_models.append(model) 63 | except Exception as e: 64 | print(f"Error checking generations for {model}: {e}") 65 | failed_models.append(model) 66 | else: 67 | successful_models.append(model) 68 | else: 69 | failed_models.append(model) 70 | 71 | # Print results 72 | print(f"Total models: {len(models)}") 73 | print(f"Successful models: {len(successful_models)}") 74 | print(f"Failed models: {len(failed_models)}") 75 | if args.check_generations: 76 | print(f"Models with incomplete generations: {len(incomplete_models)}") 77 | 78 | print("\nSuccessful models:") 79 | for model in successful_models: 80 | print(f" - {model}") 81 | 82 | print("\nFailed models:") 83 | for model in failed_models: 84 | print(f" - {model}") 85 | 86 | if args.check_generations and incomplete_models: 87 | print("\nModels with incomplete generations:") 88 | for model in incomplete_models: 89 | print(f" - {model}") 90 | 91 | # Create a new file with failed models 92 | if failed_models: 93 | failed_file = f"failed_models{'-' + args.model_postfix if args.model_postfix else ''}.txt" 94 | with open(failed_file, "w") as f: 95 | for model in failed_models: 96 | f.write(f"{model}\n") 97 | print(f"\nFailed models have been written to {failed_file}") 98 | 99 | # Create a new file with incomplete models 100 | if args.check_generations and incomplete_models: 101 | incomplete_file = f"incomplete_models{'-' + args.model_postfix if args.model_postfix else ''}.txt" 102 | with open(incomplete_file, "w") as f: 103 | for model in incomplete_models: 104 | f.write(f"{model}\n") 105 | print(f"\nIncomplete models have been written to {incomplete_file}") -------------------------------------------------------------------------------- /run_tests/README.md: -------------------------------------------------------------------------------- 1 | # IOI: Running tests 2 | 3 | ## Piston 4 | To evaluate, we rely on Piston (https://github.com/engineer-man/piston) to compile and run the code in a secure and fast sandbox environment. See the [piston](piston/README.md) directory for more details. 5 | To run the evaluation code below, spin up Piston workers and copy the `.env.template` file to `.env` and set the piston variables. 6 | 7 | ## Running the pipeline 8 | Install dependencies: 9 | ```bash 10 | pip install -r requirements.txt 11 | ``` 12 | 13 | Once you have piston setup and running, or have made the necessary changes to the `run_submission` function in [scoring.py](scoring.py) (see below for more info), you can run the pipeline with the following command: 14 | 15 | ```bash 16 | python tests_runner.py [-h] [--local_results_path LOCAL_RESULTS_PATH] [--id_column ID_COLUMN] [--max_concurrent_requests MAX_CONCURRENT_REQUESTS] [--test_batch_size TEST_BATCH_SIZE] [--dry_run] [--override] 17 | [--timeout TIMEOUT] [--add_includes] [--add_messages_column] 18 | datasets_to_evaluate results_dataset_name 19 | ``` 20 | ### Arguments 21 | - `datasets_to_evaluate`: The datasets to evaluate (HF hub ids), separated by commas. Also accepts wildcards on the dataset part, such as `open-r1/models-*-fix` 22 | - `results_dataset_name`: The name of the dataset to save the results to. 23 | - `local_results_path`: Path for local results cache, so that you can restart if the script dies. 24 | - `id_column`: The column name of the unique identifier for each submission. `uuid` by default 25 | - `max_concurrent_requests`: The maximum number of concurrent requests to make to the Piston API. Should be roughly the number of piston workers you have. 26 | - `test_batch_size`: Batch size for testing submissions. Will test this number at a time and then check if any scored 0.0. If so, the remaining tests are skipped. Increase if you have many more workers than submissions. 27 | - `dry_run`: If true, the script will not make any actual API calls to Piston. 28 | - `override`: If true, the script will override existing results in the results dataset. 29 | - `timeout`: Timeout for the Piston API calls. 30 | - `add_includes`: If true, the script will attempt to fix some basic missing #include directives in the code. 31 | - `add_messages_column`: If true, the script will add the `messages` column to the results dataset formatted for SFT. 32 | 33 | ### Examples 34 | 35 | Running the pipeline on the official contest solutions with 1500 workers: 36 | 37 | ```bash 38 | python tests_runner.py open-r1/ioi-sample-solutions my_org/ioi-sample-solutions-results --id_column label --max_concurrent_requests 1500 39 | ``` 40 | Make sure to compare your results (look at the reports for each problem) to the official contest solutions in the [open-r1/ioi-sample-solutions](https://huggingface.co/datasets/open-r1/ioi-sample-solutions) dataset. 41 | 42 | 43 | Running on a dataset produced by evaluate.py: 44 | 45 | ```bash 46 | python tests_runner.py my_org/my-dataset my_org/my-dataset-results --max_concurrent_requests 1500 47 | ``` 48 | 49 | Besides the actual results dataset, the script will also generate and upload markdown reports to the dataset's repo under `reports/my-dataset/README.md` 50 | 51 | 52 | 53 | ## Evaluating without piston 54 | To evaluate in a different sandbox environment, you should change the `run_submission` function in [scoring.py](scoring.py). It should mount/create the following files inside the sandbox: 55 | - `graders/.cpp`: The submission code. 56 | - `input.txt`: The input for the problem. 57 | - `correct_output.txt`: The expected output for the problem. 58 | - all the files in `grader_files` 59 | Plus the following 2 very important files: 60 | - [`compile`](custom_setup/compile), the command to compile the submission code with all the grader/checker/manager files. 61 | - [`run`](custom_setup/run), the command to orchestrate the execution of the submission code, managers, time limits, output checking, etc. 62 | 63 | As `run` handles time limits, if you require a time limit for a sandbox, you can set a hard limit to 2 or 3 additional seconds from the problem's time limit. 64 | 65 | You should return a tuple of `(score, feedback)` from the function, where `score` is the execution's stdout, and `feedback` its stderr, and need to handle some special failure scenarios such as (piston example): 66 | 67 | ```python 68 | 69 | if 'compile' in response and response['compile']['code'] != 0: 70 | return "0", "Compilation error exit code " + str(response['compile']['code']) + "\n" + response['compile']['stderr'] 71 | 72 | if response['run']['code'] == 1 and "MemoryError" in response['run']['stderr']: 73 | return "0", "Memory limit exceeded" 74 | 75 | # successful result 76 | if response['run']['stdout']: 77 | return response['run']['stdout'], response['run']['stderr'] 78 | 79 | # hard time limit exceeded 80 | if response['run']['signal'] == 'SIGKILL': 81 | return "0", "Time limit exceeded" 82 | 83 | return '0', 'Unknown error' 84 | ``` 85 | -------------------------------------------------------------------------------- /run_tests/custom_setup/run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # disable stack limit so you don't get RE with recursion 3 | ulimit -s unlimited 4 | # some problems have 10MB+ input/output files in their test cases and you might get RE. uncomment if needed 5 | # ulimit -f 2097152 6 | 7 | # Check if grader_config.json exists 8 | if [ ! -f "graders/grader_config.json" ]; then 9 | echo "Error: graders/grader_config.json not found" >&2 10 | echo "Current directory contents:" >&2 11 | find . -type f -o -type d | sed -e 's/[^-][^\/]*\// |/g' -e 's/|\([^ ]\)/|-\1/' >&2 12 | exit 1 13 | fi 14 | 15 | # Read task type, code, and time limit from grader_config.json using grep and sed 16 | TASK_TYPE=$(grep -o '"task_type":[^,}]*' graders/grader_config.json | sed 's/"task_type":\s*"\([^"]*\)"/\1/') 17 | TASK_NAME=$(grep -o '"code":[^,}]*' graders/grader_config.json | sed 's/"code":\s*"\([^"]*\)"/\1/') 18 | TIME_LIMIT=$(grep -o '"time_limit":[^,}]*' graders/grader_config.json | sed 's/"time_limit":\s*\([^,}]*\)/\1/') 19 | MEMORY_LIMIT=$(grep -o '"memory_limit":[^,}]*' graders/grader_config.json | sed 's/"memory_limit":\s*\([^,}]*\)/\1/') 20 | TASK_EXECUTABLE="graders/$TASK_NAME" 21 | 22 | # Set memory limit in KB (convert from bytes) 23 | MEMORY_LIMIT_KB=0 24 | if [ -n "$MEMORY_LIMIT" ]; then 25 | MEMORY_LIMIT_KB=$((MEMORY_LIMIT / 1024)) 26 | # Set the memory limit for the entire script and all child processes 27 | ulimit -v $MEMORY_LIMIT_KB 28 | fi 29 | 30 | # "Securely" handle the correct output file 31 | CORRECT_OUTPUT="" 32 | if [ -f "correct_output.txt" ]; then 33 | # Read the content and immediately remove the file 34 | CORRECT_OUTPUT=$(cat correct_output.txt) 35 | rm -f correct_output.txt 36 | fi 37 | 38 | # Create a temporary file for solution output 39 | SOLUTION_OUTPUT=$(mktemp) 40 | 41 | # Global variables for process tracking 42 | declare -a ALL_PIDS 43 | declare -a FIFO_DIRS 44 | 45 | # Define cleanup function - simplified assuming timeout exists 46 | function cleanup { 47 | # Kill all tracked processes silently 48 | exec 2>/dev/null 49 | for pid in "${ALL_PIDS[@]:-}"; do 50 | kill -9 "$pid" 2>/dev/null || true 51 | done 52 | 53 | # Clean up FIFO directories 54 | for dir in "${FIFO_DIRS[@]:-}"; do 55 | [ -d "$dir" ] && rm -rf "$dir" 56 | done 57 | 58 | # Clean up temporary files 59 | rm -f "$SOLUTION_OUTPUT" || true 60 | exec 2>&2 61 | } 62 | 63 | # Set up signal handling 64 | trap cleanup EXIT INT TERM 65 | 66 | # Function to handle exit codes consistently across task types 67 | function handle_exit_code { 68 | local exit_code=$1 69 | 70 | # Check for known timeout exit codes: 71 | # - 124: standard timeout exit code 72 | # - 137: SIGKILL (128+9), used for hard timeouts 73 | # - 143: SIGTERM (128+15), can also be used for timeouts 74 | if [ $exit_code -eq 124 ] || [ $exit_code -eq 137 ] || [ $exit_code -eq 143 ]; then 75 | echo "0" 76 | echo "Time limit exceeded (${TIME_LIMIT}s)" >&2 77 | return 124 78 | # All other non-zero exit codes should be treated as runtime errors 79 | elif [ $exit_code -ne 0 ]; then 80 | echo "0" 81 | echo "Runtime error with exit code $exit_code" >&2 82 | return $exit_code 83 | fi 84 | 85 | # Success case - return 0 86 | return 0 87 | } 88 | 89 | # Function to run a command with timeout (simplified assuming timeout exists) 90 | function run_with_timeout { 91 | local soft_limit=$1; shift 92 | local command_to_run="$@" 93 | 94 | timeout --preserve-status "$soft_limit" "$@" 95 | return $? 96 | } 97 | 98 | case "$TASK_TYPE" in 99 | "Batch") 100 | # Simple batch execution with timeout 101 | run_with_timeout "$TIME_LIMIT" ./$TASK_EXECUTABLE < input.txt > "$SOLUTION_OUTPUT" 102 | exit_code=$? 103 | 104 | # Handle non-zero exit codes 105 | handle_exit_code $exit_code 106 | if [ $? -ne 0 ]; then 107 | exit $? 108 | fi 109 | 110 | # Check the output if we have a correct output 111 | if [ -n "$CORRECT_OUTPUT" ]; then 112 | # Restore the correct output file 113 | echo "$CORRECT_OUTPUT" > correct_output.txt 114 | 115 | # Check if there's a custom checker 116 | if [ -f "checker/checker" ]; then 117 | # Let the checker handle everything 118 | ./checker/checker input.txt correct_output.txt "$SOLUTION_OUTPUT" 119 | exit $? 120 | else 121 | # Simple diff-based checking 122 | if diff -bq <(echo "$CORRECT_OUTPUT") "$SOLUTION_OUTPUT" >/dev/null; then 123 | echo "1" 124 | echo "Output is correct (diff)" >&2 125 | else 126 | echo "0" 127 | echo "Output isn't correct (diff)" >&2 128 | exit 0 129 | fi 130 | fi 131 | else 132 | # If no correct output was provided, just output the solution's output 133 | cat "$SOLUTION_OUTPUT" 134 | fi 135 | ;; 136 | 137 | "Communication") 138 | # Read Communication-specific parameters 139 | NUM_PROCESSES=$(grep -o '"task_type_parameters_Communication_num_processes":[^,}]*' graders/grader_config.json | sed 's/.*:\s*\([0-9]*\)/\1/' || true) 140 | if [ -z "$NUM_PROCESSES" ]; then 141 | NUM_PROCESSES=1 142 | fi 143 | USER_IO=$(grep -o '"task_type_parameters_Communication_user_io":[^,}]*' graders/grader_config.json | sed 's/.*:\s*"\([^"]*\)"/\1/' || echo "std_io") 144 | 145 | # Read custom manager arguments if they exist 146 | MANAGER_CUSTOM_ARGS="" 147 | if grep -q '"task_type_parameters_Communication_manager_args"' graders/grader_config.json; then 148 | MANAGER_CUSTOM_ARGS=$(grep -o '"task_type_parameters_Communication_manager_args":[^,}]*' graders/grader_config.json | sed 's/.*:\s*"\([^"]*\)"/\1/') 149 | fi 150 | 151 | # Create temporary directories for FIFOs 152 | for i in $(seq 0 $((NUM_PROCESSES-1))); do 153 | FIFO_DIRS[$i]=$(mktemp -d) 154 | 155 | # Create FIFOs for this process 156 | mkfifo "${FIFO_DIRS[$i]}/u${i}_to_m" 157 | mkfifo "${FIFO_DIRS[$i]}/m_to_u${i}" 158 | chmod 755 "${FIFO_DIRS[$i]}" 159 | chmod 666 "${FIFO_DIRS[$i]}/u${i}_to_m" "${FIFO_DIRS[$i]}/m_to_u${i}" 160 | done 161 | 162 | # Prepare manager arguments 163 | MANAGER_ARGS="" 164 | for i in $(seq 0 $((NUM_PROCESSES-1))); do 165 | MANAGER_ARGS="$MANAGER_ARGS ${FIFO_DIRS[$i]}/u${i}_to_m ${FIFO_DIRS[$i]}/m_to_u${i}" 166 | done 167 | 168 | # Add custom manager arguments if specified 169 | if [ -n "$MANAGER_CUSTOM_ARGS" ]; then 170 | MANAGER_ARGS="$MANAGER_ARGS $MANAGER_CUSTOM_ARGS" 171 | fi 172 | 173 | # Start all user processes first 174 | for i in $(seq 0 $((NUM_PROCESSES-1))); do 175 | if [ "$USER_IO" = "fifo_io" ]; then 176 | # Pass FIFOs as arguments 177 | ARGS="${FIFO_DIRS[$i]}/m_to_u${i} ${FIFO_DIRS[$i]}/u${i}_to_m" 178 | if [ "$NUM_PROCESSES" -ne 1 ]; then 179 | ARGS="$ARGS $i" 180 | fi 181 | ./$TASK_EXECUTABLE $ARGS & 182 | ALL_PIDS+=($!) 183 | else 184 | # Use stdin/stdout redirection 185 | if [ "$NUM_PROCESSES" -ne 1 ]; then 186 | ./$TASK_EXECUTABLE "$i" < "${FIFO_DIRS[$i]}/m_to_u${i}" > "${FIFO_DIRS[$i]}/u${i}_to_m" 2>/dev/null & 187 | ALL_PIDS+=($!) 188 | else 189 | ./$TASK_EXECUTABLE < "${FIFO_DIRS[$i]}/m_to_u${i}" > "${FIFO_DIRS[$i]}/u${i}_to_m" 2>/dev/null & 190 | ALL_PIDS+=($!) 191 | fi 192 | fi 193 | done 194 | 195 | # Run the manager with timeout using direct pipe from input.txt 196 | run_with_timeout "$TIME_LIMIT" ./graders/manager $MANAGER_ARGS < input.txt > "$SOLUTION_OUTPUT" 197 | 198 | exit_code=$? 199 | 200 | # Handle non-zero exit codes 201 | handle_exit_code $exit_code 202 | if [ $? -ne 0 ]; then 203 | exit $? 204 | fi 205 | 206 | # Check the output if we have a correct output AND there's a checker (otherwise we assume the manager handles everything) 207 | if [ -n "$CORRECT_OUTPUT" ] && [ -f "checker/checker" ]; then 208 | # Restore the correct output file 209 | echo "$CORRECT_OUTPUT" > correct_output.txt 210 | 211 | # Let the checker handle it 212 | ./checker/checker input.txt correct_output.txt "$SOLUTION_OUTPUT" 213 | exit $? 214 | else 215 | # we assume the manager handles it 216 | cat "$SOLUTION_OUTPUT" 217 | fi 218 | ;; 219 | 220 | *) 221 | echo "0" 222 | echo "Unsupported task type \"$TASK_TYPE\"" >&2 223 | exit 1 224 | ;; 225 | esac -------------------------------------------------------------------------------- /generate/run_ioi_slurm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from math import ceil, gcd 3 | import os 4 | import argparse 5 | import subprocess 6 | from pathlib import Path 7 | from transformers import AutoConfig 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | DEFAULT_TP = 16 13 | MAX_CTX_LENGTH = None 14 | 15 | MODEL_CONFIGS = {} 16 | 17 | LOGS_DIR = "/fsx/hynek_kydlicek/logs/ioi-eval" 18 | SLURM_SCRIPT_DIR = "/fsx/hynek_kydlicek/slurm/ioi-eval/output" 19 | UV_ENV = "/fsx/hynek_kydlicek/projects/ioi-leaderboard/ioi-eval" 20 | 21 | 22 | def get_concurrency(model_name: str, concurrency: int) -> int: 23 | """Get concurrency from model config.""" 24 | return MODEL_CONFIGS.get(model_name, {}).get("concurrency", concurrency) 25 | 26 | 27 | def get_tp(model_name: str, revision: str) -> int: 28 | default_tp = MODEL_CONFIGS.get(model_name, {}).get("tp", DEFAULT_TP) 29 | try: 30 | config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True) 31 | 32 | # Check num_attention_heads and num_key_value_heads, and ensure that both are divisable by tp 33 | if hasattr(config, 'num_attention_heads'): 34 | if config.num_attention_heads % default_tp != 0: 35 | # Adjust tp to be the highest number that divides both num_attention_heads 36 | new_tp = gcd(config.num_attention_heads, default_tp) 37 | print(f"Adjusted tp for {model_name} from {default_tp} to {new_tp}") 38 | return new_tp 39 | return default_tp 40 | except Exception as e: 41 | print(f"Could not get tp from config for {model_name}: {e}") 42 | return default_tp 43 | 44 | def get_context_length(model_name: str, revision: str) -> int: 45 | """Get maximum context length from model config.""" 46 | try: 47 | config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True) 48 | # Check various possible context length attributes 49 | context_length = ( 50 | getattr(config, 'max_position_embeddings', None) or 51 | getattr(config, 'sliding_window', None) or 52 | getattr(config, 'max_sequence_length', None) or 53 | getattr(config, 'max_seq_len', None) or 54 | 4096 # Default fallback 55 | ) 56 | 57 | # Some models (like Qwen) might have sliding_window disabled 58 | if hasattr(config, 'use_sliding_window') and not config.use_sliding_window: 59 | # If sliding window is disabled, use max_position_embeddings instead 60 | context_length = getattr(config, 'max_position_embeddings', context_length) 61 | 62 | 63 | # cap to 64k 64 | if MAX_CTX_LENGTH is not None: 65 | context_length = min(context_length, MAX_CTX_LENGTH) 66 | return context_length 67 | except Exception as e: 68 | logger.warning(f"Could not get context length from config for {model_name}: {e}") 69 | return 4096 # Default fallback 70 | 71 | def parse_args(): 72 | parser = argparse.ArgumentParser(description="Run IOI evaluation on a model using Slurm") 73 | parser.add_argument("--model", type=str, required=True, 74 | help="Model to evaluate (predefined model name)") 75 | parser.add_argument("--eval_args", type=str, required=True, 76 | help="Arguments to pass to the evaluation script") 77 | parser.add_argument("--time", type=str, default="7-00:00:00", 78 | help="Job time limit (default: 7 days)") 79 | parser.add_argument("--partition", type=str, default="hopper-prod", 80 | help="Slurm partition") 81 | parser.add_argument("--qos", type=str, default="normal", 82 | help="Slurm QOS") 83 | parser.add_argument("--startup_delay", type=int, default=3600, 84 | help="Delay in seconds before starting the server") 85 | parser.add_argument("--dry_run", action="store_true", 86 | help="Generate script but don't submit job") 87 | 88 | parser.add_argument("--revision", type=str, default=None, help="Revision to use for the model") 89 | parser.add_argument("--concurrency", type=int, default=100, 90 | help="Number of concurrent requests to the server") 91 | 92 | parser.add_argument("--uv_env", type=str, default=None, help="Path to the uv env") 93 | parser.add_argument("--logs_dir", type=str, default=None) 94 | parser.add_argument("--slurm_dir", type=str, default=None) 95 | 96 | return parser.parse_args() 97 | 98 | def create_slurm_script(args, logs_dir): 99 | # Override with custom values if provided 100 | concurrency = get_concurrency(args.model, args.concurrency) 101 | tp = get_tp(args.model, args.revision) 102 | context_length = get_context_length(args.model, args.revision) 103 | 104 | # Create a sanitized model name for the job name 105 | job_name = f"ioi-eval-{args.model.replace('/', '-')}" 106 | 107 | log_dir = logs_dir / job_name 108 | log_dir.mkdir(parents=True, exist_ok=True) 109 | 110 | n_nodes = ceil(tp / 8) 111 | tasks = n_nodes 112 | 113 | revision_arg = f"--revision {args.revision}" if args.revision else "" 114 | 115 | slurm_script = f"""#!/bin/bash 116 | #SBATCH --job-name={job_name} 117 | #SBATCH --partition={args.partition} 118 | #SBATCH --qos={args.qos} 119 | #SBATCH --nodes={n_nodes} 120 | #SBATCH --gpus-per-node=8 121 | #SBATCH --exclusive 122 | #SBATCH --output={log_dir}/%j-%x.out 123 | #SBATCH --error={log_dir}/%j-%x.out 124 | #SBATCH --time={args.time} 125 | #SBATCH --ntasks-per-node=1 126 | 127 | set -exuo pipefail 128 | 129 | SERVER_PORT=39877 130 | DIST_PORT=45000 131 | 132 | # random sleep (0-100) to prevent ddosing server 133 | sleep $((RANDOM % 100 + 1)) 134 | 135 | # Environment configuration 136 | export OUTLINES_CACHE_DIR=/scratch/serve_r1/ocache/ 137 | export TRITON_HOME=/scratch/serve_r1/triton/ 138 | export GLOO_SOCKET_IFNAME="enp71s0" 139 | export NCCL_SOCKET_IFNAME="enp71s0" 140 | 141 | # Evaluation script path 142 | EVAL_SCRIPT_PATH="/fsx/hynek_kydlicek/projects/ioi/generate/evaluate.py" 143 | 144 | module load cuda/12.4 145 | source ~/.bashrc 146 | 147 | # Activate uv 148 | source {args.uv_env or UV_ENV}/bin/activate 149 | 150 | FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1) 151 | FIRST_NODE_IP=$(srun --nodes=1 --ntasks=1 -w "$FIRST_NODE" hostname --ip-address) 152 | 153 | # Launch servers synchronously across all nodes 154 | srun --nodes={n_nodes} --ntasks={tasks} --ntasks-per-node=1 \\ 155 | bash -c "python -m sglang.launch_server \\ 156 | --model-path '{args.model}' \\ 157 | --tp {tp} \\ 158 | --dist-init-addr '$FIRST_NODE_IP:$DIST_PORT' \\ 159 | {revision_arg} \\ 160 | --nnodes {n_nodes} \\ 161 | --node-rank \\$SLURM_PROCID \\ 162 | --port '$SERVER_PORT' \\ 163 | --host 0.0.0.0 \\ 164 | --trust-remote-code \\ 165 | --max-running-requests {concurrency} \\ 166 | --context-length {context_length}" & 167 | 168 | # Wait for server with timeout 169 | TIMEOUT={args.startup_delay} # 1h, but model loading should take ~30min 170 | START_TIME=$(date +%s) 171 | echo "Waiting for SGLang server (http://$FIRST_NODE_IP:$SERVER_PORT)..." 172 | 173 | while true; do 174 | if curl -s -o /dev/null -w "%{{http_code}}" "http://$FIRST_NODE_IP:$SERVER_PORT/health" >/dev/null 2>&1; then 175 | echo "Server is ready at http://$FIRST_NODE_IP:$SERVER_PORT" 176 | break 177 | fi 178 | 179 | CURRENT_TIME=$(date +%s) 180 | if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then 181 | echo "Error: Server failed to start within $TIMEOUT seconds" 182 | exit 1 183 | fi 184 | 185 | echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)" 186 | sleep 60 187 | done 188 | 189 | echo "Checking available models..." 190 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/models" 191 | sleep 10 192 | 193 | echo "Executing sanity check..." 194 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/completions" \\ 195 | -H "Content-Type: application/json" \\ 196 | -d '{{ 197 | "model": "default", 198 | "prompt": "hi, how are you?", 199 | "max_tokens": 2048, 200 | "temperature": 0.6 201 | }}' 202 | 203 | python "$EVAL_SCRIPT_PATH" \\ 204 | --model_id "sglang/{args.model}" \\ 205 | {revision_arg} \\ 206 | --api_base "http://localhost:$SERVER_PORT/v1" \\ 207 | --concurrency {concurrency} \\ 208 | {args.eval_args} 209 | 210 | # Kill the server and exit 211 | pkill -f "python -m sglang.launch_server" 212 | exit 0 213 | """ 214 | 215 | return slurm_script, job_name 216 | 217 | def main(): 218 | args = parse_args() 219 | 220 | # Create output directory if it doesn't exist 221 | output_dir = Path(args.slurm_dir or SLURM_SCRIPT_DIR) 222 | output_dir.mkdir(parents=True, exist_ok=True) 223 | 224 | # Create logs directory if it doesn't exist 225 | logs_dir = Path(args.logs_dir or LOGS_DIR) 226 | logs_dir.mkdir(parents=True, exist_ok=True) 227 | 228 | # Generate the Slurm script 229 | slurm_script, job_name = create_slurm_script(args, logs_dir) 230 | 231 | # Create a timestamp for the filename 232 | from datetime import datetime 233 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 234 | 235 | # Save the script to a file 236 | script_path = output_dir / f"{job_name}_{timestamp}.slurm" 237 | with open(script_path, "w") as f: 238 | f.write(slurm_script) 239 | 240 | logger.info(f"Slurm script saved to: {script_path}") 241 | # Make the script executable 242 | os.chmod(script_path, 0o755) 243 | 244 | # Submit the job if not a dry run 245 | if not args.dry_run: 246 | try: 247 | result = subprocess.run( 248 | ["sbatch", str(script_path)], 249 | check=True, 250 | capture_output=True, 251 | text=True 252 | ) 253 | print(f"Job submitted: {result.stdout.strip()} find logs at {LOGS_DIR}/{job_name}") 254 | except subprocess.CalledProcessError as e: 255 | print(f"Error submitting job: {e}") 256 | print(f"Error output: {e.stderr}") 257 | else: 258 | print("Dry run - job not submitted") 259 | 260 | if __name__ == "__main__": 261 | main() -------------------------------------------------------------------------------- /run_tests/scoring.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from dataclasses import asdict, dataclass, field 3 | from typing import Union 4 | 5 | from piston_client import PistonClient 6 | from utils import batched, load_ioi_tests 7 | 8 | 9 | @dataclass 10 | class TestResult: 11 | """ 12 | Represents the result of a single test case execution. 13 | 14 | Attributes: 15 | test_name: Name of the test case 16 | score: Score achieved for this test (0.0 to 1.0) 17 | status: Status code of the test result (e.g., 'AC', 'WA', 'TLE') 18 | feedback: Detailed feedback message from the judge or an error message 19 | """ 20 | test_name: str 21 | score: float = 0.0 22 | status: str = 'SKIPPED' 23 | feedback: str = None 24 | 25 | @dataclass 26 | class SubtaskResult: 27 | """ 28 | Represents the result of a subtask containing multiple test cases. 29 | 30 | Attributes: 31 | problem: Problem identifier 32 | subtask: Subtask identifier 33 | points: Maximum points available for this subtask 34 | score_precision: Number of decimal places for score rounding 35 | test_results: List of individual test case results 36 | """ 37 | problem: str = None 38 | subtask: str = None 39 | 40 | points: float = 0.0 41 | score_precision: int = 2 42 | 43 | test_results: list[TestResult] = field(default_factory=list) 44 | 45 | @property 46 | def status(self): 47 | """ 48 | Determines the overall status of the subtask based on the worst status among test results. 49 | Status priorities are ordered from worst to best. 50 | 51 | Returns: 52 | str: The status with the highest priority (lowest value) 53 | """ 54 | status_prios = {'CE': -1, 'RE': 0, 'WA': 1, 'MLE': 2, 'TLE': 3, 'PA': 4, 'AC': 5, 'SKIPPED': 999} 55 | return min([x.status for x in self.test_results], key=lambda x: status_prios[x]) 56 | 57 | @property 58 | def score(self): 59 | """ 60 | Calculates the raw score for the subtask as the minimum score across all test results. 61 | 62 | Returns: 63 | float: The rounded minimum score 64 | """ 65 | return 0 if not self.test_results else round(min([test_result.score for test_result in self.test_results]), self.score_precision) 66 | 67 | @property 68 | def weighted_score(self): 69 | """ 70 | Calculates the weighted score by multiplying the raw score by the available points. 71 | 72 | Returns: 73 | float: The rounded weighted score 74 | """ 75 | return 0 if not self.test_results else round(min([test_result.score for test_result in self.test_results]) * self.points, self.score_precision) 76 | 77 | def to_dict(self): 78 | """ 79 | Converts the SubtaskResult to a dictionary representation. 80 | 81 | Returns: 82 | dict: Dictionary containing all subtask result data 83 | """ 84 | return { 85 | 'problem': self.problem, 86 | 'subtask': self.subtask, 87 | 'score': self.score, 88 | 'weighted_score': self.weighted_score, 89 | 'points': self.points, 90 | 'score_precision': self.score_precision, 91 | 'status': self.status, 92 | 'test_results': [asdict(test_result) for test_result in self.test_results] 93 | } 94 | 95 | def _extract_single_status(score: float, feedback: str) -> str: 96 | """ 97 | Determines the status code based on the score and feedback message. 98 | 99 | Args: 100 | score: The numeric score (0.0 to 1.0) 101 | feedback: The feedback message from the execution 102 | 103 | Returns: 104 | str: Status code ('CE', 'MLE', 'TLE', 'WA', 'RE', 'AC', or 'PA') 105 | """ 106 | if score == 0.0: 107 | if "Compilation error" in feedback: 108 | return 'CE' 109 | elif "Memory limit exceeded" in feedback: 110 | return 'MLE' 111 | elif "Time limit exceeded" in feedback: 112 | return 'TLE' 113 | elif "Output isn't correct" in feedback: 114 | return 'WA' 115 | else: 116 | return 'RE' 117 | elif score == 1.0: 118 | return 'AC' 119 | else: 120 | return 'PA' 121 | 122 | 123 | async def score_single_test_case(client: PistonClient, subtask: dict, test_name: str, test_input: str, test_output: str, submission: str) -> TestResult: 124 | """ 125 | Scores a single test case by running the submission against the provided input and output. 126 | 127 | Args: 128 | client: PistonClient instance for executing code 129 | subtask: Dictionary containing subtask configuration 130 | test_name: Name of the test case 131 | test_input: Input data for the test case 132 | test_output: Expected output for the test case 133 | submission: Source code of the submission 134 | 135 | Returns: 136 | TestResult: Result of the test case execution 137 | """ 138 | # Run submission for this test case 139 | score, feedback = await run_submission(client, subtask, test_input, submission, test_output) 140 | score = float(score) 141 | 142 | return TestResult(test_name=test_name, score=score, status=_extract_single_status(score, feedback), feedback=feedback) 143 | 144 | async def score_subtask(client: PistonClient, subtask: dict, submission: str, test_case_run_cache: Union[dict, None] = None, test_batch_size: int = 1) -> SubtaskResult: 145 | """ 146 | Scores all test cases in a subtask. 147 | 148 | Args: 149 | client: PistonClient instance for executing code 150 | subtask: Dictionary containing subtask configuration 151 | test_cases: Dictionary mapping test names to (input, output) tuples 152 | submission: Source code of the submission 153 | test_case_run_cache: Optional cache of previously run test cases 154 | test_batch_size: evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases. 155 | -1 to evaluate all test cases in parallel 156 | Returns: 157 | SubtaskResult: Result of the subtask evaluation 158 | """ 159 | subtask_result = SubtaskResult(problem=subtask['id'], subtask=subtask['subtask'], points=subtask['score'], score_precision=subtask['score_precision'], test_results=[]) 160 | 161 | # tests that are not cached 162 | tests_to_run = [ 163 | (ti, test_name) 164 | for ti, test_name in enumerate(subtask['test_names']) 165 | if test_case_run_cache is None or test_name not in test_case_run_cache 166 | ] 167 | 168 | # initialize test results with cached results or empty (SKIPPED) TestResult objects 169 | subtask_result.test_results = [ 170 | test_case_run_cache[test_name] if test_case_run_cache is not None and test_name in test_case_run_cache else 171 | TestResult(test_name=test_name) 172 | for test_name in subtask['test_names'] 173 | ] 174 | 175 | # we skip submissions where no code was extracted 176 | # no need to do anything, as we have a failed cached result 177 | if not submission or any(test_result.status != 'SKIPPED' and test_result.score == 0.0 for test_result in subtask_result.test_results): 178 | return subtask_result 179 | 180 | if "test_cases" in subtask: 181 | test_cases = subtask["test_cases"] 182 | if isinstance(subtask["test_cases"], list): 183 | test_cases = { 184 | test_name: test for test_name, test in zip(subtask["test_names"], subtask["test_cases"]) 185 | } 186 | else: 187 | test_cases = load_ioi_tests(subtask["year"], subtask["id"]) 188 | 189 | # run one batch, check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases. 190 | for test_batch_to_run in batched(tests_to_run, test_batch_size): 191 | results = await asyncio.gather(*[ 192 | asyncio.create_task(score_single_test_case(client, subtask, test_name, test_cases[test_name][0], test_cases[test_name][1], submission)) 193 | for _, test_name in test_batch_to_run 194 | ]) 195 | for (ti, test_name), test_result in zip(test_batch_to_run, results): 196 | if test_case_run_cache is not None: 197 | test_case_run_cache[test_name] = test_result 198 | subtask_result.test_results[ti] = test_result 199 | 200 | # Stop early if it failed 201 | if any(test_result.score == 0.0 for test_result in results): 202 | break 203 | 204 | return subtask_result 205 | 206 | 207 | async def score_subtasks(client: PistonClient, subtasks: list[dict], submission: str, test_batch_size: int = 1) -> list[SubtaskResult]: 208 | """ 209 | Scores multiple subtasks for a submission. 210 | 211 | Args: 212 | client: PistonClient instance for executing code 213 | subtasks: List of dictionaries containing subtask configurations 214 | submission: Source code of the submission 215 | test_batch_size: evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases. 216 | -1 to evaluate all test cases in parallel 217 | 218 | Returns: 219 | list[SubtaskResult]: Results for all subtasks 220 | """ 221 | # avoid rerunning tests present in multiple subtasks 222 | test_case_run_cache = {} 223 | 224 | return [ 225 | await score_subtask(client, subtask, submission, test_case_run_cache, test_batch_size) 226 | for subtask in subtasks 227 | ] 228 | 229 | async def run_submission(client: PistonClient, problem: dict, test_input: str, submission: str, test_output: str | None = None) -> tuple[str, str]: 230 | """ 231 | Executes a submission against a test case using the Piston execution environment. 232 | 233 | Args: 234 | client: PistonClient instance for executing code 235 | problem: Dictionary containing problem configuration 236 | test_input: Input data for the test case 237 | submission: Source code of the submission 238 | test_output: Optional expected output for the test case 239 | 240 | Returns: 241 | tuple[str, str]: A tuple containing (score, feedback) 242 | """ 243 | data = { 244 | "files": [ 245 | # the actual submission 246 | { 247 | "name": f"graders/{problem['id'].lower()}.cpp", 248 | "content": submission 249 | }, 250 | # pass the input 251 | { 252 | "name": "input.txt", 253 | "content": test_input 254 | }, 255 | # pass the expected output 256 | *([{ 257 | "name": "correct_output.txt", 258 | "content": test_output 259 | }] if test_output else []), 260 | # grader files 261 | *({ 262 | "name": name, 263 | "content": content 264 | } for name, content in problem['grader_files'] if content) 265 | ], 266 | 'run_timeout': round((problem['time_limit'] + 3) * 1000), # +3 seconds hard limit. time limits are handled by the ioi script 267 | 'run_memory_limit': problem['memory_limit'] 268 | } 269 | return await client.execute(data) 270 | -------------------------------------------------------------------------------- /run_tests/piston_client.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import random 4 | import re 5 | import subprocess 6 | from collections import Counter 7 | from functools import lru_cache 8 | 9 | import aiohttp 10 | 11 | 12 | class PistonError(Exception): 13 | pass 14 | 15 | @lru_cache(maxsize=1) 16 | def get_piston_client_from_env(session=None): 17 | piston_endpoints = os.getenv("PISTON_ENDPOINTS") 18 | if piston_endpoints is None: 19 | raise ValueError("For IOI problems Piston endpoints running our IOI package are required. Please add a list of valid Piston endpoints to a PISTON_ENDPOINTS varialbe in a `.env` file.") 20 | piston_endpoints = piston_endpoints.split(",") if piston_endpoints != "slurm" else get_slurm_piston_endpoints() 21 | random.shuffle(piston_endpoints) 22 | max_requests_per_endpoint = os.getenv("PISTON_MAX_REQUESTS_PER_ENDPOINT", "1") 23 | return PistonClient(piston_endpoints, session, max_requests_per_endpoint=int(max_requests_per_endpoint)) 24 | 25 | class PistonClient: 26 | """ 27 | A client that will automatically load balance across multiple Piston (https://github.com/engineer-man/piston) workers. 28 | This assumes piston is running our custom cms_ioi package: https://github.com/guipenedo/piston/releases/ 29 | We recommend starting the instances with the following script as otherwise some IOI problems will hit default limits: 30 | ``` 31 | export PISTON_COMPILE_TIMEOUT=60000 32 | export PISTON_RUN_TIMEOUT=60000 33 | export PISTON_OUTPUT_MAX_SIZE=1000000000 34 | export PISTON_MAX_FILE_SIZE=1000000000 35 | export PISTON_DISABLE_NETWORKING=true 36 | export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index 37 | mkdir /piston 38 | 39 | sed -i '/app.use(body_parser.urlencoded/c\ app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js 40 | sed -i '/app.use(body_parser.json/c\ app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js 41 | 42 | # Start server in background 43 | node src``` 44 | 45 | Piston docs for API usage: https://piston.readthedocs.io/en/latest/api-v2/ 46 | """ 47 | def __init__(self, base_endpoint: str | list[str] = "http://ip-10-53-80-65:3223/api/v2", session=None, max_requests_per_endpoint=1): 48 | self.max_requests_per_endpoint = max_requests_per_endpoint 49 | self.base_endpoints = [base_endpoint] if isinstance(base_endpoint, str) else base_endpoint 50 | if len(self.base_endpoints) == 0: 51 | raise ValueError("No Piston endpoints provided. Please check your PISTON_ENDPOINTS environment variable.") 52 | self.endpoint_ids = {endpoint: i for i, endpoint in enumerate(self.base_endpoints)} 53 | 54 | self._session = session 55 | self.endpoint_tokens = asyncio.Queue(maxsize=max_requests_per_endpoint * len(self.base_endpoints)) 56 | 57 | for _ in range(max_requests_per_endpoint): 58 | for base_endpoint in self.base_endpoints: 59 | self.endpoint_tokens.put_nowait(base_endpoint) 60 | self._endpoint_failures = Counter() 61 | self._unhealthy_endpoints = set() 62 | self._endpoint_failures_lock = asyncio.Lock() 63 | 64 | @property 65 | def session(self): 66 | if self._session is None: 67 | self._session = aiohttp.ClientSession( 68 | timeout=aiohttp.ClientTimeout(sock_read=10), 69 | connector=aiohttp.TCPConnector( 70 | limit=self.max_requests_per_endpoint * len(self.base_endpoints), 71 | ttl_dns_cache=300, 72 | keepalive_timeout=5 * 60 73 | ) 74 | ) 75 | return self._session 76 | 77 | async def _wait_for_endpoint(self): 78 | endpoint = await self.endpoint_tokens.get() 79 | return endpoint 80 | 81 | async def _release_endpoint(self, endpoint): 82 | await self.endpoint_tokens.put(endpoint) 83 | 84 | async def _send_request(self, endpoint, route, data=None, method="post"): 85 | async with self.session.request(method, f"{endpoint.rstrip('/')}/{route}", json=data, headers={"Content-Type": "application/json"}) as response: 86 | return await response.json(content_type=None) 87 | 88 | async def _send_to_all(self, route, data=None, method="post"): 89 | return await asyncio.gather(*[self._send_request(endpoint, route, data, method) for endpoint in self.base_endpoints]) 90 | 91 | async def _send_to_one(self, endpoint, route, data=None, method="post"): 92 | return await self._send_request(endpoint, route, data, method) 93 | 94 | async def install_package(self, language, version): 95 | return await self._send_to_all("packages", { 96 | "language": language, 97 | "version": version 98 | }, method="post") 99 | 100 | async def uninstall_package(self, language, version): 101 | return await self._send_to_all("packages", { 102 | "language": language, 103 | "version": version 104 | }, method="delete") 105 | 106 | async def get_supported_runtimes(self): 107 | return await self._send_to_all("runtimes", method="get") 108 | 109 | async def execute(self, data) -> tuple[str, str]: 110 | """ 111 | Requests to the IOI package return the score as a float in the stdout, as well as optional feedback/errors in stderr. 112 | Returns a tuple of (score, feedback). 113 | """ 114 | response = await self._send_execute(data) 115 | 116 | if 'message' in response: 117 | raise PistonError(response['message']) 118 | 119 | if 'compile' in response and response['compile']['code'] != 0: 120 | return "0", "Compilation error exit code " + str(response['compile']['code']) + "\n" + response['compile']['stderr'] 121 | 122 | if 'run' not in response: 123 | raise PistonError(response) 124 | 125 | if response['run']['code'] == 1 and "MemoryError" in response['run']['stderr']: 126 | return "0", "Memory limit exceeded" 127 | 128 | # successful result 129 | if response['run']['stdout']: 130 | return response['run']['stdout'], response['run']['stderr'] 131 | 132 | if response['run']['signal'] == 'SIGKILL': 133 | return "0", "Time limit exceeded" 134 | 135 | # other issues 136 | if response['run']['code'] != 0: 137 | raise PistonError(f"language={response['language']}, version={response['version']}, exit code={response['run']['code']}, stderr={response['run']['stderr']}, signal={response['run']['signal']}") 138 | return '0', 'Unknown error' 139 | 140 | async def _check_failed_endpoint(self, endpoint): 141 | async with self._endpoint_failures_lock: 142 | if endpoint in self._unhealthy_endpoints: 143 | return 144 | try: 145 | await asyncio.sleep(5) 146 | await self.get_supported_runtimes() 147 | except Exception as e: 148 | print(f"Error checking endpoint {endpoint}, dropping it ({e})") 149 | self._unhealthy_endpoints.add(endpoint) 150 | if len(self._unhealthy_endpoints) >= len(self.base_endpoints): 151 | raise PistonError("All endpoints are unhealthy. Please check your Piston workers.") 152 | 153 | async def _send_execute(self, data): 154 | data = data | { 155 | "language": "cms_ioi", 156 | "version": "*", 157 | } 158 | 159 | max_retries = 5 160 | base_delay = 1.0 161 | 162 | status = None 163 | endpoint = None 164 | 165 | for attempt in range(max_retries + 1): 166 | try: 167 | endpoint = await self._wait_for_endpoint() 168 | if attempt > 0: 169 | await asyncio.sleep(1) 170 | async with self.session.post(f"{endpoint.rstrip('/')}/execute", json=data, headers={"Content-Type": "application/json"}) as response: 171 | status = response.status 172 | res_json = await response.json(content_type=None) 173 | 174 | if status != 200: 175 | raise PistonError(f"Server error. status={status}") 176 | if res_json is None: 177 | raise PistonError(f"Empty response. status={status}") 178 | # piston overloaded 179 | if 'run' in res_json and "Resource temporarily unavailable" in res_json['run'].get('stderr', ''): 180 | raise PistonError(f"Piston overloaded: {res_json['run']['stderr']}") 181 | return res_json 182 | 183 | except (PistonError, asyncio.TimeoutError, aiohttp.ClientConnectionError, RuntimeError) as e: 184 | # Only retry if we haven't reached max retries yet 185 | if attempt < max_retries: 186 | # Calculate backoff with jitter 187 | delay = min(base_delay * (2 ** attempt), 10) # Exponential backoff, capped at 10 seconds 188 | jitter = delay * 0.2 * (2 * asyncio.get_event_loop().time() % 1 - 0.5) # Add ±10% jitter 189 | retry_delay = delay + jitter 190 | print(f"Retrying in {retry_delay:.2f} seconds [{self.endpoint_ids[endpoint]}] {endpoint}") 191 | 192 | # special case: worker died 193 | if isinstance(e, aiohttp.ClientConnectionError) and "Connect call failed" in str(e): 194 | await self._check_failed_endpoint(endpoint) 195 | else: 196 | # hopefully we won't get this one again 197 | await self._release_endpoint(endpoint) 198 | endpoint = None 199 | 200 | await asyncio.sleep(retry_delay) 201 | else: 202 | print(f"Giving up on retries. {e}") 203 | raise e 204 | except Exception as e: 205 | print(f"Propagating exception {type(e)}: {e}") 206 | raise e 207 | finally: 208 | # Ensure endpoint is always released, even if an exception occurs 209 | if endpoint is not None: 210 | try: 211 | await self._release_endpoint(endpoint) 212 | except Exception as e: 213 | print(f"Error releasing endpoint {endpoint}: {e}") 214 | endpoint = None 215 | 216 | 217 | def get_slurm_piston_endpoints(): 218 | """Get list of active piston worker endpoints from squeue output""" 219 | # Run squeue command to get job name, hostname and status, filtering for RUNNING state 220 | result = subprocess.run(['squeue', '--format="%j %N %T"', '--noheader', '--states=RUNNING'], capture_output=True, text=True) 221 | 222 | # Split output into lines and skip header 223 | lines = result.stdout.strip().split('\n') 224 | 225 | endpoints = [] 226 | for line in lines: 227 | # Parse job name from squeue output 228 | fields = line.split() 229 | job_name = fields[0].strip('"') # Remove quotes 230 | hostname = fields[1] 231 | 232 | # Extract port if job name matches pattern 233 | match = re.match(r'piston-worker-(\d+)', job_name) 234 | if match: 235 | port = match.group(1) 236 | endpoints.append(f"http://{hostname}:{port}/api/v2") 237 | 238 | return endpoints -------------------------------------------------------------------------------- /run_tests/tests_runner.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from fnmatch import fnmatch 3 | import json 4 | import os 5 | from typing import Set 6 | import asyncio 7 | import aiofiles 8 | import aiohttp 9 | from datasets import load_dataset, Dataset 10 | import pandas as pd 11 | from tqdm.asyncio import tqdm 12 | from selection_simulator import get_problem_scores, simulate_round_robin 13 | from utils import add_includes 14 | import uvloop 15 | from piston_client import get_piston_client_from_env 16 | from scoring import score_subtasks 17 | from dotenv import load_dotenv 18 | from loguru import logger 19 | from huggingface_hub import HfApi 20 | 21 | class TestsRunner: 22 | def __init__( 23 | self, 24 | datasets_to_evaluate: list[str] | str, 25 | results_dataset_name: str, 26 | local_results_cache: str = "results", 27 | max_concurrent_requests: int = 100, 28 | test_batch_size: int = 1, 29 | dry_run: bool = False, 30 | override: bool = False, 31 | timeout: int = 60 * 10, 32 | id_column: str = "uuid", 33 | add_messages_column: bool = False, 34 | add_includes: bool = True, 35 | always_extract_code: bool = False 36 | ): 37 | self.datasets_to_evaluate = datasets_to_evaluate if isinstance( 38 | datasets_to_evaluate, list) else [datasets_to_evaluate] 39 | self.datasets_to_evaluate_names = [dataset.split( 40 | '/')[-1].removeprefix('ioi-eval-') for dataset in self.datasets_to_evaluate] 41 | self.results_dataset_name = results_dataset_name 42 | self.local_results_cache = local_results_cache 43 | self.test_batch_size = test_batch_size 44 | self.dry_run = dry_run 45 | self.override = override 46 | self.timeout = timeout 47 | self.id_column = id_column 48 | self.add_messages_column = add_messages_column 49 | self.add_includes = add_includes 50 | self.max_concurrent_requests = max_concurrent_requests 51 | self.always_extract_code = always_extract_code 52 | os.makedirs(self.local_results_cache, exist_ok=True) 53 | 54 | if dry_run: 55 | logger.warning("Running in dry-run mode - no actual Piston calls will be made") 56 | 57 | # Lock for local file access 58 | self._file_lock = asyncio.Lock() 59 | 60 | async def run_tests_pipeline(self): 61 | # fetch completed submissions 62 | completed_ids, evaluated_submissions = self.fetch_completed_submissions() 63 | 64 | # fetch submissions to evaluate 65 | submissions_to_evaluate = self.fetch_submissions_to_evaluate(completed_ids) 66 | 67 | # load problem data 68 | problem_subtasks = self.load_problem_data(set(submissions_to_evaluate.keys())) 69 | 70 | # evaluate submissions 71 | new_evaluated_submissions = await self.evaluate_submissions(problem_subtasks, submissions_to_evaluate) 72 | 73 | # merge 74 | for key in set(evaluated_submissions.keys()).union(new_evaluated_submissions.keys()): 75 | evaluated_submissions[key].extend(new_evaluated_submissions[key]) 76 | 77 | # save all results 78 | self.save_to_hub(evaluated_submissions) 79 | 80 | # generate reports with the results for each dataset 81 | self.publish_reports(evaluated_submissions) 82 | 83 | def fetch_submissions_to_evaluate(self, completed_ids: dict[str, set]) -> dict[tuple[int, str], list[dict]]: 84 | submissions_to_evaluate = defaultdict(list) 85 | for dataset, dsname in zip(self.datasets_to_evaluate, self.datasets_to_evaluate_names): 86 | ds = load_dataset(dataset, split="train") 87 | subs_to_eval = 0 88 | for submission in ds: 89 | if self.id_column not in submission: 90 | logger.error(f"Submission does not have an \"{self.id_column}\" column. Please set --id_column to the correct column name.") 91 | exit(1) 92 | if "year" not in submission: 93 | submission['year'] = 2024 # we assume it's IOI'2024 94 | id_key = (dsname, str(submission['year']), submission['problem_id'], submission[self.id_column]) 95 | if not id_key in completed_ids[dsname]: 96 | submission['dataset'] = dsname 97 | 98 | completed_ids[dsname].add(id_key) 99 | 100 | # source code parsing 101 | if 'code' not in submission or not submission['code'] or self.always_extract_code: 102 | # try extracting code from generation if it exists 103 | if 'generation' not in submission or "```cpp\n" not in submission['generation']: 104 | submission['code'] = None 105 | else: 106 | submission['code'] = submission['generation'].split("```cpp\n")[-1].split("```")[0] 107 | if submission['code'] and self.add_includes: 108 | submission['code'] = add_includes(submission['code'], submission['problem_id']) 109 | 110 | submissions_to_evaluate[(str(submission['year']), submission['problem_id'])].append(submission) 111 | subs_to_eval += 1 112 | logger.info(f"Found {subs_to_eval} submissions to evaluate for {dsname}") 113 | logger.info(f"Found {sum(len(v) for v in submissions_to_evaluate.values())} total submissions to evaluate") 114 | return submissions_to_evaluate 115 | 116 | def fetch_completed_submissions(self) -> tuple[dict[str, set], dict[str, list]]: 117 | completed_submissions = defaultdict(list) 118 | unique_ids = defaultdict(set) 119 | 120 | if self.override: 121 | logger.warning("Override flag active. Will not fetch completed submissions from local cache or hub. Will overwrite existing local results and on the hub.") 122 | for dsname in self.datasets_to_evaluate_names: 123 | if os.path.exists(f"{self.local_results_cache}/{dsname}.jsonl"): 124 | os.rename(f"{self.local_results_cache}/{dsname}.jsonl", f"{self.local_results_cache}/{dsname}.jsonl.bak") 125 | logger.info(f"Renamed {self.local_results_cache}/{dsname}.jsonl to {self.local_results_cache}/{dsname}.jsonl.bak") 126 | return unique_ids, completed_submissions 127 | 128 | logger.info(f"Fetching completed submissions from {self.local_results_cache} and {self.results_dataset_name}") 129 | for dsname in self.datasets_to_evaluate_names: 130 | local_results_path = f"{self.local_results_cache}/{dsname}.jsonl" 131 | # local results 132 | if os.path.exists(local_results_path): 133 | with open(local_results_path, 'r') as f: 134 | for line in f: 135 | line_data = json.loads(line) 136 | id_key = (dsname, str(line_data['year']), line_data['problem_id'], line_data[self.id_column]) 137 | if not id_key in unique_ids[dsname]: 138 | line_data['dataset'] = dsname 139 | completed_submissions[dsname].append(line_data) 140 | unique_ids[dsname].add(id_key) 141 | try: 142 | # hub results 143 | pushed_results = load_dataset( 144 | self.results_dataset_name, split="train", name=dsname) 145 | if pushed_results: 146 | for submission in pushed_results: 147 | id_key = (dsname, str(submission['year']), submission['problem_id'], submission[self.id_column]) 148 | if not id_key in unique_ids[dsname]: 149 | submission['dataset'] = dsname 150 | completed_submissions[dsname].append(submission) 151 | unique_ids[dsname].add(id_key) 152 | except Exception: 153 | pass 154 | logger.info(f"Found {len(completed_submissions[dsname])} completed submissions for {dsname}") 155 | 156 | return unique_ids, completed_submissions 157 | 158 | def load_problem_data(self, problems_to_fetch: set[tuple[int, str]]) -> dict[tuple[int, str], list[dict]]: 159 | problems = load_dataset("open-r1/ioi", split="train+test") 160 | problem_subtasks = defaultdict(list) 161 | 162 | for problem in problems: 163 | if (str(problem['year']), problem['id']) in problems_to_fetch: 164 | problem_subtasks[(str(problem['year']), problem['id'])].append(problem) 165 | 166 | return problem_subtasks 167 | 168 | async def evaluate_submissions(self, problem_subtasks: dict[tuple[int, str], list[dict]], submissions_to_evaluate: list[dict]) -> list[dict]: 169 | async with aiohttp.ClientSession( 170 | timeout=aiohttp.ClientTimeout(sock_read=30), 171 | connector=aiohttp.TCPConnector( 172 | limit=self.max_concurrent_requests, ttl_dns_cache=300, keepalive_timeout=self.timeout) 173 | ) as session: 174 | client = get_piston_client_from_env(session) if not self.dry_run else None 175 | active_tasks: Set[asyncio.Task] = set() 176 | 177 | new_results = defaultdict(list) 178 | 179 | with tqdm( 180 | total=sum(len(codes_to_eval) for codes_to_eval in submissions_to_evaluate.values()), 181 | desc="Evaluating submissions", 182 | unit="row", 183 | mininterval=2, 184 | smoothing=0.0001, 185 | ) as pbar: 186 | 187 | async def score_submission_on_all_subtasks(subtasks, submission): 188 | """Score a single submission on all subtasks""" 189 | try: 190 | all_subtask_results = await score_subtasks(client, subtasks, submission['code'] if not self.dry_run else None, test_batch_size=self.test_batch_size) 191 | async with self._file_lock: 192 | async with aiofiles.open(f'{self.local_results_cache}/{submission["dataset"]}.jsonl', mode="a") as f: 193 | target_subtask = submission.pop('subtask', None) 194 | target_subtask_results = [subtask_results for subtask_results in all_subtask_results if target_subtask and subtask_results.subtask == target_subtask] 195 | 196 | full_result_data = { 197 | **submission, 198 | "target_subtask": target_subtask, 199 | "code_compiles": bool(submission["code"]) and all(subtask_results.status != "CE" for subtask_results in all_subtask_results), 200 | "target_subtask_score": target_subtask_results[0].score if target_subtask_results else None, 201 | "target_subtask_status": target_subtask_results[0].status if target_subtask_results else None, 202 | "all_subtasks_points": sum([subtask_results.weighted_score for subtask_results in all_subtask_results]), 203 | "all_subtasks_results": [subtask_result.to_dict() for subtask_result in all_subtask_results], 204 | } 205 | await f.write(json.dumps(full_result_data) + "\n") 206 | await f.flush() 207 | return full_result_data 208 | except Exception as e: 209 | print(f"Error scoring submission: {e}") 210 | finally: 211 | pbar.set_postfix(active=len(pbar.active_tasks), refresh=False) 212 | pbar.update(1) 213 | 214 | pbar.active_tasks = active_tasks 215 | 216 | for (year, problem_name), subtasks in problem_subtasks.items(): 217 | codes_to_eval = submissions_to_evaluate[(year, problem_name)] 218 | print(f"Scoring {len(codes_to_eval)} submissions on {len(subtasks)} subtasks of {problem_name} ({len(set([test_name for subtask in subtasks for test_name in subtask['test_names']]))} test cases)") 219 | 220 | for submission in codes_to_eval: 221 | while len(active_tasks) >= self.max_concurrent_requests: 222 | done, active_tasks = await asyncio.wait( 223 | active_tasks, return_when=asyncio.FIRST_COMPLETED 224 | ) 225 | for task in done: 226 | try: 227 | result = await task 228 | if result: 229 | new_results[result['dataset']].append(result) 230 | except Exception as e: 231 | print(f"Task failed: {e}") 232 | 233 | task = asyncio.create_task(score_submission_on_all_subtasks(subtasks, submission)) 234 | active_tasks.add(task) 235 | task.add_done_callback(active_tasks.discard) 236 | pbar.set_postfix(active=len(active_tasks), refresh=True) 237 | 238 | if active_tasks: 239 | for new_result in (await asyncio.gather(*active_tasks, return_exceptions=True)): 240 | if isinstance(new_result, Exception): 241 | logger.error(f"Error scoring submission: {new_result}") 242 | else: 243 | if new_result: 244 | new_results[new_result['dataset']].append(new_result) 245 | 246 | return new_results 247 | 248 | def save_to_hub(self, evaluated_submissions: list[dict]): 249 | 250 | def add_messages_column(sample): 251 | messages = [ 252 | {"role": "user", "content": sample["prompt"]}, 253 | {"role": "assistant", "content": sample["generation"].strip()}, 254 | ] 255 | return {"messages": messages} 256 | 257 | 258 | for key, submissions in evaluated_submissions.items(): 259 | if not submissions: 260 | logger.warning(f"No submissions to push for {key}") 261 | continue 262 | dataset = Dataset.from_list(submissions) 263 | if self.add_messages_column: 264 | dataset = dataset.map(add_messages_column) 265 | dataset = dataset.remove_columns("dataset") 266 | dataset.push_to_hub(self.results_dataset_name, split="train", config_name=key, private=False) 267 | logger.info(f"Pushed {len(submissions)} submissions to {self.results_dataset_name}[{key}]") 268 | 269 | def publish_reports(self, evaluated_submissions: list[dict]): 270 | api = HfApi() 271 | for dataset, submissions in evaluated_submissions.items(): 272 | if not submissions: 273 | continue 274 | 275 | submissions_per_problem = defaultdict(list) 276 | for submission in submissions: 277 | submissions_per_problem[(submission['year'], submission['problem_id'])].append(submission) 278 | 279 | year_overview = defaultdict(list) 280 | for year, problem in sorted(submissions_per_problem.keys(), key=lambda x: (-x[0], x[1])): 281 | submissions = submissions_per_problem[(year, problem)] 282 | 283 | table_data = [ 284 | { 285 | "Submission": submission[self.id_column], 286 | "Target subtask": submission.get('target_subtask', '-'), 287 | "Total": submission["all_subtasks_points"], 288 | **{ 289 | subtask['subtask']: f"{subtask['weighted_score']}/{subtask['points']} ({subtask['status']})" 290 | for subtask in submission["all_subtasks_results"] 291 | } 292 | } 293 | for submission in submissions 294 | ] 295 | table_data.sort(key=lambda x: x["Total"], reverse=True) 296 | df = pd.DataFrame(table_data) 297 | 298 | all_submissions_score = get_problem_scores(submissions) 299 | limit_50_score = get_problem_scores(simulate_round_robin(submissions)) 300 | problem_overview = { 301 | "year": year, 302 | "problem": problem, 303 | "day": submissions[0].get('day', '-'), 304 | "number_submissions": len(submissions), 305 | "number_submissions_compiling": sum(1 for submission in submissions if submission['code_compiles']), 306 | "best_submission_score": max(submission['all_subtasks_points'] for submission in submissions), 307 | "all_submissions_score": all_submissions_score, 308 | "limit_50_score": limit_50_score, 309 | } 310 | year_overview[year].append(problem_overview) 311 | 312 | # individual problem report 313 | markdown_content = f"""# {year}: {problem} 314 | ## Overview 315 | - Number of submissions: **{problem_overview['number_submissions']}** 316 | - Submissions compiling: **{problem_overview['number_submissions_compiling']}** 317 | - Best individual submission: **{problem_overview['best_submission_score']}/100** 318 | 319 | - Score on this problem (no submission limit): **{problem_overview['all_submissions_score']:.2f}/100** 320 | - Score on this problem (limited to 50 submissions, round robin selection): **{limit_50_score:.2f}/100** 321 | 322 | ## Submissions 323 | {df.to_markdown(index=False)} 324 | """ 325 | api.upload_file( 326 | path_or_fileobj=markdown_content.encode(), 327 | path_in_repo=f"reports/{dataset}/{year}_{problem}.md", 328 | repo_id=self.results_dataset_name, 329 | repo_type="dataset" 330 | ) 331 | 332 | # collect stuff for the global overview. grouped per year 333 | global_overview_markdown = f"""# Global Overview 334 | - Number of submissions: **{sum(overview['number_submissions'] for overviews in year_overview.values() for overview in overviews)}** 335 | - Submissions compiling: **{sum(overview['number_submissions_compiling'] for overviews in year_overview.values() for overview in overviews)}** 336 | 337 | """ + "\n\n".join([f"""# {year} 338 | 339 | - Score (no submission limit): **{sum(problem_overview['all_submissions_score'] for problem_overview in year_overview[year] if problem_overview['day'] != "practice")}/600** 340 | - Score (limited to 50 submissions, round robin selection): **{sum(problem_overview['limit_50_score'] for problem_overview in year_overview[year] if problem_overview['day'] != "practice")}/600** 341 | 342 | """ + pd.DataFrame([ 343 | { 344 | "Day": problem_overview['day'], 345 | "Problem": problem_overview['problem'], 346 | "#submissions": problem_overview['number_submissions'], 347 | "#compiling": problem_overview['number_submissions_compiling'], 348 | "Best individual": f"{problem_overview['best_submission_score']}/100", 349 | "Score (50 limit)": f"{problem_overview['limit_50_score']}/100", 350 | "Score (no limit)": f"{problem_overview['all_submissions_score']}/100", 351 | "Full report": f"[link](https://huggingface.co/datasets/{self.results_dataset_name}/blob/main/reports/{dataset}/{year}_{problem_overview['problem']}.md)" 352 | } 353 | for problem_overview in problem_overviews 354 | ]).to_markdown(index=False) for year, problem_overviews in year_overview.items()]) 355 | api.upload_file( 356 | path_or_fileobj=global_overview_markdown.encode(), 357 | path_in_repo=f"reports/{dataset}/README.md", 358 | repo_id=self.results_dataset_name, 359 | repo_type="dataset" 360 | ) 361 | 362 | logger.info(f"Uploaded reports to https://huggingface.co/datasets/{self.results_dataset_name}/tree/main/reports/") 363 | 364 | def parse_datasets_to_evaluate(datasets_to_evaluate_str: str) -> list[str]: 365 | api = HfApi() 366 | org_datasets = {} 367 | 368 | datasets_to_evaluate = datasets_to_evaluate_str.split(",") 369 | parsed_datasets_to_evaluate = [] 370 | for dataset in datasets_to_evaluate: 371 | org, dataset_name = dataset.split("/") 372 | if "*" in dataset_name: 373 | if org not in org_datasets: 374 | org_datasets[org] = [dataset_entry.id.removeprefix(f"{org}/") for dataset_entry in api.list_datasets(author=org)] 375 | for candidate_dataset_name in org_datasets[org]: 376 | if fnmatch(candidate_dataset_name, dataset_name): 377 | parsed_datasets_to_evaluate.append(f"{org}/{candidate_dataset_name}") 378 | else: 379 | parsed_datasets_to_evaluate.append(dataset) 380 | logger.info(f"Parsed {len(parsed_datasets_to_evaluate)} datasets to evaluate: {','.join(parsed_datasets_to_evaluate)}") 381 | return parsed_datasets_to_evaluate 382 | 383 | if __name__ == "__main__": 384 | import argparse 385 | load_dotenv() 386 | parser = argparse.ArgumentParser() 387 | parser.add_argument("datasets_to_evaluate", type=str, help="comma separated list of datasets to evaluate. accepts wildcards on the org portion, e.g. ioi-leaderboard/ioi-eval.*-prompt-mem-limit") 388 | parser.add_argument("results_dataset_name", type=str, help="where to push the final results open-r1/ioi-test-results") 389 | parser.add_argument("--local_results_path", type=str, default="results") 390 | parser.add_argument("--id_column", type=str, default="uuid", help="column name to use as the unique identifier per problem for each submission") 391 | parser.add_argument("--max_concurrent_requests", type=int, default=10, help="maximum number of concurrent requests to be sent to piston") 392 | parser.add_argument("--test_batch_size", type=int, default=1, help="evaluate these many test cases in parallel, then check if any of them failed (0 score): if so, stop evaluating; otherwise continue with the next batch of test cases") 393 | parser.add_argument("--dry_run", action="store_true", help="do not actually send any requests to piston") 394 | parser.add_argument("--override", action="store_true", help="do not fetch completed submissions from local cache or hub. Will overwrite existing results on the hub") 395 | parser.add_argument('--timeout', type=int, default=60 * 10, help="timeout for the piston client requests keep alive") 396 | parser.add_argument('--add_includes', action="store_true", help="try to fix missing includes in the code") 397 | parser.add_argument('--add_messages_column', action="store_true", help="add a messages column to the results, for SFT") 398 | parser.add_argument('--always_extract_code', action="store_true", help="always extract code from generation, even if it already exists in the code column") 399 | args = parser.parse_args() 400 | 401 | runner = TestsRunner( 402 | datasets_to_evaluate=parse_datasets_to_evaluate(args.datasets_to_evaluate), 403 | results_dataset_name=args.results_dataset_name, 404 | local_results_cache=args.local_results_path, 405 | max_concurrent_requests=args.max_concurrent_requests, 406 | test_batch_size=args.test_batch_size, 407 | dry_run=args.dry_run, 408 | override=args.override, 409 | timeout=args.timeout, 410 | id_column=args.id_column, 411 | add_messages_column=args.add_messages_column, 412 | add_includes=args.add_includes, 413 | always_extract_code=args.always_extract_code 414 | ) 415 | 416 | uvloop.install() 417 | asyncio.run(runner.run_tests_pipeline()) 418 | -------------------------------------------------------------------------------- /generate/evaluate.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from collections import defaultdict 3 | import json 4 | from pathlib import Path 5 | from typing import Dict, List, Optional 6 | import random 7 | from datetime import datetime 8 | import uuid 9 | from datasets import Dataset, load_dataset 10 | from loguru import logger 11 | from tqdm.asyncio import tqdm 12 | import litellm 13 | from dotenv import load_dotenv 14 | import polars as pl 15 | import aiofiles 16 | from litellm.utils import ModelResponse 17 | 18 | class IOIEvaluator: 19 | def __init__(self, org_id: str, model_id: str, api_base: Optional[str] = None, subset: Optional[str] = None, 20 | num_generations: int = 50, num_retries: int = 10, 21 | concurrency: int = 10, num_problems: Optional[int] = None, 22 | last_subtask: bool = False, dry_run: bool = False, 23 | override: bool = False, model_postfix: Optional[str] = None, 24 | revision: Optional[str] = None, timeout: Optional[int] = 600, 25 | use_requests: bool = False, max_tokens: Optional[int] = None): 26 | self.org_id = org_id 27 | self.model_id = model_id 28 | self.api_base = api_base 29 | self.subset = subset 30 | self.num_generations = num_generations 31 | self.num_retries = num_retries 32 | self.concurrency = concurrency 33 | self.num_problems = num_problems 34 | self.last_subtask = last_subtask 35 | self.dry_run = dry_run 36 | self.override = override 37 | self.revision = revision 38 | # Create organization and model directories 39 | self.timeout = timeout 40 | self.use_litellm = not use_requests 41 | self.max_tokens = max_tokens 42 | 43 | # Tracking totals 44 | self.total_prompt_tokens = 0 45 | self.total_completion_tokens = 0 46 | self.total_cost = 0.0 47 | self.model_postfix = model_postfix 48 | 49 | # Semaphore for controlling concurrency 50 | self._semaphore = asyncio.Semaphore(concurrency) 51 | 52 | # HTTP session for direct API calls when not using litellm 53 | self._session = None 54 | 55 | if self.api_base: 56 | logger.info(f"Using API base: {self.api_base}") 57 | 58 | if not self.use_litellm: 59 | logger.info("Using direct asyncio requests instead of LiteLLM") 60 | 61 | if dry_run: 62 | logger.warning("Running in dry-run mode - no actual LLM calls will be made") 63 | 64 | # Create results directory 65 | self.model_dir = Path("results") / self.get_model_name() 66 | self.model_dir.mkdir(parents=True, exist_ok=True) 67 | 68 | # File path for the single JSONL file 69 | self.results_file = self.model_dir / "results.jsonl" 70 | 71 | # Lock for file access 72 | self._file_lock = asyncio.Lock() 73 | 74 | async def save_result_locally(self, result: Dict, year: int, problem_id: str, subtask: str, solution_number: int): 75 | """Save a single result to local JSONL storage with locking.""" 76 | # Ensure problem_id is included in the result 77 | result['year'] = year 78 | result['problem_id'] = problem_id 79 | result['subtask'] = subtask 80 | result['solution_number'] = solution_number 81 | 82 | try: 83 | # Use lock to prevent concurrent writes 84 | async with self._file_lock: 85 | async with aiofiles.open(self.results_file, 'a') as f: 86 | await f.write(json.dumps(result) + '\n') 87 | except Exception as e: 88 | logger.error(f"Failed to save result locally: {str(e)}") 89 | 90 | async def load_previous_results(self) -> Optional[pl.DataFrame]: 91 | """Load previous results from both HuggingFace Hub and local JSONL storage.""" 92 | if self.override: 93 | logger.info("Override mode enabled - not loading previous results") 94 | return None 95 | 96 | results_dfs = [] 97 | 98 | # Try loading from Hub 99 | repo_name = f"{self.org_id}/{self.get_model_name()}" 100 | try: 101 | logger.info(f"Attempting to load previous results from HuggingFace Hub: {repo_name}") 102 | dataset = load_dataset(repo_name, split="train") 103 | if dataset is not None: 104 | # Convert to pandas then to polars 105 | df = dataset.to_polars() 106 | 107 | # Add a column indicating if the result is local 108 | df = df.with_columns([ 109 | pl.lit(False).alias('is_local') 110 | ]) 111 | results_dfs.append(df) 112 | 113 | logger.info(f"Loaded {len(df)} previous results from HuggingFace Hub") 114 | except Exception as e: 115 | logger.info(f"Could not load from HuggingFace Hub: {str(e)}") 116 | 117 | # Try loading from local storage 118 | try: 119 | if self.results_file.exists(): 120 | results = [] 121 | async with self._file_lock: 122 | async with aiofiles.open(self.results_file, 'r') as f: 123 | async for line in f: 124 | try: 125 | result = json.loads(line.strip()) 126 | results.append(result) 127 | except Exception as e: 128 | logger.error(f"Failed to parse JSONL line: {str(e)}") 129 | 130 | if results: 131 | local_df = pl.DataFrame(results).with_columns([ 132 | pl.lit(True).alias('is_local') 133 | ]) 134 | results_dfs.append(local_df) 135 | logger.info(f"Loaded {len(local_df)} previous results from local storage") 136 | except Exception as e: 137 | logger.error(f"Failed to load from local storage: {str(e)}") 138 | 139 | # Combine results if we have any 140 | if results_dfs: 141 | # Select just columns: 'generation', 'code', 'language', 'model_kwargs', 'metadata', 'uuid', 'problem_id', 'subtask', 'solution_number', 'is_local' 142 | common_columns = ['generation', 'code', 'language', 'model_kwargs', 'metadata', 'uuid', 'year', 'problem_id', 'subtask', 'solution_number', 'is_local'] 143 | 144 | # Add missing 'year' column with None values if needed 145 | results_dfs = [df if 'year' in df.columns else df.with_columns(pl.lit(None).alias('year')) for df in results_dfs] 146 | 147 | # Drop that are not in common_columns 148 | results_dfs = [df.select(common_columns) for df in results_dfs] 149 | 150 | # Try this instead: 151 | # Add stop_reason to metadata if it doesn't exist 152 | results_dfs = [df.with_columns(pl.when(pl.col('metadata').is_not_null()).then(pl.col('metadata').map_elements(lambda x: {"stop_reason": "unknown"} | x)).otherwise(pl.col('metadata')).alias('metadata')) for df in results_dfs] 153 | 154 | # Concatenate the aligned dataframes 155 | combined_df = pl.concat(results_dfs, how="vertical") 156 | 157 | # First sort by whether code exists (True first), then by source (local first) 158 | # This ensures we keep entries with code when deduplicating 159 | deduplicated_df = ( 160 | combined_df 161 | .with_columns([ 162 | # Add a column indicating if code exists and is non-empty 163 | pl.when((pl.col('code').is_not_null()) & (pl.col('code') != "")) 164 | .then(1) 165 | .otherwise(0) 166 | .alias('has_code'), 167 | ]) 168 | # Sort by has_code (descending) and is_local (descending) 169 | .sort(['has_code', 'is_local'], descending=[True, True]) 170 | # Keep first occurrence after sorting (prioritizing entries with code and local source) 171 | .unique( 172 | subset=["year", "problem_id", "subtask", "solution_number"], 173 | keep='first' 174 | ) 175 | # Drop the temporary columns 176 | .drop(['has_code', 'is_local']) 177 | ) 178 | 179 | logger.info(f"Combined and deduplicated results: {len(deduplicated_df)} entries") 180 | return deduplicated_df 181 | 182 | return None 183 | 184 | def get_dummy_response(self, prompt: str, seed: int) -> Dict: 185 | """Generate a dummy response for dry runs.""" 186 | dummy_code = """```cpp 187 | int main() { 188 | // This is a dummy solution 189 | return 0; 190 | } 191 | ```""" 192 | return { 193 | "generation": f"This is a dummy response for testing purposes.\n{dummy_code}", 194 | "code": "int main() {\n // This is a dummy solution\n return 0;\n}", 195 | "language": "cpp", 196 | "model_kwargs": { 197 | "seed": seed, 198 | }, 199 | "metadata": { 200 | "usage": { 201 | 'completion_tokens': 10, 202 | 'prompt_tokens': len(prompt.split()), 203 | 'total_tokens': len(prompt.split()) + 10, 204 | 'cost': 0.0 205 | }, 206 | "timestamp": datetime.now().isoformat(), 207 | "stop_reason": "length" # Add stop reason for dummy response 208 | } 209 | } 210 | 211 | def extract_code(self, text: str) -> tuple[str, str]: 212 | """Extract code from the response between ```cpp and ``` markers.""" 213 | try: 214 | parts = text.split("```cpp\n") 215 | if len(parts) > 1: 216 | code_block = parts[-1].split("```")[0] 217 | code = code_block.strip() 218 | if not code: 219 | logger.warning("Empty code block found") 220 | return "", "cpp" 221 | return code, "cpp" 222 | logger.warning("No code block found in the response") 223 | return "", "unknown" 224 | except Exception as e: 225 | logger.error(f"Failed to extract code: {str(e)}") 226 | return "", "unknown" 227 | 228 | async def generate_completion(self, prompt: str, seed: int) -> Dict: 229 | """Generate completion using direct asyncio HTTP requests.""" 230 | retry_budget = self.num_retries 231 | 232 | while retry_budget > 0: 233 | try: 234 | await asyncio.sleep(random.uniform(0.0, 0.1)) 235 | async with self._session.post( 236 | f"{self.api_base}/v1/chat/completions", 237 | json={ 238 | "model": "default", 239 | "messages": [{"role": "user", "content": prompt}], 240 | "seed": seed, 241 | "temperature": 0.7, 242 | "top_p": 0.8, 243 | "max_tokens": self.max_tokens, 244 | }, 245 | headers={"Authorization": "Bearer EMPTY"}, 246 | ) as response: 247 | result = await response.json(content_type=None) 248 | 249 | if result is None: 250 | logger.error("Received None response from API") 251 | retry_budget -= 1 252 | await asyncio.sleep(5) 253 | continue 254 | 255 | # Extract response content 256 | message_content = result.get("choices", [{}])[0].get("message", {}).get("content", "") 257 | 258 | # Extract token usage 259 | usage = result.get("usage", {}) 260 | completion_tokens = usage.get("completion_tokens", 0) 261 | prompt_tokens = usage.get("prompt_tokens", 0) 262 | total_tokens = usage.get("total_tokens", 0) 263 | 264 | # Update totals 265 | self.total_prompt_tokens += prompt_tokens 266 | self.total_completion_tokens += completion_tokens 267 | 268 | # Extract code 269 | code, language = self.extract_code(message_content) 270 | 271 | response_dict = { 272 | "generation": message_content, 273 | "code": code, 274 | "language": language, 275 | "model_kwargs": { 276 | "seed": seed, 277 | }, 278 | "metadata": { 279 | "usage": { 280 | 'completion_tokens': completion_tokens, 281 | 'prompt_tokens': prompt_tokens, 282 | 'total_tokens': total_tokens, 283 | }, 284 | "timestamp": datetime.now().isoformat(), 285 | "stop_reason": result.get("choices", [{}])[0].get("finish_reason", "unknown") 286 | } 287 | } 288 | 289 | 290 | return response_dict 291 | 292 | except Exception as e: 293 | logger.exception(f"API error (will retry): {e}") 294 | retry_budget -= 1 295 | await asyncio.sleep(10) 296 | 297 | raise Exception("All retries failed for direct API call") 298 | 299 | 300 | async def call_llm(self, prompt: str, seed: int) -> Dict: 301 | """Call the LLM using LiteLLM's built-in retry mechanism or direct asyncio requests.""" 302 | if self.dry_run: 303 | result = self.get_dummy_response(prompt, seed) 304 | return result 305 | 306 | if not self.use_litellm: 307 | return await self.generate_completion(prompt, seed) 308 | 309 | return await self.call_litellm(prompt, seed) 310 | 311 | async def call_litellm(self, prompt: str, seed: int) -> Dict: 312 | model_name = self.model_id 313 | kwargs = {} 314 | if self.model_id.startswith("sglang/"): 315 | model_name = model_name.replace("sglang/", "custom_openai/") 316 | kwargs["api_base"] = self.api_base 317 | kwargs["api_key"] = "sk-proj-1234567890" 318 | 319 | if self.max_tokens is not None: 320 | kwargs["max_tokens"] = self.max_tokens 321 | 322 | response: ModelResponse = await litellm.acompletion( 323 | model=model_name, 324 | messages=[{"role": "user", "content": prompt, "cache_control": {"type": "ephemeral"}}], 325 | seed=seed, 326 | num_retries=self.num_retries, 327 | top_p=0.8, 328 | temperature=0.7, 329 | timeout=self.timeout, 330 | **kwargs 331 | ) 332 | 333 | # Extract stop reason 334 | stop_reason = response.choices[0].finish_reason 335 | 336 | # Extract usage information safely 337 | usage = {} 338 | cost = 0.0 339 | if hasattr(response, 'usage'): 340 | try: 341 | completion_tokens = getattr(response.usage, 'completion_tokens', 0) 342 | prompt_tokens = getattr(response.usage, 'prompt_tokens', 0) 343 | total_tokens = getattr(response.usage, 'total_tokens', 0) 344 | 345 | # Calculate cost using litellm 346 | try: 347 | cost = litellm.completion_cost(completion_response=response) 348 | except Exception as e: 349 | logger.warning(f"Failed to calculate cost: {str(e)}") 350 | cost = 0.0 351 | 352 | usage = { 353 | 'completion_tokens': completion_tokens, 354 | 'prompt_tokens': prompt_tokens, 355 | 'total_tokens': total_tokens, 356 | 'cost': cost 357 | } 358 | 359 | # Update totals 360 | self.total_prompt_tokens += prompt_tokens 361 | self.total_completion_tokens += completion_tokens 362 | self.total_cost += cost 363 | 364 | except Exception as e: 365 | logger.error(f"Failed to extract usage information: {str(e)}") 366 | 367 | message_content = response.choices[0].message.content if response.choices else "" 368 | 369 | # Extract code from the response 370 | code, language = self.extract_code(message_content or "") 371 | 372 | result = { 373 | "generation": message_content, 374 | "code": code, 375 | "language": language, 376 | "model_kwargs": { 377 | "seed": seed, 378 | }, 379 | "metadata": { 380 | "usage": usage, 381 | "timestamp": datetime.now().isoformat(), 382 | "stop_reason": stop_reason 383 | } 384 | } 385 | return result 386 | 387 | async def create_solution_requests(self, subtasks: List[Dict]) -> List[Dict]: 388 | """Prepare result entries for a single problem.""" 389 | results = [] 390 | for subtask in subtasks: 391 | prompt = subtask['problem'] 392 | for i in range(self.num_generations): 393 | try: 394 | random_uuid = str(uuid.uuid4()) 395 | 396 | results.append({ 397 | "year": subtask['year'], 398 | "problem_id": subtask['id'], 399 | "subtask": subtask["subtask"], 400 | "prompt": prompt, 401 | "generation": None, 402 | "code": "", 403 | "language": "unknown", 404 | "solution_number": i, 405 | "uuid": random_uuid, 406 | "model_kwargs": {"seed": i}, 407 | "metadata": { 408 | "usage": {'completion_tokens': 0, 'prompt_tokens': 0, 'total_tokens': 0, 'cost': 0.0}, 409 | "timestamp": datetime.now().isoformat() 410 | } 411 | }) 412 | except Exception as e: 413 | logger.error(f"Failed to prepare prompts for problem {subtask['id']}, subtask {subtask['subtask']}: {str(e)}") 414 | return [] 415 | 416 | return results 417 | 418 | async def run_evaluation(self): 419 | """Run the evaluation for all problems.""" 420 | try: 421 | # Create HTTP session if using direct API calls 422 | if not self.use_litellm and not self.dry_run: 423 | import aiohttp 424 | self._session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout), connector=aiohttp.TCPConnector(limit=self.concurrency, ttl_dns_cache=300, keepalive_timeout=self.timeout)) 425 | 426 | 427 | logger.info(f"Loading IOI dataset for subset: {self.subset}") 428 | dataset = load_dataset("open-r1/ioi", split=self.subset) 429 | problem_subtasks = defaultdict(list) 430 | for problem in dataset: 431 | problem_subtasks[(problem["year"], problem["id"])].append(problem) 432 | problem_ids = list(problem_subtasks.keys()) 433 | if self.num_problems is not None: 434 | problem_ids = problem_ids[:self.num_problems] 435 | logger.info(f"Limited evaluation to first {self.num_problems} problems") 436 | 437 | logger.info(f"Starting evaluation of {len(problem_ids)} problems...") 438 | 439 | # Step 1: Generate all solution requests 440 | all_solution_requests = [] 441 | for problem_id in tqdm(problem_ids, desc="Preparing solution requests"): 442 | subtasks = problem_subtasks[problem_id] 443 | if self.last_subtask: 444 | subtasks = [subtasks[-1]] 445 | requests = await self.create_solution_requests(subtasks) 446 | all_solution_requests.extend(requests) 447 | 448 | # Convert to Polars DataFrame for efficient operations 449 | requests_df = pl.DataFrame(all_solution_requests) 450 | logger.info(f"Created {len(requests_df)} solution requests") 451 | 452 | # Step 2: Load previous results 453 | previous_df = None 454 | if not self.override: 455 | previous_df = await self.load_previous_results() 456 | if previous_df is not None: 457 | logger.info(f"Loaded {len(previous_df)} previous results") 458 | 459 | 460 | # Step 3: Merge solution requests with previous results efficiently 461 | if previous_df is not None: 462 | # Keep only the columns we want to preserve from previous results 463 | preserve_cols = ['generation', 'code', 'language', 'metadata', 'model_kwargs'] 464 | 465 | preserve_cols_with_key = preserve_cols + ['year', 'problem_id', 'subtask', 'solution_number'] 466 | previous_df = previous_df.select(preserve_cols_with_key).filter(pl.col('generation').is_not_null() & (pl.col('generation') != "")) 467 | 468 | # Merge using polars, keeping all solution requests and only matching previous results 469 | merged_df = requests_df.join( 470 | previous_df, 471 | on=('year', 'problem_id', 'subtask', 'solution_number'), 472 | how='left', 473 | suffix='_prev' 474 | ) 475 | 476 | # Update values from previous results where they exist 477 | for col in preserve_cols: 478 | prev_col = f'{col}_prev' 479 | merged_df = merged_df.with_columns( 480 | pl.when(pl.col(prev_col).is_not_null()) 481 | .then(pl.col(prev_col)) 482 | .otherwise(pl.col(col)) 483 | .alias(col) 484 | ) 485 | 486 | # Drop the _prev columns 487 | merged_df = merged_df.select([ 488 | c for c in merged_df.columns if not c.endswith('_prev') 489 | ]) 490 | else: 491 | merged_df = requests_df 492 | 493 | # Count how many need to be generated 494 | to_generate_df = merged_df.filter( 495 | (pl.col('generation').is_null()) | 496 | (pl.col('generation') == "") 497 | ) 498 | 499 | # Update seeds ensuring uniqueness 500 | to_generate_dicts = to_generate_df.to_dicts() 501 | logger.info(f"Need to generate {len(to_generate_df)} out of {len(merged_df)} total entries") 502 | 503 | if len(to_generate_df) == 0: 504 | logger.info("No generations needed - all results are already available") 505 | return 506 | 507 | # Run generations for entries without results 508 | async def process_single(row: Dict) -> Dict: 509 | async with self._semaphore: 510 | try: 511 | llm_result = await self.call_llm( 512 | row["prompt"], 513 | row["model_kwargs"]["seed"] 514 | ) 515 | 516 | # Log progress and token usage 517 | if llm_result["metadata"].get("usage"): 518 | usage = llm_result["metadata"]["usage"] 519 | logger.info( 520 | f"Problem {row['problem_id']} (Solution {row['solution_number']}) - " 521 | f"Tokens: {usage.get('total_tokens', 0)} " 522 | f"(prompt: {usage.get('prompt_tokens', 0)}, " 523 | f"completion: {usage.get('completion_tokens', 0)}) - " 524 | f"Cost: ${usage.get('cost', 0.0):.4f}" 525 | ) 526 | 527 | llm_result["uuid"] = row["uuid"] 528 | 529 | # Save result immediately 530 | await self.save_result_locally(llm_result, row["year"], row["problem_id"], row["subtask"], row["solution_number"]) 531 | 532 | return llm_result 533 | except Exception as e: 534 | logger.error(f"Failed generation for problem {row['problem_id']}: {str(e)}") 535 | error_result = { 536 | "generation": "", 537 | "code": "", 538 | "language": "unknown", 539 | "uuid": row["uuid"], 540 | "metadata": { 541 | "error": str(e), 542 | "usage": {'completion_tokens': 0, 'prompt_tokens': 0, 'total_tokens': 0, 'cost': 0.0}, 543 | "timestamp": datetime.now().isoformat(), 544 | "stop_reason": "error" # Add stop reason for error case 545 | } 546 | } 547 | return error_result 548 | 549 | # Run generations in parallel with controlled concurrency 550 | tasks = [process_single(row) for row in to_generate_dicts] 551 | generated_results = await tqdm.gather(*tasks, desc="Running generations") 552 | 553 | # Convert generated results to DataFrame and update original DataFrame 554 | generated_df = pl.DataFrame(generated_results) 555 | 556 | # Merge generated results with previous results 557 | merged_df = merged_df.join( 558 | generated_df, 559 | on='uuid', 560 | how='left', 561 | suffix='_gen' 562 | ) 563 | 564 | # Update the old columns with the new values 565 | for col in ['generation', 'code', 'language', 'metadata', 'model_kwargs']: 566 | merged_df = merged_df.with_columns( 567 | pl.when(pl.col(f'generation_gen').is_not_null() & (pl.col(f'generation_gen') != "")) 568 | .then(pl.col(f'{col}_gen')) 569 | .otherwise(pl.col(col)) 570 | .alias(col) 571 | ) 572 | 573 | # Drop the _gen columns 574 | merged_df = merged_df.select([ 575 | c for c in merged_df.columns if not c.endswith('_gen') 576 | ]) 577 | 578 | # Validate results before pushing to hub 579 | valid_results = merged_df.filter( 580 | (pl.col('generation').is_not_null()) & 581 | (pl.col('generation') != "") 582 | ) 583 | 584 | total_expected = len(merged_df) 585 | total_valid = len(valid_results) 586 | 587 | logger.info(f"Valid results: {total_valid}/{total_expected}") 588 | 589 | # Only push to hub if all results are valid 590 | if total_valid == total_expected: 591 | # Convert to HF Dataset 592 | output_dataset = Dataset.from_polars(merged_df) 593 | model_name = self.get_model_name() 594 | 595 | try: 596 | output_dataset.push_to_hub(f"{self.org_id}/{model_name}") 597 | logger.info(f"Pushed to hub: {self.org_id}/{model_name}") 598 | except Exception as e: 599 | logger.error(f"Failed to push to hub: {str(e)}") 600 | else: 601 | logger.warning( 602 | f"Not pushing to hub - missing {total_expected - total_valid} valid results. " 603 | "Results saved locally and can be retried later." 604 | ) 605 | 606 | # Log final statistics 607 | # logger.info(f"Evaluation completed. Total successful generations: {successful}/{len(all_results)}") 608 | logger.info( 609 | f"Total tokens used: {self.total_prompt_tokens + self.total_completion_tokens} " 610 | f"(prompt: {self.total_prompt_tokens}, completion: {self.total_completion_tokens})" 611 | ) 612 | logger.info(f"Total cost: ${self.total_cost:.4f}") 613 | 614 | # Clean up HTTP session if using direct API calls 615 | if self._session is not None: 616 | await self._session.close() 617 | self._session = None 618 | 619 | return merged_df 620 | except Exception as e: 621 | # Clean up HTTP session if using direct API calls 622 | if self._session is not None: 623 | await self._session.close() 624 | self._session = None 625 | raise e 626 | 627 | 628 | def get_model_name(self): 629 | model_name = f"ioi-eval-{self.model_id.replace('/', '_')}" 630 | if self.dry_run: 631 | model_name = f"dummy-{model_name}" 632 | 633 | if self.revision: 634 | model_name = f"{model_name}-{self.revision.replace('/', '_')}" 635 | 636 | if self.model_postfix: 637 | model_name = f"{model_name}-{self.model_postfix}" 638 | 639 | return model_name 640 | 641 | 642 | def main(): 643 | load_dotenv() # Load environment variables from .env file 644 | 645 | import argparse 646 | parser = argparse.ArgumentParser(description="Evaluate LLMs on IOI problems") 647 | parser.add_argument("--org_id", required=True, help="Organization ID") 648 | parser.add_argument("--model_id", required=True, help="Model ID") 649 | parser.add_argument("--api_base", help="API base URL for the model") 650 | parser.add_argument("--subset", default="test", help="IOI subset to generate solutions for (train or test)") 651 | parser.add_argument("--num_generations", type=int, default=50, help="Number of generations per problem") 652 | parser.add_argument("--num_retries", type=int, default=10, help="Number of retries for failed API calls") 653 | parser.add_argument("--concurrency", type=int, default=20, help="Number of concurrent generations") 654 | parser.add_argument("--num_problems", type=int, default=None, help="Number of problems to evaluate (None for all)") 655 | parser.add_argument("--last_subtask", action="store_true", help="Only evaluate the last subtask for each problem (usually the full problem)") 656 | parser.add_argument("--dry_run", action="store_true", help="Run without making actual LLM calls") 657 | parser.add_argument("--override", action="store_true", help="Override existing results and start fresh") 658 | parser.add_argument("--model_postfix", help="Postfix for the model name") 659 | parser.add_argument("--revision", help="Revision to use for the model") 660 | parser.add_argument("--timeout", type=int, default=600, help="Timeout for the LLM call") 661 | parser.add_argument("--use_requests", action="store_true", default=False, help="Use requests instead of litellm") 662 | parser.add_argument("--max_tokens", type=int, default=None, help="Max tokens") 663 | args = parser.parse_args() 664 | 665 | evaluator = IOIEvaluator( 666 | org_id=args.org_id, 667 | model_id=args.model_id, 668 | api_base=args.api_base, 669 | subset=args.subset, 670 | num_generations=args.num_generations, 671 | num_retries=args.num_retries, 672 | concurrency=args.concurrency, 673 | num_problems=args.num_problems, 674 | last_subtask=args.last_subtask, 675 | dry_run=args.dry_run, 676 | override=args.override, 677 | model_postfix=args.model_postfix, 678 | revision=args.revision, 679 | timeout=args.timeout, 680 | use_requests=args.use_requests, 681 | max_tokens=args.max_tokens 682 | ) 683 | asyncio.run(evaluator.run_evaluation()) 684 | 685 | if __name__ == "__main__": 686 | main() --------------------------------------------------------------------------------