├── ioi-evals.png
├── generate
    ├── .env.template
    ├── requirements.txt
    ├── .gitignore
    ├── get_context_length.py
    ├── utils
    │   ├── get_context_length.py
    │   ├── open_router_usage.py
    │   └── check_failures.py
    ├── TODO.md
    ├── slurm_standalone
    │   ├── serve_router.slurm
    │   ├── debug.slurm
    │   └── serve_r1.slurm
    ├── README.md
    ├── run_ioi_slurm.py
    └── evaluate.py
├── run_tests
    ├── .env.template
    ├── requirements.txt
    ├── piston
    │   ├── launch_piston_workers.sh
    │   ├── launch_single_piston.sh
    │   └── README.md
    ├── utils.py
    ├── custom_setup
    │   ├── compile
    │   └── run
    ├── selection_simulator.py
    ├── README.md
    ├── scoring.py
    ├── piston_client.py
    └── tests_runner.py
├── .gitignore
└── README.md


/ioi-evals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/ioi/HEAD/ioi-evals.png


--------------------------------------------------------------------------------
/generate/.env.template:
--------------------------------------------------------------------------------
1 | OPENROUTER_API_KEY=
2 | OPENAI_API_KEY=
3 | ANTHROPIC_API_KEY=


--------------------------------------------------------------------------------
/run_tests/.env.template:
--------------------------------------------------------------------------------
1 | PISTON_ENDPOINTS=slurm
2 | PISTON_MAX_REQUESTS_PER_ENDPOINT=1


--------------------------------------------------------------------------------
/run_tests/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets>=3.3.2
2 | tqdm
3 | python-dotenv
4 | loguru
5 | aiohttp
6 | huggingface_hub
7 | aiofiles
8 | uvloop
9 | tabulate


--------------------------------------------------------------------------------
/generate/requirements.txt:
--------------------------------------------------------------------------------
 1 | litellm
 2 | datasets>=3.3.2
 3 | tqdm
 4 | python-dotenv
 5 | loguru
 6 | aiohttp
 7 | huggingface_hub
 8 | setuptools
 9 | transformers>=4.48.3
10 | aiofiles
11 | polars
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Environment variables
 2 | 
 3 | # logs
 4 | logs/*
 5 | 
 6 | # env
 7 | .env
 8 | 
 9 | # Python
10 | __pycache__/
11 | *.py[cod]
12 | *$py.class
13 | *.so
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | 
31 | # Virtual Environment
32 | venv/
33 | env/
34 | ENV/
35 | .venv/
36 | .env/
37 | 
38 | # IDE
39 | .idea/
40 | .vscode/
41 | *.swp
42 | *.swo
43 | .DS_Store
44 | 
45 | # Project specific
46 | results/
47 | 


--------------------------------------------------------------------------------
/generate/.gitignore:
--------------------------------------------------------------------------------
 1 | # Environment variables
 2 | 
 3 | # logs
 4 | logs/*
 5 | 
 6 | # env
 7 | .env
 8 | 
 9 | # Python
10 | __pycache__/
11 | *.py[cod]
12 | *$py.class
13 | *.so
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | 
31 | # Virtual Environment
32 | venv/
33 | env/
34 | ENV/
35 | .venv/
36 | .env/
37 | 
38 | # IDE
39 | .idea/
40 | .vscode/
41 | *.swp
42 | *.swo
43 | .DS_Store
44 | 
45 | # Project specific
46 | results/
47 | 


--------------------------------------------------------------------------------
/run_tests/piston/launch_piston_workers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this simple script will launch a bunch of piston workers on the HF science cluster
 4 | 
 5 | N_INSTANCES=${1:-5}  # Default to 5 instances
 6 | 
 7 | for i in $(seq 1 $N_INSTANCES); do
 8 |     # Find random (hopefully) available port
 9 |     PORT=$(comm -23 <(seq 2000 10000 | sort) <(ss -tan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n1)
10 |     
11 |     # the job name format is important for the code to then be able to get a list of workers. `piston-worker-<port>`
12 |     sbatch \
13 |         --job-name="piston-worker-$PORT" \
14 |         --export=ALL,PORT=$PORT \
15 |         /fsx/guilherme/piston/launch_single_piston.sh
16 | done


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # IOI
 2 | 
 3 | ![IOI Evals](ioi-evals.png)
 4 | 
 5 | ## Dataset links
 6 | - [Problem statements dataset](https://huggingface.co/datasets/open-r1/ioi) (IOI’2020 - IOI’2024): `open-r1/ioi`
 7 | - [Test cases](https://huggingface.co/datasets/open-r1/ioi-test-cases): `open-r1/ioi-test-cases`
 8 | - [Official (ground truth) solutions](https://huggingface.co/datasets/open-r1/ioi-sample-solutions): `open-r1/ioi-sample-solutions`
 9 | - [Evaluation data for 40+ leading models on IOI’2024](https://huggingface.co/datasets/open-r1/ioi-2024-model-solutions): `open-r1/ioi-2024-model-solutions`
10 | 
11 | ## Generating solutions
12 | To have models generate solutions to IOI problems, follow the instructions in the [generate](generate/README.md) directory.
13 | 
14 | ## Running tests
15 | To run tests on generated solutions, follow the instructions in the [run_tests](run_tests/README.md) directory.
16 | 


--------------------------------------------------------------------------------
/generate/get_context_length.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | from typing import Dict, Any
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | def get_context_length(model_name: str) -> int:
 8 |     """Get maximum context length from model config."""
 9 |     try:
10 |         config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
11 |         # Check various possible context length attributes
12 |         context_length = (
13 |             getattr(config, 'max_position_embeddings', None) or
14 |             getattr(config, 'sliding_window', None) or
15 |             getattr(config, 'max_sequence_length', None) or
16 |             getattr(config, 'max_seq_len', None) or
17 |             4096  # Default fallback
18 |         )
19 | 
20 |         # Some models (like Qwen) might have sliding_window disabled
21 |         if hasattr(config, 'use_sliding_window') and not config.use_sliding_window:
22 |             # If sliding window is disabled, use max_position_embeddings instead
23 |             context_length = getattr(config, 'max_position_embeddings', context_length)
24 |             
25 | 
26 |         # Cap to 32k
27 |         return min(context_length, 32768)
28 |     except Exception as e:
29 |         logger.warning(f"Could not get context length from config for {model_name}: {e}")
30 |         return 4096  # Default fallback
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     import argparse
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument("--model_name", type=str, required=True)
37 |     args = parser.parse_args()
38 |     print(get_context_length(args.model_name))
39 | 


--------------------------------------------------------------------------------
/generate/utils/get_context_length.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | from typing import Dict, Any
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | def get_context_length(model_name: str) -> int:
 8 |     """Get maximum context length from model config."""
 9 |     try:
10 |         config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
11 |         # Check various possible context length attributes
12 |         context_length = (
13 |             getattr(config, 'max_position_embeddings', None) or
14 |             getattr(config, 'sliding_window', None) or
15 |             getattr(config, 'max_sequence_length', None) or
16 |             getattr(config, 'max_seq_len', None) or
17 |             4096  # Default fallback
18 |         )
19 | 
20 |         # Some models (like Qwen) might have sliding_window disabled
21 |         if hasattr(config, 'use_sliding_window') and not config.use_sliding_window:
22 |             # If sliding window is disabled, use max_position_embeddings instead
23 |             context_length = getattr(config, 'max_position_embeddings', context_length)
24 |             
25 | 
26 |         # Cap to 32k
27 |         return min(context_length, 32768)
28 |     except Exception as e:
29 |         logger.warning(f"Could not get context length from config for {model_name}: {e}")
30 |         return 4096  # Default fallback
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     import argparse
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument("--model_name", type=str, required=True)
37 |     args = parser.parse_args()
38 |     print(get_context_length(args.model_name))
39 | 


--------------------------------------------------------------------------------
/generate/TODO.md:
--------------------------------------------------------------------------------
 1 | The tasks is to implement a simple repository for evaluation LLMs on IOI problems.
 2 | 
 3 | 
 4 | ## Used frameworks
 5 | - You should use LiteLLM to call the LLM providers.
 6 | - You should use asyncio to run the LLM calls asynchronously.
 7 | 
 8 | 
 9 | ## Steps for evaluation:
10 | - We have a dataset of IOI 2024 problems here: https://huggingface.co/datasets/open-r1/ioi-2024 with following format:
11 | ```
12 | {
13 |     'name': str,
14 |     'id': str,
15 |     'day': str,
16 |     'subtask': str,
17 |     'statement': str,
18 |     'score': str,
19 |     'time_limit': str,
20 | }
21 | 
22 | - Each problem is split into multiple subtasks (they have same id, but different subtask column). You will iterate over the problems and get their subtasks.
23 | 
24 | - You will then take a problem and the subtasks and call a subtask sample funciton. Subtask sapmle function takes a problem and the last integer i, and returns the next subtask to solve.
25 | 
26 | - You will then create a prompt based on subtasks and call the LLM with random seed. Then generate a next subtask and repeat untill you don't have 50 generations.
27 | 
28 | - This way you will get 50 generations for each problem. And save them as dataset into org_id(arg)/model_id(arg). The resulting dataset will have following format:
29 | ```
30 | {
31 |     'problem_id': str,
32 |     'subtask': str,
33 |     'prompt': str,
34 |     'generation': str,
35 |     'code': str,
36 |     'language': str,
37 |     'model_kwargs': dict,
38 |     'metadata': dict,
39 | }
40 | ```
41 | 
42 | Therefore if you have 90 problems and 50 generations for each problem, the resulting dataset will have 4500 samples.


--------------------------------------------------------------------------------
/run_tests/piston/launch_single_piston.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=piston_worker
 3 | #SBATCH --output=/fsx/guilherme/piston/worker-logs/%x-%j.out
 4 | #SBATCH --error=/fsx/guilherme/piston/worker-logs/%x-%j.out  # Redirect error logs to .out
 5 | #SBATCH --cpus-per-task=2
 6 | #SBATCH --mem-per-cpu=1950M
 7 | #SBATCH --partition=hopper-cpu
 8 | #SBATCH --time=48:00:00
 9 | 
10 | # sometimes if a bunch of workers start at the same time pyxis dies
11 | sleep $(( RANDOM % 20 ))
12 | 
13 | # mounting the packages folder lets us not have to manually install the package on each instance
14 | # we use 63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a as the latest image requires isolate, which does not work on the HF science cluster (cgroups incompatibility)
15 | # feel free try with the latest image
16 | # the code you see below increases the very constrained piston default limits, and sets the repo url to the one hosting our IOI package
17 | srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/packages --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \
18 |     bash -c "
19 |     export PISTON_COMPILE_TIMEOUT=60000
20 |     export PISTON_RUN_TIMEOUT=60000
21 |     export PISTON_OUTPUT_MAX_SIZE=1000000000
22 |     export PISTON_MAX_FILE_SIZE=1000000000
23 |     export PISTON_DISABLE_NETWORKING=true
24 |     export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index
25 | 
26 |     sed -i '/app.use(body_parser.urlencoded/c\    app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js
27 |     sed -i '/app.use(body_parser.json/c\    app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js
28 | 
29 |     # Start server in background
30 |     node src
31 |     "
32 | 


--------------------------------------------------------------------------------
/run_tests/utils.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from functools import lru_cache
 3 | from itertools import islice
 4 | 
 5 | from datasets import load_dataset
 6 | 
 7 | 
 8 | def add_includes(code: str, problem_id: str) -> str:
 9 |     """
10 |         Fix common compilation errors for IOI problems.
11 |     """
12 |     if not code:
13 |         return code
14 |     # has most of the useful functions
15 |     code_header = '#include <bits/stdc++.h>\n'
16 |     # include the problem header
17 |     problem_header_include = f'#include "{problem_id}.h"'
18 |     if problem_header_include not in code:
19 |         code_header += problem_header_include + '\n'
20 |     # use namespace std since models forget std:: often
21 |     if "using namespace std;" not in code and "std::" not in code:
22 |         code_header += "\nusing namespace std;\n\n"
23 |     return code_header + code
24 | 
25 | @lru_cache
26 | def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]:
27 |     """
28 |         Load IOI tests for a given year.
29 |     """
30 |     tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train")
31 |     test_cases = defaultdict(dict)
32 |     for test_case in tests_dataset:
33 |         test_cases[test_case['problem_id']][test_case['test_name']] = test_case['test_input'], test_case['test_output']
34 |     return test_cases
35 | 
36 | def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]:
37 |     """
38 |         Load IOI tests for a given year and problem id.
39 |     """
40 |     return load_ioi_tests_for_year(year)[problem_id]
41 | 
42 | def batched(iterable, n):
43 |     "Batch data into lists of length n. The last batch may be shorter."
44 |     # batched('ABCDEFG', 3) --> ABC DEF G
45 |     if n < 1:
46 |         return iterable
47 |     it = iter(iterable)
48 |     while (batch := list(islice(it, n))):
49 |         yield batch


--------------------------------------------------------------------------------
/run_tests/custom_setup/compile:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | manager_files=()  # Array to store manager filenames
 4 | current_dir="$(pwd)"
 5 | 
 6 | # Checker compilation path
 7 | checker_dir="$current_dir/checker"
 8 | checker_src="$checker_dir/checker.cpp"
 9 | 
10 | if [ -e "$checker_src" ]; then
11 |     echo "Compiling checker"
12 |     checker_exe="$checker_dir/checker"
13 |     g++ -x c++ -std=gnu++17 -O2 -o "$checker_exe" "$checker_src"
14 |     chmod +x "$checker_exe"
15 |     if [ $? -ne 0 ]; then
16 |         echo "Could not compile checker" >&2
17 |         exit 1
18 |     fi
19 |     echo "Compiled checker"
20 | else
21 |     echo "No checker found at $checker_src"
22 | fi
23 | 
24 | # Graders path
25 | graders_dir="$current_dir/graders"
26 | if [ ! -e "$graders_dir" ]; then
27 |     echo "Grader folder was not found" >&2
28 |     exit 1
29 | fi
30 | 
31 | # Find and compile manager if it exists
32 | manager_src="$graders_dir/manager.cpp"
33 | if [ -e "$manager_src" ]; then
34 |     echo "Compiling manager"
35 |     manager_exe="$graders_dir/manager"
36 |     g++ -x c++ -std=gnu++17 -O2 -o "$manager_exe" "$manager_src"
37 |     chmod +x "$manager_exe"
38 |     if [ $? -ne 0 ]; then
39 |         echo "Could not compile manager" >&2
40 |         exit 1
41 |     fi
42 |     manager_files+=("manager")
43 | fi
44 | 
45 | # Process other graders
46 | graders_list=($(ls "$graders_dir" | grep -v 'manager.cpp'))
47 | for grader_name in "${graders_list[@]}"; do
48 |     manager_files+=("$grader_name")
49 | done
50 | 
51 | # Extract problem name and compile necessary files
52 | problem_name='?'
53 | for file in "${manager_files[@]}"; do
54 |     if [[ "$file" == *.h && "$file" != "testlib.h" ]]; then
55 |         problem_name="${file%.h}"
56 |         echo "Problem name: $problem_name"
57 |         break
58 |     fi
59 | done
60 | 
61 | files_to_compile=("graders/$problem_name.cpp")
62 | [ -e graders/grader.cpp ] && files_to_compile+=("graders/grader.cpp")
63 | [ -e graders/stub.cpp ] && files_to_compile+=("graders/stub.cpp")
64 | 
65 | g++ -DEVAL -std=gnu++17 -O2 -pipe -s -o graders/"$problem_name" "${files_to_compile[@]}"
66 | if [ $? -ne 0 ]; then
67 |     echo "Failed to compile $problem_name" >&2
68 |     exit 1
69 | fi
70 | chmod +x graders/"$problem_name"
71 | echo "Compiled $problem_name from ${files_to_compile[@]} successfully"
72 | 
73 | echo "Manager files: ${manager_files[@]}"


--------------------------------------------------------------------------------
/generate/utils/open_router_usage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | from dotenv import load_dotenv
 4 | 
 5 | # Load environment variables from .env file
 6 | load_dotenv()
 7 | 
 8 | def get_openrouter_usage_and_credits():
 9 |     # Get API key from environment variables
10 |     api_key = os.getenv('OPENROUTER_API_KEY')
11 |     
12 |     if not api_key:
13 |         raise ValueError("OPENROUTER_API_KEY not found in environment variables")
14 | 
15 |     # API endpoints
16 |     usage_url = "https://openrouter.ai/api/v1/auth/key"
17 |     credits_url = "https://openrouter.ai/api/v1/credits"
18 | 
19 |     # Headers required for OpenRouter API
20 |     headers = {
21 |         "Authorization": f"Bearer {api_key}",
22 |     }
23 |     
24 |     try:
25 |         # Fetch usage data
26 |         usage_response = requests.get(usage_url, headers=headers)
27 |         usage_response.raise_for_status()  # Raise an exception for bad status codes
28 |         usage_data = usage_response.json()
29 | 
30 |         # Fetch credits data
31 |         credits_response = requests.get(credits_url, headers=headers)
32 |         print(credits_response.json())
33 |         credits_response.raise_for_status()  # Raise an exception for bad status codes
34 |         credits_data = credits_response.json()
35 | 
36 |         return usage_data, credits_data
37 |     except requests.exceptions.RequestException as e:
38 |         print(f"Error fetching data: {e}")
39 |         return None, None
40 | 
41 | if __name__ == "__main__":
42 |     usage_data, credits_data = get_openrouter_usage_and_credits()
43 |     if usage_data:
44 |         print("OpenRouter Usage Information:")
45 |         data = usage_data.get('data', {})
46 |         print(f"Label: {data.get('label', 'N/A')}")
47 |         print(f"Limit: {data.get('limit', 'N/A')}")
48 |         print(f"Usage: {data.get('usage', 'N/A')}")
49 |         print(f"Limit Remaining: {data.get('limit_remaining', 'N/A')}")
50 |         print(f"Is Free Tier: {data.get('is_free_tier', 'N/A')}")
51 |         rate_limit = data.get('rate_limit', {})
52 |         print(f"Rate Limit Requests: {rate_limit.get('requests', 'N/A')}")
53 |         print(f"Rate Limit Interval: {rate_limit.get('interval', 'N/A')}")
54 | 
55 |     if credits_data:
56 |         print("OpenRouter Credits Information:")
57 |         data = credits_data.get('data', {})
58 |         print(f"Total Credits: {data.get('total_credits', 'N/A')}")
59 |         print(f"Total Usage: {data.get('total_usage', 'N/A')}")
60 |         # Print any other relevant information from the response


--------------------------------------------------------------------------------
/generate/slurm_standalone/serve_router.slurm:
--------------------------------------------------------------------------------
 1 | # Credits to Anton Lozhkov
 2 | #!/bin/bash
 3 | #SBATCH --job-name=r1-router
 4 | #SBATCH --partition=hopper-cpu
 5 | #SBATCH --qos=high
 6 | #SBATCH --nodes=1
 7 | #SBATCH --cpus-per-task=8
 8 | #SBATCH --mem-per-cpu=1875m
 9 | #SBATCH --output=./logs/%x_%j_%n.out
10 | #SBATCH --error=./logs/%x_%j_%n.err
11 | #SBATCH --time=30-00:00:00
12 | #SBATCH --requeue
13 | 
14 | set -exuo pipefail
15 | 
16 | # Configuration variables
17 | ROUTER_PORT=39876
18 | SERVER_PORT=39877  # Must match the server script
19 | HEALTH_CHECK_TIMEOUT=10  # Timeout for health checks (seconds)
20 | UV_ENV=/fsx/hynek_kydlicek/projects/ioi-leaderboard/ioi-eval
21 | 
22 | trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1
23 | 
24 | # Environment setup
25 | source ~/.bashrc
26 | source $UV_ENV/bin/activate
27 | 
28 | # Start the router
29 | python -m sglang_router.launch_router \
30 |     --port "$ROUTER_PORT" \
31 |     --host 0.0.0.0 \
32 |     --worker-startup-timeout-secs 300 &
33 | 
34 | ROUTER_PID=$!
35 | 
36 | # Wait for router to start
37 | sleep 10
38 | if ! curl -s -o /dev/null "http://localhost:${ROUTER_PORT}/health"; then
39 |     echo "Router failed to start"
40 |     kill $ROUTER_PID
41 |     exit 1
42 | fi
43 | 
44 | echo "Router started successfully on port $ROUTER_PORT"
45 | echo "Scanning for running r1-server instances..."
46 | 
47 | # Get a list of r1-server job IDs and register their servers
48 | while IFS= read -r jobid; do
49 |     [[ -z "$jobid" ]] && continue
50 | 
51 |     # Use scontrol to get the nodelist for this job
52 |     nodelist=$(scontrol show job "$jobid" | grep NodeList | tail -n1 | grep -oP 'NodeList=ip[^ ]+')
53 |     [[ -z "$nodelist" ]] && continue
54 |     nodelist=${nodelist#NodeList=}
55 | 
56 |     # Get first node from the nodelist
57 |     first_node=$(scontrol show hostnames "$nodelist" | head -n1)
58 |     [[ -z "$first_node" ]] && continue
59 |     [[ "$first_node" == "(null)" ]] && continue
60 | 
61 |     # Convert hostname to IP format
62 |     server_ip=$(echo "$first_node" | sed -E 's/ip-([0-9]+)-([0-9]+)-([0-9]+)-([0-9]+)/\1.\2.\3.\4/')
63 |     server_url="http://${server_ip}:${SERVER_PORT}"
64 | 
65 |     echo "Found server node: $first_node (${server_ip})"
66 | 
67 |     # Check if server is responding and register it
68 |     if timeout "$HEALTH_CHECK_TIMEOUT" curl -s -o /dev/null "http://${server_ip}:${SERVER_PORT}/health"; then
69 |         if curl -s -X POST "http://localhost:${ROUTER_PORT}/add_worker?url=${server_url}"; then
70 |             echo "Successfully registered $server_url"
71 |         else
72 |             echo "Failed to register $server_url"
73 |         fi
74 |     else
75 |         echo "Server at $server_url not healthy yet, skipping registration"
76 |     fi
77 | done < <(squeue -h -u "$USER" -n r1-server -t RUNNING -o "%i")
78 | 
79 | # Just keep router running and healthy
80 | while true; do
81 |     if ! curl -s -o /dev/null "http://localhost:${ROUTER_PORT}/health"; then
82 |         echo "Error: Router health check failed"
83 |         exit 1
84 |     fi
85 |     sleep 300
86 | done


--------------------------------------------------------------------------------
/generate/slurm_standalone/debug.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=ioi-eval-Qwen-Qwen2.5-7B-Instruct
 3 | #SBATCH --partition=hopper-prod
 4 | #SBATCH --qos=normal
 5 | #SBATCH --nodes=1
 6 | #SBATCH --gpus-per-node=8
 7 | #SBATCH --exclusive
 8 | #SBATCH --output=/fsx/hynek_kydlicek/logs/ioi-eval/ioi-eval-Qwen-Qwen2.5-7B-Instruct/%j-%x.out
 9 | #SBATCH --error=/fsx/hynek_kydlicek/logs/ioi-eval/ioi-eval-Qwen-Qwen2.5-7B-Instruct/%j-%x.out
10 | #SBATCH --time=7-00:00:00
11 | #SBATCH --ntasks-per-node=1
12 | 
13 | set -exuo pipefail
14 | 
15 | SERVER_PORT=39877
16 | DIST_PORT=45000
17 | UV_ENV=/fsx/hynek_kydlicek/projects/ioi-leaderboard/test
18 | 
19 | # random sleep (0-100) to prevent ddosing server
20 | sleep $((RANDOM % 100 + 1))
21 | 
22 | # Environment configuration
23 | export OUTLINES_CACHE_DIR=/scratch/serve_r1/ocache/
24 | export TRITON_HOME=/scratch/serve_r1/triton/
25 | export GLOO_SOCKET_IFNAME="enp71s0"
26 | export NCCL_SOCKET_IFNAME="enp71s0"
27 | 
28 | # Evaluation script path
29 | EVAL_SCRIPT_PATH="/fsx/hynek_kydlicek/projects/ioi-leaderboard/evaluate.py"
30 | 
31 | module load cuda/12.4
32 | source ~/.bashrc
33 | 
34 | # Activate uv
35 | source $UV_ENV/bin/activate
36 | 
37 | # FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
38 | # FIRST_NODE_IP=$(srun --nodes=1 -w "$FIRST_NODE" hostname --ip-address)
39 | FIRST_NODE_IP="$(hostname --ip-address)"
40 | 
41 | # Launch servers synchronously across all nodes
42 | bash -c "python -m sglang.launch_server \
43 |     --model-path 'Qwen/Qwen2-0.5B' \
44 |     --tp 2 \
45 |     --dist-init-addr '$FIRST_NODE_IP:$DIST_PORT' \
46 |     --nnodes 1 \
47 |     --node-rank \$SLURM_PROCID \
48 |     --port '$SERVER_PORT' \
49 |     --host 0.0.0.0 \
50 |     --trust-remote-code \
51 |     --max-running-requests 100 \
52 |     --context-length 4096" &
53 | 
54 | # Wait for server with timeout
55 | TIMEOUT=3600  # 1h, but model loading should take ~30min
56 | START_TIME=$(date +%s)
57 | echo "Waiting for SGLang server (http://$FIRST_NODE_IP:$SERVER_PORT)..."
58 | 
59 | while true; do
60 |     if curl -s -o /dev/null -w "%{http_code}" "http://$FIRST_NODE_IP:$SERVER_PORT/health" >/dev/null 2>&1; then
61 |         echo "Server is ready at http://$FIRST_NODE_IP:$SERVER_PORT"
62 |         break
63 |     fi
64 | 
65 |     CURRENT_TIME=$(date +%s)
66 |     if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then
67 |         echo "Error: Server failed to start within $TIMEOUT seconds"
68 |         exit 1
69 |     fi
70 | 
71 |     echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)"
72 |     sleep 60
73 | done
74 | 
75 | echo "Checking available models..."
76 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/models"
77 | sleep 10
78 | 
79 | echo "Executing sanity check..."
80 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/completions" \
81 |     -H "Content-Type: application/json" \
82 |     -d '{
83 |         "model": "default",
84 |         "prompt": "hi, how are you?",
85 |         "max_tokens": 2048,
86 |         "temperature": 0.6
87 |     }'
88 | 
89 | python "$EVAL_SCRIPT_PATH" \
90 |     --model_id "sglang/Qwen/Qwen2-0.5B" \
91 |     --api_base "http://localhost:$SERVER_PORT/v1" \
92 |     --concurrency 100 \
93 |     --org_id=ioi-leaderboard --num_problems=6 --num_generations=1 --model_postfix=test --num_subtasks=2 --override
94 | 
95 | # Kill the server and exit
96 | pkill -f "python -m sglang.launch_server"
97 | exit 0
98 | 


--------------------------------------------------------------------------------
/run_tests/piston/README.md:
--------------------------------------------------------------------------------
 1 | # Piston workers (Slurm)
 2 | 
 3 | We have built a [piston](https://github.com/engineer-man/piston) package to run IOI problems.
 4 | 
 5 | To launch a fleet of piston workers on a Slurm cluster, you can adapt the `/fsx` paths in `launch_piston_workers.sh` and `launch_single_piston.sh` and run:
 6 | ```bash
 7 | ./launch_piston_workers.sh (number of workers to launch)
 8 | ```
 9 | 
10 | This command will launch a Slurm job for each worker, which will be called `piston-worker-<port>`, where `<port>` is the port where the worker will be listening.
11 | 
12 | > [!TIP]
13 | > To accelerate evaluation, we recommend spinning up as many Piston workers as possible. For example, our evaluations are typically run with 1,500 workers.
14 | 
15 | ## First time setup
16 | 
17 | You will need to install the [IOI package](https://github.com/guipenedo/piston/tree/master/packages/cms_ioi/1.0.0) in the workers. To do so, run the following steps:
18 | 
19 | 1. Launch a single worker:
20 | ```bash
21 | ./launch_piston_workers.sh 1
22 | ```
23 | 
24 | 2. Assuming it's running on `ip-10-53-86-146:1234`, send the package install request:
25 | ```bash
26 | curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: application/json" -d '{"language": "cms_ioi", "version": "1.0.0"}'
27 | ```
28 | 
29 | 3. You can now launch more workers and due to the shared mounted packages directory, they should already have the package installed.
30 | 
31 | To have the main script find the workers automatically, you can export the following environment variable:
32 | ```bash
33 | export PISTON_ENDPOINTS=slurm
34 | ```
35 | 
36 | You can also change `PISTON_MAX_REQUESTS_PER_ENDPOINT`, which tries to limit how many simultaneous requests each worker will handle (1 by default). Keep in mind that this is a local limit and in distributed setups, as there is no global limit, workers might sometimes be overwhelmed when some processes hit the same worker.
37 | 
38 | # Piston workers (local docker)
39 | This will launch a single worker in a docker container. Consider launching multiple workers for better scalability. Replace 2000 with the port you want to use.
40 | Make sure to change `/path/to/local/packages` to the path you want to persist for package installs.
41 | 
42 | ```bash
43 | docker run -d \
44 |   --name piston_worker \
45 |   -v /path/to/local/packages:/piston/packages \
46 |   -e PORT=2000 \
47 |   -e PISTON_COMPILE_TIMEOUT=60000 \
48 |   -e PISTON_RUN_TIMEOUT=60000 \
49 |   -e PISTON_OUTPUT_MAX_SIZE=1000000000 \
50 |   -e PISTON_MAX_FILE_SIZE=1000000000 \
51 |   -e PISTON_DISABLE_NETWORKING=true \
52 |   -e PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index \
53 |   -p 2000:2000 \
54 |   --entrypoint /bin/bash \
55 |   ghcr.io/engineer-man/piston@sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a \
56 |   -c "sed -i '/app.use(body_parser.urlencoded/c\    app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js && \
57 |       sed -i '/app.use(body_parser.json/c\    app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js && \
58 |       node src"
59 | ```
60 | 
61 | Install the package:
62 | ```bash
63 | curl -X POST http://localhost:2000/api/v2/packages -H "Content-Type: application/json" -d '{"language": "cms_ioi", "version": "1.0.0"}'
64 | ```
65 | 
66 | Remember to set `PISTON_ENDPOINTS`:
67 | ```bash
68 | export PISTON_ENDPOINTS=http://localhost:2000/api/v2,http://localhost:2001/api/v2,http://localhost:2002/api/v2
69 | ```
70 | 


--------------------------------------------------------------------------------
/generate/README.md:
--------------------------------------------------------------------------------
 1 | # IOI Problem Evaluation
 2 | 
 3 | This repository contains code for evaluating Language Models on IOI 2024 problems using LiteLLM.
 4 | 
 5 | ## Installation
 6 | 
 7 | 1. Clone the repository
 8 | 2. Create a virtual environment with `uv` (to install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/getting-started/installation/)):
 9 | ```bash
10 | uv venv ioi --python 3.11 && source ioi/bin/activate && uv pip install --upgrade pip
11 | ```
12 | 3. Install dependencies:
13 | ```bash
14 | 
15 | uv pip install torch~=2.5.1 --index-url https://download.pytorch.org/whl/cu124
16 | uv pip install sgl-kernel --force-reinstall --no-deps
17 | uv pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
18 | uv pip install -r requirements.txt
19 | ```
20 | 
21 | ## Environment Setup (In case you want to use remote models)
22 | 
23 | 1. Copy the environment template:
24 | ```bash
25 | cp .env.template .env
26 | ```
27 | 
28 | 2. Edit `.env` and:
29 |    - Uncomment the variables for the LLM providers you plan to use
30 |    - Replace the placeholder values with your actual API keys
31 |    - Optional: Configure proxy settings if needed
32 | 
33 | Example `.env` for using OpenAI's GPT-4:
34 | ```bash
35 | OPENAI_API_KEY=your_actual_key_here
36 | OPENAI_ORGANIZATION=your_org_id  # Optional
37 | ```
38 | 
39 | ## Usage
40 | 
41 | ### Running with Remote Models
42 | 
43 | Run the evaluation with remote models:
44 | ```bash
45 | python evaluate.py --org_id YOUR_ORG_ID --model_id YOUR_MODEL_ID [--num_generations 50] [--concurrency 5]
46 | ```
47 | 
48 | Command line arguments:
49 | - `--org_id`: Organization ID (required)
50 | - `--model_id`: Model ID in LiteLLM format (required)
51 | - `--api_base`: API base URL for the model (optional)
52 | - `--num_generations`: Number of generations per problem (default: 50)
53 | - `--num_retries`: Number of retries for failed API calls (default: 10)
54 | - `--concurrency`: Number of concurrent generations (default: 20)
55 | - `--num_problems`: Number of problems to evaluate (default: all)
56 | - `--num_subtasks`: Number of subtasks to evaluate per problem (default: 1, use -1 for all)
57 | - `--dry_run`: Run without making actual LLM calls
58 | - `--override`: Override existing results and start fresh
59 | - `--model_postfix`: Postfix for the model name
60 | - `--revision`: Revision to use for the model
61 | - `--timeout`: Timeout for the LLM call in seconds (default: 600)
62 | - `--use_requests`: Use requests instead of litellm
63 | - `--max_tokens`: Maximum number of tokens for generation
64 | 
65 | ### Running with Locally Deployed Models (SGLang)
66 | 
67 | For locally deployed models using SGLang, you can use the provided scripts:
68 | 
69 | #### Using SLURM for Distributed Deployment
70 | 
71 | For HPC environments with SLURM, use `run_ioi_slurm.py` to evaluate open models:
72 | 
73 | ```bash
74 | python run_ioi_slurm.py --model "MODEL_PATH" --concurrency 30 --startup_delay 7200 --logs_dir "DIR_FOR_OUTPUT_LOGS" --slurm_dir "DIR_FOR_SLUR_SCRIPT" --uv_env "PATH_TO_UV_ENV" --eval_args "--org_id YOUR_ORG_ID"
75 | ```
76 | 
77 | ## Output
78 | 
79 | The results will be saved in directory specified by `--logs_dir` with structure:
80 | 
81 | ```
82 | {org_id}/{revision}-{model_id}-{postfix}/
83 | ```
84 | 
85 | The output includes:
86 | - Generated code solutions for each problem and subtask
87 | - Metrics on generation performance
88 | - Token usage statistics
89 | 
90 | You can analyze the results using the saved data to evaluate the model's performance on competitive programming tasks.


--------------------------------------------------------------------------------
/run_tests/selection_simulator.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | 
 3 | 
 4 | def get_problem_scores(selected_dataset_samples) -> float:
 5 |     if not selected_dataset_samples:
 6 |         return 0.0
 7 |     
 8 |     subtask_scores = {
 9 |         subtask['subtask']: 0 for subtask in selected_dataset_samples[0]['all_subtasks_results']
10 |     }
11 | 
12 |     for submission in selected_dataset_samples:
13 |         for subtask_result in submission['all_subtasks_results']:
14 |             subtask_scores[subtask_result['subtask']] = max(subtask_scores[subtask_result['subtask']], subtask_result['weighted_score'])
15 | 
16 |     return sum(subtask_scores.values())
17 | 
18 | def get_submission_cot_length(submission) -> int:
19 |     if "metadata" in submission:
20 |         if 'output_tokens' in submission['metadata']['usage']:
21 |             return submission['metadata']['usage']['output_tokens']
22 |         return submission['metadata']['usage']['completion_tokens']
23 |     # no token info. use pure length
24 |     if 'generation' in submission:
25 |         return len(submission['generation'])
26 |     # crap...
27 |     return 0
28 | 
29 | def simulate_round_robin(all_submissions) -> float:
30 |     if not all_submissions:
31 |         return 0
32 |     
33 |     subtasks = [x['subtask'] for x in all_submissions[0]['all_subtasks_results']]
34 |     submissions_by_target_subtask = {subtask: [] for subtask in subtasks}
35 | 
36 |     for submission in all_submissions:
37 |         # if it failed to compile, skip
38 |         if not submission["code"] or submission['all_subtasks_results'][0]['status'] == 'CE':
39 |             continue
40 |         submissions_by_target_subtask[submission['target_subtask']].append(submission)
41 | 
42 |     for target_subtask in submissions_by_target_subtask:
43 |         # we only have access to the first subtask (examples/public test)
44 |         submissions_by_target_subtask[target_subtask] = deque(
45 |             sorted(submissions_by_target_subtask[target_subtask], 
46 |                     key=lambda x: (x['all_subtasks_results'][0]['score'], get_submission_cot_length(x)), 
47 |                     reverse=True)
48 |         )
49 | 
50 |     exhausted_subtasks = set([subtask for subtask in submissions_by_target_subtask if len(submissions_by_target_subtask[subtask]) == 0])
51 |     solved_subtasks = set([subtasks[0]])  # we don't explicitly care about solving the examples
52 | 
53 |     # only up to 50 submissions
54 |     selected_submissions = []
55 | 
56 |     subtask_i = len(subtasks) - 1
57 | 
58 |     while len(selected_submissions) < 50 and len(exhausted_subtasks.union(solved_subtasks)) < len(subtasks):
59 |         subtask = subtasks[subtask_i]
60 |         if subtask not in solved_subtasks and subtask not in exhausted_subtasks:
61 |             sol = submissions_by_target_subtask[subtask].popleft()
62 |             selected_submissions.append(sol)
63 |             for subtask_to_check in range(len(sol['all_subtasks_results'])):
64 |                 if sol['all_subtasks_results'][subtask_to_check]['score'] == 1.0:
65 |                     solved_subtasks.add(subtask_to_check)
66 |             if len(submissions_by_target_subtask[subtask]) == 0:
67 |                 exhausted_subtasks.add(subtask)
68 |         subtask_i = (subtask_i - 1) % len(subtasks)
69 |     
70 |     remaining_submissions = deque(sorted(
71 |         [submission for subtask_submissions in submissions_by_target_subtask.values() for submission in subtask_submissions],
72 |         key=lambda x: (x['all_subtasks_results'][0]['score'], get_submission_cot_length(x), subtasks.index(x['target_subtask']) if x['target_subtask'] in subtasks else 0), reverse=True) 
73 |     )
74 |     while len(selected_submissions) < 50 and remaining_submissions:
75 |         selected_submissions.append(remaining_submissions.popleft())
76 | 
77 |     return selected_submissions
78 | 


--------------------------------------------------------------------------------
/generate/slurm_standalone/serve_r1.slurm:
--------------------------------------------------------------------------------
  1 | # Credits to Anton Lozhkov
  2 | #!/bin/bash
  3 | #SBATCH --job-name=r1-server
  4 | #SBATCH --partition=hopper-prod
  5 | #SBATCH --qos=normal
  6 | #SBATCH --nodes=2
  7 | #SBATCH --gpus-per-node=8
  8 | #SBATCH --exclusive
  9 | #SBATCH --output=./logs/%x_%j_%n.out
 10 | #SBATCH --error=./logs/%x_%j_%n.err
 11 | #SBATCH --time=7-00:00:00
 12 | #SBATCH --ntasks-per-node=1
 13 | 
 14 | set -exuo pipefail
 15 | 
 16 | MODEL_PATH="deepseek-ai/DeepSeek-R1"
 17 | UV_ENV=/fsx/hynek_kydlicek/projects/ioi-leaderboard/ioi-eval
 18 | ROUTER_ADDRESS=""
 19 | SERVER_PORT=39877
 20 | DIST_PORT=45000
 21 | 
 22 | # TODO: Adjust these variables to your cluster configuration
 23 | export OUTLINES_CACHE_DIR=/scratch/serve_r1/ocache/
 24 | export TRITON_HOME=/scratch/serve_r1/triton/
 25 | export GLOO_SOCKET_IFNAME="enp71s0"
 26 | export NCCL_SOCKET_IFNAME="enp71s0"
 27 | 
 28 | while getopts "m:e:r:h" opt; do
 29 |     case $opt in
 30 |         m) MODEL_PATH="$OPTARG" ;;
 31 |         r) ROUTER_ADDRESS="$OPTARG" ;;
 32 |         h|?) echo "Usage: sbatch $0 [-m MODEL_PATH] [-r ROUTER_ADDRESS]"; exit 1 ;;
 33 |     esac
 34 | done
 35 | 
 36 | # TODO: Environment setup, adjust to your cluster configuration
 37 | module load cuda/12.4
 38 | source ~/.bashrc
 39 | source $UV_ENV/bin/activate
 40 | 
 41 | FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
 42 | FIRST_NODE_IP=$(srun --nodes=1 --ntasks=1 -w "$FIRST_NODE" hostname --ip-address)
 43 | 
 44 | # Launch servers synchronously across all nodes
 45 | # (--max-running-requests=56 is rough estimate to avoid too many evicted/preempted 16k-long requests)
 46 | srun --nodes=2 --ntasks=2 --ntasks-per-node=1 \
 47 |     bash -c "python -m sglang.launch_server \
 48 |         --model-path '$MODEL_PATH' \
 49 |         --tp 16 \
 50 |         --dist-init-addr '$FIRST_NODE_IP:$DIST_PORT' \
 51 |         --nnodes 2 \
 52 |         --node-rank \$SLURM_PROCID \
 53 |         --port '$SERVER_PORT' \
 54 |         --host 0.0.0.0 \
 55 |         --trust-remote-code \
 56 |         --max-running-requests 24 \
 57 |         --context-length 65536" &
 58 | 
 59 | # Wait for server with timeout
 60 | TIMEOUT=3600  # 1h, but model loading should take ~30min
 61 | START_TIME=$(date +%s)
 62 | echo "Waiting for SGLang server (http://$FIRST_NODE_IP:$SERVER_PORT)..."
 63 | 
 64 | while true; do
 65 |     if curl -s -o /dev/null -w "%{http_code}" "http://$FIRST_NODE_IP:$SERVER_PORT/health" >/dev/null 2>&1; then
 66 |         echo "Server is ready at http://$FIRST_NODE_IP:$SERVER_PORT"
 67 |         break
 68 |     fi
 69 | 
 70 |     CURRENT_TIME=$(date +%s)
 71 |     if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then
 72 |         echo "Error: Server failed to start within $TIMEOUT seconds"
 73 |         exit 1
 74 |     fi
 75 | 
 76 |     echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)"
 77 |     sleep 60
 78 | done
 79 | 
 80 | # Register with router only if address was provided
 81 | if [ -n "$ROUTER_ADDRESS" ]; then
 82 |     echo "Registering with router at $ROUTER_ADDRESS..."
 83 |     curl -X POST "http://$ROUTER_ADDRESS/add_worker?url=http://$FIRST_NODE_IP:$SERVER_PORT" || true
 84 |     sleep 10
 85 | fi
 86 | 
 87 | echo "Checking available models..."
 88 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/models"
 89 | sleep 10
 90 | 
 91 | echo "Executing sanity check..."
 92 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/completions" \
 93 |     -H "Content-Type: application/json" \
 94 |     -d "{
 95 |         \"model\": \"default\",
 96 |         \"prompt\": \"<｜begin▁of▁sentence｜><｜User｜>hi, how are you?<｜Assistant｜>\",
 97 |         \"max_tokens\": 2048,
 98 |         \"temperature\": 0.6
 99 |     }"
100 | 
101 | # Keep the job running with health checks
102 | while true; do
103 |     if ! curl -s -o /dev/null "http://$FIRST_NODE_IP:$SERVER_PORT/health"; then
104 |         echo "Error: Server health check failed"
105 |         exit 1
106 |     fi
107 |     sleep 300
108 | done


--------------------------------------------------------------------------------
/generate/utils/check_failures.py:
--------------------------------------------------------------------------------
  1 | from huggingface_hub import HfApi
  2 | import argparse
  3 | from datasets import load_dataset
  4 | 
  5 | # Parse command line arguments
  6 | parser = argparse.ArgumentParser(description="Check which models failed to create datasets on HuggingFace Hub")
  7 | parser.add_argument("--model_postfix", default="new-prompt", help="Postfix for the model name")
  8 | parser.add_argument("--org_id", default="ioi-leaderboard", help="Organization ID")
  9 | parser.add_argument("--check_generations", action="store_true", help="Check that all generations are not null or empty")
 10 | args = parser.parse_args()
 11 | 
 12 | # Initialize the Hugging Face API
 13 | api = HfApi()
 14 | 
 15 | # Organization ID where datasets are stored
 16 | org_id = args.org_id
 17 | 
 18 | # Read models from the file
 19 | with open("models_to_run.txt", "r") as f:
 20 |     models = [line.strip() for line in f if line.strip()]
 21 | 
 22 | # Get all datasets in the organization
 23 | try:
 24 |     all_datasets = api.list_datasets(author=org_id)
 25 |     dataset_names = [dataset.id for dataset in all_datasets]
 26 | except Exception as e:
 27 |     print(f"Error fetching datasets: {e}")
 28 |     dataset_names = []
 29 | 
 30 | # Check which models have datasets
 31 | successful_models = []
 32 | failed_models = []
 33 | incomplete_models = []
 34 | 
 35 | for model in models:
 36 |     # Format the model name the same way as in the evaluator
 37 |     model_name = f"ioi-eval-sglang_{model.replace('/', '_')}"
 38 |     
 39 |     # Check if there's a model with the specified postfix
 40 |     if args.model_postfix:
 41 |         model_name_with_postfix = f"{model_name}-{args.model_postfix}"
 42 |     else:
 43 |         model_name_with_postfix = model_name
 44 |     
 45 |     # Full dataset path
 46 |     full_dataset_path = f"{org_id}/{model_name_with_postfix}"
 47 |     
 48 |     if full_dataset_path in dataset_names:
 49 |         # Dataset exists
 50 |         if args.check_generations:
 51 |             try:
 52 |                 # Load the dataset to check generations
 53 |                 dataset = load_dataset(full_dataset_path, split="train")
 54 |                 
 55 |                 # Check if any generations are null or empty
 56 |                 null_or_empty = sum(1 for gen in dataset["generation"] if gen is None or gen == "")
 57 |                 
 58 |                 if null_or_empty > 0:
 59 |                     print(f"Model {model} has {null_or_empty} null or empty generations out of {len(dataset)}")
 60 |                     incomplete_models.append(model)
 61 |                 else:
 62 |                     successful_models.append(model)
 63 |             except Exception as e:
 64 |                 print(f"Error checking generations for {model}: {e}")
 65 |                 failed_models.append(model)
 66 |         else:
 67 |             successful_models.append(model)
 68 |     else:
 69 |         failed_models.append(model)
 70 | 
 71 | # Print results
 72 | print(f"Total models: {len(models)}")
 73 | print(f"Successful models: {len(successful_models)}")
 74 | print(f"Failed models: {len(failed_models)}")
 75 | if args.check_generations:
 76 |     print(f"Models with incomplete generations: {len(incomplete_models)}")
 77 | 
 78 | print("\nSuccessful models:")
 79 | for model in successful_models:
 80 |     print(f"  - {model}")
 81 | 
 82 | print("\nFailed models:")
 83 | for model in failed_models:
 84 |     print(f"  - {model}")
 85 | 
 86 | if args.check_generations and incomplete_models:
 87 |     print("\nModels with incomplete generations:")
 88 |     for model in incomplete_models:
 89 |         print(f"  - {model}")
 90 | 
 91 | # Create a new file with failed models
 92 | if failed_models:
 93 |     failed_file = f"failed_models{'-' + args.model_postfix if args.model_postfix else ''}.txt"
 94 |     with open(failed_file, "w") as f:
 95 |         for model in failed_models:
 96 |             f.write(f"{model}\n")
 97 |     print(f"\nFailed models have been written to {failed_file}")
 98 | 
 99 | # Create a new file with incomplete models
100 | if args.check_generations and incomplete_models:
101 |     incomplete_file = f"incomplete_models{'-' + args.model_postfix if args.model_postfix else ''}.txt"
102 |     with open(incomplete_file, "w") as f:
103 |         for model in incomplete_models:
104 |             f.write(f"{model}\n")
105 |     print(f"\nIncomplete models have been written to {incomplete_file}") 


--------------------------------------------------------------------------------
/run_tests/README.md:
--------------------------------------------------------------------------------
 1 | # IOI: Running tests
 2 | 
 3 | ## Piston
 4 | To evaluate, we rely on Piston (https://github.com/engineer-man/piston) to compile and run the code in a secure and fast sandbox environment. See the [piston](piston/README.md) directory for more details.
 5 | To run the evaluation code below, spin up Piston workers and copy the `.env.template` file to `.env` and set the piston variables.
 6 | 
 7 | ## Running the pipeline
 8 | Install dependencies:
 9 | ```bash
10 | pip install -r requirements.txt
11 | ```
12 | 
13 | Once you have piston setup and running, or have made the necessary changes to the `run_submission` function in [scoring.py](scoring.py) (see below for more info), you can run the pipeline with the following command:
14 | 
15 | ```bash
16 | python tests_runner.py [-h] [--local_results_path LOCAL_RESULTS_PATH] [--id_column ID_COLUMN] [--max_concurrent_requests MAX_CONCURRENT_REQUESTS] [--test_batch_size TEST_BATCH_SIZE] [--dry_run] [--override]
17 |                        [--timeout TIMEOUT] [--add_includes] [--add_messages_column]
18 |                        datasets_to_evaluate results_dataset_name
19 | ```
20 | ### Arguments
21 | - `datasets_to_evaluate`: The datasets to evaluate (HF hub ids), separated by commas. Also accepts wildcards on the dataset part, such as `open-r1/models-*-fix`
22 | - `results_dataset_name`: The name of the dataset to save the results to.
23 | - `local_results_path`: Path for local results cache, so that you can restart if the script dies.
24 | - `id_column`: The column name of the unique identifier for each submission. `uuid` by default
25 | - `max_concurrent_requests`: The maximum number of concurrent requests to make to the Piston API. Should be roughly the number of piston workers you have.
26 | - `test_batch_size`: Batch size for testing submissions. Will test this number at a time and then check if any scored 0.0. If so, the remaining tests are skipped. Increase if you have many more workers than submissions.
27 | - `dry_run`: If true, the script will not make any actual API calls to Piston.
28 | - `override`: If true, the script will override existing results in the results dataset.
29 | - `timeout`: Timeout for the Piston API calls.
30 | - `add_includes`: If true, the script will attempt to fix some basic missing #include directives in the code.
31 | - `add_messages_column`: If true, the script will add the `messages` column to the results dataset formatted for SFT.
32 | 
33 | ### Examples
34 | 
35 | Running the pipeline on the official contest solutions with 1500 workers:
36 | 
37 | ```bash
38 | python tests_runner.py open-r1/ioi-sample-solutions my_org/ioi-sample-solutions-results --id_column label --max_concurrent_requests 1500
39 | ```
40 | Make sure to compare your results (look at the reports for each problem) to the official contest solutions in the [open-r1/ioi-sample-solutions](https://huggingface.co/datasets/open-r1/ioi-sample-solutions) dataset.
41 | 
42 | 
43 | Running on a dataset produced by evaluate.py:
44 | 
45 | ```bash
46 | python tests_runner.py my_org/my-dataset my_org/my-dataset-results --max_concurrent_requests 1500
47 | ```
48 | 
49 | Besides the actual results dataset, the script will also generate and upload markdown reports to the dataset's repo under `reports/my-dataset/README.md`
50 | 
51 | 
52 | 
53 | ## Evaluating without piston
54 | To evaluate in a different sandbox environment, you should change the `run_submission` function in [scoring.py](scoring.py). It should mount/create the following files inside the sandbox:
55 | - `graders/<problem_id>.cpp`: The submission code.
56 | - `input.txt`: The input for the problem.
57 | - `correct_output.txt`: The expected output for the problem.
58 | - all the files in `grader_files`
59 | Plus the following 2 very important files:
60 | - [`compile`](custom_setup/compile), the command to compile the submission code with all the grader/checker/manager files.
61 | - [`run`](custom_setup/run), the command to orchestrate the execution of the submission code, managers, time limits, output checking, etc.
62 | 
63 | As `run` handles time limits, if you require a time limit for a sandbox, you can set a hard limit to 2 or 3 additional seconds from the problem's time limit.
64 | 
65 | You should return a tuple of `(score, feedback)` from the function, where `score` is the execution's stdout, and `feedback` its stderr, and need to handle some special failure scenarios such as (piston example):
66 | 
67 | ```python
68 | 
69 | if 'compile' in response and response['compile']['code'] != 0:
70 |     return "0", "Compilation error exit code " + str(response['compile']['code']) + "\n" + response['compile']['stderr']
71 | 
72 | if response['run']['code'] == 1 and "MemoryError" in response['run']['stderr']:
73 |     return "0", "Memory limit exceeded"
74 | 
75 | # successful result
76 | if response['run']['stdout']:
77 |     return response['run']['stdout'], response['run']['stderr']
78 | 
79 | # hard time limit exceeded
80 | if response['run']['signal'] == 'SIGKILL':
81 |     return "0", "Time limit exceeded"
82 | 
83 | return '0', 'Unknown error'
84 | ```
85 | 


--------------------------------------------------------------------------------
/run_tests/custom_setup/run:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # disable stack limit so you don't get RE with recursion
  3 | ulimit -s unlimited
  4 | # some problems have 10MB+ input/output files in their test cases and you might get RE. uncomment if needed
  5 | # ulimit -f 2097152
  6 | 
  7 | # Check if grader_config.json exists
  8 | if [ ! -f "graders/grader_config.json" ]; then
  9 |     echo "Error: graders/grader_config.json not found" >&2
 10 |     echo "Current directory contents:" >&2
 11 |     find . -type f -o -type d | sed -e 's/[^-][^\/]*\//  |/g' -e 's/|\([^ ]\)/|-\1/' >&2
 12 |     exit 1
 13 | fi
 14 | 
 15 | # Read task type, code, and time limit from grader_config.json using grep and sed
 16 | TASK_TYPE=$(grep -o '"task_type":[^,}]*' graders/grader_config.json | sed 's/"task_type":\s*"\([^"]*\)"/\1/')
 17 | TASK_NAME=$(grep -o '"code":[^,}]*' graders/grader_config.json | sed 's/"code":\s*"\([^"]*\)"/\1/')
 18 | TIME_LIMIT=$(grep -o '"time_limit":[^,}]*' graders/grader_config.json | sed 's/"time_limit":\s*\([^,}]*\)/\1/')
 19 | MEMORY_LIMIT=$(grep -o '"memory_limit":[^,}]*' graders/grader_config.json | sed 's/"memory_limit":\s*\([^,}]*\)/\1/')
 20 | TASK_EXECUTABLE="graders/$TASK_NAME"
 21 | 
 22 | # Set memory limit in KB (convert from bytes)
 23 | MEMORY_LIMIT_KB=0
 24 | if [ -n "$MEMORY_LIMIT" ]; then
 25 |     MEMORY_LIMIT_KB=$((MEMORY_LIMIT / 1024))
 26 |     # Set the memory limit for the entire script and all child processes
 27 |     ulimit -v $MEMORY_LIMIT_KB
 28 | fi
 29 | 
 30 | # "Securely" handle the correct output file
 31 | CORRECT_OUTPUT=""
 32 | if [ -f "correct_output.txt" ]; then
 33 |     # Read the content and immediately remove the file
 34 |     CORRECT_OUTPUT=$(cat correct_output.txt)
 35 |     rm -f correct_output.txt
 36 | fi
 37 | 
 38 | # Create a temporary file for solution output
 39 | SOLUTION_OUTPUT=$(mktemp)
 40 | 
 41 | # Global variables for process tracking
 42 | declare -a ALL_PIDS
 43 | declare -a FIFO_DIRS
 44 | 
 45 | # Define cleanup function - simplified assuming timeout exists
 46 | function cleanup {
 47 |     # Kill all tracked processes silently
 48 |     exec 2>/dev/null
 49 |     for pid in "${ALL_PIDS[@]:-}"; do
 50 |         kill -9 "$pid" 2>/dev/null || true
 51 |     done
 52 |     
 53 |     # Clean up FIFO directories
 54 |     for dir in "${FIFO_DIRS[@]:-}"; do
 55 |         [ -d "$dir" ] && rm -rf "$dir"
 56 |     done
 57 |     
 58 |     # Clean up temporary files
 59 |     rm -f "$SOLUTION_OUTPUT" || true
 60 |     exec 2>&2
 61 | }
 62 | 
 63 | # Set up signal handling
 64 | trap cleanup EXIT INT TERM
 65 | 
 66 | # Function to handle exit codes consistently across task types
 67 | function handle_exit_code {
 68 |     local exit_code=$1
 69 |     
 70 |     # Check for known timeout exit codes:
 71 |     # - 124: standard timeout exit code
 72 |     # - 137: SIGKILL (128+9), used for hard timeouts
 73 |     # - 143: SIGTERM (128+15), can also be used for timeouts
 74 |     if [ $exit_code -eq 124 ] || [ $exit_code -eq 137 ] || [ $exit_code -eq 143 ]; then
 75 |         echo "0"
 76 |         echo "Time limit exceeded (${TIME_LIMIT}s)" >&2
 77 |         return 124
 78 |     # All other non-zero exit codes should be treated as runtime errors
 79 |     elif [ $exit_code -ne 0 ]; then
 80 |         echo "0"
 81 |         echo "Runtime error with exit code $exit_code" >&2
 82 |         return $exit_code
 83 |     fi
 84 |     
 85 |     # Success case - return 0
 86 |     return 0
 87 | }
 88 | 
 89 | # Function to run a command with timeout (simplified assuming timeout exists)
 90 | function run_with_timeout {
 91 |     local soft_limit=$1; shift
 92 |     local command_to_run="$@"
 93 |     
 94 |     timeout --preserve-status "$soft_limit" "$@"
 95 |     return $?
 96 | }
 97 | 
 98 | case "$TASK_TYPE" in
 99 |     "Batch")
100 |         # Simple batch execution with timeout
101 |         run_with_timeout "$TIME_LIMIT" ./$TASK_EXECUTABLE < input.txt > "$SOLUTION_OUTPUT"
102 |         exit_code=$?
103 |         
104 |         # Handle non-zero exit codes
105 |         handle_exit_code $exit_code
106 |         if [ $? -ne 0 ]; then
107 |             exit $?
108 |         fi
109 |         
110 |         # Check the output if we have a correct output
111 |         if [ -n "$CORRECT_OUTPUT" ]; then
112 |             # Restore the correct output file
113 |             echo "$CORRECT_OUTPUT" > correct_output.txt
114 |             
115 |             # Check if there's a custom checker
116 |             if [ -f "checker/checker" ]; then
117 |                 # Let the checker handle everything
118 |                 ./checker/checker input.txt correct_output.txt "$SOLUTION_OUTPUT"
119 |                 exit $?
120 |             else
121 |                 # Simple diff-based checking
122 |                 if diff -bq <(echo "$CORRECT_OUTPUT") "$SOLUTION_OUTPUT" >/dev/null; then
123 |                     echo "1"
124 |                     echo "Output is correct (diff)" >&2
125 |                 else
126 |                     echo "0"
127 |                     echo "Output isn't correct (diff)" >&2
128 |                     exit 0
129 |                 fi
130 |             fi
131 |         else
132 |             # If no correct output was provided, just output the solution's output
133 |             cat "$SOLUTION_OUTPUT"
134 |         fi
135 |         ;;
136 |         
137 |     "Communication")
138 |         # Read Communication-specific parameters
139 |         NUM_PROCESSES=$(grep -o '"task_type_parameters_Communication_num_processes":[^,}]*' graders/grader_config.json | sed 's/.*:\s*\([0-9]*\)/\1/' || true)
140 |         if [ -z "$NUM_PROCESSES" ]; then
141 |             NUM_PROCESSES=1
142 |         fi
143 |         USER_IO=$(grep -o '"task_type_parameters_Communication_user_io":[^,}]*' graders/grader_config.json | sed 's/.*:\s*"\([^"]*\)"/\1/' || echo "std_io")
144 |         
145 |         # Read custom manager arguments if they exist
146 |         MANAGER_CUSTOM_ARGS=""
147 |         if grep -q '"task_type_parameters_Communication_manager_args"' graders/grader_config.json; then
148 |             MANAGER_CUSTOM_ARGS=$(grep -o '"task_type_parameters_Communication_manager_args":[^,}]*' graders/grader_config.json | sed 's/.*:\s*"\([^"]*\)"/\1/')
149 |         fi
150 |         
151 |         # Create temporary directories for FIFOs
152 |         for i in $(seq 0 $((NUM_PROCESSES-1))); do
153 |             FIFO_DIRS[$i]=$(mktemp -d)
154 |             
155 |             # Create FIFOs for this process
156 |             mkfifo "${FIFO_DIRS[$i]}/u${i}_to_m"
157 |             mkfifo "${FIFO_DIRS[$i]}/m_to_u${i}"
158 |             chmod 755 "${FIFO_DIRS[$i]}"
159 |             chmod 666 "${FIFO_DIRS[$i]}/u${i}_to_m" "${FIFO_DIRS[$i]}/m_to_u${i}"
160 |         done
161 | 
162 |         # Prepare manager arguments
163 |         MANAGER_ARGS=""
164 |         for i in $(seq 0 $((NUM_PROCESSES-1))); do
165 |             MANAGER_ARGS="$MANAGER_ARGS ${FIFO_DIRS[$i]}/u${i}_to_m ${FIFO_DIRS[$i]}/m_to_u${i}"
166 |         done
167 |         
168 |         # Add custom manager arguments if specified
169 |         if [ -n "$MANAGER_CUSTOM_ARGS" ]; then
170 |             MANAGER_ARGS="$MANAGER_ARGS $MANAGER_CUSTOM_ARGS"
171 |         fi
172 | 
173 |         # Start all user processes first
174 |         for i in $(seq 0 $((NUM_PROCESSES-1))); do
175 |             if [ "$USER_IO" = "fifo_io" ]; then
176 |                 # Pass FIFOs as arguments
177 |                 ARGS="${FIFO_DIRS[$i]}/m_to_u${i} ${FIFO_DIRS[$i]}/u${i}_to_m"
178 |                 if [ "$NUM_PROCESSES" -ne 1 ]; then
179 |                     ARGS="$ARGS $i"
180 |                 fi
181 |                 ./$TASK_EXECUTABLE $ARGS &
182 |                 ALL_PIDS+=($!)
183 |             else
184 |                 # Use stdin/stdout redirection
185 |                 if [ "$NUM_PROCESSES" -ne 1 ]; then
186 |                     ./$TASK_EXECUTABLE "$i" < "${FIFO_DIRS[$i]}/m_to_u${i}" > "${FIFO_DIRS[$i]}/u${i}_to_m" 2>/dev/null &
187 |                     ALL_PIDS+=($!)
188 |                 else
189 |                     ./$TASK_EXECUTABLE < "${FIFO_DIRS[$i]}/m_to_u${i}" > "${FIFO_DIRS[$i]}/u${i}_to_m" 2>/dev/null &
190 |                     ALL_PIDS+=($!)
191 |                 fi
192 |             fi
193 |         done
194 |         
195 |         # Run the manager with timeout using direct pipe from input.txt
196 |         run_with_timeout "$TIME_LIMIT" ./graders/manager $MANAGER_ARGS < input.txt > "$SOLUTION_OUTPUT"
197 | 
198 |         exit_code=$?
199 |         
200 |         # Handle non-zero exit codes
201 |         handle_exit_code $exit_code
202 |         if [ $? -ne 0 ]; then
203 |             exit $?
204 |         fi
205 | 
206 |         # Check the output if we have a correct output AND there's a checker (otherwise we assume the manager handles everything)
207 |         if [ -n "$CORRECT_OUTPUT" ] && [ -f "checker/checker" ]; then
208 |             # Restore the correct output file
209 |             echo "$CORRECT_OUTPUT" > correct_output.txt
210 | 
211 |             # Let the checker handle it
212 |             ./checker/checker input.txt correct_output.txt "$SOLUTION_OUTPUT"
213 |             exit $?
214 |         else
215 |             # we assume the manager handles it
216 |             cat "$SOLUTION_OUTPUT"
217 |         fi
218 |         ;;
219 |         
220 |     *)
221 |         echo "0"
222 |         echo "Unsupported task type \"$TASK_TYPE\"" >&2
223 |         exit 1
224 |         ;;
225 | esac


--------------------------------------------------------------------------------
/generate/run_ioi_slurm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | from math import ceil, gcd
  3 | import os
  4 | import argparse
  5 | import subprocess
  6 | from pathlib import Path
  7 | from transformers import AutoConfig
  8 | import logging
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | DEFAULT_TP = 16
 13 | MAX_CTX_LENGTH = None
 14 | 
 15 | MODEL_CONFIGS = {}
 16 | 
 17 | LOGS_DIR = "/fsx/hynek_kydlicek/logs/ioi-eval"
 18 | SLURM_SCRIPT_DIR = "/fsx/hynek_kydlicek/slurm/ioi-eval/output"
 19 | UV_ENV = "/fsx/hynek_kydlicek/projects/ioi-leaderboard/ioi-eval"
 20 | 
 21 | 
 22 | def get_concurrency(model_name: str, concurrency: int) -> int:
 23 |     """Get concurrency from model config."""
 24 |     return MODEL_CONFIGS.get(model_name, {}).get("concurrency", concurrency)
 25 | 
 26 | 
 27 | def get_tp(model_name: str, revision: str) -> int:
 28 |     default_tp = MODEL_CONFIGS.get(model_name, {}).get("tp", DEFAULT_TP)
 29 |     try:
 30 |         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
 31 | 
 32 |         # Check num_attention_heads and num_key_value_heads, and ensure that both are divisable by tp
 33 |         if hasattr(config, 'num_attention_heads'):
 34 |             if config.num_attention_heads % default_tp != 0:
 35 |                 # Adjust tp to be the highest number that divides both num_attention_heads
 36 |                 new_tp = gcd(config.num_attention_heads, default_tp)
 37 |                 print(f"Adjusted tp for {model_name} from {default_tp} to {new_tp}")
 38 |                 return new_tp
 39 |         return default_tp
 40 |     except Exception as e:
 41 |         print(f"Could not get tp from config for {model_name}: {e}")
 42 |         return default_tp
 43 | 
 44 | def get_context_length(model_name: str, revision: str) -> int:
 45 |     """Get maximum context length from model config."""
 46 |     try:
 47 |         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
 48 |         # Check various possible context length attributes
 49 |         context_length = (
 50 |             getattr(config, 'max_position_embeddings', None) or
 51 |             getattr(config, 'sliding_window', None) or
 52 |             getattr(config, 'max_sequence_length', None) or
 53 |             getattr(config, 'max_seq_len', None) or
 54 |             4096  # Default fallback
 55 |         )
 56 | 
 57 |         # Some models (like Qwen) might have sliding_window disabled
 58 |         if hasattr(config, 'use_sliding_window') and not config.use_sliding_window:
 59 |             # If sliding window is disabled, use max_position_embeddings instead
 60 |             context_length = getattr(config, 'max_position_embeddings', context_length)
 61 |             
 62 | 
 63 |         # cap to 64k
 64 |         if MAX_CTX_LENGTH is not None:
 65 |             context_length = min(context_length, MAX_CTX_LENGTH)
 66 |         return context_length
 67 |     except Exception as e:
 68 |         logger.warning(f"Could not get context length from config for {model_name}: {e}")
 69 |         return 4096  # Default fallback
 70 | 
 71 | def parse_args():
 72 |     parser = argparse.ArgumentParser(description="Run IOI evaluation on a model using Slurm")
 73 |     parser.add_argument("--model", type=str, required=True,
 74 |                         help="Model to evaluate (predefined model name)")
 75 |     parser.add_argument("--eval_args", type=str, required=True,
 76 |                         help="Arguments to pass to the evaluation script")
 77 |     parser.add_argument("--time", type=str, default="7-00:00:00",
 78 |                         help="Job time limit (default: 7 days)")
 79 |     parser.add_argument("--partition", type=str, default="hopper-prod",
 80 |                         help="Slurm partition")
 81 |     parser.add_argument("--qos", type=str, default="normal",
 82 |                         help="Slurm QOS")
 83 |     parser.add_argument("--startup_delay", type=int, default=3600,
 84 |                         help="Delay in seconds before starting the server")
 85 |     parser.add_argument("--dry_run", action="store_true",
 86 |                         help="Generate script but don't submit job")
 87 | 
 88 |     parser.add_argument("--revision", type=str, default=None, help="Revision to use for the model")
 89 |     parser.add_argument("--concurrency", type=int, default=100,
 90 |                         help="Number of concurrent requests to the server")
 91 |     
 92 |     parser.add_argument("--uv_env", type=str, default=None, help="Path to the uv env")
 93 |     parser.add_argument("--logs_dir", type=str, default=None)
 94 |     parser.add_argument("--slurm_dir", type=str, default=None)
 95 |     
 96 |     return parser.parse_args()
 97 | 
 98 | def create_slurm_script(args, logs_dir):
 99 |     # Override with custom values if provided
100 |     concurrency = get_concurrency(args.model, args.concurrency)
101 |     tp = get_tp(args.model, args.revision)
102 |     context_length = get_context_length(args.model, args.revision)
103 |     
104 |     # Create a sanitized model name for the job name
105 |     job_name = f"ioi-eval-{args.model.replace('/', '-')}"
106 | 
107 |     log_dir = logs_dir / job_name
108 |     log_dir.mkdir(parents=True, exist_ok=True)
109 | 
110 |     n_nodes = ceil(tp / 8)
111 |     tasks = n_nodes
112 | 
113 |     revision_arg = f"--revision {args.revision}" if args.revision else ""
114 |     
115 |     slurm_script = f"""#!/bin/bash
116 | #SBATCH --job-name={job_name}
117 | #SBATCH --partition={args.partition}
118 | #SBATCH --qos={args.qos}
119 | #SBATCH --nodes={n_nodes}
120 | #SBATCH --gpus-per-node=8
121 | #SBATCH --exclusive
122 | #SBATCH --output={log_dir}/%j-%x.out
123 | #SBATCH --error={log_dir}/%j-%x.out
124 | #SBATCH --time={args.time}
125 | #SBATCH --ntasks-per-node=1
126 | 
127 | set -exuo pipefail
128 | 
129 | SERVER_PORT=39877
130 | DIST_PORT=45000
131 | 
132 | # random sleep (0-100) to prevent ddosing server
133 | sleep $((RANDOM % 100 + 1))
134 | 
135 | # Environment configuration
136 | export OUTLINES_CACHE_DIR=/scratch/serve_r1/ocache/
137 | export TRITON_HOME=/scratch/serve_r1/triton/
138 | export GLOO_SOCKET_IFNAME="enp71s0"
139 | export NCCL_SOCKET_IFNAME="enp71s0"
140 | 
141 | # Evaluation script path
142 | EVAL_SCRIPT_PATH="/fsx/hynek_kydlicek/projects/ioi/generate/evaluate.py"
143 | 
144 | module load cuda/12.4
145 | source ~/.bashrc
146 | 
147 | # Activate uv
148 | source {args.uv_env or UV_ENV}/bin/activate
149 | 
150 | FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
151 | FIRST_NODE_IP=$(srun --nodes=1 --ntasks=1 -w "$FIRST_NODE" hostname --ip-address)
152 | 
153 | # Launch servers synchronously across all nodes
154 | srun --nodes={n_nodes} --ntasks={tasks} --ntasks-per-node=1 \\
155 |     bash -c "python -m sglang.launch_server \\
156 |         --model-path '{args.model}' \\
157 |         --tp {tp} \\
158 |         --dist-init-addr '$FIRST_NODE_IP:$DIST_PORT' \\
159 |         {revision_arg} \\
160 |         --nnodes {n_nodes} \\
161 |         --node-rank \\$SLURM_PROCID \\
162 |         --port '$SERVER_PORT' \\
163 |         --host 0.0.0.0 \\
164 |         --trust-remote-code \\
165 |         --max-running-requests {concurrency} \\
166 |         --context-length {context_length}" &
167 | 
168 | # Wait for server with timeout
169 | TIMEOUT={args.startup_delay}  # 1h, but model loading should take ~30min
170 | START_TIME=$(date +%s)
171 | echo "Waiting for SGLang server (http://$FIRST_NODE_IP:$SERVER_PORT)..."
172 | 
173 | while true; do
174 |     if curl -s -o /dev/null -w "%{{http_code}}" "http://$FIRST_NODE_IP:$SERVER_PORT/health" >/dev/null 2>&1; then
175 |         echo "Server is ready at http://$FIRST_NODE_IP:$SERVER_PORT"
176 |         break
177 |     fi
178 | 
179 |     CURRENT_TIME=$(date +%s)
180 |     if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then
181 |         echo "Error: Server failed to start within $TIMEOUT seconds"
182 |         exit 1
183 |     fi
184 | 
185 |     echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)"
186 |     sleep 60
187 | done
188 | 
189 | echo "Checking available models..."
190 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/models"
191 | sleep 10
192 | 
193 | echo "Executing sanity check..."
194 | curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/completions" \\
195 |     -H "Content-Type: application/json" \\
196 |     -d '{{
197 |         "model": "default",
198 |         "prompt": "hi, how are you?",
199 |         "max_tokens": 2048,
200 |         "temperature": 0.6
201 |     }}'
202 | 
203 | python "$EVAL_SCRIPT_PATH" \\
204 |     --model_id "sglang/{args.model}" \\
205 |     {revision_arg} \\
206 |     --api_base "http://localhost:$SERVER_PORT/v1" \\
207 |     --concurrency {concurrency} \\
208 |     {args.eval_args}
209 | 
210 | # Kill the server and exit
211 | pkill -f "python -m sglang.launch_server"
212 | exit 0
213 | """
214 |     
215 |     return slurm_script, job_name
216 | 
217 | def main():
218 |     args = parse_args()
219 |     
220 |     # Create output directory if it doesn't exist
221 |     output_dir = Path(args.slurm_dir or SLURM_SCRIPT_DIR)
222 |     output_dir.mkdir(parents=True, exist_ok=True)
223 |     
224 |     # Create logs directory if it doesn't exist
225 |     logs_dir = Path(args.logs_dir or LOGS_DIR)
226 |     logs_dir.mkdir(parents=True, exist_ok=True)
227 |     
228 |     # Generate the Slurm script
229 |     slurm_script, job_name = create_slurm_script(args, logs_dir)
230 |     
231 |     # Create a timestamp for the filename
232 |     from datetime import datetime
233 |     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
234 |     
235 |     # Save the script to a file
236 |     script_path = output_dir / f"{job_name}_{timestamp}.slurm"
237 |     with open(script_path, "w") as f:
238 |         f.write(slurm_script)
239 |     
240 |     logger.info(f"Slurm script saved to: {script_path}")
241 |     # Make the script executable
242 |     os.chmod(script_path, 0o755)
243 |     
244 |     # Submit the job if not a dry run
245 |     if not args.dry_run:
246 |         try:
247 |             result = subprocess.run(
248 |                 ["sbatch", str(script_path)],
249 |                 check=True,
250 |                 capture_output=True,
251 |                 text=True
252 |             )
253 |             print(f"Job submitted: {result.stdout.strip()} find logs at {LOGS_DIR}/{job_name}")
254 |         except subprocess.CalledProcessError as e:
255 |             print(f"Error submitting job: {e}")
256 |             print(f"Error output: {e.stderr}")
257 |     else:
258 |         print("Dry run - job not submitted")
259 | 
260 | if __name__ == "__main__":
261 |     main() 


--------------------------------------------------------------------------------
/run_tests/scoring.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from dataclasses import asdict, dataclass, field
  3 | from typing import Union
  4 | 
  5 | from piston_client import PistonClient
  6 | from utils import batched, load_ioi_tests
  7 | 
  8 | 
  9 | @dataclass
 10 | class TestResult:
 11 |     """
 12 |     Represents the result of a single test case execution.
 13 |     
 14 |     Attributes:
 15 |         test_name: Name of the test case
 16 |         score: Score achieved for this test (0.0 to 1.0)
 17 |         status: Status code of the test result (e.g., 'AC', 'WA', 'TLE')
 18 |         feedback: Detailed feedback message from the judge or an error message
 19 |     """
 20 |     test_name: str
 21 |     score: float = 0.0
 22 |     status: str = 'SKIPPED'
 23 |     feedback: str = None
 24 | 
 25 | @dataclass
 26 | class SubtaskResult:
 27 |     """
 28 |     Represents the result of a subtask containing multiple test cases.
 29 |     
 30 |     Attributes:
 31 |         problem: Problem identifier
 32 |         subtask: Subtask identifier
 33 |         points: Maximum points available for this subtask
 34 |         score_precision: Number of decimal places for score rounding
 35 |         test_results: List of individual test case results
 36 |     """
 37 |     problem: str = None
 38 |     subtask: str = None
 39 | 
 40 |     points: float = 0.0
 41 |     score_precision: int = 2
 42 | 
 43 |     test_results: list[TestResult] = field(default_factory=list)
 44 | 
 45 |     @property
 46 |     def status(self):
 47 |         """
 48 |         Determines the overall status of the subtask based on the worst status among test results.
 49 |         Status priorities are ordered from worst to best.
 50 |         
 51 |         Returns:
 52 |             str: The status with the highest priority (lowest value)
 53 |         """
 54 |         status_prios = {'CE': -1, 'RE': 0, 'WA': 1, 'MLE': 2, 'TLE': 3, 'PA': 4, 'AC': 5, 'SKIPPED': 999}
 55 |         return min([x.status for x in self.test_results], key=lambda x: status_prios[x])
 56 | 
 57 |     @property
 58 |     def score(self):
 59 |         """
 60 |         Calculates the raw score for the subtask as the minimum score across all test results.
 61 |         
 62 |         Returns:
 63 |             float: The rounded minimum score
 64 |         """
 65 |         return 0 if not self.test_results else round(min([test_result.score for test_result in self.test_results]), self.score_precision)
 66 | 
 67 |     @property
 68 |     def weighted_score(self):
 69 |         """
 70 |         Calculates the weighted score by multiplying the raw score by the available points.
 71 |         
 72 |         Returns:
 73 |             float: The rounded weighted score
 74 |         """
 75 |         return 0 if not self.test_results else round(min([test_result.score for test_result in self.test_results]) * self.points, self.score_precision)
 76 | 
 77 |     def to_dict(self):
 78 |         """
 79 |         Converts the SubtaskResult to a dictionary representation.
 80 |         
 81 |         Returns:
 82 |             dict: Dictionary containing all subtask result data
 83 |         """
 84 |         return {
 85 |             'problem': self.problem,
 86 |             'subtask': self.subtask,
 87 |             'score': self.score,
 88 |             'weighted_score': self.weighted_score,
 89 |             'points': self.points,
 90 |             'score_precision': self.score_precision,
 91 |             'status': self.status,
 92 |             'test_results': [asdict(test_result) for test_result in self.test_results]
 93 |         }
 94 | 
 95 | def _extract_single_status(score: float, feedback: str) -> str:
 96 |     """
 97 |     Determines the status code based on the score and feedback message.
 98 |     
 99 |     Args:
100 |         score: The numeric score (0.0 to 1.0)
101 |         feedback: The feedback message from the execution
102 |         
103 |     Returns:
104 |         str: Status code ('CE', 'MLE', 'TLE', 'WA', 'RE', 'AC', or 'PA')
105 |     """
106 |     if score == 0.0:
107 |         if "Compilation error" in feedback:
108 |             return 'CE'
109 |         elif "Memory limit exceeded" in feedback:
110 |             return 'MLE'
111 |         elif "Time limit exceeded" in feedback:
112 |             return 'TLE'
113 |         elif "Output isn't correct" in feedback:
114 |             return 'WA'
115 |         else:
116 |             return 'RE'
117 |     elif score == 1.0:
118 |         return 'AC'
119 |     else:
120 |         return 'PA'
121 | 
122 | 
123 | async def score_single_test_case(client: PistonClient, subtask: dict, test_name: str, test_input: str, test_output: str, submission: str) -> TestResult:
124 |     """
125 |     Scores a single test case by running the submission against the provided input and output.
126 |     
127 |     Args:
128 |         client: PistonClient instance for executing code
129 |         subtask: Dictionary containing subtask configuration
130 |         test_name: Name of the test case
131 |         test_input: Input data for the test case
132 |         test_output: Expected output for the test case
133 |         submission: Source code of the submission
134 |         
135 |     Returns:
136 |         TestResult: Result of the test case execution
137 |     """
138 |     # Run submission for this test case
139 |     score, feedback = await run_submission(client, subtask, test_input, submission, test_output)
140 |     score = float(score)
141 | 
142 |     return TestResult(test_name=test_name, score=score, status=_extract_single_status(score, feedback), feedback=feedback)
143 | 
144 | async def score_subtask(client: PistonClient, subtask: dict, submission: str, test_case_run_cache: Union[dict, None] = None, test_batch_size: int = 1) -> SubtaskResult:
145 |     """
146 |     Scores all test cases in a subtask.
147 |     
148 |     Args:
149 |         client: PistonClient instance for executing code
150 |         subtask: Dictionary containing subtask configuration
151 |         test_cases: Dictionary mapping test names to (input, output) tuples
152 |         submission: Source code of the submission
153 |         test_case_run_cache: Optional cache of previously run test cases
154 |         test_batch_size: evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases.
155 |         -1 to evaluate all test cases in parallel
156 |     Returns:
157 |         SubtaskResult: Result of the subtask evaluation
158 |     """
159 |     subtask_result = SubtaskResult(problem=subtask['id'], subtask=subtask['subtask'], points=subtask['score'], score_precision=subtask['score_precision'], test_results=[])
160 |     
161 |     # tests that are not cached
162 |     tests_to_run = [
163 |         (ti, test_name)
164 |         for ti, test_name in enumerate(subtask['test_names'])
165 |         if test_case_run_cache is None or test_name not in test_case_run_cache
166 |     ]
167 | 
168 |     # initialize test results with cached results or empty (SKIPPED) TestResult objects
169 |     subtask_result.test_results = [
170 |         test_case_run_cache[test_name] if test_case_run_cache is not None and test_name in test_case_run_cache else 
171 |         TestResult(test_name=test_name)
172 |         for test_name in subtask['test_names']
173 |     ]
174 | 
175 |     # we skip submissions where no code was extracted
176 |     # no need to do anything, as we have a failed cached result
177 |     if not submission or any(test_result.status != 'SKIPPED' and test_result.score == 0.0 for test_result in subtask_result.test_results):
178 |         return subtask_result
179 | 
180 |     if "test_cases" in subtask:
181 |         test_cases = subtask["test_cases"]
182 |         if isinstance(subtask["test_cases"], list):
183 |             test_cases = {
184 |                 test_name: test for test_name, test in zip(subtask["test_names"], subtask["test_cases"])
185 |             }
186 |     else:
187 |         test_cases = load_ioi_tests(subtask["year"], subtask["id"])
188 | 
189 |     # run one batch, check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases.
190 |     for test_batch_to_run in batched(tests_to_run, test_batch_size):
191 |         results = await asyncio.gather(*[
192 |             asyncio.create_task(score_single_test_case(client, subtask, test_name, test_cases[test_name][0], test_cases[test_name][1], submission))
193 |             for _, test_name in test_batch_to_run
194 |         ])
195 |         for (ti, test_name), test_result in zip(test_batch_to_run, results):
196 |             if test_case_run_cache is not None:
197 |                 test_case_run_cache[test_name] = test_result
198 |             subtask_result.test_results[ti] = test_result
199 | 
200 |         # Stop early if it failed
201 |         if any(test_result.score == 0.0 for test_result in results):
202 |             break
203 |     
204 |     return subtask_result
205 | 
206 | 
207 | async def score_subtasks(client: PistonClient, subtasks: list[dict], submission: str, test_batch_size: int = 1) -> list[SubtaskResult]:
208 |     """
209 |     Scores multiple subtasks for a submission.
210 |     
211 |     Args:
212 |         client: PistonClient instance for executing code
213 |         subtasks: List of dictionaries containing subtask configurations
214 |         submission: Source code of the submission
215 |         test_batch_size: evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases.
216 |         -1 to evaluate all test cases in parallel
217 |         
218 |     Returns:
219 |         list[SubtaskResult]: Results for all subtasks
220 |     """
221 |     # avoid rerunning tests present in multiple subtasks
222 |     test_case_run_cache = {}
223 | 
224 |     return [
225 |         await score_subtask(client, subtask, submission, test_case_run_cache, test_batch_size)
226 |         for subtask in subtasks
227 |     ]
228 | 
229 | async def run_submission(client: PistonClient, problem: dict, test_input: str, submission: str, test_output: str | None = None) -> tuple[str, str]:
230 |     """
231 |     Executes a submission against a test case using the Piston execution environment.
232 |     
233 |     Args:
234 |         client: PistonClient instance for executing code
235 |         problem: Dictionary containing problem configuration
236 |         test_input: Input data for the test case
237 |         submission: Source code of the submission
238 |         test_output: Optional expected output for the test case
239 |         
240 |     Returns:
241 |         tuple[str, str]: A tuple containing (score, feedback)
242 |     """
243 |     data = {
244 |         "files": [
245 |             # the actual submission
246 |             {
247 |                 "name": f"graders/{problem['id'].lower()}.cpp",
248 |                 "content": submission
249 |             },
250 |             # pass the input
251 |             {
252 |                 "name": "input.txt",
253 |                 "content": test_input
254 |             },
255 |             # pass the expected output
256 |             *([{
257 |                 "name": "correct_output.txt",
258 |                 "content": test_output
259 |             }] if test_output else []),
260 |             # grader files
261 |             *({
262 |                 "name": name,
263 |                 "content": content
264 |             } for name, content in problem['grader_files'] if content)
265 |         ],
266 |         'run_timeout': round((problem['time_limit'] + 3) * 1000),  # +3 seconds hard limit. time limits are handled by the ioi script
267 |         'run_memory_limit': problem['memory_limit']
268 |     }
269 |     return await client.execute(data)
270 | 


--------------------------------------------------------------------------------
/run_tests/piston_client.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | import random
  4 | import re
  5 | import subprocess
  6 | from collections import Counter
  7 | from functools import lru_cache
  8 | 
  9 | import aiohttp
 10 | 
 11 | 
 12 | class PistonError(Exception):
 13 |     pass
 14 | 
 15 | @lru_cache(maxsize=1)
 16 | def get_piston_client_from_env(session=None):
 17 |     piston_endpoints = os.getenv("PISTON_ENDPOINTS")
 18 |     if piston_endpoints is None:
 19 |         raise ValueError("For IOI problems Piston endpoints running our IOI package are required. Please add a list of valid Piston endpoints to a PISTON_ENDPOINTS varialbe in a `.env` file.")
 20 |     piston_endpoints = piston_endpoints.split(",") if piston_endpoints != "slurm" else get_slurm_piston_endpoints()
 21 |     random.shuffle(piston_endpoints)
 22 |     max_requests_per_endpoint = os.getenv("PISTON_MAX_REQUESTS_PER_ENDPOINT", "1")
 23 |     return PistonClient(piston_endpoints, session, max_requests_per_endpoint=int(max_requests_per_endpoint))
 24 | 
 25 | class PistonClient:
 26 |     """
 27 |         A client that will automatically load balance across multiple Piston (https://github.com/engineer-man/piston) workers.
 28 |         This assumes piston is running our custom cms_ioi package: https://github.com/guipenedo/piston/releases/
 29 |         We recommend starting the instances with the following script as otherwise some IOI problems will hit default limits:
 30 |         ```
 31 |         export PISTON_COMPILE_TIMEOUT=60000
 32 |         export PISTON_RUN_TIMEOUT=60000
 33 |         export PISTON_OUTPUT_MAX_SIZE=1000000000
 34 |         export PISTON_MAX_FILE_SIZE=1000000000
 35 |         export PISTON_DISABLE_NETWORKING=true
 36 |         export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index
 37 |         mkdir /piston
 38 | 
 39 |         sed -i '/app.use(body_parser.urlencoded/c\    app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js
 40 |         sed -i '/app.use(body_parser.json/c\    app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js
 41 | 
 42 |         # Start server in background
 43 |         node src```
 44 | 
 45 |         Piston docs for API usage: https://piston.readthedocs.io/en/latest/api-v2/
 46 |     """
 47 |     def __init__(self, base_endpoint: str | list[str] = "http://ip-10-53-80-65:3223/api/v2", session=None, max_requests_per_endpoint=1):
 48 |         self.max_requests_per_endpoint = max_requests_per_endpoint
 49 |         self.base_endpoints = [base_endpoint] if isinstance(base_endpoint, str) else base_endpoint
 50 |         if len(self.base_endpoints) == 0:
 51 |             raise ValueError("No Piston endpoints provided. Please check your PISTON_ENDPOINTS environment variable.")
 52 |         self.endpoint_ids = {endpoint: i for i, endpoint in enumerate(self.base_endpoints)}
 53 | 
 54 |         self._session = session
 55 |         self.endpoint_tokens = asyncio.Queue(maxsize=max_requests_per_endpoint * len(self.base_endpoints))
 56 | 
 57 |         for _ in range(max_requests_per_endpoint):
 58 |             for base_endpoint in self.base_endpoints:
 59 |                 self.endpoint_tokens.put_nowait(base_endpoint)
 60 |         self._endpoint_failures = Counter()
 61 |         self._unhealthy_endpoints = set()
 62 |         self._endpoint_failures_lock = asyncio.Lock()
 63 |         
 64 |     @property
 65 |     def session(self):
 66 |         if self._session is None:
 67 |             self._session = aiohttp.ClientSession(
 68 |                 timeout=aiohttp.ClientTimeout(sock_read=10), 
 69 |                 connector=aiohttp.TCPConnector(
 70 |                     limit=self.max_requests_per_endpoint * len(self.base_endpoints), 
 71 |                     ttl_dns_cache=300, 
 72 |                     keepalive_timeout=5 * 60
 73 |                 )
 74 |             )
 75 |         return self._session
 76 | 
 77 |     async def _wait_for_endpoint(self):
 78 |         endpoint = await self.endpoint_tokens.get()
 79 |         return endpoint
 80 |     
 81 |     async def _release_endpoint(self, endpoint):
 82 |         await self.endpoint_tokens.put(endpoint)
 83 | 
 84 |     async def _send_request(self, endpoint, route, data=None, method="post"):
 85 |         async with self.session.request(method, f"{endpoint.rstrip('/')}/{route}", json=data, headers={"Content-Type": "application/json"}) as response:
 86 |             return await response.json(content_type=None)
 87 |         
 88 |     async def _send_to_all(self, route, data=None, method="post"):
 89 |         return await asyncio.gather(*[self._send_request(endpoint, route, data, method) for endpoint in self.base_endpoints])
 90 |     
 91 |     async def _send_to_one(self, endpoint, route, data=None, method="post"):
 92 |         return await self._send_request(endpoint, route, data, method)
 93 | 
 94 |     async def install_package(self, language, version):
 95 |         return await self._send_to_all("packages", {
 96 |             "language": language,
 97 |             "version": version
 98 |         }, method="post")
 99 | 
100 |     async def uninstall_package(self, language, version):
101 |         return await self._send_to_all("packages", {
102 |             "language": language,
103 |             "version": version
104 |         }, method="delete")
105 | 
106 |     async def get_supported_runtimes(self):
107 |         return await self._send_to_all("runtimes", method="get")
108 | 
109 |     async def execute(self, data) -> tuple[str, str]:
110 |         """
111 |             Requests to the IOI package return the score as a float in the stdout, as well as optional feedback/errors in stderr.
112 |             Returns a tuple of (score, feedback).
113 |         """
114 |         response = await self._send_execute(data)
115 | 
116 |         if 'message' in response:
117 |             raise PistonError(response['message'])
118 |         
119 |         if 'compile' in response and response['compile']['code'] != 0:
120 |             return "0", "Compilation error exit code " + str(response['compile']['code']) + "\n" + response['compile']['stderr']
121 |         
122 |         if 'run' not in response:
123 |             raise PistonError(response)
124 |         
125 |         if response['run']['code'] == 1 and "MemoryError" in response['run']['stderr']:
126 |             return "0", "Memory limit exceeded"
127 |         
128 |         # successful result
129 |         if response['run']['stdout']:
130 |             return response['run']['stdout'], response['run']['stderr']
131 |         
132 |         if response['run']['signal'] == 'SIGKILL':
133 |             return "0", "Time limit exceeded"
134 |         
135 |         # other issues
136 |         if response['run']['code'] != 0:
137 |             raise PistonError(f"language={response['language']}, version={response['version']}, exit code={response['run']['code']}, stderr={response['run']['stderr']}, signal={response['run']['signal']}")
138 |         return '0', 'Unknown error'
139 |     
140 |     async def _check_failed_endpoint(self, endpoint):
141 |         async with self._endpoint_failures_lock:
142 |             if endpoint in self._unhealthy_endpoints:
143 |                 return
144 |             try:
145 |                 await asyncio.sleep(5)
146 |                 await self.get_supported_runtimes()
147 |             except Exception as e:
148 |                 print(f"Error checking endpoint {endpoint}, dropping it ({e})")
149 |                 self._unhealthy_endpoints.add(endpoint)
150 |                 if len(self._unhealthy_endpoints) >= len(self.base_endpoints):
151 |                     raise PistonError("All endpoints are unhealthy. Please check your Piston workers.")
152 | 
153 |     async def _send_execute(self, data):
154 |         data = data | {
155 |             "language": "cms_ioi",
156 |             "version": "*",
157 |         }
158 |         
159 |         max_retries = 5
160 |         base_delay = 1.0
161 | 
162 |         status = None
163 |         endpoint = None
164 |     
165 |         for attempt in range(max_retries + 1):
166 |             try:
167 |                 endpoint = await self._wait_for_endpoint()
168 |                 if attempt > 0:
169 |                     await asyncio.sleep(1)
170 |                 async with self.session.post(f"{endpoint.rstrip('/')}/execute", json=data, headers={"Content-Type": "application/json"}) as response:
171 |                     status = response.status
172 |                     res_json = await response.json(content_type=None)
173 | 
174 |                     if status != 200:
175 |                         raise PistonError(f"Server error. status={status}")
176 |                     if res_json is None:
177 |                         raise PistonError(f"Empty response. status={status}")
178 |                     # piston overloaded
179 |                     if 'run' in res_json and "Resource temporarily unavailable" in res_json['run'].get('stderr', ''):
180 |                         raise PistonError(f"Piston overloaded: {res_json['run']['stderr']}")
181 |                     return res_json
182 |                 
183 |             except (PistonError, asyncio.TimeoutError, aiohttp.ClientConnectionError, RuntimeError) as e:
184 |                 # Only retry if we haven't reached max retries yet
185 |                 if attempt < max_retries:
186 |                     # Calculate backoff with jitter
187 |                     delay = min(base_delay * (2 ** attempt), 10)  # Exponential backoff, capped at 10 seconds
188 |                     jitter = delay * 0.2 * (2 * asyncio.get_event_loop().time() % 1 - 0.5)  # Add ±10% jitter
189 |                     retry_delay = delay + jitter
190 |                     print(f"Retrying in {retry_delay:.2f} seconds [{self.endpoint_ids[endpoint]}] {endpoint}")
191 | 
192 |                     # special case: worker died
193 |                     if isinstance(e, aiohttp.ClientConnectionError) and "Connect call failed" in str(e):
194 |                         await self._check_failed_endpoint(endpoint)
195 |                     else:
196 |                         # hopefully we won't get this one again
197 |                         await self._release_endpoint(endpoint)
198 |                     endpoint = None
199 | 
200 |                     await asyncio.sleep(retry_delay)
201 |                 else:
202 |                     print(f"Giving up on retries. {e}")
203 |                     raise e
204 |             except Exception as e:
205 |                 print(f"Propagating exception {type(e)}: {e}")
206 |                 raise e
207 |             finally:
208 |                 # Ensure endpoint is always released, even if an exception occurs
209 |                 if endpoint is not None:
210 |                     try:
211 |                         await self._release_endpoint(endpoint)
212 |                     except Exception as e:
213 |                         print(f"Error releasing endpoint {endpoint}: {e}")
214 |                     endpoint = None
215 | 
216 | 
217 | def get_slurm_piston_endpoints():
218 |     """Get list of active piston worker endpoints from squeue output"""
219 |     # Run squeue command to get job name, hostname and status, filtering for RUNNING state
220 |     result = subprocess.run(['squeue', '--format="%j %N %T"', '--noheader', '--states=RUNNING'], capture_output=True, text=True)
221 |     
222 |     # Split output into lines and skip header
223 |     lines = result.stdout.strip().split('\n')
224 |     
225 |     endpoints = []
226 |     for line in lines:
227 |         # Parse job name from squeue output
228 |         fields = line.split()
229 |         job_name = fields[0].strip('"')  # Remove quotes
230 |         hostname = fields[1]
231 |         
232 |         # Extract port if job name matches pattern
233 |         match = re.match(r'piston-worker-(\d+)', job_name)
234 |         if match:
235 |             port = match.group(1)
236 |             endpoints.append(f"http://{hostname}:{port}/api/v2")
237 |                 
238 |     return endpoints


--------------------------------------------------------------------------------
/run_tests/tests_runner.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from fnmatch import fnmatch
  3 | import json
  4 | import os
  5 | from typing import Set
  6 | import asyncio
  7 | import aiofiles
  8 | import aiohttp
  9 | from datasets import load_dataset, Dataset
 10 | import pandas as pd
 11 | from tqdm.asyncio import tqdm
 12 | from selection_simulator import get_problem_scores, simulate_round_robin
 13 | from utils import add_includes
 14 | import uvloop
 15 | from piston_client import get_piston_client_from_env
 16 | from scoring import score_subtasks
 17 | from dotenv import load_dotenv
 18 | from loguru import logger
 19 | from huggingface_hub import HfApi
 20 | 
 21 | class TestsRunner:
 22 |     def __init__(
 23 |         self,
 24 |         datasets_to_evaluate: list[str] | str,
 25 |         results_dataset_name: str,
 26 |         local_results_cache: str = "results",
 27 |         max_concurrent_requests: int = 100,
 28 |         test_batch_size: int = 1,
 29 |         dry_run: bool = False,
 30 |         override: bool = False,
 31 |         timeout: int = 60 * 10,
 32 |         id_column: str = "uuid",
 33 |         add_messages_column: bool = False,
 34 |         add_includes: bool = True,
 35 |         always_extract_code: bool = False
 36 |     ):
 37 |         self.datasets_to_evaluate = datasets_to_evaluate if isinstance(
 38 |             datasets_to_evaluate, list) else [datasets_to_evaluate]
 39 |         self.datasets_to_evaluate_names = [dataset.split(
 40 |             '/')[-1].removeprefix('ioi-eval-') for dataset in self.datasets_to_evaluate]
 41 |         self.results_dataset_name = results_dataset_name
 42 |         self.local_results_cache = local_results_cache
 43 |         self.test_batch_size = test_batch_size
 44 |         self.dry_run = dry_run
 45 |         self.override = override
 46 |         self.timeout = timeout
 47 |         self.id_column = id_column
 48 |         self.add_messages_column = add_messages_column
 49 |         self.add_includes = add_includes
 50 |         self.max_concurrent_requests = max_concurrent_requests
 51 |         self.always_extract_code = always_extract_code
 52 |         os.makedirs(self.local_results_cache, exist_ok=True)
 53 | 
 54 |         if dry_run:
 55 |             logger.warning("Running in dry-run mode - no actual Piston calls will be made")
 56 | 
 57 |         # Lock for local file access
 58 |         self._file_lock = asyncio.Lock()
 59 | 
 60 |     async def run_tests_pipeline(self):
 61 |         # fetch completed submissions
 62 |         completed_ids, evaluated_submissions = self.fetch_completed_submissions()
 63 | 
 64 |         # fetch submissions to evaluate
 65 |         submissions_to_evaluate = self.fetch_submissions_to_evaluate(completed_ids)
 66 | 
 67 |         # load problem data
 68 |         problem_subtasks = self.load_problem_data(set(submissions_to_evaluate.keys()))
 69 | 
 70 |         # evaluate submissions
 71 |         new_evaluated_submissions = await self.evaluate_submissions(problem_subtasks, submissions_to_evaluate)
 72 | 
 73 |         # merge
 74 |         for key in set(evaluated_submissions.keys()).union(new_evaluated_submissions.keys()):
 75 |             evaluated_submissions[key].extend(new_evaluated_submissions[key])
 76 | 
 77 |         # save all results
 78 |         self.save_to_hub(evaluated_submissions)
 79 | 
 80 |         # generate reports with the results for each dataset
 81 |         self.publish_reports(evaluated_submissions)
 82 | 
 83 |     def fetch_submissions_to_evaluate(self, completed_ids: dict[str, set]) -> dict[tuple[int, str], list[dict]]:
 84 |         submissions_to_evaluate = defaultdict(list)
 85 |         for dataset, dsname in zip(self.datasets_to_evaluate, self.datasets_to_evaluate_names):
 86 |             ds = load_dataset(dataset, split="train")
 87 |             subs_to_eval = 0
 88 |             for submission in ds:
 89 |                 if self.id_column not in submission:
 90 |                     logger.error(f"Submission does not have an \"{self.id_column}\" column. Please set --id_column to the correct column name.")
 91 |                     exit(1)
 92 |                 if "year" not in submission:
 93 |                     submission['year'] = 2024  # we assume it's IOI'2024
 94 |                 id_key = (dsname, str(submission['year']), submission['problem_id'], submission[self.id_column])
 95 |                 if not id_key in completed_ids[dsname]:
 96 |                     submission['dataset'] = dsname
 97 | 
 98 |                     completed_ids[dsname].add(id_key)
 99 | 
100 |                     # source code parsing
101 |                     if 'code' not in submission or not submission['code'] or self.always_extract_code:
102 |                         # try extracting code from generation if it exists
103 |                         if 'generation' not in submission or "```cpp\n" not in submission['generation']:
104 |                             submission['code'] = None
105 |                         else:
106 |                             submission['code'] = submission['generation'].split("```cpp\n")[-1].split("```")[0]
107 |                     if submission['code'] and self.add_includes:
108 |                         submission['code'] = add_includes(submission['code'], submission['problem_id'])
109 | 
110 |                     submissions_to_evaluate[(str(submission['year']), submission['problem_id'])].append(submission)
111 |                     subs_to_eval += 1
112 |             logger.info(f"Found {subs_to_eval} submissions to evaluate for {dsname}")
113 |         logger.info(f"Found {sum(len(v) for v in submissions_to_evaluate.values())} total submissions to evaluate")
114 |         return submissions_to_evaluate
115 | 
116 |     def fetch_completed_submissions(self) -> tuple[dict[str, set], dict[str, list]]:
117 |         completed_submissions = defaultdict(list)
118 |         unique_ids = defaultdict(set)
119 | 
120 |         if self.override:
121 |             logger.warning("Override flag active. Will not fetch completed submissions from local cache or hub. Will overwrite existing local results and on the hub.")
122 |             for dsname in self.datasets_to_evaluate_names:
123 |                 if os.path.exists(f"{self.local_results_cache}/{dsname}.jsonl"):
124 |                     os.rename(f"{self.local_results_cache}/{dsname}.jsonl", f"{self.local_results_cache}/{dsname}.jsonl.bak")
125 |                     logger.info(f"Renamed {self.local_results_cache}/{dsname}.jsonl to {self.local_results_cache}/{dsname}.jsonl.bak")
126 |             return unique_ids, completed_submissions
127 | 
128 |         logger.info(f"Fetching completed submissions from {self.local_results_cache} and {self.results_dataset_name}")
129 |         for dsname in self.datasets_to_evaluate_names:
130 |             local_results_path = f"{self.local_results_cache}/{dsname}.jsonl"
131 |             # local results
132 |             if os.path.exists(local_results_path):
133 |                 with open(local_results_path, 'r') as f:
134 |                     for line in f:
135 |                         line_data = json.loads(line)
136 |                         id_key = (dsname, str(line_data['year']), line_data['problem_id'], line_data[self.id_column])
137 |                         if not id_key in unique_ids[dsname]:
138 |                             line_data['dataset'] = dsname
139 |                             completed_submissions[dsname].append(line_data)
140 |                             unique_ids[dsname].add(id_key)
141 |             try:
142 |                 # hub results
143 |                 pushed_results = load_dataset(
144 |                     self.results_dataset_name, split="train", name=dsname)
145 |                 if pushed_results:
146 |                     for submission in pushed_results:
147 |                         id_key = (dsname, str(submission['year']), submission['problem_id'], submission[self.id_column])
148 |                         if not id_key in unique_ids[dsname]:
149 |                             submission['dataset'] = dsname
150 |                             completed_submissions[dsname].append(submission)
151 |                             unique_ids[dsname].add(id_key)
152 |             except Exception:
153 |                 pass
154 |             logger.info(f"Found {len(completed_submissions[dsname])} completed submissions for {dsname}")
155 | 
156 |         return unique_ids, completed_submissions
157 | 
158 |     def load_problem_data(self, problems_to_fetch: set[tuple[int, str]]) -> dict[tuple[int, str], list[dict]]:
159 |         problems = load_dataset("open-r1/ioi", split="train+test")
160 |         problem_subtasks = defaultdict(list)
161 |         
162 |         for problem in problems:
163 |             if (str(problem['year']), problem['id']) in problems_to_fetch:
164 |                 problem_subtasks[(str(problem['year']), problem['id'])].append(problem)
165 | 
166 |         return problem_subtasks
167 | 
168 |     async def evaluate_submissions(self, problem_subtasks: dict[tuple[int, str], list[dict]], submissions_to_evaluate: list[dict]) -> list[dict]:
169 |         async with aiohttp.ClientSession(
170 |             timeout=aiohttp.ClientTimeout(sock_read=30),
171 |             connector=aiohttp.TCPConnector(
172 |                 limit=self.max_concurrent_requests, ttl_dns_cache=300, keepalive_timeout=self.timeout)
173 |         ) as session:
174 |             client = get_piston_client_from_env(session) if not self.dry_run else None
175 |             active_tasks: Set[asyncio.Task] = set()
176 | 
177 |             new_results = defaultdict(list)
178 | 
179 |             with tqdm(
180 |                 total=sum(len(codes_to_eval) for codes_to_eval in submissions_to_evaluate.values()),
181 |                 desc="Evaluating submissions",
182 |                 unit="row",
183 |                 mininterval=2,
184 |                 smoothing=0.0001,
185 |             ) as pbar:
186 | 
187 |                 async def score_submission_on_all_subtasks(subtasks, submission):
188 |                     """Score a single submission on all subtasks"""
189 |                     try:
190 |                         all_subtask_results = await score_subtasks(client, subtasks, submission['code'] if not self.dry_run else None, test_batch_size=self.test_batch_size)
191 |                         async with self._file_lock:
192 |                             async with aiofiles.open(f'{self.local_results_cache}/{submission["dataset"]}.jsonl', mode="a") as f:
193 |                                 target_subtask = submission.pop('subtask', None)
194 |                                 target_subtask_results = [subtask_results for subtask_results in all_subtask_results if target_subtask and subtask_results.subtask == target_subtask]
195 | 
196 |                                 full_result_data = {
197 |                                     **submission,
198 |                                     "target_subtask": target_subtask,
199 |                                     "code_compiles": bool(submission["code"]) and all(subtask_results.status != "CE" for subtask_results in all_subtask_results),
200 |                                     "target_subtask_score": target_subtask_results[0].score if target_subtask_results else None,
201 |                                     "target_subtask_status": target_subtask_results[0].status if target_subtask_results else None,
202 |                                     "all_subtasks_points": sum([subtask_results.weighted_score for subtask_results in all_subtask_results]),
203 |                                     "all_subtasks_results": [subtask_result.to_dict() for subtask_result in all_subtask_results],
204 |                                 }
205 |                                 await f.write(json.dumps(full_result_data) + "\n")
206 |                                 await f.flush()
207 |                                 return full_result_data
208 |                     except Exception as e:
209 |                         print(f"Error scoring submission: {e}")
210 |                     finally:
211 |                         pbar.set_postfix(active=len(pbar.active_tasks), refresh=False)
212 |                         pbar.update(1)
213 | 
214 |                 pbar.active_tasks = active_tasks
215 | 
216 |                 for (year, problem_name), subtasks in problem_subtasks.items():
217 |                     codes_to_eval = submissions_to_evaluate[(year, problem_name)]
218 |                     print(f"Scoring {len(codes_to_eval)} submissions on {len(subtasks)} subtasks of {problem_name} ({len(set([test_name for subtask in subtasks for test_name in subtask['test_names']]))} test cases)")
219 | 
220 |                     for submission in codes_to_eval:
221 |                         while len(active_tasks) >= self.max_concurrent_requests:
222 |                             done, active_tasks = await asyncio.wait(
223 |                                 active_tasks, return_when=asyncio.FIRST_COMPLETED
224 |                             )
225 |                             for task in done:
226 |                                 try:
227 |                                     result = await task
228 |                                     if result:
229 |                                         new_results[result['dataset']].append(result)
230 |                                 except Exception as e:
231 |                                     print(f"Task failed: {e}")
232 | 
233 |                         task = asyncio.create_task(score_submission_on_all_subtasks(subtasks, submission))
234 |                         active_tasks.add(task)
235 |                         task.add_done_callback(active_tasks.discard)
236 |                         pbar.set_postfix(active=len(active_tasks), refresh=True)
237 | 
238 |                 if active_tasks:
239 |                     for new_result in (await asyncio.gather(*active_tasks, return_exceptions=True)):
240 |                         if isinstance(new_result, Exception):
241 |                             logger.error(f"Error scoring submission: {new_result}")
242 |                         else:
243 |                             if new_result:
244 |                                 new_results[new_result['dataset']].append(new_result)
245 | 
246 |                 return new_results
247 | 
248 |     def save_to_hub(self, evaluated_submissions: list[dict]):
249 | 
250 |         def add_messages_column(sample):
251 |             messages = [
252 |                 {"role": "user", "content": sample["prompt"]},
253 |                 {"role": "assistant", "content": sample["generation"].strip()},
254 |             ]
255 |             return {"messages": messages}
256 | 
257 | 
258 |         for key, submissions in evaluated_submissions.items():
259 |             if not submissions:
260 |                 logger.warning(f"No submissions to push for {key}")
261 |                 continue
262 |             dataset = Dataset.from_list(submissions)
263 |             if self.add_messages_column:
264 |                 dataset = dataset.map(add_messages_column)
265 |             dataset = dataset.remove_columns("dataset")
266 |             dataset.push_to_hub(self.results_dataset_name, split="train", config_name=key, private=False)
267 |             logger.info(f"Pushed {len(submissions)} submissions to {self.results_dataset_name}[{key}]")
268 | 
269 |     def publish_reports(self, evaluated_submissions: list[dict]):
270 |         api = HfApi()
271 |         for dataset, submissions in evaluated_submissions.items():
272 |             if not submissions:
273 |                 continue
274 | 
275 |             submissions_per_problem = defaultdict(list)
276 |             for submission in submissions:
277 |                 submissions_per_problem[(submission['year'], submission['problem_id'])].append(submission)
278 | 
279 |             year_overview = defaultdict(list)
280 |             for year, problem in sorted(submissions_per_problem.keys(), key=lambda x: (-x[0], x[1])):
281 |                 submissions = submissions_per_problem[(year, problem)]
282 | 
283 |                 table_data = [
284 |                     {
285 |                         "Submission": submission[self.id_column],
286 |                         "Target subtask": submission.get('target_subtask', '-'),
287 |                         "Total": submission["all_subtasks_points"],
288 |                         **{
289 |                             subtask['subtask']: f"{subtask['weighted_score']}/{subtask['points']} ({subtask['status']})"
290 |                             for subtask in submission["all_subtasks_results"]
291 |                         }
292 |                     }
293 |                     for submission in submissions
294 |                 ]
295 |                 table_data.sort(key=lambda x: x["Total"], reverse=True)
296 |                 df = pd.DataFrame(table_data)
297 | 
298 |                 all_submissions_score = get_problem_scores(submissions)
299 |                 limit_50_score = get_problem_scores(simulate_round_robin(submissions))
300 |                 problem_overview = {
301 |                     "year": year,
302 |                     "problem": problem,
303 |                     "day": submissions[0].get('day', '-'),
304 |                     "number_submissions": len(submissions),
305 |                     "number_submissions_compiling": sum(1 for submission in submissions if submission['code_compiles']),
306 |                     "best_submission_score": max(submission['all_subtasks_points'] for submission in submissions),
307 |                     "all_submissions_score": all_submissions_score,
308 |                     "limit_50_score": limit_50_score,
309 |                 }
310 |                 year_overview[year].append(problem_overview)
311 | 
312 |                 # individual problem report
313 |                 markdown_content = f"""# {year}: {problem}
314 | ## Overview
315 | - Number of submissions: **{problem_overview['number_submissions']}**
316 | - Submissions compiling: **{problem_overview['number_submissions_compiling']}**
317 | - Best individual submission: **{problem_overview['best_submission_score']}/100**
318 | 
319 | - Score on this problem (no submission limit): **{problem_overview['all_submissions_score']:.2f}/100**
320 | - Score on this problem (limited to 50 submissions, round robin selection): **{limit_50_score:.2f}/100**
321 |                 
322 | ## Submissions
323 | {df.to_markdown(index=False)}
324 | """
325 |                 api.upload_file(
326 |                     path_or_fileobj=markdown_content.encode(),
327 |                     path_in_repo=f"reports/{dataset}/{year}_{problem}.md",
328 |                     repo_id=self.results_dataset_name,
329 |                     repo_type="dataset"
330 |                 )
331 |             
332 |             # collect stuff for the global overview. grouped per year
333 |             global_overview_markdown = f"""# Global Overview
334 | - Number of submissions: **{sum(overview['number_submissions'] for overviews in year_overview.values() for overview in overviews)}**
335 | - Submissions compiling: **{sum(overview['number_submissions_compiling'] for overviews in year_overview.values() for overview in overviews)}**
336 | 
337 | """ + "\n\n".join([f"""# {year}
338 | 
339 | - Score (no submission limit): **{sum(problem_overview['all_submissions_score'] for problem_overview in year_overview[year] if problem_overview['day'] != "practice")}/600**
340 | - Score (limited to 50 submissions, round robin selection): **{sum(problem_overview['limit_50_score'] for problem_overview in year_overview[year] if problem_overview['day'] != "practice")}/600**
341 | 
342 | """ + pd.DataFrame([
343 |                     {
344 |                         "Day": problem_overview['day'],
345 |                         "Problem": problem_overview['problem'],
346 |                         "#submissions": problem_overview['number_submissions'],
347 |                         "#compiling": problem_overview['number_submissions_compiling'],
348 |                         "Best individual": f"{problem_overview['best_submission_score']}/100",
349 |                         "Score (50 limit)": f"{problem_overview['limit_50_score']}/100",
350 |                         "Score (no limit)": f"{problem_overview['all_submissions_score']}/100",
351 |                         "Full report": f"[link](https://huggingface.co/datasets/{self.results_dataset_name}/blob/main/reports/{dataset}/{year}_{problem_overview['problem']}.md)"
352 |                     }
353 |                     for problem_overview in problem_overviews
354 |                 ]).to_markdown(index=False) for year, problem_overviews in year_overview.items()])
355 |             api.upload_file(
356 |                 path_or_fileobj=global_overview_markdown.encode(),
357 |                 path_in_repo=f"reports/{dataset}/README.md",
358 |                 repo_id=self.results_dataset_name,
359 |                 repo_type="dataset"
360 |             )
361 | 
362 |         logger.info(f"Uploaded reports to https://huggingface.co/datasets/{self.results_dataset_name}/tree/main/reports/")
363 | 
364 | def parse_datasets_to_evaluate(datasets_to_evaluate_str: str) -> list[str]:
365 |     api = HfApi()
366 |     org_datasets = {}
367 | 
368 |     datasets_to_evaluate = datasets_to_evaluate_str.split(",")
369 |     parsed_datasets_to_evaluate = []
370 |     for dataset in datasets_to_evaluate:
371 |         org, dataset_name = dataset.split("/")
372 |         if "*" in dataset_name:
373 |             if org not in org_datasets:
374 |                 org_datasets[org] = [dataset_entry.id.removeprefix(f"{org}/") for dataset_entry in api.list_datasets(author=org)]
375 |             for candidate_dataset_name in org_datasets[org]:
376 |                 if fnmatch(candidate_dataset_name, dataset_name):
377 |                     parsed_datasets_to_evaluate.append(f"{org}/{candidate_dataset_name}")
378 |         else:
379 |             parsed_datasets_to_evaluate.append(dataset)
380 |     logger.info(f"Parsed {len(parsed_datasets_to_evaluate)} datasets to evaluate: {','.join(parsed_datasets_to_evaluate)}")
381 |     return parsed_datasets_to_evaluate
382 | 
383 | if __name__ == "__main__":
384 |     import argparse
385 |     load_dotenv()
386 |     parser = argparse.ArgumentParser()
387 |     parser.add_argument("datasets_to_evaluate", type=str, help="comma separated list of datasets to evaluate. accepts wildcards on the org portion, e.g. ioi-leaderboard/ioi-eval.*-prompt-mem-limit")
388 |     parser.add_argument("results_dataset_name", type=str, help="where to push the final results open-r1/ioi-test-results")
389 |     parser.add_argument("--local_results_path", type=str, default="results")
390 |     parser.add_argument("--id_column", type=str, default="uuid", help="column name to use as the unique identifier per problem for each submission")
391 |     parser.add_argument("--max_concurrent_requests", type=int, default=10, help="maximum number of concurrent requests to be sent to piston")
392 |     parser.add_argument("--test_batch_size", type=int, default=1, help="evaluate these many test cases in parallel, then check if any of them failed (0 score): if so, stop evaluating; otherwise continue with the next batch of test cases")
393 |     parser.add_argument("--dry_run", action="store_true", help="do not actually send any requests to piston")
394 |     parser.add_argument("--override", action="store_true", help="do not fetch completed submissions from local cache or hub. Will overwrite existing results on the hub")
395 |     parser.add_argument('--timeout', type=int, default=60 * 10, help="timeout for the piston client requests keep alive")
396 |     parser.add_argument('--add_includes', action="store_true", help="try to fix missing includes in the code")
397 |     parser.add_argument('--add_messages_column', action="store_true", help="add a messages column to the results, for SFT")
398 |     parser.add_argument('--always_extract_code', action="store_true", help="always extract code from generation, even if it already exists in the code column")
399 |     args = parser.parse_args()
400 | 
401 |     runner = TestsRunner(
402 |         datasets_to_evaluate=parse_datasets_to_evaluate(args.datasets_to_evaluate),
403 |         results_dataset_name=args.results_dataset_name,
404 |         local_results_cache=args.local_results_path,
405 |         max_concurrent_requests=args.max_concurrent_requests,
406 |         test_batch_size=args.test_batch_size,
407 |         dry_run=args.dry_run,
408 |         override=args.override,
409 |         timeout=args.timeout,
410 |         id_column=args.id_column,
411 |         add_messages_column=args.add_messages_column,
412 |         add_includes=args.add_includes,
413 |         always_extract_code=args.always_extract_code
414 |     )
415 | 
416 |     uvloop.install()
417 |     asyncio.run(runner.run_tests_pipeline())
418 | 


--------------------------------------------------------------------------------
/generate/evaluate.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from collections import defaultdict
  3 | import json
  4 | from pathlib import Path
  5 | from typing import Dict, List, Optional
  6 | import random
  7 | from datetime import datetime
  8 | import uuid
  9 | from datasets import Dataset, load_dataset
 10 | from loguru import logger
 11 | from tqdm.asyncio import tqdm
 12 | import litellm
 13 | from dotenv import load_dotenv
 14 | import polars as pl
 15 | import aiofiles
 16 | from litellm.utils import ModelResponse
 17 | 
 18 | class IOIEvaluator:
 19 |     def __init__(self, org_id: str, model_id: str, api_base: Optional[str] = None,  subset: Optional[str] = None,
 20 |                  num_generations: int = 50, num_retries: int = 10, 
 21 |                  concurrency: int = 10, num_problems: Optional[int] = None, 
 22 |                  last_subtask: bool = False, dry_run: bool = False,
 23 |                  override: bool = False, model_postfix: Optional[str] = None,
 24 |                  revision: Optional[str] = None, timeout: Optional[int] = 600,
 25 |                  use_requests: bool = False, max_tokens: Optional[int] = None):
 26 |         self.org_id = org_id
 27 |         self.model_id = model_id
 28 |         self.api_base = api_base
 29 |         self.subset = subset
 30 |         self.num_generations = num_generations
 31 |         self.num_retries = num_retries
 32 |         self.concurrency = concurrency
 33 |         self.num_problems = num_problems
 34 |         self.last_subtask = last_subtask
 35 |         self.dry_run = dry_run
 36 |         self.override = override
 37 |         self.revision = revision
 38 |         # Create organization and model directories
 39 |         self.timeout = timeout
 40 |         self.use_litellm = not use_requests
 41 |         self.max_tokens = max_tokens
 42 |         
 43 |         # Tracking totals
 44 |         self.total_prompt_tokens = 0
 45 |         self.total_completion_tokens = 0
 46 |         self.total_cost = 0.0
 47 |         self.model_postfix = model_postfix
 48 |         
 49 |         # Semaphore for controlling concurrency
 50 |         self._semaphore = asyncio.Semaphore(concurrency)
 51 |         
 52 |         # HTTP session for direct API calls when not using litellm
 53 |         self._session = None
 54 | 
 55 |         if self.api_base:
 56 |             logger.info(f"Using API base: {self.api_base}")
 57 |             
 58 |         if not self.use_litellm:
 59 |             logger.info("Using direct asyncio requests instead of LiteLLM")
 60 | 
 61 |         if dry_run:
 62 |             logger.warning("Running in dry-run mode - no actual LLM calls will be made")
 63 | 
 64 |         # Create results directory
 65 |         self.model_dir = Path("results") / self.get_model_name()
 66 |         self.model_dir.mkdir(parents=True, exist_ok=True)
 67 |         
 68 |         # File path for the single JSONL file
 69 |         self.results_file = self.model_dir / "results.jsonl"
 70 |         
 71 |         # Lock for file access
 72 |         self._file_lock = asyncio.Lock()
 73 | 
 74 |     async def save_result_locally(self, result: Dict, year: int, problem_id: str, subtask: str, solution_number: int):
 75 |         """Save a single result to local JSONL storage with locking."""
 76 |         # Ensure problem_id is included in the result
 77 |         result['year'] = year
 78 |         result['problem_id'] = problem_id
 79 |         result['subtask'] = subtask
 80 |         result['solution_number'] = solution_number
 81 |         
 82 |         try:
 83 |             # Use lock to prevent concurrent writes
 84 |             async with self._file_lock:
 85 |                 async with aiofiles.open(self.results_file, 'a') as f:
 86 |                     await f.write(json.dumps(result) + '\n')
 87 |         except Exception as e:
 88 |             logger.error(f"Failed to save result locally: {str(e)}")
 89 | 
 90 |     async def load_previous_results(self) -> Optional[pl.DataFrame]:
 91 |         """Load previous results from both HuggingFace Hub and local JSONL storage."""
 92 |         if self.override:
 93 |             logger.info("Override mode enabled - not loading previous results")
 94 |             return None
 95 |         
 96 |         results_dfs = []
 97 |         
 98 |         # Try loading from Hub
 99 |         repo_name = f"{self.org_id}/{self.get_model_name()}"
100 |         try:
101 |             logger.info(f"Attempting to load previous results from HuggingFace Hub: {repo_name}")
102 |             dataset = load_dataset(repo_name, split="train")
103 |             if dataset is not None:
104 |                 # Convert to pandas then to polars
105 |                 df = dataset.to_polars()
106 | 
107 |                 # Add a column indicating if the result is local
108 |                 df = df.with_columns([
109 |                     pl.lit(False).alias('is_local')
110 |                 ])
111 |                 results_dfs.append(df)
112 | 
113 |                 logger.info(f"Loaded {len(df)} previous results from HuggingFace Hub")
114 |         except Exception as e:
115 |             logger.info(f"Could not load from HuggingFace Hub: {str(e)}")
116 | 
117 |         # Try loading from local storage
118 |         try:
119 |             if self.results_file.exists():
120 |                 results = []
121 |                 async with self._file_lock:
122 |                     async with aiofiles.open(self.results_file, 'r') as f:
123 |                         async for line in f:
124 |                             try:
125 |                                 result = json.loads(line.strip())
126 |                                 results.append(result)
127 |                             except Exception as e:
128 |                                 logger.error(f"Failed to parse JSONL line: {str(e)}")
129 |             
130 |                 if results:
131 |                     local_df = pl.DataFrame(results).with_columns([
132 |                         pl.lit(True).alias('is_local')
133 |                     ])
134 |                     results_dfs.append(local_df)
135 |                     logger.info(f"Loaded {len(local_df)} previous results from local storage")
136 |         except Exception as e:
137 |             logger.error(f"Failed to load from local storage: {str(e)}")
138 | 
139 |         # Combine results if we have any
140 |         if results_dfs:
141 |             # Select just columns: 'generation', 'code', 'language', 'model_kwargs', 'metadata', 'uuid', 'problem_id', 'subtask', 'solution_number', 'is_local'
142 |             common_columns = ['generation', 'code', 'language', 'model_kwargs', 'metadata', 'uuid', 'year', 'problem_id', 'subtask', 'solution_number', 'is_local']
143 | 
144 |             # Add missing 'year' column with None values if needed
145 |             results_dfs = [df if 'year' in df.columns else df.with_columns(pl.lit(None).alias('year')) for df in results_dfs]
146 |             
147 |             # Drop that are not in common_columns
148 |             results_dfs = [df.select(common_columns) for df in results_dfs]
149 | 
150 |             # Try this instead:
151 |             # Add stop_reason to metadata if it doesn't exist
152 |             results_dfs = [df.with_columns(pl.when(pl.col('metadata').is_not_null()).then(pl.col('metadata').map_elements(lambda x: {"stop_reason": "unknown"} | x)).otherwise(pl.col('metadata')).alias('metadata')) for df in results_dfs]
153 |             
154 |             # Concatenate the aligned dataframes
155 |             combined_df = pl.concat(results_dfs, how="vertical")
156 |             
157 |             # First sort by whether code exists (True first), then by source (local first)
158 |             # This ensures we keep entries with code when deduplicating
159 |             deduplicated_df = (
160 |                 combined_df
161 |                 .with_columns([
162 |                     # Add a column indicating if code exists and is non-empty
163 |                     pl.when((pl.col('code').is_not_null()) & (pl.col('code') != ""))
164 |                     .then(1)
165 |                     .otherwise(0)
166 |                     .alias('has_code'),
167 |                 ])
168 |                 # Sort by has_code (descending) and is_local (descending)
169 |                 .sort(['has_code', 'is_local'], descending=[True, True])
170 |                 # Keep first occurrence after sorting (prioritizing entries with code and local source)
171 |                 .unique(
172 |                     subset=["year", "problem_id", "subtask", "solution_number"],
173 |                     keep='first'
174 |                 )
175 |                 # Drop the temporary columns
176 |                 .drop(['has_code', 'is_local'])
177 |             )
178 |             
179 |             logger.info(f"Combined and deduplicated results: {len(deduplicated_df)} entries")
180 |             return deduplicated_df
181 |         
182 |         return None
183 | 
184 |     def get_dummy_response(self, prompt: str, seed: int) -> Dict:
185 |         """Generate a dummy response for dry runs."""
186 |         dummy_code = """```cpp
187 | int main() {
188 |     // This is a dummy solution
189 |     return 0;
190 | }
191 | ```"""
192 |         return {
193 |             "generation": f"This is a dummy response for testing purposes.\n{dummy_code}",
194 |             "code": "int main() {\n    // This is a dummy solution\n    return 0;\n}",
195 |             "language": "cpp",
196 |             "model_kwargs": {
197 |                 "seed": seed,
198 |             },
199 |             "metadata": {
200 |                 "usage": {
201 |                     'completion_tokens': 10,
202 |                     'prompt_tokens': len(prompt.split()),
203 |                     'total_tokens': len(prompt.split()) + 10,
204 |                     'cost': 0.0
205 |                 },
206 |                 "timestamp": datetime.now().isoformat(),
207 |                 "stop_reason": "length"  # Add stop reason for dummy response
208 |             }
209 |         }
210 | 
211 |     def extract_code(self, text: str) -> tuple[str, str]:
212 |         """Extract code from the response between ```cpp and ``` markers."""
213 |         try:
214 |             parts = text.split("```cpp\n")
215 |             if len(parts) > 1:
216 |                 code_block = parts[-1].split("```")[0]
217 |                 code = code_block.strip()
218 |                 if not code:
219 |                     logger.warning("Empty code block found")
220 |                     return "", "cpp"
221 |                 return code, "cpp"
222 |             logger.warning("No code block found in the response")
223 |             return "", "unknown"
224 |         except Exception as e:
225 |             logger.error(f"Failed to extract code: {str(e)}")
226 |             return "", "unknown"
227 | 
228 |     async def generate_completion(self, prompt: str, seed: int) -> Dict:
229 |         """Generate completion using direct asyncio HTTP requests."""
230 |         retry_budget = self.num_retries
231 |         
232 |         while retry_budget > 0:
233 |             try:
234 |                 await asyncio.sleep(random.uniform(0.0, 0.1))
235 |                 async with self._session.post(
236 |                     f"{self.api_base}/v1/chat/completions",
237 |                     json={
238 |                         "model": "default",
239 |                         "messages": [{"role": "user", "content": prompt}],
240 |                         "seed": seed,
241 |                         "temperature": 0.7,
242 |                         "top_p": 0.8,
243 |                         "max_tokens": self.max_tokens,
244 |                     },
245 |                     headers={"Authorization": "Bearer EMPTY"},
246 |                 ) as response:
247 |                     result = await response.json(content_type=None)
248 |                     
249 |                     if result is None:
250 |                         logger.error("Received None response from API")
251 |                         retry_budget -= 1
252 |                         await asyncio.sleep(5)
253 |                         continue
254 |                     
255 |                     # Extract response content
256 |                     message_content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
257 |                     
258 |                     # Extract token usage
259 |                     usage = result.get("usage", {})
260 |                     completion_tokens = usage.get("completion_tokens", 0)
261 |                     prompt_tokens = usage.get("prompt_tokens", 0)
262 |                     total_tokens = usage.get("total_tokens", 0)
263 |                     
264 |                     # Update totals
265 |                     self.total_prompt_tokens += prompt_tokens
266 |                     self.total_completion_tokens += completion_tokens
267 |                     
268 |                     # Extract code
269 |                     code, language = self.extract_code(message_content)
270 |                     
271 |                     response_dict = {
272 |                         "generation": message_content,
273 |                         "code": code,
274 |                         "language": language,
275 |                         "model_kwargs": {
276 |                             "seed": seed,
277 |                         },
278 |                         "metadata": {
279 |                             "usage": {
280 |                                 'completion_tokens': completion_tokens,
281 |                                 'prompt_tokens': prompt_tokens,
282 |                                 'total_tokens': total_tokens,
283 |                             },
284 |                             "timestamp": datetime.now().isoformat(),
285 |                             "stop_reason": result.get("choices", [{}])[0].get("finish_reason", "unknown")
286 |                         }
287 |                     }
288 |                     
289 |                     
290 |                     return response_dict
291 |                     
292 |             except Exception as e:
293 |                 logger.exception(f"API error (will retry): {e}")
294 |                 retry_budget -= 1
295 |                 await asyncio.sleep(10)
296 | 
297 |         raise Exception("All retries failed for direct API call")
298 |                 
299 | 
300 |     async def call_llm(self, prompt: str, seed: int) -> Dict:
301 |         """Call the LLM using LiteLLM's built-in retry mechanism or direct asyncio requests."""
302 |         if self.dry_run:
303 |             result = self.get_dummy_response(prompt, seed)
304 |             return result
305 |             
306 |         if not self.use_litellm:
307 |             return await self.generate_completion(prompt, seed)
308 | 
309 |         return await self.call_litellm(prompt, seed)
310 | 
311 |     async def call_litellm(self, prompt: str, seed: int) -> Dict:
312 |         model_name = self.model_id
313 |         kwargs = {}
314 |         if self.model_id.startswith("sglang/"):
315 |             model_name = model_name.replace("sglang/", "custom_openai/")
316 |             kwargs["api_base"] = self.api_base
317 |             kwargs["api_key"] = "sk-proj-1234567890"
318 | 
319 |         if self.max_tokens is not None:
320 |             kwargs["max_tokens"] = self.max_tokens
321 | 
322 |         response: ModelResponse = await litellm.acompletion(
323 |             model=model_name,
324 |             messages=[{"role": "user", "content": prompt, "cache_control": {"type": "ephemeral"}}],
325 |             seed=seed,
326 |             num_retries=self.num_retries,
327 |             top_p=0.8,
328 |             temperature=0.7,
329 |             timeout=self.timeout,
330 |             **kwargs
331 |         )
332 | 
333 |         # Extract stop reason
334 |         stop_reason = response.choices[0].finish_reason
335 |         
336 |         # Extract usage information safely
337 |         usage = {}
338 |         cost = 0.0
339 |         if hasattr(response, 'usage'):
340 |             try:
341 |                 completion_tokens = getattr(response.usage, 'completion_tokens', 0)
342 |                 prompt_tokens = getattr(response.usage, 'prompt_tokens', 0)
343 |                 total_tokens = getattr(response.usage, 'total_tokens', 0)
344 |                 
345 |                 # Calculate cost using litellm
346 |                 try:
347 |                     cost = litellm.completion_cost(completion_response=response)
348 |                 except Exception as e:
349 |                     logger.warning(f"Failed to calculate cost: {str(e)}")
350 |                     cost = 0.0
351 |                 
352 |                 usage = {
353 |                     'completion_tokens': completion_tokens,
354 |                     'prompt_tokens': prompt_tokens,
355 |                     'total_tokens': total_tokens,
356 |                     'cost': cost
357 |                 }
358 |                 
359 |                 # Update totals
360 |                 self.total_prompt_tokens += prompt_tokens
361 |                 self.total_completion_tokens += completion_tokens
362 |                 self.total_cost += cost
363 |                 
364 |             except Exception as e:
365 |                 logger.error(f"Failed to extract usage information: {str(e)}")
366 |         
367 |         message_content = response.choices[0].message.content if response.choices else ""
368 |         
369 |         # Extract code from the response
370 |         code, language = self.extract_code(message_content or "")
371 |         
372 |         result = {
373 |             "generation": message_content,
374 |             "code": code,
375 |             "language": language,
376 |             "model_kwargs": {
377 |                 "seed": seed,
378 |             },
379 |             "metadata": {
380 |                 "usage": usage,
381 |                 "timestamp": datetime.now().isoformat(),
382 |                 "stop_reason": stop_reason
383 |             }
384 |         }
385 |         return result
386 | 
387 |     async def create_solution_requests(self, subtasks: List[Dict]) -> List[Dict]:
388 |         """Prepare result entries for a single problem."""
389 |         results = []
390 |         for subtask in subtasks:
391 |             prompt = subtask['problem']
392 |             for i in range(self.num_generations):
393 |                 try:
394 |                     random_uuid = str(uuid.uuid4())
395 |                     
396 |                     results.append({
397 |                         "year": subtask['year'],
398 |                         "problem_id": subtask['id'],
399 |                         "subtask": subtask["subtask"],
400 |                         "prompt": prompt,
401 |                         "generation": None,
402 |                         "code": "",
403 |                         "language": "unknown",
404 |                         "solution_number": i,
405 |                         "uuid": random_uuid,
406 |                         "model_kwargs": {"seed": i},
407 |                         "metadata": {
408 |                             "usage": {'completion_tokens': 0, 'prompt_tokens': 0, 'total_tokens': 0, 'cost': 0.0},
409 |                             "timestamp": datetime.now().isoformat()
410 |                         }
411 |                     })
412 |                 except Exception as e:
413 |                     logger.error(f"Failed to prepare prompts for problem {subtask['id']}, subtask {subtask['subtask']}: {str(e)}")
414 |                     return []
415 | 
416 |         return results
417 | 
418 |     async def run_evaluation(self):
419 |         """Run the evaluation for all problems."""
420 |         try:
421 |             # Create HTTP session if using direct API calls
422 |             if not self.use_litellm and not self.dry_run:
423 |                 import aiohttp
424 |                 self._session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout), connector=aiohttp.TCPConnector(limit=self.concurrency, ttl_dns_cache=300, keepalive_timeout=self.timeout))
425 |                 
426 |             
427 |             logger.info(f"Loading IOI dataset for subset: {self.subset}")
428 |             dataset = load_dataset("open-r1/ioi", split=self.subset)
429 |             problem_subtasks = defaultdict(list)
430 |             for problem in dataset:
431 |                 problem_subtasks[(problem["year"], problem["id"])].append(problem)
432 |             problem_ids = list(problem_subtasks.keys())
433 |             if self.num_problems is not None:
434 |                 problem_ids = problem_ids[:self.num_problems]
435 |                 logger.info(f"Limited evaluation to first {self.num_problems} problems")
436 |             
437 |             logger.info(f"Starting evaluation of {len(problem_ids)} problems...")
438 | 
439 |             # Step 1: Generate all solution requests
440 |             all_solution_requests = []
441 |             for problem_id in tqdm(problem_ids, desc="Preparing solution requests"):
442 |                 subtasks = problem_subtasks[problem_id]
443 |                 if self.last_subtask:
444 |                     subtasks = [subtasks[-1]]
445 |                 requests = await self.create_solution_requests(subtasks)
446 |                 all_solution_requests.extend(requests)
447 |             
448 |             # Convert to Polars DataFrame for efficient operations
449 |             requests_df = pl.DataFrame(all_solution_requests)
450 |             logger.info(f"Created {len(requests_df)} solution requests")
451 | 
452 |             # Step 2: Load previous results
453 |             previous_df = None
454 |             if not self.override:
455 |                 previous_df = await self.load_previous_results()
456 |                 if previous_df is not None:
457 |                     logger.info(f"Loaded {len(previous_df)} previous results")
458 |             
459 |             
460 |             # Step 3: Merge solution requests with previous results efficiently
461 |             if previous_df is not None:
462 |                 # Keep only the columns we want to preserve from previous results
463 |                 preserve_cols = ['generation', 'code', 'language', 'metadata', 'model_kwargs']
464 | 
465 |                 preserve_cols_with_key = preserve_cols + ['year', 'problem_id', 'subtask', 'solution_number']
466 |                 previous_df = previous_df.select(preserve_cols_with_key).filter(pl.col('generation').is_not_null() & (pl.col('generation') != ""))
467 |                 
468 |                 # Merge using polars, keeping all solution requests and only matching previous results
469 |                 merged_df = requests_df.join(
470 |                     previous_df,
471 |                     on=('year', 'problem_id', 'subtask', 'solution_number'),
472 |                     how='left',
473 |                     suffix='_prev'
474 |                 )
475 | 
476 |                 # Update values from previous results where they exist
477 |                 for col in preserve_cols:
478 |                     prev_col = f'{col}_prev'
479 |                     merged_df = merged_df.with_columns(
480 |                         pl.when(pl.col(prev_col).is_not_null())
481 |                         .then(pl.col(prev_col))
482 |                         .otherwise(pl.col(col))
483 |                         .alias(col)
484 |                     )
485 |                 
486 |                 # Drop the _prev columns
487 |                 merged_df = merged_df.select([
488 |                     c for c in merged_df.columns if not c.endswith('_prev')
489 |                 ])
490 |             else:
491 |                 merged_df = requests_df
492 | 
493 |             # Count how many need to be generated
494 |             to_generate_df = merged_df.filter(
495 |                 (pl.col('generation').is_null()) | 
496 |                 (pl.col('generation') == "")
497 |             )
498 | 
499 |             # Update seeds ensuring uniqueness
500 |             to_generate_dicts = to_generate_df.to_dicts()
501 |             logger.info(f"Need to generate {len(to_generate_df)} out of {len(merged_df)} total entries")
502 | 
503 |             if len(to_generate_df) == 0:
504 |                 logger.info("No generations needed - all results are already available")
505 |                 return
506 | 
507 |             # Run generations for entries without results
508 |             async def process_single(row: Dict) -> Dict:
509 |                 async with self._semaphore:
510 |                     try:
511 |                         llm_result = await self.call_llm(
512 |                             row["prompt"], 
513 |                             row["model_kwargs"]["seed"]
514 |                         )
515 |                         
516 |                         # Log progress and token usage
517 |                         if llm_result["metadata"].get("usage"):
518 |                             usage = llm_result["metadata"]["usage"]
519 |                             logger.info(
520 |                                 f"Problem {row['problem_id']} (Solution {row['solution_number']}) - "
521 |                                 f"Tokens: {usage.get('total_tokens', 0)} "
522 |                                 f"(prompt: {usage.get('prompt_tokens', 0)}, "
523 |                                 f"completion: {usage.get('completion_tokens', 0)}) - "
524 |                                 f"Cost: ${usage.get('cost', 0.0):.4f}"
525 |                             )
526 |                         
527 |                         llm_result["uuid"] = row["uuid"]
528 | 
529 |                         # Save result immediately
530 |                         await self.save_result_locally(llm_result, row["year"], row["problem_id"], row["subtask"], row["solution_number"])
531 | 
532 |                         return llm_result
533 |                     except Exception as e:
534 |                         logger.error(f"Failed generation for problem {row['problem_id']}: {str(e)}")
535 |                         error_result = {
536 |                             "generation": "",
537 |                             "code": "",
538 |                             "language": "unknown",
539 |                             "uuid": row["uuid"],
540 |                             "metadata": {
541 |                                 "error": str(e),
542 |                                 "usage": {'completion_tokens': 0, 'prompt_tokens': 0, 'total_tokens': 0, 'cost': 0.0},
543 |                                 "timestamp": datetime.now().isoformat(),
544 |                                 "stop_reason": "error"  # Add stop reason for error case
545 |                             }
546 |                         }
547 |                         return error_result
548 | 
549 |             # Run generations in parallel with controlled concurrency
550 |             tasks = [process_single(row) for row in to_generate_dicts]
551 |             generated_results = await tqdm.gather(*tasks, desc="Running generations")
552 | 
553 |             # Convert generated results to DataFrame and update original DataFrame
554 |             generated_df = pl.DataFrame(generated_results)
555 | 
556 |             # Merge generated results with previous results
557 |             merged_df = merged_df.join(
558 |                 generated_df,
559 |                 on='uuid',
560 |                 how='left',
561 |                 suffix='_gen'
562 |             )
563 | 
564 |             # Update the old columns with the new values
565 |             for col in ['generation', 'code', 'language', 'metadata', 'model_kwargs']:
566 |                 merged_df = merged_df.with_columns(
567 |                     pl.when(pl.col(f'generation_gen').is_not_null() & (pl.col(f'generation_gen') != ""))
568 |                     .then(pl.col(f'{col}_gen'))
569 |                     .otherwise(pl.col(col))
570 |                     .alias(col)
571 |                 )
572 | 
573 |             # Drop the _gen columns
574 |             merged_df = merged_df.select([
575 |                 c for c in merged_df.columns if not c.endswith('_gen')
576 |             ])
577 |                 
578 |             # Validate results before pushing to hub
579 |             valid_results = merged_df.filter(
580 |                 (pl.col('generation').is_not_null()) & 
581 |                 (pl.col('generation') != "")
582 |             )
583 | 
584 |             total_expected = len(merged_df)
585 |             total_valid = len(valid_results)
586 | 
587 |             logger.info(f"Valid results: {total_valid}/{total_expected}")
588 | 
589 |             # Only push to hub if all results are valid
590 |             if total_valid == total_expected:
591 |                 # Convert to HF Dataset
592 |                 output_dataset = Dataset.from_polars(merged_df)
593 |                 model_name = self.get_model_name()
594 |                 
595 |                 try:
596 |                     output_dataset.push_to_hub(f"{self.org_id}/{model_name}")
597 |                     logger.info(f"Pushed to hub: {self.org_id}/{model_name}")
598 |                 except Exception as e:
599 |                     logger.error(f"Failed to push to hub: {str(e)}")
600 |             else:
601 |                 logger.warning(
602 |                     f"Not pushing to hub - missing {total_expected - total_valid} valid results. "
603 |                     "Results saved locally and can be retried later."
604 |                 )
605 | 
606 |             # Log final statistics
607 |             # logger.info(f"Evaluation completed. Total successful generations: {successful}/{len(all_results)}")
608 |             logger.info(
609 |                 f"Total tokens used: {self.total_prompt_tokens + self.total_completion_tokens} "
610 |                 f"(prompt: {self.total_prompt_tokens}, completion: {self.total_completion_tokens})"
611 |             )
612 |             logger.info(f"Total cost: ${self.total_cost:.4f}")
613 |             
614 |             # Clean up HTTP session if using direct API calls
615 |             if self._session is not None:
616 |                 await self._session.close()
617 |                 self._session = None
618 |                 
619 |             return merged_df
620 |         except Exception as e:
621 |             # Clean up HTTP session if using direct API calls
622 |             if self._session is not None:
623 |                 await self._session.close()
624 |                 self._session = None
625 |             raise e
626 | 
627 |     
628 |     def get_model_name(self):
629 |         model_name = f"ioi-eval-{self.model_id.replace('/', '_')}"
630 |         if self.dry_run:
631 |             model_name = f"dummy-{model_name}"
632 | 
633 |         if self.revision:
634 |             model_name = f"{model_name}-{self.revision.replace('/', '_')}"
635 | 
636 |         if self.model_postfix:
637 |             model_name = f"{model_name}-{self.model_postfix}"
638 | 
639 |         return model_name
640 | 
641 | 
642 | def main():
643 |     load_dotenv()  # Load environment variables from .env file
644 |     
645 |     import argparse
646 |     parser = argparse.ArgumentParser(description="Evaluate LLMs on IOI problems")
647 |     parser.add_argument("--org_id", required=True, help="Organization ID")
648 |     parser.add_argument("--model_id", required=True, help="Model ID")
649 |     parser.add_argument("--api_base", help="API base URL for the model")
650 |     parser.add_argument("--subset", default="test", help="IOI subset to generate solutions for (train or test)")
651 |     parser.add_argument("--num_generations", type=int, default=50, help="Number of generations per problem")
652 |     parser.add_argument("--num_retries", type=int, default=10, help="Number of retries for failed API calls")
653 |     parser.add_argument("--concurrency", type=int, default=20, help="Number of concurrent generations")
654 |     parser.add_argument("--num_problems", type=int, default=None, help="Number of problems to evaluate (None for all)")
655 |     parser.add_argument("--last_subtask", action="store_true", help="Only evaluate the last subtask for each problem (usually the full problem)")
656 |     parser.add_argument("--dry_run", action="store_true", help="Run without making actual LLM calls")
657 |     parser.add_argument("--override", action="store_true", help="Override existing results and start fresh")
658 |     parser.add_argument("--model_postfix", help="Postfix for the model name")
659 |     parser.add_argument("--revision", help="Revision to use for the model")
660 |     parser.add_argument("--timeout", type=int, default=600, help="Timeout for the LLM call")
661 |     parser.add_argument("--use_requests", action="store_true", default=False, help="Use requests instead of litellm")
662 |     parser.add_argument("--max_tokens", type=int, default=None, help="Max tokens")
663 |     args = parser.parse_args()
664 | 
665 |     evaluator = IOIEvaluator(
666 |         org_id=args.org_id,
667 |         model_id=args.model_id,
668 |         api_base=args.api_base,
669 |         subset=args.subset,
670 |         num_generations=args.num_generations,
671 |         num_retries=args.num_retries,
672 |         concurrency=args.concurrency,
673 |         num_problems=args.num_problems,
674 |         last_subtask=args.last_subtask,
675 |         dry_run=args.dry_run,
676 |         override=args.override,
677 |         model_postfix=args.model_postfix,
678 |         revision=args.revision,
679 |         timeout=args.timeout,
680 |         use_requests=args.use_requests,
681 |         max_tokens=args.max_tokens
682 |     )
683 |     asyncio.run(evaluator.run_evaluation())
684 | 
685 | if __name__ == "__main__":
686 |     main() 


--------------------------------------------------------------------------------