├── reward_model
    ├── train
    │   ├── README.md
    │   ├── log.txt
    │   ├── download.py
    │   ├── requirements.txt
    │   ├── script
    │   │   └── train_qwen.sh
    │   ├── train.py
    │   ├── preprocess.py
    │   └── utils.py
    ├── run.sh
    ├── grade.py
    └── llm
    │   └── backend.py
├── points.npy
├── assets
    ├── comp.png
    ├── logo.png
    ├── steps.png
    ├── example.png
    └── overview.png
├── .gitignore
├── evolve_agent
    ├── __init__.py
    ├── prompt
    │   ├── __init__.py
    │   └── templates.py
    ├── llm
    │   ├── __init__.py
    │   ├── base.py
    │   ├── ensemble.py
    │   └── openai.py
    ├── utils
    │   ├── __init__.py
    │   ├── metrics_utils.py
    │   ├── format_utils.py
    │   ├── code_utils.py
    │   └── async_utils.py
    ├── evaluation_result.py
    ├── cli.py
    └── reward_model.py
├── benchmark
    ├── heilbronn_in_the_unit_square
    │   ├── points.npy
    │   ├── initial_proposal.txt
    │   ├── evaluator.py
    │   ├── visualization.py
    │   └── initial_program.py
    ├── MSTD
    │   ├── initial_program.py
    │   ├── initial_proposal.txt
    │   └── evaluator.py
    ├── minizing_raio_max_min_distance
    │   ├── initial_proposal.txt
    │   ├── evaluator.py
    │   └── initial_program.py
    ├── packing_circles
    │   ├── initial_proposal.txt
    │   ├── evaluator.py
    │   └── initial_program.py
    ├── third_autocorrelation_inequality
    │   ├── initial_proposal.txt
    │   ├── evaluator.py
    │   └── initial_program.py
    ├── littlewood_polynomials
    │   ├── initial_program.py
    │   ├── initial_proposal.txt
    │   └── evaluator.py
    ├── spherical_code
    │   ├── initial_proposal.txt
    │   ├── evaluator.py
    │   ├── initial_program.py
    │   └── visualization.py
    ├── kissing_number
    │   ├── initial_proposal.txt
    │   └── evaluator.py
    ├── autoconvolution_peak_minimization
    │   ├── initial_proposal.txt
    │   ├── evaluator.py
    │   └── initial_program.py
    └── human_best.txt
├── run.py
├── configs
    ├── island_config_example.yaml
    ├── README.md
    ├── island_examples.yaml
    └── default_config.yaml
├── compute.py
└── README.md


/reward_model/train/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reward_model/train/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/points.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/answers111/alpha-research/HEAD/points.npy


--------------------------------------------------------------------------------
/assets/comp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/answers111/alpha-research/HEAD/assets/comp.png


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/answers111/alpha-research/HEAD/assets/logo.png


--------------------------------------------------------------------------------
/assets/steps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/answers111/alpha-research/HEAD/assets/steps.png


--------------------------------------------------------------------------------
/assets/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/answers111/alpha-research/HEAD/assets/example.png


--------------------------------------------------------------------------------
/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/answers111/alpha-research/HEAD/assets/overview.png


--------------------------------------------------------------------------------
/reward_model/run.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=0
2 | python /data/zhuotaodeng/yzj/alpha-research/idea-eval/grade.py


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__/
3 | results
4 | configs/kimi_config.yaml
5 | configs/oai_config.yaml
6 | configs/deepseek_config.yaml
7 | 


--------------------------------------------------------------------------------
/evolve_agent/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | 
3 | from evolve_agent.controller import EvolveAgent
4 | 
5 | __all__ = ["EvolveAgent"]
6 | 


--------------------------------------------------------------------------------
/benchmark/heilbronn_in_the_unit_square/points.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/answers111/alpha-research/HEAD/benchmark/heilbronn_in_the_unit_square/points.npy


--------------------------------------------------------------------------------
/reward_model/train/download.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM
2 | 
3 | model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-Coder-32B-Instruct')
4 | 
5 | print(model)


--------------------------------------------------------------------------------
/evolve_agent/prompt/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Prompt module initialization
3 | """
4 | 
5 | from evolve_agent.prompt.sampler import PromptSampler
6 | from evolve_agent.prompt.templates import TemplateManager
7 | 
8 | __all__ = ["PromptSampler", "TemplateManager"]
9 | 


--------------------------------------------------------------------------------
/evolve_agent/llm/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | LLM module initialization
 3 | """
 4 | 
 5 | from evolve_agent.llm.base import LLMInterface
 6 | from evolve_agent.llm.ensemble import LLMEnsemble
 7 | from evolve_agent.llm.openai import OpenAILLM
 8 | 
 9 | __all__ = ["LLMInterface", "OpenAILLM", "LLMEnsemble"]
10 | 


--------------------------------------------------------------------------------
/reward_model/train/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp
 2 | openai==0.28.1
 3 | httpx==0.25.1
 4 | markdown2==2.4.10
 5 | nh3==0.2.14
 6 | numpy==1.26.2
 7 | pydantic==1.10.13
 8 | psutil==5.9.6
 9 | requests==2.32.0
10 | rich==13.7.0
11 | tiktoken==0.5.1
12 | uvicorn==0.24.0.post1
13 | accelerate==0.25.0
14 | peft==0.6.2
15 | sentencepiece==0.1.99
16 | protobuf==4.23.4
17 | einops==0.7.0
18 | wandb==0.16.0
19 | torch==2.1.2
20 | transformers==4.37.1
21 | fastchat==0.1.0
22 | conda config


--------------------------------------------------------------------------------
/benchmark/MSTD/initial_program.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def main():
 5 |     N = 30
 6 |     # Conway MSTD set example; we take A=B for classical MSTD
 7 |     A = [0, 2, 3, 4, 7, 11, 12, 14]
 8 |     B = A[:]
 9 |     A_ind = np.zeros(N, dtype=int); A_ind[A] = 1
10 |     B_ind = np.zeros(N, dtype=int); B_ind[B] = 1
11 |     return A_ind, B_ind
12 | 
13 | 
14 | # Ensure globals for evaluator
15 | try:
16 |     A_indicators; B_indicators  # type: ignore[name-defined]
17 | except NameError:
18 |     A_indicators, B_indicators = main()
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/evolve_agent/llm/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base LLM interface
 3 | """
 4 | 
 5 | from abc import ABC, abstractmethod
 6 | from typing import Any, Dict, List, Optional
 7 | 
 8 | 
 9 | class LLMInterface(ABC):
10 |     """Abstract base class for LLM interfaces"""
11 | 
12 |     @abstractmethod
13 |     async def generate(self, prompt: str, **kwargs) -> str:
14 |         """Generate text from a prompt"""
15 |         pass
16 | 
17 |     @abstractmethod
18 |     async def generate_with_context(
19 |         self, system_message: str, messages: List[Dict[str, str]], **kwargs
20 |     ) -> str:
21 |         """Generate text using a system message and conversational context"""
22 |         pass
23 | 


--------------------------------------------------------------------------------
/benchmark/minizing_raio_max_min_distance/initial_proposal.txt:
--------------------------------------------------------------------------------
1 | Problem. Arrange n points in [0,1]^d to optimize the dispersion/packing–covering tradeoff. The benchmark metric is
2 |   ratio = (min pairwise distance) / (max pairwise distance)
3 | so that larger ratio is better (values in (0,1]).
4 | Evaluator. Given a program exposing max_min_dis_ratio(n,d), we obtain configurations for (n,d)=(16,2) and (14,3), then report ratio for each case.
5 | 
6 | Baseline algorithm. The initial program employs enhanced simulated annealing with adaptive cooling, neighbor-repulsion moves, periodic smoothing via k-NN weighted averages, and a local refinement stage. KD-tree acceleration is used for nearest-neighbor queries; hyperparameters adapt to dimension.


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | from evolve_agent import EvolveAgent    
 2 | import asyncio
 3 | import logging
 4 | import os
 5 | 
 6 | os.environ['CUDA_VISIBLE_DEVICES'] = '6'
 7 | # logging.basicConfig(
 8 | #     level=logging.DEBUG, 
 9 | #     # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 
10 | #     # filename='app.log',  
11 | # )
12 | 
13 | evolve_agent = EvolveAgent(
14 |     initial_program_path="results/initial_program.py",
15 |     evaluation_file="results/evaluator.py",
16 |     initial_proposal_path="results/initial_proposal.txt",
17 |     config_path="configs/oai_config.yaml"
18 | )
19 | 
20 | async def main():
21 |     best_program = await evolve_agent.run(iterations=50) 
22 |     print(best_program)
23 | 
24 | asyncio.run(main())
25 | # print(evolve_agent)
26 | 
27 | 


--------------------------------------------------------------------------------
/benchmark/packing_circles/initial_proposal.txt:
--------------------------------------------------------------------------------
 1 | Packing circles inside a unit square to maximize sum of radii
 2 | Given an integer n, place n disjoint circles in [0,1]^2 to maximize the total sum of radii.
 3 | 
 4 | Objective and metric
 5 | - Score = total sum of radii (larger is better).
 6 | - Validity: circles must be pairwise disjoint and fully inside the unit square.
 7 | 
 8 | Notes on records
 9 | - This variable-radius “sum of radii” objective is not the classical equal-radius packing; authoritative SOTA tables are not standardized.
10 | - Values reported in code or experiments should be treated as benchmarks rather than literature SOTA.
11 | 
12 | Goal
13 | - Create algorithms that increase the total sum of radii for n ∈ {26, 32} under the above validity constraints.
14 | 


--------------------------------------------------------------------------------
/benchmark/third_autocorrelation_inequality/initial_proposal.txt:
--------------------------------------------------------------------------------
1 | Third-order autocorrelation inequality (C3 upper bound)
2 | 
3 | Problem. For piecewise-constant nonnegative functions on a fixed support with unit mass, we evaluate an upper bound C_upper_bound derived from the maximum of the autoconvolution (normalized by squared L1 mass). The benchmark score is
4 |   score = 1 / C_upper_bound
5 | so larger score indicates a smaller upper bound and hence a better result.
6 | 
7 | Evaluator. The evaluator calls find_better_c3_upper_bound() from the target program to obtain step heights, computes the (normalized) autoconvolution maximum, and returns 1/C_upper_bound.
8 | 
9 | Baseline algorithm. A simple genetic algorithm over height sequences (tournament selection, one-point crossover, Gaussian mutation) serves as the baseline search method.


--------------------------------------------------------------------------------
/benchmark/littlewood_polynomials/initial_program.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def rudin_shapiro(n: int):
 4 |     """First n signs of the Rudin–Shapiro sequence (±1)."""
 5 |     a = np.ones(n, dtype=int)
 6 |     for k in range(n):
 7 |         x, cnt, prev = k, 0, 0
 8 |         while x:
 9 |             b = x & 1
10 |             if b & prev:  # saw '11'
11 |                 cnt ^= 1
12 |             prev = b
13 |             x >>= 1
14 |         a[k] = 1 if cnt == 0 else -1
15 |     return a
16 | 
17 | def random_littlewood(n: int, seed=0):
18 |     rng = np.random.default_rng(seed)
19 |     return rng.choice([-1, 1], size=n).astype(int)
20 | 
21 | def main():
22 |     n = 512
23 |     c = rudin_shapiro(n)
24 |     print(f"n={n}, coeffs={len(c)}")
25 |     return c
26 | 
27 | if __name__ == "__main__":
28 |     coeffs = main()
29 | 
30 | # Ensure compatibility with evaluators that expect a global variable
31 | try:
32 |     coeffs  # type: ignore[name-defined]
33 | except NameError:
34 |     coeffs = main()
35 | 


--------------------------------------------------------------------------------
/benchmark/MSTD/initial_proposal.txt:
--------------------------------------------------------------------------------
 1 | MSTD (More Sums Than Differences) Benchmark
 2 | 
 3 | Objective
 4 | - Classical MSTD (enforced): Given A ⊂ {0,1,...,N-1} represented by a 0/1 indicator array of length N,
 5 |   maximize the ratio R = |A+A| / |A−A|.
 6 | - Score: score = R (higher is better).
 7 | - Comparisons should be made under the same N.
 8 | 
 9 | Default setup in this benchmark
10 | - N = 30.
11 | - Evaluator enforces A=B (classical setting). If a pair (A,B) is provided, B is ignored and A is used.
12 | 
13 | Known best for N = 30 (baseline)
14 | - Conway’s MSTD set A = {0,2,3,4,7,11,12,14} yields R ≈ 1.04.
15 | - This is the baseline included in initial_program.py.
16 | - Better ratios may exist for N=30; pushing R upwards is the optimization goal.
17 | 
18 | Notes
19 | - R>1 is rare and indicates sum-dominance.
20 | - The ratio depends strongly on N; do not compare ratios across different N without a normalization scheme.
21 | - If cross-N comparison is necessary, consider reporting both R and N, or use log R as an auxiliary measure.
22 | 


--------------------------------------------------------------------------------
/benchmark/heilbronn_in_the_unit_square/initial_proposal.txt:
--------------------------------------------------------------------------------
 1 | Heilbronn in the unit square: maximize smallest triangle area (n = 16)
 2 | 
 3 | Problem definition. Given n = 16 points in [0,1]^2, define Δ(P) as the minimum triangle area over all triples from configuration P. The task is to maximize Δ(P).
 4 | 
 5 | Metric (larger is better). Use the raw minimum triangle area:
 6 |   score = min_area.
 7 | For reference we also report scaled_min_area = n^{8/7 + 1/2000} * min_area (informational only).
 8 | 
 9 | Constraints. Points must lie in [0,1]^2.
10 | 
11 | Evaluator. The evaluator returns {"score" (= min_area), "scaled_min_area", "min_area", "n"}.
12 | 
13 | Baseline. The initial program seeds from hexagonal/grid/Poisson-disk variants and refines to increase the minimum triangle area for n = 16.
14 | 
15 | Human best. For n = 16, the best-known record is A = 7/341 ≈ 0.020526...，见 Erich Friedman 的汇总 [https://erich-friedman.github.io/packing/heilbronn/]。全局最优性未被普遍证明。
16 | 
17 | Goal. Construct point sets that maximize min_area (and thus also increase scaled_min_area accordingly) for n = 16.
18 | 


--------------------------------------------------------------------------------
/benchmark/third_autocorrelation_inequality/evaluator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def evaluate(program_path: str = "/data/zhuotaodeng/yzj/_para/alpha-research/math/initial_program.py"):
 4 |     """
 5 |     Evaluate the pack_circles function from the given program file.
 6 |     Returns the total radius sum if valid, otherwise raises an exception.
 7 |     """
 8 |     import importlib.util
 9 |     import sys
10 |     
11 |     # Load the module from the given path
12 |     spec = importlib.util.spec_from_file_location("program", program_path)
13 |     program = importlib.util.module_from_spec(spec)
14 |     sys.modules["program"] = program
15 |     spec.loader.exec_module(program)
16 |     try:
17 |         height_sequence_3 = program.find_better_c3_upper_bound()
18 |     except:
19 |         return {"error": -10.0}
20 |     
21 |     convolution_3 = np.convolve(height_sequence_3, height_sequence_3)
22 |     C_upper_bound = abs(2 * len(height_sequence_3) * np.max(convolution_3) / (np.sum(height_sequence_3)**2))
23 |     
24 |     return {"score": 1.0 / C_upper_bound}
25 | 
26 | print(evaluate())


--------------------------------------------------------------------------------
/evolve_agent/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities module initialization
 3 | """
 4 | 
 5 | from evolve_agent.utils.async_utils import (
 6 |     TaskPool,
 7 |     gather_with_concurrency,
 8 |     retry_async,
 9 |     run_in_executor,
10 | )
11 | from evolve_agent.utils.code_utils import (
12 |     apply_diff,
13 |     calculate_edit_distance,
14 |     extract_code_language,
15 |     extract_diffs,
16 |     format_diff_summary,
17 |     parse_evolve_blocks,
18 |     parse_full_rewrite,
19 | )
20 | from evolve_agent.utils.format_utils import (
21 |     format_metrics_safe,
22 |     format_improvement_safe,
23 | )
24 | from evolve_agent.utils.metrics_utils import (
25 |     safe_numeric_average,
26 |     safe_numeric_sum,
27 | )
28 | 
29 | __all__ = [
30 |     "TaskPool",
31 |     "gather_with_concurrency",
32 |     "retry_async",
33 |     "run_in_executor",
34 |     "apply_diff",
35 |     "calculate_edit_distance",
36 |     "extract_code_language",
37 |     "extract_diffs",
38 |     "format_diff_summary",
39 |     "parse_evolve_blocks",
40 |     "parse_full_rewrite",
41 |     "format_metrics_safe",
42 |     "format_improvement_safe",
43 |     "safe_numeric_average",
44 |     "safe_numeric_sum",
45 | ]
46 | 


--------------------------------------------------------------------------------
/benchmark/spherical_code/initial_proposal.txt:
--------------------------------------------------------------------------------
 1 | Spherical code (N=30 on S^2): maximize minimum pairwise angle on the unit sphere
 2 | 
 3 | Problem definition. Choose N=30 points on S^{2} to maximize the minimum pairwise angle θ_min = min_{i<j} arccos(⟨p_i,p_j⟩).
 4 | 
 5 | Constraints. Points are unit vectors (rows normalized). Metric is min_angle in radians.
 6 | 
 7 | Optimization goal. Maximize min_angle. The evaluator returns {score, min_angle, n, dimension}. All angles are in radians, and score = min_angle.
 8 | Best-known reference (for N=30 on S^2): cos(θ*) ≈ 0.7815518750949873 ⇒ θ* ≈ 0.6736467551690225 rad.
 9 | Reference table: Henry Cohn’s spherical codes data [https://cohn.mit.edu/spherical-codes/?utm_source=chatgpt.com].
10 | 
11 | Best-known results (human). On S^2 (3D), small N optima coincide with symmetric polyhedra (e.g., tetrahedron, octahedron, icosahedral arrangements). For larger N, best codes come from numerical optimization; exact optimality is known only in limited cases.
12 | 
13 | Algorithm goal. Construct codes with larger min_angle. This baseline seeds with symmetric configurations and uses farthest-point max–min; stronger methods include energy minimization, projected gradient/coordinate descent, and stochastic max–min refinement.
14 | 
15 | 


--------------------------------------------------------------------------------
/reward_model/train/script/train_qwen.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=3,2
 2 | export WANDB_DISABLED=true
 3 | set -ex
 4 | LOG_PATH=./log.txt
 5 | SAVE_PATH=/data/zhuotaodeng/yzj/alpha-research/model/qwen25_grm_iclr_boxed
 6 | mkdir -p $SAVE_PATH
 7 | 
 8 | torchrun --nproc_per_node=2 \
 9 |     --master_port=20011 \
10 |     train.py \
11 |     --model_name_or_path /data/zhuotaodeng/yzj/download_from_modelscope/Qwen/Qwen2___5-7B-Instruct \
12 |     --data_path /data/zhuotaodeng/yzj/alpha-research/data/iclr_train_all_boxed.json \
13 |     --bf16 True \
14 |     --tf32 True \
15 |     --output_dir $SAVE_PATH \
16 |     --num_train_epochs 2 \
17 |     --per_device_train_batch_size 2 \
18 |     --per_device_eval_batch_size 2 \
19 |     --gradient_accumulation_steps 32 \
20 |     --evaluation_strategy "no" \
21 |     --save_strategy "steps" \
22 |     --save_steps 60 \
23 |     --save_total_limit 17 \
24 |     --learning_rate 1e-5 \
25 |     --weight_decay 0. \
26 |     --warmup_steps 100 \
27 |     --lr_scheduler_type "cosine" \
28 |     --logging_steps 1 \
29 |     --fsdp "full_shard auto_wrap" \
30 |     --fsdp_transformer_layer_cls_to_wrap 'Qwen2DecoderLayer' \
31 |     --model_max_length 16384 \
32 |     --gradient_checkpointing True \
33 |     --lazy_preprocess False
34 |      
35 |     > $LOG_PATH 2>&1
36 | 


--------------------------------------------------------------------------------
/benchmark/kissing_number/initial_proposal.txt:
--------------------------------------------------------------------------------
 1 | Kissing number in 11D: maximize valid contact points on the unit sphere
 2 | 
 3 | Problem. Given dimension d (here d = 11), construct as many unit vectors as possible on S^{d-1} so that every pair of distinct vectors has inner product ≤ 0.5. Equivalently, these are centers of equal unit spheres touching a central one without violating pairwise angle/packing constraints.
 4 | 
 5 | Constraints. Vectors must be unit length; for any distinct i ≠ j, ⟨v_i, v_j⟩ ≤ 0.5.
 6 | 
 7 | Optimization goal. Maximize the number of vectors (kissing points). The evaluator verifies unit norms and the 0.5 cap, then reports the count and dimension.
 8 | 
 9 | Best-known results (human). In 11D, the current constructive lower bound is 592 (Ganzhinov, PSU(4,2) construction). Upper bounds above 800 exist in the literature (exact best published bound depends on source); we focus on improving the constructive lower bound.
10 | 
11 | Algorithm goal. Create an algorithm that constructs large valid sets under the above constraints. This program follows a PSU(4,2)-based construction (when ATLAS data are available) yielding 592 in 11D, and otherwise uses a robust randomized MIS-style fallback to produce large valid configurations. The program outputs sphere_centers (unit vectors) for evaluator verification.
12 | 


--------------------------------------------------------------------------------
/benchmark/littlewood_polynomials/initial_proposal.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Littlewood polynomials: minimize sup-norm on the unit circle
 3 | 
 4 | Problem definition. Choose coefficients c_k ∈ {±1} for P(z)=∑_{k=0}^{n−1} c_k z^k, |z|=1, to minimize the supremum norm ‖P‖_∞=max_{|z|=1}|P(z)|.
 5 | 
 6 | Constraints. Coefficients are ±1. Metric is supnorm estimated by FFT sampling on an equally spaced grid; denser grid → tighter upper bound.
 7 | 
 8 | Optimization goal. Minimize supnorm. The evaluator returns a single scalar: 1/supnorm if valid, else −1.0.
 9 | 
10 | Notes on bounds. For the Rudin–Shapiro construction of length n, a classical identity gives supnorm ≤ √(2n) (so the absolute constant C = √2 outside the √n). For the benchmark default n = 512, this yields supnorm ≤ √(1024) = 32 and thus score 1/32 = 0.03125.
11 | 
12 | Best-known results (human). The optimal value is Θ(√n). Upper bounds: Rudin–Shapiro and variants achieve ‖P‖_∞ ≤ C√n (C≈2 in practice). Lower bounds: ‖P‖_∞ ≥ c√n for an absolute c>0. Practical searches aim to reduce the constant for fixed n.
13 | 
14 | Algorithm goal. Construct ±1 sequences with smaller supnorm. This baseline uses Rudin–Shapiro; stronger methods include local flips, simulated annealing, coordinate descent on ±1 (with careful acceptance), and spectral heuristics guided by FFT magnitudes.
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/configs/island_config_example.yaml:
--------------------------------------------------------------------------------
 1 | # EvolveAgent Island-Based Evolution Configuration
 2 | # This configuration demonstrates the proper use of island-based evolution
 3 | 
 4 | # General settings
 5 | max_iterations: 1000
 6 | checkpoint_interval: 100
 7 | log_level: "INFO"
 8 | 
 9 | # LLM configuration
10 | llm:
11 |   primary_model: "gemini-2.0-flash-lite"
12 |   primary_model_weight: 0.8
13 |   secondary_model: "gemini-2.0-flash"
14 |   secondary_model_weight: 0.2
15 |   temperature: 0.7
16 |   top_p: 0.95
17 |   max_tokens: 4096
18 | 
19 | # Database configuration with proper island settings
20 | database:
21 |   population_size: 500
22 |   archive_size: 100
23 |   
24 |   # Island-based evolution settings
25 |   num_islands: 5                    # Number of separate populations
26 |   migration_interval: 50            # Migrate every 50 generations
27 |   migration_rate: 0.1               # Migrate 10% of top programs
28 |   
29 |   # Selection parameters
30 |   elite_selection_ratio: 0.1
31 |   exploration_ratio: 0.3
32 |   exploitation_ratio: 0.7
33 |   # Note: diversity_metric fixed to "edit_distance"
34 |   
35 |   # Feature map dimensions for MAP-Elites
36 |   feature_dimensions: ["score", "complexity"]
37 |   feature_bins: 10
38 | 
39 | # Prompt configuration
40 | prompt:
41 |   num_top_programs: 3
42 |   num_diverse_programs: 2
43 |   use_template_stochasticity: true
44 | 
45 | # Evaluator configuration
46 | evaluator:
47 |   timeout: 300
48 |   max_retries: 3
49 |   cascade_evaluation: true
50 |   parallel_evaluations: 4
51 | 
52 | # Evolution settings
53 | diff_based_evolution: true
54 | allow_full_rewrites: false
55 | max_code_length: 10000
56 | 


--------------------------------------------------------------------------------
/benchmark/minizing_raio_max_min_distance/evaluator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy as sp
 3 | 
 4 | 
 5 | def cal_ratio(construction_1):
 6 |     pairwise_distances = sp.spatial.distance.pdist(construction_1)
 7 |     min_distance = np.min(pairwise_distances)
 8 |     max_distance = np.max(pairwise_distances)
 9 |     ratio_squared = (min_distance / max_distance)**2
10 |     return ratio_squared
11 | 
12 | 
13 | def evaluate(program_path: str = "results/initial_program.py"):
14 |     """
15 |     Evaluate the pack_circles function from the given program file.
16 |     Returns the total radius sum if valid, otherwise raises an exception.
17 |     """
18 |     import importlib.util
19 |     import sys
20 |     
21 |     # Load the module from the given path
22 |     spec = importlib.util.spec_from_file_location("program", program_path)
23 |     program = importlib.util.module_from_spec(spec)
24 |     sys.modules["program"] = program
25 |     spec.loader.exec_module(program)
26 | 
27 | 
28 |     # Check if 'max_min_dis_ratio' exists in the loaded module
29 |     if not hasattr(program, 'max_min_dis_ratio'):
30 |         raise ValueError(f"The file '{program_path}' does not define 'max_min_dis_ratio'.")
31 | 
32 |     try:
33 |         res_n16_d2, _ = program.max_min_dis_ratio(16, 2) 
34 |         res_n14_d3, _ = program.max_min_dis_ratio(14, 3)
35 |     except Exception as e1:
36 |         return {"result": -10.0, "error": e1}
37 |     
38 |     try:
39 |         ratio_n16_d2 = cal_ratio(res_n16_d2) # AlphaEvolve: 1 / 12.88926611203463 = 0.07758393622320406
40 |         ratio_n14_d3 = cal_ratio(res_n14_d3) # AlphaEvolve: 1 / 4.165849767 = 0.24004706263470807
41 | 
42 | 
43 |     except Exception as e:
44 |         return {"result": -1.0, "error": e}
45 | 
46 |     results = {
47 |         "ratio_n16_d2": ratio_n16_d2,
48 |         "ratio_n14_d3": ratio_n14_d3, 
49 |     }
50 |     
51 |     return results
52 | 
53 | print(evaluate())


--------------------------------------------------------------------------------
/benchmark/autoconvolution_peak_minimization/initial_proposal.txt:
--------------------------------------------------------------------------------
 1 | Autoconvolution peak minimization on a unit interval (standard normalization)
 2 | 
 3 | Problem definition. Let  
 4 | 
 5 | $$
 6 | \mathcal{F}=\Big\{\,f\in L^{1}\!\big([-{\tfrac12},{\tfrac12}]\big):\ f\ge 0,\ \int_{-1/2}^{1/2} f(x)\,dx=1\,\Big\}
 7 | $$
 8 | 
 9 | $$
10 | \qquad
11 | (f*f)(t)=\int_{\mathbb{R}} f(x)\,f(t-x)\,dx .
12 | $$
13 | 
14 | We seek to minimize the peak value of the autoconvolution:
15 | $$
16 | \mu_\infty \;=\; \inf_{f\in\mathcal{F}} \ \|f*f\|_\infty .
17 | $$
18 | 
19 | Constraints. Nonnegative density, unit mass (L1=1), support length 1 (here taken as [-1/2, 1/2]). In the implementation, f is represented by nonnegative step heights on a uniform grid and normalized to unit integral.
20 | 
21 | Optimization goal. Minimize \(\mu_\infty = \max_t (f*f)(t)\). Smaller is better.
22 | 
23 | Best-known human results. In this standard setup, the best currently published bounds are
24 | $$
25 | \boxed{0.64 \ \le\ \mu_\infty \ \le\ 0.75496}\,.
26 | $$
27 | The upper bound traces to work of Matolcsi–Vinuesa (after normalizing support length to 1), and the lower bound to Cloninger–Steinerberger.
28 | 
29 | Algorithm goal. Create an algorithm that constructs feasible densities with progressively smaller \(\mu_\infty\). This baseline program generates simple analytical candidates (box, triangle, cosine-squared, Gaussian) on a uniform grid, normalizes to unit mass, and computes autoconvolution via FFT to measure \(\mu_\infty\). It serves as a starting point for more advanced search/optimization methods.
30 | 
31 | References.
32 | - E. P. White, *An optimal $L^2$ autoconvolution inequality*, Canadian Mathematical Bulletin (2024).
33 | - M. Matolcsi and C. Vinuesa, *Improved bounds on the supremum of autoconvolutions*, J. Math. Anal. Appl. 372 (2010), 439–447.
34 | - A. Cloninger and S. Steinerberger, *On suprema of autoconvolutions with an application to Sidon sets*, Proc. Amer. Math. Soc. 145 (2017), 3191–3200.
35 | 


--------------------------------------------------------------------------------
/evolve_agent/evaluation_result.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Evaluation result structures for EvolveAgent
 3 | """
 4 | 
 5 | import json
 6 | from dataclasses import dataclass, field
 7 | from typing import Dict, Union
 8 | 
 9 | 
10 | @dataclass
11 | class EvaluationResult:
12 |     """
13 |     Result of program evaluation containing both metrics and optional artifacts
14 | 
15 |     This maintains backward compatibility with the existing dict[str, float] contract
16 |     while adding a side-channel for arbitrary artifacts (text or binary data).
17 |     """
18 | 
19 |     metrics: Dict[str, float]  # mandatory - existing contract
20 |     artifacts: Dict[str, Union[str, bytes]] = field(default_factory=dict)  # optional side-channel
21 | 
22 |     @classmethod
23 |     def from_dict(cls, metrics: Dict[str, float]) -> "EvaluationResult":
24 |         """Auto-wrap dict returns for backward compatibility"""
25 |         return cls(metrics=metrics)
26 | 
27 |     def to_dict(self) -> Dict[str, float]:
28 |         """Backward compatibility - return just metrics"""
29 |         return self.metrics
30 | 
31 |     def has_artifacts(self) -> bool:
32 |         """Check if this result contains any artifacts"""
33 |         return bool(self.artifacts)
34 | 
35 |     def get_artifact_keys(self) -> list:
36 |         """Get list of artifact keys"""
37 |         return list(self.artifacts.keys())
38 | 
39 |     def get_artifact_size(self, key: str) -> int:
40 |         """Get size of a specific artifact in bytes"""
41 |         if key not in self.artifacts:
42 |             return 0
43 | 
44 |         value = self.artifacts[key]
45 |         if isinstance(value, str):
46 |             return len(value.encode("utf-8"))
47 |         elif isinstance(value, bytes):
48 |             return len(value)
49 |         else:
50 |             return len(str(value).encode("utf-8"))
51 | 
52 |     def get_total_artifact_size(self) -> int:
53 |         """Get total size of all artifacts in bytes"""
54 |         return sum(self.get_artifact_size(key) for key in self.artifacts.keys())
55 | 


--------------------------------------------------------------------------------
/evolve_agent/utils/metrics_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Safe calculation utilities for metrics containing mixed types
 3 | """
 4 | 
 5 | from typing import Any, Dict
 6 | 
 7 | 
 8 | def safe_numeric_average(metrics: Dict[str, Any]) -> float:
 9 |     """
10 |     Calculate the average of numeric values in a metrics dictionary,
11 |     safely ignoring non-numeric values like strings.
12 | 
13 |     Args:
14 |         metrics: Dictionary of metric names to values
15 | 
16 |     Returns:
17 |         Average of numeric values, or 0.0 if no numeric values found
18 |     """
19 |     if not metrics:
20 |         return 0.0
21 | 
22 |     numeric_values = []
23 |     for value in metrics.values():
24 |         if isinstance(value, (int, float)):
25 |             try:
26 |                 # Convert to float and check if it's a valid number
27 |                 float_val = float(value)
28 |                 if not (float_val != float_val):  # Check for NaN (NaN != NaN is True)
29 |                     numeric_values.append(float_val)
30 |             except (ValueError, TypeError, OverflowError):
31 |                 # Skip invalid numeric values
32 |                 continue
33 | 
34 |     if not numeric_values:
35 |         return 0.0
36 | 
37 |     return sum(numeric_values) / len(numeric_values)
38 | 
39 | 
40 | def safe_numeric_sum(metrics: Dict[str, Any]) -> float:
41 |     """
42 |     Calculate the sum of numeric values in a metrics dictionary,
43 |     safely ignoring non-numeric values like strings.
44 | 
45 |     Args:
46 |         metrics: Dictionary of metric names to values
47 | 
48 |     Returns:
49 |         Sum of numeric values, or 0.0 if no numeric values found
50 |     """
51 |     if not metrics:
52 |         return 0.0
53 | 
54 |     numeric_sum = 0.0
55 |     for value in metrics.values():
56 |         if isinstance(value, (int, float)):
57 |             try:
58 |                 # Convert to float and check if it's a valid number
59 |                 float_val = float(value)
60 |                 if not (float_val != float_val):  # Check for NaN (NaN != NaN is True)
61 |                     numeric_sum += float_val
62 |             except (ValueError, TypeError, OverflowError):
63 |                 # Skip invalid numeric values
64 |                 continue
65 | 
66 |     return numeric_sum
67 | 


--------------------------------------------------------------------------------
/benchmark/spherical_code/evaluator.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import json
 4 | import numpy as np
 5 | from typing import Dict
 6 | import importlib.util
 7 | 
 8 | EPS = 1e-12
 9 | 
10 | def evaluate_spherical_code_min_angle(points: np.ndarray) -> Dict[str, float]:
11 |     P = np.asarray(points, dtype=float)
12 |     if P.ndim != 2 or P.shape[0] < 2:
13 |         return {"valid": 0.0, "min_angle": 0.0, "n": 0.0, "dimension": 0.0, "score": 0.0}
14 |     # normalize rows onto the sphere
15 |     norms = np.maximum(np.linalg.norm(P, axis=1, keepdims=True), EPS)
16 |     P = P / norms
17 |     n = P.shape[0]
18 |     d = P.shape[1]
19 |     min_angle = float("inf")
20 |     for i in range(n):
21 |         for j in range(i+1, n):
22 |             cosang = float(np.clip(np.dot(P[i], P[j]), -1.0, 1.0))
23 |             ang = float(np.arccos(cosang))
24 |             if ang < min_angle:
25 |                 min_angle = ang
26 |     return {"valid": 1.0, "min_angle": float(min_angle), "n": float(n), "dimension": float(d), "score": float(min_angle)}
27 | 
28 | def evaluate(program_path: str):
29 |     try:
30 |         spec = importlib.util.spec_from_file_location("program", program_path)
31 |         program = importlib.util.module_from_spec(spec)
32 |         sys.modules["program"] = program
33 |         spec.loader.exec_module(program)
34 | 
35 |         pts = None
36 |         if hasattr(program, 'points'):
37 |             pts = program.points
38 |         elif hasattr(program, 'main'):
39 |             res = program.main()
40 |             if isinstance(res, np.ndarray):
41 |                 pts = res
42 |             elif hasattr(program, 'points'):
43 |                 pts = program.points
44 |         if pts is None:
45 |             return {"error": -1.0}
46 |         result = evaluate_spherical_code_min_angle(pts)
47 |         return {"score": result["score"], "min_angle": result["min_angle"], "n": result["n"], "dimension": result["dimension"]}
48 |     except Exception:
49 |         return {"error": -1.0}
50 | 
51 | if __name__ == "__main__":
52 |     try:
53 |         default_path = os.path.join(os.path.dirname(__file__), "initial_program.py")
54 |     except Exception:
55 |         default_path = "initial_program.py"
56 |     target = sys.argv[1] if len(sys.argv) > 1 else default_path
57 |     print(json.dumps(evaluate(target), ensure_ascii=False, indent=2))
58 | 


--------------------------------------------------------------------------------
/evolve_agent/utils/format_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions for formatting output
 3 | """
 4 | 
 5 | from typing import Any, Dict
 6 | 
 7 | 
 8 | def format_metrics_safe(metrics: Dict[str, Any]) -> str:
 9 |     """
10 |     Safely format metrics dictionary for logging, handling both numeric and string values.
11 | 
12 |     Args:
13 |         metrics: Dictionary of metric names to values
14 | 
15 |     Returns:
16 |         Formatted string representation of metrics
17 |     """
18 |     if not metrics:
19 |         return ""
20 | 
21 |     formatted_parts = []
22 |     for name, value in metrics.items():
23 |         # Check if value is numeric (int, float)
24 |         if isinstance(value, (int, float)):
25 |             try:
26 |                 # Only apply float formatting to numeric values
27 |                 formatted_parts.append(f"{name}={value:.4f}")
28 |             except (ValueError, TypeError):
29 |                 # Fallback to string representation if formatting fails
30 |                 formatted_parts.append(f"{name}={value}")
31 |         else:
32 |             # For non-numeric values (strings, etc.), just convert to string
33 |             formatted_parts.append(f"{name}={value}")
34 | 
35 |     return ", ".join(formatted_parts)
36 | 
37 | 
38 | def format_improvement_safe(parent_metrics: Dict[str, Any], child_metrics: Dict[str, Any]) -> str:
39 |     """
40 |     Safely format improvement metrics for logging.
41 | 
42 |     Args:
43 |         parent_metrics: Parent program metrics
44 |         child_metrics: Child program metrics
45 | 
46 |     Returns:
47 |         Formatted string representation of improvements
48 |     """
49 |     if not parent_metrics or not child_metrics:
50 |         return ""
51 | 
52 |     improvement_parts = []
53 |     for metric, child_value in child_metrics.items():
54 |         if metric in parent_metrics:
55 |             parent_value = parent_metrics[metric]
56 |             # Only calculate improvement for numeric values
57 |             if isinstance(child_value, (int, float)) and isinstance(parent_value, (int, float)):
58 |                 try:
59 |                     diff = child_value - parent_value
60 |                     improvement_parts.append(f"{metric}={diff:+.4f}")
61 |                 except (ValueError, TypeError):
62 |                     # Skip non-numeric comparisons
63 |                     continue
64 | 
65 |     return ", ".join(improvement_parts)
66 | 


--------------------------------------------------------------------------------
/benchmark/packing_circles/evaluator.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | import itertools
 4 | import random
 5 | 
 6 | random.seed(42)
 7 | np.random.seed(42)
 8 | 
 9 | 
10 | def verify_circles(circles):
11 |     """Checks that the circles are disjoint and lie inside a unit square.
12 | 
13 |     Args:
14 |       circles: A list of tuples (x, y, radius) or numpy array of shape (num_circles, 3)
15 | 
16 |     Returns:
17 |       bool: True if circles are valid (disjoint and within unit square), False otherwise
18 |     """
19 |     # Convert to numpy array if it's a list
20 |     if not isinstance(circles, np.ndarray):
21 |         circles = np.array(circles)
22 |     
23 |     # Check pairwise disjointness
24 |     for circle1, circle2 in itertools.combinations(circles, 2):
25 |         center_distance = np.sqrt((circle1[0] - circle2[0])**2 + (circle1[1] - circle2[1])**2)
26 |         radii_sum = circle1[2] + circle2[2]
27 |         if center_distance < radii_sum:
28 |             return False
29 | 
30 |     # Check all circles lie inside the unit square [0,1]x[0,1]
31 |     for circle in circles:
32 |         if not (0 <= min(circle[0], circle[1]) - circle[2] and max(circle[0], circle[1]) + circle[2] <= 1):
33 |             return False
34 |     
35 |     return True
36 | 
37 | 
38 | def evaluate(program_path: str = "results/initial_program.py"):
39 |     """
40 |     Evaluate the pack_circles function from the given program file.
41 |     Returns dict with keys: score, result_26, result_32; score is sum of totals.
42 |     """
43 |     import importlib.util
44 |     import sys
45 |     
46 |     # Load the module from the given path
47 |     spec = importlib.util.spec_from_file_location("program", program_path)
48 |     program = importlib.util.module_from_spec(spec)
49 |     sys.modules["program"] = program
50 |     spec.loader.exec_module(program)
51 |     
52 |     # Test the pack_circles function
53 |     try:
54 |         total_r_26, circles_26 = program.pack_circles(26)
55 |         total_r_32, circles_32 = program.pack_circles(32)
56 |     except Exception as e:
57 |         return {"error": -10.0}
58 |     
59 |     # Validate the circles
60 |     valid_26 = verify_circles(circles_26)
61 |     valid_32 = verify_circles(circles_32)
62 | 
63 |     if not all((valid_26, valid_32)):
64 |         return {"error": -1.0}
65 |     
66 |     score = float(total_r_26 + total_r_32)
67 |     return {
68 |         "score": score,
69 |         "result_26": total_r_26,
70 |         "result_32": total_r_32
71 |     }
72 | 
73 | print(evaluate("/data/zhuotaodeng/yzj/_para/alpha-research/results_circles_1/test.py"))


--------------------------------------------------------------------------------
/benchmark/MSTD/evaluator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import importlib.util
 3 | import sys
 4 | import os
 5 | import json
 6 | 
 7 | 
 8 | def _to_index_set(indicators):
 9 |     arr = np.asarray(indicators).astype(int)
10 |     return np.nonzero(arr)[0]
11 | 
12 | 
13 | def mstd_ratio(A_idx, B_idx=None):
14 |     """
15 |     Classical MSTD ratio with enforced A=B: R = |A+A| / |A−A|.
16 |     Any provided B is ignored to keep consistency with the baseline setting.
17 |     """
18 |     # Enforce classic setting: ignore B and set B_idx = A_idx
19 |     B_idx = A_idx
20 |     if len(A_idx) == 0 or len(B_idx) == 0:
21 |         return -1.0
22 |     sumset, diffset = set(), set()
23 |     for a in A_idx:
24 |         for b in B_idx:
25 |             sumset.add(int(a + b))
26 |             diffset.add(int(a - b))
27 |     if len(diffset) == 0:
28 |         return -1.0
29 |     return float(len(sumset)) / float(len(diffset))
30 | 
31 | 
32 | def evaluate(program_path: str):
33 |     try:
34 |         spec = importlib.util.spec_from_file_location("program", program_path)
35 |         program = importlib.util.module_from_spec(spec)
36 |         sys.modules["program"] = program
37 |         spec.loader.exec_module(program)
38 | 
39 |         # Accept either A_indicators (and optional B_indicators) or a main() returning them
40 |         A = None
41 |         B = None
42 |         if hasattr(program, 'A_indicators'):
43 |             A = program.A_indicators
44 |         if hasattr(program, 'B_indicators'):
45 |             B = program.B_indicators
46 |         if A is None:
47 |             if hasattr(program, 'main'):
48 |                 res = program.main()
49 |                 if isinstance(res, tuple) and len(res) in (1, 2):
50 |                     if len(res) == 1:
51 |                         A = res[0]
52 |                         B = None
53 |                     else:
54 |                         A, B = res
55 |         if A is None:
56 |             return {"error": -1.0}
57 | 
58 |         A_idx = _to_index_set(A)
59 |         # Enforce classic setting regardless of provided B
60 |         R = mstd_ratio(A_idx, None)
61 |         if R <= 0:
62 |             return {"error": -1.0}
63 |         # Higher is better: score = R
64 |         return {"score": float(R), "ratio": float(R)}
65 |     except Exception:
66 |         return {"error": -1.0}
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     try:
71 |         default_path = os.path.join(os.path.dirname(__file__), "initial_program.py")
72 |     except Exception:
73 |         default_path = "initial_program.py"
74 |     target = sys.argv[1] if len(sys.argv) > 1 else default_path
75 |     print(json.dumps(evaluate(target), ensure_ascii=False))
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/benchmark/autoconvolution_peak_minimization/evaluator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import subprocess
 3 | import sys
 4 | import traceback
 5 | import os
 6 | import json
 7 | from typing import Dict
 8 | 
 9 | def evaluate_C1_upper_std(step_heights: np.ndarray) -> Dict[str, float]:
10 |     """
11 |     Standard-normalized C1 evaluation function.
12 |     - Project to feasible set: h >= 0 and ∫f = 1 (L1 normalization).
13 |     - Objective: mu_inf = max_t (f*f)(t) (smaller is better).
14 |     """
15 |     h = np.asarray(step_heights, dtype=float)
16 |     if h.size == 0 or np.any(h < 0):
17 |         return {"valid": 0.0, "mu_inf": float("inf"), "ratio": float("inf")}
18 |     K = int(len(h))
19 |     dx = 1.0 / K
20 |     integral = float(np.sum(h) * dx)
21 |     if integral <= 0:
22 |         return {"valid": 0.0, "mu_inf": float("inf"), "ratio": float("inf")}
23 |     h = h / integral
24 |     F = np.fft.fft(h, 2*K - 1)
25 |     conv = np.fft.ifft(F * F).real
26 |     conv = np.maximum(conv, 0.0)
27 |     mu_inf = float(np.max(conv) * dx)
28 |     return {"valid": 1.0, "mu_inf": mu_inf, "ratio": mu_inf, "integral": 1.0, "K": float(K)}
29 | 
30 | def evaluate(program_path: str):
31 |     """
32 |     Evaluate a program that solves the autoconvolution peak minimization problem.
33 |     
34 |     Returns: dict with key 'score' = 1 / mu_inf (larger is better), or {'error': -1.0}
35 |     """
36 |     try:
37 |         import importlib.util
38 |         spec = importlib.util.spec_from_file_location("program", program_path)
39 |         program = importlib.util.module_from_spec(spec)
40 |         sys.modules["program"] = program
41 |         spec.loader.exec_module(program)
42 |         step_heights = None
43 |         if hasattr(program, 'step_heights'):
44 |             step_heights = program.step_heights
45 |         elif hasattr(program, 'h'):
46 |             step_heights = program.h
47 |         elif hasattr(program, 'main'):
48 |             result = program.main()
49 |             if isinstance(result, np.ndarray):
50 |                 step_heights = result
51 |             elif hasattr(program, 'step_heights'):
52 |                 step_heights = program.step_heights
53 |             elif hasattr(program, 'h'):
54 |                 step_heights = program.h
55 |         if step_heights is None:
56 |             return {"error": -1.0}
57 |         result = evaluate_C1_upper_std(step_heights)
58 |         if result["valid"] == 1.0:
59 |             mu = float(result.get("mu_inf", float("inf")))
60 |             if mu > 0 and np.isfinite(mu):
61 |                 return {"score": 1.0 / mu}
62 |             return {"error": -1.0}
63 |         else:
64 |             return {"error": -1.0}
65 |     except Exception as e:
66 |         return {"error": -1.0}
67 | 


--------------------------------------------------------------------------------
/configs/README.md:
--------------------------------------------------------------------------------
 1 | # OpenEvolve Configuration Files
 2 | 
 3 | This directory contains configuration files for OpenEvolve with examples for different use cases.
 4 | 
 5 | ## Configuration Files
 6 | 
 7 | ### `default_config.yaml`
 8 | The main configuration file containing all available options with sensible defaults. This file includes:
 9 | - Complete documentation for all configuration parameters
10 | - Default values for all settings
11 | - **Island-based evolution parameters** for proper evolutionary diversity
12 | 
13 | Use this file as a template for your own configurations.
14 | 
15 | ### `island_config_example.yaml`
16 | A practical example configuration demonstrating proper island-based evolution setup. Shows:
17 | - Recommended island settings for most use cases
18 | - Balanced migration parameters
19 | - Complete working configuration
20 | 
21 | ### `island_examples.yaml`
22 | Multiple example configurations for different scenarios:
23 | - **Maximum Diversity**: Many islands, frequent migration
24 | - **Focused Exploration**: Few islands, rare migration  
25 | - **Balanced Approach**: Default recommended settings
26 | - **Quick Exploration**: Small-scale rapid testing
27 | - **Large-Scale Evolution**: Complex optimization runs
28 | 
29 | Includes guidelines for choosing parameters based on your problem characteristics.
30 | 
31 | ## Island-Based Evolution Parameters
32 | 
33 | The key new parameters for proper evolutionary diversity are:
34 | 
35 | ```yaml
36 | database:
37 |   num_islands: 5                      # Number of separate populations
38 |   migration_interval: 50              # Migrate every N generations  
39 |   migration_rate: 0.1                 # Fraction of top programs to migrate
40 | ```
41 | 
42 | ### Parameter Guidelines
43 | 
44 | - **num_islands**: 3-10 for most problems (more = more diversity)
45 | - **migration_interval**: 25-100 generations (higher = more independence)
46 | - **migration_rate**: 0.05-0.2 (5%-20%, higher = faster knowledge sharing)
47 | 
48 | ### When to Use What
49 | 
50 | - **Complex problems** → More islands, less frequent migration
51 | - **Simple problems** → Fewer islands, more frequent migration
52 | - **Long runs** → More islands to maintain diversity
53 | - **Short runs** → Fewer islands for faster convergence
54 | 
55 | ## Usage
56 | 
57 | Copy any of these files as a starting point for your configuration:
58 | 
59 | ```bash
60 | cp configs/default_config.yaml my_config.yaml
61 | # Edit my_config.yaml for your specific needs
62 | ```
63 | 
64 | Then use with OpenEvolve:
65 | 
66 | ```python
67 | from openevolve import OpenEvolve
68 | evolve = OpenEvolve(
69 |     initial_program_path="program.py",
70 |     evaluation_file="evaluator.py", 
71 |     config_path="my_config.yaml"
72 | )
73 | ```
74 | 


--------------------------------------------------------------------------------
/benchmark/spherical_code/initial_program.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def _normalize_rows(P):
 4 |     nrm = np.linalg.norm(P, axis=1, keepdims=True)
 5 |     nrm = np.maximum(nrm, 1e-12)
 6 |     return P / nrm
 7 | 
 8 | def seed_platonic(n):
 9 |     """Return a good symmetric seed on S^2 for some n; else None."""
10 |     if n == 2:   # antipodal
11 |         return np.array([[0,0,1],[0,0,-1]], dtype=float)
12 |     if n == 3:   # equilateral on equator
13 |         ang = 2*np.pi/3
14 |         return np.array([[1,0,0],[np.cos(ang),np.sin(ang),0],[np.cos(2*ang),np.sin(2*ang),0]], dtype=float)
15 |     if n == 4:   # tetrahedron
16 |         return _normalize_rows(np.array([[1,1,1],[1,-1,-1],[-1,1,-1],[-1,-1,1]], dtype=float))
17 |     if n == 6:   # octahedron
18 |         return np.array([[1,0,0],[-1,0,0],[0,1,0],[0,-1,0],[0,0,1],[0,0,-1]], dtype=float)
19 |     if n == 8:   # cube vertices
20 |         V = np.array([[sx,sy,sz] for sx in (-1,1) for sy in (-1,1) for sz in (-1,1)], dtype=float)
21 |         return _normalize_rows(V)
22 |     if n == 12:  # icosahedron (one realization)
23 |         phi = (1+np.sqrt(5))/2
24 |         V = []
25 |         for s in (-1,1):
26 |             V += [[0, s,  phi],[0, s, -phi],[ s,  phi,0],[ s, -phi,0],[ phi,0, s],[-phi,0, s]]
27 |         V = np.array(V, dtype=float)
28 |         return _normalize_rows(V)
29 |     return None
30 | 
31 | def farthest_point_greedy(n, seed=None, rng=np.random.default_rng(0)):
32 |     """Greedy max–min on S^2: start from seed (if any), then add points that maximize min angle."""
33 |     def random_unit(k):
34 |         X = rng.normal(size=(k,3)); return _normalize_rows(X)
35 | 
36 |     if seed is None:
37 |         P = random_unit(1)   # start with one random point
38 |     else:
39 |         P = _normalize_rows(seed)
40 |     while len(P) < n:
41 |         # generate candidates and pick the one with largest min angle to current set
42 |         C = random_unit(2000)  # candidates per iteration (tune as needed)
43 |         # cosines to existing points
44 |         cos = C @ P.T
45 |         # min angle to set -> maximize this
46 |         min_ang = np.arccos(np.clip(np.max(cos, axis=1), -1.0, 1.0))
47 |         idx = np.argmax(min_ang)
48 |         P = np.vstack([P, C[idx:idx+1]])
49 |     return P
50 | 
51 | def main():
52 |     n = 30
53 |     seed = seed_platonic(n)
54 |     pts = farthest_point_greedy(n, seed=seed, rng=np.random.default_rng(42))
55 |     print(f"n={n}, points={len(pts)}")
56 |     return pts
57 | 
58 | if __name__ == "__main__":
59 |     points = main()
60 |     # 保存为 npy 文件
61 |     np.save("points.npy", points)
62 |     print("已保存 points.npy")
63 | 
64 | # Ensure compatibility with evaluators that expect a global variable
65 | try:
66 |     points  # type: ignore[name-defined]
67 | except NameError:
68 |     points = main()
69 | 


--------------------------------------------------------------------------------
/compute.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | baselines = {
 4 |     "packing_circles_26": {"s_baseline": 2.634, "higher_better": 1},
 5 |     "packind_circles_32": {"s_baseline": 2.936, "higher_better": 1},
 6 |     "minizing_raio_max_min_distance_d2_n16": {"s_baseline": 12.89, "higher_better": -1},
 7 |     "minizing_raio_max_min_distance_d3_n14": {"s_baseline": 4.168, "higher_better": -1},
 8 |     "third_autocorrelation_inequality": {"s_baseline": 1.4581, "higher_better": -1},
 9 |     # Added benchmarks (larger-is-better where applicable)
10 |     "kissing_number_d11": {"s_baseline": 592.0, "higher_better": 1},
11 |     "spherical_code_d3_n30": {"s_baseline": 0.6736467551690225, "higher_better": 1},
12 |     "heilbronn_in_the_unit_square_n16": {"s_baseline": 7.0/341.0, "higher_better": 1},
13 |     "littlewood_polynomials_n512": {"s_baseline": 0.04105, "higher_better": 1},
14 |     #"riesz_energy_n20_s1": {"s_baseline": 0.001013, "higher_better": 1},
15 |     "MSTD_n30": {"s_baseline": 1.04, "higher_better": 1},
16 |     "autoconvolution_peak_minimization": {"s_baseline": 0.6667, "higher_better": 1}
17 | }
18 | 
19 | results = {
20 |     "packing_circles_26": {"s_best": 2.6359829561164743, "round": 40657},
21 |     "packind_circles_32": {"s_best": 2.939520304932057, "round": 40657},
22 |     "minizing_raio_max_min_distance_d2_n16": {"s_best": 12.92, "round": 5000},
23 |     "minizing_raio_max_min_distance_d3_n14": {"s_best": 5.198, "round": 5000},
24 |     "third_autocorrelation_inequality": {"s_best": 0, "round": 5000},
25 |     # Placeholders for newly added benchmarks (0 indicates not yet attempted)
26 |     "kissing_number_d11": {"s_best": 502.0, "round": 5000},
27 |     "spherical_code_d3_n30": {"s_best": 0.6381359964781541, "round": 5000},
28 |     "heilbronn_in_the_unit_square_n16": {"s_best": 0, "round": 5000},
29 |     "littlewood_polynomials_n512": {"s_best": 0, "round": 5000},
30 |     #"riesz_energy_n20_s1": {"s_best": 0, "round": 5000},
31 |     "MSTD_n30": {"s_best": 0, "round": 5000},
32 |     "autoconvolution_peak_minimization": {"s_best": 0, "round": 5000}
33 | }
34 | 
35 | def compute_excel_best(results):
36 |     problems = list(baselines.keys())
37 |     num_problems = len(problems)
38 |     total = 0.0
39 |     for problem in problems:
40 |         s_baseline = baselines[problem]['s_baseline']
41 |         higher_better = baselines[problem]['higher_better']
42 |         s_best = results[problem]['s_best']
43 |         n_round = results[problem]['round']
44 |         if s_best == 0:
45 |             s_excess = 0  # Assuming s_best == 0 indicates failure/no improvement
46 |         else:
47 |             improvement = (s_best - s_baseline) * higher_better
48 |             s_excess = max(improvement, 0)
49 |         contrib = s_excess / (n_round / 1000000)
50 |         total += contrib
51 |     excel_best = total / num_problems
52 |     return excel_best
53 | 
54 | print(compute_excel_best(results))


--------------------------------------------------------------------------------
/benchmark/autoconvolution_peak_minimization/initial_program.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env python3
 3 | # -*- coding: utf-8 -*-
 4 | """
 5 | Autoconvolution Peak Minimization
 6 | =================================
 7 | 
 8 | This program generates step heights for a probability density function
 9 | that minimizes the maximum value of its autoconvolution.
10 | """
11 | 
12 | import numpy as np
13 | from typing import Dict
14 | 
15 | def evaluate_C1_upper_std(step_heights: np.ndarray) -> Dict[str, float]:
16 |     """
17 |     Standard-normalized C1 (support [-1/2,1/2], dx=1/K).
18 |     - Project to feasible set: h >= 0 and ∫f = 1 (L1 normalization).
19 |     - Objective: mu_inf = max_t (f*f)(t) (smaller is better).
20 |     Returns: {"valid", "mu_inf", "ratio"(=mu_inf), "integral"(=1.0), "K"}
21 |     """
22 |     h = np.asarray(step_heights, dtype=float)
23 |     if h.size == 0 or np.any(h < 0):
24 |         return {"valid": 0.0, "mu_inf": float("inf"), "ratio": float("inf")}
25 |     K = int(len(h))
26 |     dx = 1.0 / K
27 | 
28 |     integral = float(np.sum(h) * dx)
29 |     if integral <= 0:
30 |         return {"valid": 0.0, "mu_inf": float("inf"), "ratio": float("inf")}
31 |     h = h / integral  # ∫f = 1
32 | 
33 |     F = np.fft.fft(h, 2*K - 1)          # linear autoconvolution via padding
34 |     conv = np.fft.ifft(F * F).real
35 |     conv = np.maximum(conv, 0.0)        # clamp tiny negatives
36 | 
37 |     mu_inf = float(np.max(conv) * dx)
38 |     return {"valid": 1.0, "mu_inf": mu_inf, "ratio": mu_inf, "integral": 1.0, "K": float(K)}
39 | 
40 | def make_candidate(K: int, kind: str = "cos2") -> np.ndarray:
41 |     """
42 |     Simple candidate builder on [-1/2,1/2] (NOT normalized here).
43 |     
44 |     Args:
45 |         K: Number of discretization points
46 |         kind: Type of candidate function ("box", "triangle", "cos2", "gauss")
47 |     
48 |     Returns:
49 |         Step heights array
50 |     """
51 |     x = np.linspace(-1.0, 1.0, K)
52 |     if kind == "box":
53 |         h = np.ones(K)
54 |     elif kind == "triangle":
55 |         h = 1.0 - np.abs(x)
56 |         h[h < 0] = 0.0
57 |     elif kind == "cos2":
58 |         h = np.cos(np.pi * x / 2.0) ** 2
59 |     elif kind == "gauss":
60 |         h = np.exp(-4.0 * x**2)
61 |     else:
62 |         raise ValueError(f"unknown kind={kind}")
63 |     return h
64 | 
65 | def main():
66 |     """
67 |     Main function that generates step heights for autoconvolution minimization.
68 |     
69 |     Returns:
70 |         numpy.ndarray: Step heights array
71 |     """
72 |     K = 128
73 |     kind = "cos2"  # Change this to try different candidates (box/triangle/cos2/gauss)
74 |     step_heights = make_candidate(K, kind)
75 |     
76 |     # Evaluate the result to verify it's valid
77 |     result = evaluate_C1_upper_std(step_heights)
78 |     print(f"Generated {kind} candidate with K={K}, mu_inf={result['mu_inf']:.6f}")
79 |     
80 |     return step_heights
81 | 


--------------------------------------------------------------------------------
/evolve_agent/llm/ensemble.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Model ensemble for LLMs
 3 | """
 4 | 
 5 | import asyncio
 6 | import logging
 7 | import random
 8 | from typing import Dict, List, Optional, Tuple
 9 | 
10 | from evolve_agent.llm.base import LLMInterface
11 | from evolve_agent.llm.openai import OpenAILLM
12 | from evolve_agent.config import LLMModelConfig
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class LLMEnsemble:
18 |     """Ensemble of LLMs"""
19 | 
20 |     def __init__(self, models_cfg: List[LLMModelConfig]):
21 |         self.models_cfg = models_cfg
22 | 
23 |         # Initialize models from the configuration
24 |         self.models = [OpenAILLM(model_cfg) for model_cfg in models_cfg]
25 | 
26 |         # Extract and normalize model weights
27 |         self.weights = [model.weight for model in models_cfg]
28 |         total = sum(self.weights)
29 |         self.weights = [w / total for w in self.weights]
30 | 
31 |         logger.info(
32 |             f"Initialized LLM ensemble with models: "
33 |             + ", ".join(
34 |                 f"{model.name} (weight: {weight:.2f})"
35 |                 for model, weight in zip(models_cfg, self.weights)
36 |             )
37 |         )
38 | 
39 |     async def generate(self, prompt: str, **kwargs) -> str:
40 |         """Generate text using a randomly selected model based on weights"""
41 |         model = self._sample_model()
42 |         return await model.generate(prompt, **kwargs)
43 | 
44 |     async def generate_with_context(
45 |         self, system_message: str, messages: List[Dict[str, str]], **kwargs
46 |     ) -> str:
47 |         """Generate text using a system message and conversational context"""
48 |         model = self._sample_model()
49 |         return await model.generate_with_context(system_message, messages, **kwargs)
50 | 
51 |     def _sample_model(self) -> LLMInterface:
52 |         """Sample a model from the ensemble based on weights"""
53 |         index = random.choices(range(len(self.models)), weights=self.weights, k=1)[0]
54 |         return self.models[index]
55 | 
56 |     async def generate_multiple(self, prompt: str, n: int, **kwargs) -> List[str]:
57 |         """Generate multiple texts in parallel"""
58 |         tasks = [self.generate(prompt, **kwargs) for _ in range(n)]
59 |         return await asyncio.gather(*tasks)
60 | 
61 |     async def parallel_generate(self, prompts: List[str], **kwargs) -> List[str]:
62 |         """Generate responses for multiple prompts in parallel"""
63 |         tasks = [self.generate(prompt, **kwargs) for prompt in prompts]
64 |         return await asyncio.gather(*tasks)
65 | 
66 |     async def generate_all_with_context(
67 |         self, system_message: str, messages: List[Dict[str, str]], **kwargs
68 |     ) -> str:
69 |         """Generate text using a all available models and average their returned metrics"""
70 |         responses = []
71 |         for model in self.models:
72 |             responses.append(await model.generate_with_context(system_message, messages, **kwargs))
73 |         return responses
74 | 


--------------------------------------------------------------------------------
/benchmark/littlewood_polynomials/evaluator.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | import os
 4 | import json
 5 | import importlib.util
 6 | import numpy as np
 7 | from typing import Dict
 8 | 
 9 | def evaluate_littlewood_supnorm(coeffs, num_grid: int = 16384) -> Dict[str, float]:
10 |     """
11 |     FFT-sampled sup-norm upper bound on |P(e^{it})|.
12 |     - coeffs: 1-D array-like of ±1
13 |     - num_grid: sampling resolution (larger -> tighter upper bound)
14 |     """
15 |     if num_grid < 8:
16 |         raise ValueError("num_grid too small")
17 |     c = np.atleast_1d(np.asarray(coeffs, dtype=float))     # ensure 1-D
18 |     if c.ndim != 1 or c.size == 0:
19 |         raise ValueError("coeffs must be a non-empty 1-D array")
20 | 
21 |     pad = np.zeros(int(num_grid), dtype=np.complex128)
22 |     pad[: c.size] = c.astype(np.complex128)
23 | 
24 |     values = np.fft.fft(pad)                               # samples of P on unit circle
25 |     supnorm = float(np.max(np.abs(values)))
26 |     return {"valid": 1.0, "supnorm": supnorm}
27 | 
28 | def _read_coeffs_from_stdin():
29 |     """
30 |     Accepts lines like:
31 |       n = 512
32 |       1 -1 1 1 -1 ...
33 |     or just a line of ±1's. Robust to extra spaces/newlines.
34 |     """
35 |     text = sys.stdin.read().strip()
36 |     if not text:
37 |         return None
38 |     # Take the last line that contains numbers
39 |     lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
40 |     # concatenate all non "n =" lines to support long sequences split across lines
41 |     number_lines = [ln for ln in lines if not ln.lower().startswith("n =")]
42 |     if not number_lines:
43 |         return None
44 |     joined = " ".join(number_lines)
45 |     nums = re.findall(r"[-+]?\d+", joined)
46 |     return np.asarray(list(map(int, nums)), dtype=int)
47 | 
48 | def evaluate(program_path: str = "littlewood_polynomials/initial_program.py"):
49 |     try:
50 |         spec = importlib.util.spec_from_file_location("program", program_path)
51 |         program = importlib.util.module_from_spec(spec)
52 |         sys.modules["program"] = program
53 |         spec.loader.exec_module(program)
54 | 
55 |         coeffs_obj = None
56 |         if hasattr(program, 'coeffs'):
57 |             coeffs_obj = program.coeffs
58 |         elif hasattr(program, 'main'):
59 |             res = program.main()
60 |             if isinstance(res, np.ndarray):
61 |                 coeffs_obj = res
62 |             elif hasattr(program, 'coeffs'):
63 |                 coeffs_obj = program.coeffs
64 |         if coeffs_obj is None:
65 |             # fallback: try stdin for robustness
66 |             coeffs_obj = _read_coeffs_from_stdin()
67 |             if coeffs_obj is None:
68 |                 return {"error": -1.0}
69 |         result = evaluate_littlewood_supnorm(coeffs_obj, num_grid=16384)
70 |         if result.get("valid", 0.0) != 1.0:
71 |             return {"error": -1.0}
72 |         supnorm = float(result["supnorm"])
73 |         if supnorm > 0 and np.isfinite(supnorm):
74 |             return {"score": 1.0 / supnorm}
75 |         return {"error": -1.0}
76 |     except Exception:
77 |         return {"error": -1.0}
78 | 
79 | print(evaluate())


--------------------------------------------------------------------------------
/benchmark/heilbronn_in_the_unit_square/evaluator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import importlib.util
 3 | import sys
 4 | import os
 5 | import json
 6 | from itertools import combinations
 7 | from typing import Dict
 8 | 
 9 | 
10 | def _triangle_area(a, b, c) -> float:
11 |     return abs((b[0]-a[0])*(c[1]-a[1]) - (b[1]-a[1])*(c[0]-a[0])) * 0.5
12 | 
13 | 
14 | def evaluate_min_triangle_area(points: np.ndarray) -> Dict[str, float]:
15 |     """
16 |     Compute the minimum triangle area for 2D points.
17 |     Metrics:
18 |       - min_area: raw smallest triangle area (larger is better)
19 |       - scaled_min_area: n^(8/7 + 1/2000) * min_area (informational)
20 |       - score: equals min_area (larger is better)
21 |     """
22 |     pts = np.asarray(points, dtype=float)
23 |     if pts.ndim != 2 or pts.shape[1] != 2:
24 |         return {"valid": 0.0, "min_area": 0.0, "n": 0.0, "scaled_min_area": 0.0, "score": 0.0}
25 | 
26 |     n = len(pts)
27 |     if n < 3:
28 |         return {"valid": 0.0, "min_area": 0.0, "n": float(n), "scaled_min_area": 0.0, "score": 0.0}
29 | 
30 |     min_area = float("inf")
31 |     for i, j, k in combinations(range(n), 3):
32 |         area = _triangle_area(pts[i], pts[j], pts[k])
33 |         if area < min_area:
34 |             min_area = area
35 |             if min_area <= 1e-18:  # 早停
36 |                 break
37 | 
38 |     exponent = (8.0/7.0) + (1.0/2000.0)
39 |     scaled_min_area = (n ** exponent) * float(min_area)
40 |     score = float(min_area) if np.isfinite(min_area) else 0.0
41 | 
42 |     return {
43 |         "valid": 1.0,
44 |         "min_area": float(min_area),
45 |         "n": float(n),
46 |         "scaled_min_area": float(scaled_min_area),
47 |         "score": score,
48 |     }
49 | 
50 | 
51 | 
52 | def evaluate(program_path: str):
53 |     try:
54 |         spec = importlib.util.spec_from_file_location("program", program_path)
55 |         program = importlib.util.module_from_spec(spec)
56 |         sys.modules["program"] = program
57 |         spec.loader.exec_module(program)
58 | 
59 |         points = None
60 |         if hasattr(program, 'points'):
61 |             points = program.points
62 |         elif hasattr(program, 'main'):
63 |             res = program.main()
64 |             if isinstance(res, np.ndarray):
65 |                 points = res
66 |             elif hasattr(program, 'points'):
67 |                 points = program.points
68 |         if points is None:
69 |             return {"error": -1.0}
70 |         result = evaluate_min_triangle_area(points)
71 |         # Return both the diagnostic dict and rely on 'score' as the main metric (larger is better)
72 |         return {
73 |             "score": result["score"],
74 |             "scaled_min_area": result["scaled_min_area"],
75 |             "min_area": result["min_area"],
76 |             "n": result["n"],
77 |         }
78 |     except Exception:
79 |         return {"error": -1.0}
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     try:
84 |         default_path = os.path.join(os.path.dirname(__file__), "initial_program.py")
85 |     except Exception:
86 |         default_path = "initial_program.py"
87 |     target = sys.argv[1] if len(sys.argv) > 1 else default_path
88 |     print(json.dumps(evaluate(target), ensure_ascii=False, indent=2))
89 | 


--------------------------------------------------------------------------------
/configs/island_examples.yaml:
--------------------------------------------------------------------------------
 1 | # EvolveAgent Island-Based Evolution Configuration Examples
 2 | # Different configurations for various use cases
 3 | 
 4 | # Configuration for Maximum Diversity (Many Islands, Frequent Migration)
 5 | # Use this when you want to explore the search space thoroughly
 6 | # Good for: Complex problems, avoiding local optima, long runs
 7 | max_diversity:
 8 |   database:
 9 |     num_islands: 10                   # More islands = more diversity
10 |     migration_interval: 25            # More frequent migration
11 |     migration_rate: 0.2               # Higher migration rate
12 |     population_size: 1000
13 |     archive_size: 200
14 | 
15 | # Configuration for Focused Exploration (Few Islands, Rare Migration)  
16 | # Use this when you want deeper exploration within each island
17 | # Good for: Problems with clear structure, shorter runs
18 | focused_exploration:
19 |   database:
20 |     num_islands: 3                    # Fewer islands = deeper exploration
21 |     migration_interval: 100           # Less frequent migration
22 |     migration_rate: 0.05              # Lower migration rate
23 |     population_size: 500
24 |     archive_size: 50
25 | 
26 | # Configuration for Balanced Approach (Default Settings)
27 | # Use this as a starting point for most problems
28 | # Good for: General use, medium-length runs
29 | balanced:
30 |   database:
31 |     num_islands: 5                    # Balanced number of islands
32 |     migration_interval: 50            # Moderate migration frequency
33 |     migration_rate: 0.1               # Moderate migration rate
34 |     population_size: 1000
35 |     archive_size: 100
36 | 
37 | # Configuration for Quick Exploration (Small Scale)
38 | # Use this for rapid prototyping and testing
39 | # Good for: Small problems, quick experiments
40 | quick_exploration:
41 |   database:
42 |     num_islands: 3
43 |     migration_interval: 20
44 |     migration_rate: 0.15
45 |     population_size: 200
46 |     archive_size: 30
47 | 
48 | # Configuration for Large-Scale Evolution (High Performance)
49 | # Use this for complex problems requiring extensive search
50 | # Good for: Complex optimization, long evolutionary runs
51 | large_scale:
52 |   database:
53 |     num_islands: 15                   # Many islands for parallel exploration
54 |     migration_interval: 75            # Balanced migration timing
55 |     migration_rate: 0.08              # Conservative migration rate
56 |     population_size: 2000             # Large populations
57 |     archive_size: 300
58 | 
59 | # Guidelines for choosing parameters:
60 | #
61 | # num_islands:
62 | #   - More islands = more diversity, slower convergence
63 | #   - Fewer islands = faster convergence, risk of premature convergence
64 | #   - Recommended: 3-10 for most problems
65 | #
66 | # migration_interval:
67 | #   - Lower values = more frequent knowledge sharing
68 | #   - Higher values = more independent evolution
69 | #   - Recommended: 25-100 generations
70 | #
71 | # migration_rate:
72 | #   - Higher values = faster knowledge propagation
73 | #   - Lower values = preserve island diversity longer
74 | #   - Recommended: 0.05-0.2 (5%-20%)
75 | #
76 | # Rule of thumb:
77 | #   - Complex problems → More islands, less frequent migration
78 | #   - Simple problems → Fewer islands, more frequent migration
79 | #   - Long runs → More islands to maintain diversity
80 | #   - Short runs → Fewer islands for faster convergence
81 | 


--------------------------------------------------------------------------------
/benchmark/third_autocorrelation_inequality/initial_program.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.integrate
 3 | 
 4 | def calculate_c3_upper_bound(height_sequence):
 5 | 
 6 |     N = len(height_sequence)
 7 |     delta_x = 1 / (2 * N)
 8 | 
 9 |     def f(x):
10 |         if -0.25 <= x <= 0.25:
11 |             index = int((x - (-0.25)) / delta_x)
12 |             if index == N:
13 |                 index -= 1
14 |             return height_sequence[index]
15 |         else:
16 |             return 0.0
17 | 
18 |     integral_f = np.sum(height_sequence) * delta_x
19 |     integral_sq = integral_f**2
20 | 
21 |     if integral_sq < 1e-18:
22 |         return 0.0
23 | 
24 |     t_points = np.linspace(-0.5, 0.5, 2 * N + 1)
25 |     
26 |     max_conv_val = 0.0
27 |     for t_val in t_points:
28 | 
29 |         lower_bound = max(-0.25, t_val - 0.25)
30 |         upper_bound = min(0.25, t_val + 0.25)
31 | 
32 |         if upper_bound <= lower_bound:
33 |             convolution_val = 0.0
34 |         else:
35 |             def integrand(x):
36 |                 return f(x) * f(t_val - x)
37 |             
38 |             convolution_val, _ = scipy.integrate.quad(integrand, lower_bound, upper_bound, limit=100)
39 |         
40 |         if abs(convolution_val) > max_conv_val:
41 |             max_conv_val = abs(convolution_val)
42 | 
43 |     return max_conv_val / integral_sq
44 | 
45 | def genetic_algorithm(population_size, num_intervals, generations, mutation_rate, crossover_rate):
46 | 
47 |     population = np.random.rand(population_size, num_intervals) * 2 - 1
48 | 
49 |     best_solution = None
50 |     best_fitness = 0.0
51 | 
52 |     for gen in range(generations):
53 | 
54 |         fitness_scores = np.array([calculate_c3_upper_bound(individual) for individual in population])
55 | 
56 |         current_best_idx = np.argmax(fitness_scores)
57 |         if fitness_scores[current_best_idx] > best_fitness:
58 |             best_fitness = fitness_scores[current_best_idx]
59 |             best_solution = population[current_best_idx].copy()
60 |             # print(f"Generation {gen}: New best fitness = {best_fitness}")
61 | 
62 | 
63 |         new_population = np.zeros_like(population)
64 |         for i in range(population_size):
65 | 
66 |             competitors_indices = np.random.choice(population_size, 2, replace=False)
67 |             winner_idx = competitors_indices[np.argmax(fitness_scores[competitors_indices])]
68 |             new_population[i] = population[winner_idx].copy()
69 |             
70 |         for i in range(0, population_size, 2):
71 |             if np.random.rand() < crossover_rate:
72 |                 parent1 = new_population[i]
73 |                 parent2 = new_population[i+1]
74 |                 crossover_point = np.random.randint(1, num_intervals - 1)
75 |                 new_population[i] = np.concatenate((parent1[:crossover_point], parent2[crossover_point:]))
76 |                 new_population[i+1] = np.concatenate((parent2[:crossover_point], parent1[crossover_point:]))
77 | 
78 |         for i in range(population_size):
79 |             if np.random.rand() < mutation_rate:
80 |                 mutation_point = np.random.randint(num_intervals)
81 |                 new_population[i, mutation_point] += np.random.normal(0, 0.1) 
82 | 
83 |                 new_population[i, mutation_point] = np.clip(new_population[i, mutation_point], -2, 2)
84 | 
85 |         population = new_population
86 |     
87 |     return best_solution
88 | 
89 | def find_better_c3_upper_bound():
90 | 
91 |     NUM_INTERVALS = 4
92 |     POPULATION_SIZE = 2
93 |     GENERATIONS = 10
94 |     MUTATION_RATE = 0.1
95 |     CROSSOVER_RATE = 0.8
96 | 
97 |     height_sequence_3 = genetic_algorithm(POPULATION_SIZE, NUM_INTERVALS, GENERATIONS, MUTATION_RATE, CROSSOVER_RATE)
98 |     
99 |     return height_sequence_3


--------------------------------------------------------------------------------
/benchmark/kissing_number/evaluator.py:
--------------------------------------------------------------------------------
 1 | #@title Verification
 2 | import numpy as np
 3 | import subprocess
 4 | import sys
 5 | import traceback
 6 | import os
 7 | import json
 8 | 
 9 | 
10 | def verify_kissing_configuration(sphere_centers: np.ndarray, atol: float = 1e-9):
11 |     """
12 |     Verifies if the given points form a valid kissing number configuration.
13 | 
14 |     A valid kissing configuration of N vectors in D dimensions must satisfy:
15 |     1. All vectors are unit vectors (norm is 1).
16 |     2. The dot product of any two distinct vectors is at most 0.5.
17 | 
18 |     Args:
19 |       sphere_centers: A numpy array of shape (N, D) where N is the number of spheres
20 |                       and D is the dimension.
21 |       atol: Absolute tolerance for floating point comparisons.
22 | 
23 |     Raises:
24 |       AssertionError: If the configuration is not valid.
25 |     """
26 |     num_spheres, dimension = sphere_centers.shape
27 |     
28 |     # 1. Check if all vectors are unit vectors.
29 |     norms = np.linalg.norm(sphere_centers, axis=1)
30 |     assert np.allclose(norms, 1.0, atol=atol), f"Verification failed: Not all vectors are unit vectors. Norms range from {np.min(norms)} to {np.max(norms)}."
31 |     
32 |     # 2. Check the dot products.
33 |     # The dot product of two distinct vectors must be <= 0.5.
34 |     dot_products = sphere_centers @ sphere_centers.T
35 |     
36 |     # We only need to check the upper triangle, excluding the diagonal.
37 |     # The diagonal elements should be 1.0 for unit vectors.
38 |     np.fill_diagonal(dot_products, -np.inf) # so we don't pick diagonal elements
39 |     
40 |     max_dot_product = np.max(dot_products)
41 |     
42 |     # The condition is dot_product <= 0.5
43 |     assert max_dot_product <= 0.5 + atol, f"Verification failed: Maximum dot product between distinct vectors is {max_dot_product}, which is greater than 0.5."
44 | 
45 | 
46 | def evaluate(program_path: str):
47 |   """
48 |   Evaluate a program that solves the kissing number problem.
49 |   Returns dict with key 'score' = number of spheres (larger is better).
50 |   On failure/invalid, returns {'score': -1.0, ...}.
51 |   """
52 |   try:
53 |     # Use importlib.util to dynamically load the program module
54 |     import importlib.util
55 |     
56 |     # Load the module from the given path
57 |     spec = importlib.util.spec_from_file_location("program", program_path)
58 |     program = importlib.util.module_from_spec(spec)
59 |     sys.modules["program"] = program
60 |     spec.loader.exec_module(program)
61 |     
62 |     # Look for sphere_centers in the loaded module
63 |     sphere_centers = None
64 |     if hasattr(program, 'sphere_centers'):
65 |       sphere_centers = program.sphere_centers
66 |     elif hasattr(program, 'main'):
67 |       # If there's a main function, try calling it to get sphere_centers
68 |       sphere_centers = program.main()
69 |     
70 |     if sphere_centers is None:
71 |       return {"score": -1.0, "no_sphere_centers": True}
72 |     
73 |     # Verify the kissing configuration
74 |     verify_kissing_configuration(sphere_centers)
75 |     
76 |     # Calculate metrics
77 |     num_spheres = sphere_centers.shape[0]
78 |     dimension = sphere_centers.shape[1]
79 |     
80 |     # Return metrics with 'score'
81 |     return {
82 |       "score": float(num_spheres),
83 |       "num_spheres": float(num_spheres),
84 |       "dimension": float(dimension)
85 |     }
86 |     
87 |   except Exception as e:
88 |     return {"score": -1.0, "evaluation_error": True, "stderr": traceback.format_exc()}
89 | 
90 | 
91 | if __name__ == "__main__":
92 |   # CLI for debugging: evaluate initial_program.py by default, or a provided path
93 |   try:
94 |     default_path = os.path.join(os.path.dirname(__file__), "initial_program.py")
95 |   except Exception:
96 |     default_path = "initial_program.py"
97 | 
98 |   target = sys.argv[1] if len(sys.argv) > 1 else default_path
99 |   print(json.dumps(evaluate(target), ensure_ascii=False, indent=2))


--------------------------------------------------------------------------------
/benchmark/spherical_code/visualization.py:
--------------------------------------------------------------------------------
  1 | # viz_sphere_points.py
  2 | import argparse
  3 | import importlib.util
  4 | import sys
  5 | import os
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 (needed for 3D)
  9 | 
 10 | def _normalize_rows(P):
 11 |     nrm = np.linalg.norm(P, axis=1, keepdims=True)
 12 |     nrm = np.maximum(nrm, 1e-12)
 13 |     return P / nrm
 14 | 
 15 | def load_from_module(module_path: str):
 16 |     """
 17 |     动态加载模块：
 18 |     - 优先读取全局变量 `points`
 19 |     - 否则调用 `main()` 获取返回值
 20 |     """
 21 |     module_path = os.path.abspath(module_path)
 22 |     spec = importlib.util.spec_from_file_location("points_mod", module_path)
 23 |     if spec is None or spec.loader is None:
 24 |         raise RuntimeError(f"无法加载模块: {module_path}")
 25 |     mod = importlib.util.module_from_spec(spec)
 26 |     spec.loader.exec_module(mod)
 27 | 
 28 |     if hasattr(mod, "points"):
 29 |         pts = getattr(mod, "points")
 30 |     elif hasattr(mod, "main"):
 31 |         pts = mod.main()
 32 |     else:
 33 |         raise RuntimeError("模块中既无 `points` 变量，也无 `main()` 函数可获取点。")
 34 | 
 35 |     pts = np.asarray(pts, dtype=float)
 36 |     if pts.ndim != 2 or pts.shape[1] != 3:
 37 |         raise ValueError(f"模块返回的点形状异常: {pts.shape}, 期望 (N, 3)")
 38 |     return _normalize_rows(pts)
 39 | 
 40 | def load_from_npy(npy_path: str):
 41 |     pts = np.load(npy_path)
 42 |     pts = np.asarray(pts, dtype=float)
 43 |     if pts.ndim != 2 or pts.shape[1] != 3:
 44 |         raise ValueError(f"npy 形状异常: {pts.shape}, 期望 (N, 3)")
 45 |     return _normalize_rows(pts)
 46 | 
 47 | def load_from_csv(csv_path: str):
 48 |     pts = np.loadtxt(csv_path, delimiter=",")
 49 |     pts = np.asarray(pts, dtype=float)
 50 |     if pts.ndim != 2 or pts.shape[1] != 3:
 51 |         raise ValueError(f"csv 形状异常: {pts.shape}, 期望 (N, 3)")
 52 |     return _normalize_rows(pts)
 53 | 
 54 | def min_pairwise_angle_deg(P):
 55 |     """
 56 |     返回：
 57 |     - 最小夹角（度）
 58 |     - 对应的最大余弦相似度（最近的一对点）
 59 |     """
 60 |     # 计算上三角的点积
 61 |     dot = P @ P.T
 62 |     n = len(P)
 63 |     mask = np.triu(np.ones((n, n), dtype=bool), k=1)
 64 |     vals = dot[mask]
 65 |     max_cos = np.max(vals) if vals.size else 1.0
 66 |     max_cos = np.clip(max_cos, -1.0, 1.0)
 67 |     ang_min = np.degrees(np.arccos(max_cos))
 68 |     return ang_min, max_cos
 69 | 
 70 | def plot_points_on_sphere(P, title="Spherical Point Set"):
 71 |     fig = plt.figure(figsize=(7, 7))
 72 |     ax = fig.add_subplot(111, projection="3d")
 73 |     ax.set_box_aspect([1,1,1])
 74 | 
 75 |     # 画单位球面网格
 76 |     u = np.linspace(0, 2*np.pi, 60)
 77 |     v = np.linspace(0, np.pi, 30)
 78 |     x = np.outer(np.cos(u), np.sin(v))
 79 |     y = np.outer(np.sin(u), np.sin(v))
 80 |     z = np.outer(np.ones_like(u), np.cos(v))
 81 |     ax.plot_surface(x, y, z, alpha=0.15, linewidth=0, antialiased=True)
 82 | 
 83 |     # 画点
 84 |     ax.scatter(P[:,0], P[:,1], P[:,2], s=40, depthshade=True)
 85 | 
 86 |     # 坐标与视角
 87 |     ax.set_xlabel("x")
 88 |     ax.set_ylabel("y")
 89 |     ax.set_zlabel("z")
 90 |     ax.set_title(title)
 91 |     ax.view_init(elev=20, azim=45)
 92 | 
 93 |     plt.tight_layout()
 94 |     plt.show()
 95 | 
 96 | def main():
 97 |     parser = argparse.ArgumentParser(
 98 |         description="可视化 S^2 上的点集（从模块、.npy 或 .csv 读取）"
 99 |     )
100 |     src = parser.add_mutually_exclusive_group(required=True)
101 |     src.add_argument("--from-module", type=str, help="包含 points 或 main() 的 .py 文件路径（例如 sphere_points.py）")
102 |     src.add_argument("--from-npy", type=str, help="N×3 的 .npy 文件路径")
103 |     src.add_argument("--from-csv", type=str, help="N×3 的 .csv 文件路径（逗号分隔）")
104 |     parser.add_argument("--title", type=str, default="Spherical Point Set", help="图标题")
105 |     args = parser.parse_args()
106 | 
107 |     if args.from_module:
108 |         P = load_from_module(args.from_module)
109 |     elif args.from_npy:
110 |         P = load_from_npy(args.from_npy)
111 |     else:
112 |         P = load_from_csv(args.from_csv)
113 | 
114 |     ang_min_deg, max_cos = min_pairwise_angle_deg(P)
115 |     print(f"N = {len(P)}")
116 |     print(f"最小两点夹角 ≈ {ang_min_deg:.4f}°")
117 |     print(f"最近对的余弦相似度（最大点积） ≈ {max_cos:.6f}")
118 | 
119 |     plot_points_on_sphere(P, title=args.title)
120 | 
121 | if __name__ == "__main__":
122 |     main()
123 | 


--------------------------------------------------------------------------------
/reward_model/grade.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import pandas as pd
 4 | from typing import List, Dict
 5 | import re
 6 | from vllm import LLM, SamplingParams
 7 | from datasets import load_dataset
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llm.backend import score_abstracts_with_vllm, score_abstracts_with_api
10 | 
11 | 
12 | def evaluate_and_compare(data: List[Dict], model_results: List[Dict]) -> Dict:
13 |     results = {
14 |         "model_scores": [res["score"] for res in model_results],
15 |         "avg_ratings": [res["avg_rating"] for res in model_results],
16 |         "evaluations": [res["evaluation"] for res in model_results],
17 |         "abstracts": [res["abstract"] for res in model_results],
18 |         "differences": [],
19 |         "mae": 0.0,
20 |         "mse": 0.0,
21 |         "accuracy": 0.0  # New metric for accuracy
22 |     }
23 | 
24 |     # Calculate differences and labels
25 |     valid_differences = []
26 |     true_labels = []
27 |     pred_labels = []
28 |     for ms, ar in zip(results["model_scores"], results["avg_ratings"]):
29 |         # Calculate difference
30 |         diff = abs(ms - ar) if ms >= 0 else -1
31 |         valid_differences.append(diff)
32 | 
33 |         # Assign labels: positive (1) if score > 5.5, negative (0) otherwise
34 |         # Only include valid scores for accuracy calculation
35 |         if ms >= 0:
36 |             true_label = 1 if ar > 5.5 else 0
37 |             pred_label = 1 if ms > 5.5 else 0
38 |             true_labels.append(true_label)
39 |             pred_labels.append(pred_label)
40 | 
41 |     # Calculate metrics
42 |     results["differences"] = valid_differences
43 |     valid_diffs = [d for d in valid_differences if d >= 0]
44 |     results["mae"] = np.mean(valid_diffs) if valid_diffs else 0.0
45 |     results["mse"] = np.mean([d ** 2 for d in valid_diffs]) if valid_diffs else 0.0
46 |     
47 |     # Calculate accuracy: proportion of matching labels
48 |     correct_predictions = sum(1 for t, p in zip(true_labels, pred_labels) if t == p)
49 |     results["accuracy"] = correct_predictions / len(true_labels) if true_labels else 0.0
50 | 
51 |     return results
52 | 
53 | def print_results(results: Dict):
54 |     print(f"{'Index':<6} {'Model Score':<12} {'Avg Rating':<12} {'Difference':<12} {'Evaluation':<50}")
55 |     print("-" * 100)
56 |     for i in range(len(results["model_scores"])):
57 |         eval_snippet = results["evaluations"][i][:47] + "..." if len(results["evaluations"][i]) > 47 else results["evaluations"][i]
58 |         diff = results["differences"][i] if results["differences"][i] >= 0 else "N/A"
59 |         print(f"{i+1:<6} {results['model_scores'][i]:<12.2f} {results['avg_ratings'][i]:<12.2f} {diff:<12} {eval_snippet:<50}")
60 |     print("\nSummary Statistics (excluding invalid scores):")
61 |     print(f"Mean Absolute Error (MAE): {results['mae']:.2f}")
62 |     print(f"Mean Squared Error (MSE): {results['mse']:.2f}")
63 |     print(f"Prediction Accuracy: {results['accuracy']:.2%}")  # Display accuracy as percentage
64 | 
65 | def main():
66 | 
67 |     try:
68 |         data = load_dataset('json', data_files='/data/zhuotaodeng/yzj/alpha-research/data/iclr2025_eval_100.json', split='train')
69 |         data = [dict(item) for item in data]  # Convert to List[Dict]
70 |     except Exception as e:
71 |         print(f"Error loading dataset: {e}")
72 |         return
73 | 
74 |     model_results = score_abstracts_with_vllm(data, model_name="/data/zhuotaodeng/yzj/download_from_modelscope/Qwen/Qwen3-8B")
75 |     # model_results = score_abstracts_with_vllm(data, '/data/zhuotaodeng/yzj/alpha-research/model/qwen25_grm_iclr_boxed/checkpoint-120')
76 |     # model_results = score_abstracts_with_api(data, '/data/zhuotaodeng/yzj/alpha-research/idea-eval/results.jsonl')
77 | 
78 |     results = evaluate_and_compare(data, model_results)
79 | 
80 |     print_results(results)
81 | 
82 |     with open("vllm_evaluation_results.json", "w") as f:
83 |         json.dump(results, f, indent=4)
84 |     print("\nResults saved to 'vllm_evaluation_results.json'")
85 | 
86 |     df = pd.DataFrame({
87 |         "Abstract": results["abstracts"],
88 |         "Model_Score": results["model_scores"],
89 |         "Avg_Rating": results["avg_ratings"],
90 |         "Difference": results["differences"],
91 |         "Evaluation": results["evaluations"]
92 |     })
93 |     df.to_csv("vllm_evaluation_results.csv", index=False)
94 |     print("Results also saved to 'vllm_evaluation_results.csv'")
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     main()


--------------------------------------------------------------------------------
/evolve_agent/llm/openai.py:
--------------------------------------------------------------------------------
  1 | """
  2 | OpenAI API interface for LLMs
  3 | """
  4 | 
  5 | import asyncio
  6 | import logging
  7 | import time
  8 | from typing import Any, Dict, List, Optional, Union
  9 | 
 10 | import openai
 11 | from openai import AsyncOpenAI
 12 | from openai import AsyncAzureOpenAI
 13 | 
 14 | from evolve_agent.config import LLMConfig
 15 | from evolve_agent.llm.base import LLMInterface
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | class OpenAILLM(LLMInterface):
 20 |     """LLM interface using OpenAI-compatible APIs"""
 21 | 
 22 |     def __init__(
 23 |         self,
 24 |         model_cfg: Optional[dict] = None,
 25 |     ):
 26 |         self.model = model_cfg.name
 27 |         self.system_message = model_cfg.system_message
 28 |         self.temperature = model_cfg.temperature
 29 |         self.top_p = model_cfg.top_p
 30 |         self.max_tokens = model_cfg.max_tokens
 31 |         self.timeout = model_cfg.timeout
 32 |         self.retries = model_cfg.retries
 33 |         self.retry_delay = model_cfg.retry_delay
 34 |         self.api_base = model_cfg.api_base
 35 |         self.api_key = model_cfg.api_key
 36 | 
 37 |         # Set up async API client
 38 |         self.client = AsyncOpenAI(
 39 |             api_key=self.api_key,
 40 |             base_url=self.api_base,
 41 |         )
 42 |         # self.client = AsyncOpenAI(
 43 |         #     api_key=self.api_key,
 44 |         #     azure_endpoint=self.api_base,
 45 |         #     api_version="2024-12-01-preview",
 46 |         # )
 47 | 
 48 |         logger.info(f"Initialized OpenAI LLM with model: {self.model}")
 49 | 
 50 |     async def generate(self, prompt: str, **kwargs) -> str:
 51 |         """Generate text from a prompt"""
 52 |         return await self.generate_with_context(
 53 |             system_message=self.system_message,
 54 |             messages=[{"role": "user", "content": prompt}],
 55 |             **kwargs,
 56 |         )
 57 | 
 58 |     async def generate_with_context(
 59 |         self, system_message: str, messages: List[Dict[str, str]], **kwargs
 60 |     ) -> str:
 61 |         """Generate text using a system message and conversational context"""
 62 |         # Prepare messages with system message
 63 |         formatted_messages = [{"role": "system", "content": system_message}]
 64 |         formatted_messages.extend(messages)
 65 | 
 66 |         # Set up generation parameters
 67 |         if self.api_base == "https://api.openai.com/v1" and str(self.model).lower().startswith("o"):
 68 |             # For o-series models
 69 |             params = {
 70 |                 "model": self.model,
 71 |                 "messages": formatted_messages,
 72 |                 "max_completion_tokens": kwargs.get("max_tokens", self.max_tokens),
 73 |             }
 74 |         else:
 75 |             params = {
 76 |                 "model": self.model,
 77 |                 "messages": formatted_messages,
 78 |                 "temperature": kwargs.get("temperature", self.temperature),
 79 |                 "top_p": kwargs.get("top_p", self.top_p),
 80 |                 "max_tokens": kwargs.get("max_tokens", self.max_tokens),
 81 |             }
 82 | 
 83 |         # Attempt the API call with retries
 84 |         retries = kwargs.get("retries", self.retries)
 85 |         retry_delay = kwargs.get("retry_delay", self.retry_delay)
 86 |         timeout = kwargs.get("timeout", self.timeout)
 87 | 
 88 |         for attempt in range(retries + 1):
 89 |             try:
 90 |                 response = await asyncio.wait_for(self._call_api(params), timeout=timeout)
 91 |                 return response
 92 |             except asyncio.TimeoutError:
 93 |                 if attempt < retries:
 94 |                     logger.warning(f"Timeout on attempt {attempt + 1}/{retries + 1}. Retrying...")
 95 |                     await asyncio.sleep(retry_delay)
 96 |                 else:
 97 |                     logger.error(f"All {retries + 1} attempts failed with timeout")
 98 |                     raise
 99 |             except Exception as e:
100 |                 if attempt < retries:
101 |                     logger.warning(
102 |                         f"Error on attempt {attempt + 1}/{retries + 1}: {str(e)}. Retrying..."
103 |                     )
104 |                     await asyncio.sleep(retry_delay)
105 |                 else:
106 |                     logger.error(f"All {retries + 1} attempts failed with error: {str(e)}")
107 |                     raise
108 | 
109 |     async def _call_api(self, params: Dict[str, Any]) -> str:
110 |         """Make the actual API call"""
111 |         # Use native async API call
112 |         response = await self.client.chat.completions.create(**params)
113 |         # Logging of system prompt, user message and response content
114 |         prompt = params["messages"][0]["content"] + '\n' + params["messages"][1]["content"]
115 |         logger.info('=' * 100)
116 |         logger.info(f"API parameters: {prompt}")
117 |         logger.info('=' * 100)
118 |         logger.info(f"API response: {response.choices[0].message.content}")
119 |         logger.info('=' * 100)
120 |         return response.choices[0].message.content
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">
  2 | <img src="./assets/logo.png" width="200" alt="AlphaReseach" />
  3 | </h1>
  4 | 
  5 | <p align="center">
  6 |   <a href=""><b>[🌐 Website]</b></a> •
  7 |   <a href="https://arxiv.org/abs/2511.08522"><b>[📜 Paper]</b></a> •
  8 |   <a href="https://huggingface.co/alpha-research"><b>[🤗 HF Models]</b></a> •
  9 |   <a href="https://github.com/answers111/alpha-research"><b>[🐱 GitHub]</b></a>
 10 | </p>
 11 | <p align="center">
 12 | Repo for "<a href="https://arxiv.org/abs/2511.08522" target="_blank">AlphaResearch: Accelerating New Algorithm Discovery with Language Models</a>"
 13 | </p>
 14 | 
 15 | 
 16 | <div align="center">
 17 | 
 18 | <img src="./assets/comp.png" width="400" alt="alpha-research" />
 19 | <br>
 20 |     <em>Figure 1: Comparison of OpenEvolve (with program-based reward), ShinkaEvolve (with programbased reward) and AlphaResearch (with program-based and peer-review reward). </em>
 21 | </div>
 22 | 
 23 | # News
 24 | 
 25 | - [2025/11/12] 🔥🔥🔥 [AlphaResearch-RM-7B](https://huggingface.co/alpha-research/AlphaResearch-RM-Qwen-7B) released at [🤗 HuggingFace](https://huggingface.co/alpha-research)!
 26 | - [2025/11/12] AlphaResearch paper, repo, and website released.
 27 | 
 28 | ## AlphaResearch Pipeline
 29 | 
 30 | <img src="./assets/steps.png" width="800" alt="alpha-research" />
 31 | <br>
 32 |     <em>Figure 2: The launch of AlphaResearch contains two manual steps. 
 33 |     (1) Train reward models with realworld peer-reviewed records. (2) Prepare initial research proposals, initial programs and evalution
 34 | program. </em>
 35 | </div>
 36 | 
 37 | ## 🚀 Run AlphaResearch
 38 | 
 39 | if you have `initial_program.py` and `initial_proposal.py`, please run
 40 | ```
 41 | cd alpha-research
 42 | python run.py
 43 | ```
 44 | 
 45 | ## ⚖️ Benchmark
 46 | 
 47 | The benchmark problems in AlphaResearchComp. AlphaEvolve has not publicly disclosed all the test problems so far. To provide a more transparent evaluation, we curate and open source a set of 8 frontier program-based
 48 | research tasks spanning geometry, number theory, harmonic analysis, and combinatorial optimization. 
 49 | They are either refined from prior work (e.g.,
 50 | AlphaEvolve) or collected from online repositories and domain experts.
 51 | 
 52 | | Problem | Human Best | Human Researcher |
 53 | |---------|------------|------------------|
 54 | | Packing circles (n=26) | 2.634 | David Cantrell (2011) |
 55 | | Packing circles (n=32) | 2.936 | Eckard Specht (2012) |
 56 | | Minimizing max-min distance ratio (d=2, n=16) | 12.89 | David Cantrell (2009) |
 57 | | Third autocorrelation inequality | 1.4581 | Carlos Vinuesa (2009) |
 58 | | Spherical code (n=30) minimizing upper bound | 0.67365 | Hardin & Sloane (1996,2002) |
 59 | | Autoconvolution peak minimization (upper bound) | 0.755 | Matolcsi-Vinuesa (2010) |
 60 | | Littlewood polynomials (n=5) | 32 | Rudin-Shapiro (1946/1952) |
 61 | | MSTSD (n=30) | 1.04 | Hegarty (2006/2007) |
 62 | 
 63 | ## ⚙️ Results
 64 | Results on AlphaResearchComp. ↑ inidicates that higher score is better and ↓ for lower.
 65 | 
 66 | | Problem | Human | AlphaResearch init | best | Excel@best |
 67 | |---------|-------|---------------------|------|------------|
 68 | | Packing circles (n=26) ↑ | 2.634 | 0 | 2.636 | 0.32% |
 69 | | Packing circles (n=32) ↑ | 2.936 | 0 | 2.939 | 0.10% |
 70 | | Minimizing max-min distance ratio ↓ | 12.89 | 15.55 | 12.92 | -0.23% |
 71 | | Third autocorrelation inequality ↓ | 1.458 | 35.746 | 1.546 | -6.03% |
 72 | | Spherical code (d=3, n=30) ↑ | 0.6736 | 0.5130 | 0.6735 | -0.01% |
 73 | | Autoconvolution peak minimization ↓ | 0.755 | 1.512 | 0.756 | -0.13% |
 74 | | Littlewood polynomials (n=512) ↑ | 32 | 32 | 32 | 0% |
 75 | | MSTSD (n=30) ↑ | 1.04 | 1.04 | 1.04 | 0% |
 76 | 
 77 | ## 🤖 EvolveAgent
 78 | 
 79 | We use [OpenEvolve](https://github.com/codelion/openevolve) as our evolutionary agent.
 80 | 
 81 | ## 🌲 Reward Model
 82 | 
 83 | We train Qwen2.5-7B-Instruct with ICLR(2017-2024) papers as our reward model.
 84 | 
 85 | 
 86 | - Train Dataset: Abstract and Review Score of  ICLR 2017-2024 papers  (24,445 in total) (knowledge cut-off date: Dec, 2023)
 87 | 
 88 | - Evaluation Dataset: Abstract and Review Score of 100 ICLR 2025 papers
 89 | (ICLR2025 Rebuttal started at Dec, 2024)
 90 | 
 91 | - Metric: positive score (>5.5), negative score(<=5.5), binary classification
 92 | 
 93 | ### ⚡️ Training
 94 | 
 95 | We open-source our complete training scripts for the community, and you may construct your own dataset for training.
 96 | To train a model, run the following command:
 97 | 
 98 | ```sh
 99 | bash alpha-research/reward_model/train/script/train_qwen.sh
100 | ```
101 | 
102 | ### 🪁 RM Results
103 | 
104 | | Model | Released Date (Knowledge Cutoff) | Accuracy (Binary) |
105 | | --- | --- | --- |
106 | | Human | Mar, 2025 (potential leakage) | 65.0% |
107 | | GPT-5 (medium) | Mar, 2025 (potential leakage) | 53.0% |
108 | | Qwen2.5-7B-Instruct | Sep, 2024 | 37.0% |
109 | | [AlphaResearch-RM-7B](https://huggingface.co/alpha-research/AlphaResearch-RM-Qwen-7B)  | Sep, 2024  | 72.0% |
110 | 
111 | ## 📖 License
112 | 
113 | This code repository is licensed under the MIT License. 
114 | 
115 | ## ☕️ Citation
116 | 
117 | If you find this repository helpful, please consider citing our paper:
118 | 
119 | ```
120 | @article{yu2025alpharesearch,
121 |   title={AlphaResearch: Accelerating New Algorithm Discovery with Language Models},
122 |   author={Yu, Zhaojian and Feng, Kaiyue and Zhao, Yilun and He, Shilin and Zhang, Xiao-Ping and Cohan, Arman},
123 |   journal={arXiv preprint arXiv:2511.08522},
124 |   year={2025}
125 | }
126 | ``` 
127 | 


--------------------------------------------------------------------------------
/evolve_agent/cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Command-line interface for EvolveAgent
  3 | """
  4 | 
  5 | import argparse
  6 | import asyncio
  7 | import logging
  8 | import os
  9 | import sys
 10 | from typing import Dict, List, Optional
 11 | 
 12 | from evolve_agent import EvolveAgent
 13 | from evolve_agent.config import Config, load_config
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def parse_args() -> argparse.Namespace:
 19 |     """Parse command-line arguments"""
 20 |     parser = argparse.ArgumentParser(description="EvolveAgent - Evolutionary coding agent")
 21 | 
 22 |     parser.add_argument("initial_program", help="Path to the initial program file")
 23 | 
 24 |     parser.add_argument(
 25 |         "evaluation_file", help="Path to the evaluation file containing an 'evaluate' function"
 26 |     )
 27 | 
 28 |     parser.add_argument("--config", "-c", help="Path to configuration file (YAML)", default=None)
 29 | 
 30 |     parser.add_argument("--output", "-o", help="Output directory for results", default=None)
 31 | 
 32 |     parser.add_argument(
 33 |         "--iterations", "-i", help="Maximum number of iterations", type=int, default=None
 34 |     )
 35 | 
 36 |     parser.add_argument(
 37 |         "--target-score", "-t", help="Target score to reach", type=float, default=None
 38 |     )
 39 | 
 40 |     parser.add_argument(
 41 |         "--log-level",
 42 |         "-l",
 43 |         help="Logging level",
 44 |         choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
 45 |         default="INFO",
 46 |     )
 47 | 
 48 |     parser.add_argument(
 49 |         "--checkpoint",
 50 |         help="Path to checkpoint directory to resume from (e.g., evolve_agent_output/checkpoints/checkpoint_50)",
 51 |         default=None,
 52 |     )
 53 | 
 54 |     parser.add_argument("--api-base", help="Base URL for the LLM API", default=None)
 55 | 
 56 |     parser.add_argument("--primary-model", help="Primary LLM model name", default=None)
 57 | 
 58 |     parser.add_argument("--secondary-model", help="Secondary LLM model name", default=None)
 59 | 
 60 |     return parser.parse_args()
 61 | 
 62 | 
 63 | async def main_async() -> int:
 64 |     """
 65 |     Main asynchronous entry point
 66 | 
 67 |     Returns:
 68 |         Exit code
 69 |     """
 70 |     args = parse_args()
 71 | 
 72 |     # Check if files exist
 73 |     if not os.path.exists(args.initial_program):
 74 |         print(f"Error: Initial program file '{args.initial_program}' not found")
 75 |         return 1
 76 | 
 77 |     if not os.path.exists(args.evaluation_file):
 78 |         print(f"Error: Evaluation file '{args.evaluation_file}' not found")
 79 |         return 1
 80 | 
 81 |     # Create config object with command-line overrides
 82 |     config = None
 83 |     if args.api_base or args.primary_model or args.secondary_model:
 84 |         # Load base config from file or defaults
 85 |         config = load_config(args.config)
 86 | 
 87 |         # Apply command-line overrides
 88 |         if args.api_base:
 89 |             config.llm.api_base = args.api_base
 90 |             print(f"Using API base: {config.llm.api_base}")
 91 | 
 92 |         if args.primary_model:
 93 |             config.llm.primary_model = args.primary_model
 94 |             print(f"Using primary model: {config.llm.primary_model}")
 95 | 
 96 |         if args.secondary_model:
 97 |             config.llm.secondary_model = args.secondary_model
 98 |             print(f"Using secondary model: {config.llm.secondary_model}")
 99 | 
100 |     # Initialize EvolveAgent
101 |     try:
102 |         evolve_agent = EvolveAgent(
103 |             initial_program_path=args.initial_program,
104 |             evaluation_file=args.evaluation_file,
105 |             config=config,
106 |             config_path=args.config if config is None else None,
107 |             output_dir=args.output,
108 |         )
109 | 
110 |         # Load from checkpoint if specified
111 |         if args.checkpoint:
112 |             if not os.path.exists(args.checkpoint):
113 |                 print(f"Error: Checkpoint directory '{args.checkpoint}' not found")
114 |                 return 1
115 |             print(f"Loading checkpoint from {args.checkpoint}")
116 |             evolve_agent.database.load(args.checkpoint)
117 |             print(
118 |                 f"Checkpoint loaded successfully (iteration {evolve_agent.database.last_iteration})"
119 |             )
120 | 
121 |         # Override log level if specified
122 |         if args.log_level:
123 |             logging.getLogger().setLevel(getattr(logging, args.log_level))
124 | 
125 |         # Run evolution
126 |         best_program = await evolve_agent.run(
127 |             iterations=args.iterations,
128 |             target_score=args.target_score,
129 |         )
130 | 
131 |         # Get the checkpoint path
132 |         checkpoint_dir = os.path.join(evolve_agent.output_dir, "checkpoints")
133 |         latest_checkpoint = None
134 |         if os.path.exists(checkpoint_dir):
135 |             checkpoints = [
136 |                 os.path.join(checkpoint_dir, d)
137 |                 for d in os.listdir(checkpoint_dir)
138 |                 if os.path.isdir(os.path.join(checkpoint_dir, d))
139 |             ]
140 |             if checkpoints:
141 |                 latest_checkpoint = sorted(
142 |                     checkpoints, key=lambda x: int(x.split("_")[-1]) if "_" in x else 0
143 |                 )[-1]
144 | 
145 |         print(f"\nEvolution complete!")
146 |         print(f"Best program metrics:")
147 |         for name, value in best_program.metrics.items():
148 |             # Handle mixed types: format numbers as floats, others as strings
149 |             if isinstance(value, (int, float)):
150 |                 print(f"  {name}: {value:.4f}")
151 |             else:
152 |                 print(f"  {name}: {value}")
153 | 
154 |         if latest_checkpoint:
155 |             print(f"\nLatest checkpoint saved at: {latest_checkpoint}")
156 |             print(f"To resume, use: --checkpoint {latest_checkpoint}")
157 | 
158 |         return 0
159 | 
160 |     except Exception as e:
161 |         print(f"Error: {str(e)}")
162 |         import traceback
163 | 
164 |         traceback.print_exc()
165 |         return 1
166 | 
167 | 
168 | def main() -> int:
169 |     """
170 |     Main entry point
171 | 
172 |     Returns:
173 |         Exit code
174 |     """
175 |     return asyncio.run(main_async())
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     sys.exit(main())
180 | 


--------------------------------------------------------------------------------
/reward_model/train/train.py:
--------------------------------------------------------------------------------
  1 | # This code is based on tatsu-lab/stanford_alpaca (https://github.com/tatsu-lab/stanford_alpaca).
  2 | 
  3 | from dataclasses import dataclass, field
  4 | import math
  5 | import pathlib
  6 | from typing import Dict, Optional
  7 | 
  8 | import transformers
  9 | from transformers import Trainer
 10 | from transformers.trainer_pt_utils import LabelSmoother
 11 | from transformers import set_seed
 12 | 
 13 | from preprocess import load_dataset, make_supervised_data_module, DataCollatorForSupervisedDataset
 14 | 
 15 | IGNORE_TOKEN_ID = LabelSmoother.ignore_index
 16 | 
 17 | IGNORE_INDEX = -100
 18 | DEFAULT_PAD_TOKEN = "[PAD]"
 19 | DEFAULT_EOS_TOKEN = "</s>"
 20 | DEFAULT_BOS_TOKEN = "<s>"
 21 | DEFAULT_UNK_TOKEN = "<unk>"
 22 | 
 23 | set_seed(42)
 24 | 
 25 | @dataclass
 26 | class ModelArguments:
 27 |     model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
 28 |     
 29 | 
 30 | @dataclass
 31 | class DataArguments:
 32 |     data_path: str = field(
 33 |         default=None, metadata={"help": "Path to the training data."}
 34 |     )
 35 |     eval_data_path: str = field(
 36 |         default=None, metadata={"help": "Path to the evaluation data."}
 37 |     )
 38 |     lazy_preprocess: bool = True
 39 | 
 40 | 
 41 | @dataclass
 42 | class TrainingArguments(transformers.TrainingArguments):
 43 |     cache_dir: Optional[str] = field(default=None)
 44 |     optim: str = field(default="adamw_torch")
 45 |     model_max_length: int = field(
 46 |         default=512,
 47 |         metadata={
 48 |             "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
 49 |         },
 50 |     )
 51 | 
 52 | 
 53 | def trainer_save_model_safe(trainer: transformers.Trainer):
 54 |     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 55 |     from torch.distributed.fsdp import StateDictType, FullStateDictConfig
 56 | 
 57 |     save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
 58 |     with FSDP.state_dict_type(
 59 |         trainer.model, StateDictType.FULL_STATE_DICT, save_policy
 60 |     ):
 61 |         trainer.save_model()
 62 | 
 63 | 
 64 | def smart_tokenizer_and_embedding_resize(
 65 |     special_tokens_dict: Dict,
 66 |     tokenizer: transformers.PreTrainedTokenizer,
 67 |     model: transformers.PreTrainedModel,
 68 | ):
 69 |     """Resize tokenizer and embedding.
 70 | 
 71 |     Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
 72 |     """
 73 |     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
 74 |     model.resize_token_embeddings(len(tokenizer))
 75 | 
 76 |     if num_new_tokens > 0:
 77 |         input_embeddings = model.get_input_embeddings().weight.data
 78 |         output_embeddings = model.get_output_embeddings().weight.data
 79 | 
 80 |         input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
 81 |             dim=0, keepdim=True
 82 |         )
 83 |         output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
 84 |             dim=0, keepdim=True
 85 |         )
 86 | 
 87 |         input_embeddings[-num_new_tokens:] = input_embeddings_avg
 88 |         output_embeddings[-num_new_tokens:] = output_embeddings_avg
 89 | 
 90 | 
 91 | def train():
 92 |     global local_rank
 93 | 
 94 |     parser = transformers.HfArgumentParser(
 95 |         (ModelArguments, DataArguments, TrainingArguments)
 96 |     )
 97 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 98 |     local_rank = training_args.local_rank
 99 | 
100 |     # Set RoPE scaling factor
101 |     config = transformers.AutoConfig.from_pretrained(
102 |         model_args.model_name_or_path,
103 |         cache_dir=training_args.cache_dir,
104 |     )
105 |     orig_ctx_len = getattr(config, "max_position_embeddings", None)
106 |     if orig_ctx_len and training_args.model_max_length > orig_ctx_len:
107 |         scaling_factor = float(math.ceil(training_args.model_max_length / orig_ctx_len))
108 |         config.rope_scaling = {"type": "linear", "factor": scaling_factor}
109 |     config.use_cache = False
110 | 
111 |     # Load model and tokenizer
112 |     tokenizer = transformers.AutoTokenizer.from_pretrained(
113 |         model_args.model_name_or_path,
114 |         cache_dir=training_args.cache_dir,
115 |         model_max_length = training_args.model_max_length,
116 |         truncation = True,
117 |         padding_side = "right",
118 |         trust_remote_code = True,
119 |         use_fast=True,
120 |     )
121 | 
122 |     # Load data
123 |     if '.json' in data_args.data_path:
124 |         data_module = make_supervised_data_module(tokenizer=tokenizer, data_path=data_args.data_path)
125 |     else:
126 |         train_dataset = load_dataset(data_args.data_path)
127 |         data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
128 |         data_module = dict(
129 |         train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator
130 |     )
131 |         
132 |     model = transformers.AutoModelForCausalLM.from_pretrained(
133 |         model_args.model_name_or_path,
134 |         config=config,
135 |         cache_dir=training_args.cache_dir,
136 |         use_flash_attention_2=True
137 |     )
138 | 
139 |     if local_rank == 0:
140 |         print(config)
141 |         print(model)
142 |         
143 |     # tokenizer.pad_token = tokenizer.unk_token
144 |     special_tokens_dict = dict()
145 |     if tokenizer.pad_token is None:
146 |         special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
147 |     if tokenizer.eos_token is None:
148 |         special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
149 |     if tokenizer.bos_token is None:
150 |         special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
151 |     if tokenizer.unk_token is None:
152 |         special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
153 | 
154 |     smart_tokenizer_and_embedding_resize(
155 |         special_tokens_dict=special_tokens_dict,
156 |         tokenizer=tokenizer,
157 |         model=model,
158 |     )
159 | 
160 |     # Start trainner
161 |     trainer = Trainer(
162 |         model=model, tokenizer=tokenizer, args=training_args, **data_module
163 |     )
164 |     if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
165 |         trainer.train(resume_from_checkpoint=True)
166 |     else:
167 |         trainer.train()
168 | 
169 |     # Save model
170 |     model.config.use_cache = True
171 |     trainer.save_state()
172 |     trainer_save_model_safe(trainer)
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     train()
177 | 


--------------------------------------------------------------------------------
/evolve_agent/utils/code_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for code parsing, diffing, and manipulation
  3 | """
  4 | 
  5 | import re
  6 | from typing import Dict, List, Optional, Tuple, Union
  7 | 
  8 | 
  9 | def parse_evolve_blocks(code: str) -> List[Tuple[int, int, str]]:
 10 |     """
 11 |     Parse evolve blocks from code
 12 | 
 13 |     Args:
 14 |         code: Source code with evolve blocks
 15 | 
 16 |     Returns:
 17 |         List of tuples (start_line, end_line, block_content)
 18 |     """
 19 |     lines = code.split("\n")
 20 |     blocks = []
 21 | 
 22 |     in_block = False
 23 |     start_line = -1
 24 |     block_content = []
 25 | 
 26 |     for i, line in enumerate(lines):
 27 |         if "# EVOLVE-BLOCK-START" in line:
 28 |             in_block = True
 29 |             start_line = i
 30 |             block_content = []
 31 |         elif "# EVOLVE-BLOCK-END" in line and in_block:
 32 |             in_block = False
 33 |             blocks.append((start_line, i, "\n".join(block_content)))
 34 |         elif in_block:
 35 |             block_content.append(line)
 36 | 
 37 |     return blocks
 38 | 
 39 | 
 40 | def apply_diff(original_code: str, diff_text: str) -> str:
 41 |     """
 42 |     Apply a diff to the original code
 43 | 
 44 |     Args:
 45 |         original_code: Original source code
 46 |         diff_text: Diff in the SEARCH/REPLACE format
 47 | 
 48 |     Returns:
 49 |         Modified code
 50 |     """
 51 |     # Split into lines for easier processing
 52 |     original_lines = original_code.split("\n")
 53 |     result_lines = original_lines.copy()
 54 | 
 55 |     # Extract diff blocks
 56 |     diff_blocks = extract_diffs(diff_text)
 57 | 
 58 |     # Apply each diff block
 59 |     for search_text, replace_text in diff_blocks:
 60 |         search_lines = search_text.split("\n")
 61 |         replace_lines = replace_text.split("\n")
 62 | 
 63 |         # Find where the search pattern starts in the original code
 64 |         for i in range(len(result_lines) - len(search_lines) + 1):
 65 |             if result_lines[i : i + len(search_lines)] == search_lines:
 66 |                 # Replace the matched section
 67 |                 result_lines[i : i + len(search_lines)] = replace_lines
 68 |                 break
 69 | 
 70 |     return "\n".join(result_lines)
 71 | 
 72 | 
 73 | def extract_diffs(diff_text: str) -> List[Tuple[str, str]]:
 74 |     """
 75 |     Extract diff blocks from the diff text
 76 | 
 77 |     Args:
 78 |         diff_text: Diff in the SEARCH/REPLACE format
 79 | 
 80 |     Returns:
 81 |         List of tuples (search_text, replace_text)
 82 |     """
 83 |     diff_pattern = r"<<<<<<< SEARCH\n(.*?)=======\n(.*?)>>>>>>> REPLACE"
 84 |     diff_blocks = re.findall(diff_pattern, diff_text, re.DOTALL)
 85 |     return [(match[0].rstrip(), match[1].rstrip()) for match in diff_blocks]
 86 | 
 87 | 
 88 | def parse_full_rewrite(llm_response: str, language: str = "python") -> Optional[str]:
 89 |     """
 90 |     Extract a full rewrite from an LLM response
 91 | 
 92 |     Args:
 93 |         llm_response: Response from the LLM
 94 |         language: Programming language
 95 | 
 96 |     Returns:
 97 |         Extracted code or None if not found
 98 |     """
 99 |     code_block_pattern = r"```" + language + r"\n(.*?)```"
100 |     matches = re.findall(code_block_pattern, llm_response, re.DOTALL)
101 | 
102 |     if matches:
103 |         return matches[0].strip()
104 | 
105 |     # Fallback to any code block
106 |     code_block_pattern = r"```(.*?)```"
107 |     matches = re.findall(code_block_pattern, llm_response, re.DOTALL)
108 | 
109 |     if matches:
110 |         return matches[0].strip()
111 | 
112 |     # Fallback to plain text
113 |     return llm_response
114 | 
115 | 
116 | def format_diff_summary(diff_blocks: List[Tuple[str, str]]) -> str:
117 |     """
118 |     Create a human-readable summary of the diff
119 | 
120 |     Args:
121 |         diff_blocks: List of (search_text, replace_text) tuples
122 | 
123 |     Returns:
124 |         Summary string
125 |     """
126 |     summary = []
127 | 
128 |     for i, (search_text, replace_text) in enumerate(diff_blocks):
129 |         search_lines = search_text.strip().split("\n")
130 |         replace_lines = replace_text.strip().split("\n")
131 | 
132 |         # Create a short summary
133 |         if len(search_lines) == 1 and len(replace_lines) == 1:
134 |             summary.append(f"Change {i+1}: '{search_lines[0]}' to '{replace_lines[0]}'")
135 |         else:
136 |             search_summary = (
137 |                 f"{len(search_lines)} lines" if len(search_lines) > 1 else search_lines[0]
138 |             )
139 |             replace_summary = (
140 |                 f"{len(replace_lines)} lines" if len(replace_lines) > 1 else replace_lines[0]
141 |             )
142 |             summary.append(f"Change {i+1}: Replace {search_summary} with {replace_summary}")
143 | 
144 |     return "\n".join(summary)
145 | 
146 | 
147 | def calculate_edit_distance(code1: str, code2: str) -> int:
148 |     """
149 |     Calculate the Levenshtein edit distance between two code snippets
150 | 
151 |     Args:
152 |         code1: First code snippet
153 |         code2: Second code snippet
154 | 
155 |     Returns:
156 |         Edit distance (number of operations needed to transform code1 into code2)
157 |     """
158 |     if code1 == code2:
159 |         return 0
160 | 
161 |     # Simple implementation of Levenshtein distance
162 |     m, n = len(code1), len(code2)
163 |     dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
164 | 
165 |     for i in range(m + 1):
166 |         dp[i][0] = i
167 | 
168 |     for j in range(n + 1):
169 |         dp[0][j] = j
170 | 
171 |     for i in range(1, m + 1):
172 |         for j in range(1, n + 1):
173 |             cost = 0 if code1[i - 1] == code2[j - 1] else 1
174 |             dp[i][j] = min(
175 |                 dp[i - 1][j] + 1,  # deletion
176 |                 dp[i][j - 1] + 1,  # insertion
177 |                 dp[i - 1][j - 1] + cost,  # substitution
178 |             )
179 | 
180 |     return dp[m][n]
181 | 
182 | 
183 | def extract_code_language(code: str) -> str:
184 |     """
185 |     Try to determine the language of a code snippet
186 | 
187 |     Args:
188 |         code: Code snippet
189 | 
190 |     Returns:
191 |         Detected language or "unknown"
192 |     """
193 |     # Look for common language signatures
194 |     if re.search(r"^(import|from|def|class)\s", code, re.MULTILINE):
195 |         return "python"
196 |     elif re.search(r"^(package|import java|public class)", code, re.MULTILINE):
197 |         return "java"
198 |     elif re.search(r"^(#include|int main|void main)", code, re.MULTILINE):
199 |         return "cpp"
200 |     elif re.search(r"^(function|var|let|const|console\.log)", code, re.MULTILINE):
201 |         return "javascript"
202 |     elif re.search(r"^(module|fn|let mut|impl)", code, re.MULTILINE):
203 |         return "rust"
204 |     elif re.search(r"^(SELECT|CREATE TABLE|INSERT INTO)", code, re.MULTILINE):
205 |         return "sql"
206 | 
207 |     return "unknown"
208 | 


--------------------------------------------------------------------------------
/configs/default_config.yaml:
--------------------------------------------------------------------------------
  1 | # EvolveAgent Default Configuration
  2 | # This file contains all available configuration options with sensible defaults
  3 | # You can use this as a template for your own configuration
  4 | 
  5 | # General settings
  6 | max_iterations: 1000                  # Maximum number of evolution iterations
  7 | checkpoint_interval: 50               # Save checkpoints every N iterations
  8 | log_level: "INFO"                     # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
  9 | log_dir: null                         # Custom directory for logs (default: output_dir/logs)
 10 | random_seed: null                     # Random seed for reproducibility (null = random)
 11 | 
 12 | # Evolution settings
 13 | diff_based_evolution: true            # Use diff-based evolution (true) or full rewrites (false)
 14 | allow_full_rewrites: false            # Allow occasional full rewrites even in diff-based mode
 15 | max_code_length: 100000                # Maximum allowed code length in characters
 16 | 
 17 | # LLM configuration
 18 | llm:
 19 |   # Models for evolution
 20 |   models:
 21 |     # List of available models with their weights
 22 |     - name: "deepseek-chat"
 23 |       weight: 1.0
 24 |     # - name: "gemini-2.0-flash"
 25 |     #   weight: 0
 26 | 
 27 |   # Models for LLM feedback
 28 |   # evaluator_models:
 29 |   #   # List of available models with their weights
 30 |   #   - name: "gemini-2.0-flash-lite"
 31 |   #     weight: 0.8
 32 |   #   - name: "gemini-2.0-flash"
 33 |   #     weight: 0.2
 34 | 
 35 |   # API configuration
 36 |   api_base: "https://api.deepseek.com"  # Base URL for API (change for non-OpenAI models)
 37 |   api_key: "sk-2c3f1f58031b4b86afdb6a8192ea02e2"                       # API key (defaults to OPENAI_API_KEY env variable)
 38 | 
 39 |   # Generation parameters
 40 |   temperature: 0.7                    # Temperature for generation (higher = more creative)
 41 |   top_p: 0.95                         # Top-p sampling parameter
 42 |   max_tokens: 8192                    # Maximum tokens to generate
 43 | 
 44 |   # Request parameters
 45 |   timeout: 300                        # Timeout for API requests in seconds
 46 |   retries: 3                          # Number of retries for failed requests
 47 |   retry_delay: 5                      # Delay between retries in seconds
 48 | 
 49 | # Prompt configuration
 50 | prompt:
 51 |   template_dir: null                  # Custom directory for prompt templates
 52 |   # system_message: "You are an expert coder helping to improve programs through evolution."
 53 |   # evaluator_system_message: "You are an expert code reviewer."
 54 | 
 55 |   # Number of examples to include in the prompt
 56 |   num_top_programs: 3                 # Number of top-performing programs to include
 57 |   num_diverse_programs: 2             # Number of diverse programs to include
 58 | 
 59 |   # Template stochasticity
 60 |   use_template_stochasticity: true    # Use random variations in templates for diversity
 61 |   template_variations:                # Different phrasings for parts of the template
 62 |     improvement_suggestion:
 63 |       - "Here's how we could improve this code:"
 64 |       - "I suggest the following improvements:"
 65 |       - "We can enhance this code by:"
 66 | 
 67 |   # Note: meta-prompting features are not yet implemented
 68 | 
 69 | # Database configuration
 70 | database:
 71 |   # General settings
 72 |   db_path: null                       # Path to persist database (null = in-memory only)
 73 |   in_memory: true                     # Keep database in memory for faster access
 74 |   log_prompts: true                  # If true, log all prompts and responses into the database
 75 | 
 76 |   # Evolutionary parameters
 77 |   population_size: 1000               # Maximum number of programs to keep in memory
 78 |   archive_size: 100                   # Size of elite archive
 79 |   num_islands: 5                      # Number of islands for island model (separate populations)
 80 | 
 81 |   # Island-based evolution parameters
 82 |   # Islands provide diversity by maintaining separate populations that evolve independently.
 83 |   # Migration periodically shares the best solutions between adjacent islands.
 84 |   migration_interval: 50              # Migrate between islands every N generations
 85 |   migration_rate: 0.1                 # Fraction of top programs to migrate (0.1 = 10%)
 86 | 
 87 |   # Selection parameters
 88 |   elite_selection_ratio: 0.1          # Ratio of elite programs to select
 89 |   exploration_ratio: 0.2              # Ratio of exploration vs exploitation
 90 |   exploitation_ratio: 0.7             # Ratio of exploitation vs random selection
 91 |   # Note: diversity_metric is fixed to "edit_distance" (feature_based not implemented)
 92 | 
 93 |   # Feature map dimensions for MAP-Elites
 94 |   feature_dimensions:                 # Dimensions for MAP-Elites feature map
 95 |     - "score"                         # Performance score
 96 |     - "complexity"                    # Code complexity (length)
 97 |   feature_bins: 10                    # Number of bins per dimension
 98 | 
 99 | # Evaluator configuration
100 | evaluator:
101 |   # General settings
102 |   timeout: 300                        # Maximum evaluation time in seconds
103 |   max_retries: 3                      # Maximum number of retries for evaluation
104 | 
105 |   # Note: resource limits (memory_limit_mb, cpu_limit) are not yet implemented
106 | 
107 |   # Evaluation strategies
108 |   cascade_evaluation: false            # Use cascade evaluation to filter bad solutions early
109 |   cascade_thresholds:                 # Thresholds for advancing to next evaluation stage
110 |     - 0.5                             # First stage threshold
111 |     - 0.75                            # Second stage threshold
112 |     - 0.9                             # Third stage threshold
113 | 
114 |   # Parallel evaluation
115 |   parallel_evaluations: 4             # Number of parallel evaluations
116 |   # Note: distributed evaluation is not yet implemented
117 | 
118 |   # LLM-based feedback (experimental)
119 |   use_llm_feedback: false             # Use LLM to evaluate code quality
120 |   llm_feedback_weight: 0.1            # Weight for LLM feedback in final score
121 | 
122 | # Reward model configuration
123 | rewardmodel:
124 |   model_type: vllm                  # Model type (vllm or api)
125 |   model_name: /data/zhuotaodeng/yzj/alpha_research_model/qwen25_grm_iclr_boxed/checkpoint-180                   # Model name (if null, uses default)
126 |   temperature: 0.7                    # Temperature for generation
127 |   top_p: 0.95                         # Top-p sampling parameter
128 |   max_tokens: 4096                    # Maximum tokens to generate
129 |   proposal_score_threshold: 5.5       # Only generate programs if proposal score >= threshold
130 |   # api_key: sk-2c3f1f58031b4b86afdb6a8192ea02e2                       # API key for API models
131 |   # base_url:  https://api.deepseek.com                     # Base URL for API models
132 |   jsonl_file: "results/reward_results.jsonl"         # JSONL file for results
133 |   max_retries: 50                     # Maximum number of retries
134 |   retry_delay: 5                      # Delay between retries in seconds
135 | 


--------------------------------------------------------------------------------
/reward_model/train/preprocess.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | import logging
  3 | import pathlib
  4 | import copy
  5 | from typing import Dict, Optional, Sequence
  6 | import torch
  7 | from torch.utils.data import Dataset
  8 | 
  9 | import transformers
 10 | from transformers import AutoTokenizer
 11 | from datasets import load_dataset
 12 | 
 13 | IGNORE_INDEX = -100
 14 | DEFAULT_PAD_TOKEN = "[PAD]"
 15 | DEFAULT_EOS_TOKEN = "</s>"
 16 | DEFAULT_BOS_TOKEN = "<s>"
 17 | DEFAULT_UNK_TOKEN = "<unk>"
 18 | _MAGIC_SPLITTER_ = "-[[]]-this-is-really-our-highest-priority-[[]]-"
 19 | SYSTEM_PROMPT = "You are an expert reviewer tasked with evaluating the quality of a research proposal. "
 20 | 
 21 | 
 22 | class SupervisedDataset(Dataset):
 23 |     """Dataset for supervised fine-tuning."""
 24 | 
 25 |     def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
 26 |         super(SupervisedDataset, self).__init__()
 27 |         logging.warning("Loading data...")
 28 |         list_data_dict = load_dataset('json', data_files=data_path, split='train')
 29 |         logging.warning("Formatting inputs...")
 30 |         sources = [
 31 |             (
 32 |                 prompt_format(tokenizer, example)
 33 |             )
 34 |             for example in list_data_dict
 35 |         ]
 36 |         targets = [
 37 |             f"{example['response']}{'<|im_end|>'}" for example in list_data_dict
 38 |         ]
 39 | 
 40 |         logging.warning("Tokenizing inputs... This may take some time...")
 41 |         data_dict = preprocess(sources, targets, tokenizer)
 42 | 
 43 |         self.input_ids = data_dict["input_ids"]
 44 |         self.labels = data_dict["labels"]
 45 | 
 46 |     def __len__(self):
 47 |         return len(self.input_ids)
 48 | 
 49 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
 50 |         return dict(input_ids=self.input_ids[i], labels=self.labels[i])
 51 |     
 52 | 
 53 | def save_dataset(dataset: SupervisedDataset, save_path: str):
 54 | 
 55 |     save_dir = pathlib.Path(save_path)
 56 |     save_dir.mkdir(parents=True, exist_ok=True)
 57 | 
 58 |     torch.save(dataset.input_ids, save_dir / "input_ids.pt")
 59 |     torch.save(dataset.labels, save_dir / "labels.pt")
 60 |     logging.info(f"Dataset saved to {save_dir}")
 61 | 
 62 | 
 63 | def load_from_pt(save_path: str) -> SupervisedDataset:
 64 | 
 65 |     save_dir = pathlib.Path(save_path)
 66 | 
 67 |     # Load input_ids and labels
 68 |     input_ids = torch.load(save_dir / "input_ids.pt")
 69 |     labels = torch.load(save_dir / "labels.pt")
 70 | 
 71 |     # Create an empty SupervisedDataset instance
 72 |     dataset = SupervisedDataset.__new__(SupervisedDataset)
 73 |     dataset.input_ids = input_ids
 74 |     dataset.labels = labels
 75 | 
 76 |     logging.info(f"Dataset loaded from {save_dir}")
 77 |     return dataset
 78 | 
 79 | 
 80 | def _tokenize_fn(
 81 |     strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer
 82 | ) -> Dict:
 83 |     """Tokenize a list of strings."""
 84 |     tokenized_list = [
 85 |         tokenizer(
 86 |             text,
 87 |             return_tensors="pt",
 88 |             padding="longest",
 89 |             max_length=tokenizer.model_max_length,
 90 |             truncation=True,
 91 |         )
 92 |         for text in strings
 93 |     ]
 94 |     input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
 95 |     input_ids_lens = labels_lens = [
 96 |         tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
 97 |         for tokenized in tokenized_list
 98 |     ]
 99 |     return dict(
100 |         input_ids=input_ids,
101 |         labels=labels,
102 |         input_ids_lens=input_ids_lens,
103 |         labels_lens=labels_lens,
104 |     )
105 | 
106 | def preprocess(
107 |     sources: Sequence[str],
108 |     targets: Sequence[str],
109 |     tokenizer: transformers.PreTrainedTokenizer,
110 | ) -> Dict:
111 |     """Preprocess the data by tokenizing."""
112 |     examples = [s + t for s, t in zip(sources, targets)]
113 |     examples_tokenized, sources_tokenized = [
114 |         _tokenize_fn(strings, tokenizer) for strings in (examples, sources)
115 |     ]
116 |     input_ids = examples_tokenized["input_ids"]
117 |     labels = copy.deepcopy(input_ids)
118 |     for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
119 |         label[:source_len] = IGNORE_INDEX
120 |     return dict(input_ids=input_ids, labels=labels)
121 | 
122 | 
123 | @dataclass
124 | class DataCollatorForSupervisedDataset(object):
125 |     """Collate examples for supervised fine-tuning."""
126 | 
127 |     tokenizer: transformers.PreTrainedTokenizer
128 | 
129 |     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
130 |         input_ids, labels = tuple(
131 |             [instance[key] for instance in instances] for key in ("input_ids", "labels")
132 |         )
133 |         input_ids = torch.nn.utils.rnn.pad_sequence(
134 |             input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
135 |         )
136 |         labels = torch.nn.utils.rnn.pad_sequence(
137 |             labels, batch_first=True, padding_value=IGNORE_INDEX
138 |         )
139 |         return dict(
140 |             input_ids=input_ids,
141 |             labels=labels,
142 |             attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
143 |         )
144 | 
145 | 
146 | def make_supervised_data_module(
147 |     tokenizer: transformers.PreTrainedTokenizer, data_path
148 | ) -> Dict:
149 |     """Make dataset and collator for supervised fine-tuning."""
150 |     train_dataset = SupervisedDataset(
151 |         tokenizer=tokenizer, data_path=data_path
152 |     )
153 |     data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
154 |     print(f"len={len(train_dataset)}")
155 |     return dict(
156 |         train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator
157 |     )
158 | 
159 | 
160 | def prompt_format(tokenizer, example):
161 |     question = example['question'].strip()
162 |     messages = [
163 |         {
164 |             "role": "system",
165 |             "content": SYSTEM_PROMPT
166 |         },        
167 |         {
168 |             "role": "user",
169 |             "content": question
170 |         },
171 |         {
172 |             "role": "assistant",
173 |             "content": '' + _MAGIC_SPLITTER_            
174 |         }
175 |     ]
176 |     return tokenizer.apply_chat_template(messages, tokenize=False).split(_MAGIC_SPLITTER_)[0]
177 | 
178 | if __name__ == '__main__':
179 |     tokenizer = AutoTokenizer.from_pretrained('/work/zhuotaodeng/yzj/pretrained_models_ms/Qwen/Qwen2___5-7B-Instruct')
180 |     # data_path = '/data/zhuotaodeng/test-time-scaling/z1/data/openthought_evol-221k.json'
181 |     # ds = make_supervised_data_module(tokenizer, data_path)
182 |     # save_dataset(ds['train_dataset'],'/data/zhuotaodeng/test-time-scaling/z1/data/qwen')
183 | 
184 |     ds = load_dataset('/data/zhuotaodeng/test-time-scaling/z1/data/qwen')
185 |     data = ds[1]
186 |     decoded_input = tokenizer.decode(data['input_ids'], skip_special_tokens=True)
187 |     print("Decoded input_ids:", decoded_input)
188 |     filtered_labels = data['labels'][data['labels'] != -100]
189 |     decoded_labels = tokenizer.decode(filtered_labels, skip_special_tokens=True)
190 |     print("Decoded labels:", decoded_labels)
191 | 


--------------------------------------------------------------------------------
/benchmark/heilbronn_in_the_unit_square/visualization.py:
--------------------------------------------------------------------------------
  1 | # viz_min_triangle.py
  2 | import argparse
  3 | import importlib.util
  4 | import json
  5 | import os
  6 | import sys
  7 | from itertools import combinations
  8 | 
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | # ---------- 评测与辅助函数（与您 evaluator 一致/兼容） ----------
 14 | 
 15 | def _triangle_area(a, b, c) -> float:
 16 |     return abs((b[0]-a[0])*(c[1]-a[1]) - (b[1]-a[1])*(c[0]-a[0])) * 0.5
 17 | 
 18 | def find_min_triangle(points: np.ndarray):
 19 |     """
 20 |     返回最小三角形：(i, j, k, min_area)
 21 |     若点数<3，返回 (-1, -1, -1, 0.0)
 22 |     """
 23 |     P = np.asarray(points, dtype=float)
 24 |     n = len(P)
 25 |     if n < 3:
 26 |         return -1, -1, -1, 0.0
 27 |     best = (-1, -1, -1, float("inf"))
 28 |     for i, j, k in combinations(range(n), 3):
 29 |         area = _triangle_area(P[i], P[j], P[k])
 30 |         if area < best[3]:
 31 |             best = (i, j, k, area)
 32 |             if area == 0.0:
 33 |                 break
 34 |     return best
 35 | 
 36 | def evaluate_min_triangle_area(points: np.ndarray):
 37 |     """
 38 |     与您当前 evaluator 保持一致的指标：
 39 |       - min_area：最小三角形面积（越大越好）
 40 |       - scaled_min_area：n^(8/7 + 1/2000) * min_area
 41 |       - score：等于 min_area（越大越好）
 42 |     """
 43 |     pts = np.asarray(points, dtype=float)
 44 |     if pts.ndim != 2 or pts.shape[1] != 2 or len(pts) < 3:
 45 |         return dict(valid=0.0, min_area=0.0, n=float(len(pts)),
 46 |                     scaled_min_area=0.0, score=0.0, argmin_triplet=(-1,-1,-1))
 47 |     i, j, k, min_area = find_min_triangle(pts)
 48 |     n = float(len(pts))
 49 |     exponent = (8.0/7.0) + (1.0/2000.0)
 50 |     scaled_min_area = (n ** exponent) * float(min_area)
 51 |     return dict(
 52 |         valid=1.0,
 53 |         min_area=float(min_area),
 54 |         n=n,
 55 |         scaled_min_area=float(scaled_min_area),
 56 |         score=float(min_area),
 57 |         argmin_triplet=(int(i), int(j), int(k))
 58 |     )
 59 | 
 60 | 
 61 | # ---------- 读取点数据（模块 / npy / csv） ----------
 62 | 
 63 | def load_from_module(module_path: str) -> np.ndarray:
 64 |     module_path = os.path.abspath(module_path)
 65 |     spec = importlib.util.spec_from_file_location("points_mod", module_path)
 66 |     if spec is None or spec.loader is None:
 67 |         raise RuntimeError(f"无法加载模块: {module_path}")
 68 |     mod = importlib.util.module_from_spec(spec)
 69 |     spec.loader.exec_module(mod)
 70 | 
 71 |     pts = None
 72 |     if hasattr(mod, "points"):
 73 |         pts = mod.points
 74 |     elif hasattr(mod, "main"):
 75 |         res = mod.main()
 76 |         try:
 77 |             pts = np.asarray(res, dtype=float)
 78 |         except Exception:
 79 |             pass
 80 |     if pts is None and hasattr(mod, "points"):
 81 |         pts = mod.points
 82 |     if pts is None:
 83 |         raise RuntimeError("模块中既无 `points` 变量，也无法从 `main()` 获取点。")
 84 | 
 85 |     pts = np.asarray(pts, dtype=float)
 86 |     if pts.ndim != 2 or pts.shape[1] != 2:
 87 |         raise ValueError(f"模块返回的点形状异常: {pts.shape}, 期望 (N,2)")
 88 |     return pts
 89 | 
 90 | def load_from_npy(npy_path: str) -> np.ndarray:
 91 |     pts = np.load(npy_path)
 92 |     pts = np.asarray(pts, dtype=float)
 93 |     if pts.ndim != 2 or pts.shape[1] != 2:
 94 |         raise ValueError(f"npy 形状异常: {pts.shape}, 期望 (N,2)")
 95 |     return pts
 96 | 
 97 | def load_from_csv(csv_path: str) -> np.ndarray:
 98 |     pts = np.loadtxt(csv_path, delimiter=",")
 99 |     pts = np.asarray(pts, dtype=float)
100 |     if pts.ndim != 2 or pts.shape[1] != 2:
101 |         raise ValueError(f"csv 形状异常: {pts.shape}, 期望 (N,2)")
102 |     return pts
103 | 
104 | 
105 | # ---------- 可视化 ----------
106 | 
107 | def plot_points_and_min_triangle(points: np.ndarray,
108 |                                  show_indices: bool = False,
109 |                                  title_prefix: str = ""):
110 |     pts = np.asarray(points, dtype=float)
111 |     (i, j, k, amin) = find_min_triangle(pts)
112 |     eval_res = evaluate_min_triangle_area(pts)
113 | 
114 |     fig, ax = plt.subplots(figsize=(6, 6))
115 |     ax.set_xlim(0, 1)
116 |     ax.set_ylim(0, 1)
117 |     ax.set_aspect("equal", adjustable="box")
118 | 
119 |     # 画所有点
120 |     ax.scatter(pts[:, 0], pts[:, 1], s=40, zorder=2)
121 | 
122 |     # 可选：标注索引
123 |     if show_indices:
124 |         for idx, (x, y) in enumerate(pts):
125 |             ax.text(x, y, str(idx), fontsize=9, ha="left", va="bottom")
126 | 
127 |     # 高亮最小三角形
128 |     if i >= 0:
129 |         tri = np.array([pts[i], pts[j], pts[k], pts[i]])
130 |         ax.plot(tri[:, 0], tri[:, 1], linewidth=2.5, zorder=3)
131 |         ax.scatter(pts[[i, j, k], 0], pts[[i, j, k], 1], s=70, zorder=4)
132 | 
133 |     # 网格与边框
134 |     ax.set_xticks(np.linspace(0, 1, 6))
135 |     ax.set_yticks(np.linspace(0, 1, 6))
136 |     ax.grid(True, linestyle="--", alpha=0.3)
137 | 
138 |     # 标题（包含指标）
139 |     title = (
140 |         f"{title_prefix}min_area={eval_res['min_area']:.8f} | "
141 |         f"scaled_min_area={eval_res['scaled_min_area']:.6f} | "
142 |         f"score={eval_res['score']:.8f} | "
143 |         f"argmin={eval_res['argmin_triplet']}"
144 |     )
145 |     ax.set_title(title)
146 |     plt.tight_layout()
147 |     plt.show()
148 | 
149 |     # 同时在 stdout 打一份 JSON，方便脚本化调用时抓数值
150 |     out = {
151 |         "min_area": eval_res["min_area"],
152 |         "scaled_min_area": eval_res["scaled_min_area"],
153 |         "score": eval_res["score"],
154 |         "argmin_triplet": eval_res["argmin_triplet"],
155 |         "n": int(eval_res["n"]),
156 |     }
157 |     print(json.dumps(out, ensure_ascii=False, indent=2))
158 | 
159 | 
160 | # ---------- CLI ----------
161 | 
162 | def main():
163 |     parser = argparse.ArgumentParser(
164 |         description="可视化 [0,1]^2 中点集，并高亮最小三角形（支持模块 / .npy / .csv）。若未提供来源，将自动尝试读取同目录下的 points.npy"
165 |     )
166 |     src = parser.add_mutually_exclusive_group(required=False)
167 |     src.add_argument("--from-module", type=str, help="含 points 或 main() 的 Python 文件路径")
168 |     src.add_argument("--from-npy", type=str, help="N×2 的 .npy 路径")
169 |     src.add_argument("--from-csv", type=str, help="N×2 的 .csv 路径（逗号分隔）")
170 |     parser.add_argument("--show-indices", action="store_true", help="是否标注点索引")
171 |     parser.add_argument("--title", type=str, default="", help="标题前缀")
172 |     args = parser.parse_args()
173 | 
174 |     if args.from_module:
175 |         P = load_from_module(args.from_module)
176 |     elif args.from_npy:
177 |         P = load_from_npy(args.from_npy)
178 |     elif args.from_csv:
179 |         P = load_from_csv(args.from_csv)
180 |     else:
181 |         # 自动读取默认的 points.npy（位于本脚本同目录）
182 |         default_path = os.path.join(os.path.dirname(__file__), "points.npy")
183 |         if not os.path.exists(default_path):
184 |             print(
185 |                 "未提供输入来源，且未在本目录找到 points.npy。请先运行 initial_program.py 生成 points.npy，或通过 --from-* 指定输入。",
186 |                 file=sys.stderr,
187 |             )
188 |             sys.exit(2)
189 |         P = load_from_npy(default_path)
190 | 
191 |     # 可选：如果任务要求必须在 [0,1]^2，可以做个提示（不改变数值）
192 |     if not (np.all(P >= 0.0) and np.all(P <= 1.0)):
193 |         print("⚠️ 警告：存在越界点（不在 [0,1]^2），图中仍会显示。", file=sys.stderr)
194 | 
195 |     plot_points_and_min_triangle(P, show_indices=args.show_indices, title_prefix=args.title)
196 | 
197 | if __name__ == "__main__":
198 |     main()
199 | 


--------------------------------------------------------------------------------
/reward_model/train/utils.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import logging
  3 | import math
  4 | import os
  5 | import io
  6 | import sys
  7 | import time
  8 | import json
  9 | from typing import Optional, Sequence, Union
 10 | 
 11 | import openai
 12 | import tqdm
 13 | from openai import openai_object
 14 | import copy
 15 | 
 16 | StrOrOpenAIObject = Union[str, openai_object.OpenAIObject]
 17 | 
 18 | openai_org = os.getenv("OPENAI_ORG")
 19 | if openai_org is not None:
 20 |     openai.organization = openai_org
 21 |     logging.warning(f"Switching to organization: {openai_org} for OAI API key.")
 22 | 
 23 | 
 24 | @dataclasses.dataclass
 25 | class OpenAIDecodingArguments(object):
 26 |     max_tokens: int = 1800
 27 |     temperature: float = 0.2
 28 |     top_p: float = 1.0
 29 |     n: int = 1
 30 |     stream: bool = False
 31 |     stop: Optional[Sequence[str]] = None
 32 |     presence_penalty: float = 0.0
 33 |     frequency_penalty: float = 0.0
 34 |     suffix: Optional[str] = None
 35 |     logprobs: Optional[int] = None
 36 |     echo: bool = False
 37 | 
 38 | 
 39 | def openai_completion(
 40 |     prompts: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
 41 |     decoding_args: OpenAIDecodingArguments,
 42 |     model_name="text-davinci-003",
 43 |     sleep_time=2,
 44 |     batch_size=1,
 45 |     max_instances=sys.maxsize,
 46 |     max_batches=sys.maxsize,
 47 |     return_text=False,
 48 |     **decoding_kwargs,
 49 | ) -> Union[Union[StrOrOpenAIObject], Sequence[StrOrOpenAIObject], Sequence[Sequence[StrOrOpenAIObject]],]:
 50 |     """Decode with OpenAI API.
 51 | 
 52 |     Args:
 53 |         prompts: A string or a list of strings to complete. If it is a chat model the strings should be formatted
 54 |             as explained here: https://github.com/openai/openai-python/blob/main/chatml.md. If it is a chat model
 55 |             it can also be a dictionary (or list thereof) as explained here:
 56 |             https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
 57 |         decoding_args: Decoding arguments.
 58 |         model_name: Model name. Can be either in the format of "org/model" or just "model".
 59 |         sleep_time: Time to sleep once the rate-limit is hit.
 60 |         batch_size: Number of prompts to send in a single request. Only for non chat model.
 61 |         max_instances: Maximum number of prompts to decode.
 62 |         max_batches: Maximum number of batches to decode. This argument will be deprecated in the future.
 63 |         return_text: If True, return text instead of full completion object (which contains things like logprob).
 64 |         decoding_kwargs: Additional decoding arguments. Pass in `best_of` and `logit_bias` if you need them.
 65 | 
 66 |     Returns:
 67 |         A completion or a list of completions.
 68 |         Depending on return_text, return_openai_object, and decoding_args.n, the completion type can be one of
 69 |             - a string (if return_text is True)
 70 |             - an openai_object.OpenAIObject object (if return_text is False)
 71 |             - a list of objects of the above types (if decoding_args.n > 1)
 72 |     """
 73 |     is_single_prompt = isinstance(prompts, (str, dict))
 74 |     if is_single_prompt:
 75 |         prompts = [prompts]
 76 | 
 77 |     if max_batches < sys.maxsize:
 78 |         logging.warning(
 79 |             "`max_batches` will be deprecated in the future, please use `max_instances` instead."
 80 |             "Setting `max_instances` to `max_batches * batch_size` for now."
 81 |         )
 82 |         max_instances = max_batches * batch_size
 83 | 
 84 |     prompts = prompts[:max_instances]
 85 |     num_prompts = len(prompts)
 86 |     prompt_batches = [
 87 |         prompts[batch_id * batch_size : (batch_id + 1) * batch_size]
 88 |         for batch_id in range(int(math.ceil(num_prompts / batch_size)))
 89 |     ]
 90 | 
 91 |     completions = []
 92 |     for batch_id, prompt_batch in tqdm.tqdm(
 93 |         enumerate(prompt_batches),
 94 |         desc="prompt_batches",
 95 |         total=len(prompt_batches),
 96 |     ):
 97 |         batch_decoding_args = copy.deepcopy(decoding_args)  # cloning the decoding_args
 98 | 
 99 |         while True:
100 |             try:
101 |                 shared_kwargs = dict(
102 |                     model=model_name,
103 |                     **batch_decoding_args.__dict__,
104 |                     **decoding_kwargs,
105 |                 )
106 |                 completion_batch = openai.Completion.create(prompt=prompt_batch, **shared_kwargs)
107 |                 choices = completion_batch.choices
108 | 
109 |                 for choice in choices:
110 |                     choice["total_tokens"] = completion_batch.usage.total_tokens
111 |                 completions.extend(choices)
112 |                 break
113 |             except openai.error.OpenAIError as e:
114 |                 logging.warning(f"OpenAIError: {e}.")
115 |                 if "Please reduce your prompt" in str(e):
116 |                     batch_decoding_args.max_tokens = int(batch_decoding_args.max_tokens * 0.8)
117 |                     logging.warning(f"Reducing target length to {batch_decoding_args.max_tokens}, Retrying...")
118 |                 else:
119 |                     logging.warning("Hit request rate limit; retrying...")
120 |                     time.sleep(sleep_time)  # Annoying rate limit on requests.
121 | 
122 |     if return_text:
123 |         completions = [completion.text for completion in completions]
124 |     if decoding_args.n > 1:
125 |         # make completions a nested list, where each entry is a consecutive decoding_args.n of original entries.
126 |         completions = [completions[i : i + decoding_args.n] for i in range(0, len(completions), decoding_args.n)]
127 |     if is_single_prompt:
128 |         # Return non-tuple if only 1 input and 1 generation.
129 |         (completions,) = completions
130 |     return completions
131 | 
132 | 
133 | def _make_w_io_base(f, mode: str):
134 |     if not isinstance(f, io.IOBase):
135 |         f_dirname = os.path.dirname(f)
136 |         if f_dirname != "":
137 |             os.makedirs(f_dirname, exist_ok=True)
138 |         f = open(f, mode=mode)
139 |     return f
140 | 
141 | 
142 | def _make_r_io_base(f, mode: str):
143 |     if not isinstance(f, io.IOBase):
144 |         f = open(f, mode=mode)
145 |     return f
146 | 
147 | 
148 | def jdump(obj, f, mode="w", indent=4, default=str):
149 |     """Dump a str or dictionary to a file in json format.
150 | 
151 |     Args:
152 |         obj: An object to be written.
153 |         f: A string path to the location on disk.
154 |         mode: Mode for opening the file.
155 |         indent: Indent for storing json dictionaries.
156 |         default: A function to handle non-serializable entries; defaults to `str`.
157 |     """
158 |     f = _make_w_io_base(f, mode)
159 |     if isinstance(obj, (dict, list)):
160 |         json.dump(obj, f, indent=indent, default=default)
161 |     elif isinstance(obj, str):
162 |         f.write(obj)
163 |     else:
164 |         raise ValueError(f"Unexpected type: {type(obj)}")
165 |     f.close()
166 | 
167 | 
168 | def jload(f, mode="r"):
169 |     """Load a .json file into a dictionary."""
170 |     f = _make_r_io_base(f, mode)
171 |     jdict = json.load(f)
172 |     f.close()
173 |     return jdict
174 | 


--------------------------------------------------------------------------------
/evolve_agent/prompt/templates.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Prompt templates for EvolveAgent
  3 | """
  4 | 
  5 | import os
  6 | from pathlib import Path
  7 | from typing import Dict, List, Optional, Union
  8 | 
  9 | # Base system message template for evolution
 10 | BASE_SYSTEM_TEMPLATE = """You are an expert software developer tasked with iteratively improving a codebase.
 11 | Your job is to analyze the current program and suggest improvements based on feedback from previous attempts.
 12 | Focus on making targeted changes that will increase the program's performance metrics.
 13 | """
 14 | 
 15 | BASE_EVALUATOR_SYSTEM_TEMPLATE = """You are an expert code reviewer.
 16 | Your job is to analyze the provided code and evaluate it systematically."""
 17 | 
 18 | # User message template for diff-based evolution
 19 | DIFF_USER_TEMPLATE = """# Current Program Information
 20 | - Current performance metrics: {metrics}
 21 | - Areas identified for improvement: {improvement_areas}
 22 | 
 23 | {artifacts}
 24 | 
 25 | # Program Evolution History
 26 | {evolution_history}
 27 | 
 28 | # Current Program
 29 | ```{language}
 30 | {current_program}
 31 | ```
 32 | 
 33 | # Task
 34 | Suggest improvements to the program that will lead to better performance on the specified metrics.
 35 | 
 36 | You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes:
 37 | 
 38 | <<<<<<< SEARCH
 39 | # Original code to find and replace (must match exactly)
 40 | =======
 41 | # New replacement code
 42 | >>>>>>> REPLACE
 43 | 
 44 | Example of valid diff format:
 45 | <<<<<<< SEARCH
 46 | for i in range(m):
 47 |     for j in range(p):
 48 |         for k in range(n):
 49 |             C[i, j] += A[i, k] * B[k, j]
 50 | =======
 51 | # Reorder loops for better memory access pattern
 52 | for i in range(m):
 53 |     for k in range(n):
 54 |         for j in range(p):
 55 |             C[i, j] += A[i, k] * B[k, j]
 56 | >>>>>>> REPLACE
 57 | 
 58 | You can suggest multiple changes. Each SEARCH section must exactly match code in the current program.
 59 | Be thoughtful about your changes and explain your reasoning thoroughly.
 60 | 
 61 | IMPORTANT: Do not rewrite the entire program - focus on targeted improvements.
 62 | """
 63 | 
 64 | 
 65 | DIFF_USER_TEMPLATE_PROPOSAL = """# Previous Proposal: 
 66 | {parent_proposal_text}
 67 | 
 68 | # Previous Program:
 69 | ```{language}
 70 | {parent_program}
 71 | ```
 72 | 
 73 | # Previous Performance Metrics: 
 74 | {metrics}
 75 | 
 76 | # Areas Identified for Improvement: 
 77 | {improvement_areas}
 78 | 
 79 | {artifacts}
 80 | 
 81 | # Program Evolution History
 82 | {evolution_history}
 83 | 
 84 | # Current Proposal
 85 | {current_proposal_text}
 86 | 
 87 | # Task
 88 | Suggest improvements to the program that will lead to better performance on the specified metrics.
 89 | 
 90 | You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes:
 91 | 
 92 | <<<<<<< SEARCH
 93 | # Original code to find and replace (must match exactly)
 94 | =======
 95 | # New replacement code
 96 | >>>>>>> REPLACE
 97 | 
 98 | Example of valid diff format:
 99 | <<<<<<< SEARCH
100 | for i in range(m):
101 |     for j in range(p):
102 |         for k in range(n):
103 |             C[i, j] += A[i, k] * B[k, j]
104 | =======
105 | # Reorder loops for better memory access pattern
106 | for i in range(m):
107 |     for k in range(n):
108 |         for j in range(p):
109 |             C[i, j] += A[i, k] * B[k, j]
110 | >>>>>>> REPLACE
111 | 
112 | You can suggest multiple changes. Each SEARCH section must exactly match code in the current program.
113 | Be thoughtful about your changes and explain your reasoning thoroughly.
114 | 
115 | IMPORTANT: Do not rewrite the entire program - focus on targeted improvements.
116 | """
117 | 
118 | 
119 | 
120 | # User message template for full rewrite
121 | FULL_REWRITE_USER_TEMPLATE = """# Current Program Information
122 | - Current performance metrics: {metrics}
123 | - Areas identified for improvement: {improvement_areas}
124 | 
125 | {artifacts}
126 | 
127 | # Program Evolution History
128 | {evolution_history}
129 | 
130 | # Current Program
131 | ```{language}
132 | {current_program}
133 | ```
134 | 
135 | # Task
136 | Rewrite the program to improve its performance on the specified metrics.
137 | Provide the complete new program code.
138 | 
139 | IMPORTANT: Make sure your rewritten program maintains the same inputs and outputs
140 | as the original program, but with improved internal implementation.
141 | 
142 | ```{language}
143 | # Your rewritten program here
144 | ```
145 | """
146 | 
147 | # Template for formatting evolution history
148 | EVOLUTION_HISTORY_TEMPLATE = """## Previous Attempts
149 | 
150 | {previous_attempts}
151 | 
152 | ## Top Performing Programs
153 | 
154 | {top_programs}
155 | """
156 | 
157 | # Template for formatting a previous attempt
158 | PREVIOUS_ATTEMPT_TEMPLATE = """### Attempt {attempt_number}
159 | - Changes: {changes}
160 | - Performance: {performance}
161 | - Outcome: {outcome}
162 | """
163 | 
164 | # Template for formatting a top program
165 | TOP_PROGRAM_TEMPLATE = """### Program {program_number} (Score: {score})
166 | ```{language}
167 | {program_snippet}
168 | ```
169 | Key features: {key_features}
170 | """
171 | 
172 | # Template for evaluating a program via an LLM
173 | EVALUATION_TEMPLATE = """Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics:
174 | 1. Readability: How easy is the code to read and understand?
175 | 2. Maintainability: How easy would the code be to maintain and modify?
176 | 3. Efficiency: How efficient is the code in terms of time and space complexity?
177 | 
178 | For each metric, provide a score between 0.0 and 1.0, where 1.0 is best.
179 | 
180 | Code to evaluate:
181 | ```python
182 | {current_program}
183 | ```
184 | 
185 | Return your evaluation as a JSON object with the following format:
186 | {{
187 |     "readability": [score],
188 |     "maintainability": [score],
189 |     "efficiency": [score],
190 |     "reasoning": "[brief explanation of scores]"
191 | }}
192 | """
193 | 
194 | 
195 | # Default templates dictionary
196 | DEFAULT_TEMPLATES = {
197 |     "system_message": BASE_SYSTEM_TEMPLATE,
198 |     "evaluator_system_message": BASE_EVALUATOR_SYSTEM_TEMPLATE,
199 |     # "diff_user": DIFF_USER_TEMPLATE,
200 |     "full_rewrite_user": FULL_REWRITE_USER_TEMPLATE,
201 |     "evolution_history": EVOLUTION_HISTORY_TEMPLATE,
202 |     "previous_attempt": PREVIOUS_ATTEMPT_TEMPLATE,
203 |     "top_program": TOP_PROGRAM_TEMPLATE,
204 |     "evaluation": EVALUATION_TEMPLATE,
205 |     "diff_user": DIFF_USER_TEMPLATE_PROPOSAL
206 | }
207 | 
208 | 
209 | class TemplateManager:
210 |     """Manages templates for prompt generation"""
211 | 
212 |     def __init__(self, template_dir: Optional[str] = None):
213 |         self.templates = DEFAULT_TEMPLATES.copy()
214 | 
215 |         # Load templates from directory if provided
216 |         if template_dir and os.path.isdir(template_dir):
217 |             self._load_templates_from_dir(template_dir)
218 | 
219 |     def _load_templates_from_dir(self, template_dir: str) -> None:
220 |         """Load templates from a directory"""
221 |         for file_path in Path(template_dir).glob("*.txt"):
222 |             template_name = file_path.stem
223 |             with open(file_path, "r") as f:
224 |                 self.templates[template_name] = f.read()
225 | 
226 |     def get_template(self, template_name: str) -> str:
227 |         """Get a template by name"""
228 |         if template_name not in self.templates:
229 |             raise ValueError(f"Template '{template_name}' not found")
230 |         return self.templates[template_name]
231 | 
232 |     def add_template(self, template_name: str, template: str) -> None:
233 |         """Add or update a template"""
234 |         self.templates[template_name] = template
235 | 


--------------------------------------------------------------------------------
/evolve_agent/utils/async_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Async utilities for EvolveAgent
  3 | """
  4 | 
  5 | import asyncio
  6 | import functools
  7 | import logging
  8 | import time
  9 | from typing import Any, Callable, Dict, List, Optional, TypeVar, Union
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | T = TypeVar("T")
 14 | 
 15 | 
 16 | def run_in_executor(f: Callable) -> Callable:
 17 |     """
 18 |     Decorator to run a synchronous function in an executor
 19 | 
 20 |     Args:
 21 |         f: Function to decorate
 22 | 
 23 |     Returns:
 24 |         Decorated function that runs in an executor
 25 |     """
 26 | 
 27 |     @functools.wraps(f)
 28 |     async def wrapper(*args: Any, **kwargs: Any) -> Any:
 29 |         loop = asyncio.get_event_loop()
 30 |         return await loop.run_in_executor(None, functools.partial(f, *args, **kwargs))
 31 | 
 32 |     return wrapper
 33 | 
 34 | 
 35 | async def run_with_timeout(
 36 |     coro: Callable, timeout: float, *args: Any, timeout_error_value: Any = None, **kwargs: Any
 37 | ) -> Any:
 38 |     """
 39 |     Run a coroutine with a timeout, returning a default value on timeout
 40 | 
 41 |     Args:
 42 |         coro: Coroutine function to run
 43 |         timeout: Timeout in seconds
 44 |         *args: Arguments to pass to the coroutine
 45 |         timeout_error_value: Value to return on timeout (default: {"error": 0.0, "timeout": True})
 46 |         **kwargs: Keyword arguments to pass to the coroutine
 47 | 
 48 |     Returns:
 49 |         Result of the coroutine or timeout_error_value on timeout
 50 |     """
 51 |     if timeout_error_value is None:
 52 |         timeout_error_value = {"error": 0.0, "timeout": True}
 53 | 
 54 |     try:
 55 |         return await asyncio.wait_for(coro(*args, **kwargs), timeout=timeout)
 56 |     except asyncio.TimeoutError:
 57 |         logger.warning(f"Operation timed out after {timeout}s")
 58 |         return timeout_error_value
 59 | 
 60 | 
 61 | async def run_sync_with_timeout(
 62 |     func: Callable, timeout: float, *args: Any, timeout_error_value: Any = None, **kwargs: Any
 63 | ) -> Any:
 64 |     """
 65 |     Run a synchronous function in an executor with a timeout
 66 | 
 67 |     Args:
 68 |         func: Synchronous function to run
 69 |         timeout: Timeout in seconds
 70 |         *args: Arguments to pass to the function
 71 |         timeout_error_value: Value to return on timeout (default: {"error": 0.0, "timeout": True})
 72 |         **kwargs: Keyword arguments to pass to the function
 73 | 
 74 |     Returns:
 75 |         Result of the function or timeout_error_value on timeout
 76 |     """
 77 |     if timeout_error_value is None:
 78 |         timeout_error_value = {"error": 0.0, "timeout": True}
 79 | 
 80 |     try:
 81 |         loop = asyncio.get_event_loop()
 82 |         task = loop.run_in_executor(None, functools.partial(func, *args, **kwargs))
 83 |         return await asyncio.wait_for(task, timeout=timeout)
 84 |     except asyncio.TimeoutError:
 85 |         logger.warning(f"Sync operation timed out after {timeout}s")
 86 |         return timeout_error_value
 87 | 
 88 | 
 89 | async def gather_with_concurrency(
 90 |     n: int, *tasks: asyncio.Future, return_exceptions: bool = False
 91 | ) -> List[Any]:
 92 |     """
 93 |     Run tasks with a concurrency limit
 94 | 
 95 |     Args:
 96 |         n: Maximum number of tasks to run concurrently
 97 |         *tasks: Tasks to run
 98 |         return_exceptions: Whether to return exceptions instead of raising them
 99 | 
100 |     Returns:
101 |         List of task results
102 |     """
103 |     semaphore = asyncio.Semaphore(n)
104 | 
105 |     async def sem_task(task: asyncio.Future) -> Any:
106 |         async with semaphore:
107 |             return await task
108 | 
109 |     return await asyncio.gather(
110 |         *(sem_task(task) for task in tasks), return_exceptions=return_exceptions
111 |     )
112 | 
113 | 
114 | async def retry_async(
115 |     coro: Callable,
116 |     *args: Any,
117 |     retries: int = 3,
118 |     delay: float = 1.0,
119 |     backoff: float = 2.0,
120 |     exceptions: Union[Exception, tuple] = Exception,
121 |     **kwargs: Any,
122 | ) -> Any:
123 |     """
124 |     Retry an async function with exponential backoff
125 | 
126 |     Args:
127 |         coro: Coroutine function to retry
128 |         *args: Arguments to pass to the coroutine
129 |         retries: Maximum number of retries
130 |         delay: Initial delay between retries (seconds)
131 |         backoff: Multiplier for delay between retries
132 |         exceptions: Exception(s) to catch
133 |         **kwargs: Keyword arguments to pass to the coroutine
134 | 
135 |     Returns:
136 |         Result of the coroutine
137 | 
138 |     Raises:
139 |         The last exception caught if all retries fail
140 |     """
141 |     last_exception = None
142 |     current_delay = delay
143 | 
144 |     for i in range(retries + 1):
145 |         try:
146 |             return await coro(*args, **kwargs)
147 |         except exceptions as e:
148 |             last_exception = e
149 |             if i < retries:
150 |                 logger.warning(
151 |                     f"Retry {i+1}/{retries} failed with {type(e).__name__}: {str(e)}. "
152 |                     f"Retrying in {current_delay:.2f}s..."
153 |                 )
154 |                 await asyncio.sleep(current_delay)
155 |                 current_delay *= backoff
156 |             else:
157 |                 logger.error(
158 |                     f"All {retries+1} attempts failed. Last error: {type(e).__name__}: {str(e)}"
159 |                 )
160 | 
161 |     if last_exception:
162 |         raise last_exception
163 | 
164 |     return None  # Should never reach here
165 | 
166 | 
167 | class TaskPool:
168 |     """
169 |     A simple task pool for managing and limiting concurrent tasks
170 |     """
171 | 
172 |     def __init__(self, max_concurrency: int = 10):
173 |         self.max_concurrency = max_concurrency
174 |         self._semaphore: Optional[asyncio.Semaphore] = None
175 |         self.tasks: List[asyncio.Task] = []
176 | 
177 |     @property
178 |     def semaphore(self) -> asyncio.Semaphore:
179 |         """Lazy-initialize the semaphore when first needed"""
180 |         if self._semaphore is None:
181 |             self._semaphore = asyncio.Semaphore(self.max_concurrency)
182 |         return self._semaphore
183 | 
184 |     async def run(self, coro: Callable, *args: Any, **kwargs: Any) -> Any:
185 |         """
186 |         Run a coroutine in the pool
187 | 
188 |         Args:
189 |             coro: Coroutine function to run
190 |             *args: Arguments to pass to the coroutine
191 |             **kwargs: Keyword arguments to pass to the coroutine
192 | 
193 |         Returns:
194 |             Result of the coroutine
195 |         """
196 |         async with self.semaphore:
197 |             return await coro(*args, **kwargs)
198 | 
199 |     def create_task(self, coro: Callable, *args: Any, **kwargs: Any) -> asyncio.Task:
200 |         """
201 |         Create and track a task in the pool
202 | 
203 |         Args:
204 |             coro: Coroutine function to run
205 |             *args: Arguments to pass to the coroutine
206 |             **kwargs: Keyword arguments to pass to the coroutine
207 | 
208 |         Returns:
209 |             Task object
210 |         """
211 |         task = asyncio.create_task(self.run(coro, *args, **kwargs))
212 |         self.tasks.append(task)
213 |         task.add_done_callback(lambda t: self.tasks.remove(t))
214 |         return task
215 | 
216 |     async def wait_all(self) -> None:
217 |         """Wait for all tasks in the pool to complete"""
218 |         if self.tasks:
219 |             await asyncio.gather(*self.tasks)
220 | 
221 |     async def cancel_all(self) -> None:
222 |         """Cancel all tasks in the pool"""
223 |         for task in self.tasks:
224 |             task.cancel()
225 | 
226 |         if self.tasks:
227 |             await asyncio.gather(*self.tasks, return_exceptions=True)
228 | 


--------------------------------------------------------------------------------
/benchmark/human_best.txt:
--------------------------------------------------------------------------------
 1 | Alpha-Research Benchmark: Human-Best Values (with references)
 2 | 
 3 | Note: Some benchmarks depend on problem parameters (n, d, etc.). Where the benchmark code fixes typical parameters in its initial programs, those are used. Otherwise we cite the best-known general results or mark as open.
 4 | 
 5 | 1) kissing_number
 6 | - Objective: In dimension d, maximize the number of unit vectors with pairwise dot <= 1/2 (equivalently, kissing number K(d)).
 7 | - Benchmark default: d = 11 (per `initial_program.py`).
 8 | - Human best: K(11) ≥ 592.
 9 | - Reference: M. Ganzhinov, "Kissing number in dimension 11 is at least 592" (PSU(4,2) construction), arXiv preprint, 2024.
10 | - Larger-is-better human best: 592.0
11 | - cite：https://arxiv.org/abs/2207.08266
12 | 
13 | 2) spherical_code
14 | - Objective: On S^{d-1} (unit sphere), maximize the minimal pairwise angle for n points.
15 | - Benchmark default: n = 30 on S^2 (per `initial_program.py`).
16 | - Human best (for default n=30): Best-known numerical minimal angle ≈ 0.673646755169... radians (unproven optimal); see Sloane's Tables of Spherical Codes.
17 | - References:
18 |   - N. J. A. Sloane, "Tables of Spherical Codes" (online tables; best-known values, many unproved for general n). [https://neilsloane.com/packings/]
19 |   - For comparison: when n = 12, the icosahedron is optimal with minimal angle = arccos(1/√5) ≈ 1.107148717 radians (≈ 63.4349488°) (classical result: Schütte–van der Waerden; Fejes Tóth).
20 | - Larger-is-better human best (n=30): ≈ 0.673646755169 radians
21 | - cite: https://neilsloane.com/packings/
22 | 
23 | 3) heilbronn_in_the_unit_square (n = 16)
24 | - Objective: Place n points in the unit square to maximize the smallest triangle area.
25 | - Metric (larger is better): min_area (raw smallest triangle area).
26 | - Benchmark default: n = 16.
27 | - Human best (status): A = 7/341 ≈ 0.020526... is the best-known construction for n=16; to our knowledge, global optimality is not proved (conjectured best-known value in tables).
28 | - References:
29 |   - Erich Friedman's Heilbronn Problem page (n=16 entry; tables list best-known configurations, not general proofs of optimality). [https://erich-friedman.github.io/packing/heilbronn/]
30 | - cite: https://erich-friedman.github.io/packing/heilbronn/
31 | 
32 | 4) littlewood_polynomials
33 | - Objective: For ±1 coefficients c_k, minimize sup_{t} |∑ c_k e^{ikt}| on the unit circle (sup-norm). For degree n, the best growth is known to be on the order of √n.
34 | - Human best (general): There exist Littlewood polynomials with sup-norm ≤ C √n for an absolute constant C; exact optimal constants are unknown. Rudin–Shapiro polynomials give explicit O(√n) upper bounds.
35 | - References:
36 |   - P. Borwein and M. Mossinghoff, surveys on Littlewood polynomials (e.g., Experimental Mathematics 2008; related 2002–2010 papers).
37 |   - J.-P. Kahane, Some Random Series of Functions (re: random ±1 coefficients and bounds).
38 | - Larger-is-better metric: 1 / supnorm (grows like ≈ 1 / (C √n)); numeric value depends on n.
39 |   - For n = 512 (benchmark default): A Rudin–Shapiro construction yields supnorm ≤ √(2n) = 32 (with √n ≈ 22.627, √2 ≈ 1.414), hence the benchmark score 1/supnorm = 1/32 = 0.03125.
40 |   - Tighter constant for Rudin–Shapiro: The classical identity implies C = √2, i.e., supnorm ≤ √(2n) for length-n Rudin–Shapiro polynomials (tighter than the looser bound 2√n sometimes quoted).
41 | - cite: https://www.memphis.edu/msci/people/pbalistr/shapiro.pdf
42 | 
43 | 5) riesz_energy
44 | - Objective: On [0,1], minimize E_s(x_1,…,x_n) = ∑_{i<j} 1/|x_i - x_j|^s; benchmark uses s = 1.
45 | - Human best (general guidance): In 1D, for s ∈ (0,1] many results and strong evidence favor nearly equally spaced configurations on [0,1]; exact formulas depend on whether endpoints are included. For equally spaced points including endpoints, a standard formula for s = 1 is E ≈ (n-1)(n H_{n-1} - (n-1)) when points are at i/(n-1).
46 | - References:
47 |   - D. P. Hardin and E. B. Saff (eds.), Discrete Energy on Rectifiable Sets, Springer, 2019.
48 |   - A. B. J. Kuijlaars and E. B. Saff, "Asymptotics for minimal discrete energy on the sphere," Trans. AMS 350 (1998) (general techniques; see also 1D discussions in the monograph above).
49 | - Larger-is-better metric: 1 / energy; for equally spaced n points incl. endpoints, 1 / E = 1 / ((n-1)(n H_{n-1} − (n-1))).
50 | - cite: https://www.math.vanderbilt.edu/saffeb/texts/261.pdf
51 | 
52 | 6) sum_vs_difference_set
53 | - Objective: Maximize |A+B| / |A−B| over finite sets A,B (discrete indicator formulation in benchmark). This connects to MSTD (more sums than differences) phenomena.
54 | - Human best (general): MSTD sets (|A+A| > |A−A|) exist and can give ratios strictly larger than 1; precise extremal ratios depend on constraints and are an open line of research.
55 | - References:
56 |   - M. B. Nathanson, "Sets with more sums than differences," Integers 7 (2007), #A5.
57 |   - I. Z. Ruzsa, Sumsets and structure (various surveys in additive combinatorics).
58 | - Larger-is-better metric: |A+B| / |A−B| (no conversion).
59 | - cite: https://arxiv.org/abs/math/0608148
60 | 
61 | 7) packing_circles
62 | - Objective: In the unit square, place n disjoint circles to maximize total sum of radii (benchmark’s objective). Note: This differs from the classical equal-radius packing problem.
63 | - Human best (general): The equal-radius variants have extensive tables; for maximizing sum of radii with variable sizes, sharp records are not standardized in the literature for n = 26, 32.
64 | - References:
65 |   - E. Specht, "Packings in squares and rectangles" (online tables) — equal-radius case.
66 |   - R. Graham, B. Lubachevsky, K. Nurmela, and P. Östergård, various papers on circle packing in a square.
67 | - Larger-is-better metric: total sum of radii (no conversion).
68 | 
69 | 8) minizing_raio_max_min_distance
70 | - Objective: For n points in [0,1]^d, minimize (max pairwise distance) / (min pairwise distance); benchmark queries (n,d) = (16,2) and (14,3).
71 | - Human best (general): This is a variant of dispersion/packing-covering tradeoff in a cube; sharp constants for the ratio under these constraints are not tabulated in the classical literature.
72 | - References: See general texts on sphere packing vs covering, and numerical optimization literature for blue-noise/Poisson-disc sampling in bounded domains.
73 | - Larger-is-better metric: use min/max (i.e., the reciprocal of the usual ratio). Examples from typical baselines: d=2, n=16 → ≈ 1/12.89 ≈ 0.0776; d=3, n=14 → ≈ 1/4.168 ≈ 0.2400.
74 | 
75 | 9) autoconvolution_peak_minimization
76 | - Objective: For nonnegative f on [0,1] with ∫ f = 1, minimize μ_∞ = sup_t (f * f)(t).
77 | - Human best (general): The exact optimum constant is open; best-known rigorous bounds are close to 1.5 (upper bounds from explicit constructions, lower bounds from analytic inequalities). Precise record values depend on smoothness/support constraints.
78 | - References: Surveys on autoconvolution inequalities (e.g., works following Erdős–Rényi type convolution problems; see also additive combinatorics notes and numerical studies in approximation theory).
79 | - Larger-is-better metric: 1 / μ_∞; indicative ≥ ≈ 1/1.5 ≈ 0.6667 given current upper bounds.
80 | - cite: https://arxiv.org/pdf/2210.16437?utm_source=chatgpt.com
81 | 
82 | 10) third_autocorrelation_inequality
83 | - Objective: Improve bounds for a third-order autocorrelation constant C_3 (benchmark computes an upper bound C_upper_bound and reports its reciprocal).
84 | - Human best (indicative): Recent numerical constructions report 1 / C_upper_bound ≈ 0.6869 for representative discretizations; exact best constant remains open.
85 | - References: Literature on higher-order autocorrelation and correlation-inequality problems in additive combinatorics and signal processing; see problem surveys and recent preprints.
86 | - Larger-is-better metric: 1 / C_upper_bound; indicative ≈ 0.6869.
87 | 
88 | Caveats and next steps
89 | - Several benchmarks encode families of problems parameterized by n and/or d; precise “human-best” values depend on those choices. Where known closed-form or sharp constants exist, they are reported; otherwise we cite authoritative surveys and note the open status.
90 | - If you’d like, specify exact (n,d) for `heilbronn_in_the_unit_square`, `packing_circles`, and `minizing_raio_max_min_distance`, and we can add numerical human-best targets or authoritative records if available.
91 | 


--------------------------------------------------------------------------------
/benchmark/minizing_raio_max_min_distance/initial_program.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.spatial.distance import pdist
  3 | from scipy.spatial import cKDTree
  4 | 
  5 | # (Removed) smooth_points — smoothing logic is now inlined to reduce indirection
  6 | 
  7 | 
  8 | def calculate_distances(points):
  9 |     """Calculates min, max, and ratio of pairwise Euclidean distances using scipy pdist."""
 10 |     if points.shape[0] < 2:
 11 |         return 0.0, 0.0, 0.0
 12 |     distances = pdist(points, metric='euclidean')
 13 |     eps = 1e-8
 14 |     min_dist = max(np.min(distances), eps)
 15 |     max_dist = np.max(distances)
 16 |     ratio = max_dist / min_dist
 17 |     return min_dist, max_dist, ratio
 18 | 
 19 | # (Removed) perturb_point — now inlined directly where used
 20 | 
 21 | def update_temperature(temperature, cooling_rate, accept_history, iteration, total_iters, initial_temperature, window_size=100):
 22 |     """
 23 |     Adaptive cooling with acceptance‐rate feedback and periodic reheating.
 24 |     """
 25 |     window = accept_history[-min(len(accept_history), window_size):]
 26 |     rate = sum(window) / len(window)
 27 |     # gentler correction: slow/fast cooling factors reduced
 28 |     if rate < 0.2:
 29 |         adj = 1.02
 30 |     elif rate > 0.8:
 31 |         adj = 0.98
 32 |     else:
 33 |         adj = 1.0
 34 |     temperature *= cooling_rate * adj
 35 |     # removed periodic reheating to maintain smoother cooling schedule
 36 |     # if (iteration + 1) % (total_iters // 4) == 0:
 37 |     #     temperature = initial_temperature
 38 |     return temperature
 39 | 
 40 | def max_min_dis_ratio(n: int, d: int, seed=None):
 41 |     """
 42 |     Finds n points in d-dimensional space to minimize the max/min distance ratio 
 43 |     using simulated annealing.
 44 | 
 45 |     Args:
 46 |         n (int): Number of points.
 47 |         d (int): Dimensionality of the space.
 48 | 
 49 |     Returns:
 50 |         tuple: (best_points, best_ratio)
 51 |     """
 52 | 
 53 |     # Adaptive hyperparameters based on dimensionality
 54 |     iterations = 3000 if d <= 2 else 6000  # increased sweeps for improved convergence
 55 |     initial_temperature = 10.0
 56 |     cooling_rate = 0.998 if d <= 2 else 0.996  # slower cooling for extended exploration
 57 |     perturbation_factor = 0.15 if d <= 2 else 0.12  # tuned smaller steps in 3D for better local refinement
 58 |     # relaxation factor for post-acceptance repulsive adjustment
 59 |     # relaxation_factor removed; using inline 0.1 * perturbation_factor below
 60 | 
 61 |     # 1. Initial State: reproducible random generator
 62 |     rng = np.random.default_rng(seed)
 63 |     # uniform random initialization in [0,1]^d for simplicity
 64 |     current_points = rng.random((n, d))
 65 |     
 66 |     _, _, current_ratio = calculate_distances(current_points)
 67 |     
 68 |     best_points = np.copy(current_points)
 69 |     best_ratio = current_ratio
 70 |     
 71 |     temperature = initial_temperature
 72 |     accept_history = []
 73 |     window_size = 50  # window for stagnation detection and adaptive injection
 74 |     # smoothing_interval remains, but smoothing_strength is fixed inlined above
 75 |     smoothing_interval = max(10, iterations // (20 if d <= 2 else 30))  # more frequent smoothing in 3D for improved uniformity
 76 |     
 77 |     for i in range(iterations):
 78 |         # Build KD-tree once per iteration for neighbor queries
 79 |         tree = cKDTree(current_points)
 80 |         # optional smoothing step using distance-weighted neighbor smoothing
 81 |         if (i + 1) % smoothing_interval == 0:
 82 |             # choose neighbor count based on dimension
 83 |             k_smooth = 6 if d > 2 else 4
 84 |             _, idxs = tree.query(current_points, k=k_smooth+1)
 85 |             neighbors = current_points[idxs[:,1:]]  # exclude self
 86 |             # compute inverse-distance weights
 87 |             diffs = neighbors - current_points[:, None, :]
 88 |             dists = np.linalg.norm(diffs, axis=2) + 1e-6
 89 |             weights = 1.0 / dists
 90 |             weights /= weights.sum(axis=1, keepdims=True)
 91 |             neighbor_means = (neighbors * weights[..., None]).sum(axis=1)
 92 |             blend = 0.6 if d > 2 else 0.7
 93 |             current_points = np.clip(current_points * blend + neighbor_means * (1 - blend), 0.0, 1.0)
 94 |             _, _, current_ratio = calculate_distances(current_points)
 95 |             if current_ratio < best_ratio:
 96 |                 best_points = current_points.copy()
 97 |                 best_ratio = current_ratio
 98 | 
 99 |         # 2. Generate Neighboring State: Perturb a random point
100 |         # Simplify scaling: rely on temperature to adjust step-size instead of best_ratio
101 |         # dynamic perturbation decays sublinearly with temperature for finer local moves
102 |         perturbation_strength = perturbation_factor * ((temperature / initial_temperature)**0.6 + 0.15)
103 |         
104 |         # Choose a random point to perturb
105 |         point_to_perturb_index = rng.integers(0, n)
106 |         
107 |         old_point = current_points[point_to_perturb_index].copy()
108 |         # Increase repulsive‐move frequency in low dimensions
109 |         # dynamic repulsion probability: stronger at high temperature, tapering off as we cool
110 |         if d > 2:
111 |             # reduce repulsion frequency in 3D for finer refinement
112 |             repulsion_prob = float(np.clip(temperature / initial_temperature, 0.2, 0.8))
113 |         else:
114 |             repulsion_prob = float(np.clip(temperature / initial_temperature + 0.1, 0.5, 0.95))
115 |         # start with a random jitter
116 |         # random jitter inlined for readability
117 |         candidate = old_point + rng.uniform(-perturbation_strength, perturbation_strength, size=old_point.shape)
118 |         if n > 1 and rng.random() < repulsion_prob:
119 |             # compute nearest neighbor via KD-tree for efficiency (reusing prebuilt tree)
120 |             _, nn_idxs = tree.query(old_point, k=2)
121 |             nn_idx = nn_idxs[1]
122 |             vec = old_point - current_points[nn_idx]
123 |             norm = np.linalg.norm(vec)
124 |             if norm > 1e-8:
125 |                 dir_vec = vec / norm
126 |                 candidate = old_point + perturbation_strength * dir_vec
127 |         # keep the point in [0,1]^d
128 |         current_points[point_to_perturb_index] = np.clip(candidate, 0.0, 1.0)
129 |         _, _, candidate_ratio = calculate_distances(current_points)
130 |         
131 |         # Acceptance criterion
132 |         delta = candidate_ratio - current_ratio
133 |         accept = (delta < 0) or (rng.random() < np.exp(-delta / temperature))
134 | 
135 |         if accept:
136 |             current_ratio = candidate_ratio
137 |             # Post-acceptance repulsive relaxation to improve local spacing
138 |             # reuse prebuilt KD-tree for repulsive relaxation
139 |             dists, idxs_nn = tree.query(current_points[point_to_perturb_index], k=2)
140 |             dir_vec = current_points[point_to_perturb_index] - current_points[idxs_nn[1]]
141 |             norm = np.linalg.norm(dir_vec)
142 |             if norm > 1e-8:
143 |                 # push away from nearest neighbor
144 |                 adjustment = 0.1 * perturbation_factor * dir_vec / norm
145 |                 current_points[point_to_perturb_index] = np.clip(
146 |                     current_points[point_to_perturb_index] + adjustment, 0.0, 1.0
147 |                 )
148 |                 # update ratio and best points after relaxation
149 |                 _, _, relaxed_ratio = calculate_distances(current_points)
150 |                 current_ratio = relaxed_ratio
151 |                 if relaxed_ratio < best_ratio:
152 |                     best_points = current_points.copy()
153 |                     best_ratio = relaxed_ratio
154 |             # also keep the standard best‐check for the candidate move
155 |             if current_ratio < best_ratio:
156 |                 best_points = current_points.copy()
157 |                 best_ratio = current_ratio
158 |         else:
159 |             current_points[point_to_perturb_index] = old_point
160 |         
161 |         # Update temperature with adaptive schedule
162 |         accept_history.append(accept)
163 |         temperature = update_temperature(temperature, cooling_rate, accept_history, i, iterations, initial_temperature)
164 |         # periodic mild reheating for 3D to escape deep minima
165 |         if d > 2 and (i + 1) % (iterations // 3) == 0:
166 |             temperature = max(temperature, initial_temperature * 0.3)
167 | 
168 |         # random injection to escape plateaus: reinitialize one point every 20% of iterations
169 |         # random injection only if we’ve stagnated (low acceptance in recent window)
170 |         if (i + 1) % max(1, iterations // 5) == 0 and len(accept_history) >= window_size \
171 |            and sum(accept_history[-window_size:]) / window_size < 0.1:
172 |             j = rng.integers(0, n)
173 |             current_points[j] = rng.random(d)
174 |             _, _, current_ratio = calculate_distances(current_points)
175 | 
176 |     # Local refinement stage: fine-tune best solution with small Gaussian perturbations
177 |     refine_iters = max(100, iterations // 20)
178 |     for _ in range(refine_iters):
179 |         idx = rng.integers(0, n)
180 |         old_point = best_points[idx].copy()
181 |         perturb = rng.normal(0, perturbation_factor * 0.05, size=d)
182 |         best_points[idx] = np.clip(old_point + perturb, 0.0, 1.0)
183 |         _, _, refined_ratio = calculate_distances(best_points)
184 |         if refined_ratio < best_ratio:
185 |             best_ratio = refined_ratio
186 |         else:
187 |             best_points[idx] = old_point
188 |     return best_points, best_ratio


--------------------------------------------------------------------------------
/reward_model/llm/backend.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import json
  4 | import time
  5 | import numpy as np
  6 | import pandas as pd
  7 | from typing import List, Dict
  8 | 
  9 | from vllm import LLM, SamplingParams
 10 | from openai import OpenAI
 11 | from datasets import load_dataset
 12 | from transformers import AutoTokenizer, AutoModelForCausalLM
 13 | 
 14 | # SCORING_PROMPT = f"""
 15 | # You are an expert reviewer tasked with evaluating the quality of a research abstract. 
 16 | # Your goal is to assign a score between 1 and 10 based on the abstract's clarity, novelty, technical rigor, and potential impact. Here are the criteria:
 17 | # 1. Read the following abstract carefully and provide a score from 1 to 10. 
 18 | # 2. Score 6 means slightly higher than the boardline, 5 is slightly lower than the boardline.
 19 | # Write the score in the {BOX}.
 20 | # **idea**:
 21 | # """
 22 | 
 23 | 
 24 | BOX=r"\boxed{}"
 25 | SYSTEM_PROMPT = "You are an expert reviewer tasked with evaluating the quality of a research proposal. "
 26 | SCORING_PROMPT = f"""
 27 | Your goal is to assign a score between 1 and 10 based on the proposal's clarity, novelty, technical rigor, and potential impact. Here are the criteria:
 28 | 1. Read the following proposal carefully and provide a score from 1 to 10. 
 29 | 2. Score 6 means slightly higher than the boardline, 5 is slightly lower than the boardline.
 30 | Write the score in the {BOX}.
 31 | **idea**:
 32 | 
 33 | """
 34 | 
 35 | def parse_score_from_text(text: str) -> float:
 36 |     match = re.search(r'\\boxed\{(\d*\.?\d*)\}', text)
 37 |     if match:
 38 |         try:
 39 |             score = float(match.group(1))
 40 |             if 0 <= score <= 10:
 41 |                 return score
 42 |         except ValueError:
 43 |             pass
 44 |     return -1.0  
 45 | 
 46 | 
 47 | def score_abstracts_with_vllm(data: List[Dict], model_name: str) -> List[Dict]:
 48 | 
 49 |     llm = LLM(model=model_name, gpu_memory_utilization=0.95)
 50 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 51 | 
 52 |     prompts = [
 53 |         tokenizer.apply_chat_template(
 54 |             [
 55 |                 {
 56 |                     "role": "system",
 57 |                     "content": SYSTEM_PROMPT
 58 |                 },
 59 |                 {
 60 |                     "role": "user",
 61 |                     "content": SCORING_PROMPT + item["title"] + "\n" + item["abstract"]
 62 |                 }
 63 |             ],
 64 |             tokenize=False,
 65 |             add_generation_prompt=True,
 66 |             enable_thinking=False  
 67 |         )
 68 |         for item in data
 69 |     ]
 70 |     
 71 |     sampling_params = SamplingParams(
 72 |         temperature=0,
 73 |         top_p=1.0,
 74 |         max_tokens=1000, 
 75 |     )
 76 | 
 77 |     outputs = llm.generate(prompts, sampling_params)
 78 | 
 79 |     print(prompts[0])
 80 |     print(outputs[0].outputs[0].text)
 81 | 
 82 |     results = []
 83 |     for output, item in zip(outputs, data):
 84 |         output_text = output.outputs[0].text.strip()
 85 |         score = parse_score_from_text(output_text)
 86 |         results.append({
 87 |             "score": score,
 88 |             "evaluation": output_text,
 89 |             "abstract": item["abstract"],
 90 |             "avg_rating": item["avg_rating"]
 91 |         })
 92 | 
 93 |     return results
 94 | 
 95 | def load_processed_ids(jsonl_file: str) -> set:
 96 |     """
 97 |     Loads the IDs of already processed abstracts from the JSONL file.
 98 | 
 99 |     Args:
100 |         jsonl_file: Path to the JSONL file.
101 | 
102 |     Returns:
103 |         Set of processed abstract titles.
104 |     """
105 |     processed_ids = set()
106 |     if os.path.exists(jsonl_file):
107 |         with open(jsonl_file, 'r', encoding='utf-8') as f:
108 |             for line in f:
109 |                 try:
110 |                     data = json.loads(line.strip())
111 |                     if data.get('score', -1.0) != -1.0:  # Only include valid scores
112 |                         processed_ids.add(data['title'])
113 |                 except json.JSONDecodeError:
114 |                     print(f"Warning: Skipping invalid JSON line in {jsonl_file}")
115 |     return processed_ids
116 | 
117 | def write_result_to_jsonl(result: Dict, jsonl_file: str):
118 |     """
119 |     Writes a single result to the JSONL file if the score is valid.
120 | 
121 |     Args:
122 |         result: Dictionary containing the result to write.
123 |         jsonl_file: Path to the JSONL file.
124 |     """
125 |     if result['score'] != -1.0:  # Only write valid scores
126 |         with open(jsonl_file, 'a', encoding='utf-8') as f:
127 |             f.write(json.dumps(result, ensure_ascii=False) + '\n')
128 | 
129 | def score_abstracts_with_api(data: List[Dict], 
130 |                              jsonl_file: str, 
131 |                              model_name: str = "deepseek-chat", 
132 |                              api_key: str = "sk-2c3f1f58031b4b86afdb6a8192ea02e2", 
133 |                              base_url: str = "https://api.deepseek.com",
134 |                              max_retries: int = 50, 
135 |                              retry_delay: int = 5
136 |                              ) -> List[Dict]:
137 |     """
138 |     Scores research proposals using OpenAI's API, writing valid results to a JSONL file incrementally.
139 |     Resumes from the last successfully processed abstract (with valid score) in the JSONL file.
140 | 
141 |     Args:
142 |         data: List of dictionaries containing 'title', 'abstract', and 'avg_rating'.
143 |         jsonl_file: Path to the JSONL file for storing results.
144 |         model_name: OpenAI model to use (default: 'gpt-4o').
145 |         api_key: OpenAI API key (if not set, assumes it's configured in environment).
146 |         max_retries: Maximum number of retries per abstract (default: 5).
147 |         retry_delay: Seconds to wait between retries (default: 5).
148 | 
149 |     Returns:
150 |         List of dictionaries with 'title', 'score', 'evaluation', 'abstract', and 'avg_rating'.
151 |     """
152 |     # Initialize OpenAI client
153 |     client = OpenAI(api_key=api_key, base_url=base_url)
154 | 
155 |     # Load already processed abstracts (with valid scores) to skip them
156 |     processed_ids = load_processed_ids(jsonl_file)
157 |     results = []
158 | 
159 |     # Filter out already processed abstracts
160 |     data_to_process = [item for item in data if item['title'] not in processed_ids]
161 |     print(f"Total abstracts: {len(data)}, To process: {len(data_to_process)}, Already processed: {len(processed_ids)}")
162 | 
163 |     # Prepare prompts for remaining abstracts
164 |     prompts = [
165 |         [
166 |             {"role": "system", "content": SYSTEM_PROMPT},
167 |             {"role": "user", "content": SCORING_PROMPT + item["title"] + "\n" + item["abstract"]}
168 |         ]
169 |         for item in data_to_process
170 |     ]
171 | 
172 |     for prompt, item in zip(prompts, data_to_process):
173 |         retries = 0
174 |         score = -1.0
175 |         output_text = ""
176 | 
177 |         # Keep retrying until a valid score is obtained or max_retries is reached
178 |         while score == -1.0 and retries < max_retries:
179 |             response = client.chat.completions.create(
180 |                 model=model_name,
181 |                 messages=prompt,
182 |                 temperature=0,
183 |                 max_tokens=1000,
184 |                 top_p=1.0
185 |             )
186 |             output_text = response.choices[0].message.content.strip()
187 |             score = parse_score_from_text(output_text)
188 |             
189 |             if score == -1.0:
190 |                 retries += 1
191 |                 print(f"Invalid score for abstract: {item['title']}, Retry {retries}/{max_retries}")
192 |                 time.sleep(retry_delay)  # Wait before retrying
193 |             else:
194 |                 print(f"Prompt: {prompt}")
195 |                 print(f"Output: {output_text}")
196 | 
197 |         # Create result dictionary
198 |         result = {
199 |             "title": item["title"],
200 |             "score": score,
201 |             "evaluation": output_text,
202 |             "abstract": item["abstract"],
203 |             "avg_rating": item["avg_rating"]
204 |         }
205 | 
206 |         # Write result to JSONL file only if score is valid
207 |         write_result_to_jsonl(result, jsonl_file)
208 |         results.append(result)
209 | 
210 |         if score == -1.0:
211 |             print(f"Failed to get valid score for abstract: {item['title']} after {max_retries} retries")
212 | 
213 |     # Load previously processed results from JSONL to include in return
214 |     if processed_ids:
215 |         with open(jsonl_file, 'r', encoding='utf-8') as f:
216 |             for line in f:
217 |                 try:
218 |                     result = json.loads(line.strip())
219 |                     if result['title'] in processed_ids:
220 |                         results.append(result)
221 |                 except json.JSONDecodeError:
222 |                     print(f"Warning: Skipping invalid JSON line in {jsonl_file}")
223 | 
224 |     return results
225 | 
226 | if __name__ == '__main__':
227 |     abst = """
228 | Test-time scaling is a promising new approach to language modeling that uses extra test-time compute to improve performance. Recently, OpenAI's o1 model showed this capability but did not publicly share its methodology, leading to many replication efforts. We seek the simplest approach to achieve test-time scaling and strong reasoning performance. First, we curate a small dataset s1K of 1,000 questions paired with reasoning traces relying on three criteria we validate through ablations: difficulty, diversity, and quality. Second, we develop budget forcing to control test-time compute by forcefully terminating the model's thinking process or lengthening it by appending "Wait" multiple times to the model's generation when it tries to end. This can lead the model to double-check its answer, often fixing incorrect reasoning steps. After supervised finetuning the Qwen2.5-32B-Instruct language model on s1K and equipping it with budget forcing, our model s1-32B exceeds o1-preview on competition math questions by up to 27% (MATH and AIME24). Further, scaling s1-32B with budget forcing allows extrapolating beyond its performance without test-time intervention: from 50% to 57% on AIME24. """
229 |     title = "s1: Simple test-time scaling"
230 |     data = [{"title": title, "abstract": abst, "avg_rating": 0}]
231 |     print(score_abstracts_with_vllm(data, '/data/zhuotaodeng/yzj/alpha-research/model/qwen25_grm_iclr/checkpoint-240'))
232 |     # print(score_abstracts_with_vllm(data, '/data/zhuotaodeng/yzj/download_from_modelscope/Qwen/Qwen3-8B'))


--------------------------------------------------------------------------------
/benchmark/heilbronn_in_the_unit_square/initial_program.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import time
  4 | import argparse
  5 | from itertools import combinations
  6 | 
  7 | # =========================
  8 | # 基本几何与评估
  9 | # =========================
 10 | 
 11 | def triangle_area(a, b, c) -> float:
 12 |     # 三角形面积 = |(b-a) x (c-a)|/2
 13 |     return abs((b[0]-a[0])*(c[1]-a[1]) - (b[1]-a[1])*(c[0]-a[0])) * 0.5
 14 | 
 15 | _triplets_cache = {}
 16 | 
 17 | def _precompute_triplets(n: int):
 18 |     from itertools import combinations as _comb
 19 |     combs = np.array(list(_comb(range(n), 3)), dtype=int)
 20 |     I, J, K = combs[:, 0], combs[:, 1], combs[:, 2]
 21 |     return I, J, K, combs
 22 | 
 23 | def find_min_triangle(P: np.ndarray):
 24 |     """向量化求最小三角形：返回 (i,j,k,min_area)。若 n<3 返回 (-1,-1,-1,0.0)。"""
 25 |     n = len(P)
 26 |     if n < 3:
 27 |         return -1, -1, -1, 0.0
 28 |     if n not in _triplets_cache:
 29 |         _triplets_cache[n] = _precompute_triplets(n)
 30 |     I, J, K, combs = _triplets_cache[n]
 31 |     A = P[I]
 32 |     B = P[J]
 33 |     C = P[K]
 34 |     area = np.abs((B[:,0]-A[:,0])*(C[:,1]-A[:,1]) - (B[:,1]-A[:,1])*(C[:,0]-A[:,0])) * 0.5
 35 |     if area.size == 0:
 36 |         return -1, -1, -1, 0.0
 37 |     idx = int(np.argmin(area))
 38 |     i, j, k = combs[idx]
 39 |     return int(i), int(j), int(k), float(area[idx])
 40 | 
 41 | def min_triangle_area(P: np.ndarray) -> float:
 42 |     return find_min_triangle(P)[3]
 43 | 
 44 | def scaled_min_area(P: np.ndarray) -> float:
 45 |     n = float(len(P))
 46 |     exp = (8.0/7.0) + (1.0/2000.0)
 47 |     return (n ** exp) * min_triangle_area(P)
 48 | 
 49 | # =========================
 50 | # 初始化：多起点
 51 | # =========================
 52 | 
 53 | def jittered_grid_points(n, seed=0):
 54 |     rng = np.random.default_rng(seed)
 55 |     m = int(round(np.sqrt(n))); m = max(m, 2)
 56 |     xs = (np.arange(m) + 0.5) / m
 57 |     ys = (np.arange(m) + 0.5) / m
 58 |     X, Y = np.meshgrid(xs, ys)
 59 |     P = np.c_[X.ravel(), Y.ravel()]
 60 |     jitter = 0.12 / m
 61 |     P += rng.uniform(-jitter, jitter, size=P.shape)
 62 |     P = np.clip(P[:n], 0.0, 1.0)
 63 |     return P
 64 | 
 65 | def hex_lattice_points(n, seed=0):
 66 |     rng = np.random.default_rng(seed)
 67 |     a = 1.0 / np.sqrt(n)
 68 |     pts = []
 69 |     y = a/2
 70 |     row = 0
 71 |     while y < 1.0:
 72 |         x0 = (a/2) if (row % 2 == 1) else a
 73 |         x = x0
 74 |         while x < 1.0:
 75 |             pts.append([x, y])
 76 |             x += a
 77 |         y += np.sqrt(3)/2 * a
 78 |         row += 1
 79 |     P = np.array(pts, dtype=float)
 80 |     if len(P) < n:
 81 |         extra = rng.uniform(0, 1, size=(n - len(P), 2))
 82 |         P = np.vstack([P, extra])
 83 |     P = P[:n]
 84 |     P += rng.uniform(-0.08*a, 0.08*a, size=P.shape)
 85 |     P = np.clip(P, 0.0, 1.0)
 86 |     return P
 87 | 
 88 | def bridson_poisson_disk(n, r=None, k=30, seed=0):
 89 |     """
 90 |     近似生成 >=n 的 Poisson-disk 点，再均匀抽样到 n 个。
 91 |     r: 目标最小间距 ~ c / sqrt(n)
 92 |     """
 93 |     rng = np.random.default_rng(seed)
 94 |     if r is None:
 95 |         r = 0.6 / np.sqrt(n)  # 稍保守的间距
 96 |     cell_size = r / np.sqrt(2)
 97 |     grid_w = int(np.ceil(1.0 / cell_size))
 98 |     grid_h = int(np.ceil(1.0 / cell_size))
 99 |     grid = -np.ones((grid_h, grid_w), dtype=int)
100 | 
101 |     def grid_coords(pt):
102 |         return int(pt[1] / cell_size), int(pt[0] / cell_size)
103 | 
104 |     def in_neighborhood(pt):
105 |         gy, gx = grid_coords(pt)
106 |         for yy in range(max(gy-2,0), min(gy+3, grid_h)):
107 |             for xx in range(max(gx-2,0), min(gx+3, grid_w)):
108 |                 j = grid[yy, xx]
109 |                 if j >= 0:
110 |                     if np.linalg.norm(pts[j] - pt) < r:
111 |                         return True
112 |         return False
113 | 
114 |     pts = []
115 |     active = []
116 | 
117 |     # 初始点
118 |     p0 = rng.uniform(0, 1, size=2)
119 |     pts.append(p0); active.append(0)
120 |     gy, gx = grid_coords(p0); grid[gy, gx] = 0
121 | 
122 |     while active and len(pts) < max(n*2, n+10):
123 |         idx = rng.choice(active)
124 |         base = pts[idx]
125 |         found = False
126 |         for _ in range(k):
127 |             rad = rng.uniform(r, 2*r)
128 |             ang = rng.uniform(0, 2*np.pi)
129 |             cand = base + rad * np.array([np.cos(ang), np.sin(ang)])
130 |             if not (0 <= cand[0] <= 1 and 0 <= cand[1] <= 1):
131 |                 continue
132 |             if not in_neighborhood(cand):
133 |                 pts.append(cand)
134 |                 gy, gx = grid_coords(cand); grid[gy, gx] = len(pts)-1
135 |                 active.append(len(pts)-1)
136 |                 found = True
137 |                 break
138 |         if not found:
139 |             active.remove(idx)
140 | 
141 |     pts = np.array(pts)
142 |     if len(pts) >= n:
143 |         idx = rng.choice(len(pts), size=n, replace=False)
144 |         pts = pts[idx]
145 |     else:
146 |         extra = rng.uniform(0,1,size=(n-len(pts),2))
147 |         pts = np.vstack([pts, extra])
148 |     return pts
149 | 
150 | # =========================
151 | # 定向局部搜索（增大最小三角形）
152 | # =========================
153 | 
154 | def normalize(v):
155 |     n = np.linalg.norm(v)
156 |     return v / n if n > 1e-12 else v
157 | 
158 | def bump_point(P, idx, step, rng):
159 |     """对单个点做小扰动（含随机 + 轻微向内推），保持在 [0,1]^2。"""
160 |     q = P.copy()
161 |     jitter = rng.uniform(-step, step, size=2)
162 |     # 轻微向内推，减少贴边导致的瘦三角形
163 |     inward = 0.15 * step * (0.5 - P[idx])
164 |     q[idx] = np.clip(P[idx] + jitter + inward, 0.0, 1.0)
165 |     return q
166 | 
167 | def bump_min_triangle_directed(P, step, rng):
168 |     """
169 |     针对“当前最小三角形”的三个顶点，沿增大面积的几何方向优先移动：
170 |     - 对顶点 a，相对边 (b,c) 的法向方向能增大面积。
171 |     - 叠加小随机扰动，避免卡鞍点。
172 |     """
173 |     i, j, k, _ = find_min_triangle(P)
174 |     if i < 0:
175 |         return P
176 |     a, b, c = P[i], P[j], P[k]
177 | 
178 |     def move_along_normal(P, idx, other1, other2):
179 |         q = P.copy()
180 |         base = other2 - other1
181 |         # 2D 中与 base 垂直的法向（取两种方向试探）
182 |         n1 = normalize(np.array([ base[1], -base[0] ]))
183 |         n2 = -n1
184 |         cand1 = np.clip(P[idx] + step*n1, 0.0, 1.0)
185 |         cand2 = np.clip(P[idx] + step*n2, 0.0, 1.0)
186 |         # 选择带来更大 min_area 的方向
187 |         q1 = q.copy(); q1[idx] = cand1
188 |         q2 = q.copy(); q2[idx] = cand2
189 |         a1 = min_triangle_area(q1); a2 = min_triangle_area(q2)
190 |         if a1 >= a2:
191 |             return q1, a1
192 |         else:
193 |             return q2, a2
194 | 
195 |     # 依次尝试移动 i、j、k，并保留最好者
196 |     bestP = P.copy(); bestA = min_triangle_area(P)
197 |     for (idx, o1, o2) in [(i, b, c), (j, c, a), (k, a, b)]:
198 |         q, area_dir = move_along_normal(bestP, idx, o1, o2)
199 |         if area_dir > bestA + 1e-15:
200 |             bestP, bestA = q, area_dir
201 |         else:
202 |             # 若定向无改进，退而求其次：随机小扰动
203 |             q = bump_point(bestP, idx, 0.6*step, rng)
204 |             a_rand = min_triangle_area(q)
205 |             if a_rand > bestA + 1e-15:
206 |                 bestP, bestA = q, a_rand
207 |     return bestP
208 | 
209 | def project_min_distance(P, dmin=1e-3, iters=1):
210 |     """软约束：尽量避免过近点对（简单排斥迭代）。"""
211 |     Q = P.copy()
212 |     for _ in range(iters):
213 |         for i in range(len(Q)):
214 |             diffs = Q - Q[i]
215 |             dists = np.linalg.norm(diffs, axis=1)
216 |             mask = (dists < dmin) & (dists > 0)
217 |             if np.any(mask):
218 |                 repel = -diffs[mask]
219 |                 move = 0.5 * np.sum(repel / np.maximum(dists[mask][:,None], 1e-12), axis=0)
220 |                 Q[i] = np.clip(Q[i] + 1e-3*move, 0.0, 1.0)
221 |     return Q
222 | 
223 | def improve(P0, iters=6000, step0=0.05, seed=0, patience=800, time_limit=None):
224 |     """
225 |     退火式定向搜索：
226 |     - 以“当前最小三角形”为线索，优先移动那三个点；
227 |     - 步长逐步衰减并穿插随机扰动与最小距离投影；
228 |     """
229 |     rng = np.random.default_rng(seed)
230 |     P = np.clip(P0.copy(), 0.0, 1.0)
231 |     bestP = P.copy(); bestA = min_triangle_area(P)
232 |     no_improve = 0
233 |     t0 = time.time()
234 | 
235 |     step = step0
236 |     for t in range(1, iters+1):
237 |         if time_limit is not None and (time.time() - t0) > time_limit:
238 |             break
239 |         Q = bump_min_triangle_directed(P, step, rng)
240 |         # 偶尔对非最小三角形顶点做随机扰动，避免局部陷阱
241 |         if t % 30 == 0:
242 |             idx = rng.integers(len(P))
243 |             Q = bump_point(Q, idx, 0.5*step, rng)
244 | 
245 |         # 软性分离，避免过近
246 |         if t % 50 == 0:
247 |             Q = project_min_distance(Q, dmin=5e-3, iters=1)
248 | 
249 |         aQ = min_triangle_area(Q)
250 |         if aQ > bestA + 1e-15:
251 |             P = Q
252 |             bestP, bestA = Q.copy(), aQ
253 |             no_improve = 0
254 |         else:
255 |             # 以小概率接受较差解可加入，但这里保守：不接受
256 |             no_improve += 1
257 | 
258 |         if no_improve >= patience:
259 |             break
260 | 
261 |         # 步长衰减
262 |         if t % 400 == 0:
263 |             step *= 0.7
264 |             step = max(step, 5e-4)
265 | 
266 |     return bestP, bestA
267 | 
268 | # =========================
269 | # 主流程：多起点 + 精修
270 | # =========================
271 | 
272 | def multi_start_optimize(n=16, seeds=(42, 43, 44), iters=6000, step0=0.05, time_limit=None):
273 |     """
274 |     多路起点（Hex / Grid / Poisson）并行，保留最好的，再额外精修一轮。
275 |     """
276 |     cands = []
277 |     for s in seeds:
278 |         cands.append(hex_lattice_points(n, seed=s))
279 |         cands.append(jittered_grid_points(n, seed=1000+s))
280 |         cands.append(bridson_poisson_disk(n, seed=2000+s))
281 |     bestP = None; bestA = -1.0
282 |     t0 = time.time()
283 |     # 粗搜索：较少迭代，快速筛选
284 |     coarse_iters = max(200, int(0.25 * iters))
285 |     coarse_results = []
286 |     for P0 in cands:
287 |         remaining = None
288 |         if time_limit is not None:
289 |             elapsed = time.time() - t0
290 |             remaining = max(0.0, time_limit - elapsed)
291 |             if remaining <= 0:
292 |                 break
293 |         P1, A1 = improve(P0, iters=coarse_iters, step0=step0, seed=12345, time_limit=remaining)
294 |         coarse_results.append((A1, P1))
295 | 
296 |     if coarse_results:
297 |         coarse_results.sort(key=lambda x: x[0], reverse=True)
298 |         top_list = [P for (_, P) in coarse_results[:3]]
299 |     else:
300 |         top_list = cands[:1]
301 | 
302 |     # 精修：更小步长
303 |     for idx, P0 in enumerate(top_list):
304 |         remaining = None
305 |         if time_limit is not None:
306 |             elapsed = time.time() - t0
307 |             remaining = max(0.0, time_limit - elapsed)
308 |             if remaining <= 0:
309 |                 break
310 |         P2, A2 = improve(P0, iters=max(400, int(0.6 * iters)), step0=0.02, seed=999+idx, time_limit=remaining)
311 |         if A2 > bestA:
312 |             bestP, bestA = P2, A2
313 |     return bestP, bestA
314 | 
315 | # =========================
316 | # 入口：生成 points 并保存
317 | # =========================
318 | 
319 | def main():
320 |     parser = argparse.ArgumentParser()
321 |     parser.add_argument("--iters", type=int, default=2000)
322 |     parser.add_argument("--step0", type=float, default=0.04)
323 |     parser.add_argument("--seeds", type=str, default="7,11,19")
324 |     parser.add_argument("--time-limit", type=float, default=None)
325 |     args = parser.parse_args()
326 | 
327 |     n = 16
328 |     seeds = tuple(int(s.strip()) for s in args.seeds.split(",") if s.strip()) or (7, 11, 19)
329 |     bestP, bestA = multi_start_optimize(n=n, seeds=seeds, iters=args.iters, step0=args.step0, time_limit=args.time_limit)
330 |     smin = scaled_min_area(bestP)
331 |     print(f"n={n}, points={len(bestP)}")
332 |     print(f"min_area = {bestA:.10f}")
333 |     print(f"scaled_min_area = {smin:.10f}")
334 |     return bestP
335 | 
336 | if __name__ == "__main__":
337 |     points = main()
338 |     out_path = os.path.join(os.path.dirname(__file__), "points.npy")
339 |     np.save(out_path, points)
340 |     print(f"Saved points to {out_path}")
341 | 
342 | # 兼容外部 evaluator
343 | try:
344 |     points  # type: ignore[name-defined]
345 | except NameError:
346 |     points = main()
347 | 


--------------------------------------------------------------------------------
/evolve_agent/reward_model.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import json
  4 | import time
  5 | from typing import List, Dict, Optional, Any
  6 | from pathlib import Path
  7 | 
  8 | import asyncio
  9 | # import aiofiles
 10 | import numpy as np
 11 | import pandas as pd
 12 | from vllm import LLM, SamplingParams
 13 | from openai import AsyncOpenAI
 14 | from transformers import AutoTokenizer
 15 | from datasets import load_dataset
 16 | 
 17 | from evolve_agent.config import RewardModelConfig
 18 | 
 19 | 
 20 | class RewardModel:
 21 |     """
 22 |     A class to score research abstracts or proposals based on clarity, novelty, technical rigor,
 23 |     and potential impact using either a local vLLM model or an external API model.
 24 |     """
 25 |     
 26 |     BOX = r"\boxed{}"
 27 |     SYSTEM_PROMPT = "You are an expert reviewer tasked with evaluating the quality of a research proposal."
 28 |     SCORING_PROMPT = f"""
 29 | Your goal is to assign a score between 1 and 10 based on the proposal's clarity, novelty, technical rigor, and potential impact. Here are the criteria:
 30 | 1. Read the following proposal carefully and provide a score from 1 to 10. 
 31 | 2. Score 6 means slightly higher than the borderline, 5 is slightly lower than the borderline.
 32 | Write the score in the {BOX}.
 33 | **idea**:
 34 | """
 35 | 
 36 |     def __init__(self, config: RewardModelConfig):
 37 |         """
 38 |         Initialize the RewardModel.
 39 | 
 40 |         Args:
 41 |             config (RewardModelConfig): Configuration object containing model_type, model_name, api_key, base_url,
 42 |                                        jsonl_file, max_retries, retry_delay, temperature, top_p, max_tokens.
 43 |         """
 44 |         self.config = config
 45 | 
 46 |         if self.config.model_type == "vllm":
 47 |             self.llm = LLM(model=self.config.model_name, gpu_memory_utilization=0.95)
 48 |             self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
 49 |         elif self.config.model_type == "api":
 50 |             if not self.config.api_key or not self.config.base_url:
 51 |                 raise ValueError("API key and base URL must be provided for API model type.")
 52 |             self.client = AsyncOpenAI(api_key=self.config.api_key, base_url=self.config.base_url)
 53 |         else:
 54 |             raise ValueError("model_type must be 'vllm' or 'api'.")
 55 | 
 56 |         # Ensure the directory for jsonl_file exists
 57 |         os.makedirs(os.path.dirname(self.config.jsonl_file) or ".", exist_ok=True)
 58 | 
 59 |     def parse_score_from_text(self, text: str) -> float:
 60 |         """
 61 |         Parse the score from the model's output text.
 62 | 
 63 |         Args:
 64 |             text (str): Model output containing the score in \boxed{} format.
 65 | 
 66 |         Returns:
 67 |             float: Parsed score between 0 and 10, or -1.0 if invalid.
 68 |         """
 69 |         match = re.search(r'\\boxed\{(\d*\.?\d*)\}', text)
 70 |         if match:
 71 |             try:
 72 |                 score = float(match.group(1))
 73 |                 if 0 <= score <= 10:
 74 |                     return score
 75 |             except ValueError:
 76 |                 pass
 77 |         return -1.0
 78 | 
 79 |     async def load_processed_ids(self) -> set:
 80 |         """
 81 |         Load titles of already processed abstracts from the JSONL file.
 82 | 
 83 |         Returns:
 84 |             set: Set of processed abstract titles with valid scores.
 85 |         """
 86 |         processed_ids = set()
 87 |         # if os.path.exists(self.config.jsonl_file):
 88 |         #     async with aiofiles.open(self.config.jsonl_file, 'r', encoding='utf-8') as f:
 89 |         #         async for line in f:
 90 |         #             try:
 91 |         #                 data = json.loads(line.strip())
 92 |         #                 if data.get('score', -1.0) != -1.0:
 93 |         #                     processed_ids.add(data['title'])
 94 |         #             except json.JSONDecodeError:
 95 |         #                 print(f"Warning: Skipping invalid JSON line in {self.config.jsonl_file}")
 96 |         return processed_ids
 97 | 
 98 |     async def write_result_to_jsonl(self, result: Dict):
 99 |         """
100 |         Write a single result to the JSONL file if the score is valid.
101 | 
102 |         Args:
103 |             result (Dict): Result dictionary containing title, score, evaluation, abstract, and gt_score.
104 |         """
105 |         # if result['score'] != -1.0:
106 |         #     async with aiofiles.open(self.config.jsonl_file, 'a', encoding='utf-8') as f:
107 |         #         await f.write(json.dumps(result, ensure_ascii=False) + '\n')
108 | 
109 |     async def score_with_vllm(self, data: List[Dict]) -> List[Dict]:
110 |         """
111 |         Score abstracts using a local vLLM model.
112 | 
113 |         Args:
114 |             data (List[Dict]): List of dictionaries containing 'title', 'abstract', and 'gt_score'.
115 | 
116 |         Returns:
117 |             List[Dict]: List of results with 'title', 'score', 'evaluation', 'abstract', and 'gt_score'.
118 |         """
119 |         prompts = [
120 |             self.tokenizer.apply_chat_template(
121 |                 [
122 |                     {"role": "system", "content": self.SYSTEM_PROMPT},
123 |                     {"role": "user", "content": self.SCORING_PROMPT + item["title"] + "\n" + item["abstract"]}
124 |                 ],
125 |                 tokenize=False,
126 |                 add_generation_prompt=True,
127 |                 enable_thinking=False
128 |             )
129 |             for item in data
130 |         ]
131 | 
132 |         sampling_params = SamplingParams(
133 |             temperature=self.config.temperature,
134 |             top_p=self.config.top_p,
135 |             max_tokens=self.config.max_tokens,
136 |         )
137 | 
138 |         # vLLM is synchronous, so we run it in the default executor
139 |         outputs = await asyncio.get_event_loop().run_in_executor(None, lambda: self.llm.generate(prompts, sampling_params))
140 | 
141 |         results = []
142 |         for output, item in zip(outputs, data):
143 |             output_text = output.outputs[0].text.strip()
144 |             score = self.parse_score_from_text(output_text)
145 |             result = {
146 |                 "title": item["title"],
147 |                 "score": score,
148 |                 "evaluation": output_text,
149 |                 "abstract": item["abstract"],
150 |                 "gt_score": item["gt_score"]
151 |             }
152 |             await self.write_result_to_jsonl(result)
153 |             results.append(result)
154 | 
155 |         return results
156 | 
157 |     async def score_with_api(self, data: List[Dict]) -> List[Dict]:
158 |         """
159 |         Score abstracts using an external API model.
160 | 
161 |         Args:
162 |             data (List[Dict]): List of dictionaries containing 'title', 'abstract', and 'gt_score'.
163 | 
164 |         Returns:
165 |             List[Dict]: List of results with 'title', 'score', 'evaluation', 'abstract', and 'gt_score'.
166 |         """
167 |         processed_ids = await self.load_processed_ids()
168 |         data_to_process = [item for item in data if item['title'] not in processed_ids]
169 |         print(f"Total abstracts: {len(data)}, To process: {len(data_to_process)}, Already processed: {len(processed_ids)}")
170 | 
171 |         prompts = [
172 |             [
173 |                 {"role": "system", "content": self.SYSTEM_PROMPT},
174 |                 {"role": "user", "content": self.SCORING_PROMPT + item["title"] + "\n" + item["abstract"]}
175 |             ]
176 |             for item in data_to_process
177 |         ]
178 | 
179 |         results = []
180 |         for prompt, item in zip(prompts, data_to_process):
181 |             retries = 0
182 |             score = -1.0
183 |             output_text = ""
184 | 
185 |             while score == -1.0 and retries < self.config.max_retries:
186 |                 try:
187 |                     response = await self.client.chat.completions.create(
188 |                         model=self.config.model_name,
189 |                         messages=prompt,
190 |                         temperature=0,  # API uses fixed temperature as per original code
191 |                         max_tokens=1000,  # API uses fixed max_tokens as per original code
192 |                         top_p=1.0  # API uses fixed top_p as per original code
193 |                     )
194 |                     output_text = response.choices[0].message.content.strip()
195 |                     score = self.parse_score_from_text(output_text)
196 |                 except Exception as e:
197 |                     print(f"Error processing {item['title']}: {e}")
198 |                 
199 |                 if score == -1.0:
200 |                     retries += 1
201 |                     print(f"Invalid score for abstract: {item['title']}, Retry {retries}/{self.config.max_retries}")
202 |                     await asyncio.sleep(self.config.retry_delay)
203 | 
204 |             result = {
205 |                 "title": item["title"],
206 |                 "score": score,
207 |                 "gt_score": item["gt_score"],
208 |                 "evaluation": output_text,
209 |                 "abstract": item["abstract"]
210 |             }
211 |             await self.write_result_to_jsonl(result)
212 |             results.append(result)
213 | 
214 |             if score == -1.0:
215 |                 print(f"Failed to get valid score for abstract: {item['title']} after {self.config.max_retries} retries")
216 | 
217 |         # Include previously processed results
218 |         # if processed_ids:
219 |         #     async with aiofiles.open(self.config.jsonl_file, 'r', encoding='utf-8') as f:
220 |         #         async for line in f:
221 |         #             try:
222 |         #                 result = json.loads(line.strip())
223 |         #                 if result['title'] in processed_ids:
224 |         #                     results.append(result)
225 |         #             except json.JSONDecodeError:
226 |         #                 print(f"Warning: Skipping invalid JSON line in {self.config.jsonl_file}")
227 | 
228 |         return results
229 | 
230 |     async def score_research_proposal(self, data: List[Any]) -> List[Dict]:
231 |         """
232 |         Score abstracts using the configured model type.
233 | 
234 |         Args:
235 |             data (List[Dict]): List of dictionaries containing 'title', 'abstract', and 'gt_score'.
236 | 
237 |         Returns:
238 |             List[Dict]: List of results with 'title', 'score', 'evaluation', 'abstract', and 'gt_score'.
239 |         """
240 |         if isinstance(data[0], str):
241 |             data = [{"title": "", "gt_score":0, "abstract": d} for d in data]
242 | 
243 |         if self.config.model_type == "vllm":
244 |             return await self.score_with_vllm(data)
245 |         elif self.config.model_type == "api":
246 |             return await self.score_with_api(data)
247 |         else:
248 |             raise ValueError("Invalid model_type. Must be 'vllm' or 'api'.")
249 | 
250 | if __name__ == "__main__":
251 |     async def test_reward_model():
252 |         # Sample data for testing
253 |         sample_data = [
254 |             {
255 |                 "title": "A Novel Approach to Quantum Computing",
256 |                 "abstract": "This proposal introduces a new quantum algorithm that enhances computational efficiency by leveraging entangled states in a scalable architecture. The approach is validated through simulations showing a 20% improvement over existing methods.",
257 |                 "gt_score": 7.5
258 |             },
259 |             {
260 |                 "title": "AI-Driven Climate Modeling",
261 |                 "abstract": "We propose an AI-based framework for improving climate predictions using deep learning to integrate heterogeneous environmental data. Preliminary results demonstrate enhanced accuracy in long-term forecasts.",
262 |                 "gt_score": 8.0
263 |             }
264 |         ]
265 | 
266 |         # Test with vLLM model (commented out because it requires a local model and GPU)
267 | 
268 |         # try:
269 |         #     vllm_model = RewardModel(
270 |         #         model_type="vllm",
271 |         #         model_name="/data/zhuotaodeng/yzj/alpha_research_model/qwen25_grm_iclr_boxed/checkpoint-180",
272 |         #         jsonl_file="vllm_results.jsonl"
273 |         #     )
274 |         #     vllm_results = await vllm_model.score_research_proposal(sample_data)
275 |         #     print("vLLM Results:")
276 |         #     for result in vllm_results:
277 |         #         print(f"Title: {result['title']}, Score: {result['score']}, Evaluation: {result['evaluation'][:50]}...")
278 |         # except Exception as e:
279 |         #     print(f"vLLM test failed: {e}")
280 | 
281 | 
282 |         # Test with API model (requires valid API key and base URL)
283 |         try:
284 |             # Replace with your actual API key and base URL
285 |             api_key = "sk-2c3f1f58031b4b86afdb6a8192ea02e2"
286 |             base_url = "https://api.deepseek.com"
287 | 
288 |             config = RewardModelConfig(
289 |                 model_type="api",
290 |                 model_name="deepseek-chat",
291 |                 api_key=api_key,
292 |                 base_url=base_url,
293 |                 jsonl_file="api_results.jsonl",
294 |                 max_retries=3,
295 |                 retry_delay=1
296 |             ) 
297 |             
298 |             api_model = RewardModel(config)
299 |             api_results = await api_model.score_research_proposal(sample_data)
300 |             print("API Results:")
301 |             for result in api_results:
302 |                 print(f"Title: {result['title']}, Score: {result['score']}, Evaluation: {result['evaluation'][:50]}...")
303 |         except Exception as e:
304 |             print(f"API test failed: {e}")
305 | 
306 |     # Run the async test
307 |     asyncio.run(test_reward_model())
308 | 


--------------------------------------------------------------------------------
/benchmark/packing_circles/initial_program.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import random
  3 | from concurrent.futures import ThreadPoolExecutor
  4 | 
  5 | 
  6 | def pack_circles(n, square_size=1.0):
  7 |     """
  8 |     Pack n disjoint circles in a unit square using uniform tiling approach.
  9 |     Returns the sum of radii and list of circles (x, y, r).
 10 |     """
 11 | 
 12 |     def max_circle_radius(x, y, circles, square_size=1.0, skip_idx=None):
 13 |         """
 14 |         Compute the maximum radius for a circle centered at (x, y) that:
 15 |         - Stays within the unit square [0, square_size] × [0, square_size].
 16 |         - Does not overlap with existing circles.
 17 |         skip_idx: if provided, index in circles[] to ignore (self).
 18 |         """
 19 |         # Distance to nearest boundary of the unit square
 20 |         r_max = min(x, y, square_size - x, square_size - y)
 21 |         
 22 |         # Check distance to existing circles, exit early if r_max → 0
 23 |         # early exit if r_max is tiny, and avoid needless sqrt
 24 |         for idx, (cx, cy, cr) in enumerate(circles):
 25 |             if skip_idx == idx:
 26 |                 continue
 27 |             if r_max <= 1e-8:
 28 |                 break
 29 |             dx = x - cx
 30 |             dy = y - cy
 31 |             sep = r_max + cr
 32 |             if dx*dx + dy*dy < sep*sep:
 33 |                 # only compute sqrt when we know we can shrink
 34 |                 dist = math.sqrt(dx*dx + dy*dy)
 35 |                 r_max = min(r_max, dist - cr)
 36 |         return max(r_max, 0.0)
 37 | 
 38 |     def uniform_tiling_circles(n, square_size=1.0):
 39 |         """
 40 |         Uniformly tile the square with circles using optimal grid placement.
 41 |         """
 42 |         if n <= 0:
 43 |             return []
 44 |         
 45 |         circles = []
 46 |         
 47 |         # Calculate optimal grid dimensions
 48 |         # For n circles, find the best grid layout (rows x cols)
 49 |         best_layout = None
 50 |         best_total_radius = 0
 51 |         
 52 |         # Try different grid configurations
 53 |         for rows in range(1, min(n + 1, 20)):
 54 |             cols = math.ceil(n / rows)
 55 |             if cols > 20:  # Limit grid size
 56 |                 continue
 57 |                 
 58 |             # Calculate spacing
 59 |             spacing_x = square_size / (cols + 1)
 60 |             spacing_y = square_size / (rows + 1)
 61 |             
 62 |             # Use the smaller spacing to ensure circles fit
 63 |             min_spacing = min(spacing_x, spacing_y)
 64 |             
 65 |             # Calculate maximum radius for this layout
 66 |             max_radius = min_spacing / 2
 67 |             
 68 |             # Ensure radius doesn't exceed boundaries
 69 |             max_radius = min(max_radius, 
 70 |                            spacing_x / 2 - 1e-6, 
 71 |                            spacing_y / 2 - 1e-6)
 72 |             
 73 |             if max_radius <= 0:
 74 |                 continue
 75 |             
 76 |             # Place circles in uniform grid
 77 |             temp_circles = []
 78 |             count = 0
 79 |             
 80 |             for row in range(rows):
 81 |                 for col in range(cols):
 82 |                     if count >= n:
 83 |                         break
 84 |                     
 85 |                     x = spacing_x * (col + 1)
 86 |                     y = spacing_y * (row + 1)
 87 |                     
 88 |                     # Ensure circle stays within bounds
 89 |                     if (x - max_radius >= 0 and x + max_radius <= square_size and
 90 |                         y - max_radius >= 0 and y + max_radius <= square_size):
 91 |                         
 92 |                         temp_circles.append((x, y, max_radius))
 93 |                         count += 1
 94 |                 
 95 |                 if count >= n:
 96 |                     break
 97 |             
 98 |             # Calculate total radius for this layout
 99 |             total_radius = len(temp_circles) * max_radius
100 |             
101 |             if total_radius > best_total_radius and len(temp_circles) == n:
102 |                 best_total_radius = total_radius
103 |                 best_layout = temp_circles
104 |         
105 |         # If we found a valid layout, return it
106 |         if best_layout:
107 |             return best_layout
108 |         
109 |         # Fallback: use hexagonal packing for better density
110 |         return hexagonal_packing(n, square_size)
111 | 
112 |     def hexagonal_packing(n, square_size=1.0):
113 |         """
114 |         Use hexagonal close packing for better space utilization.
115 |         """
116 |         circles = []
117 |         
118 |         # Estimate number of rows and columns for hexagonal packing
119 |         # Hexagonal packing has rows offset by sqrt(3)/2 * diameter
120 |         
121 |         rows = int(math.sqrt(n * 2 / math.sqrt(3))) + 2
122 |         
123 |         count = 0
124 |         row = 0
125 |         
126 |         while count < n and row < rows:
127 |             # Calculate y position for this row
128 |             y = (row + 0.5) * (square_size / (rows + 1))
129 |             
130 |             # Number of circles in this row
131 |             if row % 2 == 0:
132 |                 cols = int(math.sqrt(n)) + 1
133 |             else:
134 |                 cols = int(math.sqrt(n))
135 |             
136 |             spacing_x = square_size / (cols + 1)
137 |             
138 |             for col in range(cols):
139 |                 if count >= n:
140 |                     break
141 |                 
142 |                 if row % 2 == 0:
143 |                     x = spacing_x * (col + 1)
144 |                 else:
145 |                     x = spacing_x * (col + 1) + spacing_x / 2
146 |                 
147 |                 # Calculate maximum radius for this position
148 |                 r = max_circle_radius(x, y, circles, square_size)
149 |                 
150 |                 if r > 0:
151 |                     circles.append((x, y, r))
152 |                     count += 1
153 |             
154 |             row += 1
155 |         
156 |         return circles
157 | 
158 |     def optimize_placement(n, square_size=1.0):
159 |         """
160 |         Optimize circle placement using uniform tiling with radius maximization.
161 |         """
162 |         circles = []
163 |         
164 |         # First, try hexagonal packing for high initial density
165 |         hex_circles = hexagonal_packing(n, square_size)
166 |         if len(hex_circles) == n:
167 |             # Ensure maximum radii for hex layout with stronger refinement
168 |             hex_refined = refine_circles(hex_circles, square_size, iterations=20)
169 |             return hex_refined
170 |         
171 |         # Fallback to uniform grid placement
172 |         grid_circles = uniform_tiling_circles(n, square_size)
173 |         if len(grid_circles) == n:
174 |             return grid_circles
175 |         
176 |         # If uniform tiling didn't work perfectly, use adaptive approach
177 |         # Calculate optimal radius based on density
178 |         area_per_circle = (square_size * square_size) / n
179 |         estimated_radius = math.sqrt(area_per_circle / math.pi) * 0.9  # Conservative estimate
180 |         
181 |         # Create grid with optimal spacing
182 |         spacing = estimated_radius * 2.1  # Include gap
183 |         
184 |         cols = int(square_size / spacing)
185 |         rows = int(square_size / spacing)
186 |         
187 |         actual_spacing_x = square_size / (cols + 1)
188 |         actual_spacing_y = square_size / (rows + 1)
189 |         
190 |         count = 0
191 |         for row in range(rows):
192 |             for col in range(cols):
193 |                 if count >= n:
194 |                     break
195 |                 
196 |                 x = actual_spacing_x * (col + 1)
197 |                 y = actual_spacing_y * (row + 1)
198 |                 
199 |                 # Calculate maximum possible radius
200 |                 r = max_circle_radius(x, y, circles, square_size)
201 |                 
202 |                 if r > 0:
203 |                     circles.append((x, y, r))
204 |                     count += 1
205 |             
206 |             if count >= n:
207 |                 break
208 |         
209 |         # If we still need more circles, use remaining space
210 |         remaining = n - len(circles)
211 |         if remaining > 0:
212 |             # Place remaining circles in remaining spaces
213 |             for i in range(remaining):
214 |                 # Try different positions systematically
215 |                 best_r = 0
216 |                 best_pos = (0.5, 0.5)
217 |                 
218 |                 # Fine grid search (increased resolution)
219 |                 grid_points = 100
220 |                 for gx in range(1, grid_points):
221 |                     for gy in range(1, grid_points):
222 |                         x = gx / grid_points
223 |                         y = gy / grid_points
224 |                         
225 |                         r = max_circle_radius(x, y, circles, square_size)
226 |                         if r > best_r:
227 |                             best_r = r
228 |                             best_pos = (x, y)
229 |                 
230 |                 if best_r > 0:
231 |                     circles.append((best_pos[0], best_pos[1], best_r))
232 |         
233 |         return circles
234 | 
235 |     def refine_circles(circles, square_size, iterations=80, perturb_interval=3):
236 |         """
237 |         Iteratively grow each circle to its maximum radius under non-overlap constraints.
238 |         Includes randomized update order, periodic micro-perturbation to escape
239 |         local minima, and a final local-center-perturbation pass for densification.
240 |         """
241 |         for it in range(iterations):
242 |             # randomize update order to avoid sweep-order bias
243 |             indices = list(range(len(circles)))
244 |             random.shuffle(indices)
245 |             for i in indices:
246 |                 x, y, _ = circles[i]
247 |                 # Compute maximal feasible radius here, skipping self
248 |                 r = max_circle_radius(x, y, circles, square_size, skip_idx=i)
249 |                 circles[i] = (x, y, r)
250 |             # Periodic micro-perturbation: jiggle a few circles
251 |             if it % perturb_interval == 0 and len(circles) > 0:
252 |                 subset = random.sample(indices, min(5, len(circles)))
253 |                 for j in subset:
254 |                     x0, y0, r0 = circles[j]
255 |                     dx = random.uniform(-0.03, 0.03)
256 |                     dy = random.uniform(-0.03, 0.03)
257 |                     nx = min(max(x0 + dx, 0), square_size)
258 |                     ny = min(max(y0 + dy, 0), square_size)
259 |                     # Compute maximal radius skipping self
260 |                     nr = max_circle_radius(nx, ny, circles, square_size, skip_idx=j)
261 |                     if nr > r0:
262 |                         circles[j] = (nx, ny, nr)
263 |         # Full local center-perturbation phase for final densification
264 |         for i in range(len(circles)):
265 |             x, y, r = circles[i]
266 |             best_x, best_y, best_r = x, y, r
267 |             delta = 0.1
268 |             for _ in range(20):
269 |                 dx = random.uniform(-delta, delta)
270 |                 dy = random.uniform(-delta, delta)
271 |                 nx = min(max(x + dx, 0), square_size)
272 |                 ny = min(max(y + dy, 0), square_size)
273 |                 # Compute maximal radius skipping self
274 |                 nr = max_circle_radius(nx, ny, circles, square_size, skip_idx=i)
275 |                 if nr > best_r:
276 |                     best_x, best_y, best_r = nx, ny, nr
277 |                 else:
278 |                     delta *= 0.9
279 |             circles[i] = (best_x, best_y, best_r)
280 |         
281 |         # Physics-inspired soft relaxation to escape persistent overlaps
282 |         for i in range(len(circles)):
283 |             x, y, r = circles[i]
284 |             fx, fy = 0.0, 0.0
285 |             for j, (xj, yj, rj) in enumerate(circles):
286 |                 if i == j:
287 |                     continue
288 |                 dx = x - xj
289 |                 dy = y - yj
290 |                 d = (dx*dx + dy*dy) ** 0.5
291 |                 overlap = (r + rj) - d
292 |                 if overlap > 0 and d > 1e-8:
293 |                     fx += dx / d * overlap
294 |                     fy += dy / d * overlap
295 |             # Nudge the center by 10% of the computed net “repulsive” force
296 |             nx = min(max(x + 0.1 * fx, 0), square_size)
297 |             ny = min(max(y + 0.1 * fy, 0), square_size)
298 |             nr = max_circle_radius(nx, ny, circles, square_size, skip_idx=i)
299 |             circles[i] = (nx, ny, nr)
300 |         return circles
301 | 
302 |     def multi_start_optimize(n, square_size, starts=None):
303 |         """
304 |         Parallel multi-start global → local optimization using ThreadPoolExecutor.
305 |         Number of starts adapts to problem size: max(100, 10*n).
306 |         """
307 |         if starts is None:
308 |             if n <= 50:
309 |                 starts = max(200, n * 20)
310 |             else:
311 |                 starts = max(100, n * 10)
312 |         # precompute hexagonal‐packing baseline
313 |         hex_circ = hexagonal_packing(n, square_size)
314 |         hex_sum = sum(r for _, _, r in hex_circ)
315 |         best_conf = None
316 |         best_sum = 0.0
317 | 
318 |         # single trial: seed → refine → score
319 |         def single_run(_):
320 |             conf0 = optimize_placement(n, square_size)
321 |             conf1 = refine_circles(conf0, square_size, iterations=40)
322 |             s1 = sum(r for _, _, r in conf1)
323 |             return s1, conf1
324 | 
325 |         # dispatch trials in parallel
326 |         with ThreadPoolExecutor() as executor:
327 |             for score, conf in executor.map(single_run, range(starts)):
328 |                 if score > best_sum:
329 |                     best_sum, best_conf = score, conf.copy()
330 |                 # early exit if near the hex-baseline
331 |                 if best_sum >= hex_sum * 0.995:
332 |                     break
333 | 
334 |         return best_conf
335 | 
336 |     # Use multi-start global → local optimization (adaptive number of starts)
337 |     circles = multi_start_optimize(n, square_size)
338 | 
339 |     # Quick 2-cluster remove-and-reinsert densification (extended iterations)
340 |     for _ in range(8):
341 |         # remove the two smallest circles to create a larger gap
342 |         smallest = sorted(range(len(circles)), key=lambda i: circles[i][2])[:2]
343 |         removed = [circles[i] for i in smallest]
344 |         # pop in reverse order to keep indices valid
345 |         for i in sorted(smallest, reverse=True):
346 |             circles.pop(i)
347 |         # refine the remaining configuration briefly
348 |         circles = refine_circles(circles, square_size, iterations=8)
349 |         # reinsert each removed circle with more sampling
350 |         for x_old, y_old, _ in removed:
351 |             best_r, best_pos = 0.0, (x_old, y_old)
352 |             for _ in range(500):
353 |                 x = random.uniform(0, square_size)
354 |                 y = random.uniform(0, square_size)
355 |                 r = max_circle_radius(x, y, circles, square_size)
356 |                 if r > best_r:
357 |                     best_r, best_pos = r, (x, y)
358 |             circles.append((best_pos[0], best_pos[1], best_r))
359 |         # final local polish after reinsertion
360 |         circles = refine_circles(circles, square_size, iterations=5)
361 |     # end 2-cluster remove-and-reinsert densification
362 | 
363 |     # Calculate total radius
364 |     total_radius = sum(circle[2] for circle in circles)
365 |     
366 |     return total_radius, circles


--------------------------------------------------------------------------------