├── reward_model ├── train │ ├── README.md │ ├── log.txt │ ├── download.py │ ├── requirements.txt │ ├── script │ │ └── train_qwen.sh │ ├── train.py │ ├── preprocess.py │ └── utils.py ├── run.sh ├── grade.py └── llm │ └── backend.py ├── points.npy ├── assets ├── comp.png ├── logo.png ├── steps.png ├── example.png └── overview.png ├── .gitignore ├── evolve_agent ├── __init__.py ├── prompt │ ├── __init__.py │ └── templates.py ├── llm │ ├── __init__.py │ ├── base.py │ ├── ensemble.py │ └── openai.py ├── utils │ ├── __init__.py │ ├── metrics_utils.py │ ├── format_utils.py │ ├── code_utils.py │ └── async_utils.py ├── evaluation_result.py ├── cli.py └── reward_model.py ├── benchmark ├── heilbronn_in_the_unit_square │ ├── points.npy │ ├── initial_proposal.txt │ ├── evaluator.py │ ├── visualization.py │ └── initial_program.py ├── MSTD │ ├── initial_program.py │ ├── initial_proposal.txt │ └── evaluator.py ├── minizing_raio_max_min_distance │ ├── initial_proposal.txt │ ├── evaluator.py │ └── initial_program.py ├── packing_circles │ ├── initial_proposal.txt │ ├── evaluator.py │ └── initial_program.py ├── third_autocorrelation_inequality │ ├── initial_proposal.txt │ ├── evaluator.py │ └── initial_program.py ├── littlewood_polynomials │ ├── initial_program.py │ ├── initial_proposal.txt │ └── evaluator.py ├── spherical_code │ ├── initial_proposal.txt │ ├── evaluator.py │ ├── initial_program.py │ └── visualization.py ├── kissing_number │ ├── initial_proposal.txt │ └── evaluator.py ├── autoconvolution_peak_minimization │ ├── initial_proposal.txt │ ├── evaluator.py │ └── initial_program.py └── human_best.txt ├── run.py ├── configs ├── island_config_example.yaml ├── README.md ├── island_examples.yaml └── default_config.yaml ├── compute.py └── README.md /reward_model/train/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reward_model/train/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /points.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answers111/alpha-research/HEAD/points.npy -------------------------------------------------------------------------------- /assets/comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answers111/alpha-research/HEAD/assets/comp.png -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answers111/alpha-research/HEAD/assets/logo.png -------------------------------------------------------------------------------- /assets/steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answers111/alpha-research/HEAD/assets/steps.png -------------------------------------------------------------------------------- /assets/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answers111/alpha-research/HEAD/assets/example.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answers111/alpha-research/HEAD/assets/overview.png -------------------------------------------------------------------------------- /reward_model/run.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | python /data/zhuotaodeng/yzj/alpha-research/idea-eval/grade.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__/ 3 | results 4 | configs/kimi_config.yaml 5 | configs/oai_config.yaml 6 | configs/deepseek_config.yaml 7 | -------------------------------------------------------------------------------- /evolve_agent/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | 3 | from evolve_agent.controller import EvolveAgent 4 | 5 | __all__ = ["EvolveAgent"] 6 | -------------------------------------------------------------------------------- /benchmark/heilbronn_in_the_unit_square/points.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answers111/alpha-research/HEAD/benchmark/heilbronn_in_the_unit_square/points.npy -------------------------------------------------------------------------------- /reward_model/train/download.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM 2 | 3 | model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-Coder-32B-Instruct') 4 | 5 | print(model) -------------------------------------------------------------------------------- /evolve_agent/prompt/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prompt module initialization 3 | """ 4 | 5 | from evolve_agent.prompt.sampler import PromptSampler 6 | from evolve_agent.prompt.templates import TemplateManager 7 | 8 | __all__ = ["PromptSampler", "TemplateManager"] 9 | -------------------------------------------------------------------------------- /evolve_agent/llm/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | LLM module initialization 3 | """ 4 | 5 | from evolve_agent.llm.base import LLMInterface 6 | from evolve_agent.llm.ensemble import LLMEnsemble 7 | from evolve_agent.llm.openai import OpenAILLM 8 | 9 | __all__ = ["LLMInterface", "OpenAILLM", "LLMEnsemble"] 10 | -------------------------------------------------------------------------------- /reward_model/train/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | openai==0.28.1 3 | httpx==0.25.1 4 | markdown2==2.4.10 5 | nh3==0.2.14 6 | numpy==1.26.2 7 | pydantic==1.10.13 8 | psutil==5.9.6 9 | requests==2.32.0 10 | rich==13.7.0 11 | tiktoken==0.5.1 12 | uvicorn==0.24.0.post1 13 | accelerate==0.25.0 14 | peft==0.6.2 15 | sentencepiece==0.1.99 16 | protobuf==4.23.4 17 | einops==0.7.0 18 | wandb==0.16.0 19 | torch==2.1.2 20 | transformers==4.37.1 21 | fastchat==0.1.0 22 | conda config -------------------------------------------------------------------------------- /benchmark/MSTD/initial_program.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def main(): 5 | N = 30 6 | # Conway MSTD set example; we take A=B for classical MSTD 7 | A = [0, 2, 3, 4, 7, 11, 12, 14] 8 | B = A[:] 9 | A_ind = np.zeros(N, dtype=int); A_ind[A] = 1 10 | B_ind = np.zeros(N, dtype=int); B_ind[B] = 1 11 | return A_ind, B_ind 12 | 13 | 14 | # Ensure globals for evaluator 15 | try: 16 | A_indicators; B_indicators # type: ignore[name-defined] 17 | except NameError: 18 | A_indicators, B_indicators = main() 19 | 20 | 21 | -------------------------------------------------------------------------------- /evolve_agent/llm/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base LLM interface 3 | """ 4 | 5 | from abc import ABC, abstractmethod 6 | from typing import Any, Dict, List, Optional 7 | 8 | 9 | class LLMInterface(ABC): 10 | """Abstract base class for LLM interfaces""" 11 | 12 | @abstractmethod 13 | async def generate(self, prompt: str, **kwargs) -> str: 14 | """Generate text from a prompt""" 15 | pass 16 | 17 | @abstractmethod 18 | async def generate_with_context( 19 | self, system_message: str, messages: List[Dict[str, str]], **kwargs 20 | ) -> str: 21 | """Generate text using a system message and conversational context""" 22 | pass 23 | -------------------------------------------------------------------------------- /benchmark/minizing_raio_max_min_distance/initial_proposal.txt: -------------------------------------------------------------------------------- 1 | Problem. Arrange n points in [0,1]^d to optimize the dispersion/packing–covering tradeoff. The benchmark metric is 2 | ratio = (min pairwise distance) / (max pairwise distance) 3 | so that larger ratio is better (values in (0,1]). 4 | Evaluator. Given a program exposing max_min_dis_ratio(n,d), we obtain configurations for (n,d)=(16,2) and (14,3), then report ratio for each case. 5 | 6 | Baseline algorithm. The initial program employs enhanced simulated annealing with adaptive cooling, neighbor-repulsion moves, periodic smoothing via k-NN weighted averages, and a local refinement stage. KD-tree acceleration is used for nearest-neighbor queries; hyperparameters adapt to dimension. -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from evolve_agent import EvolveAgent 2 | import asyncio 3 | import logging 4 | import os 5 | 6 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 7 | # logging.basicConfig( 8 | # level=logging.DEBUG, 9 | # # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 10 | # # filename='app.log', 11 | # ) 12 | 13 | evolve_agent = EvolveAgent( 14 | initial_program_path="results/initial_program.py", 15 | evaluation_file="results/evaluator.py", 16 | initial_proposal_path="results/initial_proposal.txt", 17 | config_path="configs/oai_config.yaml" 18 | ) 19 | 20 | async def main(): 21 | best_program = await evolve_agent.run(iterations=50) 22 | print(best_program) 23 | 24 | asyncio.run(main()) 25 | # print(evolve_agent) 26 | 27 | -------------------------------------------------------------------------------- /benchmark/packing_circles/initial_proposal.txt: -------------------------------------------------------------------------------- 1 | Packing circles inside a unit square to maximize sum of radii 2 | Given an integer n, place n disjoint circles in [0,1]^2 to maximize the total sum of radii. 3 | 4 | Objective and metric 5 | - Score = total sum of radii (larger is better). 6 | - Validity: circles must be pairwise disjoint and fully inside the unit square. 7 | 8 | Notes on records 9 | - This variable-radius “sum of radii” objective is not the classical equal-radius packing; authoritative SOTA tables are not standardized. 10 | - Values reported in code or experiments should be treated as benchmarks rather than literature SOTA. 11 | 12 | Goal 13 | - Create algorithms that increase the total sum of radii for n ∈ {26, 32} under the above validity constraints. 14 | -------------------------------------------------------------------------------- /benchmark/third_autocorrelation_inequality/initial_proposal.txt: -------------------------------------------------------------------------------- 1 | Third-order autocorrelation inequality (C3 upper bound) 2 | 3 | Problem. For piecewise-constant nonnegative functions on a fixed support with unit mass, we evaluate an upper bound C_upper_bound derived from the maximum of the autoconvolution (normalized by squared L1 mass). The benchmark score is 4 | score = 1 / C_upper_bound 5 | so larger score indicates a smaller upper bound and hence a better result. 6 | 7 | Evaluator. The evaluator calls find_better_c3_upper_bound() from the target program to obtain step heights, computes the (normalized) autoconvolution maximum, and returns 1/C_upper_bound. 8 | 9 | Baseline algorithm. A simple genetic algorithm over height sequences (tournament selection, one-point crossover, Gaussian mutation) serves as the baseline search method. -------------------------------------------------------------------------------- /benchmark/littlewood_polynomials/initial_program.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def rudin_shapiro(n: int): 4 | """First n signs of the Rudin–Shapiro sequence (±1).""" 5 | a = np.ones(n, dtype=int) 6 | for k in range(n): 7 | x, cnt, prev = k, 0, 0 8 | while x: 9 | b = x & 1 10 | if b & prev: # saw '11' 11 | cnt ^= 1 12 | prev = b 13 | x >>= 1 14 | a[k] = 1 if cnt == 0 else -1 15 | return a 16 | 17 | def random_littlewood(n: int, seed=0): 18 | rng = np.random.default_rng(seed) 19 | return rng.choice([-1, 1], size=n).astype(int) 20 | 21 | def main(): 22 | n = 512 23 | c = rudin_shapiro(n) 24 | print(f"n={n}, coeffs={len(c)}") 25 | return c 26 | 27 | if __name__ == "__main__": 28 | coeffs = main() 29 | 30 | # Ensure compatibility with evaluators that expect a global variable 31 | try: 32 | coeffs # type: ignore[name-defined] 33 | except NameError: 34 | coeffs = main() 35 | -------------------------------------------------------------------------------- /benchmark/MSTD/initial_proposal.txt: -------------------------------------------------------------------------------- 1 | MSTD (More Sums Than Differences) Benchmark 2 | 3 | Objective 4 | - Classical MSTD (enforced): Given A ⊂ {0,1,...,N-1} represented by a 0/1 indicator array of length N, 5 | maximize the ratio R = |A+A| / |A−A|. 6 | - Score: score = R (higher is better). 7 | - Comparisons should be made under the same N. 8 | 9 | Default setup in this benchmark 10 | - N = 30. 11 | - Evaluator enforces A=B (classical setting). If a pair (A,B) is provided, B is ignored and A is used. 12 | 13 | Known best for N = 30 (baseline) 14 | - Conway’s MSTD set A = {0,2,3,4,7,11,12,14} yields R ≈ 1.04. 15 | - This is the baseline included in initial_program.py. 16 | - Better ratios may exist for N=30; pushing R upwards is the optimization goal. 17 | 18 | Notes 19 | - R>1 is rare and indicates sum-dominance. 20 | - The ratio depends strongly on N; do not compare ratios across different N without a normalization scheme. 21 | - If cross-N comparison is necessary, consider reporting both R and N, or use log R as an auxiliary measure. 22 | -------------------------------------------------------------------------------- /benchmark/heilbronn_in_the_unit_square/initial_proposal.txt: -------------------------------------------------------------------------------- 1 | Heilbronn in the unit square: maximize smallest triangle area (n = 16) 2 | 3 | Problem definition. Given n = 16 points in [0,1]^2, define Δ(P) as the minimum triangle area over all triples from configuration P. The task is to maximize Δ(P). 4 | 5 | Metric (larger is better). Use the raw minimum triangle area: 6 | score = min_area. 7 | For reference we also report scaled_min_area = n^{8/7 + 1/2000} * min_area (informational only). 8 | 9 | Constraints. Points must lie in [0,1]^2. 10 | 11 | Evaluator. The evaluator returns {"score" (= min_area), "scaled_min_area", "min_area", "n"}. 12 | 13 | Baseline. The initial program seeds from hexagonal/grid/Poisson-disk variants and refines to increase the minimum triangle area for n = 16. 14 | 15 | Human best. For n = 16, the best-known record is A = 7/341 ≈ 0.020526...,见 Erich Friedman 的汇总 [https://erich-friedman.github.io/packing/heilbronn/]。全局最优性未被普遍证明。 16 | 17 | Goal. Construct point sets that maximize min_area (and thus also increase scaled_min_area accordingly) for n = 16. 18 | -------------------------------------------------------------------------------- /benchmark/third_autocorrelation_inequality/evaluator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def evaluate(program_path: str = "/data/zhuotaodeng/yzj/_para/alpha-research/math/initial_program.py"): 4 | """ 5 | Evaluate the pack_circles function from the given program file. 6 | Returns the total radius sum if valid, otherwise raises an exception. 7 | """ 8 | import importlib.util 9 | import sys 10 | 11 | # Load the module from the given path 12 | spec = importlib.util.spec_from_file_location("program", program_path) 13 | program = importlib.util.module_from_spec(spec) 14 | sys.modules["program"] = program 15 | spec.loader.exec_module(program) 16 | try: 17 | height_sequence_3 = program.find_better_c3_upper_bound() 18 | except: 19 | return {"error": -10.0} 20 | 21 | convolution_3 = np.convolve(height_sequence_3, height_sequence_3) 22 | C_upper_bound = abs(2 * len(height_sequence_3) * np.max(convolution_3) / (np.sum(height_sequence_3)**2)) 23 | 24 | return {"score": 1.0 / C_upper_bound} 25 | 26 | print(evaluate()) -------------------------------------------------------------------------------- /evolve_agent/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities module initialization 3 | """ 4 | 5 | from evolve_agent.utils.async_utils import ( 6 | TaskPool, 7 | gather_with_concurrency, 8 | retry_async, 9 | run_in_executor, 10 | ) 11 | from evolve_agent.utils.code_utils import ( 12 | apply_diff, 13 | calculate_edit_distance, 14 | extract_code_language, 15 | extract_diffs, 16 | format_diff_summary, 17 | parse_evolve_blocks, 18 | parse_full_rewrite, 19 | ) 20 | from evolve_agent.utils.format_utils import ( 21 | format_metrics_safe, 22 | format_improvement_safe, 23 | ) 24 | from evolve_agent.utils.metrics_utils import ( 25 | safe_numeric_average, 26 | safe_numeric_sum, 27 | ) 28 | 29 | __all__ = [ 30 | "TaskPool", 31 | "gather_with_concurrency", 32 | "retry_async", 33 | "run_in_executor", 34 | "apply_diff", 35 | "calculate_edit_distance", 36 | "extract_code_language", 37 | "extract_diffs", 38 | "format_diff_summary", 39 | "parse_evolve_blocks", 40 | "parse_full_rewrite", 41 | "format_metrics_safe", 42 | "format_improvement_safe", 43 | "safe_numeric_average", 44 | "safe_numeric_sum", 45 | ] 46 | -------------------------------------------------------------------------------- /benchmark/spherical_code/initial_proposal.txt: -------------------------------------------------------------------------------- 1 | Spherical code (N=30 on S^2): maximize minimum pairwise angle on the unit sphere 2 | 3 | Problem definition. Choose N=30 points on S^{2} to maximize the minimum pairwise angle θ_min = min_{i $LOG_PATH 2>&1 36 | -------------------------------------------------------------------------------- /benchmark/kissing_number/initial_proposal.txt: -------------------------------------------------------------------------------- 1 | Kissing number in 11D: maximize valid contact points on the unit sphere 2 | 3 | Problem. Given dimension d (here d = 11), construct as many unit vectors as possible on S^{d-1} so that every pair of distinct vectors has inner product ≤ 0.5. Equivalently, these are centers of equal unit spheres touching a central one without violating pairwise angle/packing constraints. 4 | 5 | Constraints. Vectors must be unit length; for any distinct i ≠ j, ⟨v_i, v_j⟩ ≤ 0.5. 6 | 7 | Optimization goal. Maximize the number of vectors (kissing points). The evaluator verifies unit norms and the 0.5 cap, then reports the count and dimension. 8 | 9 | Best-known results (human). In 11D, the current constructive lower bound is 592 (Ganzhinov, PSU(4,2) construction). Upper bounds above 800 exist in the literature (exact best published bound depends on source); we focus on improving the constructive lower bound. 10 | 11 | Algorithm goal. Create an algorithm that constructs large valid sets under the above constraints. This program follows a PSU(4,2)-based construction (when ATLAS data are available) yielding 592 in 11D, and otherwise uses a robust randomized MIS-style fallback to produce large valid configurations. The program outputs sphere_centers (unit vectors) for evaluator verification. 12 | -------------------------------------------------------------------------------- /benchmark/littlewood_polynomials/initial_proposal.txt: -------------------------------------------------------------------------------- 1 | 2 | Littlewood polynomials: minimize sup-norm on the unit circle 3 | 4 | Problem definition. Choose coefficients c_k ∈ {±1} for P(z)=∑_{k=0}^{n−1} c_k z^k, |z|=1, to minimize the supremum norm ‖P‖_∞=max_{|z|=1}|P(z)|. 5 | 6 | Constraints. Coefficients are ±1. Metric is supnorm estimated by FFT sampling on an equally spaced grid; denser grid → tighter upper bound. 7 | 8 | Optimization goal. Minimize supnorm. The evaluator returns a single scalar: 1/supnorm if valid, else −1.0. 9 | 10 | Notes on bounds. For the Rudin–Shapiro construction of length n, a classical identity gives supnorm ≤ √(2n) (so the absolute constant C = √2 outside the √n). For the benchmark default n = 512, this yields supnorm ≤ √(1024) = 32 and thus score 1/32 = 0.03125. 11 | 12 | Best-known results (human). The optimal value is Θ(√n). Upper bounds: Rudin–Shapiro and variants achieve ‖P‖_∞ ≤ C√n (C≈2 in practice). Lower bounds: ‖P‖_∞ ≥ c√n for an absolute c>0. Practical searches aim to reduce the constant for fixed n. 13 | 14 | Algorithm goal. Construct ±1 sequences with smaller supnorm. This baseline uses Rudin–Shapiro; stronger methods include local flips, simulated annealing, coordinate descent on ±1 (with careful acceptance), and spectral heuristics guided by FFT magnitudes. 15 | 16 | 17 | -------------------------------------------------------------------------------- /configs/island_config_example.yaml: -------------------------------------------------------------------------------- 1 | # EvolveAgent Island-Based Evolution Configuration 2 | # This configuration demonstrates the proper use of island-based evolution 3 | 4 | # General settings 5 | max_iterations: 1000 6 | checkpoint_interval: 100 7 | log_level: "INFO" 8 | 9 | # LLM configuration 10 | llm: 11 | primary_model: "gemini-2.0-flash-lite" 12 | primary_model_weight: 0.8 13 | secondary_model: "gemini-2.0-flash" 14 | secondary_model_weight: 0.2 15 | temperature: 0.7 16 | top_p: 0.95 17 | max_tokens: 4096 18 | 19 | # Database configuration with proper island settings 20 | database: 21 | population_size: 500 22 | archive_size: 100 23 | 24 | # Island-based evolution settings 25 | num_islands: 5 # Number of separate populations 26 | migration_interval: 50 # Migrate every 50 generations 27 | migration_rate: 0.1 # Migrate 10% of top programs 28 | 29 | # Selection parameters 30 | elite_selection_ratio: 0.1 31 | exploration_ratio: 0.3 32 | exploitation_ratio: 0.7 33 | # Note: diversity_metric fixed to "edit_distance" 34 | 35 | # Feature map dimensions for MAP-Elites 36 | feature_dimensions: ["score", "complexity"] 37 | feature_bins: 10 38 | 39 | # Prompt configuration 40 | prompt: 41 | num_top_programs: 3 42 | num_diverse_programs: 2 43 | use_template_stochasticity: true 44 | 45 | # Evaluator configuration 46 | evaluator: 47 | timeout: 300 48 | max_retries: 3 49 | cascade_evaluation: true 50 | parallel_evaluations: 4 51 | 52 | # Evolution settings 53 | diff_based_evolution: true 54 | allow_full_rewrites: false 55 | max_code_length: 10000 56 | -------------------------------------------------------------------------------- /benchmark/minizing_raio_max_min_distance/evaluator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy as sp 3 | 4 | 5 | def cal_ratio(construction_1): 6 | pairwise_distances = sp.spatial.distance.pdist(construction_1) 7 | min_distance = np.min(pairwise_distances) 8 | max_distance = np.max(pairwise_distances) 9 | ratio_squared = (min_distance / max_distance)**2 10 | return ratio_squared 11 | 12 | 13 | def evaluate(program_path: str = "results/initial_program.py"): 14 | """ 15 | Evaluate the pack_circles function from the given program file. 16 | Returns the total radius sum if valid, otherwise raises an exception. 17 | """ 18 | import importlib.util 19 | import sys 20 | 21 | # Load the module from the given path 22 | spec = importlib.util.spec_from_file_location("program", program_path) 23 | program = importlib.util.module_from_spec(spec) 24 | sys.modules["program"] = program 25 | spec.loader.exec_module(program) 26 | 27 | 28 | # Check if 'max_min_dis_ratio' exists in the loaded module 29 | if not hasattr(program, 'max_min_dis_ratio'): 30 | raise ValueError(f"The file '{program_path}' does not define 'max_min_dis_ratio'.") 31 | 32 | try: 33 | res_n16_d2, _ = program.max_min_dis_ratio(16, 2) 34 | res_n14_d3, _ = program.max_min_dis_ratio(14, 3) 35 | except Exception as e1: 36 | return {"result": -10.0, "error": e1} 37 | 38 | try: 39 | ratio_n16_d2 = cal_ratio(res_n16_d2) # AlphaEvolve: 1 / 12.88926611203463 = 0.07758393622320406 40 | ratio_n14_d3 = cal_ratio(res_n14_d3) # AlphaEvolve: 1 / 4.165849767 = 0.24004706263470807 41 | 42 | 43 | except Exception as e: 44 | return {"result": -1.0, "error": e} 45 | 46 | results = { 47 | "ratio_n16_d2": ratio_n16_d2, 48 | "ratio_n14_d3": ratio_n14_d3, 49 | } 50 | 51 | return results 52 | 53 | print(evaluate()) -------------------------------------------------------------------------------- /benchmark/autoconvolution_peak_minimization/initial_proposal.txt: -------------------------------------------------------------------------------- 1 | Autoconvolution peak minimization on a unit interval (standard normalization) 2 | 3 | Problem definition. Let 4 | 5 | $$ 6 | \mathcal{F}=\Big\{\,f\in L^{1}\!\big([-{\tfrac12},{\tfrac12}]\big):\ f\ge 0,\ \int_{-1/2}^{1/2} f(x)\,dx=1\,\Big\} 7 | $$ 8 | 9 | $$ 10 | \qquad 11 | (f*f)(t)=\int_{\mathbb{R}} f(x)\,f(t-x)\,dx . 12 | $$ 13 | 14 | We seek to minimize the peak value of the autoconvolution: 15 | $$ 16 | \mu_\infty \;=\; \inf_{f\in\mathcal{F}} \ \|f*f\|_\infty . 17 | $$ 18 | 19 | Constraints. Nonnegative density, unit mass (L1=1), support length 1 (here taken as [-1/2, 1/2]). In the implementation, f is represented by nonnegative step heights on a uniform grid and normalized to unit integral. 20 | 21 | Optimization goal. Minimize \(\mu_\infty = \max_t (f*f)(t)\). Smaller is better. 22 | 23 | Best-known human results. In this standard setup, the best currently published bounds are 24 | $$ 25 | \boxed{0.64 \ \le\ \mu_\infty \ \le\ 0.75496}\,. 26 | $$ 27 | The upper bound traces to work of Matolcsi–Vinuesa (after normalizing support length to 1), and the lower bound to Cloninger–Steinerberger. 28 | 29 | Algorithm goal. Create an algorithm that constructs feasible densities with progressively smaller \(\mu_\infty\). This baseline program generates simple analytical candidates (box, triangle, cosine-squared, Gaussian) on a uniform grid, normalizes to unit mass, and computes autoconvolution via FFT to measure \(\mu_\infty\). It serves as a starting point for more advanced search/optimization methods. 30 | 31 | References. 32 | - E. P. White, *An optimal $L^2$ autoconvolution inequality*, Canadian Mathematical Bulletin (2024). 33 | - M. Matolcsi and C. Vinuesa, *Improved bounds on the supremum of autoconvolutions*, J. Math. Anal. Appl. 372 (2010), 439–447. 34 | - A. Cloninger and S. Steinerberger, *On suprema of autoconvolutions with an application to Sidon sets*, Proc. Amer. Math. Soc. 145 (2017), 3191–3200. 35 | -------------------------------------------------------------------------------- /evolve_agent/evaluation_result.py: -------------------------------------------------------------------------------- 1 | """ 2 | Evaluation result structures for EvolveAgent 3 | """ 4 | 5 | import json 6 | from dataclasses import dataclass, field 7 | from typing import Dict, Union 8 | 9 | 10 | @dataclass 11 | class EvaluationResult: 12 | """ 13 | Result of program evaluation containing both metrics and optional artifacts 14 | 15 | This maintains backward compatibility with the existing dict[str, float] contract 16 | while adding a side-channel for arbitrary artifacts (text or binary data). 17 | """ 18 | 19 | metrics: Dict[str, float] # mandatory - existing contract 20 | artifacts: Dict[str, Union[str, bytes]] = field(default_factory=dict) # optional side-channel 21 | 22 | @classmethod 23 | def from_dict(cls, metrics: Dict[str, float]) -> "EvaluationResult": 24 | """Auto-wrap dict returns for backward compatibility""" 25 | return cls(metrics=metrics) 26 | 27 | def to_dict(self) -> Dict[str, float]: 28 | """Backward compatibility - return just metrics""" 29 | return self.metrics 30 | 31 | def has_artifacts(self) -> bool: 32 | """Check if this result contains any artifacts""" 33 | return bool(self.artifacts) 34 | 35 | def get_artifact_keys(self) -> list: 36 | """Get list of artifact keys""" 37 | return list(self.artifacts.keys()) 38 | 39 | def get_artifact_size(self, key: str) -> int: 40 | """Get size of a specific artifact in bytes""" 41 | if key not in self.artifacts: 42 | return 0 43 | 44 | value = self.artifacts[key] 45 | if isinstance(value, str): 46 | return len(value.encode("utf-8")) 47 | elif isinstance(value, bytes): 48 | return len(value) 49 | else: 50 | return len(str(value).encode("utf-8")) 51 | 52 | def get_total_artifact_size(self) -> int: 53 | """Get total size of all artifacts in bytes""" 54 | return sum(self.get_artifact_size(key) for key in self.artifacts.keys()) 55 | -------------------------------------------------------------------------------- /evolve_agent/utils/metrics_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Safe calculation utilities for metrics containing mixed types 3 | """ 4 | 5 | from typing import Any, Dict 6 | 7 | 8 | def safe_numeric_average(metrics: Dict[str, Any]) -> float: 9 | """ 10 | Calculate the average of numeric values in a metrics dictionary, 11 | safely ignoring non-numeric values like strings. 12 | 13 | Args: 14 | metrics: Dictionary of metric names to values 15 | 16 | Returns: 17 | Average of numeric values, or 0.0 if no numeric values found 18 | """ 19 | if not metrics: 20 | return 0.0 21 | 22 | numeric_values = [] 23 | for value in metrics.values(): 24 | if isinstance(value, (int, float)): 25 | try: 26 | # Convert to float and check if it's a valid number 27 | float_val = float(value) 28 | if not (float_val != float_val): # Check for NaN (NaN != NaN is True) 29 | numeric_values.append(float_val) 30 | except (ValueError, TypeError, OverflowError): 31 | # Skip invalid numeric values 32 | continue 33 | 34 | if not numeric_values: 35 | return 0.0 36 | 37 | return sum(numeric_values) / len(numeric_values) 38 | 39 | 40 | def safe_numeric_sum(metrics: Dict[str, Any]) -> float: 41 | """ 42 | Calculate the sum of numeric values in a metrics dictionary, 43 | safely ignoring non-numeric values like strings. 44 | 45 | Args: 46 | metrics: Dictionary of metric names to values 47 | 48 | Returns: 49 | Sum of numeric values, or 0.0 if no numeric values found 50 | """ 51 | if not metrics: 52 | return 0.0 53 | 54 | numeric_sum = 0.0 55 | for value in metrics.values(): 56 | if isinstance(value, (int, float)): 57 | try: 58 | # Convert to float and check if it's a valid number 59 | float_val = float(value) 60 | if not (float_val != float_val): # Check for NaN (NaN != NaN is True) 61 | numeric_sum += float_val 62 | except (ValueError, TypeError, OverflowError): 63 | # Skip invalid numeric values 64 | continue 65 | 66 | return numeric_sum 67 | -------------------------------------------------------------------------------- /benchmark/spherical_code/evaluator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import numpy as np 5 | from typing import Dict 6 | import importlib.util 7 | 8 | EPS = 1e-12 9 | 10 | def evaluate_spherical_code_min_angle(points: np.ndarray) -> Dict[str, float]: 11 | P = np.asarray(points, dtype=float) 12 | if P.ndim != 2 or P.shape[0] < 2: 13 | return {"valid": 0.0, "min_angle": 0.0, "n": 0.0, "dimension": 0.0, "score": 0.0} 14 | # normalize rows onto the sphere 15 | norms = np.maximum(np.linalg.norm(P, axis=1, keepdims=True), EPS) 16 | P = P / norms 17 | n = P.shape[0] 18 | d = P.shape[1] 19 | min_angle = float("inf") 20 | for i in range(n): 21 | for j in range(i+1, n): 22 | cosang = float(np.clip(np.dot(P[i], P[j]), -1.0, 1.0)) 23 | ang = float(np.arccos(cosang)) 24 | if ang < min_angle: 25 | min_angle = ang 26 | return {"valid": 1.0, "min_angle": float(min_angle), "n": float(n), "dimension": float(d), "score": float(min_angle)} 27 | 28 | def evaluate(program_path: str): 29 | try: 30 | spec = importlib.util.spec_from_file_location("program", program_path) 31 | program = importlib.util.module_from_spec(spec) 32 | sys.modules["program"] = program 33 | spec.loader.exec_module(program) 34 | 35 | pts = None 36 | if hasattr(program, 'points'): 37 | pts = program.points 38 | elif hasattr(program, 'main'): 39 | res = program.main() 40 | if isinstance(res, np.ndarray): 41 | pts = res 42 | elif hasattr(program, 'points'): 43 | pts = program.points 44 | if pts is None: 45 | return {"error": -1.0} 46 | result = evaluate_spherical_code_min_angle(pts) 47 | return {"score": result["score"], "min_angle": result["min_angle"], "n": result["n"], "dimension": result["dimension"]} 48 | except Exception: 49 | return {"error": -1.0} 50 | 51 | if __name__ == "__main__": 52 | try: 53 | default_path = os.path.join(os.path.dirname(__file__), "initial_program.py") 54 | except Exception: 55 | default_path = "initial_program.py" 56 | target = sys.argv[1] if len(sys.argv) > 1 else default_path 57 | print(json.dumps(evaluate(target), ensure_ascii=False, indent=2)) 58 | -------------------------------------------------------------------------------- /evolve_agent/utils/format_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for formatting output 3 | """ 4 | 5 | from typing import Any, Dict 6 | 7 | 8 | def format_metrics_safe(metrics: Dict[str, Any]) -> str: 9 | """ 10 | Safely format metrics dictionary for logging, handling both numeric and string values. 11 | 12 | Args: 13 | metrics: Dictionary of metric names to values 14 | 15 | Returns: 16 | Formatted string representation of metrics 17 | """ 18 | if not metrics: 19 | return "" 20 | 21 | formatted_parts = [] 22 | for name, value in metrics.items(): 23 | # Check if value is numeric (int, float) 24 | if isinstance(value, (int, float)): 25 | try: 26 | # Only apply float formatting to numeric values 27 | formatted_parts.append(f"{name}={value:.4f}") 28 | except (ValueError, TypeError): 29 | # Fallback to string representation if formatting fails 30 | formatted_parts.append(f"{name}={value}") 31 | else: 32 | # For non-numeric values (strings, etc.), just convert to string 33 | formatted_parts.append(f"{name}={value}") 34 | 35 | return ", ".join(formatted_parts) 36 | 37 | 38 | def format_improvement_safe(parent_metrics: Dict[str, Any], child_metrics: Dict[str, Any]) -> str: 39 | """ 40 | Safely format improvement metrics for logging. 41 | 42 | Args: 43 | parent_metrics: Parent program metrics 44 | child_metrics: Child program metrics 45 | 46 | Returns: 47 | Formatted string representation of improvements 48 | """ 49 | if not parent_metrics or not child_metrics: 50 | return "" 51 | 52 | improvement_parts = [] 53 | for metric, child_value in child_metrics.items(): 54 | if metric in parent_metrics: 55 | parent_value = parent_metrics[metric] 56 | # Only calculate improvement for numeric values 57 | if isinstance(child_value, (int, float)) and isinstance(parent_value, (int, float)): 58 | try: 59 | diff = child_value - parent_value 60 | improvement_parts.append(f"{metric}={diff:+.4f}") 61 | except (ValueError, TypeError): 62 | # Skip non-numeric comparisons 63 | continue 64 | 65 | return ", ".join(improvement_parts) 66 | -------------------------------------------------------------------------------- /benchmark/packing_circles/evaluator.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import itertools 4 | import random 5 | 6 | random.seed(42) 7 | np.random.seed(42) 8 | 9 | 10 | def verify_circles(circles): 11 | """Checks that the circles are disjoint and lie inside a unit square. 12 | 13 | Args: 14 | circles: A list of tuples (x, y, radius) or numpy array of shape (num_circles, 3) 15 | 16 | Returns: 17 | bool: True if circles are valid (disjoint and within unit square), False otherwise 18 | """ 19 | # Convert to numpy array if it's a list 20 | if not isinstance(circles, np.ndarray): 21 | circles = np.array(circles) 22 | 23 | # Check pairwise disjointness 24 | for circle1, circle2 in itertools.combinations(circles, 2): 25 | center_distance = np.sqrt((circle1[0] - circle2[0])**2 + (circle1[1] - circle2[1])**2) 26 | radii_sum = circle1[2] + circle2[2] 27 | if center_distance < radii_sum: 28 | return False 29 | 30 | # Check all circles lie inside the unit square [0,1]x[0,1] 31 | for circle in circles: 32 | if not (0 <= min(circle[0], circle[1]) - circle[2] and max(circle[0], circle[1]) + circle[2] <= 1): 33 | return False 34 | 35 | return True 36 | 37 | 38 | def evaluate(program_path: str = "results/initial_program.py"): 39 | """ 40 | Evaluate the pack_circles function from the given program file. 41 | Returns dict with keys: score, result_26, result_32; score is sum of totals. 42 | """ 43 | import importlib.util 44 | import sys 45 | 46 | # Load the module from the given path 47 | spec = importlib.util.spec_from_file_location("program", program_path) 48 | program = importlib.util.module_from_spec(spec) 49 | sys.modules["program"] = program 50 | spec.loader.exec_module(program) 51 | 52 | # Test the pack_circles function 53 | try: 54 | total_r_26, circles_26 = program.pack_circles(26) 55 | total_r_32, circles_32 = program.pack_circles(32) 56 | except Exception as e: 57 | return {"error": -10.0} 58 | 59 | # Validate the circles 60 | valid_26 = verify_circles(circles_26) 61 | valid_32 = verify_circles(circles_32) 62 | 63 | if not all((valid_26, valid_32)): 64 | return {"error": -1.0} 65 | 66 | score = float(total_r_26 + total_r_32) 67 | return { 68 | "score": score, 69 | "result_26": total_r_26, 70 | "result_32": total_r_32 71 | } 72 | 73 | print(evaluate("/data/zhuotaodeng/yzj/_para/alpha-research/results_circles_1/test.py")) -------------------------------------------------------------------------------- /benchmark/MSTD/evaluator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import importlib.util 3 | import sys 4 | import os 5 | import json 6 | 7 | 8 | def _to_index_set(indicators): 9 | arr = np.asarray(indicators).astype(int) 10 | return np.nonzero(arr)[0] 11 | 12 | 13 | def mstd_ratio(A_idx, B_idx=None): 14 | """ 15 | Classical MSTD ratio with enforced A=B: R = |A+A| / |A−A|. 16 | Any provided B is ignored to keep consistency with the baseline setting. 17 | """ 18 | # Enforce classic setting: ignore B and set B_idx = A_idx 19 | B_idx = A_idx 20 | if len(A_idx) == 0 or len(B_idx) == 0: 21 | return -1.0 22 | sumset, diffset = set(), set() 23 | for a in A_idx: 24 | for b in B_idx: 25 | sumset.add(int(a + b)) 26 | diffset.add(int(a - b)) 27 | if len(diffset) == 0: 28 | return -1.0 29 | return float(len(sumset)) / float(len(diffset)) 30 | 31 | 32 | def evaluate(program_path: str): 33 | try: 34 | spec = importlib.util.spec_from_file_location("program", program_path) 35 | program = importlib.util.module_from_spec(spec) 36 | sys.modules["program"] = program 37 | spec.loader.exec_module(program) 38 | 39 | # Accept either A_indicators (and optional B_indicators) or a main() returning them 40 | A = None 41 | B = None 42 | if hasattr(program, 'A_indicators'): 43 | A = program.A_indicators 44 | if hasattr(program, 'B_indicators'): 45 | B = program.B_indicators 46 | if A is None: 47 | if hasattr(program, 'main'): 48 | res = program.main() 49 | if isinstance(res, tuple) and len(res) in (1, 2): 50 | if len(res) == 1: 51 | A = res[0] 52 | B = None 53 | else: 54 | A, B = res 55 | if A is None: 56 | return {"error": -1.0} 57 | 58 | A_idx = _to_index_set(A) 59 | # Enforce classic setting regardless of provided B 60 | R = mstd_ratio(A_idx, None) 61 | if R <= 0: 62 | return {"error": -1.0} 63 | # Higher is better: score = R 64 | return {"score": float(R), "ratio": float(R)} 65 | except Exception: 66 | return {"error": -1.0} 67 | 68 | 69 | if __name__ == "__main__": 70 | try: 71 | default_path = os.path.join(os.path.dirname(__file__), "initial_program.py") 72 | except Exception: 73 | default_path = "initial_program.py" 74 | target = sys.argv[1] if len(sys.argv) > 1 else default_path 75 | print(json.dumps(evaluate(target), ensure_ascii=False)) 76 | 77 | 78 | -------------------------------------------------------------------------------- /benchmark/autoconvolution_peak_minimization/evaluator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import subprocess 3 | import sys 4 | import traceback 5 | import os 6 | import json 7 | from typing import Dict 8 | 9 | def evaluate_C1_upper_std(step_heights: np.ndarray) -> Dict[str, float]: 10 | """ 11 | Standard-normalized C1 evaluation function. 12 | - Project to feasible set: h >= 0 and ∫f = 1 (L1 normalization). 13 | - Objective: mu_inf = max_t (f*f)(t) (smaller is better). 14 | """ 15 | h = np.asarray(step_heights, dtype=float) 16 | if h.size == 0 or np.any(h < 0): 17 | return {"valid": 0.0, "mu_inf": float("inf"), "ratio": float("inf")} 18 | K = int(len(h)) 19 | dx = 1.0 / K 20 | integral = float(np.sum(h) * dx) 21 | if integral <= 0: 22 | return {"valid": 0.0, "mu_inf": float("inf"), "ratio": float("inf")} 23 | h = h / integral 24 | F = np.fft.fft(h, 2*K - 1) 25 | conv = np.fft.ifft(F * F).real 26 | conv = np.maximum(conv, 0.0) 27 | mu_inf = float(np.max(conv) * dx) 28 | return {"valid": 1.0, "mu_inf": mu_inf, "ratio": mu_inf, "integral": 1.0, "K": float(K)} 29 | 30 | def evaluate(program_path: str): 31 | """ 32 | Evaluate a program that solves the autoconvolution peak minimization problem. 33 | 34 | Returns: dict with key 'score' = 1 / mu_inf (larger is better), or {'error': -1.0} 35 | """ 36 | try: 37 | import importlib.util 38 | spec = importlib.util.spec_from_file_location("program", program_path) 39 | program = importlib.util.module_from_spec(spec) 40 | sys.modules["program"] = program 41 | spec.loader.exec_module(program) 42 | step_heights = None 43 | if hasattr(program, 'step_heights'): 44 | step_heights = program.step_heights 45 | elif hasattr(program, 'h'): 46 | step_heights = program.h 47 | elif hasattr(program, 'main'): 48 | result = program.main() 49 | if isinstance(result, np.ndarray): 50 | step_heights = result 51 | elif hasattr(program, 'step_heights'): 52 | step_heights = program.step_heights 53 | elif hasattr(program, 'h'): 54 | step_heights = program.h 55 | if step_heights is None: 56 | return {"error": -1.0} 57 | result = evaluate_C1_upper_std(step_heights) 58 | if result["valid"] == 1.0: 59 | mu = float(result.get("mu_inf", float("inf"))) 60 | if mu > 0 and np.isfinite(mu): 61 | return {"score": 1.0 / mu} 62 | return {"error": -1.0} 63 | else: 64 | return {"error": -1.0} 65 | except Exception as e: 66 | return {"error": -1.0} 67 | -------------------------------------------------------------------------------- /configs/README.md: -------------------------------------------------------------------------------- 1 | # OpenEvolve Configuration Files 2 | 3 | This directory contains configuration files for OpenEvolve with examples for different use cases. 4 | 5 | ## Configuration Files 6 | 7 | ### `default_config.yaml` 8 | The main configuration file containing all available options with sensible defaults. This file includes: 9 | - Complete documentation for all configuration parameters 10 | - Default values for all settings 11 | - **Island-based evolution parameters** for proper evolutionary diversity 12 | 13 | Use this file as a template for your own configurations. 14 | 15 | ### `island_config_example.yaml` 16 | A practical example configuration demonstrating proper island-based evolution setup. Shows: 17 | - Recommended island settings for most use cases 18 | - Balanced migration parameters 19 | - Complete working configuration 20 | 21 | ### `island_examples.yaml` 22 | Multiple example configurations for different scenarios: 23 | - **Maximum Diversity**: Many islands, frequent migration 24 | - **Focused Exploration**: Few islands, rare migration 25 | - **Balanced Approach**: Default recommended settings 26 | - **Quick Exploration**: Small-scale rapid testing 27 | - **Large-Scale Evolution**: Complex optimization runs 28 | 29 | Includes guidelines for choosing parameters based on your problem characteristics. 30 | 31 | ## Island-Based Evolution Parameters 32 | 33 | The key new parameters for proper evolutionary diversity are: 34 | 35 | ```yaml 36 | database: 37 | num_islands: 5 # Number of separate populations 38 | migration_interval: 50 # Migrate every N generations 39 | migration_rate: 0.1 # Fraction of top programs to migrate 40 | ``` 41 | 42 | ### Parameter Guidelines 43 | 44 | - **num_islands**: 3-10 for most problems (more = more diversity) 45 | - **migration_interval**: 25-100 generations (higher = more independence) 46 | - **migration_rate**: 0.05-0.2 (5%-20%, higher = faster knowledge sharing) 47 | 48 | ### When to Use What 49 | 50 | - **Complex problems** → More islands, less frequent migration 51 | - **Simple problems** → Fewer islands, more frequent migration 52 | - **Long runs** → More islands to maintain diversity 53 | - **Short runs** → Fewer islands for faster convergence 54 | 55 | ## Usage 56 | 57 | Copy any of these files as a starting point for your configuration: 58 | 59 | ```bash 60 | cp configs/default_config.yaml my_config.yaml 61 | # Edit my_config.yaml for your specific needs 62 | ``` 63 | 64 | Then use with OpenEvolve: 65 | 66 | ```python 67 | from openevolve import OpenEvolve 68 | evolve = OpenEvolve( 69 | initial_program_path="program.py", 70 | evaluation_file="evaluator.py", 71 | config_path="my_config.yaml" 72 | ) 73 | ``` 74 | -------------------------------------------------------------------------------- /benchmark/spherical_code/initial_program.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def _normalize_rows(P): 4 | nrm = np.linalg.norm(P, axis=1, keepdims=True) 5 | nrm = np.maximum(nrm, 1e-12) 6 | return P / nrm 7 | 8 | def seed_platonic(n): 9 | """Return a good symmetric seed on S^2 for some n; else None.""" 10 | if n == 2: # antipodal 11 | return np.array([[0,0,1],[0,0,-1]], dtype=float) 12 | if n == 3: # equilateral on equator 13 | ang = 2*np.pi/3 14 | return np.array([[1,0,0],[np.cos(ang),np.sin(ang),0],[np.cos(2*ang),np.sin(2*ang),0]], dtype=float) 15 | if n == 4: # tetrahedron 16 | return _normalize_rows(np.array([[1,1,1],[1,-1,-1],[-1,1,-1],[-1,-1,1]], dtype=float)) 17 | if n == 6: # octahedron 18 | return np.array([[1,0,0],[-1,0,0],[0,1,0],[0,-1,0],[0,0,1],[0,0,-1]], dtype=float) 19 | if n == 8: # cube vertices 20 | V = np.array([[sx,sy,sz] for sx in (-1,1) for sy in (-1,1) for sz in (-1,1)], dtype=float) 21 | return _normalize_rows(V) 22 | if n == 12: # icosahedron (one realization) 23 | phi = (1+np.sqrt(5))/2 24 | V = [] 25 | for s in (-1,1): 26 | V += [[0, s, phi],[0, s, -phi],[ s, phi,0],[ s, -phi,0],[ phi,0, s],[-phi,0, s]] 27 | V = np.array(V, dtype=float) 28 | return _normalize_rows(V) 29 | return None 30 | 31 | def farthest_point_greedy(n, seed=None, rng=np.random.default_rng(0)): 32 | """Greedy max–min on S^2: start from seed (if any), then add points that maximize min angle.""" 33 | def random_unit(k): 34 | X = rng.normal(size=(k,3)); return _normalize_rows(X) 35 | 36 | if seed is None: 37 | P = random_unit(1) # start with one random point 38 | else: 39 | P = _normalize_rows(seed) 40 | while len(P) < n: 41 | # generate candidates and pick the one with largest min angle to current set 42 | C = random_unit(2000) # candidates per iteration (tune as needed) 43 | # cosines to existing points 44 | cos = C @ P.T 45 | # min angle to set -> maximize this 46 | min_ang = np.arccos(np.clip(np.max(cos, axis=1), -1.0, 1.0)) 47 | idx = np.argmax(min_ang) 48 | P = np.vstack([P, C[idx:idx+1]]) 49 | return P 50 | 51 | def main(): 52 | n = 30 53 | seed = seed_platonic(n) 54 | pts = farthest_point_greedy(n, seed=seed, rng=np.random.default_rng(42)) 55 | print(f"n={n}, points={len(pts)}") 56 | return pts 57 | 58 | if __name__ == "__main__": 59 | points = main() 60 | # 保存为 npy 文件 61 | np.save("points.npy", points) 62 | print("已保存 points.npy") 63 | 64 | # Ensure compatibility with evaluators that expect a global variable 65 | try: 66 | points # type: ignore[name-defined] 67 | except NameError: 68 | points = main() 69 | -------------------------------------------------------------------------------- /compute.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | baselines = { 4 | "packing_circles_26": {"s_baseline": 2.634, "higher_better": 1}, 5 | "packind_circles_32": {"s_baseline": 2.936, "higher_better": 1}, 6 | "minizing_raio_max_min_distance_d2_n16": {"s_baseline": 12.89, "higher_better": -1}, 7 | "minizing_raio_max_min_distance_d3_n14": {"s_baseline": 4.168, "higher_better": -1}, 8 | "third_autocorrelation_inequality": {"s_baseline": 1.4581, "higher_better": -1}, 9 | # Added benchmarks (larger-is-better where applicable) 10 | "kissing_number_d11": {"s_baseline": 592.0, "higher_better": 1}, 11 | "spherical_code_d3_n30": {"s_baseline": 0.6736467551690225, "higher_better": 1}, 12 | "heilbronn_in_the_unit_square_n16": {"s_baseline": 7.0/341.0, "higher_better": 1}, 13 | "littlewood_polynomials_n512": {"s_baseline": 0.04105, "higher_better": 1}, 14 | #"riesz_energy_n20_s1": {"s_baseline": 0.001013, "higher_better": 1}, 15 | "MSTD_n30": {"s_baseline": 1.04, "higher_better": 1}, 16 | "autoconvolution_peak_minimization": {"s_baseline": 0.6667, "higher_better": 1} 17 | } 18 | 19 | results = { 20 | "packing_circles_26": {"s_best": 2.6359829561164743, "round": 40657}, 21 | "packind_circles_32": {"s_best": 2.939520304932057, "round": 40657}, 22 | "minizing_raio_max_min_distance_d2_n16": {"s_best": 12.92, "round": 5000}, 23 | "minizing_raio_max_min_distance_d3_n14": {"s_best": 5.198, "round": 5000}, 24 | "third_autocorrelation_inequality": {"s_best": 0, "round": 5000}, 25 | # Placeholders for newly added benchmarks (0 indicates not yet attempted) 26 | "kissing_number_d11": {"s_best": 502.0, "round": 5000}, 27 | "spherical_code_d3_n30": {"s_best": 0.6381359964781541, "round": 5000}, 28 | "heilbronn_in_the_unit_square_n16": {"s_best": 0, "round": 5000}, 29 | "littlewood_polynomials_n512": {"s_best": 0, "round": 5000}, 30 | #"riesz_energy_n20_s1": {"s_best": 0, "round": 5000}, 31 | "MSTD_n30": {"s_best": 0, "round": 5000}, 32 | "autoconvolution_peak_minimization": {"s_best": 0, "round": 5000} 33 | } 34 | 35 | def compute_excel_best(results): 36 | problems = list(baselines.keys()) 37 | num_problems = len(problems) 38 | total = 0.0 39 | for problem in problems: 40 | s_baseline = baselines[problem]['s_baseline'] 41 | higher_better = baselines[problem]['higher_better'] 42 | s_best = results[problem]['s_best'] 43 | n_round = results[problem]['round'] 44 | if s_best == 0: 45 | s_excess = 0 # Assuming s_best == 0 indicates failure/no improvement 46 | else: 47 | improvement = (s_best - s_baseline) * higher_better 48 | s_excess = max(improvement, 0) 49 | contrib = s_excess / (n_round / 1000000) 50 | total += contrib 51 | excel_best = total / num_problems 52 | return excel_best 53 | 54 | print(compute_excel_best(results)) -------------------------------------------------------------------------------- /benchmark/autoconvolution_peak_minimization/initial_program.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env python3 3 | # -*- coding: utf-8 -*- 4 | """ 5 | Autoconvolution Peak Minimization 6 | ================================= 7 | 8 | This program generates step heights for a probability density function 9 | that minimizes the maximum value of its autoconvolution. 10 | """ 11 | 12 | import numpy as np 13 | from typing import Dict 14 | 15 | def evaluate_C1_upper_std(step_heights: np.ndarray) -> Dict[str, float]: 16 | """ 17 | Standard-normalized C1 (support [-1/2,1/2], dx=1/K). 18 | - Project to feasible set: h >= 0 and ∫f = 1 (L1 normalization). 19 | - Objective: mu_inf = max_t (f*f)(t) (smaller is better). 20 | Returns: {"valid", "mu_inf", "ratio"(=mu_inf), "integral"(=1.0), "K"} 21 | """ 22 | h = np.asarray(step_heights, dtype=float) 23 | if h.size == 0 or np.any(h < 0): 24 | return {"valid": 0.0, "mu_inf": float("inf"), "ratio": float("inf")} 25 | K = int(len(h)) 26 | dx = 1.0 / K 27 | 28 | integral = float(np.sum(h) * dx) 29 | if integral <= 0: 30 | return {"valid": 0.0, "mu_inf": float("inf"), "ratio": float("inf")} 31 | h = h / integral # ∫f = 1 32 | 33 | F = np.fft.fft(h, 2*K - 1) # linear autoconvolution via padding 34 | conv = np.fft.ifft(F * F).real 35 | conv = np.maximum(conv, 0.0) # clamp tiny negatives 36 | 37 | mu_inf = float(np.max(conv) * dx) 38 | return {"valid": 1.0, "mu_inf": mu_inf, "ratio": mu_inf, "integral": 1.0, "K": float(K)} 39 | 40 | def make_candidate(K: int, kind: str = "cos2") -> np.ndarray: 41 | """ 42 | Simple candidate builder on [-1/2,1/2] (NOT normalized here). 43 | 44 | Args: 45 | K: Number of discretization points 46 | kind: Type of candidate function ("box", "triangle", "cos2", "gauss") 47 | 48 | Returns: 49 | Step heights array 50 | """ 51 | x = np.linspace(-1.0, 1.0, K) 52 | if kind == "box": 53 | h = np.ones(K) 54 | elif kind == "triangle": 55 | h = 1.0 - np.abs(x) 56 | h[h < 0] = 0.0 57 | elif kind == "cos2": 58 | h = np.cos(np.pi * x / 2.0) ** 2 59 | elif kind == "gauss": 60 | h = np.exp(-4.0 * x**2) 61 | else: 62 | raise ValueError(f"unknown kind={kind}") 63 | return h 64 | 65 | def main(): 66 | """ 67 | Main function that generates step heights for autoconvolution minimization. 68 | 69 | Returns: 70 | numpy.ndarray: Step heights array 71 | """ 72 | K = 128 73 | kind = "cos2" # Change this to try different candidates (box/triangle/cos2/gauss) 74 | step_heights = make_candidate(K, kind) 75 | 76 | # Evaluate the result to verify it's valid 77 | result = evaluate_C1_upper_std(step_heights) 78 | print(f"Generated {kind} candidate with K={K}, mu_inf={result['mu_inf']:.6f}") 79 | 80 | return step_heights 81 | -------------------------------------------------------------------------------- /evolve_agent/llm/ensemble.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model ensemble for LLMs 3 | """ 4 | 5 | import asyncio 6 | import logging 7 | import random 8 | from typing import Dict, List, Optional, Tuple 9 | 10 | from evolve_agent.llm.base import LLMInterface 11 | from evolve_agent.llm.openai import OpenAILLM 12 | from evolve_agent.config import LLMModelConfig 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class LLMEnsemble: 18 | """Ensemble of LLMs""" 19 | 20 | def __init__(self, models_cfg: List[LLMModelConfig]): 21 | self.models_cfg = models_cfg 22 | 23 | # Initialize models from the configuration 24 | self.models = [OpenAILLM(model_cfg) for model_cfg in models_cfg] 25 | 26 | # Extract and normalize model weights 27 | self.weights = [model.weight for model in models_cfg] 28 | total = sum(self.weights) 29 | self.weights = [w / total for w in self.weights] 30 | 31 | logger.info( 32 | f"Initialized LLM ensemble with models: " 33 | + ", ".join( 34 | f"{model.name} (weight: {weight:.2f})" 35 | for model, weight in zip(models_cfg, self.weights) 36 | ) 37 | ) 38 | 39 | async def generate(self, prompt: str, **kwargs) -> str: 40 | """Generate text using a randomly selected model based on weights""" 41 | model = self._sample_model() 42 | return await model.generate(prompt, **kwargs) 43 | 44 | async def generate_with_context( 45 | self, system_message: str, messages: List[Dict[str, str]], **kwargs 46 | ) -> str: 47 | """Generate text using a system message and conversational context""" 48 | model = self._sample_model() 49 | return await model.generate_with_context(system_message, messages, **kwargs) 50 | 51 | def _sample_model(self) -> LLMInterface: 52 | """Sample a model from the ensemble based on weights""" 53 | index = random.choices(range(len(self.models)), weights=self.weights, k=1)[0] 54 | return self.models[index] 55 | 56 | async def generate_multiple(self, prompt: str, n: int, **kwargs) -> List[str]: 57 | """Generate multiple texts in parallel""" 58 | tasks = [self.generate(prompt, **kwargs) for _ in range(n)] 59 | return await asyncio.gather(*tasks) 60 | 61 | async def parallel_generate(self, prompts: List[str], **kwargs) -> List[str]: 62 | """Generate responses for multiple prompts in parallel""" 63 | tasks = [self.generate(prompt, **kwargs) for prompt in prompts] 64 | return await asyncio.gather(*tasks) 65 | 66 | async def generate_all_with_context( 67 | self, system_message: str, messages: List[Dict[str, str]], **kwargs 68 | ) -> str: 69 | """Generate text using a all available models and average their returned metrics""" 70 | responses = [] 71 | for model in self.models: 72 | responses.append(await model.generate_with_context(system_message, messages, **kwargs)) 73 | return responses 74 | -------------------------------------------------------------------------------- /benchmark/littlewood_polynomials/evaluator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import os 4 | import json 5 | import importlib.util 6 | import numpy as np 7 | from typing import Dict 8 | 9 | def evaluate_littlewood_supnorm(coeffs, num_grid: int = 16384) -> Dict[str, float]: 10 | """ 11 | FFT-sampled sup-norm upper bound on |P(e^{it})|. 12 | - coeffs: 1-D array-like of ±1 13 | - num_grid: sampling resolution (larger -> tighter upper bound) 14 | """ 15 | if num_grid < 8: 16 | raise ValueError("num_grid too small") 17 | c = np.atleast_1d(np.asarray(coeffs, dtype=float)) # ensure 1-D 18 | if c.ndim != 1 or c.size == 0: 19 | raise ValueError("coeffs must be a non-empty 1-D array") 20 | 21 | pad = np.zeros(int(num_grid), dtype=np.complex128) 22 | pad[: c.size] = c.astype(np.complex128) 23 | 24 | values = np.fft.fft(pad) # samples of P on unit circle 25 | supnorm = float(np.max(np.abs(values))) 26 | return {"valid": 1.0, "supnorm": supnorm} 27 | 28 | def _read_coeffs_from_stdin(): 29 | """ 30 | Accepts lines like: 31 | n = 512 32 | 1 -1 1 1 -1 ... 33 | or just a line of ±1's. Robust to extra spaces/newlines. 34 | """ 35 | text = sys.stdin.read().strip() 36 | if not text: 37 | return None 38 | # Take the last line that contains numbers 39 | lines = [ln.strip() for ln in text.splitlines() if ln.strip()] 40 | # concatenate all non "n =" lines to support long sequences split across lines 41 | number_lines = [ln for ln in lines if not ln.lower().startswith("n =")] 42 | if not number_lines: 43 | return None 44 | joined = " ".join(number_lines) 45 | nums = re.findall(r"[-+]?\d+", joined) 46 | return np.asarray(list(map(int, nums)), dtype=int) 47 | 48 | def evaluate(program_path: str = "littlewood_polynomials/initial_program.py"): 49 | try: 50 | spec = importlib.util.spec_from_file_location("program", program_path) 51 | program = importlib.util.module_from_spec(spec) 52 | sys.modules["program"] = program 53 | spec.loader.exec_module(program) 54 | 55 | coeffs_obj = None 56 | if hasattr(program, 'coeffs'): 57 | coeffs_obj = program.coeffs 58 | elif hasattr(program, 'main'): 59 | res = program.main() 60 | if isinstance(res, np.ndarray): 61 | coeffs_obj = res 62 | elif hasattr(program, 'coeffs'): 63 | coeffs_obj = program.coeffs 64 | if coeffs_obj is None: 65 | # fallback: try stdin for robustness 66 | coeffs_obj = _read_coeffs_from_stdin() 67 | if coeffs_obj is None: 68 | return {"error": -1.0} 69 | result = evaluate_littlewood_supnorm(coeffs_obj, num_grid=16384) 70 | if result.get("valid", 0.0) != 1.0: 71 | return {"error": -1.0} 72 | supnorm = float(result["supnorm"]) 73 | if supnorm > 0 and np.isfinite(supnorm): 74 | return {"score": 1.0 / supnorm} 75 | return {"error": -1.0} 76 | except Exception: 77 | return {"error": -1.0} 78 | 79 | print(evaluate()) -------------------------------------------------------------------------------- /benchmark/heilbronn_in_the_unit_square/evaluator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import importlib.util 3 | import sys 4 | import os 5 | import json 6 | from itertools import combinations 7 | from typing import Dict 8 | 9 | 10 | def _triangle_area(a, b, c) -> float: 11 | return abs((b[0]-a[0])*(c[1]-a[1]) - (b[1]-a[1])*(c[0]-a[0])) * 0.5 12 | 13 | 14 | def evaluate_min_triangle_area(points: np.ndarray) -> Dict[str, float]: 15 | """ 16 | Compute the minimum triangle area for 2D points. 17 | Metrics: 18 | - min_area: raw smallest triangle area (larger is better) 19 | - scaled_min_area: n^(8/7 + 1/2000) * min_area (informational) 20 | - score: equals min_area (larger is better) 21 | """ 22 | pts = np.asarray(points, dtype=float) 23 | if pts.ndim != 2 or pts.shape[1] != 2: 24 | return {"valid": 0.0, "min_area": 0.0, "n": 0.0, "scaled_min_area": 0.0, "score": 0.0} 25 | 26 | n = len(pts) 27 | if n < 3: 28 | return {"valid": 0.0, "min_area": 0.0, "n": float(n), "scaled_min_area": 0.0, "score": 0.0} 29 | 30 | min_area = float("inf") 31 | for i, j, k in combinations(range(n), 3): 32 | area = _triangle_area(pts[i], pts[j], pts[k]) 33 | if area < min_area: 34 | min_area = area 35 | if min_area <= 1e-18: # 早停 36 | break 37 | 38 | exponent = (8.0/7.0) + (1.0/2000.0) 39 | scaled_min_area = (n ** exponent) * float(min_area) 40 | score = float(min_area) if np.isfinite(min_area) else 0.0 41 | 42 | return { 43 | "valid": 1.0, 44 | "min_area": float(min_area), 45 | "n": float(n), 46 | "scaled_min_area": float(scaled_min_area), 47 | "score": score, 48 | } 49 | 50 | 51 | 52 | def evaluate(program_path: str): 53 | try: 54 | spec = importlib.util.spec_from_file_location("program", program_path) 55 | program = importlib.util.module_from_spec(spec) 56 | sys.modules["program"] = program 57 | spec.loader.exec_module(program) 58 | 59 | points = None 60 | if hasattr(program, 'points'): 61 | points = program.points 62 | elif hasattr(program, 'main'): 63 | res = program.main() 64 | if isinstance(res, np.ndarray): 65 | points = res 66 | elif hasattr(program, 'points'): 67 | points = program.points 68 | if points is None: 69 | return {"error": -1.0} 70 | result = evaluate_min_triangle_area(points) 71 | # Return both the diagnostic dict and rely on 'score' as the main metric (larger is better) 72 | return { 73 | "score": result["score"], 74 | "scaled_min_area": result["scaled_min_area"], 75 | "min_area": result["min_area"], 76 | "n": result["n"], 77 | } 78 | except Exception: 79 | return {"error": -1.0} 80 | 81 | 82 | if __name__ == "__main__": 83 | try: 84 | default_path = os.path.join(os.path.dirname(__file__), "initial_program.py") 85 | except Exception: 86 | default_path = "initial_program.py" 87 | target = sys.argv[1] if len(sys.argv) > 1 else default_path 88 | print(json.dumps(evaluate(target), ensure_ascii=False, indent=2)) 89 | -------------------------------------------------------------------------------- /configs/island_examples.yaml: -------------------------------------------------------------------------------- 1 | # EvolveAgent Island-Based Evolution Configuration Examples 2 | # Different configurations for various use cases 3 | 4 | # Configuration for Maximum Diversity (Many Islands, Frequent Migration) 5 | # Use this when you want to explore the search space thoroughly 6 | # Good for: Complex problems, avoiding local optima, long runs 7 | max_diversity: 8 | database: 9 | num_islands: 10 # More islands = more diversity 10 | migration_interval: 25 # More frequent migration 11 | migration_rate: 0.2 # Higher migration rate 12 | population_size: 1000 13 | archive_size: 200 14 | 15 | # Configuration for Focused Exploration (Few Islands, Rare Migration) 16 | # Use this when you want deeper exploration within each island 17 | # Good for: Problems with clear structure, shorter runs 18 | focused_exploration: 19 | database: 20 | num_islands: 3 # Fewer islands = deeper exploration 21 | migration_interval: 100 # Less frequent migration 22 | migration_rate: 0.05 # Lower migration rate 23 | population_size: 500 24 | archive_size: 50 25 | 26 | # Configuration for Balanced Approach (Default Settings) 27 | # Use this as a starting point for most problems 28 | # Good for: General use, medium-length runs 29 | balanced: 30 | database: 31 | num_islands: 5 # Balanced number of islands 32 | migration_interval: 50 # Moderate migration frequency 33 | migration_rate: 0.1 # Moderate migration rate 34 | population_size: 1000 35 | archive_size: 100 36 | 37 | # Configuration for Quick Exploration (Small Scale) 38 | # Use this for rapid prototyping and testing 39 | # Good for: Small problems, quick experiments 40 | quick_exploration: 41 | database: 42 | num_islands: 3 43 | migration_interval: 20 44 | migration_rate: 0.15 45 | population_size: 200 46 | archive_size: 30 47 | 48 | # Configuration for Large-Scale Evolution (High Performance) 49 | # Use this for complex problems requiring extensive search 50 | # Good for: Complex optimization, long evolutionary runs 51 | large_scale: 52 | database: 53 | num_islands: 15 # Many islands for parallel exploration 54 | migration_interval: 75 # Balanced migration timing 55 | migration_rate: 0.08 # Conservative migration rate 56 | population_size: 2000 # Large populations 57 | archive_size: 300 58 | 59 | # Guidelines for choosing parameters: 60 | # 61 | # num_islands: 62 | # - More islands = more diversity, slower convergence 63 | # - Fewer islands = faster convergence, risk of premature convergence 64 | # - Recommended: 3-10 for most problems 65 | # 66 | # migration_interval: 67 | # - Lower values = more frequent knowledge sharing 68 | # - Higher values = more independent evolution 69 | # - Recommended: 25-100 generations 70 | # 71 | # migration_rate: 72 | # - Higher values = faster knowledge propagation 73 | # - Lower values = preserve island diversity longer 74 | # - Recommended: 0.05-0.2 (5%-20%) 75 | # 76 | # Rule of thumb: 77 | # - Complex problems → More islands, less frequent migration 78 | # - Simple problems → Fewer islands, more frequent migration 79 | # - Long runs → More islands to maintain diversity 80 | # - Short runs → Fewer islands for faster convergence 81 | -------------------------------------------------------------------------------- /benchmark/third_autocorrelation_inequality/initial_program.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.integrate 3 | 4 | def calculate_c3_upper_bound(height_sequence): 5 | 6 | N = len(height_sequence) 7 | delta_x = 1 / (2 * N) 8 | 9 | def f(x): 10 | if -0.25 <= x <= 0.25: 11 | index = int((x - (-0.25)) / delta_x) 12 | if index == N: 13 | index -= 1 14 | return height_sequence[index] 15 | else: 16 | return 0.0 17 | 18 | integral_f = np.sum(height_sequence) * delta_x 19 | integral_sq = integral_f**2 20 | 21 | if integral_sq < 1e-18: 22 | return 0.0 23 | 24 | t_points = np.linspace(-0.5, 0.5, 2 * N + 1) 25 | 26 | max_conv_val = 0.0 27 | for t_val in t_points: 28 | 29 | lower_bound = max(-0.25, t_val - 0.25) 30 | upper_bound = min(0.25, t_val + 0.25) 31 | 32 | if upper_bound <= lower_bound: 33 | convolution_val = 0.0 34 | else: 35 | def integrand(x): 36 | return f(x) * f(t_val - x) 37 | 38 | convolution_val, _ = scipy.integrate.quad(integrand, lower_bound, upper_bound, limit=100) 39 | 40 | if abs(convolution_val) > max_conv_val: 41 | max_conv_val = abs(convolution_val) 42 | 43 | return max_conv_val / integral_sq 44 | 45 | def genetic_algorithm(population_size, num_intervals, generations, mutation_rate, crossover_rate): 46 | 47 | population = np.random.rand(population_size, num_intervals) * 2 - 1 48 | 49 | best_solution = None 50 | best_fitness = 0.0 51 | 52 | for gen in range(generations): 53 | 54 | fitness_scores = np.array([calculate_c3_upper_bound(individual) for individual in population]) 55 | 56 | current_best_idx = np.argmax(fitness_scores) 57 | if fitness_scores[current_best_idx] > best_fitness: 58 | best_fitness = fitness_scores[current_best_idx] 59 | best_solution = population[current_best_idx].copy() 60 | # print(f"Generation {gen}: New best fitness = {best_fitness}") 61 | 62 | 63 | new_population = np.zeros_like(population) 64 | for i in range(population_size): 65 | 66 | competitors_indices = np.random.choice(population_size, 2, replace=False) 67 | winner_idx = competitors_indices[np.argmax(fitness_scores[competitors_indices])] 68 | new_population[i] = population[winner_idx].copy() 69 | 70 | for i in range(0, population_size, 2): 71 | if np.random.rand() < crossover_rate: 72 | parent1 = new_population[i] 73 | parent2 = new_population[i+1] 74 | crossover_point = np.random.randint(1, num_intervals - 1) 75 | new_population[i] = np.concatenate((parent1[:crossover_point], parent2[crossover_point:])) 76 | new_population[i+1] = np.concatenate((parent2[:crossover_point], parent1[crossover_point:])) 77 | 78 | for i in range(population_size): 79 | if np.random.rand() < mutation_rate: 80 | mutation_point = np.random.randint(num_intervals) 81 | new_population[i, mutation_point] += np.random.normal(0, 0.1) 82 | 83 | new_population[i, mutation_point] = np.clip(new_population[i, mutation_point], -2, 2) 84 | 85 | population = new_population 86 | 87 | return best_solution 88 | 89 | def find_better_c3_upper_bound(): 90 | 91 | NUM_INTERVALS = 4 92 | POPULATION_SIZE = 2 93 | GENERATIONS = 10 94 | MUTATION_RATE = 0.1 95 | CROSSOVER_RATE = 0.8 96 | 97 | height_sequence_3 = genetic_algorithm(POPULATION_SIZE, NUM_INTERVALS, GENERATIONS, MUTATION_RATE, CROSSOVER_RATE) 98 | 99 | return height_sequence_3 -------------------------------------------------------------------------------- /benchmark/kissing_number/evaluator.py: -------------------------------------------------------------------------------- 1 | #@title Verification 2 | import numpy as np 3 | import subprocess 4 | import sys 5 | import traceback 6 | import os 7 | import json 8 | 9 | 10 | def verify_kissing_configuration(sphere_centers: np.ndarray, atol: float = 1e-9): 11 | """ 12 | Verifies if the given points form a valid kissing number configuration. 13 | 14 | A valid kissing configuration of N vectors in D dimensions must satisfy: 15 | 1. All vectors are unit vectors (norm is 1). 16 | 2. The dot product of any two distinct vectors is at most 0.5. 17 | 18 | Args: 19 | sphere_centers: A numpy array of shape (N, D) where N is the number of spheres 20 | and D is the dimension. 21 | atol: Absolute tolerance for floating point comparisons. 22 | 23 | Raises: 24 | AssertionError: If the configuration is not valid. 25 | """ 26 | num_spheres, dimension = sphere_centers.shape 27 | 28 | # 1. Check if all vectors are unit vectors. 29 | norms = np.linalg.norm(sphere_centers, axis=1) 30 | assert np.allclose(norms, 1.0, atol=atol), f"Verification failed: Not all vectors are unit vectors. Norms range from {np.min(norms)} to {np.max(norms)}." 31 | 32 | # 2. Check the dot products. 33 | # The dot product of two distinct vectors must be <= 0.5. 34 | dot_products = sphere_centers @ sphere_centers.T 35 | 36 | # We only need to check the upper triangle, excluding the diagonal. 37 | # The diagonal elements should be 1.0 for unit vectors. 38 | np.fill_diagonal(dot_products, -np.inf) # so we don't pick diagonal elements 39 | 40 | max_dot_product = np.max(dot_products) 41 | 42 | # The condition is dot_product <= 0.5 43 | assert max_dot_product <= 0.5 + atol, f"Verification failed: Maximum dot product between distinct vectors is {max_dot_product}, which is greater than 0.5." 44 | 45 | 46 | def evaluate(program_path: str): 47 | """ 48 | Evaluate a program that solves the kissing number problem. 49 | Returns dict with key 'score' = number of spheres (larger is better). 50 | On failure/invalid, returns {'score': -1.0, ...}. 51 | """ 52 | try: 53 | # Use importlib.util to dynamically load the program module 54 | import importlib.util 55 | 56 | # Load the module from the given path 57 | spec = importlib.util.spec_from_file_location("program", program_path) 58 | program = importlib.util.module_from_spec(spec) 59 | sys.modules["program"] = program 60 | spec.loader.exec_module(program) 61 | 62 | # Look for sphere_centers in the loaded module 63 | sphere_centers = None 64 | if hasattr(program, 'sphere_centers'): 65 | sphere_centers = program.sphere_centers 66 | elif hasattr(program, 'main'): 67 | # If there's a main function, try calling it to get sphere_centers 68 | sphere_centers = program.main() 69 | 70 | if sphere_centers is None: 71 | return {"score": -1.0, "no_sphere_centers": True} 72 | 73 | # Verify the kissing configuration 74 | verify_kissing_configuration(sphere_centers) 75 | 76 | # Calculate metrics 77 | num_spheres = sphere_centers.shape[0] 78 | dimension = sphere_centers.shape[1] 79 | 80 | # Return metrics with 'score' 81 | return { 82 | "score": float(num_spheres), 83 | "num_spheres": float(num_spheres), 84 | "dimension": float(dimension) 85 | } 86 | 87 | except Exception as e: 88 | return {"score": -1.0, "evaluation_error": True, "stderr": traceback.format_exc()} 89 | 90 | 91 | if __name__ == "__main__": 92 | # CLI for debugging: evaluate initial_program.py by default, or a provided path 93 | try: 94 | default_path = os.path.join(os.path.dirname(__file__), "initial_program.py") 95 | except Exception: 96 | default_path = "initial_program.py" 97 | 98 | target = sys.argv[1] if len(sys.argv) > 1 else default_path 99 | print(json.dumps(evaluate(target), ensure_ascii=False, indent=2)) -------------------------------------------------------------------------------- /benchmark/spherical_code/visualization.py: -------------------------------------------------------------------------------- 1 | # viz_sphere_points.py 2 | import argparse 3 | import importlib.util 4 | import sys 5 | import os 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from mpl_toolkits.mplot3d import Axes3D # noqa: F401 (needed for 3D) 9 | 10 | def _normalize_rows(P): 11 | nrm = np.linalg.norm(P, axis=1, keepdims=True) 12 | nrm = np.maximum(nrm, 1e-12) 13 | return P / nrm 14 | 15 | def load_from_module(module_path: str): 16 | """ 17 | 动态加载模块: 18 | - 优先读取全局变量 `points` 19 | - 否则调用 `main()` 获取返回值 20 | """ 21 | module_path = os.path.abspath(module_path) 22 | spec = importlib.util.spec_from_file_location("points_mod", module_path) 23 | if spec is None or spec.loader is None: 24 | raise RuntimeError(f"无法加载模块: {module_path}") 25 | mod = importlib.util.module_from_spec(spec) 26 | spec.loader.exec_module(mod) 27 | 28 | if hasattr(mod, "points"): 29 | pts = getattr(mod, "points") 30 | elif hasattr(mod, "main"): 31 | pts = mod.main() 32 | else: 33 | raise RuntimeError("模块中既无 `points` 变量,也无 `main()` 函数可获取点。") 34 | 35 | pts = np.asarray(pts, dtype=float) 36 | if pts.ndim != 2 or pts.shape[1] != 3: 37 | raise ValueError(f"模块返回的点形状异常: {pts.shape}, 期望 (N, 3)") 38 | return _normalize_rows(pts) 39 | 40 | def load_from_npy(npy_path: str): 41 | pts = np.load(npy_path) 42 | pts = np.asarray(pts, dtype=float) 43 | if pts.ndim != 2 or pts.shape[1] != 3: 44 | raise ValueError(f"npy 形状异常: {pts.shape}, 期望 (N, 3)") 45 | return _normalize_rows(pts) 46 | 47 | def load_from_csv(csv_path: str): 48 | pts = np.loadtxt(csv_path, delimiter=",") 49 | pts = np.asarray(pts, dtype=float) 50 | if pts.ndim != 2 or pts.shape[1] != 3: 51 | raise ValueError(f"csv 形状异常: {pts.shape}, 期望 (N, 3)") 52 | return _normalize_rows(pts) 53 | 54 | def min_pairwise_angle_deg(P): 55 | """ 56 | 返回: 57 | - 最小夹角(度) 58 | - 对应的最大余弦相似度(最近的一对点) 59 | """ 60 | # 计算上三角的点积 61 | dot = P @ P.T 62 | n = len(P) 63 | mask = np.triu(np.ones((n, n), dtype=bool), k=1) 64 | vals = dot[mask] 65 | max_cos = np.max(vals) if vals.size else 1.0 66 | max_cos = np.clip(max_cos, -1.0, 1.0) 67 | ang_min = np.degrees(np.arccos(max_cos)) 68 | return ang_min, max_cos 69 | 70 | def plot_points_on_sphere(P, title="Spherical Point Set"): 71 | fig = plt.figure(figsize=(7, 7)) 72 | ax = fig.add_subplot(111, projection="3d") 73 | ax.set_box_aspect([1,1,1]) 74 | 75 | # 画单位球面网格 76 | u = np.linspace(0, 2*np.pi, 60) 77 | v = np.linspace(0, np.pi, 30) 78 | x = np.outer(np.cos(u), np.sin(v)) 79 | y = np.outer(np.sin(u), np.sin(v)) 80 | z = np.outer(np.ones_like(u), np.cos(v)) 81 | ax.plot_surface(x, y, z, alpha=0.15, linewidth=0, antialiased=True) 82 | 83 | # 画点 84 | ax.scatter(P[:,0], P[:,1], P[:,2], s=40, depthshade=True) 85 | 86 | # 坐标与视角 87 | ax.set_xlabel("x") 88 | ax.set_ylabel("y") 89 | ax.set_zlabel("z") 90 | ax.set_title(title) 91 | ax.view_init(elev=20, azim=45) 92 | 93 | plt.tight_layout() 94 | plt.show() 95 | 96 | def main(): 97 | parser = argparse.ArgumentParser( 98 | description="可视化 S^2 上的点集(从模块、.npy 或 .csv 读取)" 99 | ) 100 | src = parser.add_mutually_exclusive_group(required=True) 101 | src.add_argument("--from-module", type=str, help="包含 points 或 main() 的 .py 文件路径(例如 sphere_points.py)") 102 | src.add_argument("--from-npy", type=str, help="N×3 的 .npy 文件路径") 103 | src.add_argument("--from-csv", type=str, help="N×3 的 .csv 文件路径(逗号分隔)") 104 | parser.add_argument("--title", type=str, default="Spherical Point Set", help="图标题") 105 | args = parser.parse_args() 106 | 107 | if args.from_module: 108 | P = load_from_module(args.from_module) 109 | elif args.from_npy: 110 | P = load_from_npy(args.from_npy) 111 | else: 112 | P = load_from_csv(args.from_csv) 113 | 114 | ang_min_deg, max_cos = min_pairwise_angle_deg(P) 115 | print(f"N = {len(P)}") 116 | print(f"最小两点夹角 ≈ {ang_min_deg:.4f}°") 117 | print(f"最近对的余弦相似度(最大点积) ≈ {max_cos:.6f}") 118 | 119 | plot_points_on_sphere(P, title=args.title) 120 | 121 | if __name__ == "__main__": 122 | main() 123 | -------------------------------------------------------------------------------- /reward_model/grade.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import pandas as pd 4 | from typing import List, Dict 5 | import re 6 | from vllm import LLM, SamplingParams 7 | from datasets import load_dataset 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llm.backend import score_abstracts_with_vllm, score_abstracts_with_api 10 | 11 | 12 | def evaluate_and_compare(data: List[Dict], model_results: List[Dict]) -> Dict: 13 | results = { 14 | "model_scores": [res["score"] for res in model_results], 15 | "avg_ratings": [res["avg_rating"] for res in model_results], 16 | "evaluations": [res["evaluation"] for res in model_results], 17 | "abstracts": [res["abstract"] for res in model_results], 18 | "differences": [], 19 | "mae": 0.0, 20 | "mse": 0.0, 21 | "accuracy": 0.0 # New metric for accuracy 22 | } 23 | 24 | # Calculate differences and labels 25 | valid_differences = [] 26 | true_labels = [] 27 | pred_labels = [] 28 | for ms, ar in zip(results["model_scores"], results["avg_ratings"]): 29 | # Calculate difference 30 | diff = abs(ms - ar) if ms >= 0 else -1 31 | valid_differences.append(diff) 32 | 33 | # Assign labels: positive (1) if score > 5.5, negative (0) otherwise 34 | # Only include valid scores for accuracy calculation 35 | if ms >= 0: 36 | true_label = 1 if ar > 5.5 else 0 37 | pred_label = 1 if ms > 5.5 else 0 38 | true_labels.append(true_label) 39 | pred_labels.append(pred_label) 40 | 41 | # Calculate metrics 42 | results["differences"] = valid_differences 43 | valid_diffs = [d for d in valid_differences if d >= 0] 44 | results["mae"] = np.mean(valid_diffs) if valid_diffs else 0.0 45 | results["mse"] = np.mean([d ** 2 for d in valid_diffs]) if valid_diffs else 0.0 46 | 47 | # Calculate accuracy: proportion of matching labels 48 | correct_predictions = sum(1 for t, p in zip(true_labels, pred_labels) if t == p) 49 | results["accuracy"] = correct_predictions / len(true_labels) if true_labels else 0.0 50 | 51 | return results 52 | 53 | def print_results(results: Dict): 54 | print(f"{'Index':<6} {'Model Score':<12} {'Avg Rating':<12} {'Difference':<12} {'Evaluation':<50}") 55 | print("-" * 100) 56 | for i in range(len(results["model_scores"])): 57 | eval_snippet = results["evaluations"][i][:47] + "..." if len(results["evaluations"][i]) > 47 else results["evaluations"][i] 58 | diff = results["differences"][i] if results["differences"][i] >= 0 else "N/A" 59 | print(f"{i+1:<6} {results['model_scores'][i]:<12.2f} {results['avg_ratings'][i]:<12.2f} {diff:<12} {eval_snippet:<50}") 60 | print("\nSummary Statistics (excluding invalid scores):") 61 | print(f"Mean Absolute Error (MAE): {results['mae']:.2f}") 62 | print(f"Mean Squared Error (MSE): {results['mse']:.2f}") 63 | print(f"Prediction Accuracy: {results['accuracy']:.2%}") # Display accuracy as percentage 64 | 65 | def main(): 66 | 67 | try: 68 | data = load_dataset('json', data_files='/data/zhuotaodeng/yzj/alpha-research/data/iclr2025_eval_100.json', split='train') 69 | data = [dict(item) for item in data] # Convert to List[Dict] 70 | except Exception as e: 71 | print(f"Error loading dataset: {e}") 72 | return 73 | 74 | model_results = score_abstracts_with_vllm(data, model_name="/data/zhuotaodeng/yzj/download_from_modelscope/Qwen/Qwen3-8B") 75 | # model_results = score_abstracts_with_vllm(data, '/data/zhuotaodeng/yzj/alpha-research/model/qwen25_grm_iclr_boxed/checkpoint-120') 76 | # model_results = score_abstracts_with_api(data, '/data/zhuotaodeng/yzj/alpha-research/idea-eval/results.jsonl') 77 | 78 | results = evaluate_and_compare(data, model_results) 79 | 80 | print_results(results) 81 | 82 | with open("vllm_evaluation_results.json", "w") as f: 83 | json.dump(results, f, indent=4) 84 | print("\nResults saved to 'vllm_evaluation_results.json'") 85 | 86 | df = pd.DataFrame({ 87 | "Abstract": results["abstracts"], 88 | "Model_Score": results["model_scores"], 89 | "Avg_Rating": results["avg_ratings"], 90 | "Difference": results["differences"], 91 | "Evaluation": results["evaluations"] 92 | }) 93 | df.to_csv("vllm_evaluation_results.csv", index=False) 94 | print("Results also saved to 'vllm_evaluation_results.csv'") 95 | 96 | 97 | if __name__ == "__main__": 98 | main() -------------------------------------------------------------------------------- /evolve_agent/llm/openai.py: -------------------------------------------------------------------------------- 1 | """ 2 | OpenAI API interface for LLMs 3 | """ 4 | 5 | import asyncio 6 | import logging 7 | import time 8 | from typing import Any, Dict, List, Optional, Union 9 | 10 | import openai 11 | from openai import AsyncOpenAI 12 | from openai import AsyncAzureOpenAI 13 | 14 | from evolve_agent.config import LLMConfig 15 | from evolve_agent.llm.base import LLMInterface 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | class OpenAILLM(LLMInterface): 20 | """LLM interface using OpenAI-compatible APIs""" 21 | 22 | def __init__( 23 | self, 24 | model_cfg: Optional[dict] = None, 25 | ): 26 | self.model = model_cfg.name 27 | self.system_message = model_cfg.system_message 28 | self.temperature = model_cfg.temperature 29 | self.top_p = model_cfg.top_p 30 | self.max_tokens = model_cfg.max_tokens 31 | self.timeout = model_cfg.timeout 32 | self.retries = model_cfg.retries 33 | self.retry_delay = model_cfg.retry_delay 34 | self.api_base = model_cfg.api_base 35 | self.api_key = model_cfg.api_key 36 | 37 | # Set up async API client 38 | self.client = AsyncOpenAI( 39 | api_key=self.api_key, 40 | base_url=self.api_base, 41 | ) 42 | # self.client = AsyncOpenAI( 43 | # api_key=self.api_key, 44 | # azure_endpoint=self.api_base, 45 | # api_version="2024-12-01-preview", 46 | # ) 47 | 48 | logger.info(f"Initialized OpenAI LLM with model: {self.model}") 49 | 50 | async def generate(self, prompt: str, **kwargs) -> str: 51 | """Generate text from a prompt""" 52 | return await self.generate_with_context( 53 | system_message=self.system_message, 54 | messages=[{"role": "user", "content": prompt}], 55 | **kwargs, 56 | ) 57 | 58 | async def generate_with_context( 59 | self, system_message: str, messages: List[Dict[str, str]], **kwargs 60 | ) -> str: 61 | """Generate text using a system message and conversational context""" 62 | # Prepare messages with system message 63 | formatted_messages = [{"role": "system", "content": system_message}] 64 | formatted_messages.extend(messages) 65 | 66 | # Set up generation parameters 67 | if self.api_base == "https://api.openai.com/v1" and str(self.model).lower().startswith("o"): 68 | # For o-series models 69 | params = { 70 | "model": self.model, 71 | "messages": formatted_messages, 72 | "max_completion_tokens": kwargs.get("max_tokens", self.max_tokens), 73 | } 74 | else: 75 | params = { 76 | "model": self.model, 77 | "messages": formatted_messages, 78 | "temperature": kwargs.get("temperature", self.temperature), 79 | "top_p": kwargs.get("top_p", self.top_p), 80 | "max_tokens": kwargs.get("max_tokens", self.max_tokens), 81 | } 82 | 83 | # Attempt the API call with retries 84 | retries = kwargs.get("retries", self.retries) 85 | retry_delay = kwargs.get("retry_delay", self.retry_delay) 86 | timeout = kwargs.get("timeout", self.timeout) 87 | 88 | for attempt in range(retries + 1): 89 | try: 90 | response = await asyncio.wait_for(self._call_api(params), timeout=timeout) 91 | return response 92 | except asyncio.TimeoutError: 93 | if attempt < retries: 94 | logger.warning(f"Timeout on attempt {attempt + 1}/{retries + 1}. Retrying...") 95 | await asyncio.sleep(retry_delay) 96 | else: 97 | logger.error(f"All {retries + 1} attempts failed with timeout") 98 | raise 99 | except Exception as e: 100 | if attempt < retries: 101 | logger.warning( 102 | f"Error on attempt {attempt + 1}/{retries + 1}: {str(e)}. Retrying..." 103 | ) 104 | await asyncio.sleep(retry_delay) 105 | else: 106 | logger.error(f"All {retries + 1} attempts failed with error: {str(e)}") 107 | raise 108 | 109 | async def _call_api(self, params: Dict[str, Any]) -> str: 110 | """Make the actual API call""" 111 | # Use native async API call 112 | response = await self.client.chat.completions.create(**params) 113 | # Logging of system prompt, user message and response content 114 | prompt = params["messages"][0]["content"] + '\n' + params["messages"][1]["content"] 115 | logger.info('=' * 100) 116 | logger.info(f"API parameters: {prompt}") 117 | logger.info('=' * 100) 118 | logger.info(f"API response: {response.choices[0].message.content}") 119 | logger.info('=' * 100) 120 | return response.choices[0].message.content 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | AlphaReseach 3 |

4 | 5 |

6 | [🌐 Website] • 7 | [📜 Paper] • 8 | [🤗 HF Models] • 9 | [🐱 GitHub] 10 |

11 |

12 | Repo for "AlphaResearch: Accelerating New Algorithm Discovery with Language Models" 13 |

14 | 15 | 16 |
17 | 18 | alpha-research 19 |
20 | Figure 1: Comparison of OpenEvolve (with program-based reward), ShinkaEvolve (with programbased reward) and AlphaResearch (with program-based and peer-review reward). 21 |
22 | 23 | # News 24 | 25 | - [2025/11/12] 🔥🔥🔥 [AlphaResearch-RM-7B](https://huggingface.co/alpha-research/AlphaResearch-RM-Qwen-7B) released at [🤗 HuggingFace](https://huggingface.co/alpha-research)! 26 | - [2025/11/12] AlphaResearch paper, repo, and website released. 27 | 28 | ## AlphaResearch Pipeline 29 | 30 | alpha-research 31 |
32 | Figure 2: The launch of AlphaResearch contains two manual steps. 33 | (1) Train reward models with realworld peer-reviewed records. (2) Prepare initial research proposals, initial programs and evalution 34 | program. 35 | 36 | 37 | ## 🚀 Run AlphaResearch 38 | 39 | if you have `initial_program.py` and `initial_proposal.py`, please run 40 | ``` 41 | cd alpha-research 42 | python run.py 43 | ``` 44 | 45 | ## ⚖️ Benchmark 46 | 47 | The benchmark problems in AlphaResearchComp. AlphaEvolve has not publicly disclosed all the test problems so far. To provide a more transparent evaluation, we curate and open source a set of 8 frontier program-based 48 | research tasks spanning geometry, number theory, harmonic analysis, and combinatorial optimization. 49 | They are either refined from prior work (e.g., 50 | AlphaEvolve) or collected from online repositories and domain experts. 51 | 52 | | Problem | Human Best | Human Researcher | 53 | |---------|------------|------------------| 54 | | Packing circles (n=26) | 2.634 | David Cantrell (2011) | 55 | | Packing circles (n=32) | 2.936 | Eckard Specht (2012) | 56 | | Minimizing max-min distance ratio (d=2, n=16) | 12.89 | David Cantrell (2009) | 57 | | Third autocorrelation inequality | 1.4581 | Carlos Vinuesa (2009) | 58 | | Spherical code (n=30) minimizing upper bound | 0.67365 | Hardin & Sloane (1996,2002) | 59 | | Autoconvolution peak minimization (upper bound) | 0.755 | Matolcsi-Vinuesa (2010) | 60 | | Littlewood polynomials (n=5) | 32 | Rudin-Shapiro (1946/1952) | 61 | | MSTSD (n=30) | 1.04 | Hegarty (2006/2007) | 62 | 63 | ## ⚙️ Results 64 | Results on AlphaResearchComp. ↑ inidicates that higher score is better and ↓ for lower. 65 | 66 | | Problem | Human | AlphaResearch init | best | Excel@best | 67 | |---------|-------|---------------------|------|------------| 68 | | Packing circles (n=26) ↑ | 2.634 | 0 | 2.636 | 0.32% | 69 | | Packing circles (n=32) ↑ | 2.936 | 0 | 2.939 | 0.10% | 70 | | Minimizing max-min distance ratio ↓ | 12.89 | 15.55 | 12.92 | -0.23% | 71 | | Third autocorrelation inequality ↓ | 1.458 | 35.746 | 1.546 | -6.03% | 72 | | Spherical code (d=3, n=30) ↑ | 0.6736 | 0.5130 | 0.6735 | -0.01% | 73 | | Autoconvolution peak minimization ↓ | 0.755 | 1.512 | 0.756 | -0.13% | 74 | | Littlewood polynomials (n=512) ↑ | 32 | 32 | 32 | 0% | 75 | | MSTSD (n=30) ↑ | 1.04 | 1.04 | 1.04 | 0% | 76 | 77 | ## 🤖 EvolveAgent 78 | 79 | We use [OpenEvolve](https://github.com/codelion/openevolve) as our evolutionary agent. 80 | 81 | ## 🌲 Reward Model 82 | 83 | We train Qwen2.5-7B-Instruct with ICLR(2017-2024) papers as our reward model. 84 | 85 | 86 | - Train Dataset: Abstract and Review Score of ICLR 2017-2024 papers (24,445 in total) (knowledge cut-off date: Dec, 2023) 87 | 88 | - Evaluation Dataset: Abstract and Review Score of 100 ICLR 2025 papers 89 | (ICLR2025 Rebuttal started at Dec, 2024) 90 | 91 | - Metric: positive score (>5.5), negative score(<=5.5), binary classification 92 | 93 | ### ⚡️ Training 94 | 95 | We open-source our complete training scripts for the community, and you may construct your own dataset for training. 96 | To train a model, run the following command: 97 | 98 | ```sh 99 | bash alpha-research/reward_model/train/script/train_qwen.sh 100 | ``` 101 | 102 | ### 🪁 RM Results 103 | 104 | | Model | Released Date (Knowledge Cutoff) | Accuracy (Binary) | 105 | | --- | --- | --- | 106 | | Human | Mar, 2025 (potential leakage) | 65.0% | 107 | | GPT-5 (medium) | Mar, 2025 (potential leakage) | 53.0% | 108 | | Qwen2.5-7B-Instruct | Sep, 2024 | 37.0% | 109 | | [AlphaResearch-RM-7B](https://huggingface.co/alpha-research/AlphaResearch-RM-Qwen-7B) | Sep, 2024 | 72.0% | 110 | 111 | ## 📖 License 112 | 113 | This code repository is licensed under the MIT License. 114 | 115 | ## ☕️ Citation 116 | 117 | If you find this repository helpful, please consider citing our paper: 118 | 119 | ``` 120 | @article{yu2025alpharesearch, 121 | title={AlphaResearch: Accelerating New Algorithm Discovery with Language Models}, 122 | author={Yu, Zhaojian and Feng, Kaiyue and Zhao, Yilun and He, Shilin and Zhang, Xiao-Ping and Cohan, Arman}, 123 | journal={arXiv preprint arXiv:2511.08522}, 124 | year={2025} 125 | } 126 | ``` 127 | -------------------------------------------------------------------------------- /evolve_agent/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-line interface for EvolveAgent 3 | """ 4 | 5 | import argparse 6 | import asyncio 7 | import logging 8 | import os 9 | import sys 10 | from typing import Dict, List, Optional 11 | 12 | from evolve_agent import EvolveAgent 13 | from evolve_agent.config import Config, load_config 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def parse_args() -> argparse.Namespace: 19 | """Parse command-line arguments""" 20 | parser = argparse.ArgumentParser(description="EvolveAgent - Evolutionary coding agent") 21 | 22 | parser.add_argument("initial_program", help="Path to the initial program file") 23 | 24 | parser.add_argument( 25 | "evaluation_file", help="Path to the evaluation file containing an 'evaluate' function" 26 | ) 27 | 28 | parser.add_argument("--config", "-c", help="Path to configuration file (YAML)", default=None) 29 | 30 | parser.add_argument("--output", "-o", help="Output directory for results", default=None) 31 | 32 | parser.add_argument( 33 | "--iterations", "-i", help="Maximum number of iterations", type=int, default=None 34 | ) 35 | 36 | parser.add_argument( 37 | "--target-score", "-t", help="Target score to reach", type=float, default=None 38 | ) 39 | 40 | parser.add_argument( 41 | "--log-level", 42 | "-l", 43 | help="Logging level", 44 | choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], 45 | default="INFO", 46 | ) 47 | 48 | parser.add_argument( 49 | "--checkpoint", 50 | help="Path to checkpoint directory to resume from (e.g., evolve_agent_output/checkpoints/checkpoint_50)", 51 | default=None, 52 | ) 53 | 54 | parser.add_argument("--api-base", help="Base URL for the LLM API", default=None) 55 | 56 | parser.add_argument("--primary-model", help="Primary LLM model name", default=None) 57 | 58 | parser.add_argument("--secondary-model", help="Secondary LLM model name", default=None) 59 | 60 | return parser.parse_args() 61 | 62 | 63 | async def main_async() -> int: 64 | """ 65 | Main asynchronous entry point 66 | 67 | Returns: 68 | Exit code 69 | """ 70 | args = parse_args() 71 | 72 | # Check if files exist 73 | if not os.path.exists(args.initial_program): 74 | print(f"Error: Initial program file '{args.initial_program}' not found") 75 | return 1 76 | 77 | if not os.path.exists(args.evaluation_file): 78 | print(f"Error: Evaluation file '{args.evaluation_file}' not found") 79 | return 1 80 | 81 | # Create config object with command-line overrides 82 | config = None 83 | if args.api_base or args.primary_model or args.secondary_model: 84 | # Load base config from file or defaults 85 | config = load_config(args.config) 86 | 87 | # Apply command-line overrides 88 | if args.api_base: 89 | config.llm.api_base = args.api_base 90 | print(f"Using API base: {config.llm.api_base}") 91 | 92 | if args.primary_model: 93 | config.llm.primary_model = args.primary_model 94 | print(f"Using primary model: {config.llm.primary_model}") 95 | 96 | if args.secondary_model: 97 | config.llm.secondary_model = args.secondary_model 98 | print(f"Using secondary model: {config.llm.secondary_model}") 99 | 100 | # Initialize EvolveAgent 101 | try: 102 | evolve_agent = EvolveAgent( 103 | initial_program_path=args.initial_program, 104 | evaluation_file=args.evaluation_file, 105 | config=config, 106 | config_path=args.config if config is None else None, 107 | output_dir=args.output, 108 | ) 109 | 110 | # Load from checkpoint if specified 111 | if args.checkpoint: 112 | if not os.path.exists(args.checkpoint): 113 | print(f"Error: Checkpoint directory '{args.checkpoint}' not found") 114 | return 1 115 | print(f"Loading checkpoint from {args.checkpoint}") 116 | evolve_agent.database.load(args.checkpoint) 117 | print( 118 | f"Checkpoint loaded successfully (iteration {evolve_agent.database.last_iteration})" 119 | ) 120 | 121 | # Override log level if specified 122 | if args.log_level: 123 | logging.getLogger().setLevel(getattr(logging, args.log_level)) 124 | 125 | # Run evolution 126 | best_program = await evolve_agent.run( 127 | iterations=args.iterations, 128 | target_score=args.target_score, 129 | ) 130 | 131 | # Get the checkpoint path 132 | checkpoint_dir = os.path.join(evolve_agent.output_dir, "checkpoints") 133 | latest_checkpoint = None 134 | if os.path.exists(checkpoint_dir): 135 | checkpoints = [ 136 | os.path.join(checkpoint_dir, d) 137 | for d in os.listdir(checkpoint_dir) 138 | if os.path.isdir(os.path.join(checkpoint_dir, d)) 139 | ] 140 | if checkpoints: 141 | latest_checkpoint = sorted( 142 | checkpoints, key=lambda x: int(x.split("_")[-1]) if "_" in x else 0 143 | )[-1] 144 | 145 | print(f"\nEvolution complete!") 146 | print(f"Best program metrics:") 147 | for name, value in best_program.metrics.items(): 148 | # Handle mixed types: format numbers as floats, others as strings 149 | if isinstance(value, (int, float)): 150 | print(f" {name}: {value:.4f}") 151 | else: 152 | print(f" {name}: {value}") 153 | 154 | if latest_checkpoint: 155 | print(f"\nLatest checkpoint saved at: {latest_checkpoint}") 156 | print(f"To resume, use: --checkpoint {latest_checkpoint}") 157 | 158 | return 0 159 | 160 | except Exception as e: 161 | print(f"Error: {str(e)}") 162 | import traceback 163 | 164 | traceback.print_exc() 165 | return 1 166 | 167 | 168 | def main() -> int: 169 | """ 170 | Main entry point 171 | 172 | Returns: 173 | Exit code 174 | """ 175 | return asyncio.run(main_async()) 176 | 177 | 178 | if __name__ == "__main__": 179 | sys.exit(main()) 180 | -------------------------------------------------------------------------------- /reward_model/train/train.py: -------------------------------------------------------------------------------- 1 | # This code is based on tatsu-lab/stanford_alpaca (https://github.com/tatsu-lab/stanford_alpaca). 2 | 3 | from dataclasses import dataclass, field 4 | import math 5 | import pathlib 6 | from typing import Dict, Optional 7 | 8 | import transformers 9 | from transformers import Trainer 10 | from transformers.trainer_pt_utils import LabelSmoother 11 | from transformers import set_seed 12 | 13 | from preprocess import load_dataset, make_supervised_data_module, DataCollatorForSupervisedDataset 14 | 15 | IGNORE_TOKEN_ID = LabelSmoother.ignore_index 16 | 17 | IGNORE_INDEX = -100 18 | DEFAULT_PAD_TOKEN = "[PAD]" 19 | DEFAULT_EOS_TOKEN = "" 20 | DEFAULT_BOS_TOKEN = "" 21 | DEFAULT_UNK_TOKEN = "" 22 | 23 | set_seed(42) 24 | 25 | @dataclass 26 | class ModelArguments: 27 | model_name_or_path: Optional[str] = field(default="facebook/opt-125m") 28 | 29 | 30 | @dataclass 31 | class DataArguments: 32 | data_path: str = field( 33 | default=None, metadata={"help": "Path to the training data."} 34 | ) 35 | eval_data_path: str = field( 36 | default=None, metadata={"help": "Path to the evaluation data."} 37 | ) 38 | lazy_preprocess: bool = True 39 | 40 | 41 | @dataclass 42 | class TrainingArguments(transformers.TrainingArguments): 43 | cache_dir: Optional[str] = field(default=None) 44 | optim: str = field(default="adamw_torch") 45 | model_max_length: int = field( 46 | default=512, 47 | metadata={ 48 | "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." 49 | }, 50 | ) 51 | 52 | 53 | def trainer_save_model_safe(trainer: transformers.Trainer): 54 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP 55 | from torch.distributed.fsdp import StateDictType, FullStateDictConfig 56 | 57 | save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) 58 | with FSDP.state_dict_type( 59 | trainer.model, StateDictType.FULL_STATE_DICT, save_policy 60 | ): 61 | trainer.save_model() 62 | 63 | 64 | def smart_tokenizer_and_embedding_resize( 65 | special_tokens_dict: Dict, 66 | tokenizer: transformers.PreTrainedTokenizer, 67 | model: transformers.PreTrainedModel, 68 | ): 69 | """Resize tokenizer and embedding. 70 | 71 | Note: This is the unoptimized version that may make your embedding size not be divisible by 64. 72 | """ 73 | num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) 74 | model.resize_token_embeddings(len(tokenizer)) 75 | 76 | if num_new_tokens > 0: 77 | input_embeddings = model.get_input_embeddings().weight.data 78 | output_embeddings = model.get_output_embeddings().weight.data 79 | 80 | input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( 81 | dim=0, keepdim=True 82 | ) 83 | output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( 84 | dim=0, keepdim=True 85 | ) 86 | 87 | input_embeddings[-num_new_tokens:] = input_embeddings_avg 88 | output_embeddings[-num_new_tokens:] = output_embeddings_avg 89 | 90 | 91 | def train(): 92 | global local_rank 93 | 94 | parser = transformers.HfArgumentParser( 95 | (ModelArguments, DataArguments, TrainingArguments) 96 | ) 97 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 98 | local_rank = training_args.local_rank 99 | 100 | # Set RoPE scaling factor 101 | config = transformers.AutoConfig.from_pretrained( 102 | model_args.model_name_or_path, 103 | cache_dir=training_args.cache_dir, 104 | ) 105 | orig_ctx_len = getattr(config, "max_position_embeddings", None) 106 | if orig_ctx_len and training_args.model_max_length > orig_ctx_len: 107 | scaling_factor = float(math.ceil(training_args.model_max_length / orig_ctx_len)) 108 | config.rope_scaling = {"type": "linear", "factor": scaling_factor} 109 | config.use_cache = False 110 | 111 | # Load model and tokenizer 112 | tokenizer = transformers.AutoTokenizer.from_pretrained( 113 | model_args.model_name_or_path, 114 | cache_dir=training_args.cache_dir, 115 | model_max_length = training_args.model_max_length, 116 | truncation = True, 117 | padding_side = "right", 118 | trust_remote_code = True, 119 | use_fast=True, 120 | ) 121 | 122 | # Load data 123 | if '.json' in data_args.data_path: 124 | data_module = make_supervised_data_module(tokenizer=tokenizer, data_path=data_args.data_path) 125 | else: 126 | train_dataset = load_dataset(data_args.data_path) 127 | data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) 128 | data_module = dict( 129 | train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator 130 | ) 131 | 132 | model = transformers.AutoModelForCausalLM.from_pretrained( 133 | model_args.model_name_or_path, 134 | config=config, 135 | cache_dir=training_args.cache_dir, 136 | use_flash_attention_2=True 137 | ) 138 | 139 | if local_rank == 0: 140 | print(config) 141 | print(model) 142 | 143 | # tokenizer.pad_token = tokenizer.unk_token 144 | special_tokens_dict = dict() 145 | if tokenizer.pad_token is None: 146 | special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN 147 | if tokenizer.eos_token is None: 148 | special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN 149 | if tokenizer.bos_token is None: 150 | special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN 151 | if tokenizer.unk_token is None: 152 | special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN 153 | 154 | smart_tokenizer_and_embedding_resize( 155 | special_tokens_dict=special_tokens_dict, 156 | tokenizer=tokenizer, 157 | model=model, 158 | ) 159 | 160 | # Start trainner 161 | trainer = Trainer( 162 | model=model, tokenizer=tokenizer, args=training_args, **data_module 163 | ) 164 | if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): 165 | trainer.train(resume_from_checkpoint=True) 166 | else: 167 | trainer.train() 168 | 169 | # Save model 170 | model.config.use_cache = True 171 | trainer.save_state() 172 | trainer_save_model_safe(trainer) 173 | 174 | 175 | if __name__ == "__main__": 176 | train() 177 | -------------------------------------------------------------------------------- /evolve_agent/utils/code_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for code parsing, diffing, and manipulation 3 | """ 4 | 5 | import re 6 | from typing import Dict, List, Optional, Tuple, Union 7 | 8 | 9 | def parse_evolve_blocks(code: str) -> List[Tuple[int, int, str]]: 10 | """ 11 | Parse evolve blocks from code 12 | 13 | Args: 14 | code: Source code with evolve blocks 15 | 16 | Returns: 17 | List of tuples (start_line, end_line, block_content) 18 | """ 19 | lines = code.split("\n") 20 | blocks = [] 21 | 22 | in_block = False 23 | start_line = -1 24 | block_content = [] 25 | 26 | for i, line in enumerate(lines): 27 | if "# EVOLVE-BLOCK-START" in line: 28 | in_block = True 29 | start_line = i 30 | block_content = [] 31 | elif "# EVOLVE-BLOCK-END" in line and in_block: 32 | in_block = False 33 | blocks.append((start_line, i, "\n".join(block_content))) 34 | elif in_block: 35 | block_content.append(line) 36 | 37 | return blocks 38 | 39 | 40 | def apply_diff(original_code: str, diff_text: str) -> str: 41 | """ 42 | Apply a diff to the original code 43 | 44 | Args: 45 | original_code: Original source code 46 | diff_text: Diff in the SEARCH/REPLACE format 47 | 48 | Returns: 49 | Modified code 50 | """ 51 | # Split into lines for easier processing 52 | original_lines = original_code.split("\n") 53 | result_lines = original_lines.copy() 54 | 55 | # Extract diff blocks 56 | diff_blocks = extract_diffs(diff_text) 57 | 58 | # Apply each diff block 59 | for search_text, replace_text in diff_blocks: 60 | search_lines = search_text.split("\n") 61 | replace_lines = replace_text.split("\n") 62 | 63 | # Find where the search pattern starts in the original code 64 | for i in range(len(result_lines) - len(search_lines) + 1): 65 | if result_lines[i : i + len(search_lines)] == search_lines: 66 | # Replace the matched section 67 | result_lines[i : i + len(search_lines)] = replace_lines 68 | break 69 | 70 | return "\n".join(result_lines) 71 | 72 | 73 | def extract_diffs(diff_text: str) -> List[Tuple[str, str]]: 74 | """ 75 | Extract diff blocks from the diff text 76 | 77 | Args: 78 | diff_text: Diff in the SEARCH/REPLACE format 79 | 80 | Returns: 81 | List of tuples (search_text, replace_text) 82 | """ 83 | diff_pattern = r"<<<<<<< SEARCH\n(.*?)=======\n(.*?)>>>>>>> REPLACE" 84 | diff_blocks = re.findall(diff_pattern, diff_text, re.DOTALL) 85 | return [(match[0].rstrip(), match[1].rstrip()) for match in diff_blocks] 86 | 87 | 88 | def parse_full_rewrite(llm_response: str, language: str = "python") -> Optional[str]: 89 | """ 90 | Extract a full rewrite from an LLM response 91 | 92 | Args: 93 | llm_response: Response from the LLM 94 | language: Programming language 95 | 96 | Returns: 97 | Extracted code or None if not found 98 | """ 99 | code_block_pattern = r"```" + language + r"\n(.*?)```" 100 | matches = re.findall(code_block_pattern, llm_response, re.DOTALL) 101 | 102 | if matches: 103 | return matches[0].strip() 104 | 105 | # Fallback to any code block 106 | code_block_pattern = r"```(.*?)```" 107 | matches = re.findall(code_block_pattern, llm_response, re.DOTALL) 108 | 109 | if matches: 110 | return matches[0].strip() 111 | 112 | # Fallback to plain text 113 | return llm_response 114 | 115 | 116 | def format_diff_summary(diff_blocks: List[Tuple[str, str]]) -> str: 117 | """ 118 | Create a human-readable summary of the diff 119 | 120 | Args: 121 | diff_blocks: List of (search_text, replace_text) tuples 122 | 123 | Returns: 124 | Summary string 125 | """ 126 | summary = [] 127 | 128 | for i, (search_text, replace_text) in enumerate(diff_blocks): 129 | search_lines = search_text.strip().split("\n") 130 | replace_lines = replace_text.strip().split("\n") 131 | 132 | # Create a short summary 133 | if len(search_lines) == 1 and len(replace_lines) == 1: 134 | summary.append(f"Change {i+1}: '{search_lines[0]}' to '{replace_lines[0]}'") 135 | else: 136 | search_summary = ( 137 | f"{len(search_lines)} lines" if len(search_lines) > 1 else search_lines[0] 138 | ) 139 | replace_summary = ( 140 | f"{len(replace_lines)} lines" if len(replace_lines) > 1 else replace_lines[0] 141 | ) 142 | summary.append(f"Change {i+1}: Replace {search_summary} with {replace_summary}") 143 | 144 | return "\n".join(summary) 145 | 146 | 147 | def calculate_edit_distance(code1: str, code2: str) -> int: 148 | """ 149 | Calculate the Levenshtein edit distance between two code snippets 150 | 151 | Args: 152 | code1: First code snippet 153 | code2: Second code snippet 154 | 155 | Returns: 156 | Edit distance (number of operations needed to transform code1 into code2) 157 | """ 158 | if code1 == code2: 159 | return 0 160 | 161 | # Simple implementation of Levenshtein distance 162 | m, n = len(code1), len(code2) 163 | dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)] 164 | 165 | for i in range(m + 1): 166 | dp[i][0] = i 167 | 168 | for j in range(n + 1): 169 | dp[0][j] = j 170 | 171 | for i in range(1, m + 1): 172 | for j in range(1, n + 1): 173 | cost = 0 if code1[i - 1] == code2[j - 1] else 1 174 | dp[i][j] = min( 175 | dp[i - 1][j] + 1, # deletion 176 | dp[i][j - 1] + 1, # insertion 177 | dp[i - 1][j - 1] + cost, # substitution 178 | ) 179 | 180 | return dp[m][n] 181 | 182 | 183 | def extract_code_language(code: str) -> str: 184 | """ 185 | Try to determine the language of a code snippet 186 | 187 | Args: 188 | code: Code snippet 189 | 190 | Returns: 191 | Detected language or "unknown" 192 | """ 193 | # Look for common language signatures 194 | if re.search(r"^(import|from|def|class)\s", code, re.MULTILINE): 195 | return "python" 196 | elif re.search(r"^(package|import java|public class)", code, re.MULTILINE): 197 | return "java" 198 | elif re.search(r"^(#include|int main|void main)", code, re.MULTILINE): 199 | return "cpp" 200 | elif re.search(r"^(function|var|let|const|console\.log)", code, re.MULTILINE): 201 | return "javascript" 202 | elif re.search(r"^(module|fn|let mut|impl)", code, re.MULTILINE): 203 | return "rust" 204 | elif re.search(r"^(SELECT|CREATE TABLE|INSERT INTO)", code, re.MULTILINE): 205 | return "sql" 206 | 207 | return "unknown" 208 | -------------------------------------------------------------------------------- /configs/default_config.yaml: -------------------------------------------------------------------------------- 1 | # EvolveAgent Default Configuration 2 | # This file contains all available configuration options with sensible defaults 3 | # You can use this as a template for your own configuration 4 | 5 | # General settings 6 | max_iterations: 1000 # Maximum number of evolution iterations 7 | checkpoint_interval: 50 # Save checkpoints every N iterations 8 | log_level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) 9 | log_dir: null # Custom directory for logs (default: output_dir/logs) 10 | random_seed: null # Random seed for reproducibility (null = random) 11 | 12 | # Evolution settings 13 | diff_based_evolution: true # Use diff-based evolution (true) or full rewrites (false) 14 | allow_full_rewrites: false # Allow occasional full rewrites even in diff-based mode 15 | max_code_length: 100000 # Maximum allowed code length in characters 16 | 17 | # LLM configuration 18 | llm: 19 | # Models for evolution 20 | models: 21 | # List of available models with their weights 22 | - name: "deepseek-chat" 23 | weight: 1.0 24 | # - name: "gemini-2.0-flash" 25 | # weight: 0 26 | 27 | # Models for LLM feedback 28 | # evaluator_models: 29 | # # List of available models with their weights 30 | # - name: "gemini-2.0-flash-lite" 31 | # weight: 0.8 32 | # - name: "gemini-2.0-flash" 33 | # weight: 0.2 34 | 35 | # API configuration 36 | api_base: "https://api.deepseek.com" # Base URL for API (change for non-OpenAI models) 37 | api_key: "sk-2c3f1f58031b4b86afdb6a8192ea02e2" # API key (defaults to OPENAI_API_KEY env variable) 38 | 39 | # Generation parameters 40 | temperature: 0.7 # Temperature for generation (higher = more creative) 41 | top_p: 0.95 # Top-p sampling parameter 42 | max_tokens: 8192 # Maximum tokens to generate 43 | 44 | # Request parameters 45 | timeout: 300 # Timeout for API requests in seconds 46 | retries: 3 # Number of retries for failed requests 47 | retry_delay: 5 # Delay between retries in seconds 48 | 49 | # Prompt configuration 50 | prompt: 51 | template_dir: null # Custom directory for prompt templates 52 | # system_message: "You are an expert coder helping to improve programs through evolution." 53 | # evaluator_system_message: "You are an expert code reviewer." 54 | 55 | # Number of examples to include in the prompt 56 | num_top_programs: 3 # Number of top-performing programs to include 57 | num_diverse_programs: 2 # Number of diverse programs to include 58 | 59 | # Template stochasticity 60 | use_template_stochasticity: true # Use random variations in templates for diversity 61 | template_variations: # Different phrasings for parts of the template 62 | improvement_suggestion: 63 | - "Here's how we could improve this code:" 64 | - "I suggest the following improvements:" 65 | - "We can enhance this code by:" 66 | 67 | # Note: meta-prompting features are not yet implemented 68 | 69 | # Database configuration 70 | database: 71 | # General settings 72 | db_path: null # Path to persist database (null = in-memory only) 73 | in_memory: true # Keep database in memory for faster access 74 | log_prompts: true # If true, log all prompts and responses into the database 75 | 76 | # Evolutionary parameters 77 | population_size: 1000 # Maximum number of programs to keep in memory 78 | archive_size: 100 # Size of elite archive 79 | num_islands: 5 # Number of islands for island model (separate populations) 80 | 81 | # Island-based evolution parameters 82 | # Islands provide diversity by maintaining separate populations that evolve independently. 83 | # Migration periodically shares the best solutions between adjacent islands. 84 | migration_interval: 50 # Migrate between islands every N generations 85 | migration_rate: 0.1 # Fraction of top programs to migrate (0.1 = 10%) 86 | 87 | # Selection parameters 88 | elite_selection_ratio: 0.1 # Ratio of elite programs to select 89 | exploration_ratio: 0.2 # Ratio of exploration vs exploitation 90 | exploitation_ratio: 0.7 # Ratio of exploitation vs random selection 91 | # Note: diversity_metric is fixed to "edit_distance" (feature_based not implemented) 92 | 93 | # Feature map dimensions for MAP-Elites 94 | feature_dimensions: # Dimensions for MAP-Elites feature map 95 | - "score" # Performance score 96 | - "complexity" # Code complexity (length) 97 | feature_bins: 10 # Number of bins per dimension 98 | 99 | # Evaluator configuration 100 | evaluator: 101 | # General settings 102 | timeout: 300 # Maximum evaluation time in seconds 103 | max_retries: 3 # Maximum number of retries for evaluation 104 | 105 | # Note: resource limits (memory_limit_mb, cpu_limit) are not yet implemented 106 | 107 | # Evaluation strategies 108 | cascade_evaluation: false # Use cascade evaluation to filter bad solutions early 109 | cascade_thresholds: # Thresholds for advancing to next evaluation stage 110 | - 0.5 # First stage threshold 111 | - 0.75 # Second stage threshold 112 | - 0.9 # Third stage threshold 113 | 114 | # Parallel evaluation 115 | parallel_evaluations: 4 # Number of parallel evaluations 116 | # Note: distributed evaluation is not yet implemented 117 | 118 | # LLM-based feedback (experimental) 119 | use_llm_feedback: false # Use LLM to evaluate code quality 120 | llm_feedback_weight: 0.1 # Weight for LLM feedback in final score 121 | 122 | # Reward model configuration 123 | rewardmodel: 124 | model_type: vllm # Model type (vllm or api) 125 | model_name: /data/zhuotaodeng/yzj/alpha_research_model/qwen25_grm_iclr_boxed/checkpoint-180 # Model name (if null, uses default) 126 | temperature: 0.7 # Temperature for generation 127 | top_p: 0.95 # Top-p sampling parameter 128 | max_tokens: 4096 # Maximum tokens to generate 129 | proposal_score_threshold: 5.5 # Only generate programs if proposal score >= threshold 130 | # api_key: sk-2c3f1f58031b4b86afdb6a8192ea02e2 # API key for API models 131 | # base_url: https://api.deepseek.com # Base URL for API models 132 | jsonl_file: "results/reward_results.jsonl" # JSONL file for results 133 | max_retries: 50 # Maximum number of retries 134 | retry_delay: 5 # Delay between retries in seconds 135 | -------------------------------------------------------------------------------- /reward_model/train/preprocess.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | import logging 3 | import pathlib 4 | import copy 5 | from typing import Dict, Optional, Sequence 6 | import torch 7 | from torch.utils.data import Dataset 8 | 9 | import transformers 10 | from transformers import AutoTokenizer 11 | from datasets import load_dataset 12 | 13 | IGNORE_INDEX = -100 14 | DEFAULT_PAD_TOKEN = "[PAD]" 15 | DEFAULT_EOS_TOKEN = "" 16 | DEFAULT_BOS_TOKEN = "" 17 | DEFAULT_UNK_TOKEN = "" 18 | _MAGIC_SPLITTER_ = "-[[]]-this-is-really-our-highest-priority-[[]]-" 19 | SYSTEM_PROMPT = "You are an expert reviewer tasked with evaluating the quality of a research proposal. " 20 | 21 | 22 | class SupervisedDataset(Dataset): 23 | """Dataset for supervised fine-tuning.""" 24 | 25 | def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer): 26 | super(SupervisedDataset, self).__init__() 27 | logging.warning("Loading data...") 28 | list_data_dict = load_dataset('json', data_files=data_path, split='train') 29 | logging.warning("Formatting inputs...") 30 | sources = [ 31 | ( 32 | prompt_format(tokenizer, example) 33 | ) 34 | for example in list_data_dict 35 | ] 36 | targets = [ 37 | f"{example['response']}{'<|im_end|>'}" for example in list_data_dict 38 | ] 39 | 40 | logging.warning("Tokenizing inputs... This may take some time...") 41 | data_dict = preprocess(sources, targets, tokenizer) 42 | 43 | self.input_ids = data_dict["input_ids"] 44 | self.labels = data_dict["labels"] 45 | 46 | def __len__(self): 47 | return len(self.input_ids) 48 | 49 | def __getitem__(self, i) -> Dict[str, torch.Tensor]: 50 | return dict(input_ids=self.input_ids[i], labels=self.labels[i]) 51 | 52 | 53 | def save_dataset(dataset: SupervisedDataset, save_path: str): 54 | 55 | save_dir = pathlib.Path(save_path) 56 | save_dir.mkdir(parents=True, exist_ok=True) 57 | 58 | torch.save(dataset.input_ids, save_dir / "input_ids.pt") 59 | torch.save(dataset.labels, save_dir / "labels.pt") 60 | logging.info(f"Dataset saved to {save_dir}") 61 | 62 | 63 | def load_from_pt(save_path: str) -> SupervisedDataset: 64 | 65 | save_dir = pathlib.Path(save_path) 66 | 67 | # Load input_ids and labels 68 | input_ids = torch.load(save_dir / "input_ids.pt") 69 | labels = torch.load(save_dir / "labels.pt") 70 | 71 | # Create an empty SupervisedDataset instance 72 | dataset = SupervisedDataset.__new__(SupervisedDataset) 73 | dataset.input_ids = input_ids 74 | dataset.labels = labels 75 | 76 | logging.info(f"Dataset loaded from {save_dir}") 77 | return dataset 78 | 79 | 80 | def _tokenize_fn( 81 | strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer 82 | ) -> Dict: 83 | """Tokenize a list of strings.""" 84 | tokenized_list = [ 85 | tokenizer( 86 | text, 87 | return_tensors="pt", 88 | padding="longest", 89 | max_length=tokenizer.model_max_length, 90 | truncation=True, 91 | ) 92 | for text in strings 93 | ] 94 | input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list] 95 | input_ids_lens = labels_lens = [ 96 | tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() 97 | for tokenized in tokenized_list 98 | ] 99 | return dict( 100 | input_ids=input_ids, 101 | labels=labels, 102 | input_ids_lens=input_ids_lens, 103 | labels_lens=labels_lens, 104 | ) 105 | 106 | def preprocess( 107 | sources: Sequence[str], 108 | targets: Sequence[str], 109 | tokenizer: transformers.PreTrainedTokenizer, 110 | ) -> Dict: 111 | """Preprocess the data by tokenizing.""" 112 | examples = [s + t for s, t in zip(sources, targets)] 113 | examples_tokenized, sources_tokenized = [ 114 | _tokenize_fn(strings, tokenizer) for strings in (examples, sources) 115 | ] 116 | input_ids = examples_tokenized["input_ids"] 117 | labels = copy.deepcopy(input_ids) 118 | for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]): 119 | label[:source_len] = IGNORE_INDEX 120 | return dict(input_ids=input_ids, labels=labels) 121 | 122 | 123 | @dataclass 124 | class DataCollatorForSupervisedDataset(object): 125 | """Collate examples for supervised fine-tuning.""" 126 | 127 | tokenizer: transformers.PreTrainedTokenizer 128 | 129 | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: 130 | input_ids, labels = tuple( 131 | [instance[key] for instance in instances] for key in ("input_ids", "labels") 132 | ) 133 | input_ids = torch.nn.utils.rnn.pad_sequence( 134 | input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id 135 | ) 136 | labels = torch.nn.utils.rnn.pad_sequence( 137 | labels, batch_first=True, padding_value=IGNORE_INDEX 138 | ) 139 | return dict( 140 | input_ids=input_ids, 141 | labels=labels, 142 | attention_mask=input_ids.ne(self.tokenizer.pad_token_id), 143 | ) 144 | 145 | 146 | def make_supervised_data_module( 147 | tokenizer: transformers.PreTrainedTokenizer, data_path 148 | ) -> Dict: 149 | """Make dataset and collator for supervised fine-tuning.""" 150 | train_dataset = SupervisedDataset( 151 | tokenizer=tokenizer, data_path=data_path 152 | ) 153 | data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) 154 | print(f"len={len(train_dataset)}") 155 | return dict( 156 | train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator 157 | ) 158 | 159 | 160 | def prompt_format(tokenizer, example): 161 | question = example['question'].strip() 162 | messages = [ 163 | { 164 | "role": "system", 165 | "content": SYSTEM_PROMPT 166 | }, 167 | { 168 | "role": "user", 169 | "content": question 170 | }, 171 | { 172 | "role": "assistant", 173 | "content": '' + _MAGIC_SPLITTER_ 174 | } 175 | ] 176 | return tokenizer.apply_chat_template(messages, tokenize=False).split(_MAGIC_SPLITTER_)[0] 177 | 178 | if __name__ == '__main__': 179 | tokenizer = AutoTokenizer.from_pretrained('/work/zhuotaodeng/yzj/pretrained_models_ms/Qwen/Qwen2___5-7B-Instruct') 180 | # data_path = '/data/zhuotaodeng/test-time-scaling/z1/data/openthought_evol-221k.json' 181 | # ds = make_supervised_data_module(tokenizer, data_path) 182 | # save_dataset(ds['train_dataset'],'/data/zhuotaodeng/test-time-scaling/z1/data/qwen') 183 | 184 | ds = load_dataset('/data/zhuotaodeng/test-time-scaling/z1/data/qwen') 185 | data = ds[1] 186 | decoded_input = tokenizer.decode(data['input_ids'], skip_special_tokens=True) 187 | print("Decoded input_ids:", decoded_input) 188 | filtered_labels = data['labels'][data['labels'] != -100] 189 | decoded_labels = tokenizer.decode(filtered_labels, skip_special_tokens=True) 190 | print("Decoded labels:", decoded_labels) 191 | -------------------------------------------------------------------------------- /benchmark/heilbronn_in_the_unit_square/visualization.py: -------------------------------------------------------------------------------- 1 | # viz_min_triangle.py 2 | import argparse 3 | import importlib.util 4 | import json 5 | import os 6 | import sys 7 | from itertools import combinations 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | # ---------- 评测与辅助函数(与您 evaluator 一致/兼容) ---------- 14 | 15 | def _triangle_area(a, b, c) -> float: 16 | return abs((b[0]-a[0])*(c[1]-a[1]) - (b[1]-a[1])*(c[0]-a[0])) * 0.5 17 | 18 | def find_min_triangle(points: np.ndarray): 19 | """ 20 | 返回最小三角形:(i, j, k, min_area) 21 | 若点数<3,返回 (-1, -1, -1, 0.0) 22 | """ 23 | P = np.asarray(points, dtype=float) 24 | n = len(P) 25 | if n < 3: 26 | return -1, -1, -1, 0.0 27 | best = (-1, -1, -1, float("inf")) 28 | for i, j, k in combinations(range(n), 3): 29 | area = _triangle_area(P[i], P[j], P[k]) 30 | if area < best[3]: 31 | best = (i, j, k, area) 32 | if area == 0.0: 33 | break 34 | return best 35 | 36 | def evaluate_min_triangle_area(points: np.ndarray): 37 | """ 38 | 与您当前 evaluator 保持一致的指标: 39 | - min_area:最小三角形面积(越大越好) 40 | - scaled_min_area:n^(8/7 + 1/2000) * min_area 41 | - score:等于 min_area(越大越好) 42 | """ 43 | pts = np.asarray(points, dtype=float) 44 | if pts.ndim != 2 or pts.shape[1] != 2 or len(pts) < 3: 45 | return dict(valid=0.0, min_area=0.0, n=float(len(pts)), 46 | scaled_min_area=0.0, score=0.0, argmin_triplet=(-1,-1,-1)) 47 | i, j, k, min_area = find_min_triangle(pts) 48 | n = float(len(pts)) 49 | exponent = (8.0/7.0) + (1.0/2000.0) 50 | scaled_min_area = (n ** exponent) * float(min_area) 51 | return dict( 52 | valid=1.0, 53 | min_area=float(min_area), 54 | n=n, 55 | scaled_min_area=float(scaled_min_area), 56 | score=float(min_area), 57 | argmin_triplet=(int(i), int(j), int(k)) 58 | ) 59 | 60 | 61 | # ---------- 读取点数据(模块 / npy / csv) ---------- 62 | 63 | def load_from_module(module_path: str) -> np.ndarray: 64 | module_path = os.path.abspath(module_path) 65 | spec = importlib.util.spec_from_file_location("points_mod", module_path) 66 | if spec is None or spec.loader is None: 67 | raise RuntimeError(f"无法加载模块: {module_path}") 68 | mod = importlib.util.module_from_spec(spec) 69 | spec.loader.exec_module(mod) 70 | 71 | pts = None 72 | if hasattr(mod, "points"): 73 | pts = mod.points 74 | elif hasattr(mod, "main"): 75 | res = mod.main() 76 | try: 77 | pts = np.asarray(res, dtype=float) 78 | except Exception: 79 | pass 80 | if pts is None and hasattr(mod, "points"): 81 | pts = mod.points 82 | if pts is None: 83 | raise RuntimeError("模块中既无 `points` 变量,也无法从 `main()` 获取点。") 84 | 85 | pts = np.asarray(pts, dtype=float) 86 | if pts.ndim != 2 or pts.shape[1] != 2: 87 | raise ValueError(f"模块返回的点形状异常: {pts.shape}, 期望 (N,2)") 88 | return pts 89 | 90 | def load_from_npy(npy_path: str) -> np.ndarray: 91 | pts = np.load(npy_path) 92 | pts = np.asarray(pts, dtype=float) 93 | if pts.ndim != 2 or pts.shape[1] != 2: 94 | raise ValueError(f"npy 形状异常: {pts.shape}, 期望 (N,2)") 95 | return pts 96 | 97 | def load_from_csv(csv_path: str) -> np.ndarray: 98 | pts = np.loadtxt(csv_path, delimiter=",") 99 | pts = np.asarray(pts, dtype=float) 100 | if pts.ndim != 2 or pts.shape[1] != 2: 101 | raise ValueError(f"csv 形状异常: {pts.shape}, 期望 (N,2)") 102 | return pts 103 | 104 | 105 | # ---------- 可视化 ---------- 106 | 107 | def plot_points_and_min_triangle(points: np.ndarray, 108 | show_indices: bool = False, 109 | title_prefix: str = ""): 110 | pts = np.asarray(points, dtype=float) 111 | (i, j, k, amin) = find_min_triangle(pts) 112 | eval_res = evaluate_min_triangle_area(pts) 113 | 114 | fig, ax = plt.subplots(figsize=(6, 6)) 115 | ax.set_xlim(0, 1) 116 | ax.set_ylim(0, 1) 117 | ax.set_aspect("equal", adjustable="box") 118 | 119 | # 画所有点 120 | ax.scatter(pts[:, 0], pts[:, 1], s=40, zorder=2) 121 | 122 | # 可选:标注索引 123 | if show_indices: 124 | for idx, (x, y) in enumerate(pts): 125 | ax.text(x, y, str(idx), fontsize=9, ha="left", va="bottom") 126 | 127 | # 高亮最小三角形 128 | if i >= 0: 129 | tri = np.array([pts[i], pts[j], pts[k], pts[i]]) 130 | ax.plot(tri[:, 0], tri[:, 1], linewidth=2.5, zorder=3) 131 | ax.scatter(pts[[i, j, k], 0], pts[[i, j, k], 1], s=70, zorder=4) 132 | 133 | # 网格与边框 134 | ax.set_xticks(np.linspace(0, 1, 6)) 135 | ax.set_yticks(np.linspace(0, 1, 6)) 136 | ax.grid(True, linestyle="--", alpha=0.3) 137 | 138 | # 标题(包含指标) 139 | title = ( 140 | f"{title_prefix}min_area={eval_res['min_area']:.8f} | " 141 | f"scaled_min_area={eval_res['scaled_min_area']:.6f} | " 142 | f"score={eval_res['score']:.8f} | " 143 | f"argmin={eval_res['argmin_triplet']}" 144 | ) 145 | ax.set_title(title) 146 | plt.tight_layout() 147 | plt.show() 148 | 149 | # 同时在 stdout 打一份 JSON,方便脚本化调用时抓数值 150 | out = { 151 | "min_area": eval_res["min_area"], 152 | "scaled_min_area": eval_res["scaled_min_area"], 153 | "score": eval_res["score"], 154 | "argmin_triplet": eval_res["argmin_triplet"], 155 | "n": int(eval_res["n"]), 156 | } 157 | print(json.dumps(out, ensure_ascii=False, indent=2)) 158 | 159 | 160 | # ---------- CLI ---------- 161 | 162 | def main(): 163 | parser = argparse.ArgumentParser( 164 | description="可视化 [0,1]^2 中点集,并高亮最小三角形(支持模块 / .npy / .csv)。若未提供来源,将自动尝试读取同目录下的 points.npy" 165 | ) 166 | src = parser.add_mutually_exclusive_group(required=False) 167 | src.add_argument("--from-module", type=str, help="含 points 或 main() 的 Python 文件路径") 168 | src.add_argument("--from-npy", type=str, help="N×2 的 .npy 路径") 169 | src.add_argument("--from-csv", type=str, help="N×2 的 .csv 路径(逗号分隔)") 170 | parser.add_argument("--show-indices", action="store_true", help="是否标注点索引") 171 | parser.add_argument("--title", type=str, default="", help="标题前缀") 172 | args = parser.parse_args() 173 | 174 | if args.from_module: 175 | P = load_from_module(args.from_module) 176 | elif args.from_npy: 177 | P = load_from_npy(args.from_npy) 178 | elif args.from_csv: 179 | P = load_from_csv(args.from_csv) 180 | else: 181 | # 自动读取默认的 points.npy(位于本脚本同目录) 182 | default_path = os.path.join(os.path.dirname(__file__), "points.npy") 183 | if not os.path.exists(default_path): 184 | print( 185 | "未提供输入来源,且未在本目录找到 points.npy。请先运行 initial_program.py 生成 points.npy,或通过 --from-* 指定输入。", 186 | file=sys.stderr, 187 | ) 188 | sys.exit(2) 189 | P = load_from_npy(default_path) 190 | 191 | # 可选:如果任务要求必须在 [0,1]^2,可以做个提示(不改变数值) 192 | if not (np.all(P >= 0.0) and np.all(P <= 1.0)): 193 | print("⚠️ 警告:存在越界点(不在 [0,1]^2),图中仍会显示。", file=sys.stderr) 194 | 195 | plot_points_and_min_triangle(P, show_indices=args.show_indices, title_prefix=args.title) 196 | 197 | if __name__ == "__main__": 198 | main() 199 | -------------------------------------------------------------------------------- /reward_model/train/utils.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import logging 3 | import math 4 | import os 5 | import io 6 | import sys 7 | import time 8 | import json 9 | from typing import Optional, Sequence, Union 10 | 11 | import openai 12 | import tqdm 13 | from openai import openai_object 14 | import copy 15 | 16 | StrOrOpenAIObject = Union[str, openai_object.OpenAIObject] 17 | 18 | openai_org = os.getenv("OPENAI_ORG") 19 | if openai_org is not None: 20 | openai.organization = openai_org 21 | logging.warning(f"Switching to organization: {openai_org} for OAI API key.") 22 | 23 | 24 | @dataclasses.dataclass 25 | class OpenAIDecodingArguments(object): 26 | max_tokens: int = 1800 27 | temperature: float = 0.2 28 | top_p: float = 1.0 29 | n: int = 1 30 | stream: bool = False 31 | stop: Optional[Sequence[str]] = None 32 | presence_penalty: float = 0.0 33 | frequency_penalty: float = 0.0 34 | suffix: Optional[str] = None 35 | logprobs: Optional[int] = None 36 | echo: bool = False 37 | 38 | 39 | def openai_completion( 40 | prompts: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]], 41 | decoding_args: OpenAIDecodingArguments, 42 | model_name="text-davinci-003", 43 | sleep_time=2, 44 | batch_size=1, 45 | max_instances=sys.maxsize, 46 | max_batches=sys.maxsize, 47 | return_text=False, 48 | **decoding_kwargs, 49 | ) -> Union[Union[StrOrOpenAIObject], Sequence[StrOrOpenAIObject], Sequence[Sequence[StrOrOpenAIObject]],]: 50 | """Decode with OpenAI API. 51 | 52 | Args: 53 | prompts: A string or a list of strings to complete. If it is a chat model the strings should be formatted 54 | as explained here: https://github.com/openai/openai-python/blob/main/chatml.md. If it is a chat model 55 | it can also be a dictionary (or list thereof) as explained here: 56 | https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb 57 | decoding_args: Decoding arguments. 58 | model_name: Model name. Can be either in the format of "org/model" or just "model". 59 | sleep_time: Time to sleep once the rate-limit is hit. 60 | batch_size: Number of prompts to send in a single request. Only for non chat model. 61 | max_instances: Maximum number of prompts to decode. 62 | max_batches: Maximum number of batches to decode. This argument will be deprecated in the future. 63 | return_text: If True, return text instead of full completion object (which contains things like logprob). 64 | decoding_kwargs: Additional decoding arguments. Pass in `best_of` and `logit_bias` if you need them. 65 | 66 | Returns: 67 | A completion or a list of completions. 68 | Depending on return_text, return_openai_object, and decoding_args.n, the completion type can be one of 69 | - a string (if return_text is True) 70 | - an openai_object.OpenAIObject object (if return_text is False) 71 | - a list of objects of the above types (if decoding_args.n > 1) 72 | """ 73 | is_single_prompt = isinstance(prompts, (str, dict)) 74 | if is_single_prompt: 75 | prompts = [prompts] 76 | 77 | if max_batches < sys.maxsize: 78 | logging.warning( 79 | "`max_batches` will be deprecated in the future, please use `max_instances` instead." 80 | "Setting `max_instances` to `max_batches * batch_size` for now." 81 | ) 82 | max_instances = max_batches * batch_size 83 | 84 | prompts = prompts[:max_instances] 85 | num_prompts = len(prompts) 86 | prompt_batches = [ 87 | prompts[batch_id * batch_size : (batch_id + 1) * batch_size] 88 | for batch_id in range(int(math.ceil(num_prompts / batch_size))) 89 | ] 90 | 91 | completions = [] 92 | for batch_id, prompt_batch in tqdm.tqdm( 93 | enumerate(prompt_batches), 94 | desc="prompt_batches", 95 | total=len(prompt_batches), 96 | ): 97 | batch_decoding_args = copy.deepcopy(decoding_args) # cloning the decoding_args 98 | 99 | while True: 100 | try: 101 | shared_kwargs = dict( 102 | model=model_name, 103 | **batch_decoding_args.__dict__, 104 | **decoding_kwargs, 105 | ) 106 | completion_batch = openai.Completion.create(prompt=prompt_batch, **shared_kwargs) 107 | choices = completion_batch.choices 108 | 109 | for choice in choices: 110 | choice["total_tokens"] = completion_batch.usage.total_tokens 111 | completions.extend(choices) 112 | break 113 | except openai.error.OpenAIError as e: 114 | logging.warning(f"OpenAIError: {e}.") 115 | if "Please reduce your prompt" in str(e): 116 | batch_decoding_args.max_tokens = int(batch_decoding_args.max_tokens * 0.8) 117 | logging.warning(f"Reducing target length to {batch_decoding_args.max_tokens}, Retrying...") 118 | else: 119 | logging.warning("Hit request rate limit; retrying...") 120 | time.sleep(sleep_time) # Annoying rate limit on requests. 121 | 122 | if return_text: 123 | completions = [completion.text for completion in completions] 124 | if decoding_args.n > 1: 125 | # make completions a nested list, where each entry is a consecutive decoding_args.n of original entries. 126 | completions = [completions[i : i + decoding_args.n] for i in range(0, len(completions), decoding_args.n)] 127 | if is_single_prompt: 128 | # Return non-tuple if only 1 input and 1 generation. 129 | (completions,) = completions 130 | return completions 131 | 132 | 133 | def _make_w_io_base(f, mode: str): 134 | if not isinstance(f, io.IOBase): 135 | f_dirname = os.path.dirname(f) 136 | if f_dirname != "": 137 | os.makedirs(f_dirname, exist_ok=True) 138 | f = open(f, mode=mode) 139 | return f 140 | 141 | 142 | def _make_r_io_base(f, mode: str): 143 | if not isinstance(f, io.IOBase): 144 | f = open(f, mode=mode) 145 | return f 146 | 147 | 148 | def jdump(obj, f, mode="w", indent=4, default=str): 149 | """Dump a str or dictionary to a file in json format. 150 | 151 | Args: 152 | obj: An object to be written. 153 | f: A string path to the location on disk. 154 | mode: Mode for opening the file. 155 | indent: Indent for storing json dictionaries. 156 | default: A function to handle non-serializable entries; defaults to `str`. 157 | """ 158 | f = _make_w_io_base(f, mode) 159 | if isinstance(obj, (dict, list)): 160 | json.dump(obj, f, indent=indent, default=default) 161 | elif isinstance(obj, str): 162 | f.write(obj) 163 | else: 164 | raise ValueError(f"Unexpected type: {type(obj)}") 165 | f.close() 166 | 167 | 168 | def jload(f, mode="r"): 169 | """Load a .json file into a dictionary.""" 170 | f = _make_r_io_base(f, mode) 171 | jdict = json.load(f) 172 | f.close() 173 | return jdict 174 | -------------------------------------------------------------------------------- /evolve_agent/prompt/templates.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prompt templates for EvolveAgent 3 | """ 4 | 5 | import os 6 | from pathlib import Path 7 | from typing import Dict, List, Optional, Union 8 | 9 | # Base system message template for evolution 10 | BASE_SYSTEM_TEMPLATE = """You are an expert software developer tasked with iteratively improving a codebase. 11 | Your job is to analyze the current program and suggest improvements based on feedback from previous attempts. 12 | Focus on making targeted changes that will increase the program's performance metrics. 13 | """ 14 | 15 | BASE_EVALUATOR_SYSTEM_TEMPLATE = """You are an expert code reviewer. 16 | Your job is to analyze the provided code and evaluate it systematically.""" 17 | 18 | # User message template for diff-based evolution 19 | DIFF_USER_TEMPLATE = """# Current Program Information 20 | - Current performance metrics: {metrics} 21 | - Areas identified for improvement: {improvement_areas} 22 | 23 | {artifacts} 24 | 25 | # Program Evolution History 26 | {evolution_history} 27 | 28 | # Current Program 29 | ```{language} 30 | {current_program} 31 | ``` 32 | 33 | # Task 34 | Suggest improvements to the program that will lead to better performance on the specified metrics. 35 | 36 | You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes: 37 | 38 | <<<<<<< SEARCH 39 | # Original code to find and replace (must match exactly) 40 | ======= 41 | # New replacement code 42 | >>>>>>> REPLACE 43 | 44 | Example of valid diff format: 45 | <<<<<<< SEARCH 46 | for i in range(m): 47 | for j in range(p): 48 | for k in range(n): 49 | C[i, j] += A[i, k] * B[k, j] 50 | ======= 51 | # Reorder loops for better memory access pattern 52 | for i in range(m): 53 | for k in range(n): 54 | for j in range(p): 55 | C[i, j] += A[i, k] * B[k, j] 56 | >>>>>>> REPLACE 57 | 58 | You can suggest multiple changes. Each SEARCH section must exactly match code in the current program. 59 | Be thoughtful about your changes and explain your reasoning thoroughly. 60 | 61 | IMPORTANT: Do not rewrite the entire program - focus on targeted improvements. 62 | """ 63 | 64 | 65 | DIFF_USER_TEMPLATE_PROPOSAL = """# Previous Proposal: 66 | {parent_proposal_text} 67 | 68 | # Previous Program: 69 | ```{language} 70 | {parent_program} 71 | ``` 72 | 73 | # Previous Performance Metrics: 74 | {metrics} 75 | 76 | # Areas Identified for Improvement: 77 | {improvement_areas} 78 | 79 | {artifacts} 80 | 81 | # Program Evolution History 82 | {evolution_history} 83 | 84 | # Current Proposal 85 | {current_proposal_text} 86 | 87 | # Task 88 | Suggest improvements to the program that will lead to better performance on the specified metrics. 89 | 90 | You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes: 91 | 92 | <<<<<<< SEARCH 93 | # Original code to find and replace (must match exactly) 94 | ======= 95 | # New replacement code 96 | >>>>>>> REPLACE 97 | 98 | Example of valid diff format: 99 | <<<<<<< SEARCH 100 | for i in range(m): 101 | for j in range(p): 102 | for k in range(n): 103 | C[i, j] += A[i, k] * B[k, j] 104 | ======= 105 | # Reorder loops for better memory access pattern 106 | for i in range(m): 107 | for k in range(n): 108 | for j in range(p): 109 | C[i, j] += A[i, k] * B[k, j] 110 | >>>>>>> REPLACE 111 | 112 | You can suggest multiple changes. Each SEARCH section must exactly match code in the current program. 113 | Be thoughtful about your changes and explain your reasoning thoroughly. 114 | 115 | IMPORTANT: Do not rewrite the entire program - focus on targeted improvements. 116 | """ 117 | 118 | 119 | 120 | # User message template for full rewrite 121 | FULL_REWRITE_USER_TEMPLATE = """# Current Program Information 122 | - Current performance metrics: {metrics} 123 | - Areas identified for improvement: {improvement_areas} 124 | 125 | {artifacts} 126 | 127 | # Program Evolution History 128 | {evolution_history} 129 | 130 | # Current Program 131 | ```{language} 132 | {current_program} 133 | ``` 134 | 135 | # Task 136 | Rewrite the program to improve its performance on the specified metrics. 137 | Provide the complete new program code. 138 | 139 | IMPORTANT: Make sure your rewritten program maintains the same inputs and outputs 140 | as the original program, but with improved internal implementation. 141 | 142 | ```{language} 143 | # Your rewritten program here 144 | ``` 145 | """ 146 | 147 | # Template for formatting evolution history 148 | EVOLUTION_HISTORY_TEMPLATE = """## Previous Attempts 149 | 150 | {previous_attempts} 151 | 152 | ## Top Performing Programs 153 | 154 | {top_programs} 155 | """ 156 | 157 | # Template for formatting a previous attempt 158 | PREVIOUS_ATTEMPT_TEMPLATE = """### Attempt {attempt_number} 159 | - Changes: {changes} 160 | - Performance: {performance} 161 | - Outcome: {outcome} 162 | """ 163 | 164 | # Template for formatting a top program 165 | TOP_PROGRAM_TEMPLATE = """### Program {program_number} (Score: {score}) 166 | ```{language} 167 | {program_snippet} 168 | ``` 169 | Key features: {key_features} 170 | """ 171 | 172 | # Template for evaluating a program via an LLM 173 | EVALUATION_TEMPLATE = """Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics: 174 | 1. Readability: How easy is the code to read and understand? 175 | 2. Maintainability: How easy would the code be to maintain and modify? 176 | 3. Efficiency: How efficient is the code in terms of time and space complexity? 177 | 178 | For each metric, provide a score between 0.0 and 1.0, where 1.0 is best. 179 | 180 | Code to evaluate: 181 | ```python 182 | {current_program} 183 | ``` 184 | 185 | Return your evaluation as a JSON object with the following format: 186 | {{ 187 | "readability": [score], 188 | "maintainability": [score], 189 | "efficiency": [score], 190 | "reasoning": "[brief explanation of scores]" 191 | }} 192 | """ 193 | 194 | 195 | # Default templates dictionary 196 | DEFAULT_TEMPLATES = { 197 | "system_message": BASE_SYSTEM_TEMPLATE, 198 | "evaluator_system_message": BASE_EVALUATOR_SYSTEM_TEMPLATE, 199 | # "diff_user": DIFF_USER_TEMPLATE, 200 | "full_rewrite_user": FULL_REWRITE_USER_TEMPLATE, 201 | "evolution_history": EVOLUTION_HISTORY_TEMPLATE, 202 | "previous_attempt": PREVIOUS_ATTEMPT_TEMPLATE, 203 | "top_program": TOP_PROGRAM_TEMPLATE, 204 | "evaluation": EVALUATION_TEMPLATE, 205 | "diff_user": DIFF_USER_TEMPLATE_PROPOSAL 206 | } 207 | 208 | 209 | class TemplateManager: 210 | """Manages templates for prompt generation""" 211 | 212 | def __init__(self, template_dir: Optional[str] = None): 213 | self.templates = DEFAULT_TEMPLATES.copy() 214 | 215 | # Load templates from directory if provided 216 | if template_dir and os.path.isdir(template_dir): 217 | self._load_templates_from_dir(template_dir) 218 | 219 | def _load_templates_from_dir(self, template_dir: str) -> None: 220 | """Load templates from a directory""" 221 | for file_path in Path(template_dir).glob("*.txt"): 222 | template_name = file_path.stem 223 | with open(file_path, "r") as f: 224 | self.templates[template_name] = f.read() 225 | 226 | def get_template(self, template_name: str) -> str: 227 | """Get a template by name""" 228 | if template_name not in self.templates: 229 | raise ValueError(f"Template '{template_name}' not found") 230 | return self.templates[template_name] 231 | 232 | def add_template(self, template_name: str, template: str) -> None: 233 | """Add or update a template""" 234 | self.templates[template_name] = template 235 | -------------------------------------------------------------------------------- /evolve_agent/utils/async_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Async utilities for EvolveAgent 3 | """ 4 | 5 | import asyncio 6 | import functools 7 | import logging 8 | import time 9 | from typing import Any, Callable, Dict, List, Optional, TypeVar, Union 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | T = TypeVar("T") 14 | 15 | 16 | def run_in_executor(f: Callable) -> Callable: 17 | """ 18 | Decorator to run a synchronous function in an executor 19 | 20 | Args: 21 | f: Function to decorate 22 | 23 | Returns: 24 | Decorated function that runs in an executor 25 | """ 26 | 27 | @functools.wraps(f) 28 | async def wrapper(*args: Any, **kwargs: Any) -> Any: 29 | loop = asyncio.get_event_loop() 30 | return await loop.run_in_executor(None, functools.partial(f, *args, **kwargs)) 31 | 32 | return wrapper 33 | 34 | 35 | async def run_with_timeout( 36 | coro: Callable, timeout: float, *args: Any, timeout_error_value: Any = None, **kwargs: Any 37 | ) -> Any: 38 | """ 39 | Run a coroutine with a timeout, returning a default value on timeout 40 | 41 | Args: 42 | coro: Coroutine function to run 43 | timeout: Timeout in seconds 44 | *args: Arguments to pass to the coroutine 45 | timeout_error_value: Value to return on timeout (default: {"error": 0.0, "timeout": True}) 46 | **kwargs: Keyword arguments to pass to the coroutine 47 | 48 | Returns: 49 | Result of the coroutine or timeout_error_value on timeout 50 | """ 51 | if timeout_error_value is None: 52 | timeout_error_value = {"error": 0.0, "timeout": True} 53 | 54 | try: 55 | return await asyncio.wait_for(coro(*args, **kwargs), timeout=timeout) 56 | except asyncio.TimeoutError: 57 | logger.warning(f"Operation timed out after {timeout}s") 58 | return timeout_error_value 59 | 60 | 61 | async def run_sync_with_timeout( 62 | func: Callable, timeout: float, *args: Any, timeout_error_value: Any = None, **kwargs: Any 63 | ) -> Any: 64 | """ 65 | Run a synchronous function in an executor with a timeout 66 | 67 | Args: 68 | func: Synchronous function to run 69 | timeout: Timeout in seconds 70 | *args: Arguments to pass to the function 71 | timeout_error_value: Value to return on timeout (default: {"error": 0.0, "timeout": True}) 72 | **kwargs: Keyword arguments to pass to the function 73 | 74 | Returns: 75 | Result of the function or timeout_error_value on timeout 76 | """ 77 | if timeout_error_value is None: 78 | timeout_error_value = {"error": 0.0, "timeout": True} 79 | 80 | try: 81 | loop = asyncio.get_event_loop() 82 | task = loop.run_in_executor(None, functools.partial(func, *args, **kwargs)) 83 | return await asyncio.wait_for(task, timeout=timeout) 84 | except asyncio.TimeoutError: 85 | logger.warning(f"Sync operation timed out after {timeout}s") 86 | return timeout_error_value 87 | 88 | 89 | async def gather_with_concurrency( 90 | n: int, *tasks: asyncio.Future, return_exceptions: bool = False 91 | ) -> List[Any]: 92 | """ 93 | Run tasks with a concurrency limit 94 | 95 | Args: 96 | n: Maximum number of tasks to run concurrently 97 | *tasks: Tasks to run 98 | return_exceptions: Whether to return exceptions instead of raising them 99 | 100 | Returns: 101 | List of task results 102 | """ 103 | semaphore = asyncio.Semaphore(n) 104 | 105 | async def sem_task(task: asyncio.Future) -> Any: 106 | async with semaphore: 107 | return await task 108 | 109 | return await asyncio.gather( 110 | *(sem_task(task) for task in tasks), return_exceptions=return_exceptions 111 | ) 112 | 113 | 114 | async def retry_async( 115 | coro: Callable, 116 | *args: Any, 117 | retries: int = 3, 118 | delay: float = 1.0, 119 | backoff: float = 2.0, 120 | exceptions: Union[Exception, tuple] = Exception, 121 | **kwargs: Any, 122 | ) -> Any: 123 | """ 124 | Retry an async function with exponential backoff 125 | 126 | Args: 127 | coro: Coroutine function to retry 128 | *args: Arguments to pass to the coroutine 129 | retries: Maximum number of retries 130 | delay: Initial delay between retries (seconds) 131 | backoff: Multiplier for delay between retries 132 | exceptions: Exception(s) to catch 133 | **kwargs: Keyword arguments to pass to the coroutine 134 | 135 | Returns: 136 | Result of the coroutine 137 | 138 | Raises: 139 | The last exception caught if all retries fail 140 | """ 141 | last_exception = None 142 | current_delay = delay 143 | 144 | for i in range(retries + 1): 145 | try: 146 | return await coro(*args, **kwargs) 147 | except exceptions as e: 148 | last_exception = e 149 | if i < retries: 150 | logger.warning( 151 | f"Retry {i+1}/{retries} failed with {type(e).__name__}: {str(e)}. " 152 | f"Retrying in {current_delay:.2f}s..." 153 | ) 154 | await asyncio.sleep(current_delay) 155 | current_delay *= backoff 156 | else: 157 | logger.error( 158 | f"All {retries+1} attempts failed. Last error: {type(e).__name__}: {str(e)}" 159 | ) 160 | 161 | if last_exception: 162 | raise last_exception 163 | 164 | return None # Should never reach here 165 | 166 | 167 | class TaskPool: 168 | """ 169 | A simple task pool for managing and limiting concurrent tasks 170 | """ 171 | 172 | def __init__(self, max_concurrency: int = 10): 173 | self.max_concurrency = max_concurrency 174 | self._semaphore: Optional[asyncio.Semaphore] = None 175 | self.tasks: List[asyncio.Task] = [] 176 | 177 | @property 178 | def semaphore(self) -> asyncio.Semaphore: 179 | """Lazy-initialize the semaphore when first needed""" 180 | if self._semaphore is None: 181 | self._semaphore = asyncio.Semaphore(self.max_concurrency) 182 | return self._semaphore 183 | 184 | async def run(self, coro: Callable, *args: Any, **kwargs: Any) -> Any: 185 | """ 186 | Run a coroutine in the pool 187 | 188 | Args: 189 | coro: Coroutine function to run 190 | *args: Arguments to pass to the coroutine 191 | **kwargs: Keyword arguments to pass to the coroutine 192 | 193 | Returns: 194 | Result of the coroutine 195 | """ 196 | async with self.semaphore: 197 | return await coro(*args, **kwargs) 198 | 199 | def create_task(self, coro: Callable, *args: Any, **kwargs: Any) -> asyncio.Task: 200 | """ 201 | Create and track a task in the pool 202 | 203 | Args: 204 | coro: Coroutine function to run 205 | *args: Arguments to pass to the coroutine 206 | **kwargs: Keyword arguments to pass to the coroutine 207 | 208 | Returns: 209 | Task object 210 | """ 211 | task = asyncio.create_task(self.run(coro, *args, **kwargs)) 212 | self.tasks.append(task) 213 | task.add_done_callback(lambda t: self.tasks.remove(t)) 214 | return task 215 | 216 | async def wait_all(self) -> None: 217 | """Wait for all tasks in the pool to complete""" 218 | if self.tasks: 219 | await asyncio.gather(*self.tasks) 220 | 221 | async def cancel_all(self) -> None: 222 | """Cancel all tasks in the pool""" 223 | for task in self.tasks: 224 | task.cancel() 225 | 226 | if self.tasks: 227 | await asyncio.gather(*self.tasks, return_exceptions=True) 228 | -------------------------------------------------------------------------------- /benchmark/human_best.txt: -------------------------------------------------------------------------------- 1 | Alpha-Research Benchmark: Human-Best Values (with references) 2 | 3 | Note: Some benchmarks depend on problem parameters (n, d, etc.). Where the benchmark code fixes typical parameters in its initial programs, those are used. Otherwise we cite the best-known general results or mark as open. 4 | 5 | 1) kissing_number 6 | - Objective: In dimension d, maximize the number of unit vectors with pairwise dot <= 1/2 (equivalently, kissing number K(d)). 7 | - Benchmark default: d = 11 (per `initial_program.py`). 8 | - Human best: K(11) ≥ 592. 9 | - Reference: M. Ganzhinov, "Kissing number in dimension 11 is at least 592" (PSU(4,2) construction), arXiv preprint, 2024. 10 | - Larger-is-better human best: 592.0 11 | - cite:https://arxiv.org/abs/2207.08266 12 | 13 | 2) spherical_code 14 | - Objective: On S^{d-1} (unit sphere), maximize the minimal pairwise angle for n points. 15 | - Benchmark default: n = 30 on S^2 (per `initial_program.py`). 16 | - Human best (for default n=30): Best-known numerical minimal angle ≈ 0.673646755169... radians (unproven optimal); see Sloane's Tables of Spherical Codes. 17 | - References: 18 | - N. J. A. Sloane, "Tables of Spherical Codes" (online tables; best-known values, many unproved for general n). [https://neilsloane.com/packings/] 19 | - For comparison: when n = 12, the icosahedron is optimal with minimal angle = arccos(1/√5) ≈ 1.107148717 radians (≈ 63.4349488°) (classical result: Schütte–van der Waerden; Fejes Tóth). 20 | - Larger-is-better human best (n=30): ≈ 0.673646755169 radians 21 | - cite: https://neilsloane.com/packings/ 22 | 23 | 3) heilbronn_in_the_unit_square (n = 16) 24 | - Objective: Place n points in the unit square to maximize the smallest triangle area. 25 | - Metric (larger is better): min_area (raw smallest triangle area). 26 | - Benchmark default: n = 16. 27 | - Human best (status): A = 7/341 ≈ 0.020526... is the best-known construction for n=16; to our knowledge, global optimality is not proved (conjectured best-known value in tables). 28 | - References: 29 | - Erich Friedman's Heilbronn Problem page (n=16 entry; tables list best-known configurations, not general proofs of optimality). [https://erich-friedman.github.io/packing/heilbronn/] 30 | - cite: https://erich-friedman.github.io/packing/heilbronn/ 31 | 32 | 4) littlewood_polynomials 33 | - Objective: For ±1 coefficients c_k, minimize sup_{t} |∑ c_k e^{ikt}| on the unit circle (sup-norm). For degree n, the best growth is known to be on the order of √n. 34 | - Human best (general): There exist Littlewood polynomials with sup-norm ≤ C √n for an absolute constant C; exact optimal constants are unknown. Rudin–Shapiro polynomials give explicit O(√n) upper bounds. 35 | - References: 36 | - P. Borwein and M. Mossinghoff, surveys on Littlewood polynomials (e.g., Experimental Mathematics 2008; related 2002–2010 papers). 37 | - J.-P. Kahane, Some Random Series of Functions (re: random ±1 coefficients and bounds). 38 | - Larger-is-better metric: 1 / supnorm (grows like ≈ 1 / (C √n)); numeric value depends on n. 39 | - For n = 512 (benchmark default): A Rudin–Shapiro construction yields supnorm ≤ √(2n) = 32 (with √n ≈ 22.627, √2 ≈ 1.414), hence the benchmark score 1/supnorm = 1/32 = 0.03125. 40 | - Tighter constant for Rudin–Shapiro: The classical identity implies C = √2, i.e., supnorm ≤ √(2n) for length-n Rudin–Shapiro polynomials (tighter than the looser bound 2√n sometimes quoted). 41 | - cite: https://www.memphis.edu/msci/people/pbalistr/shapiro.pdf 42 | 43 | 5) riesz_energy 44 | - Objective: On [0,1], minimize E_s(x_1,…,x_n) = ∑_{i |A−A|) exist and can give ratios strictly larger than 1; precise extremal ratios depend on constraints and are an open line of research. 55 | - References: 56 | - M. B. Nathanson, "Sets with more sums than differences," Integers 7 (2007), #A5. 57 | - I. Z. Ruzsa, Sumsets and structure (various surveys in additive combinatorics). 58 | - Larger-is-better metric: |A+B| / |A−B| (no conversion). 59 | - cite: https://arxiv.org/abs/math/0608148 60 | 61 | 7) packing_circles 62 | - Objective: In the unit square, place n disjoint circles to maximize total sum of radii (benchmark’s objective). Note: This differs from the classical equal-radius packing problem. 63 | - Human best (general): The equal-radius variants have extensive tables; for maximizing sum of radii with variable sizes, sharp records are not standardized in the literature for n = 26, 32. 64 | - References: 65 | - E. Specht, "Packings in squares and rectangles" (online tables) — equal-radius case. 66 | - R. Graham, B. Lubachevsky, K. Nurmela, and P. Östergård, various papers on circle packing in a square. 67 | - Larger-is-better metric: total sum of radii (no conversion). 68 | 69 | 8) minizing_raio_max_min_distance 70 | - Objective: For n points in [0,1]^d, minimize (max pairwise distance) / (min pairwise distance); benchmark queries (n,d) = (16,2) and (14,3). 71 | - Human best (general): This is a variant of dispersion/packing-covering tradeoff in a cube; sharp constants for the ratio under these constraints are not tabulated in the classical literature. 72 | - References: See general texts on sphere packing vs covering, and numerical optimization literature for blue-noise/Poisson-disc sampling in bounded domains. 73 | - Larger-is-better metric: use min/max (i.e., the reciprocal of the usual ratio). Examples from typical baselines: d=2, n=16 → ≈ 1/12.89 ≈ 0.0776; d=3, n=14 → ≈ 1/4.168 ≈ 0.2400. 74 | 75 | 9) autoconvolution_peak_minimization 76 | - Objective: For nonnegative f on [0,1] with ∫ f = 1, minimize μ_∞ = sup_t (f * f)(t). 77 | - Human best (general): The exact optimum constant is open; best-known rigorous bounds are close to 1.5 (upper bounds from explicit constructions, lower bounds from analytic inequalities). Precise record values depend on smoothness/support constraints. 78 | - References: Surveys on autoconvolution inequalities (e.g., works following Erdős–Rényi type convolution problems; see also additive combinatorics notes and numerical studies in approximation theory). 79 | - Larger-is-better metric: 1 / μ_∞; indicative ≥ ≈ 1/1.5 ≈ 0.6667 given current upper bounds. 80 | - cite: https://arxiv.org/pdf/2210.16437?utm_source=chatgpt.com 81 | 82 | 10) third_autocorrelation_inequality 83 | - Objective: Improve bounds for a third-order autocorrelation constant C_3 (benchmark computes an upper bound C_upper_bound and reports its reciprocal). 84 | - Human best (indicative): Recent numerical constructions report 1 / C_upper_bound ≈ 0.6869 for representative discretizations; exact best constant remains open. 85 | - References: Literature on higher-order autocorrelation and correlation-inequality problems in additive combinatorics and signal processing; see problem surveys and recent preprints. 86 | - Larger-is-better metric: 1 / C_upper_bound; indicative ≈ 0.6869. 87 | 88 | Caveats and next steps 89 | - Several benchmarks encode families of problems parameterized by n and/or d; precise “human-best” values depend on those choices. Where known closed-form or sharp constants exist, they are reported; otherwise we cite authoritative surveys and note the open status. 90 | - If you’d like, specify exact (n,d) for `heilbronn_in_the_unit_square`, `packing_circles`, and `minizing_raio_max_min_distance`, and we can add numerical human-best targets or authoritative records if available. 91 | -------------------------------------------------------------------------------- /benchmark/minizing_raio_max_min_distance/initial_program.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial.distance import pdist 3 | from scipy.spatial import cKDTree 4 | 5 | # (Removed) smooth_points — smoothing logic is now inlined to reduce indirection 6 | 7 | 8 | def calculate_distances(points): 9 | """Calculates min, max, and ratio of pairwise Euclidean distances using scipy pdist.""" 10 | if points.shape[0] < 2: 11 | return 0.0, 0.0, 0.0 12 | distances = pdist(points, metric='euclidean') 13 | eps = 1e-8 14 | min_dist = max(np.min(distances), eps) 15 | max_dist = np.max(distances) 16 | ratio = max_dist / min_dist 17 | return min_dist, max_dist, ratio 18 | 19 | # (Removed) perturb_point — now inlined directly where used 20 | 21 | def update_temperature(temperature, cooling_rate, accept_history, iteration, total_iters, initial_temperature, window_size=100): 22 | """ 23 | Adaptive cooling with acceptance‐rate feedback and periodic reheating. 24 | """ 25 | window = accept_history[-min(len(accept_history), window_size):] 26 | rate = sum(window) / len(window) 27 | # gentler correction: slow/fast cooling factors reduced 28 | if rate < 0.2: 29 | adj = 1.02 30 | elif rate > 0.8: 31 | adj = 0.98 32 | else: 33 | adj = 1.0 34 | temperature *= cooling_rate * adj 35 | # removed periodic reheating to maintain smoother cooling schedule 36 | # if (iteration + 1) % (total_iters // 4) == 0: 37 | # temperature = initial_temperature 38 | return temperature 39 | 40 | def max_min_dis_ratio(n: int, d: int, seed=None): 41 | """ 42 | Finds n points in d-dimensional space to minimize the max/min distance ratio 43 | using simulated annealing. 44 | 45 | Args: 46 | n (int): Number of points. 47 | d (int): Dimensionality of the space. 48 | 49 | Returns: 50 | tuple: (best_points, best_ratio) 51 | """ 52 | 53 | # Adaptive hyperparameters based on dimensionality 54 | iterations = 3000 if d <= 2 else 6000 # increased sweeps for improved convergence 55 | initial_temperature = 10.0 56 | cooling_rate = 0.998 if d <= 2 else 0.996 # slower cooling for extended exploration 57 | perturbation_factor = 0.15 if d <= 2 else 0.12 # tuned smaller steps in 3D for better local refinement 58 | # relaxation factor for post-acceptance repulsive adjustment 59 | # relaxation_factor removed; using inline 0.1 * perturbation_factor below 60 | 61 | # 1. Initial State: reproducible random generator 62 | rng = np.random.default_rng(seed) 63 | # uniform random initialization in [0,1]^d for simplicity 64 | current_points = rng.random((n, d)) 65 | 66 | _, _, current_ratio = calculate_distances(current_points) 67 | 68 | best_points = np.copy(current_points) 69 | best_ratio = current_ratio 70 | 71 | temperature = initial_temperature 72 | accept_history = [] 73 | window_size = 50 # window for stagnation detection and adaptive injection 74 | # smoothing_interval remains, but smoothing_strength is fixed inlined above 75 | smoothing_interval = max(10, iterations // (20 if d <= 2 else 30)) # more frequent smoothing in 3D for improved uniformity 76 | 77 | for i in range(iterations): 78 | # Build KD-tree once per iteration for neighbor queries 79 | tree = cKDTree(current_points) 80 | # optional smoothing step using distance-weighted neighbor smoothing 81 | if (i + 1) % smoothing_interval == 0: 82 | # choose neighbor count based on dimension 83 | k_smooth = 6 if d > 2 else 4 84 | _, idxs = tree.query(current_points, k=k_smooth+1) 85 | neighbors = current_points[idxs[:,1:]] # exclude self 86 | # compute inverse-distance weights 87 | diffs = neighbors - current_points[:, None, :] 88 | dists = np.linalg.norm(diffs, axis=2) + 1e-6 89 | weights = 1.0 / dists 90 | weights /= weights.sum(axis=1, keepdims=True) 91 | neighbor_means = (neighbors * weights[..., None]).sum(axis=1) 92 | blend = 0.6 if d > 2 else 0.7 93 | current_points = np.clip(current_points * blend + neighbor_means * (1 - blend), 0.0, 1.0) 94 | _, _, current_ratio = calculate_distances(current_points) 95 | if current_ratio < best_ratio: 96 | best_points = current_points.copy() 97 | best_ratio = current_ratio 98 | 99 | # 2. Generate Neighboring State: Perturb a random point 100 | # Simplify scaling: rely on temperature to adjust step-size instead of best_ratio 101 | # dynamic perturbation decays sublinearly with temperature for finer local moves 102 | perturbation_strength = perturbation_factor * ((temperature / initial_temperature)**0.6 + 0.15) 103 | 104 | # Choose a random point to perturb 105 | point_to_perturb_index = rng.integers(0, n) 106 | 107 | old_point = current_points[point_to_perturb_index].copy() 108 | # Increase repulsive‐move frequency in low dimensions 109 | # dynamic repulsion probability: stronger at high temperature, tapering off as we cool 110 | if d > 2: 111 | # reduce repulsion frequency in 3D for finer refinement 112 | repulsion_prob = float(np.clip(temperature / initial_temperature, 0.2, 0.8)) 113 | else: 114 | repulsion_prob = float(np.clip(temperature / initial_temperature + 0.1, 0.5, 0.95)) 115 | # start with a random jitter 116 | # random jitter inlined for readability 117 | candidate = old_point + rng.uniform(-perturbation_strength, perturbation_strength, size=old_point.shape) 118 | if n > 1 and rng.random() < repulsion_prob: 119 | # compute nearest neighbor via KD-tree for efficiency (reusing prebuilt tree) 120 | _, nn_idxs = tree.query(old_point, k=2) 121 | nn_idx = nn_idxs[1] 122 | vec = old_point - current_points[nn_idx] 123 | norm = np.linalg.norm(vec) 124 | if norm > 1e-8: 125 | dir_vec = vec / norm 126 | candidate = old_point + perturbation_strength * dir_vec 127 | # keep the point in [0,1]^d 128 | current_points[point_to_perturb_index] = np.clip(candidate, 0.0, 1.0) 129 | _, _, candidate_ratio = calculate_distances(current_points) 130 | 131 | # Acceptance criterion 132 | delta = candidate_ratio - current_ratio 133 | accept = (delta < 0) or (rng.random() < np.exp(-delta / temperature)) 134 | 135 | if accept: 136 | current_ratio = candidate_ratio 137 | # Post-acceptance repulsive relaxation to improve local spacing 138 | # reuse prebuilt KD-tree for repulsive relaxation 139 | dists, idxs_nn = tree.query(current_points[point_to_perturb_index], k=2) 140 | dir_vec = current_points[point_to_perturb_index] - current_points[idxs_nn[1]] 141 | norm = np.linalg.norm(dir_vec) 142 | if norm > 1e-8: 143 | # push away from nearest neighbor 144 | adjustment = 0.1 * perturbation_factor * dir_vec / norm 145 | current_points[point_to_perturb_index] = np.clip( 146 | current_points[point_to_perturb_index] + adjustment, 0.0, 1.0 147 | ) 148 | # update ratio and best points after relaxation 149 | _, _, relaxed_ratio = calculate_distances(current_points) 150 | current_ratio = relaxed_ratio 151 | if relaxed_ratio < best_ratio: 152 | best_points = current_points.copy() 153 | best_ratio = relaxed_ratio 154 | # also keep the standard best‐check for the candidate move 155 | if current_ratio < best_ratio: 156 | best_points = current_points.copy() 157 | best_ratio = current_ratio 158 | else: 159 | current_points[point_to_perturb_index] = old_point 160 | 161 | # Update temperature with adaptive schedule 162 | accept_history.append(accept) 163 | temperature = update_temperature(temperature, cooling_rate, accept_history, i, iterations, initial_temperature) 164 | # periodic mild reheating for 3D to escape deep minima 165 | if d > 2 and (i + 1) % (iterations // 3) == 0: 166 | temperature = max(temperature, initial_temperature * 0.3) 167 | 168 | # random injection to escape plateaus: reinitialize one point every 20% of iterations 169 | # random injection only if we’ve stagnated (low acceptance in recent window) 170 | if (i + 1) % max(1, iterations // 5) == 0 and len(accept_history) >= window_size \ 171 | and sum(accept_history[-window_size:]) / window_size < 0.1: 172 | j = rng.integers(0, n) 173 | current_points[j] = rng.random(d) 174 | _, _, current_ratio = calculate_distances(current_points) 175 | 176 | # Local refinement stage: fine-tune best solution with small Gaussian perturbations 177 | refine_iters = max(100, iterations // 20) 178 | for _ in range(refine_iters): 179 | idx = rng.integers(0, n) 180 | old_point = best_points[idx].copy() 181 | perturb = rng.normal(0, perturbation_factor * 0.05, size=d) 182 | best_points[idx] = np.clip(old_point + perturb, 0.0, 1.0) 183 | _, _, refined_ratio = calculate_distances(best_points) 184 | if refined_ratio < best_ratio: 185 | best_ratio = refined_ratio 186 | else: 187 | best_points[idx] = old_point 188 | return best_points, best_ratio -------------------------------------------------------------------------------- /reward_model/llm/backend.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | import time 5 | import numpy as np 6 | import pandas as pd 7 | from typing import List, Dict 8 | 9 | from vllm import LLM, SamplingParams 10 | from openai import OpenAI 11 | from datasets import load_dataset 12 | from transformers import AutoTokenizer, AutoModelForCausalLM 13 | 14 | # SCORING_PROMPT = f""" 15 | # You are an expert reviewer tasked with evaluating the quality of a research abstract. 16 | # Your goal is to assign a score between 1 and 10 based on the abstract's clarity, novelty, technical rigor, and potential impact. Here are the criteria: 17 | # 1. Read the following abstract carefully and provide a score from 1 to 10. 18 | # 2. Score 6 means slightly higher than the boardline, 5 is slightly lower than the boardline. 19 | # Write the score in the {BOX}. 20 | # **idea**: 21 | # """ 22 | 23 | 24 | BOX=r"\boxed{}" 25 | SYSTEM_PROMPT = "You are an expert reviewer tasked with evaluating the quality of a research proposal. " 26 | SCORING_PROMPT = f""" 27 | Your goal is to assign a score between 1 and 10 based on the proposal's clarity, novelty, technical rigor, and potential impact. Here are the criteria: 28 | 1. Read the following proposal carefully and provide a score from 1 to 10. 29 | 2. Score 6 means slightly higher than the boardline, 5 is slightly lower than the boardline. 30 | Write the score in the {BOX}. 31 | **idea**: 32 | 33 | """ 34 | 35 | def parse_score_from_text(text: str) -> float: 36 | match = re.search(r'\\boxed\{(\d*\.?\d*)\}', text) 37 | if match: 38 | try: 39 | score = float(match.group(1)) 40 | if 0 <= score <= 10: 41 | return score 42 | except ValueError: 43 | pass 44 | return -1.0 45 | 46 | 47 | def score_abstracts_with_vllm(data: List[Dict], model_name: str) -> List[Dict]: 48 | 49 | llm = LLM(model=model_name, gpu_memory_utilization=0.95) 50 | tokenizer = AutoTokenizer.from_pretrained(model_name) 51 | 52 | prompts = [ 53 | tokenizer.apply_chat_template( 54 | [ 55 | { 56 | "role": "system", 57 | "content": SYSTEM_PROMPT 58 | }, 59 | { 60 | "role": "user", 61 | "content": SCORING_PROMPT + item["title"] + "\n" + item["abstract"] 62 | } 63 | ], 64 | tokenize=False, 65 | add_generation_prompt=True, 66 | enable_thinking=False 67 | ) 68 | for item in data 69 | ] 70 | 71 | sampling_params = SamplingParams( 72 | temperature=0, 73 | top_p=1.0, 74 | max_tokens=1000, 75 | ) 76 | 77 | outputs = llm.generate(prompts, sampling_params) 78 | 79 | print(prompts[0]) 80 | print(outputs[0].outputs[0].text) 81 | 82 | results = [] 83 | for output, item in zip(outputs, data): 84 | output_text = output.outputs[0].text.strip() 85 | score = parse_score_from_text(output_text) 86 | results.append({ 87 | "score": score, 88 | "evaluation": output_text, 89 | "abstract": item["abstract"], 90 | "avg_rating": item["avg_rating"] 91 | }) 92 | 93 | return results 94 | 95 | def load_processed_ids(jsonl_file: str) -> set: 96 | """ 97 | Loads the IDs of already processed abstracts from the JSONL file. 98 | 99 | Args: 100 | jsonl_file: Path to the JSONL file. 101 | 102 | Returns: 103 | Set of processed abstract titles. 104 | """ 105 | processed_ids = set() 106 | if os.path.exists(jsonl_file): 107 | with open(jsonl_file, 'r', encoding='utf-8') as f: 108 | for line in f: 109 | try: 110 | data = json.loads(line.strip()) 111 | if data.get('score', -1.0) != -1.0: # Only include valid scores 112 | processed_ids.add(data['title']) 113 | except json.JSONDecodeError: 114 | print(f"Warning: Skipping invalid JSON line in {jsonl_file}") 115 | return processed_ids 116 | 117 | def write_result_to_jsonl(result: Dict, jsonl_file: str): 118 | """ 119 | Writes a single result to the JSONL file if the score is valid. 120 | 121 | Args: 122 | result: Dictionary containing the result to write. 123 | jsonl_file: Path to the JSONL file. 124 | """ 125 | if result['score'] != -1.0: # Only write valid scores 126 | with open(jsonl_file, 'a', encoding='utf-8') as f: 127 | f.write(json.dumps(result, ensure_ascii=False) + '\n') 128 | 129 | def score_abstracts_with_api(data: List[Dict], 130 | jsonl_file: str, 131 | model_name: str = "deepseek-chat", 132 | api_key: str = "sk-2c3f1f58031b4b86afdb6a8192ea02e2", 133 | base_url: str = "https://api.deepseek.com", 134 | max_retries: int = 50, 135 | retry_delay: int = 5 136 | ) -> List[Dict]: 137 | """ 138 | Scores research proposals using OpenAI's API, writing valid results to a JSONL file incrementally. 139 | Resumes from the last successfully processed abstract (with valid score) in the JSONL file. 140 | 141 | Args: 142 | data: List of dictionaries containing 'title', 'abstract', and 'avg_rating'. 143 | jsonl_file: Path to the JSONL file for storing results. 144 | model_name: OpenAI model to use (default: 'gpt-4o'). 145 | api_key: OpenAI API key (if not set, assumes it's configured in environment). 146 | max_retries: Maximum number of retries per abstract (default: 5). 147 | retry_delay: Seconds to wait between retries (default: 5). 148 | 149 | Returns: 150 | List of dictionaries with 'title', 'score', 'evaluation', 'abstract', and 'avg_rating'. 151 | """ 152 | # Initialize OpenAI client 153 | client = OpenAI(api_key=api_key, base_url=base_url) 154 | 155 | # Load already processed abstracts (with valid scores) to skip them 156 | processed_ids = load_processed_ids(jsonl_file) 157 | results = [] 158 | 159 | # Filter out already processed abstracts 160 | data_to_process = [item for item in data if item['title'] not in processed_ids] 161 | print(f"Total abstracts: {len(data)}, To process: {len(data_to_process)}, Already processed: {len(processed_ids)}") 162 | 163 | # Prepare prompts for remaining abstracts 164 | prompts = [ 165 | [ 166 | {"role": "system", "content": SYSTEM_PROMPT}, 167 | {"role": "user", "content": SCORING_PROMPT + item["title"] + "\n" + item["abstract"]} 168 | ] 169 | for item in data_to_process 170 | ] 171 | 172 | for prompt, item in zip(prompts, data_to_process): 173 | retries = 0 174 | score = -1.0 175 | output_text = "" 176 | 177 | # Keep retrying until a valid score is obtained or max_retries is reached 178 | while score == -1.0 and retries < max_retries: 179 | response = client.chat.completions.create( 180 | model=model_name, 181 | messages=prompt, 182 | temperature=0, 183 | max_tokens=1000, 184 | top_p=1.0 185 | ) 186 | output_text = response.choices[0].message.content.strip() 187 | score = parse_score_from_text(output_text) 188 | 189 | if score == -1.0: 190 | retries += 1 191 | print(f"Invalid score for abstract: {item['title']}, Retry {retries}/{max_retries}") 192 | time.sleep(retry_delay) # Wait before retrying 193 | else: 194 | print(f"Prompt: {prompt}") 195 | print(f"Output: {output_text}") 196 | 197 | # Create result dictionary 198 | result = { 199 | "title": item["title"], 200 | "score": score, 201 | "evaluation": output_text, 202 | "abstract": item["abstract"], 203 | "avg_rating": item["avg_rating"] 204 | } 205 | 206 | # Write result to JSONL file only if score is valid 207 | write_result_to_jsonl(result, jsonl_file) 208 | results.append(result) 209 | 210 | if score == -1.0: 211 | print(f"Failed to get valid score for abstract: {item['title']} after {max_retries} retries") 212 | 213 | # Load previously processed results from JSONL to include in return 214 | if processed_ids: 215 | with open(jsonl_file, 'r', encoding='utf-8') as f: 216 | for line in f: 217 | try: 218 | result = json.loads(line.strip()) 219 | if result['title'] in processed_ids: 220 | results.append(result) 221 | except json.JSONDecodeError: 222 | print(f"Warning: Skipping invalid JSON line in {jsonl_file}") 223 | 224 | return results 225 | 226 | if __name__ == '__main__': 227 | abst = """ 228 | Test-time scaling is a promising new approach to language modeling that uses extra test-time compute to improve performance. Recently, OpenAI's o1 model showed this capability but did not publicly share its methodology, leading to many replication efforts. We seek the simplest approach to achieve test-time scaling and strong reasoning performance. First, we curate a small dataset s1K of 1,000 questions paired with reasoning traces relying on three criteria we validate through ablations: difficulty, diversity, and quality. Second, we develop budget forcing to control test-time compute by forcefully terminating the model's thinking process or lengthening it by appending "Wait" multiple times to the model's generation when it tries to end. This can lead the model to double-check its answer, often fixing incorrect reasoning steps. After supervised finetuning the Qwen2.5-32B-Instruct language model on s1K and equipping it with budget forcing, our model s1-32B exceeds o1-preview on competition math questions by up to 27% (MATH and AIME24). Further, scaling s1-32B with budget forcing allows extrapolating beyond its performance without test-time intervention: from 50% to 57% on AIME24. """ 229 | title = "s1: Simple test-time scaling" 230 | data = [{"title": title, "abstract": abst, "avg_rating": 0}] 231 | print(score_abstracts_with_vllm(data, '/data/zhuotaodeng/yzj/alpha-research/model/qwen25_grm_iclr/checkpoint-240')) 232 | # print(score_abstracts_with_vllm(data, '/data/zhuotaodeng/yzj/download_from_modelscope/Qwen/Qwen3-8B')) -------------------------------------------------------------------------------- /benchmark/heilbronn_in_the_unit_square/initial_program.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import time 4 | import argparse 5 | from itertools import combinations 6 | 7 | # ========================= 8 | # 基本几何与评估 9 | # ========================= 10 | 11 | def triangle_area(a, b, c) -> float: 12 | # 三角形面积 = |(b-a) x (c-a)|/2 13 | return abs((b[0]-a[0])*(c[1]-a[1]) - (b[1]-a[1])*(c[0]-a[0])) * 0.5 14 | 15 | _triplets_cache = {} 16 | 17 | def _precompute_triplets(n: int): 18 | from itertools import combinations as _comb 19 | combs = np.array(list(_comb(range(n), 3)), dtype=int) 20 | I, J, K = combs[:, 0], combs[:, 1], combs[:, 2] 21 | return I, J, K, combs 22 | 23 | def find_min_triangle(P: np.ndarray): 24 | """向量化求最小三角形:返回 (i,j,k,min_area)。若 n<3 返回 (-1,-1,-1,0.0)。""" 25 | n = len(P) 26 | if n < 3: 27 | return -1, -1, -1, 0.0 28 | if n not in _triplets_cache: 29 | _triplets_cache[n] = _precompute_triplets(n) 30 | I, J, K, combs = _triplets_cache[n] 31 | A = P[I] 32 | B = P[J] 33 | C = P[K] 34 | area = np.abs((B[:,0]-A[:,0])*(C[:,1]-A[:,1]) - (B[:,1]-A[:,1])*(C[:,0]-A[:,0])) * 0.5 35 | if area.size == 0: 36 | return -1, -1, -1, 0.0 37 | idx = int(np.argmin(area)) 38 | i, j, k = combs[idx] 39 | return int(i), int(j), int(k), float(area[idx]) 40 | 41 | def min_triangle_area(P: np.ndarray) -> float: 42 | return find_min_triangle(P)[3] 43 | 44 | def scaled_min_area(P: np.ndarray) -> float: 45 | n = float(len(P)) 46 | exp = (8.0/7.0) + (1.0/2000.0) 47 | return (n ** exp) * min_triangle_area(P) 48 | 49 | # ========================= 50 | # 初始化:多起点 51 | # ========================= 52 | 53 | def jittered_grid_points(n, seed=0): 54 | rng = np.random.default_rng(seed) 55 | m = int(round(np.sqrt(n))); m = max(m, 2) 56 | xs = (np.arange(m) + 0.5) / m 57 | ys = (np.arange(m) + 0.5) / m 58 | X, Y = np.meshgrid(xs, ys) 59 | P = np.c_[X.ravel(), Y.ravel()] 60 | jitter = 0.12 / m 61 | P += rng.uniform(-jitter, jitter, size=P.shape) 62 | P = np.clip(P[:n], 0.0, 1.0) 63 | return P 64 | 65 | def hex_lattice_points(n, seed=0): 66 | rng = np.random.default_rng(seed) 67 | a = 1.0 / np.sqrt(n) 68 | pts = [] 69 | y = a/2 70 | row = 0 71 | while y < 1.0: 72 | x0 = (a/2) if (row % 2 == 1) else a 73 | x = x0 74 | while x < 1.0: 75 | pts.append([x, y]) 76 | x += a 77 | y += np.sqrt(3)/2 * a 78 | row += 1 79 | P = np.array(pts, dtype=float) 80 | if len(P) < n: 81 | extra = rng.uniform(0, 1, size=(n - len(P), 2)) 82 | P = np.vstack([P, extra]) 83 | P = P[:n] 84 | P += rng.uniform(-0.08*a, 0.08*a, size=P.shape) 85 | P = np.clip(P, 0.0, 1.0) 86 | return P 87 | 88 | def bridson_poisson_disk(n, r=None, k=30, seed=0): 89 | """ 90 | 近似生成 >=n 的 Poisson-disk 点,再均匀抽样到 n 个。 91 | r: 目标最小间距 ~ c / sqrt(n) 92 | """ 93 | rng = np.random.default_rng(seed) 94 | if r is None: 95 | r = 0.6 / np.sqrt(n) # 稍保守的间距 96 | cell_size = r / np.sqrt(2) 97 | grid_w = int(np.ceil(1.0 / cell_size)) 98 | grid_h = int(np.ceil(1.0 / cell_size)) 99 | grid = -np.ones((grid_h, grid_w), dtype=int) 100 | 101 | def grid_coords(pt): 102 | return int(pt[1] / cell_size), int(pt[0] / cell_size) 103 | 104 | def in_neighborhood(pt): 105 | gy, gx = grid_coords(pt) 106 | for yy in range(max(gy-2,0), min(gy+3, grid_h)): 107 | for xx in range(max(gx-2,0), min(gx+3, grid_w)): 108 | j = grid[yy, xx] 109 | if j >= 0: 110 | if np.linalg.norm(pts[j] - pt) < r: 111 | return True 112 | return False 113 | 114 | pts = [] 115 | active = [] 116 | 117 | # 初始点 118 | p0 = rng.uniform(0, 1, size=2) 119 | pts.append(p0); active.append(0) 120 | gy, gx = grid_coords(p0); grid[gy, gx] = 0 121 | 122 | while active and len(pts) < max(n*2, n+10): 123 | idx = rng.choice(active) 124 | base = pts[idx] 125 | found = False 126 | for _ in range(k): 127 | rad = rng.uniform(r, 2*r) 128 | ang = rng.uniform(0, 2*np.pi) 129 | cand = base + rad * np.array([np.cos(ang), np.sin(ang)]) 130 | if not (0 <= cand[0] <= 1 and 0 <= cand[1] <= 1): 131 | continue 132 | if not in_neighborhood(cand): 133 | pts.append(cand) 134 | gy, gx = grid_coords(cand); grid[gy, gx] = len(pts)-1 135 | active.append(len(pts)-1) 136 | found = True 137 | break 138 | if not found: 139 | active.remove(idx) 140 | 141 | pts = np.array(pts) 142 | if len(pts) >= n: 143 | idx = rng.choice(len(pts), size=n, replace=False) 144 | pts = pts[idx] 145 | else: 146 | extra = rng.uniform(0,1,size=(n-len(pts),2)) 147 | pts = np.vstack([pts, extra]) 148 | return pts 149 | 150 | # ========================= 151 | # 定向局部搜索(增大最小三角形) 152 | # ========================= 153 | 154 | def normalize(v): 155 | n = np.linalg.norm(v) 156 | return v / n if n > 1e-12 else v 157 | 158 | def bump_point(P, idx, step, rng): 159 | """对单个点做小扰动(含随机 + 轻微向内推),保持在 [0,1]^2。""" 160 | q = P.copy() 161 | jitter = rng.uniform(-step, step, size=2) 162 | # 轻微向内推,减少贴边导致的瘦三角形 163 | inward = 0.15 * step * (0.5 - P[idx]) 164 | q[idx] = np.clip(P[idx] + jitter + inward, 0.0, 1.0) 165 | return q 166 | 167 | def bump_min_triangle_directed(P, step, rng): 168 | """ 169 | 针对“当前最小三角形”的三个顶点,沿增大面积的几何方向优先移动: 170 | - 对顶点 a,相对边 (b,c) 的法向方向能增大面积。 171 | - 叠加小随机扰动,避免卡鞍点。 172 | """ 173 | i, j, k, _ = find_min_triangle(P) 174 | if i < 0: 175 | return P 176 | a, b, c = P[i], P[j], P[k] 177 | 178 | def move_along_normal(P, idx, other1, other2): 179 | q = P.copy() 180 | base = other2 - other1 181 | # 2D 中与 base 垂直的法向(取两种方向试探) 182 | n1 = normalize(np.array([ base[1], -base[0] ])) 183 | n2 = -n1 184 | cand1 = np.clip(P[idx] + step*n1, 0.0, 1.0) 185 | cand2 = np.clip(P[idx] + step*n2, 0.0, 1.0) 186 | # 选择带来更大 min_area 的方向 187 | q1 = q.copy(); q1[idx] = cand1 188 | q2 = q.copy(); q2[idx] = cand2 189 | a1 = min_triangle_area(q1); a2 = min_triangle_area(q2) 190 | if a1 >= a2: 191 | return q1, a1 192 | else: 193 | return q2, a2 194 | 195 | # 依次尝试移动 i、j、k,并保留最好者 196 | bestP = P.copy(); bestA = min_triangle_area(P) 197 | for (idx, o1, o2) in [(i, b, c), (j, c, a), (k, a, b)]: 198 | q, area_dir = move_along_normal(bestP, idx, o1, o2) 199 | if area_dir > bestA + 1e-15: 200 | bestP, bestA = q, area_dir 201 | else: 202 | # 若定向无改进,退而求其次:随机小扰动 203 | q = bump_point(bestP, idx, 0.6*step, rng) 204 | a_rand = min_triangle_area(q) 205 | if a_rand > bestA + 1e-15: 206 | bestP, bestA = q, a_rand 207 | return bestP 208 | 209 | def project_min_distance(P, dmin=1e-3, iters=1): 210 | """软约束:尽量避免过近点对(简单排斥迭代)。""" 211 | Q = P.copy() 212 | for _ in range(iters): 213 | for i in range(len(Q)): 214 | diffs = Q - Q[i] 215 | dists = np.linalg.norm(diffs, axis=1) 216 | mask = (dists < dmin) & (dists > 0) 217 | if np.any(mask): 218 | repel = -diffs[mask] 219 | move = 0.5 * np.sum(repel / np.maximum(dists[mask][:,None], 1e-12), axis=0) 220 | Q[i] = np.clip(Q[i] + 1e-3*move, 0.0, 1.0) 221 | return Q 222 | 223 | def improve(P0, iters=6000, step0=0.05, seed=0, patience=800, time_limit=None): 224 | """ 225 | 退火式定向搜索: 226 | - 以“当前最小三角形”为线索,优先移动那三个点; 227 | - 步长逐步衰减并穿插随机扰动与最小距离投影; 228 | """ 229 | rng = np.random.default_rng(seed) 230 | P = np.clip(P0.copy(), 0.0, 1.0) 231 | bestP = P.copy(); bestA = min_triangle_area(P) 232 | no_improve = 0 233 | t0 = time.time() 234 | 235 | step = step0 236 | for t in range(1, iters+1): 237 | if time_limit is not None and (time.time() - t0) > time_limit: 238 | break 239 | Q = bump_min_triangle_directed(P, step, rng) 240 | # 偶尔对非最小三角形顶点做随机扰动,避免局部陷阱 241 | if t % 30 == 0: 242 | idx = rng.integers(len(P)) 243 | Q = bump_point(Q, idx, 0.5*step, rng) 244 | 245 | # 软性分离,避免过近 246 | if t % 50 == 0: 247 | Q = project_min_distance(Q, dmin=5e-3, iters=1) 248 | 249 | aQ = min_triangle_area(Q) 250 | if aQ > bestA + 1e-15: 251 | P = Q 252 | bestP, bestA = Q.copy(), aQ 253 | no_improve = 0 254 | else: 255 | # 以小概率接受较差解可加入,但这里保守:不接受 256 | no_improve += 1 257 | 258 | if no_improve >= patience: 259 | break 260 | 261 | # 步长衰减 262 | if t % 400 == 0: 263 | step *= 0.7 264 | step = max(step, 5e-4) 265 | 266 | return bestP, bestA 267 | 268 | # ========================= 269 | # 主流程:多起点 + 精修 270 | # ========================= 271 | 272 | def multi_start_optimize(n=16, seeds=(42, 43, 44), iters=6000, step0=0.05, time_limit=None): 273 | """ 274 | 多路起点(Hex / Grid / Poisson)并行,保留最好的,再额外精修一轮。 275 | """ 276 | cands = [] 277 | for s in seeds: 278 | cands.append(hex_lattice_points(n, seed=s)) 279 | cands.append(jittered_grid_points(n, seed=1000+s)) 280 | cands.append(bridson_poisson_disk(n, seed=2000+s)) 281 | bestP = None; bestA = -1.0 282 | t0 = time.time() 283 | # 粗搜索:较少迭代,快速筛选 284 | coarse_iters = max(200, int(0.25 * iters)) 285 | coarse_results = [] 286 | for P0 in cands: 287 | remaining = None 288 | if time_limit is not None: 289 | elapsed = time.time() - t0 290 | remaining = max(0.0, time_limit - elapsed) 291 | if remaining <= 0: 292 | break 293 | P1, A1 = improve(P0, iters=coarse_iters, step0=step0, seed=12345, time_limit=remaining) 294 | coarse_results.append((A1, P1)) 295 | 296 | if coarse_results: 297 | coarse_results.sort(key=lambda x: x[0], reverse=True) 298 | top_list = [P for (_, P) in coarse_results[:3]] 299 | else: 300 | top_list = cands[:1] 301 | 302 | # 精修:更小步长 303 | for idx, P0 in enumerate(top_list): 304 | remaining = None 305 | if time_limit is not None: 306 | elapsed = time.time() - t0 307 | remaining = max(0.0, time_limit - elapsed) 308 | if remaining <= 0: 309 | break 310 | P2, A2 = improve(P0, iters=max(400, int(0.6 * iters)), step0=0.02, seed=999+idx, time_limit=remaining) 311 | if A2 > bestA: 312 | bestP, bestA = P2, A2 313 | return bestP, bestA 314 | 315 | # ========================= 316 | # 入口:生成 points 并保存 317 | # ========================= 318 | 319 | def main(): 320 | parser = argparse.ArgumentParser() 321 | parser.add_argument("--iters", type=int, default=2000) 322 | parser.add_argument("--step0", type=float, default=0.04) 323 | parser.add_argument("--seeds", type=str, default="7,11,19") 324 | parser.add_argument("--time-limit", type=float, default=None) 325 | args = parser.parse_args() 326 | 327 | n = 16 328 | seeds = tuple(int(s.strip()) for s in args.seeds.split(",") if s.strip()) or (7, 11, 19) 329 | bestP, bestA = multi_start_optimize(n=n, seeds=seeds, iters=args.iters, step0=args.step0, time_limit=args.time_limit) 330 | smin = scaled_min_area(bestP) 331 | print(f"n={n}, points={len(bestP)}") 332 | print(f"min_area = {bestA:.10f}") 333 | print(f"scaled_min_area = {smin:.10f}") 334 | return bestP 335 | 336 | if __name__ == "__main__": 337 | points = main() 338 | out_path = os.path.join(os.path.dirname(__file__), "points.npy") 339 | np.save(out_path, points) 340 | print(f"Saved points to {out_path}") 341 | 342 | # 兼容外部 evaluator 343 | try: 344 | points # type: ignore[name-defined] 345 | except NameError: 346 | points = main() 347 | -------------------------------------------------------------------------------- /evolve_agent/reward_model.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | import time 5 | from typing import List, Dict, Optional, Any 6 | from pathlib import Path 7 | 8 | import asyncio 9 | # import aiofiles 10 | import numpy as np 11 | import pandas as pd 12 | from vllm import LLM, SamplingParams 13 | from openai import AsyncOpenAI 14 | from transformers import AutoTokenizer 15 | from datasets import load_dataset 16 | 17 | from evolve_agent.config import RewardModelConfig 18 | 19 | 20 | class RewardModel: 21 | """ 22 | A class to score research abstracts or proposals based on clarity, novelty, technical rigor, 23 | and potential impact using either a local vLLM model or an external API model. 24 | """ 25 | 26 | BOX = r"\boxed{}" 27 | SYSTEM_PROMPT = "You are an expert reviewer tasked with evaluating the quality of a research proposal." 28 | SCORING_PROMPT = f""" 29 | Your goal is to assign a score between 1 and 10 based on the proposal's clarity, novelty, technical rigor, and potential impact. Here are the criteria: 30 | 1. Read the following proposal carefully and provide a score from 1 to 10. 31 | 2. Score 6 means slightly higher than the borderline, 5 is slightly lower than the borderline. 32 | Write the score in the {BOX}. 33 | **idea**: 34 | """ 35 | 36 | def __init__(self, config: RewardModelConfig): 37 | """ 38 | Initialize the RewardModel. 39 | 40 | Args: 41 | config (RewardModelConfig): Configuration object containing model_type, model_name, api_key, base_url, 42 | jsonl_file, max_retries, retry_delay, temperature, top_p, max_tokens. 43 | """ 44 | self.config = config 45 | 46 | if self.config.model_type == "vllm": 47 | self.llm = LLM(model=self.config.model_name, gpu_memory_utilization=0.95) 48 | self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name) 49 | elif self.config.model_type == "api": 50 | if not self.config.api_key or not self.config.base_url: 51 | raise ValueError("API key and base URL must be provided for API model type.") 52 | self.client = AsyncOpenAI(api_key=self.config.api_key, base_url=self.config.base_url) 53 | else: 54 | raise ValueError("model_type must be 'vllm' or 'api'.") 55 | 56 | # Ensure the directory for jsonl_file exists 57 | os.makedirs(os.path.dirname(self.config.jsonl_file) or ".", exist_ok=True) 58 | 59 | def parse_score_from_text(self, text: str) -> float: 60 | """ 61 | Parse the score from the model's output text. 62 | 63 | Args: 64 | text (str): Model output containing the score in \boxed{} format. 65 | 66 | Returns: 67 | float: Parsed score between 0 and 10, or -1.0 if invalid. 68 | """ 69 | match = re.search(r'\\boxed\{(\d*\.?\d*)\}', text) 70 | if match: 71 | try: 72 | score = float(match.group(1)) 73 | if 0 <= score <= 10: 74 | return score 75 | except ValueError: 76 | pass 77 | return -1.0 78 | 79 | async def load_processed_ids(self) -> set: 80 | """ 81 | Load titles of already processed abstracts from the JSONL file. 82 | 83 | Returns: 84 | set: Set of processed abstract titles with valid scores. 85 | """ 86 | processed_ids = set() 87 | # if os.path.exists(self.config.jsonl_file): 88 | # async with aiofiles.open(self.config.jsonl_file, 'r', encoding='utf-8') as f: 89 | # async for line in f: 90 | # try: 91 | # data = json.loads(line.strip()) 92 | # if data.get('score', -1.0) != -1.0: 93 | # processed_ids.add(data['title']) 94 | # except json.JSONDecodeError: 95 | # print(f"Warning: Skipping invalid JSON line in {self.config.jsonl_file}") 96 | return processed_ids 97 | 98 | async def write_result_to_jsonl(self, result: Dict): 99 | """ 100 | Write a single result to the JSONL file if the score is valid. 101 | 102 | Args: 103 | result (Dict): Result dictionary containing title, score, evaluation, abstract, and gt_score. 104 | """ 105 | # if result['score'] != -1.0: 106 | # async with aiofiles.open(self.config.jsonl_file, 'a', encoding='utf-8') as f: 107 | # await f.write(json.dumps(result, ensure_ascii=False) + '\n') 108 | 109 | async def score_with_vllm(self, data: List[Dict]) -> List[Dict]: 110 | """ 111 | Score abstracts using a local vLLM model. 112 | 113 | Args: 114 | data (List[Dict]): List of dictionaries containing 'title', 'abstract', and 'gt_score'. 115 | 116 | Returns: 117 | List[Dict]: List of results with 'title', 'score', 'evaluation', 'abstract', and 'gt_score'. 118 | """ 119 | prompts = [ 120 | self.tokenizer.apply_chat_template( 121 | [ 122 | {"role": "system", "content": self.SYSTEM_PROMPT}, 123 | {"role": "user", "content": self.SCORING_PROMPT + item["title"] + "\n" + item["abstract"]} 124 | ], 125 | tokenize=False, 126 | add_generation_prompt=True, 127 | enable_thinking=False 128 | ) 129 | for item in data 130 | ] 131 | 132 | sampling_params = SamplingParams( 133 | temperature=self.config.temperature, 134 | top_p=self.config.top_p, 135 | max_tokens=self.config.max_tokens, 136 | ) 137 | 138 | # vLLM is synchronous, so we run it in the default executor 139 | outputs = await asyncio.get_event_loop().run_in_executor(None, lambda: self.llm.generate(prompts, sampling_params)) 140 | 141 | results = [] 142 | for output, item in zip(outputs, data): 143 | output_text = output.outputs[0].text.strip() 144 | score = self.parse_score_from_text(output_text) 145 | result = { 146 | "title": item["title"], 147 | "score": score, 148 | "evaluation": output_text, 149 | "abstract": item["abstract"], 150 | "gt_score": item["gt_score"] 151 | } 152 | await self.write_result_to_jsonl(result) 153 | results.append(result) 154 | 155 | return results 156 | 157 | async def score_with_api(self, data: List[Dict]) -> List[Dict]: 158 | """ 159 | Score abstracts using an external API model. 160 | 161 | Args: 162 | data (List[Dict]): List of dictionaries containing 'title', 'abstract', and 'gt_score'. 163 | 164 | Returns: 165 | List[Dict]: List of results with 'title', 'score', 'evaluation', 'abstract', and 'gt_score'. 166 | """ 167 | processed_ids = await self.load_processed_ids() 168 | data_to_process = [item for item in data if item['title'] not in processed_ids] 169 | print(f"Total abstracts: {len(data)}, To process: {len(data_to_process)}, Already processed: {len(processed_ids)}") 170 | 171 | prompts = [ 172 | [ 173 | {"role": "system", "content": self.SYSTEM_PROMPT}, 174 | {"role": "user", "content": self.SCORING_PROMPT + item["title"] + "\n" + item["abstract"]} 175 | ] 176 | for item in data_to_process 177 | ] 178 | 179 | results = [] 180 | for prompt, item in zip(prompts, data_to_process): 181 | retries = 0 182 | score = -1.0 183 | output_text = "" 184 | 185 | while score == -1.0 and retries < self.config.max_retries: 186 | try: 187 | response = await self.client.chat.completions.create( 188 | model=self.config.model_name, 189 | messages=prompt, 190 | temperature=0, # API uses fixed temperature as per original code 191 | max_tokens=1000, # API uses fixed max_tokens as per original code 192 | top_p=1.0 # API uses fixed top_p as per original code 193 | ) 194 | output_text = response.choices[0].message.content.strip() 195 | score = self.parse_score_from_text(output_text) 196 | except Exception as e: 197 | print(f"Error processing {item['title']}: {e}") 198 | 199 | if score == -1.0: 200 | retries += 1 201 | print(f"Invalid score for abstract: {item['title']}, Retry {retries}/{self.config.max_retries}") 202 | await asyncio.sleep(self.config.retry_delay) 203 | 204 | result = { 205 | "title": item["title"], 206 | "score": score, 207 | "gt_score": item["gt_score"], 208 | "evaluation": output_text, 209 | "abstract": item["abstract"] 210 | } 211 | await self.write_result_to_jsonl(result) 212 | results.append(result) 213 | 214 | if score == -1.0: 215 | print(f"Failed to get valid score for abstract: {item['title']} after {self.config.max_retries} retries") 216 | 217 | # Include previously processed results 218 | # if processed_ids: 219 | # async with aiofiles.open(self.config.jsonl_file, 'r', encoding='utf-8') as f: 220 | # async for line in f: 221 | # try: 222 | # result = json.loads(line.strip()) 223 | # if result['title'] in processed_ids: 224 | # results.append(result) 225 | # except json.JSONDecodeError: 226 | # print(f"Warning: Skipping invalid JSON line in {self.config.jsonl_file}") 227 | 228 | return results 229 | 230 | async def score_research_proposal(self, data: List[Any]) -> List[Dict]: 231 | """ 232 | Score abstracts using the configured model type. 233 | 234 | Args: 235 | data (List[Dict]): List of dictionaries containing 'title', 'abstract', and 'gt_score'. 236 | 237 | Returns: 238 | List[Dict]: List of results with 'title', 'score', 'evaluation', 'abstract', and 'gt_score'. 239 | """ 240 | if isinstance(data[0], str): 241 | data = [{"title": "", "gt_score":0, "abstract": d} for d in data] 242 | 243 | if self.config.model_type == "vllm": 244 | return await self.score_with_vllm(data) 245 | elif self.config.model_type == "api": 246 | return await self.score_with_api(data) 247 | else: 248 | raise ValueError("Invalid model_type. Must be 'vllm' or 'api'.") 249 | 250 | if __name__ == "__main__": 251 | async def test_reward_model(): 252 | # Sample data for testing 253 | sample_data = [ 254 | { 255 | "title": "A Novel Approach to Quantum Computing", 256 | "abstract": "This proposal introduces a new quantum algorithm that enhances computational efficiency by leveraging entangled states in a scalable architecture. The approach is validated through simulations showing a 20% improvement over existing methods.", 257 | "gt_score": 7.5 258 | }, 259 | { 260 | "title": "AI-Driven Climate Modeling", 261 | "abstract": "We propose an AI-based framework for improving climate predictions using deep learning to integrate heterogeneous environmental data. Preliminary results demonstrate enhanced accuracy in long-term forecasts.", 262 | "gt_score": 8.0 263 | } 264 | ] 265 | 266 | # Test with vLLM model (commented out because it requires a local model and GPU) 267 | 268 | # try: 269 | # vllm_model = RewardModel( 270 | # model_type="vllm", 271 | # model_name="/data/zhuotaodeng/yzj/alpha_research_model/qwen25_grm_iclr_boxed/checkpoint-180", 272 | # jsonl_file="vllm_results.jsonl" 273 | # ) 274 | # vllm_results = await vllm_model.score_research_proposal(sample_data) 275 | # print("vLLM Results:") 276 | # for result in vllm_results: 277 | # print(f"Title: {result['title']}, Score: {result['score']}, Evaluation: {result['evaluation'][:50]}...") 278 | # except Exception as e: 279 | # print(f"vLLM test failed: {e}") 280 | 281 | 282 | # Test with API model (requires valid API key and base URL) 283 | try: 284 | # Replace with your actual API key and base URL 285 | api_key = "sk-2c3f1f58031b4b86afdb6a8192ea02e2" 286 | base_url = "https://api.deepseek.com" 287 | 288 | config = RewardModelConfig( 289 | model_type="api", 290 | model_name="deepseek-chat", 291 | api_key=api_key, 292 | base_url=base_url, 293 | jsonl_file="api_results.jsonl", 294 | max_retries=3, 295 | retry_delay=1 296 | ) 297 | 298 | api_model = RewardModel(config) 299 | api_results = await api_model.score_research_proposal(sample_data) 300 | print("API Results:") 301 | for result in api_results: 302 | print(f"Title: {result['title']}, Score: {result['score']}, Evaluation: {result['evaluation'][:50]}...") 303 | except Exception as e: 304 | print(f"API test failed: {e}") 305 | 306 | # Run the async test 307 | asyncio.run(test_reward_model()) 308 | -------------------------------------------------------------------------------- /benchmark/packing_circles/initial_program.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | from concurrent.futures import ThreadPoolExecutor 4 | 5 | 6 | def pack_circles(n, square_size=1.0): 7 | """ 8 | Pack n disjoint circles in a unit square using uniform tiling approach. 9 | Returns the sum of radii and list of circles (x, y, r). 10 | """ 11 | 12 | def max_circle_radius(x, y, circles, square_size=1.0, skip_idx=None): 13 | """ 14 | Compute the maximum radius for a circle centered at (x, y) that: 15 | - Stays within the unit square [0, square_size] × [0, square_size]. 16 | - Does not overlap with existing circles. 17 | skip_idx: if provided, index in circles[] to ignore (self). 18 | """ 19 | # Distance to nearest boundary of the unit square 20 | r_max = min(x, y, square_size - x, square_size - y) 21 | 22 | # Check distance to existing circles, exit early if r_max → 0 23 | # early exit if r_max is tiny, and avoid needless sqrt 24 | for idx, (cx, cy, cr) in enumerate(circles): 25 | if skip_idx == idx: 26 | continue 27 | if r_max <= 1e-8: 28 | break 29 | dx = x - cx 30 | dy = y - cy 31 | sep = r_max + cr 32 | if dx*dx + dy*dy < sep*sep: 33 | # only compute sqrt when we know we can shrink 34 | dist = math.sqrt(dx*dx + dy*dy) 35 | r_max = min(r_max, dist - cr) 36 | return max(r_max, 0.0) 37 | 38 | def uniform_tiling_circles(n, square_size=1.0): 39 | """ 40 | Uniformly tile the square with circles using optimal grid placement. 41 | """ 42 | if n <= 0: 43 | return [] 44 | 45 | circles = [] 46 | 47 | # Calculate optimal grid dimensions 48 | # For n circles, find the best grid layout (rows x cols) 49 | best_layout = None 50 | best_total_radius = 0 51 | 52 | # Try different grid configurations 53 | for rows in range(1, min(n + 1, 20)): 54 | cols = math.ceil(n / rows) 55 | if cols > 20: # Limit grid size 56 | continue 57 | 58 | # Calculate spacing 59 | spacing_x = square_size / (cols + 1) 60 | spacing_y = square_size / (rows + 1) 61 | 62 | # Use the smaller spacing to ensure circles fit 63 | min_spacing = min(spacing_x, spacing_y) 64 | 65 | # Calculate maximum radius for this layout 66 | max_radius = min_spacing / 2 67 | 68 | # Ensure radius doesn't exceed boundaries 69 | max_radius = min(max_radius, 70 | spacing_x / 2 - 1e-6, 71 | spacing_y / 2 - 1e-6) 72 | 73 | if max_radius <= 0: 74 | continue 75 | 76 | # Place circles in uniform grid 77 | temp_circles = [] 78 | count = 0 79 | 80 | for row in range(rows): 81 | for col in range(cols): 82 | if count >= n: 83 | break 84 | 85 | x = spacing_x * (col + 1) 86 | y = spacing_y * (row + 1) 87 | 88 | # Ensure circle stays within bounds 89 | if (x - max_radius >= 0 and x + max_radius <= square_size and 90 | y - max_radius >= 0 and y + max_radius <= square_size): 91 | 92 | temp_circles.append((x, y, max_radius)) 93 | count += 1 94 | 95 | if count >= n: 96 | break 97 | 98 | # Calculate total radius for this layout 99 | total_radius = len(temp_circles) * max_radius 100 | 101 | if total_radius > best_total_radius and len(temp_circles) == n: 102 | best_total_radius = total_radius 103 | best_layout = temp_circles 104 | 105 | # If we found a valid layout, return it 106 | if best_layout: 107 | return best_layout 108 | 109 | # Fallback: use hexagonal packing for better density 110 | return hexagonal_packing(n, square_size) 111 | 112 | def hexagonal_packing(n, square_size=1.0): 113 | """ 114 | Use hexagonal close packing for better space utilization. 115 | """ 116 | circles = [] 117 | 118 | # Estimate number of rows and columns for hexagonal packing 119 | # Hexagonal packing has rows offset by sqrt(3)/2 * diameter 120 | 121 | rows = int(math.sqrt(n * 2 / math.sqrt(3))) + 2 122 | 123 | count = 0 124 | row = 0 125 | 126 | while count < n and row < rows: 127 | # Calculate y position for this row 128 | y = (row + 0.5) * (square_size / (rows + 1)) 129 | 130 | # Number of circles in this row 131 | if row % 2 == 0: 132 | cols = int(math.sqrt(n)) + 1 133 | else: 134 | cols = int(math.sqrt(n)) 135 | 136 | spacing_x = square_size / (cols + 1) 137 | 138 | for col in range(cols): 139 | if count >= n: 140 | break 141 | 142 | if row % 2 == 0: 143 | x = spacing_x * (col + 1) 144 | else: 145 | x = spacing_x * (col + 1) + spacing_x / 2 146 | 147 | # Calculate maximum radius for this position 148 | r = max_circle_radius(x, y, circles, square_size) 149 | 150 | if r > 0: 151 | circles.append((x, y, r)) 152 | count += 1 153 | 154 | row += 1 155 | 156 | return circles 157 | 158 | def optimize_placement(n, square_size=1.0): 159 | """ 160 | Optimize circle placement using uniform tiling with radius maximization. 161 | """ 162 | circles = [] 163 | 164 | # First, try hexagonal packing for high initial density 165 | hex_circles = hexagonal_packing(n, square_size) 166 | if len(hex_circles) == n: 167 | # Ensure maximum radii for hex layout with stronger refinement 168 | hex_refined = refine_circles(hex_circles, square_size, iterations=20) 169 | return hex_refined 170 | 171 | # Fallback to uniform grid placement 172 | grid_circles = uniform_tiling_circles(n, square_size) 173 | if len(grid_circles) == n: 174 | return grid_circles 175 | 176 | # If uniform tiling didn't work perfectly, use adaptive approach 177 | # Calculate optimal radius based on density 178 | area_per_circle = (square_size * square_size) / n 179 | estimated_radius = math.sqrt(area_per_circle / math.pi) * 0.9 # Conservative estimate 180 | 181 | # Create grid with optimal spacing 182 | spacing = estimated_radius * 2.1 # Include gap 183 | 184 | cols = int(square_size / spacing) 185 | rows = int(square_size / spacing) 186 | 187 | actual_spacing_x = square_size / (cols + 1) 188 | actual_spacing_y = square_size / (rows + 1) 189 | 190 | count = 0 191 | for row in range(rows): 192 | for col in range(cols): 193 | if count >= n: 194 | break 195 | 196 | x = actual_spacing_x * (col + 1) 197 | y = actual_spacing_y * (row + 1) 198 | 199 | # Calculate maximum possible radius 200 | r = max_circle_radius(x, y, circles, square_size) 201 | 202 | if r > 0: 203 | circles.append((x, y, r)) 204 | count += 1 205 | 206 | if count >= n: 207 | break 208 | 209 | # If we still need more circles, use remaining space 210 | remaining = n - len(circles) 211 | if remaining > 0: 212 | # Place remaining circles in remaining spaces 213 | for i in range(remaining): 214 | # Try different positions systematically 215 | best_r = 0 216 | best_pos = (0.5, 0.5) 217 | 218 | # Fine grid search (increased resolution) 219 | grid_points = 100 220 | for gx in range(1, grid_points): 221 | for gy in range(1, grid_points): 222 | x = gx / grid_points 223 | y = gy / grid_points 224 | 225 | r = max_circle_radius(x, y, circles, square_size) 226 | if r > best_r: 227 | best_r = r 228 | best_pos = (x, y) 229 | 230 | if best_r > 0: 231 | circles.append((best_pos[0], best_pos[1], best_r)) 232 | 233 | return circles 234 | 235 | def refine_circles(circles, square_size, iterations=80, perturb_interval=3): 236 | """ 237 | Iteratively grow each circle to its maximum radius under non-overlap constraints. 238 | Includes randomized update order, periodic micro-perturbation to escape 239 | local minima, and a final local-center-perturbation pass for densification. 240 | """ 241 | for it in range(iterations): 242 | # randomize update order to avoid sweep-order bias 243 | indices = list(range(len(circles))) 244 | random.shuffle(indices) 245 | for i in indices: 246 | x, y, _ = circles[i] 247 | # Compute maximal feasible radius here, skipping self 248 | r = max_circle_radius(x, y, circles, square_size, skip_idx=i) 249 | circles[i] = (x, y, r) 250 | # Periodic micro-perturbation: jiggle a few circles 251 | if it % perturb_interval == 0 and len(circles) > 0: 252 | subset = random.sample(indices, min(5, len(circles))) 253 | for j in subset: 254 | x0, y0, r0 = circles[j] 255 | dx = random.uniform(-0.03, 0.03) 256 | dy = random.uniform(-0.03, 0.03) 257 | nx = min(max(x0 + dx, 0), square_size) 258 | ny = min(max(y0 + dy, 0), square_size) 259 | # Compute maximal radius skipping self 260 | nr = max_circle_radius(nx, ny, circles, square_size, skip_idx=j) 261 | if nr > r0: 262 | circles[j] = (nx, ny, nr) 263 | # Full local center-perturbation phase for final densification 264 | for i in range(len(circles)): 265 | x, y, r = circles[i] 266 | best_x, best_y, best_r = x, y, r 267 | delta = 0.1 268 | for _ in range(20): 269 | dx = random.uniform(-delta, delta) 270 | dy = random.uniform(-delta, delta) 271 | nx = min(max(x + dx, 0), square_size) 272 | ny = min(max(y + dy, 0), square_size) 273 | # Compute maximal radius skipping self 274 | nr = max_circle_radius(nx, ny, circles, square_size, skip_idx=i) 275 | if nr > best_r: 276 | best_x, best_y, best_r = nx, ny, nr 277 | else: 278 | delta *= 0.9 279 | circles[i] = (best_x, best_y, best_r) 280 | 281 | # Physics-inspired soft relaxation to escape persistent overlaps 282 | for i in range(len(circles)): 283 | x, y, r = circles[i] 284 | fx, fy = 0.0, 0.0 285 | for j, (xj, yj, rj) in enumerate(circles): 286 | if i == j: 287 | continue 288 | dx = x - xj 289 | dy = y - yj 290 | d = (dx*dx + dy*dy) ** 0.5 291 | overlap = (r + rj) - d 292 | if overlap > 0 and d > 1e-8: 293 | fx += dx / d * overlap 294 | fy += dy / d * overlap 295 | # Nudge the center by 10% of the computed net “repulsive” force 296 | nx = min(max(x + 0.1 * fx, 0), square_size) 297 | ny = min(max(y + 0.1 * fy, 0), square_size) 298 | nr = max_circle_radius(nx, ny, circles, square_size, skip_idx=i) 299 | circles[i] = (nx, ny, nr) 300 | return circles 301 | 302 | def multi_start_optimize(n, square_size, starts=None): 303 | """ 304 | Parallel multi-start global → local optimization using ThreadPoolExecutor. 305 | Number of starts adapts to problem size: max(100, 10*n). 306 | """ 307 | if starts is None: 308 | if n <= 50: 309 | starts = max(200, n * 20) 310 | else: 311 | starts = max(100, n * 10) 312 | # precompute hexagonal‐packing baseline 313 | hex_circ = hexagonal_packing(n, square_size) 314 | hex_sum = sum(r for _, _, r in hex_circ) 315 | best_conf = None 316 | best_sum = 0.0 317 | 318 | # single trial: seed → refine → score 319 | def single_run(_): 320 | conf0 = optimize_placement(n, square_size) 321 | conf1 = refine_circles(conf0, square_size, iterations=40) 322 | s1 = sum(r for _, _, r in conf1) 323 | return s1, conf1 324 | 325 | # dispatch trials in parallel 326 | with ThreadPoolExecutor() as executor: 327 | for score, conf in executor.map(single_run, range(starts)): 328 | if score > best_sum: 329 | best_sum, best_conf = score, conf.copy() 330 | # early exit if near the hex-baseline 331 | if best_sum >= hex_sum * 0.995: 332 | break 333 | 334 | return best_conf 335 | 336 | # Use multi-start global → local optimization (adaptive number of starts) 337 | circles = multi_start_optimize(n, square_size) 338 | 339 | # Quick 2-cluster remove-and-reinsert densification (extended iterations) 340 | for _ in range(8): 341 | # remove the two smallest circles to create a larger gap 342 | smallest = sorted(range(len(circles)), key=lambda i: circles[i][2])[:2] 343 | removed = [circles[i] for i in smallest] 344 | # pop in reverse order to keep indices valid 345 | for i in sorted(smallest, reverse=True): 346 | circles.pop(i) 347 | # refine the remaining configuration briefly 348 | circles = refine_circles(circles, square_size, iterations=8) 349 | # reinsert each removed circle with more sampling 350 | for x_old, y_old, _ in removed: 351 | best_r, best_pos = 0.0, (x_old, y_old) 352 | for _ in range(500): 353 | x = random.uniform(0, square_size) 354 | y = random.uniform(0, square_size) 355 | r = max_circle_radius(x, y, circles, square_size) 356 | if r > best_r: 357 | best_r, best_pos = r, (x, y) 358 | circles.append((best_pos[0], best_pos[1], best_r)) 359 | # final local polish after reinsertion 360 | circles = refine_circles(circles, square_size, iterations=5) 361 | # end 2-cluster remove-and-reinsert densification 362 | 363 | # Calculate total radius 364 | total_radius = sum(circle[2] for circle in circles) 365 | 366 | return total_radius, circles --------------------------------------------------------------------------------