├── src ├── apebench │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── progress_tracker.py │ ├── inference │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── api_keys.example.py │ │ │ ├── chat_logger.py │ │ │ ├── parallel.py │ │ │ └── call_api.py │ │ ├── inference_pipelines │ │ │ ├── __init__.py │ │ │ ├── generate_judgement.py │ │ │ ├── generate_instruction.py │ │ │ ├── generate_patch.py │ │ │ └── base.py │ │ ├── prompts │ │ │ ├── __init__.py │ │ │ ├── judgement_generation_prompts.py │ │ │ ├── instruction_generation_prompts.py │ │ │ └── patch_generation_prompts.py │ │ └── run_inference.py │ ├── scripts │ │ ├── 1_generate_patches.py │ │ ├── 2_verify_patches.py │ │ └── 3_evaluate_patches.py │ ├── config │ │ ├── default_config.py │ │ └── config_manager.py │ └── evaluation_pipelines │ │ ├── verification_manager.py │ │ ├── evaluation_manager.py │ │ └── data_collector.py ├── __init__.py ├── eleanstic │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── sys_utils.py │ │ ├── lean_utils.py │ │ └── log_utils.py │ ├── core │ │ ├── __init__.py │ │ ├── status.py │ │ └── file_map.py │ └── config.yaml └── utils │ ├── __init__.py │ ├── lean_utils.py │ ├── file_utils.py │ └── colors.py ├── assets ├── diff.pdf ├── main.pdf ├── future.pdf ├── main1.pdf ├── main2.pdf ├── main3.pdf ├── main4.pdf ├── main5.pdf ├── gorilla.png ├── pipeline.jpg ├── road_map.jpg ├── APE_Bench_I_paper.pdf ├── combined_metrics_combined.pdf ├── combined_boxplot_analysis_left.pdf ├── combined_boxplot_analysis_main.pdf ├── filtered_pure_changes_comparison.pdf ├── filtered_pure_changes_individual_subplots.pdf ├── judgement_by_change_size_combined_metrics.pdf ├── filtered_pure_changes_individual_subplots_density.pdf └── filtered_mathlib4_commits_data_20250413_173625_100_waterfall.pdf ├── requirements.txt ├── .gitignore ├── LICENSE ├── configs └── config.yaml ├── submission.py ├── docs ├── 03_core_components │ ├── 03_2_apebench_data.md │ ├── 03_5_apebench_scripts_config.md │ └── 03_3_apebench_inference.md ├── 02_project_structure.md └── 04_troubleshooting.md └── run_ape_bench_example.sh /src/apebench/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | -------------------------------------------------------------------------------- /assets/diff.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/diff.pdf -------------------------------------------------------------------------------- /assets/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main.pdf -------------------------------------------------------------------------------- /src/eleanstic/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | -------------------------------------------------------------------------------- /assets/future.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/future.pdf -------------------------------------------------------------------------------- /assets/main1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main1.pdf -------------------------------------------------------------------------------- /assets/main2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main2.pdf -------------------------------------------------------------------------------- /assets/main3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main3.pdf -------------------------------------------------------------------------------- /assets/main4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main4.pdf -------------------------------------------------------------------------------- /assets/main5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main5.pdf -------------------------------------------------------------------------------- /assets/gorilla.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/gorilla.png -------------------------------------------------------------------------------- /assets/pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/pipeline.jpg -------------------------------------------------------------------------------- /assets/road_map.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/road_map.jpg -------------------------------------------------------------------------------- /assets/APE_Bench_I_paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/APE_Bench_I_paper.pdf -------------------------------------------------------------------------------- /assets/combined_metrics_combined.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/combined_metrics_combined.pdf -------------------------------------------------------------------------------- /assets/combined_boxplot_analysis_left.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/combined_boxplot_analysis_left.pdf -------------------------------------------------------------------------------- /assets/combined_boxplot_analysis_main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/combined_boxplot_analysis_main.pdf -------------------------------------------------------------------------------- /assets/filtered_pure_changes_comparison.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/filtered_pure_changes_comparison.pdf -------------------------------------------------------------------------------- /assets/filtered_pure_changes_individual_subplots.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/filtered_pure_changes_individual_subplots.pdf -------------------------------------------------------------------------------- /assets/judgement_by_change_size_combined_metrics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/judgement_by_change_size_combined_metrics.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | rapidfuzz 3 | tiktoken 4 | matplotlib 5 | pydantic 6 | portalocker 7 | colorlog 8 | plotly 9 | kaleido 10 | openai 11 | tenacity -------------------------------------------------------------------------------- /assets/filtered_pure_changes_individual_subplots_density.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/filtered_pure_changes_individual_subplots_density.pdf -------------------------------------------------------------------------------- /assets/filtered_mathlib4_commits_data_20250413_173625_100_waterfall.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/filtered_mathlib4_commits_data_20250413_173625_100_waterfall.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | datasets/ 3 | __pycache__/ 4 | .vscode 5 | mathlib4/ 6 | ./*.parquet 7 | results*/ 8 | temp/ 9 | temp_data_sync/ 10 | chat_logs/ 11 | # outputs/ 12 | # logs/ 13 | sync_for_verify/ 14 | analyze_outputs/ 15 | old_*/ 16 | *.log 17 | verify_database/ 18 | plots/ 19 | progress 20 | outputs/ 21 | src/apebench/data_visualization 22 | src/apebench/inference/utils/api_keys.py -------------------------------------------------------------------------------- /src/eleanstic/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | """ 4 | eleanstic utility module 5 | """ 6 | 7 | from .log_utils import setup_logger, log_progress 8 | from .lean_utils import verify_with_lean, run_lake_build, run_command 9 | 10 | __all__ = [ 11 | 'setup_logger', 12 | 'log_progress', 13 | 'verify_with_lean', 14 | 'run_lake_build', 15 | 'run_command' 16 | ] -------------------------------------------------------------------------------- /src/apebench/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | """ 4 | Utility module for ApeBench 5 | """ 6 | 7 | from .metrics import extract_verification_data, extract_judgement_data, calculate_metrics, plot_metrics 8 | from .progress_tracker import ProgressTracker 9 | 10 | __all__ = [ 11 | 'extract_verification_data', 12 | 'extract_judgement_data', 13 | 'calculate_metrics', 14 | 'plot_metrics', 15 | 'ProgressTracker', 16 | ] 17 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | """ 3 | Utility module, providing file processing and visualization functionality 4 | """ 5 | 6 | from .file_utils import load_results, load_jsonl, save_jsonl, convert_to_serializable 7 | from .lean_file_parser import LeanFileAnalyzer 8 | 9 | __all__ = [ 10 | 'load_results', 11 | 'load_jsonl', 12 | 'save_jsonl', 13 | 'convert_to_serializable', 14 | 'LeanFileAnalyzer', 15 | ] 16 | -------------------------------------------------------------------------------- /src/eleanstic/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | """ 4 | eleanstic 核心模块,提供配置和协调功能 5 | """ 6 | 7 | from .config import ConfigManager 8 | from .coordinators import BuildCoordinator, VerifyCoordinator 9 | from .file_map import FileMapManager 10 | from .status import CommitStatus 11 | 12 | __all__ = [ 13 | 'ConfigManager', 14 | 'BuildCoordinator', 15 | 'VerifyCoordinator', 16 | 'FileMapManager', 17 | 'CommitStatus' 18 | ] 19 | -------------------------------------------------------------------------------- /src/apebench/inference/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | """ 4 | Utility Tools Used in the Inference Process 5 | """ 6 | 7 | from .call_api import chat, TOTAL_MODELS, REASONING_MODELS, UNSUPPORT_TEMPERATURE_MODELS 8 | from .diff_repair import DiffRepair, apply_diff 9 | from .parallel import process_with_retries 10 | 11 | __all__ = [ 12 | 'chat', 13 | 'TOTAL_MODELS', 14 | 'REASONING_MODELS', 15 | 'UNSUPPORT_TEMPERATURE_MODELS', 16 | 'DiffRepair', 17 | 'apply_diff', 18 | 'process_with_retries', 19 | ] 20 | -------------------------------------------------------------------------------- /src/apebench/inference/inference_pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | """ 3 | Inference Pipeline Module 4 | 5 | Contains pipelines for generating instructions, patches, and judgements. 6 | """ 7 | 8 | from .base import BasePipeline 9 | from .generate_instruction import GenerateInstructionPipeline 10 | from .generate_patch import GeneratePatchPipeline 11 | from .generate_judgement import GenerateJudgementPipeline 12 | 13 | __all__ = [ 14 | 'BasePipeline', 15 | 'GenerateInstructionPipeline', 16 | 'GeneratePatchPipeline', 17 | 'GenerateJudgementPipeline' 18 | ] 19 | -------------------------------------------------------------------------------- /src/apebench/inference/utils/api_keys.example.py: -------------------------------------------------------------------------------- 1 | # API keys and endpoints for different language models 2 | # Fill in your actual API keys in this file 3 | 4 | # OpenAI API credentials (GPT models) 5 | openai_api_key = "your-openai-api-key" 6 | openai_base_url = "https://api.openai.com/v1" # Or your Azure OpenAI endpoint 7 | 8 | # Anthropic API credentials (Claude models) 9 | aws_claude_api_key = "your-anthropic-api-key" 10 | aws_claude_base_url = "https://api.anthropic.com" # Or your AWS Claude endpoint 11 | 12 | # Other API providers 13 | # DeepSeek models 14 | volces_api_key = "your-deepseek-api-key" 15 | volces_base_url = "https://api.deepseek.com" # Or other endpoint 16 | 17 | # Google API credentials 18 | google_api_key = "your-google-api-key" 19 | google_base_url = "https://generativelanguage.googleapis.com" # Or other specific Google AI service endpoint -------------------------------------------------------------------------------- /src/utils/lean_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | def remove_lean_comments(src: str) -> str: 4 | i = 0 5 | n = len(src) 6 | out = [] 7 | block_nest = 0 8 | while i < n: 9 | if block_nest == 0: 10 | if src.startswith("--", i): 11 | j = src.find("\n", i + 2) 12 | if j == -1: 13 | break 14 | else: 15 | out.append("\n") 16 | i = j + 1 17 | elif src.startswith("/-", i): 18 | block_nest = 1 19 | i += 2 20 | else: 21 | out.append(src[i]) 22 | i += 1 23 | else: 24 | if src.startswith("/-", i): 25 | block_nest += 1 26 | i += 2 27 | elif src.startswith("-/", i): 28 | block_nest -= 1 29 | i += 2 30 | else: 31 | i += 1 32 | 33 | return "".join(out) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Huajian Xin, Jacques Fleuriot, Wenda Li, Bytedance Ltd. and/or its affiliates 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /src/apebench/inference/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | """ 3 | Prompt Templates Used in the Inference Process 4 | """ 5 | 6 | from .instruction_generation_prompts import ( 7 | instruction_generation_system_prompt, 8 | instruction_generation_input_prompt, 9 | instruction_generation_input_prompt_without_lean_code 10 | ) 11 | 12 | from .patch_generation_prompts import ( 13 | patch_generation_system_prompt, 14 | patch_generation_reasoning_models_system_prompt, 15 | patch_generation_input_prompt, 16 | patch_generation_input_prompt_without_lean_code 17 | ) 18 | 19 | from .judgement_generation_prompts import ( 20 | judgement_generation_system_prompt, 21 | judgement_generation_input_prompt, 22 | judgement_generation_input_prompt_without_lean_code 23 | ) 24 | 25 | __all__ = [ 26 | 'instruction_generation_system_prompt', 27 | 'instruction_generation_input_prompt', 28 | 'instruction_generation_input_prompt_without_lean_code', 29 | 'patch_generation_system_prompt', 30 | 'patch_generation_reasoning_models_system_prompt', 31 | 'patch_generation_input_prompt', 32 | 'patch_generation_input_prompt_without_lean_code', 33 | 'judgement_generation_system_prompt', 34 | 'judgement_generation_input_prompt', 35 | 'judgement_generation_input_prompt_without_lean_code' 36 | ] 37 | -------------------------------------------------------------------------------- /src/apebench/scripts/1_generate_patches.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | #!/usr/bin/env python3 4 | """ 5 | Patch generation script, runs the patch generation process according to configuration. 6 | """ 7 | 8 | import argparse 9 | import os 10 | import sys 11 | from typing import List 12 | 13 | def main(): 14 | """Script entry point""" 15 | # Parse command line arguments 16 | parser = argparse.ArgumentParser(description="Generate patches using multiple models and configurations") 17 | parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file") 18 | args = parser.parse_args() 19 | 20 | # Ensure src can be imported 21 | script_dir = os.path.dirname(os.path.abspath(__file__)) 22 | root_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir))) 23 | if root_dir not in sys.path: 24 | sys.path.insert(0, root_dir) 25 | 26 | # Import modules 27 | from ..evaluation_pipelines.patch_generator import generate_patches 28 | from ..config.config_manager import ConfigManager 29 | 30 | # Load configuration to confirm generation section exists 31 | config = ConfigManager(args.config).get_config() 32 | if not hasattr(config, 'generation'): 33 | print(f"Error: Configuration file {args.config} does not have a 'generation' section") 34 | sys.exit(1) 35 | 36 | # Execute generation 37 | output_files = generate_patches(args.config) 38 | 39 | print(f"\nGeneration task completed successfully!") 40 | print(f"Generated {len(output_files)} patch files.") 41 | print(f"Next step: Run the verification script using the same configuration file.") 42 | 43 | if __name__ == "__main__": 44 | main() -------------------------------------------------------------------------------- /src/eleanstic/config.yaml: -------------------------------------------------------------------------------- 1 | # Eleanstic Environment Configuration File 2 | 3 | # Base Path Configuration 4 | paths: 5 | # Main Mathlib4 Git Repository Path 6 | mathlib_repo: "mathlib4" 7 | # Workspace Root Directory 8 | workspace_root: "verify_database" 9 | # Worktree Root Directory 10 | worktree_dir: "worktrees" 11 | # Content Storage Root Directory 12 | storage_dir: "storage" 13 | # Cache Directory 14 | cache_dir: "cache" 15 | # Log Directory 16 | log_dir: "logs" 17 | # Verification Results Directory 18 | verify_results_dir: "./verify_results" 19 | 20 | # Concurrency Settings 21 | concurrency: 22 | # Maximum Worker Processes 23 | max_workers: 180 24 | # Maximum Parallel File Storage Threads 25 | max_concurrent_file_storage: 8 26 | # Maximum Parallel Lean Verification Threads 27 | max_concurrent_lean_verifications: 64 28 | 29 | # Storage Settings 30 | storage: 31 | # Hash Algorithm (xxhash64/sha256) 32 | hash_algorithm: "sha256" 33 | # Whether to Delete Cache After Build 34 | remove_worktree_after_build: true 35 | 36 | # Cache Settings 37 | cache: 38 | # Number of Download Retries 39 | download_retries: 10 40 | # Download Timeout (seconds) 41 | download_timeout: 3600 42 | # Wait Time Between Retry Attempts (seconds) 43 | retry_wait: 30 44 | 45 | # Logging Settings 46 | logging: 47 | # Log Level (DEBUG/INFO/WARNING/ERROR/CRITICAL) 48 | level: "INFO" 49 | # Maximum Log File Size (MB) 50 | max_size_mb: 100 51 | # Number of Log Files to Keep 52 | backup_count: 10 53 | # Whether to Output to Console 54 | console_output: true 55 | # Whether to Use Colored Logs 56 | color_output: true 57 | 58 | # Verification Settings 59 | verification: 60 | # Verification Timeout (seconds) 61 | timeout: 120 62 | -------------------------------------------------------------------------------- /src/apebench/scripts/2_verify_patches.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | """ 4 | Patch verification script, runs the patch verification process according to configuration. 5 | """ 6 | 7 | import argparse 8 | import os 9 | import sys 10 | from typing import List, Optional 11 | 12 | def main(): 13 | """Script entry point""" 14 | # Parse command line arguments 15 | parser = argparse.ArgumentParser(description="Verify generated patches") 16 | parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file") 17 | parser.add_argument("--input_files", type=str, nargs="*", help="Optional list of generation output files") 18 | args = parser.parse_args() 19 | 20 | # Ensure src can be imported 21 | script_dir = os.path.dirname(os.path.abspath(__file__)) 22 | root_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir))) 23 | if root_dir not in sys.path: 24 | sys.path.insert(0, root_dir) 25 | 26 | # Import modules 27 | from ..evaluation_pipelines.verification_manager import verify_patches 28 | 29 | # Execute verification 30 | metrics = verify_patches(args.config, args.input_files) 31 | 32 | print(f"\nVerification completed successfully!") 33 | 34 | # Print verification metrics - using Markdown table format 35 | print("\n## Verification metrics") 36 | for model, model_metrics in metrics.items(): 37 | print(f"\n### Model: {model}") 38 | for key in model_metrics: 39 | temp, n_responses = key.split(',') 40 | print(f"\nTemperature {temp}, n_responses {n_responses}") 41 | 42 | # Create table header 43 | print("\n| Metric | Value |") 44 | print("|--------|-------|") 45 | 46 | # Create table body 47 | for metric_name, value in model_metrics[key].items(): 48 | print(f"| {metric_name} | {value:.4f} |") 49 | 50 | print(f"\nNext step: Run the evaluation script with the same config file.") 51 | 52 | if __name__ == "__main__": 53 | main() -------------------------------------------------------------------------------- /src/apebench/config/default_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | """ 3 | Default configuration definition module 4 | """ 5 | 6 | DEFAULT_CONFIG = { 7 | # Global settings 8 | "project_dir": "./", 9 | "output_dir": "./outputs", 10 | "temp_dir": "./temp", 11 | "progress_log": "./logs/progress.json", 12 | 13 | # Data settings 14 | "input_file": "./datasets/ape_bench1_valid_test.parquet", 15 | 16 | # Data collection configuration 17 | "data_collection": { 18 | "dataset_dir": "datasets", 19 | "repo_url": "https://github.com/leanprover-community/mathlib4.git", 20 | "repo_path": "mathlib4", 21 | "max_diff_lines": 100, 22 | "latest_num_data": 2000, 23 | "instruction_model": "aws_sdk_claude37_sonnet@thinking", 24 | "judgement_model": "aws_sdk_claude37_sonnet@thinking", 25 | "max_workers": 8, 26 | "max_tokens": 20000, 27 | "thinking_budget_tokens": 16000 28 | }, 29 | 30 | # Patch generation configuration 31 | "generation": { 32 | "base_output_dir": "./outputs/patch", 33 | "parallel_models": True, # Different models executed in parallel 34 | "parallel_configs": False, # Same model with different configs executed serially 35 | "max_model_workers": 4, # Number of models to execute in parallel 36 | "models": [ 37 | { 38 | "name": "deepseek-v3-250324", 39 | "configs": [ 40 | {"temperature": 0.0, "n_responses": 1, "max_workers": 48}, 41 | {"temperature": 0.6, "n_responses": 20, "max_workers": 48} 42 | ] 43 | } 44 | ] 45 | }, 46 | 47 | # Verification configuration 48 | "verification": { 49 | "eleanstic_config": "./src/eleanstic/config.yaml", 50 | "max_workers": 128, 51 | "results_dir": "./verify_results" 52 | }, 53 | 54 | # Judgment generation configuration 55 | "judgement": { 56 | "model_name": "aws_sdk_claude37_sonnet@thinking", 57 | "temperature": 0.0, 58 | "n_responses": 1, 59 | "max_workers": 8 60 | }, 61 | 62 | # Evaluation configuration 63 | "evaluation": { 64 | "k_ratio": 0.8, 65 | "generate_plots": True, 66 | "plots_dir": "./plots" 67 | } 68 | } -------------------------------------------------------------------------------- /src/apebench/scripts/3_evaluate_patches.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | #!/usr/bin/env python3 4 | """ 5 | Patch evaluation script, runs the patch evaluation process according to configuration. 6 | """ 7 | 8 | import argparse 9 | import os 10 | import sys 11 | import json 12 | from typing import Dict, Any, Optional 13 | 14 | def main(): 15 | """Script entry point""" 16 | # Parse command line arguments 17 | parser = argparse.ArgumentParser(description="Evaluate verified patches") 18 | parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file") 19 | parser.add_argument("--merged_file", type=str, help="Optional merged results file from verification") 20 | args = parser.parse_args() 21 | 22 | # Ensure src can be imported 23 | script_dir = os.path.dirname(os.path.abspath(__file__)) 24 | root_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir))) 25 | if root_dir not in sys.path: 26 | sys.path.insert(0, root_dir) 27 | 28 | # Import modules 29 | from ..evaluation_pipelines.evaluation_manager import evaluate_patches 30 | 31 | # Execute evaluation 32 | verification_metrics, metrics = evaluate_patches(args.config, args.merged_file) 33 | 34 | # Print result summary 35 | print("\nEvaluation completed successfully!") 36 | print("Metrics summary:") 37 | 38 | # Print verification metrics - using Markdown table format 39 | print("\n## Verification metrics") 40 | for model, model_metrics in verification_metrics.items(): 41 | print(f"\n### Model: {model}") 42 | for key in model_metrics: 43 | temp, n_responses = key.split(',') 44 | print(f"\nTemperature {temp}, n_responses {n_responses}") 45 | 46 | # Create table header 47 | print("\n| Metric | Value |") 48 | print("|--------|-------|") 49 | 50 | # Create table body 51 | for metric_name, value in model_metrics[key].items(): 52 | print(f"| {metric_name} | {value * 100:.2f}% |") 53 | 54 | # Print judgment metrics - using Markdown table format 55 | print("\n## Judgement metrics") 56 | for model, model_metrics in metrics.items(): 57 | print(f"\n### Model: {model}") 58 | for key in model_metrics: 59 | temp, n_responses = key.split(',') 60 | print(f"\nTemperature {temp}, n_responses {n_responses}") 61 | 62 | # Create table header 63 | print("\n| Metric | Value |") 64 | print("|--------|-------|") 65 | 66 | # Create table body 67 | for metric_name, value in model_metrics[key].items(): 68 | print(f"| {metric_name} | {value * 100:.2f}% |") 69 | 70 | if __name__ == "__main__": 71 | main() -------------------------------------------------------------------------------- /configs/config.yaml: -------------------------------------------------------------------------------- 1 | project_dir: "./" 2 | output_dir: "./outputs" 3 | temp_dir: "./temp" 4 | progress_log: "./progress/config_progress.json" 5 | input_file: "./datasets/ape_bench1_test.parquet" 6 | 7 | # Data collection configuration 8 | data_collection: 9 | # Dataset directory 10 | dataset_dir: "datasets" 11 | # Code repository information 12 | repo_url: "https://github.com/leanprover-community/mathlib4.git" 13 | repo_path: "mathlib4" 14 | # Data collection parameters 15 | max_diff_lines: 100 16 | latest_num_data: 2000 17 | # Model configuration 18 | instruction_model: "aws_sdk_claude37_sonnet@thinking" 19 | judgement_model: "aws_sdk_claude37_sonnet@thinking" 20 | max_workers: 8 21 | max_tokens: 20000 22 | thinking_budget_tokens: 16000 23 | 24 | generation: 25 | base_output_dir: "./outputs/patch" 26 | parallel_models: true 27 | parallel_configs: false 28 | max_model_workers: 16 29 | models: 30 | - name: "deepseek-v3-250324" 31 | configs: 32 | - temperature: 0.0 33 | n_responses: 1 34 | max_workers: 48 35 | - temperature: 0.6 36 | n_responses: 20 37 | max_workers: 48 38 | - name: "aws_sdk_claude37_sonnet" 39 | configs: 40 | - temperature: 0.0 41 | n_responses: 1 42 | max_workers: 4 43 | - temperature: 0.6 44 | n_responses: 20 45 | max_workers: 4 46 | - name: "aws_sdk_claude37_sonnet@thinking" 47 | configs: 48 | - temperature: 0.0 49 | n_responses: 20 50 | max_workers: 8 51 | - name: "deepseek-r1-250120" 52 | configs: 53 | - temperature: 0.0 54 | n_responses: 1 55 | max_workers: 32 56 | - temperature: 0.6 57 | n_responses: 20 58 | max_workers: 32 59 | - name: "gpt-4o-2024-08-06" 60 | configs: 61 | - temperature: 0.0 62 | n_responses: 1 63 | max_workers: 4 64 | - temperature: 0.6 65 | n_responses: 20 66 | max_workers: 4 67 | - name: "doubao-1-5-pro-32k-250115" 68 | configs: 69 | - temperature: 0.0 70 | n_responses: 1 71 | max_workers: 32 72 | - temperature: 0.6 73 | n_responses: 20 74 | max_workers: 64 75 | - name: "o3-mini" 76 | configs: 77 | - temperature: 0.0 78 | n_responses: 20 79 | max_workers: 4 80 | - name: "gemini-2.5-pro-preview-03-25" 81 | configs: 82 | - temperature: 0.0 83 | n_responses: 1 84 | max_workers: 64 85 | - temperature: 0.6 86 | n_responses: 20 87 | max_workers: 64 88 | 89 | verification: 90 | eleanstic_config: "./src/eleanstic/config.yaml" 91 | results_dir: "./verify_results" 92 | max_workers: 180 93 | 94 | judgement: 95 | model_name: "aws_sdk_claude37_sonnet@thinking" 96 | temperature: 0.0 97 | n_responses: 4 98 | max_workers: 64 99 | 100 | evaluation: 101 | k_ratio: 0.8 102 | generate_plots: true 103 | plots_dir: "./plots" -------------------------------------------------------------------------------- /src/eleanstic/utils/sys_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | """ 3 | A script to find and kill all processes containing a specific command pattern 4 | """ 5 | 6 | import os 7 | import subprocess 8 | import sys 9 | import signal 10 | 11 | def find_and_kill_processes(pattern): 12 | """ 13 | Find and kill all processes containing the specified pattern 14 | 15 | Args: 16 | pattern: Command pattern to search for 17 | 18 | Returns: 19 | killed_count: Number of processes killed 20 | """ 21 | # Use ps command to find all processes 22 | try: 23 | ps_output = subprocess.check_output( 24 | ["ps", "-ef"], 25 | universal_newlines=True 26 | ) 27 | except subprocess.SubprocessError as e: 28 | print(f"Error running ps command: {e}") 29 | return 0 30 | 31 | killed_count = 0 32 | current_pid = os.getpid() # Get current script's PID 33 | 34 | # Iterate through all process lines 35 | for line in ps_output.strip().split('\n')[1:]: # Skip header line 36 | parts = line.split() 37 | if len(parts) < 8: 38 | continue 39 | 40 | pid = int(parts[1]) 41 | cmd = ' '.join(parts[7:]) 42 | 43 | # If a matching process is found and it's not the current script itself 44 | if pattern in cmd and pid != current_pid: 45 | try: 46 | print(f"Terminating process {pid}: {cmd}") 47 | os.kill(pid, signal.SIGTERM) 48 | killed_count += 1 49 | except OSError as e: 50 | print(f"Error terminating process {pid}: {e}") 51 | 52 | return killed_count 53 | 54 | if __name__ == "__main__": 55 | # Command pattern to search for 56 | for pattern in ["eleanstic", "lean", "lake"]: 57 | print(f"Finding and terminating processes containing '{pattern}'...") 58 | killed = find_and_kill_processes(pattern) 59 | 60 | if killed == 0: 61 | print("No matching processes found") 62 | else: 63 | print(f"Successfully terminated {killed} processes") 64 | 65 | # Helper commands for monitoring disk space and directory sizes: 66 | # 1. Monitor free space on the mounted volume every 20 seconds: 67 | # while true; do echo "$(date) - Storage space: $(df -h | grep -E '/mnt/bd/ape-bench-dev$' | awk '{print $4}')"; sleep 20; done 68 | 69 | # 2. Monitor used space on root and size of verify database storage directory every 60 seconds: 70 | # while true; do echo "$(date) - Storage space: $(df -h | grep -E '/$' | awk '{print $3}') - Directory size: $(du -sh /mnt/bd/ape-bench-dev/ape-bench1/datasets/verify_database/storage/partitions 2>/dev/null || echo 'Cannot access')"; sleep 60; done 71 | 72 | # 3. Monitor used space on mounted volume and size of verify database storage partitions every 20 seconds: 73 | # while true; do echo "$(date) - Storage space: $(df -h | grep -E '/mnt/bd/ape-bench-dev$' | awk '{print $3}') - Directory size: $(du -sh /mnt/bd/ape-bench-dev/ape-bench1/datasets/verify_database/storage/partitions 2>/dev/null | awk '{print $1}')"; sleep 20; done -------------------------------------------------------------------------------- /src/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | import os 4 | import glob 5 | import json 6 | import pandas as pd 7 | from tqdm import tqdm 8 | import numpy as np 9 | 10 | def convert_to_serializable(obj): 11 | if isinstance(obj, (str, int, float, bool, type(None))): 12 | return obj 13 | elif isinstance(obj, (set, frozenset)): 14 | return list(obj) 15 | elif isinstance(obj, dict): 16 | return {key: convert_to_serializable(value) for key, value in obj.items()} 17 | elif isinstance(obj, (list, tuple)): 18 | return [convert_to_serializable(item) for item in obj] 19 | elif hasattr(obj, 'isoformat'): 20 | return obj.isoformat() 21 | elif isinstance(obj, np.ndarray): 22 | return [convert_to_serializable(item) for item in obj] 23 | else: 24 | try: 25 | return str(obj) 26 | except: 27 | return None 28 | 29 | def load_results(file_paths): 30 | """ 31 | Load results from files matching the pattern. 32 | 33 | Args: 34 | file_paths (str): Directory containing result files 35 | file_pattern (str): Pattern to match result files 36 | 37 | Returns: 38 | pd.DataFrame: Combined DataFrame from all matching files 39 | """ 40 | if isinstance(file_paths, list): 41 | file_paths = [file_path for file_path_pattern in file_paths for file_path in glob.glob(file_path_pattern)] 42 | if isinstance(file_paths, str): 43 | file_paths = glob.glob(file_paths) 44 | 45 | if not file_paths: 46 | print(f"Warning: No files found matching {file_paths}") 47 | return pd.DataFrame() 48 | 49 | print(f"Found {len(file_paths)} files matching {file_paths}") 50 | 51 | all_data = [] 52 | for file_path in tqdm(file_paths, desc="Loading files"): 53 | try: 54 | if file_path.endswith('.parquet'): 55 | df = pd.read_parquet(file_path) 56 | elif file_path.endswith('.jsonl') or file_path.endswith('.json'): 57 | # Read JSONL files 58 | records = load_jsonl(file_path) 59 | df = pd.DataFrame(records) 60 | else: 61 | print(f"Warning: Unsupported file format for {file_path}") 62 | continue 63 | 64 | all_data.append(df) 65 | except Exception as e: 66 | print(f"Error loading {file_path}: {str(e)}") 67 | 68 | if not all_data: 69 | print(f"Warning: Could not load any data from files matching {file_path}") 70 | return pd.DataFrame() 71 | 72 | combined_data = pd.concat(all_data, ignore_index=True) 73 | print(f"Loaded {len(combined_data)} entries from {len(all_data)} files") 74 | 75 | return combined_data 76 | 77 | def load_jsonl(input_path): 78 | with open(input_path, 'r') as f: 79 | data = [json.loads(line.strip()) for line in f] 80 | return data 81 | 82 | def save_jsonl(data, output_path): 83 | with open(output_path, 'w') as f: 84 | if isinstance(data, pd.DataFrame): 85 | for _, row in data.iterrows(): 86 | f.write(json.dumps(row.to_dict(), ensure_ascii=False) + '\n') 87 | else: 88 | for item in data: 89 | f.write(json.dumps(item, ensure_ascii=False) + '\n') 90 | -------------------------------------------------------------------------------- /submission.py: -------------------------------------------------------------------------------- 1 | """DO NOT rename this file!""" 2 | import os 3 | import re 4 | import json 5 | import textwrap 6 | import sys 7 | 8 | import openai 9 | 10 | from tqdm import tqdm 11 | 12 | 13 | class Submission: 14 | """A submission template. """ 15 | 16 | def __init__(self, output_file: str): 17 | """You need to specify the following arguments.""" 18 | 19 | self.output_file = output_file 20 | 21 | self.task = "Auto_Formalization" # [Auto_Formalization, Auto_Informalization] 22 | self.phase = "development" # [development, final] 23 | 24 | self.base_url = "http://120.77.8.29:12345/v1/" # The base url of the model server 25 | # If you are using OpenAI API or have set API key for 26 | # your own model, please fill in your API key 27 | self.api_key = "EMPTY" 28 | self.model = "./Mistral-7B-Instruct-v0.2" # Your own model path, or GPTs 29 | self.prompt = textwrap.dedent(""" 30 | You are a math expert and familar with Lean 3 formal language. 31 | Now please translate the following statement and solution of a math 32 | word problem into Lean 3 formal solution. Please note that the 33 | informal solution and the formal solution need to be identical. 34 | # Problem: {{informal_statement}} 35 | # Solution: {{informal_proof}} 36 | # Formal solution in Lean 3: 37 | """) 38 | 39 | # custom generation parameters 40 | self.max_tokens = 256 41 | self.temperature = 0.9 42 | self.top_p = 0.7 43 | self.frequency_penalty = 0.0 44 | 45 | def generate(self, prompt): 46 | """We DO NOT recommend modifying this function, as 47 | it will be used to test if the model is accessable""" 48 | 49 | openai.api_key = self.api_key 50 | openai.base_url = self.base_url 51 | 52 | messages = [ 53 | {"role": "user", "content": prompt}, 54 | ] 55 | 56 | completion = openai.chat.completions.create( 57 | model=self.model, messages=messages, max_tokens=self.max_tokens, 58 | temperature=self.temperature, top_p=self.top_p, 59 | frequency_penalty=self.frequency_penalty, 60 | ) 61 | 62 | return completion.choices[0].message.content 63 | 64 | def post_process(self, model_output: str): 65 | """You can post-process the model output here, 66 | such as extracting the formal proof from the model output.""" 67 | 68 | formal_proof = re.findall(r'```[\S\s]*```', model_output) 69 | if formal_proof == []: 70 | formal_proof = re.findall(r'```[\S\s]*', model_output) 71 | if formal_proof == []: 72 | formal_proof = [model_output] 73 | formal_proof = formal_proof[-1].strip() 74 | 75 | lean_code = "\n".join(formal_proof.strip().split("\n")[1:-1]) # remove ```lean ``` 76 | lean_code = re.sub(pattern=r'line [0-9]* ', repl='', string=lean_code) # remove line * 77 | 78 | return lean_code 79 | 80 | def run(self, input_data: str): 81 | """Run your model on the given input data, and store the 82 | predictions into the output file.""" 83 | 84 | with open(input_data, 'r', encoding="utf8") as f: 85 | datas = json.load(f) 86 | 87 | outputs = [] 88 | for data in tqdm(datas[:10], file=sys.stdout): 89 | input_text = self.prompt.format( 90 | informal_statement=data["informal_statement"], 91 | informal_proof=data["informal_proof"] 92 | ) 93 | 94 | output = self.generate(prompt=input_text) 95 | outputs.append(dict( 96 | name=data["name"], 97 | formal_proof=self.post_process(output), 98 | )) 99 | 100 | if not os.path.exists(self.output_file): 101 | os.makedirs(os.path.dirname(self.output_file), exist_ok=True) 102 | with open(self.output_file, 'w', encoding='utf8') as f: 103 | json.dump(outputs, f, indent=4, ensure_ascii=False) 104 | -------------------------------------------------------------------------------- /src/apebench/config/config_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | """ 3 | Configuration management module responsible for loading, validating, and providing access to configurations 4 | """ 5 | 6 | import os 7 | import json 8 | import yaml 9 | from typing import Dict, Any, Union 10 | from .default_config import DEFAULT_CONFIG 11 | 12 | class ConfigDict: 13 | """Class that allows dictionary data to be accessed via attributes""" 14 | 15 | def __init__(self, config_data: Dict[str, Any]): 16 | for key, value in config_data.items(): 17 | if isinstance(value, dict): 18 | setattr(self, key, ConfigDict(value)) 19 | else: 20 | setattr(self, key, value) 21 | 22 | def to_dict(self) -> Dict[str, Any]: 23 | """Convert configuration back to a dictionary""" 24 | result = {} 25 | for key, value in self.__dict__.items(): 26 | if isinstance(value, ConfigDict): 27 | result[key] = value.to_dict() 28 | else: 29 | result[key] = value 30 | return result 31 | 32 | class ConfigManager: 33 | """Configuration manager""" 34 | 35 | def __init__(self, config_file: str = None): 36 | """ 37 | Initialize the configuration manager 38 | 39 | Args: 40 | config_file: Optional path to a configuration file 41 | """ 42 | # Load default configuration 43 | self.config_data = DEFAULT_CONFIG.copy() 44 | 45 | # If a configuration file is provided, load and merge it 46 | if config_file and os.path.exists(config_file): 47 | self._load_from_file(config_file) 48 | 49 | # Convert to attribute access form 50 | if config_file and not 'progress_log' in self.config_data: 51 | self.config_data['progress_log'] = config_file[ : config_file.find('.')] + '_progress.json' 52 | self.config = ConfigDict(self.config_data) 53 | 54 | def _load_from_file(self, config_file: str) -> None: 55 | """Load configuration from file and merge it""" 56 | file_extension = os.path.splitext(config_file)[1].lower() 57 | 58 | try: 59 | if file_extension == '.json': 60 | with open(config_file, 'r') as f: 61 | user_config = json.load(f) 62 | elif file_extension in ('.yaml', '.yml'): 63 | with open(config_file, 'r') as f: 64 | user_config = yaml.safe_load(f) 65 | else: 66 | raise ValueError(f"Unsupported config file format: {file_extension}") 67 | 68 | # Recursively merge configurations 69 | self._merge_configs(self.config_data, user_config) 70 | except Exception as e: 71 | print(f"Error loading config file: {e}") 72 | 73 | def _merge_configs(self, base: Dict[str, Any], override: Dict[str, Any]) -> None: 74 | """Recursively merge configuration dictionaries""" 75 | for key, value in override.items(): 76 | if key in base and isinstance(base[key], dict) and isinstance(value, dict): 77 | self._merge_configs(base[key], value) 78 | else: 79 | base[key] = value 80 | 81 | def get_config(self) -> ConfigDict: 82 | """Get the configuration object""" 83 | return self.config 84 | 85 | def save_config(self, output_file: str) -> None: 86 | """Save current configuration to a file""" 87 | file_extension = os.path.splitext(output_file)[1].lower() 88 | 89 | try: 90 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 91 | 92 | if file_extension == '.json': 93 | with open(output_file, 'w') as f: 94 | json.dump(self.config_data, f, indent=2) 95 | elif file_extension in ('.yaml', '.yml'): 96 | with open(output_file, 'w') as f: 97 | yaml.dump(self.config_data, f, default_flow_style=False) 98 | else: 99 | raise ValueError(f"Unsupported config file format: {file_extension}") 100 | except Exception as e: 101 | print(f"Error saving config file: {e}") -------------------------------------------------------------------------------- /docs/03_core_components/03_2_apebench_data.md: -------------------------------------------------------------------------------- 1 | [English](#english-version) | [中文](#chinese-version) 2 | 3 | 4 | # 3.2 Data Handling: Tasks and Format 5 | 6 | This section describes how APE-Bench I tasks are structured, where the data comes from, and how it's handled by the `src/apebench/data/` modules. 7 | 8 | ## Task Format 9 | 10 | As specified in the APE-Bench I paper (Section 3.1), each task in the benchmark is a triplet: `(Instruction, PreFile, Patch)`. 11 | 12 | * **`Instruction`**: A natural language string describing the intended modification to a Lean file. This serves as the main prompt for the LLM being evaluated. 13 | * *Example*: "Refactor the proof of `theorem_xyz` to use `lemma_abc`." or "Add a new definition `new_function` with the following properties..." 14 | * **`PreFile`**: A string containing the complete Lean source code of the target file *before* the edit. This provides the full context for the LLM. 15 | * **`Patch`**: A string in the unified diff format that encodes the ground-truth edit. This patch, when applied to `PreFile`, should result in the desired post-edit state of the file. 16 | * This is used as the reference for evaluating LLM-generated patches, although direct diff matching is not the primary success metric (semantic correctness is key). 17 | 18 | Additional metadata associated with each task in the test set includes: 19 | * **Task ID**: A unique identifier for the task. 20 | * **Commit SHA**: The Mathlib4 commit from which the task was derived. 21 | * **File Path**: The path to the specific Lean file within the Mathlib commit. 22 | * **Task Category**: One of `Feature`, `Refactor`, or `Bug Fix` (as defined in paper Section 3.3). 23 | * **Difficulty Level**: One of `Easy`, `Medium`, or `Hard` (as defined in paper Section 3.3). 24 | 25 | ## Data Source 26 | 27 | The APE-Bench I dataset is hosted on Hugging Face: 28 | * **URL**: [https://huggingface.co/datasets/HuajianXin/APE-Bench_I](https://huggingface.co/datasets/HuajianXin/APE-Bench_I) 29 | 30 | During setup, you must clone this dataset into the `datasets/` directory within your project. The primary test dataset file is named `ape_bench1_test.parquet`. 31 | 32 | ## Data Handling in `src/apebench/data/` 33 | 34 | The modules within `src/apebench/data/` are responsible for: 35 | 36 | * **Loading Tasks**: Reading the benchmark data files (from `datasets/`) into memory, supporting both JSONL and Parquet formats. 37 | * **Parsing**: Extracting the `Instruction`, `PreFile`, `Patch`, and other metadata for each task. 38 | * **Data Representation**: Converting the raw data into Python objects for easier use throughout the application. 39 | * **Filtering/Selection**: Providing utilities to classify specific tasks based on criteria like ID, category, or difficulty. 40 | 41 | --- 42 | 43 | 44 | ## 中文翻译 (Chinese Translation) 45 | 46 | # 3.2 数据处理:任务与格式 47 | 48 | 本节描述 APE-Bench I 任务的结构、数据来源以及 `src/apebench/data/` 模块如何处理这些数据。 49 | 50 | ## 任务格式 51 | 52 | 正如 APE-Bench I 论文(第 3.1 节)所明确指出的,基准测试中的每个任务都是一个三元组:`(Instruction, PreFile, Patch)`。 53 | 54 | * **`Instruction` (指令)**:一个自然语言字符串,描述对 Lean 文件的预期修改。这是被评估 LLM 的主要提示。 55 | * *示例*:"将 `theorem_xyz` 的证明重构为使用 `lemma_abc`。"或"添加一个具有以下属性的新定义 `new_function`..." 56 | * **`PreFile` (修改前文件)**:一个包含编辑前目标文件完整 Lean 源代码的字符串。这为 LLM 提供了完整的上下文。 57 | * **`Patch` (补丁)**:一个统一差异格式的字符串,编码了真实的编辑。当此补丁应用于 `PreFile` 时,应产生文件所需的编辑后状态。 58 | * 这被用作评估 LLM 生成补丁的参考,尽管直接的差异匹配不是主要的成功指标(语义正确性是关键)。 59 | 60 | 测试集中与每个任务相关的其他元数据包括: 61 | * **Task ID (任务 ID)**:任务的唯一标识符。 62 | * **Commit SHA (提交 SHA)**:任务来源的 Mathlib4 提交的 SHA 值。 63 | * **File Path (文件路径)**:Mathlib 提交中特定 Lean 文件的路径。 64 | * **Task Category (任务类别)**:`Feature` (功能)、`Refactor` (重构)或`Bug Fix` (错误修复)之一(根据论文第 3.3 节定义)。 65 | * **Difficulty Level (难度级别)**:`Easy` (简单)、`Medium` (中等)或`Hard` (困难)之一(根据论文第 3.3 节定义)。 66 | 67 | ## 数据来源 68 | 69 | APE-Bench I 数据集托管在 Hugging Face 上: 70 | * **URL**: [https://huggingface.co/datasets/HuajianXin/APE-Bench_I](https://huggingface.co/datasets/HuajianXin/APE-Bench_I) 71 | 72 | 在设置过程中,您必须将此数据集克隆到项目中的 `datasets/` 目录。主要的测试数据集文件名为 `ape_bench1_test.parquet`。 73 | 74 | ## `src/apebench/data/` 中的数据处理 75 | 76 | `src/apebench/data/` 中的模块负责: 77 | 78 | * **加载任务**:从 `datasets/` 内存中读取基准测试数据文件,支持 JSONL 和 Parquet 格式。 79 | * **解析**:为每个任务提取 `Instruction`、`PreFile`、`Patch` 和其他元数据。 80 | * **数据表示**:将原始数据转换为 Python 对象,以便在整个应用程序中更轻松地使用。 81 | * **筛选/选择**:提供根据 ID、类别或难度等标准分类。 82 | 83 | --- 84 | 85 | 下一节: [LLM 推理与 DiffRepair](./03_3_apebench_inference.md) -------------------------------------------------------------------------------- /src/apebench/inference/prompts/judgement_generation_prompts.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | judgement_generation_system_prompt = '''## Task 4 | Analyze the provided Lean4 code patch comprehensively to determine whether it correctly, completely, and clearly implements the specified tasks. The patch has already been verified by Lean's type checker, thus types and tactics are assumed correct. Your goal is to provide a thorough yet contextually flexible evaluation, assessing how well the patch aligns with task requirements and integrates into the existing codebase. 5 | 6 | Use the following steps and criteria **as references** to guide your analysis. **Do not mechanically adhere to these steps; instead, adapt their use according to the specific context and significance of each element in the provided code.** Aim for a comprehensive, flexible, and nuanced evaluation rather than a rigid checklist. 7 | 8 | --- 9 | 10 | ### Step 1: Task Understanding (Reference) 11 | - Summarize core and implied requirements clearly. 12 | - Identify any explicit or implicit constraints. 13 | - Clarify expected outcomes and note any ambiguities. 14 | 15 | --- 16 | 17 | ### Step 2: Original Code Analysis (Reference) 18 | - Provide a concise summary of the original code structure and purpose. 19 | - Highlight key definitions, lemmas, theorems, proofs, assumptions, or dependencies relevant to the patch. 20 | - Evaluate logical flow and proof structure as contextually appropriate. 21 | 22 | --- 23 | 24 | ### Step 3: Patch Examination (Reference) 25 | - Clearly describe the elements added, modified, or removed. 26 | - Evaluate the logical clarity, correctness, and efficacy of modifications. 27 | - Consider appropriate use of Lean4-specific features (e.g., inductive types, macros, notations). 28 | 29 | --- 30 | 31 | ### Step 4: Requirement Fulfillment Analysis (Reference) 32 | For each provided task, evaluate (as contextually relevant): 33 | - Accuracy and completeness of achieving core objectives. 34 | - Logical thoroughness and consideration of edge cases. 35 | - Mathematical and type-theoretic correctness. 36 | - Consistency with existing design patterns and coding standards. 37 | 38 | --- 39 | 40 | ### Step 5: Implementation Quality Analysis (Reference) 41 | Evaluate implementation quality with respect to: 42 | - Mathematical abstraction, modularity, and hierarchical structure. 43 | - Clarity, naming conventions, and documentation effectiveness. 44 | - Logical decomposition, proof readability, and maintainability. 45 | - Software engineering principles (single responsibility, interface rationality). 46 | - Appropriate use of Lean-specific techniques (metaprogramming, universes, computational vs. proof separation). 47 | - Future-proofing, extensibility, and integration within mathlib standards. 48 | 49 | --- 50 | 51 | ### Step 6: Overall Judgement (Required) 52 | Based on your comprehensive analysis, provide structured final grades **without additional justification**, strictly using the JSON format below for clear information extraction: 53 | 54 | ```json 55 | { 56 | "TaskEvaluations": { 57 | "Task 1": "Excellent | Good | Acceptable | Poor | Unacceptable", 58 | "Task 2": "Excellent | Good | Acceptable | Poor | Unacceptable" 59 | // Add additional tasks as necessary 60 | }, 61 | "FinalOverallGrade": "Excellent | Good | Acceptable | Poor | Unacceptable" 62 | } 63 | ``` 64 | 65 | --- 66 | 67 | **Reminder:** Prioritize flexible, context-sensitive analysis. Reference provided steps and criteria only as guidelines, adapting your evaluation according to actual significance and context of the provided Lean4 code patch. 68 | ''' 69 | 70 | judgement_generation_input_prompt = '''# Lean4 Code Evaluation Request 71 | 72 | ## Original Source Code: {filename} 73 | 74 | ```lean 75 | {lean_code} 76 | ``` 77 | 78 | ## Task Requirements 79 | 80 | {instruction} 81 | 82 | ## Proposed Implementation 83 | 84 | ```diff 85 | {raw_patch} 86 | ``` 87 | 88 | Please evaluate whether this implementation properly fulfills the task requirements. 89 | ''' 90 | 91 | judgement_generation_input_prompt_without_lean_code = '''# Lean4 Code Evaluation Request 92 | 93 | ## Original Source Code Status 94 | 95 | This is a new file creation with no pre-existing code. 96 | 97 | ## Task Requirements 98 | 99 | {instruction} 100 | 101 | ## Proposed Implementation 102 | 103 | ```diff 104 | {raw_patch} 105 | ``` 106 | 107 | Please evaluate whether this implementation properly fulfills the task requirements. 108 | ''' -------------------------------------------------------------------------------- /src/apebench/inference/prompts/instruction_generation_prompts.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | instruction_generation_system_prompt = '''# Task Overview 4 | 5 | Your goal is to transform given Lean code modifications (diffs) for a given Lean file into structured, precise, and self-contained Lean exercises suitable for practicing mathematical reasoning and proof engineering. Each generated exercise should be concise yet comprehensive enough for practitioners to reconstruct the exact changes based solely on the provided exercise. 6 | 7 | You will complete the following three steps explicitly and systematically. Each step must clearly connect logically to the next, ensuring an integrated, coherent result. 8 | 9 | --- 10 | 11 | Step 1: Diff Analysis 12 | 13 | Instructions: 14 | - Carefully examine each diff hunk in detail. 15 | - For **each modified Lean declaration** (`def`, `lemma`, `theorem`, `class`, `instance`, etc.): 16 | - Clearly state the diff hunk span (e.g., `@@ -12,7 +12,7 @@`). 17 | - Precisely describe what was **added, removed, or changed** within the declaration. 18 | - Clearly outline the mathematical meaning or implication of each modification. 19 | - Identify and summarize the overall mathematical context of the entire diff. 20 | 21 | --- 22 | 23 | Step 2: Dependency and Hierarchy Analysis 24 | 25 | Instructions: 26 | - Analyze the relationships among declarations identified in Step 1. 27 | - Explicitly classify declarations into: 28 | - **Core Contributions:** Declarations directly motivated by essential mathematical goals. 29 | - **Auxiliary Declarations:** Supporting or intermediate lemmas serving core contributions. 30 | - Clearly outline dependencies and hierarchical relationships among these declarations. 31 | - Explicitly state the core mathematical motivations and objectives driving the identified core contributions. 32 | 33 | --- 34 | 35 | Step 3: Exercise Generation 36 | 37 | Instructions: 38 | - Based explicitly on the Core Contributions identified in Step 2, generate one structured, self-contained Lean exercise for each core declaration. 39 | - Each exercise must: 40 | - Clearly reflect the overall mathematical context (from Step 1) and the core mathematical motivation (from Step 2). 41 | - Be formulated entirely in standard mathematical language in textbooks or academic literature, explicitly avoiding Lean-specific syntax or implementation details. 42 | - Allow practitioners to precisely reconstruct the intended modifications solely from your concise instructions. 43 | - Use imperative language for instructions ("Prove that…", "Define…", etc.). 44 | 45 | Response Format for Step 3: 46 | ``` 47 | # Exercises in Lean 48 | 49 | ## Exercise 1: [Concise and Descriptive Title Reflecting Mathematical Content] 50 | - **Diff Hunk Span:** `@@ -X,Y +X,Y @@` 51 | - **Task Category:** [Feature | Bug Fix | Refactor | Chore | Testing | Documentation | Formatting] 52 | - **Focus:** [Mathematical Concepts | Software Engineering] 53 | - **Difficulty:** [Very Easy | Easy | Medium | Hard | Very Hard] 54 | - **Task Nature:** [Substantial | Superficial] 55 | - **Problem Statement (Natural Mathematical Language):** 56 | Clearly state the mathematical statement to be proved or defined. Use concise, self-contained, textbook-style language understandable to mathematicians without referencing Lean-specific syntax. If the task involves modifying an existing statement (e.g., correcting an error or clarifying logic), precisely describe the intended conceptual adjustments in purely mathematical terms. Include LaTeX-formatted mathematical expressions as needed. Ensure that instructions are imperative (e.g., "Prove that...", "Define...") and explicitly indicate the logical or conceptual emphasis required by the modification. 57 | 58 | *(Repeat explicitly for each core contribution.)* 59 | ``` 60 | 61 | --- 62 | 63 | Ensure your responses strictly follow the provided formats and clearly adhere to each instruction, thus creating structured, integrated, and high-quality proof engineering exercises. 64 | 65 | ''' 66 | 67 | instruction_generation_input_prompt = '''# Lean Code Modification Analysis Request 68 | 69 | ## Original File: {filename} 70 | 71 | ```lean 72 | {lean_code} 73 | ``` 74 | 75 | ## Proposed Changes (Patch): 76 | 77 | ```diff 78 | {raw_patch} 79 | ``` 80 | 81 | Please analyze these modifications according to the instructions provided. 82 | ''' 83 | 84 | instruction_generation_input_prompt_without_lean_code = '''# Lean Code Modification Analysis Request 85 | 86 | ## Original File Status 87 | This represents a new file creation - no pre-existing code. 88 | 89 | ## Proposed File Content (Patch): 90 | 91 | ```diff 92 | {raw_patch} 93 | ``` 94 | 95 | Please analyze this new file according to the instructions provided. 96 | ''' -------------------------------------------------------------------------------- /src/apebench/inference/prompts/patch_generation_prompts.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | patch_generation_system_prompt = """You are given a set of **Task Descriptions**, each specifying modifications to an existing Lean 4 codebase (which may be optional or only partially provided). Your goal is to generate a **unified diff patch** that implements **only** the specified changes in **Lean 4 syntax**, ensuring strict adherence to Lean 4 conventions. 4 | 5 | Follow these steps: 6 | 7 | ### **Step 1: Identify Key Proving Strategies** 8 | - For each Task Description, **analyze and summarize** the key strategies involved, such as: 9 | - Lemma rewriting 10 | - Data structure modification 11 | - Function renaming 12 | - Introducing new theorems or lemmas 13 | - Other conceptual or syntactical transformations 14 | - Highlight any specialized proof techniques or high-level ideas guiding your modifications. 15 | 16 | ### **Step 2: Declaration Inventory** 17 | - List all **relevant declarations** (definitions, lemmas, theorems, data types) to be **added, removed, or modified**. 18 | - For new Lean 4 declarations: 19 | - Provide **concise, academic-style statements** or descriptions. 20 | - Explain how they integrate into the overall codebase. 21 | 22 | ### **Step 3: Determine Modification Locations** 23 | - Identify **where each modification should be applied** within the given Lean 4 codebase. 24 | - Quote relevant **original Lean code** where applicable, indicating: 25 | - **Insertion points** for new definitions, lemmas, or theorems. 26 | - **Lines to be modified**, specifying which parts require updates. 27 | - **Removals**, justifying why specific lines or declarations should be deleted. 28 | 29 | ### **Step 4: Unified Diff Patch (Lean 4)** 30 | - Present the **final patch** in **unified diff format** with **at least three lines of context before and after** each modified hunk. 31 | - Ensure the patch contains **only** the specified changes—no extraneous edits. 32 | - **Strictly enforce Lean 4 syntax**: 33 | - Check that all modifications are **Lean 4-compliant** and follow best practices. 34 | - Avoid deprecated Lean 3 syntax or tactics. 35 | - Ensure consistency with **Lean 4's module system and proof style**. 36 | - All code must be valid **Lean 4 syntax**, with **no** placeholders (`sorry`, `admit`). 37 | - Do **not** interleave commentary within the diff—explanations belong in Steps 1–3. 38 | 39 | ### **Response Format** 40 | 41 | #### **Step 1: Key Strategies** 42 | [Summarize the main strategies for each Task Description.] 43 | 44 | #### **Step 2: Declaration Inventory** 45 | [List modified, removed, or added declarations, providing concise descriptions for new ones.] 46 | 47 | #### **Step 3: Modification Locations** 48 | [Identify and quote the relevant Lean code where changes should be made. Specify insertion points, modifications, and removals.] 49 | 50 | #### **Step 4: Unified Diff Patch (Lean 4)** 51 | - **Overall Explanation of the Changes:** 52 | - [Provide a structured natural-language overview of the modifications.] 53 | - **Lean 4 Compliance Reminder:** 54 | - Clearly highlight how the diff strictly adheres to **Lean 4 syntax**, avoiding **Lean 3 syntax or tactics**. 55 | - Emphasize key changes in **Lean 4 module system, proof tactics, and syntax adaptations**. 56 | - **Final Patch in Unified Diff Format:** 57 | ```diff 58 | [Present the final patch in unified diff format, with at least three lines of context before and after each diff hunk. Ensure strict Lean 4 compliance.] 59 | ``` 60 | 61 | """ 62 | 63 | patch_generation_reasoning_models_system_prompt = """You are given a set of **Task Descriptions**, each specifying modifications to an existing Lean 4 codebase (which may be optional or only partially provided). Your task is to generate a **unified diff patch** that implements **only** the specified changes in **Lean 4 syntax**, ensuring strict adherence to Lean 4 conventions. 64 | 65 | Please provide the final patch in the following format: 66 | 67 | ```diff 68 | [Present the final patch in unified diff format, with at least three lines of context before and after each diff hunk. Ensure strict Lean 4 compliance.] 69 | ``` 70 | """ 71 | 72 | patch_generation_input_prompt = """# Lean4 Code Modification Task 73 | 74 | ## Task Requirements 75 | 76 | {instructions} 77 | 78 | ## Source Codebase: {filename} 79 | 80 | ```lean 81 | {lean_code} 82 | ``` 83 | 84 | Please generate a unified diff patch that implements all specified requirements while ensuring strict adherence to Lean4 syntax and conventions. 85 | """ 86 | 87 | patch_generation_input_prompt_without_lean_code = """# Lean4 Code Creation Task 88 | 89 | ## Task Requirements 90 | 91 | {instructions} 92 | 93 | ## Source Codebase Status 94 | 95 | This task requires creating a new file for {filename}. No existing code is provided. 96 | 97 | Please generate a unified diff patch that creates this file with all specified requirements while ensuring strict adherence to Lean4 syntax and conventions. 98 | """ -------------------------------------------------------------------------------- /src/apebench/inference/inference_pipelines/generate_judgement.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | from ..inference_pipelines.base import BasePipeline 4 | import re 5 | import logging 6 | import json 7 | from collections import Counter 8 | 9 | class GenerateJudgementPipeline(BasePipeline): 10 | """ 11 | Pipeline for generating judgements on the quality of generated patches. 12 | 13 | Assesses completeness, accuracy, scope, and coding style of implementations. 14 | """ 15 | def __init__(self, args): 16 | super().__init__(args) 17 | from ..prompts import ( 18 | judgement_generation_system_prompt, 19 | judgement_generation_input_prompt, 20 | judgement_generation_input_prompt_without_lean_code 21 | ) 22 | self.system_prompt = judgement_generation_system_prompt 23 | self.input_prompt = judgement_generation_input_prompt 24 | self.input_prompt_without_lean_code = judgement_generation_input_prompt_without_lean_code 25 | self.criteria_list = ['unacceptable', 'poor', 'acceptable', 'good', 'excellent'] 26 | 27 | def get_input(self, row): 28 | if not 'exercises' in row: 29 | assert len(row['responses']) == 1, f"Expected 1 response, got {len(row['responses'])}" 30 | row['exercises'] = row['responses'][0]['exercises'] 31 | if not 'full_instruction' in row: 32 | exercises = row['exercises'] 33 | instructions = [f"- Task {idx + 1}: {exercise['title']}\n\n{exercise['instruction']}" for idx, exercise in enumerate(exercises)] 34 | full_instruction = '\n\n\n'.join(instructions) 35 | row['full_instruction'] = full_instruction 36 | else: 37 | full_instruction = row['full_instruction'] 38 | 39 | # Format input for verification 40 | if not row['content_before']: 41 | formatted_input = self.input_prompt_without_lean_code.format( 42 | instruction=full_instruction, 43 | raw_patch=row[self.args.patch_key].strip() 44 | ) 45 | else: 46 | formatted_input = self.input_prompt.format( 47 | instruction=full_instruction, 48 | raw_patch=row[self.args.patch_key].strip(), 49 | lean_code=row['content_before'].strip(), 50 | filename=row['file_path_after'] 51 | ) 52 | return formatted_input 53 | 54 | def initialize_metadata(self, row): 55 | """Initialize metadata for a row using Counter""" 56 | return { 57 | 'worst_judgement': None, 58 | 'majority_judgement': None, 59 | 'judgement_counter': Counter(), # Use Counter instead of list 60 | } 61 | 62 | def update_metadata_per_response(self, metadata, parsed_response): 63 | """Update metadata with response using Counter""" 64 | if parsed_response is not None and 'TaskEvaluations' in parsed_response: 65 | key = 'judgement' 66 | worst_key = f'worst_{key}' 67 | majority_key = f'majority_{key}' 68 | counter_key = f'{key}_counter' 69 | for task_evaluation in parsed_response['TaskEvaluations'].values(): 70 | value = task_evaluation.lower() 71 | if value in self.criteria_list: 72 | if metadata[worst_key] is None or self.criteria_list.index(value) < self.criteria_list.index(metadata[worst_key]): 73 | metadata[worst_key] = value 74 | metadata[counter_key].update([value]) 75 | if metadata[counter_key]: 76 | metadata[majority_key] = metadata[counter_key].most_common(1)[0][0] 77 | return metadata 78 | 79 | def update_metadata_per_row(self, metadata, responses): 80 | """Update metadata with responses""" 81 | counter_key = 'judgement_counter' 82 | majority_key = 'majority_judgement' 83 | if metadata[counter_key]: 84 | metadata[majority_key] = metadata[counter_key].most_common(1)[0][0] 85 | 86 | metadata.pop(counter_key) 87 | return metadata 88 | 89 | def parse_response(self, response, row): 90 | """Parse verification response into structured dictionary""" 91 | try: 92 | json_blocks = re.findall(r'```json(.*?)```', response, re.DOTALL) 93 | if len(json_blocks) == 0: 94 | json_blocks = re.findall(r'{.*"TaskEvaluations".*}', response, re.DOTALL) 95 | if len(json_blocks) == 0: 96 | raise ValueError(f"Expected 1 JSON block, got {len(json_blocks)}") 97 | json_block = json_blocks[-1] 98 | else: 99 | json_block = json_blocks[-1] 100 | parsed_response = json.loads(json_block) 101 | return parsed_response 102 | except Exception as e: 103 | logging.error(f"Error parsing GPT response: {e}") 104 | return None 105 | 106 | -------------------------------------------------------------------------------- /src/apebench/inference/utils/chat_logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | import os 4 | import json 5 | from datetime import datetime 6 | import logging 7 | from typing import Dict, Any, Optional 8 | import fcntl 9 | import threading 10 | from filelock import FileLock 11 | import uuid 12 | 13 | class ChatLogger: 14 | def __init__(self, log_dir: str = "chat_logs"): 15 | """ 16 | Initialize the ChatLogger. 17 | 18 | Args: 19 | log_dir (str): Directory where log files will be stored 20 | """ 21 | self.log_dir = log_dir 22 | self._setup_logging() 23 | self._lock = threading.Lock() 24 | 25 | def _setup_logging(self): 26 | """Set up the logging directory and basic configuration.""" 27 | # Create log directory if it doesn't exist 28 | os.makedirs(self.log_dir, exist_ok=True) 29 | 30 | # Set up basic logging with thread safety 31 | logging.basicConfig( 32 | level=logging.INFO, 33 | format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s' 34 | ) 35 | self.logger = logging.getLogger(__name__) 36 | 37 | def _get_log_filename(self) -> str: 38 | """Generate a filename for the current day's log.""" 39 | current_date = datetime.now().strftime("%Y-%m-%d") 40 | return os.path.join(self.log_dir, f"chat_log_{current_date}.jsonl") 41 | 42 | def log_chat(self, 43 | prompt: str, 44 | completion: Dict[str, Any], 45 | model_name: str, 46 | system_prompt: Optional[str] = None) -> None: 47 | """ 48 | Log a chat interaction to a JSONL file in a thread-safe manner. 49 | 50 | Args: 51 | prompt (str): The user prompt 52 | completion (Dict): The completion response 53 | model_name (str): Name of the model used 54 | system_prompt (Optional[str]): System prompt if used 55 | """ 56 | log_entry = { 57 | "id": str(uuid.uuid4()), # Add unique identifier for each log entry 58 | "timestamp": datetime.now().isoformat(), 59 | "model_name": model_name, 60 | "system_prompt": system_prompt, 61 | "prompt": prompt, 62 | "completion": completion, 63 | "thread_id": threading.get_ident() 64 | } 65 | 66 | log_file = self._get_log_filename() 67 | lock_file = f"{log_file}.lock" 68 | 69 | # Use FileLock for cross-process locking 70 | with FileLock(lock_file): 71 | try: 72 | with open(log_file, "a", encoding="utf-8") as f: 73 | # Use fcntl for file-level locking (UNIX systems only) 74 | fcntl.flock(f.fileno(), fcntl.LOCK_EX) 75 | try: 76 | json.dump(log_entry, f, ensure_ascii=False) 77 | f.write("\n") 78 | f.flush() # Ensure the write is committed to disk 79 | finally: 80 | fcntl.flock(f.fileno(), fcntl.LOCK_UN) 81 | self.logger.info(f"Successfully logged chat interaction (ID: {log_entry['id']}) to {log_file}") 82 | except Exception as e: 83 | self.logger.error(f"Failed to log chat interaction: {str(e)}") 84 | 85 | def get_chat_history(self, 86 | date_str: Optional[str] = None, 87 | thread_id: Optional[int] = None) -> list: 88 | """ 89 | Retrieve chat history for a specific date or current date if not specified. 90 | 91 | Args: 92 | date_str (Optional[str]): Date in format 'YYYY-MM-DD' 93 | thread_id (Optional[int]): Filter logs by specific thread ID 94 | 95 | Returns: 96 | list: List of chat interactions for the specified date 97 | """ 98 | if date_str is None: 99 | date_str = datetime.now().strftime("%Y-%m-%d") 100 | 101 | log_file = os.path.join(self.log_dir, f"chat_log_{date_str}.jsonl") 102 | lock_file = f"{log_file}.lock" 103 | 104 | if not os.path.exists(log_file): 105 | return [] 106 | 107 | try: 108 | with FileLock(lock_file): 109 | with open(log_file, "r", encoding="utf-8") as f: 110 | # Use fcntl for file-level locking 111 | fcntl.flock(f.fileno(), fcntl.LOCK_SH) 112 | try: 113 | logs = [json.loads(line) for line in f] 114 | finally: 115 | fcntl.flock(f.fileno(), fcntl.LOCK_UN) 116 | 117 | # Filter by thread_id if specified 118 | if thread_id is not None: 119 | logs = [log for log in logs if log.get("thread_id") == thread_id] 120 | 121 | return logs 122 | except Exception as e: 123 | self.logger.error(f"Failed to read chat history: {str(e)}") 124 | return [] -------------------------------------------------------------------------------- /src/apebench/inference/run_inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | #!/usr/bin/env python3 4 | """ 5 | Unified entry point for the ApeBench pipeline system. 6 | 7 | This script provides a command-line interface to run different data processing 8 | pipelines for the ApeBench system, including instruction generation, patch 9 | generation, and judgment generation. 10 | """ 11 | 12 | import argparse 13 | import os 14 | import sys 15 | from datetime import datetime 16 | 17 | # Import pipeline classes 18 | from .inference_pipelines import GenerateInstructionPipeline, GeneratePatchPipeline, GenerateJudgementPipeline 19 | 20 | def parse_arguments(): 21 | """Parse command line arguments""" 22 | parser = argparse.ArgumentParser(description="ApeBench Unified Pipeline Entry Point") 23 | 24 | # General arguments for all pipelines 25 | parser.add_argument("--pipeline", type=str, required=True, 26 | choices=["instruction", "patch", "judgement"], 27 | help="Pipeline type to run") 28 | parser.add_argument("--input_file", type=str, required=True, help="Path to the input file") 29 | parser.add_argument("--output_dir", type=str, default="./outputs", 30 | help="Directory to save output files") 31 | parser.add_argument("--output_file", type=str, 32 | help="Path to the output file (if not specified, will be auto-generated)") 33 | parser.add_argument("--log_dir", type=str, default="./logs", 34 | help="Directory to save log files") 35 | parser.add_argument("--timestamp", type=str, 36 | help="Timestamp to use for filenames (default: current time)") 37 | parser.add_argument("--max_workers", type=int, default=1, 38 | help="Maximum number of parallel workers") 39 | parser.add_argument("--max_retries", type=int, default=10, 40 | help="Maximum number of retries for failed rows") 41 | parser.add_argument("--model_name", type=str, required=True, 42 | help="Name of the model to use for inference") 43 | parser.add_argument("--n_responses", type=int, default=1, 44 | help="Number of responses to generate for each input") 45 | parser.add_argument("--temperature", type=float, default=0.0, 46 | help="Temperature for the model") 47 | parser.add_argument("--max_tokens", type=int, default=8000, 48 | help="Maximum number of tokens to generate") 49 | parser.add_argument("--thinking_budget_tokens", type=int, default=6000, 50 | help="Budget tokens for thinking") 51 | 52 | # Arguments specific to the instruction pipeline 53 | parser.add_argument("--gold_diff_key", type=str, default="gold_diff", 54 | help="Key in the input data for the gold diff (for instruction pipeline)") 55 | 56 | # Arguments specific to the judgment pipeline 57 | parser.add_argument("--patch_key", type=str, default="best_gen_patch_comment_free", 58 | help="Key in the input data for the patch to judge (for judgment pipeline)") 59 | 60 | # Arguments specific to the patch pipeline 61 | parser.add_argument("--force_complete_prompt", action="store_true", 62 | help="Force complete prompt") 63 | parser.add_argument("--force_reasoning_prompt", action="store_true", 64 | help="Force consise prompt") 65 | 66 | return parser.parse_args() 67 | 68 | def select_pipeline(args): 69 | """Select the appropriate pipeline based on arguments""" 70 | if args.pipeline == "instruction": 71 | return GenerateInstructionPipeline(args) 72 | elif args.pipeline == "patch": 73 | return GeneratePatchPipeline(args) 74 | elif args.pipeline == "judgement": 75 | return GenerateJudgementPipeline(args) 76 | else: 77 | raise ValueError(f"Unknown pipeline type: {args.pipeline}") 78 | 79 | def main(): 80 | """Main entry point""" 81 | # Parse command line arguments 82 | args = parse_arguments() 83 | 84 | # Set default timestamp if not provided 85 | if args.timestamp is None: 86 | args.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') 87 | 88 | # Create output directory if it doesn't exist 89 | os.makedirs(args.output_dir, exist_ok=True) 90 | 91 | # Create log directory if it doesn't exist 92 | os.makedirs(args.log_dir, exist_ok=True) 93 | 94 | # Print banner 95 | print("\n" + "="*80) 96 | print(f" ApeBench Pipeline: {args.pipeline.upper()}") 97 | print("="*80 + "\n") 98 | 99 | # Select and initialize the appropriate pipeline 100 | pipeline = select_pipeline(args) 101 | 102 | # Run the pipeline 103 | print(f"Starting {args.pipeline} pipeline with model {args.model_name}...\n") 104 | total_processed, total_errors, failed_indices = pipeline.process_data() 105 | 106 | # Print summary 107 | print("\n" + "="*80) 108 | print(f" SUMMARY: {args.pipeline.upper()} PIPELINE") 109 | print("="*80) 110 | print(f"Total processed: {total_processed}") 111 | print(f"Total errors: {total_errors}") 112 | print(f"Failed indices: {len(failed_indices)}") 113 | print(f"Success rate: {(total_processed / (total_processed + total_errors) if total_processed + total_errors > 0 else 0) * 100:.2f}%") 114 | print("="*80 + "\n") 115 | 116 | return 0 117 | 118 | if __name__ == "__main__": 119 | sys.exit(main()) -------------------------------------------------------------------------------- /src/apebench/inference/inference_pipelines/generate_instruction.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | from ...inference.inference_pipelines.base import BasePipeline 4 | import re 5 | import logging 6 | from collections import Counter 7 | 8 | class GenerateInstructionPipeline(BasePipeline): 9 | """ 10 | Pipeline for generating instructions from patches. 11 | 12 | This processor extracts structured information about code contributions and exercises. 13 | """ 14 | 15 | def __init__(self, args): 16 | super().__init__(args) 17 | from ..prompts import ( 18 | instruction_generation_system_prompt, 19 | instruction_generation_input_prompt, 20 | instruction_generation_input_prompt_without_lean_code 21 | ) 22 | self.system_prompt = instruction_generation_system_prompt 23 | self.input_prompt = instruction_generation_input_prompt 24 | self.input_prompt_without_lean_code = instruction_generation_input_prompt_without_lean_code 25 | self.task_nature_list = ['superficial', 'substantial'] 26 | self.difficulty_list = ['very easy', 'easy', 'medium', 'hard', 'very hard'] 27 | self.task_category_list = ['bug fix', 'refactor', 'feature'] 28 | 29 | def get_input(self, row): 30 | if not row['content_before']: 31 | formatted_input = self.input_prompt_without_lean_code.format( 32 | raw_patch=row[self.args.gold_diff_key].strip() 33 | ) 34 | else: 35 | formatted_input = self.input_prompt.format( 36 | raw_patch=row[self.args.gold_diff_key].strip(), 37 | lean_code=row['content_before'].strip(), 38 | filename=row['file_path_before'] 39 | ) 40 | return formatted_input 41 | 42 | def initialize_metadata(self, row): 43 | """Initialize metadata for a row using Counter""" 44 | return { 45 | 'worst_difficulty': None, 46 | 'worst_task_nature': None, 47 | 'majority_difficulty': None, 48 | 'majority_task_nature': None, 49 | 'majority_task_category': None, 50 | 'difficulty_counter': Counter(), 51 | 'task_nature_counter': Counter(), 52 | 'task_category_counter': Counter(), 53 | } 54 | 55 | def update_metadata_per_response(self, metadata, parsed_response): 56 | """Update metadata with response using Counter""" 57 | if parsed_response is not None: 58 | for exercise in parsed_response['exercises']: 59 | for key, criteria in zip( 60 | ('difficulty', 'task_nature', 'task_category'), 61 | (self.difficulty_list, self.task_nature_list, self.task_category_list), 62 | ): 63 | worst_key = f'worst_{key}' 64 | majority_key = f'majority_{key}' 65 | counter_key = f'{key}_counter' 66 | value = exercise[key].lower() 67 | if value in criteria: 68 | if worst_key in metadata and (metadata[worst_key] is None or criteria.index(value) < criteria.index(metadata[worst_key])): 69 | metadata[worst_key] = value 70 | metadata[counter_key].update([value]) 71 | if metadata[counter_key]: 72 | metadata[majority_key] = metadata[counter_key].most_common(1)[0][0] 73 | return metadata 74 | 75 | def update_metadata_per_row(self, metadata, responses): 76 | """Update metadata with responses""" 77 | for key in ('difficulty', 'task_nature'): 78 | counter_key = f'{key}_counter' 79 | majority_key = f'majority_{key}' 80 | if metadata[counter_key]: 81 | metadata[majority_key] = metadata[counter_key].most_common(1)[0][0] 82 | 83 | metadata.pop('difficulty_counter') 84 | metadata.pop('task_nature_counter') 85 | return metadata 86 | 87 | def _extract_exercises(self, exercise_text): 88 | # Extract Exercises 89 | split_pos = exercise_text.find('Exercises in Lean') 90 | assert split_pos != -1 91 | exercise_text = exercise_text[split_pos:].strip(' \n-') 92 | exercises = [] 93 | exercise_pattern = r'Exercise[\*\s:]*(\d+)[\*\s]*:[\*\s:]*(.*?)[-\*\s]*Diff Hunk Span.*?@@(.*?)@@.*?[-\*\s]*Task Category[-\*\s:]*(.*?)[-\*\s]*Focus[-\*\s:]*(.*?)[-\*\s]*Difficulty[-\*\s:]*(.*?)[-\*\s]*Task Nature[-\*\s:]*(.*?)[-\*\s]*Problem Statement.*?[-\*:]+(.*?)(?=[-\*#\s]*Exercise|$)' 94 | exercise_blocks = re.findall(exercise_pattern, exercise_text, re.DOTALL) 95 | for num, title, hunk_span, category, focus, difficulty, nature, instruction in exercise_blocks: 96 | exercises.append({ 97 | 'num': int(num), 98 | 'title': title.strip().strip(), 99 | 'hunk_span': hunk_span.strip(), 100 | 'focus': focus.strip(), 101 | 'difficulty': difficulty.strip(), 102 | 'task_category': category.strip(), 103 | 'task_nature': nature.strip(), 104 | 'instruction': instruction.strip() 105 | }) 106 | return exercises 107 | 108 | def parse_response(self, response, row): 109 | """Parse structured data from GPT response""" 110 | try: 111 | if '(Continue similarly' in response: 112 | return None 113 | 114 | exercises = self._extract_exercises(response) 115 | 116 | assert len(exercises) > 0 117 | return {"exercises": exercises} 118 | except Exception as e: 119 | logging.error(f"Error parsing GPT response: {e}") 120 | return None -------------------------------------------------------------------------------- /src/eleanstic/utils/lean_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | import os 4 | import time 5 | import subprocess 6 | from typing import List, Tuple 7 | import logging 8 | import threading 9 | import psutil 10 | import signal 11 | import random 12 | import tempfile 13 | import traceback 14 | 15 | def run_command(command: List[str], cwd: str, logger: logging.Logger, env: dict = None) -> Tuple[List[str], List[str], float, int]: 16 | # Record start time 17 | start_time = time.time() 18 | 19 | # Use subprocess.run to execute command and wait for result 20 | try: 21 | result = subprocess.run( 22 | command, 23 | cwd=cwd, 24 | stdout=subprocess.PIPE, 25 | stderr=subprocess.PIPE, 26 | universal_newlines=True, 27 | env=env, 28 | check=False # Don't automatically raise exceptions, let caller handle return code 29 | ) 30 | 31 | # Process output results 32 | stdout_lines = [] 33 | if result.stdout: 34 | stdout_lines = [line.strip() + '\n' for line in result.stdout.splitlines() if line.strip()] 35 | # for line in result.stdout.splitlines(): 36 | # if line.strip(): 37 | # logger.info(f"[lake build] {line.strip()}") 38 | 39 | stderr_lines = [] 40 | if result.stderr: 41 | stderr_lines = [line.strip() + '\n' for line in result.stderr.splitlines() if line.strip()] 42 | # for line in result.stderr.splitlines(): 43 | # if line.strip(): 44 | # logger.warning(f"[lake build stderr] {line.strip()}") 45 | 46 | returncode = result.returncode 47 | 48 | except Exception as e: 49 | logger.error(f"Command execution exception: {traceback.format_exc()}") 50 | stderr_lines = [f"Command execution exception: {traceback.format_exc()}\n"] 51 | returncode = -1 52 | 53 | # Calculate build time 54 | build_time = time.time() - start_time 55 | return stdout_lines, stderr_lines, build_time, returncode 56 | 57 | def run_lake_build(worktree_path: str, logger: logging.Logger, cache_dir: str = None) -> Tuple[bool, str]: 58 | """ 59 | Run lake build command 60 | 61 | Args: 62 | worktree_path: Git worktree path 63 | 64 | Returns: 65 | Tuple[bool, str]: (success status, message) 66 | """ 67 | try: 68 | # Check if worktree exists 69 | if not os.path.exists(worktree_path): 70 | return False, f"Worktree does not exist: {worktree_path}" 71 | 72 | # Execute lake build command 73 | logger.info(f"Starting to build Mathlib (worktree: {worktree_path})") 74 | 75 | # Set environment variables, specify cache directory 76 | env = os.environ.copy() 77 | if cache_dir: 78 | env["XDG_CACHE_HOME"] = cache_dir 79 | logger.info(f"Setting XDG_CACHE_HOME={cache_dir}") 80 | 81 | # Run lake build 82 | logger.info(f"Running lake build command") 83 | stdout_lines, stderr_lines, build_time, returncode = run_command( 84 | ["lake", "build"], 85 | worktree_path, 86 | logger, 87 | env 88 | ) 89 | 90 | # Check build result 91 | if returncode == 0: 92 | logger.info(f"lake build completed, time taken {build_time:.1f} seconds") 93 | return True, f"lake build completed, time taken {build_time:.1f} seconds" 94 | else: 95 | error_message = "\n".join(line for line in (stderr_lines + stdout_lines) if not '] Building' in line) 96 | logger.error(f"lake build failed, time taken {build_time:.1f} seconds") 97 | return False, f"lake build failed, exit code {returncode}\n{error_message}" 98 | 99 | except Exception as e: 100 | logger.error(f"Error executing lake build: {traceback.format_exc()}") 101 | return False, f"Error executing lake build: {traceback.format_exc()}" 102 | 103 | def parse_lean_output(output): 104 | """Parse Lean output, categorize each line as error, warning or info.""" 105 | results = [] 106 | for line in output.splitlines(): 107 | line_lower = line.lower() 108 | if "error" in line_lower: 109 | results.append({"type": "error", "message": line.strip()}) 110 | elif "warning" in line_lower: 111 | results.append({"type": "warning", "message": line.strip()}) 112 | else: 113 | results.append({"type": "info", "message": line.strip()}) 114 | return results 115 | 116 | def verify_with_lean(content, worktree_path, logger, timeout=600): 117 | """Verify Lean file content. 118 | 119 | By creating a temporary Lean file at the given worktree path, 120 | call `lake env lean` to verify the file, and parse the output. 121 | """ 122 | with tempfile.NamedTemporaryFile(mode='w+', suffix='.lean', encoding='utf-8', delete=False) as temp_file: 123 | temp_file.write(content) 124 | temp_file.flush() 125 | temp_file_name = temp_file.name 126 | try: 127 | process = subprocess.run( 128 | ["lake", "env", "lean", temp_file_name], 129 | capture_output=True, 130 | text=True, 131 | cwd=worktree_path, 132 | timeout=timeout 133 | ) 134 | lean_output = parse_lean_output(process.stdout) 135 | passed = (process.returncode == 0 and not any(r.get("type") == "error" for r in lean_output)) 136 | complete = passed and not any(r.get("type") == "warning" for r in lean_output) 137 | result = { 138 | "parsed_output": lean_output, 139 | "raw_output": process.stdout, 140 | "raw_stderr": process.stderr, 141 | "returncode": process.returncode, 142 | "pass": passed, 143 | "complete": complete 144 | } 145 | except Exception as e: 146 | result = { 147 | "lean_output": None, 148 | "system_error": traceback.format_exc(), 149 | "pass": False, 150 | "complete": False 151 | } 152 | # logger.error(traceback.format_exc()) 153 | finally: 154 | import os 155 | try: 156 | os.remove(temp_file_name) 157 | except Exception: 158 | pass 159 | return result -------------------------------------------------------------------------------- /src/utils/colors.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | from copy import deepcopy 4 | import random 5 | import numpy as np 6 | 7 | # Existing color palette 8 | colors = dict(red_colors = [ 9 | # "#672F2F", # dark blood red 10 | "#E16A54", # bright red 11 | "#BE3144", # red 12 | "#810000", # dark red 13 | ], 14 | green_colors = [ 15 | # "#99B19C", # light green 16 | # "#A9B388", # light grass green 17 | # "#3F4F44", # green 18 | "#5F6F52", # grass green 19 | "#2C3930", # dark green 20 | ], 21 | yellow_colors = [ 22 | "#DDA853", # earth yellow 23 | # "#ECE5C7", # very light brown 24 | # "#CDC2AE", # light brown 25 | "#A27B5C", # brown 26 | ], 27 | blue_colors = [ 28 | # "#C2DEDC", # light blue 29 | "#116A7B", # lake blue 30 | "#27548A", # blue 31 | "#123458", # dark blue 32 | ] 33 | ) 34 | 35 | def increase_grayscale(hex_color, amount=0.3): 36 | """ 37 | Increase the grayscale of a hex color by reducing saturation 38 | 39 | Parameters: 40 | hex_color -- hex color string (e.g. "#672F2F") 41 | amount -- degree of grayscale increase (0.0 to 1.0) 42 | 43 | Returns: 44 | Hex color with increased grayscale 45 | """ 46 | # Convert hex to RGB 47 | hex_color = hex_color.lstrip('#') 48 | r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16) 49 | 50 | # Reduce saturation (move towards average) 51 | avg = (r + g + b) // 3 52 | r = int(r * (1 - amount) + avg * amount) 53 | g = int(g * (1 - amount) + avg * amount) 54 | b = int(b * (1 - amount) + avg * amount) 55 | 56 | # Convert back to hex 57 | return f"#{r:02x}{g:02x}{b:02x}" 58 | 59 | class ColorPicker: 60 | def __init__(self, color_palette=None, grayscale_increment=0.1): 61 | """ 62 | Initialize the color picker 63 | 64 | Parameters: 65 | color_palette -- dictionary of color groups, defaults to module's colors 66 | grayscale_increment -- grayscale amount to increase when all colors are used 67 | """ 68 | if color_palette is None: 69 | color_palette = colors 70 | 71 | self.original_palette = deepcopy(color_palette) # Save original palette 72 | self.grayscale_increment = grayscale_increment 73 | self.current_palette = deepcopy(color_palette) # Working palette 74 | self.grayscale_level = 0 75 | self.used_colors = set() # Track all used colors 76 | 77 | def get_color(self): 78 | """ 79 | Get a color according to rules: 80 | Pick a color from the group with fewest remaining colors, without replacement 81 | When all colors are picked, increase grayscale and reset the palette 82 | 83 | Returns: 84 | Selected color 85 | """ 86 | # Check if all groups are empty 87 | while True: 88 | all_empty = True 89 | none_empty_list = [] 90 | for colors_list in self.current_palette.values(): 91 | if colors_list: 92 | all_empty = False 93 | none_empty_list.append(colors_list) 94 | 95 | # If all groups are empty, increase grayscale and reset palette 96 | if all_empty: 97 | self.grayscale_level += self.grayscale_increment 98 | self._reset_palette() 99 | else: 100 | break 101 | 102 | # Randomly select a group 103 | group = random.choice(none_empty_list) 104 | # Take middle color from selected group 105 | mu = (len(group) - 1) / 2 106 | sigma = len(group) / 4 107 | 108 | # Generate normal distribution index and constrain to list range 109 | idx = max(0, min(len(group) - 1, int(np.random.normal(mu, sigma)))) 110 | selected_color = group.pop(idx) 111 | self.used_colors.add(selected_color) 112 | 113 | return selected_color 114 | 115 | def _reset_palette(self): 116 | """ 117 | Reset current palette, increase grayscale for each color in original palette, ensure generated colors are unique 118 | """ 119 | self.current_palette = {} 120 | 121 | for group, colors_list in self.original_palette.items(): 122 | self.current_palette[group] = [] 123 | for color in colors_list: 124 | # Calculate new color with base grayscale increase 125 | new_color = increase_grayscale(color, self.grayscale_level) 126 | 127 | # If this color has been used, adjust grayscale until a unique color is generated 128 | attempts = 0 129 | while new_color in self.used_colors and attempts < 10: 130 | # Gradually increase grayscale to get a new color 131 | adjustment = 0.05 + attempts * 0.02 132 | new_color = increase_grayscale(color, self.grayscale_level + adjustment) 133 | attempts += 1 134 | 135 | # If still can't find a unique color, generate a fine-tuned color 136 | if new_color in self.used_colors: 137 | # Extract RGB from original color 138 | base_color = new_color.lstrip('#') 139 | r = int(base_color[0:2], 16) 140 | g = int(base_color[2:4], 16) 141 | b = int(base_color[4:6], 16) 142 | 143 | # Fine-tune RGB values, ensure within valid range 144 | r = max(0, min(255, r + random.randint(-20, 20))) 145 | g = max(0, min(255, g + random.randint(-20, 20))) 146 | b = max(0, min(255, b + random.randint(-20, 20))) 147 | 148 | new_color = f"#{r:02x}{g:02x}{b:02x}" 149 | 150 | # Ensure final generated color is unique 151 | while new_color in self.used_colors: 152 | r = max(0, min(255, r + random.randint(-10, 10))) 153 | g = max(0, min(255, g + random.randint(-10, 10))) 154 | b = max(0, min(255, b + random.randint(-10, 10))) 155 | new_color = f"#{r:02x}{g:02x}{b:02x}" 156 | 157 | self.current_palette[group].append(new_color) 158 | 159 | # Usage example 160 | if __name__ == "__main__": 161 | picker = ColorPicker() 162 | for i in range(20): 163 | color = picker.get_color() 164 | print(f"Color #{i+1}: {color}") -------------------------------------------------------------------------------- /src/eleanstic/core/status.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | """ 4 | Commit Status Management Module 5 | 6 | Implements commit status storage through the file system, each commit's status is saved in a separate file. 7 | """ 8 | 9 | import os 10 | import json 11 | from datetime import datetime 12 | import traceback 13 | 14 | STATUS_PENDING = 'pending' 15 | STATUS_BUILDING = 'building' 16 | STATUS_COLLAPSED = 'collapsed' 17 | STATUS_READY = 'ready' 18 | STATUS_FAILED = 'failed' 19 | STATUS_FAILED_VERIFY = 'failed_verify' 20 | 21 | class CommitStatus: 22 | """ 23 | Commit Status Management Class, implemented using the file system 24 | 25 | Each commit's status is saved in a separate file. 26 | Supports the following statuses: 27 | - 'pending': Waiting to start build 28 | - 'building': Currently building 29 | - 'ready': Build successful, not compressed 30 | - 'collapsed': Build successful, compressed 31 | - 'failed': Build failed 32 | - 'failed_verify': Built but verification failed 33 | """ 34 | 35 | def __init__(self, status_dir: str = "commit_status"): 36 | """ 37 | Initialize CommitStatus instance 38 | 39 | Args: 40 | status_dir: Directory to save status files 41 | """ 42 | self.status_dir = status_dir 43 | os.makedirs(self.status_dir, exist_ok=True) 44 | 45 | def _get_status_file(self, commit_id: str) -> str: 46 | """Get status file path for specified commit""" 47 | return os.path.join(self.status_dir, f"{commit_id}.json") 48 | 49 | def is_commit_processed(self, commit_id: str) -> bool: 50 | """ 51 | Determine if a commit has been successfully built 52 | 53 | Args: 54 | commit_id: Git commit ID 55 | 56 | Returns: 57 | bool: Returns True if status is 'ready' or 'collapsed', otherwise False 58 | """ 59 | status_file = self._get_status_file(commit_id) 60 | if not os.path.exists(status_file): 61 | return False 62 | 63 | try: 64 | with open(status_file, 'r') as f: 65 | status_data = json.load(f) 66 | return status_data.get('status') in [STATUS_READY, STATUS_COLLAPSED] 67 | except (json.JSONDecodeError, IOError) as e: 68 | print(f"Error reading commit status file ({commit_id}): {e}") 69 | return False 70 | 71 | def get_commit_status(self, commit_id: str): 72 | """ 73 | Get complete status information for a commit 74 | 75 | Args: 76 | commit_id: Git commit ID 77 | 78 | Returns: 79 | Dict: Dictionary containing status, message and timestamp; if not exists returns default dictionary with 'pending' status 80 | """ 81 | status_file = self._get_status_file(commit_id) 82 | if not os.path.exists(status_file): 83 | return { 84 | 'commit_id': commit_id, 85 | 'status': STATUS_PENDING, 86 | 'message': None, 87 | 'updated_at': datetime.now().isoformat() 88 | } 89 | 90 | try: 91 | with open(status_file, 'r') as f: 92 | return json.load(f) 93 | except (json.JSONDecodeError, IOError) as e: 94 | print(f"Error reading commit status file ({commit_id}): {e}") 95 | return { 96 | 'commit_id': commit_id, 97 | 'status': STATUS_PENDING, 98 | 'message': f"Status file read error: {traceback.format_exc()}", 99 | 'updated_at': datetime.now().isoformat() 100 | } 101 | 102 | def update_commit_status(self, commit_id: str, status: str, message = None, additional_data = None) -> bool: 103 | """ 104 | Update commit status, preserving previous status data 105 | 106 | Args: 107 | commit_id: Git commit ID 108 | status: New status ('pending', 'building', 'ready', 'collapsed', or 'failed') 109 | message: Optional status message 110 | additional_data: Optional dictionary with additional data to update 111 | 112 | Returns: 113 | bool: Returns True on successful update, False on failure 114 | """ 115 | # Get existing status data if available 116 | existing_data = self.get_commit_status(commit_id) 117 | 118 | # Update with new values 119 | existing_data['status'] = status 120 | if message is not None: 121 | existing_data['message'] = message 122 | existing_data['updated_at'] = datetime.now().isoformat() 123 | 124 | # Merge additional data if provided 125 | if additional_data: 126 | for key, value in additional_data.items(): 127 | existing_data[key] = value 128 | 129 | status_file = self._get_status_file(commit_id) 130 | 131 | # Ensure directory exists 132 | os.makedirs(os.path.dirname(status_file), exist_ok=True) 133 | 134 | try: 135 | with open(status_file, 'w') as f: 136 | json.dump(existing_data, f, indent=2, ensure_ascii=False) 137 | return True 138 | except IOError as e: 139 | print(f"Failed to update commit status file ({commit_id}): {e}") 140 | return False 141 | 142 | def get_all_commits_status(self): 143 | """ 144 | Get status information for all commits 145 | 146 | Returns: 147 | List[Dict]: List of commit status information 148 | """ 149 | result = [] 150 | try: 151 | for filename in os.listdir(self.status_dir): 152 | if filename.endswith('.json'): 153 | commit_id = filename[:-5] # Remove .json suffix 154 | status = self.get_commit_status(commit_id) 155 | result.append(status) 156 | return result 157 | except OSError as e: 158 | print(f"Failed to read status directory: {e}") 159 | return [] 160 | 161 | def get_commits_by_status(self, status: str): 162 | """ 163 | Get all commits with a specified status 164 | 165 | Args: 166 | status: Status to filter by 167 | 168 | Returns: 169 | List[Dict]: List of commit status information matching the status 170 | """ 171 | all_statuses = self.get_all_commits_status() 172 | return [item for item in all_statuses if item.get('status') == status] -------------------------------------------------------------------------------- /run_ape_bench_example.sh: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | #!/bin/bash 4 | 5 | # --- Configuration --- 6 | # TODO: User should customize these paths and settings 7 | # Ensure this script is run from the root of the APE-Bench_I project. 8 | 9 | # Path to your local clone of the mathlib4 repository. 10 | # Eleanstic (src/eleanstic/config.yaml) must also be configured to point to this path. 11 | MATHLIB_REPO_PATH="./mathlib4" 12 | 13 | # Path to the directory where the APE-Bench_I dataset (from Hugging Face) is cloned. 14 | APE_BENCH_DATASET_DIR="./datasets" 15 | 16 | # Specific APE-Bench dataset file (e.g., .parquet) to be used for the benchmark. 17 | # This path should also be set in the main APE-Bench configuration file below. 18 | APE_BENCH_DATASET_FILE="${APE_BENCH_DATASET_DIR}/ape_bench1_test.parquet" 19 | 20 | # Main APE-Bench configuration file. 21 | # Ensure its 'project.input_file' points to APE_BENCH_DATASET_FILE. 22 | CONFIG_FILE="configs/config.yaml" 23 | 24 | # Eleanstic configuration file. 25 | # Ensure its 'paths.mathlib_repo' points to MATHLIB_REPO_PATH. 26 | ELEANSTIC_CONFIG_FILE="src/eleanstic/config.yaml" 27 | 28 | # --- Check for required commands --- 29 | echo "Checking for required dependencies..." 30 | 31 | # Check for lean command 32 | if ! command -v lean &> /dev/null; then 33 | echo "Error: 'lean' command not found. Please install Lean 4 before running this script." 34 | echo "Visit https://lean-lang.org/lean4/doc/quickstart.html for installation instructions." 35 | exit 1 36 | fi 37 | 38 | # Check for elan command 39 | if ! command -v elan &> /dev/null; then 40 | echo "Error: 'elan' command not found. Please install Elan (Lean version manager) before running this script." 41 | echo "Visit https://github.com/leanprover/elan for installation instructions." 42 | exit 1 43 | fi 44 | 45 | echo "All required dependencies are installed." 46 | echo "---------------------------------------------------------------------" 47 | 48 | # --- 1. Setup: Clone repositories (if not already present) --- 49 | echo "Step 1: Setting up repositories..." 50 | 51 | # Clone Mathlib4 52 | if [ ! -d "$MATHLIB_REPO_PATH" ]; then 53 | echo "Cloning mathlib4 to $MATHLIB_REPO_PATH..." 54 | git clone https://github.com/leanprover-community/mathlib4.git "$MATHLIB_REPO_PATH" 55 | if [ $? -ne 0 ]; then echo "Failed to clone mathlib4. Exiting."; exit 1; fi 56 | else 57 | echo "Mathlib4 repository found at $MATHLIB_REPO_PATH." 58 | fi 59 | 60 | if [ ! -f "$APE_BENCH_DATASET_FILE" ]; then 61 | echo "Obtaining APE-Bench_I dataset to $APE_BENCH_DATASET_DIR..." 62 | 63 | # Check if git lfs is installed and working 64 | if command -v git-lfs >/dev/null 2>&1 && git lfs install >/dev/null 2>&1; then 65 | echo "Git LFS is available. Cloning repository..." 66 | git clone https://huggingface.co/datasets/HuajianXin/APE-Bench_I "$APE_BENCH_DATASET_DIR" 67 | if [ $? -ne 0 ]; then 68 | echo "Failed to clone APE-Bench_I dataset. Exiting." 69 | exit 1 70 | fi 71 | else 72 | echo "Git LFS not available. Downloading files directly using curl..." 73 | mkdir -p "$APE_BENCH_DATASET_DIR" 74 | 75 | # Download main dataset file directly 76 | curl -L "https://huggingface.co/datasets/HuajianXin/APE-Bench_I/resolve/main/$(basename "$APE_BENCH_DATASET_FILE")" -o "$APE_BENCH_DATASET_FILE" 77 | 78 | if [ $? -ne 0 ]; then 79 | echo "Failed to download dataset file. Exiting." 80 | exit 1 81 | fi 82 | fi 83 | else 84 | echo "APE-Bench dataset file $APE_BENCH_DATASET_FILE already exists." 85 | fi 86 | 87 | echo "Repository setup complete." 88 | echo "---------------------------------------------------------------------" 89 | 90 | # --- 2. Eleanstic Build (Preprocessing Mathlib Commits) --- 91 | # This step preprocesses all Mathlib commits referenced in the target APE-Bench dataset file. 92 | # It uses Eleanstic and can be time-consuming for the first run. 93 | 94 | echo "Step 2: Eleanstic Build..." 95 | echo "IMPORTANT: Ensure Eleanstic configuration ($ELEANSTIC_CONFIG_FILE) is correct, especially 'paths.mathlib_repo'." 96 | echo "This will build Eleanstic data for commits in: $APE_BENCH_DATASET_FILE" 97 | 98 | # Assuming the parquet file contains a column named 'commit' for commit hashes. 99 | # Adjust --commit_id_key if your parquet uses a different column name for commit SHAs. 100 | python -m src.eleanstic.main \ 101 | --config "$ELEANSTIC_CONFIG_FILE" \ 102 | --input_file "$APE_BENCH_DATASET_FILE" \ 103 | --commit_id_key commit_hash \ 104 | build 105 | # --max_workers # Optional: adjust based on your system 106 | 107 | if [ $? -ne 0 ]; then echo "Eleanstic build failed. Exiting."; exit 1; fi 108 | echo "Eleanstic build complete." 109 | echo "---------------------------------------------------------------------" 110 | 111 | --- 3. Run APE-Bench Pipeline Scripts --- 112 | These scripts use the main APE-Bench configuration file ($CONFIG_FILE). 113 | Ensure $CONFIG_FILE is correctly set up, especially: 114 | project.input_file: should point to $APE_BENCH_DATASET_FILE 115 | generation, verification, judgement sections as per your needs. 116 | 117 | echo "Step 3.1: Generating Patches (using $CONFIG_FILE)..." 118 | python -m src.apebench.scripts.1_generate_patches --config "$CONFIG_FILE" 119 | if [ $? -ne 0 ]; then echo "Patch generation failed. Exiting."; exit 1; fi 120 | echo "Patch generation complete." 121 | echo "---------------------------------------------------------------------" 122 | 123 | 124 | echo "Step 3.2: Verifying Patches (using $CONFIG_FILE)..." 125 | python -m src.apebench.scripts.2_verify_patches --config "$CONFIG_FILE" 126 | if [ $? -ne 0 ]; then echo "Patch verification failed. Exiting."; exit 1; fi 127 | echo "Patch verification complete." 128 | echo "---------------------------------------------------------------------" 129 | 130 | 131 | echo "Step 3.3: Evaluating Patches (using $CONFIG_FILE)..." 132 | python -m src.apebench.scripts.3_evaluate_patches --config "$CONFIG_FILE" 133 | if [ $? -ne 0 ]; then echo "Patch evaluation failed. Exiting."; exit 1; fi 134 | echo "Patch evaluation complete." 135 | echo "---------------------------------------------------------------------" 136 | 137 | 138 | echo "APE-Bench pipeline finished successfully!" 139 | echo "Check the 'outputs/' directory for results." 140 | 141 | --- Optional: Rebuilding Data from Scratch --- 142 | If you need to regenerate the APE-Bench dataset itself (e.g., from new Mathlib commits), 143 | you can use the 0_collect_data.py script. This is an advanced step. 144 | echo "" 145 | echo "Optional: To rebuild the APE-Bench dataset from scratch, inspect and run:" 146 | echo "# python -m src.apebench.scripts.0_collect_data --config $CONFIG_FILE --repo_path $MATHLIB_REPO_PATH ... (other args)" 147 | 148 | exit 0 -------------------------------------------------------------------------------- /src/eleanstic/utils/log_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | """ 4 | Log utilities module 5 | Supports colored log output and log file rotation 6 | """ 7 | import os 8 | import logging 9 | import logging.handlers 10 | from pathlib import Path 11 | from typing import Optional, Dict, Any 12 | import colorlog 13 | 14 | def setup_logger( 15 | name: str = "commit_database", 16 | level: str = "INFO", 17 | log_dir: Optional[str] = None, 18 | log_file: Optional[str] = None, 19 | max_size_mb: int = 100, 20 | backup_count: int = 10, 21 | console_output: bool = True, 22 | color_output: bool = True 23 | ) -> logging.Logger: 24 | """ 25 | Set up logging system 26 | 27 | Args: 28 | name: Logger name 29 | level: Log level 30 | log_dir: Log directory 31 | log_file: Log file 32 | max_size_mb: Maximum log file size (MB) 33 | backup_count: Number of log files to keep 34 | console_output: Whether to output to console 35 | color_output: Whether to use colored logs 36 | 37 | Returns: 38 | logging.Logger: Configured logger 39 | """ 40 | # Convert log level 41 | level_map = { 42 | "DEBUG": logging.DEBUG, 43 | "INFO": logging.INFO, 44 | "WARNING": logging.WARNING, 45 | "ERROR": logging.ERROR, 46 | "CRITICAL": logging.CRITICAL 47 | } 48 | log_level = level_map.get(level.upper(), logging.INFO) 49 | 50 | # Create complete log file path 51 | full_log_path = None 52 | if log_dir and not log_file: 53 | log_path = Path(log_dir) 54 | if not log_path.exists(): 55 | log_path.mkdir(parents=True, exist_ok=True) 56 | full_log_path = str(log_path / f"{name}.log") 57 | elif log_file: 58 | full_log_path = log_file 59 | # if full_log_path: 60 | # console_output = False 61 | 62 | # Create logger with "name:log_file" as unique identifier 63 | # This way even if name is the same but log_file is different, different logger instances will be created 64 | logger_id = name if full_log_path is None else f"{name}:{full_log_path}" 65 | logger = logging.getLogger(logger_id) 66 | logger.setLevel(log_level) 67 | 68 | # Clear old handlers 69 | for handler in logger.handlers[:]: 70 | logger.removeHandler(handler) 71 | 72 | # Define log format - Use custom Formatter to keep simple name display 73 | class SimpleNameFormatter(logging.Formatter): 74 | def format(self, record): 75 | # Temporarily save original name 76 | original_name = record.name 77 | # Set to simple name (remove log_file path) 78 | if ':' in original_name: 79 | record.name = original_name.split(':', 1)[0] 80 | result = super().format(record) 81 | # Restore original name 82 | record.name = original_name 83 | return result 84 | 85 | log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 86 | 87 | # Add console output 88 | if console_output: 89 | console_handler = logging.StreamHandler() 90 | console_handler.setLevel(log_level) 91 | 92 | if color_output: 93 | # Colored logs 94 | colors = { 95 | 'DEBUG': 'cyan', 96 | 'INFO': 'green', 97 | 'WARNING': 'yellow', 98 | 'ERROR': 'red', 99 | 'CRITICAL': 'red,bg_white', 100 | } 101 | color_formatter = colorlog.ColoredFormatter( 102 | "%(log_color)s" + log_format, 103 | log_colors=colors 104 | ) 105 | # Replace with custom Formatter 106 | class SimpleNameColorFormatter(colorlog.ColoredFormatter): 107 | def format(self, record): 108 | # Temporarily save original name 109 | original_name = record.name 110 | # Set to simple name (remove log_file path) 111 | if ':' in original_name: 112 | record.name = original_name.split(':', 1)[0] 113 | result = super().format(record) 114 | # Restore original name 115 | record.name = original_name 116 | return result 117 | 118 | color_formatter = SimpleNameColorFormatter( 119 | "%(log_color)s" + log_format, 120 | log_colors=colors 121 | ) 122 | console_handler.setFormatter(color_formatter) 123 | else: 124 | # Regular logs 125 | formatter = SimpleNameFormatter(log_format) 126 | console_handler.setFormatter(formatter) 127 | 128 | logger.addHandler(console_handler) 129 | 130 | # Add file output 131 | if full_log_path: 132 | file_handler = logging.FileHandler( 133 | filename=full_log_path, 134 | encoding='utf-8' 135 | ) 136 | file_handler.setLevel(log_level) 137 | formatter = SimpleNameFormatter(log_format) 138 | file_handler.setFormatter(formatter) 139 | logger.addHandler(file_handler) 140 | # print(f"Logging {name}@{os.getpid()}: {full_log_path}") 141 | 142 | return logger 143 | 144 | def log_progress(logger, file_count, total_files, start_time, current_time, logging_ratio = 0.1, log_every_file = False, **kwargs): 145 | """ 146 | Log processing progress 147 | 148 | Args: 149 | file_count: Number of files processed 150 | total_files: Total number of files 151 | start_time: Start time 152 | current_time: Current time 153 | """ 154 | elapsed_time = current_time - start_time 155 | if log_every_file or file_count / total_files > logging_ratio: 156 | progress_percent = (file_count / total_files) * 100 if total_files > 0 else 0 157 | 158 | # Calculate estimated remaining time 159 | if file_count > 0 and progress_percent < 100: 160 | time_per_file = elapsed_time / file_count 161 | remaining_files = total_files - file_count 162 | estimated_remaining_time = time_per_file * remaining_files 163 | 164 | logger.info( 165 | f"Progress: {progress_percent:.2f}% ({file_count}/{total_files}) | " 166 | f"Time used: {elapsed_time:.2f}s | " 167 | f"Est. remaining: {estimated_remaining_time:.2f}s" + 168 | (" | " + " | ".join([f"{k}: {v}" for k, v in kwargs.items()]) if kwargs else "") 169 | ) 170 | else: 171 | logger.info( 172 | f"Progress: {progress_percent:.2f}% ({file_count}/{total_files}) | " 173 | f"Time used: {elapsed_time:.2f}s" + 174 | (" | " + " | ".join([f"{k}: {v}" for k, v in kwargs.items()]) if kwargs else "") 175 | ) -------------------------------------------------------------------------------- /docs/03_core_components/03_5_apebench_scripts_config.md: -------------------------------------------------------------------------------- 1 | [English](#english-version) | [中文](#chinese-version) 2 | 3 | 4 | # 3.5 Scripts and Configuration 5 | 6 | This section covers the scripts and configuration files that orchestrate the APE-Bench evaluation workflows. 7 | 8 | ## Scripts (`src/apebench/scripts/`) 9 | 10 | The `src/apebench/scripts/` directory typically contains Python scripts that serve as high-level entry points for various stages of the APE-Bench I workflow. 11 | 12 | **Common Functions of Scripts:** 13 | 14 | * **Running Full Experiments**: Scripts to orchestrate an end-to-end evaluation for one or more LLMs. This might involve: 15 | * Loading tasks from the dataset. 16 | * Calling the inference modules (`src/apebench/inference/`) to generate patches for all tasks. 17 | * Invoking the evaluation pipeline (`src/apebench/evaluation_pipelines/`) to perform syntactic and semantic checks. 18 | * Saving raw results and aggregated metrics. 19 | * **Data Preprocessing/Analysis**: Scripts for analyzing the APE-Bench I dataset itself, or for preprocessing data before an experiment. 20 | * **Result Aggregation and Reporting**: Scripts to collect results from multiple partial runs, compute final metrics (like those in the paper's tables and figures), and generate reports or visualizations. 21 | * This might use `src/apebench/evaluation_pipelines/gather_results.py` internally. 22 | * **Targeted Evaluations**: Scripts for running specific parts of the pipeline, e.g., only running inference for a new model, or only re-evaluating existing patches with a new semantic judge. 23 | 24 | **Usage:** 25 | 26 | These scripts are generally designed to be run from the command line. They would parse command-line arguments to specify things like: 27 | * Which LLM(s) to evaluate. 28 | * Paths to input data and output directories. 29 | * Configuration files to use. 30 | * Specific task IDs or categories to focus on. 31 | 32 | ## Configuration Files 33 | 34 | Configuration files allow for customizing the behavior of the APE-Bench framework without modifying the source code directly. 35 | 36 | ### 1. Eleanstic Configuration (`src/eleanstic/config.yaml`) 37 | 38 | * **Purpose**: Configures the Eleanstic service. 39 | * **Key Settings**: As detailed in the [Eleanstic documentation](./04_1_eleanstic.md): 40 | * `mathlib_repo_path`: Path to your local Mathlib4 clone. 41 | * `cas_store_path`: Location for Eleanstic's Content-Addressable Store. 42 | * `snapshots_path`: Location for Eleanstic's commit snapshots. 43 | * Parameters for concurrency, logging, etc. 44 | * **Importance**: Must be correctly set up before Eleanstic can be used, especially for the initial preprocessing of Mathlib commits. 45 | 46 | ### 2. APE-Bench Configuration (primarily in `src/apebench/config/`) 47 | 48 | This directory likely contains configuration files (e.g., YAML, JSON, or Python modules) for various aspects of the APE-Bench experiments. 49 | 50 | * **Model Configurations**: 51 | * API keys (or paths to key files). 52 | * Model names/identifiers as used in APIs (e.g., `gpt-4o`, `claude-3-sonnet-20240229`). 53 | * Default generation parameters (temperature, max tokens, top_p) for each model. 54 | * API endpoint URLs if not standard. 55 | * **Path Configurations**: 56 | * Paths to the APE-Bench I dataset (`datasets/`). 57 | * Default directories for saving LLM-generated patches, evaluation results, logs, and analysis outputs. 58 | * **Experiment Parameters**: 59 | * Number of samples to generate per task ($n$ for pass@k). 60 | * Parameters for `DiffRepair` (e.g., matching thresholds). 61 | * Settings for the LLM-as-a-Judge (e.g., which model to use as judge, judge-specific prompting parameters). 62 | * **Feature Flags**: Flags to enable/disable certain parts of the pipeline (e.g., skip syntactic check, force re-generation of patches). 63 | 64 | ## Secondary Development 65 | 66 | * **Scripts**: 67 | * Develop new scripts for novel experimental workflows or more detailed analyses (e.g., generating specific plots, performing statistical tests on results). 68 | * Improve the command-line interface and modularity of existing scripts. 69 | * **Configuration**: 70 | * Refine the structure of configuration files for better organization or to support more complex experimental designs (e.g., using hierarchical configurations with tools like Hydra). 71 | * Add validation for configuration parameters (e.g., using `pydantic` as listed in `requirements.txt`) to catch errors early. 72 | * Standardize how different modules access configuration settings. 73 | 74 | Effectively using and managing scripts and configurations is key to running reproducible experiments and extending the APE-Bench I framework. 75 | 76 | --- 77 | 78 | Next: [Troubleshooting](./04_troubleshooting.md) 79 | 80 | 81 | 82 | ## 中文翻译 (Chinese Translation) 83 | 84 | # 3.5 脚本与配置 85 | 86 | 本节介绍协调 APE-Bench 评估工作流的脚本和配置文件。 87 | 88 | ## 脚本 (`src/apebench/scripts/`) 89 | 90 | `src/apebench/scripts/` 目录通常包含作为 APE-Bench I 工作流程各个阶段高级入口点的 Python 脚本。 91 | 92 | **脚本的常见功能:** 93 | 94 | * **运行完整实验**:用于为一个或多个 LLM 编排端到端评估的脚本。这可能涉及: 95 | * 从数据集中加载任务。 96 | * 调用推理模块 (`src/apebench/inference/`) 为所有任务生成补丁。 97 | * 调用评估流程 (`src/apebench/evaluation_pipelines/`) 执行语法和语义检查。 98 | * 保存原始结果和聚合指标。 99 | * **数据预处理/分析**:用于分析 APE-Bench I 数据集本身,或在实验前预处理数据的脚本。 100 | * **结果聚合和报告**:用于从多个部分运行中收集结果,计算最终指标(如论文表格和图中的指标),并生成报告或可视化的脚本。 101 | * 这可能在内部使用 `src/apebench/evaluation_pipelines/gather_results.py`。 102 | * **有针对性的评估**:用于运行流程特定部分的脚本,例如,仅为新模型运行推理,或仅使用新的语义裁判重新评估现有补丁。 103 | 104 | **用法:** 105 | 106 | 这些脚本通常设计为从命令行运行。它们会解析命令行参数以指定诸如以下内容: 107 | * 要评估的 LLM。 108 | * 输入数据和输出目录的路径。 109 | * 要使用的配置文件。 110 | * 要关注的特定任务 ID 或类别。 111 | 112 | ## 配置文件 113 | 114 | 配置文件允许在不直接修改源代码的情况下自定义 APE-Bench 框架的行为。 115 | 116 | ### 1. Eleanstic 配置 (`src/eleanstic/config.yaml`) 117 | 118 | * **目的**:配置 Eleanstic 服务。 119 | * **关键设置**:如 [Eleanstic 文档](./04_1_eleanstic.md) 中所述: 120 | * `mathlib_repo_path`:指向您的本地 Mathlib4 克隆的路径。 121 | * `cas_store_path`:Eleanstic 内容寻址存储的位置。 122 | * `snapshots_path`:Eleanstic 提交快照的位置。 123 | * 并发、日志记录等参数。 124 | * **重要性**:在使用 Eleanstic 之前必须正确设置,尤其是在对 Mathlib 提交进行初始预处理时。 125 | 126 | ### 2. APE-Bench 配置 (主要在 `src/apebench/config/` 中) 127 | 128 | 此目录可能包含 APE-Bench 实验各个方面的配置文件(例如 YAML、JSON 或 Python 模块)。 129 | 130 | * **模型配置**: 131 | * API 密钥(或密钥文件路径)。 132 | * API 中使用的模型名称/标识符(例如 `gpt-4o`、`claude-3-sonnet-20240229`)。 133 | * 每个模型的默认生成参数(温度、最大令牌数、top_p)。 134 | * 如果不是标准 API,则为 API 端点 URL。 135 | * **路径配置**: 136 | * 指向 APE-Bench I 数据集的路径 (`datasets/`)。 137 | * 用于保存 LLM 生成的补丁、评估结果、日志和分析输出的默认目录。 138 | * **实验参数**: 139 | * 每个任务生成的样本数(pass@k 中的 $n$)。 140 | * `DiffRepair` 的参数(例如匹配阈值)。 141 | * 作为裁判的 LLM 的设置(例如使用哪个模型作为裁判,裁判特定的提示参数)。 142 | * **功能标志**:用于启用/禁用流程某些部分的标志(例如跳过语法检查,强制重新生成补丁)。 143 | 144 | --- 145 | 146 | 下一节: [故障排除](./04_troubleshooting.md) -------------------------------------------------------------------------------- /src/apebench/inference/inference_pipelines/generate_patch.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | from ..inference_pipelines.base import BasePipeline 4 | from ..utils.diff_repair import DiffRepair, apply_diff, generate_diff 5 | from ..utils.call_api import REASONING_MODELS 6 | import re 7 | import logging 8 | from src.utils.lean_utils import remove_lean_comments 9 | 10 | class GeneratePatchPipeline(BasePipeline): 11 | """ 12 | Pipeline for generating patches based on task descriptions. 13 | 14 | Supports multiple prompt types and model configurations. 15 | """ 16 | def __init__(self, args): 17 | super().__init__(args) 18 | from ..prompts import ( 19 | patch_generation_system_prompt, 20 | patch_generation_reasoning_models_system_prompt, 21 | patch_generation_input_prompt, 22 | patch_generation_input_prompt_without_lean_code 23 | ) 24 | self.system_prompt = patch_generation_system_prompt if self.args.model_name not in REASONING_MODELS else patch_generation_reasoning_models_system_prompt 25 | assert not self.args.force_reasoning_prompt or not self.args.force_complete_prompt, "force_reasoning_prompt and force_complete_prompt cannot be both True" 26 | if self.args.force_reasoning_prompt: 27 | self.system_prompt = patch_generation_reasoning_models_system_prompt 28 | if self.args.force_complete_prompt: 29 | self.system_prompt = patch_generation_system_prompt 30 | self.input_prompt = patch_generation_input_prompt 31 | self.input_prompt_without_lean_code = patch_generation_input_prompt_without_lean_code 32 | self.strict_match_threshold = 0.5 33 | self.max_context_lines = 3 34 | 35 | @property 36 | def special_config(self): 37 | if self.args.force_complete_prompt: 38 | return '_force_complete_prompt' 39 | elif self.args.force_reasoning_prompt: 40 | return '_force_reasoning_prompt' 41 | else: 42 | return '' 43 | 44 | def parse_response(self, response, row): 45 | try: 46 | result = { 47 | 'gen_patch': None, 48 | 'gen_content_from_scratch': None, 49 | 'gen_patch_after_exact_repair': None, 50 | 'gen_content_after_exact_repair': None, 51 | 'gen_patch_after_robust_repair': None, 52 | 'gen_content_after_robust_repair': None 53 | } 54 | patch_match = re.search(r'```diff(.*?)```', response, re.DOTALL) 55 | best_gen_patch = None 56 | best_gen_patch_comment_free = None 57 | best_gen_content = None 58 | content_before = row['content_before'] 59 | content_before_comment_free = remove_lean_comments(content_before) 60 | if patch_match: 61 | patch = patch_match.group(1).strip() 62 | result['gen_patch'] = patch 63 | if not content_before: 64 | try: 65 | result['gen_content_from_scratch'] = apply_diff(content_before, patch) 66 | best_gen_patch = patch 67 | best_gen_content = result['gen_content_from_scratch'] 68 | content_after_comment_free = remove_lean_comments(result['gen_content_from_scratch']) 69 | best_gen_patch_comment_free = generate_diff(content_before_comment_free, content_after_comment_free) 70 | except Exception as e: 71 | pass 72 | else: 73 | try: 74 | repairer = DiffRepair(content_before, patch, strict_match_threshold=self.strict_match_threshold, max_context_lines=self.max_context_lines, exact_match=False) 75 | repaired_patch_text, full_new_content = repairer.repair() 76 | 77 | if full_new_content is not None: 78 | # Special case: DiffRepair returned full new content 79 | result[f'gen_content_after_robust_repair'] = full_new_content 80 | actual_diff = generate_diff(content_before, full_new_content) 81 | result[f'gen_patch_after_robust_repair'] = actual_diff 82 | best_gen_patch = actual_diff 83 | best_gen_content = full_new_content 84 | content_after_comment_free = remove_lean_comments(full_new_content) 85 | best_gen_patch_comment_free = generate_diff(content_before_comment_free, content_after_comment_free) 86 | elif repaired_patch_text is not None: 87 | # Standard case: DiffRepair returned a repaired patch text 88 | repaired_content = apply_diff(content_before, repaired_patch_text) 89 | result[f'gen_content_after_robust_repair'] = repaired_content 90 | 91 | actual_diff = generate_diff(content_before, repaired_content) 92 | result[f'gen_patch_after_robust_repair'] = actual_diff 93 | 94 | best_gen_patch = actual_diff 95 | best_gen_content = repaired_content 96 | content_after_comment_free = remove_lean_comments(repaired_content) 97 | best_gen_patch_comment_free = generate_diff(content_before_comment_free, content_after_comment_free) 98 | # else: an error occurred in repair, or it returned (None, None) - fields will remain None 99 | except Exception as e: 100 | pass 101 | result['best_gen_content'] = best_gen_content 102 | result['best_gen_patch'] = best_gen_patch 103 | result['best_gen_patch_comment_free'] = best_gen_patch_comment_free 104 | return result 105 | except Exception as e: 106 | logging.error(f"Error parsing GPT response: {e}") 107 | return None 108 | 109 | def get_input(self, row): 110 | """Generate prompt input for a row""" 111 | 112 | lean_code = row['content_before'] 113 | filename = row['file_path_after'] 114 | if not 'full_instruction' in row: 115 | instructions = '\n\n\n'.join([f"- Task {idx + 1}: {exercise['title']}\n\n{exercise['instruction']}" for idx, exercise in enumerate(row['instructions']['exercises'])]) 116 | row['full_instruction'] = instructions 117 | else: 118 | instructions = row['full_instruction'] 119 | 120 | if filename and lean_code: 121 | return self.input_prompt.format( 122 | lean_code=lean_code, 123 | instructions=instructions, 124 | filename=filename 125 | ) 126 | else: 127 | return self.input_prompt_without_lean_code.format( 128 | instructions=instructions, 129 | filename=filename 130 | ) 131 | -------------------------------------------------------------------------------- /src/apebench/evaluation_pipelines/verification_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | """ 3 | Verification management module responsible for executing the patch verification process 4 | """ 5 | 6 | import os 7 | import subprocess 8 | import glob 9 | from datetime import datetime 10 | from typing import Dict, List, Any, Optional, Union 11 | 12 | from ..utils import ProgressTracker, extract_verification_data, calculate_metrics, plot_metrics 13 | 14 | def get_latest_results_dir(base_dir: str) -> str: 15 | """ 16 | Get the latest results directory 17 | 18 | Args: 19 | base_dir: Base directory 20 | 21 | Returns: 22 | Path to the latest results directory 23 | """ 24 | result_dirs = glob.glob(f"{base_dir}*") 25 | if not result_dirs: 26 | raise ValueError(f"No result directories found in {base_dir}") 27 | 28 | # Sort by timestamp 29 | latest_dir = max(result_dirs, key=os.path.getctime) 30 | return latest_dir 31 | 32 | def verify_patches(config_file: str, generation_output_files: Optional[List[str]] = None) -> str: 33 | """ 34 | Verify generated patches 35 | 36 | Args: 37 | config_file: Path to configuration file 38 | generation_output_files: Optional list of generation output files 39 | 40 | Returns: 41 | Path to the merged results file 42 | """ 43 | # Import here instead of at the top to avoid circular imports 44 | from ..config.config_manager import ConfigManager 45 | 46 | # Load configuration 47 | config = ConfigManager(config_file).get_config() 48 | 49 | # Initialize progress tracker 50 | progress_tracker = ProgressTracker(config.progress_log) 51 | 52 | print(f"Running patch verification with configuration from: {config_file}") 53 | 54 | # Check if verification is already completed 55 | verification_status = progress_tracker.get_verification_status() 56 | if verification_status.get("completed", False): 57 | print("Verification already completed") 58 | verification_status = progress_tracker.get_verification_status() 59 | verification_metrics = verification_status.get("metrics", {}) 60 | return verification_metrics 61 | 62 | # If no output files are provided, get them from the progress record 63 | if not generation_output_files: 64 | generation_output_files = progress_tracker.get_all_output_files() 65 | 66 | if not generation_output_files: 67 | raise ValueError("No generation output files found. Run patch generation first.") 68 | 69 | print(f"Found {len(generation_output_files)} generation output files") 70 | 71 | # Create temporary directory 72 | os.makedirs(config.temp_dir, exist_ok=True) 73 | 74 | # Create timestamp 75 | timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') 76 | 77 | # 1. Use gather_results.py to collect patch data 78 | print("Collecting patches for verification...") 79 | patch_collection_file = f"{config.temp_dir}/patches_for_verification_{timestamp}.jsonl" 80 | 81 | # Build gather_results.py command to collect patches 82 | collect_cmd = [ 83 | "python", "-m", "src.apebench.evaluation_pipelines.gather_results", 84 | "--pipeline", "patch", 85 | "--input_files", *generation_output_files, 86 | "--output_file", patch_collection_file, 87 | ] 88 | 89 | print(f"Executing: {' '.join(collect_cmd)}") 90 | subprocess.run(collect_cmd, check=True) 91 | 92 | # 2. Call eleanstic to perform verification 93 | print("Running Eleanstic verification...") 94 | verify_results_dir = os.path.join(config.verification.results_dir, f"results_{timestamp}") 95 | # Ensure results directory exists 96 | os.makedirs(verify_results_dir, exist_ok=True) 97 | 98 | verify_cmd = [ 99 | "python", "-m", "src.eleanstic.main", 100 | "--input_file", patch_collection_file, 101 | "--commit_id_key", "commit_hash", 102 | "--max_workers", str(config.verification.max_workers), 103 | "verify", 104 | "--code_key", "code", 105 | "--results_dir", verify_results_dir 106 | ] 107 | 108 | print(f"Executing: {' '.join(verify_cmd)}") 109 | subprocess.run(verify_cmd, check=True) 110 | 111 | # 3. Use gather_results.py to collect verification results 112 | print("Collecting verification results...") 113 | verification_output_file = f"{config.temp_dir}/verification_results_{timestamp}.jsonl" 114 | 115 | verify_collect_cmd = [ 116 | "python", "-m", "src.apebench.evaluation_pipelines.gather_results", 117 | "--pipeline", "verification", 118 | "--input_files", f"{verify_results_dir}/*.jsonl", 119 | "--output_file", verification_output_file, 120 | ] 121 | 122 | print(f"Executing: {' '.join(verify_collect_cmd)}") 123 | subprocess.run(verify_collect_cmd, check=True) 124 | 125 | # 4. Merge verification results with original generation data 126 | print("Merging verification results with original data...") 127 | merged_results_file = f"{config.output_dir}/merged_results_{timestamp}.jsonl" 128 | os.makedirs(os.path.dirname(merged_results_file), exist_ok=True) 129 | 130 | # Call gather_results.py merge functionality 131 | merge_cmd = [ 132 | "python", "-m", "src.apebench.evaluation_pipelines.gather_results", 133 | "--pipeline", "merge", # New pipeline type 134 | "--original_files", *generation_output_files, 135 | "--verification_file", verification_output_file, 136 | "--output_file", merged_results_file, 137 | ] 138 | 139 | print(f"Executing: {' '.join(merge_cmd)}") 140 | subprocess.run(merge_cmd, check=True) 141 | 142 | # 5. Calculate pass@k metrics for each model 143 | print("Calculating verification metrics...") 144 | verified_results = extract_verification_data(merged_results_file) 145 | metrics = calculate_metrics(verified_results, config) 146 | 147 | # 6. Generate visualizations 148 | if hasattr(config.evaluation, 'generate_plots') and config.evaluation.generate_plots: 149 | print("Generating verification metric plots...") 150 | plots_dir = getattr(config.evaluation, 'plots_dir', './verification_plots') 151 | os.makedirs(plots_dir, exist_ok=True) 152 | plot_metrics(metrics, plots_dir, f'verification_{timestamp}') 153 | print(f"Verification metric plots saved to: {plots_dir}") 154 | 155 | # 7. Save metrics 156 | metrics_file = f"{config.output_dir}/verification_metrics_{timestamp}.json" 157 | 158 | import json 159 | print('Saving verification metrics to: ', metrics_file) 160 | print('Metrics: ', metrics) 161 | with open(metrics_file, 'w') as f: 162 | json.dump(metrics, f, indent=2) 163 | 164 | # 8. Update progress tracking 165 | verification_status = { 166 | "completed": True, 167 | "timestamp": timestamp, 168 | "verification_output": verification_output_file, 169 | "merged_results": merged_results_file, 170 | "metrics_file": metrics_file, 171 | "metrics": metrics 172 | } 173 | 174 | progress_tracker.update_verification_status(verification_status) 175 | 176 | print(f"Verification completed. Results saved to: {merged_results_file}") 177 | 178 | return metrics -------------------------------------------------------------------------------- /docs/02_project_structure.md: -------------------------------------------------------------------------------- 1 | [English](#english-version) | [中文](#chinese-version) 2 | 3 | 4 | # 2. Project Structure 5 | 6 | This document provides a high-level overview of the APE-Bench I project's directory structure. 7 | 8 | ``` 9 | ape-bench/ 10 | ├── .git/ # Git repository data 11 | ├── .venv/ # Python virtual environment (recommended) 12 | ├── configs/ # General configuration files for experiments (if any) 13 | ├── datasets/ # Downloaded APE-Bench I dataset from Hugging Face 14 | ├── docs/ # This documentation 15 | │ ├── README.md 16 | │ ├── 01_introduction.md 17 | │ ├── ... (other documentation files) 18 | │ └── 04_core_components/ 19 | │ └── ... (component-specific docs) 20 | ├── paper.tex # LaTeX source for the research paper 21 | ├── README.md # Main project README (points to Hugging Face dataset) 22 | ├── requirements.txt # Python dependencies 23 | ├── src/ 24 | │ ├── __init__.py 25 | │ ├── apebench/ # Core logic for APE-Bench I framework 26 | │ │ ├── __init__.py 27 | │ │ ├── config/ # Configuration for APE-Bench components (models, paths) 28 | │ │ ├── data/ # Data loading, processing, task representation 29 | │ │ ├── evaluation_pipelines/ # Syntactic and semantic evaluation logic 30 | │ │ ├── inference/ # LLM interaction, patch generation, DiffRepair 31 | │ │ │ └── utils/ # Utilities for inference, e.g., diff_repair.py 32 | │ │ └── scripts/ # Scripts for running experiments, analysis 33 | │ │ └── utils/ # General utilities for apebench module 34 | │ ├── eleanstic/ # Eleanstic: version-aware syntactic verification 35 | │ │ ├── __init__.py 36 | │ │ ├── config.yaml # Configuration for Eleanstic 37 | │ │ ├── core/ # Core logic for Eleanstic (snapshotting, CAS) 38 | │ │ ├── main.py # Main script/entry point for Eleanstic operations 39 | │ │ └── utils/ # Utilities specific to Eleanstic 40 | │ └── utils/ # Shared utility functions (if any at src level) 41 | └── ... # Other project files (e.g., .gitignore) 42 | ``` 43 | 44 | ## Key Directories 45 | 46 | * **`configs/`**: May contain high-level configuration files for orchestrating different experimental setups. More specific configurations are often found within `src/apebench/config/` and `src/eleanstic/config.yaml`. 47 | 48 | * **`datasets/`**: This directory (created by you during setup) holds the actual benchmark data – the collection of (`Instruction`, `PreFile`, `Patch`) triplets. 49 | 50 | * **`docs/`**: Contains all the documentation files you are currently reading. 51 | 52 | * **`src/`**: The heart of the project, containing all source code. 53 | * **`src/apebench/`**: Implements the core APE-Bench I framework. This is where most of the logic for running experiments, interacting with LLMs, and evaluating results resides. 54 | * `config/`: Specific configurations for APE-Bench, such as model parameters, API endpoints, file paths relevant to benchmark runs. 55 | * `data/`: Modules for loading, parsing, and managing the APE-Bench I tasks from the `datasets/` directory. 56 | * `evaluation_pipelines/`: Contains the code for the two-stage evaluation process: syntactic verification (interfacing with Eleanstic) and semantic judgment (LLM-as-a-Judge). 57 | * `inference/`: Handles the generation of patches by LLMs. This includes constructing prompts, making API calls to various models, and processing their outputs. The critical `DiffRepair` utility (`inference/utils/diff_repair.py`) is also part of this module. 58 | * `scripts/`: Contains Python scripts that act as entry points for various operations, such as running a full evaluation pass for a model, generating specific analyses, or preparing data. 59 | * **`src/eleanstic/`**: A self-contained module that implements the Eleanstic system. Its primary role is to provide efficient and version-aware syntactic verification of Lean code by managing Mathlib build artifacts. 60 | * `config.yaml`: The main configuration file for Eleanstic, defining paths to Mathlib, storage locations, etc. 61 | * `core/`: The core implementation of Eleanstic's content-addressable storage, snapshot management, and environment restoration logic. 62 | * `main.py`: Often the main executable or entry point for Eleanstic operations like preprocessing Mathlib commits or servicing verification requests. 63 | 64 | Understanding this structure will help you navigate the codebase when trying to understand specific functionalities or when planning secondary development. 65 | 66 | --- 67 | 68 | 69 | ## 中文翻译 (Chinese Translation) 70 | 71 | # 2. 项目结构 72 | 73 | 本文档提供了 APE-Bench I 项目目录结构的高级概述。 74 | 75 | ``` 76 | ape-bench/ 77 | ├── .git/ # Git 仓库数据 78 | ├── .venv/ # Python 虚拟环境 (推荐) 79 | ├── configs/ # 实验的通用配置文件 (如果有) 80 | ├── datasets/ # 从 Hugging Face 下载的 APE-Bench I 数据集 81 | ├── docs/ # 本文档 82 | │ ├── README.md 83 | │ ├── 01_introduction.md 84 | │ ├── ... (其他文档文件) 85 | │ └── 04_core_components/ 86 | │ └── ... (组件特定文档) 87 | ├── paper.tex # 研究论文的 LaTeX 源文件 88 | ├── README.md # 项目主 README (指向 Hugging Face 数据集) 89 | ├── requirements.txt # Python 依赖 90 | ├── src/ 91 | │ ├── __init__.py 92 | │ ├── apebench/ # APE-Bench I 框架的核心逻辑 93 | │ │ ├── __init__.py 94 | │ │ ├── config/ # APE-Bench 组件的配置 (模型、路径) 95 | │ │ ├── data/ # 数据加载、处理、任务表示 96 | │ │ ├── evaluation_pipelines/ # 语法和语义评估逻辑 97 | │ │ ├── inference/ # LLM 交互、补丁生成、DiffRepair 98 | │ │ │ └── utils/ # 推理工具,例如 diff_repair.py 99 | │ │ └── scripts/ # 运行实验、分析的脚本 100 | │ │ └── utils/ # apebench 模块的通用工具 101 | │ ├── eleanstic/ # Eleanstic:版本感知的语法验证 102 | │ │ ├── __init__.py 103 | │ │ ├── config.yaml # Eleanstic 的配置文件 104 | │ │ ├── core/ # Eleanstic 的核心逻辑 (快照、CAS) 105 | │ │ ├── main.py # Eleanstic 操作的主脚本/入口点 106 | │ │ └── utils/ # Eleanstic 特定的工具 107 | │ └── utils/ # 共享的工具函数 (如果在 src 级别有的话) 108 | └── ... # 其他项目文件 (例如 .gitignore) 109 | ``` 110 | 111 | ## 关键目录 112 | 113 | * **`configs/`**: 可能包含用于编排不同实验设置的高级配置文件。更具体的配置通常位于 `src/apebench/config/` 和 `src/eleanstic/config.yaml` 中。 114 | 115 | * **`datasets/`**: 此目录(在设置过程中由您创建)包含实际的基准测试数据——(`指令`, `修改前文件`, `补丁`) 三元组的集合。 116 | 117 | * **`docs/`**: 包含您当前正在阅读的所有文档文件。 118 | 119 | * **`src/`**: 项目的核心,包含所有源代码。 120 | * **`src/apebench/`**: 实现核心 APE-Bench I 框架。大部分运行实验、与 LLM 交互以及评估结果的逻辑都位于此处。 121 | * `config/`: APE-Bench 的特定配置,例如模型参数、API 端点、与基准测试运行相关的文件路径。 122 | * `data/`: 用于从 `datasets/` 目录加载、解析和管理 APE-Bench I 任务的模块。 123 | * `evaluation_pipelines/`: 包含两阶段评估过程的代码:语法验证(与 Eleanstic 对接)和语义判断(作为裁判的 LLM)。 124 | * `inference/`: 处理由 LLM 生成补丁。这包括构建提示、调用各种模型的 API 以及处理其输出。关键的 `DiffRepair` 工具 (`inference/utils/diff_repair.py`) 也是此模块的一部分。 125 | * `scripts/`: 包含作为各种操作入口点的 Python 脚本,例如为模型运行完整的评估遍、生成特定分析或准备数据。 126 | * **`src/eleanstic/`**: 一个独立的模块,实现 Eleanstic 系统。其主要作用是通过管理 Mathlib 构建产物来提供高效且版本感知的 Lean 代码语法验证。 127 | * `config.yaml`: Eleanstic 的主配置文件,定义 Mathlib 的路径、存储位置等。 128 | * `core/`: Eleanstic 内容寻址存储、快照管理和环境恢复逻辑的核心实现。 129 | * `main.py`: 通常是 Eleanstic 操作(如预处理 Mathlib 提交或服务验证请求)的主要可执行文件或入口点。 130 | 131 | 理解此结构将有助于您在尝试理解特定功能或计划二次开发时浏览代码库。 132 | 133 | --- 134 | 135 | 下一节: [核心组件](./03_core_components/03_1_eleanstic.md) -------------------------------------------------------------------------------- /src/apebench/inference/utils/parallel.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | import pandas as pd 4 | import json 5 | import logging 6 | from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed 7 | import os 8 | import random 9 | import time 10 | 11 | def process_rows_parallel(data: pd.DataFrame, 12 | process_func: callable, 13 | output_file: str, 14 | max_workers: int = 2, 15 | config_info: dict = None): 16 | """Process rows in parallel using ThreadPoolExecutor. 17 | 18 | Args: 19 | data: DataFrame containing rows to process 20 | process_func: Function to process each row 21 | output_file: Path to output JSON file 22 | max_workers: Number of parallel workers 23 | config_info: Dictionary containing configuration information for logging 24 | 25 | Returns: 26 | Tuple of (processed_count, error_count) 27 | """ 28 | processed_count = 0 29 | error_count = 0 30 | total_items = len(data) 31 | start_time = time.time() 32 | 33 | # Prepare configuration information display 34 | config_str = "" 35 | if config_info: 36 | config_items = [] 37 | if 'model_name' in config_info: 38 | config_items.append(f"Model: {config_info['model_name']}") 39 | if 'temperature' in config_info: 40 | config_items.append(f"Temp: {config_info['temperature']}") 41 | if 'n_responses' in config_info: 42 | config_items.append(f"Responses: {config_info['n_responses']}") 43 | config_str = " | ".join(config_items) 44 | if config_str: 45 | config_str = f"[{config_str}] " 46 | 47 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 48 | futures = {executor.submit(process_func, row) : i for i, row in data.iterrows()} 49 | for future in as_completed(futures): 50 | try: 51 | result = future.result() 52 | if result is not None: 53 | result_json = json.dumps(result, ensure_ascii=False) 54 | with open(output_file, 'a') as f: 55 | f.write(result_json + '\n') 56 | processed_count += 1 57 | else: 58 | error_count += 1 59 | except TimeoutError: 60 | error_count += 1 61 | logging.error(f"{config_str}Timeout occurred while processing row") 62 | except Exception as e: 63 | error_count += 1 64 | logging.error(f"{config_str}Unexpected error while processing result: {str(e)}") 65 | finally: 66 | current_item = processed_count + error_count 67 | current_time = time.time() 68 | elapsed_time = current_time - start_time 69 | progress_percent = (current_item / total_items) * 100 70 | 71 | # Calculate estimated remaining time 72 | if current_item > 0 and progress_percent < 100: 73 | time_per_item = elapsed_time / current_item 74 | remaining_items = total_items - current_item 75 | estimated_remaining_time = time_per_item * remaining_items 76 | 77 | print( 78 | f"{config_str}Progress: {progress_percent:.2f}% ({current_item}/{total_items}) | " 79 | f"Completed: {processed_count} | Errors: {error_count} | " 80 | f"Elapsed time: {elapsed_time / 3600:.2f} hours | " 81 | f"Est. remaining: {estimated_remaining_time / 3600:.2f} hours" 82 | ) 83 | else: 84 | print( 85 | f"{config_str}Progress: {progress_percent:.2f}% ({current_item}/{total_items}) | " 86 | f"Completed: {processed_count} | Errors: {error_count} | " 87 | f"Elapsed time: {elapsed_time / 3600:.2f} hours" 88 | ) 89 | 90 | return processed_count, error_count 91 | 92 | def check_missing_rows(data: pd.DataFrame, output_file: str): 93 | """Check which rows from the original data are missing in the output file. 94 | 95 | Args: 96 | data: Original DataFrame with row indices 97 | output_file: Path to the output JSON file 98 | 99 | Returns: 100 | List of missing row indices 101 | """ 102 | processed_indices = set() 103 | 104 | if os.path.exists(output_file): 105 | with open(output_file, 'r') as f: 106 | for line in f: 107 | try: 108 | result = json.loads(line) 109 | if 'local_index' in result: 110 | processed_indices.add(result['local_index']) 111 | except json.JSONDecodeError: 112 | logging.error(f"Error decoding JSON line: {line}") 113 | 114 | all_indices = set(data.index.tolist()) 115 | missing_indices = list(all_indices - processed_indices) 116 | 117 | return missing_indices 118 | 119 | def process_with_retries(data: pd.DataFrame, 120 | process_func: callable, 121 | output_file: str, 122 | max_workers: int = 2, 123 | max_retries: int = 3, 124 | config_info: dict = None): 125 | """Process rows with automatic retries for failed rows. 126 | 127 | Args: 128 | data: DataFrame containing rows to process 129 | process_func: Function to process each row 130 | output_file: Path to output JSON file 131 | max_workers: Number of parallel workers 132 | max_retries: Maximum number of retry attempts for each batch of failures 133 | config_info: Dictionary containing configuration information for logging 134 | 135 | Returns: 136 | Tuple of (total_processed_count, total_error_count, final_missing_indices) 137 | """ 138 | total_processed = 0 139 | total_errors = 0 140 | retry_count = 0 141 | 142 | # Initial processing 143 | logging.info("Starting initial processing...") 144 | 145 | # Retry loop 146 | missing_indices = check_missing_rows(data, output_file) 147 | random.shuffle(missing_indices) 148 | 149 | config_str = "" 150 | if config_info: 151 | config_items = [] 152 | if 'model_name' in config_info: 153 | config_items.append(f"Model: {config_info['model_name']}") 154 | if 'temperature' in config_info: 155 | config_items.append(f"Temp: {config_info['temperature']}") 156 | if 'n_responses' in config_info: 157 | config_items.append(f"Responses: {config_info['n_responses']}") 158 | config_str = " | ".join(config_items) 159 | if config_str: 160 | config_str = f"[{config_str}] " 161 | 162 | while missing_indices and retry_count < max_retries: 163 | retry_count += 1 164 | 165 | print(f"{config_str}Retry attempt {retry_count}: Found {len(missing_indices)} missing rows") 166 | 167 | retry_data = data.loc[missing_indices] 168 | retry_processed, retry_errors = process_rows_parallel( 169 | retry_data, process_func, output_file, max_workers, config_info=config_info 170 | ) 171 | 172 | total_processed += retry_processed 173 | total_errors += retry_errors 174 | 175 | missing_indices = check_missing_rows(data, output_file) 176 | if not missing_indices: 177 | print(f"{config_str}All rows successfully processed") 178 | break 179 | 180 | if retry_count == max_retries: 181 | print(f"{config_str}Reached maximum retry attempts ({max_retries})") 182 | 183 | return total_processed, total_errors, missing_indices 184 | 185 | -------------------------------------------------------------------------------- /src/apebench/utils/progress_tracker.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | """ 4 | Progress tracking manager, used to record and manage the execution progress of the evaluation process 5 | """ 6 | 7 | import os 8 | import json 9 | import fcntl 10 | from datetime import datetime 11 | from typing import Dict, Any, List, Optional 12 | 13 | class ProgressTracker: 14 | """Track and manage evaluation process progress""" 15 | 16 | def __init__(self, progress_file: str): 17 | """ 18 | Initialize progress tracker 19 | 20 | Args: 21 | progress_file: Path to progress data file 22 | """ 23 | self.progress_file = progress_file 24 | self.data = self._load_progress() 25 | 26 | def _load_progress(self) -> Dict[str, Any]: 27 | """Load progress data, using file locks to ensure multi-process safety""" 28 | os.makedirs(os.path.dirname(self.progress_file), exist_ok=True) 29 | 30 | if os.path.exists(self.progress_file): 31 | try: 32 | with open(self.progress_file, 'r') as f: 33 | # Get shared lock (read lock) 34 | fcntl.flock(f, fcntl.LOCK_SH) 35 | try: 36 | data = json.load(f) 37 | finally: 38 | # Release lock 39 | fcntl.flock(f, fcntl.LOCK_UN) 40 | return data 41 | except Exception as e: 42 | print(f"Error loading progress file: {e}") 43 | # If loading fails, backup old file and create new one 44 | backup_file = f"{self.progress_file}.bak.{datetime.now().strftime('%Y%m%d%H%M%S')}" 45 | os.rename(self.progress_file, backup_file) 46 | print(f"Backed up problematic progress file to {backup_file}") 47 | 48 | # Initialize empty progress data 49 | return { 50 | "models": {}, 51 | "verification": {"completed": False}, 52 | "evaluation": {"completed": False}, 53 | "last_updated": None 54 | } 55 | 56 | def _save_progress(self, limited_update_keys: Optional[List[str]] = None) -> None: 57 | """Save progress data, using lock files to ensure multi-process safety""" 58 | self.data["last_updated"] = datetime.now().isoformat() 59 | os.makedirs(os.path.dirname(self.progress_file), exist_ok=True) 60 | 61 | # Create lock file path 62 | lock_file = f"{self.progress_file}.lock" 63 | 64 | try: 65 | # Open or create lock file 66 | with open(lock_file, 'w') as lock_f: 67 | # Get exclusive lock (write lock) 68 | fcntl.flock(lock_f, fcntl.LOCK_EX) 69 | try: 70 | # Read current data (if exists) 71 | current_data = self.data 72 | if os.path.exists(self.progress_file) and os.path.getsize(self.progress_file) > 0: 73 | try: 74 | with open(self.progress_file, 'r') as f: 75 | current_data = json.load(f) 76 | # Merge model data, preserve other parts unchanged 77 | current_data.update({k : v for k, v in self.data.items() if limited_update_keys is None or k in limited_update_keys}) 78 | current_data["last_updated"] = self.data["last_updated"] 79 | except (json.JSONDecodeError, ValueError): 80 | # If file is empty or format is wrong, use current data 81 | current_data = self.data 82 | 83 | # Update data in memory 84 | self.data = current_data 85 | 86 | # Write directly to original file 87 | with open(self.progress_file, 'w') as f: 88 | json.dump(self.data, f, indent=2) 89 | finally: 90 | # Release lock 91 | fcntl.flock(lock_f, fcntl.LOCK_UN) 92 | except Exception as e: 93 | print(f"Error saving progress file: {e}") 94 | 95 | def get_model_status(self, model_name: str) -> Dict[str, Any]: 96 | """ 97 | Get status of a specific model, forcibly reload latest data before getting 98 | 99 | Args: 100 | model_name: Model name 101 | 102 | Returns: 103 | Dictionary containing model status 104 | """ 105 | # Reload to get latest status 106 | self.data = self._load_progress() 107 | 108 | if model_name not in self.data["models"]: 109 | self.data["models"][model_name] = { 110 | "completed": False, 111 | "last_completed_config": -1, 112 | "output_files": [] 113 | } 114 | return self.data["models"][model_name] 115 | 116 | def update_model_status(self, model_name: str, status: Dict[str, Any]) -> None: 117 | """ 118 | Update model status 119 | 120 | Args: 121 | model_name: Model name 122 | status: New status dictionary 123 | """ 124 | self.data["models"][model_name] = status 125 | self._save_progress() 126 | 127 | def get_verification_status(self) -> Dict[str, Any]: 128 | """ 129 | Get verification phase status 130 | 131 | Returns: 132 | Verification status dictionary 133 | """ 134 | # Reload to get latest status 135 | self.data = self._load_progress() 136 | return self.data["verification"] 137 | 138 | def update_verification_status(self, status: Dict[str, Any]) -> None: 139 | """ 140 | Update verification phase status 141 | 142 | Args: 143 | status: New verification status dictionary 144 | """ 145 | self.data["verification"] = status 146 | self._save_progress() 147 | 148 | def get_evaluation_status(self) -> Dict[str, Any]: 149 | """ 150 | Get evaluation phase status 151 | 152 | Returns: 153 | Evaluation status dictionary 154 | """ 155 | # Reload to get latest status 156 | self.data = self._load_progress() 157 | return self.data["evaluation"] 158 | 159 | def update_evaluation_status(self, status: Dict[str, Any]) -> None: 160 | """ 161 | Update evaluation phase status 162 | 163 | Args: 164 | status: New evaluation status dictionary 165 | """ 166 | self.data["evaluation"] = status 167 | self._save_progress() 168 | 169 | def get_all_output_files(self) -> List[str]: 170 | """ 171 | Get output files for all completed models 172 | 173 | Returns: 174 | List of output file paths 175 | """ 176 | # Reload to get latest status 177 | self.data = self._load_progress() 178 | 179 | all_files = [] 180 | for model_name, model_status in self.data["models"].items(): 181 | if model_status.get("completed", False): 182 | all_files.extend(model_status.get("output_files", [])) 183 | return all_files 184 | 185 | def reset_progress(self, section: Optional[str] = None) -> None: 186 | """ 187 | Reset progress data 188 | 189 | Args: 190 | section: Section to reset, such as 'models', 'verification', 'evaluation', 191 | if None, reset all data 192 | """ 193 | if section is None: 194 | self.data = { 195 | "models": {}, 196 | "verification": {"completed": False}, 197 | "evaluation": {"completed": False}, 198 | "last_updated": None 199 | } 200 | elif section == 'models': 201 | self.data["models"] = {} 202 | elif section in self.data: 203 | self.data[section] = {"completed": False} 204 | 205 | self._save_progress() -------------------------------------------------------------------------------- /docs/03_core_components/03_3_apebench_inference.md: -------------------------------------------------------------------------------- 1 | [English](#english-version) | [中文](#chinese-version) 2 | 3 | 4 | # 3.3 LLM Inference and DiffRepair 5 | 6 | This section covers the process of generating patches using Large Language Models (LLMs) and the `DiffRepair` utility that post-processes these patches. The relevant code is located in `src/apebench/inference/` and its sub-modules like `src/apebench/inference/utils/diff_repair.py`. 7 | 8 | ## LLM Inference Process 9 | 10 | The core task for an LLM in APE-Bench I is to generate a patch (in unified diff format) that transforms a given `PreFile` according to an `Instruction`. 11 | 12 | 1. **Entry Point**: The main entry point for inference is `src/apebench/inference/run_inference.py`, which supports multiple pipelines including: 13 | * `patch` pipeline: For generating patched code based on instructions 14 | * `judgement` pipeline: For evaluating patches using LLM-as-Judge 15 | * `instruction` pipeline: For generating natural language instructions 16 | * Each pipeline is implemented as a specialized class in `src/apebench/inference/inference_pipelines/` 17 | 18 | 2. **Prompt Construction**: For each task, a prompt is constructed for the target LLM. This includes: 19 | * The `Instruction` (natural language command). 20 | * The `PreFile` (the full Lean code before edits). 21 | * Formatting instructions to guide the LLM to output correctly structured patches. 22 | * The prompt templates are specific to each pipeline type and are defined in separate modules under `src/apebench/inference/prompts/`. 23 | 24 | 3. **Model Invocation**: The inference framework supports various LLM providers: 25 | * The pipeline classes in `src/apebench/inference/inference_pipelines/` handle API authentication, request formatting, and response parsing for different APIs. 26 | * The `select_pipeline` function in `src/apebench/inference/run_inference.py` maps pipeline types to their respective pipeline classes. 27 | * Key parameters like `temperature`, `max_tokens`, and `n_responses` (for sampling multiple candidates) are passed to the appropriate API. 28 | 29 | 4. **Output Processing**: 30 | * The raw LLM output is parsed to extract the generated patches. 31 | * For the `patch` pipeline, `DiffRepair` is applied to the extracted patches (see below). 32 | * The processed outputs are saved to the specified output file in a structured format. 33 | 34 | 5. **Parallelism**: Processing is distributed across multiple workers (using `ProcessPoolExecutor`) to speed up inference for large datasets, controlled by the `--max_workers` parameter. 35 | 36 | The command to run inference looks like: 37 | ```bash 38 | python -m src.apebench.inference.run_inference \ 39 | --pipeline patch \ 40 | --input_file /path/to/tasks.jsonl \ 41 | --output_file /path/to/results.jsonl \ 42 | --model_name gpt-4o \ 43 | --temperature 0.8 \ 44 | --n_responses 20 \ 45 | --max_workers 4 46 | ``` 47 | 48 | ## DiffRepair: Fault-Tolerant Patch Recovery 49 | 50 | LLM-generated diffs are often "noisy" – they have incorrect line numbers, misaligned context lines, or formatting issues that prevent them from being applied cleanly using standard `patch` utilities. `DiffRepair` is a vital component designed to address this. 51 | 52 | * **Location**: `src/apebench/inference/utils/diff_repair.py` 53 | * **Purpose**: To transform noisy model-generated diffs into clean, structurally consistent, and applicable patches while preserving the original intent of the edit as much as possible. 54 | * **Mention in Paper**: Sections 5.1 (Patch Normalization) and Appendix A. 55 | 56 | **DiffRepair Workflow (as described in Appendix A of the paper):** 57 | 58 | 1. **Hunk Parsing**: The input diff text is parsed into individual "hunks" (segments of changes). 59 | 2. **Intent Localization (Fuzzy Matching)**: For each hunk, `DiffRepair` attempts to find the correct region in the `PreFile` where the change was intended. This is a crucial step and involves: 60 | * Comparing context lines from the hunk with lines in the `PreFile`. 61 | * Using fuzzy matching algorithms (e.g., Levenshtein distance, sequence matching) to tolerate minor discrepancies. 62 | * The `_find_candidate_region_exact` and `_find_best_region_with_dp` methods in `diff_repair.py` implement sophisticated matching logic, including dynamic programming. 63 | 3. **Patch Reconstruction**: Once the target region is localized, `DiffRepair` reconstructs a clean diff hunk: 64 | * Re-aligning added and deleted lines to structurally valid positions relative to the correctly identified context from `PreFile`. 65 | * Augmenting missing context lines to satisfy unified diff format constraints. 66 | * Resolving line number offsets and potential hunk overlaps. 67 | 4. **Final Diff Generation**: The repaired hunks are combined into a final, clean unified diff string. 68 | 69 | **Key aspects of `DiffRepair` from the code (`diff_repair.py`):** 70 | * Handles both standard diffs with `@@ ... @@` headers and non-standard diffs. 71 | * Normalizes lines (stripping whitespace, lowercasing) for more robust matching. 72 | * Uses a combination of exact and fuzzy matching techniques. 73 | * The `repair()` method orchestrates the overall process for a given diff. 74 | * Filters overlapping hunks based on the significance of changes. 75 | 76 | The paper's Table 3, showing patch application success rates before and after repair, highlights the importance of `DiffRepair`. 77 | 78 | --- 79 | 80 | Next: [Evaluation Pipeline: Syntactic & Semantic Checks](./03_4_apebench_evaluation.md) 81 | 82 | 83 | 84 | ## 中文翻译 (Chinese Translation) 85 | 86 | # 3.3 LLM 推理与 DiffRepair 87 | 88 | 本节涵盖使用大型语言模型 (LLM) 生成补丁的过程以及对这些补丁进行后处理的 `DiffRepair` 实用程序。相关代码位于 `src/apebench/inference/` 及其子模块中,例如 `src/apebench/inference/utils/diff_repair.py`。 89 | 90 | ## LLM 推理过程 91 | 92 | LLM 在 APE-Bench I 中的核心任务是根据 `Instruction` 生成一个补丁(统一差异格式),以转换给定的 `PreFile`。 93 | 94 | 1. **入口点**:推理的主要入口点是 `src/apebench/inference/run_inference.py`,它支持多个流程,包括: 95 | * `patch` 流程:根据指令生成修补后的代码 96 | * `judgement` 流程:使用作为裁判的 LLM 评估补丁 97 | * `instruction` 流程:生成自然语言指令 98 | * 每个流程都在 `src/apebench/inference/inference_pipelines/` 中实现为专门的类 99 | 100 | 2. **提示构建**:为每个任务构建目标 LLM 的提示。这包括: 101 | * `Instruction` (自然语言命令)。 102 | * `PreFile` (编辑前的完整 Lean 代码)。 103 | * 格式化指令,以指导 LLM 输出结构正确的补丁。 104 | * 提示模板针对每种流程类型,并在 `src/apebench/inference/prompts/` 下的独立模块中定义。 105 | 106 | 3. **模型调用**:推理框架支持各种 LLM 提供商: 107 | * `src/apebench/inference/inference_pipelines/` 中的流程类处理不同 API 的 API 身份验证、请求格式化和响应解析。 108 | * `src/apebench/inference/run_inference.py` 中的 `select_pipeline` 函数将流程类型映射到它们各自的流程类。 109 | * 诸如 `temperature`、`max_tokens` 和 `n_responses`(用于对多个候选进行采样)等关键参数会传递给相应的 API。 110 | 111 | 4. **输出处理**: 112 | * 解析原始 LLM 输出以提取生成的补丁。 113 | * 对于 `patch` 流程,对提取的补丁应用 `DiffRepair`(见下文)。 114 | * 处理后的输出以结构化格式保存到指定的输出文件中。 115 | 116 | 5. **并行性**:处理过程分布在多个工作进程中(使用 `ProcessPoolExecutor`)以加速大型数据集的推理,由 `--max_workers` 参数控制。 117 | 118 | 运行推理的命令如下所示: 119 | ```bash 120 | python -m src.apebench.inference.run_inference \ 121 | --pipeline patch \ 122 | --input_file /path/to/tasks.jsonl \ 123 | --output_file /path/to/results.jsonl \ 124 | --model_name gpt-4o \ 125 | --temperature 0.8 \ 126 | --n_responses 20 \ 127 | --max_workers 4 128 | ``` 129 | 130 | ## DiffRepair:容错补丁恢复 131 | 132 | LLM 生成的差异通常是"嘈杂的"——它们具有不正确的行号、未对齐的上下文行或格式问题,从而阻止使用标准 `patch` 实用程序将其干净地应用。`DiffRepair` 是为解决此问题而设计的关键组件。 133 | 134 | * **位置**:`src/apebench/inference/utils/diff_repair.py` 135 | * **目的**:将模型生成的嘈杂差异转换为干净、结构一致且可应用的补丁,同时尽可能保留编辑的原始意图。 136 | * **论文提及**:第 5.1 节(补丁规范化)和附录 A。 137 | 138 | **DiffRepair 工作流程(如论文附录 A 所述):** 139 | 140 | 1. **Hunk 解析**:将输入的差异文本解析为单独的"Hunk"(更改段)。 141 | 2. **意图定位(模糊匹配)**:对于每个 Hunk,`DiffRepair` 尝试在 `PreFile` 中找到更改意图的正确区域。这是一个关键步骤,涉及: 142 | * 比较 Hunk 中的上下文行与 `PreFile` 中的行。 143 | * 使用模糊匹配算法(例如,Levenshtein 距离、序列匹配)来容忍微小的差异。 144 | * `diff_repair.py` 中的 `_find_candidate_region_exact` 和 `_find_best_region_with_dp` 方法实现了复杂的匹配逻辑,包括动态规划。 145 | 3. **补丁重建**:一旦定位到目标区域,`DiffRepair` 会重建一个干净的差异 Hunk: 146 | * 相对于从 `PreFile` 中正确识别的上下文,将添加和删除的行重新对齐到结构有效的位置。 147 | * 扩充缺失的上下文行以满足统一差异格式的约束。 148 | * 解决行号偏移和潜在的 Hunk 重叠。 149 | 4. **最终差异生成**:将修复后的 Hunk 组合成最终的、干净的统一差异字符串。 150 | 151 | **代码中 `DiffRepair` (`diff_repair.py`) 的关键方面:** 152 | * 处理带有 `@@ ... @@` 标头的标准差异和非标准差异。 153 | * 规范化行(去除空白、小写化)以实现更稳健的匹配。 154 | * 结合使用精确匹配和模糊匹配技术。 155 | * `repair()` 方法协调给定差异的整个过程。 156 | * 筛选重叠的 Hunk 基于更改的重要性。 157 | 158 | --- 159 | 160 | 下一节: [评估流程:语法与语义检查](./03_4_apebench_evaluation.md) -------------------------------------------------------------------------------- /src/eleanstic/core/file_map.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | """ 4 | File Mapping Manager Module 5 | Responsible for storing and retrieving file mapping relationships for each commit 6 | 7 | Uses a compact binary format to store file mappings, reducing disk space usage. 8 | Each file mapping record contains only relative path, file hash, and file type information. 9 | """ 10 | import os 11 | import struct 12 | import shutil 13 | import hashlib 14 | import traceback 15 | 16 | class FileMapManager: 17 | """ 18 | File Mapping Manager, responsible for storing and retrieving file mapping relationships for each commit 19 | 20 | Uses binary file-based storage instead of JSON to reduce disk space usage 21 | """ 22 | def __init__(self, storage_dir="storage", maps_dir="file_maps"): 23 | """Initialize file mapping manager 24 | 25 | Args: 26 | storage_dir: File content storage directory 27 | maps_dir: File mapping storage directory 28 | """ 29 | self.storage_dir = storage_dir 30 | self.maps_dir = maps_dir 31 | 32 | # Ensure directories exist 33 | os.makedirs(self.storage_dir, exist_ok=True) 34 | os.makedirs(self.maps_dir, exist_ok=True) 35 | 36 | def get_map_path(self, commit_id): 37 | """Get mapping file path for specified commit 38 | 39 | Args: 40 | commit_id: Commit ID 41 | 42 | Returns: 43 | Complete path to the mapping file 44 | """ 45 | return os.path.join(self.maps_dir, f"{commit_id}.bin") 46 | 47 | def store_file_mapping(self, commit_id, file_mappings): 48 | """Store commit file mappings, using binary format 49 | 50 | File format: 51 | - 4 bytes: Record count (unsigned int) 52 | - For each record: 53 | - 2 bytes: Path length (unsigned short) 54 | - 32 bytes: SHA-256 hash 55 | - 1 byte: File type (0: regular file, 1: symlink) 56 | - Variable length: Relative path string (UTF-8 encoded) 57 | 58 | Args: 59 | commit_id: Commit ID 60 | file_mappings: {relative_path: {"hash": file_hash, "type": file_type}} 61 | 62 | Returns: 63 | True on success, False on failure 64 | """ 65 | map_path = self.get_map_path(commit_id) 66 | 67 | try: 68 | with open(map_path, 'wb') as f: 69 | # Write record count 70 | f.write(struct.pack('!I', len(file_mappings))) 71 | 72 | # Write each record 73 | for rel_path, file_info in file_mappings.items(): 74 | path_bytes = rel_path.encode('utf-8') 75 | path_len = len(path_bytes) 76 | 77 | # Convert hash from hex string to binary 78 | hash_bin = bytes.fromhex(file_info["hash"]) 79 | 80 | # File type: 0 for regular file, 1 for symlink 81 | file_type = 1 if file_info["type"] == "symlink" else 0 82 | 83 | # Write record header 84 | f.write(struct.pack('!H32sB', path_len, hash_bin, file_type)) 85 | 86 | # Write path string 87 | f.write(path_bytes) 88 | 89 | return True 90 | except Exception as e: 91 | print(f"Failed to store file mapping: {traceback.format_exc()}") 92 | return False 93 | 94 | def get_file_mapping(self, commit_id): 95 | """Get commit file mappings, reading from binary format 96 | 97 | Args: 98 | commit_id: Commit ID 99 | 100 | Returns: 101 | File mapping dictionary, or empty dictionary if not found 102 | """ 103 | map_path = self.get_map_path(commit_id) 104 | 105 | if os.path.exists(map_path): 106 | try: 107 | with open(map_path, 'rb') as f: 108 | # Read record count 109 | record_count_data = f.read(4) 110 | if not record_count_data: 111 | return {} 112 | 113 | record_count = struct.unpack('!I', record_count_data)[0] 114 | 115 | # Read all records 116 | file_mappings = {} 117 | for _ in range(record_count): 118 | # Read record header 119 | header_data = f.read(35) # 2(path_len) + 32(hash) + 1(type) = 35 bytes 120 | if not header_data or len(header_data) < 35: 121 | break 122 | 123 | path_len, hash_bin, file_type = struct.unpack('!H32sB', header_data) 124 | 125 | # Read path string 126 | path_data = f.read(path_len) 127 | if not path_data or len(path_data) < path_len: 128 | break 129 | 130 | rel_path = path_data.decode('utf-8') 131 | 132 | # Convert hash to hex string 133 | file_hash = hash_bin.hex() 134 | 135 | # Convert file type 136 | type_str = "symlink" if file_type == 1 else "regular" 137 | 138 | # Store in mapping dictionary 139 | file_mappings[rel_path] = { 140 | "hash": file_hash, 141 | "type": type_str 142 | } 143 | 144 | return file_mappings 145 | except Exception as e: 146 | print(f"Failed to read file mapping: {traceback.format_exc()}") 147 | 148 | return {} 149 | 150 | def get_storage_path(self, file_hash): 151 | """Get storage path based on file hash 152 | 153 | Args: 154 | file_hash: File content hash 155 | 156 | Returns: 157 | Complete path to the file in storage system 158 | """ 159 | # Use first 4 digits of hash for two-level directory 160 | return os.path.join(self.storage_dir, file_hash[:2], file_hash[2:4], file_hash) 161 | 162 | def compute_file_hash(self, filepath): 163 | """Calculate file hash 164 | 165 | Args: 166 | filepath: File path 167 | 168 | Returns: 169 | SHA256 hash of the file 170 | """ 171 | hasher = hashlib.sha256() 172 | 173 | if os.path.islink(filepath): 174 | # For symlinks, hash the target path 175 | target = os.readlink(filepath) 176 | hasher.update(target.encode()) 177 | else: 178 | # For regular files, hash the content 179 | with open(filepath, 'rb') as f: 180 | for chunk in iter(lambda: f.read(4096), b''): 181 | hasher.update(chunk) 182 | 183 | return hasher.hexdigest() 184 | 185 | def restore_file(self, dest_path, file_hash, file_type): 186 | """Restore file from storage system 187 | 188 | Args: 189 | dest_path: Target file path 190 | file_hash: File hash 191 | file_type: File type ("regular" or "symlink") 192 | 193 | Returns: 194 | On success returns (True, message), on failure returns (False, error_message) 195 | """ 196 | storage_path = self.get_storage_path(file_hash) 197 | 198 | if not os.path.exists(storage_path): 199 | return False, f"File does not exist in storage: {storage_path}" 200 | 201 | # Ensure target directory exists 202 | os.makedirs(os.path.dirname(dest_path), exist_ok=True) 203 | 204 | # If target already exists, delete it first 205 | if os.path.exists(dest_path): 206 | if os.path.islink(dest_path) or not os.path.isdir(dest_path): 207 | os.remove(dest_path) 208 | 209 | try: 210 | if file_type == "symlink": 211 | # Restore symlink 212 | with open(storage_path, 'r') as f: 213 | link_target = f.read() 214 | os.symlink(link_target, dest_path) 215 | else: 216 | # Restore regular file 217 | shutil.copy2(storage_path, dest_path) 218 | return True, f"File restored successfully: {dest_path}" 219 | except Exception as e: 220 | return False, f"Failed to restore file {dest_path}: {traceback.format_exc()}" -------------------------------------------------------------------------------- /src/apebench/inference/inference_pipelines/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | import pandas as pd 4 | import json 5 | import os 6 | import logging 7 | import time 8 | import traceback 9 | from datetime import datetime 10 | from abc import ABC, abstractmethod 11 | 12 | from ..utils import process_with_retries 13 | from ..utils import chat 14 | from ....utils.file_utils import load_jsonl, convert_to_serializable 15 | import random 16 | 17 | class BasePipeline(ABC): 18 | """ 19 | Base class for data processing pipelines that interact with AI models. 20 | 21 | This abstract class provides common functionality for: 22 | - Loading and processing input data 23 | - Handling results and errors 24 | - Logging and output management 25 | """ 26 | 27 | def __init__(self, args): 28 | """Initialize with command-line arguments""" 29 | self.args = args 30 | # Set default timestamp if not provided 31 | if not hasattr(self.args, 'timestamp') or self.args.timestamp is None: 32 | self.args.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') 33 | self.setup_logging() 34 | self.print_script_configuration() 35 | 36 | self.system_prompt = None 37 | 38 | def setup_logging(self): 39 | """Configure logging based on arguments""" 40 | os.makedirs(self.args.log_dir, exist_ok=True) 41 | log_file = f'{self.args.log_dir}/{self.args.pipeline}/{self.args.timestamp}_{self.args.model_name}_{self.args.temperature}.log' 42 | os.makedirs(os.path.dirname(log_file), exist_ok=True) 43 | logging.basicConfig( 44 | level=logging.INFO, 45 | format='%(asctime)s - %(levelname)s - %(message)s', 46 | filename=log_file 47 | ) 48 | 49 | 50 | def print_script_configuration(self): 51 | print("\nScript Configuration:") 52 | print("---------------------") 53 | for arg, value in vars(self.args).items(): 54 | print(f"{arg}: {value}") 55 | print("---------------------\n") 56 | 57 | def load_data(self): 58 | """ 59 | Load data from input file with support for multiple formats 60 | 61 | Returns: 62 | pd.DataFrame: Loaded data 63 | """ 64 | if self.args.input_file.endswith('.parquet'): 65 | data = pd.read_parquet(self.args.input_file) 66 | elif self.args.input_file.endswith('.json'): 67 | data = pd.read_json(self.args.input_file, orient='records', lines=True) 68 | elif self.args.input_file.endswith('.jsonl'): 69 | data = load_jsonl(self.args.input_file) 70 | data = pd.DataFrame(data) 71 | else: 72 | raise ValueError(f"Unsupported file type: {self.args.input_file}") 73 | 74 | return data 75 | 76 | @abstractmethod 77 | def get_input(self, row): 78 | """Get input text for a row""" 79 | pass 80 | 81 | def initialize_metadata(self, row): 82 | """Initialize metadata for a row""" 83 | return {} 84 | 85 | def update_metadata_per_response(self, metadata, parsed_response): 86 | """Update metadata with response""" 87 | return metadata 88 | 89 | def update_metadata_per_row(self, metadata, responses): 90 | """Update metadata with responses""" 91 | return metadata 92 | 93 | def early_stop(self, metadata, responses): 94 | """Early stop if the metadata is good or bad enough""" 95 | return False 96 | 97 | def parse_response(self, response, row): 98 | """Parse the response from the GPT model""" 99 | return {} 100 | 101 | def process_row(self, row): 102 | """ 103 | Process a single row of data. 104 | 105 | Args: 106 | row (pd.Series): The row to process 107 | 108 | Returns: 109 | Dict or None: Processing result or None if processing failed 110 | """ 111 | try: 112 | row_dict = row.to_dict() 113 | row_dict = convert_to_serializable(row_dict) 114 | row_dict['local_index'] = row.name 115 | input_text = self.get_input(row_dict) 116 | 117 | responses = [] 118 | metadata = self.initialize_metadata(row_dict) 119 | for _ in range(self.args.n_responses): 120 | response = None 121 | try: 122 | response = chat( 123 | prompt=input_text, 124 | system_prompt=self.system_prompt, 125 | model_name=self.args.model_name, 126 | temperature=self.args.temperature, 127 | max_tokens=self.args.max_tokens, 128 | thinking_budget_tokens=self.args.thinking_budget_tokens 129 | ) 130 | parsed_response = self.parse_response(response['choices'][0]['message']['content'], row_dict) 131 | if parsed_response is not None: 132 | response['inference_params'].update({ 133 | 'temperature': self.args.temperature, 134 | 'n_responses': self.args.n_responses 135 | }) 136 | parsed_response.update({ 137 | 'raw_response': response['choices'][0], 138 | 'model': self.args.model_name, 139 | 'usage': response['usage'], 140 | 'inference_params': response['inference_params'] 141 | }) 142 | metadata = self.update_metadata_per_response(metadata, parsed_response) 143 | responses.append(parsed_response) 144 | if self.early_stop(metadata, responses): 145 | break 146 | except Exception as e: 147 | logging.error(f"Error processing row {row.name}: {traceback.format_exc()}") 148 | responses.append(response) 149 | time.sleep(random.randint(1, 5)) 150 | continue 151 | metadata = self.update_metadata_per_row(metadata, responses) 152 | return { 153 | **row_dict, 154 | **metadata, 155 | 'responses': responses 156 | } 157 | except Exception as e: 158 | logging.error(f"Error processing row {row.name}: {traceback.format_exc()}") 159 | time.sleep(random.randint(1, 5)) 160 | return None 161 | 162 | @property 163 | def special_config(self): 164 | return '' 165 | 166 | def process_data(self): 167 | """ 168 | Process all data with automatic retries for failures 169 | 170 | Returns: 171 | Tuple[int, int, List]: (processed_count, error_count, failed_indices) 172 | """ 173 | # Load data 174 | data = self.load_data() 175 | 176 | # Generate output file path if not provided 177 | if not hasattr(self.args, 'output_file') or self.args.output_file is None: 178 | _input_file_name = os.path.splitext(os.path.basename(self.args.input_file))[0] 179 | self.args.output_file = '/'.join([ 180 | self.args.output_dir, 181 | self.args.pipeline, 182 | f'{self.args.timestamp}__{_input_file_name}__{self.args.model_name}__{self.args.temperature}{self.special_config}.jsonl' 183 | ]) 184 | os.makedirs(os.path.dirname(self.args.output_file), exist_ok=True) 185 | 186 | print(f"Results will be saved to {self.args.output_file}") 187 | 188 | # Prepare configuration information dictionary 189 | config_info = { 190 | 'model_name': self.args.model_name, 191 | 'temperature': self.args.temperature, 192 | 'n_responses': self.args.n_responses 193 | } 194 | 195 | # Process with automatic retries 196 | total_processed, total_errors, final_missing = process_with_retries( 197 | data=data, 198 | process_func=self.process_row, 199 | output_file=self.args.output_file, 200 | max_workers=self.args.max_workers, 201 | max_retries=self.args.max_retries, 202 | config_info=config_info 203 | ) 204 | 205 | # Save permanently failed indices if any 206 | if final_missing: 207 | os.makedirs('temp', exist_ok=True) 208 | missing_file = f'temp/missing_{self.args.pipeline}_{self.args.model_name}_{self.args.timestamp}.json' 209 | with open(missing_file, 'w') as f: 210 | json.dump({'missing_indices': final_missing}, f) 211 | logging.info(f"Saved {len(final_missing)} permanently failed indices to {missing_file}") 212 | 213 | logging.info(f"Final processing statistics - Successfully processed: {total_processed}, Total errors: {total_errors}") 214 | 215 | return total_processed, total_errors, final_missing -------------------------------------------------------------------------------- /src/apebench/inference/utils/call_api.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | 3 | import openai 4 | import logging 5 | import time 6 | import uuid 7 | from typing import Dict, Optional, Any 8 | from tenacity import retry, stop_after_attempt, wait_exponential, wait_random, wait_combine, retry_if_exception_type 9 | from ..utils.chat_logger import ChatLogger 10 | from ..utils.api_keys import * 11 | 12 | # List of supported model categories 13 | TOTAL_MODELS = ( 14 | 'o1', 'o3-mini', 'deepseek-r1-250120', 'aws_sdk_claude37_sonnet@thinking', 15 | 'gpt-4o-2024-08-06', 'gpt-4o-2024-11-20', 'aws_claude35_sdk_sonnet_v2', 16 | 'aws_sdk_claude37_sonnet', 'deepseek-v3-250324', 17 | 'deepseek-v3', 18 | 'doubao-1-5-pro-32k-250115', 19 | 'gpt-4.5-preview-2025-02-27' 20 | ) 21 | 22 | FORMAL_NAMES = { 23 | 'o1': 'OpenAI o1', 24 | 'o3-mini': 'OpenAI o3-mini', 25 | 'deepseek-r1-250120': 'DeepSeek R1', 26 | 'aws_sdk_claude37_sonnet@thinking': 'Claude 3.7 Sonnet (thinking)', 27 | 'gpt-4o-2024-08-06': 'GPT-4o', 28 | 'gpt-4o-2024-11-20': 'GPT-4o', 29 | 'aws_claude35_sdk_sonnet_v2': 'Claude 3.5 Sonnet', 30 | 'aws_sdk_claude37_sonnet': 'Claude 3.7 Sonnet', 31 | 'deepseek-v3-250324': 'DeepSeek V3 (0324)', 32 | 'deepseek-v3': 'DeepSeek V3', 33 | 'doubao-1-5-pro-32k-250115': 'Doubao 1.5 Pro', 34 | 'gpt-4.5-preview-2025-02-27': 'GPT-4.5', 35 | 'gemini-2.5-pro-preview-03-25': 'Gemini 2.5 Pro Preview', 36 | } 37 | 38 | REASONING_MODELS = ( 39 | 'o1', 'o3-mini', 'deepseek-r1-250120', 'aws_sdk_claude37_sonnet@thinking' 40 | ) 41 | 42 | UNSUPPORT_TEMPERATURE_MODELS = ( 43 | 'o3-mini', 'aws_sdk_claude37_sonnet@thinking' 44 | ) 45 | 46 | forbidden_params = { 47 | 'o3-mini': ['temperature'], 48 | 'aws_sdk_claude37_sonnet@thinking': ['temperature'], 49 | } 50 | 51 | def generate_logid() -> str: 52 | """ 53 | Generate a unique log ID 54 | 55 | Returns: 56 | str: UUID format unique ID 57 | """ 58 | return str(uuid.uuid4()) 59 | 60 | 61 | def create_client(model_name: str): 62 | """Create an appropriate client""" 63 | if 'deepseek' in model_name or 'doubao' in model_name: 64 | return openai.OpenAI( 65 | api_key=volces_api_key, 66 | base_url=volces_base_url, 67 | ) 68 | elif 'claude' in model_name: 69 | return openai.AzureOpenAI( 70 | azure_endpoint=aws_claude_base_url, 71 | api_version="2024-03-01-preview", 72 | api_key=aws_claude_api_key, 73 | ) 74 | elif 'gemini' in model_name: 75 | return openai.AzureOpenAI( 76 | azure_endpoint=google_base_url, 77 | api_version="2024-03-01-preview", 78 | api_key=google_api_key, 79 | ) 80 | else: 81 | return openai.AzureOpenAI( 82 | azure_endpoint=openai_base_url, 83 | api_version="2024-03-01-preview", 84 | api_key=openai_api_key, 85 | ) 86 | 87 | 88 | def prepare_inference_params( 89 | client: openai.OpenAI, 90 | model_name: str, 91 | messages: list, 92 | logid: str, 93 | temperature: float = 0.0, 94 | max_tokens: int = 8000, 95 | thinking_budget_tokens: int = 16000, 96 | reasoning_effort: str = 'high' 97 | ) -> Dict[str, Any]: 98 | """Prepare parameters for completion request""" 99 | params = { 100 | "model": model_name, 101 | "messages": messages, 102 | "temperature": temperature, 103 | "max_tokens": max_tokens, 104 | "extra_headers": {"X-TT-LOGID": logid}, 105 | } 106 | 107 | # Add thinking mode for Claude models 108 | if '@thinking' in model_name: 109 | params["model"] = model_name.replace('@thinking', '') 110 | params["temperature"] = 1.0 111 | params["extra_body"] = { 112 | "thinking": { 113 | "type": "enabled", 114 | "budget_tokens": thinking_budget_tokens 115 | } 116 | } 117 | params["max_tokens"] += thinking_budget_tokens 118 | 119 | # Add reasoning effort for o1 models 120 | # if model_name == 'o3-mini': 121 | # params["reasoning_effort"] = reasoning_effort 122 | 123 | if model_name in forbidden_params: 124 | for param in forbidden_params[model_name]: 125 | params.pop(param, None) 126 | return params 127 | 128 | 129 | @retry( 130 | stop=stop_after_attempt(5), # Retry up to 5 times 131 | wait=wait_combine( 132 | wait_exponential(multiplier=1, min=1, max=60), # Base exponential backoff: 1s, 2s, 4s, 8s, 16s 133 | wait_random(0, 2) # Add random jitter between 0-2 seconds 134 | ), 135 | retry=retry_if_exception_type((Exception,)), # Retry all exceptions 136 | reraise=True # Re-raise the exception at the end 137 | ) 138 | def execute_completion(client: openai.OpenAI, params: Dict[str, Any]): 139 | """Execute request with retry logic and jitter""" 140 | try: 141 | return client.chat.completions.create(**params) 142 | except Exception as e: 143 | logging.error(f"API call failed: {str(e)}") 144 | raise 145 | 146 | 147 | def chat( 148 | prompt: str, 149 | system_prompt: Optional[str] = None, 150 | model_name: str = 'gpt-4o-2024-08-06', 151 | print_result: bool = False, 152 | temperature: float = 0.0, 153 | n: int = 1, 154 | max_tokens: int = 8000, 155 | thinking_budget_tokens: int = 6000, 156 | logid: Optional[str] = None, 157 | log_chat: bool = True, 158 | **kwargs 159 | ) -> Dict[str, Any]: 160 | """ 161 | Generate conversational responses using specified model 162 | 163 | Parameters: 164 | prompt: User prompt text 165 | system_prompt: System prompt text (optional) 166 | model_name: Model name 167 | print_result: Whether to print results 168 | temperature: Sampling temperature 169 | n: Must be 1, otherwise throws an error 170 | max_tokens: Maximum tokens to generate 171 | logid: Custom log ID, automatically generated if None 172 | 173 | Returns: 174 | Dict: Model response result 175 | """ 176 | # Validate n parameter 177 | if n != 1: 178 | raise ValueError("This implementation only supports n=1, multiple sampling has been removed to simplify code") 179 | 180 | # Generate or use provided logid 181 | if logid is None: 182 | logid = generate_logid() 183 | 184 | # Initialize chat logger and timing 185 | chat_logger = ChatLogger() 186 | start_time = time.time() 187 | 188 | # Create message list 189 | messages = [{"role": "user", "content": prompt}] 190 | if system_prompt: 191 | if model_name.startswith('o'): 192 | messages = [{"role": "user", "content": system_prompt + "\n\n\n\n" + prompt}] 193 | else: 194 | messages.insert(0, {"role": "system", "content": system_prompt}) 195 | 196 | # Create appropriate client 197 | client = create_client(model_name) 198 | 199 | # Prepare API call parameters 200 | params = prepare_inference_params(client, model_name, messages, logid, temperature, max_tokens, thinking_budget_tokens) 201 | 202 | # Execute API call (with automatic retry) 203 | try: 204 | completion = execute_completion(client, params) 205 | result = completion.model_dump() 206 | result['inference_params'] = params 207 | 208 | # Calculate response time 209 | response_time = time.time() - start_time 210 | 211 | except Exception as e: 212 | logging.error(f"Request failed [logid: {logid}]: {str(e)}") 213 | raise 214 | 215 | # Print results (if needed) 216 | if print_result: 217 | print(f"LogID: {logid}") 218 | print(completion.model_dump_json()) 219 | print('\n\n--------------------------------\n\n') 220 | print(completion.choices[0].message.content) 221 | print('\n--------------------------------\n') 222 | print(f"Time taken: {response_time:.2f} seconds") 223 | 224 | # Log chat interaction 225 | if log_chat: 226 | chat_logger.log_chat( 227 | prompt=prompt, 228 | completion=result, 229 | model_name=model_name, 230 | system_prompt=system_prompt 231 | ) 232 | 233 | return result 234 | 235 | 236 | if __name__ == "__main__": 237 | system_prompt = None 238 | prompt = "What is the capital of France?" 239 | # model_name = "deepseek-r1-250120" 240 | model_name = "gemini-2.5-pro-preview-03-25" 241 | # model_name = "o3-mini" 242 | 243 | # Using automatically generated logid 244 | result = chat( 245 | prompt=prompt, 246 | system_prompt=system_prompt, 247 | model_name=model_name, 248 | print_result=True, 249 | n=1 250 | ) 251 | 252 | # Or using custom logid 253 | # custom_logid = f"api_call_{int(time.time())}" 254 | # result = chat( 255 | # prompt=prompt, 256 | # system_prompt=system_prompt, 257 | # model_name=model_name, 258 | # print_result=True, 259 | # n=1, 260 | # logid=custom_logid 261 | # ) -------------------------------------------------------------------------------- /src/apebench/evaluation_pipelines/evaluation_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | """ 3 | Evaluation management module responsible for executing the patch evaluation process 4 | """ 5 | 6 | import os 7 | import subprocess 8 | import json 9 | from datetime import datetime 10 | from typing import Dict, List, Any, Optional 11 | from ...utils import load_results, save_jsonl, load_jsonl 12 | from ..utils import ProgressTracker, calculate_metrics, plot_metrics, extract_judgement_data 13 | 14 | def filter_verified_data(merged_file: str) -> List[Dict[str, Any]]: 15 | """ 16 | Filter data that passed verification from the merged file 17 | 18 | Args: 19 | merged_file: Path to the merged results file 20 | 21 | Returns: 22 | List of verified data 23 | """ 24 | # Load merged data 25 | with open(merged_file, 'r') as f: 26 | merged_data = [json.loads(line) for line in f if line.strip()] 27 | 28 | # Filter items that passed verification 29 | verified_data = [] 30 | 31 | for item in merged_data: 32 | verified_responses = [] 33 | 34 | for response in item.get('responses', []): 35 | # Check verification result 36 | if response.get('verification_result', {}).get('complete', False): 37 | verified_responses.append(response) 38 | 39 | if verified_responses: 40 | # Create new item containing only verified responses 41 | verified_item = item.copy() 42 | verified_item['responses'] = verified_responses 43 | verified_data.append(verified_item) 44 | 45 | return verified_data 46 | 47 | def flatten_results(results): 48 | """ 49 | Flatten verification results 50 | """ 51 | flattened_results = [] 52 | for result in results: 53 | for response in result.get('responses', []): 54 | if response is not None and response.get('verification_result', {}).get('complete', False): 55 | if not 'best_gen_patch' in response: 56 | best_gen_patch = response['gen_patch'] 57 | if 'gen_patch_after_exact_repair' in response: 58 | best_gen_patch = response['gen_patch_after_exact_repair'] 59 | if 'gen_patch_after_robust_repair' in response: 60 | best_gen_patch = response['gen_patch_after_robust_repair'] 61 | response['best_gen_patch'] = best_gen_patch 62 | else: 63 | best_gen_patch = response['best_gen_patch'] 64 | flattened_result = result.copy() 65 | flattened_result['best_gen_patch'] = best_gen_patch 66 | # flattened_result['patch_generation_responses'] = flattened_result.pop('responses') 67 | flattened_result.update({k : response[k] for k in ('model', 'usage', 'inference_params', 'verification_result', 'best_gen_content', 'best_gen_patch', 'best_gen_patch_comment_free')}) 68 | flattened_result['raw_patch_generation_responses'] = response['raw_response'] 69 | flattened_results.append(flattened_result) 70 | return flattened_results 71 | 72 | def evaluate_patches(config_file: str, merged_results_file: Optional[str] = None) -> Dict[str, Any]: 73 | """ 74 | Evaluate the quality of verified patches 75 | 76 | Args: 77 | config_file: Path to configuration file 78 | merged_results_file: Optional path to merged results file 79 | 80 | Returns: 81 | Evaluation metrics 82 | """ 83 | # Import here instead of at the top to avoid circular imports 84 | from ..config.config_manager import ConfigManager 85 | 86 | # Load configuration 87 | config = ConfigManager(config_file).get_config() 88 | 89 | # Initialize progress tracker 90 | progress_tracker = ProgressTracker(config.progress_log) 91 | 92 | print(f"Running patch evaluation with configuration from: {config_file}") 93 | 94 | # Check if evaluation is already completed 95 | evaluation_status = progress_tracker.get_evaluation_status() 96 | if evaluation_status.get("completed", False): 97 | print("Evaluation already completed") 98 | verification_status = progress_tracker.get_verification_status() 99 | verification_metrics = verification_status.get("metrics", {}) 100 | judgement_status = progress_tracker.get_evaluation_status() 101 | judgement_metrics = judgement_status.get("metrics", {}) 102 | return verification_metrics, judgement_metrics 103 | 104 | # If no merged results file is provided, get it from the progress record 105 | if not merged_results_file: 106 | verification_status = progress_tracker.get_verification_status() 107 | if verification_status.get("completed", False): 108 | merged_results_file = verification_status.get("merged_results", "") 109 | else: 110 | raise ValueError("Verification has not been completed. Run verify_patches first.") 111 | 112 | if not merged_results_file or not os.path.exists(merged_results_file): 113 | raise ValueError(f"Merged results file not found: {merged_results_file}") 114 | 115 | print(f"Using merged results file: {merged_results_file}") 116 | 117 | # Create temporary and output directories 118 | os.makedirs(config.temp_dir, exist_ok=True) 119 | os.makedirs(config.output_dir, exist_ok=True) 120 | 121 | # Create timestamp 122 | timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') 123 | 124 | # 1. Flatten verification results 125 | print("Flattening verification results...") 126 | merged_results = load_jsonl(merged_results_file) 127 | flattened_results = flatten_results(merged_results) 128 | flattened_results_file = f"{config.temp_dir}/flattened_verification_{timestamp}.jsonl" 129 | save_jsonl(flattened_results, flattened_results_file) 130 | print(f"Flattened {len(flattened_results)} results saved to: {flattened_results_file}") 131 | 132 | # 2. Run judgment generation pipeline 133 | print("Running judgement generation...") 134 | judgement_output_file = f"{config.output_dir}/judgement_{timestamp}.jsonl" 135 | 136 | judgement_cmd = [ 137 | "python", "-m", "src.apebench.inference.run_inference", 138 | "--pipeline", "judgement", 139 | "--input_file", flattened_results_file, 140 | "--output_file", judgement_output_file, 141 | "--model_name", config.judgement.model_name, 142 | "--temperature", str(config.judgement.temperature), 143 | "--n_responses", str(config.judgement.n_responses), 144 | "--max_workers", str(config.judgement.max_workers) 145 | ] 146 | 147 | if hasattr(config.judgement, 'max_tokens') and config.judgement.max_tokens: 148 | judgement_cmd.append("--max_tokens") 149 | judgement_cmd.append(str(config.judgement.max_tokens)) 150 | 151 | if hasattr(config.judgement, 'thinking_budget_tokens') and config.judgement.thinking_budget_tokens: 152 | judgement_cmd.append("--thinking_budget_tokens") 153 | judgement_cmd.append(str(config.judgement.thinking_budget_tokens)) 154 | 155 | print(f"Executing: {' '.join(judgement_cmd)}") 156 | subprocess.run(judgement_cmd, check=True) 157 | 158 | # 3. Collect and filter judgment results 159 | print("Filtering judgement results...") 160 | filtered_judgement_file = f"{config.output_dir}/filtered_judgement_{timestamp}.jsonl" 161 | 162 | filter_cmd = [ 163 | "python", "-m", "src.apebench.evaluation_pipelines.gather_results", 164 | "--pipeline", "judgement", 165 | "--input_files", judgement_output_file, 166 | "--output_file", filtered_judgement_file, 167 | ] 168 | 169 | print(f"Executing: {' '.join(filter_cmd)}") 170 | subprocess.run(filter_cmd, check=True) 171 | 172 | # 4. Calculate final evaluation metrics (using modified gather_results implementation) 173 | print("Calculating final evaluation metrics...") 174 | judgement_data = extract_judgement_data(filtered_judgement_file) 175 | metrics = calculate_metrics(judgement_data, config) 176 | 177 | # 5. Generate visualizations 178 | if hasattr(config.evaluation, 'generate_plots') and config.evaluation.generate_plots: 179 | print("Generating judgement metric plots...") 180 | plots_dir = getattr(config.evaluation, 'plots_dir', './judgement_plots') 181 | os.makedirs(plots_dir, exist_ok=True) 182 | plot_metrics(metrics, plots_dir, f'judgement_{timestamp}') 183 | 184 | # 6. Save metrics 185 | metrics_file = f"{config.output_dir}/judgement_metrics_{timestamp}.json" 186 | with open(metrics_file, 'w') as f: 187 | json.dump(metrics, f, indent=2) 188 | 189 | # 7. Update progress tracking 190 | evaluation_status = { 191 | "completed": True, 192 | "timestamp": timestamp, 193 | "judgement_output": judgement_output_file, 194 | "filtered_judgement": filtered_judgement_file, 195 | "metrics_file": metrics_file, 196 | "metrics": metrics 197 | } 198 | 199 | progress_tracker.update_evaluation_status(evaluation_status) 200 | 201 | print(f"Evaluation completed. Results saved to: {metrics_file}") 202 | 203 | # 9. Reload verification metrics 204 | verification_status = progress_tracker.get_verification_status() 205 | verification_metrics = verification_status.get("metrics", {}) 206 | 207 | return verification_metrics, metrics -------------------------------------------------------------------------------- /docs/04_troubleshooting.md: -------------------------------------------------------------------------------- 1 | [English](#english-version) | [中文](#chinese-version) 2 | 3 | 4 | # 4. Troubleshooting 5 | 6 | This section lists common issues encountered during setup or execution and provides potential solutions. 7 | 8 | ## Eleanstic Setup Issues 9 | 10 | * **Issue**: Eleanstic preprocessing fails or takes an extremely long time. 11 | * **Cause**: Insufficient disk space for Mathlib clones, `.lake` build artifacts (before Eleanstic processes them), or the Eleanstic CAS store. 12 | * **Solution**: Ensure ample free disk space (hundreds of GB may be needed temporarily for many commits). Check paths in `src/eleanstic/config.yaml` are correct and writable. 13 | * **Cause**: `lake build` errors for specific Mathlib commits (e.g., network issues during `lake exe cache get`, toolchain problems). 14 | * **Solution**: Ensure Lean and Lake are correctly installed and in PATH. Check Eleanstic logs for specific errors from `lake`. The `src/eleanstic/README.md` mentions retry mechanisms for `lake exe cache get`; ensure these are active or consider increasing retry attempts/timeouts if configurable. Some older Mathlib commits might have unique build issues; Eleanstic should ideally be robust to a few failing commits or allow skipping them if they are not critical for the benchmark set. 15 | * **Cause**: Incorrect `mathlib_repo_path` in `src/eleanstic/config.yaml`. 16 | * **Solution**: Verify the path points to a valid, up-to-date clone of `leanprover-community/mathlib4`. 17 | 18 | * **Issue**: Eleanstic CAS store grows excessively large despite deduplication. 19 | * **Cause**: If many binary files (e.g., compiled `.olean` files) have minor, non-semantic differences across commits that defeat simple content hashing. 20 | * **Solution**: This is an inherent challenge. Eleanstic's design aims to mitigate this. Ensure Eleanstic is correctly identifying and hashing files. For extreme cases, one might investigate more advanced binary diffing/patching for storage, but this would be a significant R&D effort for Eleanstic itself. 21 | 22 | ## LLM Inference Issues 23 | 24 | * **Issue**: API errors from LLMs (e.g., authentication, rate limits, model not found). 25 | * **Solution**: 26 | * **Authentication**: Double-check API keys are correctly set as environment variables or in `src/apebench/config/` model configuration files. 27 | * **Rate Limits**: Implement or enhance retry logic (e.g., exponential backoff, as provided by the `tenacity` library in `requirements.txt`) in the API calling modules in `src/apebench/inference/`. Consider reducing batch sizes or running inference for fewer tasks at a time. 28 | * **Model Not Found**: Ensure the model names in your configuration match the exact identifiers used by the LLM provider's API. 29 | 30 | * **Issue**: LLM outputs are not in the expected diff format. 31 | * **Solution**: Review and refine the prompting strategy used in `src/apebench/inference/`. Ensure prompts clearly instruct the LLM to output a unified diff. `DiffRepair` can handle some noise, but if the output is entirely unstructured, prompting is the primary fix. 32 | 33 | * **Issue**: `DiffRepair` fails to repair a patch or significantly alters its meaning. 34 | * **Cause**: The LLM-generated diff is too divergent from the `PreFile` context, or `DiffRepair`'s fuzzy matching thresholds are too strict/loose. 35 | * **Solution**: 36 | * Inspect the problematic raw diff and `PreFile`. 37 | * Experiment with `DiffRepair` parameters (e.g., `strict_match_threshold`, `exact_match` flag when initializing `DiffRepair` in the inference pipeline). 38 | * For systematic issues, this might indicate a need to improve `DiffRepair`'s algorithms (see [LLM Inference and DiffRepair - Secondary Development](./04_core_components/04_3_apebench_inference.md)). 39 | 40 | ## Evaluation Issues 41 | 42 | * **Issue**: Syntactic verification (Lean compile) fails for patches that seem correct. 43 | * **Cause**: Eleanstic might not be restoring the *exact* correct versioned environment (e.g., wrong snapshot, issue during file restoration from CAS). 44 | * **Solution**: Verify Eleanstic setup. Check logs from Eleanstic and the Lean compiler for specific errors. Ensure the task's commit SHA is correctly mapped to the Eleanstic snapshot. 45 | * **Cause**: The patch, even if repaired, introduces subtle Lean errors not obvious at first glance. 46 | * **Solution**: Manually apply the patch to the `PreFile` (from the correct Mathlib commit, checked out locally) and try to compile with `lake env lean ` to debug the Lean error directly. 47 | 48 | * **Issue**: Semantic Judgement (LLM-as-a-Judge) gives unexpected results. 49 | * **Cause**: Prompting issues for the judge LLM; instability in judge LLM responses. 50 | * **Solution**: Review the semantic evaluation prompts. Ensure the `sample@4` voting is working as expected. The APE-Bench I paper uses Claude Sonnet 3.7 (thinking mode); using a different judge model might require re-calibrating expectations or prompts. 51 | 52 | ## General Issues 53 | 54 | * **Issue**: Python `ModuleNotFoundError` or `ImportError`. 55 | * **Solution**: Ensure your virtual environment is activated (`source venv/bin/activate`). Verify all dependencies in `requirements.txt` are installed correctly (`pip install -r requirements.txt`). Check `PYTHONPATH` if using complex project structures, though this should generally not be needed if the project is structured as a proper Python package. 56 | 57 | * **Issue**: Slow performance. 58 | * **Cause**: LLM API calls can be slow. Eleanstic preprocessing is intensive but one-time per commit. Disk I/O for Eleanstic CAS on slow drives. 59 | * **Solution**: 60 | * Use faster LLM models if available (though this changes the experiment). 61 | * Ensure Eleanstic CAS and snapshot directories are on fast storage (SSD recommended). 62 | * For inference, consider parallelizing API calls across multiple tasks if your API quotas and local resources allow (scripts in `src/apebench/scripts/` might already do this). 63 | 64 | If you encounter an issue not listed here, please check existing GitHub issues for the project (if available) or consider reporting a new one with detailed information: steps to reproduce, error messages, relevant configuration, and environment details. 65 | 66 | --- 67 | 68 | Next: [Development and Contribution Guide](./05_development_contribution.md) 69 | 70 | 71 | 72 | ## 中文翻译 (Chinese Translation) 73 | 74 | # 4. 故障排除 75 | 76 | 本节列出了在设置或执行过程中遇到的常见问题及其潜在的解决方案。 77 | 78 | ## Eleanstic 设置问题 79 | 80 | * **问题**:Eleanstic 预处理失败或耗时过长。 81 | * **原因**:Mathlib 克隆、`.lake` 构建产物(在 Eleanstic 处理它们之前)或 Eleanstic CAS 存储的磁盘空间不足。 82 | * **解决方案**:确保有足够的可用磁盘空间(对于许多提交,可能临时需要数百 GB)。检查 `src/eleanstic/config.yaml` 中的路径是否正确且可写。 83 | * **原因**:特定 Mathlib 提交的 `lake build` 错误(例如,`lake exe cache get` 期间的网络问题,工具链问题)。 84 | * **解决方案**:确保 Lean 和 Lake 已正确安装并在 PATH 中。检查 Eleanstic 日志以获取来自 `lake` 的特定错误。`src/eleanstic/README.md` 提到了 `lake exe cache get` 的重试机制;确保这些机制已激活,或者如果可配置,则考虑增加重试次数/超时。一些较旧的 Mathlib 提交可能存在独特的构建问题;理想情况下,Eleanstic 应该能够容忍少量失败的提交,或者如果它们对基准测试集不重要,则允许跳过它们。 85 | * **原因**:`src/eleanstic/config.yaml` 中的 `mathlib_repo_path` 不正确。 86 | * **解决方案**:验证该路径指向 `leanprover-community/mathlib4` 的有效、最新的克隆。 87 | 88 | * **问题**:尽管进行了重复数据删除,Eleanstic CAS 存储仍然过度增长。 89 | * **原因**:如果许多二进制文件(例如,编译的 `.olean` 文件)在不同提交之间存在微小的、非语义的差异,从而破坏了简单的内容哈希。 90 | * **解决方案**:这是一个固有的挑战。Eleanstic 的设计旨在缓解此问题。确保 Eleanstic 正确识别和哈希文件。对于极端情况,可以研究更高级的二进制差异/补丁存储方法,但这将是 Eleanstic 本身的重大研发工作。 91 | 92 | ## LLM 推理问题 93 | 94 | * **问题**:来自 LLM 的 API 错误(例如,身份验证、速率限制、模型未找到)。 95 | * **解决方案**: 96 | * **身份验证**:仔细检查 API 密钥是否已正确设置为环境变量或在 `src/apebench/config/` 模型配置文件中。 97 | * **速率限制**:在 `src/apebench/inference/` 的 API 调用模块中实现或增强重试逻辑(例如,指数退避,如 `requirements.txt` 中的 `tenacity` 库所提供)。考虑减少批处理大小或一次运行较少任务的推理。 98 | * **模型未找到**:确保配置中的模型名称与 LLM 提供商 API 使用的确切标识符匹配。 99 | 100 | * **问题**:LLM 输出未采用预期的差异格式。 101 | * **解决方案**:审查并优化 `src/apebench/inference/` 中使用的提示策略。确保提示明确指示 LLM 输出统一差异格式。`DiffRepair` 可以处理一些噪音,但如果输出完全没有结构,则提示是主要的解决方法。 102 | 103 | * **问题**:`DiffRepair` 无法修复补丁或显著改变其含义。 104 | * **原因**:LLM 生成的差异与 `PreFile` 上下文过于偏离,或者 `DiffRepair` 的模糊匹配阈值过于严格/宽松。 105 | * **解决方案**: 106 | * 检查有问题的原始差异和 `PreFile`。 107 | * 试验 `DiffRepair` 参数(例如,在推理流程中初始化 `DiffRepair` 时的 `strict_match_threshold`、`exact_match` 标志)。 108 | * 对于系统性问题,这可能表明需要改进 `DiffRepair` 的算法(请参阅[LLM 推理与 DiffRepair - 二次开发](./04_core_components/04_3_apebench_inference.md))。 109 | 110 | ## 评估问题 111 | 112 | * **问题**:对于看起来正确的补丁,语法验证(Lean 编译)失败。 113 | * **原因**:Eleanstic 可能没有恢复*完全*正确的版本化环境(例如,错误的快照,从 CAS 恢复文件时出现问题)。 114 | * **解决方案**:验证 Eleanstic 设置。检查 Eleanstic 和 Lean 编译器的日志以获取特定错误。确保任务的提交 SHA 正确映射到 Eleanstic 快照。 115 | * **原因**:即使修复后,补丁仍引入了乍一看并不明显的细微 Lean 错误。 116 | * **解决方案**:将补丁手动应用于 `PreFile`(来自正确的 Mathlib 提交,本地检出),并尝试使用 `lake env lean `进行编译以直接调试 Lean 错误。 117 | 118 | * **问题**:语义判断(作为裁判的 LLM)给出意外结果。 119 | * **原因**:裁判 LLM 的提示问题;裁判 LLM 响应的不稳定性。 120 | * **解决方案**:审查语义评估提示。确保 `sample@4` 投票按预期工作。APE-Bench I 论文使用 Claude Sonnet 3.7(思考模式);使用不同的裁判模型可能需要重新校准期望或提示。 121 | 122 | ## 一般问题 123 | 124 | * **问题**:Python `ModuleNotFoundError` 或 `ImportError`。 125 | * **解决方案**:确保您的虚拟环境已激活 (`source venv/bin/activate`)。验证 `requirements.txt` 中的所有依赖项均已正确安装 (`pip install -r requirements.txt`)。如果使用复杂的项目结构,请检查 `PYTHONPATH`,尽管如果项目结构为正确的 Python 包,则通常不需要这样做。 126 | 127 | * **问题**:性能缓慢。 128 | * **原因**:LLM API 调用可能很慢。Eleanstic 预处理计算量大,但每个提交仅执行一次。慢速驱动器上 Eleanstic CAS 的磁盘 I/O。 129 | * **解决方案**: 130 | * 如果可用,请使用更快的 LLM 模型(尽管这会改变实验)。 131 | * 确保 Eleanstic CAS 和快照目录位于快速存储设备上(建议使用 SSD)。 132 | * 对于推理,如果您的 API 配额和本地资源允许,请考虑跨多个任务并行化 API 调用(`src/apebench/scripts/` 中的脚本可能已经这样做了)。 133 | 134 | 如果您遇到此处未列出的问题,请检查项目的现有 GitHub 问题(如果可用),或考虑报告一个新问题并提供详细信息:重现步骤、错误消息、相关配置和环境详细信息。 135 | 136 | --- 137 | 138 | 下一节: [开发与贡献指南](./05_development_contribution.md) -------------------------------------------------------------------------------- /src/apebench/evaluation_pipelines/data_collector.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates. 2 | """ 3 | Data Collection Management Module, responsible for executing the dataset creation workflow 4 | """ 5 | 6 | import os 7 | import subprocess 8 | import logging 9 | from datetime import datetime 10 | from typing import Dict, Any, Optional 11 | import pandas as pd 12 | 13 | # Configure logging 14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 15 | logger = logging.getLogger(__name__) 16 | 17 | def collect_data(config_file: str) -> Dict[str, Any]: 18 | """ 19 | Run data collection tasks according to configuration 20 | 21 | Args: 22 | config_file: Configuration file path 23 | 24 | Returns: 25 | Key file path information generated during the collection process 26 | """ 27 | # Import here instead of at the top to avoid circular imports 28 | from ..config.config_manager import ConfigManager 29 | 30 | # Load configuration 31 | config_manager = ConfigManager(config_file) 32 | config = config_manager.get_config() 33 | 34 | # Use data_collection section of the configuration 35 | data_config = config.data_collection 36 | 37 | # Create output directory 38 | os.makedirs(data_config.dataset_dir, exist_ok=True) 39 | 40 | # Get data collection timestamp 41 | data_collection_date = datetime.now().strftime('%Y%m%d_%H%M%S') 42 | 43 | # Set key path variables 44 | repo_path = data_config.repo_path 45 | repo_name = os.path.basename(repo_path) 46 | max_diff_lines = data_config.max_diff_lines 47 | 48 | # Build base filename 49 | base_filename = f"{repo_name}_commits_data_{data_collection_date}_{max_diff_lines}" 50 | 51 | # Record all generated files 52 | output_files = {} 53 | 54 | # Step 1: Clone repository 55 | if not os.path.exists(repo_path): 56 | logger.info(f"Cloning {data_config.repo_url} to {repo_path}") 57 | subprocess.run(["git", "clone", data_config.repo_url, repo_path], check=True) 58 | else: 59 | logger.info(f"Repository {repo_path} already exists, skipping clone") 60 | 61 | # Step 2: Collect commit data 62 | raw_data_path = os.path.join(data_config.dataset_dir, f"{base_filename}.parquet") 63 | logger.info(f"Collecting commit data to {raw_data_path}") 64 | 65 | collect_cmd = [ 66 | "python", "src/apebench/data/collect_commit_data.py", 67 | "--repo_path", repo_path, 68 | "--output_path", raw_data_path, 69 | "--max_diff_lines", str(max_diff_lines) 70 | ] 71 | subprocess.run(collect_cmd, check=True) 72 | output_files["raw_data"] = raw_data_path 73 | 74 | # Step 3: Filter commit data 75 | filtered_data_path = os.path.join(data_config.dataset_dir, f"filtered_{base_filename}.parquet") 76 | length_plot_path = os.path.join(data_config.dataset_dir, f"filtered_{base_filename}_filtered_length_distribution.png") 77 | 78 | logger.info(f"Filtering data to {filtered_data_path}") 79 | filter_cmd = [ 80 | "python", "src/apebench/data/filter_commit_data.py", 81 | "--file_path", raw_data_path, 82 | "--output_path", filtered_data_path, 83 | "--length_distribution_plot_path", length_plot_path 84 | ] 85 | subprocess.run(filter_cmd, check=True) 86 | output_files["filtered_data"] = filtered_data_path 87 | output_files["length_plot"] = length_plot_path 88 | 89 | # Step 4: Build database 90 | logger.info("Building database") 91 | build_cmd = [ 92 | "python", "-m", "src.eleanstic.main", 93 | "--input_file", filtered_data_path, 94 | "build" 95 | ] 96 | subprocess.run(build_cmd, check=True) 97 | 98 | # Step 5: Verify filtered data 99 | verify_result_dir = os.path.join(data_config.dataset_dir, "verify_results", f"filtered_{base_filename}") 100 | os.makedirs(os.path.dirname(verify_result_dir), exist_ok=True) 101 | 102 | logger.info(f"Verifying filtered data, saving results to {verify_result_dir}") 103 | verify_cmd = [ 104 | "python", "-m", "src.eleanstic.main", 105 | "--input_file", filtered_data_path, 106 | "verify", 107 | "--code_key", "content_after", 108 | "--results_dir", verify_result_dir 109 | ] 110 | subprocess.run(verify_cmd, check=True) 111 | output_files["verify_results"] = verify_result_dir 112 | 113 | # Step 6: Filter verification data 114 | verified_data_path = os.path.join(data_config.dataset_dir, f"filtered_{base_filename}_verified.parquet") 115 | 116 | logger.info(f"Filtering verification data to {verified_data_path}") 117 | filter_results_cmd = [ 118 | "python", "src/apebench/data/filter_results.py", 119 | "--pipeline", "verification", 120 | "--input_files", f"{verify_result_dir}/*.jsonl", 121 | "--output_file", verified_data_path, 122 | "--reset_index_by_date" 123 | ] 124 | subprocess.run(filter_results_cmd, check=True) 125 | output_files["verified_data"] = verified_data_path 126 | 127 | # Step 7: Extract latest data 128 | latest_num_data = data_config.latest_num_data 129 | latest_data_path = os.path.join(data_config.dataset_dir, f"filtered_{base_filename}_verified_latest_{latest_num_data}.jsonl") 130 | 131 | logger.info(f"Extracting latest {latest_num_data} records to {latest_data_path}") 132 | # Use pandas to read and save data, instead of executing Python commands 133 | df = pd.read_parquet(verified_data_path) 134 | df.sort_values(by='date', ascending=False, inplace=True) 135 | df = df.head(latest_num_data) 136 | df.to_json(latest_data_path, orient='records', lines=True) 137 | output_files["latest_data"] = latest_data_path 138 | 139 | # Ensure output directories exist 140 | os.makedirs(config.output_dir, exist_ok=True) 141 | os.makedirs(os.path.join(config.output_dir, "instruction"), exist_ok=True) 142 | os.makedirs(os.path.join(config.output_dir, "judgement"), exist_ok=True) 143 | 144 | # Step 8: Generate instruction data 145 | instruction_model_name = data_config.instruction_model 146 | instruction_output_path = os.path.join( 147 | config.output_dir, 148 | "instruction", 149 | f"filtered_{base_filename}_verified_latest_{latest_num_data}_instruction_{instruction_model_name}.jsonl" 150 | ) 151 | 152 | logger.info(f"Generating instruction data to {instruction_output_path}") 153 | instruction_cmd = [ 154 | "python", "src/apebench/inference/run_inference.py", 155 | "--pipeline", "instruction", 156 | "--input_file", latest_data_path, 157 | "--output_file", instruction_output_path, 158 | "--model_name", instruction_model_name, 159 | "--max_workers", str(data_config.max_workers), 160 | "--n_responses", "1", 161 | "--temperature", "0", 162 | "--max_tokens", str(data_config.max_tokens), 163 | "--thinking_budget_tokens", str(data_config.thinking_budget_tokens) 164 | ] 165 | subprocess.run(instruction_cmd, check=True) 166 | output_files["instruction_output"] = instruction_output_path 167 | 168 | # Create instruction data directory 169 | os.makedirs(os.path.join(data_config.dataset_dir, "instruction"), exist_ok=True) 170 | 171 | instruction_data_path = os.path.join( 172 | data_config.dataset_dir, 173 | "instruction", 174 | f"filtered_{base_filename}_verified_latest_{latest_num_data}_instruction_{instruction_model_name}.jsonl" 175 | ) 176 | 177 | logger.info(f"Filtering instruction data to {instruction_data_path}") 178 | filter_instruction_cmd = [ 179 | "python", "src/apebench/data/filter_results.py", 180 | "--pipeline", "instruction", 181 | "--input_files", instruction_output_path, 182 | "--output_file", instruction_data_path, 183 | "--extract_exercise_info" 184 | ] 185 | subprocess.run(filter_instruction_cmd, check=True) 186 | output_files["instruction_data"] = instruction_data_path 187 | 188 | # Step 9: Verify through judgement of golden differences 189 | judgement_model_name = data_config.judgement_model 190 | judgement_output_path = os.path.join( 191 | config.output_dir, 192 | "judgement", 193 | f"filtered_{base_filename}_verified_latest_{latest_num_data}_judgement_{judgement_model_name}.jsonl" 194 | ) 195 | 196 | logger.info(f"Executing judgement verification to {judgement_output_path}") 197 | judgement_cmd = [ 198 | "python", "src/apebench/inference/run_inference.py", 199 | "--pipeline", "judgement", 200 | "--input_file", instruction_data_path, 201 | "--output_file", judgement_output_path, 202 | "--model_name", judgement_model_name, 203 | "--max_workers", str(data_config.max_workers), 204 | "--n_responses", "1", 205 | "--temperature", "0", 206 | "--max_tokens", str(data_config.max_tokens), 207 | "--thinking_budget_tokens", str(data_config.thinking_budget_tokens), 208 | "--patch_key", "gold_diff" 209 | ] 210 | subprocess.run(judgement_cmd, check=True) 211 | output_files["judgement_output"] = judgement_output_path 212 | 213 | # Create judgement data directory 214 | os.makedirs(os.path.join(data_config.dataset_dir, "judgement"), exist_ok=True) 215 | 216 | judgement_data_path = os.path.join( 217 | data_config.dataset_dir, 218 | "judgement", 219 | f"filtered_{base_filename}_verified_latest_{latest_num_data}_judgement_{judgement_model_name}.jsonl" 220 | ) 221 | 222 | logger.info(f"Filtering judgement data to {judgement_data_path}") 223 | filter_judgement_cmd = [ 224 | "python", "src/apebench/data/filter_results.py", 225 | "--pipeline", "judgement", 226 | "--input_files", judgement_output_path, 227 | "--output_file", judgement_data_path 228 | ] 229 | subprocess.run(filter_judgement_cmd, check=True) 230 | output_files["judgement_data"] = judgement_data_path 231 | 232 | logger.info(f"Data collection complete! Final data path: {judgement_data_path}") 233 | 234 | return output_files --------------------------------------------------------------------------------