├── src
├── apebench
│ ├── __init__.py
│ ├── utils
│ │ ├── __init__.py
│ │ └── progress_tracker.py
│ ├── inference
│ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ ├── api_keys.example.py
│ │ │ ├── chat_logger.py
│ │ │ ├── parallel.py
│ │ │ └── call_api.py
│ │ ├── inference_pipelines
│ │ │ ├── __init__.py
│ │ │ ├── generate_judgement.py
│ │ │ ├── generate_instruction.py
│ │ │ ├── generate_patch.py
│ │ │ └── base.py
│ │ ├── prompts
│ │ │ ├── __init__.py
│ │ │ ├── judgement_generation_prompts.py
│ │ │ ├── instruction_generation_prompts.py
│ │ │ └── patch_generation_prompts.py
│ │ └── run_inference.py
│ ├── scripts
│ │ ├── 1_generate_patches.py
│ │ ├── 2_verify_patches.py
│ │ └── 3_evaluate_patches.py
│ ├── config
│ │ ├── default_config.py
│ │ └── config_manager.py
│ └── evaluation_pipelines
│ │ ├── verification_manager.py
│ │ ├── evaluation_manager.py
│ │ └── data_collector.py
├── __init__.py
├── eleanstic
│ ├── __init__.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── sys_utils.py
│ │ ├── lean_utils.py
│ │ └── log_utils.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── status.py
│ │ └── file_map.py
│ └── config.yaml
└── utils
│ ├── __init__.py
│ ├── lean_utils.py
│ ├── file_utils.py
│ └── colors.py
├── assets
├── diff.pdf
├── main.pdf
├── future.pdf
├── main1.pdf
├── main2.pdf
├── main3.pdf
├── main4.pdf
├── main5.pdf
├── gorilla.png
├── pipeline.jpg
├── road_map.jpg
├── APE_Bench_I_paper.pdf
├── combined_metrics_combined.pdf
├── combined_boxplot_analysis_left.pdf
├── combined_boxplot_analysis_main.pdf
├── filtered_pure_changes_comparison.pdf
├── filtered_pure_changes_individual_subplots.pdf
├── judgement_by_change_size_combined_metrics.pdf
├── filtered_pure_changes_individual_subplots_density.pdf
└── filtered_mathlib4_commits_data_20250413_173625_100_waterfall.pdf
├── requirements.txt
├── .gitignore
├── LICENSE
├── configs
└── config.yaml
├── submission.py
├── docs
├── 03_core_components
│ ├── 03_2_apebench_data.md
│ ├── 03_5_apebench_scripts_config.md
│ └── 03_3_apebench_inference.md
├── 02_project_structure.md
└── 04_troubleshooting.md
└── run_ape_bench_example.sh
/src/apebench/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
--------------------------------------------------------------------------------
/assets/diff.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/diff.pdf
--------------------------------------------------------------------------------
/assets/main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main.pdf
--------------------------------------------------------------------------------
/src/eleanstic/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
--------------------------------------------------------------------------------
/assets/future.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/future.pdf
--------------------------------------------------------------------------------
/assets/main1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main1.pdf
--------------------------------------------------------------------------------
/assets/main2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main2.pdf
--------------------------------------------------------------------------------
/assets/main3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main3.pdf
--------------------------------------------------------------------------------
/assets/main4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main4.pdf
--------------------------------------------------------------------------------
/assets/main5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/main5.pdf
--------------------------------------------------------------------------------
/assets/gorilla.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/gorilla.png
--------------------------------------------------------------------------------
/assets/pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/pipeline.jpg
--------------------------------------------------------------------------------
/assets/road_map.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/road_map.jpg
--------------------------------------------------------------------------------
/assets/APE_Bench_I_paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/APE_Bench_I_paper.pdf
--------------------------------------------------------------------------------
/assets/combined_metrics_combined.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/combined_metrics_combined.pdf
--------------------------------------------------------------------------------
/assets/combined_boxplot_analysis_left.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/combined_boxplot_analysis_left.pdf
--------------------------------------------------------------------------------
/assets/combined_boxplot_analysis_main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/combined_boxplot_analysis_main.pdf
--------------------------------------------------------------------------------
/assets/filtered_pure_changes_comparison.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/filtered_pure_changes_comparison.pdf
--------------------------------------------------------------------------------
/assets/filtered_pure_changes_individual_subplots.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/filtered_pure_changes_individual_subplots.pdf
--------------------------------------------------------------------------------
/assets/judgement_by_change_size_combined_metrics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/judgement_by_change_size_combined_metrics.pdf
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | rapidfuzz
3 | tiktoken
4 | matplotlib
5 | pydantic
6 | portalocker
7 | colorlog
8 | plotly
9 | kaleido
10 | openai
11 | tenacity
--------------------------------------------------------------------------------
/assets/filtered_pure_changes_individual_subplots_density.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/filtered_pure_changes_individual_subplots_density.pdf
--------------------------------------------------------------------------------
/assets/filtered_mathlib4_commits_data_20250413_173625_100_waterfall.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xinhjBrant/APE-Bench_I/HEAD/assets/filtered_mathlib4_commits_data_20250413_173625_100_waterfall.pdf
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | datasets/
3 | __pycache__/
4 | .vscode
5 | mathlib4/
6 | ./*.parquet
7 | results*/
8 | temp/
9 | temp_data_sync/
10 | chat_logs/
11 | # outputs/
12 | # logs/
13 | sync_for_verify/
14 | analyze_outputs/
15 | old_*/
16 | *.log
17 | verify_database/
18 | plots/
19 | progress
20 | outputs/
21 | src/apebench/data_visualization
22 | src/apebench/inference/utils/api_keys.py
--------------------------------------------------------------------------------
/src/eleanstic/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | """
4 | eleanstic utility module
5 | """
6 |
7 | from .log_utils import setup_logger, log_progress
8 | from .lean_utils import verify_with_lean, run_lake_build, run_command
9 |
10 | __all__ = [
11 | 'setup_logger',
12 | 'log_progress',
13 | 'verify_with_lean',
14 | 'run_lake_build',
15 | 'run_command'
16 | ]
--------------------------------------------------------------------------------
/src/apebench/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | """
4 | Utility module for ApeBench
5 | """
6 |
7 | from .metrics import extract_verification_data, extract_judgement_data, calculate_metrics, plot_metrics
8 | from .progress_tracker import ProgressTracker
9 |
10 | __all__ = [
11 | 'extract_verification_data',
12 | 'extract_judgement_data',
13 | 'calculate_metrics',
14 | 'plot_metrics',
15 | 'ProgressTracker',
16 | ]
17 |
--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 | """
3 | Utility module, providing file processing and visualization functionality
4 | """
5 |
6 | from .file_utils import load_results, load_jsonl, save_jsonl, convert_to_serializable
7 | from .lean_file_parser import LeanFileAnalyzer
8 |
9 | __all__ = [
10 | 'load_results',
11 | 'load_jsonl',
12 | 'save_jsonl',
13 | 'convert_to_serializable',
14 | 'LeanFileAnalyzer',
15 | ]
16 |
--------------------------------------------------------------------------------
/src/eleanstic/core/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | """
4 | eleanstic 核心模块,提供配置和协调功能
5 | """
6 |
7 | from .config import ConfigManager
8 | from .coordinators import BuildCoordinator, VerifyCoordinator
9 | from .file_map import FileMapManager
10 | from .status import CommitStatus
11 |
12 | __all__ = [
13 | 'ConfigManager',
14 | 'BuildCoordinator',
15 | 'VerifyCoordinator',
16 | 'FileMapManager',
17 | 'CommitStatus'
18 | ]
19 |
--------------------------------------------------------------------------------
/src/apebench/inference/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | """
4 | Utility Tools Used in the Inference Process
5 | """
6 |
7 | from .call_api import chat, TOTAL_MODELS, REASONING_MODELS, UNSUPPORT_TEMPERATURE_MODELS
8 | from .diff_repair import DiffRepair, apply_diff
9 | from .parallel import process_with_retries
10 |
11 | __all__ = [
12 | 'chat',
13 | 'TOTAL_MODELS',
14 | 'REASONING_MODELS',
15 | 'UNSUPPORT_TEMPERATURE_MODELS',
16 | 'DiffRepair',
17 | 'apply_diff',
18 | 'process_with_retries',
19 | ]
20 |
--------------------------------------------------------------------------------
/src/apebench/inference/inference_pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 | """
3 | Inference Pipeline Module
4 |
5 | Contains pipelines for generating instructions, patches, and judgements.
6 | """
7 |
8 | from .base import BasePipeline
9 | from .generate_instruction import GenerateInstructionPipeline
10 | from .generate_patch import GeneratePatchPipeline
11 | from .generate_judgement import GenerateJudgementPipeline
12 |
13 | __all__ = [
14 | 'BasePipeline',
15 | 'GenerateInstructionPipeline',
16 | 'GeneratePatchPipeline',
17 | 'GenerateJudgementPipeline'
18 | ]
19 |
--------------------------------------------------------------------------------
/src/apebench/inference/utils/api_keys.example.py:
--------------------------------------------------------------------------------
1 | # API keys and endpoints for different language models
2 | # Fill in your actual API keys in this file
3 |
4 | # OpenAI API credentials (GPT models)
5 | openai_api_key = "your-openai-api-key"
6 | openai_base_url = "https://api.openai.com/v1" # Or your Azure OpenAI endpoint
7 |
8 | # Anthropic API credentials (Claude models)
9 | aws_claude_api_key = "your-anthropic-api-key"
10 | aws_claude_base_url = "https://api.anthropic.com" # Or your AWS Claude endpoint
11 |
12 | # Other API providers
13 | # DeepSeek models
14 | volces_api_key = "your-deepseek-api-key"
15 | volces_base_url = "https://api.deepseek.com" # Or other endpoint
16 |
17 | # Google API credentials
18 | google_api_key = "your-google-api-key"
19 | google_base_url = "https://generativelanguage.googleapis.com" # Or other specific Google AI service endpoint
--------------------------------------------------------------------------------
/src/utils/lean_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | def remove_lean_comments(src: str) -> str:
4 | i = 0
5 | n = len(src)
6 | out = []
7 | block_nest = 0
8 | while i < n:
9 | if block_nest == 0:
10 | if src.startswith("--", i):
11 | j = src.find("\n", i + 2)
12 | if j == -1:
13 | break
14 | else:
15 | out.append("\n")
16 | i = j + 1
17 | elif src.startswith("/-", i):
18 | block_nest = 1
19 | i += 2
20 | else:
21 | out.append(src[i])
22 | i += 1
23 | else:
24 | if src.startswith("/-", i):
25 | block_nest += 1
26 | i += 2
27 | elif src.startswith("-/", i):
28 | block_nest -= 1
29 | i += 2
30 | else:
31 | i += 1
32 |
33 | return "".join(out)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Huajian Xin, Jacques Fleuriot, Wenda Li, Bytedance Ltd. and/or its affiliates
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/src/apebench/inference/prompts/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 | """
3 | Prompt Templates Used in the Inference Process
4 | """
5 |
6 | from .instruction_generation_prompts import (
7 | instruction_generation_system_prompt,
8 | instruction_generation_input_prompt,
9 | instruction_generation_input_prompt_without_lean_code
10 | )
11 |
12 | from .patch_generation_prompts import (
13 | patch_generation_system_prompt,
14 | patch_generation_reasoning_models_system_prompt,
15 | patch_generation_input_prompt,
16 | patch_generation_input_prompt_without_lean_code
17 | )
18 |
19 | from .judgement_generation_prompts import (
20 | judgement_generation_system_prompt,
21 | judgement_generation_input_prompt,
22 | judgement_generation_input_prompt_without_lean_code
23 | )
24 |
25 | __all__ = [
26 | 'instruction_generation_system_prompt',
27 | 'instruction_generation_input_prompt',
28 | 'instruction_generation_input_prompt_without_lean_code',
29 | 'patch_generation_system_prompt',
30 | 'patch_generation_reasoning_models_system_prompt',
31 | 'patch_generation_input_prompt',
32 | 'patch_generation_input_prompt_without_lean_code',
33 | 'judgement_generation_system_prompt',
34 | 'judgement_generation_input_prompt',
35 | 'judgement_generation_input_prompt_without_lean_code'
36 | ]
37 |
--------------------------------------------------------------------------------
/src/apebench/scripts/1_generate_patches.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | #!/usr/bin/env python3
4 | """
5 | Patch generation script, runs the patch generation process according to configuration.
6 | """
7 |
8 | import argparse
9 | import os
10 | import sys
11 | from typing import List
12 |
13 | def main():
14 | """Script entry point"""
15 | # Parse command line arguments
16 | parser = argparse.ArgumentParser(description="Generate patches using multiple models and configurations")
17 | parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file")
18 | args = parser.parse_args()
19 |
20 | # Ensure src can be imported
21 | script_dir = os.path.dirname(os.path.abspath(__file__))
22 | root_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
23 | if root_dir not in sys.path:
24 | sys.path.insert(0, root_dir)
25 |
26 | # Import modules
27 | from ..evaluation_pipelines.patch_generator import generate_patches
28 | from ..config.config_manager import ConfigManager
29 |
30 | # Load configuration to confirm generation section exists
31 | config = ConfigManager(args.config).get_config()
32 | if not hasattr(config, 'generation'):
33 | print(f"Error: Configuration file {args.config} does not have a 'generation' section")
34 | sys.exit(1)
35 |
36 | # Execute generation
37 | output_files = generate_patches(args.config)
38 |
39 | print(f"\nGeneration task completed successfully!")
40 | print(f"Generated {len(output_files)} patch files.")
41 | print(f"Next step: Run the verification script using the same configuration file.")
42 |
43 | if __name__ == "__main__":
44 | main()
--------------------------------------------------------------------------------
/src/eleanstic/config.yaml:
--------------------------------------------------------------------------------
1 | # Eleanstic Environment Configuration File
2 |
3 | # Base Path Configuration
4 | paths:
5 | # Main Mathlib4 Git Repository Path
6 | mathlib_repo: "mathlib4"
7 | # Workspace Root Directory
8 | workspace_root: "verify_database"
9 | # Worktree Root Directory
10 | worktree_dir: "worktrees"
11 | # Content Storage Root Directory
12 | storage_dir: "storage"
13 | # Cache Directory
14 | cache_dir: "cache"
15 | # Log Directory
16 | log_dir: "logs"
17 | # Verification Results Directory
18 | verify_results_dir: "./verify_results"
19 |
20 | # Concurrency Settings
21 | concurrency:
22 | # Maximum Worker Processes
23 | max_workers: 180
24 | # Maximum Parallel File Storage Threads
25 | max_concurrent_file_storage: 8
26 | # Maximum Parallel Lean Verification Threads
27 | max_concurrent_lean_verifications: 64
28 |
29 | # Storage Settings
30 | storage:
31 | # Hash Algorithm (xxhash64/sha256)
32 | hash_algorithm: "sha256"
33 | # Whether to Delete Cache After Build
34 | remove_worktree_after_build: true
35 |
36 | # Cache Settings
37 | cache:
38 | # Number of Download Retries
39 | download_retries: 10
40 | # Download Timeout (seconds)
41 | download_timeout: 3600
42 | # Wait Time Between Retry Attempts (seconds)
43 | retry_wait: 30
44 |
45 | # Logging Settings
46 | logging:
47 | # Log Level (DEBUG/INFO/WARNING/ERROR/CRITICAL)
48 | level: "INFO"
49 | # Maximum Log File Size (MB)
50 | max_size_mb: 100
51 | # Number of Log Files to Keep
52 | backup_count: 10
53 | # Whether to Output to Console
54 | console_output: true
55 | # Whether to Use Colored Logs
56 | color_output: true
57 |
58 | # Verification Settings
59 | verification:
60 | # Verification Timeout (seconds)
61 | timeout: 120
62 |
--------------------------------------------------------------------------------
/src/apebench/scripts/2_verify_patches.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | """
4 | Patch verification script, runs the patch verification process according to configuration.
5 | """
6 |
7 | import argparse
8 | import os
9 | import sys
10 | from typing import List, Optional
11 |
12 | def main():
13 | """Script entry point"""
14 | # Parse command line arguments
15 | parser = argparse.ArgumentParser(description="Verify generated patches")
16 | parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file")
17 | parser.add_argument("--input_files", type=str, nargs="*", help="Optional list of generation output files")
18 | args = parser.parse_args()
19 |
20 | # Ensure src can be imported
21 | script_dir = os.path.dirname(os.path.abspath(__file__))
22 | root_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
23 | if root_dir not in sys.path:
24 | sys.path.insert(0, root_dir)
25 |
26 | # Import modules
27 | from ..evaluation_pipelines.verification_manager import verify_patches
28 |
29 | # Execute verification
30 | metrics = verify_patches(args.config, args.input_files)
31 |
32 | print(f"\nVerification completed successfully!")
33 |
34 | # Print verification metrics - using Markdown table format
35 | print("\n## Verification metrics")
36 | for model, model_metrics in metrics.items():
37 | print(f"\n### Model: {model}")
38 | for key in model_metrics:
39 | temp, n_responses = key.split(',')
40 | print(f"\nTemperature {temp}, n_responses {n_responses}")
41 |
42 | # Create table header
43 | print("\n| Metric | Value |")
44 | print("|--------|-------|")
45 |
46 | # Create table body
47 | for metric_name, value in model_metrics[key].items():
48 | print(f"| {metric_name} | {value:.4f} |")
49 |
50 | print(f"\nNext step: Run the evaluation script with the same config file.")
51 |
52 | if __name__ == "__main__":
53 | main()
--------------------------------------------------------------------------------
/src/apebench/config/default_config.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 | """
3 | Default configuration definition module
4 | """
5 |
6 | DEFAULT_CONFIG = {
7 | # Global settings
8 | "project_dir": "./",
9 | "output_dir": "./outputs",
10 | "temp_dir": "./temp",
11 | "progress_log": "./logs/progress.json",
12 |
13 | # Data settings
14 | "input_file": "./datasets/ape_bench1_valid_test.parquet",
15 |
16 | # Data collection configuration
17 | "data_collection": {
18 | "dataset_dir": "datasets",
19 | "repo_url": "https://github.com/leanprover-community/mathlib4.git",
20 | "repo_path": "mathlib4",
21 | "max_diff_lines": 100,
22 | "latest_num_data": 2000,
23 | "instruction_model": "aws_sdk_claude37_sonnet@thinking",
24 | "judgement_model": "aws_sdk_claude37_sonnet@thinking",
25 | "max_workers": 8,
26 | "max_tokens": 20000,
27 | "thinking_budget_tokens": 16000
28 | },
29 |
30 | # Patch generation configuration
31 | "generation": {
32 | "base_output_dir": "./outputs/patch",
33 | "parallel_models": True, # Different models executed in parallel
34 | "parallel_configs": False, # Same model with different configs executed serially
35 | "max_model_workers": 4, # Number of models to execute in parallel
36 | "models": [
37 | {
38 | "name": "deepseek-v3-250324",
39 | "configs": [
40 | {"temperature": 0.0, "n_responses": 1, "max_workers": 48},
41 | {"temperature": 0.6, "n_responses": 20, "max_workers": 48}
42 | ]
43 | }
44 | ]
45 | },
46 |
47 | # Verification configuration
48 | "verification": {
49 | "eleanstic_config": "./src/eleanstic/config.yaml",
50 | "max_workers": 128,
51 | "results_dir": "./verify_results"
52 | },
53 |
54 | # Judgment generation configuration
55 | "judgement": {
56 | "model_name": "aws_sdk_claude37_sonnet@thinking",
57 | "temperature": 0.0,
58 | "n_responses": 1,
59 | "max_workers": 8
60 | },
61 |
62 | # Evaluation configuration
63 | "evaluation": {
64 | "k_ratio": 0.8,
65 | "generate_plots": True,
66 | "plots_dir": "./plots"
67 | }
68 | }
--------------------------------------------------------------------------------
/src/apebench/scripts/3_evaluate_patches.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | #!/usr/bin/env python3
4 | """
5 | Patch evaluation script, runs the patch evaluation process according to configuration.
6 | """
7 |
8 | import argparse
9 | import os
10 | import sys
11 | import json
12 | from typing import Dict, Any, Optional
13 |
14 | def main():
15 | """Script entry point"""
16 | # Parse command line arguments
17 | parser = argparse.ArgumentParser(description="Evaluate verified patches")
18 | parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file")
19 | parser.add_argument("--merged_file", type=str, help="Optional merged results file from verification")
20 | args = parser.parse_args()
21 |
22 | # Ensure src can be imported
23 | script_dir = os.path.dirname(os.path.abspath(__file__))
24 | root_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
25 | if root_dir not in sys.path:
26 | sys.path.insert(0, root_dir)
27 |
28 | # Import modules
29 | from ..evaluation_pipelines.evaluation_manager import evaluate_patches
30 |
31 | # Execute evaluation
32 | verification_metrics, metrics = evaluate_patches(args.config, args.merged_file)
33 |
34 | # Print result summary
35 | print("\nEvaluation completed successfully!")
36 | print("Metrics summary:")
37 |
38 | # Print verification metrics - using Markdown table format
39 | print("\n## Verification metrics")
40 | for model, model_metrics in verification_metrics.items():
41 | print(f"\n### Model: {model}")
42 | for key in model_metrics:
43 | temp, n_responses = key.split(',')
44 | print(f"\nTemperature {temp}, n_responses {n_responses}")
45 |
46 | # Create table header
47 | print("\n| Metric | Value |")
48 | print("|--------|-------|")
49 |
50 | # Create table body
51 | for metric_name, value in model_metrics[key].items():
52 | print(f"| {metric_name} | {value * 100:.2f}% |")
53 |
54 | # Print judgment metrics - using Markdown table format
55 | print("\n## Judgement metrics")
56 | for model, model_metrics in metrics.items():
57 | print(f"\n### Model: {model}")
58 | for key in model_metrics:
59 | temp, n_responses = key.split(',')
60 | print(f"\nTemperature {temp}, n_responses {n_responses}")
61 |
62 | # Create table header
63 | print("\n| Metric | Value |")
64 | print("|--------|-------|")
65 |
66 | # Create table body
67 | for metric_name, value in model_metrics[key].items():
68 | print(f"| {metric_name} | {value * 100:.2f}% |")
69 |
70 | if __name__ == "__main__":
71 | main()
--------------------------------------------------------------------------------
/configs/config.yaml:
--------------------------------------------------------------------------------
1 | project_dir: "./"
2 | output_dir: "./outputs"
3 | temp_dir: "./temp"
4 | progress_log: "./progress/config_progress.json"
5 | input_file: "./datasets/ape_bench1_test.parquet"
6 |
7 | # Data collection configuration
8 | data_collection:
9 | # Dataset directory
10 | dataset_dir: "datasets"
11 | # Code repository information
12 | repo_url: "https://github.com/leanprover-community/mathlib4.git"
13 | repo_path: "mathlib4"
14 | # Data collection parameters
15 | max_diff_lines: 100
16 | latest_num_data: 2000
17 | # Model configuration
18 | instruction_model: "aws_sdk_claude37_sonnet@thinking"
19 | judgement_model: "aws_sdk_claude37_sonnet@thinking"
20 | max_workers: 8
21 | max_tokens: 20000
22 | thinking_budget_tokens: 16000
23 |
24 | generation:
25 | base_output_dir: "./outputs/patch"
26 | parallel_models: true
27 | parallel_configs: false
28 | max_model_workers: 16
29 | models:
30 | - name: "deepseek-v3-250324"
31 | configs:
32 | - temperature: 0.0
33 | n_responses: 1
34 | max_workers: 48
35 | - temperature: 0.6
36 | n_responses: 20
37 | max_workers: 48
38 | - name: "aws_sdk_claude37_sonnet"
39 | configs:
40 | - temperature: 0.0
41 | n_responses: 1
42 | max_workers: 4
43 | - temperature: 0.6
44 | n_responses: 20
45 | max_workers: 4
46 | - name: "aws_sdk_claude37_sonnet@thinking"
47 | configs:
48 | - temperature: 0.0
49 | n_responses: 20
50 | max_workers: 8
51 | - name: "deepseek-r1-250120"
52 | configs:
53 | - temperature: 0.0
54 | n_responses: 1
55 | max_workers: 32
56 | - temperature: 0.6
57 | n_responses: 20
58 | max_workers: 32
59 | - name: "gpt-4o-2024-08-06"
60 | configs:
61 | - temperature: 0.0
62 | n_responses: 1
63 | max_workers: 4
64 | - temperature: 0.6
65 | n_responses: 20
66 | max_workers: 4
67 | - name: "doubao-1-5-pro-32k-250115"
68 | configs:
69 | - temperature: 0.0
70 | n_responses: 1
71 | max_workers: 32
72 | - temperature: 0.6
73 | n_responses: 20
74 | max_workers: 64
75 | - name: "o3-mini"
76 | configs:
77 | - temperature: 0.0
78 | n_responses: 20
79 | max_workers: 4
80 | - name: "gemini-2.5-pro-preview-03-25"
81 | configs:
82 | - temperature: 0.0
83 | n_responses: 1
84 | max_workers: 64
85 | - temperature: 0.6
86 | n_responses: 20
87 | max_workers: 64
88 |
89 | verification:
90 | eleanstic_config: "./src/eleanstic/config.yaml"
91 | results_dir: "./verify_results"
92 | max_workers: 180
93 |
94 | judgement:
95 | model_name: "aws_sdk_claude37_sonnet@thinking"
96 | temperature: 0.0
97 | n_responses: 4
98 | max_workers: 64
99 |
100 | evaluation:
101 | k_ratio: 0.8
102 | generate_plots: true
103 | plots_dir: "./plots"
--------------------------------------------------------------------------------
/src/eleanstic/utils/sys_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 | """
3 | A script to find and kill all processes containing a specific command pattern
4 | """
5 |
6 | import os
7 | import subprocess
8 | import sys
9 | import signal
10 |
11 | def find_and_kill_processes(pattern):
12 | """
13 | Find and kill all processes containing the specified pattern
14 |
15 | Args:
16 | pattern: Command pattern to search for
17 |
18 | Returns:
19 | killed_count: Number of processes killed
20 | """
21 | # Use ps command to find all processes
22 | try:
23 | ps_output = subprocess.check_output(
24 | ["ps", "-ef"],
25 | universal_newlines=True
26 | )
27 | except subprocess.SubprocessError as e:
28 | print(f"Error running ps command: {e}")
29 | return 0
30 |
31 | killed_count = 0
32 | current_pid = os.getpid() # Get current script's PID
33 |
34 | # Iterate through all process lines
35 | for line in ps_output.strip().split('\n')[1:]: # Skip header line
36 | parts = line.split()
37 | if len(parts) < 8:
38 | continue
39 |
40 | pid = int(parts[1])
41 | cmd = ' '.join(parts[7:])
42 |
43 | # If a matching process is found and it's not the current script itself
44 | if pattern in cmd and pid != current_pid:
45 | try:
46 | print(f"Terminating process {pid}: {cmd}")
47 | os.kill(pid, signal.SIGTERM)
48 | killed_count += 1
49 | except OSError as e:
50 | print(f"Error terminating process {pid}: {e}")
51 |
52 | return killed_count
53 |
54 | if __name__ == "__main__":
55 | # Command pattern to search for
56 | for pattern in ["eleanstic", "lean", "lake"]:
57 | print(f"Finding and terminating processes containing '{pattern}'...")
58 | killed = find_and_kill_processes(pattern)
59 |
60 | if killed == 0:
61 | print("No matching processes found")
62 | else:
63 | print(f"Successfully terminated {killed} processes")
64 |
65 | # Helper commands for monitoring disk space and directory sizes:
66 | # 1. Monitor free space on the mounted volume every 20 seconds:
67 | # while true; do echo "$(date) - Storage space: $(df -h | grep -E '/mnt/bd/ape-bench-dev$' | awk '{print $4}')"; sleep 20; done
68 |
69 | # 2. Monitor used space on root and size of verify database storage directory every 60 seconds:
70 | # while true; do echo "$(date) - Storage space: $(df -h | grep -E '/$' | awk '{print $3}') - Directory size: $(du -sh /mnt/bd/ape-bench-dev/ape-bench1/datasets/verify_database/storage/partitions 2>/dev/null || echo 'Cannot access')"; sleep 60; done
71 |
72 | # 3. Monitor used space on mounted volume and size of verify database storage partitions every 20 seconds:
73 | # while true; do echo "$(date) - Storage space: $(df -h | grep -E '/mnt/bd/ape-bench-dev$' | awk '{print $3}') - Directory size: $(du -sh /mnt/bd/ape-bench-dev/ape-bench1/datasets/verify_database/storage/partitions 2>/dev/null | awk '{print $1}')"; sleep 20; done
--------------------------------------------------------------------------------
/src/utils/file_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | import os
4 | import glob
5 | import json
6 | import pandas as pd
7 | from tqdm import tqdm
8 | import numpy as np
9 |
10 | def convert_to_serializable(obj):
11 | if isinstance(obj, (str, int, float, bool, type(None))):
12 | return obj
13 | elif isinstance(obj, (set, frozenset)):
14 | return list(obj)
15 | elif isinstance(obj, dict):
16 | return {key: convert_to_serializable(value) for key, value in obj.items()}
17 | elif isinstance(obj, (list, tuple)):
18 | return [convert_to_serializable(item) for item in obj]
19 | elif hasattr(obj, 'isoformat'):
20 | return obj.isoformat()
21 | elif isinstance(obj, np.ndarray):
22 | return [convert_to_serializable(item) for item in obj]
23 | else:
24 | try:
25 | return str(obj)
26 | except:
27 | return None
28 |
29 | def load_results(file_paths):
30 | """
31 | Load results from files matching the pattern.
32 |
33 | Args:
34 | file_paths (str): Directory containing result files
35 | file_pattern (str): Pattern to match result files
36 |
37 | Returns:
38 | pd.DataFrame: Combined DataFrame from all matching files
39 | """
40 | if isinstance(file_paths, list):
41 | file_paths = [file_path for file_path_pattern in file_paths for file_path in glob.glob(file_path_pattern)]
42 | if isinstance(file_paths, str):
43 | file_paths = glob.glob(file_paths)
44 |
45 | if not file_paths:
46 | print(f"Warning: No files found matching {file_paths}")
47 | return pd.DataFrame()
48 |
49 | print(f"Found {len(file_paths)} files matching {file_paths}")
50 |
51 | all_data = []
52 | for file_path in tqdm(file_paths, desc="Loading files"):
53 | try:
54 | if file_path.endswith('.parquet'):
55 | df = pd.read_parquet(file_path)
56 | elif file_path.endswith('.jsonl') or file_path.endswith('.json'):
57 | # Read JSONL files
58 | records = load_jsonl(file_path)
59 | df = pd.DataFrame(records)
60 | else:
61 | print(f"Warning: Unsupported file format for {file_path}")
62 | continue
63 |
64 | all_data.append(df)
65 | except Exception as e:
66 | print(f"Error loading {file_path}: {str(e)}")
67 |
68 | if not all_data:
69 | print(f"Warning: Could not load any data from files matching {file_path}")
70 | return pd.DataFrame()
71 |
72 | combined_data = pd.concat(all_data, ignore_index=True)
73 | print(f"Loaded {len(combined_data)} entries from {len(all_data)} files")
74 |
75 | return combined_data
76 |
77 | def load_jsonl(input_path):
78 | with open(input_path, 'r') as f:
79 | data = [json.loads(line.strip()) for line in f]
80 | return data
81 |
82 | def save_jsonl(data, output_path):
83 | with open(output_path, 'w') as f:
84 | if isinstance(data, pd.DataFrame):
85 | for _, row in data.iterrows():
86 | f.write(json.dumps(row.to_dict(), ensure_ascii=False) + '\n')
87 | else:
88 | for item in data:
89 | f.write(json.dumps(item, ensure_ascii=False) + '\n')
90 |
--------------------------------------------------------------------------------
/submission.py:
--------------------------------------------------------------------------------
1 | """DO NOT rename this file!"""
2 | import os
3 | import re
4 | import json
5 | import textwrap
6 | import sys
7 |
8 | import openai
9 |
10 | from tqdm import tqdm
11 |
12 |
13 | class Submission:
14 | """A submission template. """
15 |
16 | def __init__(self, output_file: str):
17 | """You need to specify the following arguments."""
18 |
19 | self.output_file = output_file
20 |
21 | self.task = "Auto_Formalization" # [Auto_Formalization, Auto_Informalization]
22 | self.phase = "development" # [development, final]
23 |
24 | self.base_url = "http://120.77.8.29:12345/v1/" # The base url of the model server
25 | # If you are using OpenAI API or have set API key for
26 | # your own model, please fill in your API key
27 | self.api_key = "EMPTY"
28 | self.model = "./Mistral-7B-Instruct-v0.2" # Your own model path, or GPTs
29 | self.prompt = textwrap.dedent("""
30 | You are a math expert and familar with Lean 3 formal language.
31 | Now please translate the following statement and solution of a math
32 | word problem into Lean 3 formal solution. Please note that the
33 | informal solution and the formal solution need to be identical.
34 | # Problem: {{informal_statement}}
35 | # Solution: {{informal_proof}}
36 | # Formal solution in Lean 3:
37 | """)
38 |
39 | # custom generation parameters
40 | self.max_tokens = 256
41 | self.temperature = 0.9
42 | self.top_p = 0.7
43 | self.frequency_penalty = 0.0
44 |
45 | def generate(self, prompt):
46 | """We DO NOT recommend modifying this function, as
47 | it will be used to test if the model is accessable"""
48 |
49 | openai.api_key = self.api_key
50 | openai.base_url = self.base_url
51 |
52 | messages = [
53 | {"role": "user", "content": prompt},
54 | ]
55 |
56 | completion = openai.chat.completions.create(
57 | model=self.model, messages=messages, max_tokens=self.max_tokens,
58 | temperature=self.temperature, top_p=self.top_p,
59 | frequency_penalty=self.frequency_penalty,
60 | )
61 |
62 | return completion.choices[0].message.content
63 |
64 | def post_process(self, model_output: str):
65 | """You can post-process the model output here,
66 | such as extracting the formal proof from the model output."""
67 |
68 | formal_proof = re.findall(r'```[\S\s]*```', model_output)
69 | if formal_proof == []:
70 | formal_proof = re.findall(r'```[\S\s]*', model_output)
71 | if formal_proof == []:
72 | formal_proof = [model_output]
73 | formal_proof = formal_proof[-1].strip()
74 |
75 | lean_code = "\n".join(formal_proof.strip().split("\n")[1:-1]) # remove ```lean ```
76 | lean_code = re.sub(pattern=r'line [0-9]* ', repl='', string=lean_code) # remove line *
77 |
78 | return lean_code
79 |
80 | def run(self, input_data: str):
81 | """Run your model on the given input data, and store the
82 | predictions into the output file."""
83 |
84 | with open(input_data, 'r', encoding="utf8") as f:
85 | datas = json.load(f)
86 |
87 | outputs = []
88 | for data in tqdm(datas[:10], file=sys.stdout):
89 | input_text = self.prompt.format(
90 | informal_statement=data["informal_statement"],
91 | informal_proof=data["informal_proof"]
92 | )
93 |
94 | output = self.generate(prompt=input_text)
95 | outputs.append(dict(
96 | name=data["name"],
97 | formal_proof=self.post_process(output),
98 | ))
99 |
100 | if not os.path.exists(self.output_file):
101 | os.makedirs(os.path.dirname(self.output_file), exist_ok=True)
102 | with open(self.output_file, 'w', encoding='utf8') as f:
103 | json.dump(outputs, f, indent=4, ensure_ascii=False)
104 |
--------------------------------------------------------------------------------
/src/apebench/config/config_manager.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 | """
3 | Configuration management module responsible for loading, validating, and providing access to configurations
4 | """
5 |
6 | import os
7 | import json
8 | import yaml
9 | from typing import Dict, Any, Union
10 | from .default_config import DEFAULT_CONFIG
11 |
12 | class ConfigDict:
13 | """Class that allows dictionary data to be accessed via attributes"""
14 |
15 | def __init__(self, config_data: Dict[str, Any]):
16 | for key, value in config_data.items():
17 | if isinstance(value, dict):
18 | setattr(self, key, ConfigDict(value))
19 | else:
20 | setattr(self, key, value)
21 |
22 | def to_dict(self) -> Dict[str, Any]:
23 | """Convert configuration back to a dictionary"""
24 | result = {}
25 | for key, value in self.__dict__.items():
26 | if isinstance(value, ConfigDict):
27 | result[key] = value.to_dict()
28 | else:
29 | result[key] = value
30 | return result
31 |
32 | class ConfigManager:
33 | """Configuration manager"""
34 |
35 | def __init__(self, config_file: str = None):
36 | """
37 | Initialize the configuration manager
38 |
39 | Args:
40 | config_file: Optional path to a configuration file
41 | """
42 | # Load default configuration
43 | self.config_data = DEFAULT_CONFIG.copy()
44 |
45 | # If a configuration file is provided, load and merge it
46 | if config_file and os.path.exists(config_file):
47 | self._load_from_file(config_file)
48 |
49 | # Convert to attribute access form
50 | if config_file and not 'progress_log' in self.config_data:
51 | self.config_data['progress_log'] = config_file[ : config_file.find('.')] + '_progress.json'
52 | self.config = ConfigDict(self.config_data)
53 |
54 | def _load_from_file(self, config_file: str) -> None:
55 | """Load configuration from file and merge it"""
56 | file_extension = os.path.splitext(config_file)[1].lower()
57 |
58 | try:
59 | if file_extension == '.json':
60 | with open(config_file, 'r') as f:
61 | user_config = json.load(f)
62 | elif file_extension in ('.yaml', '.yml'):
63 | with open(config_file, 'r') as f:
64 | user_config = yaml.safe_load(f)
65 | else:
66 | raise ValueError(f"Unsupported config file format: {file_extension}")
67 |
68 | # Recursively merge configurations
69 | self._merge_configs(self.config_data, user_config)
70 | except Exception as e:
71 | print(f"Error loading config file: {e}")
72 |
73 | def _merge_configs(self, base: Dict[str, Any], override: Dict[str, Any]) -> None:
74 | """Recursively merge configuration dictionaries"""
75 | for key, value in override.items():
76 | if key in base and isinstance(base[key], dict) and isinstance(value, dict):
77 | self._merge_configs(base[key], value)
78 | else:
79 | base[key] = value
80 |
81 | def get_config(self) -> ConfigDict:
82 | """Get the configuration object"""
83 | return self.config
84 |
85 | def save_config(self, output_file: str) -> None:
86 | """Save current configuration to a file"""
87 | file_extension = os.path.splitext(output_file)[1].lower()
88 |
89 | try:
90 | os.makedirs(os.path.dirname(output_file), exist_ok=True)
91 |
92 | if file_extension == '.json':
93 | with open(output_file, 'w') as f:
94 | json.dump(self.config_data, f, indent=2)
95 | elif file_extension in ('.yaml', '.yml'):
96 | with open(output_file, 'w') as f:
97 | yaml.dump(self.config_data, f, default_flow_style=False)
98 | else:
99 | raise ValueError(f"Unsupported config file format: {file_extension}")
100 | except Exception as e:
101 | print(f"Error saving config file: {e}")
--------------------------------------------------------------------------------
/docs/03_core_components/03_2_apebench_data.md:
--------------------------------------------------------------------------------
1 | [English](#english-version) | [中文](#chinese-version)
2 |
3 |
4 | # 3.2 Data Handling: Tasks and Format
5 |
6 | This section describes how APE-Bench I tasks are structured, where the data comes from, and how it's handled by the `src/apebench/data/` modules.
7 |
8 | ## Task Format
9 |
10 | As specified in the APE-Bench I paper (Section 3.1), each task in the benchmark is a triplet: `(Instruction, PreFile, Patch)`.
11 |
12 | * **`Instruction`**: A natural language string describing the intended modification to a Lean file. This serves as the main prompt for the LLM being evaluated.
13 | * *Example*: "Refactor the proof of `theorem_xyz` to use `lemma_abc`." or "Add a new definition `new_function` with the following properties..."
14 | * **`PreFile`**: A string containing the complete Lean source code of the target file *before* the edit. This provides the full context for the LLM.
15 | * **`Patch`**: A string in the unified diff format that encodes the ground-truth edit. This patch, when applied to `PreFile`, should result in the desired post-edit state of the file.
16 | * This is used as the reference for evaluating LLM-generated patches, although direct diff matching is not the primary success metric (semantic correctness is key).
17 |
18 | Additional metadata associated with each task in the test set includes:
19 | * **Task ID**: A unique identifier for the task.
20 | * **Commit SHA**: The Mathlib4 commit from which the task was derived.
21 | * **File Path**: The path to the specific Lean file within the Mathlib commit.
22 | * **Task Category**: One of `Feature`, `Refactor`, or `Bug Fix` (as defined in paper Section 3.3).
23 | * **Difficulty Level**: One of `Easy`, `Medium`, or `Hard` (as defined in paper Section 3.3).
24 |
25 | ## Data Source
26 |
27 | The APE-Bench I dataset is hosted on Hugging Face:
28 | * **URL**: [https://huggingface.co/datasets/HuajianXin/APE-Bench_I](https://huggingface.co/datasets/HuajianXin/APE-Bench_I)
29 |
30 | During setup, you must clone this dataset into the `datasets/` directory within your project. The primary test dataset file is named `ape_bench1_test.parquet`.
31 |
32 | ## Data Handling in `src/apebench/data/`
33 |
34 | The modules within `src/apebench/data/` are responsible for:
35 |
36 | * **Loading Tasks**: Reading the benchmark data files (from `datasets/`) into memory, supporting both JSONL and Parquet formats.
37 | * **Parsing**: Extracting the `Instruction`, `PreFile`, `Patch`, and other metadata for each task.
38 | * **Data Representation**: Converting the raw data into Python objects for easier use throughout the application.
39 | * **Filtering/Selection**: Providing utilities to classify specific tasks based on criteria like ID, category, or difficulty.
40 |
41 | ---
42 |
43 |
44 | ## 中文翻译 (Chinese Translation)
45 |
46 | # 3.2 数据处理:任务与格式
47 |
48 | 本节描述 APE-Bench I 任务的结构、数据来源以及 `src/apebench/data/` 模块如何处理这些数据。
49 |
50 | ## 任务格式
51 |
52 | 正如 APE-Bench I 论文(第 3.1 节)所明确指出的,基准测试中的每个任务都是一个三元组:`(Instruction, PreFile, Patch)`。
53 |
54 | * **`Instruction` (指令)**:一个自然语言字符串,描述对 Lean 文件的预期修改。这是被评估 LLM 的主要提示。
55 | * *示例*:"将 `theorem_xyz` 的证明重构为使用 `lemma_abc`。"或"添加一个具有以下属性的新定义 `new_function`..."
56 | * **`PreFile` (修改前文件)**:一个包含编辑前目标文件完整 Lean 源代码的字符串。这为 LLM 提供了完整的上下文。
57 | * **`Patch` (补丁)**:一个统一差异格式的字符串,编码了真实的编辑。当此补丁应用于 `PreFile` 时,应产生文件所需的编辑后状态。
58 | * 这被用作评估 LLM 生成补丁的参考,尽管直接的差异匹配不是主要的成功指标(语义正确性是关键)。
59 |
60 | 测试集中与每个任务相关的其他元数据包括:
61 | * **Task ID (任务 ID)**:任务的唯一标识符。
62 | * **Commit SHA (提交 SHA)**:任务来源的 Mathlib4 提交的 SHA 值。
63 | * **File Path (文件路径)**:Mathlib 提交中特定 Lean 文件的路径。
64 | * **Task Category (任务类别)**:`Feature` (功能)、`Refactor` (重构)或`Bug Fix` (错误修复)之一(根据论文第 3.3 节定义)。
65 | * **Difficulty Level (难度级别)**:`Easy` (简单)、`Medium` (中等)或`Hard` (困难)之一(根据论文第 3.3 节定义)。
66 |
67 | ## 数据来源
68 |
69 | APE-Bench I 数据集托管在 Hugging Face 上:
70 | * **URL**: [https://huggingface.co/datasets/HuajianXin/APE-Bench_I](https://huggingface.co/datasets/HuajianXin/APE-Bench_I)
71 |
72 | 在设置过程中,您必须将此数据集克隆到项目中的 `datasets/` 目录。主要的测试数据集文件名为 `ape_bench1_test.parquet`。
73 |
74 | ## `src/apebench/data/` 中的数据处理
75 |
76 | `src/apebench/data/` 中的模块负责:
77 |
78 | * **加载任务**:从 `datasets/` 内存中读取基准测试数据文件,支持 JSONL 和 Parquet 格式。
79 | * **解析**:为每个任务提取 `Instruction`、`PreFile`、`Patch` 和其他元数据。
80 | * **数据表示**:将原始数据转换为 Python 对象,以便在整个应用程序中更轻松地使用。
81 | * **筛选/选择**:提供根据 ID、类别或难度等标准分类。
82 |
83 | ---
84 |
85 | 下一节: [LLM 推理与 DiffRepair](./03_3_apebench_inference.md)
--------------------------------------------------------------------------------
/src/apebench/inference/prompts/judgement_generation_prompts.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | judgement_generation_system_prompt = '''## Task
4 | Analyze the provided Lean4 code patch comprehensively to determine whether it correctly, completely, and clearly implements the specified tasks. The patch has already been verified by Lean's type checker, thus types and tactics are assumed correct. Your goal is to provide a thorough yet contextually flexible evaluation, assessing how well the patch aligns with task requirements and integrates into the existing codebase.
5 |
6 | Use the following steps and criteria **as references** to guide your analysis. **Do not mechanically adhere to these steps; instead, adapt their use according to the specific context and significance of each element in the provided code.** Aim for a comprehensive, flexible, and nuanced evaluation rather than a rigid checklist.
7 |
8 | ---
9 |
10 | ### Step 1: Task Understanding (Reference)
11 | - Summarize core and implied requirements clearly.
12 | - Identify any explicit or implicit constraints.
13 | - Clarify expected outcomes and note any ambiguities.
14 |
15 | ---
16 |
17 | ### Step 2: Original Code Analysis (Reference)
18 | - Provide a concise summary of the original code structure and purpose.
19 | - Highlight key definitions, lemmas, theorems, proofs, assumptions, or dependencies relevant to the patch.
20 | - Evaluate logical flow and proof structure as contextually appropriate.
21 |
22 | ---
23 |
24 | ### Step 3: Patch Examination (Reference)
25 | - Clearly describe the elements added, modified, or removed.
26 | - Evaluate the logical clarity, correctness, and efficacy of modifications.
27 | - Consider appropriate use of Lean4-specific features (e.g., inductive types, macros, notations).
28 |
29 | ---
30 |
31 | ### Step 4: Requirement Fulfillment Analysis (Reference)
32 | For each provided task, evaluate (as contextually relevant):
33 | - Accuracy and completeness of achieving core objectives.
34 | - Logical thoroughness and consideration of edge cases.
35 | - Mathematical and type-theoretic correctness.
36 | - Consistency with existing design patterns and coding standards.
37 |
38 | ---
39 |
40 | ### Step 5: Implementation Quality Analysis (Reference)
41 | Evaluate implementation quality with respect to:
42 | - Mathematical abstraction, modularity, and hierarchical structure.
43 | - Clarity, naming conventions, and documentation effectiveness.
44 | - Logical decomposition, proof readability, and maintainability.
45 | - Software engineering principles (single responsibility, interface rationality).
46 | - Appropriate use of Lean-specific techniques (metaprogramming, universes, computational vs. proof separation).
47 | - Future-proofing, extensibility, and integration within mathlib standards.
48 |
49 | ---
50 |
51 | ### Step 6: Overall Judgement (Required)
52 | Based on your comprehensive analysis, provide structured final grades **without additional justification**, strictly using the JSON format below for clear information extraction:
53 |
54 | ```json
55 | {
56 | "TaskEvaluations": {
57 | "Task 1": "Excellent | Good | Acceptable | Poor | Unacceptable",
58 | "Task 2": "Excellent | Good | Acceptable | Poor | Unacceptable"
59 | // Add additional tasks as necessary
60 | },
61 | "FinalOverallGrade": "Excellent | Good | Acceptable | Poor | Unacceptable"
62 | }
63 | ```
64 |
65 | ---
66 |
67 | **Reminder:** Prioritize flexible, context-sensitive analysis. Reference provided steps and criteria only as guidelines, adapting your evaluation according to actual significance and context of the provided Lean4 code patch.
68 | '''
69 |
70 | judgement_generation_input_prompt = '''# Lean4 Code Evaluation Request
71 |
72 | ## Original Source Code: {filename}
73 |
74 | ```lean
75 | {lean_code}
76 | ```
77 |
78 | ## Task Requirements
79 |
80 | {instruction}
81 |
82 | ## Proposed Implementation
83 |
84 | ```diff
85 | {raw_patch}
86 | ```
87 |
88 | Please evaluate whether this implementation properly fulfills the task requirements.
89 | '''
90 |
91 | judgement_generation_input_prompt_without_lean_code = '''# Lean4 Code Evaluation Request
92 |
93 | ## Original Source Code Status
94 |
95 | This is a new file creation with no pre-existing code.
96 |
97 | ## Task Requirements
98 |
99 | {instruction}
100 |
101 | ## Proposed Implementation
102 |
103 | ```diff
104 | {raw_patch}
105 | ```
106 |
107 | Please evaluate whether this implementation properly fulfills the task requirements.
108 | '''
--------------------------------------------------------------------------------
/src/apebench/inference/prompts/instruction_generation_prompts.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | instruction_generation_system_prompt = '''# Task Overview
4 |
5 | Your goal is to transform given Lean code modifications (diffs) for a given Lean file into structured, precise, and self-contained Lean exercises suitable for practicing mathematical reasoning and proof engineering. Each generated exercise should be concise yet comprehensive enough for practitioners to reconstruct the exact changes based solely on the provided exercise.
6 |
7 | You will complete the following three steps explicitly and systematically. Each step must clearly connect logically to the next, ensuring an integrated, coherent result.
8 |
9 | ---
10 |
11 | Step 1: Diff Analysis
12 |
13 | Instructions:
14 | - Carefully examine each diff hunk in detail.
15 | - For **each modified Lean declaration** (`def`, `lemma`, `theorem`, `class`, `instance`, etc.):
16 | - Clearly state the diff hunk span (e.g., `@@ -12,7 +12,7 @@`).
17 | - Precisely describe what was **added, removed, or changed** within the declaration.
18 | - Clearly outline the mathematical meaning or implication of each modification.
19 | - Identify and summarize the overall mathematical context of the entire diff.
20 |
21 | ---
22 |
23 | Step 2: Dependency and Hierarchy Analysis
24 |
25 | Instructions:
26 | - Analyze the relationships among declarations identified in Step 1.
27 | - Explicitly classify declarations into:
28 | - **Core Contributions:** Declarations directly motivated by essential mathematical goals.
29 | - **Auxiliary Declarations:** Supporting or intermediate lemmas serving core contributions.
30 | - Clearly outline dependencies and hierarchical relationships among these declarations.
31 | - Explicitly state the core mathematical motivations and objectives driving the identified core contributions.
32 |
33 | ---
34 |
35 | Step 3: Exercise Generation
36 |
37 | Instructions:
38 | - Based explicitly on the Core Contributions identified in Step 2, generate one structured, self-contained Lean exercise for each core declaration.
39 | - Each exercise must:
40 | - Clearly reflect the overall mathematical context (from Step 1) and the core mathematical motivation (from Step 2).
41 | - Be formulated entirely in standard mathematical language in textbooks or academic literature, explicitly avoiding Lean-specific syntax or implementation details.
42 | - Allow practitioners to precisely reconstruct the intended modifications solely from your concise instructions.
43 | - Use imperative language for instructions ("Prove that…", "Define…", etc.).
44 |
45 | Response Format for Step 3:
46 | ```
47 | # Exercises in Lean
48 |
49 | ## Exercise 1: [Concise and Descriptive Title Reflecting Mathematical Content]
50 | - **Diff Hunk Span:** `@@ -X,Y +X,Y @@`
51 | - **Task Category:** [Feature | Bug Fix | Refactor | Chore | Testing | Documentation | Formatting]
52 | - **Focus:** [Mathematical Concepts | Software Engineering]
53 | - **Difficulty:** [Very Easy | Easy | Medium | Hard | Very Hard]
54 | - **Task Nature:** [Substantial | Superficial]
55 | - **Problem Statement (Natural Mathematical Language):**
56 | Clearly state the mathematical statement to be proved or defined. Use concise, self-contained, textbook-style language understandable to mathematicians without referencing Lean-specific syntax. If the task involves modifying an existing statement (e.g., correcting an error or clarifying logic), precisely describe the intended conceptual adjustments in purely mathematical terms. Include LaTeX-formatted mathematical expressions as needed. Ensure that instructions are imperative (e.g., "Prove that...", "Define...") and explicitly indicate the logical or conceptual emphasis required by the modification.
57 |
58 | *(Repeat explicitly for each core contribution.)*
59 | ```
60 |
61 | ---
62 |
63 | Ensure your responses strictly follow the provided formats and clearly adhere to each instruction, thus creating structured, integrated, and high-quality proof engineering exercises.
64 |
65 | '''
66 |
67 | instruction_generation_input_prompt = '''# Lean Code Modification Analysis Request
68 |
69 | ## Original File: {filename}
70 |
71 | ```lean
72 | {lean_code}
73 | ```
74 |
75 | ## Proposed Changes (Patch):
76 |
77 | ```diff
78 | {raw_patch}
79 | ```
80 |
81 | Please analyze these modifications according to the instructions provided.
82 | '''
83 |
84 | instruction_generation_input_prompt_without_lean_code = '''# Lean Code Modification Analysis Request
85 |
86 | ## Original File Status
87 | This represents a new file creation - no pre-existing code.
88 |
89 | ## Proposed File Content (Patch):
90 |
91 | ```diff
92 | {raw_patch}
93 | ```
94 |
95 | Please analyze this new file according to the instructions provided.
96 | '''
--------------------------------------------------------------------------------
/src/apebench/inference/prompts/patch_generation_prompts.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | patch_generation_system_prompt = """You are given a set of **Task Descriptions**, each specifying modifications to an existing Lean 4 codebase (which may be optional or only partially provided). Your goal is to generate a **unified diff patch** that implements **only** the specified changes in **Lean 4 syntax**, ensuring strict adherence to Lean 4 conventions.
4 |
5 | Follow these steps:
6 |
7 | ### **Step 1: Identify Key Proving Strategies**
8 | - For each Task Description, **analyze and summarize** the key strategies involved, such as:
9 | - Lemma rewriting
10 | - Data structure modification
11 | - Function renaming
12 | - Introducing new theorems or lemmas
13 | - Other conceptual or syntactical transformations
14 | - Highlight any specialized proof techniques or high-level ideas guiding your modifications.
15 |
16 | ### **Step 2: Declaration Inventory**
17 | - List all **relevant declarations** (definitions, lemmas, theorems, data types) to be **added, removed, or modified**.
18 | - For new Lean 4 declarations:
19 | - Provide **concise, academic-style statements** or descriptions.
20 | - Explain how they integrate into the overall codebase.
21 |
22 | ### **Step 3: Determine Modification Locations**
23 | - Identify **where each modification should be applied** within the given Lean 4 codebase.
24 | - Quote relevant **original Lean code** where applicable, indicating:
25 | - **Insertion points** for new definitions, lemmas, or theorems.
26 | - **Lines to be modified**, specifying which parts require updates.
27 | - **Removals**, justifying why specific lines or declarations should be deleted.
28 |
29 | ### **Step 4: Unified Diff Patch (Lean 4)**
30 | - Present the **final patch** in **unified diff format** with **at least three lines of context before and after** each modified hunk.
31 | - Ensure the patch contains **only** the specified changes—no extraneous edits.
32 | - **Strictly enforce Lean 4 syntax**:
33 | - Check that all modifications are **Lean 4-compliant** and follow best practices.
34 | - Avoid deprecated Lean 3 syntax or tactics.
35 | - Ensure consistency with **Lean 4's module system and proof style**.
36 | - All code must be valid **Lean 4 syntax**, with **no** placeholders (`sorry`, `admit`).
37 | - Do **not** interleave commentary within the diff—explanations belong in Steps 1–3.
38 |
39 | ### **Response Format**
40 |
41 | #### **Step 1: Key Strategies**
42 | [Summarize the main strategies for each Task Description.]
43 |
44 | #### **Step 2: Declaration Inventory**
45 | [List modified, removed, or added declarations, providing concise descriptions for new ones.]
46 |
47 | #### **Step 3: Modification Locations**
48 | [Identify and quote the relevant Lean code where changes should be made. Specify insertion points, modifications, and removals.]
49 |
50 | #### **Step 4: Unified Diff Patch (Lean 4)**
51 | - **Overall Explanation of the Changes:**
52 | - [Provide a structured natural-language overview of the modifications.]
53 | - **Lean 4 Compliance Reminder:**
54 | - Clearly highlight how the diff strictly adheres to **Lean 4 syntax**, avoiding **Lean 3 syntax or tactics**.
55 | - Emphasize key changes in **Lean 4 module system, proof tactics, and syntax adaptations**.
56 | - **Final Patch in Unified Diff Format:**
57 | ```diff
58 | [Present the final patch in unified diff format, with at least three lines of context before and after each diff hunk. Ensure strict Lean 4 compliance.]
59 | ```
60 |
61 | """
62 |
63 | patch_generation_reasoning_models_system_prompt = """You are given a set of **Task Descriptions**, each specifying modifications to an existing Lean 4 codebase (which may be optional or only partially provided). Your task is to generate a **unified diff patch** that implements **only** the specified changes in **Lean 4 syntax**, ensuring strict adherence to Lean 4 conventions.
64 |
65 | Please provide the final patch in the following format:
66 |
67 | ```diff
68 | [Present the final patch in unified diff format, with at least three lines of context before and after each diff hunk. Ensure strict Lean 4 compliance.]
69 | ```
70 | """
71 |
72 | patch_generation_input_prompt = """# Lean4 Code Modification Task
73 |
74 | ## Task Requirements
75 |
76 | {instructions}
77 |
78 | ## Source Codebase: {filename}
79 |
80 | ```lean
81 | {lean_code}
82 | ```
83 |
84 | Please generate a unified diff patch that implements all specified requirements while ensuring strict adherence to Lean4 syntax and conventions.
85 | """
86 |
87 | patch_generation_input_prompt_without_lean_code = """# Lean4 Code Creation Task
88 |
89 | ## Task Requirements
90 |
91 | {instructions}
92 |
93 | ## Source Codebase Status
94 |
95 | This task requires creating a new file for {filename}. No existing code is provided.
96 |
97 | Please generate a unified diff patch that creates this file with all specified requirements while ensuring strict adherence to Lean4 syntax and conventions.
98 | """
--------------------------------------------------------------------------------
/src/apebench/inference/inference_pipelines/generate_judgement.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | from ..inference_pipelines.base import BasePipeline
4 | import re
5 | import logging
6 | import json
7 | from collections import Counter
8 |
9 | class GenerateJudgementPipeline(BasePipeline):
10 | """
11 | Pipeline for generating judgements on the quality of generated patches.
12 |
13 | Assesses completeness, accuracy, scope, and coding style of implementations.
14 | """
15 | def __init__(self, args):
16 | super().__init__(args)
17 | from ..prompts import (
18 | judgement_generation_system_prompt,
19 | judgement_generation_input_prompt,
20 | judgement_generation_input_prompt_without_lean_code
21 | )
22 | self.system_prompt = judgement_generation_system_prompt
23 | self.input_prompt = judgement_generation_input_prompt
24 | self.input_prompt_without_lean_code = judgement_generation_input_prompt_without_lean_code
25 | self.criteria_list = ['unacceptable', 'poor', 'acceptable', 'good', 'excellent']
26 |
27 | def get_input(self, row):
28 | if not 'exercises' in row:
29 | assert len(row['responses']) == 1, f"Expected 1 response, got {len(row['responses'])}"
30 | row['exercises'] = row['responses'][0]['exercises']
31 | if not 'full_instruction' in row:
32 | exercises = row['exercises']
33 | instructions = [f"- Task {idx + 1}: {exercise['title']}\n\n{exercise['instruction']}" for idx, exercise in enumerate(exercises)]
34 | full_instruction = '\n\n\n'.join(instructions)
35 | row['full_instruction'] = full_instruction
36 | else:
37 | full_instruction = row['full_instruction']
38 |
39 | # Format input for verification
40 | if not row['content_before']:
41 | formatted_input = self.input_prompt_without_lean_code.format(
42 | instruction=full_instruction,
43 | raw_patch=row[self.args.patch_key].strip()
44 | )
45 | else:
46 | formatted_input = self.input_prompt.format(
47 | instruction=full_instruction,
48 | raw_patch=row[self.args.patch_key].strip(),
49 | lean_code=row['content_before'].strip(),
50 | filename=row['file_path_after']
51 | )
52 | return formatted_input
53 |
54 | def initialize_metadata(self, row):
55 | """Initialize metadata for a row using Counter"""
56 | return {
57 | 'worst_judgement': None,
58 | 'majority_judgement': None,
59 | 'judgement_counter': Counter(), # Use Counter instead of list
60 | }
61 |
62 | def update_metadata_per_response(self, metadata, parsed_response):
63 | """Update metadata with response using Counter"""
64 | if parsed_response is not None and 'TaskEvaluations' in parsed_response:
65 | key = 'judgement'
66 | worst_key = f'worst_{key}'
67 | majority_key = f'majority_{key}'
68 | counter_key = f'{key}_counter'
69 | for task_evaluation in parsed_response['TaskEvaluations'].values():
70 | value = task_evaluation.lower()
71 | if value in self.criteria_list:
72 | if metadata[worst_key] is None or self.criteria_list.index(value) < self.criteria_list.index(metadata[worst_key]):
73 | metadata[worst_key] = value
74 | metadata[counter_key].update([value])
75 | if metadata[counter_key]:
76 | metadata[majority_key] = metadata[counter_key].most_common(1)[0][0]
77 | return metadata
78 |
79 | def update_metadata_per_row(self, metadata, responses):
80 | """Update metadata with responses"""
81 | counter_key = 'judgement_counter'
82 | majority_key = 'majority_judgement'
83 | if metadata[counter_key]:
84 | metadata[majority_key] = metadata[counter_key].most_common(1)[0][0]
85 |
86 | metadata.pop(counter_key)
87 | return metadata
88 |
89 | def parse_response(self, response, row):
90 | """Parse verification response into structured dictionary"""
91 | try:
92 | json_blocks = re.findall(r'```json(.*?)```', response, re.DOTALL)
93 | if len(json_blocks) == 0:
94 | json_blocks = re.findall(r'{.*"TaskEvaluations".*}', response, re.DOTALL)
95 | if len(json_blocks) == 0:
96 | raise ValueError(f"Expected 1 JSON block, got {len(json_blocks)}")
97 | json_block = json_blocks[-1]
98 | else:
99 | json_block = json_blocks[-1]
100 | parsed_response = json.loads(json_block)
101 | return parsed_response
102 | except Exception as e:
103 | logging.error(f"Error parsing GPT response: {e}")
104 | return None
105 |
106 |
--------------------------------------------------------------------------------
/src/apebench/inference/utils/chat_logger.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | import os
4 | import json
5 | from datetime import datetime
6 | import logging
7 | from typing import Dict, Any, Optional
8 | import fcntl
9 | import threading
10 | from filelock import FileLock
11 | import uuid
12 |
13 | class ChatLogger:
14 | def __init__(self, log_dir: str = "chat_logs"):
15 | """
16 | Initialize the ChatLogger.
17 |
18 | Args:
19 | log_dir (str): Directory where log files will be stored
20 | """
21 | self.log_dir = log_dir
22 | self._setup_logging()
23 | self._lock = threading.Lock()
24 |
25 | def _setup_logging(self):
26 | """Set up the logging directory and basic configuration."""
27 | # Create log directory if it doesn't exist
28 | os.makedirs(self.log_dir, exist_ok=True)
29 |
30 | # Set up basic logging with thread safety
31 | logging.basicConfig(
32 | level=logging.INFO,
33 | format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s'
34 | )
35 | self.logger = logging.getLogger(__name__)
36 |
37 | def _get_log_filename(self) -> str:
38 | """Generate a filename for the current day's log."""
39 | current_date = datetime.now().strftime("%Y-%m-%d")
40 | return os.path.join(self.log_dir, f"chat_log_{current_date}.jsonl")
41 |
42 | def log_chat(self,
43 | prompt: str,
44 | completion: Dict[str, Any],
45 | model_name: str,
46 | system_prompt: Optional[str] = None) -> None:
47 | """
48 | Log a chat interaction to a JSONL file in a thread-safe manner.
49 |
50 | Args:
51 | prompt (str): The user prompt
52 | completion (Dict): The completion response
53 | model_name (str): Name of the model used
54 | system_prompt (Optional[str]): System prompt if used
55 | """
56 | log_entry = {
57 | "id": str(uuid.uuid4()), # Add unique identifier for each log entry
58 | "timestamp": datetime.now().isoformat(),
59 | "model_name": model_name,
60 | "system_prompt": system_prompt,
61 | "prompt": prompt,
62 | "completion": completion,
63 | "thread_id": threading.get_ident()
64 | }
65 |
66 | log_file = self._get_log_filename()
67 | lock_file = f"{log_file}.lock"
68 |
69 | # Use FileLock for cross-process locking
70 | with FileLock(lock_file):
71 | try:
72 | with open(log_file, "a", encoding="utf-8") as f:
73 | # Use fcntl for file-level locking (UNIX systems only)
74 | fcntl.flock(f.fileno(), fcntl.LOCK_EX)
75 | try:
76 | json.dump(log_entry, f, ensure_ascii=False)
77 | f.write("\n")
78 | f.flush() # Ensure the write is committed to disk
79 | finally:
80 | fcntl.flock(f.fileno(), fcntl.LOCK_UN)
81 | self.logger.info(f"Successfully logged chat interaction (ID: {log_entry['id']}) to {log_file}")
82 | except Exception as e:
83 | self.logger.error(f"Failed to log chat interaction: {str(e)}")
84 |
85 | def get_chat_history(self,
86 | date_str: Optional[str] = None,
87 | thread_id: Optional[int] = None) -> list:
88 | """
89 | Retrieve chat history for a specific date or current date if not specified.
90 |
91 | Args:
92 | date_str (Optional[str]): Date in format 'YYYY-MM-DD'
93 | thread_id (Optional[int]): Filter logs by specific thread ID
94 |
95 | Returns:
96 | list: List of chat interactions for the specified date
97 | """
98 | if date_str is None:
99 | date_str = datetime.now().strftime("%Y-%m-%d")
100 |
101 | log_file = os.path.join(self.log_dir, f"chat_log_{date_str}.jsonl")
102 | lock_file = f"{log_file}.lock"
103 |
104 | if not os.path.exists(log_file):
105 | return []
106 |
107 | try:
108 | with FileLock(lock_file):
109 | with open(log_file, "r", encoding="utf-8") as f:
110 | # Use fcntl for file-level locking
111 | fcntl.flock(f.fileno(), fcntl.LOCK_SH)
112 | try:
113 | logs = [json.loads(line) for line in f]
114 | finally:
115 | fcntl.flock(f.fileno(), fcntl.LOCK_UN)
116 |
117 | # Filter by thread_id if specified
118 | if thread_id is not None:
119 | logs = [log for log in logs if log.get("thread_id") == thread_id]
120 |
121 | return logs
122 | except Exception as e:
123 | self.logger.error(f"Failed to read chat history: {str(e)}")
124 | return []
--------------------------------------------------------------------------------
/src/apebench/inference/run_inference.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | #!/usr/bin/env python3
4 | """
5 | Unified entry point for the ApeBench pipeline system.
6 |
7 | This script provides a command-line interface to run different data processing
8 | pipelines for the ApeBench system, including instruction generation, patch
9 | generation, and judgment generation.
10 | """
11 |
12 | import argparse
13 | import os
14 | import sys
15 | from datetime import datetime
16 |
17 | # Import pipeline classes
18 | from .inference_pipelines import GenerateInstructionPipeline, GeneratePatchPipeline, GenerateJudgementPipeline
19 |
20 | def parse_arguments():
21 | """Parse command line arguments"""
22 | parser = argparse.ArgumentParser(description="ApeBench Unified Pipeline Entry Point")
23 |
24 | # General arguments for all pipelines
25 | parser.add_argument("--pipeline", type=str, required=True,
26 | choices=["instruction", "patch", "judgement"],
27 | help="Pipeline type to run")
28 | parser.add_argument("--input_file", type=str, required=True, help="Path to the input file")
29 | parser.add_argument("--output_dir", type=str, default="./outputs",
30 | help="Directory to save output files")
31 | parser.add_argument("--output_file", type=str,
32 | help="Path to the output file (if not specified, will be auto-generated)")
33 | parser.add_argument("--log_dir", type=str, default="./logs",
34 | help="Directory to save log files")
35 | parser.add_argument("--timestamp", type=str,
36 | help="Timestamp to use for filenames (default: current time)")
37 | parser.add_argument("--max_workers", type=int, default=1,
38 | help="Maximum number of parallel workers")
39 | parser.add_argument("--max_retries", type=int, default=10,
40 | help="Maximum number of retries for failed rows")
41 | parser.add_argument("--model_name", type=str, required=True,
42 | help="Name of the model to use for inference")
43 | parser.add_argument("--n_responses", type=int, default=1,
44 | help="Number of responses to generate for each input")
45 | parser.add_argument("--temperature", type=float, default=0.0,
46 | help="Temperature for the model")
47 | parser.add_argument("--max_tokens", type=int, default=8000,
48 | help="Maximum number of tokens to generate")
49 | parser.add_argument("--thinking_budget_tokens", type=int, default=6000,
50 | help="Budget tokens for thinking")
51 |
52 | # Arguments specific to the instruction pipeline
53 | parser.add_argument("--gold_diff_key", type=str, default="gold_diff",
54 | help="Key in the input data for the gold diff (for instruction pipeline)")
55 |
56 | # Arguments specific to the judgment pipeline
57 | parser.add_argument("--patch_key", type=str, default="best_gen_patch_comment_free",
58 | help="Key in the input data for the patch to judge (for judgment pipeline)")
59 |
60 | # Arguments specific to the patch pipeline
61 | parser.add_argument("--force_complete_prompt", action="store_true",
62 | help="Force complete prompt")
63 | parser.add_argument("--force_reasoning_prompt", action="store_true",
64 | help="Force consise prompt")
65 |
66 | return parser.parse_args()
67 |
68 | def select_pipeline(args):
69 | """Select the appropriate pipeline based on arguments"""
70 | if args.pipeline == "instruction":
71 | return GenerateInstructionPipeline(args)
72 | elif args.pipeline == "patch":
73 | return GeneratePatchPipeline(args)
74 | elif args.pipeline == "judgement":
75 | return GenerateJudgementPipeline(args)
76 | else:
77 | raise ValueError(f"Unknown pipeline type: {args.pipeline}")
78 |
79 | def main():
80 | """Main entry point"""
81 | # Parse command line arguments
82 | args = parse_arguments()
83 |
84 | # Set default timestamp if not provided
85 | if args.timestamp is None:
86 | args.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
87 |
88 | # Create output directory if it doesn't exist
89 | os.makedirs(args.output_dir, exist_ok=True)
90 |
91 | # Create log directory if it doesn't exist
92 | os.makedirs(args.log_dir, exist_ok=True)
93 |
94 | # Print banner
95 | print("\n" + "="*80)
96 | print(f" ApeBench Pipeline: {args.pipeline.upper()}")
97 | print("="*80 + "\n")
98 |
99 | # Select and initialize the appropriate pipeline
100 | pipeline = select_pipeline(args)
101 |
102 | # Run the pipeline
103 | print(f"Starting {args.pipeline} pipeline with model {args.model_name}...\n")
104 | total_processed, total_errors, failed_indices = pipeline.process_data()
105 |
106 | # Print summary
107 | print("\n" + "="*80)
108 | print(f" SUMMARY: {args.pipeline.upper()} PIPELINE")
109 | print("="*80)
110 | print(f"Total processed: {total_processed}")
111 | print(f"Total errors: {total_errors}")
112 | print(f"Failed indices: {len(failed_indices)}")
113 | print(f"Success rate: {(total_processed / (total_processed + total_errors) if total_processed + total_errors > 0 else 0) * 100:.2f}%")
114 | print("="*80 + "\n")
115 |
116 | return 0
117 |
118 | if __name__ == "__main__":
119 | sys.exit(main())
--------------------------------------------------------------------------------
/src/apebench/inference/inference_pipelines/generate_instruction.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | from ...inference.inference_pipelines.base import BasePipeline
4 | import re
5 | import logging
6 | from collections import Counter
7 |
8 | class GenerateInstructionPipeline(BasePipeline):
9 | """
10 | Pipeline for generating instructions from patches.
11 |
12 | This processor extracts structured information about code contributions and exercises.
13 | """
14 |
15 | def __init__(self, args):
16 | super().__init__(args)
17 | from ..prompts import (
18 | instruction_generation_system_prompt,
19 | instruction_generation_input_prompt,
20 | instruction_generation_input_prompt_without_lean_code
21 | )
22 | self.system_prompt = instruction_generation_system_prompt
23 | self.input_prompt = instruction_generation_input_prompt
24 | self.input_prompt_without_lean_code = instruction_generation_input_prompt_without_lean_code
25 | self.task_nature_list = ['superficial', 'substantial']
26 | self.difficulty_list = ['very easy', 'easy', 'medium', 'hard', 'very hard']
27 | self.task_category_list = ['bug fix', 'refactor', 'feature']
28 |
29 | def get_input(self, row):
30 | if not row['content_before']:
31 | formatted_input = self.input_prompt_without_lean_code.format(
32 | raw_patch=row[self.args.gold_diff_key].strip()
33 | )
34 | else:
35 | formatted_input = self.input_prompt.format(
36 | raw_patch=row[self.args.gold_diff_key].strip(),
37 | lean_code=row['content_before'].strip(),
38 | filename=row['file_path_before']
39 | )
40 | return formatted_input
41 |
42 | def initialize_metadata(self, row):
43 | """Initialize metadata for a row using Counter"""
44 | return {
45 | 'worst_difficulty': None,
46 | 'worst_task_nature': None,
47 | 'majority_difficulty': None,
48 | 'majority_task_nature': None,
49 | 'majority_task_category': None,
50 | 'difficulty_counter': Counter(),
51 | 'task_nature_counter': Counter(),
52 | 'task_category_counter': Counter(),
53 | }
54 |
55 | def update_metadata_per_response(self, metadata, parsed_response):
56 | """Update metadata with response using Counter"""
57 | if parsed_response is not None:
58 | for exercise in parsed_response['exercises']:
59 | for key, criteria in zip(
60 | ('difficulty', 'task_nature', 'task_category'),
61 | (self.difficulty_list, self.task_nature_list, self.task_category_list),
62 | ):
63 | worst_key = f'worst_{key}'
64 | majority_key = f'majority_{key}'
65 | counter_key = f'{key}_counter'
66 | value = exercise[key].lower()
67 | if value in criteria:
68 | if worst_key in metadata and (metadata[worst_key] is None or criteria.index(value) < criteria.index(metadata[worst_key])):
69 | metadata[worst_key] = value
70 | metadata[counter_key].update([value])
71 | if metadata[counter_key]:
72 | metadata[majority_key] = metadata[counter_key].most_common(1)[0][0]
73 | return metadata
74 |
75 | def update_metadata_per_row(self, metadata, responses):
76 | """Update metadata with responses"""
77 | for key in ('difficulty', 'task_nature'):
78 | counter_key = f'{key}_counter'
79 | majority_key = f'majority_{key}'
80 | if metadata[counter_key]:
81 | metadata[majority_key] = metadata[counter_key].most_common(1)[0][0]
82 |
83 | metadata.pop('difficulty_counter')
84 | metadata.pop('task_nature_counter')
85 | return metadata
86 |
87 | def _extract_exercises(self, exercise_text):
88 | # Extract Exercises
89 | split_pos = exercise_text.find('Exercises in Lean')
90 | assert split_pos != -1
91 | exercise_text = exercise_text[split_pos:].strip(' \n-')
92 | exercises = []
93 | exercise_pattern = r'Exercise[\*\s:]*(\d+)[\*\s]*:[\*\s:]*(.*?)[-\*\s]*Diff Hunk Span.*?@@(.*?)@@.*?[-\*\s]*Task Category[-\*\s:]*(.*?)[-\*\s]*Focus[-\*\s:]*(.*?)[-\*\s]*Difficulty[-\*\s:]*(.*?)[-\*\s]*Task Nature[-\*\s:]*(.*?)[-\*\s]*Problem Statement.*?[-\*:]+(.*?)(?=[-\*#\s]*Exercise|$)'
94 | exercise_blocks = re.findall(exercise_pattern, exercise_text, re.DOTALL)
95 | for num, title, hunk_span, category, focus, difficulty, nature, instruction in exercise_blocks:
96 | exercises.append({
97 | 'num': int(num),
98 | 'title': title.strip().strip(),
99 | 'hunk_span': hunk_span.strip(),
100 | 'focus': focus.strip(),
101 | 'difficulty': difficulty.strip(),
102 | 'task_category': category.strip(),
103 | 'task_nature': nature.strip(),
104 | 'instruction': instruction.strip()
105 | })
106 | return exercises
107 |
108 | def parse_response(self, response, row):
109 | """Parse structured data from GPT response"""
110 | try:
111 | if '(Continue similarly' in response:
112 | return None
113 |
114 | exercises = self._extract_exercises(response)
115 |
116 | assert len(exercises) > 0
117 | return {"exercises": exercises}
118 | except Exception as e:
119 | logging.error(f"Error parsing GPT response: {e}")
120 | return None
--------------------------------------------------------------------------------
/src/eleanstic/utils/lean_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | import os
4 | import time
5 | import subprocess
6 | from typing import List, Tuple
7 | import logging
8 | import threading
9 | import psutil
10 | import signal
11 | import random
12 | import tempfile
13 | import traceback
14 |
15 | def run_command(command: List[str], cwd: str, logger: logging.Logger, env: dict = None) -> Tuple[List[str], List[str], float, int]:
16 | # Record start time
17 | start_time = time.time()
18 |
19 | # Use subprocess.run to execute command and wait for result
20 | try:
21 | result = subprocess.run(
22 | command,
23 | cwd=cwd,
24 | stdout=subprocess.PIPE,
25 | stderr=subprocess.PIPE,
26 | universal_newlines=True,
27 | env=env,
28 | check=False # Don't automatically raise exceptions, let caller handle return code
29 | )
30 |
31 | # Process output results
32 | stdout_lines = []
33 | if result.stdout:
34 | stdout_lines = [line.strip() + '\n' for line in result.stdout.splitlines() if line.strip()]
35 | # for line in result.stdout.splitlines():
36 | # if line.strip():
37 | # logger.info(f"[lake build] {line.strip()}")
38 |
39 | stderr_lines = []
40 | if result.stderr:
41 | stderr_lines = [line.strip() + '\n' for line in result.stderr.splitlines() if line.strip()]
42 | # for line in result.stderr.splitlines():
43 | # if line.strip():
44 | # logger.warning(f"[lake build stderr] {line.strip()}")
45 |
46 | returncode = result.returncode
47 |
48 | except Exception as e:
49 | logger.error(f"Command execution exception: {traceback.format_exc()}")
50 | stderr_lines = [f"Command execution exception: {traceback.format_exc()}\n"]
51 | returncode = -1
52 |
53 | # Calculate build time
54 | build_time = time.time() - start_time
55 | return stdout_lines, stderr_lines, build_time, returncode
56 |
57 | def run_lake_build(worktree_path: str, logger: logging.Logger, cache_dir: str = None) -> Tuple[bool, str]:
58 | """
59 | Run lake build command
60 |
61 | Args:
62 | worktree_path: Git worktree path
63 |
64 | Returns:
65 | Tuple[bool, str]: (success status, message)
66 | """
67 | try:
68 | # Check if worktree exists
69 | if not os.path.exists(worktree_path):
70 | return False, f"Worktree does not exist: {worktree_path}"
71 |
72 | # Execute lake build command
73 | logger.info(f"Starting to build Mathlib (worktree: {worktree_path})")
74 |
75 | # Set environment variables, specify cache directory
76 | env = os.environ.copy()
77 | if cache_dir:
78 | env["XDG_CACHE_HOME"] = cache_dir
79 | logger.info(f"Setting XDG_CACHE_HOME={cache_dir}")
80 |
81 | # Run lake build
82 | logger.info(f"Running lake build command")
83 | stdout_lines, stderr_lines, build_time, returncode = run_command(
84 | ["lake", "build"],
85 | worktree_path,
86 | logger,
87 | env
88 | )
89 |
90 | # Check build result
91 | if returncode == 0:
92 | logger.info(f"lake build completed, time taken {build_time:.1f} seconds")
93 | return True, f"lake build completed, time taken {build_time:.1f} seconds"
94 | else:
95 | error_message = "\n".join(line for line in (stderr_lines + stdout_lines) if not '] Building' in line)
96 | logger.error(f"lake build failed, time taken {build_time:.1f} seconds")
97 | return False, f"lake build failed, exit code {returncode}\n{error_message}"
98 |
99 | except Exception as e:
100 | logger.error(f"Error executing lake build: {traceback.format_exc()}")
101 | return False, f"Error executing lake build: {traceback.format_exc()}"
102 |
103 | def parse_lean_output(output):
104 | """Parse Lean output, categorize each line as error, warning or info."""
105 | results = []
106 | for line in output.splitlines():
107 | line_lower = line.lower()
108 | if "error" in line_lower:
109 | results.append({"type": "error", "message": line.strip()})
110 | elif "warning" in line_lower:
111 | results.append({"type": "warning", "message": line.strip()})
112 | else:
113 | results.append({"type": "info", "message": line.strip()})
114 | return results
115 |
116 | def verify_with_lean(content, worktree_path, logger, timeout=600):
117 | """Verify Lean file content.
118 |
119 | By creating a temporary Lean file at the given worktree path,
120 | call `lake env lean` to verify the file, and parse the output.
121 | """
122 | with tempfile.NamedTemporaryFile(mode='w+', suffix='.lean', encoding='utf-8', delete=False) as temp_file:
123 | temp_file.write(content)
124 | temp_file.flush()
125 | temp_file_name = temp_file.name
126 | try:
127 | process = subprocess.run(
128 | ["lake", "env", "lean", temp_file_name],
129 | capture_output=True,
130 | text=True,
131 | cwd=worktree_path,
132 | timeout=timeout
133 | )
134 | lean_output = parse_lean_output(process.stdout)
135 | passed = (process.returncode == 0 and not any(r.get("type") == "error" for r in lean_output))
136 | complete = passed and not any(r.get("type") == "warning" for r in lean_output)
137 | result = {
138 | "parsed_output": lean_output,
139 | "raw_output": process.stdout,
140 | "raw_stderr": process.stderr,
141 | "returncode": process.returncode,
142 | "pass": passed,
143 | "complete": complete
144 | }
145 | except Exception as e:
146 | result = {
147 | "lean_output": None,
148 | "system_error": traceback.format_exc(),
149 | "pass": False,
150 | "complete": False
151 | }
152 | # logger.error(traceback.format_exc())
153 | finally:
154 | import os
155 | try:
156 | os.remove(temp_file_name)
157 | except Exception:
158 | pass
159 | return result
--------------------------------------------------------------------------------
/src/utils/colors.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | from copy import deepcopy
4 | import random
5 | import numpy as np
6 |
7 | # Existing color palette
8 | colors = dict(red_colors = [
9 | # "#672F2F", # dark blood red
10 | "#E16A54", # bright red
11 | "#BE3144", # red
12 | "#810000", # dark red
13 | ],
14 | green_colors = [
15 | # "#99B19C", # light green
16 | # "#A9B388", # light grass green
17 | # "#3F4F44", # green
18 | "#5F6F52", # grass green
19 | "#2C3930", # dark green
20 | ],
21 | yellow_colors = [
22 | "#DDA853", # earth yellow
23 | # "#ECE5C7", # very light brown
24 | # "#CDC2AE", # light brown
25 | "#A27B5C", # brown
26 | ],
27 | blue_colors = [
28 | # "#C2DEDC", # light blue
29 | "#116A7B", # lake blue
30 | "#27548A", # blue
31 | "#123458", # dark blue
32 | ]
33 | )
34 |
35 | def increase_grayscale(hex_color, amount=0.3):
36 | """
37 | Increase the grayscale of a hex color by reducing saturation
38 |
39 | Parameters:
40 | hex_color -- hex color string (e.g. "#672F2F")
41 | amount -- degree of grayscale increase (0.0 to 1.0)
42 |
43 | Returns:
44 | Hex color with increased grayscale
45 | """
46 | # Convert hex to RGB
47 | hex_color = hex_color.lstrip('#')
48 | r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
49 |
50 | # Reduce saturation (move towards average)
51 | avg = (r + g + b) // 3
52 | r = int(r * (1 - amount) + avg * amount)
53 | g = int(g * (1 - amount) + avg * amount)
54 | b = int(b * (1 - amount) + avg * amount)
55 |
56 | # Convert back to hex
57 | return f"#{r:02x}{g:02x}{b:02x}"
58 |
59 | class ColorPicker:
60 | def __init__(self, color_palette=None, grayscale_increment=0.1):
61 | """
62 | Initialize the color picker
63 |
64 | Parameters:
65 | color_palette -- dictionary of color groups, defaults to module's colors
66 | grayscale_increment -- grayscale amount to increase when all colors are used
67 | """
68 | if color_palette is None:
69 | color_palette = colors
70 |
71 | self.original_palette = deepcopy(color_palette) # Save original palette
72 | self.grayscale_increment = grayscale_increment
73 | self.current_palette = deepcopy(color_palette) # Working palette
74 | self.grayscale_level = 0
75 | self.used_colors = set() # Track all used colors
76 |
77 | def get_color(self):
78 | """
79 | Get a color according to rules:
80 | Pick a color from the group with fewest remaining colors, without replacement
81 | When all colors are picked, increase grayscale and reset the palette
82 |
83 | Returns:
84 | Selected color
85 | """
86 | # Check if all groups are empty
87 | while True:
88 | all_empty = True
89 | none_empty_list = []
90 | for colors_list in self.current_palette.values():
91 | if colors_list:
92 | all_empty = False
93 | none_empty_list.append(colors_list)
94 |
95 | # If all groups are empty, increase grayscale and reset palette
96 | if all_empty:
97 | self.grayscale_level += self.grayscale_increment
98 | self._reset_palette()
99 | else:
100 | break
101 |
102 | # Randomly select a group
103 | group = random.choice(none_empty_list)
104 | # Take middle color from selected group
105 | mu = (len(group) - 1) / 2
106 | sigma = len(group) / 4
107 |
108 | # Generate normal distribution index and constrain to list range
109 | idx = max(0, min(len(group) - 1, int(np.random.normal(mu, sigma))))
110 | selected_color = group.pop(idx)
111 | self.used_colors.add(selected_color)
112 |
113 | return selected_color
114 |
115 | def _reset_palette(self):
116 | """
117 | Reset current palette, increase grayscale for each color in original palette, ensure generated colors are unique
118 | """
119 | self.current_palette = {}
120 |
121 | for group, colors_list in self.original_palette.items():
122 | self.current_palette[group] = []
123 | for color in colors_list:
124 | # Calculate new color with base grayscale increase
125 | new_color = increase_grayscale(color, self.grayscale_level)
126 |
127 | # If this color has been used, adjust grayscale until a unique color is generated
128 | attempts = 0
129 | while new_color in self.used_colors and attempts < 10:
130 | # Gradually increase grayscale to get a new color
131 | adjustment = 0.05 + attempts * 0.02
132 | new_color = increase_grayscale(color, self.grayscale_level + adjustment)
133 | attempts += 1
134 |
135 | # If still can't find a unique color, generate a fine-tuned color
136 | if new_color in self.used_colors:
137 | # Extract RGB from original color
138 | base_color = new_color.lstrip('#')
139 | r = int(base_color[0:2], 16)
140 | g = int(base_color[2:4], 16)
141 | b = int(base_color[4:6], 16)
142 |
143 | # Fine-tune RGB values, ensure within valid range
144 | r = max(0, min(255, r + random.randint(-20, 20)))
145 | g = max(0, min(255, g + random.randint(-20, 20)))
146 | b = max(0, min(255, b + random.randint(-20, 20)))
147 |
148 | new_color = f"#{r:02x}{g:02x}{b:02x}"
149 |
150 | # Ensure final generated color is unique
151 | while new_color in self.used_colors:
152 | r = max(0, min(255, r + random.randint(-10, 10)))
153 | g = max(0, min(255, g + random.randint(-10, 10)))
154 | b = max(0, min(255, b + random.randint(-10, 10)))
155 | new_color = f"#{r:02x}{g:02x}{b:02x}"
156 |
157 | self.current_palette[group].append(new_color)
158 |
159 | # Usage example
160 | if __name__ == "__main__":
161 | picker = ColorPicker()
162 | for i in range(20):
163 | color = picker.get_color()
164 | print(f"Color #{i+1}: {color}")
--------------------------------------------------------------------------------
/src/eleanstic/core/status.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | """
4 | Commit Status Management Module
5 |
6 | Implements commit status storage through the file system, each commit's status is saved in a separate file.
7 | """
8 |
9 | import os
10 | import json
11 | from datetime import datetime
12 | import traceback
13 |
14 | STATUS_PENDING = 'pending'
15 | STATUS_BUILDING = 'building'
16 | STATUS_COLLAPSED = 'collapsed'
17 | STATUS_READY = 'ready'
18 | STATUS_FAILED = 'failed'
19 | STATUS_FAILED_VERIFY = 'failed_verify'
20 |
21 | class CommitStatus:
22 | """
23 | Commit Status Management Class, implemented using the file system
24 |
25 | Each commit's status is saved in a separate file.
26 | Supports the following statuses:
27 | - 'pending': Waiting to start build
28 | - 'building': Currently building
29 | - 'ready': Build successful, not compressed
30 | - 'collapsed': Build successful, compressed
31 | - 'failed': Build failed
32 | - 'failed_verify': Built but verification failed
33 | """
34 |
35 | def __init__(self, status_dir: str = "commit_status"):
36 | """
37 | Initialize CommitStatus instance
38 |
39 | Args:
40 | status_dir: Directory to save status files
41 | """
42 | self.status_dir = status_dir
43 | os.makedirs(self.status_dir, exist_ok=True)
44 |
45 | def _get_status_file(self, commit_id: str) -> str:
46 | """Get status file path for specified commit"""
47 | return os.path.join(self.status_dir, f"{commit_id}.json")
48 |
49 | def is_commit_processed(self, commit_id: str) -> bool:
50 | """
51 | Determine if a commit has been successfully built
52 |
53 | Args:
54 | commit_id: Git commit ID
55 |
56 | Returns:
57 | bool: Returns True if status is 'ready' or 'collapsed', otherwise False
58 | """
59 | status_file = self._get_status_file(commit_id)
60 | if not os.path.exists(status_file):
61 | return False
62 |
63 | try:
64 | with open(status_file, 'r') as f:
65 | status_data = json.load(f)
66 | return status_data.get('status') in [STATUS_READY, STATUS_COLLAPSED]
67 | except (json.JSONDecodeError, IOError) as e:
68 | print(f"Error reading commit status file ({commit_id}): {e}")
69 | return False
70 |
71 | def get_commit_status(self, commit_id: str):
72 | """
73 | Get complete status information for a commit
74 |
75 | Args:
76 | commit_id: Git commit ID
77 |
78 | Returns:
79 | Dict: Dictionary containing status, message and timestamp; if not exists returns default dictionary with 'pending' status
80 | """
81 | status_file = self._get_status_file(commit_id)
82 | if not os.path.exists(status_file):
83 | return {
84 | 'commit_id': commit_id,
85 | 'status': STATUS_PENDING,
86 | 'message': None,
87 | 'updated_at': datetime.now().isoformat()
88 | }
89 |
90 | try:
91 | with open(status_file, 'r') as f:
92 | return json.load(f)
93 | except (json.JSONDecodeError, IOError) as e:
94 | print(f"Error reading commit status file ({commit_id}): {e}")
95 | return {
96 | 'commit_id': commit_id,
97 | 'status': STATUS_PENDING,
98 | 'message': f"Status file read error: {traceback.format_exc()}",
99 | 'updated_at': datetime.now().isoformat()
100 | }
101 |
102 | def update_commit_status(self, commit_id: str, status: str, message = None, additional_data = None) -> bool:
103 | """
104 | Update commit status, preserving previous status data
105 |
106 | Args:
107 | commit_id: Git commit ID
108 | status: New status ('pending', 'building', 'ready', 'collapsed', or 'failed')
109 | message: Optional status message
110 | additional_data: Optional dictionary with additional data to update
111 |
112 | Returns:
113 | bool: Returns True on successful update, False on failure
114 | """
115 | # Get existing status data if available
116 | existing_data = self.get_commit_status(commit_id)
117 |
118 | # Update with new values
119 | existing_data['status'] = status
120 | if message is not None:
121 | existing_data['message'] = message
122 | existing_data['updated_at'] = datetime.now().isoformat()
123 |
124 | # Merge additional data if provided
125 | if additional_data:
126 | for key, value in additional_data.items():
127 | existing_data[key] = value
128 |
129 | status_file = self._get_status_file(commit_id)
130 |
131 | # Ensure directory exists
132 | os.makedirs(os.path.dirname(status_file), exist_ok=True)
133 |
134 | try:
135 | with open(status_file, 'w') as f:
136 | json.dump(existing_data, f, indent=2, ensure_ascii=False)
137 | return True
138 | except IOError as e:
139 | print(f"Failed to update commit status file ({commit_id}): {e}")
140 | return False
141 |
142 | def get_all_commits_status(self):
143 | """
144 | Get status information for all commits
145 |
146 | Returns:
147 | List[Dict]: List of commit status information
148 | """
149 | result = []
150 | try:
151 | for filename in os.listdir(self.status_dir):
152 | if filename.endswith('.json'):
153 | commit_id = filename[:-5] # Remove .json suffix
154 | status = self.get_commit_status(commit_id)
155 | result.append(status)
156 | return result
157 | except OSError as e:
158 | print(f"Failed to read status directory: {e}")
159 | return []
160 |
161 | def get_commits_by_status(self, status: str):
162 | """
163 | Get all commits with a specified status
164 |
165 | Args:
166 | status: Status to filter by
167 |
168 | Returns:
169 | List[Dict]: List of commit status information matching the status
170 | """
171 | all_statuses = self.get_all_commits_status()
172 | return [item for item in all_statuses if item.get('status') == status]
--------------------------------------------------------------------------------
/run_ape_bench_example.sh:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | #!/bin/bash
4 |
5 | # --- Configuration ---
6 | # TODO: User should customize these paths and settings
7 | # Ensure this script is run from the root of the APE-Bench_I project.
8 |
9 | # Path to your local clone of the mathlib4 repository.
10 | # Eleanstic (src/eleanstic/config.yaml) must also be configured to point to this path.
11 | MATHLIB_REPO_PATH="./mathlib4"
12 |
13 | # Path to the directory where the APE-Bench_I dataset (from Hugging Face) is cloned.
14 | APE_BENCH_DATASET_DIR="./datasets"
15 |
16 | # Specific APE-Bench dataset file (e.g., .parquet) to be used for the benchmark.
17 | # This path should also be set in the main APE-Bench configuration file below.
18 | APE_BENCH_DATASET_FILE="${APE_BENCH_DATASET_DIR}/ape_bench1_test.parquet"
19 |
20 | # Main APE-Bench configuration file.
21 | # Ensure its 'project.input_file' points to APE_BENCH_DATASET_FILE.
22 | CONFIG_FILE="configs/config.yaml"
23 |
24 | # Eleanstic configuration file.
25 | # Ensure its 'paths.mathlib_repo' points to MATHLIB_REPO_PATH.
26 | ELEANSTIC_CONFIG_FILE="src/eleanstic/config.yaml"
27 |
28 | # --- Check for required commands ---
29 | echo "Checking for required dependencies..."
30 |
31 | # Check for lean command
32 | if ! command -v lean &> /dev/null; then
33 | echo "Error: 'lean' command not found. Please install Lean 4 before running this script."
34 | echo "Visit https://lean-lang.org/lean4/doc/quickstart.html for installation instructions."
35 | exit 1
36 | fi
37 |
38 | # Check for elan command
39 | if ! command -v elan &> /dev/null; then
40 | echo "Error: 'elan' command not found. Please install Elan (Lean version manager) before running this script."
41 | echo "Visit https://github.com/leanprover/elan for installation instructions."
42 | exit 1
43 | fi
44 |
45 | echo "All required dependencies are installed."
46 | echo "---------------------------------------------------------------------"
47 |
48 | # --- 1. Setup: Clone repositories (if not already present) ---
49 | echo "Step 1: Setting up repositories..."
50 |
51 | # Clone Mathlib4
52 | if [ ! -d "$MATHLIB_REPO_PATH" ]; then
53 | echo "Cloning mathlib4 to $MATHLIB_REPO_PATH..."
54 | git clone https://github.com/leanprover-community/mathlib4.git "$MATHLIB_REPO_PATH"
55 | if [ $? -ne 0 ]; then echo "Failed to clone mathlib4. Exiting."; exit 1; fi
56 | else
57 | echo "Mathlib4 repository found at $MATHLIB_REPO_PATH."
58 | fi
59 |
60 | if [ ! -f "$APE_BENCH_DATASET_FILE" ]; then
61 | echo "Obtaining APE-Bench_I dataset to $APE_BENCH_DATASET_DIR..."
62 |
63 | # Check if git lfs is installed and working
64 | if command -v git-lfs >/dev/null 2>&1 && git lfs install >/dev/null 2>&1; then
65 | echo "Git LFS is available. Cloning repository..."
66 | git clone https://huggingface.co/datasets/HuajianXin/APE-Bench_I "$APE_BENCH_DATASET_DIR"
67 | if [ $? -ne 0 ]; then
68 | echo "Failed to clone APE-Bench_I dataset. Exiting."
69 | exit 1
70 | fi
71 | else
72 | echo "Git LFS not available. Downloading files directly using curl..."
73 | mkdir -p "$APE_BENCH_DATASET_DIR"
74 |
75 | # Download main dataset file directly
76 | curl -L "https://huggingface.co/datasets/HuajianXin/APE-Bench_I/resolve/main/$(basename "$APE_BENCH_DATASET_FILE")" -o "$APE_BENCH_DATASET_FILE"
77 |
78 | if [ $? -ne 0 ]; then
79 | echo "Failed to download dataset file. Exiting."
80 | exit 1
81 | fi
82 | fi
83 | else
84 | echo "APE-Bench dataset file $APE_BENCH_DATASET_FILE already exists."
85 | fi
86 |
87 | echo "Repository setup complete."
88 | echo "---------------------------------------------------------------------"
89 |
90 | # --- 2. Eleanstic Build (Preprocessing Mathlib Commits) ---
91 | # This step preprocesses all Mathlib commits referenced in the target APE-Bench dataset file.
92 | # It uses Eleanstic and can be time-consuming for the first run.
93 |
94 | echo "Step 2: Eleanstic Build..."
95 | echo "IMPORTANT: Ensure Eleanstic configuration ($ELEANSTIC_CONFIG_FILE) is correct, especially 'paths.mathlib_repo'."
96 | echo "This will build Eleanstic data for commits in: $APE_BENCH_DATASET_FILE"
97 |
98 | # Assuming the parquet file contains a column named 'commit' for commit hashes.
99 | # Adjust --commit_id_key if your parquet uses a different column name for commit SHAs.
100 | python -m src.eleanstic.main \
101 | --config "$ELEANSTIC_CONFIG_FILE" \
102 | --input_file "$APE_BENCH_DATASET_FILE" \
103 | --commit_id_key commit_hash \
104 | build
105 | # --max_workers # Optional: adjust based on your system
106 |
107 | if [ $? -ne 0 ]; then echo "Eleanstic build failed. Exiting."; exit 1; fi
108 | echo "Eleanstic build complete."
109 | echo "---------------------------------------------------------------------"
110 |
111 | --- 3. Run APE-Bench Pipeline Scripts ---
112 | These scripts use the main APE-Bench configuration file ($CONFIG_FILE).
113 | Ensure $CONFIG_FILE is correctly set up, especially:
114 | project.input_file: should point to $APE_BENCH_DATASET_FILE
115 | generation, verification, judgement sections as per your needs.
116 |
117 | echo "Step 3.1: Generating Patches (using $CONFIG_FILE)..."
118 | python -m src.apebench.scripts.1_generate_patches --config "$CONFIG_FILE"
119 | if [ $? -ne 0 ]; then echo "Patch generation failed. Exiting."; exit 1; fi
120 | echo "Patch generation complete."
121 | echo "---------------------------------------------------------------------"
122 |
123 |
124 | echo "Step 3.2: Verifying Patches (using $CONFIG_FILE)..."
125 | python -m src.apebench.scripts.2_verify_patches --config "$CONFIG_FILE"
126 | if [ $? -ne 0 ]; then echo "Patch verification failed. Exiting."; exit 1; fi
127 | echo "Patch verification complete."
128 | echo "---------------------------------------------------------------------"
129 |
130 |
131 | echo "Step 3.3: Evaluating Patches (using $CONFIG_FILE)..."
132 | python -m src.apebench.scripts.3_evaluate_patches --config "$CONFIG_FILE"
133 | if [ $? -ne 0 ]; then echo "Patch evaluation failed. Exiting."; exit 1; fi
134 | echo "Patch evaluation complete."
135 | echo "---------------------------------------------------------------------"
136 |
137 |
138 | echo "APE-Bench pipeline finished successfully!"
139 | echo "Check the 'outputs/' directory for results."
140 |
141 | --- Optional: Rebuilding Data from Scratch ---
142 | If you need to regenerate the APE-Bench dataset itself (e.g., from new Mathlib commits),
143 | you can use the 0_collect_data.py script. This is an advanced step.
144 | echo ""
145 | echo "Optional: To rebuild the APE-Bench dataset from scratch, inspect and run:"
146 | echo "# python -m src.apebench.scripts.0_collect_data --config $CONFIG_FILE --repo_path $MATHLIB_REPO_PATH ... (other args)"
147 |
148 | exit 0
--------------------------------------------------------------------------------
/src/eleanstic/utils/log_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | """
4 | Log utilities module
5 | Supports colored log output and log file rotation
6 | """
7 | import os
8 | import logging
9 | import logging.handlers
10 | from pathlib import Path
11 | from typing import Optional, Dict, Any
12 | import colorlog
13 |
14 | def setup_logger(
15 | name: str = "commit_database",
16 | level: str = "INFO",
17 | log_dir: Optional[str] = None,
18 | log_file: Optional[str] = None,
19 | max_size_mb: int = 100,
20 | backup_count: int = 10,
21 | console_output: bool = True,
22 | color_output: bool = True
23 | ) -> logging.Logger:
24 | """
25 | Set up logging system
26 |
27 | Args:
28 | name: Logger name
29 | level: Log level
30 | log_dir: Log directory
31 | log_file: Log file
32 | max_size_mb: Maximum log file size (MB)
33 | backup_count: Number of log files to keep
34 | console_output: Whether to output to console
35 | color_output: Whether to use colored logs
36 |
37 | Returns:
38 | logging.Logger: Configured logger
39 | """
40 | # Convert log level
41 | level_map = {
42 | "DEBUG": logging.DEBUG,
43 | "INFO": logging.INFO,
44 | "WARNING": logging.WARNING,
45 | "ERROR": logging.ERROR,
46 | "CRITICAL": logging.CRITICAL
47 | }
48 | log_level = level_map.get(level.upper(), logging.INFO)
49 |
50 | # Create complete log file path
51 | full_log_path = None
52 | if log_dir and not log_file:
53 | log_path = Path(log_dir)
54 | if not log_path.exists():
55 | log_path.mkdir(parents=True, exist_ok=True)
56 | full_log_path = str(log_path / f"{name}.log")
57 | elif log_file:
58 | full_log_path = log_file
59 | # if full_log_path:
60 | # console_output = False
61 |
62 | # Create logger with "name:log_file" as unique identifier
63 | # This way even if name is the same but log_file is different, different logger instances will be created
64 | logger_id = name if full_log_path is None else f"{name}:{full_log_path}"
65 | logger = logging.getLogger(logger_id)
66 | logger.setLevel(log_level)
67 |
68 | # Clear old handlers
69 | for handler in logger.handlers[:]:
70 | logger.removeHandler(handler)
71 |
72 | # Define log format - Use custom Formatter to keep simple name display
73 | class SimpleNameFormatter(logging.Formatter):
74 | def format(self, record):
75 | # Temporarily save original name
76 | original_name = record.name
77 | # Set to simple name (remove log_file path)
78 | if ':' in original_name:
79 | record.name = original_name.split(':', 1)[0]
80 | result = super().format(record)
81 | # Restore original name
82 | record.name = original_name
83 | return result
84 |
85 | log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
86 |
87 | # Add console output
88 | if console_output:
89 | console_handler = logging.StreamHandler()
90 | console_handler.setLevel(log_level)
91 |
92 | if color_output:
93 | # Colored logs
94 | colors = {
95 | 'DEBUG': 'cyan',
96 | 'INFO': 'green',
97 | 'WARNING': 'yellow',
98 | 'ERROR': 'red',
99 | 'CRITICAL': 'red,bg_white',
100 | }
101 | color_formatter = colorlog.ColoredFormatter(
102 | "%(log_color)s" + log_format,
103 | log_colors=colors
104 | )
105 | # Replace with custom Formatter
106 | class SimpleNameColorFormatter(colorlog.ColoredFormatter):
107 | def format(self, record):
108 | # Temporarily save original name
109 | original_name = record.name
110 | # Set to simple name (remove log_file path)
111 | if ':' in original_name:
112 | record.name = original_name.split(':', 1)[0]
113 | result = super().format(record)
114 | # Restore original name
115 | record.name = original_name
116 | return result
117 |
118 | color_formatter = SimpleNameColorFormatter(
119 | "%(log_color)s" + log_format,
120 | log_colors=colors
121 | )
122 | console_handler.setFormatter(color_formatter)
123 | else:
124 | # Regular logs
125 | formatter = SimpleNameFormatter(log_format)
126 | console_handler.setFormatter(formatter)
127 |
128 | logger.addHandler(console_handler)
129 |
130 | # Add file output
131 | if full_log_path:
132 | file_handler = logging.FileHandler(
133 | filename=full_log_path,
134 | encoding='utf-8'
135 | )
136 | file_handler.setLevel(log_level)
137 | formatter = SimpleNameFormatter(log_format)
138 | file_handler.setFormatter(formatter)
139 | logger.addHandler(file_handler)
140 | # print(f"Logging {name}@{os.getpid()}: {full_log_path}")
141 |
142 | return logger
143 |
144 | def log_progress(logger, file_count, total_files, start_time, current_time, logging_ratio = 0.1, log_every_file = False, **kwargs):
145 | """
146 | Log processing progress
147 |
148 | Args:
149 | file_count: Number of files processed
150 | total_files: Total number of files
151 | start_time: Start time
152 | current_time: Current time
153 | """
154 | elapsed_time = current_time - start_time
155 | if log_every_file or file_count / total_files > logging_ratio:
156 | progress_percent = (file_count / total_files) * 100 if total_files > 0 else 0
157 |
158 | # Calculate estimated remaining time
159 | if file_count > 0 and progress_percent < 100:
160 | time_per_file = elapsed_time / file_count
161 | remaining_files = total_files - file_count
162 | estimated_remaining_time = time_per_file * remaining_files
163 |
164 | logger.info(
165 | f"Progress: {progress_percent:.2f}% ({file_count}/{total_files}) | "
166 | f"Time used: {elapsed_time:.2f}s | "
167 | f"Est. remaining: {estimated_remaining_time:.2f}s" +
168 | (" | " + " | ".join([f"{k}: {v}" for k, v in kwargs.items()]) if kwargs else "")
169 | )
170 | else:
171 | logger.info(
172 | f"Progress: {progress_percent:.2f}% ({file_count}/{total_files}) | "
173 | f"Time used: {elapsed_time:.2f}s" +
174 | (" | " + " | ".join([f"{k}: {v}" for k, v in kwargs.items()]) if kwargs else "")
175 | )
--------------------------------------------------------------------------------
/docs/03_core_components/03_5_apebench_scripts_config.md:
--------------------------------------------------------------------------------
1 | [English](#english-version) | [中文](#chinese-version)
2 |
3 |
4 | # 3.5 Scripts and Configuration
5 |
6 | This section covers the scripts and configuration files that orchestrate the APE-Bench evaluation workflows.
7 |
8 | ## Scripts (`src/apebench/scripts/`)
9 |
10 | The `src/apebench/scripts/` directory typically contains Python scripts that serve as high-level entry points for various stages of the APE-Bench I workflow.
11 |
12 | **Common Functions of Scripts:**
13 |
14 | * **Running Full Experiments**: Scripts to orchestrate an end-to-end evaluation for one or more LLMs. This might involve:
15 | * Loading tasks from the dataset.
16 | * Calling the inference modules (`src/apebench/inference/`) to generate patches for all tasks.
17 | * Invoking the evaluation pipeline (`src/apebench/evaluation_pipelines/`) to perform syntactic and semantic checks.
18 | * Saving raw results and aggregated metrics.
19 | * **Data Preprocessing/Analysis**: Scripts for analyzing the APE-Bench I dataset itself, or for preprocessing data before an experiment.
20 | * **Result Aggregation and Reporting**: Scripts to collect results from multiple partial runs, compute final metrics (like those in the paper's tables and figures), and generate reports or visualizations.
21 | * This might use `src/apebench/evaluation_pipelines/gather_results.py` internally.
22 | * **Targeted Evaluations**: Scripts for running specific parts of the pipeline, e.g., only running inference for a new model, or only re-evaluating existing patches with a new semantic judge.
23 |
24 | **Usage:**
25 |
26 | These scripts are generally designed to be run from the command line. They would parse command-line arguments to specify things like:
27 | * Which LLM(s) to evaluate.
28 | * Paths to input data and output directories.
29 | * Configuration files to use.
30 | * Specific task IDs or categories to focus on.
31 |
32 | ## Configuration Files
33 |
34 | Configuration files allow for customizing the behavior of the APE-Bench framework without modifying the source code directly.
35 |
36 | ### 1. Eleanstic Configuration (`src/eleanstic/config.yaml`)
37 |
38 | * **Purpose**: Configures the Eleanstic service.
39 | * **Key Settings**: As detailed in the [Eleanstic documentation](./04_1_eleanstic.md):
40 | * `mathlib_repo_path`: Path to your local Mathlib4 clone.
41 | * `cas_store_path`: Location for Eleanstic's Content-Addressable Store.
42 | * `snapshots_path`: Location for Eleanstic's commit snapshots.
43 | * Parameters for concurrency, logging, etc.
44 | * **Importance**: Must be correctly set up before Eleanstic can be used, especially for the initial preprocessing of Mathlib commits.
45 |
46 | ### 2. APE-Bench Configuration (primarily in `src/apebench/config/`)
47 |
48 | This directory likely contains configuration files (e.g., YAML, JSON, or Python modules) for various aspects of the APE-Bench experiments.
49 |
50 | * **Model Configurations**:
51 | * API keys (or paths to key files).
52 | * Model names/identifiers as used in APIs (e.g., `gpt-4o`, `claude-3-sonnet-20240229`).
53 | * Default generation parameters (temperature, max tokens, top_p) for each model.
54 | * API endpoint URLs if not standard.
55 | * **Path Configurations**:
56 | * Paths to the APE-Bench I dataset (`datasets/`).
57 | * Default directories for saving LLM-generated patches, evaluation results, logs, and analysis outputs.
58 | * **Experiment Parameters**:
59 | * Number of samples to generate per task ($n$ for pass@k).
60 | * Parameters for `DiffRepair` (e.g., matching thresholds).
61 | * Settings for the LLM-as-a-Judge (e.g., which model to use as judge, judge-specific prompting parameters).
62 | * **Feature Flags**: Flags to enable/disable certain parts of the pipeline (e.g., skip syntactic check, force re-generation of patches).
63 |
64 | ## Secondary Development
65 |
66 | * **Scripts**:
67 | * Develop new scripts for novel experimental workflows or more detailed analyses (e.g., generating specific plots, performing statistical tests on results).
68 | * Improve the command-line interface and modularity of existing scripts.
69 | * **Configuration**:
70 | * Refine the structure of configuration files for better organization or to support more complex experimental designs (e.g., using hierarchical configurations with tools like Hydra).
71 | * Add validation for configuration parameters (e.g., using `pydantic` as listed in `requirements.txt`) to catch errors early.
72 | * Standardize how different modules access configuration settings.
73 |
74 | Effectively using and managing scripts and configurations is key to running reproducible experiments and extending the APE-Bench I framework.
75 |
76 | ---
77 |
78 | Next: [Troubleshooting](./04_troubleshooting.md)
79 |
80 |
81 |
82 | ## 中文翻译 (Chinese Translation)
83 |
84 | # 3.5 脚本与配置
85 |
86 | 本节介绍协调 APE-Bench 评估工作流的脚本和配置文件。
87 |
88 | ## 脚本 (`src/apebench/scripts/`)
89 |
90 | `src/apebench/scripts/` 目录通常包含作为 APE-Bench I 工作流程各个阶段高级入口点的 Python 脚本。
91 |
92 | **脚本的常见功能:**
93 |
94 | * **运行完整实验**:用于为一个或多个 LLM 编排端到端评估的脚本。这可能涉及:
95 | * 从数据集中加载任务。
96 | * 调用推理模块 (`src/apebench/inference/`) 为所有任务生成补丁。
97 | * 调用评估流程 (`src/apebench/evaluation_pipelines/`) 执行语法和语义检查。
98 | * 保存原始结果和聚合指标。
99 | * **数据预处理/分析**:用于分析 APE-Bench I 数据集本身,或在实验前预处理数据的脚本。
100 | * **结果聚合和报告**:用于从多个部分运行中收集结果,计算最终指标(如论文表格和图中的指标),并生成报告或可视化的脚本。
101 | * 这可能在内部使用 `src/apebench/evaluation_pipelines/gather_results.py`。
102 | * **有针对性的评估**:用于运行流程特定部分的脚本,例如,仅为新模型运行推理,或仅使用新的语义裁判重新评估现有补丁。
103 |
104 | **用法:**
105 |
106 | 这些脚本通常设计为从命令行运行。它们会解析命令行参数以指定诸如以下内容:
107 | * 要评估的 LLM。
108 | * 输入数据和输出目录的路径。
109 | * 要使用的配置文件。
110 | * 要关注的特定任务 ID 或类别。
111 |
112 | ## 配置文件
113 |
114 | 配置文件允许在不直接修改源代码的情况下自定义 APE-Bench 框架的行为。
115 |
116 | ### 1. Eleanstic 配置 (`src/eleanstic/config.yaml`)
117 |
118 | * **目的**:配置 Eleanstic 服务。
119 | * **关键设置**:如 [Eleanstic 文档](./04_1_eleanstic.md) 中所述:
120 | * `mathlib_repo_path`:指向您的本地 Mathlib4 克隆的路径。
121 | * `cas_store_path`:Eleanstic 内容寻址存储的位置。
122 | * `snapshots_path`:Eleanstic 提交快照的位置。
123 | * 并发、日志记录等参数。
124 | * **重要性**:在使用 Eleanstic 之前必须正确设置,尤其是在对 Mathlib 提交进行初始预处理时。
125 |
126 | ### 2. APE-Bench 配置 (主要在 `src/apebench/config/` 中)
127 |
128 | 此目录可能包含 APE-Bench 实验各个方面的配置文件(例如 YAML、JSON 或 Python 模块)。
129 |
130 | * **模型配置**:
131 | * API 密钥(或密钥文件路径)。
132 | * API 中使用的模型名称/标识符(例如 `gpt-4o`、`claude-3-sonnet-20240229`)。
133 | * 每个模型的默认生成参数(温度、最大令牌数、top_p)。
134 | * 如果不是标准 API,则为 API 端点 URL。
135 | * **路径配置**:
136 | * 指向 APE-Bench I 数据集的路径 (`datasets/`)。
137 | * 用于保存 LLM 生成的补丁、评估结果、日志和分析输出的默认目录。
138 | * **实验参数**:
139 | * 每个任务生成的样本数(pass@k 中的 $n$)。
140 | * `DiffRepair` 的参数(例如匹配阈值)。
141 | * 作为裁判的 LLM 的设置(例如使用哪个模型作为裁判,裁判特定的提示参数)。
142 | * **功能标志**:用于启用/禁用流程某些部分的标志(例如跳过语法检查,强制重新生成补丁)。
143 |
144 | ---
145 |
146 | 下一节: [故障排除](./04_troubleshooting.md)
--------------------------------------------------------------------------------
/src/apebench/inference/inference_pipelines/generate_patch.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | from ..inference_pipelines.base import BasePipeline
4 | from ..utils.diff_repair import DiffRepair, apply_diff, generate_diff
5 | from ..utils.call_api import REASONING_MODELS
6 | import re
7 | import logging
8 | from src.utils.lean_utils import remove_lean_comments
9 |
10 | class GeneratePatchPipeline(BasePipeline):
11 | """
12 | Pipeline for generating patches based on task descriptions.
13 |
14 | Supports multiple prompt types and model configurations.
15 | """
16 | def __init__(self, args):
17 | super().__init__(args)
18 | from ..prompts import (
19 | patch_generation_system_prompt,
20 | patch_generation_reasoning_models_system_prompt,
21 | patch_generation_input_prompt,
22 | patch_generation_input_prompt_without_lean_code
23 | )
24 | self.system_prompt = patch_generation_system_prompt if self.args.model_name not in REASONING_MODELS else patch_generation_reasoning_models_system_prompt
25 | assert not self.args.force_reasoning_prompt or not self.args.force_complete_prompt, "force_reasoning_prompt and force_complete_prompt cannot be both True"
26 | if self.args.force_reasoning_prompt:
27 | self.system_prompt = patch_generation_reasoning_models_system_prompt
28 | if self.args.force_complete_prompt:
29 | self.system_prompt = patch_generation_system_prompt
30 | self.input_prompt = patch_generation_input_prompt
31 | self.input_prompt_without_lean_code = patch_generation_input_prompt_without_lean_code
32 | self.strict_match_threshold = 0.5
33 | self.max_context_lines = 3
34 |
35 | @property
36 | def special_config(self):
37 | if self.args.force_complete_prompt:
38 | return '_force_complete_prompt'
39 | elif self.args.force_reasoning_prompt:
40 | return '_force_reasoning_prompt'
41 | else:
42 | return ''
43 |
44 | def parse_response(self, response, row):
45 | try:
46 | result = {
47 | 'gen_patch': None,
48 | 'gen_content_from_scratch': None,
49 | 'gen_patch_after_exact_repair': None,
50 | 'gen_content_after_exact_repair': None,
51 | 'gen_patch_after_robust_repair': None,
52 | 'gen_content_after_robust_repair': None
53 | }
54 | patch_match = re.search(r'```diff(.*?)```', response, re.DOTALL)
55 | best_gen_patch = None
56 | best_gen_patch_comment_free = None
57 | best_gen_content = None
58 | content_before = row['content_before']
59 | content_before_comment_free = remove_lean_comments(content_before)
60 | if patch_match:
61 | patch = patch_match.group(1).strip()
62 | result['gen_patch'] = patch
63 | if not content_before:
64 | try:
65 | result['gen_content_from_scratch'] = apply_diff(content_before, patch)
66 | best_gen_patch = patch
67 | best_gen_content = result['gen_content_from_scratch']
68 | content_after_comment_free = remove_lean_comments(result['gen_content_from_scratch'])
69 | best_gen_patch_comment_free = generate_diff(content_before_comment_free, content_after_comment_free)
70 | except Exception as e:
71 | pass
72 | else:
73 | try:
74 | repairer = DiffRepair(content_before, patch, strict_match_threshold=self.strict_match_threshold, max_context_lines=self.max_context_lines, exact_match=False)
75 | repaired_patch_text, full_new_content = repairer.repair()
76 |
77 | if full_new_content is not None:
78 | # Special case: DiffRepair returned full new content
79 | result[f'gen_content_after_robust_repair'] = full_new_content
80 | actual_diff = generate_diff(content_before, full_new_content)
81 | result[f'gen_patch_after_robust_repair'] = actual_diff
82 | best_gen_patch = actual_diff
83 | best_gen_content = full_new_content
84 | content_after_comment_free = remove_lean_comments(full_new_content)
85 | best_gen_patch_comment_free = generate_diff(content_before_comment_free, content_after_comment_free)
86 | elif repaired_patch_text is not None:
87 | # Standard case: DiffRepair returned a repaired patch text
88 | repaired_content = apply_diff(content_before, repaired_patch_text)
89 | result[f'gen_content_after_robust_repair'] = repaired_content
90 |
91 | actual_diff = generate_diff(content_before, repaired_content)
92 | result[f'gen_patch_after_robust_repair'] = actual_diff
93 |
94 | best_gen_patch = actual_diff
95 | best_gen_content = repaired_content
96 | content_after_comment_free = remove_lean_comments(repaired_content)
97 | best_gen_patch_comment_free = generate_diff(content_before_comment_free, content_after_comment_free)
98 | # else: an error occurred in repair, or it returned (None, None) - fields will remain None
99 | except Exception as e:
100 | pass
101 | result['best_gen_content'] = best_gen_content
102 | result['best_gen_patch'] = best_gen_patch
103 | result['best_gen_patch_comment_free'] = best_gen_patch_comment_free
104 | return result
105 | except Exception as e:
106 | logging.error(f"Error parsing GPT response: {e}")
107 | return None
108 |
109 | def get_input(self, row):
110 | """Generate prompt input for a row"""
111 |
112 | lean_code = row['content_before']
113 | filename = row['file_path_after']
114 | if not 'full_instruction' in row:
115 | instructions = '\n\n\n'.join([f"- Task {idx + 1}: {exercise['title']}\n\n{exercise['instruction']}" for idx, exercise in enumerate(row['instructions']['exercises'])])
116 | row['full_instruction'] = instructions
117 | else:
118 | instructions = row['full_instruction']
119 |
120 | if filename and lean_code:
121 | return self.input_prompt.format(
122 | lean_code=lean_code,
123 | instructions=instructions,
124 | filename=filename
125 | )
126 | else:
127 | return self.input_prompt_without_lean_code.format(
128 | instructions=instructions,
129 | filename=filename
130 | )
131 |
--------------------------------------------------------------------------------
/src/apebench/evaluation_pipelines/verification_manager.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 | """
3 | Verification management module responsible for executing the patch verification process
4 | """
5 |
6 | import os
7 | import subprocess
8 | import glob
9 | from datetime import datetime
10 | from typing import Dict, List, Any, Optional, Union
11 |
12 | from ..utils import ProgressTracker, extract_verification_data, calculate_metrics, plot_metrics
13 |
14 | def get_latest_results_dir(base_dir: str) -> str:
15 | """
16 | Get the latest results directory
17 |
18 | Args:
19 | base_dir: Base directory
20 |
21 | Returns:
22 | Path to the latest results directory
23 | """
24 | result_dirs = glob.glob(f"{base_dir}*")
25 | if not result_dirs:
26 | raise ValueError(f"No result directories found in {base_dir}")
27 |
28 | # Sort by timestamp
29 | latest_dir = max(result_dirs, key=os.path.getctime)
30 | return latest_dir
31 |
32 | def verify_patches(config_file: str, generation_output_files: Optional[List[str]] = None) -> str:
33 | """
34 | Verify generated patches
35 |
36 | Args:
37 | config_file: Path to configuration file
38 | generation_output_files: Optional list of generation output files
39 |
40 | Returns:
41 | Path to the merged results file
42 | """
43 | # Import here instead of at the top to avoid circular imports
44 | from ..config.config_manager import ConfigManager
45 |
46 | # Load configuration
47 | config = ConfigManager(config_file).get_config()
48 |
49 | # Initialize progress tracker
50 | progress_tracker = ProgressTracker(config.progress_log)
51 |
52 | print(f"Running patch verification with configuration from: {config_file}")
53 |
54 | # Check if verification is already completed
55 | verification_status = progress_tracker.get_verification_status()
56 | if verification_status.get("completed", False):
57 | print("Verification already completed")
58 | verification_status = progress_tracker.get_verification_status()
59 | verification_metrics = verification_status.get("metrics", {})
60 | return verification_metrics
61 |
62 | # If no output files are provided, get them from the progress record
63 | if not generation_output_files:
64 | generation_output_files = progress_tracker.get_all_output_files()
65 |
66 | if not generation_output_files:
67 | raise ValueError("No generation output files found. Run patch generation first.")
68 |
69 | print(f"Found {len(generation_output_files)} generation output files")
70 |
71 | # Create temporary directory
72 | os.makedirs(config.temp_dir, exist_ok=True)
73 |
74 | # Create timestamp
75 | timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
76 |
77 | # 1. Use gather_results.py to collect patch data
78 | print("Collecting patches for verification...")
79 | patch_collection_file = f"{config.temp_dir}/patches_for_verification_{timestamp}.jsonl"
80 |
81 | # Build gather_results.py command to collect patches
82 | collect_cmd = [
83 | "python", "-m", "src.apebench.evaluation_pipelines.gather_results",
84 | "--pipeline", "patch",
85 | "--input_files", *generation_output_files,
86 | "--output_file", patch_collection_file,
87 | ]
88 |
89 | print(f"Executing: {' '.join(collect_cmd)}")
90 | subprocess.run(collect_cmd, check=True)
91 |
92 | # 2. Call eleanstic to perform verification
93 | print("Running Eleanstic verification...")
94 | verify_results_dir = os.path.join(config.verification.results_dir, f"results_{timestamp}")
95 | # Ensure results directory exists
96 | os.makedirs(verify_results_dir, exist_ok=True)
97 |
98 | verify_cmd = [
99 | "python", "-m", "src.eleanstic.main",
100 | "--input_file", patch_collection_file,
101 | "--commit_id_key", "commit_hash",
102 | "--max_workers", str(config.verification.max_workers),
103 | "verify",
104 | "--code_key", "code",
105 | "--results_dir", verify_results_dir
106 | ]
107 |
108 | print(f"Executing: {' '.join(verify_cmd)}")
109 | subprocess.run(verify_cmd, check=True)
110 |
111 | # 3. Use gather_results.py to collect verification results
112 | print("Collecting verification results...")
113 | verification_output_file = f"{config.temp_dir}/verification_results_{timestamp}.jsonl"
114 |
115 | verify_collect_cmd = [
116 | "python", "-m", "src.apebench.evaluation_pipelines.gather_results",
117 | "--pipeline", "verification",
118 | "--input_files", f"{verify_results_dir}/*.jsonl",
119 | "--output_file", verification_output_file,
120 | ]
121 |
122 | print(f"Executing: {' '.join(verify_collect_cmd)}")
123 | subprocess.run(verify_collect_cmd, check=True)
124 |
125 | # 4. Merge verification results with original generation data
126 | print("Merging verification results with original data...")
127 | merged_results_file = f"{config.output_dir}/merged_results_{timestamp}.jsonl"
128 | os.makedirs(os.path.dirname(merged_results_file), exist_ok=True)
129 |
130 | # Call gather_results.py merge functionality
131 | merge_cmd = [
132 | "python", "-m", "src.apebench.evaluation_pipelines.gather_results",
133 | "--pipeline", "merge", # New pipeline type
134 | "--original_files", *generation_output_files,
135 | "--verification_file", verification_output_file,
136 | "--output_file", merged_results_file,
137 | ]
138 |
139 | print(f"Executing: {' '.join(merge_cmd)}")
140 | subprocess.run(merge_cmd, check=True)
141 |
142 | # 5. Calculate pass@k metrics for each model
143 | print("Calculating verification metrics...")
144 | verified_results = extract_verification_data(merged_results_file)
145 | metrics = calculate_metrics(verified_results, config)
146 |
147 | # 6. Generate visualizations
148 | if hasattr(config.evaluation, 'generate_plots') and config.evaluation.generate_plots:
149 | print("Generating verification metric plots...")
150 | plots_dir = getattr(config.evaluation, 'plots_dir', './verification_plots')
151 | os.makedirs(plots_dir, exist_ok=True)
152 | plot_metrics(metrics, plots_dir, f'verification_{timestamp}')
153 | print(f"Verification metric plots saved to: {plots_dir}")
154 |
155 | # 7. Save metrics
156 | metrics_file = f"{config.output_dir}/verification_metrics_{timestamp}.json"
157 |
158 | import json
159 | print('Saving verification metrics to: ', metrics_file)
160 | print('Metrics: ', metrics)
161 | with open(metrics_file, 'w') as f:
162 | json.dump(metrics, f, indent=2)
163 |
164 | # 8. Update progress tracking
165 | verification_status = {
166 | "completed": True,
167 | "timestamp": timestamp,
168 | "verification_output": verification_output_file,
169 | "merged_results": merged_results_file,
170 | "metrics_file": metrics_file,
171 | "metrics": metrics
172 | }
173 |
174 | progress_tracker.update_verification_status(verification_status)
175 |
176 | print(f"Verification completed. Results saved to: {merged_results_file}")
177 |
178 | return metrics
--------------------------------------------------------------------------------
/docs/02_project_structure.md:
--------------------------------------------------------------------------------
1 | [English](#english-version) | [中文](#chinese-version)
2 |
3 |
4 | # 2. Project Structure
5 |
6 | This document provides a high-level overview of the APE-Bench I project's directory structure.
7 |
8 | ```
9 | ape-bench/
10 | ├── .git/ # Git repository data
11 | ├── .venv/ # Python virtual environment (recommended)
12 | ├── configs/ # General configuration files for experiments (if any)
13 | ├── datasets/ # Downloaded APE-Bench I dataset from Hugging Face
14 | ├── docs/ # This documentation
15 | │ ├── README.md
16 | │ ├── 01_introduction.md
17 | │ ├── ... (other documentation files)
18 | │ └── 04_core_components/
19 | │ └── ... (component-specific docs)
20 | ├── paper.tex # LaTeX source for the research paper
21 | ├── README.md # Main project README (points to Hugging Face dataset)
22 | ├── requirements.txt # Python dependencies
23 | ├── src/
24 | │ ├── __init__.py
25 | │ ├── apebench/ # Core logic for APE-Bench I framework
26 | │ │ ├── __init__.py
27 | │ │ ├── config/ # Configuration for APE-Bench components (models, paths)
28 | │ │ ├── data/ # Data loading, processing, task representation
29 | │ │ ├── evaluation_pipelines/ # Syntactic and semantic evaluation logic
30 | │ │ ├── inference/ # LLM interaction, patch generation, DiffRepair
31 | │ │ │ └── utils/ # Utilities for inference, e.g., diff_repair.py
32 | │ │ └── scripts/ # Scripts for running experiments, analysis
33 | │ │ └── utils/ # General utilities for apebench module
34 | │ ├── eleanstic/ # Eleanstic: version-aware syntactic verification
35 | │ │ ├── __init__.py
36 | │ │ ├── config.yaml # Configuration for Eleanstic
37 | │ │ ├── core/ # Core logic for Eleanstic (snapshotting, CAS)
38 | │ │ ├── main.py # Main script/entry point for Eleanstic operations
39 | │ │ └── utils/ # Utilities specific to Eleanstic
40 | │ └── utils/ # Shared utility functions (if any at src level)
41 | └── ... # Other project files (e.g., .gitignore)
42 | ```
43 |
44 | ## Key Directories
45 |
46 | * **`configs/`**: May contain high-level configuration files for orchestrating different experimental setups. More specific configurations are often found within `src/apebench/config/` and `src/eleanstic/config.yaml`.
47 |
48 | * **`datasets/`**: This directory (created by you during setup) holds the actual benchmark data – the collection of (`Instruction`, `PreFile`, `Patch`) triplets.
49 |
50 | * **`docs/`**: Contains all the documentation files you are currently reading.
51 |
52 | * **`src/`**: The heart of the project, containing all source code.
53 | * **`src/apebench/`**: Implements the core APE-Bench I framework. This is where most of the logic for running experiments, interacting with LLMs, and evaluating results resides.
54 | * `config/`: Specific configurations for APE-Bench, such as model parameters, API endpoints, file paths relevant to benchmark runs.
55 | * `data/`: Modules for loading, parsing, and managing the APE-Bench I tasks from the `datasets/` directory.
56 | * `evaluation_pipelines/`: Contains the code for the two-stage evaluation process: syntactic verification (interfacing with Eleanstic) and semantic judgment (LLM-as-a-Judge).
57 | * `inference/`: Handles the generation of patches by LLMs. This includes constructing prompts, making API calls to various models, and processing their outputs. The critical `DiffRepair` utility (`inference/utils/diff_repair.py`) is also part of this module.
58 | * `scripts/`: Contains Python scripts that act as entry points for various operations, such as running a full evaluation pass for a model, generating specific analyses, or preparing data.
59 | * **`src/eleanstic/`**: A self-contained module that implements the Eleanstic system. Its primary role is to provide efficient and version-aware syntactic verification of Lean code by managing Mathlib build artifacts.
60 | * `config.yaml`: The main configuration file for Eleanstic, defining paths to Mathlib, storage locations, etc.
61 | * `core/`: The core implementation of Eleanstic's content-addressable storage, snapshot management, and environment restoration logic.
62 | * `main.py`: Often the main executable or entry point for Eleanstic operations like preprocessing Mathlib commits or servicing verification requests.
63 |
64 | Understanding this structure will help you navigate the codebase when trying to understand specific functionalities or when planning secondary development.
65 |
66 | ---
67 |
68 |
69 | ## 中文翻译 (Chinese Translation)
70 |
71 | # 2. 项目结构
72 |
73 | 本文档提供了 APE-Bench I 项目目录结构的高级概述。
74 |
75 | ```
76 | ape-bench/
77 | ├── .git/ # Git 仓库数据
78 | ├── .venv/ # Python 虚拟环境 (推荐)
79 | ├── configs/ # 实验的通用配置文件 (如果有)
80 | ├── datasets/ # 从 Hugging Face 下载的 APE-Bench I 数据集
81 | ├── docs/ # 本文档
82 | │ ├── README.md
83 | │ ├── 01_introduction.md
84 | │ ├── ... (其他文档文件)
85 | │ └── 04_core_components/
86 | │ └── ... (组件特定文档)
87 | ├── paper.tex # 研究论文的 LaTeX 源文件
88 | ├── README.md # 项目主 README (指向 Hugging Face 数据集)
89 | ├── requirements.txt # Python 依赖
90 | ├── src/
91 | │ ├── __init__.py
92 | │ ├── apebench/ # APE-Bench I 框架的核心逻辑
93 | │ │ ├── __init__.py
94 | │ │ ├── config/ # APE-Bench 组件的配置 (模型、路径)
95 | │ │ ├── data/ # 数据加载、处理、任务表示
96 | │ │ ├── evaluation_pipelines/ # 语法和语义评估逻辑
97 | │ │ ├── inference/ # LLM 交互、补丁生成、DiffRepair
98 | │ │ │ └── utils/ # 推理工具,例如 diff_repair.py
99 | │ │ └── scripts/ # 运行实验、分析的脚本
100 | │ │ └── utils/ # apebench 模块的通用工具
101 | │ ├── eleanstic/ # Eleanstic:版本感知的语法验证
102 | │ │ ├── __init__.py
103 | │ │ ├── config.yaml # Eleanstic 的配置文件
104 | │ │ ├── core/ # Eleanstic 的核心逻辑 (快照、CAS)
105 | │ │ ├── main.py # Eleanstic 操作的主脚本/入口点
106 | │ │ └── utils/ # Eleanstic 特定的工具
107 | │ └── utils/ # 共享的工具函数 (如果在 src 级别有的话)
108 | └── ... # 其他项目文件 (例如 .gitignore)
109 | ```
110 |
111 | ## 关键目录
112 |
113 | * **`configs/`**: 可能包含用于编排不同实验设置的高级配置文件。更具体的配置通常位于 `src/apebench/config/` 和 `src/eleanstic/config.yaml` 中。
114 |
115 | * **`datasets/`**: 此目录(在设置过程中由您创建)包含实际的基准测试数据——(`指令`, `修改前文件`, `补丁`) 三元组的集合。
116 |
117 | * **`docs/`**: 包含您当前正在阅读的所有文档文件。
118 |
119 | * **`src/`**: 项目的核心,包含所有源代码。
120 | * **`src/apebench/`**: 实现核心 APE-Bench I 框架。大部分运行实验、与 LLM 交互以及评估结果的逻辑都位于此处。
121 | * `config/`: APE-Bench 的特定配置,例如模型参数、API 端点、与基准测试运行相关的文件路径。
122 | * `data/`: 用于从 `datasets/` 目录加载、解析和管理 APE-Bench I 任务的模块。
123 | * `evaluation_pipelines/`: 包含两阶段评估过程的代码:语法验证(与 Eleanstic 对接)和语义判断(作为裁判的 LLM)。
124 | * `inference/`: 处理由 LLM 生成补丁。这包括构建提示、调用各种模型的 API 以及处理其输出。关键的 `DiffRepair` 工具 (`inference/utils/diff_repair.py`) 也是此模块的一部分。
125 | * `scripts/`: 包含作为各种操作入口点的 Python 脚本,例如为模型运行完整的评估遍、生成特定分析或准备数据。
126 | * **`src/eleanstic/`**: 一个独立的模块,实现 Eleanstic 系统。其主要作用是通过管理 Mathlib 构建产物来提供高效且版本感知的 Lean 代码语法验证。
127 | * `config.yaml`: Eleanstic 的主配置文件,定义 Mathlib 的路径、存储位置等。
128 | * `core/`: Eleanstic 内容寻址存储、快照管理和环境恢复逻辑的核心实现。
129 | * `main.py`: 通常是 Eleanstic 操作(如预处理 Mathlib 提交或服务验证请求)的主要可执行文件或入口点。
130 |
131 | 理解此结构将有助于您在尝试理解特定功能或计划二次开发时浏览代码库。
132 |
133 | ---
134 |
135 | 下一节: [核心组件](./03_core_components/03_1_eleanstic.md)
--------------------------------------------------------------------------------
/src/apebench/inference/utils/parallel.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | import pandas as pd
4 | import json
5 | import logging
6 | from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
7 | import os
8 | import random
9 | import time
10 |
11 | def process_rows_parallel(data: pd.DataFrame,
12 | process_func: callable,
13 | output_file: str,
14 | max_workers: int = 2,
15 | config_info: dict = None):
16 | """Process rows in parallel using ThreadPoolExecutor.
17 |
18 | Args:
19 | data: DataFrame containing rows to process
20 | process_func: Function to process each row
21 | output_file: Path to output JSON file
22 | max_workers: Number of parallel workers
23 | config_info: Dictionary containing configuration information for logging
24 |
25 | Returns:
26 | Tuple of (processed_count, error_count)
27 | """
28 | processed_count = 0
29 | error_count = 0
30 | total_items = len(data)
31 | start_time = time.time()
32 |
33 | # Prepare configuration information display
34 | config_str = ""
35 | if config_info:
36 | config_items = []
37 | if 'model_name' in config_info:
38 | config_items.append(f"Model: {config_info['model_name']}")
39 | if 'temperature' in config_info:
40 | config_items.append(f"Temp: {config_info['temperature']}")
41 | if 'n_responses' in config_info:
42 | config_items.append(f"Responses: {config_info['n_responses']}")
43 | config_str = " | ".join(config_items)
44 | if config_str:
45 | config_str = f"[{config_str}] "
46 |
47 | with ThreadPoolExecutor(max_workers=max_workers) as executor:
48 | futures = {executor.submit(process_func, row) : i for i, row in data.iterrows()}
49 | for future in as_completed(futures):
50 | try:
51 | result = future.result()
52 | if result is not None:
53 | result_json = json.dumps(result, ensure_ascii=False)
54 | with open(output_file, 'a') as f:
55 | f.write(result_json + '\n')
56 | processed_count += 1
57 | else:
58 | error_count += 1
59 | except TimeoutError:
60 | error_count += 1
61 | logging.error(f"{config_str}Timeout occurred while processing row")
62 | except Exception as e:
63 | error_count += 1
64 | logging.error(f"{config_str}Unexpected error while processing result: {str(e)}")
65 | finally:
66 | current_item = processed_count + error_count
67 | current_time = time.time()
68 | elapsed_time = current_time - start_time
69 | progress_percent = (current_item / total_items) * 100
70 |
71 | # Calculate estimated remaining time
72 | if current_item > 0 and progress_percent < 100:
73 | time_per_item = elapsed_time / current_item
74 | remaining_items = total_items - current_item
75 | estimated_remaining_time = time_per_item * remaining_items
76 |
77 | print(
78 | f"{config_str}Progress: {progress_percent:.2f}% ({current_item}/{total_items}) | "
79 | f"Completed: {processed_count} | Errors: {error_count} | "
80 | f"Elapsed time: {elapsed_time / 3600:.2f} hours | "
81 | f"Est. remaining: {estimated_remaining_time / 3600:.2f} hours"
82 | )
83 | else:
84 | print(
85 | f"{config_str}Progress: {progress_percent:.2f}% ({current_item}/{total_items}) | "
86 | f"Completed: {processed_count} | Errors: {error_count} | "
87 | f"Elapsed time: {elapsed_time / 3600:.2f} hours"
88 | )
89 |
90 | return processed_count, error_count
91 |
92 | def check_missing_rows(data: pd.DataFrame, output_file: str):
93 | """Check which rows from the original data are missing in the output file.
94 |
95 | Args:
96 | data: Original DataFrame with row indices
97 | output_file: Path to the output JSON file
98 |
99 | Returns:
100 | List of missing row indices
101 | """
102 | processed_indices = set()
103 |
104 | if os.path.exists(output_file):
105 | with open(output_file, 'r') as f:
106 | for line in f:
107 | try:
108 | result = json.loads(line)
109 | if 'local_index' in result:
110 | processed_indices.add(result['local_index'])
111 | except json.JSONDecodeError:
112 | logging.error(f"Error decoding JSON line: {line}")
113 |
114 | all_indices = set(data.index.tolist())
115 | missing_indices = list(all_indices - processed_indices)
116 |
117 | return missing_indices
118 |
119 | def process_with_retries(data: pd.DataFrame,
120 | process_func: callable,
121 | output_file: str,
122 | max_workers: int = 2,
123 | max_retries: int = 3,
124 | config_info: dict = None):
125 | """Process rows with automatic retries for failed rows.
126 |
127 | Args:
128 | data: DataFrame containing rows to process
129 | process_func: Function to process each row
130 | output_file: Path to output JSON file
131 | max_workers: Number of parallel workers
132 | max_retries: Maximum number of retry attempts for each batch of failures
133 | config_info: Dictionary containing configuration information for logging
134 |
135 | Returns:
136 | Tuple of (total_processed_count, total_error_count, final_missing_indices)
137 | """
138 | total_processed = 0
139 | total_errors = 0
140 | retry_count = 0
141 |
142 | # Initial processing
143 | logging.info("Starting initial processing...")
144 |
145 | # Retry loop
146 | missing_indices = check_missing_rows(data, output_file)
147 | random.shuffle(missing_indices)
148 |
149 | config_str = ""
150 | if config_info:
151 | config_items = []
152 | if 'model_name' in config_info:
153 | config_items.append(f"Model: {config_info['model_name']}")
154 | if 'temperature' in config_info:
155 | config_items.append(f"Temp: {config_info['temperature']}")
156 | if 'n_responses' in config_info:
157 | config_items.append(f"Responses: {config_info['n_responses']}")
158 | config_str = " | ".join(config_items)
159 | if config_str:
160 | config_str = f"[{config_str}] "
161 |
162 | while missing_indices and retry_count < max_retries:
163 | retry_count += 1
164 |
165 | print(f"{config_str}Retry attempt {retry_count}: Found {len(missing_indices)} missing rows")
166 |
167 | retry_data = data.loc[missing_indices]
168 | retry_processed, retry_errors = process_rows_parallel(
169 | retry_data, process_func, output_file, max_workers, config_info=config_info
170 | )
171 |
172 | total_processed += retry_processed
173 | total_errors += retry_errors
174 |
175 | missing_indices = check_missing_rows(data, output_file)
176 | if not missing_indices:
177 | print(f"{config_str}All rows successfully processed")
178 | break
179 |
180 | if retry_count == max_retries:
181 | print(f"{config_str}Reached maximum retry attempts ({max_retries})")
182 |
183 | return total_processed, total_errors, missing_indices
184 |
185 |
--------------------------------------------------------------------------------
/src/apebench/utils/progress_tracker.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | """
4 | Progress tracking manager, used to record and manage the execution progress of the evaluation process
5 | """
6 |
7 | import os
8 | import json
9 | import fcntl
10 | from datetime import datetime
11 | from typing import Dict, Any, List, Optional
12 |
13 | class ProgressTracker:
14 | """Track and manage evaluation process progress"""
15 |
16 | def __init__(self, progress_file: str):
17 | """
18 | Initialize progress tracker
19 |
20 | Args:
21 | progress_file: Path to progress data file
22 | """
23 | self.progress_file = progress_file
24 | self.data = self._load_progress()
25 |
26 | def _load_progress(self) -> Dict[str, Any]:
27 | """Load progress data, using file locks to ensure multi-process safety"""
28 | os.makedirs(os.path.dirname(self.progress_file), exist_ok=True)
29 |
30 | if os.path.exists(self.progress_file):
31 | try:
32 | with open(self.progress_file, 'r') as f:
33 | # Get shared lock (read lock)
34 | fcntl.flock(f, fcntl.LOCK_SH)
35 | try:
36 | data = json.load(f)
37 | finally:
38 | # Release lock
39 | fcntl.flock(f, fcntl.LOCK_UN)
40 | return data
41 | except Exception as e:
42 | print(f"Error loading progress file: {e}")
43 | # If loading fails, backup old file and create new one
44 | backup_file = f"{self.progress_file}.bak.{datetime.now().strftime('%Y%m%d%H%M%S')}"
45 | os.rename(self.progress_file, backup_file)
46 | print(f"Backed up problematic progress file to {backup_file}")
47 |
48 | # Initialize empty progress data
49 | return {
50 | "models": {},
51 | "verification": {"completed": False},
52 | "evaluation": {"completed": False},
53 | "last_updated": None
54 | }
55 |
56 | def _save_progress(self, limited_update_keys: Optional[List[str]] = None) -> None:
57 | """Save progress data, using lock files to ensure multi-process safety"""
58 | self.data["last_updated"] = datetime.now().isoformat()
59 | os.makedirs(os.path.dirname(self.progress_file), exist_ok=True)
60 |
61 | # Create lock file path
62 | lock_file = f"{self.progress_file}.lock"
63 |
64 | try:
65 | # Open or create lock file
66 | with open(lock_file, 'w') as lock_f:
67 | # Get exclusive lock (write lock)
68 | fcntl.flock(lock_f, fcntl.LOCK_EX)
69 | try:
70 | # Read current data (if exists)
71 | current_data = self.data
72 | if os.path.exists(self.progress_file) and os.path.getsize(self.progress_file) > 0:
73 | try:
74 | with open(self.progress_file, 'r') as f:
75 | current_data = json.load(f)
76 | # Merge model data, preserve other parts unchanged
77 | current_data.update({k : v for k, v in self.data.items() if limited_update_keys is None or k in limited_update_keys})
78 | current_data["last_updated"] = self.data["last_updated"]
79 | except (json.JSONDecodeError, ValueError):
80 | # If file is empty or format is wrong, use current data
81 | current_data = self.data
82 |
83 | # Update data in memory
84 | self.data = current_data
85 |
86 | # Write directly to original file
87 | with open(self.progress_file, 'w') as f:
88 | json.dump(self.data, f, indent=2)
89 | finally:
90 | # Release lock
91 | fcntl.flock(lock_f, fcntl.LOCK_UN)
92 | except Exception as e:
93 | print(f"Error saving progress file: {e}")
94 |
95 | def get_model_status(self, model_name: str) -> Dict[str, Any]:
96 | """
97 | Get status of a specific model, forcibly reload latest data before getting
98 |
99 | Args:
100 | model_name: Model name
101 |
102 | Returns:
103 | Dictionary containing model status
104 | """
105 | # Reload to get latest status
106 | self.data = self._load_progress()
107 |
108 | if model_name not in self.data["models"]:
109 | self.data["models"][model_name] = {
110 | "completed": False,
111 | "last_completed_config": -1,
112 | "output_files": []
113 | }
114 | return self.data["models"][model_name]
115 |
116 | def update_model_status(self, model_name: str, status: Dict[str, Any]) -> None:
117 | """
118 | Update model status
119 |
120 | Args:
121 | model_name: Model name
122 | status: New status dictionary
123 | """
124 | self.data["models"][model_name] = status
125 | self._save_progress()
126 |
127 | def get_verification_status(self) -> Dict[str, Any]:
128 | """
129 | Get verification phase status
130 |
131 | Returns:
132 | Verification status dictionary
133 | """
134 | # Reload to get latest status
135 | self.data = self._load_progress()
136 | return self.data["verification"]
137 |
138 | def update_verification_status(self, status: Dict[str, Any]) -> None:
139 | """
140 | Update verification phase status
141 |
142 | Args:
143 | status: New verification status dictionary
144 | """
145 | self.data["verification"] = status
146 | self._save_progress()
147 |
148 | def get_evaluation_status(self) -> Dict[str, Any]:
149 | """
150 | Get evaluation phase status
151 |
152 | Returns:
153 | Evaluation status dictionary
154 | """
155 | # Reload to get latest status
156 | self.data = self._load_progress()
157 | return self.data["evaluation"]
158 |
159 | def update_evaluation_status(self, status: Dict[str, Any]) -> None:
160 | """
161 | Update evaluation phase status
162 |
163 | Args:
164 | status: New evaluation status dictionary
165 | """
166 | self.data["evaluation"] = status
167 | self._save_progress()
168 |
169 | def get_all_output_files(self) -> List[str]:
170 | """
171 | Get output files for all completed models
172 |
173 | Returns:
174 | List of output file paths
175 | """
176 | # Reload to get latest status
177 | self.data = self._load_progress()
178 |
179 | all_files = []
180 | for model_name, model_status in self.data["models"].items():
181 | if model_status.get("completed", False):
182 | all_files.extend(model_status.get("output_files", []))
183 | return all_files
184 |
185 | def reset_progress(self, section: Optional[str] = None) -> None:
186 | """
187 | Reset progress data
188 |
189 | Args:
190 | section: Section to reset, such as 'models', 'verification', 'evaluation',
191 | if None, reset all data
192 | """
193 | if section is None:
194 | self.data = {
195 | "models": {},
196 | "verification": {"completed": False},
197 | "evaluation": {"completed": False},
198 | "last_updated": None
199 | }
200 | elif section == 'models':
201 | self.data["models"] = {}
202 | elif section in self.data:
203 | self.data[section] = {"completed": False}
204 |
205 | self._save_progress()
--------------------------------------------------------------------------------
/docs/03_core_components/03_3_apebench_inference.md:
--------------------------------------------------------------------------------
1 | [English](#english-version) | [中文](#chinese-version)
2 |
3 |
4 | # 3.3 LLM Inference and DiffRepair
5 |
6 | This section covers the process of generating patches using Large Language Models (LLMs) and the `DiffRepair` utility that post-processes these patches. The relevant code is located in `src/apebench/inference/` and its sub-modules like `src/apebench/inference/utils/diff_repair.py`.
7 |
8 | ## LLM Inference Process
9 |
10 | The core task for an LLM in APE-Bench I is to generate a patch (in unified diff format) that transforms a given `PreFile` according to an `Instruction`.
11 |
12 | 1. **Entry Point**: The main entry point for inference is `src/apebench/inference/run_inference.py`, which supports multiple pipelines including:
13 | * `patch` pipeline: For generating patched code based on instructions
14 | * `judgement` pipeline: For evaluating patches using LLM-as-Judge
15 | * `instruction` pipeline: For generating natural language instructions
16 | * Each pipeline is implemented as a specialized class in `src/apebench/inference/inference_pipelines/`
17 |
18 | 2. **Prompt Construction**: For each task, a prompt is constructed for the target LLM. This includes:
19 | * The `Instruction` (natural language command).
20 | * The `PreFile` (the full Lean code before edits).
21 | * Formatting instructions to guide the LLM to output correctly structured patches.
22 | * The prompt templates are specific to each pipeline type and are defined in separate modules under `src/apebench/inference/prompts/`.
23 |
24 | 3. **Model Invocation**: The inference framework supports various LLM providers:
25 | * The pipeline classes in `src/apebench/inference/inference_pipelines/` handle API authentication, request formatting, and response parsing for different APIs.
26 | * The `select_pipeline` function in `src/apebench/inference/run_inference.py` maps pipeline types to their respective pipeline classes.
27 | * Key parameters like `temperature`, `max_tokens`, and `n_responses` (for sampling multiple candidates) are passed to the appropriate API.
28 |
29 | 4. **Output Processing**:
30 | * The raw LLM output is parsed to extract the generated patches.
31 | * For the `patch` pipeline, `DiffRepair` is applied to the extracted patches (see below).
32 | * The processed outputs are saved to the specified output file in a structured format.
33 |
34 | 5. **Parallelism**: Processing is distributed across multiple workers (using `ProcessPoolExecutor`) to speed up inference for large datasets, controlled by the `--max_workers` parameter.
35 |
36 | The command to run inference looks like:
37 | ```bash
38 | python -m src.apebench.inference.run_inference \
39 | --pipeline patch \
40 | --input_file /path/to/tasks.jsonl \
41 | --output_file /path/to/results.jsonl \
42 | --model_name gpt-4o \
43 | --temperature 0.8 \
44 | --n_responses 20 \
45 | --max_workers 4
46 | ```
47 |
48 | ## DiffRepair: Fault-Tolerant Patch Recovery
49 |
50 | LLM-generated diffs are often "noisy" – they have incorrect line numbers, misaligned context lines, or formatting issues that prevent them from being applied cleanly using standard `patch` utilities. `DiffRepair` is a vital component designed to address this.
51 |
52 | * **Location**: `src/apebench/inference/utils/diff_repair.py`
53 | * **Purpose**: To transform noisy model-generated diffs into clean, structurally consistent, and applicable patches while preserving the original intent of the edit as much as possible.
54 | * **Mention in Paper**: Sections 5.1 (Patch Normalization) and Appendix A.
55 |
56 | **DiffRepair Workflow (as described in Appendix A of the paper):**
57 |
58 | 1. **Hunk Parsing**: The input diff text is parsed into individual "hunks" (segments of changes).
59 | 2. **Intent Localization (Fuzzy Matching)**: For each hunk, `DiffRepair` attempts to find the correct region in the `PreFile` where the change was intended. This is a crucial step and involves:
60 | * Comparing context lines from the hunk with lines in the `PreFile`.
61 | * Using fuzzy matching algorithms (e.g., Levenshtein distance, sequence matching) to tolerate minor discrepancies.
62 | * The `_find_candidate_region_exact` and `_find_best_region_with_dp` methods in `diff_repair.py` implement sophisticated matching logic, including dynamic programming.
63 | 3. **Patch Reconstruction**: Once the target region is localized, `DiffRepair` reconstructs a clean diff hunk:
64 | * Re-aligning added and deleted lines to structurally valid positions relative to the correctly identified context from `PreFile`.
65 | * Augmenting missing context lines to satisfy unified diff format constraints.
66 | * Resolving line number offsets and potential hunk overlaps.
67 | 4. **Final Diff Generation**: The repaired hunks are combined into a final, clean unified diff string.
68 |
69 | **Key aspects of `DiffRepair` from the code (`diff_repair.py`):**
70 | * Handles both standard diffs with `@@ ... @@` headers and non-standard diffs.
71 | * Normalizes lines (stripping whitespace, lowercasing) for more robust matching.
72 | * Uses a combination of exact and fuzzy matching techniques.
73 | * The `repair()` method orchestrates the overall process for a given diff.
74 | * Filters overlapping hunks based on the significance of changes.
75 |
76 | The paper's Table 3, showing patch application success rates before and after repair, highlights the importance of `DiffRepair`.
77 |
78 | ---
79 |
80 | Next: [Evaluation Pipeline: Syntactic & Semantic Checks](./03_4_apebench_evaluation.md)
81 |
82 |
83 |
84 | ## 中文翻译 (Chinese Translation)
85 |
86 | # 3.3 LLM 推理与 DiffRepair
87 |
88 | 本节涵盖使用大型语言模型 (LLM) 生成补丁的过程以及对这些补丁进行后处理的 `DiffRepair` 实用程序。相关代码位于 `src/apebench/inference/` 及其子模块中,例如 `src/apebench/inference/utils/diff_repair.py`。
89 |
90 | ## LLM 推理过程
91 |
92 | LLM 在 APE-Bench I 中的核心任务是根据 `Instruction` 生成一个补丁(统一差异格式),以转换给定的 `PreFile`。
93 |
94 | 1. **入口点**:推理的主要入口点是 `src/apebench/inference/run_inference.py`,它支持多个流程,包括:
95 | * `patch` 流程:根据指令生成修补后的代码
96 | * `judgement` 流程:使用作为裁判的 LLM 评估补丁
97 | * `instruction` 流程:生成自然语言指令
98 | * 每个流程都在 `src/apebench/inference/inference_pipelines/` 中实现为专门的类
99 |
100 | 2. **提示构建**:为每个任务构建目标 LLM 的提示。这包括:
101 | * `Instruction` (自然语言命令)。
102 | * `PreFile` (编辑前的完整 Lean 代码)。
103 | * 格式化指令,以指导 LLM 输出结构正确的补丁。
104 | * 提示模板针对每种流程类型,并在 `src/apebench/inference/prompts/` 下的独立模块中定义。
105 |
106 | 3. **模型调用**:推理框架支持各种 LLM 提供商:
107 | * `src/apebench/inference/inference_pipelines/` 中的流程类处理不同 API 的 API 身份验证、请求格式化和响应解析。
108 | * `src/apebench/inference/run_inference.py` 中的 `select_pipeline` 函数将流程类型映射到它们各自的流程类。
109 | * 诸如 `temperature`、`max_tokens` 和 `n_responses`(用于对多个候选进行采样)等关键参数会传递给相应的 API。
110 |
111 | 4. **输出处理**:
112 | * 解析原始 LLM 输出以提取生成的补丁。
113 | * 对于 `patch` 流程,对提取的补丁应用 `DiffRepair`(见下文)。
114 | * 处理后的输出以结构化格式保存到指定的输出文件中。
115 |
116 | 5. **并行性**:处理过程分布在多个工作进程中(使用 `ProcessPoolExecutor`)以加速大型数据集的推理,由 `--max_workers` 参数控制。
117 |
118 | 运行推理的命令如下所示:
119 | ```bash
120 | python -m src.apebench.inference.run_inference \
121 | --pipeline patch \
122 | --input_file /path/to/tasks.jsonl \
123 | --output_file /path/to/results.jsonl \
124 | --model_name gpt-4o \
125 | --temperature 0.8 \
126 | --n_responses 20 \
127 | --max_workers 4
128 | ```
129 |
130 | ## DiffRepair:容错补丁恢复
131 |
132 | LLM 生成的差异通常是"嘈杂的"——它们具有不正确的行号、未对齐的上下文行或格式问题,从而阻止使用标准 `patch` 实用程序将其干净地应用。`DiffRepair` 是为解决此问题而设计的关键组件。
133 |
134 | * **位置**:`src/apebench/inference/utils/diff_repair.py`
135 | * **目的**:将模型生成的嘈杂差异转换为干净、结构一致且可应用的补丁,同时尽可能保留编辑的原始意图。
136 | * **论文提及**:第 5.1 节(补丁规范化)和附录 A。
137 |
138 | **DiffRepair 工作流程(如论文附录 A 所述):**
139 |
140 | 1. **Hunk 解析**:将输入的差异文本解析为单独的"Hunk"(更改段)。
141 | 2. **意图定位(模糊匹配)**:对于每个 Hunk,`DiffRepair` 尝试在 `PreFile` 中找到更改意图的正确区域。这是一个关键步骤,涉及:
142 | * 比较 Hunk 中的上下文行与 `PreFile` 中的行。
143 | * 使用模糊匹配算法(例如,Levenshtein 距离、序列匹配)来容忍微小的差异。
144 | * `diff_repair.py` 中的 `_find_candidate_region_exact` 和 `_find_best_region_with_dp` 方法实现了复杂的匹配逻辑,包括动态规划。
145 | 3. **补丁重建**:一旦定位到目标区域,`DiffRepair` 会重建一个干净的差异 Hunk:
146 | * 相对于从 `PreFile` 中正确识别的上下文,将添加和删除的行重新对齐到结构有效的位置。
147 | * 扩充缺失的上下文行以满足统一差异格式的约束。
148 | * 解决行号偏移和潜在的 Hunk 重叠。
149 | 4. **最终差异生成**:将修复后的 Hunk 组合成最终的、干净的统一差异字符串。
150 |
151 | **代码中 `DiffRepair` (`diff_repair.py`) 的关键方面:**
152 | * 处理带有 `@@ ... @@` 标头的标准差异和非标准差异。
153 | * 规范化行(去除空白、小写化)以实现更稳健的匹配。
154 | * 结合使用精确匹配和模糊匹配技术。
155 | * `repair()` 方法协调给定差异的整个过程。
156 | * 筛选重叠的 Hunk 基于更改的重要性。
157 |
158 | ---
159 |
160 | 下一节: [评估流程:语法与语义检查](./03_4_apebench_evaluation.md)
--------------------------------------------------------------------------------
/src/eleanstic/core/file_map.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | """
4 | File Mapping Manager Module
5 | Responsible for storing and retrieving file mapping relationships for each commit
6 |
7 | Uses a compact binary format to store file mappings, reducing disk space usage.
8 | Each file mapping record contains only relative path, file hash, and file type information.
9 | """
10 | import os
11 | import struct
12 | import shutil
13 | import hashlib
14 | import traceback
15 |
16 | class FileMapManager:
17 | """
18 | File Mapping Manager, responsible for storing and retrieving file mapping relationships for each commit
19 |
20 | Uses binary file-based storage instead of JSON to reduce disk space usage
21 | """
22 | def __init__(self, storage_dir="storage", maps_dir="file_maps"):
23 | """Initialize file mapping manager
24 |
25 | Args:
26 | storage_dir: File content storage directory
27 | maps_dir: File mapping storage directory
28 | """
29 | self.storage_dir = storage_dir
30 | self.maps_dir = maps_dir
31 |
32 | # Ensure directories exist
33 | os.makedirs(self.storage_dir, exist_ok=True)
34 | os.makedirs(self.maps_dir, exist_ok=True)
35 |
36 | def get_map_path(self, commit_id):
37 | """Get mapping file path for specified commit
38 |
39 | Args:
40 | commit_id: Commit ID
41 |
42 | Returns:
43 | Complete path to the mapping file
44 | """
45 | return os.path.join(self.maps_dir, f"{commit_id}.bin")
46 |
47 | def store_file_mapping(self, commit_id, file_mappings):
48 | """Store commit file mappings, using binary format
49 |
50 | File format:
51 | - 4 bytes: Record count (unsigned int)
52 | - For each record:
53 | - 2 bytes: Path length (unsigned short)
54 | - 32 bytes: SHA-256 hash
55 | - 1 byte: File type (0: regular file, 1: symlink)
56 | - Variable length: Relative path string (UTF-8 encoded)
57 |
58 | Args:
59 | commit_id: Commit ID
60 | file_mappings: {relative_path: {"hash": file_hash, "type": file_type}}
61 |
62 | Returns:
63 | True on success, False on failure
64 | """
65 | map_path = self.get_map_path(commit_id)
66 |
67 | try:
68 | with open(map_path, 'wb') as f:
69 | # Write record count
70 | f.write(struct.pack('!I', len(file_mappings)))
71 |
72 | # Write each record
73 | for rel_path, file_info in file_mappings.items():
74 | path_bytes = rel_path.encode('utf-8')
75 | path_len = len(path_bytes)
76 |
77 | # Convert hash from hex string to binary
78 | hash_bin = bytes.fromhex(file_info["hash"])
79 |
80 | # File type: 0 for regular file, 1 for symlink
81 | file_type = 1 if file_info["type"] == "symlink" else 0
82 |
83 | # Write record header
84 | f.write(struct.pack('!H32sB', path_len, hash_bin, file_type))
85 |
86 | # Write path string
87 | f.write(path_bytes)
88 |
89 | return True
90 | except Exception as e:
91 | print(f"Failed to store file mapping: {traceback.format_exc()}")
92 | return False
93 |
94 | def get_file_mapping(self, commit_id):
95 | """Get commit file mappings, reading from binary format
96 |
97 | Args:
98 | commit_id: Commit ID
99 |
100 | Returns:
101 | File mapping dictionary, or empty dictionary if not found
102 | """
103 | map_path = self.get_map_path(commit_id)
104 |
105 | if os.path.exists(map_path):
106 | try:
107 | with open(map_path, 'rb') as f:
108 | # Read record count
109 | record_count_data = f.read(4)
110 | if not record_count_data:
111 | return {}
112 |
113 | record_count = struct.unpack('!I', record_count_data)[0]
114 |
115 | # Read all records
116 | file_mappings = {}
117 | for _ in range(record_count):
118 | # Read record header
119 | header_data = f.read(35) # 2(path_len) + 32(hash) + 1(type) = 35 bytes
120 | if not header_data or len(header_data) < 35:
121 | break
122 |
123 | path_len, hash_bin, file_type = struct.unpack('!H32sB', header_data)
124 |
125 | # Read path string
126 | path_data = f.read(path_len)
127 | if not path_data or len(path_data) < path_len:
128 | break
129 |
130 | rel_path = path_data.decode('utf-8')
131 |
132 | # Convert hash to hex string
133 | file_hash = hash_bin.hex()
134 |
135 | # Convert file type
136 | type_str = "symlink" if file_type == 1 else "regular"
137 |
138 | # Store in mapping dictionary
139 | file_mappings[rel_path] = {
140 | "hash": file_hash,
141 | "type": type_str
142 | }
143 |
144 | return file_mappings
145 | except Exception as e:
146 | print(f"Failed to read file mapping: {traceback.format_exc()}")
147 |
148 | return {}
149 |
150 | def get_storage_path(self, file_hash):
151 | """Get storage path based on file hash
152 |
153 | Args:
154 | file_hash: File content hash
155 |
156 | Returns:
157 | Complete path to the file in storage system
158 | """
159 | # Use first 4 digits of hash for two-level directory
160 | return os.path.join(self.storage_dir, file_hash[:2], file_hash[2:4], file_hash)
161 |
162 | def compute_file_hash(self, filepath):
163 | """Calculate file hash
164 |
165 | Args:
166 | filepath: File path
167 |
168 | Returns:
169 | SHA256 hash of the file
170 | """
171 | hasher = hashlib.sha256()
172 |
173 | if os.path.islink(filepath):
174 | # For symlinks, hash the target path
175 | target = os.readlink(filepath)
176 | hasher.update(target.encode())
177 | else:
178 | # For regular files, hash the content
179 | with open(filepath, 'rb') as f:
180 | for chunk in iter(lambda: f.read(4096), b''):
181 | hasher.update(chunk)
182 |
183 | return hasher.hexdigest()
184 |
185 | def restore_file(self, dest_path, file_hash, file_type):
186 | """Restore file from storage system
187 |
188 | Args:
189 | dest_path: Target file path
190 | file_hash: File hash
191 | file_type: File type ("regular" or "symlink")
192 |
193 | Returns:
194 | On success returns (True, message), on failure returns (False, error_message)
195 | """
196 | storage_path = self.get_storage_path(file_hash)
197 |
198 | if not os.path.exists(storage_path):
199 | return False, f"File does not exist in storage: {storage_path}"
200 |
201 | # Ensure target directory exists
202 | os.makedirs(os.path.dirname(dest_path), exist_ok=True)
203 |
204 | # If target already exists, delete it first
205 | if os.path.exists(dest_path):
206 | if os.path.islink(dest_path) or not os.path.isdir(dest_path):
207 | os.remove(dest_path)
208 |
209 | try:
210 | if file_type == "symlink":
211 | # Restore symlink
212 | with open(storage_path, 'r') as f:
213 | link_target = f.read()
214 | os.symlink(link_target, dest_path)
215 | else:
216 | # Restore regular file
217 | shutil.copy2(storage_path, dest_path)
218 | return True, f"File restored successfully: {dest_path}"
219 | except Exception as e:
220 | return False, f"Failed to restore file {dest_path}: {traceback.format_exc()}"
--------------------------------------------------------------------------------
/src/apebench/inference/inference_pipelines/base.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | import pandas as pd
4 | import json
5 | import os
6 | import logging
7 | import time
8 | import traceback
9 | from datetime import datetime
10 | from abc import ABC, abstractmethod
11 |
12 | from ..utils import process_with_retries
13 | from ..utils import chat
14 | from ....utils.file_utils import load_jsonl, convert_to_serializable
15 | import random
16 |
17 | class BasePipeline(ABC):
18 | """
19 | Base class for data processing pipelines that interact with AI models.
20 |
21 | This abstract class provides common functionality for:
22 | - Loading and processing input data
23 | - Handling results and errors
24 | - Logging and output management
25 | """
26 |
27 | def __init__(self, args):
28 | """Initialize with command-line arguments"""
29 | self.args = args
30 | # Set default timestamp if not provided
31 | if not hasattr(self.args, 'timestamp') or self.args.timestamp is None:
32 | self.args.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
33 | self.setup_logging()
34 | self.print_script_configuration()
35 |
36 | self.system_prompt = None
37 |
38 | def setup_logging(self):
39 | """Configure logging based on arguments"""
40 | os.makedirs(self.args.log_dir, exist_ok=True)
41 | log_file = f'{self.args.log_dir}/{self.args.pipeline}/{self.args.timestamp}_{self.args.model_name}_{self.args.temperature}.log'
42 | os.makedirs(os.path.dirname(log_file), exist_ok=True)
43 | logging.basicConfig(
44 | level=logging.INFO,
45 | format='%(asctime)s - %(levelname)s - %(message)s',
46 | filename=log_file
47 | )
48 |
49 |
50 | def print_script_configuration(self):
51 | print("\nScript Configuration:")
52 | print("---------------------")
53 | for arg, value in vars(self.args).items():
54 | print(f"{arg}: {value}")
55 | print("---------------------\n")
56 |
57 | def load_data(self):
58 | """
59 | Load data from input file with support for multiple formats
60 |
61 | Returns:
62 | pd.DataFrame: Loaded data
63 | """
64 | if self.args.input_file.endswith('.parquet'):
65 | data = pd.read_parquet(self.args.input_file)
66 | elif self.args.input_file.endswith('.json'):
67 | data = pd.read_json(self.args.input_file, orient='records', lines=True)
68 | elif self.args.input_file.endswith('.jsonl'):
69 | data = load_jsonl(self.args.input_file)
70 | data = pd.DataFrame(data)
71 | else:
72 | raise ValueError(f"Unsupported file type: {self.args.input_file}")
73 |
74 | return data
75 |
76 | @abstractmethod
77 | def get_input(self, row):
78 | """Get input text for a row"""
79 | pass
80 |
81 | def initialize_metadata(self, row):
82 | """Initialize metadata for a row"""
83 | return {}
84 |
85 | def update_metadata_per_response(self, metadata, parsed_response):
86 | """Update metadata with response"""
87 | return metadata
88 |
89 | def update_metadata_per_row(self, metadata, responses):
90 | """Update metadata with responses"""
91 | return metadata
92 |
93 | def early_stop(self, metadata, responses):
94 | """Early stop if the metadata is good or bad enough"""
95 | return False
96 |
97 | def parse_response(self, response, row):
98 | """Parse the response from the GPT model"""
99 | return {}
100 |
101 | def process_row(self, row):
102 | """
103 | Process a single row of data.
104 |
105 | Args:
106 | row (pd.Series): The row to process
107 |
108 | Returns:
109 | Dict or None: Processing result or None if processing failed
110 | """
111 | try:
112 | row_dict = row.to_dict()
113 | row_dict = convert_to_serializable(row_dict)
114 | row_dict['local_index'] = row.name
115 | input_text = self.get_input(row_dict)
116 |
117 | responses = []
118 | metadata = self.initialize_metadata(row_dict)
119 | for _ in range(self.args.n_responses):
120 | response = None
121 | try:
122 | response = chat(
123 | prompt=input_text,
124 | system_prompt=self.system_prompt,
125 | model_name=self.args.model_name,
126 | temperature=self.args.temperature,
127 | max_tokens=self.args.max_tokens,
128 | thinking_budget_tokens=self.args.thinking_budget_tokens
129 | )
130 | parsed_response = self.parse_response(response['choices'][0]['message']['content'], row_dict)
131 | if parsed_response is not None:
132 | response['inference_params'].update({
133 | 'temperature': self.args.temperature,
134 | 'n_responses': self.args.n_responses
135 | })
136 | parsed_response.update({
137 | 'raw_response': response['choices'][0],
138 | 'model': self.args.model_name,
139 | 'usage': response['usage'],
140 | 'inference_params': response['inference_params']
141 | })
142 | metadata = self.update_metadata_per_response(metadata, parsed_response)
143 | responses.append(parsed_response)
144 | if self.early_stop(metadata, responses):
145 | break
146 | except Exception as e:
147 | logging.error(f"Error processing row {row.name}: {traceback.format_exc()}")
148 | responses.append(response)
149 | time.sleep(random.randint(1, 5))
150 | continue
151 | metadata = self.update_metadata_per_row(metadata, responses)
152 | return {
153 | **row_dict,
154 | **metadata,
155 | 'responses': responses
156 | }
157 | except Exception as e:
158 | logging.error(f"Error processing row {row.name}: {traceback.format_exc()}")
159 | time.sleep(random.randint(1, 5))
160 | return None
161 |
162 | @property
163 | def special_config(self):
164 | return ''
165 |
166 | def process_data(self):
167 | """
168 | Process all data with automatic retries for failures
169 |
170 | Returns:
171 | Tuple[int, int, List]: (processed_count, error_count, failed_indices)
172 | """
173 | # Load data
174 | data = self.load_data()
175 |
176 | # Generate output file path if not provided
177 | if not hasattr(self.args, 'output_file') or self.args.output_file is None:
178 | _input_file_name = os.path.splitext(os.path.basename(self.args.input_file))[0]
179 | self.args.output_file = '/'.join([
180 | self.args.output_dir,
181 | self.args.pipeline,
182 | f'{self.args.timestamp}__{_input_file_name}__{self.args.model_name}__{self.args.temperature}{self.special_config}.jsonl'
183 | ])
184 | os.makedirs(os.path.dirname(self.args.output_file), exist_ok=True)
185 |
186 | print(f"Results will be saved to {self.args.output_file}")
187 |
188 | # Prepare configuration information dictionary
189 | config_info = {
190 | 'model_name': self.args.model_name,
191 | 'temperature': self.args.temperature,
192 | 'n_responses': self.args.n_responses
193 | }
194 |
195 | # Process with automatic retries
196 | total_processed, total_errors, final_missing = process_with_retries(
197 | data=data,
198 | process_func=self.process_row,
199 | output_file=self.args.output_file,
200 | max_workers=self.args.max_workers,
201 | max_retries=self.args.max_retries,
202 | config_info=config_info
203 | )
204 |
205 | # Save permanently failed indices if any
206 | if final_missing:
207 | os.makedirs('temp', exist_ok=True)
208 | missing_file = f'temp/missing_{self.args.pipeline}_{self.args.model_name}_{self.args.timestamp}.json'
209 | with open(missing_file, 'w') as f:
210 | json.dump({'missing_indices': final_missing}, f)
211 | logging.info(f"Saved {len(final_missing)} permanently failed indices to {missing_file}")
212 |
213 | logging.info(f"Final processing statistics - Successfully processed: {total_processed}, Total errors: {total_errors}")
214 |
215 | return total_processed, total_errors, final_missing
--------------------------------------------------------------------------------
/src/apebench/inference/utils/call_api.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 |
3 | import openai
4 | import logging
5 | import time
6 | import uuid
7 | from typing import Dict, Optional, Any
8 | from tenacity import retry, stop_after_attempt, wait_exponential, wait_random, wait_combine, retry_if_exception_type
9 | from ..utils.chat_logger import ChatLogger
10 | from ..utils.api_keys import *
11 |
12 | # List of supported model categories
13 | TOTAL_MODELS = (
14 | 'o1', 'o3-mini', 'deepseek-r1-250120', 'aws_sdk_claude37_sonnet@thinking',
15 | 'gpt-4o-2024-08-06', 'gpt-4o-2024-11-20', 'aws_claude35_sdk_sonnet_v2',
16 | 'aws_sdk_claude37_sonnet', 'deepseek-v3-250324',
17 | 'deepseek-v3',
18 | 'doubao-1-5-pro-32k-250115',
19 | 'gpt-4.5-preview-2025-02-27'
20 | )
21 |
22 | FORMAL_NAMES = {
23 | 'o1': 'OpenAI o1',
24 | 'o3-mini': 'OpenAI o3-mini',
25 | 'deepseek-r1-250120': 'DeepSeek R1',
26 | 'aws_sdk_claude37_sonnet@thinking': 'Claude 3.7 Sonnet (thinking)',
27 | 'gpt-4o-2024-08-06': 'GPT-4o',
28 | 'gpt-4o-2024-11-20': 'GPT-4o',
29 | 'aws_claude35_sdk_sonnet_v2': 'Claude 3.5 Sonnet',
30 | 'aws_sdk_claude37_sonnet': 'Claude 3.7 Sonnet',
31 | 'deepseek-v3-250324': 'DeepSeek V3 (0324)',
32 | 'deepseek-v3': 'DeepSeek V3',
33 | 'doubao-1-5-pro-32k-250115': 'Doubao 1.5 Pro',
34 | 'gpt-4.5-preview-2025-02-27': 'GPT-4.5',
35 | 'gemini-2.5-pro-preview-03-25': 'Gemini 2.5 Pro Preview',
36 | }
37 |
38 | REASONING_MODELS = (
39 | 'o1', 'o3-mini', 'deepseek-r1-250120', 'aws_sdk_claude37_sonnet@thinking'
40 | )
41 |
42 | UNSUPPORT_TEMPERATURE_MODELS = (
43 | 'o3-mini', 'aws_sdk_claude37_sonnet@thinking'
44 | )
45 |
46 | forbidden_params = {
47 | 'o3-mini': ['temperature'],
48 | 'aws_sdk_claude37_sonnet@thinking': ['temperature'],
49 | }
50 |
51 | def generate_logid() -> str:
52 | """
53 | Generate a unique log ID
54 |
55 | Returns:
56 | str: UUID format unique ID
57 | """
58 | return str(uuid.uuid4())
59 |
60 |
61 | def create_client(model_name: str):
62 | """Create an appropriate client"""
63 | if 'deepseek' in model_name or 'doubao' in model_name:
64 | return openai.OpenAI(
65 | api_key=volces_api_key,
66 | base_url=volces_base_url,
67 | )
68 | elif 'claude' in model_name:
69 | return openai.AzureOpenAI(
70 | azure_endpoint=aws_claude_base_url,
71 | api_version="2024-03-01-preview",
72 | api_key=aws_claude_api_key,
73 | )
74 | elif 'gemini' in model_name:
75 | return openai.AzureOpenAI(
76 | azure_endpoint=google_base_url,
77 | api_version="2024-03-01-preview",
78 | api_key=google_api_key,
79 | )
80 | else:
81 | return openai.AzureOpenAI(
82 | azure_endpoint=openai_base_url,
83 | api_version="2024-03-01-preview",
84 | api_key=openai_api_key,
85 | )
86 |
87 |
88 | def prepare_inference_params(
89 | client: openai.OpenAI,
90 | model_name: str,
91 | messages: list,
92 | logid: str,
93 | temperature: float = 0.0,
94 | max_tokens: int = 8000,
95 | thinking_budget_tokens: int = 16000,
96 | reasoning_effort: str = 'high'
97 | ) -> Dict[str, Any]:
98 | """Prepare parameters for completion request"""
99 | params = {
100 | "model": model_name,
101 | "messages": messages,
102 | "temperature": temperature,
103 | "max_tokens": max_tokens,
104 | "extra_headers": {"X-TT-LOGID": logid},
105 | }
106 |
107 | # Add thinking mode for Claude models
108 | if '@thinking' in model_name:
109 | params["model"] = model_name.replace('@thinking', '')
110 | params["temperature"] = 1.0
111 | params["extra_body"] = {
112 | "thinking": {
113 | "type": "enabled",
114 | "budget_tokens": thinking_budget_tokens
115 | }
116 | }
117 | params["max_tokens"] += thinking_budget_tokens
118 |
119 | # Add reasoning effort for o1 models
120 | # if model_name == 'o3-mini':
121 | # params["reasoning_effort"] = reasoning_effort
122 |
123 | if model_name in forbidden_params:
124 | for param in forbidden_params[model_name]:
125 | params.pop(param, None)
126 | return params
127 |
128 |
129 | @retry(
130 | stop=stop_after_attempt(5), # Retry up to 5 times
131 | wait=wait_combine(
132 | wait_exponential(multiplier=1, min=1, max=60), # Base exponential backoff: 1s, 2s, 4s, 8s, 16s
133 | wait_random(0, 2) # Add random jitter between 0-2 seconds
134 | ),
135 | retry=retry_if_exception_type((Exception,)), # Retry all exceptions
136 | reraise=True # Re-raise the exception at the end
137 | )
138 | def execute_completion(client: openai.OpenAI, params: Dict[str, Any]):
139 | """Execute request with retry logic and jitter"""
140 | try:
141 | return client.chat.completions.create(**params)
142 | except Exception as e:
143 | logging.error(f"API call failed: {str(e)}")
144 | raise
145 |
146 |
147 | def chat(
148 | prompt: str,
149 | system_prompt: Optional[str] = None,
150 | model_name: str = 'gpt-4o-2024-08-06',
151 | print_result: bool = False,
152 | temperature: float = 0.0,
153 | n: int = 1,
154 | max_tokens: int = 8000,
155 | thinking_budget_tokens: int = 6000,
156 | logid: Optional[str] = None,
157 | log_chat: bool = True,
158 | **kwargs
159 | ) -> Dict[str, Any]:
160 | """
161 | Generate conversational responses using specified model
162 |
163 | Parameters:
164 | prompt: User prompt text
165 | system_prompt: System prompt text (optional)
166 | model_name: Model name
167 | print_result: Whether to print results
168 | temperature: Sampling temperature
169 | n: Must be 1, otherwise throws an error
170 | max_tokens: Maximum tokens to generate
171 | logid: Custom log ID, automatically generated if None
172 |
173 | Returns:
174 | Dict: Model response result
175 | """
176 | # Validate n parameter
177 | if n != 1:
178 | raise ValueError("This implementation only supports n=1, multiple sampling has been removed to simplify code")
179 |
180 | # Generate or use provided logid
181 | if logid is None:
182 | logid = generate_logid()
183 |
184 | # Initialize chat logger and timing
185 | chat_logger = ChatLogger()
186 | start_time = time.time()
187 |
188 | # Create message list
189 | messages = [{"role": "user", "content": prompt}]
190 | if system_prompt:
191 | if model_name.startswith('o'):
192 | messages = [{"role": "user", "content": system_prompt + "\n\n\n\n" + prompt}]
193 | else:
194 | messages.insert(0, {"role": "system", "content": system_prompt})
195 |
196 | # Create appropriate client
197 | client = create_client(model_name)
198 |
199 | # Prepare API call parameters
200 | params = prepare_inference_params(client, model_name, messages, logid, temperature, max_tokens, thinking_budget_tokens)
201 |
202 | # Execute API call (with automatic retry)
203 | try:
204 | completion = execute_completion(client, params)
205 | result = completion.model_dump()
206 | result['inference_params'] = params
207 |
208 | # Calculate response time
209 | response_time = time.time() - start_time
210 |
211 | except Exception as e:
212 | logging.error(f"Request failed [logid: {logid}]: {str(e)}")
213 | raise
214 |
215 | # Print results (if needed)
216 | if print_result:
217 | print(f"LogID: {logid}")
218 | print(completion.model_dump_json())
219 | print('\n\n--------------------------------\n\n')
220 | print(completion.choices[0].message.content)
221 | print('\n--------------------------------\n')
222 | print(f"Time taken: {response_time:.2f} seconds")
223 |
224 | # Log chat interaction
225 | if log_chat:
226 | chat_logger.log_chat(
227 | prompt=prompt,
228 | completion=result,
229 | model_name=model_name,
230 | system_prompt=system_prompt
231 | )
232 |
233 | return result
234 |
235 |
236 | if __name__ == "__main__":
237 | system_prompt = None
238 | prompt = "What is the capital of France?"
239 | # model_name = "deepseek-r1-250120"
240 | model_name = "gemini-2.5-pro-preview-03-25"
241 | # model_name = "o3-mini"
242 |
243 | # Using automatically generated logid
244 | result = chat(
245 | prompt=prompt,
246 | system_prompt=system_prompt,
247 | model_name=model_name,
248 | print_result=True,
249 | n=1
250 | )
251 |
252 | # Or using custom logid
253 | # custom_logid = f"api_call_{int(time.time())}"
254 | # result = chat(
255 | # prompt=prompt,
256 | # system_prompt=system_prompt,
257 | # model_name=model_name,
258 | # print_result=True,
259 | # n=1,
260 | # logid=custom_logid
261 | # )
--------------------------------------------------------------------------------
/src/apebench/evaluation_pipelines/evaluation_manager.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 | """
3 | Evaluation management module responsible for executing the patch evaluation process
4 | """
5 |
6 | import os
7 | import subprocess
8 | import json
9 | from datetime import datetime
10 | from typing import Dict, List, Any, Optional
11 | from ...utils import load_results, save_jsonl, load_jsonl
12 | from ..utils import ProgressTracker, calculate_metrics, plot_metrics, extract_judgement_data
13 |
14 | def filter_verified_data(merged_file: str) -> List[Dict[str, Any]]:
15 | """
16 | Filter data that passed verification from the merged file
17 |
18 | Args:
19 | merged_file: Path to the merged results file
20 |
21 | Returns:
22 | List of verified data
23 | """
24 | # Load merged data
25 | with open(merged_file, 'r') as f:
26 | merged_data = [json.loads(line) for line in f if line.strip()]
27 |
28 | # Filter items that passed verification
29 | verified_data = []
30 |
31 | for item in merged_data:
32 | verified_responses = []
33 |
34 | for response in item.get('responses', []):
35 | # Check verification result
36 | if response.get('verification_result', {}).get('complete', False):
37 | verified_responses.append(response)
38 |
39 | if verified_responses:
40 | # Create new item containing only verified responses
41 | verified_item = item.copy()
42 | verified_item['responses'] = verified_responses
43 | verified_data.append(verified_item)
44 |
45 | return verified_data
46 |
47 | def flatten_results(results):
48 | """
49 | Flatten verification results
50 | """
51 | flattened_results = []
52 | for result in results:
53 | for response in result.get('responses', []):
54 | if response is not None and response.get('verification_result', {}).get('complete', False):
55 | if not 'best_gen_patch' in response:
56 | best_gen_patch = response['gen_patch']
57 | if 'gen_patch_after_exact_repair' in response:
58 | best_gen_patch = response['gen_patch_after_exact_repair']
59 | if 'gen_patch_after_robust_repair' in response:
60 | best_gen_patch = response['gen_patch_after_robust_repair']
61 | response['best_gen_patch'] = best_gen_patch
62 | else:
63 | best_gen_patch = response['best_gen_patch']
64 | flattened_result = result.copy()
65 | flattened_result['best_gen_patch'] = best_gen_patch
66 | # flattened_result['patch_generation_responses'] = flattened_result.pop('responses')
67 | flattened_result.update({k : response[k] for k in ('model', 'usage', 'inference_params', 'verification_result', 'best_gen_content', 'best_gen_patch', 'best_gen_patch_comment_free')})
68 | flattened_result['raw_patch_generation_responses'] = response['raw_response']
69 | flattened_results.append(flattened_result)
70 | return flattened_results
71 |
72 | def evaluate_patches(config_file: str, merged_results_file: Optional[str] = None) -> Dict[str, Any]:
73 | """
74 | Evaluate the quality of verified patches
75 |
76 | Args:
77 | config_file: Path to configuration file
78 | merged_results_file: Optional path to merged results file
79 |
80 | Returns:
81 | Evaluation metrics
82 | """
83 | # Import here instead of at the top to avoid circular imports
84 | from ..config.config_manager import ConfigManager
85 |
86 | # Load configuration
87 | config = ConfigManager(config_file).get_config()
88 |
89 | # Initialize progress tracker
90 | progress_tracker = ProgressTracker(config.progress_log)
91 |
92 | print(f"Running patch evaluation with configuration from: {config_file}")
93 |
94 | # Check if evaluation is already completed
95 | evaluation_status = progress_tracker.get_evaluation_status()
96 | if evaluation_status.get("completed", False):
97 | print("Evaluation already completed")
98 | verification_status = progress_tracker.get_verification_status()
99 | verification_metrics = verification_status.get("metrics", {})
100 | judgement_status = progress_tracker.get_evaluation_status()
101 | judgement_metrics = judgement_status.get("metrics", {})
102 | return verification_metrics, judgement_metrics
103 |
104 | # If no merged results file is provided, get it from the progress record
105 | if not merged_results_file:
106 | verification_status = progress_tracker.get_verification_status()
107 | if verification_status.get("completed", False):
108 | merged_results_file = verification_status.get("merged_results", "")
109 | else:
110 | raise ValueError("Verification has not been completed. Run verify_patches first.")
111 |
112 | if not merged_results_file or not os.path.exists(merged_results_file):
113 | raise ValueError(f"Merged results file not found: {merged_results_file}")
114 |
115 | print(f"Using merged results file: {merged_results_file}")
116 |
117 | # Create temporary and output directories
118 | os.makedirs(config.temp_dir, exist_ok=True)
119 | os.makedirs(config.output_dir, exist_ok=True)
120 |
121 | # Create timestamp
122 | timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
123 |
124 | # 1. Flatten verification results
125 | print("Flattening verification results...")
126 | merged_results = load_jsonl(merged_results_file)
127 | flattened_results = flatten_results(merged_results)
128 | flattened_results_file = f"{config.temp_dir}/flattened_verification_{timestamp}.jsonl"
129 | save_jsonl(flattened_results, flattened_results_file)
130 | print(f"Flattened {len(flattened_results)} results saved to: {flattened_results_file}")
131 |
132 | # 2. Run judgment generation pipeline
133 | print("Running judgement generation...")
134 | judgement_output_file = f"{config.output_dir}/judgement_{timestamp}.jsonl"
135 |
136 | judgement_cmd = [
137 | "python", "-m", "src.apebench.inference.run_inference",
138 | "--pipeline", "judgement",
139 | "--input_file", flattened_results_file,
140 | "--output_file", judgement_output_file,
141 | "--model_name", config.judgement.model_name,
142 | "--temperature", str(config.judgement.temperature),
143 | "--n_responses", str(config.judgement.n_responses),
144 | "--max_workers", str(config.judgement.max_workers)
145 | ]
146 |
147 | if hasattr(config.judgement, 'max_tokens') and config.judgement.max_tokens:
148 | judgement_cmd.append("--max_tokens")
149 | judgement_cmd.append(str(config.judgement.max_tokens))
150 |
151 | if hasattr(config.judgement, 'thinking_budget_tokens') and config.judgement.thinking_budget_tokens:
152 | judgement_cmd.append("--thinking_budget_tokens")
153 | judgement_cmd.append(str(config.judgement.thinking_budget_tokens))
154 |
155 | print(f"Executing: {' '.join(judgement_cmd)}")
156 | subprocess.run(judgement_cmd, check=True)
157 |
158 | # 3. Collect and filter judgment results
159 | print("Filtering judgement results...")
160 | filtered_judgement_file = f"{config.output_dir}/filtered_judgement_{timestamp}.jsonl"
161 |
162 | filter_cmd = [
163 | "python", "-m", "src.apebench.evaluation_pipelines.gather_results",
164 | "--pipeline", "judgement",
165 | "--input_files", judgement_output_file,
166 | "--output_file", filtered_judgement_file,
167 | ]
168 |
169 | print(f"Executing: {' '.join(filter_cmd)}")
170 | subprocess.run(filter_cmd, check=True)
171 |
172 | # 4. Calculate final evaluation metrics (using modified gather_results implementation)
173 | print("Calculating final evaluation metrics...")
174 | judgement_data = extract_judgement_data(filtered_judgement_file)
175 | metrics = calculate_metrics(judgement_data, config)
176 |
177 | # 5. Generate visualizations
178 | if hasattr(config.evaluation, 'generate_plots') and config.evaluation.generate_plots:
179 | print("Generating judgement metric plots...")
180 | plots_dir = getattr(config.evaluation, 'plots_dir', './judgement_plots')
181 | os.makedirs(plots_dir, exist_ok=True)
182 | plot_metrics(metrics, plots_dir, f'judgement_{timestamp}')
183 |
184 | # 6. Save metrics
185 | metrics_file = f"{config.output_dir}/judgement_metrics_{timestamp}.json"
186 | with open(metrics_file, 'w') as f:
187 | json.dump(metrics, f, indent=2)
188 |
189 | # 7. Update progress tracking
190 | evaluation_status = {
191 | "completed": True,
192 | "timestamp": timestamp,
193 | "judgement_output": judgement_output_file,
194 | "filtered_judgement": filtered_judgement_file,
195 | "metrics_file": metrics_file,
196 | "metrics": metrics
197 | }
198 |
199 | progress_tracker.update_evaluation_status(evaluation_status)
200 |
201 | print(f"Evaluation completed. Results saved to: {metrics_file}")
202 |
203 | # 9. Reload verification metrics
204 | verification_status = progress_tracker.get_verification_status()
205 | verification_metrics = verification_status.get("metrics", {})
206 |
207 | return verification_metrics, metrics
--------------------------------------------------------------------------------
/docs/04_troubleshooting.md:
--------------------------------------------------------------------------------
1 | [English](#english-version) | [中文](#chinese-version)
2 |
3 |
4 | # 4. Troubleshooting
5 |
6 | This section lists common issues encountered during setup or execution and provides potential solutions.
7 |
8 | ## Eleanstic Setup Issues
9 |
10 | * **Issue**: Eleanstic preprocessing fails or takes an extremely long time.
11 | * **Cause**: Insufficient disk space for Mathlib clones, `.lake` build artifacts (before Eleanstic processes them), or the Eleanstic CAS store.
12 | * **Solution**: Ensure ample free disk space (hundreds of GB may be needed temporarily for many commits). Check paths in `src/eleanstic/config.yaml` are correct and writable.
13 | * **Cause**: `lake build` errors for specific Mathlib commits (e.g., network issues during `lake exe cache get`, toolchain problems).
14 | * **Solution**: Ensure Lean and Lake are correctly installed and in PATH. Check Eleanstic logs for specific errors from `lake`. The `src/eleanstic/README.md` mentions retry mechanisms for `lake exe cache get`; ensure these are active or consider increasing retry attempts/timeouts if configurable. Some older Mathlib commits might have unique build issues; Eleanstic should ideally be robust to a few failing commits or allow skipping them if they are not critical for the benchmark set.
15 | * **Cause**: Incorrect `mathlib_repo_path` in `src/eleanstic/config.yaml`.
16 | * **Solution**: Verify the path points to a valid, up-to-date clone of `leanprover-community/mathlib4`.
17 |
18 | * **Issue**: Eleanstic CAS store grows excessively large despite deduplication.
19 | * **Cause**: If many binary files (e.g., compiled `.olean` files) have minor, non-semantic differences across commits that defeat simple content hashing.
20 | * **Solution**: This is an inherent challenge. Eleanstic's design aims to mitigate this. Ensure Eleanstic is correctly identifying and hashing files. For extreme cases, one might investigate more advanced binary diffing/patching for storage, but this would be a significant R&D effort for Eleanstic itself.
21 |
22 | ## LLM Inference Issues
23 |
24 | * **Issue**: API errors from LLMs (e.g., authentication, rate limits, model not found).
25 | * **Solution**:
26 | * **Authentication**: Double-check API keys are correctly set as environment variables or in `src/apebench/config/` model configuration files.
27 | * **Rate Limits**: Implement or enhance retry logic (e.g., exponential backoff, as provided by the `tenacity` library in `requirements.txt`) in the API calling modules in `src/apebench/inference/`. Consider reducing batch sizes or running inference for fewer tasks at a time.
28 | * **Model Not Found**: Ensure the model names in your configuration match the exact identifiers used by the LLM provider's API.
29 |
30 | * **Issue**: LLM outputs are not in the expected diff format.
31 | * **Solution**: Review and refine the prompting strategy used in `src/apebench/inference/`. Ensure prompts clearly instruct the LLM to output a unified diff. `DiffRepair` can handle some noise, but if the output is entirely unstructured, prompting is the primary fix.
32 |
33 | * **Issue**: `DiffRepair` fails to repair a patch or significantly alters its meaning.
34 | * **Cause**: The LLM-generated diff is too divergent from the `PreFile` context, or `DiffRepair`'s fuzzy matching thresholds are too strict/loose.
35 | * **Solution**:
36 | * Inspect the problematic raw diff and `PreFile`.
37 | * Experiment with `DiffRepair` parameters (e.g., `strict_match_threshold`, `exact_match` flag when initializing `DiffRepair` in the inference pipeline).
38 | * For systematic issues, this might indicate a need to improve `DiffRepair`'s algorithms (see [LLM Inference and DiffRepair - Secondary Development](./04_core_components/04_3_apebench_inference.md)).
39 |
40 | ## Evaluation Issues
41 |
42 | * **Issue**: Syntactic verification (Lean compile) fails for patches that seem correct.
43 | * **Cause**: Eleanstic might not be restoring the *exact* correct versioned environment (e.g., wrong snapshot, issue during file restoration from CAS).
44 | * **Solution**: Verify Eleanstic setup. Check logs from Eleanstic and the Lean compiler for specific errors. Ensure the task's commit SHA is correctly mapped to the Eleanstic snapshot.
45 | * **Cause**: The patch, even if repaired, introduces subtle Lean errors not obvious at first glance.
46 | * **Solution**: Manually apply the patch to the `PreFile` (from the correct Mathlib commit, checked out locally) and try to compile with `lake env lean ` to debug the Lean error directly.
47 |
48 | * **Issue**: Semantic Judgement (LLM-as-a-Judge) gives unexpected results.
49 | * **Cause**: Prompting issues for the judge LLM; instability in judge LLM responses.
50 | * **Solution**: Review the semantic evaluation prompts. Ensure the `sample@4` voting is working as expected. The APE-Bench I paper uses Claude Sonnet 3.7 (thinking mode); using a different judge model might require re-calibrating expectations or prompts.
51 |
52 | ## General Issues
53 |
54 | * **Issue**: Python `ModuleNotFoundError` or `ImportError`.
55 | * **Solution**: Ensure your virtual environment is activated (`source venv/bin/activate`). Verify all dependencies in `requirements.txt` are installed correctly (`pip install -r requirements.txt`). Check `PYTHONPATH` if using complex project structures, though this should generally not be needed if the project is structured as a proper Python package.
56 |
57 | * **Issue**: Slow performance.
58 | * **Cause**: LLM API calls can be slow. Eleanstic preprocessing is intensive but one-time per commit. Disk I/O for Eleanstic CAS on slow drives.
59 | * **Solution**:
60 | * Use faster LLM models if available (though this changes the experiment).
61 | * Ensure Eleanstic CAS and snapshot directories are on fast storage (SSD recommended).
62 | * For inference, consider parallelizing API calls across multiple tasks if your API quotas and local resources allow (scripts in `src/apebench/scripts/` might already do this).
63 |
64 | If you encounter an issue not listed here, please check existing GitHub issues for the project (if available) or consider reporting a new one with detailed information: steps to reproduce, error messages, relevant configuration, and environment details.
65 |
66 | ---
67 |
68 | Next: [Development and Contribution Guide](./05_development_contribution.md)
69 |
70 |
71 |
72 | ## 中文翻译 (Chinese Translation)
73 |
74 | # 4. 故障排除
75 |
76 | 本节列出了在设置或执行过程中遇到的常见问题及其潜在的解决方案。
77 |
78 | ## Eleanstic 设置问题
79 |
80 | * **问题**:Eleanstic 预处理失败或耗时过长。
81 | * **原因**:Mathlib 克隆、`.lake` 构建产物(在 Eleanstic 处理它们之前)或 Eleanstic CAS 存储的磁盘空间不足。
82 | * **解决方案**:确保有足够的可用磁盘空间(对于许多提交,可能临时需要数百 GB)。检查 `src/eleanstic/config.yaml` 中的路径是否正确且可写。
83 | * **原因**:特定 Mathlib 提交的 `lake build` 错误(例如,`lake exe cache get` 期间的网络问题,工具链问题)。
84 | * **解决方案**:确保 Lean 和 Lake 已正确安装并在 PATH 中。检查 Eleanstic 日志以获取来自 `lake` 的特定错误。`src/eleanstic/README.md` 提到了 `lake exe cache get` 的重试机制;确保这些机制已激活,或者如果可配置,则考虑增加重试次数/超时。一些较旧的 Mathlib 提交可能存在独特的构建问题;理想情况下,Eleanstic 应该能够容忍少量失败的提交,或者如果它们对基准测试集不重要,则允许跳过它们。
85 | * **原因**:`src/eleanstic/config.yaml` 中的 `mathlib_repo_path` 不正确。
86 | * **解决方案**:验证该路径指向 `leanprover-community/mathlib4` 的有效、最新的克隆。
87 |
88 | * **问题**:尽管进行了重复数据删除,Eleanstic CAS 存储仍然过度增长。
89 | * **原因**:如果许多二进制文件(例如,编译的 `.olean` 文件)在不同提交之间存在微小的、非语义的差异,从而破坏了简单的内容哈希。
90 | * **解决方案**:这是一个固有的挑战。Eleanstic 的设计旨在缓解此问题。确保 Eleanstic 正确识别和哈希文件。对于极端情况,可以研究更高级的二进制差异/补丁存储方法,但这将是 Eleanstic 本身的重大研发工作。
91 |
92 | ## LLM 推理问题
93 |
94 | * **问题**:来自 LLM 的 API 错误(例如,身份验证、速率限制、模型未找到)。
95 | * **解决方案**:
96 | * **身份验证**:仔细检查 API 密钥是否已正确设置为环境变量或在 `src/apebench/config/` 模型配置文件中。
97 | * **速率限制**:在 `src/apebench/inference/` 的 API 调用模块中实现或增强重试逻辑(例如,指数退避,如 `requirements.txt` 中的 `tenacity` 库所提供)。考虑减少批处理大小或一次运行较少任务的推理。
98 | * **模型未找到**:确保配置中的模型名称与 LLM 提供商 API 使用的确切标识符匹配。
99 |
100 | * **问题**:LLM 输出未采用预期的差异格式。
101 | * **解决方案**:审查并优化 `src/apebench/inference/` 中使用的提示策略。确保提示明确指示 LLM 输出统一差异格式。`DiffRepair` 可以处理一些噪音,但如果输出完全没有结构,则提示是主要的解决方法。
102 |
103 | * **问题**:`DiffRepair` 无法修复补丁或显著改变其含义。
104 | * **原因**:LLM 生成的差异与 `PreFile` 上下文过于偏离,或者 `DiffRepair` 的模糊匹配阈值过于严格/宽松。
105 | * **解决方案**:
106 | * 检查有问题的原始差异和 `PreFile`。
107 | * 试验 `DiffRepair` 参数(例如,在推理流程中初始化 `DiffRepair` 时的 `strict_match_threshold`、`exact_match` 标志)。
108 | * 对于系统性问题,这可能表明需要改进 `DiffRepair` 的算法(请参阅[LLM 推理与 DiffRepair - 二次开发](./04_core_components/04_3_apebench_inference.md))。
109 |
110 | ## 评估问题
111 |
112 | * **问题**:对于看起来正确的补丁,语法验证(Lean 编译)失败。
113 | * **原因**:Eleanstic 可能没有恢复*完全*正确的版本化环境(例如,错误的快照,从 CAS 恢复文件时出现问题)。
114 | * **解决方案**:验证 Eleanstic 设置。检查 Eleanstic 和 Lean 编译器的日志以获取特定错误。确保任务的提交 SHA 正确映射到 Eleanstic 快照。
115 | * **原因**:即使修复后,补丁仍引入了乍一看并不明显的细微 Lean 错误。
116 | * **解决方案**:将补丁手动应用于 `PreFile`(来自正确的 Mathlib 提交,本地检出),并尝试使用 `lake env lean `进行编译以直接调试 Lean 错误。
117 |
118 | * **问题**:语义判断(作为裁判的 LLM)给出意外结果。
119 | * **原因**:裁判 LLM 的提示问题;裁判 LLM 响应的不稳定性。
120 | * **解决方案**:审查语义评估提示。确保 `sample@4` 投票按预期工作。APE-Bench I 论文使用 Claude Sonnet 3.7(思考模式);使用不同的裁判模型可能需要重新校准期望或提示。
121 |
122 | ## 一般问题
123 |
124 | * **问题**:Python `ModuleNotFoundError` 或 `ImportError`。
125 | * **解决方案**:确保您的虚拟环境已激活 (`source venv/bin/activate`)。验证 `requirements.txt` 中的所有依赖项均已正确安装 (`pip install -r requirements.txt`)。如果使用复杂的项目结构,请检查 `PYTHONPATH`,尽管如果项目结构为正确的 Python 包,则通常不需要这样做。
126 |
127 | * **问题**:性能缓慢。
128 | * **原因**:LLM API 调用可能很慢。Eleanstic 预处理计算量大,但每个提交仅执行一次。慢速驱动器上 Eleanstic CAS 的磁盘 I/O。
129 | * **解决方案**:
130 | * 如果可用,请使用更快的 LLM 模型(尽管这会改变实验)。
131 | * 确保 Eleanstic CAS 和快照目录位于快速存储设备上(建议使用 SSD)。
132 | * 对于推理,如果您的 API 配额和本地资源允许,请考虑跨多个任务并行化 API 调用(`src/apebench/scripts/` 中的脚本可能已经这样做了)。
133 |
134 | 如果您遇到此处未列出的问题,请检查项目的现有 GitHub 问题(如果可用),或考虑报告一个新问题并提供详细信息:重现步骤、错误消息、相关配置和环境详细信息。
135 |
136 | ---
137 |
138 | 下一节: [开发与贡献指南](./05_development_contribution.md)
--------------------------------------------------------------------------------
/src/apebench/evaluation_pipelines/data_collector.py:
--------------------------------------------------------------------------------
1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates.
2 | """
3 | Data Collection Management Module, responsible for executing the dataset creation workflow
4 | """
5 |
6 | import os
7 | import subprocess
8 | import logging
9 | from datetime import datetime
10 | from typing import Dict, Any, Optional
11 | import pandas as pd
12 |
13 | # Configure logging
14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15 | logger = logging.getLogger(__name__)
16 |
17 | def collect_data(config_file: str) -> Dict[str, Any]:
18 | """
19 | Run data collection tasks according to configuration
20 |
21 | Args:
22 | config_file: Configuration file path
23 |
24 | Returns:
25 | Key file path information generated during the collection process
26 | """
27 | # Import here instead of at the top to avoid circular imports
28 | from ..config.config_manager import ConfigManager
29 |
30 | # Load configuration
31 | config_manager = ConfigManager(config_file)
32 | config = config_manager.get_config()
33 |
34 | # Use data_collection section of the configuration
35 | data_config = config.data_collection
36 |
37 | # Create output directory
38 | os.makedirs(data_config.dataset_dir, exist_ok=True)
39 |
40 | # Get data collection timestamp
41 | data_collection_date = datetime.now().strftime('%Y%m%d_%H%M%S')
42 |
43 | # Set key path variables
44 | repo_path = data_config.repo_path
45 | repo_name = os.path.basename(repo_path)
46 | max_diff_lines = data_config.max_diff_lines
47 |
48 | # Build base filename
49 | base_filename = f"{repo_name}_commits_data_{data_collection_date}_{max_diff_lines}"
50 |
51 | # Record all generated files
52 | output_files = {}
53 |
54 | # Step 1: Clone repository
55 | if not os.path.exists(repo_path):
56 | logger.info(f"Cloning {data_config.repo_url} to {repo_path}")
57 | subprocess.run(["git", "clone", data_config.repo_url, repo_path], check=True)
58 | else:
59 | logger.info(f"Repository {repo_path} already exists, skipping clone")
60 |
61 | # Step 2: Collect commit data
62 | raw_data_path = os.path.join(data_config.dataset_dir, f"{base_filename}.parquet")
63 | logger.info(f"Collecting commit data to {raw_data_path}")
64 |
65 | collect_cmd = [
66 | "python", "src/apebench/data/collect_commit_data.py",
67 | "--repo_path", repo_path,
68 | "--output_path", raw_data_path,
69 | "--max_diff_lines", str(max_diff_lines)
70 | ]
71 | subprocess.run(collect_cmd, check=True)
72 | output_files["raw_data"] = raw_data_path
73 |
74 | # Step 3: Filter commit data
75 | filtered_data_path = os.path.join(data_config.dataset_dir, f"filtered_{base_filename}.parquet")
76 | length_plot_path = os.path.join(data_config.dataset_dir, f"filtered_{base_filename}_filtered_length_distribution.png")
77 |
78 | logger.info(f"Filtering data to {filtered_data_path}")
79 | filter_cmd = [
80 | "python", "src/apebench/data/filter_commit_data.py",
81 | "--file_path", raw_data_path,
82 | "--output_path", filtered_data_path,
83 | "--length_distribution_plot_path", length_plot_path
84 | ]
85 | subprocess.run(filter_cmd, check=True)
86 | output_files["filtered_data"] = filtered_data_path
87 | output_files["length_plot"] = length_plot_path
88 |
89 | # Step 4: Build database
90 | logger.info("Building database")
91 | build_cmd = [
92 | "python", "-m", "src.eleanstic.main",
93 | "--input_file", filtered_data_path,
94 | "build"
95 | ]
96 | subprocess.run(build_cmd, check=True)
97 |
98 | # Step 5: Verify filtered data
99 | verify_result_dir = os.path.join(data_config.dataset_dir, "verify_results", f"filtered_{base_filename}")
100 | os.makedirs(os.path.dirname(verify_result_dir), exist_ok=True)
101 |
102 | logger.info(f"Verifying filtered data, saving results to {verify_result_dir}")
103 | verify_cmd = [
104 | "python", "-m", "src.eleanstic.main",
105 | "--input_file", filtered_data_path,
106 | "verify",
107 | "--code_key", "content_after",
108 | "--results_dir", verify_result_dir
109 | ]
110 | subprocess.run(verify_cmd, check=True)
111 | output_files["verify_results"] = verify_result_dir
112 |
113 | # Step 6: Filter verification data
114 | verified_data_path = os.path.join(data_config.dataset_dir, f"filtered_{base_filename}_verified.parquet")
115 |
116 | logger.info(f"Filtering verification data to {verified_data_path}")
117 | filter_results_cmd = [
118 | "python", "src/apebench/data/filter_results.py",
119 | "--pipeline", "verification",
120 | "--input_files", f"{verify_result_dir}/*.jsonl",
121 | "--output_file", verified_data_path,
122 | "--reset_index_by_date"
123 | ]
124 | subprocess.run(filter_results_cmd, check=True)
125 | output_files["verified_data"] = verified_data_path
126 |
127 | # Step 7: Extract latest data
128 | latest_num_data = data_config.latest_num_data
129 | latest_data_path = os.path.join(data_config.dataset_dir, f"filtered_{base_filename}_verified_latest_{latest_num_data}.jsonl")
130 |
131 | logger.info(f"Extracting latest {latest_num_data} records to {latest_data_path}")
132 | # Use pandas to read and save data, instead of executing Python commands
133 | df = pd.read_parquet(verified_data_path)
134 | df.sort_values(by='date', ascending=False, inplace=True)
135 | df = df.head(latest_num_data)
136 | df.to_json(latest_data_path, orient='records', lines=True)
137 | output_files["latest_data"] = latest_data_path
138 |
139 | # Ensure output directories exist
140 | os.makedirs(config.output_dir, exist_ok=True)
141 | os.makedirs(os.path.join(config.output_dir, "instruction"), exist_ok=True)
142 | os.makedirs(os.path.join(config.output_dir, "judgement"), exist_ok=True)
143 |
144 | # Step 8: Generate instruction data
145 | instruction_model_name = data_config.instruction_model
146 | instruction_output_path = os.path.join(
147 | config.output_dir,
148 | "instruction",
149 | f"filtered_{base_filename}_verified_latest_{latest_num_data}_instruction_{instruction_model_name}.jsonl"
150 | )
151 |
152 | logger.info(f"Generating instruction data to {instruction_output_path}")
153 | instruction_cmd = [
154 | "python", "src/apebench/inference/run_inference.py",
155 | "--pipeline", "instruction",
156 | "--input_file", latest_data_path,
157 | "--output_file", instruction_output_path,
158 | "--model_name", instruction_model_name,
159 | "--max_workers", str(data_config.max_workers),
160 | "--n_responses", "1",
161 | "--temperature", "0",
162 | "--max_tokens", str(data_config.max_tokens),
163 | "--thinking_budget_tokens", str(data_config.thinking_budget_tokens)
164 | ]
165 | subprocess.run(instruction_cmd, check=True)
166 | output_files["instruction_output"] = instruction_output_path
167 |
168 | # Create instruction data directory
169 | os.makedirs(os.path.join(data_config.dataset_dir, "instruction"), exist_ok=True)
170 |
171 | instruction_data_path = os.path.join(
172 | data_config.dataset_dir,
173 | "instruction",
174 | f"filtered_{base_filename}_verified_latest_{latest_num_data}_instruction_{instruction_model_name}.jsonl"
175 | )
176 |
177 | logger.info(f"Filtering instruction data to {instruction_data_path}")
178 | filter_instruction_cmd = [
179 | "python", "src/apebench/data/filter_results.py",
180 | "--pipeline", "instruction",
181 | "--input_files", instruction_output_path,
182 | "--output_file", instruction_data_path,
183 | "--extract_exercise_info"
184 | ]
185 | subprocess.run(filter_instruction_cmd, check=True)
186 | output_files["instruction_data"] = instruction_data_path
187 |
188 | # Step 9: Verify through judgement of golden differences
189 | judgement_model_name = data_config.judgement_model
190 | judgement_output_path = os.path.join(
191 | config.output_dir,
192 | "judgement",
193 | f"filtered_{base_filename}_verified_latest_{latest_num_data}_judgement_{judgement_model_name}.jsonl"
194 | )
195 |
196 | logger.info(f"Executing judgement verification to {judgement_output_path}")
197 | judgement_cmd = [
198 | "python", "src/apebench/inference/run_inference.py",
199 | "--pipeline", "judgement",
200 | "--input_file", instruction_data_path,
201 | "--output_file", judgement_output_path,
202 | "--model_name", judgement_model_name,
203 | "--max_workers", str(data_config.max_workers),
204 | "--n_responses", "1",
205 | "--temperature", "0",
206 | "--max_tokens", str(data_config.max_tokens),
207 | "--thinking_budget_tokens", str(data_config.thinking_budget_tokens),
208 | "--patch_key", "gold_diff"
209 | ]
210 | subprocess.run(judgement_cmd, check=True)
211 | output_files["judgement_output"] = judgement_output_path
212 |
213 | # Create judgement data directory
214 | os.makedirs(os.path.join(data_config.dataset_dir, "judgement"), exist_ok=True)
215 |
216 | judgement_data_path = os.path.join(
217 | data_config.dataset_dir,
218 | "judgement",
219 | f"filtered_{base_filename}_verified_latest_{latest_num_data}_judgement_{judgement_model_name}.jsonl"
220 | )
221 |
222 | logger.info(f"Filtering judgement data to {judgement_data_path}")
223 | filter_judgement_cmd = [
224 | "python", "src/apebench/data/filter_results.py",
225 | "--pipeline", "judgement",
226 | "--input_files", judgement_output_path,
227 | "--output_file", judgement_data_path
228 | ]
229 | subprocess.run(filter_judgement_cmd, check=True)
230 | output_files["judgement_data"] = judgement_data_path
231 |
232 | logger.info(f"Data collection complete! Final data path: {judgement_data_path}")
233 |
234 | return output_files
--------------------------------------------------------------------------------