├── app ├── __init__.py ├── agents │ ├── __init__.py │ ├── write_dockerfile_agent │ │ ├── __init__.py │ │ └── write_dockerfile_agent.py │ ├── write_eval_script_agent │ │ └── __init__.py │ ├── context_retrieval_agent │ │ └── __init__.py │ ├── test_analysis_agent │ │ └── __init__.py │ ├── train_env_gen_agent │ │ ├── tools │ │ │ ├── finish.py │ │ │ ├── execute_bash.py │ │ │ └── search.py │ │ └── prompt.py │ └── agent.py ├── model │ ├── __init__.py │ ├── register.py │ ├── gemini.py │ ├── groq.py │ ├── claude.py │ ├── bedrock.py │ ├── ollama.py │ ├── gptlitellm.py │ └── common.py ├── globals.py ├── globals_mut.py ├── task.py ├── log.py ├── data_structures.py ├── post_process.py └── raw_tasks.py ├── data_collection ├── collect │ ├── __init__.py │ ├── SetupBench-lite │ │ ├── batch_12.txt │ │ ├── batch_need_test_resources.txt │ │ ├── batch_11.txt │ │ ├── batch_1.txt │ │ ├── batch_7.txt │ │ ├── batch_9.txt │ │ ├── batch_10.txt │ │ ├── batch_8.txt │ │ ├── batch_6.txt │ │ ├── batch_3.txt │ │ ├── batch_4.txt │ │ ├── batch_5.txt │ │ └── batch_2.txt │ ├── get_top_repos.py │ ├── print_pulls.py │ ├── README.md │ ├── get_version.py │ └── build_dataset.py ├── versioning │ ├── __init__.py │ ├── get_version_mix.sh │ ├── README.md │ ├── constants.py │ ├── merge_final_data.py │ └── get_versions_by_git.py └── README.md ├── figure └── overview.png ├── .pre-commit-config.yaml ├── LICENSE ├── evaluation ├── README.md └── docker_utils.py ├── run └── run.sh ├── requirements.txt ├── scripts ├── compute_cost.py └── judge_fail2pass.py ├── .gitignore └── README.md /app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_collection/collect/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_collection/versioning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /figure/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepSoftwareAnalytics/swe-factory/HEAD/figure/overview.png -------------------------------------------------------------------------------- /app/agents/write_dockerfile_agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .write_dockerfile_agent import WriteDockerfileAgent, write_dockerfile_utils -------------------------------------------------------------------------------- /app/agents/write_eval_script_agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .write_eval_script_agent import WriteEvalScriptAgent, write_eval_script_utils -------------------------------------------------------------------------------- /app/agents/context_retrieval_agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .context_retrieval_agent import ContextRetrievalAgent, context_retrieval_utils -------------------------------------------------------------------------------- /app/agents/test_analysis_agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .test_analysis_agent import TestAnalysisAgent, test_analysis_utils 2 | from . import docker_utils -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_12.txt: -------------------------------------------------------------------------------- 1 | mochajs__mocha-1878 2 | mochajs__mocha-1698 3 | mochajs__mocha-1337 4 | mochajs__mocha-1243 5 | mochajs__mocha-1224 6 | mochajs__mocha-1110 7 | mochajs__mocha-795 8 | mochajs__mocha-577 9 | mochajs__mocha-635 10 | mochajs__mocha-368 11 | mochajs__mocha-462 12 | -------------------------------------------------------------------------------- /data_collection/README.md: -------------------------------------------------------------------------------- 1 | # Raw Issue Data Collection Project 2 | 3 | This code is dedicated to the collection and versioning of raw issue data. 4 | 5 | ## Process Overview 6 | 7 | The data handling process is divided into two main stages: 8 | 9 | 1. **Data Collection**: For instructions on how to collect raw task instances, please refer to the documentation in the `collect` directory. 10 | 2. **Data Versioning**: Once the data is collected, please follow the versioning guidelines outlined in the `versioning` directory to properly label and manage the dataset. -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3.11 3 | 4 | repos: 5 | - repo: https://github.com/asottile/pyupgrade 6 | rev: v3.15.2 7 | hooks: 8 | - id: pyupgrade 9 | args: ["--py311-plus"] 10 | 11 | - repo: https://github.com/astral-sh/ruff-pre-commit 12 | rev: v0.3.7 13 | hooks: 14 | - id: ruff 15 | args: ["--fix"] 16 | 17 | - repo: https://github.com/pycqa/isort 18 | rev: 5.13.2 19 | hooks: 20 | - id: isort 21 | name: isort (python) 22 | args: ["--profile", "black"] 23 | 24 | - repo: https://github.com/psf/black 25 | rev: 24.4.0 26 | hooks: 27 | - id: black 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | LICENSE 2 | 3 | The source code within this repository are dual licenced. You may choose to use it under the terms of GNU Affero General Public License (https://www.gnu.org/licenses/agpl-3.0.en.html#license-text) for non-commercial purposes, or you can obtain a commercial license for commercial use. 4 | 5 | For non-commercial uses and licensing of this code and its derivatives (which include academic purposes), an open-source licence is granted in accordance with the following terms and conditions - 6 | 7 | · GNU Affero General Public License (https://www.gnu.org/licenses/agpl-3.0.en.html#license-text) 8 | 9 | For commercial use and licensing of this code, please contact - 10 | 11 | · Yanlin Wang ( wangylin36@mail.sysu.edu.cn ) 12 | 13 | -------------------------------------------------------------------------------- /evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation Framework 2 | 3 | This directory provides the evaluation framework for the GitHub issue resolution task. 4 | 5 | ## Fail2Pass Validation 6 | 7 | Ensure your dataset contains both a Dockerfile and an evaluation script. Then, run the following command to generate Fail2Pass test logs. The logs will be saved under `run_instances/mypy_fail2pass_check/gold`. When performing Fail2Pass validation, set the `predictions_path` to `gold` and use the `--is_judge_fail2pass` flag. 8 | After running this command, you can find two test log files: "test_output_after_apply.txt" and "test_output_prev_apply.txt". 9 | ```bash 10 | python run_evaluation.py \ 11 | --dataset_name "output/git-4.1-mini/mypy/results/results.json" \ 12 | --predictions_path "gold" \ 13 | --max_workers 5 \ 14 | --run_id "mypy_fail2pass_check" \ 15 | --output_path "run_instances" \ 16 | --timeout 3600 \ 17 | --is_judge_fail2pass 18 | ``` 19 | 20 | ## Evaluation 21 | 22 | Once you have a validated GitHub issue resolution dataset (including Dockerfile and evaluation script), you can run the evaluation using the following command: 23 | 24 | ```bash 25 | python run_evaluation.py \ 26 | --dataset_name "mypy_valid.json" \ 27 | --predictions_path "predictions.json" \ 28 | --max_workers 5 \ 29 | --run_id "mypy_evaluation" \ 30 | --output_path "run_instances" \ 31 | --timeout 3600 \ 32 | ``` -------------------------------------------------------------------------------- /app/globals.py: -------------------------------------------------------------------------------- 1 | """ 2 | Values of global configuration variables. 3 | """ 4 | 5 | # Overall output directory for results 6 | output_dir: str = "" 7 | 8 | # upper bound of the number of conversation rounds for the agent 9 | conv_round_limit: int = 15 10 | 11 | context_retrieval_round_limit: int = 15 12 | 13 | # whether to perform sbfl 14 | enable_sbfl: bool = False 15 | 16 | # whether to perform layered search 17 | enable_layered: bool = True 18 | 19 | # whether to perform our own validation 20 | enable_validation: bool = False 21 | 22 | # whether to do angelic debugging 23 | enable_angelic: bool = False 24 | 25 | # whether to do perfect angelic debugging 26 | enable_perfect_angelic: bool = False 27 | 28 | 29 | # A special mode to only save SBFL result and exit 30 | only_save_sbfl_result: bool = False 31 | 32 | # timeout for test cmd execution, currently set to 5 min 33 | test_exec_timeout: int = 300 34 | 35 | 36 | # Used with disable_patch_generation - constrains or extends the amount of context retrieval rounds 37 | context_generation_limit: int = -1 38 | 39 | get_version: bool = False 40 | 41 | enable_web_search: bool = False 42 | 43 | agent_mode: str = "multi_agent" 44 | 45 | disable_memory_pool: bool = False 46 | 47 | disable_context_retrieval: bool = False 48 | 49 | disable_run_test: bool = False 50 | 51 | disable_download_test_resources: bool = False 52 | 53 | using_ubuntu_only: bool = False -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_need_test_resources.txt: -------------------------------------------------------------------------------- 1 | nodejs__undici-3842 2 | eclipse-vertx__vert.x-3657 3 | eclipse-vertx__vert.x-1300 4 | python-pillow__Pillow-8056 5 | python-pillow__Pillow-7823 6 | python-pillow__Pillow-7883 7 | python-pillow__Pillow-7496 8 | python-pillow__Pillow-7274 9 | python-pillow__Pillow-7151 10 | python-pillow__Pillow-7111 11 | python-pillow__Pillow-6954 12 | python-pillow__Pillow-6852 13 | python-pillow__Pillow-6517 14 | python-pillow__Pillow-6481 15 | python-pillow__Pillow-6381 16 | python-pillow__Pillow-6097 17 | python-pillow__Pillow-6086 18 | python-pillow__Pillow-5756 19 | python-pillow__Pillow-5557 20 | python-pillow__Pillow-5208 21 | python-pillow__Pillow-5125 22 | python-pillow__Pillow-4664 23 | python-pillow__Pillow-4471 24 | python-pillow__Pillow-4240 25 | python-pillow__Pillow-4063 26 | python-pillow__Pillow-4147 27 | python-pillow__Pillow-3897 28 | python-pillow__Pillow-3673 29 | python-pillow__Pillow-3625 30 | python-pillow__Pillow-3588 31 | python-pillow__Pillow-3532 32 | python-pillow__Pillow-3479 33 | python-pillow__Pillow-3364 34 | python-pillow__Pillow-3023 35 | python-pillow__Pillow-2899 36 | python-pillow__Pillow-2328 37 | python-pillow__Pillow-1985 38 | python-pillow__Pillow-1539 39 | python-pillow__Pillow-1152 40 | python-pillow__Pillow-1302 41 | python-pillow__Pillow-537 42 | python-pillow__Pillow-525 43 | python-pillow__Pillow-380 44 | python-pillow__Pillow-333 45 | -------------------------------------------------------------------------------- /app/globals_mut.py: -------------------------------------------------------------------------------- 1 | """ 2 | A global store, for values that can be mutated in multiprocessing, along with their related values. 3 | """ 4 | 5 | import multiprocessing 6 | 7 | # to be set at beginning 8 | total_num_tasks = 0 9 | num_completed_tasks = multiprocessing.Value("i", 0) 10 | 11 | 12 | # to be set at beginning 13 | total_num_task_groups = 0 14 | num_completed_task_groups = multiprocessing.Value("i", 0) 15 | 16 | 17 | def init_total_num_tasks(n: int): 18 | global total_num_tasks 19 | total_num_tasks = n 20 | 21 | 22 | def init_total_num_task_groups(n: int): 23 | global total_num_task_groups 24 | total_num_task_groups = n 25 | 26 | 27 | def incre_completed_tasks() -> int: 28 | with num_completed_tasks.get_lock(): 29 | num_completed_tasks.value += 1 30 | return num_completed_tasks.value 31 | 32 | 33 | def incre_completed_task_groups() -> int: 34 | with num_completed_task_groups.get_lock(): 35 | num_completed_task_groups.value += 1 36 | return num_completed_task_groups.value 37 | 38 | 39 | def incre_task_return_msg() -> str: 40 | completed = incre_completed_tasks() 41 | completed_groups = num_completed_task_groups.value 42 | return f">>> Completed {completed}/{total_num_tasks} tasks. For groups, completed {completed_groups}/{total_num_task_groups} so far." 43 | 44 | 45 | def incre_task_group_return_msg() -> str: 46 | completed = incre_completed_task_groups() 47 | return f">>>>>> Completed {completed}/{total_num_task_groups} task groups." 48 | -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_11.txt: -------------------------------------------------------------------------------- 1 | python-attrs__attrs-367 2 | python-attrs__attrs-394 3 | python-attrs__attrs-383 4 | python-attrs__attrs-286 5 | python-attrs__attrs-292 6 | python-attrs__attrs-343 7 | python-attrs__attrs-277 8 | python-attrs__attrs-229 9 | python-attrs__attrs-60 10 | python-attrs__attrs-181 11 | python-attrs__attrs-186 12 | mochajs__mocha-5292 13 | mochajs__mocha-5325 14 | mochajs__mocha-5165 15 | mochajs__mocha-5231 16 | mochajs__mocha-5198 17 | mochajs__mocha-5032 18 | mochajs__mocha-4985 19 | mochajs__mocha-5074 20 | mochajs__mocha-4842 21 | mochajs__mocha-4835 22 | mochajs__mocha-4771 23 | mochajs__mocha-4807 24 | mochajs__mocha-4746 25 | mochajs__mocha-4668 26 | mochajs__mocha-4614 27 | mochajs__mocha-4638 28 | mochajs__mocha-4557 29 | mochajs__mocha-4607 30 | mochajs__mocha-4418 31 | mochajs__mocha-4382 32 | mochajs__mocha-4315 33 | mochajs__mocha-4165 34 | mochajs__mocha-4234 35 | mochajs__mocha-4147 36 | mochajs__mocha-4063 37 | mochajs__mocha-4068 38 | mochajs__mocha-3834 39 | mochajs__mocha-3816 40 | mochajs__mocha-3767 41 | mochajs__mocha-3737 42 | mochajs__mocha-3699 43 | mochajs__mocha-3632 44 | mochajs__mocha-3375 45 | mochajs__mocha-3222 46 | mochajs__mocha-3268 47 | mochajs__mocha-3024 48 | mochajs__mocha-3143 49 | mochajs__mocha-2746 50 | mochajs__mocha-2696 51 | mochajs__mocha-2642 52 | mochajs__mocha-2513 53 | mochajs__mocha-2479 54 | mochajs__mocha-2499 55 | mochajs__mocha-2345 56 | mochajs__mocha-2094 57 | mochajs__mocha-2081 58 | mochajs__mocha-1965 59 | mochajs__mocha-1410 60 | mochajs__mocha-1520 61 | -------------------------------------------------------------------------------- /data_collection/versioning/get_version_mix.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | usage() { 5 | cat >&2 < [testbed_dir] 7 | 8 | Original task instance file (.json or .jsonl) 9 | Directory to store intermediate and final results 10 | [testbed_dir] (Optional) Temporary directory for git-clone, defaults to ./testbed 11 | EOF 12 | exit 1 13 | } 14 | 15 | # Check arguments 16 | if [ "$#" -lt 2 ] || [ "$#" -gt 3 ]; then 17 | usage 18 | fi 19 | 20 | INSTANCE="$1" 21 | OUTDIR="$2" 22 | # Use the third argument if it exists, otherwise default to ./testbed 23 | TESTBED="${3:-./testbed}" 24 | 25 | echo "🔧 Using testbed directory: $TESTBED" 26 | echo "🔧 Using output directory: $OUTDIR" 27 | 28 | mkdir -p "$TESTBED" "$OUTDIR" 29 | 30 | # 1. Extract by-github versions 31 | echo "👉 1. Getting by-github versions..." 32 | python get_versions.py \ 33 | --instances_path "$INSTANCE" \ 34 | --num_workers 100 \ 35 | --retrieval_method github \ 36 | --output_dir "$OUTDIR" 37 | 38 | # 2. Extract by-git versions 39 | echo "👉 2. Getting by-git versions..." 40 | python get_versions_by_git.py \ 41 | --instance_path "$INSTANCE" \ 42 | --testbed "$TESTBED" \ 43 | --max_workers 100 \ 44 | --output_dir "$OUTDIR" \ 45 | --last_stage_output_dir "$OUTDIR" 46 | 47 | # 3. Merge into the final version 48 | echo "👉 3. Merging both results into the final version..." 49 | python merge_final_data.py "$OUTDIR" 50 | 51 | echo "✅ All done. Results are saved in $OUTDIR" -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_1.txt: -------------------------------------------------------------------------------- 1 | assertj__assertj-3820 2 | assertj__assertj-3735 3 | assertj__assertj-3724 4 | assertj__assertj-3325 5 | assertj__assertj-3318 6 | assertj__assertj-3691 7 | assertj__assertj-3120 8 | assertj__assertj-3056 9 | assertj__assertj-2726 10 | assertj__assertj-2685 11 | assertj__assertj-2549 12 | assertj__assertj-2410 13 | assertj__assertj-2297 14 | assertj__assertj-2247 15 | assertj__assertj-2200 16 | assertj__assertj-2193 17 | assertj__assertj-2042 18 | assertj__assertj-1983 19 | assertj__assertj-1890 20 | assertj__assertj-1769 21 | assertj__assertj-1743 22 | assertj__assertj-1629 23 | assertj__assertj-1332 24 | assertj__assertj-1568 25 | assertj__assertj-1511 26 | assertj__assertj-1243 27 | assertj__assertj-1204 28 | assertj__assertj-1184 29 | assertj__assertj-1134 30 | assertj__assertj-1014 31 | assertj__assertj-813 32 | assertj__assertj-656 33 | assertj__assertj-54 34 | assertj__assertj-73 35 | assertj__assertj-169 36 | assertj__assertj-225 37 | assertj__assertj-101 38 | assertj__assertj-120 39 | assertj__assertj-613 40 | nodejs__undici-4178 41 | nodejs__undici-4131 42 | nodejs__undici-4112 43 | nodejs__undici-4088 44 | nodejs__undici-3977 45 | nodejs__undici-3855 46 | nodejs__undici-3941 47 | nodejs__undici-3833 48 | nodejs__undici-3842 49 | nodejs__undici-3758 50 | nodejs__undici-3631 51 | nodejs__undici-3566 52 | nodejs__undici-3495 53 | nodejs__undici-3505 54 | nodejs__undici-3294 55 | nodejs__undici-3251 56 | nodejs__undici-3206 57 | nodejs__undici-3169 58 | nodejs__undici-3105 59 | nodejs__undici-3047 60 | nodejs__undici-3005 61 | -------------------------------------------------------------------------------- /run/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | MODEL="deepseek/deepseek-chat-v3-0324" 5 | # "google/gemini-2.5-flash-preview" 6 | # "gpt-4.1-mini" 7 | REPO_NAME="SetupBench-lite" 8 | BASE_TASK_DIR="data_collection/collect/${REPO_NAME}" 9 | TASKS_MAP="${BASE_TASK_DIR}/merged_instances_versions.jsonl" 10 | SETUP_DIR="testbed" 11 | ROUND=5 12 | NUM_PROCS=5 13 | TEMP=0.2 14 | BATCH_COUNT=1 15 | # we split SweSetupBench-lite into 17 batches, each batch contains 40 raw issue instances. 16 | for f in "$TASKS_MAP"; do 17 | if [ ! -f "$f" ]; then 18 | echo "❌ Missing file: $f" 19 | exit 1 20 | fi 21 | done 22 | 23 | cleanup() { 24 | docker ps -a -q | xargs -r docker rm -f || true 25 | docker image prune -af || true 26 | rm -rf "$SETUP_DIR" 27 | } 28 | 29 | for idx in $(seq 17 $BATCH_COUNT); do 30 | TASK_LIST_FILE="${BASE_TASK_DIR}/batch_${idx}.txt" 31 | if [ ! -f "$TASK_LIST_FILE" ]; then 32 | echo "⚠️ Skipping missing ${TASK_LIST_FILE}" 33 | continue 34 | fi 35 | 36 | cleanup 37 | 38 | OUT_DIR="output_test1/${REPO_NAME}/${MODEL}/round_${ROUND}_batch_${idx}" 39 | RESULT_DIR="output_test1/${REPO_NAME}/${MODEL}/results" 40 | mkdir -p "$OUT_DIR" 41 | 42 | echo "▶️ Running batch_${idx} with normal mode" 43 | 44 | python app/main.py swe-bench \ 45 | --model "$MODEL" \ 46 | --tasks-map "$TASKS_MAP" \ 47 | --task-list-file "$TASK_LIST_FILE" \ 48 | --num-processes "$NUM_PROCS" \ 49 | --model-temperature "$TEMP" \ 50 | --conv-round-limit "$ROUND" \ 51 | --output-dir "$OUT_DIR" \ 52 | --setup-dir "$SETUP_DIR" \ 53 | --results-path "$RESULT_DIR" 54 | done 55 | -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_7.txt: -------------------------------------------------------------------------------- 1 | iamkun__dayjs-2377 2 | iamkun__dayjs-2231 3 | iamkun__dayjs-1571 4 | iamkun__dayjs-1321 5 | iamkun__dayjs-1414 6 | iamkun__dayjs-1229 7 | iamkun__dayjs-1086 8 | iamkun__dayjs-1101 9 | iamkun__dayjs-1003 10 | iamkun__dayjs-1023 11 | iamkun__dayjs-996 12 | iamkun__dayjs-980 13 | iamkun__dayjs-851 14 | iamkun__dayjs-1112 15 | iamkun__dayjs-891 16 | iamkun__dayjs-768 17 | iamkun__dayjs-719 18 | iamkun__dayjs-678 19 | iamkun__dayjs-539 20 | iamkun__dayjs-528 21 | iamkun__dayjs-453 22 | iamkun__dayjs-76 23 | iamkun__dayjs-55 24 | iamkun__dayjs-337 25 | iamkun__dayjs-1161 26 | iamkun__dayjs-952 27 | iamkun__dayjs-162 28 | python-pillow__Pillow-8852 29 | python-pillow__Pillow-8792 30 | python-pillow__Pillow-8701 31 | python-pillow__Pillow-8535 32 | python-pillow__Pillow-8602 33 | python-pillow__Pillow-8635 34 | python-pillow__Pillow-8476 35 | python-pillow__Pillow-8422 36 | python-pillow__Pillow-8231 37 | python-pillow__Pillow-8366 38 | python-pillow__Pillow-8056 39 | python-pillow__Pillow-8063 40 | python-pillow__Pillow-7948 41 | python-pillow__Pillow-7870 42 | python-pillow__Pillow-7823 43 | python-pillow__Pillow-7883 44 | python-pillow__Pillow-7496 45 | python-pillow__Pillow-7481 46 | python-pillow__Pillow-7383 47 | python-pillow__Pillow-7420 48 | python-pillow__Pillow-7412 49 | python-pillow__Pillow-7302 50 | python-pillow__Pillow-7274 51 | python-pillow__Pillow-7151 52 | python-pillow__Pillow-7078 53 | python-pillow__Pillow-7111 54 | python-pillow__Pillow-6954 55 | python-pillow__Pillow-6890 56 | python-pillow__Pillow-6852 57 | python-pillow__Pillow-6830 58 | python-pillow__Pillow-6819 59 | python-pillow__Pillow-6783 60 | python-pillow__Pillow-6647 61 | -------------------------------------------------------------------------------- /app/agents/train_env_gen_agent/tools/finish.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Description: A simple finish tool with a "submit" command. 5 | 6 | Notes about the `submit` command: 7 | * When invoked with `--result`, the provided string is used for submitting required task results (e.g., localization files). 8 | * If no `--result` is provided, it defaults to an empty string. 9 | 10 | **Parameters:** 11 | 1. **command** (`string`, required): The command to run. Currently allowed option is: `submit`. 12 | - Allowed value: [`submit`] 13 | 2. **result** (`string`, optional): The result text to submit. Defaults to an empty string. 14 | """ 15 | 16 | import argparse 17 | import sys 18 | 19 | 20 | def submit(result: str = ""): 21 | """ 22 | Submits a final result, printing a message that includes the result. 23 | """ 24 | print("<<>>") 25 | # if result: 26 | # print(f"Final result submitted: {result}") 27 | # else: 28 | # print("No result provided.") 29 | # You can add more logic here as needed 30 | 31 | 32 | def main(): 33 | parser = argparse.ArgumentParser( 34 | description="submit tool: run the `submit` command with an optional `--result` argument." 35 | ) 36 | parser.add_argument("command", help="Subcommand to run (currently only `submit`).") 37 | parser.add_argument( 38 | "--result", help="The result text to submit (optional).", default="" 39 | ) 40 | 41 | args = parser.parse_args() 42 | 43 | if args.command == "submit": 44 | submit(args.result) 45 | else: 46 | print(f"Unknown command '{args.command}'. Only `submit` is supported.") 47 | sys.exit(1) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_9.txt: -------------------------------------------------------------------------------- 1 | python-pillow__Pillow-3338 2 | python-pillow__Pillow-3233 3 | python-pillow__Pillow-3086 4 | python-pillow__Pillow-3023 5 | python-pillow__Pillow-2899 6 | python-pillow__Pillow-2852 7 | python-pillow__Pillow-2683 8 | python-pillow__Pillow-2410 9 | python-pillow__Pillow-2641 10 | python-pillow__Pillow-2399 11 | python-pillow__Pillow-2330 12 | python-pillow__Pillow-2328 13 | python-pillow__Pillow-2262 14 | python-pillow__Pillow-2115 15 | python-pillow__Pillow-2131 16 | python-pillow__Pillow-2103 17 | python-pillow__Pillow-1988 18 | python-pillow__Pillow-1985 19 | python-pillow__Pillow-1647 20 | python-pillow__Pillow-1686 21 | python-pillow__Pillow-1594 22 | python-pillow__Pillow-1539 23 | python-pillow__Pillow-1401 24 | python-pillow__Pillow-1400 25 | python-pillow__Pillow-1152 26 | python-pillow__Pillow-1302 27 | python-pillow__Pillow-997 28 | python-pillow__Pillow-808 29 | python-pillow__Pillow-898 30 | python-pillow__Pillow-669 31 | python-pillow__Pillow-638 32 | python-pillow__Pillow-537 33 | python-pillow__Pillow-525 34 | python-pillow__Pillow-380 35 | python-pillow__Pillow-364 36 | python-pillow__Pillow-333 37 | python-pillow__Pillow-171 38 | python-pillow__Pillow-228 39 | python-pillow__Pillow-64 40 | pallets__click-2840 41 | pallets__click-2622 42 | pallets__click-2607 43 | pallets__click-2591 44 | pallets__click-2397 45 | pallets__click-2333 46 | pallets__click-2271 47 | pallets__click-2151 48 | pallets__click-2094 49 | pallets__click-2219 50 | pallets__click-2030 51 | pallets__click-1998 52 | pallets__click-1840 53 | pallets__click-1829 54 | pallets__click-1785 55 | pallets__click-1784 56 | pallets__click-1786 57 | pallets__click-1543 58 | pallets__click-1402 59 | pallets__click-1318 60 | pallets__click-1304 61 | -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_10.txt: -------------------------------------------------------------------------------- 1 | pallets__click-1261 2 | pallets__click-1167 3 | pallets__click-1014 4 | pallets__click-1098 5 | pallets__click-865 6 | pallets__click-552 7 | pallets__click-706 8 | pallets__click-545 9 | pallets__click-240 10 | pallets__click-212 11 | pallets__click-123 12 | reduxjs__redux-toolkit-4758 13 | reduxjs__redux-toolkit-4768 14 | reduxjs__redux-toolkit-4762 15 | reduxjs__redux-toolkit-4869 16 | reduxjs__redux-toolkit-4732 17 | reduxjs__redux-toolkit-4204 18 | reduxjs__redux-toolkit-4084 19 | reduxjs__redux-toolkit-4082 20 | reduxjs__redux-toolkit-4055 21 | reduxjs__redux-toolkit-3878 22 | reduxjs__redux-toolkit-3800 23 | reduxjs__redux-toolkit-3414 24 | reduxjs__redux-toolkit-3388 25 | reduxjs__redux-toolkit-3188 26 | reduxjs__redux-toolkit-3116 27 | reduxjs__redux-toolkit-3089 28 | reduxjs__redux-toolkit-2835 29 | reduxjs__redux-toolkit-2804 30 | reduxjs__redux-toolkit-2595 31 | reduxjs__redux-toolkit-2363 32 | reduxjs__redux-toolkit-2225 33 | reduxjs__redux-toolkit-2000 34 | reduxjs__redux-toolkit-1984 35 | reduxjs__redux-toolkit-1662 36 | reduxjs__redux-toolkit-1496 37 | reduxjs__redux-toolkit-1520 38 | python-attrs__attrs-1417 39 | python-attrs__attrs-1410 40 | python-attrs__attrs-1383 41 | python-attrs__attrs-1329 42 | python-attrs__attrs-1172 43 | python-attrs__attrs-1267 44 | python-attrs__attrs-1009 45 | python-attrs__attrs-1319 46 | python-attrs__attrs-1122 47 | python-attrs__attrs-969 48 | python-attrs__attrs-806 49 | python-attrs__attrs-760 50 | python-attrs__attrs-830 51 | python-attrs__attrs-886 52 | python-attrs__attrs-763 53 | python-attrs__attrs-684 54 | python-attrs__attrs-712 55 | python-attrs__attrs-660 56 | python-attrs__attrs-607 57 | python-attrs__attrs-563 58 | python-attrs__attrs-586 59 | python-attrs__attrs-489 60 | python-attrs__attrs-556 61 | -------------------------------------------------------------------------------- /data_collection/versioning/README.md: -------------------------------------------------------------------------------- 1 | # Data Versioning 2 | 3 | This directory provides a hybrid strategy to accurately version task instances. It combines two methods for robust results, prioritizing accuracy while maintaining automation. 4 | 5 | *** 6 | 7 | ### Our Approach 8 | 9 | Our pipeline intelligently combines two methods: 10 | 11 | 1. **Pattern-Based Method (Primary)** 12 | Inspired by SWE-bench, this method uses a predefined map of repository paths (e.g., `__init__.py`, `package.json`) and regex patterns to find the exact version string. It is extremely fast and accurate for supported projects. 13 | 14 | 2. **Git-Based Method (Fallback)** 15 | This fully automated method infers the version by finding the nearest tag to a commit using git describe --tags. It requires no manual setup but is more time-consuming due to the need for repository cloning and checkout operations. 16 | 17 | If a pattern is not defined for a repository, the system will automatically use the Git-Based Method for that task. However, for the best overall performance and accuracy, we still recommend using both methods together. This hybrid approach ensures we can efficiently retrieve version information for the vast majority of task instances. 18 | *** 19 | 20 | ### How to Use 21 | 22 | Run the provided shell script to execute the entire versioning pipeline. The script runs both methods and merges the results into a final versioned file. 23 | 24 | **Command:** 25 | ```bash 26 | bash run_versioning.sh [testbed_dir] 27 | ```` 28 | 29 | **Arguments:** 30 | 31 | * ``: **(Required)** Path to your input task instances file. 32 | * ``: **(Required)** Directory to store the results. 33 | * `[testbed_dir]`: **(Optional)** Temporary directory for cloning repos. Defaults to `./testbed`. 34 | 35 | The final, merged output will be saved in the ``. 36 | 37 | 38 | -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_8.txt: -------------------------------------------------------------------------------- 1 | python-pillow__Pillow-6582 2 | python-pillow__Pillow-6517 3 | python-pillow__Pillow-6500 4 | python-pillow__Pillow-6481 5 | python-pillow__Pillow-6431 6 | python-pillow__Pillow-6381 7 | python-pillow__Pillow-6265 8 | python-pillow__Pillow-6234 9 | python-pillow__Pillow-6242 10 | python-pillow__Pillow-6188 11 | python-pillow__Pillow-6128 12 | python-pillow__Pillow-6124 13 | python-pillow__Pillow-6101 14 | python-pillow__Pillow-6097 15 | python-pillow__Pillow-6086 16 | python-pillow__Pillow-6054 17 | python-pillow__Pillow-5891 18 | python-pillow__Pillow-5845 19 | python-pillow__Pillow-5839 20 | python-pillow__Pillow-5696 21 | python-pillow__Pillow-5756 22 | python-pillow__Pillow-5647 23 | python-pillow__Pillow-5609 24 | python-pillow__Pillow-5572 25 | python-pillow__Pillow-5557 26 | python-pillow__Pillow-5554 27 | python-pillow__Pillow-5549 28 | python-pillow__Pillow-5437 29 | python-pillow__Pillow-5425 30 | python-pillow__Pillow-5417 31 | python-pillow__Pillow-5330 32 | python-pillow__Pillow-5313 33 | python-pillow__Pillow-5208 34 | python-pillow__Pillow-5139 35 | python-pillow__Pillow-5125 36 | python-pillow__Pillow-4966 37 | python-pillow__Pillow-4677 38 | python-pillow__Pillow-4749 39 | python-pillow__Pillow-4741 40 | python-pillow__Pillow-4664 41 | python-pillow__Pillow-4605 42 | python-pillow__Pillow-4474 43 | python-pillow__Pillow-4471 44 | python-pillow__Pillow-4283 45 | python-pillow__Pillow-4240 46 | python-pillow__Pillow-4063 47 | python-pillow__Pillow-4003 48 | python-pillow__Pillow-4147 49 | python-pillow__Pillow-3897 50 | python-pillow__Pillow-3859 51 | python-pillow__Pillow-3825 52 | python-pillow__Pillow-3778 53 | python-pillow__Pillow-3673 54 | python-pillow__Pillow-3625 55 | python-pillow__Pillow-3588 56 | python-pillow__Pillow-3532 57 | python-pillow__Pillow-3558 58 | python-pillow__Pillow-3513 59 | python-pillow__Pillow-3479 60 | python-pillow__Pillow-3364 61 | -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_6.txt: -------------------------------------------------------------------------------- 1 | checkstyle__checkstyle-14553 2 | checkstyle__checkstyle-14497 3 | checkstyle__checkstyle-13667 4 | checkstyle__checkstyle-13357 5 | checkstyle__checkstyle-13320 6 | checkstyle__checkstyle-12487 7 | checkstyle__checkstyle-12831 8 | checkstyle__checkstyle-12516 9 | checkstyle__checkstyle-12444 10 | checkstyle__checkstyle-12105 11 | checkstyle__checkstyle-11972 12 | checkstyle__checkstyle-11517 13 | checkstyle__checkstyle-11601 14 | checkstyle__checkstyle-11383 15 | checkstyle__checkstyle-11264 16 | checkstyle__checkstyle-11245 17 | checkstyle__checkstyle-11482 18 | checkstyle__checkstyle-11173 19 | checkstyle__checkstyle-10930 20 | checkstyle__checkstyle-10958 21 | checkstyle__checkstyle-10904 22 | checkstyle__checkstyle-10922 23 | checkstyle__checkstyle-10857 24 | checkstyle__checkstyle-10762 25 | checkstyle__checkstyle-10280 26 | checkstyle__checkstyle-10216 27 | checkstyle__checkstyle-9942 28 | checkstyle__checkstyle-9759 29 | checkstyle__checkstyle-9728 30 | checkstyle__checkstyle-9539 31 | checkstyle__checkstyle-9744 32 | checkstyle__checkstyle-9370 33 | checkstyle__checkstyle-9209 34 | checkstyle__checkstyle-9261 35 | checkstyle__checkstyle-9018 36 | checkstyle__checkstyle-8984 37 | checkstyle__checkstyle-8913 38 | checkstyle__checkstyle-8907 39 | checkstyle__checkstyle-8720 40 | checkstyle__checkstyle-8420 41 | checkstyle__checkstyle-8273 42 | checkstyle__checkstyle-8103 43 | checkstyle__checkstyle-8127 44 | checkstyle__checkstyle-8070 45 | checkstyle__checkstyle-8008 46 | checkstyle__checkstyle-7899 47 | checkstyle__checkstyle-7894 48 | checkstyle__checkstyle-7853 49 | checkstyle__checkstyle-7851 50 | checkstyle__checkstyle-7798 51 | checkstyle__checkstyle-7193 52 | checkstyle__checkstyle-7172 53 | checkstyle__checkstyle-6882 54 | checkstyle__checkstyle-6567 55 | checkstyle__checkstyle-6515 56 | checkstyle__checkstyle-4463 57 | checkstyle__checkstyle-3366 58 | checkstyle__checkstyle-1485 59 | checkstyle__checkstyle-1399 60 | iamkun__dayjs-2369 61 | -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_3.txt: -------------------------------------------------------------------------------- 1 | apollographql__apollo-client-1664 2 | apollographql__apollo-client-1540 3 | apollographql__apollo-client-1661 4 | apollographql__apollo-client-1492 5 | apollographql__apollo-client-1270 6 | apollographql__apollo-client-1169 7 | apollographql__apollo-client-1095 8 | apollographql__apollo-client-1069 9 | apollographql__apollo-client-1054 10 | apollographql__apollo-client-683 11 | apollographql__apollo-client-581 12 | apollographql__apollo-client-493 13 | apollographql__apollo-client-465 14 | apollographql__apollo-client-445 15 | apollographql__apollo-client-313 16 | apollographql__apollo-client-201 17 | apollographql__apollo-client-200 18 | apollographql__apollo-client-140 19 | apollographql__apollo-client-111 20 | apollographql__apollo-client-133 21 | eclipse-vertx__vert.x-5347 22 | eclipse-vertx__vert.x-5273 23 | eclipse-vertx__vert.x-5137 24 | eclipse-vertx__vert.x-4904 25 | eclipse-vertx__vert.x-5346 26 | eclipse-vertx__vert.x-4616 27 | eclipse-vertx__vert.x-4597 28 | eclipse-vertx__vert.x-4485 29 | eclipse-vertx__vert.x-4423 30 | eclipse-vertx__vert.x-4413 31 | eclipse-vertx__vert.x-4377 32 | eclipse-vertx__vert.x-4311 33 | eclipse-vertx__vert.x-4307 34 | eclipse-vertx__vert.x-4225 35 | eclipse-vertx__vert.x-4191 36 | eclipse-vertx__vert.x-4172 37 | eclipse-vertx__vert.x-4164 38 | eclipse-vertx__vert.x-4160 39 | eclipse-vertx__vert.x-4134 40 | eclipse-vertx__vert.x-4080 41 | eclipse-vertx__vert.x-4125 42 | eclipse-vertx__vert.x-4053 43 | eclipse-vertx__vert.x-4037 44 | eclipse-vertx__vert.x-3946 45 | eclipse-vertx__vert.x-3913 46 | eclipse-vertx__vert.x-3853 47 | eclipse-vertx__vert.x-3800 48 | eclipse-vertx__vert.x-3764 49 | eclipse-vertx__vert.x-3754 50 | eclipse-vertx__vert.x-3718 51 | eclipse-vertx__vert.x-3663 52 | eclipse-vertx__vert.x-3657 53 | eclipse-vertx__vert.x-3604 54 | eclipse-vertx__vert.x-3607 55 | eclipse-vertx__vert.x-3559 56 | eclipse-vertx__vert.x-3428 57 | eclipse-vertx__vert.x-3418 58 | eclipse-vertx__vert.x-3384 59 | eclipse-vertx__vert.x-3247 60 | eclipse-vertx__vert.x-3197 61 | -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_4.txt: -------------------------------------------------------------------------------- 1 | eclipse-vertx__vert.x-3101 2 | eclipse-vertx__vert.x-3016 3 | eclipse-vertx__vert.x-2929 4 | eclipse-vertx__vert.x-2883 5 | eclipse-vertx__vert.x-2726 6 | eclipse-vertx__vert.x-2723 7 | eclipse-vertx__vert.x-2724 8 | eclipse-vertx__vert.x-2631 9 | eclipse-vertx__vert.x-2458 10 | eclipse-vertx__vert.x-2392 11 | eclipse-vertx__vert.x-2366 12 | eclipse-vertx__vert.x-2309 13 | eclipse-vertx__vert.x-2354 14 | eclipse-vertx__vert.x-2209 15 | eclipse-vertx__vert.x-2108 16 | eclipse-vertx__vert.x-2083 17 | eclipse-vertx__vert.x-2074 18 | eclipse-vertx__vert.x-2073 19 | eclipse-vertx__vert.x-2064 20 | eclipse-vertx__vert.x-2017 21 | eclipse-vertx__vert.x-1907 22 | eclipse-vertx__vert.x-1799 23 | eclipse-vertx__vert.x-1770 24 | eclipse-vertx__vert.x-1615 25 | eclipse-vertx__vert.x-1604 26 | eclipse-vertx__vert.x-1565 27 | eclipse-vertx__vert.x-1476 28 | eclipse-vertx__vert.x-1366 29 | eclipse-vertx__vert.x-1300 30 | eclipse-vertx__vert.x-1287 31 | tailwindlabs__tailwindcss-17647 32 | tailwindlabs__tailwindcss-17754 33 | tailwindlabs__tailwindcss-17301 34 | tailwindlabs__tailwindcss-16800 35 | tailwindlabs__tailwindcss-16631 36 | tailwindlabs__tailwindcss-16078 37 | tailwindlabs__tailwindcss-16103 38 | tailwindlabs__tailwindcss-16069 39 | tailwindlabs__tailwindcss-15576 40 | tailwindlabs__tailwindcss-15318 41 | tailwindlabs__tailwindcss-15183 42 | tailwindlabs__tailwindcss-15003 43 | tailwindlabs__tailwindcss-14962 44 | tailwindlabs__tailwindcss-14981 45 | tailwindlabs__tailwindcss-14993 46 | tailwindlabs__tailwindcss-14744 47 | tailwindlabs__tailwindcss-14747 48 | tailwindlabs__tailwindcss-14877 49 | tailwindlabs__tailwindcss-14269 50 | tailwindlabs__tailwindcss-13949 51 | tailwindlabs__tailwindcss-13770 52 | tailwindlabs__tailwindcss-13379 53 | tailwindlabs__tailwindcss-11470 54 | tailwindlabs__tailwindcss-12404 55 | tailwindlabs__tailwindcss-11157 56 | tailwindlabs__tailwindcss-12113 57 | tailwindlabs__tailwindcss-11002 58 | tailwindlabs__tailwindcss-10288 59 | tailwindlabs__tailwindcss-10074 60 | tailwindlabs__tailwindcss-10601 61 | -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_5.txt: -------------------------------------------------------------------------------- 1 | tailwindlabs__tailwindcss-9704 2 | tailwindlabs__tailwindcss-9405 3 | tailwindlabs__tailwindcss-9319 4 | tailwindlabs__tailwindcss-9208 5 | tailwindlabs__tailwindcss-8773 6 | tailwindlabs__tailwindcss-8687 7 | tailwindlabs__tailwindcss-8622 8 | tailwindlabs__tailwindcss-8448 9 | tailwindlabs__tailwindcss-8125 10 | tailwindlabs__tailwindcss-7789 11 | tailwindlabs__tailwindcss-8091 12 | tailwindlabs__tailwindcss-7565 13 | tailwindlabs__tailwindcss-7291 14 | tailwindlabs__tailwindcss-7163 15 | tailwindlabs__tailwindcss-6519 16 | tailwindlabs__tailwindcss-6469 17 | tailwindlabs__tailwindcss-5470 18 | tailwindlabs__tailwindcss-5245 19 | tailwindlabs__tailwindcss-4852 20 | tailwindlabs__tailwindcss-4471 21 | tailwindlabs__tailwindcss-4263 22 | tailwindlabs__tailwindcss-4214 23 | tailwindlabs__tailwindcss-2951 24 | tailwindlabs__tailwindcss-2331 25 | tailwindlabs__tailwindcss-2322 26 | tailwindlabs__tailwindcss-2271 27 | tailwindlabs__tailwindcss-2211 28 | tailwindlabs__tailwindcss-2108 29 | tailwindlabs__tailwindcss-2075 30 | tailwindlabs__tailwindcss-1083 31 | tailwindlabs__tailwindcss-1680 32 | tailwindlabs__tailwindcss-1094 33 | tailwindlabs__tailwindcss-1799 34 | tailwindlabs__tailwindcss-992 35 | tailwindlabs__tailwindcss-847 36 | tailwindlabs__tailwindcss-681 37 | tailwindlabs__tailwindcss-516 38 | tailwindlabs__tailwindcss-497 39 | tailwindlabs__tailwindcss-418 40 | tailwindlabs__tailwindcss-255 41 | tailwindlabs__tailwindcss-77 42 | tailwindlabs__tailwindcss-82 43 | checkstyle__checkstyle-16515 44 | checkstyle__checkstyle-16605 45 | checkstyle__checkstyle-16418 46 | checkstyle__checkstyle-15969 47 | checkstyle__checkstyle-15822 48 | checkstyle__checkstyle-15686 49 | checkstyle__checkstyle-15681 50 | checkstyle__checkstyle-15430 51 | checkstyle__checkstyle-15334 52 | checkstyle__checkstyle-15337 53 | checkstyle__checkstyle-15358 54 | checkstyle__checkstyle-15212 55 | checkstyle__checkstyle-15199 56 | checkstyle__checkstyle-15127 57 | checkstyle__checkstyle-14983 58 | checkstyle__checkstyle-14882 59 | checkstyle__checkstyle-14804 60 | checkstyle__checkstyle-14623 61 | -------------------------------------------------------------------------------- /app/agents/train_env_gen_agent/tools/execute_bash.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Description: Execute a bash command in the terminal, with Python version compatibility. 5 | 6 | Parameters: 7 | --command (string, optional): The bash command to execute. For example: --command 'python my_script.py'. If not provided, will show help. 8 | """ 9 | 10 | import argparse 11 | import subprocess 12 | import sys 13 | 14 | BLOCKED_BASH_COMMANDS = ["git", "ipython", "jupyter", "nohup"] 15 | 16 | 17 | def run_command(cmd): 18 | try: 19 | # Try to use the new parameters (Python 3.7+) 20 | return subprocess.run(cmd, shell=True, capture_output=True, text=True) 21 | except TypeError: 22 | # Fallback for Python 3.5 and 3.6: 23 | return subprocess.run( 24 | cmd, 25 | shell=True, 26 | stdout=subprocess.PIPE, 27 | stderr=subprocess.PIPE, 28 | universal_newlines=True, 29 | ) 30 | 31 | 32 | def main(): 33 | parser = argparse.ArgumentParser(description="Execute a bash command.") 34 | parser.add_argument( 35 | "command", 36 | type=str, 37 | help="The command (and optional arguments) to execute. For example: 'python my_script.py'", 38 | ) 39 | args = parser.parse_args() 40 | 41 | # Check if the command is blocked 42 | first_token = args.command.strip().split()[0] 43 | if first_token in BLOCKED_BASH_COMMANDS: 44 | print( 45 | f"Bash command '{first_token}' is not allowed. " 46 | "Please use a different command or tool." 47 | ) 48 | sys.exit(1) 49 | 50 | result = run_command(args.command) 51 | 52 | if result.returncode != 0: 53 | print(f"Error executing command:\n") 54 | print("[STDOUT]\n") 55 | print(result.stdout.strip(), "\n") 56 | print("[STDERR]\n") 57 | print(result.stderr.strip()) 58 | sys.exit(result.returncode) 59 | 60 | print("[STDOUT]\n") 61 | print(result.stdout.strip(), "\n") 62 | print("[STDERR]\n") 63 | print(result.stderr.strip()) 64 | 65 | 66 | if __name__ == "__main__": 67 | main() -------------------------------------------------------------------------------- /data_collection/collect/SetupBench-lite/batch_2.txt: -------------------------------------------------------------------------------- 1 | nodejs__undici-2992 2 | nodejs__undici-2939 3 | apollographql__apollo-client-12450 4 | apollographql__apollo-client-12451 5 | apollographql__apollo-client-12497 6 | apollographql__apollo-client-12533 7 | apollographql__apollo-client-12379 8 | apollographql__apollo-client-12300 9 | apollographql__apollo-client-12224 10 | apollographql__apollo-client-12254 11 | apollographql__apollo-client-12121 12 | apollographql__apollo-client-12367 13 | apollographql__apollo-client-12052 14 | apollographql__apollo-client-11944 15 | apollographql__apollo-client-11921 16 | apollographql__apollo-client-11923 17 | apollographql__apollo-client-11799 18 | apollographql__apollo-client-11638 19 | apollographql__apollo-client-11403 20 | apollographql__apollo-client-11200 21 | apollographql__apollo-client-11180 22 | apollographql__apollo-client-11078 23 | apollographql__apollo-client-10853 24 | apollographql__apollo-client-10937 25 | apollographql__apollo-client-10809 26 | apollographql__apollo-client-10766 27 | apollographql__apollo-client-10450 28 | apollographql__apollo-client-10368 29 | apollographql__apollo-client-10499 30 | apollographql__apollo-client-10340 31 | apollographql__apollo-client-10143 32 | apollographql__apollo-client-10134 33 | apollographql__apollo-client-9808 34 | apollographql__apollo-client-9369 35 | apollographql__apollo-client-9328 36 | apollographql__apollo-client-9222 37 | apollographql__apollo-client-8718 38 | apollographql__apollo-client-8574 39 | apollographql__apollo-client-8438 40 | apollographql__apollo-client-8394 41 | apollographql__apollo-client-8372 42 | apollographql__apollo-client-7581 43 | apollographql__apollo-client-7657 44 | apollographql__apollo-client-7146 45 | apollographql__apollo-client-7075 46 | apollographql__apollo-client-7055 47 | apollographql__apollo-client-6587 48 | apollographql__apollo-client-6710 49 | apollographql__apollo-client-6589 50 | apollographql__apollo-client-6691 51 | apollographql__apollo-client-6448 52 | apollographql__apollo-client-6353 53 | apollographql__apollo-client-5116 54 | apollographql__apollo-client-4765 55 | apollographql__apollo-client-3956 56 | apollographql__apollo-client-3580 57 | apollographql__apollo-client-2710 58 | apollographql__apollo-client-2362 59 | apollographql__apollo-client-2345 60 | apollographql__apollo-client-1801 61 | -------------------------------------------------------------------------------- /data_collection/collect/get_top_repos.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import argparse 4 | import os 5 | import sys 6 | 7 | def fetch_top_repos(language: str, output_path: str, top_n: int, token: str): 8 | headers = { 9 | "Accept": "application/vnd.github+json", 10 | "Authorization": f"token {token}" 11 | } 12 | 13 | url = "https://api.github.com/search/repositories" 14 | params = { 15 | "q": f"language:{language}", 16 | "sort": "stars", 17 | "order": "desc", 18 | "per_page": 100, 19 | "page": 1 20 | } 21 | 22 | print(f"📡 Fetching top {top_n} repositories for language: {language}") 23 | repos = [] 24 | 25 | while len(repos) < top_n: 26 | response = requests.get(url, headers=headers, params=params) 27 | if response.status_code != 200: 28 | print(f"❌ Error: {response.status_code} - {response.json().get('message')}") 29 | break 30 | 31 | data = response.json().get("items", []) 32 | if not data: 33 | break 34 | 35 | for repo in data: 36 | repos.append({ 37 | "name": repo["full_name"], 38 | "stars": repo["stargazers_count"], 39 | "url": repo["html_url"], 40 | "description": repo["description"], 41 | "owner": repo["owner"]["login"], 42 | "language": repo["language"] 43 | }) 44 | 45 | params["page"] += 1 46 | 47 | os.makedirs(output_path, exist_ok=True) 48 | output_file = os.path.join(output_path, f"{language.lower()}_top_{top_n}_repos.json") 49 | print(f"💾 Saving {min(top_n, len(repos))} repos to {output_file}") 50 | with open(output_file, mode='w', encoding='utf-8') as f: 51 | json.dump(repos[:top_n], f, indent=2, ensure_ascii=False) 52 | 53 | print("✅ Done!") 54 | 55 | def main(): 56 | parser = argparse.ArgumentParser(description="Fetch top GitHub repos by language") 57 | parser.add_argument("--language", type=str, required=True, help="Programming language (e.g., Python)") 58 | parser.add_argument("--output_path", type=str, required=True, help="Directory to save the result JSON") 59 | parser.add_argument("--top_n", type=int, default=500, help="Number of top repositories to fetch") 60 | args = parser.parse_args() 61 | 62 | token = os.environ.get("GITHUB_TOKEN") 63 | if not token: 64 | print("❌ GitHub token not found. Please set the environment variable `github_token`.") 65 | sys.exit(1) 66 | 67 | fetch_top_repos( 68 | language=args.language, 69 | output_path=args.output_path, 70 | top_n=args.top_n, 71 | token=token 72 | ) 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohappyeyeballs==2.4.3 2 | aiohttp==3.10.2 3 | aiosignal==1.3.1 4 | annotated-types==0.6.0 5 | antlr4-tools==0.2.1 6 | anyio==4.2.0 7 | astroid==3.2.3 8 | asttokens==2.4.1 9 | attrs==23.2.0 10 | beautifulsoup4==4.12.3 11 | Brotli==1.1.0 12 | certifi==2024.7.4 13 | cffi==1.17.1 14 | cfgv==3.4.0 15 | chain==1.0 16 | charset-normalizer==3.4.0 17 | click==8.1.7 18 | colorama==0.4.6 19 | coverage==7.5.3 20 | Cython==3.0.8 21 | datasets==3.5.0 22 | dill==0.3.8 23 | discord==2.3.2 24 | discord.py==2.3.2 25 | distlib==0.3.8 26 | distro==1.9.0 27 | docker==7.1.0 28 | docstring-parser==0.15 29 | emojis==0.7.0 30 | executing==2.1.0 31 | filelock==3.13.1 32 | frozenlist==1.4.1 33 | fsspec==2024.6.1 34 | ghapi==1.0.5 35 | h11==0.14.0 36 | h2==4.1.0 37 | hpack==4.0.0 38 | httpcore==1.0.5 39 | httpx==0.27.0 40 | huggingface-hub==0.30.2 41 | hyperframe==6.0.1 42 | icecream==2.1.3 43 | identify==2.5.33 44 | idna==3.7 45 | importlib-metadata==7.0.1 46 | install-jdk==1.1.0 47 | isort==5.13.2 48 | javalang==0.13.0 49 | Jinja2==3.1.4 50 | jiter==0.8.0 51 | jsonschema==4.22.0 52 | jsonschema-specifications==2023.12.1 53 | libclang==18.1.1 54 | linkify-it-py==2.0.2 55 | litellm==1.44.8 56 | loguru==0.7.2 57 | lxml==5.1.0 58 | markdown-it-py 59 | MarkupSafe==2.1.5 60 | mccabe==0.7.0 61 | mdit-py-plugins==0.4.0 62 | mdurl 63 | more-itertools==10.2.0 64 | mpmath==1.3.0 65 | multidict==6.0.4 66 | multiprocess==0.70.16 67 | natsort==8.4.0 68 | networkx==3.2.1 69 | nodeenv==1.8.0 70 | numpy==1.26.4 71 | # nvidia-cublas-cu12==12.1.3.1 72 | # nvidia-cuda-cupti-cu12==12.1.105 73 | # nvidia-cuda-nvrtc-cu12==12.1.105 74 | # nvidia-cuda-runtime-cu12==12.1.105 75 | # nvidia-cudnn-cu12==8.9.2.26 76 | # nvidia-cufft-cu12==11.0.2.54 77 | # nvidia-curand-cu12==10.3.2.106 78 | # nvidia-cusolver-cu12==11.4.5.107 79 | # nvidia-cusparse-cu12==12.1.0.106 80 | # nvidia-nccl-cu12==2.19.3 81 | # nvidia-nvjitlink-cu12==12.4.99 82 | # nvidia-nvtx-cu12==12.1.105 83 | ollama==0.3.3 84 | openai==1.50.2 85 | opt-einsum==3.3.0 86 | packaging==23.2 87 | pandas==2.2.3 88 | platformdirs==4.1.0 89 | polars==0.20.31 90 | pre-commit==3.6.0 91 | pyarrow==19.0.1 92 | pycparser==2.22 93 | pydantic==2.5.3 94 | pydantic_core==2.14.6 95 | Pygments==2.17.2 96 | pylint==3.2.3 97 | pyro-api==0.1.2 98 | pyro-ppl==1.9.0 99 | PySocks==1.7.1 100 | python-dateutil==2.9.0.post0 101 | python-dotenv==1.0.0 102 | pytz==2025.2 103 | PyYAML==6.0.1 104 | referencing==0.32.1 105 | regex==2024.5.15 106 | requests==2.32.3 107 | rich==13.7.1 108 | rpds-py==0.16.2 109 | semver==3.0.2 110 | setuptools==68.2.2 111 | six==1.16.0 112 | slack_sdk==3.26.2 113 | sniffio==1.3.0 114 | soupsieve==2.5 115 | sympy==1.13.0 116 | tenacity==8.2.3 117 | termcolor==2.4.0 118 | textual==0.52.1 119 | tiktoken==0.7.0 120 | timeout-decorator==0.5.0 121 | tokenizers==0.19.1 122 | tomlkit==0.13.0 123 | torch==2.2.1 124 | tqdm==4.66.4 125 | tree-sitter==0.21.3 126 | tree-sitter-c==0.21.4 127 | tree-sitter-cpp==0.22.2 128 | tree-sitter-java==0.21.0 129 | tree-sitter-languages==1.10.2 130 | triton==2.2.0 131 | types-jsonschema==4.21.0.20240311 132 | typing_extensions 133 | tzdata==2025.2 134 | uc-micro-py==1.0.2 135 | unidiff==0.7.5 136 | unittest-xml-reporting==3.2.0 137 | urllib3==2.2.3 138 | virtualenv==20.25.0 139 | wheel==0.41.2 140 | xxhash==3.5.0 141 | yarl==1.9.4 142 | zipp==3.19.2 143 | zstandard==0.22.0 144 | -------------------------------------------------------------------------------- /data_collection/collect/print_pulls.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Given the `` of a GitHub repo, this script writes the raw information for all the repo's PRs to a single `.jsonl` file.""" 4 | 5 | import argparse 6 | import json 7 | import logging 8 | import os 9 | from typing import Optional 10 | from tqdm import tqdm 11 | from fastcore.xtras import obj2dict 12 | from utils import Repo 13 | 14 | logging.basicConfig( 15 | level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 16 | ) 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def log_all_pulls(repo: Repo, output: str, mode: str, pr_data_list=None): 21 | """ 22 | Iterate over all pull requests in a repository and log them to a file 23 | 24 | Args: 25 | repo (Repo): repository object 26 | output (str): output file name 27 | """ 28 | # Create output directory if it doesn't exist 29 | output_dir = os.path.dirname(output) 30 | if output_dir: 31 | os.makedirs(output_dir, exist_ok=True) 32 | 33 | if mode == 'swebench': 34 | with open(output, "w") as output_file: 35 | for pull in repo.get_all_pulls(): 36 | setattr(pull, "resolved_issues", repo.extract_resolved_issues(pull)) 37 | print(json.dumps(obj2dict(pull)), end="\n", flush=True, file=output_file) 38 | else: 39 | pulls = repo.get_all_pulls_with_official_github_api() 40 | print(f'total prs number: {len(pulls)}') 41 | with open(output, 'a') as f: 42 | for pull in tqdm(pulls): 43 | if pr_data_list and pull['number'] in pr_data_list: 44 | continue 45 | else: 46 | issues = repo.extract_resolved_issues_with_official_github_api(pull) 47 | pull["resolved_issues"] = issues 48 | json.dump(pull, f) 49 | f.write('\n') # 写入换行符以分隔 JSON 对象 50 | 51 | 52 | 53 | 54 | def main(repo_name: str, output: str, token: Optional[str] = None,mode: Optional[str] ='swebench'): 55 | """ 56 | Logic for logging all pull requests in a repository 57 | 58 | Args: 59 | repo_name (str): name of the repository 60 | output (str): output file name 61 | token (str, optional): GitHub token 62 | """ 63 | if token is None: 64 | token = os.environ["GITHUB_TOKEN"] 65 | try: 66 | owner, repo = repo_name.split("/") 67 | except: 68 | print(repo_name) 69 | logger.info(repo_name) 70 | repo = Repo(owner, repo, token=token) 71 | if os.path.exists(output): 72 | pr_data_list = [] 73 | with open(output, 'r', encoding='utf-8') as f: 74 | for line in f: 75 | pr_data_list.append(json.loads(line)['number']) 76 | log_all_pulls(repo, output,mode,pr_data_list) 77 | else: 78 | log_all_pulls(repo, output,mode) 79 | 80 | 81 | if __name__ == "__main__": 82 | parser = argparse.ArgumentParser(description=__doc__) 83 | parser.add_argument("repo_name", type=str, help="Name of the repository") 84 | parser.add_argument("output", type=str, help="Output file name") 85 | parser.add_argument("--token", type=str, help="GitHub token") 86 | parser.add_argument("--mode", type=str, default='omnigirl',help="GitHub token") 87 | args = parser.parse_args() 88 | main(**vars(args)) 89 | -------------------------------------------------------------------------------- /scripts/compute_cost.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def count_finished_status_and_cost(directory): 6 | # status.json 统计 7 | finished_count = 0 8 | total_status_files = 0 9 | 10 | # cost.json 统计 11 | total_tokens_sum = 0.0 12 | total_input_tokens_sum = 0.0 13 | total_output_tokens_sum = 0.0 14 | total_elapsed_seconds = 0.0 15 | total_cost_files = 0 16 | 17 | # meta.json 统计(如果需要) 18 | total_meta_files = 0 19 | 20 | for root, _, files in os.walk(directory): 21 | for file in files: 22 | path = os.path.join(root, file) 23 | 24 | if file == "status.json": 25 | total_status_files += 1 26 | try: 27 | data = json.load(open(path, encoding="utf-8")) 28 | if data.get("is_finish") is True: 29 | finished_count += 1 30 | except Exception as e: 31 | print(f"读取 {path} 出错: {e}") 32 | 33 | elif file == "cost.json": 34 | total_cost_files += 1 35 | try: 36 | data = json.load(open(path, encoding="utf-8")) 37 | # total_tokens 38 | total_tokens_sum += float(data.get("total_tokens", 0)) 39 | # total_input_tokens 40 | total_input_tokens_sum += float(data.get("total_input_tokens", 0)) 41 | # total_output_tokens 42 | total_output_tokens_sum += float(data.get("total_output_tokens", 0)) 43 | # elapsed_seconds 44 | total_elapsed_seconds += float(data.get("elapsed_seconds", 0)) 45 | except Exception as e: 46 | print(f"读取 {path} 出错: {e}") 47 | 48 | elif file == "meta.json": 49 | total_meta_files += 1 50 | 51 | # 输出结果 52 | print(f"Total 'status.json' files found: {total_status_files}") 53 | print(f"Files with 'is_finish = true': {finished_count}") 54 | print(f"Total 'meta.json' files found: {total_meta_files}") 55 | print(f"Total 'cost.json' files found: {total_cost_files}") 56 | 57 | if total_cost_files: 58 | avg_total = total_tokens_sum / total_cost_files 59 | avg_input = total_input_tokens_sum / total_cost_files 60 | avg_output = total_output_tokens_sum / total_cost_files 61 | avg_elapsed = total_elapsed_seconds / total_cost_files 62 | 63 | print(f"Sum of 'total_tokens': {total_tokens_sum}") 64 | print(f"Average 'total_tokens': {avg_total:.2f}") 65 | print(f"Sum of 'total_input_tokens': {total_input_tokens_sum}") 66 | print(f"Average 'total_input_tokens': {avg_input:.2f}") 67 | print(f"Sum of 'total_output_tokens': {total_output_tokens_sum}") 68 | print(f"Average 'total_output_tokens': {avg_output:.2f}") 69 | print(f"Sum of 'elapsed_seconds': {total_elapsed_seconds}") 70 | print(f"Average 'elapsed_seconds': {avg_elapsed:.2f}") 71 | else: 72 | print("No 'cost.json' files found, cannot compute averages.") 73 | 74 | if __name__ == "__main__": 75 | parser = argparse.ArgumentParser( 76 | description="统计目录下 status.json、cost.json(含 total_tokens/input/output_tokens 和 elapsed_seconds)和 meta.json 文件情况。" 77 | ) 78 | parser.add_argument("directory", help="目标目录路径") 79 | args = parser.parse_args() 80 | count_finished_status_and_cost(args.directory) 81 | -------------------------------------------------------------------------------- /app/model/register.py: -------------------------------------------------------------------------------- 1 | from app.model import ( 2 | azure, 3 | bedrock, 4 | claude, 5 | common, 6 | gemini, 7 | gpt, 8 | gptlitellm, 9 | groq, 10 | ollama, 11 | ) 12 | 13 | 14 | def register_all_models() -> None: 15 | """ 16 | Register all models. This is called in main. 17 | """ 18 | common.register_model(gpt.Gpt4o_20241120()) 19 | common.register_model(gpt.Gpt4o_20240806()) 20 | common.register_model(gpt.Gpt4o_20240513()) 21 | common.register_model(gpt.Gpt4o_mini_20240718()) 22 | common.register_model(gpt.Gpt4_Turbo20240409()) 23 | common.register_model(gpt.Gpt4_0125Preview()) 24 | common.register_model(gpt.Gpt4_1106Preview()) 25 | common.register_model(gpt.Gpt35_Turbo0125()) 26 | common.register_model(gpt.Gpt35_Turbo1106()) 27 | common.register_model(gpt.Gpt35_Turbo16k_0613()) 28 | common.register_model(gpt.Gpt35_Turbo0613()) 29 | common.register_model(gpt.Gpt4_0613()) 30 | common.register_model(gpt.Gpt_o1mini()) 31 | common.register_model(gpt.Qwen25_72B()) 32 | common.register_model(gpt.DeepSeekV25()) 33 | common.register_model(gpt.DeepSeekV3()) 34 | common.register_model(gpt.DeepSeek()) 35 | common.register_model(gpt.Gpt4_1()) 36 | common.register_model(gpt.Gpt4_1_mini()) 37 | common.register_model(gpt.Gpt5_mini()) 38 | common.register_model(gpt.Gemini_2_5_flash_preview()) 39 | common.register_model(gpt.Gemini_2_5_flash_lite_preview()) 40 | common.register_model(gpt.Kimi_k2()) 41 | common.register_model(gpt.Gpt4_1_nano()) 42 | common.register_model(gpt.Claude3_5Sonnet()) 43 | common.register_model(gpt.Claude3_7Sonnet()) 44 | common.register_model(claude.Claude3Opus()) 45 | common.register_model(claude.Claude3Sonnet()) 46 | common.register_model(claude.Claude3Haiku()) 47 | # common.register_model(claude.Claude3_5Sonnet()) 48 | 49 | common.register_model(bedrock.AnthropicClaude3Opus()) 50 | common.register_model(bedrock.AnthropicClaude3Sonnet()) 51 | common.register_model(bedrock.AnthropicClaude3Haiku()) 52 | 53 | common.register_model(ollama.Llama3_8B()) 54 | common.register_model(ollama.Llama3_70B()) 55 | 56 | common.register_model(groq.Llama3_8B()) 57 | common.register_model(groq.Llama3_70B()) 58 | common.register_model(groq.Mixtral_8x7B()) 59 | common.register_model(groq.Gemma_7B()) 60 | 61 | common.register_model(gptlitellm.Gpt4o_20240513LiteLLM()) 62 | common.register_model(gptlitellm.Gpt4_Turbo20240409LiteLLM()) 63 | common.register_model(gptlitellm.Gpt4_0125PreviewLiteLLM()) 64 | common.register_model(gptlitellm.Gpt4_1106PreviewLiteLLM()) 65 | common.register_model(gptlitellm.Gpt35_Turbo0125LiteLLM()) 66 | common.register_model(gptlitellm.Gpt35_Turbo1106LiteLLM()) 67 | common.register_model(gptlitellm.Gpt35_Turbo16k_0613LiteLLM()) 68 | common.register_model(gptlitellm.Gpt35_Turbo0613LiteLLM()) 69 | common.register_model(gptlitellm.Gpt4_0613LiteLLM()) 70 | 71 | 72 | common.register_model(azure.AzureGpt4()) 73 | common.register_model(azure.AzureGpt4o()) 74 | common.register_model(azure.AzureGpt35_Turbo()) 75 | common.register_model(azure.AzureGpt35_Turbo16k()) 76 | common.register_model(azure.AzureGpt_o1mini()) 77 | 78 | common.register_model(gemini.GeminiPro()) 79 | common.register_model(gemini.Gemini15Pro()) 80 | 81 | # register default model as selected 82 | common.SELECTED_MODEL = gpt.Gpt35_Turbo0125() 83 | -------------------------------------------------------------------------------- /data_collection/collect/README.md: -------------------------------------------------------------------------------- 1 | # Data Collection Process Overview 2 | 3 | This directory provides code to collect raw issue data using GitHub APIs and predefined patterns. This implementation currently supports collecting issues from repositories in Python, Java, JavaScript, and TypeScript. We welcome PRs to support more languages! 4 | 5 | 6 | 1. **Fetch Popular Repositories** 7 | 8 | - Use the `get_top_repos.py` script to find and save a list of the most popular (by stars) repositories for a given language. 9 | 10 | - **Note**: This script requires a GitHub Personal Access Token to be set as an environment variable. 11 | 12 | Example: 13 | ```bash 14 | export GITHUB_TOKEN= # Set your token first 15 | python get_top_repos.py --language Python --output_path data/popular_repos --top_n 100 16 | ``` 17 | Where: 18 | - `--language:` The programming language to search for (e.g., 'Python', 'Java'). (Required) 19 | - `--output_path`: The directory where the output JSON file will be saved. (Required) 20 | - `--top_n`: The number of top-starred repositories to fetch (default: 500). 21 | - The output will be saved in the specified path, in a file named, for instance, python_top_100_repos.json. 22 | 23 | 2. **Raw PR Data Collection** 24 | 25 | - Use the `print_pulls.py` script to collect raw PR data from GitHub repositories. 26 | 27 | Example: 28 | ```bash 29 | export GITHUB_TOKEN= # Set your token first 30 | python print_pulls.py python-attrs/attrs data/python-attrs/attrs/prs.jsonl 31 | ``` 32 | 33 | Where: 34 | - ``: GitHub repository name in "owner/repo" format (e.g., "octocat/Hello-World"). 35 | - ``: Path for the output JSONL file (e.g., "data/prs.jsonl"). 36 | - `--token`: GitHub personal access token (defaults to the `GITHUB_TOKEN` environment variable). 37 | 38 | 3. **Raw Task Instance Construction** 39 | - Use the `build_dataset.py` script to process collected PR data and construct task instances. 40 | 41 | Example: 42 | ```bash 43 | export GITHUB_TOKEN= # Set your token first 44 | python build_dataset.py data/python-attrs/attrs/prs.jsonl data/python-attrs/attrs/instances.jsonl --language python 45 | ``` 46 | 47 | Where: 48 | - ``: Path to the input PR JSONL file from the previous step. 49 | - ``: Path for the output task instance JSONL file. 50 | - `--language`: The programming language of the repository. Accepts `python`, `java`, or `js`. Use `js` for both JavaScript and TypeScript repositories. 51 | - `--token`: Optional GitHub token (defaults to the `GITHUB_TOKEN` environment variable). 52 | 53 | 4. **Versioning** 54 | - Use the `get_version.py` script to assign version numbers to the raw instances. 55 | 56 | - **Note on Strategy**: The script works by checking out an instance's `base_commit` and parsing the output of the `git describe --tags` command. 57 | 58 | This method's success **depends entirely on a repository's tagging practices**. It may fail or produce inaccurate versions if release tags are inconsistent or not used. For more reliable results, please follow the complete [versioning documentation](../versioning). 59 | 60 | 61 | Example: 62 | ```bash 63 | python get_version.py --instance_path data/python-attrs/attrs/instances.jsonl --testbed github --max-workers 20 64 | ``` 65 | 66 | Where: 67 | - `--instance_path`: Path to the task instances file (required). 68 | - `--testbed`: A temporary working directory for cloning repositories. 69 | - `--max-workers`: The number of parallel processes to use (default: 10). 70 | - The results will be saved to a new file with a `_versions` suffix (e.g., `instances_versions.jsonl`). -------------------------------------------------------------------------------- /app/task.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import subprocess 5 | from abc import ABC, abstractmethod 6 | from dataclasses import dataclass 7 | from os.path import join as pjoin 8 | from tempfile import mkstemp 9 | import shutil 10 | import app.utils as apputils 11 | from app import globals, log 12 | from app import utils as app_utils 13 | 14 | from app.log import log_and_print 15 | from docker import DockerClient 16 | 17 | class Task(ABC): 18 | @property 19 | @abstractmethod 20 | def project_path(self) -> str: 21 | raise NotImplementedError("abstract method") 22 | 23 | @abstractmethod 24 | def get_issue_statement(self) -> str: 25 | raise NotImplementedError("abstract method") 26 | 27 | @abstractmethod 28 | def setup_project(self) -> None: 29 | """Set up the project before starting to resolve the task.""" 30 | raise NotImplementedError("abstract method") 31 | 32 | @abstractmethod 33 | def reset_project(self) -> None: 34 | """Reset project to initial state.""" 35 | raise NotImplementedError("abstract method") 36 | 37 | 38 | 39 | @dataclass(kw_only=True) 40 | class SweTask(Task): 41 | task_id: str 42 | problem_statement: str 43 | repo_path: str 44 | repo_cache_path: str 45 | commit: str 46 | # env_name: str 47 | repo_name: str 48 | # pre_install_cmds: list[str] 49 | # install_cmd: str 50 | # test_cmd: str 51 | patch: str 52 | test_patch: str 53 | # testcases_passing: list[str] 54 | # testcases_failing: list[str] 55 | language: str 56 | # image_urls: list[str] 57 | # reference_setup: dict 58 | version: str 59 | client: DockerClient 60 | task_info: dict 61 | @property 62 | def project_path(self) -> str: 63 | return self.repo_path 64 | 65 | 66 | @project_path.setter 67 | def project_path(self, value: str) -> None: 68 | self.repo_path = value 69 | 70 | def get_issue_statement(self) -> str: 71 | return self.problem_statement 72 | 73 | 74 | def setup_project(self) -> None: 75 | # get the correct version of the project and commit-specific pip install 76 | task = self 77 | with apputils.cd(task.project_path): 78 | apputils.repo_reset_and_clean_checkout(task.commit) 79 | 80 | 81 | # apply the test modifications to this task 82 | 83 | # commit the current changes, so that resetting later do not erase them 84 | with apputils.cd(task.project_path): 85 | apputils.repo_commit_current_changes() 86 | 87 | def reset_project(self) -> None: 88 | with apputils.cd(self.repo_path): 89 | apputils.repo_reset_and_clean_checkout(self.commit) 90 | 91 | def remove_project(self) -> None: 92 | """Remove the entire project repository.""" 93 | if os.path.exists(self.repo_path): 94 | shutil.rmtree(self.repo_path) 95 | log_and_print(f"Removed project repository at {self.repo_path}") 96 | 97 | 98 | 99 | 100 | 101 | @dataclass(kw_only=True) 102 | class PlainTask(Task): 103 | """ 104 | Tasks that only contain a codebase and an issue descripion (no test suite). 105 | """ 106 | 107 | commit_hash: str 108 | local_path: str 109 | problem_statement: str 110 | 111 | @property 112 | def project_path(self) -> str: 113 | return self.local_path 114 | 115 | def setup_project(self) -> None: 116 | with apputils.cd(self.project_path): 117 | apputils.repo_reset_and_clean_checkout(self.commit_hash) 118 | 119 | def reset_project(self) -> None: 120 | with apputils.cd(self.project_path): 121 | apputils.repo_reset_and_clean_checkout(self.commit_hash) 122 | 123 | def get_issue_statement(self) -> str: 124 | return self.problem_statement 125 | 126 | 127 | -------------------------------------------------------------------------------- /scripts/judge_fail2pass.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import argparse 5 | import multiprocessing 6 | from tqdm import tqdm 7 | from dotenv import load_dotenv 8 | from openai import OpenAI 9 | 10 | # --- Configuration --- 11 | load_dotenv() # Load environment variables from .env file 12 | 13 | PREV_FILE_NAME = "test_output_prev_apply.txt" 14 | AFTER_FILE_NAME = "test_output_after_apply.txt" 15 | EXIT_CODE_RE = re.compile(r"echo OMNIGRIL_EXIT_CODE=(\d+)") 16 | 17 | 18 | def extract_exit_code(content: str) -> int | None: 19 | """Extracts the exit code from the content; returns None if not found.""" 20 | m = EXIT_CODE_RE.search(content) 21 | return int(m.group(1)) if m else None 22 | 23 | 24 | def process_subdirectory(subdir): 25 | prev_path = os.path.join(subdir, PREV_FILE_NAME) 26 | after_path = os.path.join(subdir, AFTER_FILE_NAME) 27 | 28 | # missing outputs or unparsable -> error 29 | if not (os.path.isfile(prev_path) and os.path.isfile(after_path)): 30 | return "error" 31 | 32 | prev_content = open(prev_path, encoding="utf-8", errors="ignore").read() 33 | after_content = open(after_path, encoding="utf-8", errors="ignore").read() 34 | prev_exit = extract_exit_code(prev_content) 35 | after_exit = extract_exit_code(after_content) 36 | 37 | if prev_exit is None or after_exit is None: 38 | return "error" 39 | 40 | prev_fail = (prev_exit != 0) 41 | after_pass = (after_exit == 0) 42 | 43 | if prev_fail and after_pass: 44 | return "fail2pass" 45 | elif prev_fail and not after_pass: 46 | return "fail2fail" 47 | elif not prev_fail and after_pass: 48 | return "pass2pass" 49 | elif not prev_fail and not after_pass: 50 | return "pass2fail" 51 | else: 52 | return "error" 53 | 54 | 55 | def classify_and_write_json(src_folder: str, output_json: str, processes: int): 56 | # Collect subdirectories 57 | subs = [os.path.join(src_folder, d) 58 | for d in os.listdir(src_folder) 59 | if os.path.isdir(os.path.join(src_folder, d))] 60 | 61 | # Parallel processing 62 | with multiprocessing.Pool(processes) as pool: 63 | statuses = list(tqdm( 64 | pool.imap(process_subdirectory, subs), 65 | total=len(subs), desc="Classifying" 66 | )) 67 | 68 | # Build category mapping 69 | cats = {"fail2pass": [], "fail2fail": [], "pass2pass": [], "pass2fail": [], "error": []} 70 | for subdir, status in zip(subs, statuses): 71 | inst_id = os.path.basename(subdir) 72 | cats.setdefault(status, []).append(inst_id) 73 | 74 | # Print summary 75 | print("Classification summary:") 76 | for cat, ids in cats.items(): 77 | print(f" {cat}: {len(ids)}") 78 | 79 | # Write structured JSON 80 | summary = {"total": len(subs), "categories": cats} 81 | with open(output_json, 'w', encoding="utf-8") as f: 82 | json.dump(summary, f, indent=2) 83 | print(f"Summary JSON written to '{output_json}'") 84 | 85 | 86 | def main(): 87 | parser = argparse.ArgumentParser( 88 | description="Classify subdirectories by test exit codes and output summary JSON.") 89 | parser.add_argument("target_folder", help="Folder containing subdirs to classify.") 90 | parser.add_argument("output_json", help="Path for summary JSON output.") 91 | parser.add_argument("--processes", type=int, default=20, help="Number of worker processes.") 92 | args = parser.parse_args() 93 | 94 | if not os.path.isdir(args.target_folder): 95 | parser.error(f"Folder not found: {args.target_folder}") 96 | if args.processes < 1: 97 | parser.error("--processes must be >= 1") 98 | 99 | classify_and_write_json(args.target_folder, args.output_json, args.processes) 100 | 101 | if __name__ == "__main__": 102 | multiprocessing.freeze_support() 103 | main() 104 | -------------------------------------------------------------------------------- /app/agents/train_env_gen_agent/prompt.py: -------------------------------------------------------------------------------- 1 | SYSTEM_PROMPT = """ 2 | You are tasked with adapting a Dockerfile and its evaluation script (eval script) so that they can run seamlessly in a coding agent evaluation environment. 3 | 4 | Context & Constraints 5 | 1. Container startup 6 | The coding agent will always start the container like this: 7 | self.container = self.client.containers.run( 8 | docker_image, 9 | ["/bin/bash", "-l"], 10 | name=ctr_name, 11 | detach=True, 12 | tty=True, 13 | stdin_open=True 14 | ) 15 | Ensure the image is compatible with this startup (login shell, interactive mode, etc.). 16 | 17 | 2. Command execution 18 | All commands are executed using: 19 | future = executor.submit( 20 | self.container.exec_run, 21 | cmd=["/bin/sh", "-c", command], 22 | workdir='/testbed', 23 | stdout=True, 24 | stderr=True 25 | ) 26 | Ensure that this execution pattern works (commands are run in /testbed by default). 27 | 28 | 3. Repository location 29 | - The target repository must be cloned directly into /testbed. 30 | - Do NOT create subdirectories like /testbed/mypy; it must be /testbed. 31 | 32 | 4. Workdir & virtual environment 33 | - The final working directory when the container starts must be /testbed. 34 | - If a virtual environment (e.g., conda or venv) is used, activate it automatically by adding the activation command to ~/.bash_profile so that it’s active when the agent attaches with /bin/bash -l. 35 | 36 | 5. Install coding agent tools 37 | - Pre-install the required tools: 38 | git clone https://github.com/gnohgnailoug/r2e_tools.git /root/r2e_tools 39 | pip install -e /root/r2e_tools 40 | - This ensures the agent can run search -h successfully. 41 | 42 | 6. Adjust eval script if needed 43 | - If you modify paths in the Dockerfile (e.g., moving the repository from /testbed/mypy to /testbed), also update the eval script accordingly so it still runs correctly. 44 | 45 | 7. No other changes 46 | - Do NOT change the environment setup or testing commands (e.g., Maven/pytest commands remain unchanged). 47 | - Only make changes required for compatibility with the coding agent. 48 | 49 | Deliverables 50 | - Rewritten Dockerfile: Fully adapted to the above constraints. 51 | - Updated eval script: Ensure it works with the new paths and environment settings. 52 | 53 | Evaluation Criteria 54 | Your output will be tested as follows: 55 | 1. Container builds successfully. 56 | 2. Tool works: Running run_command("search -h") inside the container should succeed. 57 | 3. Eval script works: After copying it to /run_tests.sh, running run_command("bash /run_tests.sh") should successfully execute the tests. 58 | 59 | Task: 60 | Rewrite the Dockerfile and eval script accordingly. Do not change any build/test logic except what’s needed for path and environment adaptation. Ensure full compliance with the above constraints. 61 | """ 62 | 63 | USER_PROMPT=""" 64 | You are given an original Dockerfile and an evaluation script. 65 | 66 | Your task is to modify them only as needed to make them fully compatible with the coding agent environment, based on the system constraints provided. 67 | 68 | Important: 69 | - Do not change any core build or test logic in the Dockerfile or evaluation script. 70 | - Only make minimal adjustments necessary for compatibility (e.g., paths, working directory, virtual environment activation, tool installation). 71 | - If a file does not require modification, return "" instead of repeating its content. 72 | - Always provide the full content for any modified file. 73 | 74 | Original Dockerfile: 75 | ```dockerfile 76 | {{ORIGINAL_DOCKERFILE}} 77 | ``` 78 | 79 | Original Evaluation Script: 80 | ```bash 81 | {{ORIGINAL_EVAL_SCRIPT}} 82 | ``` 83 | 84 | Your task: Rewrite these files as needed to meet the system constraints. 85 | 86 | Return your answer strictly in the following JSON format (valid JSON, no extra text outside JSON): 87 | ```json 88 | { 89 | "dockerfile": "string (full modified Dockerfile or )", 90 | "eval_script": "string (full modified eval script or )", 91 | "notes": "string (explanation of what was changed and why)" 92 | } 93 | ``` 94 | """ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | output*/ 29 | testbed/ 30 | data/ 31 | evaluation/run_instance/ 32 | evaluation/run_instance* 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | # *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | evaluation/reports/ 166 | output/ 167 | # *data_collection/collect/*.sh 168 | *data_collection/collect/temp 169 | *evaluation/temp 170 | temp/ 171 | evaluation/*.sh 172 | run_collect*/ 173 | run_collect_train 174 | scripts/*.json 175 | scripts/dataset/ -------------------------------------------------------------------------------- /app/agents/agent.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from app.data_structures import MessageThread, FunctionCallIntent 3 | from app.log import log_exception 4 | from loguru import logger 5 | import os 6 | import json 7 | from collections.abc import Callable, Mapping 8 | 9 | class Agent(ABC): 10 | """ 11 | Abstract base class for all agents. 12 | Provides per-agent message thread, tool call tracking, and default dispatch_intent. 13 | """ 14 | api_functions: list[str] = [] 15 | 16 | def __init__(self, agent_id): 17 | # Each agent has its own thread 18 | self.msg_thread = MessageThread() 19 | self.agent_id = agent_id 20 | # Tracking of tool calls 21 | self.tool_call_sequence: list[dict] = [] 22 | self.tool_call_layers: list[list[dict]] = [] 23 | self.curr_tool: str | None = None 24 | self.iteration_num = 0 25 | self.finish_status = True 26 | 27 | def add_user_message(self, text: str): 28 | """add a user message to the thread""" 29 | self.msg_thread.add_user(text) 30 | 31 | def add_system_message(self, text: str): 32 | """add a system message to the thread""" 33 | self.msg_thread.add_system(text) 34 | 35 | def add_model_message(self, text: str,tools: list): 36 | """add a model message to the thread""" 37 | self.msg_thread.add_model(text,tools) 38 | 39 | @abstractmethod 40 | def run_task(self, print_callback=None) -> tuple[str, str, bool]: 41 | """ 42 | Execute the agent's primary function. 43 | Returns: 44 | - output (str): raw tool or LLM output 45 | - summary (str): one-line summary 46 | - success (bool): whether the action succeeded 47 | """ 48 | pass 49 | 50 | def init_msg_thread(self) -> None: 51 | pass 52 | 53 | def dispatch_intent( 54 | self, 55 | intent: FunctionCallIntent, 56 | # message_thread: MessageThread = None, 57 | # print_callback: Callable[[dict], None] | None = None, 58 | ) -> tuple[str, str, bool]: 59 | """ 60 | Dispatch a FunctionCallIntent to call the agent's tool methods. 61 | """ 62 | 63 | 64 | if intent.func_name not in self.api_functions: 65 | error = f"Unknown function name {intent.func_name}." 66 | summary = "You called a tool that does not exist." 67 | return error, summary, False 68 | 69 | func_obj = getattr(self, intent.func_name) 70 | try: 71 | self.curr_tool = intent.func_name 72 | # If function expects thread 73 | # if 'message_thread' in func_obj.__code__.co_varnames: 74 | # call_res = func_obj(message_thread, print_callback=print_callback) 75 | # else: 76 | call_res = func_obj(**intent.arg_values) 77 | except Exception as e: 78 | log_exception(e) 79 | error = str(e) 80 | summary = "Tool raised an exception." 81 | call_res = (error, summary, False) 82 | 83 | logger.debug("Result of dispatch_intent: {}", call_res) 84 | 85 | # Record the call 86 | result, _, ok = call_res 87 | self.tool_call_sequence.append(intent.to_dict_with_result(ok,result,self.agent_id)) 88 | # if not self.tool_call_layers: 89 | # self.tool_call_layers.append([]) 90 | # self.tool_call_layers[-1].append(intent.to_dict_with_result(ok,result,self.agent_id)) 91 | 92 | return call_res 93 | 94 | def start_new_layer(self): 95 | self.tool_call_layers.append([]) 96 | 97 | def reset_tool_sequence(self): 98 | self.tool_call_sequence = [] 99 | 100 | def dump_tool_sequence(self, output_dir: str): 101 | os.makedirs(output_dir, exist_ok=True) 102 | seq_file = os.path.join(output_dir, 'tool_sequence.json') 103 | # layer_file = os.path.join(output_dir, 'agent_tool_layers.json') 104 | with open(seq_file, 'w') as f: 105 | json.dump(self.tool_call_sequence, f, indent=2) 106 | # with open(layer_file, 'w') as f: 107 | # json.dump(self.tool_call_layers, f, indent=2) 108 | -------------------------------------------------------------------------------- /data_collection/versioning/constants.py: -------------------------------------------------------------------------------- 1 | # Constants - Task Instance Version File 2 | MAP_REPO_TO_VERSION_PATHS = { 3 | "dbt-labs/dbt-core": ["core/dbt/version.py", "core/dbt/__init__.py"], 4 | "django/django": ["django/__init__.py"], 5 | "huggingface/transformers": ["src/transformers/__init__.py"], 6 | "marshmallow-code/marshmallow": ["src/marshmallow/__init__.py"], 7 | "mwaskom/seaborn": ["seaborn/__init__.py"], 8 | "pallets/flask": ["src/flask/__init__.py", "flask/__init__.py"], 9 | "psf/requests": ["requests/__version__.py", "requests/__init__.py"], 10 | "pyca/cryptography": [ 11 | "src/cryptography/__about__.py", 12 | "src/cryptography/__init__.py", 13 | ], 14 | "pylint-dev/astroid": ["astroid/__pkginfo__.py", "astroid/__init__.py"], 15 | "pylint-dev/pylint": ["pylint/__pkginfo__.py", "pylint/__init__.py"], 16 | "pytest-dev/pytest": ["src/_pytest/_version.py", "_pytest/_version.py"], 17 | "pyvista/pyvista": ["pyvista/_version.py", "pyvista/__init__.py"], 18 | "Qiskit/qiskit": ["qiskit/VERSION.txt"], 19 | "scikit-learn/scikit-learn": ["sklearn/__init__.py"], 20 | "sphinx-doc/sphinx": ["sphinx/__init__.py"], 21 | "sympy/sympy": ["sympy/release.py", "sympy/__init__.py"], 22 | "pillow/pillow":["src/PIL/_version.py"], 23 | 'dateutil/dateutil': ['NEWS'], 24 | 'python/mypy': ['mypy/version.py'], 25 | 'redis/redis-py': ['setup.py','redis/__init__.py'], 26 | 'tqdm/tqdm': ['tqdm/_version.py'], 27 | 'prettier/prettier':['package.json'], 28 | 'tailwindlabs/tailwindcss':['package.json'], 29 | 'jestjs/jest':['lerna.json','package.json'], 30 | 'webpack/webpack':['package.json'], 31 | 'apollographql/apollo-client':['package.json'], 32 | 'iamkun/dayjs':['CHANGELOG.md'], 33 | 'babel/babel':['package.json'], 34 | 'statsmodels/statsmodels':['docs/source/release/index.rst'], 35 | "assertj/assertj":['pom.xml'], 36 | "netty/netty":['pom.xml'], 37 | "google/gson":['pom.xml'], 38 | } 39 | 40 | # Cosntants - Task Instance Version Regex Pattern 41 | MAP_REPO_TO_VERSION_PATTERNS = { 42 | k: [r'__version__ = [\'"](.*)[\'"]', r"VERSION = \((.*)\)"] 43 | 44 | for k in [ 45 | "dbt-labs/dbt-core", 46 | "django/django", 47 | "huggingface/transformers", 48 | "marshmallow-code/marshmallow", 49 | "mwaskom/seaborn", 50 | "pallets/flask", 51 | "psf/requests", 52 | "pyca/cryptography", 53 | "pylint-dev/astroid", 54 | "pylint-dev/pylint", 55 | "scikit-learn/scikit-learn", 56 | "sphinx-doc/sphinx", 57 | "sympy/sympy", 58 | 'python/mypy', 59 | ] 60 | } 61 | MAP_REPO_TO_VERSION_PATTERNS.update({ 62 | k: [ 63 | r'\[\s*(\d+\.\d+\.\d+)\s*\]' 64 | ] for k in ['iamkun/dayjs'] 65 | }) 66 | MAP_REPO_TO_VERSION_PATTERNS.update({ 67 | k: [ 68 | r'version(\d+\.\d+(?:\.\d+)?(?:-\d+)?)' 69 | ] for k in ['statsmodels/statsmodels'] 70 | }) 71 | MAP_REPO_TO_VERSION_PATTERNS.update( 72 | { 73 | k: [ 74 | r'"version":\s*"([^"]+)"' 75 | 76 | ] 77 | for k in [ 78 | 79 | 'prettier/prettier', 80 | 'tailwindlabs/tailwindcss', 81 | 'jestjs/jest', 82 | 'webpack/webpack', 83 | 'babel/babel', 84 | 'apollographql/apollo-client' 85 | ] 86 | } 87 | ) 88 | 89 | MAP_REPO_TO_VERSION_PATTERNS.update({ 90 | k:[ 91 | r'(\d+(\.\d+)*\.[A-Za-z][A-Za-z0-9\-]*)<\/version>', r'(\d+\.\d+\.\d+(?:-\w+)?)' 92 | ] 93 | for k in [ 94 | "netty/netty", 95 | "assertj/assertj", 96 | "google/gson" 97 | ]} 98 | ) 99 | 100 | 101 | 102 | MAP_REPO_TO_VERSION_PATTERNS.update( 103 | { 104 | k: [ 105 | r'__version__ = [\'"](.*)[\'"]', 106 | r'__version__ = version = [\'"](.*)[\'"]', 107 | r"VERSION = \((.*)\)", 108 | ] 109 | for k in ["pytest-dev/pytest", "matplotlib/matplotlib"] 110 | } 111 | ) 112 | MAP_REPO_TO_VERSION_PATTERNS.update({k: [r"Version\s+(\d+\.\d+(?:\.\d+)?)"] for k in ["dateutil/dateutil"]}) 113 | MAP_REPO_TO_VERSION_PATTERNS.update({k: [r"(.*)"] for k in ["Qiskit/qiskit"]}) 114 | MAP_REPO_TO_VERSION_PATTERNS.update({k: [r"version_info = [\d]+,[\d\s]+,"] for k in ["pyvista/pyvista"]}) 115 | MAP_REPO_TO_VERSION_PATTERNS.update({k:[r"version_info = [\d]+, [\d]+, [\d\s]+"] for k in ['tqdm/tqdm']}) 116 | MAP_REPO_TO_VERSION_PATTERNS.update( 117 | { 118 | k: [ 119 | 120 | r'version="(.*?)"', 121 | r'__version__ = [\'"](.*)[\'"]', 122 | 123 | ] 124 | for k in ['redis/redis-py'] 125 | } 126 | ) 127 | SWE_BENCH_URL_RAW = 'https://github.com/' 128 | # SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/" -------------------------------------------------------------------------------- /app/model/gemini.py: -------------------------------------------------------------------------------- 1 | """ 2 | For models other than those from OpenAI, use LiteLLM if possible. 3 | """ 4 | 5 | import os 6 | import sys 7 | from typing import Literal 8 | 9 | import litellm 10 | from litellm.utils import Choices, Message, ModelResponse 11 | from openai import BadRequestError 12 | from tenacity import retry, stop_after_attempt, wait_random_exponential 13 | 14 | from app.log import log_and_print 15 | from app.model import common 16 | from app.model.common import Model 17 | 18 | 19 | class GeminiModel(Model): 20 | """ 21 | Base class for creating Singleton instances of Gemini models. 22 | """ 23 | 24 | _instances = {} 25 | 26 | def __new__(cls): 27 | if cls not in cls._instances: 28 | cls._instances[cls] = super().__new__(cls) 29 | cls._instances[cls]._initialized = False 30 | return cls._instances[cls] 31 | 32 | def __init__( 33 | self, 34 | name: str, 35 | cost_per_input: float, 36 | cost_per_output: float, 37 | parallel_tool_call: bool = False, 38 | ): 39 | if self._initialized: 40 | return 41 | super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call) 42 | self._initialized = True 43 | 44 | def setup(self) -> None: 45 | """ 46 | Check API key. 47 | """ 48 | self.check_api_key() 49 | 50 | def check_api_key(self) -> str: 51 | key_name = "GEMINI_API_KEY" 52 | credential_name = "GOOGLE_APPLICATION_CREDENTIALS" 53 | 54 | gemini_key = os.getenv(key_name) 55 | credential_key = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") 56 | if not (gemini_key or credential_key): 57 | print(f"Please set the {key_name} or {credential_name} env var") 58 | sys.exit(1) 59 | return gemini_key or credential_key 60 | 61 | def extract_resp_content(self, chat_message: Message) -> str: 62 | """ 63 | Given a chat completion message, extract the content from it. 64 | """ 65 | content = chat_message.content 66 | if content is None: 67 | return "" 68 | else: 69 | return content 70 | 71 | @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3)) 72 | def call( 73 | self, 74 | messages: list[dict], 75 | top_p=1, 76 | tools=None, 77 | response_format: Literal["text", "json_object"] = "text", 78 | **kwargs, 79 | ): 80 | # FIXME: ignore tools field since we don't use tools now 81 | try: 82 | prefill_content = "{" 83 | if response_format == "json_object": # prefill 84 | messages.append({"role": "assistant", "content": prefill_content}) 85 | 86 | response = litellm.completion( 87 | model=self.name, 88 | messages=messages, 89 | temperature=common.MODEL_TEMP, 90 | max_tokens=1024, 91 | top_p=top_p, 92 | stream=False, 93 | ) 94 | assert isinstance(response, ModelResponse) 95 | resp_usage = response.usage 96 | assert resp_usage is not None 97 | input_tokens = int(resp_usage.prompt_tokens) 98 | output_tokens = int(resp_usage.completion_tokens) 99 | cost = self.calc_cost(input_tokens, output_tokens) 100 | 101 | common.thread_cost.process_cost += cost 102 | common.thread_cost.process_input_tokens += input_tokens 103 | common.thread_cost.process_output_tokens += output_tokens 104 | 105 | first_resp_choice = response.choices[0] 106 | assert isinstance(first_resp_choice, Choices) 107 | resp_msg: Message = first_resp_choice.message 108 | content = self.extract_resp_content(resp_msg) 109 | if response_format == "json_object": 110 | # prepend the prefilled character 111 | if not content.startswith(prefill_content): 112 | content = prefill_content + content 113 | 114 | return content, cost, input_tokens, output_tokens 115 | 116 | except BadRequestError as e: 117 | if e.code == "context_length_exceeded": 118 | log_and_print("Context length exceeded") 119 | raise e 120 | 121 | 122 | class GeminiPro(GeminiModel): 123 | def __init__(self): 124 | super().__init__( 125 | "gemini-1.0-pro-002", 0.00000035, 0.00000105, parallel_tool_call=True 126 | ) 127 | self.note = "Gemini 1.0 from Google" 128 | 129 | 130 | class Gemini15Pro(GeminiModel): 131 | def __init__(self): 132 | super().__init__( 133 | "gemini-1.5-pro-preview-0409", 134 | 0.00000035, 135 | 0.00000105, 136 | parallel_tool_call=True, 137 | ) 138 | self.note = "Gemini 1.5 from Google" 139 | -------------------------------------------------------------------------------- /data_collection/versioning/merge_final_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import json 4 | import sys 5 | from pathlib import Path 6 | import logging 7 | 8 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") 9 | logger = logging.getLogger(__name__) 10 | 11 | def read_instances(path: Path): 12 | """ 13 | Reads instance files, returns an empty list if it does not exist. 14 | Supports .json and .jsonl formats. 15 | """ 16 | if not path.exists(): 17 | logger.warning(f"File not found: {path}, treating as an empty list") 18 | return [] 19 | try: 20 | text = path.read_text(encoding='utf-8') 21 | if path.suffix.lower() == '.jsonl': 22 | return [json.loads(line) for line in text.splitlines() if line.strip()] 23 | else: 24 | return json.loads(text) 25 | except Exception as e: 26 | logger.error(f"Failed to read or parse file ({path}): {e}") 27 | sys.exit(1) 28 | 29 | def write_instances(instances, path: Path): 30 | """ 31 | Writes the list of instances, automatically selecting json or jsonl format based on the suffix. 32 | """ 33 | path.parent.mkdir(parents=True, exist_ok=True) 34 | try: 35 | if path.suffix.lower() == '.jsonl': 36 | with path.open('w', encoding='utf-8') as f: 37 | for inst in instances: 38 | f.write(json.dumps(inst, ensure_ascii=False) + '\n') 39 | else: 40 | path.write_text( 41 | json.dumps(instances, indent=2, ensure_ascii=False), encoding='utf-8' 42 | ) 43 | except Exception as e: 44 | logger.error(f"Failed to write file ({path}): {e}") 45 | sys.exit(1) 46 | 47 | def merge(primary, secondary): 48 | """ 49 | Takes primary as the main list and absorbs all pull_numbers from secondary 50 | that are not in primary, ensuring pull_number uniqueness. 51 | """ 52 | seen = {inst.get('pull_number') for inst in primary if 'pull_number' in inst} 53 | out = list(primary) 54 | for inst in secondary: 55 | pn = inst.get('pull_number') 56 | if pn is None: 57 | logger.warning("Skipping entry with missing pull_number") 58 | continue 59 | if pn not in seen: 60 | out.append(inst) 61 | seen.add(pn) 62 | return out 63 | 64 | def find_version_file(directory: Path, suffix: str): 65 | """ 66 | Finds a version file in the directory ending with suffix, supporting .json or .jsonl. 67 | Returns the first Path found, or None. 68 | """ 69 | # First, look for a fixed format: dirname + suffix + ext 70 | for ext in ('.json', '.jsonl'): 71 | candidate = directory / f"{directory.name}{suffix}{ext}" 72 | if candidate.exists(): 73 | return candidate 74 | # Then, use wildcards 75 | for ext in ('.json', '.jsonl'): 76 | matches = list(directory.glob(f"*{suffix}{ext}")) 77 | if matches: 78 | return matches[0] 79 | return None 80 | 81 | def main(): 82 | parser = argparse.ArgumentParser( 83 | description="Merges `_versions_by_github` and `_versions_by_git` files in the same directory, and outputs `_versions_final`" 84 | ) 85 | parser.add_argument("input_dir", help="Directory containing the version files") 86 | args = parser.parse_args() 87 | 88 | directory = Path(args.input_dir) 89 | if not directory.is_dir(): 90 | logger.error(f"Input is not a directory: {directory}") 91 | sys.exit(1) 92 | 93 | # 1. Find the two version files 94 | github_file = find_version_file(directory, "_versions_by_github") 95 | git_file = find_version_file(directory, "_versions_by_git") 96 | 97 | # 2. Read the files 98 | if github_file: 99 | logger.info(f"Using GitHub version file: {github_file.name}") 100 | primary = read_instances(github_file) 101 | ext = github_file.suffix 102 | else: 103 | logger.info("Could not find `_versions_by_github`, setting primary list to empty") 104 | primary = [] 105 | ext = None 106 | 107 | if git_file: 108 | logger.info(f"Using Git checkout version file: {git_file.name}") 109 | secondary = read_instances(git_file) 110 | if ext is None: 111 | ext = git_file.suffix 112 | else: 113 | logger.info("Could not find `_versions_by_git`, setting secondary list to empty") 114 | secondary = [] 115 | if ext is None: 116 | ext = ".json" 117 | 118 | # 3. Merge 119 | merged = merge(primary, secondary) 120 | 121 | # 3.1 Sort by pull_number in descending order (converted to int) 122 | try: 123 | merged.sort(key=lambda x: int(x.get('pull_number', 0)), reverse=True) 124 | except (ValueError, TypeError): 125 | merged.sort(key=lambda x: x.get('pull_number', ""), reverse=True) 126 | 127 | # 4. Write to versions_final 128 | output_path = directory / f"{directory.name}_versions_final{ext}" 129 | write_instances(merged, output_path) 130 | logger.info( 131 | f"✅ Merge complete: {len(primary)} + {len(merged)-len(primary)} new = {len(merged)} total entries, written to {output_path.name}" 132 | ) 133 | 134 | if __name__ == "__main__": 135 | main() -------------------------------------------------------------------------------- /app/model/groq.py: -------------------------------------------------------------------------------- 1 | """ 2 | Interfacing with Groq cloud. 3 | """ 4 | 5 | import os 6 | import sys 7 | from typing import Literal 8 | 9 | import litellm 10 | from litellm.utils import Choices, Message, ModelResponse 11 | from openai import BadRequestError 12 | from tenacity import retry, stop_after_attempt, wait_random_exponential 13 | 14 | from app.log import log_and_print 15 | from app.model import common 16 | from app.model.common import Model 17 | 18 | # litellm.set_verbose = True 19 | 20 | 21 | class GroqModel(Model): 22 | """ 23 | Base class for creating Singleton instances of Groq models. 24 | We use native API from Groq through LiteLLM. 25 | """ 26 | 27 | _instances = {} 28 | 29 | def __new__(cls): 30 | if cls not in cls._instances: 31 | cls._instances[cls] = super().__new__(cls) 32 | cls._instances[cls]._initialized = False 33 | return cls._instances[cls] 34 | 35 | def __init__( 36 | self, 37 | name: str, 38 | cost_per_input: float, 39 | cost_per_output: float, 40 | parallel_tool_call: bool = False, 41 | ): 42 | if self._initialized: 43 | return 44 | super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call) 45 | self._initialized = True 46 | 47 | def setup(self) -> None: 48 | """ 49 | Check Groq API key. 50 | """ 51 | self.check_api_key() 52 | 53 | def check_api_key(self) -> str: 54 | """ 55 | Check for the GROQ_API_KEY environment variable. 56 | """ 57 | key = os.environ.get("GROQ_API_KEY") 58 | if not key: 59 | log_and_print("Please set the GROQ_API_KEY env var") 60 | sys.exit(1) 61 | return key 62 | 63 | def extract_resp_content(self, chat_message: Message) -> str: 64 | """ 65 | Given a chat completion message, extract the content from it. 66 | """ 67 | content = chat_message.content 68 | if content is None: 69 | return "" 70 | else: 71 | return content 72 | 73 | @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3)) 74 | def call( 75 | self, 76 | messages: list[dict], 77 | top_p=1, 78 | tools=None, 79 | response_format: Literal["text", "json_object"] = "text", 80 | **kwargs, 81 | ): 82 | """ 83 | Calls the Groq API to generate completions for the given inputs. 84 | """ 85 | # FIXME: ignore tools field since we don't use tools now 86 | try: 87 | # groq models - prefilling response with { increase the success rate 88 | # of producing json output 89 | prefill_content = "{" 90 | if response_format == "json_object": # prefill 91 | messages.append({"role": "assistant", "content": prefill_content}) 92 | 93 | response = litellm.completion( 94 | model=self.name, 95 | messages=messages, 96 | temperature=common.MODEL_TEMP, 97 | max_tokens=1024, 98 | top_p=top_p, 99 | stream=False, 100 | ) 101 | assert isinstance(response, ModelResponse) 102 | resp_usage = response.usage 103 | assert resp_usage is not None 104 | input_tokens = int(resp_usage.prompt_tokens) 105 | output_tokens = int(resp_usage.completion_tokens) 106 | cost = self.calc_cost(input_tokens, output_tokens) 107 | 108 | common.thread_cost.process_cost += cost 109 | common.thread_cost.process_input_tokens += input_tokens 110 | common.thread_cost.process_output_tokens += output_tokens 111 | 112 | first_resp_choice = response.choices[0] 113 | assert isinstance(first_resp_choice, Choices) 114 | resp_msg: Message = first_resp_choice.message 115 | content = self.extract_resp_content(resp_msg) 116 | if response_format == "json_object": 117 | # prepend the prefilled character 118 | if not content.startswith(prefill_content): 119 | content = prefill_content + content 120 | return content, cost, input_tokens, output_tokens 121 | 122 | except BadRequestError as e: 123 | if e.code == "context_length_exceeded": 124 | log_and_print("Context length exceeded") 125 | raise e 126 | 127 | 128 | class Llama3_8B(GroqModel): 129 | def __init__(self): 130 | super().__init__( 131 | "groq/llama3-8b-8192", 0.00000005, 0.00000010, parallel_tool_call=True 132 | ) 133 | self.note = "The champion of the Llama series with 8B params from Meta" 134 | 135 | 136 | class Llama3_70B(GroqModel): 137 | def __init__(self): 138 | super().__init__( 139 | "groq/llama3-70b-8192", 0.00000059, 0.00000079, parallel_tool_call=True 140 | ) 141 | self.note = "Llama lastest model with 70B params" 142 | 143 | 144 | class Mixtral_8x7B(GroqModel): 145 | def __init__(self): 146 | super().__init__( 147 | "groq/mixtral-8x7b-32768", 0.00000027, 0.00000027, parallel_tool_call=True 148 | ) 149 | self.note = "Balanced blend of speed and power from Mixtral team with 8 layers and 7B parameters" 150 | 151 | 152 | class Gemma_7B(GroqModel): 153 | def __init__(self): 154 | super().__init__( 155 | "groq/gemma-7b-it", 0.0000001, 0.0000001, parallel_tool_call=True 156 | ) 157 | self.note = "A state-of-the-art open model from Google, boasting 7B parameters" 158 | -------------------------------------------------------------------------------- /app/model/claude.py: -------------------------------------------------------------------------------- 1 | """ 2 | For models other than those from OpenAI, use LiteLLM if possible. 3 | """ 4 | 5 | import os 6 | import sys 7 | from typing import Literal 8 | 9 | import litellm 10 | from litellm.utils import Choices, Message, ModelResponse 11 | from openai import BadRequestError 12 | from tenacity import retry, stop_after_attempt, wait_random_exponential 13 | 14 | from app.log import log_and_print 15 | from app.model import common 16 | from app.model.common import Model 17 | 18 | 19 | class AnthropicModel(Model): 20 | """ 21 | Base class for creating Singleton instances of Antropic models. 22 | """ 23 | 24 | _instances = {} 25 | 26 | def __new__(cls): 27 | if cls not in cls._instances: 28 | cls._instances[cls] = super().__new__(cls) 29 | cls._instances[cls]._initialized = False 30 | return cls._instances[cls] 31 | 32 | def __init__( 33 | self, 34 | name: str, 35 | cost_per_input: float, 36 | cost_per_output: float, 37 | max_output_token: int = 4096, 38 | parallel_tool_call: bool = False, 39 | ): 40 | if self._initialized: 41 | return 42 | super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call) 43 | self.max_output_token = max_output_token 44 | self._initialized = True 45 | 46 | def setup(self) -> None: 47 | """ 48 | Check API key. 49 | """ 50 | self.check_api_key() 51 | 52 | def check_api_key(self) -> str: 53 | key_name = "ANTHROPIC_API_KEY" 54 | key = os.getenv(key_name) 55 | if not key: 56 | print(f"Please set the {key_name} env var") 57 | sys.exit(1) 58 | return key 59 | 60 | def extract_resp_content(self, chat_message: Message) -> str: 61 | """ 62 | Given a chat completion message, extract the content from it. 63 | """ 64 | content = chat_message.content 65 | if content is None: 66 | return "" 67 | else: 68 | return content 69 | 70 | @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3)) 71 | def call( 72 | self, 73 | messages: list[dict], 74 | top_p=1, 75 | tools=None, 76 | response_format: Literal["text", "json_object"] = "text", 77 | temperature: float | None = None, 78 | **kwargs, 79 | ): 80 | # FIXME: ignore tools field since we don't use tools now 81 | if temperature is None: 82 | temperature = common.MODEL_TEMP 83 | 84 | try: 85 | # antropic models - prefilling response with { increase the success rate 86 | # of producing json output 87 | prefill_content = "{" 88 | if response_format == "json_object": # prefill 89 | messages.append({"role": "assistant", "content": prefill_content}) 90 | 91 | response = litellm.completion( 92 | model=self.name, 93 | messages=messages, 94 | temperature=temperature, 95 | max_tokens=self.max_output_token, 96 | top_p=top_p, 97 | stream=False, 98 | ) 99 | assert isinstance(response, ModelResponse) 100 | resp_usage = response.usage 101 | assert resp_usage is not None 102 | input_tokens = int(resp_usage.prompt_tokens) 103 | output_tokens = int(resp_usage.completion_tokens) 104 | cost = self.calc_cost(input_tokens, output_tokens) 105 | 106 | common.thread_cost.process_cost += cost 107 | common.thread_cost.process_input_tokens += input_tokens 108 | common.thread_cost.process_output_tokens += output_tokens 109 | 110 | first_resp_choice = response.choices[0] 111 | assert isinstance(first_resp_choice, Choices) 112 | resp_msg: Message = first_resp_choice.message 113 | content = self.extract_resp_content(resp_msg) 114 | if response_format == "json_object": 115 | # prepend the prefilled character 116 | if not content.startswith(prefill_content): 117 | content = prefill_content + content 118 | return content, cost, input_tokens, output_tokens 119 | 120 | except BadRequestError as e: 121 | if e.code == "context_length_exceeded": 122 | log_and_print("Context length exceeded") 123 | raise e 124 | 125 | 126 | class Claude3Opus(AnthropicModel): 127 | def __init__(self): 128 | super().__init__( 129 | "claude-3-opus-20240229", 0.000015, 0.000075, parallel_tool_call=True 130 | ) 131 | self.note = "Most powerful model among Claude 3" 132 | 133 | 134 | class Claude3Sonnet(AnthropicModel): 135 | def __init__(self): 136 | super().__init__( 137 | "claude-3-sonnet-20240229", 0.000003, 0.000015, parallel_tool_call=True 138 | ) 139 | self.note = "Most balanced (intelligence and speed) model from Antropic" 140 | 141 | 142 | class Claude3Haiku(AnthropicModel): 143 | def __init__(self): 144 | super().__init__( 145 | "claude-3-haiku-20240307", 0.00000025, 0.00000125, parallel_tool_call=True 146 | ) 147 | self.note = "Fastest model from Antropic" 148 | 149 | 150 | # class Claude3_5Sonnet(AnthropicModel): 151 | # def __init__(self): 152 | # super().__init__( 153 | # "claude-3-5-sonnet-20240620", 154 | # 0.000003, 155 | # 0.000015, 156 | # max_output_token=8192, 157 | # parallel_tool_call=True, 158 | # ) 159 | # self.note = "Most intelligent model from Antropic" 160 | -------------------------------------------------------------------------------- /app/agents/write_dockerfile_agent/write_dockerfile_agent.py: -------------------------------------------------------------------------------- 1 | from app.data_structures import MessageThread 2 | from app.agents.write_dockerfile_agent import write_dockerfile_utils 3 | from app.agents.agent import Agent 4 | from app.task import Task 5 | import os 6 | import shutil 7 | from loguru import logger 8 | import re 9 | from app.log import ( 10 | print_acr, 11 | print_banner, 12 | print_retrieval, 13 | ) 14 | from os.path import join as pjoin 15 | 16 | 17 | class WriteDockerfileAgent(Agent): 18 | """ 19 | LLM-based agent for creating or modifying a Dockerfile via direct chat. 20 | Manages its own create/modify logic, output directories, and retry behavior. 21 | """ 22 | api_functions: list[str] = [] 23 | def __init__(self, task: Task, output_dir: str, repo_basic_info: str, using_ubuntu_only: bool = False): 24 | super().__init__(agent_id="WriteDockerfileAgent") 25 | self.msg_thread = MessageThread() 26 | self.task = task 27 | self.output_dir = os.path.abspath(output_dir) 28 | self.run_count = 0 29 | self.reference_setup = None 30 | self.repo_basic_info = repo_basic_info 31 | self.init_msg_thread() 32 | self.using_ubuntu_only = using_ubuntu_only 33 | 34 | 35 | def init_msg_thread(self) -> None: 36 | self.msg_thread = MessageThread() 37 | self.add_system_message(write_dockerfile_utils.get_system_prompt_dockerfile()) 38 | self.add_user_message(self.repo_basic_info) 39 | 40 | def add_reference_message(self) -> None: 41 | if self.reference_setup: 42 | reference_version = self.reference_setup['version'] 43 | reference_dockerfile =self.reference_setup['dockerfile'] 44 | reference_text = ( 45 | f"I found a Dockerfile from version {reference_version} of this repo that worked well in a similar setup. " 46 | "You might consider it as a reference—if its configuration aligns with your current environment, it could " 47 | "save you some effort. Otherwise, feel free to adapt or disregard as needed:\n\n" 48 | f"{reference_dockerfile}" 49 | ) 50 | self.add_user_message(reference_text) 51 | 52 | 53 | def run_task(self, print_callback=None) -> tuple[str, str, bool]: 54 | """ 55 | Create or modify a Dockerfile based on the given message_thread context. 56 | Handles versioning, directory management, and fallback copy logic. 57 | """ 58 | # 1. Determine previous vs current output paths 59 | print_banner(f"Iteration ROUND {self.iteration_num}: Dockerfile Generation ") 60 | prev_dir = self.get_latest_write_dockerfile_output_dir() 61 | prev_file = os.path.join(prev_dir, 'Dockerfile') 62 | self.run_count += 1 63 | curr_dir = self.get_latest_write_dockerfile_output_dir() 64 | os.makedirs(curr_dir, exist_ok=True) 65 | self.add_reference_message() 66 | # 2. Inject either modify or init prompt 67 | if os.path.exists(prev_file): 68 | modify_prompt = write_dockerfile_utils.get_user_prompt_modify_dockerfile() 69 | # add previous Dockerfile content 70 | prev_content = self._read_file(prev_file) 71 | self.add_user_message(f"Previous dockerfile:\n{prev_content}\n") 72 | self.add_user_message(modify_prompt) 73 | else: 74 | if self.using_ubuntu_only: 75 | self.add_user_message(write_dockerfile_utils.get_user_prompt_init_dockerfile_using_ubuntu_only()) 76 | else: 77 | self.add_user_message(write_dockerfile_utils.get_user_prompt_init_dockerfile()) 78 | 79 | # 3. Delegate to the retryable writer 80 | task_output = write_dockerfile_utils.write_dockerfile_with_retries( 81 | self.msg_thread, 82 | curr_dir, 83 | self.task, 84 | print_callback=print_callback 85 | ) 86 | 87 | # 4. Post-process: validate or fallback copy 88 | dockerfile_path = os.path.join(curr_dir, 'Dockerfile') 89 | if not os.path.isfile(dockerfile_path): 90 | 91 | # fallback: copy previous 92 | if os.path.exists(prev_file): 93 | shutil.copy(prev_file, dockerfile_path) 94 | summary = "Dockerfile generation failed." 95 | is_ok = False 96 | else: 97 | summary = "Dockerfile created/updated successfully." 98 | is_ok = True 99 | 100 | dockerfile_output_dir = self.get_latest_write_dockerfile_output_dir() 101 | conversation_file = pjoin(dockerfile_output_dir, f"conversation.json") 102 | self.msg_thread.save_to_file(conversation_file) 103 | # self.init_msg_thread() 104 | return task_output, summary, is_ok 105 | 106 | def _read_file(self, path: str) -> str: 107 | try: 108 | with open(path, 'r') as f: 109 | return f.read() 110 | except Exception: 111 | return "" 112 | 113 | def get_latest_write_dockerfile_output_dir(self) -> str: 114 | """ 115 | Return the directory of the most recent Dockerfile outputs. 116 | """ 117 | return os.path.join(self.output_dir, f"write_dockerfile_agent_{self.run_count}") 118 | 119 | def get_latest_dockerfile(self) -> str: 120 | """ 121 | Read and return contents of the latest generated Dockerfile. 122 | """ 123 | path = os.path.join(self.get_latest_write_dockerfile_output_dir(), 'Dockerfile') 124 | try: 125 | with open(path, 'r') as f: 126 | return f.read() 127 | except Exception as e: 128 | logger.error(f"Failed to read latest Dockerfile at {path}: {e}") 129 | return "" 130 | -------------------------------------------------------------------------------- /app/model/bedrock.py: -------------------------------------------------------------------------------- 1 | """ 2 | For models other than those from OpenAI, use LiteLLM if possible. 3 | """ 4 | 5 | import os 6 | import sys 7 | from typing import Literal 8 | 9 | import litellm 10 | from litellm.utils import Choices, Message, ModelResponse 11 | from openai import BadRequestError 12 | from tenacity import retry, stop_after_attempt, wait_random_exponential 13 | 14 | from app.log import log_and_print 15 | from app.model import common 16 | from app.model.common import Model 17 | 18 | 19 | class BedrockModel(Model): 20 | """ 21 | Base class for creating Singleton instances of Amazon Bedrock models. 22 | """ 23 | 24 | _instances = {} 25 | 26 | def __new__(cls): 27 | if cls not in cls._instances: 28 | cls._instances[cls] = super().__new__(cls) 29 | cls._instances[cls]._initialized = False 30 | return cls._instances[cls] 31 | 32 | def __init__( 33 | self, 34 | name: str, 35 | cost_per_input: float, 36 | cost_per_output: float, 37 | parallel_tool_call: bool = False, 38 | ): 39 | if self._initialized: 40 | return 41 | super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call) 42 | self._model_provider = self.name.split(".")[0] 43 | self._initialized = True 44 | 45 | def setup(self) -> None: 46 | """ 47 | Check API key. 48 | """ 49 | self.check_api_key() 50 | 51 | def check_api_key(self) -> str: 52 | # See https://litellm.vercel.app/docs/providers/bedrock 53 | required_env_vars = [ 54 | "AWS_ACCESS_KEY_ID", 55 | "AWS_SECRET_ACCESS_KEY", 56 | "AWS_REGION_NAME", 57 | ] 58 | if len(set(os.environ).intersection(required_env_vars)) != len( 59 | required_env_vars 60 | ): 61 | print( 62 | "Missing env vars. Please refer to https://litellm.vercel.app/docs/providers/bedrock" 63 | ) 64 | sys.exit(1) 65 | return os.getenv(required_env_vars[-1]) 66 | 67 | def extract_resp_content(self, chat_message: Message) -> str: 68 | """ 69 | Given a chat completion message, extract the content from it. 70 | """ 71 | content = chat_message.content 72 | if content is None: 73 | return "" 74 | else: 75 | return content 76 | 77 | @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3)) 78 | def call( 79 | self, 80 | messages: list[dict], 81 | top_p=1, 82 | tools=None, 83 | response_format: Literal["text", "json_object"] = "text", 84 | **kwargs, 85 | ): 86 | try: 87 | if self._model_provider == "bedrock/anthropic": 88 | # antropic models - prefilling response with { increase the success rate 89 | # of producing json output 90 | prefill_content = "{" 91 | if response_format == "json_object": # prefill 92 | messages.append({"role": "assistant", "content": prefill_content}) 93 | 94 | response = litellm.completion( 95 | model=self.name, 96 | messages=messages, 97 | temperature=common.MODEL_TEMP, 98 | max_tokens=1024, 99 | top_p=top_p, 100 | stream=False, 101 | ) 102 | assert isinstance(response, ModelResponse) 103 | resp_usage = response.usage 104 | assert resp_usage is not None 105 | input_tokens = int(resp_usage.prompt_tokens) 106 | output_tokens = int(resp_usage.completion_tokens) 107 | cost = self.calc_cost(input_tokens, output_tokens) 108 | 109 | common.thread_cost.process_cost += cost 110 | common.thread_cost.process_input_tokens += input_tokens 111 | common.thread_cost.process_output_tokens += output_tokens 112 | 113 | first_resp_choice = response.choices[0] 114 | assert isinstance(first_resp_choice, Choices) 115 | resp_msg: Message = first_resp_choice.message 116 | content = self.extract_resp_content(resp_msg) 117 | if response_format == "json_object": 118 | # prepend the prefilled character 119 | if not content.startswith(prefill_content): 120 | content = prefill_content + content 121 | return content, cost, input_tokens, output_tokens 122 | 123 | except BadRequestError as e: 124 | if e.code == "context_length_exceeded": 125 | log_and_print("Context length exceeded") 126 | raise e 127 | 128 | 129 | class AnthropicClaude2(BedrockModel): 130 | def __init__(self): 131 | super().__init__( 132 | "bedrock/anthropic.claude-v2:1", 133 | 0.00000025, 134 | 0.00000125, 135 | parallel_tool_call=True, 136 | ) 137 | self.note = "Older Claude model" 138 | 139 | 140 | class AnthropicClaude3Opus(BedrockModel): 141 | def __init__(self): 142 | super().__init__( 143 | "bedrock/anthropic.claude-3-opus-20240229-v1:0", 144 | 0.000015, 145 | 0.000075, 146 | parallel_tool_call=True, 147 | ) 148 | self.note = "Most powerful model from Antropic" 149 | 150 | 151 | class AnthropicClaude3Sonnet(BedrockModel): 152 | def __init__(self): 153 | super().__init__( 154 | "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", 155 | 0.000003, 156 | 0.000015, 157 | parallel_tool_call=True, 158 | ) 159 | self.note = "Most balanced (intelligence and speed) model from Antropic" 160 | 161 | 162 | class AnthropicClaude3Haiku(BedrockModel): 163 | def __init__(self): 164 | super().__init__( 165 | "bedrock/anthropic.claude-3-haiku-20240307-v1:0", 166 | 0.00000025, 167 | 0.00000125, 168 | parallel_tool_call=True, 169 | ) 170 | self.note = "Fastest model from Antropic" 171 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 👉🏻 SWE-Factory 👈🏻 2 | 3 | Your automated factory for GitHub Issue Resolution Training Data and Evaluation Benchmarks. 4 | 5 | [![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=SWE-Factory%20🧑‍💻&text2=✨%20Build%20Your%20Own%20SWE-Bench%20and%20SWE-Gym,%20Automatically!&width=900&height=200)](https://github.com/Akshay090/svg-banners) 6 | 7 | 8 |

9 | 📃 Paper 10 | • 11 | 🤗 Data & Models 12 |

13 | 14 | ## 📰 News 15 | * **[Sep. 17, 2025]**: *Build your own SWE-GYM with SWE-Factory!* We trained a series of LLMs on 2,809 Python task instances constructed with our framework, all demonstrating effective performance improvements. For instance, the resolve rate of the fine-tuned Qwen2.5-Coder-14B-instruct model increased from 5.8% to 21.0%. The training trajectories sampled from our 2,809 task instances and fine-tuned models are open-sourced on 🤗 Huggingface. 16 | * **[Sep. 15, 2025]**: We release SWE-Factory 1.5, which is more robust and has a higher success rate. 17 | 18 | 19 | ## ✨ Key Features 20 | 21 | - **An automated pipeline** for GitHub issue resolution data collection, reducing your manual effort! 22 | - **Produce reliable and reproducible Docker-based evaluation environments** 23 | - **Automatic environment construction using the LLM-powered multi-agent system (SWE-Builder)** 24 | - **Support for multiple programming languages** (we have evaluated Python, Java, JS, and TS extensively.) 25 | 26 | ## 📦 Environment Setup 27 | 28 | Our experiments are conducted using Docker version 27.0.3-1 and Ubuntu 22.04.4 LTS. 29 | 30 | To get started, run the following commands to set up the environment: 31 | 32 | ```bash 33 | conda create --name swe-factory python=3.12.5 -y 34 | conda activate swe-factory 35 | pip install -r requirements.txt 36 | ``` 37 | 38 | ## 🚀 Running SWE-Factory 39 | 40 | ### 📍 Stage I: Raw Issue Data Collection 41 | 42 | We use GitHub APIs and predefined patterns to collect raw issue data (e.g., `python-mypy-instances.jsonl`). Check the detailed tutorial in the [data_collection/collect](./data_collection/collect) directory. 43 | 44 | ### 🛠 Stage II: Automated Evaluation Environemnt Setup via SWE-Builder 45 | 46 | After collecting raw issue data, set up the evaluation environment by running: 47 | 48 | ```bash 49 | export OPENAI_API_BASE_URL= 50 | export OPENAI_KEY= 51 | 52 | python app/main.py swe-bench \ 53 | --model gpt-4.1-mini \ 54 | --tasks-map "python-mypy-instances.jsonl" \ 55 | --num-processes 10 \ 56 | --model-temperature 0.2 \ 57 | --conv-round-limit 10 \ 58 | --output-dir "output/git-4.1-mini/mypy" \ 59 | --setup-dir "testbed" \ 60 | --results-path "output/git-4.1-mini/mypy/results" 61 | ``` 62 | 63 | We employ SWE-Builder, an LLM-based multi-agent system consisting of: 64 | 65 | 1. **🔍 Repository Explorer** 66 | - Gathers environment setup and test commands automatically. 67 | 68 | 2. **🐳 Environment Manager** 69 | - Generates Dockerfiles for reproducible test environments. 70 | 71 | 3. **📝 Test Manager** 72 | - Writes evaluation scripts to run tests inside containers. 73 | 74 | 4. **🔬 Test Analyst** 75 | - Validates generated environments and orchestrates iterative refinement. 76 | 77 | 5. **💾 Evaluation Environment Memory Pool** 78 | - Reuses previously successful setups for efficiency and consistency. 79 | 80 | ![Overview](figure/overview.png) 81 | 82 | #### 📊 SWE-Builder Evaluation Results 83 | 84 | We evaluated SWE-Builder using three base models: 85 | 86 | | Base Model | F2P Rate (%) | Output Rate (%) | Cost (USD) | Time (min) | 87 | |---------------------------|----------------|------------------|------------|------------| 88 | | GPT-4.1-mini | 50.2 (337/671) | 64.8 (435/671) | 0.047 | 26.3 | 89 | | DeepSeek-v3-0324 | 42.0 (282/671) | 53.4 (358/671) | 0.037 | 23.0 | 90 | | Kimi-K2 | 47.8 (321/671) | 63.2 (424/671) | 0.056 | 30.2 | 91 | 92 | To reproduce these experiments: 93 | 94 | ```bash 95 | export OPENAI_API_BASE_URL= 96 | export OPENAI_KEY= 97 | bash run/run.sh 98 | ``` 99 | 100 | ### ✅ Stage III: Fail2Pass Validation 101 | 102 | After generating evaluation environments, perform Fail2Pass validation: 103 | 104 | 1. Obtain test logs before and after applying the ground-truth patch. Check [evaluation](./evaluation) for detailed instructions. 105 | 106 | 2. Run automated Fail2Pass validation: 107 | 108 | ```bash 109 | python scripts/judge_fail2pass.py evaluation/run_instance/mypy_gpt-4.1-mini/gold fail2pass_status.json 110 | ``` 111 | 112 | The validated instances can be filtered using the generated `fail2pass_status.json`. 113 | 114 | **Note:** Although our automated validation demonstrates high precision, manual checks are recommended to ensure dataset quality, particularly to identify and filter out error-to-pass cases. 115 | 116 | ## 📌 Using Your Own Dataset 117 | 118 | After building your dataset for evaluation and training, check the [evaluation](./evaluation) directory for detailed instructions on how to run tests and obtain test exection feedback. 119 | 120 | ## 📖 Citation 121 | 122 | If SWE-Factory helps your research or projects, star ⭐ our repo or cite us: 123 | 124 | ```bibtex 125 | @article{guo2025swefactory, 126 | title={SWE-Factory: Your Automated Factory for Issue Resolution Training Data and Evaluation Benchmarks}, 127 | author={Lianghong Guo and Yanlin Wang and Caihua Li and Pengyu Yang and Jiachi Chen and Wei Tao and Yingtian Zou and Duyu Tang and Zibin Zheng}, 128 | journal={arXiv preprint arXiv:2506.10954}, 129 | year={2025}, 130 | url={https://arxiv.org/abs/2506.10954}, 131 | } 132 | ``` 133 | 134 | ## 🙏 Acknowledgements 135 | 136 | - We build upon prior research — **[R2E-Gym](https://github.com/R2E-Gym/R2E-Gym/)**, **[SWE-bench](https://arxiv.org/abs/2310.06770)**, **[AutoCodeRover](https://arxiv.org/abs/2404.05427)**, **[Magis](https://arxiv.org/abs/2403.17927)**, and **[OmniGIRL](https://arxiv.org/abs/2505.04606)** — foundational to our work. 137 | - Huge thanks to the open-source developer community for your invaluable contributions to software engineering research. ❤️ 138 | -------------------------------------------------------------------------------- /app/model/ollama.py: -------------------------------------------------------------------------------- 1 | """ 2 | For models other than those from OpenAI, use LiteLLM if possible. 3 | Create all models managed by Ollama here, since they need to talk to ollama server. 4 | """ 5 | 6 | import sys 7 | from collections.abc import Mapping 8 | from copy import deepcopy 9 | from typing import Literal, cast 10 | 11 | import httpx 12 | import ollama 13 | import timeout_decorator 14 | from ollama._types import Message, Options 15 | from openai.types.chat import ChatCompletionMessage 16 | 17 | from app.model import common 18 | from app.model.common import Model 19 | 20 | 21 | class OllamaModel(Model): 22 | """ 23 | Base class for creating Singleton instances of Ollama models. 24 | """ 25 | 26 | _instances = {} 27 | 28 | def __new__(cls): 29 | if cls not in cls._instances: 30 | cls._instances[cls] = super().__new__(cls) 31 | cls._instances[cls]._initialized = False 32 | return cls._instances[cls] 33 | 34 | def __init__(self, name: str): 35 | if self._initialized: 36 | return 37 | # local models are free 38 | super().__init__(name, 0.0, 0.0) 39 | self.client: ollama.Client | None = None 40 | self._initialized = True 41 | 42 | def setup(self) -> None: 43 | """ 44 | Check API key. 45 | """ 46 | self.check_api_key() 47 | try: 48 | self.send_empty_request() 49 | print(f"Model {self.name} is up and running.") 50 | except timeout_decorator.TimeoutError as e: 51 | print( 52 | "Ollama server is taking too long (more than 2 mins) to respond. Please check whether it's running.", 53 | e, 54 | ) 55 | sys.exit(1) 56 | except Exception as e: 57 | print("Could not communicate with ollama server due to exception.", e) 58 | sys.exit(1) 59 | 60 | @timeout_decorator.timeout(120) # 2 min 61 | def send_empty_request(self): 62 | """ 63 | Send an empty request to the model, for two purposes 64 | (1) check whether the model is up and running 65 | (2) preload the model for faster response time (models will be kept in memory for 5 mins after loaded) 66 | (see https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-pre-load-a-model-to-get-faster-response-times) 67 | """ 68 | # localhost is used when (1) running both ACR and ollama on host machine; and 69 | # (2) running ollama in host, and ACR in container with --net=host 70 | local_client = ollama.Client(host="http://localhost:11434") 71 | # docker_host_client is used when running ollama in host and ACR in container, and 72 | # Docker Desktop is installed 73 | docker_host_client = ollama.Client(host="http://host.docker.internal:11434") 74 | try: 75 | local_client.chat(model=self.name, messages=[]) 76 | self.client = local_client 77 | return 78 | except httpx.ConnectError: 79 | # failed to connect to client at localhost 80 | pass 81 | 82 | try: 83 | docker_host_client.chat(model=self.name, messages=[]) 84 | self.client = docker_host_client 85 | except httpx.ConnectError: 86 | # also failed to connect via host.docker.internal 87 | print("Could not connect to ollama server.") 88 | sys.exit(1) 89 | 90 | def check_api_key(self) -> str: 91 | return "No key required for local models." 92 | 93 | def extract_resp_content( 94 | self, chat_completion_message: ChatCompletionMessage 95 | ) -> str: 96 | """ 97 | Given a chat completion message, extract the content from it. 98 | """ 99 | content = chat_completion_message.content 100 | if content is None: 101 | return "" 102 | else: 103 | return content 104 | 105 | def call( 106 | self, 107 | messages: list[dict], 108 | top_p=1, 109 | tools=None, 110 | response_format: Literal["text", "json_object"] = "text", 111 | **kwargs, 112 | ): 113 | stop_words = ["assistant", "\n\n \n\n"] 114 | json_stop_words = deepcopy(stop_words) 115 | json_stop_words.append("```") 116 | json_stop_words.append(" " * 10) 117 | # FIXME: ignore tools field since we don't use tools now 118 | 119 | assert self.client is not None 120 | try: 121 | # build up options for ollama 122 | options = {"temperature": common.MODEL_TEMP, "top_p": top_p} 123 | if response_format == "json_object": 124 | # additional instructions for json mode 125 | json_instruction = { 126 | "role": "user", 127 | "content": "Stop your response after a valid json is generated.", 128 | } 129 | messages.append(json_instruction) 130 | # give more stop words and lower max_token for json mode 131 | options.update({"stop": json_stop_words, "num_predict": 128}) 132 | response = self.client.chat( 133 | model=self.name, 134 | messages=cast(list[Message], messages), 135 | format="json", 136 | options=cast(Options, options), 137 | stream=False, 138 | ) 139 | else: 140 | options.update({"stop": stop_words, "num_predict": 1024}) 141 | response = self.client.chat( 142 | model=self.name, 143 | messages=cast(list[Message], messages), 144 | options=cast(Options, options), 145 | stream=False, 146 | ) 147 | 148 | assert isinstance(response, Mapping) 149 | resp_msg = response.get("message", None) 150 | if resp_msg is None: 151 | return "", 0, 0, 0 152 | 153 | content: str = resp_msg.get("content", "") 154 | return content, 0, 0, 0 155 | 156 | except Exception as e: 157 | # FIXME: catch appropriate exception from ollama 158 | raise e 159 | 160 | 161 | class Llama3_8B(OllamaModel): 162 | def __init__(self): 163 | super().__init__("llama3") 164 | self.note = "Llama3 8B model." 165 | 166 | 167 | class Llama3_70B(OllamaModel): 168 | def __init__(self): 169 | super().__init__("llama3:70b") 170 | self.note = "Llama3 70B model." 171 | -------------------------------------------------------------------------------- /data_collection/collect/get_version.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import subprocess 4 | import re 5 | import json 6 | import argparse 7 | from contextlib import contextmanager 8 | from typing import List, Dict 9 | from concurrent.futures import ProcessPoolExecutor, as_completed 10 | 11 | @contextmanager 12 | def cd(newdir): 13 | prevdir = os.getcwd() 14 | os.chdir(os.path.expanduser(newdir)) 15 | try: 16 | yield 17 | finally: 18 | os.chdir(prevdir) 19 | 20 | def run_command(cmd: list[str], **kwargs) -> subprocess.CompletedProcess: 21 | try: 22 | return subprocess.run(cmd, check=True, **kwargs) 23 | except subprocess.CalledProcessError as e: 24 | print(f"Error running command: {cmd}, {e}") 25 | raise 26 | 27 | def get_version_by_git(cloned_dir: str) -> str: 28 | if not os.path.isdir(cloned_dir): 29 | raise NotADirectoryError(f"Invalid directory: {cloned_dir}") 30 | with cd(cloned_dir): 31 | result = run_command(["git", "describe", "--tags"], capture_output=True, text=True) 32 | version = result.stdout.strip() 33 | print(f"✔️ Current version: {version}") 34 | match = re.search(r"(\d+\.\d+)(?:\.\d+)?", version) 35 | if match: 36 | return match.group(1) 37 | raise RuntimeError(f"Unrecognized version: {version}") 38 | 39 | def get_instances(instance_path: str) -> List[Dict]: 40 | if instance_path.endswith((".jsonl", ".jsonl.all")): 41 | with open(instance_path, encoding="utf-8") as f: 42 | return [json.loads(line) for line in f] 43 | with open(instance_path, encoding="utf-8") as f: 44 | return json.load(f) 45 | 46 | def prepare_repo_cache(tasks: List[Dict], cache_dir: str) -> Dict[str, str]: 47 | os.makedirs(cache_dir, exist_ok=True) 48 | repo_cache = {} 49 | for task in tasks: 50 | repo = task["repo"] 51 | if repo in repo_cache: 52 | continue 53 | repo_url = f"https://github.com/{repo}.git" 54 | local_path = os.path.join(cache_dir, repo.replace("/", "__")) 55 | try: 56 | run_command(["git", "clone", repo_url, local_path], capture_output=True) 57 | repo_cache[repo] = local_path 58 | print(f"✅ Cached repo: {repo}") 59 | except Exception as e: 60 | print(f"❌ Failed to clone {repo}: {e}") 61 | return repo_cache 62 | 63 | def process_repo_task(task: Dict, testbed: str, repo_cache: Dict[str, str]) -> Dict | None: 64 | instance_id = task["instance_id"] 65 | repo = task["repo"] 66 | base_commit = task["base_commit"] 67 | repo_dir = os.path.join(testbed, instance_id) 68 | os.makedirs(repo_dir, exist_ok=True) 69 | 70 | try: 71 | cached_repo = repo_cache.get(repo) 72 | if not cached_repo or not os.path.exists(cached_repo): 73 | raise RuntimeError(f"Missing cached repo for {repo}") 74 | shutil.copytree(cached_repo, repo_dir, dirs_exist_ok=True) 75 | with cd(repo_dir): 76 | run_command(["git", "checkout", base_commit], capture_output=True) 77 | version = get_version_by_git(repo_dir) 78 | result = task.copy() 79 | result["version"] = version 80 | return result 81 | except Exception as e: 82 | print(f"❌ Failed: {instance_id} | {e}") 83 | return None 84 | finally: 85 | shutil.rmtree(repo_dir, ignore_errors=True) 86 | 87 | def process_repos(tasks: List[Dict], testbed: str, repo_cache: Dict[str, str], max_workers: int = 4) -> tuple[List[Dict], List[Dict]]: 88 | os.makedirs(testbed, exist_ok=True) 89 | results, failures = [], [] 90 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 91 | future_to_task = { 92 | executor.submit(process_repo_task, t, testbed, repo_cache): t for t in tasks 93 | } 94 | for future in as_completed(future_to_task): 95 | task = future_to_task[future] 96 | try: 97 | result = future.result() 98 | if result: 99 | results.append(result) 100 | else: 101 | failures.append(task) 102 | except Exception as e: 103 | print(f"Unexpected error in {task['instance_id']}: {e}") 104 | failures.append(task) 105 | return results, failures 106 | 107 | def save_results(results: List[Dict], output_path: str): 108 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 109 | if output_path.endswith((".jsonl", ".jsonl.all")): 110 | with open(output_path, "w", encoding="utf-8") as f: 111 | for r in results: 112 | f.write(json.dumps(r) + "\n") 113 | else: 114 | with open(output_path, "w", encoding="utf-8") as f: 115 | json.dump(results, f, indent=2, ensure_ascii=False) 116 | 117 | def generate_output_path(instance_path: str, suffix="_versions") -> str: 118 | base, ext = os.path.splitext(instance_path) 119 | return f"{base}{suffix}{ext}" 120 | 121 | def main(): 122 | parser = argparse.ArgumentParser() 123 | parser.add_argument("--instance_path", type=str, required=True, help="Path to input task file (.json or .jsonl)") 124 | parser.add_argument("--testbed", type=str, required=True, help="Temp working directory for cloning repos") 125 | parser.add_argument("--max-workers", type=int, default=10, help="Number of processes (default: 4)") 126 | args = parser.parse_args() 127 | 128 | try: 129 | tasks = get_instances(args.instance_path) 130 | except Exception as e: 131 | print(f"❌ Error reading instance file: {e}") 132 | return 133 | 134 | required_keys = {"repo", "base_commit", "instance_id"} 135 | for t in tasks: 136 | if not required_keys.issubset(t): 137 | print(f"Invalid task format: {t}") 138 | return 139 | 140 | repo_cache_dir = os.path.join(args.testbed, "_cache") 141 | repo_cache = prepare_repo_cache(tasks, repo_cache_dir) 142 | 143 | results, failures = process_repos(tasks, args.testbed, repo_cache, args.max_workers) 144 | 145 | output_path = generate_output_path(args.instance_path, "_versions") 146 | save_results(results, output_path) 147 | print(f"\n✅ {len(results)} results saved to {output_path}") 148 | 149 | if failures: 150 | fail_path = generate_output_path(args.instance_path, "_failures") 151 | save_results(failures, fail_path) 152 | print(f"⚠️ {len(failures)} failures saved to {fail_path}") 153 | 154 | for r in results: 155 | print(json.dumps(r, indent=2, ensure_ascii=False)) 156 | 157 | if __name__ == "__main__": 158 | main() 159 | -------------------------------------------------------------------------------- /app/model/gptlitellm.py: -------------------------------------------------------------------------------- 1 | """ 2 | For models other than those from OpenAI, use LiteLLM if possible. 3 | """ 4 | 5 | import os 6 | import sys 7 | from typing import Literal 8 | 9 | import litellm 10 | from litellm.utils import Choices, Message, ModelResponse 11 | from openai import BadRequestError 12 | from tenacity import retry, stop_after_attempt, wait_random_exponential 13 | 14 | from app.log import log_and_print 15 | from app.model import common 16 | from app.model.common import Model 17 | 18 | 19 | class OpenaiLiteLLMModel(Model): 20 | """ 21 | Base class for creating Singleton instances of Openai models. 22 | """ 23 | 24 | _instances = {} 25 | 26 | def __new__(cls): 27 | if cls not in cls._instances: 28 | cls._instances[cls] = super().__new__(cls) 29 | cls._instances[cls]._initialized = False 30 | return cls._instances[cls] 31 | 32 | def __init__( 33 | self, 34 | name: str, 35 | cost_per_input: float, 36 | cost_per_output: float, 37 | parallel_tool_call: bool = False, 38 | ): 39 | if self._initialized: 40 | return 41 | super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call) 42 | self._initialized = True 43 | 44 | def setup(self) -> None: 45 | """ 46 | Check API key. 47 | """ 48 | self.check_api_key() 49 | 50 | def check_api_key(self) -> str: 51 | key_name = "OPENAI_KEY" 52 | key = os.getenv(key_name) 53 | if not key: 54 | print(f"Please set the {key_name} env var") 55 | sys.exit(1) 56 | os.environ["OPENAI_API_KEY"] = key 57 | return key 58 | 59 | def extract_resp_content(self, chat_message: Message) -> str: 60 | """ 61 | Given a chat completion message, extract the content from it. 62 | """ 63 | content = chat_message.content 64 | if content is None: 65 | return "" 66 | else: 67 | return content 68 | 69 | @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3)) 70 | def call( 71 | self, 72 | messages: list[dict], 73 | top_p=1, 74 | tools=None, 75 | response_format: Literal["text", "json_object"] = "text", 76 | **kwargs, 77 | ): 78 | # FIXME: ignore tools field since we don't use tools now 79 | try: 80 | prefill_content = "{" 81 | if response_format == "json_object": # prefill 82 | messages.append({"role": "assistant", "content": prefill_content}) 83 | 84 | response = litellm.completion( 85 | model=( 86 | self.name 87 | if not self.name.startswith("litellm-") 88 | else self.name[len("litellm-") :] 89 | ), 90 | messages=messages, 91 | temperature=common.MODEL_TEMP, 92 | max_tokens=4096, 93 | response_format={"type": response_format}, 94 | top_p=top_p, 95 | base_url=os.getenv("OPENAI_API_BASE_URL", None), 96 | stream=False, 97 | ) 98 | assert isinstance(response, ModelResponse) 99 | resp_usage = response.usage 100 | assert resp_usage is not None 101 | input_tokens = int(resp_usage.prompt_tokens) 102 | output_tokens = int(resp_usage.completion_tokens) 103 | cost = self.calc_cost(input_tokens, output_tokens) 104 | 105 | common.thread_cost.process_cost += cost 106 | common.thread_cost.process_input_tokens += input_tokens 107 | common.thread_cost.process_output_tokens += output_tokens 108 | 109 | first_resp_choice = response.choices[0] 110 | assert isinstance(first_resp_choice, Choices) 111 | resp_msg: Message = first_resp_choice.message 112 | content = self.extract_resp_content(resp_msg) 113 | if response_format == "json_object": 114 | # prepend the prefilled character 115 | if not content.startswith(prefill_content): 116 | content = prefill_content + content 117 | 118 | return content, cost, input_tokens, output_tokens 119 | 120 | except BadRequestError as e: 121 | if e.code == "context_length_exceeded": 122 | log_and_print("Context length exceeded") 123 | raise e 124 | 125 | 126 | class Gpt4o_20240513LiteLLM(OpenaiLiteLLMModel): 127 | def __init__(self): 128 | super().__init__( 129 | "litellm-gpt-4o-2024-05-13", 0.000005, 0.000015, parallel_tool_call=True 130 | ) 131 | self.note = "Multimodal model. Up to Oct 2023." 132 | 133 | 134 | class Gpt4_Turbo20240409LiteLLM(OpenaiLiteLLMModel): 135 | def __init__(self): 136 | super().__init__( 137 | "litellm-gpt-4-turbo-2024-04-09", 0.00001, 0.00003, parallel_tool_call=True 138 | ) 139 | self.note = "Turbo with vision. Up to Dec 2023." 140 | 141 | 142 | class Gpt4_0125PreviewLiteLLM(OpenaiLiteLLMModel): 143 | def __init__(self): 144 | super().__init__( 145 | "litellm-gpt-4-0125-preview", 0.00001, 0.00003, parallel_tool_call=True 146 | ) 147 | self.note = "Turbo. Up to Dec 2023." 148 | 149 | 150 | class Gpt4_1106PreviewLiteLLM(OpenaiLiteLLMModel): 151 | def __init__(self): 152 | super().__init__( 153 | "litellm-gpt-4-1106-preview", 0.00001, 0.00003, parallel_tool_call=True 154 | ) 155 | self.note = "Turbo. Up to Apr 2023." 156 | 157 | 158 | class Gpt35_Turbo0125LiteLLM(OpenaiLiteLLMModel): 159 | # cheapest gpt model 160 | def __init__(self): 161 | super().__init__( 162 | "litellm-gpt-3.5-turbo-0125", 0.0000005, 0.0000015, parallel_tool_call=True 163 | ) 164 | self.note = "Turbo. Up to Sep 2021." 165 | 166 | 167 | class Gpt35_Turbo1106LiteLLM(OpenaiLiteLLMModel): 168 | def __init__(self): 169 | super().__init__( 170 | "litellm-gpt-3.5-turbo-1106", 0.000001, 0.000002, parallel_tool_call=True 171 | ) 172 | self.note = "Turbo. Up to Sep 2021." 173 | 174 | 175 | class Gpt35_Turbo16k_0613LiteLLM(OpenaiLiteLLMModel): 176 | def __init__(self): 177 | super().__init__("litellm-gpt-3.5-turbo-16k-0613", 0.000003, 0.000004) 178 | self.note = "Turbo. Deprecated. Up to Sep 2021." 179 | 180 | 181 | class Gpt35_Turbo0613LiteLLM(OpenaiLiteLLMModel): 182 | def __init__(self): 183 | super().__init__("litellm-gpt-3.5-turbo-0613", 0.0000015, 0.000002) 184 | self.note = "Turbo. Deprecated. Only 4k window. Up to Sep 2021." 185 | 186 | 187 | class Gpt4_0613LiteLLM(OpenaiLiteLLMModel): 188 | def __init__(self): 189 | super().__init__("litellm-gpt-4-0613", 0.00003, 0.00006) 190 | self.note = "Not turbo. Up to Sep 2021." 191 | 192 | -------------------------------------------------------------------------------- /app/log.py: -------------------------------------------------------------------------------- 1 | import time 2 | from collections.abc import Callable 3 | from os import get_terminal_size 4 | 5 | from loguru import logger 6 | from rich.console import Console 7 | from rich.markdown import Markdown 8 | from rich.markup import escape 9 | from rich.panel import Panel 10 | import logging 11 | from pathlib import Path 12 | import threading 13 | 14 | logger_lock = threading.Lock() 15 | 16 | def terminal_width(): 17 | try: 18 | return get_terminal_size().columns 19 | except OSError: 20 | return 80 21 | 22 | 23 | WIDTH = min(120, terminal_width() - 10) 24 | 25 | console = Console() 26 | 27 | print_stdout = True 28 | 29 | 30 | def log_exception(exception): 31 | logger.exception(exception) 32 | 33 | 34 | def print_banner(msg: str) -> None: 35 | if not print_stdout: 36 | return 37 | 38 | banner = f" {msg} ".center(WIDTH, "=") 39 | console.print() 40 | console.print(banner, style="bold") 41 | console.print() 42 | 43 | 44 | def replace_html_tags(content: str): 45 | """ 46 | Helper method to process the content before printing to markdown. 47 | """ 48 | replace_dict = { 49 | "": "[file]", 50 | "": "[class]", 51 | "": "[func]", 52 | "": "[method]", 53 | "": "[code]", 54 | "": "[original]", 55 | "": "[patched]", 56 | "": "[/file]", 57 | "": "[/class]", 58 | "": "[/func]", 59 | "": "[/method]", 60 | "": "[/code]", 61 | "": "[/original]", 62 | "": "[/patched]", 63 | } 64 | for key, value in replace_dict.items(): 65 | content = content.replace(key, value) 66 | return content 67 | 68 | 69 | def print_acr( 70 | msg: str, desc="", print_callback: Callable[[dict], None] | None = None 71 | ) -> None: 72 | if not print_stdout: 73 | return 74 | 75 | msg = replace_html_tags(msg) 76 | markdown = Markdown(msg) 77 | 78 | name = "SweEnvSetupAgent" 79 | if desc: 80 | title = f"{name} ({desc})" 81 | else: 82 | title = name 83 | 84 | panel = Panel( 85 | markdown, title=title, title_align="left", border_style="magenta", width=WIDTH 86 | ) 87 | console.print(panel) 88 | 89 | if print_callback: 90 | print_callback( 91 | {"title": f"{name} ({desc})", "message": msg, "category": "SweEnvSetupAgent"} 92 | ) 93 | 94 | 95 | def print_retrieval( 96 | msg: str, desc="", print_callback: Callable[[dict], None] | None = None 97 | ) -> None: 98 | if not print_stdout: 99 | return 100 | 101 | msg = replace_html_tags(msg) 102 | markdown = Markdown(msg) 103 | 104 | name = "Context Retrieval Agent" 105 | if desc: 106 | title = f"{name} ({desc})" 107 | else: 108 | title = name 109 | 110 | panel = Panel( 111 | markdown, title=title, title_align="left", border_style="blue", width=WIDTH 112 | ) 113 | console.print(panel) 114 | if print_callback: 115 | print_callback( 116 | { 117 | "title": f"{name} ({desc})", 118 | "message": msg, 119 | "category": "context_retrieval_agent", 120 | } 121 | ) 122 | 123 | 124 | def print_patch_generation( 125 | msg: str, desc="", print_callback: Callable[[dict], None] | None = None 126 | ) -> None: 127 | if not print_stdout: 128 | return 129 | 130 | msg = replace_html_tags(msg) 131 | markdown = Markdown(msg) 132 | 133 | name = "Patch Generation" 134 | if desc: 135 | title = f"{name} ({desc})" 136 | else: 137 | title = name 138 | 139 | panel = Panel( 140 | markdown, title=title, title_align="left", border_style="yellow", width=WIDTH 141 | ) 142 | console.print(panel) 143 | if print_callback: 144 | print_callback( 145 | { 146 | "title": f"{name} ({desc})", 147 | "message": msg, 148 | "category": "patch_generation", 149 | } 150 | ) 151 | 152 | 153 | def print_fix_loc_generation( 154 | msg: str, desc="", print_callback: Callable[[dict], None] | None = None 155 | ) -> None: 156 | if not print_stdout: 157 | return 158 | 159 | msg = replace_html_tags(msg) 160 | markdown = Markdown(msg) 161 | 162 | name = "Fix Location Generation" 163 | if desc: 164 | title = f"{name} ({desc})" 165 | else: 166 | title = name 167 | 168 | panel = Panel( 169 | markdown, title=title, title_align="left", border_style="green", width=WIDTH 170 | ) 171 | console.print(panel) 172 | if print_callback: 173 | print_callback( 174 | { 175 | "title": f"{name} ({desc})", 176 | "message": msg, 177 | "category": "fix_loc_generation", 178 | } 179 | ) 180 | 181 | 182 | def print_issue(content: str) -> None: 183 | if not print_stdout: 184 | return 185 | 186 | title = "Issue description" 187 | panel = Panel( 188 | escape(content), 189 | title=title, 190 | title_align="left", 191 | border_style="red", 192 | width=WIDTH, 193 | ) 194 | console.print(panel) 195 | 196 | 197 | def log_and_print(msg): 198 | logger.info(msg) 199 | if print_stdout: 200 | console.print(msg) 201 | 202 | 203 | def log_and_cprint(msg, **kwargs): 204 | logger.info(msg) 205 | if print_stdout: 206 | console.print(msg, **kwargs) 207 | 208 | 209 | def log_and_always_print(msg): 210 | """ 211 | A mode which always print to stdout, no matter what. 212 | Useful when running multiple tasks and we just want to see the important information. 213 | """ 214 | logger.info(msg) 215 | # always include time for important messages 216 | t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 217 | console.print(f"\n[{t}] {msg}") 218 | 219 | 220 | def print_with_time(msg): 221 | """ 222 | Print a msg to console with timestamp. 223 | """ 224 | t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 225 | console.print(f"\n[{t}] {msg}") 226 | 227 | 228 | def setup_logger(instance_id: str, log_file: Path, mode="w"): 229 | """ 230 | This logger is used for logging the build process of images and containers. 231 | It writes logs to the log file. 232 | """ 233 | with logger_lock: 234 | log_file.parent.mkdir(parents=True, exist_ok=True) 235 | new_logger = logging.getLogger(f"{instance_id}.{log_file.name}") 236 | handler = logging.FileHandler(log_file, mode=mode) 237 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 238 | handler.setFormatter(formatter) 239 | new_logger.addHandler(handler) 240 | new_logger.setLevel(logging.INFO) 241 | new_logger.propagate = False 242 | setattr(new_logger, "log_file", log_file) 243 | return new_logger 244 | 245 | def close_logger(new_logger): 246 | # To avoid too many open files 247 | with logger_lock: 248 | for handler in new_logger.handlers: 249 | handler.close() 250 | new_logger.removeHandler(handler) -------------------------------------------------------------------------------- /app/data_structures.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections.abc import Mapping 3 | from dataclasses import dataclass 4 | from pprint import pformat 5 | import base64 6 | import httpx 7 | 8 | from openai.types.chat import ChatCompletionMessageToolCall 9 | from openai.types.chat.chat_completion_message_tool_call import ( 10 | Function as OpenaiFunction, 11 | ) 12 | 13 | 14 | @dataclass 15 | class MethodId: 16 | class_name: str 17 | method_name: str 18 | 19 | def __str__(self): 20 | if self.class_name: 21 | return f"{self.class_name}.{self.method_name}" 22 | return self.method_name 23 | 24 | def __hash__(self): 25 | return hash((self.class_name, self.method_name)) 26 | 27 | 28 | class FunctionCallIntent: 29 | """An intent to call a tool function. 30 | 31 | This object created from OpenAI API response. 32 | """ 33 | 34 | def __init__( 35 | self, 36 | func_name: str, 37 | arguments: Mapping[str, str], 38 | openai_func: OpenaiFunction | None, 39 | ): 40 | self.func_name = func_name 41 | self.arg_values = dict() 42 | self.arg_values.update(arguments) 43 | # record the original openai function object, 44 | # which is used when we want tell the model that it has 45 | # previously called this function/tool 46 | self.openai_func = openai_func or OpenaiFunction( 47 | arguments=json.dumps(arguments), name=func_name 48 | ) 49 | 50 | def __str__(self): 51 | return f"Call function `{self.func_name}` with arguments {self.arg_values}." 52 | 53 | def to_dict(self): 54 | return {"func_name": self.func_name, "arguments": self.arg_values} 55 | 56 | def to_dict_with_result(self, call_ok: bool, result: str | None=None,agent_id:str | None=None): 57 | return { 58 | "func_name": self.func_name, 59 | "arguments": self.arg_values, 60 | "call_ok": call_ok, 61 | "result": result, 62 | "agent_id": agent_id 63 | } 64 | 65 | 66 | class MessageThread: 67 | """ 68 | Represents a thread of conversation with the model. 69 | Abstrated into a class so that we can dump this to a file at any point. 70 | """ 71 | 72 | def __init__(self, messages=None): 73 | self.messages: list[dict] = messages or [] 74 | 75 | def add(self, role: str, message: str): 76 | """ 77 | Add a new message to the thread. 78 | Args: 79 | message (str): The content of the new message. 80 | role (str): The role of the new message. 81 | """ 82 | self.messages.append({"role": role, "content": message}) 83 | 84 | def add_system(self, message: str): 85 | self.messages.append({"role": "system", "content": message}) 86 | 87 | def add_user(self, message: str): 88 | self.messages.append({"role": "user", "content": message}) 89 | 90 | def add_image(self, messages: list[str]): 91 | def encode_image(image_path): 92 | with open(image_path, "rb") as image_file: 93 | return base64.b64encode(image_file.read()).decode("utf-8") 94 | 95 | for idx, message in enumerate(messages): 96 | if 'imgur' in message : 97 | if 'Ow4tDFX' in message: 98 | image1_data = encode_image('temp.jpeg') 99 | else: 100 | image1_data = encode_image('temp1.jpeg') 101 | self.messages.append({ 102 | "role": "user", 103 | "content": [ 104 | { 105 | "type": "text", 106 | "text": ":\n" 107 | }, 108 | { 109 | "type": "image_url", 110 | "image_url": {"url": f"data:image/jpeg;base64,{image1_data}"}, 111 | } 112 | ] 113 | }) 114 | else: 115 | message = message.replace('snipboard','i.snipboard') 116 | self.messages.append({"role": "user", "content": [ 117 | {'type':'text','text':f':\n'}, 118 | { 119 | "type": "image_url", 120 | "image_url": { 121 | "url": message 122 | }, 123 | }, 124 | 125 | ]}) 126 | 127 | def add_tool(self, message: str, tool_call_id: str): 128 | m = {"role": "tool", "content": message, "tool_call_id": tool_call_id} 129 | self.messages.append(m) 130 | 131 | def add_model( 132 | self, message: str | None, tools: list[ChatCompletionMessageToolCall] 133 | ): 134 | # let's serialize tools into json first 135 | json_tools = [] 136 | for tool in tools: 137 | this_tool_dict = {} 138 | this_tool_dict["id"] = tool.id 139 | this_tool_dict["type"] = tool.type 140 | # now serialize function as well 141 | func_obj: OpenaiFunction = tool.function 142 | func_args: str = func_obj.arguments 143 | func_name: str = func_obj.name 144 | this_tool_dict["function"] = {"name": func_name, "arguments": func_args} 145 | json_tools.append(this_tool_dict) 146 | 147 | if json_tools == []: 148 | # there is no tool calls from the model last time, 149 | # the best we could do is to return the generated text 150 | self.messages.append({"role": "assistant", "content": message}) 151 | else: 152 | self.messages.append( 153 | {"role": "assistant", "content": None, "tool_calls": json_tools} 154 | ) 155 | 156 | def to_msg(self) -> list[dict]: 157 | """ 158 | Convert to the format to be consumed by the model. 159 | Returns: 160 | List[Dict]: The message thread. 161 | """ 162 | return self.messages 163 | 164 | def __str__(self): 165 | return pformat(self.messages, width=160, sort_dicts=False) 166 | 167 | def save_to_file(self, file_path: str): 168 | """ 169 | Save the current state of the message thread to a file. 170 | Args: 171 | file_path (str): The path to the file. 172 | """ 173 | with open(file_path, "w") as f: 174 | json.dump(self.messages, f, indent=4) 175 | 176 | def get_round_number(self) -> int: 177 | """ 178 | From the current message history, decide how many rounds have been completed. 179 | """ 180 | completed_rounds = 0 181 | for message in self.messages: 182 | if message["role"] == "assistant": 183 | completed_rounds += 1 184 | return completed_rounds 185 | 186 | @classmethod 187 | def load_from_file(cls, file_path: str): 188 | """ 189 | Load the message thread from a file. 190 | Args: 191 | file_path (str): The path to the file. 192 | Returns: 193 | MessageThread: The message thread. 194 | """ 195 | with open(file_path) as f: 196 | messages = json.load(f) 197 | return cls(messages) 198 | -------------------------------------------------------------------------------- /data_collection/versioning/get_versions_by_git.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import shutil 4 | import subprocess 5 | import re 6 | import json 7 | import argparse 8 | from contextlib import contextmanager 9 | from typing import List, Dict 10 | from concurrent.futures import ProcessPoolExecutor, as_completed 11 | import glob 12 | @contextmanager 13 | def cd(newdir): 14 | prevdir = os.getcwd() 15 | os.chdir(os.path.expanduser(newdir)) 16 | try: 17 | yield 18 | finally: 19 | os.chdir(prevdir) 20 | 21 | 22 | def run_command(cmd: List[str], **kwargs) -> subprocess.CompletedProcess: 23 | try: 24 | return subprocess.run(cmd, check=True, **kwargs) 25 | except subprocess.CalledProcessError as e: 26 | print(f"Error running command: {cmd}, {e}") 27 | raise 28 | 29 | 30 | def get_version_by_git(cloned_dir: str) -> str: 31 | if not os.path.isdir(cloned_dir): 32 | raise NotADirectoryError(f"Invalid directory: {cloned_dir}") 33 | with cd(cloned_dir): 34 | result = run_command(["git", "describe", "--tags"], capture_output=True, text=True) 35 | version = result.stdout.strip() 36 | print(f"✔️ Current version: {version}") 37 | match = re.search(r"(\d+\.\d+)(?:\.\d+)?", version) 38 | if match: 39 | return match.group(1) 40 | raise RuntimeError(f"Unrecognized version: {version}") 41 | 42 | 43 | def get_instances(instance_path: str) -> List[Dict]: 44 | if instance_path.endswith((".jsonl", ".jsonl.all")): 45 | with open(instance_path, encoding="utf-8") as f: 46 | return [json.loads(line) for line in f] 47 | with open(instance_path, encoding="utf-8") as f: 48 | return json.load(f) 49 | 50 | 51 | def prepare_repo_cache(tasks: List[Dict], cache_dir: str) -> Dict[str, str]: 52 | os.makedirs(cache_dir, exist_ok=True) 53 | repo_cache = {} 54 | for task in tasks: 55 | repo = task["repo"] 56 | if repo in repo_cache: 57 | continue 58 | repo_url = f"https://github.com/{repo}.git" 59 | local_path = os.path.join(cache_dir, repo.replace("/", "__")) 60 | try: 61 | run_command(["git", "clone", repo_url, local_path], capture_output=True) 62 | repo_cache[repo] = local_path 63 | print(f"✅ Cached repo: {repo}") 64 | except Exception as e: 65 | print(f"❌ Failed to clone {repo}: {e}") 66 | return repo_cache 67 | 68 | 69 | def process_repo_task(task: Dict, testbed: str, repo_cache: Dict[str, str]) -> Dict | None: 70 | instance_id = task["instance_id"] 71 | repo = task["repo"] 72 | base_commit = task["base_commit"] 73 | repo_dir = os.path.join(testbed, instance_id) 74 | os.makedirs(repo_dir, exist_ok=True) 75 | try: 76 | cached_repo = repo_cache.get(repo) 77 | if not cached_repo or not os.path.exists(cached_repo): 78 | raise RuntimeError(f"Missing cached repo for {repo}") 79 | shutil.copytree(cached_repo, repo_dir, dirs_exist_ok=True) 80 | with cd(repo_dir): 81 | run_command(["git", "checkout", base_commit], capture_output=True) 82 | version = get_version_by_git(repo_dir) 83 | result = task.copy() 84 | result["version"] = version 85 | return result 86 | except Exception as e: 87 | print(f"❌ Failed: {instance_id} | {e}") 88 | return None 89 | finally: 90 | shutil.rmtree(repo_dir, ignore_errors=True) 91 | 92 | 93 | def process_repos(tasks: List[Dict], testbed: str, repo_cache: Dict[str, str], max_workers: int = 4) -> List[Dict]: 94 | os.makedirs(testbed, exist_ok=True) 95 | results = [] 96 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 97 | futures = [executor.submit(process_repo_task, t, testbed, repo_cache) for t in tasks] 98 | for future in as_completed(futures): 99 | res = future.result() 100 | if res: 101 | results.append(res) 102 | return results 103 | 104 | 105 | def save_results(results: List[Dict], output_path: str): 106 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 107 | if output_path.endswith((".jsonl", ".jsonl.all")): 108 | with open(output_path, "w", encoding="utf-8") as f: 109 | for r in results: 110 | f.write(json.dumps(r, ensure_ascii=False) + "\n") 111 | else: 112 | with open(output_path, "w", encoding="utf-8") as f: 113 | json.dump(results, f, indent=2, ensure_ascii=False) 114 | 115 | 116 | def generate_output_path(instance_path: str, suffix: str) -> str: 117 | base, ext = os.path.splitext(instance_path) 118 | ext='.json' 119 | return f"{base}{suffix}{ext}" 120 | 121 | def find_github_file(output_dir: str) -> str | None: 122 | """ 123 | search file 124 | """ 125 | # 通配所有 _versions_by_github.json 或 jsonl 126 | for ext in ('json', 'jsonl'): 127 | pattern = os.path.join(output_dir, f"*_versions_by_github.{ext}") 128 | matches = glob.glob(pattern) 129 | if matches: 130 | return matches[0] 131 | return None 132 | 133 | 134 | def main(): 135 | parser = argparse.ArgumentParser() 136 | parser.add_argument("--instance_path", "-i", type=str, required=True, 137 | help="Path to input task file (.json or .jsonl)") 138 | parser.add_argument("--testbed", "-t", type=str, default="testbed", 139 | help="Temp working directory for cloning repos") 140 | parser.add_argument("--max_workers", "-w", type=int, default=10, 141 | help="Number of parallel workers") 142 | parser.add_argument("--output_dir", "-d", type=str, default=None, 143 | help="Directory to save output (keeps original filename + suffix)") 144 | parser.add_argument("--last_stage_output_dir", "-l", type=str, default=None, 145 | help="Directory to save output (keeps original filename + suffix)") 146 | args = parser.parse_args() 147 | 148 | 149 | try: 150 | tasks = get_instances(args.instance_path) 151 | except Exception as e: 152 | print(f"❌ Error reading instance file: {e}") 153 | return 154 | 155 | 156 | github_file = find_github_file(args.last_stage_output_dir) 157 | 158 | if github_file: 159 | try: 160 | processed = get_instances(github_file) 161 | seen = {item.get('instance_id') for item in processed if 'instance_id' in item} 162 | before = len(tasks) 163 | tasks = [t for t in tasks if t.get('instance_id') not in seen] 164 | print(f"ℹ️ Skipped {before - len(tasks)} tasks already in {os.path.basename(github_file)}") 165 | except Exception as e: 166 | print(f"⚠️ Failed to read GitHub versions file: {e}") 167 | 168 | for t in tasks: 169 | if not {"repo", "base_commit", "instance_id"}.issubset(t): 170 | print(f"Invalid task format: {t}") 171 | return 172 | 173 | 174 | cache_dir = os.path.join(args.testbed, "_cache") 175 | repo_cache = prepare_repo_cache(tasks, cache_dir) 176 | results = process_repos(tasks, args.testbed, repo_cache, args.max_workers) 177 | 178 | tmp = generate_output_path(args.instance_path, "_versions_by_git") 179 | if args.output_dir: 180 | output_path = os.path.join(args.output_dir, os.path.basename(tmp)) 181 | else: 182 | output_path = tmp 183 | 184 | save_results(results, output_path) 185 | print(f"\n✅ {len(results)} results saved to {output_path}") 186 | 187 | if __name__ == "__main__": 188 | main() 189 | -------------------------------------------------------------------------------- /app/model/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import threading 4 | from abc import ABC, abstractmethod 5 | from typing import Literal 6 | 7 | import litellm 8 | from litellm import cost_per_token 9 | from litellm.utils import Choices, Message, ModelResponse 10 | from openai import BadRequestError 11 | from tenacity import retry, stop_after_attempt, wait_random_exponential 12 | 13 | from app.log import log_and_cprint, log_and_print 14 | 15 | # Variables for each process. Since models are singleton objects, their references are copied 16 | # to each process, but they all point to the same objects. For safe updating costs per process, 17 | # we define the accumulators here. 18 | 19 | thread_cost = threading.local() 20 | thread_cost.process_cost = 0.0 21 | thread_cost.process_input_tokens = 0 22 | thread_cost.process_output_tokens = 0 23 | 24 | 25 | class Model(ABC): 26 | def __init__( 27 | self, 28 | name: str, 29 | cost_per_input: float, 30 | cost_per_output: float, 31 | parallel_tool_call: bool = False, 32 | ): 33 | self.name: str = name 34 | # cost stats - zero for local models 35 | self.cost_per_input: float = cost_per_input 36 | self.cost_per_output: float = cost_per_output 37 | # whether the model supports parallel tool call 38 | self.parallel_tool_call: bool = parallel_tool_call 39 | 40 | @abstractmethod 41 | def check_api_key(self) -> str: 42 | raise NotImplementedError("abstract base class") 43 | 44 | @abstractmethod 45 | def setup(self) -> None: 46 | raise NotImplementedError("abstract base class") 47 | 48 | @abstractmethod 49 | def call(self, messages: list[dict], **kwargs): 50 | raise NotImplementedError("abstract base class") 51 | 52 | def calc_cost(self, input_tokens: int, output_tokens: int) -> float: 53 | """ 54 | Calculates the cost of a request based on the number of input/output tokens. 55 | """ 56 | input_cost = self.cost_per_input * input_tokens 57 | output_cost = self.cost_per_output * output_tokens 58 | cost = input_cost + output_cost 59 | log_and_cprint( 60 | f"Model API request cost info: " 61 | f"input_tokens={input_tokens}, output_tokens={output_tokens}, cost={cost:.6f}", 62 | style="yellow", 63 | ) 64 | return cost 65 | 66 | def get_overall_exec_stats(self): 67 | return { 68 | "model": self.name, 69 | "input_cost_per_token": self.cost_per_input, 70 | "output_cost_per_token": self.cost_per_output, 71 | "total_input_tokens": thread_cost.process_input_tokens, 72 | "total_output_tokens": thread_cost.process_output_tokens, 73 | "total_tokens": thread_cost.process_input_tokens 74 | + thread_cost.process_output_tokens, 75 | "total_cost": thread_cost.process_cost, 76 | } 77 | 78 | 79 | class LiteLLMGeneric(Model): 80 | """ 81 | Base class for creating instances of LiteLLM-supported models. 82 | """ 83 | 84 | _instances = {} 85 | 86 | def __new__(cls, model_name: str, cost_per_input: float, cost_per_output: float): 87 | if model_name not in cls._instances: 88 | cls._instances[model_name] = super().__new__(cls) 89 | cls._instances[model_name]._initialized = False 90 | return cls._instances[model_name] 91 | 92 | def __init__( 93 | self, 94 | name: str, 95 | cost_per_input: float, 96 | cost_per_output: float, 97 | parallel_tool_call: bool = False, 98 | ): 99 | if self._initialized: 100 | return 101 | super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call) 102 | self._initialized = True 103 | 104 | def setup(self) -> None: 105 | """ 106 | Check API key. 107 | """ 108 | pass 109 | 110 | def check_api_key(self) -> str: 111 | return "" 112 | 113 | def extract_resp_content(self, chat_message: Message) -> str: 114 | """ 115 | Given a chat completion message, extract the content from it. 116 | """ 117 | content = chat_message.content 118 | if content is None: 119 | return "" 120 | else: 121 | return content 122 | 123 | @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3)) 124 | def call( 125 | self, 126 | messages: list[dict], 127 | top_p=1, 128 | tools=None, 129 | response_format: Literal["text", "json_object"] = "text", 130 | **kwargs, 131 | ): 132 | # FIXME: ignore tools field since we don't use tools now 133 | try: 134 | prefill_content = "{" 135 | if response_format == "json_object": # prefill 136 | messages.append({"role": "assistant", "content": prefill_content}) 137 | 138 | response = litellm.completion( 139 | model=self.name, 140 | messages=messages, 141 | temperature=MODEL_TEMP, 142 | max_tokens=os.getenv("ACR_TOKEN_LIMIT", 1024), 143 | response_format=( 144 | {"type": response_format} if "gpt" in self.name else None 145 | ), 146 | top_p=top_p, 147 | stream=False, 148 | ) 149 | assert isinstance(response, ModelResponse) 150 | resp_usage = response.usage 151 | assert resp_usage is not None 152 | input_tokens = int(resp_usage.prompt_tokens) 153 | output_tokens = int(resp_usage.completion_tokens) 154 | cost = self.calc_cost(input_tokens, output_tokens) 155 | 156 | thread_cost.process_cost += cost 157 | thread_cost.process_input_tokens += input_tokens 158 | thread_cost.process_output_tokens += output_tokens 159 | 160 | first_resp_choice = response.choices[0] 161 | assert isinstance(first_resp_choice, Choices) 162 | resp_msg: Message = first_resp_choice.message 163 | content = self.extract_resp_content(resp_msg) 164 | if response_format == "json_object": 165 | # prepend the prefilled character 166 | if not content.startswith(prefill_content): 167 | content = prefill_content + content 168 | 169 | return content, cost, input_tokens, output_tokens 170 | 171 | except BadRequestError as e: 172 | if e.code == "context_length_exceeded": 173 | log_and_print("Context length exceeded") 174 | raise e 175 | 176 | 177 | MODEL_HUB = {} 178 | 179 | 180 | def register_model(model: Model): 181 | global MODEL_HUB 182 | MODEL_HUB[model.name] = model 183 | 184 | 185 | def get_all_model_names(): 186 | return list(MODEL_HUB.keys()) 187 | 188 | 189 | # To be set at runtime - the selected model for a run 190 | SELECTED_MODEL: Model 191 | 192 | 193 | def set_model(model_name: str): 194 | global SELECTED_MODEL 195 | if model_name not in MODEL_HUB and not model_name.startswith("litellm-generic-"): 196 | print(f"Invalid model name: {model_name}") 197 | sys.exit(1) 198 | if model_name.startswith("litellm-generic-"): 199 | real_model_name = model_name.removeprefix("litellm-generic-") 200 | prompt_tokens = 5 201 | completion_tokens = 10 202 | prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = ( 203 | cost_per_token( 204 | model=real_model_name, 205 | prompt_tokens=prompt_tokens, 206 | completion_tokens=completion_tokens, 207 | ) 208 | ) 209 | # litellm.set_verbose = True 210 | SELECTED_MODEL = LiteLLMGeneric( 211 | real_model_name, 212 | prompt_tokens_cost_usd_dollar, 213 | completion_tokens_cost_usd_dollar, 214 | ) 215 | else: 216 | SELECTED_MODEL = MODEL_HUB[model_name] 217 | SELECTED_MODEL.setup() 218 | 219 | 220 | # the model temperature to use 221 | # For OpenAI models: this value should be from 0 to 2 222 | MODEL_TEMP: float = 0.0 223 | -------------------------------------------------------------------------------- /app/agents/train_env_gen_agent/tools/search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Description: Search for a term in either a directory or a single file. 5 | 6 | Behavior: 7 | * If `--path` points to a directory (default is `.`), we recursively search all non-hidden files and directories. 8 | * If `--path` points to a file, we run `grep -n` on that file to find line numbers containing the search term. 9 | * If more than 100 files match (directory search scenario), the tool will stop listing and inform you to narrow your search. 10 | * If no files are found that match your search term, the tool will inform you of that as well. 11 | 12 | **Parameters:** 13 | 1. **search_term** (`string`, required): The term to search for in files. 14 | 2. **path** (`string`, optional): The file or directory in which to search. If not provided, defaults to the current directory (i.e., `.`). 15 | """ 16 | 17 | import argparse 18 | import os 19 | import sys 20 | import subprocess 21 | 22 | def search_in_directory(search_term: str, directory: str = ".", python_only: bool = False): 23 | """ 24 | Searches for `search_term` in all non-hidden files under `directory` 25 | (or only in .py files if `python_only=True`), excluding hidden directories. 26 | Prints how many matches were found per file. 27 | """ 28 | directory = os.path.realpath(directory) 29 | 30 | if not os.path.isdir(directory): 31 | print(f"Directory '{directory}' not found or not a directory.") 32 | sys.exit(1) 33 | 34 | matches = {} 35 | num_files_matched = 0 36 | 37 | for root, dirs, files in os.walk(directory): 38 | # Exclude hidden directories 39 | dirs[:] = [d for d in dirs if not d.startswith(".")] 40 | for file in files: 41 | # Skip hidden files 42 | if file.startswith("."): 43 | continue 44 | 45 | # If --python_only is set, only search .py files 46 | if python_only and not file.endswith(".py"): 47 | continue 48 | 49 | filepath = os.path.join(root, file) 50 | try: 51 | with open(filepath, "r", errors="ignore") as f: 52 | file_matches = 0 53 | for line_num, line in enumerate(f, 1): 54 | if search_term in line: 55 | file_matches += 1 56 | if file_matches > 0: 57 | matches[filepath] = file_matches 58 | num_files_matched += 1 59 | except (UnicodeDecodeError, PermissionError): 60 | # Skip files that can't be read 61 | continue 62 | 63 | if not matches: 64 | print(f'No matches found for "{search_term}" in {directory}') 65 | sys.exit(0) 66 | 67 | # Summarize 68 | num_matches = sum(matches.values()) 69 | if num_files_matched > 100: 70 | print( 71 | f'More than {num_files_matched} files matched for "{search_term}" in {directory}. ' 72 | "Please narrow your search." 73 | ) 74 | sys.exit(0) 75 | 76 | print(f'Found {num_matches} matches for "{search_term}" in {directory}:') 77 | 78 | # Print matched files 79 | for filepath, count in matches.items(): 80 | relative_path = os.path.relpath(filepath, start=os.getcwd()) 81 | if not relative_path.startswith("./"): 82 | relative_path = "./" + relative_path 83 | print(f"{relative_path} ({count} matches)") 84 | 85 | print(f'End of matches for "{search_term}" in {directory}') 86 | 87 | def search_in_directory_old(search_term: str, directory: str = ".", python_only=False): 88 | """ 89 | Searches for `search_term` in all non-hidden files under `directory`, 90 | excluding hidden directories. Prints how many matches were found per file. 91 | """ 92 | directory = os.path.realpath(directory) 93 | 94 | if not os.path.isdir(directory): 95 | print(f"Directory '{directory}' not found or not a directory.") 96 | sys.exit(1) 97 | 98 | matches = {} 99 | num_files_matched = 0 100 | 101 | for root, dirs, files in os.walk(directory): 102 | # Exclude hidden directories 103 | dirs[:] = [d for d in dirs if not d.startswith(".")] 104 | for file in files: 105 | # Skip hidden files 106 | if file.startswith("."): 107 | continue 108 | filepath = os.path.join(root, file) 109 | try: 110 | with open(filepath, "r", errors="ignore") as f: 111 | file_matches = 0 112 | for line_num, line in enumerate(f, 1): 113 | if search_term in line: 114 | file_matches += 1 115 | if file_matches > 0: 116 | matches[filepath] = file_matches 117 | num_files_matched += 1 118 | except (UnicodeDecodeError, PermissionError): 119 | # Skip files that can't be read 120 | continue 121 | 122 | if not matches: 123 | print(f'No matches found for "{search_term}" in {directory}') 124 | sys.exit(0) 125 | 126 | # Summarize 127 | num_matches = sum(matches.values()) 128 | if num_files_matched > 100: 129 | print( 130 | f'More than {num_files_matched} files matched for "{search_term}" in {directory}. ' 131 | "Please narrow your search." 132 | ) 133 | sys.exit(0) 134 | 135 | print(f'Found {num_matches} matches for "{search_term}" in {directory}:') 136 | 137 | # Print matched files 138 | for filepath, count in matches.items(): 139 | # Convert absolute path to relative path 140 | relative_path = os.path.relpath(filepath, start=os.getcwd()) 141 | if not relative_path.startswith("./"): 142 | relative_path = "./" + relative_path 143 | print(f"{relative_path} ({count} matches)") 144 | 145 | print(f'End of matches for "{search_term}" in {directory}') 146 | 147 | 148 | def search_in_file(search_term: str, filepath: str): 149 | """ 150 | Uses grep -n to search for `search_term` in a single file. 151 | Prints lines (with line numbers) where matches occur. 152 | """ 153 | filepath = os.path.realpath(filepath) 154 | 155 | if not os.path.isfile(filepath): 156 | print(f"File '{filepath}' not found or is not a file.") 157 | sys.exit(1) 158 | 159 | try: 160 | # Try modern parameters if Python 3.7+ (capture_output, text) 161 | result = subprocess.run( 162 | ["grep", "-n", search_term, filepath], 163 | capture_output=True, 164 | text=True 165 | ) 166 | except TypeError: 167 | # Fallback for Python 3.5/3.6 168 | result = subprocess.run( 169 | ["grep", "-n", search_term, filepath], 170 | stdout=subprocess.PIPE, 171 | stderr=subprocess.PIPE, 172 | universal_newlines=True 173 | ) 174 | 175 | if result.returncode != 0: 176 | # grep exit code = 1 means "no matches", other non-zero exit code is a real error 177 | if result.returncode == 1: 178 | print(f'No matches found for "{search_term}" in {filepath}') 179 | sys.exit(0) 180 | else: 181 | # Something else went wrong 182 | print(f"Error executing grep:\n{result.stderr}") 183 | sys.exit(result.returncode) 184 | 185 | # Print the grep output directly 186 | print(f'Matches for "{search_term}" in {filepath}:') 187 | # Depending on the fallback, the output is in result.stdout 188 | print(result.stdout.strip()) 189 | # try: 190 | # # Run grep -n 191 | # result = subprocess.run( 192 | # ["grep", "-n", search_term, filepath], capture_output=True, text=True 193 | # ) 194 | # if result.returncode != 0: 195 | # # grep exit code = 1 means no matches 196 | # print(f'No matches found for "{search_term}" in {filepath}') 197 | # sys.exit(0) 198 | # # Print grep output directly 199 | # print(f'Matches for "{search_term}" in {filepath}:') 200 | # print(result.stdout.strip()) 201 | # except FileNotFoundError: 202 | # print( 203 | # "`grep` is not available on this system. Please install or use another method." 204 | # ) 205 | # sys.exit(1) 206 | 207 | 208 | def main(): 209 | parser = argparse.ArgumentParser( 210 | description="search tool: run subcommands such as `search` for files or directories." 211 | ) 212 | parser.add_argument( 213 | "--search_term", help="Term to search for in files.", required=True 214 | ) 215 | parser.add_argument( 216 | "--path", 217 | help="File or directory to search in (defaults to current dir).", 218 | default=".", 219 | ) 220 | # NEW ARGUMENT: 221 | parser.add_argument( 222 | "--python_only", 223 | default=True, 224 | help="If set, only search for matches in .py files when searching a directory." 225 | ) 226 | 227 | args = parser.parse_args() 228 | # Check if path is a file or a directory 229 | if os.path.isfile(args.path): 230 | search_in_file(args.search_term, args.path) 231 | else: 232 | search_in_directory(args.search_term, args.path, python_only=args.python_only) 233 | 234 | 235 | if __name__ == "__main__": 236 | main() -------------------------------------------------------------------------------- /data_collection/collect/build_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import json 5 | import logging 6 | import os 7 | from typing import Optional 8 | from datetime import datetime 9 | from utils import Repo, extract_patches, extract_problem_statement_and_hints, extract_problem_statement_and_hints_with_official_github_api 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 13 | ) 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def create_instance(repo: Repo, pull: dict, output_path: str, mode: str ='swebench') -> dict: 18 | """ 19 | Create a single task instance from a pull request, where task instance is: 20 | 21 | { 22 | repo (str): owner/repo this task instance is from, 23 | pull_number (int): number of PR this task instance is from, 24 | base_commit (str): SHA of the base commit PR is based on, 25 | patch (str): reference solution as .patch (apply to base commit), 26 | test_patch (str): test suite as .patch (apply to base commit), 27 | } 28 | """ 29 | # try: 30 | patch, test_patch, request_success = extract_patches(pull, repo) 31 | # except Exception as e: 32 | # logger.info(e) 33 | # patch = "" 34 | # test_patch = "" 35 | instance_id = (repo.repo.full_name + "-" + str(pull["number"])).replace("/", "__") 36 | successful_path = os.path.join(os.path.dirname(output_path), "successful_requests.txt") 37 | if request_success: 38 | with open(successful_path, "a") as f: 39 | f.write(instance_id + "\n") 40 | 41 | if mode =='swebench': 42 | 43 | problem_statement, hints = extract_problem_statement_and_hints(pull, repo) 44 | else: 45 | problem_statement, hints = extract_problem_statement_and_hints_with_official_github_api(pull, repo) 46 | return { 47 | "repo": repo.repo.full_name, 48 | "pull_number": pull["number"], 49 | "instance_id": instance_id, 50 | "issue_numbers": pull["resolved_issues"], 51 | "base_commit": pull["base"]["sha"], 52 | "patch": patch, 53 | "test_patch": test_patch, 54 | "problem_statement": problem_statement, 55 | "hints_text": hints, 56 | "created_at": pull["created_at"], 57 | } 58 | 59 | 60 | def is_valid_pull(pull: dict) -> bool: 61 | """ 62 | Check whether PR has an associated issue and is merged 63 | 64 | Args: 65 | pull (dict): pull request object 66 | Returns: 67 | bool: whether PR is valid 68 | """ 69 | if pull["merged_at"] is None: 70 | # logger.info(f" not merged") 71 | return False 72 | if "resolved_issues" not in pull or len(pull["resolved_issues"]) < 1: 73 | # logger.info(f"no resolved_issues") 74 | return False 75 | 76 | return True 77 | 78 | 79 | def is_valid_instance(instance: dict) -> bool: 80 | """ 81 | Check whether task instance has all required fields for task instance creation 82 | 83 | Args: 84 | instance (dict): task instance object 85 | Returns: 86 | bool: whether task instance is valid 87 | """ 88 | if instance["patch"] is None or instance["patch"] == "": 89 | logger.info(f"Instance {instance['pull_number']} no patch") 90 | return False 91 | if instance["problem_statement"] is None or instance["problem_statement"] == "": 92 | logger.info(f"Instance {instance['pull_number']} no problem statement") 93 | return False 94 | return True 95 | 96 | 97 | def has_test_patch(instance: dict) -> bool: 98 | """ 99 | Check whether task instance has a test suite 100 | 101 | Args: 102 | instance (dict): task instance object 103 | Returns: 104 | bool: whether task instance has a test suite 105 | """ 106 | if instance["test_patch"] is None or instance["test_patch"].strip() == "": 107 | logger.info(f"Instance {instance['pull_number']} no test patch") 108 | return False 109 | return True 110 | 111 | def main(pr_file: str, output: str, token: Optional[str] = None,mode: Optional[str] = 'swebench',language: Optional[str] = 'python', cutoff_date: Optional[str] = None): 112 | """ 113 | Main thread for creating task instances from pull requests 114 | 115 | Args: 116 | pr_file (str): path to pull request JSONL file 117 | output (str): output file name 118 | token (str): GitHub token 119 | """ 120 | logger.info(f'Language: {language}') 121 | logger.info(f'mode: {mode}') 122 | cutoff_date = datetime.strptime(cutoff_date, "%Y-%m-%dT%H:%M:%SZ") 123 | if token is None: 124 | # Get GitHub token from environment variable if not provided 125 | token = os.environ["GITHUB_TOKEN"] 126 | 127 | def load_repo(repo_name,language): 128 | # Return repo object for a given repo name 129 | owner, repo = repo_name.split("/") 130 | return Repo(owner, repo, token=token,language=language) 131 | 132 | repos = dict() 133 | completed = 0 134 | with_tests = 0 135 | total_instances = 0 136 | all_output = output + ".all" 137 | seen_prs = set() 138 | 139 | successful_path = os.path.join(os.path.dirname(output), "successful_requests.txt") 140 | 141 | if not os.path.exists(successful_path): 142 | with open(successful_path, "w") as f: 143 | pass 144 | 145 | successful_instances = set() 146 | with open(successful_path, "r") as f: 147 | for line in f: 148 | successful_instances.add(line.strip()) 149 | 150 | # Continue where we left off if output file already exists 151 | if os.path.exists(all_output): 152 | with open(all_output) as f: 153 | for line in f: 154 | pr = json.loads(line) 155 | if "instance_id" not in pr: 156 | pr["instance_id"] = ( 157 | pr["repo"] + "-" + str(pr["pull_number"]) 158 | ).replace("/", "__") 159 | instance_id = pr["instance_id"] 160 | seen_prs.add(instance_id) 161 | if datetime.strptime(pr["created_at"], "%Y-%m-%dT%H:%M:%SZ") >= cutoff_date: 162 | logger.info(f"Instance {instance_id} created_at {pr['created_at']} exceeds cutoff_date {cutoff_date}") 163 | continue 164 | if is_valid_instance(pr): 165 | completed += 1 166 | if has_test_patch(pr): 167 | with_tests += 1 168 | logger.info(f"{len(seen_prs)} instance_ids previously recorded") 169 | original_output_path = output 170 | # Write to .all file for all PRs 171 | write_mode_all = "w" if not os.path.exists(all_output) else "a" 172 | with open(all_output, write_mode_all) as all_output: 173 | # Write to output file for PRs with test suites 174 | write_mode = "w" if not os.path.exists(output) else "a" 175 | with open(output, write_mode) as output: 176 | for ix, line in enumerate(open(pr_file)): 177 | total_instances += 1 178 | pull = json.loads(line) 179 | if ix % 100 == 0: 180 | logger.info( 181 | f"[{pull['base']['repo']['full_name']}] ( Up to {ix} checked ) {completed} valid, {with_tests} with tests." 182 | ) 183 | # Construct instance fields 184 | instance_id = ( 185 | pull["base"]["repo"]["full_name"] + "-" + str(pull["number"]) 186 | ) 187 | instance_id = instance_id.replace("/", "__") 188 | 189 | if instance_id in seen_prs: 190 | seen_prs -= {instance_id} 191 | continue 192 | 193 | if instance_id in successful_instances: 194 | continue 195 | 196 | if not is_valid_pull(pull): 197 | # Throw out invalid PRs 198 | continue 199 | # Create task instance 200 | repo_name = pull["base"]["repo"]["full_name"] 201 | if repo_name not in repos: 202 | repos[repo_name] = load_repo(repo_name,language) 203 | repo = repos[repo_name] 204 | instance = create_instance(repo, pull,original_output_path,mode) 205 | if is_valid_instance(instance): 206 | # If valid, write to .all output file 207 | print( 208 | json.dumps(instance), end="\n", flush=True, file=all_output 209 | ) # write all instances to a separate file 210 | completed += 1 211 | if has_test_patch(instance): 212 | # If has test suite, write to output file 213 | print(json.dumps(instance), end="\n", flush=True, file=output) 214 | with_tests += 1 215 | logger.info( 216 | f"Total instances: {total_instances}, completed: {completed}, with tests: {with_tests}" 217 | ) 218 | logger.info(f"Didn't see {len(seen_prs)} instances previously recorded") 219 | logger.info("\n".join(sorted(seen_prs))) 220 | 221 | 222 | if __name__ == "__main__": 223 | parser = argparse.ArgumentParser() 224 | parser.add_argument("pr_file", type=str, help="Path to pull request JSONL file") 225 | parser.add_argument("output", type=str, help="Output file name") 226 | parser.add_argument("--token", type=str, help="GitHub token") 227 | parser.add_argument("--mode", type=str, default='omnigirl',help="collecting mode") 228 | parser.add_argument("--cutoff_date", type=str, default="2025-03-31T23:59:59Z", help="Cutoff date for filtering PRs in YYYY-MM-DDTHH:MM:SSZ format") 229 | parser.add_argument("--language", type=str, help="language") 230 | 231 | args = parser.parse_args() 232 | print(">>> reached main()") 233 | main(**vars(args)) 234 | -------------------------------------------------------------------------------- /app/post_process.py: -------------------------------------------------------------------------------- 1 | """ 2 | Post-process the output of the inference workflow. 3 | """ 4 | 5 | import json 6 | import os 7 | import shutil 8 | import subprocess 9 | from collections import defaultdict 10 | from collections.abc import Mapping 11 | from enum import Enum 12 | from glob import glob 13 | from os.path import join as pjoin 14 | from shutil import move 15 | 16 | from app import utils as apputils 17 | from app.model import common 18 | 19 | 20 | 21 | 22 | # track status of patch extraction 23 | class ExtractStatus(str, Enum): 24 | # APPLICABLE_PATCH = "APPLICABLE_PATCH" 25 | # MATCHED_BUT_EMPTY_ORIGIN = "MATCHED_BUT_EMPTY_ORIGIN" 26 | # MATCHED_BUT_EMPTY_DIFF = "MATCHED_BUT_EMPTY_DIFF" 27 | # RAW_PATCH_BUT_UNMATCHED = "RAW_PATCH_BUT_UNMATCHED" 28 | # RAW_PATCH_BUT_UNPARSED = "RAW_PATCH_BUT_UNPARSED" 29 | # NO_PATCH = "NO_PATCH" 30 | IS_VALID_JSON = "IS_VALID_JSON" 31 | NOT_VALID_JSON = "NOT_VALID_JSON" 32 | NO_SETUP = "NO_SETUP" 33 | APPLICABLE_SETUP = "APPLICABLE_SETUP" 34 | 35 | def __lt__(self, other): 36 | # order from min to max 37 | order = [ 38 | self.NO_SETUP, 39 | # self.RAW_PATCH_BUT_UNPARSED, 40 | # self.RAW_PATCH_BUT_UNMATCHED, 41 | # self.MATCHED_BUT_EMPTY_DIFF, 42 | # self.MATCHED_BUT_EMPTY_ORIGIN, 43 | self.APPLICABLE_SETUP, 44 | ] 45 | self_index = order.index(self) 46 | other_index = order.index(other) 47 | return self_index < other_index 48 | 49 | def __eq__(self, other): 50 | return self is other 51 | 52 | def __hash__(self): 53 | return hash(self.value) 54 | 55 | def to_dir_name(self, expr_dir: str): 56 | return pjoin(expr_dir, self.value.lower()) 57 | 58 | @staticmethod 59 | def max(statuses): 60 | return sorted(statuses)[-1] 61 | 62 | 63 | def record_extract_status(individual_expr_dir: str, extract_status: ExtractStatus): 64 | """ 65 | Write extract status to file, so that we can read it again when 66 | classifying patches 67 | """ 68 | # there is 1-to-1 correspondence between agent_patch_raw and extract_status 69 | # FIXME: it might be better to record these status in memory so they can be easily managed. 70 | record_file = pjoin(individual_expr_dir, "extract_status.json") 71 | if not os.path.isfile(record_file): 72 | # record for the first time 73 | with open(record_file, "w") as f: 74 | json.dump({"extract_status": [extract_status]}, f, indent=4) 75 | else: 76 | with open(record_file) as f: 77 | record = json.load(f) 78 | record["extract_status"].append(extract_status) 79 | with open(record_file, "w") as f: 80 | json.dump(record, f, indent=4) 81 | 82 | 83 | def read_extract_status(individual_expr_dir: str) -> tuple[ExtractStatus, int]: 84 | """ 85 | Read extract status from file. If there are multiple status recorded, read the best one. 86 | Returns: 87 | - The best extract status 88 | - The index of the best status in the list of all statuses. (0-based) 89 | """ 90 | # we should read from the all the record 91 | record_file = pjoin(individual_expr_dir, "Dockerfile") 92 | if not os.path.isfile(record_file): 93 | # if no status file is written, means that we did not even 94 | # reach the state of extracting patches 95 | return ExtractStatus.NO_SETUP, -1 96 | else: 97 | return ExtractStatus.APPLICABLE_SETUP, 1 98 | # with open(record_file) as f: 99 | # record = json.load(f) 100 | # # convert string to enum type 101 | # all_status = [ExtractStatus(s) for s in record["extract_status"]] 102 | 103 | # best_status = ExtractStatus.max(all_status) 104 | # best_idx = all_status.index(best_status) 105 | # return best_status, best_idx 106 | 107 | 108 | 109 | 110 | 111 | 112 | def organize_experiment_results(expr_dir: str): 113 | """ 114 | Assuming patches have already been extracted, organize the experiment result 115 | directories into a few categories and move them there. 116 | """ 117 | # (1) find all the task experiment directories 118 | task_exp_names = [ 119 | x 120 | for x in os.listdir(expr_dir) 121 | if os.path.isdir(pjoin(expr_dir, x)) 122 | and "__" in x # for filtering out other dirs like "applicable_patch" 123 | ] 124 | task_exp_dirs = [pjoin(expr_dir, x) for x in task_exp_names] 125 | 126 | # start organizing 127 | for extract_status in ExtractStatus: 128 | os.makedirs(extract_status.to_dir_name(expr_dir), exist_ok=True) 129 | 130 | for task_dir in task_exp_dirs: 131 | extract_status, _ = read_extract_status(task_dir) 132 | corresponding_dir = extract_status.to_dir_name(expr_dir) 133 | shutil.move(task_dir, corresponding_dir) 134 | 135 | 136 | 137 | def extract_swe_bench_input(dir: str): 138 | """ 139 | After diff format patch files have been extracted, this function collects 140 | them and writes a single file that can be used by swe-bench. 141 | 142 | Returns: 143 | - path to swe-bench input file. 144 | """ 145 | # only look into applicable_patch dir, since we have already done 146 | # the categorization 147 | applicable_res_dir = pjoin(dir, "applicable_setup") 148 | # figure out what tasks have applicable patch 149 | task_dirs = [ 150 | x 151 | for x in os.listdir(applicable_res_dir) 152 | if os.path.isdir(pjoin(applicable_res_dir, x)) 153 | ] 154 | task_dirs = [pjoin(applicable_res_dir, x) for x in task_dirs] 155 | # patch_files = [pjoin(x, "agent_patch_raw") for x in task_dirs] 156 | # patch_files = [os.path.abspath(x) for x in patch_files] 157 | 158 | # Diff files have the name extracted_patch_{1,2,3...}.diff 159 | # We take the one with the largest index. This is because 160 | # (1) if there is no validation, then there is at most one such file, 161 | # so just take it 162 | # (2) if there is validation, only the one with the largest index may be correct 163 | docker_files = [] 164 | for x in task_dirs: 165 | extracted_dockerfile = glob(pjoin(x, "Dockerfile")) 166 | docker_files.append(extracted_dockerfile[0]) 167 | 168 | docker_files = [os.path.abspath(x) for x in docker_files] 169 | 170 | # patch_files = [x for x in patch_files if os.path.isfile(x)] 171 | docker_files = [x for x in docker_files if os.path.isfile(x)] 172 | 173 | all_results = [] 174 | final_results = [] 175 | for docker_file in docker_files: 176 | # task_dir = os.path.dirname(os.path.dirname(docker_file)) 177 | task_dir = os.path.dirname(docker_file) 178 | meta_file = pjoin(task_dir, "meta.json") 179 | with open(meta_file) as f: 180 | meta = json.load(f) 181 | status_file = pjoin(task_dir, "status.json") 182 | status = NotImplemented 183 | if os.path.exists(status_file): 184 | with open(status_file) as f: 185 | status_meta = json.load(f) 186 | status = status_meta['is_finish'] 187 | 188 | else: 189 | continue 190 | task_id = meta["task_id"] 191 | this_result = {} 192 | 193 | this_result["instance_id"] = task_id 194 | this_result["model_name_or_path"] = common.SELECTED_MODEL.name 195 | docker_content = "" 196 | eval_script_content = "" 197 | if os.path.exists(docker_file): 198 | with open(docker_file) as f: 199 | docker_content = f.read() 200 | eval_script_file = docker_file.replace('Dockerfile','eval.sh') 201 | if os.path.exists(eval_script_file): 202 | with open(eval_script_file) as f: 203 | eval_script_content = f.read() 204 | # if not docker_content: 205 | # # empty diff file, dont bother sending it to swe-bench 206 | # continue 207 | this_result["dockerfile"] = docker_content 208 | this_result["eval_script"] = eval_script_content 209 | this_result['version'] = meta['task_info']['version'] 210 | this_result['repo'] = meta['task_info']['repo'] 211 | this_result['patch'] = meta['task_info']['patch'] 212 | this_result['status'] = status 213 | all_results.append(this_result) 214 | if status == True: 215 | final_results.append(this_result) 216 | 217 | final_predictions_file = pjoin(dir, "predictions.json") 218 | raw_predictions_file = pjoin(dir, "raw_predictions.json") 219 | with open(final_predictions_file, "w") as f: 220 | json.dump(final_results, f, indent=4) 221 | 222 | with open(raw_predictions_file, "w") as f: 223 | json.dump(all_results, f, indent=4) 224 | 225 | return final_predictions_file 226 | 227 | 228 | def is_valid_json(json_str: str) -> tuple[ExtractStatus, list | dict | None]: 229 | """ 230 | Check whether a json string is valid. 231 | """ 232 | try: 233 | data = json.loads(json_str) 234 | except json.decoder.JSONDecodeError: 235 | return ExtractStatus.NOT_VALID_JSON, None 236 | return ExtractStatus.IS_VALID_JSON, data 237 | 238 | 239 | """ 240 | Main entries of the module. 241 | """ 242 | 243 | 244 | 245 | def un_classify_expr_dir(expr_dir: str): 246 | individual_expr_dirs = [] 247 | for individual_expr_dir in glob(pjoin(expr_dir, "*", "*__*")): 248 | assert "info.log" in os.listdir( 249 | individual_expr_dir 250 | ), f"{individual_expr_dir} has no info.log" 251 | individual_expr_dirs.append(individual_expr_dir) 252 | 253 | for d in individual_expr_dirs: 254 | move(d, expr_dir) 255 | 256 | 257 | 258 | 259 | def organize_and_form_input(expr_dir): 260 | """ 261 | Only organize the experiment directories into a few categories. 262 | Args: 263 | - expr_dir: the overall experiment directory. 264 | """ 265 | organize_experiment_results(expr_dir) 266 | swe_input_file = extract_swe_bench_input(expr_dir) 267 | return swe_input_file 268 | -------------------------------------------------------------------------------- /evaluation/docker_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import docker 4 | import os 5 | import signal 6 | import tarfile 7 | import threading 8 | import traceback 9 | from pathlib import Path 10 | 11 | from docker.models.containers import Container 12 | 13 | HEREDOC_DELIMITER = "EOF_1399519320" # different from dataset HEREDOC_DELIMITERs! 14 | 15 | 16 | def copy_to_container(container: Container, src: Path, dst: Path): 17 | """ 18 | Copy a file from local to a docker container 19 | 20 | Args: 21 | container (Container): Docker container to copy to 22 | src (Path): Source file path 23 | dst (Path): Destination file path in the container 24 | """ 25 | # Check if destination path is valid 26 | if os.path.dirname(dst) == "": 27 | raise ValueError( 28 | f"Destination path parent directory cannot be empty!, dst: {dst}" 29 | ) 30 | 31 | # temporary tar file 32 | tar_path = src.with_suffix(".tar") 33 | with tarfile.open(tar_path, "w") as tar: 34 | tar.add(src, arcname=src.name) 35 | 36 | # get bytes for put_archive cmd 37 | with open(tar_path, "rb") as tar_file: 38 | data = tar_file.read() 39 | 40 | # Make directory if necessary 41 | container.exec_run(f"mkdir -p {dst.parent}") 42 | 43 | # Send tar file to container and extract 44 | container.put_archive(os.path.dirname(dst), data) 45 | container.exec_run(f"tar -xf {dst}.tar -C {dst.parent}") 46 | 47 | # clean up in locally and in container 48 | tar_path.unlink() 49 | container.exec_run(f"rm {dst}.tar") 50 | 51 | 52 | def write_to_container(container: Container, data: str, dst: Path): 53 | """ 54 | Write a string to a file in a docker container 55 | """ 56 | # echo with heredoc to file 57 | command = f"cat <<'{HEREDOC_DELIMITER}' > {dst}\n{data}\n{HEREDOC_DELIMITER}" 58 | container.exec_run(command) 59 | 60 | 61 | def remove_image(client, image_id, logger=None): 62 | """ 63 | Remove a Docker image by ID. 64 | 65 | Args: 66 | client (docker.DockerClient): Docker client. 67 | image_id (str): Image ID. 68 | rm_image (bool): Whether to remove the image. 69 | logger (logging.Logger): Logger to use for output. If None, print to stdout. 70 | """ 71 | if not logger: 72 | # if logger is None, print to stdout 73 | log_info = print 74 | log_error = print 75 | raise_error = True 76 | elif logger == "quiet": 77 | # if logger is "quiet", don't print anything 78 | log_info = lambda x: None 79 | log_error = lambda x: None 80 | raise_error = True 81 | else: 82 | # if logger is a logger object, use it 83 | log_error = logger.info 84 | log_info = logger.info 85 | raise_error = False 86 | 87 | try: 88 | log_info(f"Attempting to remove image {image_id}...") 89 | client.images.remove(image_id, force=True) 90 | log_info(f"Image {image_id} removed.") 91 | except Exception as e: 92 | if raise_error: 93 | raise e 94 | log_error( 95 | f"Failed to remove image {image_id}: {e}\n" f"{traceback.format_exc()}" 96 | ) 97 | 98 | 99 | def cleanup_container(client, container, logger): 100 | """ 101 | Stop and remove a Docker container. 102 | Performs this forcefully if the container cannot be stopped with the python API. 103 | 104 | Args: 105 | client (docker.DockerClient): Docker client. 106 | container (docker.models.containers.Container): Container to remove. 107 | logger (logging.Logger): Logger to use for output. If None, print to stdout 108 | """ 109 | if not container: 110 | return 111 | 112 | container_id = container.id 113 | 114 | if not logger: 115 | # if logger is None, print to stdout 116 | log_error = print 117 | log_info = print 118 | raise_error = True 119 | elif logger == "quiet": 120 | # if logger is "quiet", don't print anything 121 | log_info = lambda x: None 122 | log_error = lambda x: None 123 | raise_error = True 124 | else: 125 | # if logger is a logger object, use it 126 | log_error = logger.info 127 | log_info = logger.info 128 | raise_error = False 129 | 130 | # Attempt to stop the container 131 | try: 132 | if container: 133 | log_info(f"Attempting to stop container {container.name}...") 134 | container.stop(timeout=15) 135 | except Exception as e: 136 | log_error( 137 | f"Failed to stop container {container.name}: {e}. Trying to forcefully kill..." 138 | ) 139 | try: 140 | # Get the PID of the container 141 | container_info = client.api.inspect_container(container_id) 142 | pid = container_info["State"].get("Pid", 0) 143 | 144 | # If container PID found, forcefully kill the container 145 | if pid > 0: 146 | log_info( 147 | f"Forcefully killing container {container.name} with PID {pid}..." 148 | ) 149 | os.kill(pid, signal.SIGKILL) 150 | else: 151 | log_error(f"PID for container {container.name}: {pid} - not killing.") 152 | except Exception as e2: 153 | if raise_error: 154 | raise e2 155 | log_error( 156 | f"Failed to forcefully kill container {container.name}: {e2}\n" 157 | f"{traceback.format_exc()}" 158 | ) 159 | 160 | # Attempt to remove the container 161 | try: 162 | log_info(f"Attempting to remove container {container.name}...") 163 | container.remove(force=True) 164 | log_info(f"Container {container.name} removed.") 165 | except Exception as e: 166 | if raise_error: 167 | raise e 168 | log_error( 169 | f"Failed to remove container {container.name}: {e}\n" 170 | f"{traceback.format_exc()}" 171 | ) 172 | 173 | 174 | def exec_run_with_timeout(container, cmd, timeout: int|None=60): 175 | """ 176 | Run a command in a container with a timeout. 177 | 178 | Args: 179 | container (docker.Container): Container to run the command in. 180 | cmd (str): Command to run. 181 | timeout (int): Timeout in seconds. 182 | """ 183 | # Local variables to store the result of executing the command 184 | exec_result = None 185 | exec_id = None 186 | exception = None 187 | 188 | # Wrapper function to run the command 189 | def run_command(): 190 | nonlocal exec_result, exec_id, exception 191 | try: 192 | exec_id = container.client.api.exec_create(container.id, cmd)["Id"] 193 | exec_result = container.client.api.exec_start(exec_id) 194 | except Exception as e: 195 | exception = e 196 | 197 | # Start the command in a separate thread 198 | thread = threading.Thread(target=run_command) 199 | thread.start() 200 | thread.join(timeout) 201 | 202 | if exception: 203 | raise exception 204 | 205 | # If the thread is still alive, the command timed out 206 | if thread.is_alive(): 207 | raise TimeoutError(f"Command '{cmd}' timed out after {timeout} seconds") 208 | 209 | return exec_result 210 | 211 | 212 | def find_dependent_images(client: docker.DockerClient, image_name: str): 213 | """ 214 | Find all images that are built upon `image_name` image 215 | 216 | Args: 217 | client (docker.DockerClient): Docker client. 218 | image_name (str): Name of the base image. 219 | """ 220 | dependent_images = [] 221 | 222 | # Get all local images 223 | all_images = client.images.list() 224 | 225 | # Get the ID of the base image 226 | try: 227 | base_image = client.images.get(image_name) 228 | base_image_id = base_image.id 229 | except docker.errors.ImageNotFound: 230 | print(f"Base image {image_name} not found.") 231 | return [] 232 | 233 | for image in all_images: 234 | # Skip the base image itself 235 | if image.id == base_image_id: 236 | continue 237 | 238 | # Check if the base image is in this image's history 239 | history = image.history() 240 | for layer in history: 241 | if layer['Id'] == base_image_id: 242 | # If found, add this image to the dependent images list 243 | tags = image.tags 244 | dependent_images.append(tags[0] if tags else image.id) 245 | break 246 | 247 | return dependent_images 248 | 249 | 250 | def list_images(client: docker.DockerClient): 251 | """ 252 | List all images from the Docker client. 253 | """ 254 | # don't use this in multi-threaded context 255 | return {tag for i in client.images.list(all=True) for tag in i.tags} 256 | 257 | 258 | def clean_images( 259 | client: docker.DockerClient, 260 | prior_images: set, 261 | cache_level: str, 262 | clean: bool 263 | ): 264 | """ 265 | Clean Docker images based on cache level and clean flag. 266 | 267 | Args: 268 | client (docker.DockerClient): Docker client. 269 | prior_images (set): Set of images that existed before the current run. 270 | cache (str): Cache level to use. 271 | clean (bool): Whether to clean; remove images that are higher in the cache hierarchy than the current 272 | cache level. E.g. if cache_level is set to env, remove all previously built instances images. if 273 | clean is false, previously built instances images will not be removed, but instance images built 274 | in the current run will be removed. 275 | """ 276 | images = list_images(client) 277 | removed = 0 278 | print(f"Cleaning cached images...") 279 | for image_name in images: 280 | if should_remove(image_name, cache_level, clean, prior_images): 281 | try: 282 | remove_image(client, image_name, "quiet") 283 | removed += 1 284 | except Exception as e: 285 | print(f"Error removing image {image_name}: {e}") 286 | continue 287 | print(f"Removed {removed} images.") 288 | 289 | 290 | def should_remove( 291 | image_name: str, 292 | cache_level: str, 293 | clean: bool, 294 | prior_images: set 295 | ): 296 | """ 297 | Determine if an image should be removed based on cache level and clean flag. 298 | """ 299 | existed_before = image_name in prior_images 300 | if image_name.startswith("setup"): 301 | # if cache_level in {"none"} and (clean or not existed_before): 302 | # return True 303 | return True 304 | elif image_name.startswith("sweb.base"): 305 | if cache_level in {"none"} and (clean or not existed_before): 306 | return True 307 | elif image_name.startswith("sweb.env"): 308 | if cache_level in {"none", "base"} and (clean or not existed_before): 309 | return True 310 | elif image_name.startswith("sweb.eval"): 311 | if cache_level in {"none", "base", "env"} and (clean or not existed_before): 312 | return True 313 | return False 314 | -------------------------------------------------------------------------------- /app/raw_tasks.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import shutil 5 | from abc import ABC, abstractmethod 6 | from os.path import join as pjoin 7 | from pathlib import Path 8 | 9 | import httpx 10 | 11 | from app import utils as app_utils 12 | from app.log import log_and_print 13 | from app.task import PlainTask, SweTask, Task 14 | from docker import DockerClient 15 | 16 | class RawTask(ABC): 17 | @property 18 | @abstractmethod 19 | def task_id(self) -> str: 20 | raise NotImplementedError("abstract base class") 21 | 22 | @abstractmethod 23 | def to_task(self) -> Task: 24 | raise NotImplementedError("abstract base class") 25 | 26 | @abstractmethod 27 | def dump_meta_data(self, output_dir: str) -> None: 28 | raise NotImplementedError("abstract base class") 29 | 30 | 31 | class RawSweTask(RawTask): 32 | """ 33 | Encapsulate everything required to run one task. 34 | """ 35 | 36 | def __init__(self, task_id: str, setup_info: dict, task_info: dict,client:DockerClient = None): 37 | # a counter str, format "1/150", which means first task out of 150 38 | # id from the benchmark 39 | self._task_id = task_id 40 | # setup_info (Dict): keys: ['repo_path', 'env_name', 'pre_install', 'install','test_cmd'] 41 | self.setup_info = setup_info 42 | # task_info (Dict): keys: ['base_commit', 'hints_text', 'created_at', 43 | # 'test_patch', 'repo', 'problem_statement', 'version', 'instance_id', 44 | # 'FAIL_TO_PASS', 'PASS_TO_PASS', 'environment_setup_commit'] 45 | self.task_info = task_info 46 | self.client = client 47 | @property 48 | def task_id(self) -> str: 49 | return self._task_id 50 | 51 | def to_task(self) -> SweTask: 52 | task_id = self.task_id 53 | setup_info = self.setup_info 54 | task_info = self.task_info 55 | language = task_info.get('language','None') 56 | client = self.client 57 | return SweTask( 58 | task_id=task_id, 59 | problem_statement=task_info["problem_statement"], 60 | repo_path=setup_info["repo_path"], 61 | repo_cache_path=setup_info["repo_cache_path"], 62 | # env_name=setup_info["env_name"], 63 | # pre_install_cmds=setup_info["pre_install"], 64 | # install_cmd=setup_info["install"], 65 | # command to run the relevant tests, 66 | # test_cmd=setup_info["test_cmd"], 67 | commit=task_info["base_commit"], 68 | repo_name=task_info["repo"], 69 | # modifications to the test suite for this task instance, 70 | patch=task_info["patch"], 71 | test_patch=task_info["test_patch"], 72 | # testcases_passing=task_info["PASS_TO_PASS"], 73 | # testcases_failing=task_info["FAIL_TO_PASS"], 74 | language=language, 75 | # image_urls=task_info['image_urls'], 76 | # reference_setup=task_info['reference_setup'], 77 | version=task_info['version'], 78 | client = client, 79 | task_info = task_info 80 | ) 81 | 82 | def dump_meta_data(self, output_dir: str): 83 | meta = { 84 | "task_id": self.task_id, 85 | "setup_info": self.setup_info, 86 | "task_info": self.task_info, 87 | } 88 | with open(pjoin(output_dir, "meta.json"), "w") as f: 89 | json.dump(meta, f, indent=4) 90 | with open(pjoin(output_dir, "problem_statement.txt"), "w") as f: 91 | f.write(self.task_info["problem_statement"]) 92 | with open(pjoin(output_dir, "developer_patch.diff"), "w") as f: 93 | f.write(self.task_info["patch"]) 94 | 95 | 96 | class RawGithubTask(RawTask): 97 | """ 98 | Encapsulate everything required to run ACR on a fresh issue from the internet. 99 | """ 100 | 101 | def __init__( 102 | self, 103 | task_id: str, 104 | clone_link: str, 105 | commit_hash: str | None, 106 | issue_link: str, 107 | setup_dir: str, 108 | use_comments: bool = False, 109 | ): 110 | self._task_id = task_id 111 | self.clone_link = clone_link 112 | # if commit_hash is None, assume using the HEAD of default branch 113 | self.commit_hash = commit_hash 114 | self.issue_link = issue_link 115 | self.setup_dir = setup_dir 116 | self.use_comments = use_comments 117 | self.clone_path = pjoin(self.setup_dir, self.task_id) 118 | self.problem_statement, self.created_at = self.fetch_issue() 119 | self.clone_repo() 120 | 121 | @property 122 | def task_id(self) -> str: 123 | return self._task_id 124 | 125 | def clone_repo(self): 126 | clone_path = Path(self.clone_path) 127 | if os.path.exists(clone_path): 128 | log_and_print( 129 | f"Path {clone_path} already exists. Removing it to get a fresh clone." 130 | ) 131 | shutil.rmtree(clone_path) 132 | app_utils.clone_repo(self.clone_link, str(clone_path.parent), clone_path.name) 133 | log_and_print(f"Cloned source code to {clone_path}.") 134 | if self.commit_hash is None: 135 | # let's get the current commit hash 136 | with app_utils.cd(clone_path): 137 | self.commit_hash = app_utils.get_current_commit_hash() 138 | 139 | def dump_meta_data(self, output_dir: str): 140 | meta = { 141 | "task_info": { 142 | "base_commit": self.commit_hash, 143 | "created_at": self.created_at, 144 | "problem_statement": self.problem_statement, 145 | "instance_id": self.task_id, 146 | }, 147 | "setup_info": {"repo_path": self.clone_path}, 148 | } 149 | 150 | meta_file = pjoin(output_dir, "meta.json") 151 | 152 | with open(meta_file, "w") as f: 153 | json.dump(meta, f, indent=4) 154 | 155 | def fetch_issue(self): 156 | if "github.com" not in self.issue_link: 157 | raise NotImplementedError("Only GitHub issues are supported for now.") 158 | 159 | retrieved_issue = self.fetch_github_issue(self.issue_link, self.use_comments) 160 | 161 | if retrieved_issue is None: 162 | raise RuntimeError( 163 | f"Failed to retrieve issue information from {self.issue_link}" 164 | ) 165 | 166 | title, body, created_at = retrieved_issue 167 | 168 | body = self.process_links(body) 169 | 170 | problem_statement = f"{title}\n{body}" 171 | 172 | return problem_statement, created_at 173 | 174 | @classmethod 175 | def process_links(cls, body: str): 176 | code_pattern = re.compile( 177 | r"https://github.com/(.*?)/blob/(.*)/(.*)#L(\d+)-L(\d+)" 178 | ) 179 | replacements = [] 180 | 181 | for code_links in code_pattern.finditer(body): 182 | repo_name = code_links.group(1) 183 | commit = code_links.group(2) 184 | file_path = code_links.group(3) 185 | start_line = int(code_links.group(4)) 186 | end_line = int(code_links.group(5)) 187 | 188 | file_contents = httpx.get( 189 | f"https://raw.githubusercontent.com/{repo_name}/{commit}/{file_path}" 190 | ).text.splitlines() 191 | fragment = "\n".join(file_contents[start_line - 1 : end_line]) 192 | 193 | replacements.append((code_links.group(0), f"\n```{fragment }```\n")) 194 | 195 | for code_link, replacement in replacements: 196 | body = body.replace(code_link, code_link + replacement) 197 | return body 198 | 199 | @classmethod 200 | def fetch_github_issue( 201 | cls, issue_url: str, use_comments: bool = False 202 | ) -> tuple[str, str, str]: 203 | """Extract owner, repo, and issue number from the URL""" 204 | 205 | # Example issue URL: https://github.com/owner/repo/issues/123 206 | 207 | _, owner, repo, _, issue_number = issue_url.rsplit("/", 4) 208 | 209 | api_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}" 210 | comments_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/comments" 211 | 212 | issue_response = httpx.get(api_url) 213 | 214 | if issue_response.status_code != 200: 215 | raise RuntimeError( 216 | f"Failed to fetch issue information: {issue_response.status_code}" 217 | ) 218 | 219 | issue_info = issue_response.json() 220 | 221 | title = issue_info["title"] 222 | body = issue_info["body"] 223 | 224 | if use_comments: 225 | comments_response = httpx.get(comments_url) 226 | if comments_response.status_code != 200: 227 | raise RuntimeError( 228 | f"Failed to fetch comments information: {comments_response.status_code}" 229 | ) 230 | 231 | comments_info = comments_response.json() 232 | for comment in comments_info: 233 | if ( 234 | "user" not in comment 235 | or comment["user"]["type"] == "Bot" 236 | or comment["user"]["login"] == "acr-bot" 237 | ): 238 | continue 239 | 240 | body += ( 241 | f"\nUser: {comment['user']['login']}\nComment: {comment['body']}" 242 | ) 243 | 244 | created_at = issue_info["created_at"] 245 | 246 | return title, body, created_at 247 | 248 | def to_task(self) -> PlainTask: 249 | return PlainTask( 250 | commit_hash=self.commit_hash, 251 | local_path=self.clone_path, 252 | problem_statement=self.problem_statement, 253 | ) 254 | 255 | 256 | class RawLocalTask(RawTask): 257 | """ 258 | Encapsulate everything required to run ACR on a local issue on the disk. 259 | """ 260 | 261 | def __init__(self, task_id: str, local_repo: str, issue_file: str): 262 | self._task_id = task_id 263 | self.local_repo = local_repo 264 | self.issue_file = issue_file 265 | self.commit_hash = self.init_local_repo() 266 | self.problem_statement = self.read_issue_from_file() 267 | 268 | @property 269 | def task_id(self) -> str: 270 | return self._task_id 271 | 272 | def init_local_repo(self): 273 | with app_utils.cd(self.local_repo): 274 | if not app_utils.is_git_repo(): 275 | # non git repo - let's make it a git repo first 276 | app_utils.initialize_git_repo_and_commit() 277 | commit = app_utils.get_current_commit_hash() 278 | return commit 279 | 280 | def read_issue_from_file(self) -> str: 281 | # ignore encoding errors so at least we can have some issue content 282 | issue = Path(self.issue_file).read_text(errors="ignore") 283 | return issue 284 | 285 | def dump_meta_data(self, output_dir: str): 286 | meta = { 287 | "task_info": { 288 | "base_commit": self.commit_hash, 289 | "problem_statement": self.problem_statement, 290 | "instance_id": self.task_id, 291 | }, 292 | "setup_info": {"repo_path": self.local_repo}, 293 | } 294 | 295 | meta_file = pjoin(output_dir, "meta.json") 296 | 297 | with open(meta_file, "w") as f: 298 | json.dump(meta, f, indent=4) 299 | 300 | def to_task(self) -> PlainTask: 301 | return PlainTask( 302 | commit_hash=self.commit_hash, 303 | local_path=self.local_repo, 304 | problem_statement=self.problem_statement, 305 | ) 306 | --------------------------------------------------------------------------------