├── app
    ├── __init__.py
    ├── agents
    │   ├── __init__.py
    │   ├── write_dockerfile_agent
    │   │   ├── __init__.py
    │   │   └── write_dockerfile_agent.py
    │   ├── write_eval_script_agent
    │   │   └── __init__.py
    │   ├── context_retrieval_agent
    │   │   └── __init__.py
    │   ├── test_analysis_agent
    │   │   └── __init__.py
    │   ├── train_env_gen_agent
    │   │   ├── tools
    │   │   │   ├── finish.py
    │   │   │   ├── execute_bash.py
    │   │   │   └── search.py
    │   │   └── prompt.py
    │   └── agent.py
    ├── model
    │   ├── __init__.py
    │   ├── register.py
    │   ├── gemini.py
    │   ├── groq.py
    │   ├── claude.py
    │   ├── bedrock.py
    │   ├── ollama.py
    │   ├── gptlitellm.py
    │   └── common.py
    ├── globals.py
    ├── globals_mut.py
    ├── task.py
    ├── log.py
    ├── data_structures.py
    ├── post_process.py
    └── raw_tasks.py
├── data_collection
    ├── collect
    │   ├── __init__.py
    │   ├── SetupBench-lite
    │   │   ├── batch_12.txt
    │   │   ├── batch_need_test_resources.txt
    │   │   ├── batch_11.txt
    │   │   ├── batch_1.txt
    │   │   ├── batch_7.txt
    │   │   ├── batch_9.txt
    │   │   ├── batch_10.txt
    │   │   ├── batch_8.txt
    │   │   ├── batch_6.txt
    │   │   ├── batch_3.txt
    │   │   ├── batch_4.txt
    │   │   ├── batch_5.txt
    │   │   └── batch_2.txt
    │   ├── get_top_repos.py
    │   ├── print_pulls.py
    │   ├── README.md
    │   ├── get_version.py
    │   └── build_dataset.py
    ├── versioning
    │   ├── __init__.py
    │   ├── get_version_mix.sh
    │   ├── README.md
    │   ├── constants.py
    │   ├── merge_final_data.py
    │   └── get_versions_by_git.py
    └── README.md
├── figure
    └── overview.png
├── .pre-commit-config.yaml
├── LICENSE
├── evaluation
    ├── README.md
    └── docker_utils.py
├── run
    └── run.sh
├── requirements.txt
├── scripts
    ├── compute_cost.py
    └── judge_fail2pass.py
├── .gitignore
└── README.md


/app/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data_collection/collect/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data_collection/versioning/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/figure/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepSoftwareAnalytics/swe-factory/HEAD/figure/overview.png


--------------------------------------------------------------------------------
/app/agents/write_dockerfile_agent/__init__.py:
--------------------------------------------------------------------------------
1 | from .write_dockerfile_agent import WriteDockerfileAgent, write_dockerfile_utils


--------------------------------------------------------------------------------
/app/agents/write_eval_script_agent/__init__.py:
--------------------------------------------------------------------------------
1 | from .write_eval_script_agent import WriteEvalScriptAgent, write_eval_script_utils


--------------------------------------------------------------------------------
/app/agents/context_retrieval_agent/__init__.py:
--------------------------------------------------------------------------------
1 | from .context_retrieval_agent import ContextRetrievalAgent, context_retrieval_utils


--------------------------------------------------------------------------------
/app/agents/test_analysis_agent/__init__.py:
--------------------------------------------------------------------------------
1 | from .test_analysis_agent import TestAnalysisAgent, test_analysis_utils
2 | from . import docker_utils


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_12.txt:
--------------------------------------------------------------------------------
 1 | mochajs__mocha-1878
 2 | mochajs__mocha-1698
 3 | mochajs__mocha-1337
 4 | mochajs__mocha-1243
 5 | mochajs__mocha-1224
 6 | mochajs__mocha-1110
 7 | mochajs__mocha-795
 8 | mochajs__mocha-577
 9 | mochajs__mocha-635
10 | mochajs__mocha-368
11 | mochajs__mocha-462
12 | 


--------------------------------------------------------------------------------
/data_collection/README.md:
--------------------------------------------------------------------------------
 1 | # Raw Issue Data Collection Project
 2 | 
 3 | This code is dedicated to the collection and versioning of raw issue data.
 4 | 
 5 | ## Process Overview
 6 | 
 7 | The data handling process is divided into two main stages:
 8 | 
 9 | 1.  **Data Collection**: For instructions on how to collect raw task instances, please refer to the documentation in the `collect` directory.
10 | 2.  **Data Versioning**: Once the data is collected, please follow the versioning guidelines outlined in the `versioning` directory to properly label and manage the dataset.


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3.11
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/asottile/pyupgrade
 6 |     rev: v3.15.2
 7 |     hooks:
 8 |     - id: pyupgrade
 9 |       args: ["--py311-plus"]
10 | 
11 |   - repo: https://github.com/astral-sh/ruff-pre-commit
12 |     rev: v0.3.7
13 |     hooks:
14 |       - id: ruff
15 |         args: ["--fix"]
16 | 
17 |   - repo: https://github.com/pycqa/isort
18 |     rev: 5.13.2
19 |     hooks:
20 |       - id: isort
21 |         name: isort (python)
22 |         args: ["--profile", "black"]
23 | 
24 |   - repo: https://github.com/psf/black
25 |     rev: 24.4.0
26 |     hooks:
27 |       - id: black
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | LICENSE
 2 | 
 3 | The source code within this repository are dual licenced. You may choose to use it under the terms of GNU Affero General Public License (https://www.gnu.org/licenses/agpl-3.0.en.html#license-text) for non-commercial purposes, or you can obtain a commercial license for commercial use.
 4 | 
 5 | For non-commercial uses and licensing of this code and its derivatives (which include academic purposes), an open-source licence is granted in accordance with the following terms and conditions -
 6 | 
 7 | · GNU Affero General Public License (https://www.gnu.org/licenses/agpl-3.0.en.html#license-text)
 8 | 
 9 | For commercial use and licensing of this code, please contact -
10 | 
11 | · Yanlin Wang ( wangylin36@mail.sysu.edu.cn )
12 | 
13 | 


--------------------------------------------------------------------------------
/evaluation/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation Framework
 2 | 
 3 | This directory provides the evaluation framework for the GitHub issue resolution task.
 4 | 
 5 | ## Fail2Pass Validation
 6 | 
 7 | Ensure your dataset contains both a Dockerfile and an evaluation script. Then, run the following command to generate Fail2Pass test logs. The logs will be saved under `run_instances/mypy_fail2pass_check/gold`. When performing Fail2Pass validation, set the `predictions_path` to `gold` and use the `--is_judge_fail2pass` flag.
 8 | After running this command, you can find two test log files: "test_output_after_apply.txt" and "test_output_prev_apply.txt".
 9 | ```bash
10 | python run_evaluation.py \
11 |   --dataset_name "output/git-4.1-mini/mypy/results/results.json" \
12 |   --predictions_path "gold" \
13 |   --max_workers 5 \
14 |   --run_id "mypy_fail2pass_check" \
15 |   --output_path "run_instances" \
16 |   --timeout 3600 \
17 |   --is_judge_fail2pass
18 | ```
19 | 
20 | ## Evaluation
21 | 
22 | Once you have a validated GitHub issue resolution dataset (including Dockerfile and evaluation script), you can run the evaluation using the following command:
23 | 
24 | ```bash
25 | python run_evaluation.py \
26 |   --dataset_name "mypy_valid.json" \
27 |   --predictions_path "predictions.json" \
28 |   --max_workers 5 \
29 |   --run_id "mypy_evaluation" \
30 |   --output_path "run_instances" \
31 |   --timeout 3600 \
32 | ```


--------------------------------------------------------------------------------
/app/globals.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Values of global configuration variables.
 3 | """
 4 | 
 5 | # Overall output directory for results
 6 | output_dir: str = ""
 7 | 
 8 | # upper bound of the number of conversation rounds for the agent
 9 | conv_round_limit: int = 15
10 | 
11 | context_retrieval_round_limit: int = 15
12 | 
13 | # whether to perform sbfl
14 | enable_sbfl: bool = False
15 | 
16 | # whether to perform layered search
17 | enable_layered: bool = True
18 | 
19 | # whether to perform our own validation
20 | enable_validation: bool = False
21 | 
22 | # whether to do angelic debugging
23 | enable_angelic: bool = False
24 | 
25 | # whether to do perfect angelic debugging
26 | enable_perfect_angelic: bool = False
27 | 
28 | 
29 | # A special mode to only save SBFL result and exit
30 | only_save_sbfl_result: bool = False
31 | 
32 | # timeout for test cmd execution, currently set to 5 min
33 | test_exec_timeout: int = 300
34 | 
35 | 
36 | # Used with disable_patch_generation - constrains or extends the amount of context retrieval rounds
37 | context_generation_limit: int = -1
38 | 
39 | get_version: bool = False
40 | 
41 | enable_web_search: bool = False
42 | 
43 | agent_mode: str = "multi_agent"
44 | 
45 | disable_memory_pool: bool = False
46 | 
47 | disable_context_retrieval: bool = False
48 | 
49 | disable_run_test: bool = False
50 | 
51 | disable_download_test_resources: bool = False
52 | 
53 | using_ubuntu_only: bool = False


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_need_test_resources.txt:
--------------------------------------------------------------------------------
 1 | nodejs__undici-3842
 2 | eclipse-vertx__vert.x-3657
 3 | eclipse-vertx__vert.x-1300
 4 | python-pillow__Pillow-8056
 5 | python-pillow__Pillow-7823
 6 | python-pillow__Pillow-7883
 7 | python-pillow__Pillow-7496
 8 | python-pillow__Pillow-7274
 9 | python-pillow__Pillow-7151
10 | python-pillow__Pillow-7111
11 | python-pillow__Pillow-6954
12 | python-pillow__Pillow-6852
13 | python-pillow__Pillow-6517
14 | python-pillow__Pillow-6481
15 | python-pillow__Pillow-6381
16 | python-pillow__Pillow-6097
17 | python-pillow__Pillow-6086
18 | python-pillow__Pillow-5756
19 | python-pillow__Pillow-5557
20 | python-pillow__Pillow-5208
21 | python-pillow__Pillow-5125
22 | python-pillow__Pillow-4664
23 | python-pillow__Pillow-4471
24 | python-pillow__Pillow-4240
25 | python-pillow__Pillow-4063
26 | python-pillow__Pillow-4147
27 | python-pillow__Pillow-3897
28 | python-pillow__Pillow-3673
29 | python-pillow__Pillow-3625
30 | python-pillow__Pillow-3588
31 | python-pillow__Pillow-3532
32 | python-pillow__Pillow-3479
33 | python-pillow__Pillow-3364
34 | python-pillow__Pillow-3023
35 | python-pillow__Pillow-2899
36 | python-pillow__Pillow-2328
37 | python-pillow__Pillow-1985
38 | python-pillow__Pillow-1539
39 | python-pillow__Pillow-1152
40 | python-pillow__Pillow-1302
41 | python-pillow__Pillow-537
42 | python-pillow__Pillow-525
43 | python-pillow__Pillow-380
44 | python-pillow__Pillow-333
45 | 


--------------------------------------------------------------------------------
/app/globals_mut.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A global store, for values that can be mutated in multiprocessing, along with their related values.
 3 | """
 4 | 
 5 | import multiprocessing
 6 | 
 7 | # to be set at beginning
 8 | total_num_tasks = 0
 9 | num_completed_tasks = multiprocessing.Value("i", 0)
10 | 
11 | 
12 | # to be set at beginning
13 | total_num_task_groups = 0
14 | num_completed_task_groups = multiprocessing.Value("i", 0)
15 | 
16 | 
17 | def init_total_num_tasks(n: int):
18 |     global total_num_tasks
19 |     total_num_tasks = n
20 | 
21 | 
22 | def init_total_num_task_groups(n: int):
23 |     global total_num_task_groups
24 |     total_num_task_groups = n
25 | 
26 | 
27 | def incre_completed_tasks() -> int:
28 |     with num_completed_tasks.get_lock():
29 |         num_completed_tasks.value += 1
30 |     return num_completed_tasks.value
31 | 
32 | 
33 | def incre_completed_task_groups() -> int:
34 |     with num_completed_task_groups.get_lock():
35 |         num_completed_task_groups.value += 1
36 |     return num_completed_task_groups.value
37 | 
38 | 
39 | def incre_task_return_msg() -> str:
40 |     completed = incre_completed_tasks()
41 |     completed_groups = num_completed_task_groups.value
42 |     return f">>> Completed {completed}/{total_num_tasks} tasks. For groups, completed {completed_groups}/{total_num_task_groups} so far."
43 | 
44 | 
45 | def incre_task_group_return_msg() -> str:
46 |     completed = incre_completed_task_groups()
47 |     return f">>>>>> Completed {completed}/{total_num_task_groups} task groups."
48 | 


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_11.txt:
--------------------------------------------------------------------------------
 1 | python-attrs__attrs-367
 2 | python-attrs__attrs-394
 3 | python-attrs__attrs-383
 4 | python-attrs__attrs-286
 5 | python-attrs__attrs-292
 6 | python-attrs__attrs-343
 7 | python-attrs__attrs-277
 8 | python-attrs__attrs-229
 9 | python-attrs__attrs-60
10 | python-attrs__attrs-181
11 | python-attrs__attrs-186
12 | mochajs__mocha-5292
13 | mochajs__mocha-5325
14 | mochajs__mocha-5165
15 | mochajs__mocha-5231
16 | mochajs__mocha-5198
17 | mochajs__mocha-5032
18 | mochajs__mocha-4985
19 | mochajs__mocha-5074
20 | mochajs__mocha-4842
21 | mochajs__mocha-4835
22 | mochajs__mocha-4771
23 | mochajs__mocha-4807
24 | mochajs__mocha-4746
25 | mochajs__mocha-4668
26 | mochajs__mocha-4614
27 | mochajs__mocha-4638
28 | mochajs__mocha-4557
29 | mochajs__mocha-4607
30 | mochajs__mocha-4418
31 | mochajs__mocha-4382
32 | mochajs__mocha-4315
33 | mochajs__mocha-4165
34 | mochajs__mocha-4234
35 | mochajs__mocha-4147
36 | mochajs__mocha-4063
37 | mochajs__mocha-4068
38 | mochajs__mocha-3834
39 | mochajs__mocha-3816
40 | mochajs__mocha-3767
41 | mochajs__mocha-3737
42 | mochajs__mocha-3699
43 | mochajs__mocha-3632
44 | mochajs__mocha-3375
45 | mochajs__mocha-3222
46 | mochajs__mocha-3268
47 | mochajs__mocha-3024
48 | mochajs__mocha-3143
49 | mochajs__mocha-2746
50 | mochajs__mocha-2696
51 | mochajs__mocha-2642
52 | mochajs__mocha-2513
53 | mochajs__mocha-2479
54 | mochajs__mocha-2499
55 | mochajs__mocha-2345
56 | mochajs__mocha-2094
57 | mochajs__mocha-2081
58 | mochajs__mocha-1965
59 | mochajs__mocha-1410
60 | mochajs__mocha-1520
61 | 


--------------------------------------------------------------------------------
/data_collection/versioning/get_version_mix.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | 
 4 | usage() {
 5 |   cat >&2 <<EOF
 6 | Usage: $0 <instance_file> <output_dir> [testbed_dir]
 7 | 
 8 |   <instance_file>  Original task instance file (.json or .jsonl)
 9 |   <output_dir>     Directory to store intermediate and final results
10 |   [testbed_dir]    (Optional) Temporary directory for git-clone, defaults to ./testbed
11 | EOF
12 |   exit 1
13 | }
14 | 
15 | # Check arguments
16 | if [ "$#" -lt 2 ] || [ "$#" -gt 3 ]; then
17 |   usage
18 | fi
19 | 
20 | INSTANCE="$1"
21 | OUTDIR="$2"
22 | # Use the third argument if it exists, otherwise default to ./testbed
23 | TESTBED="${3:-./testbed}"
24 | 
25 | echo "🔧 Using testbed directory: $TESTBED"
26 | echo "🔧 Using output directory: $OUTDIR"
27 | 
28 | mkdir -p "$TESTBED" "$OUTDIR"
29 | 
30 | # 1. Extract by-github versions
31 | echo "👉 1. Getting by-github versions..."
32 | python get_versions.py \
33 |     --instances_path "$INSTANCE" \
34 |     --num_workers 100 \
35 |     --retrieval_method github \
36 |     --output_dir "$OUTDIR"
37 | 
38 | # 2. Extract by-git versions
39 | echo "👉 2. Getting by-git versions..."
40 | python get_versions_by_git.py \
41 |     --instance_path "$INSTANCE" \
42 |     --testbed "$TESTBED" \
43 |     --max_workers 100 \
44 |     --output_dir "$OUTDIR" \
45 |     --last_stage_output_dir "$OUTDIR"
46 | 
47 | # 3. Merge into the final version
48 | echo "👉 3. Merging both results into the final version..."
49 | python merge_final_data.py "$OUTDIR"
50 | 
51 | echo "✅ All done. Results are saved in $OUTDIR"


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_1.txt:
--------------------------------------------------------------------------------
 1 | assertj__assertj-3820
 2 | assertj__assertj-3735
 3 | assertj__assertj-3724
 4 | assertj__assertj-3325
 5 | assertj__assertj-3318
 6 | assertj__assertj-3691
 7 | assertj__assertj-3120
 8 | assertj__assertj-3056
 9 | assertj__assertj-2726
10 | assertj__assertj-2685
11 | assertj__assertj-2549
12 | assertj__assertj-2410
13 | assertj__assertj-2297
14 | assertj__assertj-2247
15 | assertj__assertj-2200
16 | assertj__assertj-2193
17 | assertj__assertj-2042
18 | assertj__assertj-1983
19 | assertj__assertj-1890
20 | assertj__assertj-1769
21 | assertj__assertj-1743
22 | assertj__assertj-1629
23 | assertj__assertj-1332
24 | assertj__assertj-1568
25 | assertj__assertj-1511
26 | assertj__assertj-1243
27 | assertj__assertj-1204
28 | assertj__assertj-1184
29 | assertj__assertj-1134
30 | assertj__assertj-1014
31 | assertj__assertj-813
32 | assertj__assertj-656
33 | assertj__assertj-54
34 | assertj__assertj-73
35 | assertj__assertj-169
36 | assertj__assertj-225
37 | assertj__assertj-101
38 | assertj__assertj-120
39 | assertj__assertj-613
40 | nodejs__undici-4178
41 | nodejs__undici-4131
42 | nodejs__undici-4112
43 | nodejs__undici-4088
44 | nodejs__undici-3977
45 | nodejs__undici-3855
46 | nodejs__undici-3941
47 | nodejs__undici-3833
48 | nodejs__undici-3842
49 | nodejs__undici-3758
50 | nodejs__undici-3631
51 | nodejs__undici-3566
52 | nodejs__undici-3495
53 | nodejs__undici-3505
54 | nodejs__undici-3294
55 | nodejs__undici-3251
56 | nodejs__undici-3206
57 | nodejs__undici-3169
58 | nodejs__undici-3105
59 | nodejs__undici-3047
60 | nodejs__undici-3005
61 | 


--------------------------------------------------------------------------------
/run/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | MODEL="deepseek/deepseek-chat-v3-0324"
 5 | # "google/gemini-2.5-flash-preview"
 6 | # "gpt-4.1-mini"
 7 | REPO_NAME="SetupBench-lite"
 8 | BASE_TASK_DIR="data_collection/collect/${REPO_NAME}"
 9 | TASKS_MAP="${BASE_TASK_DIR}/merged_instances_versions.jsonl"
10 | SETUP_DIR="testbed"
11 | ROUND=5
12 | NUM_PROCS=5
13 | TEMP=0.2
14 | BATCH_COUNT=1
15 | # we split SweSetupBench-lite into 17 batches, each batch contains 40 raw issue instances.
16 | for f in  "$TASKS_MAP"; do
17 |   if [ ! -f "$f" ]; then
18 |     echo "❌ Missing file: $f"
19 |     exit 1
20 |   fi
21 | done
22 | 
23 | cleanup() {
24 |   docker ps -a -q | xargs -r docker rm -f || true
25 |   docker image prune -af || true
26 |   rm -rf "$SETUP_DIR"
27 | }
28 | 
29 | for idx in $(seq 17 $BATCH_COUNT); do
30 |   TASK_LIST_FILE="${BASE_TASK_DIR}/batch_${idx}.txt"
31 |   if [ ! -f "$TASK_LIST_FILE" ]; then
32 |     echo "⚠️  Skipping missing ${TASK_LIST_FILE}"
33 |     continue
34 |   fi
35 | 
36 |   cleanup
37 | 
38 |   OUT_DIR="output_test1/${REPO_NAME}/${MODEL}/round_${ROUND}_batch_${idx}"
39 |   RESULT_DIR="output_test1/${REPO_NAME}/${MODEL}/results"
40 |   mkdir -p "$OUT_DIR"
41 | 
42 |   echo "▶️  Running batch_${idx} with normal mode"
43 | 
44 |   python app/main.py swe-bench \
45 |     --model "$MODEL" \
46 |     --tasks-map "$TASKS_MAP" \
47 |     --task-list-file "$TASK_LIST_FILE" \
48 |     --num-processes "$NUM_PROCS" \
49 |     --model-temperature "$TEMP" \
50 |     --conv-round-limit "$ROUND" \
51 |     --output-dir "$OUT_DIR" \
52 |     --setup-dir "$SETUP_DIR" \
53 |     --results-path "$RESULT_DIR"
54 | done
55 | 


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_7.txt:
--------------------------------------------------------------------------------
 1 | iamkun__dayjs-2377
 2 | iamkun__dayjs-2231
 3 | iamkun__dayjs-1571
 4 | iamkun__dayjs-1321
 5 | iamkun__dayjs-1414
 6 | iamkun__dayjs-1229
 7 | iamkun__dayjs-1086
 8 | iamkun__dayjs-1101
 9 | iamkun__dayjs-1003
10 | iamkun__dayjs-1023
11 | iamkun__dayjs-996
12 | iamkun__dayjs-980
13 | iamkun__dayjs-851
14 | iamkun__dayjs-1112
15 | iamkun__dayjs-891
16 | iamkun__dayjs-768
17 | iamkun__dayjs-719
18 | iamkun__dayjs-678
19 | iamkun__dayjs-539
20 | iamkun__dayjs-528
21 | iamkun__dayjs-453
22 | iamkun__dayjs-76
23 | iamkun__dayjs-55
24 | iamkun__dayjs-337
25 | iamkun__dayjs-1161
26 | iamkun__dayjs-952
27 | iamkun__dayjs-162
28 | python-pillow__Pillow-8852
29 | python-pillow__Pillow-8792
30 | python-pillow__Pillow-8701
31 | python-pillow__Pillow-8535
32 | python-pillow__Pillow-8602
33 | python-pillow__Pillow-8635
34 | python-pillow__Pillow-8476
35 | python-pillow__Pillow-8422
36 | python-pillow__Pillow-8231
37 | python-pillow__Pillow-8366
38 | python-pillow__Pillow-8056
39 | python-pillow__Pillow-8063
40 | python-pillow__Pillow-7948
41 | python-pillow__Pillow-7870
42 | python-pillow__Pillow-7823
43 | python-pillow__Pillow-7883
44 | python-pillow__Pillow-7496
45 | python-pillow__Pillow-7481
46 | python-pillow__Pillow-7383
47 | python-pillow__Pillow-7420
48 | python-pillow__Pillow-7412
49 | python-pillow__Pillow-7302
50 | python-pillow__Pillow-7274
51 | python-pillow__Pillow-7151
52 | python-pillow__Pillow-7078
53 | python-pillow__Pillow-7111
54 | python-pillow__Pillow-6954
55 | python-pillow__Pillow-6890
56 | python-pillow__Pillow-6852
57 | python-pillow__Pillow-6830
58 | python-pillow__Pillow-6819
59 | python-pillow__Pillow-6783
60 | python-pillow__Pillow-6647
61 | 


--------------------------------------------------------------------------------
/app/agents/train_env_gen_agent/tools/finish.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Description: A simple finish tool with a "submit" command.
 5 | 
 6 | Notes about the `submit` command:
 7 | * When invoked with `--result`, the provided string is used for submitting required task results (e.g., localization files).
 8 | * If no `--result` is provided, it defaults to an empty string.
 9 | 
10 | **Parameters:**
11 |   1. **command** (`string`, required): The command to run. Currently allowed option is: `submit`.
12 |      - Allowed value: [`submit`]
13 |   2. **result** (`string`, optional): The result text to submit. Defaults to an empty string.
14 | """
15 | 
16 | import argparse
17 | import sys
18 | 
19 | 
20 | def submit(result: str = ""):
21 |     """
22 |     Submits a final result, printing a message that includes the result.
23 |     """
24 |     print("<<<Finished>>>")
25 |     # if result:
26 |     #     print(f"Final result submitted: {result}")
27 |     # else:
28 |     #     print("No result provided.")
29 |     # You can add more logic here as needed
30 | 
31 | 
32 | def main():
33 |     parser = argparse.ArgumentParser(
34 |         description="submit tool: run the `submit` command with an optional `--result` argument."
35 |     )
36 |     parser.add_argument("command", help="Subcommand to run (currently only `submit`).")
37 |     parser.add_argument(
38 |         "--result", help="The result text to submit (optional).", default=""
39 |     )
40 | 
41 |     args = parser.parse_args()
42 | 
43 |     if args.command == "submit":
44 |         submit(args.result)
45 |     else:
46 |         print(f"Unknown command '{args.command}'. Only `submit` is supported.")
47 |         sys.exit(1)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_9.txt:
--------------------------------------------------------------------------------
 1 | python-pillow__Pillow-3338
 2 | python-pillow__Pillow-3233
 3 | python-pillow__Pillow-3086
 4 | python-pillow__Pillow-3023
 5 | python-pillow__Pillow-2899
 6 | python-pillow__Pillow-2852
 7 | python-pillow__Pillow-2683
 8 | python-pillow__Pillow-2410
 9 | python-pillow__Pillow-2641
10 | python-pillow__Pillow-2399
11 | python-pillow__Pillow-2330
12 | python-pillow__Pillow-2328
13 | python-pillow__Pillow-2262
14 | python-pillow__Pillow-2115
15 | python-pillow__Pillow-2131
16 | python-pillow__Pillow-2103
17 | python-pillow__Pillow-1988
18 | python-pillow__Pillow-1985
19 | python-pillow__Pillow-1647
20 | python-pillow__Pillow-1686
21 | python-pillow__Pillow-1594
22 | python-pillow__Pillow-1539
23 | python-pillow__Pillow-1401
24 | python-pillow__Pillow-1400
25 | python-pillow__Pillow-1152
26 | python-pillow__Pillow-1302
27 | python-pillow__Pillow-997
28 | python-pillow__Pillow-808
29 | python-pillow__Pillow-898
30 | python-pillow__Pillow-669
31 | python-pillow__Pillow-638
32 | python-pillow__Pillow-537
33 | python-pillow__Pillow-525
34 | python-pillow__Pillow-380
35 | python-pillow__Pillow-364
36 | python-pillow__Pillow-333
37 | python-pillow__Pillow-171
38 | python-pillow__Pillow-228
39 | python-pillow__Pillow-64
40 | pallets__click-2840
41 | pallets__click-2622
42 | pallets__click-2607
43 | pallets__click-2591
44 | pallets__click-2397
45 | pallets__click-2333
46 | pallets__click-2271
47 | pallets__click-2151
48 | pallets__click-2094
49 | pallets__click-2219
50 | pallets__click-2030
51 | pallets__click-1998
52 | pallets__click-1840
53 | pallets__click-1829
54 | pallets__click-1785
55 | pallets__click-1784
56 | pallets__click-1786
57 | pallets__click-1543
58 | pallets__click-1402
59 | pallets__click-1318
60 | pallets__click-1304
61 | 


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_10.txt:
--------------------------------------------------------------------------------
 1 | pallets__click-1261
 2 | pallets__click-1167
 3 | pallets__click-1014
 4 | pallets__click-1098
 5 | pallets__click-865
 6 | pallets__click-552
 7 | pallets__click-706
 8 | pallets__click-545
 9 | pallets__click-240
10 | pallets__click-212
11 | pallets__click-123
12 | reduxjs__redux-toolkit-4758
13 | reduxjs__redux-toolkit-4768
14 | reduxjs__redux-toolkit-4762
15 | reduxjs__redux-toolkit-4869
16 | reduxjs__redux-toolkit-4732
17 | reduxjs__redux-toolkit-4204
18 | reduxjs__redux-toolkit-4084
19 | reduxjs__redux-toolkit-4082
20 | reduxjs__redux-toolkit-4055
21 | reduxjs__redux-toolkit-3878
22 | reduxjs__redux-toolkit-3800
23 | reduxjs__redux-toolkit-3414
24 | reduxjs__redux-toolkit-3388
25 | reduxjs__redux-toolkit-3188
26 | reduxjs__redux-toolkit-3116
27 | reduxjs__redux-toolkit-3089
28 | reduxjs__redux-toolkit-2835
29 | reduxjs__redux-toolkit-2804
30 | reduxjs__redux-toolkit-2595
31 | reduxjs__redux-toolkit-2363
32 | reduxjs__redux-toolkit-2225
33 | reduxjs__redux-toolkit-2000
34 | reduxjs__redux-toolkit-1984
35 | reduxjs__redux-toolkit-1662
36 | reduxjs__redux-toolkit-1496
37 | reduxjs__redux-toolkit-1520
38 | python-attrs__attrs-1417
39 | python-attrs__attrs-1410
40 | python-attrs__attrs-1383
41 | python-attrs__attrs-1329
42 | python-attrs__attrs-1172
43 | python-attrs__attrs-1267
44 | python-attrs__attrs-1009
45 | python-attrs__attrs-1319
46 | python-attrs__attrs-1122
47 | python-attrs__attrs-969
48 | python-attrs__attrs-806
49 | python-attrs__attrs-760
50 | python-attrs__attrs-830
51 | python-attrs__attrs-886
52 | python-attrs__attrs-763
53 | python-attrs__attrs-684
54 | python-attrs__attrs-712
55 | python-attrs__attrs-660
56 | python-attrs__attrs-607
57 | python-attrs__attrs-563
58 | python-attrs__attrs-586
59 | python-attrs__attrs-489
60 | python-attrs__attrs-556
61 | 


--------------------------------------------------------------------------------
/data_collection/versioning/README.md:
--------------------------------------------------------------------------------
 1 | # Data Versioning
 2 | 
 3 | This directory provides a hybrid strategy to accurately version task instances. It combines two methods for robust results, prioritizing accuracy while maintaining automation.
 4 | 
 5 | ***
 6 | 
 7 | ### Our Approach
 8 | 
 9 | Our pipeline intelligently combines two methods:
10 | 
11 | 1.  **Pattern-Based Method (Primary)**
12 |     Inspired by SWE-bench, this method uses a predefined map of repository paths (e.g., `__init__.py`, `package.json`) and regex patterns to find the exact version string. It is extremely fast and accurate for supported projects.
13 | 
14 | 2.  **Git-Based Method (Fallback)**
15 |     This fully automated method infers the version by finding the nearest tag to a commit using git describe --tags. It requires no manual setup but is more time-consuming due to the need for repository cloning and checkout operations.
16 | 
17 | If a pattern is not defined for a repository, the system will automatically use the Git-Based Method for that task. However, for the best overall performance and accuracy, we still recommend using both methods together. This hybrid approach ensures we can efficiently retrieve version information for the vast majority of task instances.
18 | ***
19 | 
20 | ### How to Use
21 | 
22 | Run the provided shell script to execute the entire versioning pipeline. The script runs both methods and merges the results into a final versioned file.
23 | 
24 | **Command:**
25 | ```bash
26 | bash run_versioning.sh <instance_file> <output_dir> [testbed_dir]
27 | ````
28 | 
29 | **Arguments:**
30 | 
31 |   * `<instance_file>`: **(Required)** Path to your input task instances file.
32 |   * `<output_dir>`: **(Required)** Directory to store the results.
33 |   * `[testbed_dir]`: **(Optional)** Temporary directory for cloning repos. Defaults to `./testbed`.
34 | 
35 | The final, merged output will be saved in the `<output_dir>`.
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_8.txt:
--------------------------------------------------------------------------------
 1 | python-pillow__Pillow-6582
 2 | python-pillow__Pillow-6517
 3 | python-pillow__Pillow-6500
 4 | python-pillow__Pillow-6481
 5 | python-pillow__Pillow-6431
 6 | python-pillow__Pillow-6381
 7 | python-pillow__Pillow-6265
 8 | python-pillow__Pillow-6234
 9 | python-pillow__Pillow-6242
10 | python-pillow__Pillow-6188
11 | python-pillow__Pillow-6128
12 | python-pillow__Pillow-6124
13 | python-pillow__Pillow-6101
14 | python-pillow__Pillow-6097
15 | python-pillow__Pillow-6086
16 | python-pillow__Pillow-6054
17 | python-pillow__Pillow-5891
18 | python-pillow__Pillow-5845
19 | python-pillow__Pillow-5839
20 | python-pillow__Pillow-5696
21 | python-pillow__Pillow-5756
22 | python-pillow__Pillow-5647
23 | python-pillow__Pillow-5609
24 | python-pillow__Pillow-5572
25 | python-pillow__Pillow-5557
26 | python-pillow__Pillow-5554
27 | python-pillow__Pillow-5549
28 | python-pillow__Pillow-5437
29 | python-pillow__Pillow-5425
30 | python-pillow__Pillow-5417
31 | python-pillow__Pillow-5330
32 | python-pillow__Pillow-5313
33 | python-pillow__Pillow-5208
34 | python-pillow__Pillow-5139
35 | python-pillow__Pillow-5125
36 | python-pillow__Pillow-4966
37 | python-pillow__Pillow-4677
38 | python-pillow__Pillow-4749
39 | python-pillow__Pillow-4741
40 | python-pillow__Pillow-4664
41 | python-pillow__Pillow-4605
42 | python-pillow__Pillow-4474
43 | python-pillow__Pillow-4471
44 | python-pillow__Pillow-4283
45 | python-pillow__Pillow-4240
46 | python-pillow__Pillow-4063
47 | python-pillow__Pillow-4003
48 | python-pillow__Pillow-4147
49 | python-pillow__Pillow-3897
50 | python-pillow__Pillow-3859
51 | python-pillow__Pillow-3825
52 | python-pillow__Pillow-3778
53 | python-pillow__Pillow-3673
54 | python-pillow__Pillow-3625
55 | python-pillow__Pillow-3588
56 | python-pillow__Pillow-3532
57 | python-pillow__Pillow-3558
58 | python-pillow__Pillow-3513
59 | python-pillow__Pillow-3479
60 | python-pillow__Pillow-3364
61 | 


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_6.txt:
--------------------------------------------------------------------------------
 1 | checkstyle__checkstyle-14553
 2 | checkstyle__checkstyle-14497
 3 | checkstyle__checkstyle-13667
 4 | checkstyle__checkstyle-13357
 5 | checkstyle__checkstyle-13320
 6 | checkstyle__checkstyle-12487
 7 | checkstyle__checkstyle-12831
 8 | checkstyle__checkstyle-12516
 9 | checkstyle__checkstyle-12444
10 | checkstyle__checkstyle-12105
11 | checkstyle__checkstyle-11972
12 | checkstyle__checkstyle-11517
13 | checkstyle__checkstyle-11601
14 | checkstyle__checkstyle-11383
15 | checkstyle__checkstyle-11264
16 | checkstyle__checkstyle-11245
17 | checkstyle__checkstyle-11482
18 | checkstyle__checkstyle-11173
19 | checkstyle__checkstyle-10930
20 | checkstyle__checkstyle-10958
21 | checkstyle__checkstyle-10904
22 | checkstyle__checkstyle-10922
23 | checkstyle__checkstyle-10857
24 | checkstyle__checkstyle-10762
25 | checkstyle__checkstyle-10280
26 | checkstyle__checkstyle-10216
27 | checkstyle__checkstyle-9942
28 | checkstyle__checkstyle-9759
29 | checkstyle__checkstyle-9728
30 | checkstyle__checkstyle-9539
31 | checkstyle__checkstyle-9744
32 | checkstyle__checkstyle-9370
33 | checkstyle__checkstyle-9209
34 | checkstyle__checkstyle-9261
35 | checkstyle__checkstyle-9018
36 | checkstyle__checkstyle-8984
37 | checkstyle__checkstyle-8913
38 | checkstyle__checkstyle-8907
39 | checkstyle__checkstyle-8720
40 | checkstyle__checkstyle-8420
41 | checkstyle__checkstyle-8273
42 | checkstyle__checkstyle-8103
43 | checkstyle__checkstyle-8127
44 | checkstyle__checkstyle-8070
45 | checkstyle__checkstyle-8008
46 | checkstyle__checkstyle-7899
47 | checkstyle__checkstyle-7894
48 | checkstyle__checkstyle-7853
49 | checkstyle__checkstyle-7851
50 | checkstyle__checkstyle-7798
51 | checkstyle__checkstyle-7193
52 | checkstyle__checkstyle-7172
53 | checkstyle__checkstyle-6882
54 | checkstyle__checkstyle-6567
55 | checkstyle__checkstyle-6515
56 | checkstyle__checkstyle-4463
57 | checkstyle__checkstyle-3366
58 | checkstyle__checkstyle-1485
59 | checkstyle__checkstyle-1399
60 | iamkun__dayjs-2369
61 | 


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_3.txt:
--------------------------------------------------------------------------------
 1 | apollographql__apollo-client-1664
 2 | apollographql__apollo-client-1540
 3 | apollographql__apollo-client-1661
 4 | apollographql__apollo-client-1492
 5 | apollographql__apollo-client-1270
 6 | apollographql__apollo-client-1169
 7 | apollographql__apollo-client-1095
 8 | apollographql__apollo-client-1069
 9 | apollographql__apollo-client-1054
10 | apollographql__apollo-client-683
11 | apollographql__apollo-client-581
12 | apollographql__apollo-client-493
13 | apollographql__apollo-client-465
14 | apollographql__apollo-client-445
15 | apollographql__apollo-client-313
16 | apollographql__apollo-client-201
17 | apollographql__apollo-client-200
18 | apollographql__apollo-client-140
19 | apollographql__apollo-client-111
20 | apollographql__apollo-client-133
21 | eclipse-vertx__vert.x-5347
22 | eclipse-vertx__vert.x-5273
23 | eclipse-vertx__vert.x-5137
24 | eclipse-vertx__vert.x-4904
25 | eclipse-vertx__vert.x-5346
26 | eclipse-vertx__vert.x-4616
27 | eclipse-vertx__vert.x-4597
28 | eclipse-vertx__vert.x-4485
29 | eclipse-vertx__vert.x-4423
30 | eclipse-vertx__vert.x-4413
31 | eclipse-vertx__vert.x-4377
32 | eclipse-vertx__vert.x-4311
33 | eclipse-vertx__vert.x-4307
34 | eclipse-vertx__vert.x-4225
35 | eclipse-vertx__vert.x-4191
36 | eclipse-vertx__vert.x-4172
37 | eclipse-vertx__vert.x-4164
38 | eclipse-vertx__vert.x-4160
39 | eclipse-vertx__vert.x-4134
40 | eclipse-vertx__vert.x-4080
41 | eclipse-vertx__vert.x-4125
42 | eclipse-vertx__vert.x-4053
43 | eclipse-vertx__vert.x-4037
44 | eclipse-vertx__vert.x-3946
45 | eclipse-vertx__vert.x-3913
46 | eclipse-vertx__vert.x-3853
47 | eclipse-vertx__vert.x-3800
48 | eclipse-vertx__vert.x-3764
49 | eclipse-vertx__vert.x-3754
50 | eclipse-vertx__vert.x-3718
51 | eclipse-vertx__vert.x-3663
52 | eclipse-vertx__vert.x-3657
53 | eclipse-vertx__vert.x-3604
54 | eclipse-vertx__vert.x-3607
55 | eclipse-vertx__vert.x-3559
56 | eclipse-vertx__vert.x-3428
57 | eclipse-vertx__vert.x-3418
58 | eclipse-vertx__vert.x-3384
59 | eclipse-vertx__vert.x-3247
60 | eclipse-vertx__vert.x-3197
61 | 


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_4.txt:
--------------------------------------------------------------------------------
 1 | eclipse-vertx__vert.x-3101
 2 | eclipse-vertx__vert.x-3016
 3 | eclipse-vertx__vert.x-2929
 4 | eclipse-vertx__vert.x-2883
 5 | eclipse-vertx__vert.x-2726
 6 | eclipse-vertx__vert.x-2723
 7 | eclipse-vertx__vert.x-2724
 8 | eclipse-vertx__vert.x-2631
 9 | eclipse-vertx__vert.x-2458
10 | eclipse-vertx__vert.x-2392
11 | eclipse-vertx__vert.x-2366
12 | eclipse-vertx__vert.x-2309
13 | eclipse-vertx__vert.x-2354
14 | eclipse-vertx__vert.x-2209
15 | eclipse-vertx__vert.x-2108
16 | eclipse-vertx__vert.x-2083
17 | eclipse-vertx__vert.x-2074
18 | eclipse-vertx__vert.x-2073
19 | eclipse-vertx__vert.x-2064
20 | eclipse-vertx__vert.x-2017
21 | eclipse-vertx__vert.x-1907
22 | eclipse-vertx__vert.x-1799
23 | eclipse-vertx__vert.x-1770
24 | eclipse-vertx__vert.x-1615
25 | eclipse-vertx__vert.x-1604
26 | eclipse-vertx__vert.x-1565
27 | eclipse-vertx__vert.x-1476
28 | eclipse-vertx__vert.x-1366
29 | eclipse-vertx__vert.x-1300
30 | eclipse-vertx__vert.x-1287
31 | tailwindlabs__tailwindcss-17647
32 | tailwindlabs__tailwindcss-17754
33 | tailwindlabs__tailwindcss-17301
34 | tailwindlabs__tailwindcss-16800
35 | tailwindlabs__tailwindcss-16631
36 | tailwindlabs__tailwindcss-16078
37 | tailwindlabs__tailwindcss-16103
38 | tailwindlabs__tailwindcss-16069
39 | tailwindlabs__tailwindcss-15576
40 | tailwindlabs__tailwindcss-15318
41 | tailwindlabs__tailwindcss-15183
42 | tailwindlabs__tailwindcss-15003
43 | tailwindlabs__tailwindcss-14962
44 | tailwindlabs__tailwindcss-14981
45 | tailwindlabs__tailwindcss-14993
46 | tailwindlabs__tailwindcss-14744
47 | tailwindlabs__tailwindcss-14747
48 | tailwindlabs__tailwindcss-14877
49 | tailwindlabs__tailwindcss-14269
50 | tailwindlabs__tailwindcss-13949
51 | tailwindlabs__tailwindcss-13770
52 | tailwindlabs__tailwindcss-13379
53 | tailwindlabs__tailwindcss-11470
54 | tailwindlabs__tailwindcss-12404
55 | tailwindlabs__tailwindcss-11157
56 | tailwindlabs__tailwindcss-12113
57 | tailwindlabs__tailwindcss-11002
58 | tailwindlabs__tailwindcss-10288
59 | tailwindlabs__tailwindcss-10074
60 | tailwindlabs__tailwindcss-10601
61 | 


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_5.txt:
--------------------------------------------------------------------------------
 1 | tailwindlabs__tailwindcss-9704
 2 | tailwindlabs__tailwindcss-9405
 3 | tailwindlabs__tailwindcss-9319
 4 | tailwindlabs__tailwindcss-9208
 5 | tailwindlabs__tailwindcss-8773
 6 | tailwindlabs__tailwindcss-8687
 7 | tailwindlabs__tailwindcss-8622
 8 | tailwindlabs__tailwindcss-8448
 9 | tailwindlabs__tailwindcss-8125
10 | tailwindlabs__tailwindcss-7789
11 | tailwindlabs__tailwindcss-8091
12 | tailwindlabs__tailwindcss-7565
13 | tailwindlabs__tailwindcss-7291
14 | tailwindlabs__tailwindcss-7163
15 | tailwindlabs__tailwindcss-6519
16 | tailwindlabs__tailwindcss-6469
17 | tailwindlabs__tailwindcss-5470
18 | tailwindlabs__tailwindcss-5245
19 | tailwindlabs__tailwindcss-4852
20 | tailwindlabs__tailwindcss-4471
21 | tailwindlabs__tailwindcss-4263
22 | tailwindlabs__tailwindcss-4214
23 | tailwindlabs__tailwindcss-2951
24 | tailwindlabs__tailwindcss-2331
25 | tailwindlabs__tailwindcss-2322
26 | tailwindlabs__tailwindcss-2271
27 | tailwindlabs__tailwindcss-2211
28 | tailwindlabs__tailwindcss-2108
29 | tailwindlabs__tailwindcss-2075
30 | tailwindlabs__tailwindcss-1083
31 | tailwindlabs__tailwindcss-1680
32 | tailwindlabs__tailwindcss-1094
33 | tailwindlabs__tailwindcss-1799
34 | tailwindlabs__tailwindcss-992
35 | tailwindlabs__tailwindcss-847
36 | tailwindlabs__tailwindcss-681
37 | tailwindlabs__tailwindcss-516
38 | tailwindlabs__tailwindcss-497
39 | tailwindlabs__tailwindcss-418
40 | tailwindlabs__tailwindcss-255
41 | tailwindlabs__tailwindcss-77
42 | tailwindlabs__tailwindcss-82
43 | checkstyle__checkstyle-16515
44 | checkstyle__checkstyle-16605
45 | checkstyle__checkstyle-16418
46 | checkstyle__checkstyle-15969
47 | checkstyle__checkstyle-15822
48 | checkstyle__checkstyle-15686
49 | checkstyle__checkstyle-15681
50 | checkstyle__checkstyle-15430
51 | checkstyle__checkstyle-15334
52 | checkstyle__checkstyle-15337
53 | checkstyle__checkstyle-15358
54 | checkstyle__checkstyle-15212
55 | checkstyle__checkstyle-15199
56 | checkstyle__checkstyle-15127
57 | checkstyle__checkstyle-14983
58 | checkstyle__checkstyle-14882
59 | checkstyle__checkstyle-14804
60 | checkstyle__checkstyle-14623
61 | 


--------------------------------------------------------------------------------
/app/agents/train_env_gen_agent/tools/execute_bash.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Description: Execute a bash command in the terminal, with Python version compatibility.
 5 | 
 6 | Parameters:
 7 |   --command (string, optional): The bash command to execute. For example: --command 'python my_script.py'. If not provided, will show help.
 8 | """
 9 | 
10 | import argparse
11 | import subprocess
12 | import sys
13 | 
14 | BLOCKED_BASH_COMMANDS = ["git", "ipython", "jupyter", "nohup"]
15 | 
16 | 
17 | def run_command(cmd):
18 |     try:
19 |         # Try to use the new parameters (Python 3.7+)
20 |         return subprocess.run(cmd, shell=True, capture_output=True, text=True)
21 |     except TypeError:
22 |         # Fallback for Python 3.5 and 3.6:
23 |         return subprocess.run(
24 |             cmd,
25 |             shell=True,
26 |             stdout=subprocess.PIPE,
27 |             stderr=subprocess.PIPE,
28 |             universal_newlines=True,
29 |         )
30 | 
31 | 
32 | def main():
33 |     parser = argparse.ArgumentParser(description="Execute a bash command.")
34 |     parser.add_argument(
35 |         "command",
36 |         type=str,
37 |         help="The command (and optional arguments) to execute. For example: 'python my_script.py'",
38 |     )
39 |     args = parser.parse_args()
40 | 
41 |     # Check if the command is blocked
42 |     first_token = args.command.strip().split()[0]
43 |     if first_token in BLOCKED_BASH_COMMANDS:
44 |         print(
45 |             f"Bash command '{first_token}' is not allowed. "
46 |             "Please use a different command or tool."
47 |         )
48 |         sys.exit(1)
49 | 
50 |     result = run_command(args.command)
51 | 
52 |     if result.returncode != 0:
53 |         print(f"Error executing command:\n")
54 |         print("[STDOUT]\n")
55 |         print(result.stdout.strip(), "\n")
56 |         print("[STDERR]\n")
57 |         print(result.stderr.strip())
58 |         sys.exit(result.returncode)
59 | 
60 |     print("[STDOUT]\n")
61 |     print(result.stdout.strip(), "\n")
62 |     print("[STDERR]\n")
63 |     print(result.stderr.strip())
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     main()


--------------------------------------------------------------------------------
/data_collection/collect/SetupBench-lite/batch_2.txt:
--------------------------------------------------------------------------------
 1 | nodejs__undici-2992
 2 | nodejs__undici-2939
 3 | apollographql__apollo-client-12450
 4 | apollographql__apollo-client-12451
 5 | apollographql__apollo-client-12497
 6 | apollographql__apollo-client-12533
 7 | apollographql__apollo-client-12379
 8 | apollographql__apollo-client-12300
 9 | apollographql__apollo-client-12224
10 | apollographql__apollo-client-12254
11 | apollographql__apollo-client-12121
12 | apollographql__apollo-client-12367
13 | apollographql__apollo-client-12052
14 | apollographql__apollo-client-11944
15 | apollographql__apollo-client-11921
16 | apollographql__apollo-client-11923
17 | apollographql__apollo-client-11799
18 | apollographql__apollo-client-11638
19 | apollographql__apollo-client-11403
20 | apollographql__apollo-client-11200
21 | apollographql__apollo-client-11180
22 | apollographql__apollo-client-11078
23 | apollographql__apollo-client-10853
24 | apollographql__apollo-client-10937
25 | apollographql__apollo-client-10809
26 | apollographql__apollo-client-10766
27 | apollographql__apollo-client-10450
28 | apollographql__apollo-client-10368
29 | apollographql__apollo-client-10499
30 | apollographql__apollo-client-10340
31 | apollographql__apollo-client-10143
32 | apollographql__apollo-client-10134
33 | apollographql__apollo-client-9808
34 | apollographql__apollo-client-9369
35 | apollographql__apollo-client-9328
36 | apollographql__apollo-client-9222
37 | apollographql__apollo-client-8718
38 | apollographql__apollo-client-8574
39 | apollographql__apollo-client-8438
40 | apollographql__apollo-client-8394
41 | apollographql__apollo-client-8372
42 | apollographql__apollo-client-7581
43 | apollographql__apollo-client-7657
44 | apollographql__apollo-client-7146
45 | apollographql__apollo-client-7075
46 | apollographql__apollo-client-7055
47 | apollographql__apollo-client-6587
48 | apollographql__apollo-client-6710
49 | apollographql__apollo-client-6589
50 | apollographql__apollo-client-6691
51 | apollographql__apollo-client-6448
52 | apollographql__apollo-client-6353
53 | apollographql__apollo-client-5116
54 | apollographql__apollo-client-4765
55 | apollographql__apollo-client-3956
56 | apollographql__apollo-client-3580
57 | apollographql__apollo-client-2710
58 | apollographql__apollo-client-2362
59 | apollographql__apollo-client-2345
60 | apollographql__apollo-client-1801
61 | 


--------------------------------------------------------------------------------
/data_collection/collect/get_top_repos.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import argparse
 4 | import os
 5 | import sys
 6 | 
 7 | def fetch_top_repos(language: str, output_path: str, top_n: int, token: str):
 8 |     headers = {
 9 |         "Accept": "application/vnd.github+json",
10 |         "Authorization": f"token {token}"
11 |     }
12 | 
13 |     url = "https://api.github.com/search/repositories"
14 |     params = {
15 |         "q": f"language:{language}",
16 |         "sort": "stars",
17 |         "order": "desc",
18 |         "per_page": 100,
19 |         "page": 1
20 |     }
21 | 
22 |     print(f"📡 Fetching top {top_n} repositories for language: {language}")
23 |     repos = []
24 | 
25 |     while len(repos) < top_n:
26 |         response = requests.get(url, headers=headers, params=params)
27 |         if response.status_code != 200:
28 |             print(f"❌ Error: {response.status_code} - {response.json().get('message')}")
29 |             break
30 | 
31 |         data = response.json().get("items", [])
32 |         if not data:
33 |             break
34 | 
35 |         for repo in data:
36 |             repos.append({
37 |                 "name": repo["full_name"],
38 |                 "stars": repo["stargazers_count"],
39 |                 "url": repo["html_url"],
40 |                 "description": repo["description"],
41 |                 "owner": repo["owner"]["login"],
42 |                 "language": repo["language"]
43 |             })
44 | 
45 |         params["page"] += 1
46 | 
47 |     os.makedirs(output_path, exist_ok=True)
48 |     output_file = os.path.join(output_path, f"{language.lower()}_top_{top_n}_repos.json")
49 |     print(f"💾 Saving {min(top_n, len(repos))} repos to {output_file}")
50 |     with open(output_file, mode='w', encoding='utf-8') as f:
51 |         json.dump(repos[:top_n], f, indent=2, ensure_ascii=False)
52 | 
53 |     print("✅ Done!")
54 | 
55 | def main():
56 |     parser = argparse.ArgumentParser(description="Fetch top GitHub repos by language")
57 |     parser.add_argument("--language", type=str, required=True, help="Programming language (e.g., Python)")
58 |     parser.add_argument("--output_path", type=str, required=True, help="Directory to save the result JSON")
59 |     parser.add_argument("--top_n", type=int, default=500, help="Number of top repositories to fetch")
60 |     args = parser.parse_args()
61 | 
62 |     token = os.environ.get("GITHUB_TOKEN")
63 |     if not token:
64 |         print("❌ GitHub token not found. Please set the environment variable `github_token`.")
65 |         sys.exit(1)
66 | 
67 |     fetch_top_repos(
68 |         language=args.language,
69 |         output_path=args.output_path,
70 |         top_n=args.top_n,
71 |         token=token
72 |     )
73 | 
74 | if __name__ == "__main__":
75 |     main()
76 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | aiohappyeyeballs==2.4.3
  2 | aiohttp==3.10.2
  3 | aiosignal==1.3.1
  4 | annotated-types==0.6.0
  5 | antlr4-tools==0.2.1
  6 | anyio==4.2.0
  7 | astroid==3.2.3
  8 | asttokens==2.4.1
  9 | attrs==23.2.0
 10 | beautifulsoup4==4.12.3
 11 | Brotli==1.1.0
 12 | certifi==2024.7.4
 13 | cffi==1.17.1
 14 | cfgv==3.4.0
 15 | chain==1.0
 16 | charset-normalizer==3.4.0
 17 | click==8.1.7
 18 | colorama==0.4.6
 19 | coverage==7.5.3
 20 | Cython==3.0.8
 21 | datasets==3.5.0
 22 | dill==0.3.8
 23 | discord==2.3.2
 24 | discord.py==2.3.2
 25 | distlib==0.3.8
 26 | distro==1.9.0
 27 | docker==7.1.0
 28 | docstring-parser==0.15
 29 | emojis==0.7.0
 30 | executing==2.1.0
 31 | filelock==3.13.1
 32 | frozenlist==1.4.1
 33 | fsspec==2024.6.1
 34 | ghapi==1.0.5
 35 | h11==0.14.0
 36 | h2==4.1.0
 37 | hpack==4.0.0
 38 | httpcore==1.0.5
 39 | httpx==0.27.0
 40 | huggingface-hub==0.30.2
 41 | hyperframe==6.0.1
 42 | icecream==2.1.3
 43 | identify==2.5.33
 44 | idna==3.7
 45 | importlib-metadata==7.0.1
 46 | install-jdk==1.1.0
 47 | isort==5.13.2
 48 | javalang==0.13.0
 49 | Jinja2==3.1.4
 50 | jiter==0.8.0
 51 | jsonschema==4.22.0
 52 | jsonschema-specifications==2023.12.1
 53 | libclang==18.1.1
 54 | linkify-it-py==2.0.2
 55 | litellm==1.44.8
 56 | loguru==0.7.2
 57 | lxml==5.1.0
 58 | markdown-it-py
 59 | MarkupSafe==2.1.5
 60 | mccabe==0.7.0
 61 | mdit-py-plugins==0.4.0
 62 | mdurl
 63 | more-itertools==10.2.0
 64 | mpmath==1.3.0
 65 | multidict==6.0.4
 66 | multiprocess==0.70.16
 67 | natsort==8.4.0
 68 | networkx==3.2.1
 69 | nodeenv==1.8.0
 70 | numpy==1.26.4
 71 | # nvidia-cublas-cu12==12.1.3.1
 72 | # nvidia-cuda-cupti-cu12==12.1.105
 73 | # nvidia-cuda-nvrtc-cu12==12.1.105
 74 | # nvidia-cuda-runtime-cu12==12.1.105
 75 | # nvidia-cudnn-cu12==8.9.2.26
 76 | # nvidia-cufft-cu12==11.0.2.54
 77 | # nvidia-curand-cu12==10.3.2.106
 78 | # nvidia-cusolver-cu12==11.4.5.107
 79 | # nvidia-cusparse-cu12==12.1.0.106
 80 | # nvidia-nccl-cu12==2.19.3
 81 | # nvidia-nvjitlink-cu12==12.4.99
 82 | # nvidia-nvtx-cu12==12.1.105
 83 | ollama==0.3.3
 84 | openai==1.50.2
 85 | opt-einsum==3.3.0
 86 | packaging==23.2
 87 | pandas==2.2.3
 88 | platformdirs==4.1.0
 89 | polars==0.20.31
 90 | pre-commit==3.6.0
 91 | pyarrow==19.0.1
 92 | pycparser==2.22
 93 | pydantic==2.5.3
 94 | pydantic_core==2.14.6
 95 | Pygments==2.17.2
 96 | pylint==3.2.3
 97 | pyro-api==0.1.2
 98 | pyro-ppl==1.9.0
 99 | PySocks==1.7.1
100 | python-dateutil==2.9.0.post0
101 | python-dotenv==1.0.0
102 | pytz==2025.2
103 | PyYAML==6.0.1
104 | referencing==0.32.1
105 | regex==2024.5.15
106 | requests==2.32.3
107 | rich==13.7.1
108 | rpds-py==0.16.2
109 | semver==3.0.2
110 | setuptools==68.2.2
111 | six==1.16.0
112 | slack_sdk==3.26.2
113 | sniffio==1.3.0
114 | soupsieve==2.5
115 | sympy==1.13.0
116 | tenacity==8.2.3
117 | termcolor==2.4.0
118 | textual==0.52.1
119 | tiktoken==0.7.0
120 | timeout-decorator==0.5.0
121 | tokenizers==0.19.1
122 | tomlkit==0.13.0
123 | torch==2.2.1
124 | tqdm==4.66.4
125 | tree-sitter==0.21.3
126 | tree-sitter-c==0.21.4
127 | tree-sitter-cpp==0.22.2
128 | tree-sitter-java==0.21.0
129 | tree-sitter-languages==1.10.2
130 | triton==2.2.0
131 | types-jsonschema==4.21.0.20240311
132 | typing_extensions
133 | tzdata==2025.2
134 | uc-micro-py==1.0.2
135 | unidiff==0.7.5
136 | unittest-xml-reporting==3.2.0
137 | urllib3==2.2.3
138 | virtualenv==20.25.0
139 | wheel==0.41.2
140 | xxhash==3.5.0
141 | yarl==1.9.4
142 | zipp==3.19.2
143 | zstandard==0.22.0
144 | 


--------------------------------------------------------------------------------
/data_collection/collect/print_pulls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """Given the `<owner/name>` of a GitHub repo, this script writes the raw information for all the repo's PRs to a single `.jsonl` file."""
 4 | 
 5 | import argparse
 6 | import json
 7 | import logging
 8 | import os
 9 | from typing import Optional
10 | from tqdm import tqdm
11 | from fastcore.xtras import obj2dict
12 | from utils import Repo
13 | 
14 | logging.basicConfig(
15 |     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
16 | )
17 | logger = logging.getLogger(__name__)
18 | 
19 | 
20 | def log_all_pulls(repo: Repo, output: str, mode: str, pr_data_list=None):
21 |     """
22 |     Iterate over all pull requests in a repository and log them to a file
23 | 
24 |     Args:
25 |         repo (Repo): repository object
26 |         output (str): output file name
27 |     """
28 |     # Create output directory if it doesn't exist
29 |     output_dir = os.path.dirname(output)
30 |     if output_dir:
31 |         os.makedirs(output_dir, exist_ok=True)
32 | 
33 |     if mode == 'swebench':
34 |         with open(output, "w") as output_file:
35 |             for pull in repo.get_all_pulls():
36 |                 setattr(pull, "resolved_issues", repo.extract_resolved_issues(pull))
37 |                 print(json.dumps(obj2dict(pull)), end="\n", flush=True, file=output_file)
38 |     else:
39 |         pulls = repo.get_all_pulls_with_official_github_api()
40 |         print(f'total prs number: {len(pulls)}')
41 |         with open(output, 'a') as f:
42 |             for pull in tqdm(pulls):
43 |                 if pr_data_list and pull['number'] in pr_data_list:
44 |                     continue
45 |                 else:
46 |                     issues = repo.extract_resolved_issues_with_official_github_api(pull)
47 |                     pull["resolved_issues"] = issues
48 |                     json.dump(pull, f)
49 |                     f.write('\n')  # 写入换行符以分隔 JSON 对象
50 | 
51 |                 
52 | 
53 | 
54 | def main(repo_name: str, output: str, token: Optional[str] = None,mode: Optional[str]  ='swebench'):
55 |     """
56 |     Logic for logging all pull requests in a repository
57 | 
58 |     Args:
59 |         repo_name (str): name of the repository
60 |         output (str): output file name
61 |         token (str, optional): GitHub token
62 |     """
63 |     if token is None:
64 |         token = os.environ["GITHUB_TOKEN"]
65 |     try:
66 |         owner, repo = repo_name.split("/")
67 |     except:
68 |         print(repo_name)
69 |     logger.info(repo_name)
70 |     repo = Repo(owner, repo, token=token)
71 |     if os.path.exists(output):
72 |         pr_data_list = []
73 |         with open(output, 'r', encoding='utf-8') as f:
74 |             for line in f:  
75 |                 pr_data_list.append(json.loads(line)['number'])
76 |         log_all_pulls(repo, output,mode,pr_data_list)
77 |     else:
78 |         log_all_pulls(repo, output,mode)
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     parser = argparse.ArgumentParser(description=__doc__)
83 |     parser.add_argument("repo_name", type=str, help="Name of the repository")
84 |     parser.add_argument("output", type=str, help="Output file name")
85 |     parser.add_argument("--token", type=str, help="GitHub token")
86 |     parser.add_argument("--mode", type=str, default='omnigirl',help="GitHub token")
87 |     args = parser.parse_args()
88 |     main(**vars(args))
89 | 


--------------------------------------------------------------------------------
/scripts/compute_cost.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | def count_finished_status_and_cost(directory):
 6 |     # status.json 统计
 7 |     finished_count = 0
 8 |     total_status_files = 0
 9 | 
10 |     # cost.json 统计
11 |     total_tokens_sum = 0.0
12 |     total_input_tokens_sum = 0.0
13 |     total_output_tokens_sum = 0.0
14 |     total_elapsed_seconds = 0.0
15 |     total_cost_files = 0
16 | 
17 |     # meta.json 统计（如果需要）
18 |     total_meta_files = 0
19 | 
20 |     for root, _, files in os.walk(directory):
21 |         for file in files:
22 |             path = os.path.join(root, file)
23 | 
24 |             if file == "status.json":
25 |                 total_status_files += 1
26 |                 try:
27 |                     data = json.load(open(path, encoding="utf-8"))
28 |                     if data.get("is_finish") is True:
29 |                         finished_count += 1
30 |                 except Exception as e:
31 |                     print(f"读取 {path} 出错: {e}")
32 | 
33 |             elif file == "cost.json":
34 |                 total_cost_files += 1
35 |                 try:
36 |                     data = json.load(open(path, encoding="utf-8"))
37 |                     # total_tokens
38 |                     total_tokens_sum += float(data.get("total_tokens", 0))
39 |                     # total_input_tokens
40 |                     total_input_tokens_sum += float(data.get("total_input_tokens", 0))
41 |                     # total_output_tokens
42 |                     total_output_tokens_sum += float(data.get("total_output_tokens", 0))
43 |                     # elapsed_seconds
44 |                     total_elapsed_seconds += float(data.get("elapsed_seconds", 0))
45 |                 except Exception as e:
46 |                     print(f"读取 {path} 出错: {e}")
47 | 
48 |             elif file == "meta.json":
49 |                 total_meta_files += 1
50 | 
51 |     # 输出结果
52 |     print(f"Total 'status.json' files found: {total_status_files}")
53 |     print(f"Files with 'is_finish = true': {finished_count}")
54 |     print(f"Total 'meta.json' files found: {total_meta_files}")
55 |     print(f"Total 'cost.json' files found: {total_cost_files}")
56 | 
57 |     if total_cost_files:
58 |         avg_total = total_tokens_sum / total_cost_files
59 |         avg_input = total_input_tokens_sum / total_cost_files
60 |         avg_output = total_output_tokens_sum / total_cost_files
61 |         avg_elapsed = total_elapsed_seconds / total_cost_files
62 | 
63 |         print(f"Sum of 'total_tokens': {total_tokens_sum}")
64 |         print(f"Average 'total_tokens': {avg_total:.2f}")
65 |         print(f"Sum of 'total_input_tokens': {total_input_tokens_sum}")
66 |         print(f"Average 'total_input_tokens': {avg_input:.2f}")
67 |         print(f"Sum of 'total_output_tokens': {total_output_tokens_sum}")
68 |         print(f"Average 'total_output_tokens': {avg_output:.2f}")
69 |         print(f"Sum of 'elapsed_seconds': {total_elapsed_seconds}")
70 |         print(f"Average 'elapsed_seconds': {avg_elapsed:.2f}")
71 |     else:
72 |         print("No 'cost.json' files found, cannot compute averages.")
73 | 
74 | if __name__ == "__main__":
75 |     parser = argparse.ArgumentParser(
76 |         description="统计目录下 status.json、cost.json（含 total_tokens/input/output_tokens 和 elapsed_seconds）和 meta.json 文件情况。"
77 |     )
78 |     parser.add_argument("directory", help="目标目录路径")
79 |     args = parser.parse_args()
80 |     count_finished_status_and_cost(args.directory)
81 | 


--------------------------------------------------------------------------------
/app/model/register.py:
--------------------------------------------------------------------------------
 1 | from app.model import (
 2 |     azure,
 3 |     bedrock,
 4 |     claude,
 5 |     common,
 6 |     gemini,
 7 |     gpt,
 8 |     gptlitellm,
 9 |     groq,
10 |     ollama,
11 | )
12 | 
13 | 
14 | def register_all_models() -> None:
15 |     """
16 |     Register all models. This is called in main.
17 |     """
18 |     common.register_model(gpt.Gpt4o_20241120())
19 |     common.register_model(gpt.Gpt4o_20240806())
20 |     common.register_model(gpt.Gpt4o_20240513())
21 |     common.register_model(gpt.Gpt4o_mini_20240718())
22 |     common.register_model(gpt.Gpt4_Turbo20240409())
23 |     common.register_model(gpt.Gpt4_0125Preview())
24 |     common.register_model(gpt.Gpt4_1106Preview())
25 |     common.register_model(gpt.Gpt35_Turbo0125())
26 |     common.register_model(gpt.Gpt35_Turbo1106())
27 |     common.register_model(gpt.Gpt35_Turbo16k_0613())
28 |     common.register_model(gpt.Gpt35_Turbo0613())
29 |     common.register_model(gpt.Gpt4_0613())
30 |     common.register_model(gpt.Gpt_o1mini())
31 |     common.register_model(gpt.Qwen25_72B())
32 |     common.register_model(gpt.DeepSeekV25())
33 |     common.register_model(gpt.DeepSeekV3())
34 |     common.register_model(gpt.DeepSeek())
35 |     common.register_model(gpt.Gpt4_1())
36 |     common.register_model(gpt.Gpt4_1_mini())
37 |     common.register_model(gpt.Gpt5_mini())
38 |     common.register_model(gpt.Gemini_2_5_flash_preview())
39 |     common.register_model(gpt.Gemini_2_5_flash_lite_preview())
40 |     common.register_model(gpt.Kimi_k2())
41 |     common.register_model(gpt.Gpt4_1_nano())
42 |     common.register_model(gpt.Claude3_5Sonnet())
43 |     common.register_model(gpt.Claude3_7Sonnet())
44 |     common.register_model(claude.Claude3Opus())
45 |     common.register_model(claude.Claude3Sonnet())
46 |     common.register_model(claude.Claude3Haiku())
47 |     # common.register_model(claude.Claude3_5Sonnet())
48 | 
49 |     common.register_model(bedrock.AnthropicClaude3Opus())
50 |     common.register_model(bedrock.AnthropicClaude3Sonnet())
51 |     common.register_model(bedrock.AnthropicClaude3Haiku())
52 | 
53 |     common.register_model(ollama.Llama3_8B())
54 |     common.register_model(ollama.Llama3_70B())
55 | 
56 |     common.register_model(groq.Llama3_8B())
57 |     common.register_model(groq.Llama3_70B())
58 |     common.register_model(groq.Mixtral_8x7B())
59 |     common.register_model(groq.Gemma_7B())
60 | 
61 |     common.register_model(gptlitellm.Gpt4o_20240513LiteLLM())
62 |     common.register_model(gptlitellm.Gpt4_Turbo20240409LiteLLM())
63 |     common.register_model(gptlitellm.Gpt4_0125PreviewLiteLLM())
64 |     common.register_model(gptlitellm.Gpt4_1106PreviewLiteLLM())
65 |     common.register_model(gptlitellm.Gpt35_Turbo0125LiteLLM())
66 |     common.register_model(gptlitellm.Gpt35_Turbo1106LiteLLM())
67 |     common.register_model(gptlitellm.Gpt35_Turbo16k_0613LiteLLM())
68 |     common.register_model(gptlitellm.Gpt35_Turbo0613LiteLLM())
69 |     common.register_model(gptlitellm.Gpt4_0613LiteLLM())
70 | 
71 | 
72 |     common.register_model(azure.AzureGpt4())
73 |     common.register_model(azure.AzureGpt4o())
74 |     common.register_model(azure.AzureGpt35_Turbo())
75 |     common.register_model(azure.AzureGpt35_Turbo16k())
76 |     common.register_model(azure.AzureGpt_o1mini())
77 | 
78 |     common.register_model(gemini.GeminiPro())
79 |     common.register_model(gemini.Gemini15Pro())
80 | 
81 |     # register default model as selected
82 |     common.SELECTED_MODEL = gpt.Gpt35_Turbo0125()
83 | 


--------------------------------------------------------------------------------
/data_collection/collect/README.md:
--------------------------------------------------------------------------------
 1 | # Data Collection Process Overview
 2 | 
 3 | This directory provides code to collect raw issue data using GitHub APIs and predefined patterns. This implementation currently supports collecting issues from repositories in Python, Java, JavaScript, and TypeScript. We welcome PRs to support more languages!
 4 | 
 5 | 
 6 | 1. **Fetch Popular Repositories**
 7 | 
 8 |     - Use the `get_top_repos.py` script to find and save a list of the most popular (by stars) repositories for a given language.
 9 | 
10 |     - **Note**: This script requires a GitHub Personal Access Token to be set as an environment variable.
11 | 
12 |         Example:
13 |         ```bash
14 |         export GITHUB_TOKEN=<your_token> # Set your token first
15 |         python get_top_repos.py --language Python --output_path data/popular_repos --top_n 100
16 |         ```
17 |         Where:
18 |         - `--language:` The programming language to search for (e.g., 'Python', 'Java'). (Required)
19 |         - `--output_path`: The directory where the output JSON file will be saved. (Required)
20 |         - `--top_n`: The number of top-starred repositories to fetch (default: 500).
21 |         - The output will be saved in the specified path, in a file named, for instance, python_top_100_repos.json. 
22 | 
23 | 2. **Raw PR Data Collection**
24 |    
25 |     - Use the `print_pulls.py` script to collect raw PR data from GitHub repositories.
26 |         
27 |        Example:
28 |        ```bash
29 |        export GITHUB_TOKEN=<your_token> # Set your token first
30 |        python print_pulls.py python-attrs/attrs data/python-attrs/attrs/prs.jsonl
31 |        ```
32 | 
33 |        Where:
34 |        - `<repo_name>`: GitHub repository name in "owner/repo" format (e.g., "octocat/Hello-World").
35 |        - `<output_file>`: Path for the output JSONL file (e.g., "data/prs.jsonl").
36 |        - `--token`: GitHub personal access token (defaults to the `GITHUB_TOKEN` environment variable).
37 |        
38 | 3. **Raw Task Instance Construction**
39 |     - Use the `build_dataset.py` script to process collected PR data and construct task instances.
40 | 
41 |        Example:
42 |        ```bash
43 |        export GITHUB_TOKEN=<your_token> # Set your token first
44 |        python build_dataset.py data/python-attrs/attrs/prs.jsonl data/python-attrs/attrs/instances.jsonl --language python
45 |        ```  
46 | 
47 |        Where:
48 |        - `<pr_file>`: Path to the input PR JSONL file from the previous step.
49 |        - `<output_file>`: Path for the output task instance JSONL file.
50 |        - `--language`: The programming language of the repository. Accepts `python`, `java`, or `js`. Use `js` for both JavaScript and TypeScript repositories.
51 |        - `--token`: Optional GitHub token (defaults to the `GITHUB_TOKEN` environment variable).
52 | 
53 | 4. **Versioning**
54 |    - Use the `get_version.py` script to assign version numbers to the raw instances.
55 |    
56 |    - **Note on Strategy**: The script works by checking out an instance's `base_commit` and parsing the output of the `git describe   --tags` command.
57 | 
58 |     This method's success **depends entirely on a repository's tagging practices**. It may fail or produce inaccurate versions if release tags are inconsistent or not used. For more reliable results, please follow the complete [versioning documentation](../versioning).
59 | 
60 |     
61 |     Example:
62 |      ```bash
63 |      python get_version.py --instance_path data/python-attrs/attrs/instances.jsonl --testbed github --max-workers 20
64 |      ```
65 |      
66 |      Where:
67 |      - `--instance_path`: Path to the task instances file (required).
68 |      - `--testbed`: A temporary working directory for cloning repositories.
69 |      - `--max-workers`: The number of parallel processes to use (default: 10).
70 |      - The results will be saved to a new file with a `_versions` suffix (e.g., `instances_versions.jsonl`).


--------------------------------------------------------------------------------
/app/task.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | import subprocess
  5 | from abc import ABC, abstractmethod
  6 | from dataclasses import dataclass
  7 | from os.path import join as pjoin
  8 | from tempfile import mkstemp
  9 | import shutil
 10 | import app.utils as apputils
 11 | from app import globals, log
 12 | from app import utils as app_utils
 13 | 
 14 | from app.log import log_and_print
 15 | from docker import DockerClient
 16 | 
 17 | class Task(ABC):
 18 |     @property
 19 |     @abstractmethod
 20 |     def project_path(self) -> str:
 21 |         raise NotImplementedError("abstract method")
 22 | 
 23 |     @abstractmethod
 24 |     def get_issue_statement(self) -> str:
 25 |         raise NotImplementedError("abstract method")
 26 | 
 27 |     @abstractmethod
 28 |     def setup_project(self) -> None:
 29 |         """Set up the project before starting to resolve the task."""
 30 |         raise NotImplementedError("abstract method")
 31 | 
 32 |     @abstractmethod
 33 |     def reset_project(self) -> None:
 34 |         """Reset project to initial state."""
 35 |         raise NotImplementedError("abstract method")
 36 | 
 37 | 
 38 | 
 39 | @dataclass(kw_only=True)
 40 | class SweTask(Task):
 41 |     task_id: str
 42 |     problem_statement: str
 43 |     repo_path: str
 44 |     repo_cache_path: str
 45 |     commit: str
 46 |     # env_name: str
 47 |     repo_name: str
 48 |     # pre_install_cmds: list[str]
 49 |     # install_cmd: str
 50 |     # test_cmd: str
 51 |     patch: str
 52 |     test_patch: str
 53 |     # testcases_passing: list[str]
 54 |     # testcases_failing: list[str]
 55 |     language: str
 56 |     # image_urls: list[str]
 57 |     # reference_setup: dict
 58 |     version: str
 59 |     client: DockerClient
 60 |     task_info: dict
 61 |     @property
 62 |     def project_path(self) -> str:
 63 |         return self.repo_path
 64 |     
 65 | 
 66 |     @project_path.setter
 67 |     def project_path(self, value: str) -> None:
 68 |         self.repo_path = value
 69 | 
 70 |     def get_issue_statement(self) -> str:
 71 |         return self.problem_statement
 72 |     
 73 | 
 74 |     def setup_project(self) -> None:
 75 |         # get the correct version of the project and commit-specific pip install
 76 |         task = self
 77 |         with apputils.cd(task.project_path):
 78 |             apputils.repo_reset_and_clean_checkout(task.commit)
 79 | 
 80 | 
 81 |         # apply the test modifications to this task
 82 | 
 83 |         # commit the current changes, so that resetting later do not erase them
 84 |         with apputils.cd(task.project_path):
 85 |             apputils.repo_commit_current_changes()
 86 | 
 87 |     def reset_project(self) -> None:
 88 |         with apputils.cd(self.repo_path):
 89 |             apputils.repo_reset_and_clean_checkout(self.commit)
 90 | 
 91 |     def remove_project(self) -> None:
 92 |         """Remove the entire project repository."""
 93 |         if os.path.exists(self.repo_path):
 94 |             shutil.rmtree(self.repo_path)
 95 |             log_and_print(f"Removed project repository at {self.repo_path}")
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | @dataclass(kw_only=True)
102 | class PlainTask(Task):
103 |     """
104 |     Tasks that only contain a codebase and an issue descripion (no test suite).
105 |     """
106 | 
107 |     commit_hash: str
108 |     local_path: str
109 |     problem_statement: str
110 | 
111 |     @property
112 |     def project_path(self) -> str:
113 |         return self.local_path
114 | 
115 |     def setup_project(self) -> None:
116 |         with apputils.cd(self.project_path):
117 |             apputils.repo_reset_and_clean_checkout(self.commit_hash)
118 | 
119 |     def reset_project(self) -> None:
120 |         with apputils.cd(self.project_path):
121 |             apputils.repo_reset_and_clean_checkout(self.commit_hash)
122 | 
123 |     def get_issue_statement(self) -> str:
124 |         return self.problem_statement
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/scripts/judge_fail2pass.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import argparse
  5 | import multiprocessing
  6 | from tqdm import tqdm
  7 | from dotenv import load_dotenv
  8 | from openai import OpenAI
  9 | 
 10 | # --- Configuration ---
 11 | load_dotenv()  # Load environment variables from .env file
 12 | 
 13 | PREV_FILE_NAME  = "test_output_prev_apply.txt"
 14 | AFTER_FILE_NAME = "test_output_after_apply.txt"
 15 | EXIT_CODE_RE    = re.compile(r"echo OMNIGRIL_EXIT_CODE=(\d+)")
 16 | 
 17 | 
 18 | def extract_exit_code(content: str) -> int | None:
 19 |     """Extracts the exit code from the content; returns None if not found."""
 20 |     m = EXIT_CODE_RE.search(content)
 21 |     return int(m.group(1)) if m else None
 22 | 
 23 | 
 24 | def process_subdirectory(subdir):
 25 |     prev_path  = os.path.join(subdir, PREV_FILE_NAME)
 26 |     after_path = os.path.join(subdir, AFTER_FILE_NAME)
 27 | 
 28 |     # missing outputs or unparsable -> error
 29 |     if not (os.path.isfile(prev_path) and os.path.isfile(after_path)):
 30 |         return "error"
 31 | 
 32 |     prev_content  = open(prev_path,  encoding="utf-8", errors="ignore").read()
 33 |     after_content = open(after_path, encoding="utf-8", errors="ignore").read()
 34 |     prev_exit  = extract_exit_code(prev_content)
 35 |     after_exit = extract_exit_code(after_content)
 36 | 
 37 |     if prev_exit is None or after_exit is None:
 38 |         return "error"
 39 | 
 40 |     prev_fail = (prev_exit != 0)
 41 |     after_pass = (after_exit == 0)
 42 | 
 43 |     if prev_fail and after_pass:
 44 |         return "fail2pass"
 45 |     elif prev_fail and not after_pass:
 46 |         return "fail2fail"
 47 |     elif not prev_fail and after_pass:
 48 |         return "pass2pass"
 49 |     elif not prev_fail and not after_pass:
 50 |         return "pass2fail"
 51 |     else:
 52 |         return "error"
 53 | 
 54 | 
 55 | def classify_and_write_json(src_folder: str, output_json: str, processes: int):
 56 |     # Collect subdirectories
 57 |     subs = [os.path.join(src_folder, d)
 58 |             for d in os.listdir(src_folder)
 59 |             if os.path.isdir(os.path.join(src_folder, d))]
 60 | 
 61 |     # Parallel processing
 62 |     with multiprocessing.Pool(processes) as pool:
 63 |         statuses = list(tqdm(
 64 |             pool.imap(process_subdirectory, subs),
 65 |             total=len(subs), desc="Classifying"
 66 |         ))
 67 | 
 68 |     # Build category mapping
 69 |     cats = {"fail2pass": [], "fail2fail": [], "pass2pass": [], "pass2fail": [], "error": []}
 70 |     for subdir, status in zip(subs, statuses):
 71 |         inst_id = os.path.basename(subdir)
 72 |         cats.setdefault(status, []).append(inst_id)
 73 | 
 74 |     # Print summary
 75 |     print("Classification summary:")
 76 |     for cat, ids in cats.items():
 77 |         print(f"  {cat}: {len(ids)}")
 78 | 
 79 |     # Write structured JSON
 80 |     summary = {"total": len(subs), "categories": cats}
 81 |     with open(output_json, 'w', encoding="utf-8") as f:
 82 |         json.dump(summary, f, indent=2)
 83 |     print(f"Summary JSON written to '{output_json}'")
 84 | 
 85 | 
 86 | def main():
 87 |     parser = argparse.ArgumentParser(
 88 |         description="Classify subdirectories by test exit codes and output summary JSON.")
 89 |     parser.add_argument("target_folder", help="Folder containing subdirs to classify.")
 90 |     parser.add_argument("output_json", help="Path for summary JSON output.")
 91 |     parser.add_argument("--processes", type=int, default=20, help="Number of worker processes.")
 92 |     args = parser.parse_args()
 93 | 
 94 |     if not os.path.isdir(args.target_folder):
 95 |         parser.error(f"Folder not found: {args.target_folder}")
 96 |     if args.processes < 1:
 97 |         parser.error("--processes must be >= 1")
 98 | 
 99 |     classify_and_write_json(args.target_folder, args.output_json, args.processes)
100 | 
101 | if __name__ == "__main__":
102 |     multiprocessing.freeze_support()
103 |     main()
104 | 


--------------------------------------------------------------------------------
/app/agents/train_env_gen_agent/prompt.py:
--------------------------------------------------------------------------------
 1 | SYSTEM_PROMPT = """
 2 | You are tasked with adapting a Dockerfile and its evaluation script (eval script) so that they can run seamlessly in a coding agent evaluation environment.
 3 | 
 4 | Context & Constraints
 5 | 1. Container startup
 6 |    The coding agent will always start the container like this:
 7 |    self.container = self.client.containers.run(
 8 |        docker_image,
 9 |        ["/bin/bash", "-l"],
10 |        name=ctr_name,
11 |        detach=True,
12 |        tty=True,
13 |        stdin_open=True
14 |    )
15 |    Ensure the image is compatible with this startup (login shell, interactive mode, etc.).
16 | 
17 | 2. Command execution
18 |    All commands are executed using:
19 |    future = executor.submit(
20 |        self.container.exec_run,
21 |        cmd=["/bin/sh", "-c", command],
22 |        workdir='/testbed',
23 |        stdout=True,
24 |        stderr=True
25 |    )
26 |    Ensure that this execution pattern works (commands are run in /testbed by default).
27 | 
28 | 3. Repository location
29 |    - The target repository must be cloned directly into /testbed.
30 |    - Do NOT create subdirectories like /testbed/mypy; it must be /testbed.
31 | 
32 | 4. Workdir & virtual environment
33 |    - The final working directory when the container starts must be /testbed.
34 |    - If a virtual environment (e.g., conda or venv) is used, activate it automatically by adding the activation command to ~/.bash_profile so that it’s active when the agent attaches with /bin/bash -l.
35 | 
36 | 5. Install coding agent tools
37 |    - Pre-install the required tools:
38 |      git clone https://github.com/gnohgnailoug/r2e_tools.git /root/r2e_tools
39 |      pip install -e /root/r2e_tools
40 |    - This ensures the agent can run search -h successfully.
41 | 
42 | 6. Adjust eval script if needed
43 |    - If you modify paths in the Dockerfile (e.g., moving the repository from /testbed/mypy to /testbed), also update the eval script accordingly so it still runs correctly.
44 | 
45 | 7. No other changes
46 |    - Do NOT change the environment setup or testing commands (e.g., Maven/pytest commands remain unchanged).
47 |    - Only make changes required for compatibility with the coding agent.
48 | 
49 | Deliverables
50 | - Rewritten Dockerfile: Fully adapted to the above constraints.
51 | - Updated eval script: Ensure it works with the new paths and environment settings.
52 | 
53 | Evaluation Criteria
54 | Your output will be tested as follows:
55 | 1. Container builds successfully.
56 | 2. Tool works: Running run_command("search -h") inside the container should succeed.
57 | 3. Eval script works: After copying it to /run_tests.sh, running run_command("bash /run_tests.sh") should successfully execute the tests.
58 | 
59 | Task:
60 | Rewrite the Dockerfile and eval script accordingly. Do not change any build/test logic except what’s needed for path and environment adaptation. Ensure full compliance with the above constraints.
61 | """
62 | 
63 | USER_PROMPT="""
64 | You are given an original Dockerfile and an evaluation script.
65 | 
66 | Your task is to modify them only as needed to make them fully compatible with the coding agent environment, based on the system constraints provided.
67 | 
68 | Important:
69 | - Do not change any core build or test logic in the Dockerfile or evaluation script.
70 | - Only make minimal adjustments necessary for compatibility (e.g., paths, working directory, virtual environment activation, tool installation).
71 | - If a file does not require modification, return "<None>" instead of repeating its content.
72 | - Always provide the full content for any modified file.
73 | 
74 | Original Dockerfile:
75 | ```dockerfile
76 | {{ORIGINAL_DOCKERFILE}}
77 | ```
78 | 
79 | Original Evaluation Script:
80 | ```bash
81 | {{ORIGINAL_EVAL_SCRIPT}}
82 | ```
83 | 
84 | Your task: Rewrite these files as needed to meet the system constraints.
85 | 
86 | Return your answer strictly in the following JSON format (valid JSON, no extra text outside JSON):
87 | ```json
88 | {
89 |   "dockerfile": "string (full modified Dockerfile or <None>)",
90 |   "eval_script": "string (full modified eval script or <None>)",
91 |   "notes": "string (explanation of what was changed and why)"
92 | }
93 | ```
94 | """


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | output*/
 29 | testbed/
 30 | data/
 31 | evaluation/run_instance/
 32 | evaluation/run_instance*
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | # *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | evaluation/reports/
166 | output/
167 | # *data_collection/collect/*.sh
168 | *data_collection/collect/temp
169 | *evaluation/temp
170 | temp/
171 | evaluation/*.sh
172 | run_collect*/
173 | run_collect_train
174 | scripts/*.json
175 | scripts/dataset/


--------------------------------------------------------------------------------
/app/agents/agent.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from app.data_structures import MessageThread, FunctionCallIntent
  3 | from app.log import log_exception
  4 | from loguru import logger
  5 | import os
  6 | import json
  7 | from collections.abc import Callable, Mapping
  8 | 
  9 | class Agent(ABC):
 10 |     """
 11 |     Abstract base class for all agents.
 12 |     Provides per-agent message thread, tool call tracking, and default dispatch_intent.
 13 |     """
 14 |     api_functions: list[str] = []
 15 | 
 16 |     def __init__(self, agent_id):
 17 |         # Each agent has its own thread
 18 |         self.msg_thread = MessageThread()
 19 |         self.agent_id = agent_id
 20 |         # Tracking of tool calls
 21 |         self.tool_call_sequence: list[dict] = []
 22 |         self.tool_call_layers: list[list[dict]] = []
 23 |         self.curr_tool: str | None = None
 24 |         self.iteration_num = 0
 25 |         self.finish_status = True
 26 |     
 27 |     def add_user_message(self, text: str):
 28 |         """add a user message to the thread"""
 29 |         self.msg_thread.add_user(text)
 30 | 
 31 |     def add_system_message(self, text: str):
 32 |         """add a system message to the thread"""
 33 |         self.msg_thread.add_system(text)
 34 | 
 35 |     def add_model_message(self, text: str,tools: list):
 36 |         """add a model message to the thread"""
 37 |         self.msg_thread.add_model(text,tools)
 38 | 
 39 |     @abstractmethod
 40 |     def run_task(self, print_callback=None) -> tuple[str, str, bool]:
 41 |         """
 42 |         Execute the agent's primary function.
 43 |         Returns:
 44 |             - output (str): raw tool or LLM output
 45 |             - summary (str): one-line summary
 46 |             - success (bool): whether the action succeeded
 47 |         """
 48 |         pass
 49 |     
 50 |     def init_msg_thread(self) -> None:
 51 |         pass
 52 | 
 53 |     def dispatch_intent(
 54 |         self,
 55 |         intent: FunctionCallIntent,
 56 |         # message_thread: MessageThread = None,
 57 |         # print_callback: Callable[[dict], None] | None = None,
 58 |     ) -> tuple[str, str, bool]:
 59 |         """
 60 |         Dispatch a FunctionCallIntent to call the agent's tool methods.
 61 |         """
 62 |     
 63 | 
 64 |         if intent.func_name not in self.api_functions:
 65 |             error = f"Unknown function name {intent.func_name}."
 66 |             summary = "You called a tool that does not exist."
 67 |             return error, summary, False
 68 | 
 69 |         func_obj = getattr(self, intent.func_name)
 70 |         try:
 71 |             self.curr_tool = intent.func_name
 72 |             # If function expects thread
 73 |             # if 'message_thread' in func_obj.__code__.co_varnames:
 74 |             #     call_res = func_obj(message_thread, print_callback=print_callback)
 75 |             # else:
 76 |             call_res = func_obj(**intent.arg_values)
 77 |         except Exception as e:
 78 |             log_exception(e)
 79 |             error = str(e)
 80 |             summary = "Tool raised an exception."
 81 |             call_res = (error, summary, False)
 82 | 
 83 |         logger.debug("Result of dispatch_intent: {}", call_res)
 84 | 
 85 |         # Record the call
 86 |         result, _, ok = call_res
 87 |         self.tool_call_sequence.append(intent.to_dict_with_result(ok,result,self.agent_id))
 88 |         # if not self.tool_call_layers:
 89 |         #     self.tool_call_layers.append([])
 90 |         # self.tool_call_layers[-1].append(intent.to_dict_with_result(ok,result,self.agent_id))
 91 | 
 92 |         return call_res
 93 | 
 94 |     def start_new_layer(self):
 95 |         self.tool_call_layers.append([])
 96 | 
 97 |     def reset_tool_sequence(self):
 98 |         self.tool_call_sequence = []
 99 | 
100 |     def dump_tool_sequence(self, output_dir: str):
101 |         os.makedirs(output_dir, exist_ok=True)
102 |         seq_file = os.path.join(output_dir, 'tool_sequence.json')
103 |         # layer_file = os.path.join(output_dir, 'agent_tool_layers.json')
104 |         with open(seq_file, 'w') as f:
105 |             json.dump(self.tool_call_sequence, f, indent=2)
106 |         # with open(layer_file, 'w') as f:
107 |         #     json.dump(self.tool_call_layers, f, indent=2)
108 | 


--------------------------------------------------------------------------------
/data_collection/versioning/constants.py:
--------------------------------------------------------------------------------
  1 | # Constants - Task Instance Version File
  2 | MAP_REPO_TO_VERSION_PATHS = {
  3 |     "dbt-labs/dbt-core": ["core/dbt/version.py", "core/dbt/__init__.py"],
  4 |     "django/django": ["django/__init__.py"],
  5 |     "huggingface/transformers": ["src/transformers/__init__.py"],
  6 |     "marshmallow-code/marshmallow": ["src/marshmallow/__init__.py"],
  7 |     "mwaskom/seaborn": ["seaborn/__init__.py"],
  8 |     "pallets/flask": ["src/flask/__init__.py", "flask/__init__.py"],
  9 |     "psf/requests": ["requests/__version__.py", "requests/__init__.py"],
 10 |     "pyca/cryptography": [
 11 |         "src/cryptography/__about__.py",
 12 |         "src/cryptography/__init__.py",
 13 |     ],
 14 |     "pylint-dev/astroid": ["astroid/__pkginfo__.py", "astroid/__init__.py"],
 15 |     "pylint-dev/pylint": ["pylint/__pkginfo__.py", "pylint/__init__.py"],
 16 |     "pytest-dev/pytest": ["src/_pytest/_version.py", "_pytest/_version.py"],
 17 |     "pyvista/pyvista": ["pyvista/_version.py", "pyvista/__init__.py"],
 18 |     "Qiskit/qiskit": ["qiskit/VERSION.txt"],
 19 |     "scikit-learn/scikit-learn": ["sklearn/__init__.py"],
 20 |     "sphinx-doc/sphinx": ["sphinx/__init__.py"],
 21 |     "sympy/sympy": ["sympy/release.py", "sympy/__init__.py"],
 22 |     "pillow/pillow":["src/PIL/_version.py"],
 23 |     'dateutil/dateutil': ['NEWS'],
 24 |     'python/mypy': ['mypy/version.py'],
 25 |     'redis/redis-py': ['setup.py','redis/__init__.py'],
 26 |     'tqdm/tqdm': ['tqdm/_version.py'],
 27 |     'prettier/prettier':['package.json'],
 28 |     'tailwindlabs/tailwindcss':['package.json'],
 29 |     'jestjs/jest':['lerna.json','package.json'],
 30 |     'webpack/webpack':['package.json'],
 31 |     'apollographql/apollo-client':['package.json'],
 32 |     'iamkun/dayjs':['CHANGELOG.md'],
 33 |     'babel/babel':['package.json'],
 34 |     'statsmodels/statsmodels':['docs/source/release/index.rst'],
 35 |     "assertj/assertj":['pom.xml'],
 36 |     "netty/netty":['pom.xml'],
 37 |     "google/gson":['pom.xml'],
 38 | }
 39 | 
 40 | # Cosntants - Task Instance Version Regex Pattern
 41 | MAP_REPO_TO_VERSION_PATTERNS = {
 42 |     k: [r'__version__ = [\'"](.*)[\'"]', r"VERSION = \((.*)\)"]
 43 |     
 44 |     for k in [
 45 |         "dbt-labs/dbt-core",
 46 |         "django/django",
 47 |         "huggingface/transformers",
 48 |         "marshmallow-code/marshmallow",
 49 |         "mwaskom/seaborn",
 50 |         "pallets/flask",
 51 |         "psf/requests",
 52 |         "pyca/cryptography",
 53 |         "pylint-dev/astroid",
 54 |         "pylint-dev/pylint",
 55 |         "scikit-learn/scikit-learn",
 56 |         "sphinx-doc/sphinx",
 57 |         "sympy/sympy",
 58 |         'python/mypy',
 59 |     ]
 60 | }
 61 | MAP_REPO_TO_VERSION_PATTERNS.update({
 62 |     k: [
 63 |         r'\[\s*(\d+\.\d+\.\d+)\s*\]'
 64 |         ] for k in ['iamkun/dayjs']
 65 | })
 66 | MAP_REPO_TO_VERSION_PATTERNS.update({
 67 |     k: [
 68 |         r'version(\d+\.\d+(?:\.\d+)?(?:-\d+)?)'
 69 |         ] for k in ['statsmodels/statsmodels']
 70 | })
 71 | MAP_REPO_TO_VERSION_PATTERNS.update(
 72 |     {
 73 |         k: [
 74 |             r'"version":\s*"([^"]+)"'
 75 |             
 76 |         ]
 77 |         for k in [
 78 | 
 79 |             'prettier/prettier',
 80 |             'tailwindlabs/tailwindcss',
 81 |             'jestjs/jest',
 82 |             'webpack/webpack',
 83 |             'babel/babel',
 84 |             'apollographql/apollo-client'
 85 |         ]
 86 |     }
 87 | )
 88 | 
 89 | MAP_REPO_TO_VERSION_PATTERNS.update({
 90 |     k:[
 91 |       r'<version>(\d+(\.\d+)*\.[A-Za-z][A-Za-z0-9\-]*)<\/version>', r'<version>(\d+\.\d+\.\d+(?:-\w+)?)</version>'
 92 |     ]
 93 |     for k in [
 94 |         "netty/netty",
 95 |         "assertj/assertj",
 96 |         "google/gson"
 97 |         ]}
 98 | )
 99 | 
100 | 
101 | 
102 | MAP_REPO_TO_VERSION_PATTERNS.update(
103 |     {
104 |         k: [
105 |             r'__version__ = [\'"](.*)[\'"]',
106 |             r'__version__ = version = [\'"](.*)[\'"]',
107 |             r"VERSION = \((.*)\)",
108 |         ]
109 |         for k in ["pytest-dev/pytest", "matplotlib/matplotlib"]
110 |     }
111 | )
112 | MAP_REPO_TO_VERSION_PATTERNS.update({k: [r"Version\s+(\d+\.\d+(?:\.\d+)?)"] for k in ["dateutil/dateutil"]})
113 | MAP_REPO_TO_VERSION_PATTERNS.update({k: [r"(.*)"] for k in ["Qiskit/qiskit"]})
114 | MAP_REPO_TO_VERSION_PATTERNS.update({k: [r"version_info = [\d]+,[\d\s]+,"] for k in ["pyvista/pyvista"]})
115 | MAP_REPO_TO_VERSION_PATTERNS.update({k:[r"version_info = [\d]+, [\d]+, [\d\s]+"] for k in ['tqdm/tqdm']})
116 | MAP_REPO_TO_VERSION_PATTERNS.update(
117 |     {
118 |         k: [
119 |             
120 |            r'version="(.*?)"',
121 |            r'__version__ = [\'"](.*)[\'"]',
122 | 
123 |         ]
124 |         for k in ['redis/redis-py']
125 |     }
126 | )
127 | SWE_BENCH_URL_RAW = 'https://github.com/'
128 | # SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/"


--------------------------------------------------------------------------------
/app/model/gemini.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For models other than those from OpenAI, use LiteLLM if possible.
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | from typing import Literal
  8 | 
  9 | import litellm
 10 | from litellm.utils import Choices, Message, ModelResponse
 11 | from openai import BadRequestError
 12 | from tenacity import retry, stop_after_attempt, wait_random_exponential
 13 | 
 14 | from app.log import log_and_print
 15 | from app.model import common
 16 | from app.model.common import Model
 17 | 
 18 | 
 19 | class GeminiModel(Model):
 20 |     """
 21 |     Base class for creating Singleton instances of Gemini models.
 22 |     """
 23 | 
 24 |     _instances = {}
 25 | 
 26 |     def __new__(cls):
 27 |         if cls not in cls._instances:
 28 |             cls._instances[cls] = super().__new__(cls)
 29 |             cls._instances[cls]._initialized = False
 30 |         return cls._instances[cls]
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         name: str,
 35 |         cost_per_input: float,
 36 |         cost_per_output: float,
 37 |         parallel_tool_call: bool = False,
 38 |     ):
 39 |         if self._initialized:
 40 |             return
 41 |         super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call)
 42 |         self._initialized = True
 43 | 
 44 |     def setup(self) -> None:
 45 |         """
 46 |         Check API key.
 47 |         """
 48 |         self.check_api_key()
 49 | 
 50 |     def check_api_key(self) -> str:
 51 |         key_name = "GEMINI_API_KEY"
 52 |         credential_name = "GOOGLE_APPLICATION_CREDENTIALS"
 53 | 
 54 |         gemini_key = os.getenv(key_name)
 55 |         credential_key = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
 56 |         if not (gemini_key or credential_key):
 57 |             print(f"Please set the {key_name} or {credential_name} env var")
 58 |             sys.exit(1)
 59 |         return gemini_key or credential_key
 60 | 
 61 |     def extract_resp_content(self, chat_message: Message) -> str:
 62 |         """
 63 |         Given a chat completion message, extract the content from it.
 64 |         """
 65 |         content = chat_message.content
 66 |         if content is None:
 67 |             return ""
 68 |         else:
 69 |             return content
 70 | 
 71 |     @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3))
 72 |     def call(
 73 |         self,
 74 |         messages: list[dict],
 75 |         top_p=1,
 76 |         tools=None,
 77 |         response_format: Literal["text", "json_object"] = "text",
 78 |         **kwargs,
 79 |     ):
 80 |         # FIXME: ignore tools field since we don't use tools now
 81 |         try:
 82 |             prefill_content = "{"
 83 |             if response_format == "json_object":  # prefill
 84 |                 messages.append({"role": "assistant", "content": prefill_content})
 85 | 
 86 |             response = litellm.completion(
 87 |                 model=self.name,
 88 |                 messages=messages,
 89 |                 temperature=common.MODEL_TEMP,
 90 |                 max_tokens=1024,
 91 |                 top_p=top_p,
 92 |                 stream=False,
 93 |             )
 94 |             assert isinstance(response, ModelResponse)
 95 |             resp_usage = response.usage
 96 |             assert resp_usage is not None
 97 |             input_tokens = int(resp_usage.prompt_tokens)
 98 |             output_tokens = int(resp_usage.completion_tokens)
 99 |             cost = self.calc_cost(input_tokens, output_tokens)
100 | 
101 |             common.thread_cost.process_cost += cost
102 |             common.thread_cost.process_input_tokens += input_tokens
103 |             common.thread_cost.process_output_tokens += output_tokens
104 | 
105 |             first_resp_choice = response.choices[0]
106 |             assert isinstance(first_resp_choice, Choices)
107 |             resp_msg: Message = first_resp_choice.message
108 |             content = self.extract_resp_content(resp_msg)
109 |             if response_format == "json_object":
110 |                 # prepend the prefilled character
111 |                 if not content.startswith(prefill_content):
112 |                     content = prefill_content + content
113 | 
114 |             return content, cost, input_tokens, output_tokens
115 | 
116 |         except BadRequestError as e:
117 |             if e.code == "context_length_exceeded":
118 |                 log_and_print("Context length exceeded")
119 |             raise e
120 | 
121 | 
122 | class GeminiPro(GeminiModel):
123 |     def __init__(self):
124 |         super().__init__(
125 |             "gemini-1.0-pro-002", 0.00000035, 0.00000105, parallel_tool_call=True
126 |         )
127 |         self.note = "Gemini 1.0 from Google"
128 | 
129 | 
130 | class Gemini15Pro(GeminiModel):
131 |     def __init__(self):
132 |         super().__init__(
133 |             "gemini-1.5-pro-preview-0409",
134 |             0.00000035,
135 |             0.00000105,
136 |             parallel_tool_call=True,
137 |         )
138 |         self.note = "Gemini 1.5 from Google"
139 | 


--------------------------------------------------------------------------------
/data_collection/versioning/merge_final_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import json
  4 | import sys
  5 | from pathlib import Path
  6 | import logging
  7 | 
  8 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def read_instances(path: Path):
 12 |     """
 13 |     Reads instance files, returns an empty list if it does not exist.
 14 |     Supports .json and .jsonl formats.
 15 |     """
 16 |     if not path.exists():
 17 |         logger.warning(f"File not found: {path}, treating as an empty list")
 18 |         return []
 19 |     try:
 20 |         text = path.read_text(encoding='utf-8')
 21 |         if path.suffix.lower() == '.jsonl':
 22 |             return [json.loads(line) for line in text.splitlines() if line.strip()]
 23 |         else:
 24 |             return json.loads(text)
 25 |     except Exception as e:
 26 |         logger.error(f"Failed to read or parse file ({path}): {e}")
 27 |         sys.exit(1)
 28 | 
 29 | def write_instances(instances, path: Path):
 30 |     """
 31 |     Writes the list of instances, automatically selecting json or jsonl format based on the suffix.
 32 |     """
 33 |     path.parent.mkdir(parents=True, exist_ok=True)
 34 |     try:
 35 |         if path.suffix.lower() == '.jsonl':
 36 |             with path.open('w', encoding='utf-8') as f:
 37 |                 for inst in instances:
 38 |                     f.write(json.dumps(inst, ensure_ascii=False) + '\n')
 39 |         else:
 40 |             path.write_text(
 41 |                 json.dumps(instances, indent=2, ensure_ascii=False), encoding='utf-8'
 42 |             )
 43 |     except Exception as e:
 44 |         logger.error(f"Failed to write file ({path}): {e}")
 45 |         sys.exit(1)
 46 | 
 47 | def merge(primary, secondary):
 48 |     """
 49 |     Takes primary as the main list and absorbs all pull_numbers from secondary
 50 |     that are not in primary, ensuring pull_number uniqueness.
 51 |     """
 52 |     seen = {inst.get('pull_number') for inst in primary if 'pull_number' in inst}
 53 |     out = list(primary)
 54 |     for inst in secondary:
 55 |         pn = inst.get('pull_number')
 56 |         if pn is None:
 57 |             logger.warning("Skipping entry with missing pull_number")
 58 |             continue
 59 |         if pn not in seen:
 60 |             out.append(inst)
 61 |             seen.add(pn)
 62 |     return out
 63 | 
 64 | def find_version_file(directory: Path, suffix: str):
 65 |     """
 66 |     Finds a version file in the directory ending with suffix, supporting .json or .jsonl.
 67 |     Returns the first Path found, or None.
 68 |     """
 69 |     # First, look for a fixed format: dirname + suffix + ext
 70 |     for ext in ('.json', '.jsonl'):
 71 |         candidate = directory / f"{directory.name}{suffix}{ext}"
 72 |         if candidate.exists():
 73 |             return candidate
 74 |     # Then, use wildcards
 75 |     for ext in ('.json', '.jsonl'):
 76 |         matches = list(directory.glob(f"*{suffix}{ext}"))
 77 |         if matches:
 78 |             return matches[0]
 79 |     return None
 80 | 
 81 | def main():
 82 |     parser = argparse.ArgumentParser(
 83 |         description="Merges `_versions_by_github` and `_versions_by_git` files in the same directory, and outputs `_versions_final`"
 84 |     )
 85 |     parser.add_argument("input_dir", help="Directory containing the version files")
 86 |     args = parser.parse_args()
 87 | 
 88 |     directory = Path(args.input_dir)
 89 |     if not directory.is_dir():
 90 |         logger.error(f"Input is not a directory: {directory}")
 91 |         sys.exit(1)
 92 | 
 93 |     # 1. Find the two version files
 94 |     github_file = find_version_file(directory, "_versions_by_github")
 95 |     git_file    = find_version_file(directory, "_versions_by_git")
 96 | 
 97 |     # 2. Read the files
 98 |     if github_file:
 99 |         logger.info(f"Using GitHub version file: {github_file.name}")
100 |         primary = read_instances(github_file)
101 |         ext = github_file.suffix
102 |     else:
103 |         logger.info("Could not find `_versions_by_github`, setting primary list to empty")
104 |         primary = []
105 |         ext = None
106 | 
107 |     if git_file:
108 |         logger.info(f"Using Git checkout version file: {git_file.name}")
109 |         secondary = read_instances(git_file)
110 |         if ext is None:
111 |             ext = git_file.suffix
112 |     else:
113 |         logger.info("Could not find `_versions_by_git`, setting secondary list to empty")
114 |         secondary = []
115 |         if ext is None:
116 |             ext = ".json"
117 | 
118 |     # 3. Merge
119 |     merged = merge(primary, secondary)
120 | 
121 |     # 3.1 Sort by pull_number in descending order (converted to int)
122 |     try:
123 |         merged.sort(key=lambda x: int(x.get('pull_number', 0)), reverse=True)
124 |     except (ValueError, TypeError):
125 |         merged.sort(key=lambda x: x.get('pull_number', ""), reverse=True)
126 | 
127 |     # 4. Write to versions_final
128 |     output_path = directory / f"{directory.name}_versions_final{ext}"
129 |     write_instances(merged, output_path)
130 |     logger.info(
131 |         f"✅ Merge complete: {len(primary)} + {len(merged)-len(primary)} new = {len(merged)} total entries, written to {output_path.name}"
132 |     )
133 | 
134 | if __name__ == "__main__":
135 |     main()


--------------------------------------------------------------------------------
/app/model/groq.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Interfacing with Groq cloud.
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | from typing import Literal
  8 | 
  9 | import litellm
 10 | from litellm.utils import Choices, Message, ModelResponse
 11 | from openai import BadRequestError
 12 | from tenacity import retry, stop_after_attempt, wait_random_exponential
 13 | 
 14 | from app.log import log_and_print
 15 | from app.model import common
 16 | from app.model.common import Model
 17 | 
 18 | # litellm.set_verbose = True
 19 | 
 20 | 
 21 | class GroqModel(Model):
 22 |     """
 23 |     Base class for creating Singleton instances of Groq models.
 24 |     We use native API from Groq through LiteLLM.
 25 |     """
 26 | 
 27 |     _instances = {}
 28 | 
 29 |     def __new__(cls):
 30 |         if cls not in cls._instances:
 31 |             cls._instances[cls] = super().__new__(cls)
 32 |             cls._instances[cls]._initialized = False
 33 |         return cls._instances[cls]
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         name: str,
 38 |         cost_per_input: float,
 39 |         cost_per_output: float,
 40 |         parallel_tool_call: bool = False,
 41 |     ):
 42 |         if self._initialized:
 43 |             return
 44 |         super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call)
 45 |         self._initialized = True
 46 | 
 47 |     def setup(self) -> None:
 48 |         """
 49 |         Check Groq API key.
 50 |         """
 51 |         self.check_api_key()
 52 | 
 53 |     def check_api_key(self) -> str:
 54 |         """
 55 |         Check for the GROQ_API_KEY environment variable.
 56 |         """
 57 |         key = os.environ.get("GROQ_API_KEY")
 58 |         if not key:
 59 |             log_and_print("Please set the GROQ_API_KEY env var")
 60 |             sys.exit(1)
 61 |         return key
 62 | 
 63 |     def extract_resp_content(self, chat_message: Message) -> str:
 64 |         """
 65 |         Given a chat completion message, extract the content from it.
 66 |         """
 67 |         content = chat_message.content
 68 |         if content is None:
 69 |             return ""
 70 |         else:
 71 |             return content
 72 | 
 73 |     @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3))
 74 |     def call(
 75 |         self,
 76 |         messages: list[dict],
 77 |         top_p=1,
 78 |         tools=None,
 79 |         response_format: Literal["text", "json_object"] = "text",
 80 |         **kwargs,
 81 |     ):
 82 |         """
 83 |         Calls the Groq API to generate completions for the given inputs.
 84 |         """
 85 |         # FIXME: ignore tools field since we don't use tools now
 86 |         try:
 87 |             # groq models - prefilling response with { increase the success rate
 88 |             # of producing json output
 89 |             prefill_content = "{"
 90 |             if response_format == "json_object":  # prefill
 91 |                 messages.append({"role": "assistant", "content": prefill_content})
 92 | 
 93 |             response = litellm.completion(
 94 |                 model=self.name,
 95 |                 messages=messages,
 96 |                 temperature=common.MODEL_TEMP,
 97 |                 max_tokens=1024,
 98 |                 top_p=top_p,
 99 |                 stream=False,
100 |             )
101 |             assert isinstance(response, ModelResponse)
102 |             resp_usage = response.usage
103 |             assert resp_usage is not None
104 |             input_tokens = int(resp_usage.prompt_tokens)
105 |             output_tokens = int(resp_usage.completion_tokens)
106 |             cost = self.calc_cost(input_tokens, output_tokens)
107 | 
108 |             common.thread_cost.process_cost += cost
109 |             common.thread_cost.process_input_tokens += input_tokens
110 |             common.thread_cost.process_output_tokens += output_tokens
111 | 
112 |             first_resp_choice = response.choices[0]
113 |             assert isinstance(first_resp_choice, Choices)
114 |             resp_msg: Message = first_resp_choice.message
115 |             content = self.extract_resp_content(resp_msg)
116 |             if response_format == "json_object":
117 |                 # prepend the prefilled character
118 |                 if not content.startswith(prefill_content):
119 |                     content = prefill_content + content
120 |             return content, cost, input_tokens, output_tokens
121 | 
122 |         except BadRequestError as e:
123 |             if e.code == "context_length_exceeded":
124 |                 log_and_print("Context length exceeded")
125 |             raise e
126 | 
127 | 
128 | class Llama3_8B(GroqModel):
129 |     def __init__(self):
130 |         super().__init__(
131 |             "groq/llama3-8b-8192", 0.00000005, 0.00000010, parallel_tool_call=True
132 |         )
133 |         self.note = "The champion of the Llama series with 8B params from Meta"
134 | 
135 | 
136 | class Llama3_70B(GroqModel):
137 |     def __init__(self):
138 |         super().__init__(
139 |             "groq/llama3-70b-8192", 0.00000059, 0.00000079, parallel_tool_call=True
140 |         )
141 |         self.note = "Llama lastest model with 70B params"
142 | 
143 | 
144 | class Mixtral_8x7B(GroqModel):
145 |     def __init__(self):
146 |         super().__init__(
147 |             "groq/mixtral-8x7b-32768", 0.00000027, 0.00000027, parallel_tool_call=True
148 |         )
149 |         self.note = "Balanced blend of speed and power from Mixtral team with 8 layers and 7B parameters"
150 | 
151 | 
152 | class Gemma_7B(GroqModel):
153 |     def __init__(self):
154 |         super().__init__(
155 |             "groq/gemma-7b-it", 0.0000001, 0.0000001, parallel_tool_call=True
156 |         )
157 |         self.note = "A state-of-the-art open model from Google, boasting 7B parameters"
158 | 


--------------------------------------------------------------------------------
/app/model/claude.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For models other than those from OpenAI, use LiteLLM if possible.
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | from typing import Literal
  8 | 
  9 | import litellm
 10 | from litellm.utils import Choices, Message, ModelResponse
 11 | from openai import BadRequestError
 12 | from tenacity import retry, stop_after_attempt, wait_random_exponential
 13 | 
 14 | from app.log import log_and_print
 15 | from app.model import common
 16 | from app.model.common import Model
 17 | 
 18 | 
 19 | class AnthropicModel(Model):
 20 |     """
 21 |     Base class for creating Singleton instances of Antropic models.
 22 |     """
 23 | 
 24 |     _instances = {}
 25 | 
 26 |     def __new__(cls):
 27 |         if cls not in cls._instances:
 28 |             cls._instances[cls] = super().__new__(cls)
 29 |             cls._instances[cls]._initialized = False
 30 |         return cls._instances[cls]
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         name: str,
 35 |         cost_per_input: float,
 36 |         cost_per_output: float,
 37 |         max_output_token: int = 4096,
 38 |         parallel_tool_call: bool = False,
 39 |     ):
 40 |         if self._initialized:
 41 |             return
 42 |         super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call)
 43 |         self.max_output_token = max_output_token
 44 |         self._initialized = True
 45 | 
 46 |     def setup(self) -> None:
 47 |         """
 48 |         Check API key.
 49 |         """
 50 |         self.check_api_key()
 51 | 
 52 |     def check_api_key(self) -> str:
 53 |         key_name = "ANTHROPIC_API_KEY"
 54 |         key = os.getenv(key_name)
 55 |         if not key:
 56 |             print(f"Please set the {key_name} env var")
 57 |             sys.exit(1)
 58 |         return key
 59 | 
 60 |     def extract_resp_content(self, chat_message: Message) -> str:
 61 |         """
 62 |         Given a chat completion message, extract the content from it.
 63 |         """
 64 |         content = chat_message.content
 65 |         if content is None:
 66 |             return ""
 67 |         else:
 68 |             return content
 69 | 
 70 |     @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3))
 71 |     def call(
 72 |         self,
 73 |         messages: list[dict],
 74 |         top_p=1,
 75 |         tools=None,
 76 |         response_format: Literal["text", "json_object"] = "text",
 77 |         temperature: float | None = None,
 78 |         **kwargs,
 79 |     ):
 80 |         # FIXME: ignore tools field since we don't use tools now
 81 |         if temperature is None:
 82 |             temperature = common.MODEL_TEMP
 83 | 
 84 |         try:
 85 |             # antropic models - prefilling response with { increase the success rate
 86 |             # of producing json output
 87 |             prefill_content = "{"
 88 |             if response_format == "json_object":  # prefill
 89 |                 messages.append({"role": "assistant", "content": prefill_content})
 90 | 
 91 |             response = litellm.completion(
 92 |                 model=self.name,
 93 |                 messages=messages,
 94 |                 temperature=temperature,
 95 |                 max_tokens=self.max_output_token,
 96 |                 top_p=top_p,
 97 |                 stream=False,
 98 |             )
 99 |             assert isinstance(response, ModelResponse)
100 |             resp_usage = response.usage
101 |             assert resp_usage is not None
102 |             input_tokens = int(resp_usage.prompt_tokens)
103 |             output_tokens = int(resp_usage.completion_tokens)
104 |             cost = self.calc_cost(input_tokens, output_tokens)
105 | 
106 |             common.thread_cost.process_cost += cost
107 |             common.thread_cost.process_input_tokens += input_tokens
108 |             common.thread_cost.process_output_tokens += output_tokens
109 | 
110 |             first_resp_choice = response.choices[0]
111 |             assert isinstance(first_resp_choice, Choices)
112 |             resp_msg: Message = first_resp_choice.message
113 |             content = self.extract_resp_content(resp_msg)
114 |             if response_format == "json_object":
115 |                 # prepend the prefilled character
116 |                 if not content.startswith(prefill_content):
117 |                     content = prefill_content + content
118 |             return content, cost, input_tokens, output_tokens
119 | 
120 |         except BadRequestError as e:
121 |             if e.code == "context_length_exceeded":
122 |                 log_and_print("Context length exceeded")
123 |             raise e
124 | 
125 | 
126 | class Claude3Opus(AnthropicModel):
127 |     def __init__(self):
128 |         super().__init__(
129 |             "claude-3-opus-20240229", 0.000015, 0.000075, parallel_tool_call=True
130 |         )
131 |         self.note = "Most powerful model among Claude 3"
132 | 
133 | 
134 | class Claude3Sonnet(AnthropicModel):
135 |     def __init__(self):
136 |         super().__init__(
137 |             "claude-3-sonnet-20240229", 0.000003, 0.000015, parallel_tool_call=True
138 |         )
139 |         self.note = "Most balanced (intelligence and speed) model from Antropic"
140 | 
141 | 
142 | class Claude3Haiku(AnthropicModel):
143 |     def __init__(self):
144 |         super().__init__(
145 |             "claude-3-haiku-20240307", 0.00000025, 0.00000125, parallel_tool_call=True
146 |         )
147 |         self.note = "Fastest model from Antropic"
148 | 
149 | 
150 | # class Claude3_5Sonnet(AnthropicModel):
151 | #     def __init__(self):
152 | #         super().__init__(
153 | #             "claude-3-5-sonnet-20240620",
154 | #             0.000003,
155 | #             0.000015,
156 | #             max_output_token=8192,
157 | #             parallel_tool_call=True,
158 | #         )
159 | #         self.note = "Most intelligent model from Antropic"
160 | 


--------------------------------------------------------------------------------
/app/agents/write_dockerfile_agent/write_dockerfile_agent.py:
--------------------------------------------------------------------------------
  1 | from app.data_structures import MessageThread
  2 | from app.agents.write_dockerfile_agent import write_dockerfile_utils
  3 | from app.agents.agent import Agent
  4 | from app.task import Task
  5 | import os
  6 | import shutil
  7 | from loguru import logger
  8 | import re
  9 | from app.log import (
 10 |     print_acr,
 11 |     print_banner,
 12 |     print_retrieval,
 13 | )
 14 | from os.path import join as pjoin
 15 | 
 16 | 
 17 | class WriteDockerfileAgent(Agent):
 18 |     """
 19 |     LLM-based agent for creating or modifying a Dockerfile via direct chat.
 20 |     Manages its own create/modify logic, output directories, and retry behavior.
 21 |     """
 22 |     api_functions: list[str] = []
 23 |     def __init__(self,  task: Task, output_dir: str, repo_basic_info: str, using_ubuntu_only: bool = False):
 24 |         super().__init__(agent_id="WriteDockerfileAgent")
 25 |         self.msg_thread  = MessageThread()
 26 |         self.task = task
 27 |         self.output_dir = os.path.abspath(output_dir)
 28 |         self.run_count = 0
 29 |         self.reference_setup = None
 30 |         self.repo_basic_info = repo_basic_info
 31 |         self.init_msg_thread()
 32 |         self.using_ubuntu_only = using_ubuntu_only
 33 | 
 34 | 
 35 |     def init_msg_thread(self) -> None:
 36 |         self.msg_thread = MessageThread()
 37 |         self.add_system_message(write_dockerfile_utils.get_system_prompt_dockerfile())
 38 |         self.add_user_message(self.repo_basic_info)
 39 | 
 40 |     def add_reference_message(self) -> None:
 41 |         if self.reference_setup:
 42 |             reference_version = self.reference_setup['version']
 43 |             reference_dockerfile =self.reference_setup['dockerfile']
 44 |             reference_text = (
 45 |                 f"I found a Dockerfile from version {reference_version} of this repo that worked well in a similar setup. "
 46 |                 "You might consider it as a reference—if its configuration aligns with your current environment, it could "
 47 |                 "save you some effort. Otherwise, feel free to adapt or disregard as needed:\n\n"
 48 |                 f"{reference_dockerfile}"
 49 |             )
 50 |             self.add_user_message(reference_text)
 51 | 
 52 | 
 53 |     def run_task(self, print_callback=None) -> tuple[str, str, bool]:
 54 |         """
 55 |         Create or modify a Dockerfile based on the given message_thread context.
 56 |         Handles versioning, directory management, and fallback copy logic.
 57 |         """
 58 |         # 1. Determine previous vs current output paths
 59 |         print_banner(f"Iteration ROUND {self.iteration_num}: Dockerfile Generation ")
 60 |         prev_dir = self.get_latest_write_dockerfile_output_dir()
 61 |         prev_file = os.path.join(prev_dir, 'Dockerfile')
 62 |         self.run_count += 1
 63 |         curr_dir = self.get_latest_write_dockerfile_output_dir()
 64 |         os.makedirs(curr_dir, exist_ok=True)
 65 |         self.add_reference_message()
 66 |         # 2. Inject either modify or init prompt
 67 |         if os.path.exists(prev_file):
 68 |             modify_prompt = write_dockerfile_utils.get_user_prompt_modify_dockerfile()
 69 |             # add previous Dockerfile content
 70 |             prev_content = self._read_file(prev_file)
 71 |             self.add_user_message(f"Previous dockerfile:\n{prev_content}\n")
 72 |             self.add_user_message(modify_prompt)
 73 |         else:
 74 |             if self.using_ubuntu_only:
 75 |                 self.add_user_message(write_dockerfile_utils.get_user_prompt_init_dockerfile_using_ubuntu_only())
 76 |             else:
 77 |                 self.add_user_message(write_dockerfile_utils.get_user_prompt_init_dockerfile())
 78 | 
 79 |         # 3. Delegate to the retryable writer
 80 |         task_output = write_dockerfile_utils.write_dockerfile_with_retries(
 81 |             self.msg_thread,
 82 |             curr_dir,
 83 |             self.task,
 84 |             print_callback=print_callback
 85 |         )
 86 | 
 87 |         # 4. Post-process: validate or fallback copy
 88 |         dockerfile_path = os.path.join(curr_dir, 'Dockerfile')
 89 |         if not os.path.isfile(dockerfile_path):
 90 |             
 91 |             # fallback: copy previous
 92 |             if os.path.exists(prev_file):
 93 |                 shutil.copy(prev_file, dockerfile_path)
 94 |             summary = "Dockerfile generation failed."
 95 |             is_ok = False
 96 |         else:
 97 |             summary = "Dockerfile created/updated successfully." 
 98 |             is_ok = True
 99 | 
100 |         dockerfile_output_dir = self.get_latest_write_dockerfile_output_dir()
101 |         conversation_file = pjoin(dockerfile_output_dir, f"conversation.json")
102 |         self.msg_thread.save_to_file(conversation_file)
103 |         # self.init_msg_thread()
104 |         return task_output, summary, is_ok
105 | 
106 |     def _read_file(self, path: str) -> str:
107 |         try:
108 |             with open(path, 'r') as f:
109 |                 return f.read()
110 |         except Exception:
111 |             return ""
112 | 
113 |     def get_latest_write_dockerfile_output_dir(self) -> str:
114 |         """
115 |         Return the directory of the most recent Dockerfile outputs.
116 |         """
117 |         return os.path.join(self.output_dir, f"write_dockerfile_agent_{self.run_count}")
118 | 
119 |     def get_latest_dockerfile(self) -> str:
120 |         """
121 |         Read and return contents of the latest generated Dockerfile.
122 |         """
123 |         path = os.path.join(self.get_latest_write_dockerfile_output_dir(), 'Dockerfile')
124 |         try:
125 |             with open(path, 'r') as f:
126 |                 return f.read()
127 |         except Exception as e:
128 |             logger.error(f"Failed to read latest Dockerfile at {path}: {e}")
129 |             return ""
130 | 


--------------------------------------------------------------------------------
/app/model/bedrock.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For models other than those from OpenAI, use LiteLLM if possible.
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | from typing import Literal
  8 | 
  9 | import litellm
 10 | from litellm.utils import Choices, Message, ModelResponse
 11 | from openai import BadRequestError
 12 | from tenacity import retry, stop_after_attempt, wait_random_exponential
 13 | 
 14 | from app.log import log_and_print
 15 | from app.model import common
 16 | from app.model.common import Model
 17 | 
 18 | 
 19 | class BedrockModel(Model):
 20 |     """
 21 |     Base class for creating Singleton instances of Amazon Bedrock models.
 22 |     """
 23 | 
 24 |     _instances = {}
 25 | 
 26 |     def __new__(cls):
 27 |         if cls not in cls._instances:
 28 |             cls._instances[cls] = super().__new__(cls)
 29 |             cls._instances[cls]._initialized = False
 30 |         return cls._instances[cls]
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         name: str,
 35 |         cost_per_input: float,
 36 |         cost_per_output: float,
 37 |         parallel_tool_call: bool = False,
 38 |     ):
 39 |         if self._initialized:
 40 |             return
 41 |         super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call)
 42 |         self._model_provider = self.name.split(".")[0]
 43 |         self._initialized = True
 44 | 
 45 |     def setup(self) -> None:
 46 |         """
 47 |         Check API key.
 48 |         """
 49 |         self.check_api_key()
 50 | 
 51 |     def check_api_key(self) -> str:
 52 |         # See https://litellm.vercel.app/docs/providers/bedrock
 53 |         required_env_vars = [
 54 |             "AWS_ACCESS_KEY_ID",
 55 |             "AWS_SECRET_ACCESS_KEY",
 56 |             "AWS_REGION_NAME",
 57 |         ]
 58 |         if len(set(os.environ).intersection(required_env_vars)) != len(
 59 |             required_env_vars
 60 |         ):
 61 |             print(
 62 |                 "Missing env vars. Please refer to https://litellm.vercel.app/docs/providers/bedrock"
 63 |             )
 64 |             sys.exit(1)
 65 |         return os.getenv(required_env_vars[-1])
 66 | 
 67 |     def extract_resp_content(self, chat_message: Message) -> str:
 68 |         """
 69 |         Given a chat completion message, extract the content from it.
 70 |         """
 71 |         content = chat_message.content
 72 |         if content is None:
 73 |             return ""
 74 |         else:
 75 |             return content
 76 | 
 77 |     @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3))
 78 |     def call(
 79 |         self,
 80 |         messages: list[dict],
 81 |         top_p=1,
 82 |         tools=None,
 83 |         response_format: Literal["text", "json_object"] = "text",
 84 |         **kwargs,
 85 |     ):
 86 |         try:
 87 |             if self._model_provider == "bedrock/anthropic":
 88 |                 # antropic models - prefilling response with { increase the success rate
 89 |                 # of producing json output
 90 |                 prefill_content = "{"
 91 |                 if response_format == "json_object":  # prefill
 92 |                     messages.append({"role": "assistant", "content": prefill_content})
 93 | 
 94 |             response = litellm.completion(
 95 |                 model=self.name,
 96 |                 messages=messages,
 97 |                 temperature=common.MODEL_TEMP,
 98 |                 max_tokens=1024,
 99 |                 top_p=top_p,
100 |                 stream=False,
101 |             )
102 |             assert isinstance(response, ModelResponse)
103 |             resp_usage = response.usage
104 |             assert resp_usage is not None
105 |             input_tokens = int(resp_usage.prompt_tokens)
106 |             output_tokens = int(resp_usage.completion_tokens)
107 |             cost = self.calc_cost(input_tokens, output_tokens)
108 | 
109 |             common.thread_cost.process_cost += cost
110 |             common.thread_cost.process_input_tokens += input_tokens
111 |             common.thread_cost.process_output_tokens += output_tokens
112 | 
113 |             first_resp_choice = response.choices[0]
114 |             assert isinstance(first_resp_choice, Choices)
115 |             resp_msg: Message = first_resp_choice.message
116 |             content = self.extract_resp_content(resp_msg)
117 |             if response_format == "json_object":
118 |                 # prepend the prefilled character
119 |                 if not content.startswith(prefill_content):
120 |                     content = prefill_content + content
121 |             return content, cost, input_tokens, output_tokens
122 | 
123 |         except BadRequestError as e:
124 |             if e.code == "context_length_exceeded":
125 |                 log_and_print("Context length exceeded")
126 |             raise e
127 | 
128 | 
129 | class AnthropicClaude2(BedrockModel):
130 |     def __init__(self):
131 |         super().__init__(
132 |             "bedrock/anthropic.claude-v2:1",
133 |             0.00000025,
134 |             0.00000125,
135 |             parallel_tool_call=True,
136 |         )
137 |         self.note = "Older Claude model"
138 | 
139 | 
140 | class AnthropicClaude3Opus(BedrockModel):
141 |     def __init__(self):
142 |         super().__init__(
143 |             "bedrock/anthropic.claude-3-opus-20240229-v1:0",
144 |             0.000015,
145 |             0.000075,
146 |             parallel_tool_call=True,
147 |         )
148 |         self.note = "Most powerful model from Antropic"
149 | 
150 | 
151 | class AnthropicClaude3Sonnet(BedrockModel):
152 |     def __init__(self):
153 |         super().__init__(
154 |             "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
155 |             0.000003,
156 |             0.000015,
157 |             parallel_tool_call=True,
158 |         )
159 |         self.note = "Most balanced (intelligence and speed) model from Antropic"
160 | 
161 | 
162 | class AnthropicClaude3Haiku(BedrockModel):
163 |     def __init__(self):
164 |         super().__init__(
165 |             "bedrock/anthropic.claude-3-haiku-20240307-v1:0",
166 |             0.00000025,
167 |             0.00000125,
168 |             parallel_tool_call=True,
169 |         )
170 |         self.note = "Fastest model from Antropic"
171 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 👉🏻 SWE-Factory 👈🏻
  2 | 
  3 | Your automated factory for GitHub Issue Resolution Training Data and Evaluation Benchmarks.
  4 | 
  5 | [![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=SWE-Factory%20🧑‍💻&text2=✨%20Build%20Your%20Own%20SWE-Bench%20and%20SWE-Gym,%20Automatically!&width=900&height=200)](https://github.com/Akshay090/svg-banners)
  6 | 
  7 | <!-- paper . data and models . project page -->
  8 | <p align="center">
  9 | <a href="https://arxiv.org/abs/2506.10954">📃 Paper</a>
 10 | •
 11 | <a href="https://huggingface.co/SWE-Factory">🤗 Data & Models</a>
 12 | </p>
 13 | 
 14 | ## 📰 News
 15 | * **[Sep. 17, 2025]**: *Build your own SWE-GYM with SWE-Factory!* We trained a series of LLMs on 2,809 Python task instances constructed with our framework, all demonstrating effective performance improvements. For instance, the resolve rate of the fine-tuned Qwen2.5-Coder-14B-instruct model increased from 5.8% to 21.0%. The training trajectories sampled from our 2,809 task instances and fine-tuned models are open-sourced on <a href="https://huggingface.co/SWE-Factory">🤗 Huggingface</a>.
 16 | * **[Sep. 15, 2025]**: We release SWE-Factory 1.5, which is more robust and has a higher success rate.
 17 | 
 18 | 
 19 | ## ✨ Key Features
 20 | 
 21 | - **An automated pipeline** for GitHub issue resolution data collection, reducing your manual effort!
 22 | - **Produce reliable and reproducible Docker-based evaluation environments**
 23 | - **Automatic environment construction using the LLM-powered multi-agent system (SWE-Builder)**
 24 | - **Support for multiple programming languages** (we have evaluated Python, Java, JS, and TS extensively.)
 25 | 
 26 | ## 📦 Environment Setup
 27 | 
 28 | Our experiments are conducted using Docker version 27.0.3-1 and Ubuntu 22.04.4 LTS.
 29 | 
 30 | To get started, run the following commands to set up the environment:
 31 | 
 32 | ```bash
 33 | conda create --name swe-factory python=3.12.5 -y
 34 | conda activate swe-factory
 35 | pip install -r requirements.txt
 36 | ```
 37 | 
 38 | ## 🚀 Running SWE-Factory
 39 | 
 40 | ### 📍 Stage I: Raw Issue Data Collection
 41 | 
 42 | We use GitHub APIs and predefined patterns to collect raw issue data (e.g., `python-mypy-instances.jsonl`). Check the detailed tutorial in the [data_collection/collect](./data_collection/collect) directory.
 43 | 
 44 | ### 🛠 Stage II: Automated Evaluation Environemnt Setup via SWE-Builder
 45 | 
 46 | After collecting raw issue data, set up the evaluation environment by running:
 47 | 
 48 | ```bash
 49 | export OPENAI_API_BASE_URL=<your_base_url>
 50 | export OPENAI_KEY=<your_key>
 51 | 
 52 | python app/main.py swe-bench \
 53 |     --model gpt-4.1-mini \
 54 |     --tasks-map "python-mypy-instances.jsonl" \
 55 |     --num-processes 10 \
 56 |     --model-temperature 0.2 \
 57 |     --conv-round-limit 10 \
 58 |     --output-dir "output/git-4.1-mini/mypy" \
 59 |     --setup-dir "testbed" \
 60 |     --results-path "output/git-4.1-mini/mypy/results"
 61 | ```
 62 | 
 63 | We employ SWE-Builder, an LLM-based multi-agent system consisting of:
 64 | 
 65 | 1. **🔍 Repository Explorer**
 66 |    - Gathers environment setup and test commands automatically.
 67 | 
 68 | 2. **🐳 Environment Manager**
 69 |    - Generates Dockerfiles for reproducible test environments.
 70 | 
 71 | 3. **📝 Test Manager**
 72 |    - Writes evaluation scripts to run tests inside containers.
 73 | 
 74 | 4. **🔬 Test Analyst**
 75 |    - Validates generated environments and orchestrates iterative refinement.
 76 | 
 77 | 5. **💾 Evaluation Environment Memory Pool**
 78 |    - Reuses previously successful setups for efficiency and consistency.
 79 | 
 80 | ![Overview](figure/overview.png)
 81 | 
 82 | #### 📊 SWE-Builder Evaluation Results
 83 | 
 84 | We evaluated SWE-Builder using three base models:
 85 | 
 86 | | Base Model                | F2P Rate (%) | Output Rate (%) | Cost (USD) | Time (min) |
 87 | |---------------------------|----------------|------------------|------------|------------|
 88 | | GPT-4.1-mini              | 50.2 (337/671) | 64.8 (435/671)   | 0.047      | 26.3       |
 89 | | DeepSeek-v3-0324          | 42.0 (282/671) | 53.4 (358/671)   | 0.037      | 23.0       |
 90 | | Kimi-K2  | 47.8 (321/671) | 63.2 (424/671)   | 0.056      | 30.2       |
 91 | 
 92 | To reproduce these experiments:
 93 | 
 94 | ```bash
 95 | export OPENAI_API_BASE_URL=<your_base_url>
 96 | export OPENAI_KEY=<your_key>
 97 | bash run/run.sh
 98 | ```
 99 | 
100 | ### ✅ Stage III: Fail2Pass Validation
101 | 
102 | After generating evaluation environments, perform Fail2Pass validation:
103 | 
104 | 1. Obtain test logs before and after applying the ground-truth patch. Check [evaluation](./evaluation) for detailed instructions.
105 | 
106 | 2. Run automated Fail2Pass validation:
107 | 
108 | ```bash
109 | python scripts/judge_fail2pass.py evaluation/run_instance/mypy_gpt-4.1-mini/gold fail2pass_status.json
110 | ```
111 | 
112 | The validated instances can be filtered using the generated `fail2pass_status.json`.
113 | 
114 | **Note:** Although our automated validation demonstrates high precision, manual checks are recommended to ensure dataset quality, particularly to identify and filter out error-to-pass cases.
115 | 
116 | ## 📌 Using Your Own Dataset
117 | 
118 | After building your dataset for evaluation and training, check the [evaluation](./evaluation) directory for detailed instructions on how to run tests and obtain test exection feedback.
119 | 
120 | ## 📖 Citation
121 | 
122 | If SWE-Factory helps your research or projects, star ⭐ our repo or cite us:
123 | 
124 | ```bibtex
125 | @article{guo2025swefactory,
126 |   title={SWE-Factory: Your Automated Factory for Issue Resolution Training Data and Evaluation Benchmarks},
127 |   author={Lianghong Guo and Yanlin Wang and Caihua Li and Pengyu Yang and Jiachi Chen and Wei Tao and Yingtian Zou and Duyu Tang and Zibin Zheng},
128 |   journal={arXiv preprint arXiv:2506.10954},
129 |   year={2025},
130 |   url={https://arxiv.org/abs/2506.10954},
131 | }
132 | ```
133 | 
134 | ## 🙏 Acknowledgements
135 | 
136 | - We build upon prior research — **[R2E-Gym](https://github.com/R2E-Gym/R2E-Gym/)**, **[SWE-bench](https://arxiv.org/abs/2310.06770)**, **[AutoCodeRover](https://arxiv.org/abs/2404.05427)**, **[Magis](https://arxiv.org/abs/2403.17927)**, and **[OmniGIRL](https://arxiv.org/abs/2505.04606)** — foundational to our work.
137 | - Huge thanks to the open-source developer community for your invaluable contributions to software engineering research. ❤️
138 | 


--------------------------------------------------------------------------------
/app/model/ollama.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For models other than those from OpenAI, use LiteLLM if possible.
  3 | Create all models managed by Ollama here, since they need to talk to ollama server.
  4 | """
  5 | 
  6 | import sys
  7 | from collections.abc import Mapping
  8 | from copy import deepcopy
  9 | from typing import Literal, cast
 10 | 
 11 | import httpx
 12 | import ollama
 13 | import timeout_decorator
 14 | from ollama._types import Message, Options
 15 | from openai.types.chat import ChatCompletionMessage
 16 | 
 17 | from app.model import common
 18 | from app.model.common import Model
 19 | 
 20 | 
 21 | class OllamaModel(Model):
 22 |     """
 23 |     Base class for creating Singleton instances of Ollama models.
 24 |     """
 25 | 
 26 |     _instances = {}
 27 | 
 28 |     def __new__(cls):
 29 |         if cls not in cls._instances:
 30 |             cls._instances[cls] = super().__new__(cls)
 31 |             cls._instances[cls]._initialized = False
 32 |         return cls._instances[cls]
 33 | 
 34 |     def __init__(self, name: str):
 35 |         if self._initialized:
 36 |             return
 37 |         # local models are free
 38 |         super().__init__(name, 0.0, 0.0)
 39 |         self.client: ollama.Client | None = None
 40 |         self._initialized = True
 41 | 
 42 |     def setup(self) -> None:
 43 |         """
 44 |         Check API key.
 45 |         """
 46 |         self.check_api_key()
 47 |         try:
 48 |             self.send_empty_request()
 49 |             print(f"Model {self.name} is up and running.")
 50 |         except timeout_decorator.TimeoutError as e:
 51 |             print(
 52 |                 "Ollama server is taking too long (more than 2 mins) to respond. Please check whether it's running.",
 53 |                 e,
 54 |             )
 55 |             sys.exit(1)
 56 |         except Exception as e:
 57 |             print("Could not communicate with ollama server due to exception.", e)
 58 |             sys.exit(1)
 59 | 
 60 |     @timeout_decorator.timeout(120)  # 2 min
 61 |     def send_empty_request(self):
 62 |         """
 63 |         Send an empty request to the model, for two purposes
 64 |         (1) check whether the model is up and running
 65 |         (2) preload the model for faster response time (models will be kept in memory for 5 mins after loaded)
 66 |         (see https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-pre-load-a-model-to-get-faster-response-times)
 67 |         """
 68 |         # localhost is used when (1) running both ACR and ollama on host machine; and
 69 |         #   (2) running ollama in host, and ACR in container with --net=host
 70 |         local_client = ollama.Client(host="http://localhost:11434")
 71 |         # docker_host_client is used when running ollama in host and ACR in container, and
 72 |         # Docker Desktop is installed
 73 |         docker_host_client = ollama.Client(host="http://host.docker.internal:11434")
 74 |         try:
 75 |             local_client.chat(model=self.name, messages=[])
 76 |             self.client = local_client
 77 |             return
 78 |         except httpx.ConnectError:
 79 |             # failed to connect to client at localhost
 80 |             pass
 81 | 
 82 |         try:
 83 |             docker_host_client.chat(model=self.name, messages=[])
 84 |             self.client = docker_host_client
 85 |         except httpx.ConnectError:
 86 |             # also failed to connect via host.docker.internal
 87 |             print("Could not connect to ollama server.")
 88 |             sys.exit(1)
 89 | 
 90 |     def check_api_key(self) -> str:
 91 |         return "No key required for local models."
 92 | 
 93 |     def extract_resp_content(
 94 |         self, chat_completion_message: ChatCompletionMessage
 95 |     ) -> str:
 96 |         """
 97 |         Given a chat completion message, extract the content from it.
 98 |         """
 99 |         content = chat_completion_message.content
100 |         if content is None:
101 |             return ""
102 |         else:
103 |             return content
104 | 
105 |     def call(
106 |         self,
107 |         messages: list[dict],
108 |         top_p=1,
109 |         tools=None,
110 |         response_format: Literal["text", "json_object"] = "text",
111 |         **kwargs,
112 |     ):
113 |         stop_words = ["assistant", "\n\n \n\n"]
114 |         json_stop_words = deepcopy(stop_words)
115 |         json_stop_words.append("```")
116 |         json_stop_words.append(" " * 10)
117 |         # FIXME: ignore tools field since we don't use tools now
118 | 
119 |         assert self.client is not None
120 |         try:
121 |             # build up options for ollama
122 |             options = {"temperature": common.MODEL_TEMP, "top_p": top_p}
123 |             if response_format == "json_object":
124 |                 # additional instructions for json mode
125 |                 json_instruction = {
126 |                     "role": "user",
127 |                     "content": "Stop your response after a valid json is generated.",
128 |                 }
129 |                 messages.append(json_instruction)
130 |                 # give more stop words and lower max_token for json mode
131 |                 options.update({"stop": json_stop_words, "num_predict": 128})
132 |                 response = self.client.chat(
133 |                     model=self.name,
134 |                     messages=cast(list[Message], messages),
135 |                     format="json",
136 |                     options=cast(Options, options),
137 |                     stream=False,
138 |                 )
139 |             else:
140 |                 options.update({"stop": stop_words, "num_predict": 1024})
141 |                 response = self.client.chat(
142 |                     model=self.name,
143 |                     messages=cast(list[Message], messages),
144 |                     options=cast(Options, options),
145 |                     stream=False,
146 |                 )
147 | 
148 |             assert isinstance(response, Mapping)
149 |             resp_msg = response.get("message", None)
150 |             if resp_msg is None:
151 |                 return "", 0, 0, 0
152 | 
153 |             content: str = resp_msg.get("content", "")
154 |             return content, 0, 0, 0
155 | 
156 |         except Exception as e:
157 |             # FIXME: catch appropriate exception from ollama
158 |             raise e
159 | 
160 | 
161 | class Llama3_8B(OllamaModel):
162 |     def __init__(self):
163 |         super().__init__("llama3")
164 |         self.note = "Llama3 8B model."
165 | 
166 | 
167 | class Llama3_70B(OllamaModel):
168 |     def __init__(self):
169 |         super().__init__("llama3:70b")
170 |         self.note = "Llama3 70B model."
171 | 


--------------------------------------------------------------------------------
/data_collection/collect/get_version.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import subprocess
  4 | import re
  5 | import json
  6 | import argparse
  7 | from contextlib import contextmanager
  8 | from typing import List, Dict
  9 | from concurrent.futures import ProcessPoolExecutor, as_completed
 10 | 
 11 | @contextmanager
 12 | def cd(newdir):
 13 |     prevdir = os.getcwd()
 14 |     os.chdir(os.path.expanduser(newdir))
 15 |     try:
 16 |         yield
 17 |     finally:
 18 |         os.chdir(prevdir)
 19 | 
 20 | def run_command(cmd: list[str], **kwargs) -> subprocess.CompletedProcess:
 21 |     try:
 22 |         return subprocess.run(cmd, check=True, **kwargs)
 23 |     except subprocess.CalledProcessError as e:
 24 |         print(f"Error running command: {cmd}, {e}")
 25 |         raise
 26 | 
 27 | def get_version_by_git(cloned_dir: str) -> str:
 28 |     if not os.path.isdir(cloned_dir):
 29 |         raise NotADirectoryError(f"Invalid directory: {cloned_dir}")
 30 |     with cd(cloned_dir):
 31 |         result = run_command(["git", "describe", "--tags"], capture_output=True, text=True)
 32 |         version = result.stdout.strip()
 33 |         print(f"✔️ Current version: {version}")
 34 |         match = re.search(r"(\d+\.\d+)(?:\.\d+)?", version)
 35 |         if match:
 36 |             return match.group(1)
 37 |         raise RuntimeError(f"Unrecognized version: {version}")
 38 | 
 39 | def get_instances(instance_path: str) -> List[Dict]:
 40 |     if instance_path.endswith((".jsonl", ".jsonl.all")):
 41 |         with open(instance_path, encoding="utf-8") as f:
 42 |             return [json.loads(line) for line in f]
 43 |     with open(instance_path, encoding="utf-8") as f:
 44 |         return json.load(f)
 45 | 
 46 | def prepare_repo_cache(tasks: List[Dict], cache_dir: str) -> Dict[str, str]:
 47 |     os.makedirs(cache_dir, exist_ok=True)
 48 |     repo_cache = {}
 49 |     for task in tasks:
 50 |         repo = task["repo"]
 51 |         if repo in repo_cache:
 52 |             continue
 53 |         repo_url = f"https://github.com/{repo}.git"
 54 |         local_path = os.path.join(cache_dir, repo.replace("/", "__"))
 55 |         try:
 56 |             run_command(["git", "clone", repo_url, local_path], capture_output=True)
 57 |             repo_cache[repo] = local_path
 58 |             print(f"✅ Cached repo: {repo}")
 59 |         except Exception as e:
 60 |             print(f"❌ Failed to clone {repo}: {e}")
 61 |     return repo_cache
 62 | 
 63 | def process_repo_task(task: Dict, testbed: str, repo_cache: Dict[str, str]) -> Dict | None:
 64 |     instance_id = task["instance_id"]
 65 |     repo = task["repo"]
 66 |     base_commit = task["base_commit"]
 67 |     repo_dir = os.path.join(testbed, instance_id)
 68 |     os.makedirs(repo_dir, exist_ok=True)
 69 | 
 70 |     try:
 71 |         cached_repo = repo_cache.get(repo)
 72 |         if not cached_repo or not os.path.exists(cached_repo):
 73 |             raise RuntimeError(f"Missing cached repo for {repo}")
 74 |         shutil.copytree(cached_repo, repo_dir, dirs_exist_ok=True)
 75 |         with cd(repo_dir):
 76 |             run_command(["git", "checkout", base_commit], capture_output=True)
 77 |         version = get_version_by_git(repo_dir)
 78 |         result = task.copy()
 79 |         result["version"] = version
 80 |         return result
 81 |     except Exception as e:
 82 |         print(f"❌ Failed: {instance_id} | {e}")
 83 |         return None
 84 |     finally:
 85 |         shutil.rmtree(repo_dir, ignore_errors=True)
 86 | 
 87 | def process_repos(tasks: List[Dict], testbed: str, repo_cache: Dict[str, str], max_workers: int = 4) -> tuple[List[Dict], List[Dict]]:
 88 |     os.makedirs(testbed, exist_ok=True)
 89 |     results, failures = [], []
 90 |     with ProcessPoolExecutor(max_workers=max_workers) as executor:
 91 |         future_to_task = {
 92 |             executor.submit(process_repo_task, t, testbed, repo_cache): t for t in tasks
 93 |         }
 94 |         for future in as_completed(future_to_task):
 95 |             task = future_to_task[future]
 96 |             try:
 97 |                 result = future.result()
 98 |                 if result:
 99 |                     results.append(result)
100 |                 else:
101 |                     failures.append(task)
102 |             except Exception as e:
103 |                 print(f"Unexpected error in {task['instance_id']}: {e}")
104 |                 failures.append(task)
105 |     return results, failures
106 | 
107 | def save_results(results: List[Dict], output_path: str):
108 |     os.makedirs(os.path.dirname(output_path), exist_ok=True)
109 |     if output_path.endswith((".jsonl", ".jsonl.all")):
110 |         with open(output_path, "w", encoding="utf-8") as f:
111 |             for r in results:
112 |                 f.write(json.dumps(r) + "\n")
113 |     else:
114 |         with open(output_path, "w", encoding="utf-8") as f:
115 |             json.dump(results, f, indent=2, ensure_ascii=False)
116 | 
117 | def generate_output_path(instance_path: str, suffix="_versions") -> str:
118 |     base, ext = os.path.splitext(instance_path)
119 |     return f"{base}{suffix}{ext}"
120 | 
121 | def main():
122 |     parser = argparse.ArgumentParser()
123 |     parser.add_argument("--instance_path", type=str, required=True, help="Path to input task file (.json or .jsonl)")
124 |     parser.add_argument("--testbed", type=str, required=True, help="Temp working directory for cloning repos")
125 |     parser.add_argument("--max-workers", type=int, default=10, help="Number of processes (default: 4)")
126 |     args = parser.parse_args()
127 | 
128 |     try:
129 |         tasks = get_instances(args.instance_path)
130 |     except Exception as e:
131 |         print(f"❌ Error reading instance file: {e}")
132 |         return
133 | 
134 |     required_keys = {"repo", "base_commit", "instance_id"}
135 |     for t in tasks:
136 |         if not required_keys.issubset(t):
137 |             print(f"Invalid task format: {t}")
138 |             return
139 | 
140 |     repo_cache_dir = os.path.join(args.testbed, "_cache")
141 |     repo_cache = prepare_repo_cache(tasks, repo_cache_dir)
142 | 
143 |     results, failures = process_repos(tasks, args.testbed, repo_cache, args.max_workers)
144 | 
145 |     output_path = generate_output_path(args.instance_path, "_versions")
146 |     save_results(results, output_path)
147 |     print(f"\n✅ {len(results)} results saved to {output_path}")
148 | 
149 |     if failures:
150 |         fail_path = generate_output_path(args.instance_path, "_failures")
151 |         save_results(failures, fail_path)
152 |         print(f"⚠️  {len(failures)} failures saved to {fail_path}")
153 | 
154 |     for r in results:
155 |         print(json.dumps(r, indent=2, ensure_ascii=False))
156 | 
157 | if __name__ == "__main__":
158 |     main()
159 | 


--------------------------------------------------------------------------------
/app/model/gptlitellm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For models other than those from OpenAI, use LiteLLM if possible.
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | from typing import Literal
  8 | 
  9 | import litellm
 10 | from litellm.utils import Choices, Message, ModelResponse
 11 | from openai import BadRequestError
 12 | from tenacity import retry, stop_after_attempt, wait_random_exponential
 13 | 
 14 | from app.log import log_and_print
 15 | from app.model import common
 16 | from app.model.common import Model
 17 | 
 18 | 
 19 | class OpenaiLiteLLMModel(Model):
 20 |     """
 21 |     Base class for creating Singleton instances of Openai models.
 22 |     """
 23 | 
 24 |     _instances = {}
 25 | 
 26 |     def __new__(cls):
 27 |         if cls not in cls._instances:
 28 |             cls._instances[cls] = super().__new__(cls)
 29 |             cls._instances[cls]._initialized = False
 30 |         return cls._instances[cls]
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         name: str,
 35 |         cost_per_input: float,
 36 |         cost_per_output: float,
 37 |         parallel_tool_call: bool = False,
 38 |     ):
 39 |         if self._initialized:
 40 |             return
 41 |         super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call)
 42 |         self._initialized = True
 43 | 
 44 |     def setup(self) -> None:
 45 |         """
 46 |         Check API key.
 47 |         """
 48 |         self.check_api_key()
 49 | 
 50 |     def check_api_key(self) -> str:
 51 |         key_name = "OPENAI_KEY"
 52 |         key = os.getenv(key_name)
 53 |         if not key:
 54 |             print(f"Please set the {key_name} env var")
 55 |             sys.exit(1)
 56 |         os.environ["OPENAI_API_KEY"] = key
 57 |         return key
 58 | 
 59 |     def extract_resp_content(self, chat_message: Message) -> str:
 60 |         """
 61 |         Given a chat completion message, extract the content from it.
 62 |         """
 63 |         content = chat_message.content
 64 |         if content is None:
 65 |             return ""
 66 |         else:
 67 |             return content
 68 | 
 69 |     @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3))
 70 |     def call(
 71 |         self,
 72 |         messages: list[dict],
 73 |         top_p=1,
 74 |         tools=None,
 75 |         response_format: Literal["text", "json_object"] = "text",
 76 |         **kwargs,
 77 |     ):
 78 |         # FIXME: ignore tools field since we don't use tools now
 79 |         try:
 80 |             prefill_content = "{"
 81 |             if response_format == "json_object":  # prefill
 82 |                 messages.append({"role": "assistant", "content": prefill_content})
 83 | 
 84 |             response = litellm.completion(
 85 |                 model=(
 86 |                     self.name
 87 |                     if not self.name.startswith("litellm-")
 88 |                     else self.name[len("litellm-") :]
 89 |                 ),
 90 |                 messages=messages,
 91 |                 temperature=common.MODEL_TEMP,
 92 |                 max_tokens=4096,
 93 |                 response_format={"type": response_format},
 94 |                 top_p=top_p,
 95 |                 base_url=os.getenv("OPENAI_API_BASE_URL", None),
 96 |                 stream=False,
 97 |             )
 98 |             assert isinstance(response, ModelResponse)
 99 |             resp_usage = response.usage
100 |             assert resp_usage is not None
101 |             input_tokens = int(resp_usage.prompt_tokens)
102 |             output_tokens = int(resp_usage.completion_tokens)
103 |             cost = self.calc_cost(input_tokens, output_tokens)
104 | 
105 |             common.thread_cost.process_cost += cost
106 |             common.thread_cost.process_input_tokens += input_tokens
107 |             common.thread_cost.process_output_tokens += output_tokens
108 | 
109 |             first_resp_choice = response.choices[0]
110 |             assert isinstance(first_resp_choice, Choices)
111 |             resp_msg: Message = first_resp_choice.message
112 |             content = self.extract_resp_content(resp_msg)
113 |             if response_format == "json_object":
114 |                 # prepend the prefilled character
115 |                 if not content.startswith(prefill_content):
116 |                     content = prefill_content + content
117 | 
118 |             return content, cost, input_tokens, output_tokens
119 | 
120 |         except BadRequestError as e:
121 |             if e.code == "context_length_exceeded":
122 |                 log_and_print("Context length exceeded")
123 |             raise e
124 | 
125 | 
126 | class Gpt4o_20240513LiteLLM(OpenaiLiteLLMModel):
127 |     def __init__(self):
128 |         super().__init__(
129 |             "litellm-gpt-4o-2024-05-13", 0.000005, 0.000015, parallel_tool_call=True
130 |         )
131 |         self.note = "Multimodal model. Up to Oct 2023."
132 | 
133 | 
134 | class Gpt4_Turbo20240409LiteLLM(OpenaiLiteLLMModel):
135 |     def __init__(self):
136 |         super().__init__(
137 |             "litellm-gpt-4-turbo-2024-04-09", 0.00001, 0.00003, parallel_tool_call=True
138 |         )
139 |         self.note = "Turbo with vision. Up to Dec 2023."
140 | 
141 | 
142 | class Gpt4_0125PreviewLiteLLM(OpenaiLiteLLMModel):
143 |     def __init__(self):
144 |         super().__init__(
145 |             "litellm-gpt-4-0125-preview", 0.00001, 0.00003, parallel_tool_call=True
146 |         )
147 |         self.note = "Turbo. Up to Dec 2023."
148 | 
149 | 
150 | class Gpt4_1106PreviewLiteLLM(OpenaiLiteLLMModel):
151 |     def __init__(self):
152 |         super().__init__(
153 |             "litellm-gpt-4-1106-preview", 0.00001, 0.00003, parallel_tool_call=True
154 |         )
155 |         self.note = "Turbo. Up to Apr 2023."
156 | 
157 | 
158 | class Gpt35_Turbo0125LiteLLM(OpenaiLiteLLMModel):
159 |     # cheapest gpt model
160 |     def __init__(self):
161 |         super().__init__(
162 |             "litellm-gpt-3.5-turbo-0125", 0.0000005, 0.0000015, parallel_tool_call=True
163 |         )
164 |         self.note = "Turbo. Up to Sep 2021."
165 | 
166 | 
167 | class Gpt35_Turbo1106LiteLLM(OpenaiLiteLLMModel):
168 |     def __init__(self):
169 |         super().__init__(
170 |             "litellm-gpt-3.5-turbo-1106", 0.000001, 0.000002, parallel_tool_call=True
171 |         )
172 |         self.note = "Turbo. Up to Sep 2021."
173 | 
174 | 
175 | class Gpt35_Turbo16k_0613LiteLLM(OpenaiLiteLLMModel):
176 |     def __init__(self):
177 |         super().__init__("litellm-gpt-3.5-turbo-16k-0613", 0.000003, 0.000004)
178 |         self.note = "Turbo. Deprecated. Up to Sep 2021."
179 | 
180 | 
181 | class Gpt35_Turbo0613LiteLLM(OpenaiLiteLLMModel):
182 |     def __init__(self):
183 |         super().__init__("litellm-gpt-3.5-turbo-0613", 0.0000015, 0.000002)
184 |         self.note = "Turbo. Deprecated. Only 4k window. Up to Sep 2021."
185 | 
186 | 
187 | class Gpt4_0613LiteLLM(OpenaiLiteLLMModel):
188 |     def __init__(self):
189 |         super().__init__("litellm-gpt-4-0613", 0.00003, 0.00006)
190 |         self.note = "Not turbo. Up to Sep 2021."
191 | 
192 | 


--------------------------------------------------------------------------------
/app/log.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from collections.abc import Callable
  3 | from os import get_terminal_size
  4 | 
  5 | from loguru import logger
  6 | from rich.console import Console
  7 | from rich.markdown import Markdown
  8 | from rich.markup import escape
  9 | from rich.panel import Panel
 10 | import logging
 11 | from pathlib import Path
 12 | import threading
 13 | 
 14 | logger_lock = threading.Lock()
 15 | 
 16 | def terminal_width():
 17 |     try:
 18 |         return get_terminal_size().columns
 19 |     except OSError:
 20 |         return 80
 21 | 
 22 | 
 23 | WIDTH = min(120, terminal_width() - 10)
 24 | 
 25 | console = Console()
 26 | 
 27 | print_stdout = True
 28 | 
 29 | 
 30 | def log_exception(exception):
 31 |     logger.exception(exception)
 32 | 
 33 | 
 34 | def print_banner(msg: str) -> None:
 35 |     if not print_stdout:
 36 |         return
 37 | 
 38 |     banner = f" {msg} ".center(WIDTH, "=")
 39 |     console.print()
 40 |     console.print(banner, style="bold")
 41 |     console.print()
 42 | 
 43 | 
 44 | def replace_html_tags(content: str):
 45 |     """
 46 |     Helper method to process the content before printing to markdown.
 47 |     """
 48 |     replace_dict = {
 49 |         "<file>": "[file]",
 50 |         "<class>": "[class]",
 51 |         "<func>": "[func]",
 52 |         "<method>": "[method]",
 53 |         "<code>": "[code]",
 54 |         "<original>": "[original]",
 55 |         "<patched>": "[patched]",
 56 |         "</file>": "[/file]",
 57 |         "</class>": "[/class]",
 58 |         "</func>": "[/func]",
 59 |         "</method>": "[/method]",
 60 |         "</code>": "[/code]",
 61 |         "</original>": "[/original]",
 62 |         "</patched>": "[/patched]",
 63 |     }
 64 |     for key, value in replace_dict.items():
 65 |         content = content.replace(key, value)
 66 |     return content
 67 | 
 68 | 
 69 | def print_acr(
 70 |     msg: str, desc="", print_callback: Callable[[dict], None] | None = None
 71 | ) -> None:
 72 |     if not print_stdout:
 73 |         return
 74 | 
 75 |     msg = replace_html_tags(msg)
 76 |     markdown = Markdown(msg)
 77 | 
 78 |     name = "SweEnvSetupAgent"
 79 |     if desc:
 80 |         title = f"{name} ({desc})"
 81 |     else:
 82 |         title = name
 83 | 
 84 |     panel = Panel(
 85 |         markdown, title=title, title_align="left", border_style="magenta", width=WIDTH
 86 |     )
 87 |     console.print(panel)
 88 | 
 89 |     if print_callback:
 90 |         print_callback(
 91 |             {"title": f"{name} ({desc})", "message": msg, "category": "SweEnvSetupAgent"}
 92 |         )
 93 | 
 94 | 
 95 | def print_retrieval(
 96 |     msg: str, desc="", print_callback: Callable[[dict], None] | None = None
 97 | ) -> None:
 98 |     if not print_stdout:
 99 |         return
100 | 
101 |     msg = replace_html_tags(msg)
102 |     markdown = Markdown(msg)
103 | 
104 |     name = "Context Retrieval Agent"
105 |     if desc:
106 |         title = f"{name} ({desc})"
107 |     else:
108 |         title = name
109 | 
110 |     panel = Panel(
111 |         markdown, title=title, title_align="left", border_style="blue", width=WIDTH
112 |     )
113 |     console.print(panel)
114 |     if print_callback:
115 |         print_callback(
116 |             {
117 |                 "title": f"{name} ({desc})",
118 |                 "message": msg,
119 |                 "category": "context_retrieval_agent",
120 |             }
121 |         )
122 | 
123 | 
124 | def print_patch_generation(
125 |     msg: str, desc="", print_callback: Callable[[dict], None] | None = None
126 | ) -> None:
127 |     if not print_stdout:
128 |         return
129 | 
130 |     msg = replace_html_tags(msg)
131 |     markdown = Markdown(msg)
132 | 
133 |     name = "Patch Generation"
134 |     if desc:
135 |         title = f"{name} ({desc})"
136 |     else:
137 |         title = name
138 | 
139 |     panel = Panel(
140 |         markdown, title=title, title_align="left", border_style="yellow", width=WIDTH
141 |     )
142 |     console.print(panel)
143 |     if print_callback:
144 |         print_callback(
145 |             {
146 |                 "title": f"{name} ({desc})",
147 |                 "message": msg,
148 |                 "category": "patch_generation",
149 |             }
150 |         )
151 | 
152 | 
153 | def print_fix_loc_generation(
154 |     msg: str, desc="", print_callback: Callable[[dict], None] | None = None
155 | ) -> None:
156 |     if not print_stdout:
157 |         return
158 | 
159 |     msg = replace_html_tags(msg)
160 |     markdown = Markdown(msg)
161 | 
162 |     name = "Fix Location Generation"
163 |     if desc:
164 |         title = f"{name} ({desc})"
165 |     else:
166 |         title = name
167 | 
168 |     panel = Panel(
169 |         markdown, title=title, title_align="left", border_style="green", width=WIDTH
170 |     )
171 |     console.print(panel)
172 |     if print_callback:
173 |         print_callback(
174 |             {
175 |                 "title": f"{name} ({desc})",
176 |                 "message": msg,
177 |                 "category": "fix_loc_generation",
178 |             }
179 |         )
180 | 
181 | 
182 | def print_issue(content: str) -> None:
183 |     if not print_stdout:
184 |         return
185 | 
186 |     title = "Issue description"
187 |     panel = Panel(
188 |         escape(content),
189 |         title=title,
190 |         title_align="left",
191 |         border_style="red",
192 |         width=WIDTH,
193 |     )
194 |     console.print(panel)
195 | 
196 | 
197 | def log_and_print(msg):
198 |     logger.info(msg)
199 |     if print_stdout:
200 |         console.print(msg)
201 | 
202 | 
203 | def log_and_cprint(msg, **kwargs):
204 |     logger.info(msg)
205 |     if print_stdout:
206 |         console.print(msg, **kwargs)
207 | 
208 | 
209 | def log_and_always_print(msg):
210 |     """
211 |     A mode which always print to stdout, no matter what.
212 |     Useful when running multiple tasks and we just want to see the important information.
213 |     """
214 |     logger.info(msg)
215 |     # always include time for important messages
216 |     t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
217 |     console.print(f"\n[{t}] {msg}")
218 | 
219 | 
220 | def print_with_time(msg):
221 |     """
222 |     Print a msg to console with timestamp.
223 |     """
224 |     t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
225 |     console.print(f"\n[{t}] {msg}")
226 | 
227 | 
228 | def setup_logger(instance_id: str, log_file: Path, mode="w"):
229 |     """
230 |     This logger is used for logging the build process of images and containers.
231 |     It writes logs to the log file.
232 |     """
233 |     with logger_lock:
234 |         log_file.parent.mkdir(parents=True, exist_ok=True)
235 |         new_logger = logging.getLogger(f"{instance_id}.{log_file.name}")
236 |         handler = logging.FileHandler(log_file, mode=mode)
237 |         formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
238 |         handler.setFormatter(formatter)
239 |         new_logger.addHandler(handler)
240 |         new_logger.setLevel(logging.INFO)
241 |         new_logger.propagate = False
242 |         setattr(new_logger, "log_file", log_file)
243 |         return new_logger
244 | 
245 | def close_logger(new_logger):
246 |     # To avoid too many open files
247 |     with logger_lock:
248 |         for handler in new_logger.handlers:
249 |             handler.close()
250 |             new_logger.removeHandler(handler)


--------------------------------------------------------------------------------
/app/data_structures.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from collections.abc import Mapping
  3 | from dataclasses import dataclass
  4 | from pprint import pformat
  5 | import base64
  6 | import httpx
  7 | 
  8 | from openai.types.chat import ChatCompletionMessageToolCall
  9 | from openai.types.chat.chat_completion_message_tool_call import (
 10 |     Function as OpenaiFunction,
 11 | )
 12 | 
 13 | 
 14 | @dataclass
 15 | class MethodId:
 16 |     class_name: str
 17 |     method_name: str
 18 | 
 19 |     def __str__(self):
 20 |         if self.class_name:
 21 |             return f"{self.class_name}.{self.method_name}"
 22 |         return self.method_name
 23 | 
 24 |     def __hash__(self):
 25 |         return hash((self.class_name, self.method_name))
 26 | 
 27 | 
 28 | class FunctionCallIntent:
 29 |     """An intent to call a tool function.
 30 | 
 31 |     This object created from OpenAI API response.
 32 |     """
 33 | 
 34 |     def __init__(
 35 |         self,
 36 |         func_name: str,
 37 |         arguments: Mapping[str, str],
 38 |         openai_func: OpenaiFunction | None,
 39 |     ):
 40 |         self.func_name = func_name
 41 |         self.arg_values = dict()
 42 |         self.arg_values.update(arguments)
 43 |         # record the original openai function object,
 44 |         # which is used when we want tell the model that it has
 45 |         # previously called this function/tool
 46 |         self.openai_func = openai_func or OpenaiFunction(
 47 |             arguments=json.dumps(arguments), name=func_name
 48 |         )
 49 | 
 50 |     def __str__(self):
 51 |         return f"Call function `{self.func_name}` with arguments {self.arg_values}."
 52 | 
 53 |     def to_dict(self):
 54 |         return {"func_name": self.func_name, "arguments": self.arg_values}
 55 | 
 56 |     def to_dict_with_result(self, call_ok: bool, result: str | None=None,agent_id:str | None=None):
 57 |         return {
 58 |             "func_name": self.func_name,
 59 |             "arguments": self.arg_values,
 60 |             "call_ok": call_ok,
 61 |             "result": result,
 62 |             "agent_id": agent_id
 63 |         }
 64 | 
 65 | 
 66 | class MessageThread:
 67 |     """
 68 |     Represents a thread of conversation with the model.
 69 |     Abstrated into a class so that we can dump this to a file at any point.
 70 |     """
 71 | 
 72 |     def __init__(self, messages=None):
 73 |         self.messages: list[dict] = messages or []
 74 | 
 75 |     def add(self, role: str, message: str):
 76 |         """
 77 |         Add a new message to the thread.
 78 |         Args:
 79 |             message (str): The content of the new message.
 80 |             role (str): The role of the new message.
 81 |         """
 82 |         self.messages.append({"role": role, "content": message})
 83 | 
 84 |     def add_system(self, message: str):
 85 |         self.messages.append({"role": "system", "content": message})
 86 | 
 87 |     def add_user(self, message: str):
 88 |         self.messages.append({"role": "user", "content": message})
 89 | 
 90 |     def add_image(self, messages: list[str]):
 91 |         def encode_image(image_path):
 92 |             with open(image_path, "rb") as image_file:
 93 |                 return base64.b64encode(image_file.read()).decode("utf-8")
 94 |             
 95 |         for idx, message in enumerate(messages):
 96 |             if 'imgur' in message :
 97 |                 if 'Ow4tDFX' in message:
 98 |                     image1_data = encode_image('temp.jpeg')
 99 |                 else:
100 |                     image1_data = encode_image('temp1.jpeg')
101 |                 self.messages.append({
102 |                         "role": "user",
103 |                         "content": [
104 |                             {
105 |                                 "type": "text",
106 |                                 "text": "<image>:\n"
107 |                             },
108 |                             {
109 |                                 "type": "image_url",
110 |                                 "image_url": {"url": f"data:image/jpeg;base64,{image1_data}"},
111 |                             }
112 |                         ]
113 |                 })
114 |             else:
115 |                 message = message.replace('snipboard','i.snipboard')
116 |                 self.messages.append({"role": "user", "content": [
117 |                     {'type':'text','text':f'<image of {message}>:\n'},
118 |                     {
119 |                             "type": "image_url",
120 |                             "image_url": {
121 |                                 "url": message
122 |                             },
123 |                         },
124 | 
125 |                 ]})
126 | 
127 |     def add_tool(self, message: str, tool_call_id: str):
128 |         m = {"role": "tool", "content": message, "tool_call_id": tool_call_id}
129 |         self.messages.append(m)
130 | 
131 |     def add_model(
132 |         self, message: str | None, tools: list[ChatCompletionMessageToolCall]
133 |     ):
134 |         # let's serialize tools into json first
135 |         json_tools = []
136 |         for tool in tools:
137 |             this_tool_dict = {}
138 |             this_tool_dict["id"] = tool.id
139 |             this_tool_dict["type"] = tool.type
140 |             # now serialize function as well
141 |             func_obj: OpenaiFunction = tool.function
142 |             func_args: str = func_obj.arguments
143 |             func_name: str = func_obj.name
144 |             this_tool_dict["function"] = {"name": func_name, "arguments": func_args}
145 |             json_tools.append(this_tool_dict)
146 | 
147 |         if json_tools == []:
148 |             # there is no tool calls from the model last time,
149 |             # the best we could do is to return the generated text
150 |             self.messages.append({"role": "assistant", "content": message})
151 |         else:
152 |             self.messages.append(
153 |                 {"role": "assistant", "content": None, "tool_calls": json_tools}
154 |             )
155 | 
156 |     def to_msg(self) -> list[dict]:
157 |         """
158 |         Convert to the format to be consumed by the model.
159 |         Returns:
160 |             List[Dict]: The message thread.
161 |         """
162 |         return self.messages
163 | 
164 |     def __str__(self):
165 |         return pformat(self.messages, width=160, sort_dicts=False)
166 | 
167 |     def save_to_file(self, file_path: str):
168 |         """
169 |         Save the current state of the message thread to a file.
170 |         Args:
171 |             file_path (str): The path to the file.
172 |         """
173 |         with open(file_path, "w") as f:
174 |             json.dump(self.messages, f, indent=4)
175 | 
176 |     def get_round_number(self) -> int:
177 |         """
178 |         From the current message history, decide how many rounds have been completed.
179 |         """
180 |         completed_rounds = 0
181 |         for message in self.messages:
182 |             if message["role"] == "assistant":
183 |                 completed_rounds += 1
184 |         return completed_rounds
185 | 
186 |     @classmethod
187 |     def load_from_file(cls, file_path: str):
188 |         """
189 |         Load the message thread from a file.
190 |         Args:
191 |             file_path (str): The path to the file.
192 |         Returns:
193 |             MessageThread: The message thread.
194 |         """
195 |         with open(file_path) as f:
196 |             messages = json.load(f)
197 |         return cls(messages)
198 | 


--------------------------------------------------------------------------------
/data_collection/versioning/get_versions_by_git.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import shutil
  4 | import subprocess
  5 | import re
  6 | import json
  7 | import argparse
  8 | from contextlib import contextmanager
  9 | from typing import List, Dict
 10 | from concurrent.futures import ProcessPoolExecutor, as_completed
 11 | import glob
 12 | @contextmanager
 13 | def cd(newdir):
 14 |     prevdir = os.getcwd()
 15 |     os.chdir(os.path.expanduser(newdir))
 16 |     try:
 17 |         yield
 18 |     finally:
 19 |         os.chdir(prevdir)
 20 | 
 21 | 
 22 | def run_command(cmd: List[str], **kwargs) -> subprocess.CompletedProcess:
 23 |     try:
 24 |         return subprocess.run(cmd, check=True, **kwargs)
 25 |     except subprocess.CalledProcessError as e:
 26 |         print(f"Error running command: {cmd}, {e}")
 27 |         raise
 28 | 
 29 | 
 30 | def get_version_by_git(cloned_dir: str) -> str:
 31 |     if not os.path.isdir(cloned_dir):
 32 |         raise NotADirectoryError(f"Invalid directory: {cloned_dir}")
 33 |     with cd(cloned_dir):
 34 |         result = run_command(["git", "describe", "--tags"], capture_output=True, text=True)
 35 |         version = result.stdout.strip()
 36 |         print(f"✔️ Current version: {version}")
 37 |         match = re.search(r"(\d+\.\d+)(?:\.\d+)?", version)
 38 |         if match:
 39 |             return match.group(1)
 40 |         raise RuntimeError(f"Unrecognized version: {version}")
 41 | 
 42 | 
 43 | def get_instances(instance_path: str) -> List[Dict]:
 44 |     if instance_path.endswith((".jsonl", ".jsonl.all")):
 45 |         with open(instance_path, encoding="utf-8") as f:
 46 |             return [json.loads(line) for line in f]
 47 |     with open(instance_path, encoding="utf-8") as f:
 48 |         return json.load(f)
 49 | 
 50 | 
 51 | def prepare_repo_cache(tasks: List[Dict], cache_dir: str) -> Dict[str, str]:
 52 |     os.makedirs(cache_dir, exist_ok=True)
 53 |     repo_cache = {}
 54 |     for task in tasks:
 55 |         repo = task["repo"]
 56 |         if repo in repo_cache:
 57 |             continue
 58 |         repo_url = f"https://github.com/{repo}.git"
 59 |         local_path = os.path.join(cache_dir, repo.replace("/", "__"))
 60 |         try:
 61 |             run_command(["git", "clone", repo_url, local_path], capture_output=True)
 62 |             repo_cache[repo] = local_path
 63 |             print(f"✅ Cached repo: {repo}")
 64 |         except Exception as e:
 65 |             print(f"❌ Failed to clone {repo}: {e}")
 66 |     return repo_cache
 67 | 
 68 | 
 69 | def process_repo_task(task: Dict, testbed: str, repo_cache: Dict[str, str]) -> Dict | None:
 70 |     instance_id = task["instance_id"]
 71 |     repo = task["repo"]
 72 |     base_commit = task["base_commit"]
 73 |     repo_dir = os.path.join(testbed, instance_id)
 74 |     os.makedirs(repo_dir, exist_ok=True)
 75 |     try:
 76 |         cached_repo = repo_cache.get(repo)
 77 |         if not cached_repo or not os.path.exists(cached_repo):
 78 |             raise RuntimeError(f"Missing cached repo for {repo}")
 79 |         shutil.copytree(cached_repo, repo_dir, dirs_exist_ok=True)
 80 |         with cd(repo_dir):
 81 |             run_command(["git", "checkout", base_commit], capture_output=True)
 82 |         version = get_version_by_git(repo_dir)
 83 |         result = task.copy()
 84 |         result["version"] = version
 85 |         return result
 86 |     except Exception as e:
 87 |         print(f"❌ Failed: {instance_id} | {e}")
 88 |         return None
 89 |     finally:
 90 |         shutil.rmtree(repo_dir, ignore_errors=True)
 91 | 
 92 | 
 93 | def process_repos(tasks: List[Dict], testbed: str, repo_cache: Dict[str, str], max_workers: int = 4) -> List[Dict]:
 94 |     os.makedirs(testbed, exist_ok=True)
 95 |     results = []
 96 |     with ProcessPoolExecutor(max_workers=max_workers) as executor:
 97 |         futures = [executor.submit(process_repo_task, t, testbed, repo_cache) for t in tasks]
 98 |         for future in as_completed(futures):
 99 |             res = future.result()
100 |             if res:
101 |                 results.append(res)
102 |     return results
103 | 
104 | 
105 | def save_results(results: List[Dict], output_path: str):
106 |     os.makedirs(os.path.dirname(output_path), exist_ok=True)
107 |     if output_path.endswith((".jsonl", ".jsonl.all")):
108 |         with open(output_path, "w", encoding="utf-8") as f:
109 |             for r in results:
110 |                 f.write(json.dumps(r, ensure_ascii=False) + "\n")
111 |     else:
112 |         with open(output_path, "w", encoding="utf-8") as f:
113 |             json.dump(results, f, indent=2, ensure_ascii=False)
114 | 
115 | 
116 | def generate_output_path(instance_path: str, suffix: str) -> str:
117 |     base, ext = os.path.splitext(instance_path)
118 |     ext='.json'
119 |     return f"{base}{suffix}{ext}"
120 | 
121 | def find_github_file(output_dir: str) -> str | None:
122 |     """
123 |     search file
124 |     """
125 |     # 通配所有 _versions_by_github.json 或 jsonl
126 |     for ext in ('json', 'jsonl'):
127 |         pattern = os.path.join(output_dir, f"*_versions_by_github.{ext}")
128 |         matches = glob.glob(pattern)
129 |         if matches:
130 |             return matches[0]
131 |     return None
132 | 
133 | 
134 | def main():
135 |     parser = argparse.ArgumentParser()
136 |     parser.add_argument("--instance_path", "-i", type=str, required=True,
137 |                         help="Path to input task file (.json or .jsonl)")
138 |     parser.add_argument("--testbed", "-t", type=str, default="testbed",
139 |                         help="Temp working directory for cloning repos")
140 |     parser.add_argument("--max_workers", "-w", type=int, default=10,
141 |                         help="Number of parallel workers")
142 |     parser.add_argument("--output_dir", "-d", type=str, default=None,
143 |                         help="Directory to save output (keeps original filename + suffix)")
144 |     parser.add_argument("--last_stage_output_dir", "-l", type=str, default=None,
145 |                         help="Directory to save output (keeps original filename + suffix)")
146 |     args = parser.parse_args()
147 | 
148 | 
149 |     try:
150 |         tasks = get_instances(args.instance_path)
151 |     except Exception as e:
152 |         print(f"❌ Error reading instance file: {e}")
153 |         return
154 | 
155 |    
156 |     github_file = find_github_file(args.last_stage_output_dir)
157 |     
158 |     if github_file:
159 |         try:
160 |             processed = get_instances(github_file)
161 |             seen = {item.get('instance_id') for item in processed if 'instance_id' in item}
162 |             before = len(tasks)
163 |             tasks = [t for t in tasks if t.get('instance_id') not in seen]
164 |             print(f"ℹ️ Skipped {before - len(tasks)} tasks already in {os.path.basename(github_file)}")
165 |         except Exception as e:
166 |             print(f"⚠️ Failed to read GitHub versions file: {e}")
167 | 
168 |     for t in tasks:
169 |         if not {"repo", "base_commit", "instance_id"}.issubset(t):
170 |             print(f"Invalid task format: {t}")
171 |             return
172 | 
173 | 
174 |     cache_dir = os.path.join(args.testbed, "_cache")
175 |     repo_cache = prepare_repo_cache(tasks, cache_dir)
176 |     results = process_repos(tasks, args.testbed, repo_cache, args.max_workers)
177 | 
178 |     tmp = generate_output_path(args.instance_path, "_versions_by_git")
179 |     if args.output_dir:
180 |         output_path = os.path.join(args.output_dir, os.path.basename(tmp))
181 |     else:
182 |         output_path = tmp
183 | 
184 |     save_results(results, output_path)
185 |     print(f"\n✅ {len(results)} results saved to {output_path}")
186 | 
187 | if __name__ == "__main__":
188 |     main()
189 | 


--------------------------------------------------------------------------------
/app/model/common.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import threading
  4 | from abc import ABC, abstractmethod
  5 | from typing import Literal
  6 | 
  7 | import litellm
  8 | from litellm import cost_per_token
  9 | from litellm.utils import Choices, Message, ModelResponse
 10 | from openai import BadRequestError
 11 | from tenacity import retry, stop_after_attempt, wait_random_exponential
 12 | 
 13 | from app.log import log_and_cprint, log_and_print
 14 | 
 15 | # Variables for each process. Since models are singleton objects, their references are copied
 16 | # to each process, but they all point to the same objects. For safe updating costs per process,
 17 | # we define the accumulators here.
 18 | 
 19 | thread_cost = threading.local()
 20 | thread_cost.process_cost = 0.0
 21 | thread_cost.process_input_tokens = 0
 22 | thread_cost.process_output_tokens = 0
 23 | 
 24 | 
 25 | class Model(ABC):
 26 |     def __init__(
 27 |         self,
 28 |         name: str,
 29 |         cost_per_input: float,
 30 |         cost_per_output: float,
 31 |         parallel_tool_call: bool = False,
 32 |     ):
 33 |         self.name: str = name
 34 |         # cost stats - zero for local models
 35 |         self.cost_per_input: float = cost_per_input
 36 |         self.cost_per_output: float = cost_per_output
 37 |         # whether the model supports parallel tool call
 38 |         self.parallel_tool_call: bool = parallel_tool_call
 39 | 
 40 |     @abstractmethod
 41 |     def check_api_key(self) -> str:
 42 |         raise NotImplementedError("abstract base class")
 43 | 
 44 |     @abstractmethod
 45 |     def setup(self) -> None:
 46 |         raise NotImplementedError("abstract base class")
 47 | 
 48 |     @abstractmethod
 49 |     def call(self, messages: list[dict], **kwargs):
 50 |         raise NotImplementedError("abstract base class")
 51 | 
 52 |     def calc_cost(self, input_tokens: int, output_tokens: int) -> float:
 53 |         """
 54 |         Calculates the cost of a request based on the number of input/output tokens.
 55 |         """
 56 |         input_cost = self.cost_per_input * input_tokens
 57 |         output_cost = self.cost_per_output * output_tokens
 58 |         cost = input_cost + output_cost
 59 |         log_and_cprint(
 60 |             f"Model API request cost info: "
 61 |             f"input_tokens={input_tokens}, output_tokens={output_tokens}, cost={cost:.6f}",
 62 |             style="yellow",
 63 |         )
 64 |         return cost
 65 | 
 66 |     def get_overall_exec_stats(self):
 67 |         return {
 68 |             "model": self.name,
 69 |             "input_cost_per_token": self.cost_per_input,
 70 |             "output_cost_per_token": self.cost_per_output,
 71 |             "total_input_tokens": thread_cost.process_input_tokens,
 72 |             "total_output_tokens": thread_cost.process_output_tokens,
 73 |             "total_tokens": thread_cost.process_input_tokens
 74 |             + thread_cost.process_output_tokens,
 75 |             "total_cost": thread_cost.process_cost,
 76 |         }
 77 | 
 78 | 
 79 | class LiteLLMGeneric(Model):
 80 |     """
 81 |     Base class for creating instances of LiteLLM-supported models.
 82 |     """
 83 | 
 84 |     _instances = {}
 85 | 
 86 |     def __new__(cls, model_name: str, cost_per_input: float, cost_per_output: float):
 87 |         if model_name not in cls._instances:
 88 |             cls._instances[model_name] = super().__new__(cls)
 89 |             cls._instances[model_name]._initialized = False
 90 |         return cls._instances[model_name]
 91 | 
 92 |     def __init__(
 93 |         self,
 94 |         name: str,
 95 |         cost_per_input: float,
 96 |         cost_per_output: float,
 97 |         parallel_tool_call: bool = False,
 98 |     ):
 99 |         if self._initialized:
100 |             return
101 |         super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call)
102 |         self._initialized = True
103 | 
104 |     def setup(self) -> None:
105 |         """
106 |         Check API key.
107 |         """
108 |         pass
109 | 
110 |     def check_api_key(self) -> str:
111 |         return ""
112 | 
113 |     def extract_resp_content(self, chat_message: Message) -> str:
114 |         """
115 |         Given a chat completion message, extract the content from it.
116 |         """
117 |         content = chat_message.content
118 |         if content is None:
119 |             return ""
120 |         else:
121 |             return content
122 | 
123 |     @retry(wait=wait_random_exponential(min=30, max=600), stop=stop_after_attempt(3))
124 |     def call(
125 |         self,
126 |         messages: list[dict],
127 |         top_p=1,
128 |         tools=None,
129 |         response_format: Literal["text", "json_object"] = "text",
130 |         **kwargs,
131 |     ):
132 |         # FIXME: ignore tools field since we don't use tools now
133 |         try:
134 |             prefill_content = "{"
135 |             if response_format == "json_object":  # prefill
136 |                 messages.append({"role": "assistant", "content": prefill_content})
137 | 
138 |             response = litellm.completion(
139 |                 model=self.name,
140 |                 messages=messages,
141 |                 temperature=MODEL_TEMP,
142 |                 max_tokens=os.getenv("ACR_TOKEN_LIMIT", 1024),
143 |                 response_format=(
144 |                     {"type": response_format} if "gpt" in self.name else None
145 |                 ),
146 |                 top_p=top_p,
147 |                 stream=False,
148 |             )
149 |             assert isinstance(response, ModelResponse)
150 |             resp_usage = response.usage
151 |             assert resp_usage is not None
152 |             input_tokens = int(resp_usage.prompt_tokens)
153 |             output_tokens = int(resp_usage.completion_tokens)
154 |             cost = self.calc_cost(input_tokens, output_tokens)
155 | 
156 |             thread_cost.process_cost += cost
157 |             thread_cost.process_input_tokens += input_tokens
158 |             thread_cost.process_output_tokens += output_tokens
159 | 
160 |             first_resp_choice = response.choices[0]
161 |             assert isinstance(first_resp_choice, Choices)
162 |             resp_msg: Message = first_resp_choice.message
163 |             content = self.extract_resp_content(resp_msg)
164 |             if response_format == "json_object":
165 |                 # prepend the prefilled character
166 |                 if not content.startswith(prefill_content):
167 |                     content = prefill_content + content
168 | 
169 |             return content, cost, input_tokens, output_tokens
170 | 
171 |         except BadRequestError as e:
172 |             if e.code == "context_length_exceeded":
173 |                 log_and_print("Context length exceeded")
174 |             raise e
175 | 
176 | 
177 | MODEL_HUB = {}
178 | 
179 | 
180 | def register_model(model: Model):
181 |     global MODEL_HUB
182 |     MODEL_HUB[model.name] = model
183 | 
184 | 
185 | def get_all_model_names():
186 |     return list(MODEL_HUB.keys())
187 | 
188 | 
189 | # To be set at runtime - the selected model for a run
190 | SELECTED_MODEL: Model
191 | 
192 | 
193 | def set_model(model_name: str):
194 |     global SELECTED_MODEL
195 |     if model_name not in MODEL_HUB and not model_name.startswith("litellm-generic-"):
196 |         print(f"Invalid model name: {model_name}")
197 |         sys.exit(1)
198 |     if model_name.startswith("litellm-generic-"):
199 |         real_model_name = model_name.removeprefix("litellm-generic-")
200 |         prompt_tokens = 5
201 |         completion_tokens = 10
202 |         prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = (
203 |             cost_per_token(
204 |                 model=real_model_name,
205 |                 prompt_tokens=prompt_tokens,
206 |                 completion_tokens=completion_tokens,
207 |             )
208 |         )
209 |         # litellm.set_verbose = True
210 |         SELECTED_MODEL = LiteLLMGeneric(
211 |             real_model_name,
212 |             prompt_tokens_cost_usd_dollar,
213 |             completion_tokens_cost_usd_dollar,
214 |         )
215 |     else:
216 |         SELECTED_MODEL = MODEL_HUB[model_name]
217 |     SELECTED_MODEL.setup()
218 | 
219 | 
220 | # the model temperature to use
221 | # For OpenAI models: this value should be from 0 to 2
222 | MODEL_TEMP: float = 0.0
223 | 


--------------------------------------------------------------------------------
/app/agents/train_env_gen_agent/tools/search.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Description: Search for a term in either a directory or a single file.
  5 | 
  6 | Behavior:
  7 | * If `--path` points to a directory (default is `.`), we recursively search all non-hidden files and directories.
  8 | * If `--path` points to a file, we run `grep -n` on that file to find line numbers containing the search term.
  9 | * If more than 100 files match (directory search scenario), the tool will stop listing and inform you to narrow your search.
 10 | * If no files are found that match your search term, the tool will inform you of that as well.
 11 | 
 12 | **Parameters:**
 13 |   1. **search_term** (`string`, required): The term to search for in files.
 14 |   2. **path** (`string`, optional): The file or directory in which to search. If not provided, defaults to the current directory (i.e., `.`).
 15 | """
 16 | 
 17 | import argparse
 18 | import os
 19 | import sys
 20 | import subprocess
 21 | 
 22 | def search_in_directory(search_term: str, directory: str = ".", python_only: bool = False):
 23 |     """
 24 |     Searches for `search_term` in all non-hidden files under `directory`
 25 |     (or only in .py files if `python_only=True`), excluding hidden directories.
 26 |     Prints how many matches were found per file.
 27 |     """
 28 |     directory = os.path.realpath(directory)
 29 | 
 30 |     if not os.path.isdir(directory):
 31 |         print(f"Directory '{directory}' not found or not a directory.")
 32 |         sys.exit(1)
 33 | 
 34 |     matches = {}
 35 |     num_files_matched = 0
 36 | 
 37 |     for root, dirs, files in os.walk(directory):
 38 |         # Exclude hidden directories
 39 |         dirs[:] = [d for d in dirs if not d.startswith(".")]
 40 |         for file in files:
 41 |             # Skip hidden files
 42 |             if file.startswith("."):
 43 |                 continue
 44 | 
 45 |             # If --python_only is set, only search .py files
 46 |             if python_only and not file.endswith(".py"):
 47 |                 continue
 48 | 
 49 |             filepath = os.path.join(root, file)
 50 |             try:
 51 |                 with open(filepath, "r", errors="ignore") as f:
 52 |                     file_matches = 0
 53 |                     for line_num, line in enumerate(f, 1):
 54 |                         if search_term in line:
 55 |                             file_matches += 1
 56 |                     if file_matches > 0:
 57 |                         matches[filepath] = file_matches
 58 |                         num_files_matched += 1
 59 |             except (UnicodeDecodeError, PermissionError):
 60 |                 # Skip files that can't be read
 61 |                 continue
 62 | 
 63 |     if not matches:
 64 |         print(f'No matches found for "{search_term}" in {directory}')
 65 |         sys.exit(0)
 66 | 
 67 |     # Summarize
 68 |     num_matches = sum(matches.values())
 69 |     if num_files_matched > 100:
 70 |         print(
 71 |             f'More than {num_files_matched} files matched for "{search_term}" in {directory}. '
 72 |             "Please narrow your search."
 73 |         )
 74 |         sys.exit(0)
 75 | 
 76 |     print(f'Found {num_matches} matches for "{search_term}" in {directory}:')
 77 | 
 78 |     # Print matched files
 79 |     for filepath, count in matches.items():
 80 |         relative_path = os.path.relpath(filepath, start=os.getcwd())
 81 |         if not relative_path.startswith("./"):
 82 |             relative_path = "./" + relative_path
 83 |         print(f"{relative_path} ({count} matches)")
 84 | 
 85 |     print(f'End of matches for "{search_term}" in {directory}')
 86 | 
 87 | def search_in_directory_old(search_term: str, directory: str = ".", python_only=False):
 88 |     """
 89 |     Searches for `search_term` in all non-hidden files under `directory`,
 90 |     excluding hidden directories. Prints how many matches were found per file.
 91 |     """
 92 |     directory = os.path.realpath(directory)
 93 | 
 94 |     if not os.path.isdir(directory):
 95 |         print(f"Directory '{directory}' not found or not a directory.")
 96 |         sys.exit(1)
 97 | 
 98 |     matches = {}
 99 |     num_files_matched = 0
100 | 
101 |     for root, dirs, files in os.walk(directory):
102 |         # Exclude hidden directories
103 |         dirs[:] = [d for d in dirs if not d.startswith(".")]
104 |         for file in files:
105 |             # Skip hidden files
106 |             if file.startswith("."):
107 |                 continue
108 |             filepath = os.path.join(root, file)
109 |             try:
110 |                 with open(filepath, "r", errors="ignore") as f:
111 |                     file_matches = 0
112 |                     for line_num, line in enumerate(f, 1):
113 |                         if search_term in line:
114 |                             file_matches += 1
115 |                     if file_matches > 0:
116 |                         matches[filepath] = file_matches
117 |                         num_files_matched += 1
118 |             except (UnicodeDecodeError, PermissionError):
119 |                 # Skip files that can't be read
120 |                 continue
121 | 
122 |     if not matches:
123 |         print(f'No matches found for "{search_term}" in {directory}')
124 |         sys.exit(0)
125 | 
126 |     # Summarize
127 |     num_matches = sum(matches.values())
128 |     if num_files_matched > 100:
129 |         print(
130 |             f'More than {num_files_matched} files matched for "{search_term}" in {directory}. '
131 |             "Please narrow your search."
132 |         )
133 |         sys.exit(0)
134 | 
135 |     print(f'Found {num_matches} matches for "{search_term}" in {directory}:')
136 | 
137 |     # Print matched files
138 |     for filepath, count in matches.items():
139 |         # Convert absolute path to relative path
140 |         relative_path = os.path.relpath(filepath, start=os.getcwd())
141 |         if not relative_path.startswith("./"):
142 |             relative_path = "./" + relative_path
143 |         print(f"{relative_path} ({count} matches)")
144 | 
145 |     print(f'End of matches for "{search_term}" in {directory}')
146 | 
147 | 
148 | def search_in_file(search_term: str, filepath: str):
149 |     """
150 |     Uses grep -n to search for `search_term` in a single file.
151 |     Prints lines (with line numbers) where matches occur.
152 |     """
153 |     filepath = os.path.realpath(filepath)
154 | 
155 |     if not os.path.isfile(filepath):
156 |         print(f"File '{filepath}' not found or is not a file.")
157 |         sys.exit(1)
158 | 
159 |     try:
160 |         # Try modern parameters if Python 3.7+ (capture_output, text)
161 |         result = subprocess.run(
162 |             ["grep", "-n", search_term, filepath],
163 |             capture_output=True,
164 |             text=True
165 |         )
166 |     except TypeError:
167 |         # Fallback for Python 3.5/3.6
168 |         result = subprocess.run(
169 |             ["grep", "-n", search_term, filepath],
170 |             stdout=subprocess.PIPE,
171 |             stderr=subprocess.PIPE,
172 |             universal_newlines=True
173 |         )
174 | 
175 |     if result.returncode != 0:
176 |         # grep exit code = 1 means "no matches", other non-zero exit code is a real error
177 |         if result.returncode == 1:
178 |             print(f'No matches found for "{search_term}" in {filepath}')
179 |             sys.exit(0)
180 |         else:
181 |             # Something else went wrong
182 |             print(f"Error executing grep:\n{result.stderr}")
183 |             sys.exit(result.returncode)
184 | 
185 |     # Print the grep output directly
186 |     print(f'Matches for "{search_term}" in {filepath}:')
187 |     # Depending on the fallback, the output is in result.stdout
188 |     print(result.stdout.strip())
189 |     # try:
190 |     #     # Run grep -n <search_term> <filepath>
191 |     #     result = subprocess.run(
192 |     #         ["grep", "-n", search_term, filepath], capture_output=True, text=True
193 |     #     )
194 |     #     if result.returncode != 0:
195 |     #         # grep exit code = 1 means no matches
196 |     #         print(f'No matches found for "{search_term}" in {filepath}')
197 |     #         sys.exit(0)
198 |     #     # Print grep output directly
199 |     #     print(f'Matches for "{search_term}" in {filepath}:')
200 |     #     print(result.stdout.strip())
201 |     # except FileNotFoundError:
202 |     #     print(
203 |     #         "`grep` is not available on this system. Please install or use another method."
204 |     #     )
205 |     #     sys.exit(1)
206 | 
207 | 
208 | def main():
209 |     parser = argparse.ArgumentParser(
210 |         description="search tool: run subcommands such as `search` for files or directories."
211 |     )
212 |     parser.add_argument(
213 |         "--search_term", help="Term to search for in files.", required=True
214 |     )
215 |     parser.add_argument(
216 |         "--path",
217 |         help="File or directory to search in (defaults to current dir).",
218 |         default=".",
219 |     )
220 |     # NEW ARGUMENT:
221 |     parser.add_argument(
222 |         "--python_only",
223 |         default=True,
224 |         help="If set, only search for matches in .py files when searching a directory."
225 |     )
226 | 
227 |     args = parser.parse_args()
228 |     # Check if path is a file or a directory
229 |     if os.path.isfile(args.path):
230 |         search_in_file(args.search_term, args.path)
231 |     else:
232 |         search_in_directory(args.search_term, args.path, python_only=args.python_only)
233 | 
234 | 
235 | if __name__ == "__main__":
236 |     main()


--------------------------------------------------------------------------------
/data_collection/collect/build_dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import json
  5 | import logging
  6 | import os
  7 | from typing import Optional
  8 | from datetime import datetime
  9 | from utils import Repo, extract_patches, extract_problem_statement_and_hints, extract_problem_statement_and_hints_with_official_github_api
 10 | 
 11 | logging.basicConfig(
 12 |     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 13 | )
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def create_instance(repo: Repo, pull: dict, output_path: str, mode: str ='swebench') -> dict:
 18 |     """
 19 |     Create a single task instance from a pull request, where task instance is:
 20 | 
 21 |     {
 22 |         repo (str): owner/repo this task instance is from,
 23 |         pull_number (int): number of PR this task instance is from,
 24 |         base_commit (str): SHA of the base commit PR is based on,
 25 |         patch (str): reference solution as .patch (apply to base commit),
 26 |         test_patch (str): test suite as .patch (apply to base commit),
 27 |     }
 28 |     """
 29 |     # try: 
 30 |     patch, test_patch, request_success = extract_patches(pull, repo)
 31 |     # except Exception as e:
 32 |     #     logger.info(e)
 33 |     #     patch = ""
 34 |     #     test_patch = ""
 35 |     instance_id  = (repo.repo.full_name + "-" + str(pull["number"])).replace("/", "__")
 36 |     successful_path = os.path.join(os.path.dirname(output_path), "successful_requests.txt")
 37 |     if request_success:
 38 |         with open(successful_path, "a") as f:
 39 |             f.write(instance_id + "\n")
 40 | 
 41 |     if mode =='swebench':
 42 | 
 43 |         problem_statement, hints = extract_problem_statement_and_hints(pull, repo)
 44 |     else:
 45 |         problem_statement, hints = extract_problem_statement_and_hints_with_official_github_api(pull, repo)
 46 |     return {
 47 |         "repo": repo.repo.full_name,
 48 |         "pull_number": pull["number"],
 49 |         "instance_id": instance_id,
 50 |         "issue_numbers": pull["resolved_issues"],
 51 |         "base_commit": pull["base"]["sha"],
 52 |         "patch": patch,
 53 |         "test_patch": test_patch,
 54 |         "problem_statement": problem_statement,
 55 |         "hints_text": hints,
 56 |         "created_at": pull["created_at"],
 57 |     }
 58 | 
 59 | 
 60 | def is_valid_pull(pull: dict) -> bool:
 61 |     """
 62 |     Check whether PR has an associated issue and is merged
 63 | 
 64 |     Args:
 65 |         pull (dict): pull request object
 66 |     Returns:
 67 |         bool: whether PR is valid
 68 |     """
 69 |     if pull["merged_at"] is None:
 70 |         # logger.info(f" not merged")
 71 |         return False
 72 |     if "resolved_issues" not in pull or len(pull["resolved_issues"]) < 1:
 73 |         # logger.info(f"no resolved_issues")
 74 |         return False
 75 | 
 76 |     return True
 77 | 
 78 | 
 79 | def is_valid_instance(instance: dict) -> bool:
 80 |     """
 81 |     Check whether task instance has all required fields for task instance creation
 82 | 
 83 |     Args:
 84 |         instance (dict): task instance object
 85 |     Returns:
 86 |         bool: whether task instance is valid
 87 |     """
 88 |     if instance["patch"] is None or instance["patch"] == "":
 89 |         logger.info(f"Instance {instance['pull_number']} no patch")
 90 |         return False
 91 |     if instance["problem_statement"] is None or instance["problem_statement"] == "":
 92 |         logger.info(f"Instance {instance['pull_number']} no problem statement")
 93 |         return False
 94 |     return True
 95 | 
 96 | 
 97 | def has_test_patch(instance: dict) -> bool:
 98 |     """
 99 |     Check whether task instance has a test suite
100 | 
101 |     Args:
102 |         instance (dict): task instance object
103 |     Returns:
104 |         bool: whether task instance has a test suite
105 |     """
106 |     if instance["test_patch"] is None or instance["test_patch"].strip() == "":
107 |         logger.info(f"Instance {instance['pull_number']} no test patch")
108 |         return False
109 |     return True
110 | 
111 | def main(pr_file: str, output: str, token: Optional[str] = None,mode: Optional[str] = 'swebench',language: Optional[str] = 'python', cutoff_date: Optional[str] = None):
112 |     """
113 |     Main thread for creating task instances from pull requests
114 | 
115 |     Args:
116 |         pr_file (str): path to pull request JSONL file
117 |         output (str): output file name
118 |         token (str): GitHub token
119 |     """
120 |     logger.info(f'Language: {language}')
121 |     logger.info(f'mode: {mode}')
122 |     cutoff_date = datetime.strptime(cutoff_date, "%Y-%m-%dT%H:%M:%SZ")
123 |     if token is None:
124 |         # Get GitHub token from environment variable if not provided
125 |         token = os.environ["GITHUB_TOKEN"]
126 | 
127 |     def load_repo(repo_name,language):
128 |         # Return repo object for a given repo name
129 |         owner, repo = repo_name.split("/")
130 |         return Repo(owner, repo, token=token,language=language)
131 | 
132 |     repos = dict()
133 |     completed = 0
134 |     with_tests = 0
135 |     total_instances = 0
136 |     all_output = output + ".all"
137 |     seen_prs = set()
138 | 
139 |     successful_path = os.path.join(os.path.dirname(output), "successful_requests.txt")
140 | 
141 |     if not os.path.exists(successful_path):
142 |         with open(successful_path, "w") as f:
143 |             pass  
144 | 
145 |     successful_instances = set()
146 |     with open(successful_path, "r") as f:
147 |         for line in f:
148 |             successful_instances.add(line.strip())
149 | 
150 |     # Continue where we left off if output file already exists
151 |     if os.path.exists(all_output):
152 |         with open(all_output) as f:
153 |             for line in f:
154 |                 pr = json.loads(line)
155 |                 if "instance_id" not in pr:
156 |                     pr["instance_id"] = (
157 |                         pr["repo"] + "-" + str(pr["pull_number"])
158 |                     ).replace("/", "__")
159 |                 instance_id = pr["instance_id"]
160 |                 seen_prs.add(instance_id)
161 |                 if datetime.strptime(pr["created_at"], "%Y-%m-%dT%H:%M:%SZ") >= cutoff_date:
162 |                     logger.info(f"Instance {instance_id} created_at {pr['created_at']} exceeds cutoff_date {cutoff_date}")
163 |                     continue
164 |                 if is_valid_instance(pr):
165 |                     completed += 1
166 |                     if has_test_patch(pr):
167 |                         with_tests += 1
168 |     logger.info(f"{len(seen_prs)} instance_ids previously recorded")
169 |     original_output_path = output
170 |     # Write to .all file for all PRs
171 |     write_mode_all = "w" if not os.path.exists(all_output) else "a"
172 |     with open(all_output, write_mode_all) as all_output:
173 |         # Write to output file for PRs with test suites
174 |         write_mode = "w" if not os.path.exists(output) else "a"
175 |         with open(output, write_mode) as output:
176 |             for ix, line in enumerate(open(pr_file)):
177 |                 total_instances += 1
178 |                 pull = json.loads(line)
179 |                 if ix % 100 == 0:
180 |                     logger.info(
181 |                         f"[{pull['base']['repo']['full_name']}] ( Up to {ix} checked ) {completed} valid, {with_tests} with tests."
182 |                     )
183 |                 # Construct instance fields
184 |                 instance_id = (
185 |                     pull["base"]["repo"]["full_name"] + "-" + str(pull["number"])
186 |                 )
187 |                 instance_id = instance_id.replace("/", "__")
188 |                 
189 |                 if instance_id in seen_prs:
190 |                     seen_prs -= {instance_id}
191 |                     continue
192 | 
193 |                 if instance_id in successful_instances:
194 |                     continue
195 |     
196 |                 if not is_valid_pull(pull):
197 |                     # Throw out invalid PRs
198 |                     continue
199 |                 # Create task instance
200 |                 repo_name = pull["base"]["repo"]["full_name"]
201 |                 if repo_name not in repos:
202 |                     repos[repo_name] = load_repo(repo_name,language)
203 |                 repo = repos[repo_name]
204 |                 instance = create_instance(repo, pull,original_output_path,mode)
205 |                 if is_valid_instance(instance):
206 |                     # If valid, write to .all output file
207 |                     print(
208 |                         json.dumps(instance), end="\n", flush=True, file=all_output
209 |                     )  # write all instances to a separate file
210 |                     completed += 1
211 |                     if has_test_patch(instance):
212 |                         # If has test suite, write to output file
213 |                         print(json.dumps(instance), end="\n", flush=True, file=output)
214 |                         with_tests += 1
215 |     logger.info(
216 |         f"Total instances: {total_instances}, completed: {completed}, with tests: {with_tests}"
217 |     )
218 |     logger.info(f"Didn't see {len(seen_prs)} instances previously recorded")
219 |     logger.info("\n".join(sorted(seen_prs)))
220 | 
221 | 
222 | if __name__ == "__main__":
223 |     parser = argparse.ArgumentParser()
224 |     parser.add_argument("pr_file", type=str, help="Path to pull request JSONL file")
225 |     parser.add_argument("output", type=str, help="Output file name")
226 |     parser.add_argument("--token", type=str, help="GitHub token")
227 |     parser.add_argument("--mode", type=str, default='omnigirl',help="collecting mode")
228 |     parser.add_argument("--cutoff_date", type=str, default="2025-03-31T23:59:59Z", help="Cutoff date for filtering PRs in YYYY-MM-DDTHH:MM:SSZ format")
229 |     parser.add_argument("--language", type=str, help="language")
230 |     
231 |     args = parser.parse_args()
232 |     print(">>> reached main()")
233 |     main(**vars(args))
234 | 


--------------------------------------------------------------------------------
/app/post_process.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Post-process the output of the inference workflow.
  3 | """
  4 | 
  5 | import json
  6 | import os
  7 | import shutil
  8 | import subprocess
  9 | from collections import defaultdict
 10 | from collections.abc import Mapping
 11 | from enum import Enum
 12 | from glob import glob
 13 | from os.path import join as pjoin
 14 | from shutil import move
 15 | 
 16 | from app import utils as apputils
 17 | from app.model import common
 18 | 
 19 | 
 20 | 
 21 | 
 22 | # track status of patch extraction
 23 | class ExtractStatus(str, Enum):
 24 |     # APPLICABLE_PATCH = "APPLICABLE_PATCH"
 25 |     # MATCHED_BUT_EMPTY_ORIGIN = "MATCHED_BUT_EMPTY_ORIGIN"
 26 |     # MATCHED_BUT_EMPTY_DIFF = "MATCHED_BUT_EMPTY_DIFF"
 27 |     # RAW_PATCH_BUT_UNMATCHED = "RAW_PATCH_BUT_UNMATCHED"
 28 |     # RAW_PATCH_BUT_UNPARSED = "RAW_PATCH_BUT_UNPARSED"
 29 |     # NO_PATCH = "NO_PATCH"
 30 |     IS_VALID_JSON = "IS_VALID_JSON"
 31 |     NOT_VALID_JSON = "NOT_VALID_JSON"
 32 |     NO_SETUP = "NO_SETUP"
 33 |     APPLICABLE_SETUP = "APPLICABLE_SETUP"
 34 | 
 35 |     def __lt__(self, other):
 36 |         # order from min to max
 37 |         order = [
 38 |             self.NO_SETUP,
 39 |             # self.RAW_PATCH_BUT_UNPARSED,
 40 |             # self.RAW_PATCH_BUT_UNMATCHED,
 41 |             # self.MATCHED_BUT_EMPTY_DIFF,
 42 |             # self.MATCHED_BUT_EMPTY_ORIGIN,
 43 |             self.APPLICABLE_SETUP,
 44 |         ]
 45 |         self_index = order.index(self)
 46 |         other_index = order.index(other)
 47 |         return self_index < other_index
 48 | 
 49 |     def __eq__(self, other):
 50 |         return self is other
 51 | 
 52 |     def __hash__(self):
 53 |         return hash(self.value)
 54 | 
 55 |     def to_dir_name(self, expr_dir: str):
 56 |         return pjoin(expr_dir, self.value.lower())
 57 | 
 58 |     @staticmethod
 59 |     def max(statuses):
 60 |         return sorted(statuses)[-1]
 61 | 
 62 | 
 63 | def record_extract_status(individual_expr_dir: str, extract_status: ExtractStatus):
 64 |     """
 65 |     Write extract status to file, so that we can read it again when
 66 |     classifying patches
 67 |     """
 68 |     # there is 1-to-1 correspondence between agent_patch_raw and extract_status
 69 |     # FIXME: it might be better to record these status in memory so they can be easily managed.
 70 |     record_file = pjoin(individual_expr_dir, "extract_status.json")
 71 |     if not os.path.isfile(record_file):
 72 |         # record for the first time
 73 |         with open(record_file, "w") as f:
 74 |             json.dump({"extract_status": [extract_status]}, f, indent=4)
 75 |     else:
 76 |         with open(record_file) as f:
 77 |             record = json.load(f)
 78 |         record["extract_status"].append(extract_status)
 79 |         with open(record_file, "w") as f:
 80 |             json.dump(record, f, indent=4)
 81 | 
 82 | 
 83 | def read_extract_status(individual_expr_dir: str) -> tuple[ExtractStatus, int]:
 84 |     """
 85 |     Read extract status from file. If there are multiple status recorded, read the best one.
 86 |     Returns:
 87 |         - The best extract status
 88 |         - The index of the best status in the list of all statuses. (0-based)
 89 |     """
 90 |     # we should read from the all the record
 91 |     record_file = pjoin(individual_expr_dir, "Dockerfile")
 92 |     if not os.path.isfile(record_file):
 93 |         # if no status file is written, means that we did not even
 94 |         # reach the state of extracting patches
 95 |         return ExtractStatus.NO_SETUP, -1
 96 |     else:
 97 |         return ExtractStatus.APPLICABLE_SETUP, 1
 98 |     # with open(record_file) as f:
 99 |     #     record = json.load(f)
100 |     # # convert string to enum type
101 |     # all_status = [ExtractStatus(s) for s in record["extract_status"]]
102 | 
103 |     # best_status = ExtractStatus.max(all_status)
104 |     # best_idx = all_status.index(best_status)
105 |     # return best_status, best_idx
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | def organize_experiment_results(expr_dir: str):
113 |     """
114 |     Assuming patches have already been extracted, organize the experiment result
115 |     directories into a few categories and move them there.
116 |     """
117 |     # (1) find all the task experiment directories
118 |     task_exp_names = [
119 |         x
120 |         for x in os.listdir(expr_dir)
121 |         if os.path.isdir(pjoin(expr_dir, x))
122 |         and "__" in x  # for filtering out other dirs like "applicable_patch"
123 |     ]
124 |     task_exp_dirs = [pjoin(expr_dir, x) for x in task_exp_names]
125 | 
126 |     # start organizing
127 |     for extract_status in ExtractStatus:
128 |         os.makedirs(extract_status.to_dir_name(expr_dir), exist_ok=True)
129 | 
130 |     for task_dir in task_exp_dirs:
131 |         extract_status, _ = read_extract_status(task_dir)
132 |         corresponding_dir = extract_status.to_dir_name(expr_dir)
133 |         shutil.move(task_dir, corresponding_dir)
134 | 
135 | 
136 | 
137 | def extract_swe_bench_input(dir: str):
138 |     """
139 |     After diff format patch files have been extracted, this function collects
140 |     them and writes a single file that can be used by swe-bench.
141 | 
142 |     Returns:
143 |         - path to swe-bench input file.
144 |     """
145 |     # only look into applicable_patch dir, since we have already done
146 |     # the categorization
147 |     applicable_res_dir = pjoin(dir, "applicable_setup")
148 |     # figure out what tasks have applicable patch
149 |     task_dirs = [
150 |         x
151 |         for x in os.listdir(applicable_res_dir)
152 |         if os.path.isdir(pjoin(applicable_res_dir, x))
153 |     ]
154 |     task_dirs = [pjoin(applicable_res_dir, x) for x in task_dirs]
155 |     # patch_files = [pjoin(x, "agent_patch_raw") for x in task_dirs]
156 |     # patch_files = [os.path.abspath(x) for x in patch_files]
157 | 
158 |     # Diff files have the name extracted_patch_{1,2,3...}.diff
159 |     # We take the one with the largest index. This is because
160 |     # (1) if there is no validation, then there is at most one such file,
161 |     #     so just take it
162 |     # (2) if there is validation, only the one with the largest index may be correct
163 |     docker_files = []
164 |     for x in task_dirs:
165 |         extracted_dockerfile = glob(pjoin(x, "Dockerfile"))
166 |         docker_files.append(extracted_dockerfile[0])
167 | 
168 |     docker_files = [os.path.abspath(x) for x in docker_files]
169 | 
170 |     # patch_files = [x for x in patch_files if os.path.isfile(x)]
171 |     docker_files = [x for x in docker_files if os.path.isfile(x)]
172 | 
173 |     all_results = []
174 |     final_results = []
175 |     for docker_file in docker_files:
176 |         # task_dir = os.path.dirname(os.path.dirname(docker_file))
177 |         task_dir = os.path.dirname(docker_file)
178 |         meta_file = pjoin(task_dir, "meta.json")
179 |         with open(meta_file) as f:
180 |             meta = json.load(f)
181 |         status_file = pjoin(task_dir, "status.json")
182 |         status = NotImplemented
183 |         if os.path.exists(status_file):
184 |             with open(status_file) as f:
185 |                 status_meta = json.load(f)
186 |             status = status_meta['is_finish'] 
187 |                 
188 |         else:
189 |             continue
190 |         task_id = meta["task_id"]
191 |         this_result = {}
192 |         
193 |         this_result["instance_id"] = task_id
194 |         this_result["model_name_or_path"] = common.SELECTED_MODEL.name
195 |         docker_content = ""
196 |         eval_script_content = ""
197 |         if os.path.exists(docker_file):
198 |             with open(docker_file) as f:
199 |                 docker_content = f.read()
200 |         eval_script_file = docker_file.replace('Dockerfile','eval.sh')
201 |         if os.path.exists(eval_script_file):
202 |             with open(eval_script_file) as f:
203 |                 eval_script_content = f.read()
204 |         # if not docker_content:
205 |         #     # empty diff file, dont bother sending it to swe-bench
206 |         #     continue
207 |         this_result["dockerfile"] = docker_content
208 |         this_result["eval_script"] = eval_script_content
209 |         this_result['version'] = meta['task_info']['version']
210 |         this_result['repo'] = meta['task_info']['repo']
211 |         this_result['patch'] = meta['task_info']['patch']
212 |         this_result['status'] = status
213 |         all_results.append(this_result)
214 |         if status == True:
215 |             final_results.append(this_result)
216 | 
217 |     final_predictions_file = pjoin(dir, "predictions.json")
218 |     raw_predictions_file = pjoin(dir, "raw_predictions.json")
219 |     with open(final_predictions_file, "w") as f:
220 |         json.dump(final_results, f, indent=4)
221 | 
222 |     with open(raw_predictions_file, "w") as f:
223 |         json.dump(all_results, f, indent=4)
224 | 
225 |     return final_predictions_file
226 | 
227 | 
228 | def is_valid_json(json_str: str) -> tuple[ExtractStatus, list | dict | None]:
229 |     """
230 |     Check whether a json string is valid.
231 |     """
232 |     try:
233 |         data = json.loads(json_str)
234 |     except json.decoder.JSONDecodeError:
235 |         return ExtractStatus.NOT_VALID_JSON, None
236 |     return ExtractStatus.IS_VALID_JSON, data
237 | 
238 | 
239 | """
240 | Main entries of the module.
241 | """
242 | 
243 | 
244 | 
245 | def un_classify_expr_dir(expr_dir: str):
246 |     individual_expr_dirs = []
247 |     for individual_expr_dir in glob(pjoin(expr_dir, "*", "*__*")):
248 |         assert "info.log" in os.listdir(
249 |             individual_expr_dir
250 |         ), f"{individual_expr_dir} has no info.log"
251 |         individual_expr_dirs.append(individual_expr_dir)
252 | 
253 |     for d in individual_expr_dirs:
254 |         move(d, expr_dir)
255 | 
256 | 
257 | 
258 | 
259 | def organize_and_form_input(expr_dir):
260 |     """
261 |     Only organize the experiment directories into a few categories.
262 |     Args:
263 |         - expr_dir: the overall experiment directory.
264 |     """
265 |     organize_experiment_results(expr_dir)
266 |     swe_input_file = extract_swe_bench_input(expr_dir)
267 |     return swe_input_file
268 | 


--------------------------------------------------------------------------------
/evaluation/docker_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import docker
  4 | import os
  5 | import signal
  6 | import tarfile
  7 | import threading
  8 | import traceback
  9 | from pathlib import Path
 10 | 
 11 | from docker.models.containers import Container
 12 | 
 13 | HEREDOC_DELIMITER = "EOF_1399519320"  # different from dataset HEREDOC_DELIMITERs!
 14 | 
 15 | 
 16 | def copy_to_container(container: Container, src: Path, dst: Path):
 17 |     """
 18 |     Copy a file from local to a docker container
 19 | 
 20 |     Args:
 21 |         container (Container): Docker container to copy to
 22 |         src (Path): Source file path
 23 |         dst (Path): Destination file path in the container
 24 |     """
 25 |     # Check if destination path is valid
 26 |     if os.path.dirname(dst) == "":
 27 |         raise ValueError(
 28 |             f"Destination path parent directory cannot be empty!, dst: {dst}"
 29 |         )
 30 | 
 31 |     # temporary tar file
 32 |     tar_path = src.with_suffix(".tar")
 33 |     with tarfile.open(tar_path, "w") as tar:
 34 |         tar.add(src, arcname=src.name)
 35 | 
 36 |     # get bytes for put_archive cmd
 37 |     with open(tar_path, "rb") as tar_file:
 38 |         data = tar_file.read()
 39 | 
 40 |     # Make directory if necessary
 41 |     container.exec_run(f"mkdir -p {dst.parent}")
 42 | 
 43 |     # Send tar file to container and extract
 44 |     container.put_archive(os.path.dirname(dst), data)
 45 |     container.exec_run(f"tar -xf {dst}.tar -C {dst.parent}")
 46 | 
 47 |     # clean up in locally and in container
 48 |     tar_path.unlink()
 49 |     container.exec_run(f"rm {dst}.tar")
 50 | 
 51 | 
 52 | def write_to_container(container: Container, data: str, dst: Path):
 53 |     """
 54 |     Write a string to a file in a docker container
 55 |     """
 56 |     # echo with heredoc to file
 57 |     command = f"cat <<'{HEREDOC_DELIMITER}' > {dst}\n{data}\n{HEREDOC_DELIMITER}"
 58 |     container.exec_run(command)
 59 | 
 60 | 
 61 | def remove_image(client, image_id, logger=None):
 62 |     """
 63 |     Remove a Docker image by ID.
 64 | 
 65 |     Args:
 66 |         client (docker.DockerClient): Docker client.
 67 |         image_id (str): Image ID.
 68 |         rm_image (bool): Whether to remove the image.
 69 |         logger (logging.Logger): Logger to use for output. If None, print to stdout.
 70 |     """
 71 |     if not logger:
 72 |         # if logger is None, print to stdout
 73 |         log_info = print
 74 |         log_error = print
 75 |         raise_error = True
 76 |     elif logger == "quiet":
 77 |         # if logger is "quiet", don't print anything
 78 |         log_info = lambda x: None
 79 |         log_error = lambda x: None
 80 |         raise_error = True
 81 |     else:
 82 |         # if logger is a logger object, use it
 83 |         log_error = logger.info
 84 |         log_info = logger.info
 85 |         raise_error = False
 86 | 
 87 |     try:
 88 |         log_info(f"Attempting to remove image {image_id}...")
 89 |         client.images.remove(image_id, force=True)
 90 |         log_info(f"Image {image_id} removed.")
 91 |     except Exception as e:
 92 |         if raise_error:
 93 |             raise e
 94 |         log_error(
 95 |             f"Failed to remove image {image_id}: {e}\n" f"{traceback.format_exc()}"
 96 |         )
 97 | 
 98 | 
 99 | def cleanup_container(client, container, logger):
100 |     """
101 |     Stop and remove a Docker container.
102 |     Performs this forcefully if the container cannot be stopped with the python API.
103 | 
104 |     Args:
105 |         client (docker.DockerClient): Docker client.
106 |         container (docker.models.containers.Container): Container to remove.
107 |         logger (logging.Logger): Logger to use for output. If None, print to stdout
108 |     """
109 |     if not container:
110 |         return
111 | 
112 |     container_id = container.id
113 | 
114 |     if not logger:
115 |         # if logger is None, print to stdout
116 |         log_error = print
117 |         log_info = print
118 |         raise_error = True
119 |     elif logger == "quiet":
120 |         # if logger is "quiet", don't print anything
121 |         log_info = lambda x: None
122 |         log_error = lambda x: None
123 |         raise_error = True
124 |     else:
125 |         # if logger is a logger object, use it
126 |         log_error = logger.info
127 |         log_info = logger.info
128 |         raise_error = False
129 | 
130 |     # Attempt to stop the container
131 |     try:
132 |         if container:
133 |             log_info(f"Attempting to stop container {container.name}...")
134 |             container.stop(timeout=15)
135 |     except Exception as e:
136 |         log_error(
137 |             f"Failed to stop container {container.name}: {e}. Trying to forcefully kill..."
138 |         )
139 |         try:
140 |             # Get the PID of the container
141 |             container_info = client.api.inspect_container(container_id)
142 |             pid = container_info["State"].get("Pid", 0)
143 | 
144 |             # If container PID found, forcefully kill the container
145 |             if pid > 0:
146 |                 log_info(
147 |                     f"Forcefully killing container {container.name} with PID {pid}..."
148 |                 )
149 |                 os.kill(pid, signal.SIGKILL)
150 |             else:
151 |                 log_error(f"PID for container {container.name}: {pid} - not killing.")
152 |         except Exception as e2:
153 |             if raise_error:
154 |                 raise e2
155 |             log_error(
156 |                 f"Failed to forcefully kill container {container.name}: {e2}\n"
157 |                 f"{traceback.format_exc()}"
158 |             )
159 | 
160 |     # Attempt to remove the container
161 |     try:
162 |         log_info(f"Attempting to remove container {container.name}...")
163 |         container.remove(force=True)
164 |         log_info(f"Container {container.name} removed.")
165 |     except Exception as e:
166 |         if raise_error:
167 |             raise e
168 |         log_error(
169 |             f"Failed to remove container {container.name}: {e}\n"
170 |             f"{traceback.format_exc()}"
171 |         )
172 | 
173 | 
174 | def exec_run_with_timeout(container, cmd, timeout: int|None=60):
175 |     """
176 |     Run a command in a container with a timeout.
177 | 
178 |     Args:
179 |         container (docker.Container): Container to run the command in.
180 |         cmd (str): Command to run.
181 |         timeout (int): Timeout in seconds.
182 |     """
183 |     # Local variables to store the result of executing the command
184 |     exec_result = None
185 |     exec_id = None
186 |     exception = None
187 | 
188 |     # Wrapper function to run the command
189 |     def run_command():
190 |         nonlocal exec_result, exec_id, exception
191 |         try:
192 |             exec_id = container.client.api.exec_create(container.id, cmd)["Id"]
193 |             exec_result = container.client.api.exec_start(exec_id)
194 |         except Exception as e:
195 |             exception = e
196 | 
197 |     # Start the command in a separate thread
198 |     thread = threading.Thread(target=run_command)
199 |     thread.start()
200 |     thread.join(timeout)
201 | 
202 |     if exception:
203 |         raise exception
204 | 
205 |     # If the thread is still alive, the command timed out
206 |     if thread.is_alive():
207 |         raise TimeoutError(f"Command '{cmd}' timed out after {timeout} seconds")
208 | 
209 |     return exec_result
210 | 
211 | 
212 | def find_dependent_images(client: docker.DockerClient, image_name: str):
213 |     """
214 |     Find all images that are built upon `image_name` image
215 | 
216 |     Args:
217 |         client (docker.DockerClient): Docker client.
218 |         image_name (str): Name of the base image.
219 |     """
220 |     dependent_images = []
221 | 
222 |     # Get all local images
223 |     all_images = client.images.list()
224 | 
225 |     # Get the ID of the base image
226 |     try:
227 |         base_image = client.images.get(image_name)
228 |         base_image_id = base_image.id
229 |     except docker.errors.ImageNotFound:
230 |         print(f"Base image {image_name} not found.")
231 |         return []
232 | 
233 |     for image in all_images:
234 |         # Skip the base image itself
235 |         if image.id == base_image_id:
236 |             continue
237 | 
238 |         # Check if the base image is in this image's history
239 |         history = image.history()
240 |         for layer in history:
241 |             if layer['Id'] == base_image_id:
242 |                 # If found, add this image to the dependent images list
243 |                 tags = image.tags
244 |                 dependent_images.append(tags[0] if tags else image.id)
245 |                 break
246 | 
247 |     return dependent_images
248 | 
249 | 
250 | def list_images(client: docker.DockerClient):
251 |     """
252 |     List all images from the Docker client.
253 |     """
254 |     # don't use this in multi-threaded context
255 |     return {tag for i in client.images.list(all=True) for tag in i.tags}
256 | 
257 | 
258 | def clean_images(
259 |         client: docker.DockerClient,
260 |         prior_images: set,
261 |         cache_level: str,
262 |         clean: bool
263 |     ):
264 |     """
265 |     Clean Docker images based on cache level and clean flag.
266 | 
267 |     Args:
268 |         client (docker.DockerClient): Docker client.
269 |         prior_images (set): Set of images that existed before the current run.
270 |         cache (str): Cache level to use.
271 |         clean (bool): Whether to clean; remove images that are higher in the cache hierarchy than the current
272 |             cache level. E.g. if cache_level is set to env, remove all previously built instances images. if
273 |             clean is false, previously built instances images will not be removed, but instance images built
274 |             in the current run will be removed.
275 |     """
276 |     images = list_images(client)
277 |     removed = 0
278 |     print(f"Cleaning cached images...")
279 |     for image_name in images:
280 |         if should_remove(image_name, cache_level, clean, prior_images):
281 |             try:
282 |                 remove_image(client, image_name, "quiet")
283 |                 removed += 1
284 |             except Exception as e:
285 |                 print(f"Error removing image {image_name}: {e}")
286 |                 continue
287 |     print(f"Removed {removed} images.")
288 | 
289 | 
290 | def should_remove(
291 |         image_name: str,
292 |         cache_level: str,
293 |         clean: bool,
294 |         prior_images: set
295 |     ):
296 |     """
297 |     Determine if an image should be removed based on cache level and clean flag.
298 |     """
299 |     existed_before = image_name in prior_images
300 |     if image_name.startswith("setup"):
301 |         # if cache_level in {"none"} and (clean or not existed_before):
302 |         #     return True
303 |         return True
304 |     elif image_name.startswith("sweb.base"):
305 |         if cache_level in {"none"} and (clean or not existed_before):
306 |             return True
307 |     elif image_name.startswith("sweb.env"):
308 |         if cache_level in {"none", "base"} and (clean or not existed_before):
309 |             return True
310 |     elif image_name.startswith("sweb.eval"):
311 |         if cache_level in {"none", "base", "env"} and (clean or not existed_before):
312 |             return True
313 |     return False
314 | 


--------------------------------------------------------------------------------
/app/raw_tasks.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | import shutil
  5 | from abc import ABC, abstractmethod
  6 | from os.path import join as pjoin
  7 | from pathlib import Path
  8 | 
  9 | import httpx
 10 | 
 11 | from app import utils as app_utils
 12 | from app.log import log_and_print
 13 | from app.task import PlainTask, SweTask, Task
 14 | from docker import DockerClient
 15 | 
 16 | class RawTask(ABC):
 17 |     @property
 18 |     @abstractmethod
 19 |     def task_id(self) -> str:
 20 |         raise NotImplementedError("abstract base class")
 21 | 
 22 |     @abstractmethod
 23 |     def to_task(self) -> Task:
 24 |         raise NotImplementedError("abstract base class")
 25 | 
 26 |     @abstractmethod
 27 |     def dump_meta_data(self, output_dir: str) -> None:
 28 |         raise NotImplementedError("abstract base class")
 29 | 
 30 | 
 31 | class RawSweTask(RawTask):
 32 |     """
 33 |     Encapsulate everything required to run one task.
 34 |     """
 35 | 
 36 |     def __init__(self, task_id: str, setup_info: dict, task_info: dict,client:DockerClient = None):
 37 |         # a counter str, format "1/150", which means first task out of 150
 38 |         # id from the benchmark
 39 |         self._task_id = task_id
 40 |         # setup_info (Dict): keys: ['repo_path', 'env_name', 'pre_install', 'install','test_cmd']
 41 |         self.setup_info = setup_info
 42 |         # task_info (Dict): keys: ['base_commit', 'hints_text', 'created_at',
 43 |         # 'test_patch', 'repo', 'problem_statement', 'version', 'instance_id',
 44 |         # 'FAIL_TO_PASS', 'PASS_TO_PASS', 'environment_setup_commit']
 45 |         self.task_info = task_info
 46 |         self.client = client 
 47 |     @property
 48 |     def task_id(self) -> str:
 49 |         return self._task_id
 50 | 
 51 |     def to_task(self) -> SweTask:
 52 |         task_id = self.task_id
 53 |         setup_info = self.setup_info
 54 |         task_info = self.task_info
 55 |         language = task_info.get('language','None')
 56 |         client = self.client
 57 |         return SweTask(
 58 |             task_id=task_id,
 59 |             problem_statement=task_info["problem_statement"],
 60 |             repo_path=setup_info["repo_path"],
 61 |             repo_cache_path=setup_info["repo_cache_path"],
 62 |             # env_name=setup_info["env_name"],
 63 |             # pre_install_cmds=setup_info["pre_install"],
 64 |             # install_cmd=setup_info["install"],
 65 |             # command to run the relevant tests,
 66 |             # test_cmd=setup_info["test_cmd"],
 67 |             commit=task_info["base_commit"],
 68 |             repo_name=task_info["repo"],
 69 |             # modifications to the test suite for this task instance,
 70 |             patch=task_info["patch"],
 71 |             test_patch=task_info["test_patch"],
 72 |             # testcases_passing=task_info["PASS_TO_PASS"],
 73 |             # testcases_failing=task_info["FAIL_TO_PASS"],
 74 |             language=language,
 75 |             # image_urls=task_info['image_urls'],
 76 |             # reference_setup=task_info['reference_setup'],
 77 |             version=task_info['version'],
 78 |             client = client,
 79 |             task_info = task_info
 80 |         )
 81 | 
 82 |     def dump_meta_data(self, output_dir: str):
 83 |         meta = {
 84 |             "task_id": self.task_id,
 85 |             "setup_info": self.setup_info,
 86 |             "task_info": self.task_info,
 87 |         }
 88 |         with open(pjoin(output_dir, "meta.json"), "w") as f:
 89 |             json.dump(meta, f, indent=4)
 90 |         with open(pjoin(output_dir, "problem_statement.txt"), "w") as f:
 91 |             f.write(self.task_info["problem_statement"])
 92 |         with open(pjoin(output_dir, "developer_patch.diff"), "w") as f:
 93 |             f.write(self.task_info["patch"])
 94 | 
 95 | 
 96 | class RawGithubTask(RawTask):
 97 |     """
 98 |     Encapsulate everything required to run ACR on a fresh issue from the internet.
 99 |     """
100 | 
101 |     def __init__(
102 |         self,
103 |         task_id: str,
104 |         clone_link: str,
105 |         commit_hash: str | None,
106 |         issue_link: str,
107 |         setup_dir: str,
108 |         use_comments: bool = False,
109 |     ):
110 |         self._task_id = task_id
111 |         self.clone_link = clone_link
112 |         # if commit_hash is None, assume using the HEAD of default branch
113 |         self.commit_hash = commit_hash
114 |         self.issue_link = issue_link
115 |         self.setup_dir = setup_dir
116 |         self.use_comments = use_comments
117 |         self.clone_path = pjoin(self.setup_dir, self.task_id)
118 |         self.problem_statement, self.created_at = self.fetch_issue()
119 |         self.clone_repo()
120 | 
121 |     @property
122 |     def task_id(self) -> str:
123 |         return self._task_id
124 | 
125 |     def clone_repo(self):
126 |         clone_path = Path(self.clone_path)
127 |         if os.path.exists(clone_path):
128 |             log_and_print(
129 |                 f"Path {clone_path} already exists. Removing it to get a fresh clone."
130 |             )
131 |             shutil.rmtree(clone_path)
132 |         app_utils.clone_repo(self.clone_link, str(clone_path.parent), clone_path.name)
133 |         log_and_print(f"Cloned source code to {clone_path}.")
134 |         if self.commit_hash is None:
135 |             # let's get the current commit hash
136 |             with app_utils.cd(clone_path):
137 |                 self.commit_hash = app_utils.get_current_commit_hash()
138 | 
139 |     def dump_meta_data(self, output_dir: str):
140 |         meta = {
141 |             "task_info": {
142 |                 "base_commit": self.commit_hash,
143 |                 "created_at": self.created_at,
144 |                 "problem_statement": self.problem_statement,
145 |                 "instance_id": self.task_id,
146 |             },
147 |             "setup_info": {"repo_path": self.clone_path},
148 |         }
149 | 
150 |         meta_file = pjoin(output_dir, "meta.json")
151 | 
152 |         with open(meta_file, "w") as f:
153 |             json.dump(meta, f, indent=4)
154 | 
155 |     def fetch_issue(self):
156 |         if "github.com" not in self.issue_link:
157 |             raise NotImplementedError("Only GitHub issues are supported for now.")
158 | 
159 |         retrieved_issue = self.fetch_github_issue(self.issue_link, self.use_comments)
160 | 
161 |         if retrieved_issue is None:
162 |             raise RuntimeError(
163 |                 f"Failed to retrieve issue information from {self.issue_link}"
164 |             )
165 | 
166 |         title, body, created_at = retrieved_issue
167 | 
168 |         body = self.process_links(body)
169 | 
170 |         problem_statement = f"{title}\n{body}"
171 | 
172 |         return problem_statement, created_at
173 | 
174 |     @classmethod
175 |     def process_links(cls, body: str):
176 |         code_pattern = re.compile(
177 |             r"https://github.com/(.*?)/blob/(.*)/(.*)#L(\d+)-L(\d+)"
178 |         )
179 |         replacements = []
180 | 
181 |         for code_links in code_pattern.finditer(body):
182 |             repo_name = code_links.group(1)
183 |             commit = code_links.group(2)
184 |             file_path = code_links.group(3)
185 |             start_line = int(code_links.group(4))
186 |             end_line = int(code_links.group(5))
187 | 
188 |             file_contents = httpx.get(
189 |                 f"https://raw.githubusercontent.com/{repo_name}/{commit}/{file_path}"
190 |             ).text.splitlines()
191 |             fragment = "\n".join(file_contents[start_line - 1 : end_line])
192 | 
193 |             replacements.append((code_links.group(0), f"\n```{fragment }```\n"))
194 | 
195 |         for code_link, replacement in replacements:
196 |             body = body.replace(code_link, code_link + replacement)
197 |         return body
198 | 
199 |     @classmethod
200 |     def fetch_github_issue(
201 |         cls, issue_url: str, use_comments: bool = False
202 |     ) -> tuple[str, str, str]:
203 |         """Extract owner, repo, and issue number from the URL"""
204 | 
205 |         # Example issue URL: https://github.com/owner/repo/issues/123
206 | 
207 |         _, owner, repo, _, issue_number = issue_url.rsplit("/", 4)
208 | 
209 |         api_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}"
210 |         comments_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/comments"
211 | 
212 |         issue_response = httpx.get(api_url)
213 | 
214 |         if issue_response.status_code != 200:
215 |             raise RuntimeError(
216 |                 f"Failed to fetch issue information: {issue_response.status_code}"
217 |             )
218 | 
219 |         issue_info = issue_response.json()
220 | 
221 |         title = issue_info["title"]
222 |         body = issue_info["body"]
223 | 
224 |         if use_comments:
225 |             comments_response = httpx.get(comments_url)
226 |             if comments_response.status_code != 200:
227 |                 raise RuntimeError(
228 |                     f"Failed to fetch comments information: {comments_response.status_code}"
229 |                 )
230 | 
231 |             comments_info = comments_response.json()
232 |             for comment in comments_info:
233 |                 if (
234 |                     "user" not in comment
235 |                     or comment["user"]["type"] == "Bot"
236 |                     or comment["user"]["login"] == "acr-bot"
237 |                 ):
238 |                     continue
239 | 
240 |                 body += (
241 |                     f"\nUser: {comment['user']['login']}\nComment: {comment['body']}"
242 |                 )
243 | 
244 |         created_at = issue_info["created_at"]
245 | 
246 |         return title, body, created_at
247 | 
248 |     def to_task(self) -> PlainTask:
249 |         return PlainTask(
250 |             commit_hash=self.commit_hash,
251 |             local_path=self.clone_path,
252 |             problem_statement=self.problem_statement,
253 |         )
254 | 
255 | 
256 | class RawLocalTask(RawTask):
257 |     """
258 |     Encapsulate everything required to run ACR on a local issue on the disk.
259 |     """
260 | 
261 |     def __init__(self, task_id: str, local_repo: str, issue_file: str):
262 |         self._task_id = task_id
263 |         self.local_repo = local_repo
264 |         self.issue_file = issue_file
265 |         self.commit_hash = self.init_local_repo()
266 |         self.problem_statement = self.read_issue_from_file()
267 | 
268 |     @property
269 |     def task_id(self) -> str:
270 |         return self._task_id
271 | 
272 |     def init_local_repo(self):
273 |         with app_utils.cd(self.local_repo):
274 |             if not app_utils.is_git_repo():
275 |                 # non git repo - let's make it a git repo first
276 |                 app_utils.initialize_git_repo_and_commit()
277 |             commit = app_utils.get_current_commit_hash()
278 |         return commit
279 | 
280 |     def read_issue_from_file(self) -> str:
281 |         # ignore encoding errors so at least we can have some issue content
282 |         issue = Path(self.issue_file).read_text(errors="ignore")
283 |         return issue
284 | 
285 |     def dump_meta_data(self, output_dir: str):
286 |         meta = {
287 |             "task_info": {
288 |                 "base_commit": self.commit_hash,
289 |                 "problem_statement": self.problem_statement,
290 |                 "instance_id": self.task_id,
291 |             },
292 |             "setup_info": {"repo_path": self.local_repo},
293 |         }
294 | 
295 |         meta_file = pjoin(output_dir, "meta.json")
296 | 
297 |         with open(meta_file, "w") as f:
298 |             json.dump(meta, f, indent=4)
299 | 
300 |     def to_task(self) -> PlainTask:
301 |         return PlainTask(
302 |             commit_hash=self.commit_hash,
303 |             local_path=self.local_repo,
304 |             problem_statement=self.problem_statement,
305 |         )
306 | 


--------------------------------------------------------------------------------