├── swebench
    ├── collect
    │   ├── __init__.py
    │   ├── run_build_dataset_ft.sh
    │   ├── run_get_tasks_pipeline.sh
    │   ├── make_repo
    │   │   ├── call_make_repo.py
    │   │   └── make_repo.sh
    │   ├── make_lite
    │   │   ├── README.md
    │   │   ├── make_lite.py
    │   │   └── criteria.py
    │   ├── cleanup
    │   │   ├── delete_gh_workflows.py
    │   │   └── remove_envs.py
    │   ├── print_pulls.py
    │   ├── build_dataset_ft.py
    │   ├── get_top_pypi.py
    │   ├── README.md
    │   ├── check_validation.ipynb
    │   ├── get_tasks_pipeline.py
    │   └── build_dataset.py
    ├── harness
    │   ├── __init__.py
    │   ├── remove_containers.py
    │   ├── dockerfiles.py
    │   ├── prepare_images.py
    │   └── grading.py
    ├── inference
    │   ├── __init__.py
    │   ├── llamao
    │   │   ├── __init__.py
    │   │   └── distributed_attention.py
    │   ├── make_datasets
    │   │   ├── __init__.py
    │   │   ├── eval_retrieval.py
    │   │   ├── README.md
    │   │   ├── tokenize_dataset.py
    │   │   └── create_text_dataset.py
    │   ├── README.md
    │   └── run_live.py
    ├── versioning
    │   ├── __init__.py
    │   ├── run_get_versions.sh
    │   ├── utils.py
    │   ├── extract_web
    │   │   ├── get_versions_pydicom.py
    │   │   ├── get_versions_xarray.py
    │   │   ├── get_versions_matplotlib.py
    │   │   ├── get_versions_astropy.py
    │   │   ├── get_versions_pvlib-python.py
    │   │   └── get_versions_sqlfluff.py
    │   ├── README.md
    │   └── constants.py
    └── __init__.py
├── setup.cfg
├── assets
    ├── figures
    │   ├── teaser.png
    │   ├── collection.png
    │   ├── evaluation.png
    │   ├── validation.png
    │   └── swellama_banner.png
    ├── build_deploy.sh
    ├── evaluation.md
    └── collection.md
├── pyproject.toml
├── scripts
    ├── run_validation.sh
    ├── run_get_versions.sh
    ├── filter_empty_version.py
    ├── build_images.sh
    ├── docker
    │   └── push_all_images.sh
    └── eval
    │   ├── convert_od_output_to_swe_json.py
    │   ├── update_output_with_eval.py
    │   └── eval_infer.sh
├── README.md
├── codecov.yml
├── tests
    ├── test_cli.py
    ├── test_collect_cli.py
    └── test_evaluation.py
├── LICENSE
├── docs
    ├── 20240406_devin_validate
    │   ├── report.md
    │   └── get_devin_preds.ipynb
    ├── 20240415_eval_bug
    │   ├── sweep_conda_links.py
    │   └── check_harness.ipynb
    ├── README_JP.md
    ├── README_CN.md
    ├── README_TW.md
    └── 20240627_docker
    │   └── README.md
├── setup.py
├── CHANGELOG.md
└── Original_README.md


/swebench/collect/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/swebench/harness/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/swebench/inference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/swebench/versioning/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/swebench/inference/llamao/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/swebench/inference/make_datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | version = attr: swebench.__version__
3 | license_files = LICENSE


--------------------------------------------------------------------------------
/assets/figures/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SWE-Gym/SWE-Bench-Fork/HEAD/assets/figures/teaser.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ['setuptools>=42']
3 | build-backend = 'setuptools.build_meta'


--------------------------------------------------------------------------------
/assets/figures/collection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SWE-Gym/SWE-Bench-Fork/HEAD/assets/figures/collection.png


--------------------------------------------------------------------------------
/assets/figures/evaluation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SWE-Gym/SWE-Bench-Fork/HEAD/assets/figures/evaluation.png


--------------------------------------------------------------------------------
/assets/figures/validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SWE-Gym/SWE-Bench-Fork/HEAD/assets/figures/validation.png


--------------------------------------------------------------------------------
/assets/figures/swellama_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SWE-Gym/SWE-Bench-Fork/HEAD/assets/figures/swellama_banner.png


--------------------------------------------------------------------------------
/assets/build_deploy.sh:
--------------------------------------------------------------------------------
1 | # !bin/bash
2 | 
3 | python3 -m build
4 | 
5 | python3 -m twine upload --skip-existing --repository pypi dist/*
6 | # python3 -m twine upload --skip-existing --repository testpypi dist/*


--------------------------------------------------------------------------------
/swebench/collect/run_build_dataset_ft.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python build_dataset_ft.py \
4 |     --instances_path "<path to folder containing task instance (raw) files>" \
5 |     --output_path "<path to folder to save finetuning dataset to>" \
6 |     --eval_path "<path to folder containing all evaluation task instances>"


--------------------------------------------------------------------------------
/scripts/run_validation.sh:
--------------------------------------------------------------------------------
 1 | REPO_NAME=$1 # e.g. getmoto__moto
 2 | VERSION_DATA=data/interim/versioned/${REPO_NAME}_versions.non-empty.jsonl
 3 | 
 4 | python swebench/harness/run_validation.py \
 5 |     --dataset_name $VERSION_DATA \
 6 |     --run_id test \
 7 |     --cache_level instance \
 8 |     --max_workers 8
 9 | 
10 | # --force_rebuild true
11 | 


--------------------------------------------------------------------------------
/swebench/collect/run_get_tasks_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # If you'd like to parallelize, do the following:
 4 | # * Create a .env file in this folder
 5 | # * Declare GITHUB_TOKENS=token1,token2,token3...
 6 | 
 7 | python get_tasks_pipeline.py \
 8 |     --repos 'scikit-learn/scikit-learn', 'pallets/flask' \
 9 |     --path_prs '<path to folder to save PRs to>' \
10 |     --path_tasks '<path to folder to save tasks to>'


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## SWE-Bench-Fork for SWE-Gym
 2 | [SWE-Gym Dataset + Models](https://huggingface.co/SWE-Gym)
 3 | 
 4 | [Project page](https://github.com/SWE-Gym/SWE-Gym)
 5 | 
 6 | This fork contains environment setup files used for additonal 11 repos used in SWE-Gym dataset and an improved version of the instance collection pipeline.
 7 | 
 8 | We plan to upstream the changes and merge with SWE-Bench soon.
 9 | 
10 | The original README.md is available [here](Original_README.md).
11 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | # Configuration for codecov
 2 | coverage:
 3 |   status:
 4 |     project:
 5 |       default:
 6 |         # If we get < 45% coverage, codecov is gonna mark it a failure
 7 |         target: 45%
 8 |         threshold: null
 9 |     patch:
10 |       default:
11 |         # Codecov won't mark it as a failure if a patch is not covered well
12 |         informational: true
13 | github_checks:
14 |   # Don't mark lines that aren't covered
15 |   annotations: false
16 | 
17 | 


--------------------------------------------------------------------------------
/swebench/collect/make_repo/call_make_repo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import subprocess
 4 | 
 5 | repos = ["Repos here"]
 6 | 
 7 | for repo in repos:
 8 |     print(f"Making mirror repo for {repo}")
 9 |     out_make = subprocess.run(
10 |         f"./make_repo.sh {repo}",
11 |         shell=True,
12 |         stdout=subprocess.DEVNULL,
13 |         stderr=subprocess.DEVNULL,
14 |     )
15 |     if out_make.returncode != 0:
16 |         print(f"Error making mirror repo for {repo}")
17 |     else:
18 |         print(f"Success making mirror repo for {repo}")
19 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | 
 4 | def test_smoke_test():
 5 |     cmd = ["python", "-m", "swebench.harness.run_evaluation", "--help"]
 6 |     result = subprocess.run(cmd, capture_output=True)
 7 |     print(result.stdout)
 8 |     print(result.stderr)
 9 |     assert result.returncode == 0
10 | 
11 | 
12 | def test_one_instance():
13 |     cmd = ["python", "-m", "swebench.harness.run_evaluation", "--predictions_path",  "gold", "--max_workers", "1", "--instance_ids", "sympy__sympy-20590", "--run_id", "validate-gold"]
14 |     result = subprocess.run(cmd, capture_output=True)
15 |     print(result.stdout)
16 |     print(result.stderr)
17 |     assert result.returncode == 0


--------------------------------------------------------------------------------
/scripts/run_get_versions.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | REPO_NAME=$1  # e.g., getmoto__moto
 3 | INSTANCE_PATH=/SWE-Bench/data/raw/${REPO_NAME}.jsonl
 4 | OUTPUT_DIR=/SWE-Bench/data/interim/versioned
 5 | CONDA_PATH=/miniconda3/condabin/conda
 6 | TESTBED_PATH=/SWE-Bench/data/testbed
 7 | 
 8 | pushd swebench/versioning
 9 | 
10 | python get_versions.py \
11 |     --instances_path $INSTANCE_PATH \
12 |     --retrieval_method github \
13 |     --conda_env temp \
14 |     --num_workers 4 \
15 |     --path_conda $CONDA_PATH \
16 |     --output_dir $OUTPUT_DIR \
17 |     --testbed $TESTBED_PATH
18 | 
19 | popd
20 | 
21 | OUTPUT_PATH=$OUTPUT_DIR/${REPO_NAME}_versions.json
22 | python3 scripts/filter_empty_version.py $OUTPUT_PATH
23 | 


--------------------------------------------------------------------------------
/swebench/versioning/run_get_versions.sh:
--------------------------------------------------------------------------------
 1 | # Example call for getting versions by building the repo locally
 2 | python get_versions.py \
 3 |     --path_tasks "<path to matplotlib task instances>" \
 4 |     --retrieval_method build \
 5 |     --conda_env "<name of conda environment to build task instances within>" \
 6 |     --num_threads 10 \
 7 |     --path_conda "<path to conda installation with `conda_env`>" \
 8 |     --testbed "<path to folder>"
 9 | 
10 | # Example call for getting versions from github web interface
11 | python get_versions.py \
12 |     --path_tasks "<path to sphinx task instances>" \
13 |     --retrieval_method github \
14 |     --num_workers 25 \
15 |     --output_dir "<path to folder to save versioned task instances to>"


--------------------------------------------------------------------------------
/scripts/filter_empty_version.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pandas as pd
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("input_path", type=str)
 6 | args = parser.parse_args()
 7 | 
 8 | output_path = args.input_path.replace(".json", ".non-empty.jsonl")
 9 | df = pd.read_json(args.input_path, lines=False)
10 | print(df.head())
11 | print(f"Got {len(df)} instances")
12 | df = df[df["version"].notna()]
13 | df["version"] = df["version"].astype(str)
14 | print(f"Got {len(df)} non-empty versions instances")
15 | df.to_json(output_path, lines=True, orient="records")
16 | print(f"Filtered {args.input_path} to {output_path}")
17 | 
18 | print("Version Stats:")
19 | print(df["version"].value_counts())
20 | 
21 | print("Unique versions:")
22 | print(sorted(list(df["version"].unique())))
23 | 


--------------------------------------------------------------------------------
/scripts/build_images.sh:
--------------------------------------------------------------------------------
 1 | DATASET_NAME=$1 
 2 | if [ -z "$DATASET_NAME" ]; then
 3 |     DATASET_NAME="princeton-nlp/SWE-bench_Lite"
 4 |     echo "Using default dataset name: $DATASET_NAME"
 5 | fi
 6 | SPLIT=$2
 7 | if [ -z "$SPLIT" ]; then
 8 |     SPLIT="test"
 9 |     echo "Using default split: $SPLIT"
10 | fi
11 | 
12 | MAX_WORKERS=$3
13 | if [ -z "$MAX_WORKERS" ]; then
14 |     MAX_WORKERS=4
15 |     echo "Using default max workers: $MAX_WORKERS"
16 | fi
17 | 
18 | RUN_ID="build-images-${DATASET_NAME//\//__}-${SPLIT}"
19 | echo "Using dataset name: $DATASET_NAME"
20 | echo "Using split: $SPLIT"
21 | echo "Using max workers: $MAX_WORKERS"
22 | echo "Using run id: $RUN_ID"
23 | echo "================================================"
24 | 
25 | python -m swebench.harness.run_evaluation \
26 |     --dataset_name $DATASET_NAME \
27 |     --split $SPLIT \
28 |     --predictions_path gold \
29 |     --max_workers $MAX_WORKERS \
30 |     --run_id $RUN_ID \
31 |     --cache_level instance
32 | 


--------------------------------------------------------------------------------
/tests/test_collect_cli.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | 
 4 | def test_collect_smoke_test():
 5 |     cmd = ["python", "-m", "swebench.collect.print_pulls", "--help"]
 6 |     result = subprocess.run(cmd, capture_output=True)
 7 |     print(result.stdout)
 8 |     print(result.stderr)
 9 |     assert result.returncode == 0
10 | 
11 | 
12 | def test_collect_one(tmp_path):
13 |     cmd = ["python", "-m", "swebench.collect.print_pulls", "pvlib/pvlib-python", str(tmp_path/ "out.txt"), "--max_pulls", "1"]
14 |     print(" ".join(cmd))
15 |     result = subprocess.run(cmd, capture_output=True)
16 |     print(result.stdout)
17 |     print(result.stderr)
18 |     assert result.returncode == 0
19 | 
20 | 
21 | def test_collect_ds(tmp_path):
22 |     cmd = ["python", "-m", "swebench.collect.build_dataset", "tests/test_data/pvlib.jsonl", str(tmp_path/ "out.jsonl")]
23 |     print(" ".join(cmd))
24 |     result = subprocess.run(cmd, capture_output=True)
25 |     print(result.stdout)
26 |     print(result.stderr)
27 |     assert result.returncode == 0


--------------------------------------------------------------------------------
/tests/test_evaluation.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import json
 3 | import docker
 4 | 
 5 | from swebench.harness.constants import (
 6 |     FAIL_TO_PASS,
 7 |     PASS_TO_PASS,
 8 |     KEY_INSTANCE_ID,
 9 |     KEY_MODEL,
10 | )
11 | from swebench.harness.run_evaluation import make_run_report
12 | 
13 | TEST_INSTANCE = collections.defaultdict(lambda: "test")
14 | TEST_INSTANCE[PASS_TO_PASS] = '[]'
15 | TEST_INSTANCE["repo"] = 'pvlib/pvlib-python'
16 | TEST_INSTANCE["version"] = '0.1'
17 | TEST_INSTANCE[FAIL_TO_PASS] = '[]'
18 | 
19 | def test_make_run_report(tmpdir) -> None:
20 |     client = docker.from_env()
21 |     with tmpdir.as_cwd():
22 |         output_path = make_run_report(
23 |             {
24 |                 "test": {
25 |                     KEY_INSTANCE_ID: "test",
26 |                     KEY_MODEL: "test"
27 |                 }
28 |             },
29 |             [TEST_INSTANCE],
30 |             client,
31 |             "test"
32 |         )
33 |         assert output_path.is_file()
34 |         report = json.loads(output_path.read_text())
35 |         assert report["schema_version"] == 2


--------------------------------------------------------------------------------
/scripts/docker/push_all_images.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DOCKER_NAMESPACE=$1
 4 | INSTANCE_ID_FILE=$2
 5 | 
 6 | if [ -z "$DOCKER_NAMESPACE" ] || [ -z "$INSTANCE_ID_FILE" ]; then
 7 |     echo "Usage: $0 <docker_namespace> <instance_id_file>"
 8 |     exit 1
 9 | fi
10 | 
11 | # target namespace
12 | image_list=$(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep sweb | grep -v $DOCKER_NAMESPACE)
13 | instance_ids=$(cat $INSTANCE_ID_FILE)
14 | 
15 | # KEEP images that are IN the instance_ids
16 | image_list=$(echo "$image_list" | grep -f <(echo "$instance_ids"))
17 | 
18 | echo "# of images to push: $(echo "$image_list" | wc -l)"
19 | 
20 | # There are three tiers of images
21 | # - base
22 | # - env
23 | # - eval (instance level)
24 | 
25 | for image in $image_list; do
26 |     echo "Tagging $image"
27 |     # rename image by replace "__" with "_s_" to comply with docker naming convention
28 |     new_image_name=${image//__/_s_}
29 |     docker tag $image $DOCKER_NAMESPACE/$new_image_name
30 |     echo "Tagged $image to $DOCKER_NAMESPACE/$new_image_name"
31 |     
32 |     docker push $DOCKER_NAMESPACE/$new_image_name
33 |     echo "Pushed $image"
34 | done
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Carlos E Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, Karthik R Narasimhan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/20240406_devin_validate/report.md:
--------------------------------------------------------------------------------
 1 | # Validating Devin's Results
 2 | April 6, 2024
 3 | 
 4 | In this report, we briefly detail our validation of [Devin](https://www.cognition-labs.com/introducing-devin), an AI software engineer released by [Cognition Labs](https://www.cognition-labs.com/) that resolves an impressive 13.86% of issues on a random 25% subset of SWE-bench.
 5 | 
 6 | The Cognition Labs team released their own [report on Devin's performance on SWE-bench](https://www.cognition-labs.com/post/swe-bench-technical-report), which includes a much more thorough deep dive into where Devin excels and struggles.
 7 | 
 8 | Our report focuses solely on validating Devin's performance. To this end, we do the following:
 9 | 1. Compile the open-sourced Devin predictions ([Github repository](https://github.com/CognitionAI/devin-swebench-results/tree/main)) into a SWE-bench evaluation-compatible `.jsonl` file.
10 | 2. Run evaluation on these predictions with:
11 | ```shell
12 | python evaluation.py \
13 |     --predictions_path devin_all_preds.jsonl \
14 |     --swe_bench_tasks swe-bench.json \
15 |     --log_dir ./results/ \
16 |     --testbed ./testbed/ \
17 |     --skip_existing \
18 |     --timeout 1200 \
19 |     --verbose
20 | ```
21 | 
22 | [To Do: Results]
23 | 
24 | ✍️ Carlos & John


--------------------------------------------------------------------------------
/swebench/collect/make_lite/README.md:
--------------------------------------------------------------------------------
 1 | ## SWE-bench *Lite*
 2 | This directory contains the scripts used to make the *lite* version of SWE-bench. The *lite* version is a subset of the full SWE-bench, that filters out certain types of instances to make evaluation on SWE-bench a bit cheaper and more accessible.
 3 | 
 4 | SWE-bench lite consists of 300 test instances and 23 development instances; both subsets of the full SWE-bench splits. We filter the full SWE-bench according to the following criteria to get *lite*:
 5 | - We remove instances with images, external hyperlinks, references to specific commit shas and references to other pull requests or issues.
 6 | - We remove instances that have fewer than 40 words in the problem statement.
 7 | - We remove instances that edit more than 1 file.
 8 | - We remove instances where the gold patch has more than 3 edit hunks (see [patch](https://man7.org/linux/man-pages/man1/patch.1.html)).
 9 | - We remove instances that create or remove files.
10 | - We remove instances that contain tests with error message checks.
11 | - Finally, we sample 300 test instances and 23 development instances from the remaining instances.
12 | 
13 | See `make_lite.py` for the script that makes the *lite* version of SWE-bench, or download the *lite* version from the Hugging Face datasets [princeton-nlp/SWE-bench_Lite](https://huggingface.co/datasets/princeton-nlp/SWE-bench_Lite)
14 | 


--------------------------------------------------------------------------------
/swebench/versioning/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def get_instances(instance_path: str) -> list:
 5 |     """
 6 |     Get task instances from given path
 7 | 
 8 |     Args:
 9 |         instance_path (str): Path to task instances
10 |     Returns:
11 |         task_instances (list): List of task instances
12 |     """
13 |     if any([instance_path.endswith(x) for x in [".jsonl", ".jsonl.all"]]):
14 |         task_instances = list()
15 |         with open(instance_path) as f:
16 |             for line in f.readlines():
17 |                 task_instances.append(json.loads(line))
18 |         return task_instances
19 | 
20 |     with open(instance_path) as f:
21 |         task_instances = json.load(f)
22 |     return task_instances
23 | 
24 | 
25 | def split_instances(input_list: list, n: int) -> list:
26 |     """
27 |     Split a list into n approximately equal length sublists
28 | 
29 |     Args:
30 |         input_list (list): List to split
31 |         n (int): Number of sublists to split into
32 |     Returns:
33 |         result (list): List of sublists
34 |     """
35 |     avg_length = len(input_list) // n
36 |     remainder = len(input_list) % n
37 |     result, start = [], 0
38 | 
39 |     for i in range(n):
40 |         length = avg_length + 1 if i < remainder else avg_length
41 |         sublist = input_list[start : start + length]
42 |         result.append(sublist)
43 |         start += length
44 | 
45 |     return result
46 | 


--------------------------------------------------------------------------------
/swebench/versioning/extract_web/get_versions_pydicom.py:
--------------------------------------------------------------------------------
 1 | import datetime, json, requests, sys
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | sys.path.append("../../harness")
 5 | from utils import get_instances
 6 | 
 7 | PATH_TASKS_PYDICOM = "<path to pydicom task instances>"
 8 | PATH_TASKS_PYDICOM_V = "<path to pydicom task instances with versions>"
 9 | 
10 | data_tasks = get_instances(PATH_TASKS_PYDICOM)
11 | resp = requests.get('https://pydicom.github.io/pydicom/dev/faq/index.html')
12 | soup = BeautifulSoup(resp.text, "html.parser")
13 | release_table = soup.find("table", {"class": "docutils align-default"})
14 | 
15 | times = []
16 | for row in release_table.find_all('tr'):
17 |     cells = row.find_all('td')
18 |     if len(cells) == 3:
19 |         version = cells[0].text.strip()
20 |         date = cells[1].text.strip().strip('~')
21 |         if date == 'Jan 2024':
22 |             date = '2024-01-01'
23 |         else:
24 |             date = datetime.strptime(date, "%B %Y").strftime("%Y-%m-%d")
25 |         python_versions = max(cells[2].text.strip().split(', '))
26 |         times.append((date, version))
27 | 
28 | times = sorted(times, key=lambda x: x[0], reverse=True)
29 | for task in data_tasks:
30 |     created_at = task["created_at"].split("T")[0]
31 |     found = False
32 |     for t in times:
33 |         if t[0] < created_at:
34 |             task["version"] = t[1]
35 |             found = True
36 |             break
37 |     if not found:
38 |         task["version"] = times[-1][1]
39 | 
40 | with open(PATH_TASKS_PYDICOM_V, 'w') as f:
41 |     json.dump(data_tasks, fp=f)


--------------------------------------------------------------------------------
/assets/evaluation.md:
--------------------------------------------------------------------------------
 1 | # Evaluating with SWE-bench
 2 | John Yang &bull; November 6, 2023
 3 | 
 4 | In this tutorial, we will explain how to evaluate models and methods using SWE-bench.
 5 | 
 6 | ## 🤖 Creating Predictions
 7 | For each task instance of the SWE-bench dataset, given an issue (`problem_statement`) + codebase (`repo` + `base_commit`), your model should attempt to write a diff patch prediction. For full details on the SWE-bench task, please refer to Section 2 of the main paper.
 8 | 
 9 | Each prediction must be formatted as follows:
10 | ```json
11 | {
12 |     "instance_id": "<Unique task instance ID>",
13 |     "model_patch": "<.patch file content string>",
14 |     "model_name_or_path": "<Model name here (i.e. SWE-Llama-13b)>",
15 | }
16 | ```
17 | 
18 | Store multiple predictions in a `.json` file formatted as `[<prediction 1>, <prediction 2>,... <prediction n>]`. It is not necessary to generate predictions for every task instance.
19 | 
20 | If you'd like examples, the [swe-bench/experiments](https://github.com/swe-bench/experiments) GitHub repository contains many examples of well formed patches.
21 | 
22 | ## 🔄 Running Evaluation
23 | Evaluate model predictions on SWE-bench Lite using the evaluation harness with the following command:
24 | ```bash
25 | python -m swebench.harness.run_evaluation \
26 |     --dataset_name princeton-nlp/SWE-bench_Lite \
27 |     --predictions_path <path_to_predictions> \
28 |     --max_workers <num_workers> \
29 |     --run_id <run_id>
30 |     # use --predictions_path 'gold' to verify the gold patches
31 |     # use --run_id to name the evaluation run
32 | ```
33 | 


--------------------------------------------------------------------------------
/swebench/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "2.0.13"
 2 | 
 3 | from swebench.collect.build_dataset import main as build_dataset
 4 | from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline
 5 | from swebench.collect.print_pulls import main as print_pulls
 6 | 
 7 | from swebench.harness.constants import (
 8 |     KEY_INSTANCE_ID,
 9 |     KEY_MODEL,
10 |     KEY_PREDICTION,
11 |     MAP_REPO_VERSION_TO_SPECS,
12 | )
13 | 
14 | from swebench.harness.docker_build import (
15 |     build_image,
16 |     build_base_images,
17 |     build_env_images,
18 |     build_instance_images,
19 |     build_instance_image,
20 |     close_logger,
21 |     setup_logger,
22 | )
23 | 
24 | from swebench.harness.docker_utils import (
25 |     cleanup_container,
26 |     remove_image,
27 |     copy_to_container,
28 |     exec_run_with_timeout,
29 |     list_images,
30 | )
31 | 
32 | from swebench.harness.grading import (
33 |     compute_fail_to_pass,
34 |     compute_pass_to_pass,
35 |     get_logs_eval,
36 |     get_eval_report,
37 |     get_resolution_status,
38 |     ResolvedStatus,
39 |     TestStatus,
40 | )
41 | 
42 | from swebench.harness.log_parsers import (
43 |     MAP_REPO_TO_PARSER,
44 | )
45 | 
46 | from swebench.harness.run_evaluation import (
47 |     main as run_evaluation,
48 | )
49 | 
50 | from swebench.harness.utils import (
51 |     get_environment_yml,
52 |     get_requirements,
53 | )
54 | 
55 | from swebench.versioning.constants import (
56 |     MAP_REPO_TO_VERSION_PATHS,
57 |     MAP_REPO_TO_VERSION_PATTERNS,
58 | )
59 | 
60 | from swebench.versioning.get_versions import (
61 |     get_version,
62 |     map_version_to_task_instances,
63 |     get_versions_from_build,
64 |     get_versions_from_web,
65 | )
66 | 
67 | from swebench.versioning.utils import (
68 |     split_instances,
69 | )


--------------------------------------------------------------------------------
/swebench/harness/remove_containers.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from argparse import ArgumentParser
 3 | 
 4 | import docker
 5 | 
 6 | """
 7 | Script for removing containers associated with specified instance IDs.
 8 | """
 9 | 
10 | def main(instance_ids, predictions_path):
11 |     all_ids = set()
12 |     if predictions_path:
13 |         with open(predictions_path, "r") as f:
14 |             predictions = json.loads(f.read())
15 |             for pred in predictions:
16 |                 all_ids.add(pred["instance_id"])
17 | 
18 |     if instance_ids:
19 |         all_ids |= set(instance_ids)
20 | 
21 |     if not all_ids:
22 |         print("No instance IDs provided, exiting.")
23 |         return
24 | 
25 |     for instance_id in all_ids:
26 |         try:
27 |             client = docker.from_env()
28 |             container = client.containers.get(f"sweb.eval.{instance_id}")
29 |             container.stop()
30 |             container.remove()
31 |             print(f"Removed container {instance_id}")
32 |         except docker.errors.NotFound:
33 |             print(f"Container {instance_id} not found, skipping.")
34 |         except Exception as e:
35 |             print(f"Error removing container {instance_id}: {e}")
36 |             continue
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     parser = ArgumentParser(description=__doc__)
41 |     parser.add_argument(
42 |         "--instance_ids",
43 |         help="Instance IDs to remove containers for",
44 |     )
45 |     parser.add_argument(
46 |         "--predictions_path",
47 |         help="Path to predictions file",
48 |     )
49 |     args = parser.parse_args()
50 |     instance_ids = [i.strip() for i in args.instance_ids.split(",")] if args.instance_ids else []
51 |     main(
52 |         instance_ids=instance_ids,
53 |         predictions_path=args.predictions_path,
54 |     )
55 | 


--------------------------------------------------------------------------------
/swebench/versioning/extract_web/get_versions_xarray.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | import requests
 5 | import sys
 6 | 
 7 | from datetime import datetime
 8 | 
 9 | sys.path.append("../../harness")
10 | from utils import get_instances
11 | 
12 | PATH_TASKS_XARRAY = "<path to xarray task instances>"
13 | 
14 | # Get raw xarray dataset
15 | data_tasks = get_instances(PATH_TASKS_XARRAY)
16 | 
17 | # Get version to date from xarray home page
18 | resp = requests.get("https://docs.xarray.dev/en/stable/whats-new.html")
19 | pattern = (
20 |     r'<a class="reference internal nav-link( active)?" href="#v(.*)">v(.*) \((.*)\)</a>'
21 | )
22 | matches = re.findall(pattern, resp.text)
23 | matches = list(set(matches))
24 | matches = [x[1:] for x in matches]
25 | 
26 | # Get (date, version) pairs
27 | date_formats = ["%B %d %Y", "%d %B %Y"]
28 | keep_major_minor = lambda x, sep: ".".join(x.strip().split(sep)[:2])
29 | 
30 | times = []
31 | for match in matches:
32 |     parts = match[0].split("-")
33 |     version = keep_major_minor(".".join(parts[0:3]), ".")
34 |     date_str = " ".join(parts[3:])
35 | 
36 |     for f_ in date_formats:
37 |         try:
38 |             date_obj = datetime.strptime(date_str, f_)
39 |             times.append((date_obj.strftime("%Y-%m-%d"), version))
40 |         except:
41 |             continue
42 |         break
43 | 
44 | times = sorted(times, key=lambda x: x[0])[::-1]
45 | 
46 | for task in data_tasks:
47 |     created_at = task["created_at"].split("T")[0]
48 |     found = False
49 |     for t in times:
50 |         if t[0] < created_at:
51 |             task["version"] = t[1]
52 |             found = True
53 |             break
54 |     if not found:
55 |         task["version"] = None
56 | 
57 | # Save xarray versioned data to repository
58 | with open(
59 |     os.path.join(PATH_TASKS_XARRAY, "xarray-task-instances_versions.json"),
60 |     "w",
61 | ) as f:
62 |     json.dump(data_tasks, fp=f)
63 | 


--------------------------------------------------------------------------------
/swebench/versioning/README.md:
--------------------------------------------------------------------------------
 1 | # Versioning
 2 | To enable execution based evaluation, SWE-bench assigns each task instances a `version` (with respect to its repository), where the `version` is then a key for the installation instructions.
 3 | 
 4 | This folder contains code for assigning the version of a task instance based on its repository.
 5 | 
 6 | ## 🔧 General Purpose
 7 | `get_versions.py` script is a general purpose tool for getting version from either A. reading the GitHub repository or B. from building the repository locally and locating the appropriate version files.
 8 | Given a list of candidate task instances, the script assigns each task instance a new `version: <value>` key/value pair.
 9 | 
10 | This script can be invoked via the `./run_get_version.sh` script, where the arguments are:
11 | ```
12 | python get_versions.py \
13 |     --instances_path   [Required] [folder] Patch to candidate task instances \
14 |     --retrieval_method [Required] [choice] Method to retrieve versions ("build", "mix", or "github") \
15 |     --cleanup          [Required] [bool]   Remove testbed and conda environments upon task completion \
16 |     --conda_env        [Required] [str]    Name of conda environment to run task installation within \
17 |     --num_workers      [Required] [int]    Number of processes to parallelize on \
18 |     --path_conda       [Required] [folder] Path to miniconda or anaconda installation \
19 |     --output_dir       [Required] [folder] Path to directory to write versioned task instances to (overwrite by default) \
20 |     --testbed          [Required] [folder] Path to testbed directory, for cloning GitHub repos to
21 | ```
22 | 
23 | ## 🌐 Repository Website-Based
24 | The `extract_web/get_versions_*.py` files are repository specific scripts that crawl the website of the PyPI package to find versions and their cut off dates.
25 | This script can be easily adapted to other repositories to check task instances' `creation_date` against the version dates.


--------------------------------------------------------------------------------
/swebench/versioning/extract_web/get_versions_matplotlib.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | import requests
 5 | import sys
 6 | 
 7 | from datetime import datetime
 8 | 
 9 | sys.path.append("../../harness")
10 | from utils import get_instances
11 | 
12 | PATH_TASKS_MATPLOTLIB = "<path to matplotlib task instances>"
13 | 
14 | # Get raw matplotlib dataset
15 | data_tasks = get_instances(PATH_TASKS_MATPLOTLIB)
16 | 
17 | # Get version to date from matplotlib home page
18 | resp = requests.get("https://matplotlib.org/stable/users/release_notes#past-versions")
19 | pattern = r'<a class="reference internal" href="prev_whats_new/whats_new_(.*).html">What\'s new in Matplotlib (.*)</a>'
20 | matches = re.findall(pattern, resp.text)
21 | matches = list(set(matches))
22 | 
23 | # Get (date, version) pairs
24 | date_format = "%b %d, %Y"
25 | keep_major_minor = lambda x, sep: ".".join(x.strip().split(sep)[:2])
26 | 
27 | times = []
28 | for match in matches:
29 |     version, s = match[0], match[1]
30 |     if "(" not in s:
31 |         continue
32 |     version = keep_major_minor(version, ".")
33 |     date_string = s[s.find("(") + 1 : s.find(")")]
34 |     date_obj = datetime.strptime(date_string, date_format)
35 |     times.append((date_obj.strftime("%Y-%m-%d"), version))
36 | times = sorted(times, key=lambda x: x[0])[::-1]
37 | 
38 | for task in data_tasks:
39 |     created_at = task["created_at"].split("T")[0]
40 |     for t in times:
41 |         if t[0] < created_at:
42 |             task["version"] = t[1]
43 |             break
44 | 
45 | # Construct map of versions to task instances
46 | map_v_to_t = {}
47 | for t in data_tasks:
48 |     if t["version"] not in map_v_to_t:
49 |         map_v_to_t[t["version"]] = []
50 |     map_v_to_t[t["version"]].append(t)
51 | 
52 | # Save matplotlib versioned data to repository
53 | with open(
54 |     os.path.join(PATH_TASKS_MATPLOTLIB, "matplotlib-task-instances_versions.json"),
55 |     "w",
56 | ) as f:
57 |     json.dump(data_tasks, fp=f)
58 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open('README.md', 'r', encoding='utf-8') as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name='swebench',
 8 |     author='John Yang',
 9 |     author_email='byjohnyang@gmail.com',
10 |     description='The official SWE-bench package - a benchmark for evaluating LMs on software engineering',
11 |     keywords='nlp, benchmark, code',
12 |     long_description=long_description,
13 |     long_description_content_type='text/markdown',
14 |     url='https://swebench.com',
15 |     project_urls={
16 |         'Documentation': 'https://github.com/princeton-nlp/SWE-bench',
17 |         'Bug Reports': 'http://github.com/princeton-nlp/SWE-bench/issues',
18 |         'Source Code': 'http://github.com/princeton-nlp/SWE-bench',
19 |         'Website': 'https://swebench.com',
20 |     },
21 |     packages=setuptools.find_packages(),
22 |     classifiers=[
23 |         'Programming Language :: Python :: 3.8',
24 |         'Programming Language :: Python :: 3.9',
25 |         'Programming Language :: Python :: 3 :: Only',
26 |         'License :: OSI Approved :: MIT License',
27 |         'Operating System :: OS Independent',
28 |     ],
29 |     python_requires='>=3.8',
30 |     install_requires=[
31 |         'beautifulsoup4',
32 |         'chardet',
33 |         'datasets',
34 |         'docker',
35 |         'ghapi',
36 |         'GitPython',
37 |         'pre-commit',
38 |         'python-dotenv',
39 |         'requests',
40 |         'rich',
41 |         'unidiff',
42 |         'tqdm',
43 |     ],
44 |     extras_require={
45 |         'inference': [
46 |             'tiktoken',
47 |             'openai',
48 |             'anthropic',
49 |             'transformers',
50 |             'peft',
51 |             'sentencepiece',
52 |             'protobuf',
53 |             'torch',
54 |             'flash_attn',
55 |             'triton',
56 |             'jedi',
57 |             'tenacity',
58 |         ],
59 |     },
60 |     include_package_data=True,
61 | )


--------------------------------------------------------------------------------
/swebench/collect/cleanup/delete_gh_workflows.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | import subprocess
 6 | 
 7 | 
 8 | def main(repo_url):
 9 |     """
10 |     Remove .github/workflows folder from all branches of a repo
11 | 
12 |     Args:
13 |         repo_url (str): URL of the target repo
14 |     """
15 |     # Get list of remote branches
16 |     branches_command = subprocess.run(
17 |         ["git", "ls-remote", "--heads", repo_url], capture_output=True, text=True
18 |     )
19 |     branches = branches_command.stdout.strip().split("\n")
20 |     branches = [branch.split()[1] for branch in branches]
21 |     subprocess.run(
22 |         ["git", "clone", repo_url, "temp_repo"],
23 |         stderr=subprocess.DEVNULL,
24 |         stdout=subprocess.DEVNULL,
25 |     )
26 | 
27 |     # Iterate through all branches
28 |     os.chdir("temp_repo")
29 |     for branch in branches:
30 |         # Switch to branch
31 |         print(f"--------------\nProcessing branch: {branch}")
32 |         branch = branch.split("/")[-1]
33 |         subprocess.run(["git", "checkout", branch])
34 | 
35 |         workflows_path = os.path.join(".github", "workflows")
36 |         if os.path.exists(workflows_path):
37 |             # Remove .github/workflows folder if it exists
38 |             print(f"Deleting .github/workflows folder from branch: {branch}")
39 |             subprocess.run(["rm", "-rf", workflows_path])
40 |             subprocess.run(["git", "add", "-A"])
41 |             subprocess.run(["git", "commit", "-m", "Remove .github/workflows folder"])
42 |             subprocess.run(["git", "push"])
43 |         else:
44 |             print(f".github/workflows folder not found in branch: {branch}")
45 | 
46 |     os.chdir("..")
47 |     subprocess.run(["rm", "-rf", "temp_repo"])
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument("--repo_url", type=str, required=True)
53 |     args = parser.parse_args()
54 |     main(**vars(args))
55 | 


--------------------------------------------------------------------------------
/swebench/harness/dockerfiles.py:
--------------------------------------------------------------------------------
 1 | # IF you change the base image, you need to rebuild all images (run with --force_rebuild)
 2 | _DOCKERFILE_BASE = r"""
 3 | FROM --platform={platform} ubuntu:22.04
 4 | 
 5 | ARG DEBIAN_FRONTEND=noninteractive
 6 | ENV TZ=Etc/UTC
 7 | 
 8 | RUN apt update && apt install -y \
 9 | wget \
10 | git \
11 | build-essential \
12 | libffi-dev \
13 | libtiff-dev \
14 | python3 \
15 | python3-pip \
16 | python-is-python3 \
17 | jq \
18 | curl \
19 | locales \
20 | locales-all \
21 | tzdata \
22 | && rm -rf /var/lib/apt/lists/*
23 | 
24 | # Download and install conda
25 | RUN wget 'https://repo.anaconda.com/miniconda/Miniconda3-py311_24.7.1-0-Linux-x86_64.sh' -O miniconda.sh \
26 |     && bash miniconda.sh -b -p /opt/miniconda3
27 | # Add conda to PATH
28 | ENV PATH=/opt/miniconda3/bin:$PATH
29 | # Add conda to shell startup scripts like .bashrc (DO NOT REMOVE THIS)
30 | RUN conda init --all
31 | RUN conda config --append channels conda-forge
32 | 
33 | RUN adduser --disabled-password --gecos 'dog' nonroot
34 | """
35 | 
36 | _DOCKERFILE_ENV = r"""FROM --platform={platform} sweb.base.{arch}:latest
37 | 
38 | COPY ./setup_env.sh /root/
39 | RUN chmod +x /root/setup_env.sh
40 | RUN /bin/bash -c "source ~/.bashrc && /root/setup_env.sh"
41 | 
42 | WORKDIR /testbed/
43 | 
44 | # Automatically activate the testbed environment
45 | RUN echo "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed" > /root/.bashrc
46 | """
47 | 
48 | _DOCKERFILE_INSTANCE = r"""FROM --platform={platform} {env_image_name}
49 | 
50 | COPY ./setup_repo.sh /root/
51 | RUN /bin/bash /root/setup_repo.sh
52 | 
53 | WORKDIR /testbed/
54 | """
55 | 
56 | 
57 | def get_dockerfile_base(platform, arch):
58 |     if arch == "arm64":
59 |         conda_arch = "aarch64"
60 |     else:
61 |         conda_arch = arch
62 |     return _DOCKERFILE_BASE.format(platform=platform, conda_arch=conda_arch)
63 | 
64 | 
65 | def get_dockerfile_env(platform, arch):
66 |     return _DOCKERFILE_ENV.format(platform=platform, arch=arch)
67 | 
68 | 
69 | def get_dockerfile_instance(platform, env_image_name):
70 |     return _DOCKERFILE_INSTANCE.format(platform=platform, env_image_name=env_image_name)
71 | 


--------------------------------------------------------------------------------
/swebench/collect/make_repo/make_repo.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Mirror repository to https://github.com/swe-bench
 4 | # Usage make_repo.sh {gh organization}/{gh repository}
 5 | 
 6 | # Abort on error
 7 | set -euo pipefail
 8 | 
 9 | REPO_TARGET=$1
10 | 
11 | # Check if the target repository exists
12 | gh repo view "$REPO_TARGET" > /dev/null || exit 1
13 | 
14 | # Set the organization and repository names
15 | ORG_NAME="swe-train"
16 | NEW_REPO_NAME="${REPO_TARGET//\//__}"
17 | 
18 | # Check if the new repository already exists
19 | # gh repo view "$ORG_NAME/$NEW_REPO_NAME" > /dev/null 2>&1
20 | # if [ $? -eq 0 ]; then
21 | #     echo "The repository $ORG_NAME/$NEW_REPO_NAME already exists."
22 | #     exit 1
23 | # else
24 | #     # Create mirror repository
25 | gh repo create "$ORG_NAME/$NEW_REPO_NAME" --private
26 | # fi
27 | 
28 | # Check if the repository creation was successful
29 | if [ $? -eq 0 ]; then
30 |     echo "** Repository created successfully at $ORG_NAME/$NEW_REPO_NAME."
31 | else
32 |     echo "Failed to create the repository."
33 |     exit 1
34 | fi
35 | 
36 | # Clone the target repository
37 | echo "** Cloning $REPO_TARGET..."
38 | TARGET_REPO_DIR="${REPO_TARGET##*/}.git"
39 | 
40 | # Check if the local repository directory already exists
41 | if [ -d "$TARGET_REPO_DIR" ]; then
42 |     echo "The local repository directory $TARGET_REPO_DIR already exists."
43 |     exit 1
44 | fi
45 | 
46 | git clone --bare git@github.com:$REPO_TARGET.git
47 | 
48 | # Push files to the mirror repository
49 | echo "** Performing mirror push of files to $ORG_NAME/$NEW_REPO_NAME..."
50 | cd "$TARGET_REPO_DIR"; git push --mirror git@github.com:$ORG_NAME/$NEW_REPO_NAME
51 | 
52 | # Remove the target repository
53 | cd ..; rm -rf "$TARGET_REPO_DIR"
54 | 
55 | # Clone the mirror repository
56 | git clone git@github.com:$ORG_NAME/$NEW_REPO_NAME.git
57 | 
58 | # Delete .github/workflows if it exists
59 | if [ -d "$NEW_REPO_NAME/.github/workflows" ]; then
60 |     # Remove the directory
61 |     rm -rf "$NEW_REPO_NAME/.github/workflows"
62 | 
63 |     # Commit and push the changes
64 |     cd "$NEW_REPO_NAME";
65 |     git add -A;
66 |     git commit -m "Removed .github/workflows";
67 |     git push origin main;  # Change 'master' to your desired branch
68 |     cd ..;
69 | else
70 |     echo "$REPO_NAME/.github/workflows does not exist. No action required."
71 | fi
72 | 
73 | rm -rf "$NEW_REPO_NAME"
74 | 


--------------------------------------------------------------------------------
/swebench/versioning/extract_web/get_versions_astropy.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | import requests
 5 | import sys
 6 | 
 7 | from datetime import datetime
 8 | 
 9 | sys.path.append("../../harness")
10 | from utils import get_instances
11 | 
12 | PATH_TASKS_ASTROPY = "<path to astropy task instances>"
13 | 
14 | # Get raw astropy dataset
15 | data_tasks = get_instances(PATH_TASKS_ASTROPY)
16 | 
17 | # Get version to date from astropy homepage
18 | resp = requests.get("https://docs.astropy.org/en/latest/changelog.html")
19 | pattern = (
20 |     r'<a class="reference internal nav-link" href="#version-(.*)">Version (.*)</a>'
21 | )
22 | matches = re.findall(pattern, resp.text)
23 | matches = list(set(matches))
24 | 
25 | # Get (date, version) pairs
26 | date_format = "%Y-%m-%d"
27 | keep_major_minor = lambda x, sep: ".".join(x.strip().split(sep)[:2])
28 | 
29 | # Iterate through matches, construct (version, date) pairs
30 | times = []
31 | for match in matches:
32 |     match_parts = match[1].split(" ")
33 |     version, date = match_parts[0], match_parts[1].strip(")").strip("(")
34 |     version = keep_major_minor(version, ".")
35 |     date_obj = datetime.strptime(date, date_format)
36 |     times.append((date_obj.strftime("%Y-%m-%d"), version))
37 | 
38 | # Group times by major/minor version
39 | map_version_to_times = {}
40 | for time in times:
41 |     if time[1] not in map_version_to_times:
42 |         map_version_to_times[time[1]] = []
43 |     map_version_to_times[time[1]].append(time[0])
44 | 
45 | # Pick the most recent time as the version cut off date
46 | version_to_time = [(k, max(v)) for k, v in map_version_to_times.items()]
47 | version_to_time = sorted(version_to_time, key=lambda x: x[0])[::-1]
48 | 
49 | # Assign version to each task instance
50 | for task in data_tasks:
51 |     created_at = task["created_at"].split("T")[0]
52 |     for t in version_to_time:
53 |         found = False
54 |         if t[1] < created_at:
55 |             task["version"] = t[0]
56 |             found = True
57 |             break
58 |     if not found:
59 |         task["version"] = version_to_time[-1][0]
60 | 
61 | # Construct map of versions to task instances
62 | map_v_to_t = {}
63 | for task in data_tasks:
64 |     if task["version"] not in map_v_to_t:
65 |         map_v_to_t[task["version"]] = []
66 |     map_v_to_t[task["version"]].append(t)
67 | 
68 | # Save matplotlib versioned data to repository
69 | with open(
70 |     os.path.join(PATH_TASKS_ASTROPY, "astropy-task-instances_versions.json"),
71 |     "w",
72 | ) as f:
73 |     json.dump(data_tasks, fp=f)
74 | 


--------------------------------------------------------------------------------
/scripts/eval/convert_od_output_to_swe_json.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import pandas as pd
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('od_output_file', type=str)
 8 | args = parser.parse_args()
 9 | output_filepath = args.od_output_file.replace('.jsonl', '.swebench.jsonl')
10 | print(f'Converting {args.od_output_file} to {output_filepath}')
11 | 
12 | od_format = pd.read_json(args.od_output_file, orient='records', lines=True)
13 | # model name is the folder name of od_output_file
14 | model_name = os.path.basename(os.path.dirname(args.od_output_file))
15 | 
16 | 
17 | def process_git_patch(patch):
18 |     if not isinstance(patch, str):
19 |         return ''
20 | 
21 |     if not patch.strip():
22 |         # skip empty patches
23 |         return ''
24 | 
25 |     patch = patch.replace('\r\n', '\n')
26 |     # There might be some weird characters at the beginning of the patch
27 |     # due to some OpenHands inference command outputs
28 | 
29 |     # FOR EXAMPLE:
30 |     # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
31 |     # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
32 |     # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
33 |     # new file mode 100644
34 |     # index 0000000000..fc13db5948
35 | 
36 |     # We "find" the first line that starts with "diff" and then we remove lines before it
37 |     lines = patch.split('\n')
38 |     for i, line in enumerate(lines):
39 |         if line.startswith('diff --git'):
40 |             patch = '\n'.join(lines[i:])
41 |             break
42 | 
43 |     patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
44 |     return patch
45 | 
46 | 
47 | def convert_row_to_swebench_format(row):
48 |     if 'git_patch' in row:
49 |         model_patch = row['git_patch']
50 |     elif 'test_result' in row and 'git_patch' in row['test_result']:
51 |         model_patch = row['test_result']['git_patch']
52 |     else:
53 |         raise ValueError(f'Row {row} does not have a git_patch')
54 | 
55 |     return {
56 |         'instance_id': row['instance_id'],
57 |         'model_patch': process_git_patch(model_patch),
58 |         'model_name_or_path': model_name,
59 |     }
60 | 
61 | 
62 | swebench_format = od_format.apply(convert_row_to_swebench_format, axis=1)
63 | swebench_format.to_json(output_filepath, lines=True, orient='records')
64 | 


--------------------------------------------------------------------------------
/swebench/versioning/extract_web/get_versions_pvlib-python.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | import requests
 4 | import sys
 5 | 
 6 | from datetime import datetime
 7 | 
 8 | sys.path.append("/n/fs/nlp-jy1682/swe-bench/public/harness")
 9 | from utils import get_instances
10 | sys.path = sys.path[:-1]
11 | 
12 | PATH_TASKS_PVLIB   = "<path to pvlib-python task instances>"
13 | PATH_TASKS_PVLIB_V = "<path to pvlib-python task instances with versions>"
14 | WEBPAGE = "https://pvlib-python.readthedocs.io/en/stable/whatsnew.html"
15 | PATTERN = r'<a class="reference internal nav-link" href="#(.*)">\n\s+v(.*)\n\s+<\/a>'
16 | DATE_FORMAT = "%B %d, %Y"
17 | 
18 | # Get raw astropy dataset
19 | data_tasks = get_instances(PATH_TASKS_PVLIB)
20 | 
21 | # Get version to date from astropy homepage
22 | resp = requests.get(WEBPAGE)
23 | matches = re.findall(PATTERN, resp.text)
24 | matches = list(set(matches))
25 | 
26 | # Get (date, version) pairs
27 | keep_major_minor = lambda x, sep: ".".join(x.strip().split(sep)[:2])
28 | 
29 | # Iterate through matches, construct (version, date) pairs
30 | times = []
31 | for match in matches:
32 |     match_parts = match[1].split(" (")
33 |     version = '.'.join(match_parts[0].split('.')[:-1])
34 |     date = match_parts[1].strip(')').strip('(')
35 |     date_obj = datetime.strptime(date, DATE_FORMAT)
36 |     times.append((date_obj.strftime("%Y-%m-%d"), version))
37 | 
38 | # Group times by major/minor version
39 | map_version_to_times = {}
40 | for time in times:
41 |     if time[1] not in map_version_to_times:
42 |         map_version_to_times[time[1]] = []
43 |     map_version_to_times[time[1]].append(time[0])
44 | 
45 | # Pick the most recent time as the version cut off date
46 | version_to_time = [(k, max(v)) for k, v in map_version_to_times.items()]
47 | version_to_time = sorted(version_to_time, key=lambda x: x[0])[::-1]
48 | 
49 | # Assign version to each task instance
50 | for task in data_tasks:
51 |     created_at = task["created_at"].split("T")[0]
52 |     for t in version_to_time:
53 |         found = False
54 |         if t[1] < created_at:
55 |             task["version"] = t[0]
56 |             found = True
57 |             break
58 |     if not found:
59 |         task["version"] = version_to_time[-1][0]
60 | 
61 | # Construct map of versions to task instances
62 | map_v_to_t = {}
63 | for task in data_tasks:
64 |     if task["version"] not in map_v_to_t:
65 |         map_v_to_t[task["version"]] = []
66 |     map_v_to_t[task["version"]].append(t)
67 | 
68 | # Save matplotlib versioned data to repository
69 | with open(PATH_TASKS_PVLIB_V, "w") as f:
70 |     json.dump(data_tasks, fp=f)


--------------------------------------------------------------------------------
/swebench/collect/make_lite/make_lite.py:
--------------------------------------------------------------------------------
 1 | from criteria import (
 2 |     contains_git_commit_hash,
 3 |     contains_hyperlinks,
 4 |     contains_image,
 5 |     contains_issue_reference,
 6 |     contains_non_modified_files,
 7 |     contains_pytest_match_arg,
 8 |     leq_n_code_lines,
 9 |     leq_n_files,
10 |     leq_n_hunks,
11 |     leq_n_words,
12 | )
13 | from datasets import load_dataset, disable_caching, DatasetDict
14 | disable_caching()
15 | 
16 | 
17 | def filter_problem_statement(instance):
18 |     problem_statement = instance["problem_statement"]
19 |     repo = instance["repo"]
20 |     if leq_n_words(problem_statement, 40) or \
21 |         contains_hyperlinks(problem_statement, repo) or \
22 |         contains_issue_reference(problem_statement, repo) or \
23 |         contains_git_commit_hash(problem_statement) or \
24 |         contains_image(problem_statement):
25 |         return False
26 |     return True
27 | 
28 | 
29 | def filter_patch(instance):
30 |     patch_text = instance["patch"]
31 |     if contains_non_modified_files(patch_text) or \
32 |         not leq_n_files(patch_text, 1) or \
33 |         not leq_n_hunks(patch_text, 3):
34 |         return False
35 |     return True
36 | 
37 | 
38 | def filter_patch_test(instance):
39 |     patch_text = instance["test_patch"]
40 |     if contains_pytest_match_arg(patch_text):
41 |         return False
42 |     return True
43 | 
44 | 
45 | def apply_filters(dset, filters, name=''):
46 |     print(f'Starting with {len(dset)} instances', end='')
47 |     if name:
48 |         print(f' for {name}.')
49 |     else:
50 |         print('.')
51 |     for _filter in filters:
52 |         dset = dset.filter(_filter, desc=f'Applying {_filter.__name__}')
53 |         print(f'After filtering {len(dset)}.')
54 |     return dset
55 | 
56 | 
57 | def take_subset(dset, n, name=''):
58 |     dset = dset.sort("instance_id")
59 |     print(f'Starting with {len(dset)} instances', end='')
60 |     if name:
61 |         print(f' for {name}.')
62 |     else:
63 |         print('.')
64 |     dset = dset.shuffle(seed=42).select(range(n))
65 |     print(f'Sampled {len(dset)} instances.')
66 |     return dset
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     # Load the dataset
71 |     dev = load_dataset("princeton-nlp/SWE-bench")['dev']
72 |     test = load_dataset("princeton-nlp/SWE-bench")['test']
73 | 
74 |     test = apply_filters(test, [filter_problem_statement, filter_patch, filter_patch_test], 'test')
75 |     test = take_subset(test, 300, 'test')
76 |     dev = apply_filters(dev, [filter_problem_statement, filter_patch, filter_patch_test], 'dev')
77 |     dset = DatasetDict({'dev': dev, 'test': test})
78 |     # Save the filtered dataset to disk
79 |     dset.save_to_disk("SWE-bench_lite")
80 |     print("Saved to SWE-bench_lite.")


--------------------------------------------------------------------------------
/swebench/inference/llamao/distributed_attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import torch
 7 | 
 8 | from typing import Any
 9 | from torch import Tensor
10 | from torch.nn import Module
11 | 
12 | import torch.distributed as dist
13 | 
14 | class SeqAllToAll(torch.autograd.Function):
15 |     @staticmethod
16 |     def forward(ctx: Any, input: Tensor, scatter_idx: int, gather_idx: int, group: Any) -> Tensor:
17 |         ctx.scatter_idx = scatter_idx
18 |         ctx.gather_idx = gather_idx
19 |         ctx.group = group
20 | 
21 |         world_size = dist.get_world_size(group)
22 | 
23 |         input_list = [t.contiguous() for t in torch.tensor_split(input, world_size, scatter_idx)]
24 |         output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
25 | 
26 |         dist.all_to_all(output_list, input_list, group=group)
27 |         return torch.cat(output_list, dim=gather_idx).contiguous()
28 | 
29 |     @staticmethod
30 |     def backward(ctx: Any, *grad_output: Tensor) -> tuple[Tensor, None, None, None]:
31 |         return (SeqAllToAll.apply(*grad_output, ctx.gather_idx, ctx.scatter_idx, ctx.group), None, None, None)
32 | 
33 | 
34 | class DistributedAttention(torch.nn.Module):
35 |     """Initialization.
36 | 
37 |     Arguments:
38 |         local_attention (Module): local attention with q,k,v
39 |         scatter_idx (int): scatter_idx for all2all comm
40 |         gather_idx (int): gather_idx for all2all comm
41 |     """
42 | 
43 |     def __init__(
44 |         self,
45 |         local_attention: Module,
46 |         scatter_idx: int = -2,
47 |         gather_idx: int = 1,
48 |     ) -> None:
49 | 
50 |         super().__init__()
51 |         self.local_attn = local_attention
52 |         self.scatter_idx = scatter_idx  # head axis
53 |         self.gather_idx = gather_idx  # seq axis
54 | 
55 |     def forward(self, query: Tensor, key_values: Tensor, group: Any = None, **kwargs) -> Tensor:
56 |         """ forward
57 | 
58 |         Arguments:
59 |             query (Tensor): query input to the layer
60 |             key (Tensor): key input to the layer
61 |             value (Tensor): value input to the layer
62 |             args: other args
63 | 
64 |         Returns:
65 |             * output (Tensor): context output
66 |         """
67 |         #in shape : e.g.,  [s/p:h:]
68 |         query_heads = SeqAllToAll.apply(query, self.scatter_idx, self.gather_idx, group)
69 |         key_values_heads = SeqAllToAll.apply(key_values, self.scatter_idx, self.gather_idx, group)
70 | 
71 |         #out shape : e.g., [s:h/p:]
72 |         output_heads = self.local_attn(query_heads, key_values_heads, **kwargs)
73 | 
74 |         #out e.g., [s/p::h]
75 |         return SeqAllToAll.apply(output_heads, self.gather_idx, self.scatter_idx, group)


--------------------------------------------------------------------------------
/docs/20240415_eval_bug/sweep_conda_links.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | """
 5 | This script is used to sweep through a list of conda links and run the evaluation script on each one.
 6 | 
 7 | It was originally invoked from the swebench/harness/ folder.
 8 | """
 9 | 
10 | conda_links = [
11 |   "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Linux-x86_64.sh",
12 |   "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh",
13 |   "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.9.0-0-Linux-x86_64.sh",
14 |   "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.10.0-1-Linux-x86_64.sh",
15 |   "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.10.0-1-Linux-x86_64.sh",
16 |   "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh",
17 |   "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.10.0-1-Linux-x86_64.sh",
18 |   "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-1-Linux-x86_64.sh",
19 |   "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-1-Linux-x86_64.sh",
20 |   "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-1-Linux-x86_64.sh",
21 |   "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-1-Linux-x86_64.sh",
22 |   "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh",
23 |   "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-x86_64.sh",
24 |   "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh",
25 |   "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-2-Linux-x86_64.sh",
26 | ]
27 | 
28 | for conda_link in conda_links:
29 |     version = conda_link.split("/")[-1]\
30 |       .split("-", 1)[1]\
31 |       .rsplit("-", 2)[0]\
32 |       .replace(".", "_")\
33 |       .replace("-", "_")
34 |     os.makedirs(f"/n/fs/p-swe-bench/results/{version}/", exist_ok=True)
35 | 
36 |     cmd = (
37 |       "python evaluation.py "
38 |       "--predictions_path /n/fs/p-swe-bench/data/original/gold_preds.jsonl "
39 |       "--swe_bench_tasks /n/fs/p-swe-bench/data/original/swe-bench.json "
40 |       f"--log_dir /n/fs/p-swe-bench/results/{version}/ "
41 |       f"--conda_link {conda_link} "
42 |       "--testbed /n/fs/p-swe-bench/testbed/ "
43 |       "--timeout 1200 "
44 |       "--verbose "
45 |     )
46 | 
47 |     # Run subprocess
48 |     subprocess.run(cmd, shell=True)
49 | 
50 |     # Move results, scorecard to results/{version} log_dir
51 |     subprocess.run(
52 |       f"mv /n/fs/p-swe-bench/data/original/results.json /n/fs/p-swe-bench/results/{version}/results.json",
53 |       shell=True
54 |     )
55 |     subprocess.run(
56 |       f"mv /n/fs/p-swe-bench/data/original/scorecard.json /n/fs/p-swe-bench/results/{version}/scorecard.json",
57 |       shell=True
58 |     )
59 |     
60 |     # Clear testbed
61 |     subprocess.run(f"rm -rf /n/fs/p-swe-bench/testbed/*", shell=True)
62 | 


--------------------------------------------------------------------------------
/swebench/inference/make_datasets/eval_retrieval.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env python
 3 | 
 4 | """This script can be used to evaluate the BM25 retrieval results for a dataset created with create_text_dataset.py with the --retrieval_file option and --file_source bm25."""
 5 | 
 6 | import re
 7 | import numpy as np
 8 | from datasets import load_dataset, disable_caching, load_from_disk
 9 | from argparse import ArgumentParser
10 | import logging
11 | 
12 | disable_caching()
13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def main(dataset_name_or_path, split):
18 |     try:
19 |         dataset = load_dataset(dataset_name_or_path, split=split)
20 |     except:
21 |         dataset = load_from_disk(dataset_name_or_path, split=split)
22 |     print(f'Evaluating {len(dataset)} instances from {dataset_name_or_path} {split} split')
23 |     instance_files_pattern = re.compile(r'\[start of ([\w\.\-\/]+)\]\n(?:.+?)\n\[end of \1\]', re.DOTALL)
24 |     patch_files_pattern = re.compile(r'\-\-\- a/(.+)')
25 |     patch_files = {instance['instance_id']: instance['patch'] for instance in dataset}
26 |     recalls_any = list()
27 |     recalls_all = list()
28 |     recalls = list()
29 |     for datum in dataset:
30 |         instance_id = datum['instance_id']
31 |         retrieved_files = instance_files_pattern.findall(datum['text'])
32 |         if retrieved_files and 'readme' in retrieved_files[0].lower():
33 |             retrieved_files = retrieved_files[1:]  # first file is usually the readme, we don't want to count that
34 |         retrieved_files = set(retrieved_files)
35 |         gold_files = set(patch_files_pattern.findall(patch_files[instance_id]))
36 |         if len(gold_files) == 0:
37 |             print(f"WARNING: Instance {datum['instance_id']} has no gold files")
38 |             continue
39 |         if len(retrieved_files) == 0:
40 |             print(f"WARNING: Instance {datum['instance_id']} has no retrieved files")
41 |             continue
42 |         recall = len(retrieved_files.intersection(gold_files)) / len(gold_files)
43 |         recalls.append(recall)
44 |         recalls_any.append(int(recall > 0))
45 |         recalls_all.append(int(recall == 1))
46 |     recalls = np.array(recalls)
47 |     recalls_any = np.array(recalls_any)
48 |     recalls_all = np.array(recalls_all)
49 |     print(f"Avg Recall: {np.mean(recalls)*100:.2f}")
50 |     print(f"All Recall: {np.mean(recalls_all)*100:.2f}")
51 |     print(f"Any Recall: {np.mean(recalls_any)*100:.2f}")
52 |     
53 | 
54 | if __name__ == "__main__":
55 |     parser = ArgumentParser(description=__doc__)
56 |     parser.add_argument('--dataset_name_or_path', type=str, default='princeton-nlp/SWE-bench_bm25_13K')
57 |     parser.add_argument('--split', type=str, default='test')
58 |     args = parser.parse_args()
59 |     main(**vars(args))
60 | 


--------------------------------------------------------------------------------
/swebench/collect/print_pulls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """Given the `<owner/name>` of a GitHub repo, this script writes the raw information for all the repo's PRs to a single `.jsonl` file."""
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | import argparse
 8 | import json
 9 | import logging
10 | import os
11 | 
12 | from datetime import datetime
13 | from fastcore.xtras import obj2dict
14 | from swebench.collect.utils import Repo
15 | from typing import Optional
16 | 
17 | logging.basicConfig(
18 |     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
19 | )
20 | logger = logging.getLogger(__name__)
21 | 
22 | 
23 | def log_all_pulls(
24 |         repo: Repo,
25 |         output: str,
26 |         max_pulls: int = None,
27 |         cutoff_date: str = None,
28 |     ) -> None:
29 |     """
30 |     Iterate over all pull requests in a repository and log them to a file
31 | 
32 |     Args:
33 |         repo (Repo): repository object
34 |         output (str): output file name
35 |     """
36 |     cutoff_date = datetime.strptime(cutoff_date, "%Y%m%d") \
37 |         .strftime("%Y-%m-%dT%H:%M:%SZ") \
38 |         if cutoff_date is not None else None
39 | 
40 |     with open(output, "w") as file:
41 |         for i_pull, pull in enumerate(repo.get_all_pulls()):
42 |             setattr(pull, "resolved_issues", repo.extract_resolved_issues(pull))
43 |             print(json.dumps(obj2dict(pull)), end="\n", flush=True, file=file)
44 |             if max_pulls is not None and i_pull >= max_pulls:
45 |                 break
46 |             if cutoff_date is not None and pull.created_at < cutoff_date:
47 |                 break
48 | 
49 | def main(
50 |         repo_name: str,
51 |         output: str,
52 |         token: Optional[str] = None,
53 |         max_pulls: int = None,
54 |         cutoff_date: str = None,
55 |     ):
56 |     """
57 |     Logic for logging all pull requests in a repository
58 | 
59 |     Args:
60 |         repo_name (str): name of the repository
61 |         output (str): output file name
62 |         token (str, optional): GitHub token
63 |     """
64 |     if token is None:
65 |         token = os.environ.get("GITHUB_TOKEN")
66 |     owner, repo = repo_name.split("/")
67 |     repo = Repo(owner, repo, token=token)
68 |     log_all_pulls(repo, output, max_pulls=max_pulls, cutoff_date=cutoff_date)
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     parser = argparse.ArgumentParser(description=__doc__)
73 |     parser.add_argument("repo_name", type=str, help="Name of the repository")
74 |     parser.add_argument("output", type=str, help="Output file name")
75 |     parser.add_argument("--token", type=str, help="GitHub token")
76 |     parser.add_argument("--max_pulls", type=int, help="Maximum number of pulls to log", default=None)
77 |     parser.add_argument("--cutoff_date", type=str, help="Cutoff date for PRs to consider in format YYYYMMDD", default=None)
78 |     args = parser.parse_args()
79 |     main(**vars(args))
80 | 


--------------------------------------------------------------------------------
/swebench/collect/build_dataset_ft.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import glob
 5 | import json
 6 | import os
 7 | import random
 8 | 
 9 | from tqdm import tqdm
10 | from datetime import datetime
11 | 
12 | 
13 | def main(instances_path: str, output_path: str, eval_path: str, seed: int):
14 |     """
15 |     Combine all non-eval task instances into a single fine tuning dataset
16 | 
17 |     Args:
18 |         instances_path (str): Path to directory containing all candidate task instances
19 |         output_path (str): Path to save output fine tuning dataset to
20 |         eval_path (str): Path to directory containing all eval task instances
21 |         seed (int): Random seed
22 |     """
23 |     # Define output file name
24 |     random.seed(seed)
25 |     SWE_PRS_FT_DATASET = (
26 |         f"SWE_PRS_FT_DATASET_{datetime.now().strftime('%Y%m%d%H')}_{seed}.jsonl"
27 |     )
28 |     destination = os.path.join(output_path, SWE_PRS_FT_DATASET)
29 |     total_insts, total_repos = 0, 0
30 | 
31 |     # Gather Evaluation Set Task Instances
32 |     eval_instances = []
33 |     for x in glob.glob(os.path.join(eval_path, "*-task-instances.jsonl")):
34 |         with open(x) as f:
35 |             eval_instances.extend(f.readlines())
36 |     eval_instances = set(eval_instances)
37 | 
38 |     # Create fine tuning dataset
39 |     with open(destination, "w") as f_out:
40 |         for dataset_path in tqdm(
41 |             glob.glob(os.path.join(instances_path, "*-task-instances.jsonl.all"))
42 |         ):
43 |             total_repos += 1
44 |             with open(dataset_path) as f:
45 |                 lines = f.readlines()
46 | 
47 |                 # Remove data from evaluation dataset
48 |                 lines = [line for line in lines if line not in eval_instances]
49 | 
50 |                 # Shuffle lines
51 |                 random.shuffle(lines)
52 | 
53 |                 # Keep 500 lines per dataset
54 |                 for line in lines[:500]:
55 |                     line = json.loads(line)
56 |                     if "test_patch" in line:
57 |                         del line["test_patch"]
58 |                     f_out.write(json.dumps(line) + "\n")
59 |                     total_insts += 1
60 | 
61 |     print(
62 |         f"Fine tuning dataset saved to {destination} ({total_insts} instances from {total_repos} repos)"
63 |     )
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     parser = argparse.ArgumentParser()
68 |     parser.add_argument(
69 |         "--instances_path",
70 |         type=str,
71 |         help="Path to directory containing all candidate task instances",
72 |     )
73 |     parser.add_argument(
74 |         "--output_path", type=str, help="Path to save output fine tuning dataset to"
75 |     )
76 |     parser.add_argument(
77 |         "--eval_path",
78 |         type=str,
79 |         help="Path to directory containing all eval task instances",
80 |     )
81 |     parser.add_argument("--seed", type=int, default=42, help="Random seed")
82 |     args = parser.parse_args()
83 |     main(**vars(args))
84 | 


--------------------------------------------------------------------------------
/swebench/versioning/extract_web/get_versions_sqlfluff.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | import sys
 5 | from ghapi.core import GhApi
 6 | 
 7 | sys.path.append("../../harness")
 8 | from utils import get_instances
 9 | 
10 | GITHUB_TOKEN = "<your GitHub token>"
11 | PATH_TASKS_SQLFLUFF = "<path to sqlfluff task instances>"
12 | PATH_TO_SAVE = "<path to save versioned task instances to>"
13 | 
14 | # Get raw sqlfluff dataset
15 | data_tasks = get_instances(PATH_TASKS_SQLFLUFF)
16 | 
17 | # Get all GitHub releases
18 | api = GhApi(token=GITHUB_TOKEN)
19 | 
20 | releases, i = [], 0
21 | while True:
22 |     temp = api.repos.list_releases('sqlfluff', 'sqlfluff', 100, i + 1)
23 |     releases.extend(temp)
24 |     if len(temp) < 100:
25 |         break
26 |     i += 1
27 | pairs = [(x['name'], x['published_at']) for x in releases]
28 | 
29 | def process(x):
30 |     """Extract version number from name"""
31 |     if x.startswith('SQLFluff '):
32 |         x = x[len('SQLFluff '):]
33 |     pattern = re.compile(r'\[[\d\.\w]*\] - \d*-\d*-\d*')
34 |     matches = pattern.findall(x)
35 |     if len(matches) > 0:
36 |         parts = x.split(' - ')
37 |         version = parts[0].replace('[', '').replace(']', '')
38 |         version = version.rsplit('.', 1)[0]
39 |         return (version, parts[1])
40 | 
41 |     pattern = re.compile(r'\d\.\d\.[\d\.]*')
42 |     matches = pattern.findall(x)
43 |     if len(matches) > 0:
44 |         version = matches[0]
45 |         version = version.rsplit('.', 1)[0]
46 |         return (version, None)
47 | 
48 |     return (None, None)
49 | 
50 | # Collect version/date pairs
51 | version_date_map = {}
52 | for pair in pairs:
53 |     pair_rv = process(pair[0])
54 |     if pair_rv[0] == None:
55 |         continue
56 |     version = pair_rv[0]
57 |     if version.startswith('Bugfix Release '):
58 |         version = version[len('Bugfix Release '):]
59 |     date = pair[1] if pair_rv[1] == None else pair_rv[1]
60 |     if version in version_date_map:
61 |         version_date_map[version] = max(
62 |             version_date_map[version],
63 |             date
64 |         )
65 |     else:
66 |         version_date_map[version] = date
67 | 
68 | # Get (date, version) pairs
69 | times = [(v, k) for k, v in version_date_map.items()]
70 | times = sorted(times, key=lambda x: x[0])[::-1]
71 | 
72 | # Iterate through data_tasks and assign versions
73 | for task in data_tasks:
74 |     created_at = task["created_at"].split("T")[0]
75 |     set_version = False
76 |     for t in times:
77 |         if t[0] < created_at:
78 |             task["version"] = t[1]
79 |             set_version = True
80 |             break
81 |     if not set_version:
82 |         task["version"] = None
83 | 
84 | # Save sqlfluff versioned data to repository
85 | versioned_path = "sqlfluff-task-instances_versions.json"
86 | with open(
87 |     os.path.join(PATH_TO_SAVE, versioned_path),
88 |     "w",
89 | ) as f:
90 |     json.dump(data_tasks, fp=f)
91 | 
92 | # Print all versions
93 | versioned = json.load(open(os.path.join(PATH_TO_SAVE, versioned_path)))
94 | print(sorted(list({t['version'] for t in versioned if t['version'] is not None})))


--------------------------------------------------------------------------------
/swebench/versioning/constants.py:
--------------------------------------------------------------------------------
 1 | # Constants - Task Instance Version File
 2 | MAP_REPO_TO_VERSION_PATHS = {
 3 |     "dbt-labs/dbt-core": ["core/dbt/version.py", "core/dbt/__init__.py"],
 4 |     "django/django": ["django/__init__.py"],
 5 |     "huggingface/transformers": ["src/transformers/__init__.py"],
 6 |     "marshmallow-code/marshmallow": ["src/marshmallow/__init__.py"],
 7 |     "mwaskom/seaborn": ["seaborn/__init__.py"],
 8 |     "pallets/flask": ["src/flask/__init__.py", "flask/__init__.py"],
 9 |     "psf/requests": ["requests/__version__.py", "requests/__init__.py", "src/requests/__version__.py"],
10 |     "pyca/cryptography": [
11 |         "src/cryptography/__about__.py",
12 |         "src/cryptography/__init__.py",
13 |     ],
14 |     "pylint-dev/astroid": ["astroid/__pkginfo__.py", "astroid/__init__.py"],
15 |     "pylint-dev/pylint": ["pylint/__pkginfo__.py", "pylint/__init__.py"],
16 |     "pytest-dev/pytest": ["src/_pytest/_version.py", "_pytest/_version.py" ],
17 |     "pyvista/pyvista": ["pyvista/_version.py", "pyvista/__init__.py"],
18 |     "Qiskit/qiskit": ["qiskit/VERSION.txt"],
19 |     "scikit-learn/scikit-learn": ["sklearn/__init__.py"],
20 |     "sphinx-doc/sphinx": ["sphinx/__init__.py"],
21 |     "sympy/sympy": ["sympy/release.py", "sympy/__init__.py"],
22 |     "facebookresearch/hydra": ["hydra/__init__.py"],
23 | 
24 | }
25 | 
26 | # Cosntants - Task Instance Version Regex Pattern
27 | MAP_REPO_TO_VERSION_PATTERNS = {
28 |     k: [r'__version__ = [\'"](.*)[\'"]', r"VERSION = \((.*)\)"]
29 |     for k in [
30 |         "dbt-labs/dbt-core",
31 |         "django/django",
32 |         "huggingface/transformers",
33 |         "marshmallow-code/marshmallow",
34 |         "mwaskom/seaborn",
35 |         "pallets/flask",
36 |         "psf/requests",
37 |         "pyca/cryptography",
38 |         "pylint-dev/astroid",
39 |         "pylint-dev/pylint",
40 |         "scikit-learn/scikit-learn",
41 |         "sphinx-doc/sphinx",
42 |         "sympy/sympy",
43 |         "modin-project/modin",
44 |         "facebookresearch/hydra"
45 |     ]
46 | }
47 | MAP_REPO_TO_VERSION_PATTERNS.update(
48 |     {
49 |         k: [
50 |             r'__version__ = [\'"](.*)[\'"]',
51 |             r'__version__ = version = [\'"](.*)[\'"]',
52 |             r"VERSION = \((.*)\)",
53 |         ]
54 |         for k in ["pytest-dev/pytest", "matplotlib/matplotlib"]
55 |     }
56 | )
57 | MAP_REPO_TO_VERSION_PATTERNS.update({k: [r"(.*)"] for k in ["Qiskit/qiskit"]})
58 | MAP_REPO_TO_VERSION_PATTERNS.update({k: [r"version_info = [\d]+,[\d\s]+,"] for k in ["pyvista/pyvista"]})
59 | 
60 | SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/"
61 | 
62 | # python/mypy
63 | MAP_REPO_TO_VERSION_PATHS.update({"python/mypy": ["mypy/version.py"]})
64 | MAP_REPO_TO_VERSION_PATTERNS.update({"python/mypy": [r'__version__ = [\'"](.*)[\'"]', r"VERSION = \((.*)\)"]})
65 | 
66 | # getmoto/moto
67 | MAP_REPO_TO_VERSION_PATHS.update({"getmoto/moto": ["moto/__init__.py"]})
68 | MAP_REPO_TO_VERSION_PATTERNS.update({"getmoto/moto": [r'__version__ = [\'"](.*)[\'"]', r"VERSION = \((.*)\)"]})
69 | 
70 | # conan-io/conan
71 | MAP_REPO_TO_VERSION_PATHS.update({"conan-io/conan": ["conans/__init__.py"]})
72 | MAP_REPO_TO_VERSION_PATTERNS.update({"conan-io/conan": [r'__version__ = [\'"](.*)[\'"]', r"VERSION = \((.*)\)"]})
73 | 


--------------------------------------------------------------------------------
/docs/20240415_eval_bug/check_harness.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "a5079bf3-97cd-40f3-ba6a-35cd662f7439",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import json\n",
 11 |     "\n",
 12 |     "from datasets import load_dataset\n",
 13 |     "from swebench import MAP_VERSION_TO_INSTALL"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "id": "93b467a6-a450-49e7-9283-5bcb520f7f23",
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "data = load_dataset(\"princeton-nlp/SWE-bench\", split=\"test\")\n",
 24 |     "\n",
 25 |     "# NOTE: We have not released the gold predictions, so this is just a placeholder and will not work\n",
 26 |     "golds = [json.loads(x) for x in open(\"gold_preds.jsonl\")]\n",
 27 |     "golds = {x['instance_id']: x for x in golds}"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "id": "889859d0-f5e7-4670-a133-0dc8fb1cdf75",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "repo_version_pairs = [\n",
 38 |     "    (repo, version)\n",
 39 |     "    for repo, version_map in MAP_VERSION_TO_INSTALL.items()\n",
 40 |     "    for version in version_map.keys()\n",
 41 |     "]"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 4,
 47 |    "id": "b9aaa3d4-4d8f-4b8c-b516-61c98ab0ccde",
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "data": {
 52 |       "text/plain": [
 53 |        "126"
 54 |       ]
 55 |      },
 56 |      "execution_count": 4,
 57 |      "metadata": {},
 58 |      "output_type": "execute_result"
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "check_harness = []\n",
 63 |     "for repo, version in repo_version_pairs:\n",
 64 |     "    subset = [x for x in data if x['repo'] == repo and x['version'] == version]\n",
 65 |     "    if len(subset) == 0:\n",
 66 |     "        continue\n",
 67 |     "    subset = sorted(subset, key=lambda x: x['created_at'], reverse=False)\n",
 68 |     "    inst_id = subset[-1]['instance_id']\n",
 69 |     "    check_harness.append(golds[inst_id])\n",
 70 |     "len(check_harness)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 5,
 76 |    "id": "70591c07-8cc3-4f7f-800a-07eec5d4e5ff",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "with open(\"check-harness.jsonl\", \"w\") as f:\n",
 81 |     "    for gold_pred in check_harness:\n",
 82 |     "        print(json.dumps(gold_pred), file=f, flush=True)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "id": "651f2c61-ed58-403c-ac6a-71f5375e8b59",
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": []
 92 |   }
 93 |  ],
 94 |  "metadata": {
 95 |   "kernelspec": {
 96 |    "display_name": "Python 3 (ipykernel)",
 97 |    "language": "python",
 98 |    "name": "python3"
 99 |   },
100 |   "language_info": {
101 |    "codemirror_mode": {
102 |     "name": "ipython",
103 |     "version": 3
104 |    },
105 |    "file_extension": ".py",
106 |    "mimetype": "text/x-python",
107 |    "name": "python",
108 |    "nbconvert_exporter": "python",
109 |    "pygments_lexer": "ipython3",
110 |    "version": "3.11.7"
111 |   }
112 |  },
113 |  "nbformat": 4,
114 |  "nbformat_minor": 5
115 | }
116 | 


--------------------------------------------------------------------------------
/swebench/collect/cleanup/remove_envs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import os
 5 | import subprocess
 6 | 
 7 | from multiprocessing import Pool
 8 | 
 9 | 
10 | def get_conda_env_names(output: str) -> list:
11 |     """
12 |     Parse conda environments (`conda env list`) created for a particular conda installation
13 | 
14 |     Args:
15 |         output (str): Output of `conda env list` command
16 |     """
17 |     lines = output.split("\n")
18 |     env_names = []
19 |     for line in lines:
20 |         if line.startswith("#"):
21 |             continue
22 |         if line.strip() == "":
23 |             continue
24 |         if " " in line:
25 |             env_name = line.split(" ")[0]
26 |             env_names.append(env_name)
27 |     return [x for x in env_names if len(x) > 0]
28 | 
29 | 
30 | def delete_folders_with_prefix(prefix, conda_path):
31 |     """
32 |     Find and rm folders with a particular prefix in the conda installation's env folder
33 | 
34 |     Args:
35 |         prefix (str): Prefix of folders to remove
36 |         conda_path (str): Path to conda installation
37 |     """
38 |     envs_folder = os.path.join(conda_path, "envs")
39 |     command = f'find {envs_folder} -type d -name "{prefix}*" -exec rm -rf {{}} +'
40 |     subprocess.run(command.split(" "))
41 | 
42 | 
43 | def remove_environment(env_name, prefix):
44 |     """
45 |     Remove all conda environments with a particular prefix from a conda installation
46 |     """
47 |     if env_name.startswith(prefix):
48 |         print(f"Removing {env_name}")
49 |         conda_cmd = "conda remove -n " + env_name + " --all -y"
50 |         cmd = conda_source + " && " + conda_cmd
51 |         try:
52 |             conda_create_output = subprocess.run(cmd.split(), check=True, capture_output=True, text=True)
53 |         except subprocess.CalledProcessError as e:
54 |             print(f"Error: {e}")
55 |             print(f"Error output: {e.stderr}")
56 |             raise e
57 |         print(f"Output: {conda_create_output.stdout}")
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     """
62 |     Logic for removing conda environments and their folders from a conda installation
63 |     """
64 |     parser = argparse.ArgumentParser()
65 |     parser.add_argument("prefix", type=str, help="Prefix for environments to delete")
66 |     parser.add_argument(
67 |         "--conda_path",
68 |         type=str,
69 |         help="Path to miniconda installation",
70 |     )
71 |     args = parser.parse_args()
72 | 
73 |     # Remove conda environments with a specific prefix
74 |     conda_source = "source " + os.path.join(args.conda_path, "etc/profile.d/conda.sh")
75 |     check_env = conda_source + " && " + "conda env list"
76 |     try:
77 |         conda_envs = subprocess.run(check_env.split(" "), check=True, capture_output=True)
78 |     except subprocess.CalledProcessError as e:
79 |         print(f"Error: {e}")
80 |         print(f"Error output: {e.stderr.decode('utf-8')}")
81 |         raise e
82 |     conda_envs_names = get_conda_env_names(conda_envs.stdout.decode("utf-8"))
83 | 
84 |     # Remove conda environments in parallel
85 |     num_processes = 25
86 |     pool = Pool(num_processes)
87 |     pool.starmap(
88 |         remove_environment, zip(conda_envs_names, [args.prefix] * len(conda_envs_names))
89 |     )
90 | 
91 |     # Remove env folder with the same prefix
92 |     print(
93 |         f"Removing miniconda folder for environments with {args.prefix} from {args.conda_path}"
94 |     )
95 |     delete_folders_with_prefix(args.prefix, args.conda_path)
96 |     print(f"Done!")
97 | 


--------------------------------------------------------------------------------
/scripts/eval/update_output_with_eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | from collections import defaultdict
 5 | 
 6 | import pandas as pd
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('input_file', type=str)
10 | args = parser.parse_args()
11 | 
12 | dirname = os.path.dirname(args.input_file)
13 | report_json = os.path.join(dirname, 'report.json')
14 | 
15 | df = pd.read_json(args.input_file, lines=True)
16 | 
17 | output_md_filepath = os.path.join(dirname, 'README.md')
18 | instance_id_to_status = defaultdict(
19 |     lambda: {'resolved': False, 'empty_generation': False}
20 | )
21 | if os.path.exists(report_json):
22 |     with open(report_json, 'r') as f:
23 |         report = json.load(f)
24 | 
25 |     output_md = (
26 |         "# SWE-bench Report\n"
27 |         "This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n"
28 |         "## Summary\n"
29 |         f"- total instances: {report['total_instances']}\n"
30 |         f"- submitted instances: {report['submitted_instances']}\n"
31 |         f"- completed instances: {report['completed_instances']}\n"
32 |         f"- empty patch instances: {report['empty_patch_instances']}\n"
33 |         f"- resolved instances: {report['resolved_instances']}\n"
34 |         f"- unresolved instances: {report['unresolved_instances']}\n"
35 |         f"- error instances: {report['error_instances']}\n"
36 |         f"- unstopped instances: {report['unstopped_instances']}\n"
37 |     )
38 | 
39 |     output_md += '\n## Resolved Instances\n'
40 |     # instance_id to status
41 |     for instance_id in report['resolved_ids']:
42 |         instance_id_to_status[instance_id]['resolved'] = True
43 |         output_md += (
44 |             f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
45 |         )
46 | 
47 |     output_md += '\n## Unresolved Instances\n'
48 |     for instance_id in report['unresolved_ids']:
49 |         output_md += (
50 |             f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
51 |         )
52 | 
53 |     output_md += '\n## Error Instances\n'
54 |     for instance_id in report['error_ids']:
55 |         instance_id_to_status[instance_id]['error_eval'] = True
56 |         output_md += (
57 |             f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
58 |         )
59 | 
60 |     output_md += '\n## Empty Patch Instances\n'
61 |     for instance_id in report['empty_patch_ids']:
62 |         instance_id_to_status[instance_id]['empty_generation'] = True
63 |         output_md += (
64 |             f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
65 |         )
66 | 
67 |     output_md += '\n## Incomplete Instances\n'
68 |     for instance_id in report['incomplete_ids']:
69 |         output_md += (
70 |             f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
71 |         )
72 | 
73 |     # Apply the status to the dataframe
74 |     def apply_report(row):
75 |         instance_id = row['instance_id']
76 |         if instance_id in instance_id_to_status:
77 |             return dict(instance_id_to_status[instance_id])
78 |         return row.get('report', {})
79 | 
80 |     df['report'] = df.apply(apply_report, axis=1)
81 | 
82 | 
83 | if os.path.exists(args.input_file + '.bak'):
84 |     conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
85 |     if conf != 'y':
86 |         exit()
87 |     os.remove(args.input_file + '.bak')
88 | 
89 | # backup the original file
90 | os.rename(args.input_file, args.input_file + '.bak')
91 | df.to_json(args.input_file, orient='records', lines=True)
92 | 
93 | with open(output_md_filepath, 'w') as f:
94 |     f.write(output_md)
95 | 


--------------------------------------------------------------------------------
/swebench/harness/prepare_images.py:
--------------------------------------------------------------------------------
 1 | import docker
 2 | import resource
 3 | 
 4 | from argparse import ArgumentParser
 5 | 
 6 | from swebench.harness.constants import KEY_INSTANCE_ID
 7 | from swebench.harness.docker_build import build_instance_images
 8 | from swebench.harness.docker_utils import list_images
 9 | from swebench.harness.test_spec import make_test_spec
10 | from swebench.harness.utils import load_swebench_dataset, str2bool
11 | 
12 | 
13 | def filter_dataset_to_build(
14 |         dataset: list,
15 |         instance_ids: list,
16 |         client: docker.DockerClient,
17 |         force_rebuild: bool
18 |     ):
19 |     """
20 |     Filter the dataset to only include instances that need to be built.
21 | 
22 |     Args:
23 |         dataset (list): List of instances (usually all of SWE-bench dev/test split)
24 |         instance_ids (list): List of instance IDs to build.
25 |         client (docker.DockerClient): Docker client.
26 |         force_rebuild (bool): Whether to force rebuild all images.
27 |     """
28 |     # Get existing images
29 |     existing_images = list_images(client)
30 |     data_to_build = []
31 | 
32 |     # Check if all instance IDs are in the dataset
33 |     not_in_dataset = set(instance_ids).difference(set([instance[KEY_INSTANCE_ID] for instance in dataset]))
34 |     if not_in_dataset:
35 |         raise ValueError(f"Instance IDs not found in dataset: {not_in_dataset}")
36 | 
37 |     for instance in dataset:
38 |         if instance[KEY_INSTANCE_ID] not in instance_ids:
39 |             # Skip instances not in the list
40 |             continue
41 | 
42 |         # Check if the instance needs to be built (based on force_rebuild flag and existing images)
43 |         spec = make_test_spec(instance)
44 |         if force_rebuild:
45 |             data_to_build.append(instance)
46 |         elif spec.instance_image_key not in existing_images:
47 |             data_to_build.append(instance)
48 | 
49 |     return data_to_build
50 | 
51 | 
52 | def main(
53 |     dataset_name,
54 |     split,
55 |     instance_ids,
56 |     max_workers,
57 |     force_rebuild,
58 |     open_file_limit,
59 | ):
60 |     """
61 |     Build Docker images for the specified instances.
62 | 
63 |     Args:
64 |         instance_ids (list): List of instance IDs to build.
65 |         max_workers (int): Number of workers for parallel processing.
66 |         force_rebuild (bool): Whether to force rebuild all images.
67 |         open_file_limit (int): Open file limit.
68 |     """
69 |     # Set open file limit
70 |     resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit))
71 |     client = docker.from_env()
72 | 
73 |     # Filter out instances that were not specified
74 |     dataset = load_swebench_dataset(dataset_name, split)
75 |     dataset = filter_dataset_to_build(dataset, instance_ids, client, force_rebuild)
76 | 
77 |     # Build images for remaining instances
78 |     successful, failed = build_instance_images(
79 |         client=client,
80 |         dataset=dataset,
81 |         force_rebuild=force_rebuild,
82 |         max_workers=max_workers,
83 |     )
84 |     print(f"Successfully built {len(successful)} images")
85 |     print(f"Failed to build {len(failed)} images")
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     parser = ArgumentParser()
90 |     parser.add_argument("--dataset_name", type=str, default="princeton-nlp/SWE-bench_Lite", help="Name of the dataset to use")
91 |     parser.add_argument("--split", type=str, default="test", help="Split to use")
92 |     parser.add_argument("--instance_ids", nargs="+", type=str, help="Instance IDs to run (space separated)")
93 |     parser.add_argument("--max_workers", type=int, default=4, help="Max workers for parallel processing")
94 |     parser.add_argument("--force_rebuild", type=str2bool, default=False, help="Force rebuild images")
95 |     parser.add_argument("--open_file_limit", type=int, default=8192, help="Open file limit")
96 |     args = parser.parse_args()
97 |     main(**vars(args))
98 | 


--------------------------------------------------------------------------------
/swebench/inference/README.md:
--------------------------------------------------------------------------------
 1 | # SWE-bench Inference
 2 | In this sub-package, we provide various tools to get started on SWE-bench inference.
 3 | In particular, we provide the following important scripts and sub-packages:
 4 | 
 5 | - `make_datasets`, this sub-package contains scripts to generate new datasets for SWE-bench inference with your own prompts and issues.
 6 | - `run_api.py`, this script is used to generate API model generations for a given dataset.
 7 | - `run_llama.py`, this script is used to run inference using Llama models, i.e. SWE-Llama.
 8 | - `run_live.py`, this script is used to generate model generations for new issues on GitHub in real time.
 9 | 
10 | ## Installation
11 | To install the dependencies for this sub-package, you can run the following command:
12 | ```bash
13 | pip install -e .[inference]
14 | ```
15 | 
16 | ## `make_datasets`
17 | For more information on how to use this sub-package, please refer to the [README](./make_datasets/README.md) in the `make_datasets` sub-package.
18 | 
19 | ## Run API inference on test datasets
20 | 
21 | This python script is designed to run inference on a dataset using either the OpenAI or Anthropic API, depending on the model specified. It sorts instances by length and continually writes the outputs to a specified file, so that the script can be stopped and restarted without losing progress.
22 | 
23 | For instance, to run this script on SWE-bench with the ``Oracle`` context and Anthropic's Claude 2 model, you can run the following command:
24 | ```bash
25 | export ANTHROPIC_API_KEY=<your key>
26 | python -m swebench.inference.run_api --dataset_name_or_path princeton-nlp/SWE-bench_oracle --model_name_or_path claude-2 --output_dir ./outputs
27 | ```
28 | 
29 | You can also specify further options:
30 | 
31 | - `--split`: To specify the dataset split to use (default is "test").
32 | - `--shard_id` and `--num_shards`: To process only a shard of the data.
33 | - `--model_args`: A string containing comma-separated key=value pairs for arguments to pass to the model. (e.g. `--model_args="temperature=0.2,top_p=0.95"`)
34 | - `--max_cost`: The maximum cost to spend on inference total.
35 | 
36 | 
37 | ## Run inference using Llama models (i.e. SWE-Llama)
38 | 
39 | You can run inference using [SWE-Llama](https://huggingface.co/princeton-nlp/SWE-Llama-13b) with the `run_llama.py` script.
40 | This script is similar to `run_api.py`, but it is designed to run inference using Llama models.
41 | 
42 | For instance, to run this script on SWE-bench with the ``Oracle`` context and SWE-Llama, you can run the following command:
43 | ```bash
44 | python -m swebench.inference.run_llama \
45 |     --dataset_path princeton-nlp/SWE-bench_oracle \
46 |     --model_name_or_path princeton-nlp/SWE-Llama-13b \
47 |     --output_dir ./outputs \
48 |     --temperature 0
49 | ```
50 | 
51 | You can also specify further options:
52 | - `--split`: To specify the dataset split to use (default is "test").
53 | - `--shard_id` and `--num_shards`: To process only a shard of the data.
54 | - `--temperature`: The temperature to use for sampling (default is 0).
55 | - `--top_p`: The top_p to use for sampling (default is 1).
56 | - `--peft_path`: The path or hf name for the PEFT adapter. 
57 | 
58 | 
59 | ## Run live inference on open GitHub issues
60 | 
61 | Follow instructions [here](https://github.com/castorini/pyserini/blob/master/docs/installation.md) to install [Pyserini](https://github.com/castorini/pyserini), to perform BM25 retrieval, and [here](https://github.com/facebookresearch/faiss/blob/main/INSTALL.md) to install [Faiss](https://github.com/facebookresearch/faiss).
62 | 
63 | Then run `run_live.py` to try solving a new issue. For example, you can try solving [this issue](https://github.com/huggingface/transformers/issues/26706 ) by running the following command:
64 | 
65 | ```bash
66 | export OPENAI_API_KEY=<your key>
67 | python -m swebench.inference.run_live --model_name gpt-3.5-turbo-1106 \
68 |     --issue_url https://github.com/huggingface/transformers/issues/26706 
69 | ```
70 | 


--------------------------------------------------------------------------------
/swebench/collect/get_top_pypi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os, json
  4 | import argparse
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | from ghapi.core import GhApi
  8 | from selenium import webdriver
  9 | from selenium.webdriver.common.by import By
 10 | 
 11 | 
 12 | gh_token = os.environ.get("GITHUB_TOKEN")
 13 | if not gh_token:
 14 |     msg = "Please set the GITHUB_TOKEN environment variable."
 15 |     raise ValueError(msg)
 16 | api = GhApi(token="gh_token")
 17 | 
 18 | 
 19 | def get_package_stats(data_tasks, f):
 20 |     """
 21 |     Get package stats from pypi page
 22 | 
 23 |     Args:
 24 |         data_tasks (list): List of packages + HTML
 25 |         f (str): File to write to
 26 |     """
 27 |     # Adjust access type if file already exists
 28 |     content = None
 29 |     access_type = "w"
 30 |     if os.path.exists(f):
 31 |         with open(f) as fp_:
 32 |             content = fp_.read()
 33 |             access_type = "a"
 34 |             fp_.close()
 35 | 
 36 |     # Extra package title, pypi URL, stars, pulls, and github URL
 37 |     with open(f, access_type) as fp_:
 38 |         for idx, chunk in enumerate(data_tasks):
 39 |             # Get package name and pypi URL
 40 |             package_name = chunk["title"]
 41 |             package_url = chunk["href"]
 42 |             if content is not None and package_url in content:
 43 |                 continue
 44 | 
 45 |             # Get github URL
 46 |             package_github = None
 47 |             driver.get(package_url)
 48 |             soup = BeautifulSoup(driver.page_source, "html.parser")
 49 |             for link in soup.find_all("a", class_="vertical-tabs__tab--with-icon"):
 50 |                 found = False
 51 |                 for x in ["Source", "Code", "Homepage"]:
 52 |                     if (
 53 |                         x.lower() in link.get_text().lower()
 54 |                         and "github" in link["href"].lower()
 55 |                     ):
 56 |                         package_github = link["href"]
 57 |                         found = True
 58 |                         break
 59 |                 if found:
 60 |                     break
 61 | 
 62 |             # Get stars and pulls from github API
 63 |             stars_count, pulls_count = None, None
 64 |             if package_github is not None:
 65 |                 repo_parts = package_github.split("/")[-2:]
 66 |                 owner, name = repo_parts[0], repo_parts[1]
 67 | 
 68 |                 try:
 69 |                     repo = api.repos.get(owner, name)
 70 |                     stars_count = int(repo["stargazers_count"])
 71 |                     issues = api.issues.list_for_repo(owner, name)
 72 |                     pulls_count = int(issues[0]["number"])
 73 |                 except:
 74 |                     pass
 75 | 
 76 |             # Write to file
 77 |             print(
 78 |                 json.dumps(
 79 |                     {
 80 |                         "rank": idx,
 81 |                         "name": package_name,
 82 |                         "url": package_url,
 83 |                         "github": package_github,
 84 |                         "stars": stars_count,
 85 |                         "pulls": pulls_count,
 86 |                     }
 87 |                 ),
 88 |                 file=fp_,
 89 |                 flush=True,
 90 |             )
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument("--max-repos", help="Maximum number of repos to get", type=int, default=5000)
 96 |     args = parser.parse_args()
 97 | 
 98 |     # Start selenium driver to get top 5000 pypi page
 99 |     url_top_pypi = "https://hugovk.github.io/top-pypi-packages/"
100 |     driver = webdriver.Chrome()
101 |     driver.get(url_top_pypi)
102 |     button = driver.find_element(By.CSS_SELECTOR, 'button[ng-click="show(8000)"]')
103 |     button.click()
104 | 
105 |     # Retrieve HTML for packages from page
106 |     soup = BeautifulSoup(driver.page_source, "html.parser")
107 |     package_list = soup.find("div", {"class": "list"})
108 |     packages = package_list.find_all("a", class_="ng-scope")
109 | 
110 |     get_package_stats(packages[:args.max_repos], "pypi_rankings.jsonl")
111 | 


--------------------------------------------------------------------------------
/swebench/collect/README.md:
--------------------------------------------------------------------------------
 1 | # Data Collection
 2 | This folder includes the code for the first two parts of the benchmark construction procedure as described in the paper, specifically 1. Repo selection and data scraping, and 2. Attribute-based filtering.
 3 | 
 4 | We include a comprehensive [tutorial](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/collection.md) that describes the end-to-end procedure for collecting evaluation task instances from PyPI repositories.
 5 | 
 6 | > SWE-bench's collection pipeline is currently designed to target PyPI packages. We hope to expand SWE-bench to more repositories and languages in the future.
 7 | 
 8 | <img src="../../assets/figures/collection.png">
 9 | 
10 | ## Collection Procedure
11 | To run collection on your own repositories, run the `run_get_tasks_pipeline.sh` script. Given a repository or list of repositories (formatted as `owner/name`), for each repository this command will generate...
12 | * `<repo>-prs.jsonl` file containing the [metadata for every pull request](https://docs.github.com/rest/reference/pulls#list-pull-requests) from the repository.
13 | * `<repo>-task-instances.jsonl.all` file containing all *valid* task instances (has associated issues + gold patch).
14 |     * This file's values can be used for fine tuning purposes.
15 | * `<repo>-task-instances.jsonl` file containing *valid* task instances that also has associated *tests*.
16 |     * This file's values are candidate task instances. Once validated, they can be used for evaluation purposes.
17 |     * The `.json.all` includes these task instances as well.
18 | 
19 | ## Directory Overview
20 | In this section, we briefly describe each of the files in this directory and its usage details.
21 | 
22 | **🧐 GitHub Repository Selection**
23 | * `get_top_pypi.py`
24 |     * Purpose: Retrieves the PyPI URL, GitHub URL, # of ⭐, and # of Issues + PRs for the [top 5000](https://hugovk.github.io/top-pypi-packages/") most downloaded PyPI packages.
25 |     * Usage: `python get_top_pypi.py`
26 | 
27 | **⛏️ GitHub Data Collection**
28 | * `print_pulls.py`
29 |     * Purpose: Given the `<owner/name>` of a GitHub repo, this script writes the raw information for all the repo's PRs to a single `.jsonl` file
30 |     * Usage: `python print_pulls.py <repo name> <path to PRs .jsonl file> --token <GitHub Token>`
31 | * `build_dataset.py`
32 |     * Purpose: Given the path to a PRs `.jsonl` file generated by `print_pulls.py`, this script attempts to convert each PR to a task instance. It creates a `jsonl.all` file for any PRs with an issue and a `.jsonl` file for any PRs with both an issue and modifications to that repository's tests.
33 |     * Usage: `python build_dataset.py <path to PRs .jsonl file> <path to output .jsonl file> --token <Github Token>`
34 | * `get_tasks_pipeline.py`
35 |     * Purpose: Automates invocation of the repo → task instance construction pipeline (`print_pulls.py` + `build_dataset.py`) for multiple repositories
36 |     * Usage: `./run_get_tasks_pipeline` (Check file for arguments)
37 | 
38 | **🎵 Fine Tuning Dataset Construction**
39 | * `build_dataset_ft.py`
40 |     * Purpose: Given the path to a collection of `.jsonl.all` files generated by `build_dataset.py`, this is a simple script to combine all such files into a single `.jsonl` that can be used to construct a instruction tuning dataset based on [problem statement + original code, code Δ] pairs.
41 |     * Usage: `./run_build_dataset_ft` (Check file for arguments)
42 | 
43 | **🪞 Mirroring Repositories**
44 | * `make_repo.sh`
45 |     * Purpose: A script for creating a [mirror repository](https://docs.github.com/en/repositories/creating-and-managing-repositories/duplicating-a-repository) of an existing repository on GitHub. Examples available under the [swe-bench organization](https://github.com/orgs/swe-bench/repositories).
46 |     * Usage: `python call_make_repo.py` (Check file for arguments)
47 | 
48 | **🧹 Clean Up**
49 | * `delete_gh_workflows.py`
50 |     * Purpose: Recurring workflows from mirror repositories can clog up your inbox for the email account associated with your GitHub token. Given a repo URL, this will automate removing the `.github/workflows` folder from all branches of a repository.
51 |     * Usage: `python delete_gh_workflows.py <repo URL>`
52 | * `remove_envs.py`
53 |     * Purpose: SWE Bench's evaluation + validation harnesses rely on the creation of multiple virtual environments with conda to speed up benchmark evaluation. Use these script to parallelize conda environment removal for environments named with the same prefix.
54 |     * Usage: `python remove_envs.py <prefix> --conda_path <path to conda installation>`
55 | 


--------------------------------------------------------------------------------
/docs/README_JP.md:
--------------------------------------------------------------------------------
 1 | <documents>
 2 | <document index="1">
 3 | <source>paste.txt</source>
 4 | <document_content>
 5 | <p align="center">
 6 |   <a href="https://github.com/princeton-nlp/Llamao">
 7 |     <img src="https://raw.githubusercontent.com/Sunwood-ai-labs/SWE-bench/main/assets/figures/swellama_banner.png" width="50%" alt="Kawi the SWE-Llama" />
 8 |   </a>
 9 | </p>
10 | 
11 | <div align="center">
12 | 
13 |  | [日本語](docs/README_JP.md) | [English](https://github.com/princeton-nlp/SWE-bench) | [中文简体](docs/README_CN.md) | [中文繁體](docs/README_TW.md) |
14 | 
15 | </div>
16 | 
17 | 
18 | ---
19 | <p align="center">
20 | ICLR 2024 の論文 <a href="http://swe-bench.github.io/paper.pdf">SWE-bench: Can Language Models Resolve Real-World GitHub Issues?</a> のコードとデータ
21 |     </br>
22 |     </br>
23 |     <a href="https://www.python.org/">
24 |         <img alt="Build" src="https://img.shields.io/badge/Python-3.8+-1f425f.svg?color=purple">
25 |     </a>
26 |     <a href="https://copyright.princeton.edu/policy">
27 |         <img alt="License" src="https://img.shields.io/badge/License-MIT-blue">
28 |     </a>
29 | </p>
30 | 
31 | パブリックリーダーボードは[ウェブサイト](http://swe-bench.github.io)を、SWE-bench ベンチマークの最新アップデート情報は [change log](https://github.com/princeton-nlp/SWE-bench/blob/master/CHANGELOG.md) を参照してください。
32 | 
33 | ## 👋 概要 
34 | SWE-bench は、GitHub から収集された実世界のソフトウェアの課題に関する大規模言語モデルを評価するためのベンチマークです。
35 | *コードベース*と*イシュー*が与えられ、言語モデルは記述された問題を解決する*パッチ*を生成するタスクを行います。
36 | 
37 | <img src="https://raw.githubusercontent.com/Sunwood-ai-labs/SWE-bench/main/assets/figures/teaser.png">
38 | 
39 | ## 🚀 セットアップ
40 | SWE-bench をソースからビルドするには、以下の手順に従ってください:
41 | 1. このリポジトリをローカルにクローンする
42 | 2. リポジトリに `cd` で移動する
43 | 3. `conda env create -f environment.yml` を実行して、`swe-bench` という名前の conda 環境を作成する  
44 | 4. `conda activate swe-bench` で環境をアクティベートする
45 | 
46 | ## 💽 使用法
47 | SWE-bench データセットは直接ダウンロードするか ([dev](https://drive.google.com/uc?export=download&id=1SbOxHiR0eXlq2azPSSOIDZz-Hva0ETpX), [test](https://drive.google.com/uc?export=download&id=164g55i3_B78F6EphCZGtgSrd2GneFyRM) セット)、[HuggingFace](https://huggingface.co/datasets/princeton-nlp/SWE-bench) からダウンロードできます。
48 | 
49 | SWE-Bench を使用するには、以下のことができます:
50 | * 前処理済みのデータセットで独自のモデルを学習する
51 | * 既存のモデル (ディスクにあるLLaMAのようなモデルやGPT-4のようなAPIでアクセスできるモデル) で[推論](https://github.com/princeton-nlp/SWE-bench/blob/master/inference/)を実行する。推論ステップでは、レポとイシューを取得し、モデルにそれを修正するためのコードを生成させます。 
52 | * SWE-bench に対してモデルを[評価](https://github.com/princeton-nlp/SWE-bench/blob/master/harness/)する。これは、SWE-Benchのタスクとモデルが提案したソリューションを受け取り、その正確性を評価するためのものです。
53 | * 独自のリポジトリに対してSWE-benchの[データ収集手順](https://github.com/princeton-nlp/SWE-bench/blob/master/collect/)を実行し、新しいSWE-Benchタスクを作成する。
54 | 
55 | ## ⬇️ ダウンロード
56 | | データセット | モデル |  
57 | | - | - |
58 | | [🤗 SWE-bench](https://huggingface.co/datasets/princeton-nlp/SWE-bench) | [🦙 SWE-Llama 13b](https://huggingface.co/princeton-nlp/SWE-Llama-13b) |
59 | | [🤗 "Oracle" Retrieval](https://huggingface.co/datasets/princeton-nlp/SWE-bench_oracle) | [🦙 SWE-Llama 13b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-13b-peft) |  
60 | | [🤗 BM25 Retrieval 13K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_13K) | [🦙 SWE-Llama 7b](https://huggingface.co/princeton-nlp/SWE-Llama-7b) |
61 | | [🤗 BM25 Retrieval 27K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_27K) | [🦙 SWE-Llama 7b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-7b-peft) |
62 | | [🤗 BM25 Retrieval 40K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_40K) | |
63 | | [🤗 BM25 Retrieval 50K (Llamaトークン)](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_50k_llama) | |  
64 | 
65 | ## 🍎 チュートリアル 
66 | SWE-benchの様々な部分の使い方についても、以下のブログ記事を書いています。
67 | 特定のトピックについての投稿を見たい場合は、issueでお知らせください。
68 | * [2023年11月1日] SWE-Benchの評価タスクの収集について ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/collection.md))
69 | * [2023年11月6日] SWE-benchでの評価について ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/evaluation.md))
70 | 
71 | ## 💫 貢献
72 | NLP、機械学習、ソフトウェア工学の研究コミュニティからのフィードバックを歓迎します。貢献、プルリクエスト、issueを歓迎します!
73 | そのためには、新しいプルリクエストまたはissueを提出し、それぞれのテンプレートに従って記入してください。すぐにフォローアップします!
74 | 
75 | 連絡先: [Carlos E. Jimenez](http://www.carlosejimenez.com/) と [John Yang](https://john-b-yang.github.io/) (Email: {carlosej, jy1682}@princeton.edu)
76 | 
77 | ## ✍️ 引用
78 | 私たちの研究が役立つと思われる場合は、以下の引用をご利用ください。
79 | ```
80 | @inproceedings{jimenez2024swebench,
81 |       title={SWE-bench: Can Language Models Resolve Real-World GitHub Issues?}, 
82 |       author={Carlos E. Jimenez and John Yang and Alexander Wettig and Shunyu Yao and Kexin Pei and Ofir Press and Karthik Narasimhan},
83 |       booktitle={The Twelfth International Conference on Learning Representations},
84 |       year={2024},
85 |       url={https://openreview.net/forum?id=VTF8yNQM66}
86 | }
87 | ```
88 | 
89 | ## 🪪 ライセンス
90 | MIT。`LICENSE.md`を確認してください。
91 | 


--------------------------------------------------------------------------------
/docs/20240406_devin_validate/get_devin_preds.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "f7e54da5-97a7-4447-ba2b-0ad24dd3de20",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import json\n",
 11 |     "\n",
 12 |     "from glob import glob\n",
 13 |     "from unidiff import PatchSet"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "id": "6383506c-3405-4344-bfdd-6008c30a8e26",
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "Cloning into 'devin-swebench-results'...\n",
 27 |       "remote: Enumerating objects: 582, done.\u001b[K\n",
 28 |       "remote: Counting objects: 100% (582/582), done.\u001b[K\n",
 29 |       "remote: Compressing objects: 100% (570/570), done.\u001b[K\n",
 30 |       "remote: Total 582 (delta 12), reused 579 (delta 9), pack-reused 0\u001b[K\n",
 31 |       "Receiving objects: 100% (582/582), 571.31 KiB | 6.35 MiB/s, done.\n",
 32 |       "Resolving deltas: 100% (12/12), done.\n",
 33 |       "Updating files: 100% (580/580), done.\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "!git clone git@github.com:CognitionAI/devin-swebench-results.git"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 7,
 44 |    "id": "0afd1c6b-88e7-4e18-b065-f035f85c34b0",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "def convert_devin_txt_to_pred(pred_file):\n",
 49 |     "    inst_id = pred_file.split(\"/\")[-1].split(\"-diff\")[0]\n",
 50 |     "    pred = open(pred_file).read()\n",
 51 |     "    try:\n",
 52 |     "        PatchSet(pred)\n",
 53 |     "    except:\n",
 54 |     "        print(f\"{inst_id}: Prediction patch is malformed\")\n",
 55 |     "    return {\n",
 56 |     "        \"model_name_or_path\": \"devin-20240406\",\n",
 57 |     "        \"instance_id\": inst_id,\n",
 58 |     "        \"model_patch\": pred\n",
 59 |     "    }"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 8,
 65 |    "id": "f81ead15-bc56-4cc7-ba0a-de2b68473c2c",
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/plain": [
 71 |        "570"
 72 |       ]
 73 |      },
 74 |      "execution_count": 8,
 75 |      "metadata": {},
 76 |      "output_type": "execute_result"
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "predictions = []\n",
 81 |     "for pred_file in \\\n",
 82 |     "    glob(\"devin-swebench-results/output_diffs/fail/*.txt\") + \\\n",
 83 |     "    glob(\"devin-swebench-results/output_diffs/pass/*.txt\"):\n",
 84 |     "    predictions.append(convert_devin_txt_to_pred(pred_file))\n",
 85 |     "len(predictions)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 9,
 91 |    "id": "cf22d4d5-5ba7-4b7a-a298-676f1955da0c",
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "text/plain": [
 97 |        "{'model_name_or_path': 'devin-20240406',\n",
 98 |        " 'instance_id': 'django__django-16745',\n",
 99 |        " 'model_patch': 'diff --git a/django/core/validators.py b/django/core/validators.py\\nindex 6c622f5788..7a1aff3fe5 100644\\n--- a/django/core/validators.py\\n+++ b/django/core/validators.py\\n@@ -397,8 +397,9 @@ class StepValueValidator(BaseValidator):\\n     message = _(\"Ensure this value is a multiple of step size %(limit_value)s.\")\\n     code = \"step_size\"\\n \\n-    def compare(self, a, b):\\n-        return not math.isclose(math.remainder(a, b), 0, abs_tol=1e-9)\\n+    def compare(self, a, b, min_value=0):\\n+        offset = a - min_value\\n+        return not math.isclose(math.remainder(offset, b), 0, abs_tol=1e-9)\\n \\n \\n @deconstructible\\n'}"
100 |       ]
101 |      },
102 |      "execution_count": 9,
103 |      "metadata": {},
104 |      "output_type": "execute_result"
105 |     }
106 |    ],
107 |    "source": [
108 |     "predictions[0]"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 10,
114 |    "id": "00c2e805-cf64-4975-bd23-0b5d2be8576d",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "with open(\"devin_predictions.jsonl\", \"w\") as f:\n",
119 |     "    for pred in predictions:\n",
120 |     "        print(json.dumps(pred), file=f, flush=True)"
121 |    ]
122 |   }
123 |  ],
124 |  "metadata": {
125 |   "kernelspec": {
126 |    "display_name": "Python 3 (ipykernel)",
127 |    "language": "python",
128 |    "name": "python3"
129 |   },
130 |   "language_info": {
131 |    "codemirror_mode": {
132 |     "name": "ipython",
133 |     "version": 3
134 |    },
135 |    "file_extension": ".py",
136 |    "mimetype": "text/x-python",
137 |    "name": "python",
138 |    "nbconvert_exporter": "python",
139 |    "pygments_lexer": "ipython3",
140 |    "version": "3.11.7"
141 |   }
142 |  },
143 |  "nbformat": 4,
144 |  "nbformat_minor": 5
145 | }
146 | 


--------------------------------------------------------------------------------
/docs/README_CN.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |   <a href="https://github.com/princeton-nlp/Llamao">
 3 |     <img src="assets/figures/swellama_banner.png" width="50%" alt="Kawi the SWE-Llama" />
 4 |   </a>
 5 | </p>
 6 | 
 7 | <div align="center">
 8 | 
 9 |  | [日本語](docs/README_JP.md) | [English](https://github.com/princeton-nlp/SWE-bench) | [中文简体](docs/README_CN.md) | [中文繁體](docs/README_TW.md) |
10 | 
11 | </div>
12 | 
13 | 
14 | ---
15 | <p align="center">
16 | 您可以在我们的ICLR 2024的论文<a href="http://swe-bench.github.io/paper.pdf">《SWE-bench: Can Language Models Resolve Real-World GitHub Issues?》</a>中找到我们的代码和数据
17 |     </br>
18 |     </br>
19 |     <a href="https://www.python.org/">
20 |         <img alt="Build" src="https://img.shields.io/badge/Python-3.8+-1f425f.svg?color=purple">
21 |     </a>
22 |     <a href="https://copyright.princeton.edu/policy">
23 |         <img alt="License" src="https://img.shields.io/badge/License-MIT-blue">
24 |     </a>
25 |     <a href="https://badge.fury.io/py/swebench">
26 |         <img src="https://badge.fury.io/py/swebench.svg">
27 |     </a>
28 | </p>
29 | 
30 | 请访问我们的[网站](http://swe-bench.github.io)查看公共排行榜，并查看[更改日志](https://github.com/princeton-nlp/SWE-bench/blob/master/CHANGELOG.md)以获取有关 SWE-bench 基准最新更新的信息。
31 | 
32 | ## 👋 总览
33 | SWE-bench 是一个用于评估大型语言模型的基准，这些模型是从 GitHub 收集的真实软件问题。
34 | 给定一个 *代码库* 和一个 *问题*，语言模型的任务是生成一个 *补丁* 来解决所描述的问题。
35 | 
36 | <img src="assets/figures/teaser.png">
37 | 
38 | ## 🚀 设置
39 | 要从源代码构建 SWE-bench，请按照以下步骤操作:
40 | 1. 克隆此仓库到本地
41 | 2. `cd` 进入仓库
42 | 3. 运行 `conda env create -f environment.yml` 创建名为 `swe-bench` 的 conda 环境
43 | 4. 使用 `conda activate swe-bench` 激活环境
44 | 
45 | ## 💽 使用
46 | 你可以直接下载 SWE-bench 数据集 ([开发](https://drive.google.com/uc?export=download&id=1SbOxHiR0eXlq2azPSSOIDZz-Hva0ETpX), [测试](https://drive.google.com/uc?export=download&id=164g55i3_B78F6EphCZGtgSrd2GneFyRM) 集) 或从 [HuggingFace](https://huggingface.co/datasets/princeton-nlp/SWE-bench) 下载。
47 | 要使用 SWE-Bench，你可以:
48 | * 在我们预处理的数据集上训练自己的模型
49 | * 在现有模型上运行 [推理](https://github.com/princeton-nlp/SWE-bench/blob/master/inference/) （不管是本地的模型，比如LLaMA，还是通过API访问的模型，比如GPT-4）。推理步骤是你获取一个仓库和一个问题，让模型尝试去修复它。
50 | * 对模型进行 [评估](https://github.com/princeton-nlp/SWE-bench/blob/master/harness/)。这是你拿到一个 SWE-Bench 任务和一个模型提出的解决方案，然后评估其正确性。
51 | * 在你自己的仓库上运行 SWE-bench 的 [数据收集过程](https://github.com/princeton-nlp/SWE-bench/blob/master/collect/)，以创建新的 SWE-Bench 任务。
52 | 
53 | ## ⬇️ 下载
54 | | 数据集                                                                                                            | 模型                                                                                 |
55 | |----------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------|
56 | | [🤗 SWE-bench](https://huggingface.co/datasets/princeton-nlp/SWE-bench)                                        | [🦙 SWE-Llama 13b](https://huggingface.co/princeton-nlp/SWE-Llama-13b)             |
57 | | [🤗 "Oracle" Retrieval](https://huggingface.co/datasets/princeton-nlp/SWE-bench_oracle)                        | [🦙 SWE-Llama 13b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-13b-peft) |
58 | | [🤗 BM25 Retrieval 13K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_13K)                      | [🦙 SWE-Llama 7b](https://huggingface.co/princeton-nlp/SWE-Llama-7b)               |
59 | | [🤗 BM25 Retrieval 27K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_27K)                      | [🦙 SWE-Llama 7b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-7b-peft)   |
60 | | [🤗 BM25 Retrieval 40K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_40K)                      |                                                                                    |
61 | | [🤗 BM25 Retrieval 50K (Llama tokens)](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_50k_llama) |                                                                                    |
62 | 
63 | ## 🍎 教程
64 | 我们还写了关于如何使用SWE-bench不同部分的博客文章。
65 | 如果您想看到关于特定主题的文章，请通过问题告诉我们。
66 | * [Nov 1. 2023] Collecting Evaluation Tasks for SWE-Bench ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/collection.md))
67 | * [Nov 6. 2023] Evaluating on SWE-bench ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/evaluation.md))
68 | 
69 | ## 💫 贡献
70 | 我们欢迎来自更广泛的自然语言处理、机器学习和软件工程研究社区的反馈。我们欢迎任何贡献、PR或问题!
71 | 为此，请提交新的PR或问题，并相应地填写相应的模板。我们将尽快跟进!
72 | 
73 | 联系人: [Carlos E. Jimenez](http://www.carlosejimenez.com/) 和 [John Yang](https://john-b-yang.github.io/) (Email: {carlosej, jy1682}@princeton.edu).
74 | 
75 | ## ✍️ 引用
76 | 如果你觉得我们的工作有帮助，请使用以下引用。
77 | ```
78 | @inproceedings{
79 |     jimenez2024swebench,
80 |     title={{SWE}-bench: Can Language Models Resolve Real-world Github Issues?},
81 |     author={Carlos E Jimenez and John Yang and Alexander Wettig and Shunyu Yao and Kexin Pei and Ofir Press and Karthik R Narasimhan},
82 |     booktitle={The Twelfth International Conference on Learning Representations},
83 |     year={2024},
84 |     url={https://openreview.net/forum?id=VTF8yNQM66}
85 | }
86 | ```
87 | 
88 | ## 🪪 许可证
89 | MIT. 参考 `LICENSE.md`.
90 | 


--------------------------------------------------------------------------------
/docs/README_TW.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |   <a href="https://github.com/princeton-nlp/Llamao">
 3 |     <img src="assets/figures/swellama_banner.png" width="50%" alt="Kawi the SWE-Llama" />
 4 |   </a>
 5 | </p>
 6 | 
 7 | <div align="center">
 8 | 
 9 |  | [日本語](docs/README_JP.md) | [English](https://github.com/princeton-nlp/SWE-bench) | [中文简体](docs/README_CN.md) | [中文繁體](docs/README_TW.md) |
10 | 
11 | </div>
12 | 
13 | 
14 | ---
15 | <p align="center">
16 | 你可以在我們的ICLR 2024的論文<a href="http://swe-bench.github.io/paper.pdf">《SWE-bench: Can Language Models Resolve Real-World GitHub Issues?》</a>中找到我們的代碼和數據
17 |     </br>
18 |     </br>
19 |     <a href="https://www.python.org/">
20 |         <img alt="Build" src="https://img.shields.io/badge/Python-3.8+-1f425f.svg?color=purple">
21 |     </a>
22 |     <a href="https://copyright.princeton.edu/policy">
23 |         <img alt="License" src="https://img.shields.io/badge/License-MIT-blue">
24 |     </a>
25 |     <a href="https://badge.fury.io/py/swebench">
26 |         <img src="https://badge.fury.io/py/swebench.svg">
27 |     </a>
28 | </p>
29 | 
30 | 請訪問我們的[網站](http://swe-bench.github.io)查看公共排行榜，並查看[更改日誌](https://github.com/princeton-nlp/SWE-bench/blob/master/CHANGELOG.md)以獲取有關 SWE-bench 基準最新更新的信息。
31 | 
32 | ## 👋 縱覽
33 | SWE-bench 是一個用於評估大型語言模型的基準，這些模型是從 GitHub 收集的真實軟體問題。
34 | 給定一個 *代碼庫* 和一個 *問題*，語言模型的任務是生成一個 *修補程式* 來解決所描述的問題。
35 | 
36 | <img src="assets/figures/teaser.png">
37 | 
38 | ## 🚀 設置
39 | 要從源代碼構建 SWE-bench，請按照以下步驟操作:
40 | 1. 克隆此倉庫到本地
41 | 2. `cd` 進入倉庫
42 | 3. 運行 `conda env create -f environment.yml` 創建名為 `swe-bench` 的 conda 環境
43 | 4. 使用 `conda activate swe-bench` 激活環境
44 | 
45 | ## 💽 使用
46 | 你可以直接下載 SWE-bench 數據集 ([開發](https://drive.google.com/uc?export=download&id=1SbOxHiR0eXlq2azPSSOIDZz-Hva0ETpX), [測試](https://drive.google.com/uc?export=download&id=164g55i3_B78F6EphCZGtgSrd2GneFyRM) 集) 或從 [HuggingFace](https://huggingface.co/datasets/princeton-nlp/SWE-bench) 下載。
47 | 要使用 SWE-Bench，你可以:
48 | * 在我們預處理的數據集上訓練自己的模型
49 | * 在現有模型上運行 [推理](https://github.com/princeton-nlp/SWE-bench/blob/master/inference/)（不管是本地的模型，比如LLaMA，還是通過API訪問的模型，比如GPT-4）。推理步驟是你獲取一個倉庫和一個問題，讓模型嘗試去修復它。
50 | * 對模型進行 [評估](https://github.com/princeton-nlp/SWE-bench/blob/master/inference/)。這是你拿到一個 SWE-Bench 任務和一個模型提出的解決方案，然後評估其正確性。
51 | * 在你自己的倉庫上運行 SWE-bench 的 [數據收集過程](https://github.com/princeton-nlp/SWE-bench/blob/master/collect/)，以創建新的 SWE-Bench 任務。
52 | 
53 | ## ⬇️ 下載
54 | | 數據集                                                                                                           | 模型                                                                                 |
55 | |---------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------|
56 | | [🤗 SWE-bench](https://huggingface.co/datasets/princeton-nlp/SWE-bench)                                       | [🦙 SWE-Llama 13b](https://huggingface.co/princeton-nlp/SWE-Llama-13b)             |
57 | | [🤗 "Oracle" Retrieval](https://huggingface.co/datasets/princeton-nlp/SWE-bench_oracle)                       | [🦙 SWE-Llama 13b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-13b-peft) |
58 | | [🤗 BM25 Retrieval 13K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_13K)                     | [🦙 SWE-Llama 7b](https://huggingface.co/princeton-nlp/SWE-Llama-7b)               |
59 | | [🤗 BM25 Retrieval 27K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_27K)                     | [🦙 SWE-Llama 7b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-7b-peft)   |
60 | | [🤗 BM25 Retrieval 40K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_40K)                     |                                                                                    |
61 | | [🤗 BM25 Retrieval 50K (Llama tokens)](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_50k_llama) |                                                                                    |
62 | 
63 | ## 🍎 教程
64 | 我們還撰寫了以下有關如何使用SWE-bench不同部分的博客文章。
65 | 如果您想看到有關特定主題的文章，請通過問題告訴我們。
66 | * [Nov 1. 2023] Collecting Evaluation Tasks for SWE-Bench ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/collection.md))
67 | * [Nov 6. 2023] Evaluating on SWE-bench ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/evaluation.md))
68 | 
69 | ## 💫 貢獻
70 | 我們很樂意聽取來自更廣泛的 NLP、機器學習和軟體工程研究社區的意見，並歡迎任何貢獻、拉取請求或問題！
71 | 為此請提交新的拉取請求或問題，並根據相應的模板填寫。我們將盡快跟進！
72 | 
73 | 聯繫人: [Carlos E. Jimenez](http://www.carlosejimenez.com/) 和 [John Yang](https://john-b-yang.github.io/) (Email: {carlosej, jy1682}@princeton.edu).
74 | 
75 | ## ✍️ 引用
76 | 如果你覺得我們的工作有幫助，請使用以下引用。
77 | ```
78 | @inproceedings{
79 |     jimenez2024swebench,
80 |     title={{SWE}-bench: Can Language Models Resolve Real-world Github Issues?},
81 |     author={Carlos E Jimenez and John Yang and Alexander Wettig and Shunyu Yao and Kexin Pei and Ofir Press and Karthik R Narasimhan},
82 |     booktitle={The Twelfth International Conference on Learning Representations},
83 |     year={2024},
84 |     url={https://openreview.net/forum?id=VTF8yNQM66}
85 | }
86 | ```
87 | 
88 | ## 🪪 授權
89 | MIT. 參考 `LICENSE.md`.
90 | 


--------------------------------------------------------------------------------
/swebench/inference/make_datasets/README.md:
--------------------------------------------------------------------------------
 1 | # `make_datasets`
 2 | The `make_datasets` sub-package is used to create datasets for SWE-bench with your own prompts, contexts, and tokenizers.
 3 | The sub-package contains the following scripts:
 4 | 
 5 | - `create_text_dataset.py` is used to create a text dataset from SWE-bench with a given prompt and context-source.
 6 | - `tokenize_dataset.py` is used to tokenize a text dataset with a given tokenizer.
 7 | - `bm25_retrieval.py` can be used to perform BM25 retrieval on the SWE-bench dataset.
 8 | 
 9 | ## `create_text_dataset.py`
10 | This script is used to create a text dataset from SWE-bench with a given prompt and context-source.
11 | Prompts are defined as functions in `create_instance.py`. `style-2` and `style-3` are appropriate for API models, while only `style-2` can be used for SWE-Llama.
12 | `full_file_gen` is used for the full file generation ablation, and `style-2-edits-only`  is used for the `oracle-collapsed` ablation.
13 | 
14 | Here's an example of how to call the script to create a dataset with `style-3` prompts and `oracle` contexts:
15 | 
16 | ```bash
17 | export GITHUB_TOKEN=<your token>
18 | python -m swebench.inference.make_datasets.create_text_dataset \
19 |     --dataset_name_or_path princeton-nlp/SWE-bench \
20 |     --output_dir ./base_datasets --prompt_style style-3 \
21 |     --file_source oracle
22 | ```
23 | 
24 | You can also specify further options:
25 | 
26 | - `--splits`: To specify the dataset splits to process (default is all splits). If you want to process only the `test` split, you can use `--splits test`.
27 | - `--validation_ratio`: To specify the ratio of the training set to use for validation (default is 0.01). For example, you can use `--validation_ratio 0.05` to use 5% of the training set for validation.
28 | - `--max_context_len`: To specify the maximum number of tokens to use for context. For example, `--max_context_len 15000` will limit the context to 15000 tokens.
29 | - `--tokenizer_name`: To specify the tokenizer to use. You can choose from the available tokenizers defined in `tokenize_dataset.py`. If not specified, the default tokenizer will be used.
30 | - `--push_to_hub_user`: If you want to push the dataset to the Hugging Face Hub, you can specify your username with this option. If specified, make sure you have set your API key environment variable `HUGGING_FACE_HUB_TOKEN`. You do not need to specify `--output_dir` if you use this option.
31 | - `--retrieval_file`: If you want to use BM25 retrieval to create the dataset, you can specify the file containing the retrieval results with this option. The retrieval results should be in the format produced by `bm25_retrieval.py`. You should specify `--file_source bm25` if you use this option.
32 | 
33 | The script will create a new dataset in the specified output directory. If you choose to push the dataset to the Hugging Face Hub, it will be available under your username.
34 | 
35 | ## `tokenize_dataset.py`
36 | This script is used to tokenize a text dataset with a given tokenizer. You can choose from the available tokenizers defined in the script. The script will create a new tokenized dataset in the specified output directory.
37 | 
38 | Here's an example of how to call the script to tokenize a dataset with the `llama` tokenizer:
39 | 
40 | ```bash
41 | python -m swebench.inference.make_datasets.tokenize_dataset \
42 |     --dataset_name_or_path ./base_datasets/DATASET_NAME \
43 |     --output_dir ./tokenized_datasets \
44 |     --tokenizer_name llama \
45 |     --num_proc 20
46 | ```
47 | 
48 | - `--push_to_hub_user`: If you want to push the dataset to the Hugging Face Hub, you can specify your username with this option. If specified, make sure you have set your API key environment variable `HUGGING_FACE_HUB_TOKEN`. You do not need to specify `--output_dir` if you use this option.
49 | 
50 | __NOTE:__ The `cl100k` tokenizer does not support multiprocessing.
51 | 
52 | ## `bm25_retrieval.py`
53 | This script can be used to perform BM25 retrieval on the SWE-bench dataset. It creates a results file in the specified output directory that can be used in `create_text_dataset.py` with the `--retrieval_file` option and `--file_source bm25`.
54 | 
55 | Here's an example of how to call the script to perform BM25 retrieval on the `test` split of the SWE-bench dataset:
56 | 
57 | ```bash
58 | python -m swebench.inference.make_datasets.bm25_retrieval \
59 |     --dataset_name_or_path princeton-nlp/SWE-bench \
60 |     --output_dir ./retrieval_results \
61 |     --splits test
62 | ```
63 | 
64 | __NOTE:__ The script requires the `pyserini` package to be installed. See the pyserini [installation instructions](https://github.com/castorini/pyserini) for more details.
65 | 
66 | 
67 | ## `eval_retrieval.py`
68 | This script can be used to evaluate the BM25 retrieval results for a dataset created with `create_text_dataset.py` with the `--retrieval_file` option and `--file_source bm25`.
69 | __NOTE__: The script assumes that the `text` field in the dataset specifies files using the "\[start of filename\]" and "\[end of filename\]" tags used by the default DOCUMENT_ENCODING_FUNCTIONS in `bm25_retrieval.py`. If you change that format, you need to modify the `instance_file_pattern` in `eval_retrieval.py` accordingly.
70 | 
71 | Here's an example of how to call the script to evaluate the BM25 retrieval results for a dataset:
72 | 
73 | ```bash
74 | python -m swebench.inference.make_datasets.eval_retrieval \
75 |     --dataset_name_or_path princeton-nlp/SWE-bench_bm25_13K \
76 |     --split test
77 | ```
78 | 


--------------------------------------------------------------------------------
/swebench/collect/make_lite/criteria.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import requests
  3 | 
  4 | from unidiff import PatchSet
  5 | 
  6 | 
  7 | def contains_git_commit_hash(text: str) -> bool:
  8 |     """
  9 |     Returns True if the text contains a git commit hash (40 character SHA-1 hash).
 10 |     * Excludes commit hashes that are part of a URL.
 11 |     """
 12 |     pattern_git_commit_hash = re.compile(r'(?<!/)\b[0-9a-f]{40}\b')
 13 |     if re.search(pattern_git_commit_hash, text) is not None:
 14 |         return True
 15 |     pattern_django_commit_hash = re.compile(r'\[[0-9a-f]{23}\]')
 16 |     if re.search(pattern_django_commit_hash, text) is not None:
 17 |         return True
 18 |     return False
 19 | 
 20 | 
 21 | def contains_hyperlinks(text: str, repo: str = None) -> bool:
 22 |     """
 23 |     Returns True if the text contains a URL. Excludes URLs that are part of the repository.
 24 |     """
 25 |     if repo:
 26 |         repo_prefix = f"http://github.com/{repo}"
 27 |         pattern_repo = re.escape(repo_prefix)
 28 |         # Adding a negative lookahead assertion to ensure URLs starting with the repository prefix are excluded
 29 |         pattern_urls = r'(?:https?://(?!{}).+)|(?:www\.(?!{}).+)'.format(pattern_repo, pattern_repo)
 30 |     else:
 31 |         pattern_urls = r'https?://(?:www\.)?\S+'
 32 | 
 33 |     return bool(re.search(pattern_urls, text))
 34 | 
 35 | 
 36 | def contains_image(text: str) -> bool:
 37 |     """
 38 |     Returns True if the text contains an image or video file extension.
 39 |     """
 40 |     image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.svg', '.webp', '.ico', '.heif', '.bpg', '.avif']
 41 |     video_extensions = ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg']
 42 | 
 43 |     pattern_image = '|'.join(re.escape(ext) for ext in image_extensions)
 44 |     pattern_video = '|'.join(re.escape(ext) for ext in video_extensions)
 45 | 
 46 |     image_regex = re.compile(r'\b({})\b'.format(pattern_image), flags=re.IGNORECASE)
 47 |     video_regex = re.compile(r'\b({})\b'.format(pattern_video), flags=re.IGNORECASE)
 48 | 
 49 |     return image_regex.search(text) is not None or video_regex.search(text) is not None
 50 | 
 51 | 
 52 | def contains_issue_reference(text: str, repo: str) -> bool:
 53 |     """
 54 |     Returns True if text (problem statement) contains a reference to another issue (e.g. #1234).
 55 |     """
 56 |     # Look for GitHub style issue references
 57 |     pattern_issue_ref = re.compile(r"(\w+)\s+\#(\d+)")
 58 |     keywords = {
 59 |         "close", "closes", "closed",
 60 |         "fix", "fixes", "fixed",
 61 |         "resolve", "resolves", "resolved",
 62 |     }
 63 |     references = dict(pattern_issue_ref.findall(text))
 64 |     if references:
 65 |         for word, _ in references.items():
 66 |             if word.lower() in keywords:
 67 |                 return True
 68 |     
 69 |     # Look for GitLab style issue references
 70 |     pattern_gitlab = re.compile(r"https?:\/\/gitlab.com\/(.*)\/issues")
 71 |     if re.search(pattern_gitlab, text):
 72 |         return True
 73 | 
 74 |     # Look for GitHub `#` style references + verify if the issue exists
 75 |     pattern_issue_ref = re.compile(r'#\d+')
 76 |     matches = pattern_issue_ref.findall(text)
 77 |     for match in matches:
 78 |         url = f"http://github.com/{repo}/issues/{match[1:]}"
 79 |         if repo == "django/django":
 80 |             url = f"https://code.djangoproject.com/ticket/{match[1:]}"
 81 |         if requests.get(url).status_code == 200:
 82 |             return True
 83 | 
 84 |     return False
 85 | 
 86 | 
 87 | def contains_non_modified_files(patch_text: str) -> bool:
 88 |     """
 89 |     Returns True if the patch contains files that are not modified.
 90 |     """
 91 |     patch = PatchSet(patch_text)
 92 |     return len(patch.removed_files) > 0 or len(patch.added_files) > 0
 93 | 
 94 | 
 95 | def contains_pytest_match_arg(patch_test_text: str) -> bool:
 96 |     """
 97 |     Returns True if the test patch contains a pytest.raises() call with a match argument.
 98 |     """
 99 |     if any([x in patch_test_text for x in [
100 |         'pytest.raises',
101 |         'pytest.warns',
102 |         'pytest.deprecated_call',
103 |         ]]):
104 |         return 'match' in patch_test_text
105 |     # Django style assertions:
106 |     if any([x in patch_test_text for x in [
107 |         'assertOutput',
108 |         'assertRaises',
109 |         'checks.Error',
110 |         ]]):
111 |         return True
112 |     return False
113 | 
114 | 
115 | def leq_n_code_lines(patch_text: str, n: int = 25) -> bool:
116 |     """
117 |     Returns True if the patch has at most n lines of code changed.
118 |     """
119 |     lines = 0
120 |     patch = PatchSet(patch_text)
121 |     for file in patch:
122 |         for hunk in file:
123 |             lines += hunk.added
124 |             lines += hunk.removed
125 |     return lines <= n
126 | 
127 | 
128 | def leq_n_files(patch_text: str, n: int = 1) -> bool:
129 |     """
130 |     Returns True if the patch has at most n files.
131 |     """
132 |     patch = PatchSet(patch_text)
133 |     return len(patch.modified_files) <= n
134 | 
135 | 
136 | def leq_n_hunks(patch_text: str, n: int = 3) -> bool:
137 |     """
138 |     Returns True if the patch has at most n hunks.
139 |     """
140 |     patch = PatchSet(patch_text)
141 |     num_hunks = sum([
142 |         len([h for h in f])
143 |         for f in patch.modified_files
144 |     ])
145 |     return num_hunks <= n and num_hunks > 0
146 | 
147 | 
148 | def leq_n_words(text: str, n: int = 50) -> bool:
149 |     """
150 |     Returns True if the text has at most n words.
151 |     """
152 |     return len(text.split()) <= n
153 | 


--------------------------------------------------------------------------------
/assets/collection.md:
--------------------------------------------------------------------------------
 1 | # Collecting Evaluation Tasks for SWE-Bench
 2 | John Yang &bull; November 1, 2023
 3 | 
 4 | In this tutorial, we explain how to use the SWE-Bench repository to collect evaluation task instances from GitHub repositories.
 5 | 
 6 | > SWE-bench's collection pipeline is currently designed to target PyPI packages. We hope to expand SWE-bench to more repositories and languages in the future.
 7 | 
 8 | <div align="center">
 9 |     <img style="width:70%" src="../assets/figures/collection.png">
10 | </div>
11 | 
12 | ## 🔍 Selecting a Repository
13 | 
14 | SWE-bench constructs task instances from issues and pull requests.
15 | A good repository to source evaluation instances from should have many issues and pull requests.
16 | A point of reference for repositories that fit this bill would be the [Top PyPI packages](https://hugovk.github.io/top-pypi-packages/) website.
17 | 
18 | Once you've selected a repository, use the `/collect/make_repo/make_repo.sh` script to create a mirror of the repository, like so:
19 | ```bash
20 | ./collect/make_repo/make_repo.sh scikit-learn/scikit-learn
21 | ```
22 | 
23 | ## ⛏️ Collecting Candidate Tasks
24 | 
25 | Once you have cloned the repository, you can then use the `collect/get_tasks_pipeline.py` script to collect pull requests and convert them to candidate task instances.
26 | Supply the *repository name(s)* and *logging folders* as arguments to the `run_get_tasks_pipeline.sh` script, then run it like so:
27 | ```bash
28 | ./collect/run_get_tasks_pipeline.sh 
29 | ```
30 | 
31 | At this point, for a repository, you should have...
32 | * A mirror clone of the repository under the [SWE-bench organization](https://github.com/orgs/swe-bench/repositories).
33 | * A `<repo name>-prs.jsonl` file containing all the repository's PRs.
34 | * A `<repo name>-task-instances.jsonl` file containing all the candidate task instances.
35 | 
36 | ## 📙 Specify Execution Parameters
37 | 
38 | This step is the most manual of all parts.
39 | To create an appropriate execution environment for task instances from a new repository, you must do the following steps:
40 | * Assign a repository-specific *version* (i.e. `1.2`) to every task instance.
41 | * Specify repository+version-specific installation commands in `harness/constants.py`.
42 | 
43 | ### Part A: Versioning
44 | Determining a version for each task instance can be accomplished in a number of ways, depending on the availability + feasability with respect to each repository.
45 | * Scrape from code: A version is explicitly specified in the codebase (in `__init__.py` or `_version.py` for PyPI packages).
46 | * Scrape from web: Repositories with websites (i.e. [xarray.dev](https://xarray.dev/)) have a "Releases" or "What's New" page (i.e. [release page](https://docs.xarray.dev/en/stable/whats-new.html) for xarray). This can be scraped for information.
47 | * Build from code: Sometimes, version-related files (i.e. `_version.py`) are purposely omitted by a developer (check `.gitignore` to verify). In this case, per task instance you can build the repository source code locally and extract the version number from the built codebase.
48 | 
49 | Examples and technical details for each are included in `/versioning/`. Please refer to them as needed.
50 | 
51 | ### Part B: Installation Configurations
52 | Per repository, you must provide installation instructions per version. In `constants.py`...
53 | 1. In `MAP_VERSION_TO_INSTALL`, declare a `<repo owner/name>: MAP_VERSION_TO_INSTALL_<repo name>` key/value pair.
54 | 2. Define a `MAP_VERSION_TO_INSTALL_<repo name>`, where the key is a version as a string, and the value is a dictionary of installation fields that include the following information:
55 | ```python
56 | {
57 |     "python": "3.x", # Required
58 |     "packages": "numpy pandas tensorflow",
59 |     "install": "pip install -e .", # Required
60 |     "pip_packages": ["pytest"],
61 | }
62 | ```
63 | These instructions can typically be inferred from the companion website or `CONTRIBUTING.md` doc that many open source repositories have.
64 | 
65 | ## ⚙️ Execution-based Validation
66 | Congrats, you got through the trickiest part! It's smooth sailing from here on out.
67 | 
68 | We now need to check that the task instances install properly + the problem solved by the task instance is non-trivial.
69 | This is taken care of by the `engine_validation.py` code.
70 | Run `./harness/run_validation.sh` and supply the following arguments:
71 | * `instances_path`: Path to versioned candidate task instances
72 | * `log_dir`: Path to folder to store task instance-specific execution logs
73 | * `temp_dir`: Path to directory to perform execution
74 | * `verbose`: Whether to print logging traces to standard output.
75 | 
76 | > In practice, you may have to iterate between this step and **Installation Configurations** a couple times. If your instructions are incorrect/under-specified, it may result in candidate task instances not being installed properly.
77 | 
78 | ## 🔄 Convert to Task Instances
79 | At this point, we now have all the information necessary to determine if task instances can be used for evaluation with SWE-bench, and save them if they do.
80 | 
81 | We provide the `validation.ipynb` Jupyter notebook provided in this folder to make the remaining steps easier.
82 | At a high level, it enables the following:
83 | * In **Monitor Validation**, check the results of the `./run_validation.sh` step.
84 | * In **Get [FP]2[FP] Tests**, determine which task instances are non-trivial (solves at least one test)
85 | * In **Create Task Instances `.json` file**, perform some final preprocessing and save your task instances to a `.json` file.
86 | 
87 | Thanks for reading! If you have any questions or comments about the details in the article, please feel free to follow up with an issue.
88 | 


--------------------------------------------------------------------------------
/docs/20240627_docker/README.md:
--------------------------------------------------------------------------------
 1 | # Containerized Evaluation Harness
 2 | June 27, 2024
 3 | 
 4 | We’re releasing an update that improves the reliability of the SWE-bench evaluation harness using **containerized environments** based on Docker.
 5 | 
 6 | In the original setup, we hypothesized that `conda` environments would be enough to enforce reproducible evaluation.
 7 | In hindsight, it is underspecified.
 8 | This past April, we put out [Bug Report 4/5/2024](docs/reports/20240405_eval_bug/README.md), that among several upgrades, adds explicit versioning for packages.
 9 | 
10 | However, SWE-bench evaluation remained sensitive to discrepancies originating from different platforms and user-specific configurations, leading to inconsistent results.
11 | To eliminate these irregularities, our new harness provisions **per-sample Docker images with Python virtual environments** that have been rigorously tested.
12 | 
13 | In the new Docker harness, **99.78% (2289/2294) of SWE-bench** tasks and **100% (300/300) of SWE-bench Lite** tasks consistently resolve correctly with the ground truth solution. Furthermore, containers spawned from these images can be used as development environments for agents that run and develop solutions iteratively.
14 | 
15 | ## Running Evaluation
16 | The main entrypoint for the evaluation harness is the `swebench.harness.run_evaluation` module.
17 | 
18 | Run the following command to see the available arguments:
19 | ```bash
20 | python -m swebench.harness.run_evaluation -h
21 | ```
22 | 
23 | This module runs docker containers for each evaluation instance in parallel.
24 | In the process of running the evaluation, the harness will:
25 | 1. Build a base image that install basic dependencies for all instances
26 | 2. Build "environment" images that initialize the python environment for various configurations that are common to multiple instances (in total there are about 60 of these - or 100GB of images)
27 | 3. Build "instance" images that install the specific dependencies and source code for each instance
28 | 
29 | The harness will then run the evaluation script in each instance container, and collect the results.
30 | After the evaluation is complete, the harness will clean up the containers and images depending on the `--cache_level` argument.
31 | 
32 | ## Choosing the right `cache_level`
33 | Since the harness builds images for each instance, it can be time-consuming to rebuild these images every time you run an evaluation.
34 | We provide a `cache_level` argument to control how the harness caches images between runs.
35 | By default, the harness `cache_level` is set to `env`, which means that the harness will store the base and environment images, but not the instance images.
36 | In this setting, the base and environment images will be reused across runs, but take up about 100GB of disk space.
37 | At the time of release, we require about 120GB of free disk space to run the harness with any `cache_level`.
38 | For most users, this is the recommended setting providing a good balance between evaluation speed and disk space usage.
39 | 
40 | For users who want the fastest possible evaluation times, we recommend setting `cache_level` to `instance`.
41 | In this setting, the harness will store images for all instances; making evaluation extremely fast.
42 | However, all base, environment, and instance images will be stored, taking up about 2,000GB of disk space.
43 | While this setting is the fastest, it is also extremely disk space intensive.
44 | 
45 | For users who want to minimize disk space usage, we recommend setting `cache_level` to `base` or `none`, which will remove all the instance and environment images after each run.
46 | Note at this time, this setting still requires about 100GB of disk space to store the base and environment images when first building them.
47 | 
48 | ## Choosing the right `max_workers`
49 | The harness runs instances in parallel using the `max_workers` argument.
50 | Since the harness uses the docker daemon to run instances, the number of workers should be chosen based on the resources available on your machine.
51 | In general, we don't recommend using a very large number of workers, as this can slow down the evaluation process.
52 | Regardless your CPU count, we recommend using fewer than 28 workers.
53 | 
54 | On a 16-core machine with `max_workers=12`, it should be possible to run evaluation on SWE-bench Lite in about 30 minutes when using the `env` cache level and under 15 minutes when using the `instance` cache level.
55 | 
56 | On an 8-core machine with `max_workers=6`, it should be possible to run evaluation on SWE-bench Lite in about 50 minutes when using the `env` cache level and about 15 minutes when using the `instance` cache level.
57 | 
58 | Using a much larger number of workers will likely slow down the evaluation process.
59 | 
60 | ## Future Steps
61 | We'd like to soon make the harness even more user-friendly by providing pre-built docker images that include verified starting points for each instance.
62 | 
63 | We're also hoping to better enable evaluation via orchestration tools like Kubernetes, which would allow users to run evaluations on larger clusters of machines.
64 | 
65 | We're providing experimental support for running evaluations on `arm64` machines, but this is still in the early stages of development.
66 | Users may experience substantial speed degradation when running evaluations on `arm64` machines.
67 | 
68 | ## Deliverables
69 | * Please use `swebench>=2.0` for the latest version of the benchmark - the old version is now deprecated but can still be accessed using `swebench<2.0`.
70 | 
71 | ## Acknowledgements
72 | This work was done in collaboration with the Preparedness team at OpenAI (including Oliver Jaffe, Chan Jun Shern, James Aung, Giulio Starace, Dane Sherburn, and Neil Chowdhury).
73 | 
74 | We'd also like to thank Cognition Labs for providing [inspiration](https://github.com/CognitionAI/devin-swebench-results/tree/main) in the design of the evaluation harness.
75 | 
76 | ✍️ Carlos & John


--------------------------------------------------------------------------------
/scripts/eval/eval_infer.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | PROCESS_FILEPATH=$1
  4 | if [ -z "$PROCESS_FILEPATH" ]; then
  5 |     echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
  6 |     exit 1
  7 | fi
  8 | 
  9 | if [ ! -f $PROCESS_FILEPATH ]; then
 10 |     echo "Error: $PROCESS_FILEPATH is not a file"
 11 |     exit 1
 12 | fi
 13 | 
 14 | # If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH
 15 | # otherwise, we want to eval on the instance_id
 16 | INSTANCE_ID=$2
 17 | DATASET_NAME=${3:-"swe-train/swe-train-v0"}
 18 | SPLIT=${4:-"train"}
 19 | 
 20 | echo "INSTANCE_ID: $INSTANCE_ID"
 21 | echo "DATASET_NAME: $DATASET_NAME"
 22 | echo "SPLIT: $SPLIT"
 23 | 
 24 | PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
 25 | FILE_DIR=$(dirname $PROCESS_FILEPATH)
 26 | FILE_NAME=$(basename $PROCESS_FILEPATH)
 27 | 
 28 | echo "Evaluating $FILE_NAME @ $FILE_DIR"
 29 | 
 30 | # ================================================
 31 | # detect whether PROCESS_FILEPATH is in OD format or in SWE-bench format
 32 | echo "=============================================================="
 33 | echo "Detecting whether PROCESS_FILEPATH is in OD format or in SWE-bench format"
 34 | echo "=============================================================="
 35 | # SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
 36 | function is_swebench_format() {
 37 |     # Read the first line of the file
 38 |     read -r first_line < "$PROCESS_FILEPATH"
 39 | 
 40 |     # Use jq to check if the first line has the required fields
 41 |     echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null
 42 | 
 43 |     if [ $? -ne 0 ]; then
 44 |         return 1 # Return 1 if the first line does not have the required fields
 45 |     fi
 46 | 
 47 |     return 0 # Return 0 if the first line has the required fields
 48 | }
 49 | # Call the function with the file path
 50 | is_swebench_format "$PROCESS_FILEPATH"
 51 | IS_SWEBENCH_FORMAT=$?
 52 | # Use the result in an if-else statement
 53 | if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
 54 |     echo "The file IS in SWE-bench format."
 55 |     SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
 56 | else
 57 |     echo "The file IS NOT in SWE-bench format."
 58 | 
 59 |     # ==== Convert OD format to SWE-bench format ====
 60 |     echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
 61 |     python3 scripts/eval/convert_od_output_to_swe_json.py $PROCESS_FILEPATH
 62 |     # replace .jsonl with .swebench.jsonl in filename
 63 |     SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
 64 |     echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
 65 |     # assert that the file exists
 66 |     if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
 67 |         echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
 68 |         exit 1
 69 |     fi
 70 |     SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
 71 | fi
 72 | # ================================================
 73 | 
 74 | echo "=============================================================="
 75 | echo "Running SWE-bench evaluation"
 76 | echo "=============================================================="
 77 | 
 78 | RUN_ID=$(date +"%Y%m%d_%H%M%S")
 79 | N_PROCESS=16
 80 | 
 81 | if [ -z "$INSTANCE_ID" ]; then
 82 |     echo "Running SWE-bench evaluation on the whole input file..."
 83 |     # Default to SWE-Bench-lite
 84 |     # change `--dataset_name` and `--split` to alter dataset
 85 | 
 86 |     python -m swebench.harness.run_evaluation \
 87 |         --dataset_name "$DATASET_NAME" \
 88 |         --split "$SPLIT" \
 89 |         --predictions_path $SWEBENCH_FORMAT_JSONL \
 90 |         --timeout 1800 \
 91 |         --cache_level instance \
 92 |         --max_workers $N_PROCESS \
 93 |         --run_id $RUN_ID
 94 | 
 95 |     # get the "model_name_or_path" from the first line of the SWEBENCH_FORMAT_JSONL
 96 |     MODEL_NAME_OR_PATH=$(jq -r '.model_name_or_path' $SWEBENCH_FORMAT_JSONL | head -n 1)
 97 |     echo "MODEL_NAME_OR_PATH: $MODEL_NAME_OR_PATH"
 98 | 
 99 |     RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL)
100 |     echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR"
101 | 
102 |     # move the eval results to the target directory
103 |     mkdir -p $RESULT_OUTPUT_DIR
104 |     # rm eval_outputs directory if it exists
105 |     if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
106 |         rm -rf $RESULT_OUTPUT_DIR/eval_outputs
107 |     fi
108 | 
109 |     mv run_instance_logs/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
110 |     mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
111 |     echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt
112 | 
113 |     # move report file
114 |     REPORT_PATH=$MODEL_NAME_OR_PATH.$RUN_ID.json
115 |     if [ -f $REPORT_PATH ]; then
116 |         # check if $RESULT_OUTPUT_DIR/report.json exists
117 |         if [ -f $RESULT_OUTPUT_DIR/report.json ]; then
118 |             echo "Report file $RESULT_OUTPUT_DIR/report.json already exists. Overwriting..."
119 |             if [ -f $RESULT_OUTPUT_DIR/report.json.bak ]; then
120 |                 rm $RESULT_OUTPUT_DIR/report.json.bak
121 |             fi
122 |             mv $RESULT_OUTPUT_DIR/report.json $RESULT_OUTPUT_DIR/report.json.bak
123 |         fi
124 | 
125 |         mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
126 |     fi
127 | 
128 |     python scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
129 | 
130 | else
131 |     echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
132 |     python -m swebench.harness.run_evaluation \
133 |         --dataset_name "$DATASET_NAME" \
134 |         --split "$SPLIT" \
135 |         --predictions_path $SWEBENCH_FORMAT_JSONL \
136 |         --timeout 1800 \
137 |         --instance_ids $INSTANCE_ID \
138 |         --cache_level instance \
139 |         --max_workers $N_PROCESS \
140 |         --run_id $RUN_ID
141 | fi
142 | 


--------------------------------------------------------------------------------
/swebench/collect/check_validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import json\n",
 10 |     "from collections import defaultdict\n",
 11 |     "import os\n",
 12 |     "import pandas as pd\n",
 13 |     "from tqdm import tqdm"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "swebench_data_path = \"\"\n",
 23 |     "log_root = \"\"\n",
 24 |     "validation_logs = defaultdict(lambda: defaultdict(set))\n",
 25 |     "swebench_data = pd.read_json(swebench_data_path, lines=True, orient=\"records\")\n",
 26 |     "swebench_data_dict = {d[\"instance_id\"]: d for d in swebench_data.to_dict(orient=\"records\")}"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# parse test results\n",
 36 |     "for name in [\"gold\", \"empty\"]:\n",
 37 |     "    log_path = os.path.join(log_root, name)\n",
 38 |     "    log_dirs = os.listdir(log_path)\n",
 39 |     "    \n",
 40 |     "    total_logs = len(log_dirs)\n",
 41 |     "    missing_logs = 0\n",
 42 |     "    print(f\"Processing [{name}] logs\")\n",
 43 |     "    pbar = tqdm(total=total_logs)\n",
 44 |     "    \n",
 45 |     "    for log_dir in log_dirs:\n",
 46 |     "        log_file = os.path.join(log_path, log_dir, \"report.json\")\n",
 47 |     "        pbar.update(1)\n",
 48 |     "        if not os.path.exists(log_file):\n",
 49 |     "            missing_logs += 1\n",
 50 |     "            pbar.set_postfix({\"missing\": missing_logs, \"total\": total_logs})\n",
 51 |     "            continue\n",
 52 |     "        with open(log_file, \"r\") as f:\n",
 53 |     "            log = json.load(f)\n",
 54 |     "        instance_id = list(log.keys())[0]\n",
 55 |     "        if \"tests_status\" in log[instance_id]:\n",
 56 |     "            validation_logs[instance_id][f\"{name}-pass\"] = set(log[instance_id][\"tests_status\"][\"PASS\"])\n",
 57 |     "            validation_logs[instance_id][f\"{name}-fail\"] = set(log[instance_id][\"tests_status\"][\"FAIL\"])"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# From Swe-Bench: we need \"at least one test where it changes from fail to pass\n",
 67 |     "validated_instances = []\n",
 68 |     "n_total = 0\n",
 69 |     "n_validated = 0\n",
 70 |     "pbar = tqdm(total=len(validation_logs))\n",
 71 |     "for k, log in validation_logs.items():\n",
 72 |     "    fail_to_pass = log[\"gold-pass\"] & log[\"empty-fail\"]\n",
 73 |     "    pass_to_pass = log[\"gold-pass\"] & log[\"empty-pass\"] \n",
 74 |     "    n_total += 1\n",
 75 |     "    if len(fail_to_pass) > 0:\n",
 76 |     "        n_validated += 1\n",
 77 |     "        # print(f\"{k} has test that changes from fail to pass\")\n",
 78 |     "        validated_instances.append(swebench_data_dict[k])\n",
 79 |     "        validated_instances[-1]['FAIL_TO_PASS'] = list(fail_to_pass)\n",
 80 |     "        validated_instances[-1]['PASS_TO_PASS'] = list(pass_to_pass)\n",
 81 |     "    pbar.update(1)\n",
 82 |     "    pbar.set_postfix({\"validated\": n_validated, \"total\": n_total})"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from datasets import Dataset, Value, Sequence, Features\n",
 92 |     "KEYS = [\n",
 93 |     "    'repo',\n",
 94 |     "    'pull_number',\n",
 95 |     "    'instance_id',\n",
 96 |     "    'issue_numbers',\n",
 97 |     "    'base_commit',\n",
 98 |     "    'patch',\n",
 99 |     "    'test_patch',\n",
100 |     "    'problem_statement',\n",
101 |     "    'hints_text',\n",
102 |     "    'created_at',\n",
103 |     "    'version',\n",
104 |     "    'PASS_TO_PASS',\n",
105 |     "    'FAIL_TO_PASS',\n",
106 |     "]\n",
107 |     "# We need to define feature to make sure the dataset is consistent with the huggingface dataset on the hub\n",
108 |     "FEATURES = Features({\n",
109 |     "    'repo': Value(dtype='string', id=None),\n",
110 |     "    'pull_number': Value(dtype='int64', id=None),\n",
111 |     "    'instance_id': Value(dtype='string', id=None),\n",
112 |     "    'issue_numbers': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),\n",
113 |     "    'base_commit': Value(dtype='string', id=None),\n",
114 |     "    'patch': Value(dtype='string', id=None),\n",
115 |     "    'test_patch': Value(dtype='string', id=None),\n",
116 |     "    'problem_statement': Value(dtype='string', id=None),\n",
117 |     "    'hints_text': Value(dtype='string', id=None),\n",
118 |     "    'created_at': Value(dtype='string', id=None),\n",
119 |     "    'version': Value(dtype='string', id=None),\n",
120 |     "    'PASS_TO_PASS': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),\n",
121 |     "    'FAIL_TO_PASS': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)\n",
122 |     "})\n",
123 |     "def to_hf_dataset(data_list):\n",
124 |     "    return Dataset.from_dict({k: [d[k] for d in data_list] for k in KEYS}, features=FEATURES)\n",
125 |     "validated_dataset = to_hf_dataset(validated_instances)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "validated_dataset.push_to_hub(\"\", split=\"\", private=True)"
135 |    ]
136 |   }
137 |  ],
138 |  "metadata": {
139 |   "kernelspec": {
140 |    "display_name": "swebench",
141 |    "language": "python",
142 |    "name": "python3"
143 |   },
144 |   "language_info": {
145 |    "codemirror_mode": {
146 |     "name": "ipython",
147 |     "version": 3
148 |    },
149 |    "file_extension": ".py",
150 |    "mimetype": "text/x-python",
151 |    "name": "python",
152 |    "nbconvert_exporter": "python",
153 |    "pygments_lexer": "ipython3",
154 |    "version": "3.9.19"
155 |   }
156 |  },
157 |  "nbformat": 4,
158 |  "nbformat_minor": 2
159 | }
160 | 


--------------------------------------------------------------------------------
/swebench/collect/get_tasks_pipeline.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """Script to collect pull requests and convert them to candidate task instances"""
  4 | 
  5 | import argparse, os
  6 | import traceback
  7 | 
  8 | from dotenv import load_dotenv
  9 | from multiprocessing import Pool
 10 | from swebench.collect.build_dataset import main as build_dataset
 11 | from swebench.collect.print_pulls import main as print_pulls
 12 | 
 13 | 
 14 | load_dotenv()
 15 | 
 16 | 
 17 | def split_instances(input_list: list, n: int) -> list:
 18 |     """
 19 |     Split a list into n approximately equal length sublists
 20 | 
 21 |     Args:
 22 |         input_list (list): List to split
 23 |         n (int): Number of sublists to split into
 24 |     Returns:
 25 |         result (list): List of sublists
 26 |     """
 27 |     avg_length = len(input_list) // n
 28 |     remainder = len(input_list) % n
 29 |     result, start = [], 0
 30 | 
 31 |     for i in range(n):
 32 |         length = avg_length + 1 if i < remainder else avg_length
 33 |         sublist = input_list[start : start + length]
 34 |         result.append(sublist)
 35 |         start += length
 36 | 
 37 |     return result
 38 | 
 39 | 
 40 | def construct_data_files(data: dict):
 41 |     """
 42 |     Logic for combining multiple .all PR files into a single fine tuning dataset
 43 | 
 44 |     Args:
 45 |         data (dict): Dictionary containing the following keys:
 46 |             repos (list): List of repositories to retrieve instruction data for
 47 |             path_prs (str): Path to save PR data files to
 48 |             path_tasks (str): Path to save task instance data files to
 49 |             token (str): GitHub token to use for API requests
 50 |     """
 51 |     repos, path_prs, path_tasks, max_pulls, cutoff_date, token = (
 52 |         data["repos"],
 53 |         data["path_prs"],
 54 |         data["path_tasks"],
 55 |         data["max_pulls"],
 56 |         data["cutoff_date"],
 57 |         data["token"],
 58 |     )
 59 |     for repo in repos:
 60 |         repo = repo.strip(",").strip()
 61 |         repo_name = repo.split("/")[1]
 62 |         try:
 63 |             path_pr = os.path.join(path_prs, f"{repo_name}-prs.jsonl")
 64 |             if cutoff_date:
 65 |                 path_pr = path_pr.replace(".jsonl", f"-{cutoff_date}.jsonl")
 66 |             if not os.path.exists(path_pr):
 67 |                 print(f"Pull request data for {repo} not found, creating...")
 68 |                 print_pulls(
 69 |                     repo,
 70 |                     path_pr,
 71 |                     token,
 72 |                     max_pulls=max_pulls,
 73 |                     cutoff_date=cutoff_date
 74 |                 )
 75 |                 print(f"✅ Successfully saved PR data for {repo} to {path_pr}")
 76 |             else:
 77 |                 print(f"📁 Pull request data for {repo} already exists at {path_pr}, skipping...")
 78 | 
 79 |             path_task = os.path.join(path_tasks, f"{repo_name}-task-instances.jsonl")
 80 |             if not os.path.exists(path_task):
 81 |                 print(f"Task instance data for {repo} not found, creating...")
 82 |                 build_dataset(path_pr, path_task, token)
 83 |                 print(f"✅ Successfully saved task instance data for {repo} to {path_task}")
 84 |             else:
 85 |                 print(f"📁 Task instance data for {repo} already exists at {path_task}, skipping...")
 86 |         except Exception as e:
 87 |             print("-"*80)
 88 |             print(f"Something went wrong for {repo}, skipping: {e}")
 89 |             print("Here is the full traceback:")
 90 |             traceback.print_exc()
 91 |             print("-"*80)
 92 | 
 93 | 
 94 | def main(
 95 |         repos: list,
 96 |         path_prs: str,
 97 |         path_tasks: str,
 98 |         max_pulls: int = None,
 99 |         cutoff_date: str = None,
100 |     ):
101 |     """
102 |     Spawns multiple threads given multiple GitHub tokens for collecting fine tuning data
103 | 
104 |     Args:
105 |         repos (list): List of repositories to retrieve instruction data for
106 |         path_prs (str): Path to save PR data files to
107 |         path_tasks (str): Path to save task instance data files to
108 |         cutoff_date (str): Cutoff date for PRs to consider in format YYYYMMDD
109 |     """
110 |     path_prs, path_tasks = os.path.abspath(path_prs), os.path.abspath(path_tasks)
111 |     print(f"Will save PR data to {path_prs}")
112 |     print(f"Will save task instance data to {path_tasks}")
113 |     print(f"Received following repos to create task instances for: {repos}")
114 | 
115 |     tokens = os.getenv("GITHUB_TOKENS")
116 |     if not tokens: raise Exception("Missing GITHUB_TOKENS, consider rerunning with GITHUB_TOKENS=$(gh auth token)")
117 |     tokens = tokens.split(",")
118 |     data_task_lists = split_instances(repos, len(tokens))
119 | 
120 |     data_pooled = [
121 |         {
122 |             "repos": repos,
123 |             "path_prs": path_prs,
124 |             "path_tasks": path_tasks,
125 |             "max_pulls": max_pulls,
126 |             "cutoff_date": cutoff_date,
127 |             "token": token
128 |         }
129 |         for repos, token in zip(data_task_lists, tokens)
130 |     ]
131 | 
132 |     with Pool(len(tokens)) as p:
133 |         p.map(construct_data_files, data_pooled)
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     parser = argparse.ArgumentParser(description=__doc__)
138 |     parser.add_argument(
139 |         "--repos", nargs="+", help="List of repositories (e.g., `sqlfluff/sqlfluff`) to create task instances for"
140 |     )
141 |     parser.add_argument(
142 |         "--path_prs", type=str, help="Path to folder to save PR data files to"
143 |     )
144 |     parser.add_argument(
145 |         "--path_tasks",
146 |         type=str,
147 |         help="Path to folder to save task instance data files to",
148 |     )
149 |     parser.add_argument(
150 |         "--max_pulls",
151 |         type=int,
152 |         help="Maximum number of pulls to log",
153 |         default=None
154 |     )
155 |     parser.add_argument(
156 |         "--cutoff_date",
157 |         type=str,
158 |         help="Cutoff date for PRs to consider in format YYYYMMDD",
159 |         default=None,
160 |     )
161 |     args = parser.parse_args()
162 |     main(**vars(args))
163 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | All notable changes to the PyPI package for SWE-bench ([`swebench`](https://pypi.org/project/swebench/)) will be documented in this file.
  4 | 
  5 | Prior to version 1.1.0, not all deployed versions are listed, as the PyPI package was going through development and testing. The noteworthy versions and the respective changes that were introduced by that version are included. All versions 1.1.0 onwards are fully listed.
  6 | 
  7 | ## [2.0.12] - 7/21/2024
  8 | * Minor naming changes
  9 | * #186 fix: correct some typings and a incorrect function call
 10 | * #183 Fix timeout
 11 | * #178 Add schema version to report card
 12 | * #177 Fix run live scripts
 13 | 
 14 | ## [2.0.9] - 7/10/2024
 15 | * #176 Move inference to swebench.inference sub-package
 16 | * #175 Fix link in collect README.md
 17 | 
 18 | ## [2.0.8] - 7/8/2024
 19 | * Add `cutoff_date`, `max_pulls` arguments to collection pipeline
 20 | * Minor Django issue comment parsing logic
 21 | * Rewritten `extract_patches` logic
 22 | * Remove `MAP_REPO_TO_TEST_FRAMEWORK` symbol
 23 | 
 24 | ## [2.0.4] - 7/5/2024
 25 | * #173 Fix: Allow to set GH token from env var in collect/print_pulls
 26 | * #171 Don't let tox install a virtualenv during evaluation
 27 | * #169 Handle failures because of None/empty patches
 28 | 
 29 | ## [2.0.3] - 7/2/2024
 30 | * #149 Interface fix: run_id is required
 31 | * #151 Fix: Support JSON datasets (avoid loading json twice)
 32 | * #152 Add very simple CI
 33 | * #153 Various nitpicks
 34 | * #155 Fix link to collection tutorial
 35 | * #161 Fix path to image in docs
 36 | * #162 Fix evaluation hanging issue and improve patch apply
 37 | * #164 Fix so it doesn't crash when no env imgs to build
 38 | * #166 Fix newline outputs for django's log parser
 39 | * #168 Update reporting and skip empty model patch predictions
 40 | 
 41 | ## [2.0.0] - 6/27/2024
 42 | Major release - the SWE-bench evaluation harness has been upgraded to incorporate containerized, sandboxed execution environments based on Docker. There are several chances to the API resulting from this:
 43 | * Removal of the `swebench.metrics` module
 44 | * Updates to the API of `swebench.harness` functionality
 45 | * Significant modifications to underlying evaluation logic
 46 | * Minor updates to installation specifications for different repos + versions.
 47 | 
 48 | Read the full report [here](https://github.com/princeton-nlp/SWE-bench/tree/main/docs/20240627_docker)
 49 | 
 50 | ## [1.1.5] - 5/15/2024
 51 | * Add support for HumanEvalFix (Python, JS, Go, Java) ([source](https://huggingface.co/datasets/bigcode/humanevalpack))
 52 | 
 53 | ## [1.1.0] - 4/15/2024
 54 | * Add `env_vars_test` field to allow for environment variable assignment for testing scripts.
 55 | * Change `pip_packages` installation specification to be a list instead of a string.
 56 | * Define PyPI package versioning explicitly for dev, test repos.
 57 | * Fix versioning for `astroid` dependency in `pylint` installation script`.
 58 | * Fix minor error in `parse_log_pytest_options`.
 59 | * Improve clarity + succinctness of logging.
 60 | * Make logging of subprocess args to log file smaller.
 61 | * Remove installation specifications for `dbt-core`, `transformers`.
 62 | * Remove redundant declaration of constants.
 63 | * Remove unused versions from installation specifications for dev, test repos.
 64 | * Rewrite `swebench.metrics.get_model_report`.
 65 | 
 66 | ## [1.0.5] - 4/7/2024
 67 | * Fix log parsing for `pydicom`, `pylint`, and `requests` libraries. [5cb448](https://github.com/princeton-nlp/SWE-bench/commit/5cb448140a8cd05490650b0671d860765180f26c)
 68 | 
 69 | ## [1.0.4] - 4/5/2024
 70 | * Fixed `env_list` parsing. [5be59d](https://github.com/princeton-nlp/SWE-bench/commit/5be59d665233ffb63b9beb30b2740cc41098e51f)
 71 | * Updated `ExecWrapper`, `LogWrapper` logic for harness. [231a2b](https://github.com/princeton-nlp/SWE-bench/commit/231a2b205c5ca9ddcb126b73b22667d79e1b6108)
 72 | 
 73 | ## [1.0.2] - 4/2/2024
 74 | * Added `try/catch` around `lsof` based clean up for `run_evaluation.py`. [3fb217](https://github.com/princeton-nlp/SWE-bench/commit/3fb2179a5c69737465f916898e8708adffff9914)
 75 | * Fixed `get_eval_refs` function. [12a287](https://github.com/princeton-nlp/SWE-bench/commit/12a287a9591cb4a0d65483f0c8bfaa3375285bfc)
 76 | * Fixed `seaborn` log parser. [0372b6](https://github.com/princeton-nlp/SWE-bench/commit/0372b6a9ff62516067fb26f602163c231d818163)
 77 | 
 78 | ## [1.0.1] - 3/31/2024
 79 | First working version. We strongly recommend not using versions older than this one.
 80 | * Added logging for failed installations. [58d24d](https://github.com/princeton-nlp/SWE-bench/commit/58d24d1b65b95ed96d57805604aca7adca49861d)
 81 | * Added missing `datasets` dependency. [68e89e](https://github.com/princeton-nlp/SWE-bench/commit/68e89ef8d099ca5c23a8fd5681e3f990cf729fd6)
 82 | * Reorganized repository to be directly build-able as a PyPI package. [548bdb](https://github.com/princeton-nlp/SWE-bench/commit/548bdbffb2ac5f0a09c1d7eb95bbee1bce126233)
 83 | 
 84 | ## [0.6.9 - 0.6.9.2] - 3/31/2024 
 85 | > ⚠️ Do NOT use these versions. The PyPI package for these versions was under development. Specifically, some of the evaluation configurations required re-validation. A detailed report for the failures and our recovery from it are detailed in [Bug Report 4/5/2024](docs/reports/20240405_eval_bug/README.md).
 86 | 
 87 | ## [0.6.1] - 3/14/2023
 88 | * Added minor conditions to make `run_evaluation` more robust (e.g. exit on empty predictions)
 89 | * Added logic that conditions conda link download based on which architecture/platform (e.g. x86, arm) the code is being run on.
 90 | * Added classes to unify `subprocess` execution arguments + make them more consistent throughout the codebase. Also remove `shell=True` flag when not necessary.
 91 | * Added deterministic hashing of model name when creating certain testbed paths, defends against https://github.com/conda/conda/issues/12250
 92 | * Fixed key errors across the `metrics/` folder.
 93 | * Reorganized `harness` code. Moved constants into a separate file to improve readability.
 94 | 
 95 | ## [0.4.8] - 11/8/2023
 96 | * `run_evaluation` can be imported to make running the evaluation harness of SWE-bench more accessible.
 97 | * Add condition in `harness/context_manager.py` to skip installation if no instructions are provided.
 98 | * Add functionality to check and remove logs with `AttributeError` or `ImportError`
 99 | * Add support for HumanEval dataset.
100 | * Add support for relative paths for `log_dir` and `testbed` arguments of evaluation.
101 | * Minor renaming for `metrics/report.py` variables.
102 | 
103 | ## [0.4.3] - 11/5/2023
104 | Introducing the initial release of SWE-Bench, a novel benchmark that introduces "software engineering as a task". Given a codebase and an issue, a model is tasked with writing a `.patch` file that addresses the desired changes.
105 | 
106 | Please view the `README.md` for information on how to run the repository, and check out our paper, [SWE-bench: Can Language Models Resolve Real-World GitHub Issues?](https://arxiv.org/abs/2310.06770), for full details on the project.
107 | 
108 | We will maintain a leaderboard on the SWE-bench public [website](http://swe-bench.github.io). We will release details soon on how to submit your generations for evaluation to be included on the leaderboard.
109 | 
110 | ## [< 0.4.3] - 11/4/2023
111 | > ⚠️ Do NOT use these versions. The PyPI package was under development for these versions and will not work properly.


--------------------------------------------------------------------------------
/swebench/collect/build_dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import json
  5 | import logging
  6 | import os
  7 | from typing import Optional
  8 | 
  9 | from swebench.collect.utils import (
 10 |     extract_patches,
 11 |     extract_problem_statement_and_hints,
 12 |     Repo,
 13 | )
 14 | 
 15 | logging.basicConfig(
 16 |     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 17 | )
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def create_instance(repo: Repo, pull: dict) -> dict:
 22 |     """
 23 |     Create a single task instance from a pull request, where task instance is:
 24 | 
 25 |     {
 26 |         repo (str): owner/repo this task instance is from,
 27 |         pull_number (int): number of PR this task instance is from,
 28 |         base_commit (str): SHA of the base commit PR is based on,
 29 |         patch (str): reference solution as .patch (apply to base commit),
 30 |         test_patch (str): test suite as .patch (apply to base commit),
 31 |     }
 32 |     """
 33 |     patch, test_patch = extract_patches(pull, repo)
 34 |     problem_statement, hints = extract_problem_statement_and_hints(pull, repo)
 35 |     return {
 36 |         "repo": repo.repo.full_name,
 37 |         "pull_number": pull["number"],
 38 |         "instance_id": (repo.repo.full_name + "-" + str(pull["number"])).replace(
 39 |             "/", "__"
 40 |         ),
 41 |         "issue_numbers": pull["resolved_issues"],
 42 |         "base_commit": pull["base"]["sha"],
 43 |         "patch": patch,
 44 |         "test_patch": test_patch,
 45 |         "problem_statement": problem_statement,
 46 |         "hints_text": hints,
 47 |         "created_at": pull["created_at"],
 48 |     }
 49 | 
 50 | 
 51 | def is_valid_pull(pull: dict) -> bool:
 52 |     """
 53 |     Check whether PR has an associated issue and is merged
 54 | 
 55 |     Args:
 56 |         pull (dict): pull request object
 57 |     Returns:
 58 |         bool: whether PR is valid
 59 |     """
 60 |     if pull["merged_at"] is None:
 61 |         return False
 62 |     if "resolved_issues" not in pull or len(pull["resolved_issues"]) < 1:
 63 |         return False
 64 |     return True
 65 | 
 66 | 
 67 | def is_valid_instance(instance: dict) -> bool:
 68 |     """
 69 |     Check whether task instance has all required fields for task instance creation
 70 | 
 71 |     Args:
 72 |         instance (dict): task instance object
 73 |     Returns:
 74 |         bool: whether task instance is valid
 75 |     """
 76 |     if instance["patch"] is None or instance["patch"] == "":
 77 |         return False
 78 |     if instance["problem_statement"] is None or instance["problem_statement"] == "":
 79 |         return False
 80 |     return True
 81 | 
 82 | 
 83 | def has_test_patch(instance: dict) -> bool:
 84 |     """
 85 |     Check whether task instance has a test suite
 86 | 
 87 |     Args:
 88 |         instance (dict): task instance object
 89 |     Returns:
 90 |         bool: whether task instance has a test suite
 91 |     """
 92 |     if instance["test_patch"] is None or instance["test_patch"].strip() == "":
 93 |         return False
 94 |     return True
 95 | 
 96 | 
 97 | def main(pr_file: str, output: str, token: Optional[str] = None):
 98 |     """
 99 |     Main thread for creating task instances from pull requests
100 | 
101 |     Args:
102 |         pr_file (str): path to pull request JSONL file
103 |         output (str): output file name
104 |         token (str): GitHub token
105 |     """
106 |     if token is None:
107 |         # Get GitHub token from environment variable if not provided
108 |         token = os.environ.get("GITHUB_TOKEN")
109 | 
110 |     def load_repo(repo_name):
111 |         # Return repo object for a given repo name
112 |         owner, repo = repo_name.split("/")
113 |         return Repo(owner, repo, token=token)
114 | 
115 |     repos = dict()
116 |     completed = 0
117 |     with_tests = 0
118 |     total_instances = 0
119 |     all_output = output + ".all"
120 |     seen_prs = set()
121 | 
122 |     # Continue where we left off if output file already exists
123 |     if os.path.exists(all_output):
124 |         with open(all_output) as f:
125 |             for line in f:
126 |                 pr = json.loads(line)
127 |                 if "instance_id" not in pr:
128 |                     pr["instance_id"] = (
129 |                         pr["repo"] + "-" + str(pr["pull_number"])
130 |                     ).replace("/", "__")
131 |                 instance_id = pr["instance_id"]
132 |                 seen_prs.add(instance_id)
133 |                 if is_valid_instance(pr):
134 |                     completed += 1
135 |                     if has_test_patch(pr):
136 |                         with_tests += 1
137 |     logger.info(f"Will skip {len(seen_prs)} pull requests that have already been inspected")
138 | 
139 |     # Write to .all file for all PRs
140 |     write_mode_all = "w" if not os.path.exists(all_output) else "a"
141 |     with open(all_output, write_mode_all) as all_output:
142 |         # Write to output file for PRs with test suites
143 |         write_mode = "w" if not os.path.exists(output) else "a"
144 |         with open(output, write_mode) as output:
145 |             for ix, line in enumerate(open(pr_file)):
146 |                 total_instances += 1
147 |                 pull = json.loads(line)
148 |                 if ix % 100 == 0:
149 |                     logger.info(
150 |                         f"[{pull['base']['repo']['full_name']}] (Up to {ix} checked) "
151 |                         f"{completed} valid, {with_tests} with tests."
152 |                     )
153 |                 # Construct instance fields
154 |                 instance_id = (
155 |                     pull["base"]["repo"]["full_name"] + "-" + str(pull["number"])
156 |                 )
157 |                 instance_id = instance_id.replace("/", "__")
158 |                 if instance_id in seen_prs:
159 |                     seen_prs -= {instance_id}
160 |                     continue
161 |                 if not is_valid_pull(pull):
162 |                     # Throw out invalid PRs
163 |                     continue
164 |                 # Create task instance
165 |                 repo_name = pull["base"]["repo"]["full_name"]
166 |                 if repo_name not in repos:
167 |                     repos[repo_name] = load_repo(repo_name)
168 |                 repo = repos[repo_name]
169 |                 instance = create_instance(repo, pull)
170 |                 if is_valid_instance(instance):
171 |                     # If valid, write to .all output file
172 |                     print(
173 |                         json.dumps(instance), end="\n", flush=True, file=all_output
174 |                     )  # write all instances to a separate file
175 |                     completed += 1
176 |                     if has_test_patch(instance):
177 |                         # If has test suite, write to output file
178 |                         print(json.dumps(instance), end="\n", flush=True, file=output)
179 |                         with_tests += 1
180 |     logger.info(f"[{', '.join(repos.keys())}] Total instances: {total_instances}, completed: {completed}, with tests: {with_tests}")
181 |     logger.info(f"[{', '.join(repos.keys())}] Skipped {len(seen_prs)} pull requests that have already been inspected")
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     parser = argparse.ArgumentParser()
186 |     parser.add_argument("pr_file", type=str, help="Path to pull request JSONL file")
187 |     parser.add_argument("output", type=str, help="Output file name")
188 |     parser.add_argument("--token", type=str, help="GitHub token")
189 |     args = parser.parse_args()
190 |     main(**vars(args))
191 | 


--------------------------------------------------------------------------------
/swebench/inference/make_datasets/tokenize_dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """Provided a source (raw) directory and the final (eval) directory, create a training split by removing all instances that are in the final directory from the source directory.
  4 | """
  5 | 
  6 | import os
  7 | import logging
  8 | from argparse import ArgumentParser
  9 | from pathlib import Path
 10 | 
 11 | import tiktoken
 12 | from datasets import disable_caching, load_from_disk, load_dataset
 13 | from tqdm.auto import tqdm
 14 | from transformers import LlamaTokenizer
 15 | 
 16 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 17 | logger = logging.getLogger(__name__)
 18 | logger.warning("Disabling caching")
 19 | disable_caching()
 20 | 
 21 | 
 22 | def cl100k(text, tokenizer):
 23 |     return tokenizer.encode(text, disallowed_special=())
 24 | 
 25 | 
 26 | def llama(text, tokenizer):
 27 |     return tokenizer(text, add_special_tokens=False, return_attention_mask=False)[
 28 |         "input_ids"
 29 |     ]
 30 | 
 31 | 
 32 | TOKENIZER_FUNCS = {
 33 |     "cl100k": (tiktoken.get_encoding("cl100k_base"), cl100k),
 34 |     "llama": (LlamaTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K"), llama),
 35 | }
 36 | 
 37 | 
 38 | def extract_fields(instance, tokenizer_name, tokenizer, tokenizer_func, eos_token):
 39 |     instance_id = instance["instance_id"]
 40 |     if instance["text"] is None or instance["patch"] is None:
 41 |         print(f"No text for {instance_id}")
 42 |         return {"input_ids": [], "labels": [], "text": "", "patch": ""}
 43 |     text_inputs = instance["text"].strip() + "\n"
 44 |     if text_inputs is None or instance["patch"] is None:
 45 |         print(f"No inputs for {instance_id}")
 46 |         return None
 47 |     patch = instance["patch"].strip()
 48 |     if len(eos_token) > 0:
 49 |         patch += f"\n{eos_token}"
 50 |     input_ids = tokenizer_func(text_inputs, tokenizer)
 51 |     if tokenizer_name in {"llama"}:
 52 |         label_ids = tokenizer_func(
 53 |             "\n" + patch, tokenizer
 54 |         )  # add newline to tokenize patch
 55 |         idx = label_ids.index(13)
 56 |         assert (
 57 |             idx <= 2
 58 |         ), "Expected newline token id (13) to be one of the first three tokens"
 59 |         label_ids = label_ids[idx + 1 :]  # remove newline tokens
 60 |     else:
 61 |         label_ids = tokenizer_func(patch, tokenizer)
 62 |     inputs = input_ids + label_ids[:-1]
 63 |     cond_len = len(input_ids) - 1
 64 |     labels = [-100] * cond_len + label_ids
 65 |     assert len(inputs) == len(labels)
 66 |     return {**instance, "input_ids": inputs, "labels": labels, "text": text_inputs, "patch": patch}
 67 | 
 68 | 
 69 | def extract_test_fields(instance, tokenizer_name, tokenizer, tokenizer_func, eos_token):
 70 |     instance_id = instance["instance_id"]
 71 |     if instance["text"] is None or instance["patch"] is None:
 72 |         print(f"No text for {instance_id}")
 73 |         return None
 74 |     text_inputs = instance["text"].strip() + "\n"
 75 |     if text_inputs is None or instance["patch"] is None:
 76 |         print(f"No inputs for {instance_id}")
 77 |         return None
 78 |     patch = instance["patch"].strip()
 79 |     if len(eos_token) > 0:
 80 |         patch += f"\n{eos_token}"
 81 |     input_ids = tokenizer_func(text_inputs, tokenizer)
 82 |     label_ids = tokenizer_func(patch, tokenizer)
 83 |     inputs = input_ids
 84 |     labels = label_ids
 85 |     return {**instance, "input_ids": inputs, "labels": labels, "text": text_inputs, "patch": patch}
 86 | 
 87 | 
 88 | def add_columns_from_dict(dataset, dict_columns):
 89 |     """dict_columns is a list of dicts with keys that are columns in dataset"""
 90 |     for column in dict_columns[0].keys():
 91 |         values = [d[column] for d in dict_columns]
 92 |         if column in dataset.column_names:
 93 |             dataset = dataset.remove_columns(column)
 94 |         dataset = dataset.add_column(column, values)
 95 |     return dataset
 96 | 
 97 | 
 98 | def main(
 99 |     dataset_name_or_path,
100 |     output_dir,
101 |     tokenizer_name,
102 |     num_proc,
103 |     push_to_hub_user,
104 | ):
105 |     if push_to_hub_user is not None:
106 |         hub_token = os.environ.get("HUGGING_FACE_HUB_TOKEN", None)
107 |         if hub_token is None:
108 |             raise ValueError("Must provide HUGGING_FACE_HUB_TOKEN to push to the Hub")
109 |     if not Path(output_dir).exists():
110 |         Path(output_dir).mkdir(parents=True)
111 | 
112 |     if tokenizer_name is not None:
113 |         tokenizer, tokenizer_func = TOKENIZER_FUNCS[tokenizer_name]
114 |         eos_token = getattr(tokenizer, "eos_token", "")
115 |         if num_proc > 0 and tokenizer_name == 'cl100k':
116 |             logger.warning('cl100k tokenizer does not support multiprocessing. Ignoring num_proc')
117 |             num_proc = 0
118 | 
119 |     if Path(dataset_name_or_path).exists():
120 |         dataset = load_from_disk(dataset_name_or_path)
121 |     else:
122 |         dataset = load_dataset(dataset_name_or_path)
123 |     dataset = dataset.filter(lambda x: len(x["text"]) <= 5_000_000)  # filter out superlong instances
124 |     for split in dataset.keys():
125 |         if split == "test":
126 |             continue
127 |         if num_proc > 0:
128 |             dataset[split] = dataset[split].map(
129 |                 lambda instance: extract_fields(
130 |                     instance,
131 |                     tokenizer_name,
132 |                     tokenizer,
133 |                     tokenizer_func,
134 |                     eos_token,
135 |                 ),
136 |                 num_proc=num_proc,
137 |                 batched=False,
138 |                 desc=f"Tokenizing {split}",
139 |             )
140 |         elif len(dataset[split]) > 0:
141 |             new_values = list(
142 |                 map(
143 |                     lambda x: extract_fields(
144 |                         x, tokenizer_name, tokenizer, tokenizer_func, eos_token
145 |                     ),
146 |                     tqdm(
147 |                         dataset[split],
148 |                         total=len(dataset[split]),
149 |                         desc=f"Tokenizing {split}",
150 |                     ),
151 |                 )
152 |             )
153 |             dataset[split] = add_columns_from_dict(dataset[split], new_values)
154 |     for split in ["test"]:
155 |         if split not in dataset:
156 |             logger.warning(f"Split {split} not in dataset. Skipping")
157 |             continue
158 |         if num_proc > 0:
159 |             dataset[split] = dataset[split].map(
160 |                 lambda instance: extract_test_fields(
161 |                     instance,
162 |                     tokenizer_name,
163 |                     tokenizer,
164 |                     tokenizer_func,
165 |                     eos_token,
166 |                 ),
167 |                 num_proc=num_proc,
168 |                 batched=False,
169 |                 desc=f"Tokenizing {split}",
170 |             )
171 |         elif len(dataset[split]) > 0:
172 |             new_values = list(
173 |                 map(
174 |                     lambda x: extract_test_fields(
175 |                         x, tokenizer_name, tokenizer, tokenizer_func, eos_token
176 |                     ),
177 |                     tqdm(
178 |                         dataset[split],
179 |                         total=len(dataset[split]),
180 |                         desc=f"Tokenizing {split}",
181 |                     ),
182 |                 )
183 |             )
184 |             dataset[split] = add_columns_from_dict(dataset[split], new_values)
185 |     output_file = Path(dataset_name_or_path).name + f"__tok-{tokenizer_name}"
186 |     if push_to_hub_user is not None:
187 |         output_file = f"{push_to_hub_user}/{output_file}"
188 |         dataset.push_to_hub(output_file, use_auth_token=hub_token)
189 |     else:
190 |         output_file = Path(output_dir) / output_file
191 |         dataset.save_to_disk(output_file)
192 |     logger.warning(f"Saved to {output_file}")
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     parser = ArgumentParser(description=__doc__)
197 |     parser.add_argument("--dataset_name_or_path", type=str, required=True)
198 |     parser.add_argument("--output_dir", type=str, required=True)
199 |     parser.add_argument(
200 |         "--tokenizer_name", type=str, required=True, choices=TOKENIZER_FUNCS.keys()
201 |     )
202 |     parser.add_argument("--num_proc", type=int, default=0)
203 |     parser.add_argument(
204 |         "--push_to_hub_user",
205 |         type=str,
206 |         default=None,
207 |         help="Push the dataset to the Hub user under this name.",
208 |     )
209 |     main(**vars(parser.parse_args()))
210 | 


--------------------------------------------------------------------------------
/Original_README.md:
--------------------------------------------------------------------------------
  1 | ## SWE-Bench-Fork for SWE-Gym
  2 | 
  3 | This fork contains environment setup files used for additonal 11 repos used in SWE-Gym dataset and an improved version of the instance collection pipeline.
  4 | 
  5 | We plan to upstream the changes and merge with SWE-Bench soon.
  6 | 
  7 | <p align="center">
  8 |   <a href="https://github.com/princeton-nlp/Llamao">
  9 |     <img src="assets/figures/swellama_banner.png" width="50%" alt="Kawi the SWE-Llama" />
 10 |   </a>
 11 | </p>
 12 | 
 13 | <div align="center">
 14 | 
 15 |  | [日本語](docs/README_JP.md) | [English](https://github.com/princeton-nlp/SWE-bench) | [中文简体](docs/README_CN.md) | [中文繁體](docs/README_TW.md) |
 16 | 
 17 | </div>
 18 | 
 19 | 
 20 | ---
 21 | <p align="center">
 22 | Code and data for our ICLR 2024 paper <a href="http://swe-bench.github.io/paper.pdf">SWE-bench: Can Language Models Resolve Real-World GitHub Issues?</a>
 23 |     </br>
 24 |     </br>
 25 |     <a href="https://www.python.org/">
 26 |         <img alt="Build" src="https://img.shields.io/badge/Python-3.8+-1f425f.svg?color=purple">
 27 |     </a>
 28 |     <a href="https://copyright.princeton.edu/policy">
 29 |         <img alt="License" src="https://img.shields.io/badge/License-MIT-blue">
 30 |     </a>
 31 |     <a href="https://badge.fury.io/py/swebench">
 32 |         <img src="https://badge.fury.io/py/swebench.svg">
 33 |     </a>
 34 | </p>
 35 | 
 36 | Please refer our [website](http://swe-bench.github.io) for the public leaderboard and the [change log](https://github.com/princeton-nlp/SWE-bench/blob/main/CHANGELOG.md) for information on the latest updates to the SWE-bench benchmark.
 37 | 
 38 | ## 📰 News
 39 | * **[Aug. 13, 2024]**: Introducing *SWE-bench Verified*! Part 2 of our collaboration with [OpenAI Preparedness](https://openai.com/preparedness/). A subset of 500 problems that real software engineers have confirmed are solvable. Check out more in the [report](https://openai.com/index/introducing-swe-bench-verified/)!
 40 | * **[Jun. 27, 2024]**: We have an exciting update for SWE-bench - with support from [OpenAI's Preparedness](https://openai.com/preparedness/) team: We're moving to a fully containerized evaluation harness using Docker for more reproducible evaluations! Read more in our [report](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).
 41 | * **[Apr. 15, 2024]**: SWE-bench has gone through major improvements to resolve issues with the evaluation harness. Read more in our [report](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240415_eval_bug/README.md).
 42 | * **[Apr. 2, 2024]**: We have released [SWE-agent](https://github.com/princeton-nlp/SWE-agent), which sets the state-of-the-art on the full SWE-bench test set! ([Tweet 🔗](https://twitter.com/jyangballin/status/1775114444370051582))
 43 | * **[Jan. 16, 2024]**: SWE-bench has been accepted to ICLR 2024 as an oral presentation! ([OpenReview 🔗](https://openreview.net/forum?id=VTF8yNQM66))
 44 | 
 45 | ## 👋 Overview
 46 | SWE-bench is a benchmark for evaluating large language models on real world software issues collected from GitHub.
 47 | Given a *codebase* and an *issue*, a language model is tasked with generating a *patch* that resolves the described problem.
 48 | 
 49 | <img src="assets/figures/teaser.png">
 50 | 
 51 | To access SWE-bench, copy and run the following code:
 52 | ```python
 53 | from datasets import load_dataset
 54 | swebench = load_dataset('princeton-nlp/SWE-bench', split='test')
 55 | ```
 56 | 
 57 | ## 🚀 Set Up
 58 | SWE-bench uses Docker for reproducible evaluations.
 59 | Follow the instructions in the [Docker setup guide](https://docs.docker.com/engine/install/) to install Docker on your machine.
 60 | If you're setting up on Linux, we recommend seeing the [post-installation steps](https://docs.docker.com/engine/install/linux-postinstall/) as well.
 61 | 
 62 | Finally, to build SWE-bench from source, follow these steps:
 63 | ```bash
 64 | git clone git@github.com:princeton-nlp/SWE-bench.git
 65 | cd SWE-bench
 66 | pip install -e .
 67 | ```
 68 | 
 69 | Test your installation by running:
 70 | ```bash
 71 | python -m swebench.harness.run_evaluation \
 72 |     --predictions_path gold \
 73 |     --max_workers 1 \
 74 |     --instance_ids sympy__sympy-20590 \
 75 |     --run_id validate-gold
 76 | ```
 77 | 
 78 | ## 💽 Usage
 79 | > [!WARNING]
 80 | > Running fast evaluations on SWE-bench can be resource intensive
 81 | > We recommend running the evaluation harness on an `x86_64` machine with at least 120GB of free storage, 16GB of RAM, and 8 CPU cores.
 82 | > You may need to experiment with the `--max_workers` argument to find the optimal number of workers for your machine, but we recommend using fewer than `min(0.75 * os.cpu_count(), 24)`.
 83 | >
 84 | > If running with docker desktop, make sure to increase your virtual disk space to have ~120 free GB available, and set max_workers to be consistent with the above for the CPUs available to docker.
 85 | >
 86 | > Support for `arm64` machines is experimental.
 87 | 
 88 | Evaluate model predictions on SWE-bench Lite using the evaluation harness with the following command:
 89 | ```bash
 90 | python -m swebench.harness.run_evaluation \
 91 |     --dataset_name princeton-nlp/SWE-bench_Lite \
 92 |     --predictions_path <path_to_predictions> \
 93 |     --max_workers <num_workers> \
 94 |     --run_id <run_id>
 95 |     # use --predictions_path 'gold' to verify the gold patches
 96 |     # use --run_id to name the evaluation run
 97 | ```
 98 | 
 99 | This command will generate docker build logs (`logs/build_images`) and evaluation logs (`logs/run_evaluation`) in the current directory.
100 | 
101 | The final evaluation results will be stored in the `evaluation_results` directory.
102 | 
103 | To see the full list of arguments for the evaluation harness, run:
104 | ```bash
105 | python -m swebench.harness.run_evaluation --help
106 | ```
107 | 
108 | Additionally, the SWE-Bench repo can help you:
109 | * Train your own models on our pre-processed datasets
110 | * Run [inference](https://github.com/princeton-nlp/SWE-bench/blob/main/swebench/inference/README.md) on existing models (either models you have on-disk like LLaMA, or models you have access to through an API like GPT-4). The inference step is where you get a repo and an issue and have the model try to generate a fix for it.
111 | *  Run SWE-bench's [data collection procedure](https://github.com/princeton-nlp/SWE-bench/blob/main/swebench/collect/) on your own repositories, to make new SWE-Bench tasks.
112 | 
113 | ## ⬇️ Downloads
114 | | Datasets | Models |
115 | | - | - |
116 | | [🤗 SWE-bench](https://huggingface.co/datasets/princeton-nlp/SWE-bench) | [🦙 SWE-Llama 13b](https://huggingface.co/princeton-nlp/SWE-Llama-13b) |
117 | | [🤗 "Oracle" Retrieval](https://huggingface.co/datasets/princeton-nlp/SWE-bench_oracle) | [🦙 SWE-Llama 13b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-13b-peft) |
118 | | [🤗 BM25 Retrieval 13K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_13K) | [🦙 SWE-Llama 7b](https://huggingface.co/princeton-nlp/SWE-Llama-7b) |
119 | | [🤗 BM25 Retrieval 27K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_27K) | [🦙 SWE-Llama 7b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-7b-peft) |
120 | | [🤗 BM25 Retrieval 40K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_40K) | |
121 | | [🤗 BM25 Retrieval 50K (Llama tokens)](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_50k_llama)   | |
122 | 
123 | ## 🍎 Tutorials
124 | We've also written the following blog posts on how to use different parts of SWE-bench.
125 | If you'd like to see a post about a particular topic, please let us know via an issue.
126 | * [Nov 1. 2023] Collecting Evaluation Tasks for SWE-Bench ([🔗](https://github.com/princeton-nlp/SWE-bench/blob/main/assets/collection.md))
127 | * [Nov 6. 2023] Evaluating on SWE-bench ([🔗](https://github.com/princeton-nlp/SWE-bench/blob/main/assets/evaluation.md))
128 | 
129 | ## 💫 Contributions
130 | We would love to hear from the broader NLP, Machine Learning, and Software Engineering research communities, and we welcome any contributions, pull requests, or issues!
131 | To do so, please either file a new pull request or issue and fill in the corresponding templates accordingly. We'll be sure to follow up shortly!
132 | 
133 | Contact person: [Carlos E. Jimenez](http://www.carlosejimenez.com/) and [John Yang](https://john-b-yang.github.io/) (Email: carlosej@princeton.edu, johnby@stanford.edu).
134 | 
135 | ## ✍️ Citation
136 | If you find our work helpful, please use the following citations.
137 | ```
138 | @inproceedings{
139 |     jimenez2024swebench,
140 |     title={{SWE}-bench: Can Language Models Resolve Real-world Github Issues?},
141 |     author={Carlos E Jimenez and John Yang and Alexander Wettig and Shunyu Yao and Kexin Pei and Ofir Press and Karthik R Narasimhan},
142 |     booktitle={The Twelfth International Conference on Learning Representations},
143 |     year={2024},
144 |     url={https://openreview.net/forum?id=VTF8yNQM66}
145 | }
146 | ```
147 | 
148 | ## 🪪 License
149 | MIT. Check `LICENSE.md`.
150 | 


--------------------------------------------------------------------------------
/swebench/inference/make_datasets/create_text_dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Create a dataset for text-to-text training from the raw task instance outputs.
  5 | """
  6 | 
  7 | import json
  8 | import logging
  9 | import os
 10 | from argparse import ArgumentParser
 11 | from pathlib import Path
 12 | from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
 13 | from tqdm.auto import tqdm
 14 | 
 15 | from swebench.inference.make_datasets.create_instance import add_text_inputs, PROMPT_FUNCTIONS
 16 | from swebench.inference.make_datasets.tokenize_dataset import TOKENIZER_FUNCS
 17 | from swebench.inference.make_datasets.utils import string_to_bool
 18 | 
 19 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | def load_jsonl_file(filename):
 24 |     if type(filename) == str:
 25 |         filename = Path(filename)
 26 |     if filename.name.endswith(".jsonl") or filename.name.endswith(".jsonl.all"):
 27 |         with open(filename) as f:
 28 |             return [json.loads(line) for line in f]
 29 |     elif filename.name.endswith(".json"):
 30 |         with open(filename) as f:
 31 |             return json.load(f)
 32 |     else:
 33 |         raise ValueError(f"Unknown file type {filename}")
 34 | 
 35 | 
 36 | def instances_generator(files):
 37 |     all_data = list()
 38 |     for file in tqdm(files, desc="Loading instance files"):
 39 |         all_data.extend(load_jsonl_file(file))
 40 |     return all_data
 41 | 
 42 | 
 43 | def get_training_and_eval_instances(raw_files, test_dataset):
 44 |     logger.info("Loading instances")
 45 |     raw_instances = list(instances_generator(raw_files))
 46 |     final_instances = list(test_dataset["test"])
 47 |     eval_repos = {x["repo"] for x in final_instances}
 48 |     train_instances = [x for x in raw_instances if x["repo"] not in eval_repos]
 49 |     train_instances = list(sorted(train_instances, key=lambda x: x["instance_id"]))
 50 |     eval_instances = list(sorted(final_instances, key=lambda x: x["instance_id"]))
 51 |     logger.info(f"Found {len(train_instances)} training ids")
 52 |     logger.info(f"Found {len(eval_instances)} eval ids")
 53 |     return train_instances, eval_instances
 54 | 
 55 | 
 56 | def extract_fields(instance):
 57 |     instance_id = instance["instance_id"]
 58 |     if instance["text_inputs"] is None or instance["patch"] is None:
 59 |         print(f"No text for {instance_id}")
 60 |         return None
 61 |     text_inputs = instance["text_inputs"].strip() + "\n\n"
 62 |     if text_inputs is None or instance["patch"] is None:
 63 |         print(f"No inputs for {instance_id}")
 64 |         return None
 65 |     patch = "\n".join([f"<patch>", instance["patch"], "</patch>"])
 66 |     return {**instance, "text": text_inputs, "patch": patch}
 67 | 
 68 | 
 69 | def main(
 70 |     dataset_name_or_path,
 71 |     splits,
 72 |     validation_ratio,
 73 |     output_dir,
 74 |     retrieval_file,
 75 |     prompt_style,
 76 |     file_source,
 77 |     k,
 78 |     max_context_len,
 79 |     tokenizer_name,
 80 |     push_to_hub_user,
 81 | ):
 82 |     if push_to_hub_user is not None:
 83 |         hub_token = os.environ.get("HUGGING_FACE_HUB_TOKEN", None)
 84 |         assert hub_token is not None, "Must provide HUGGING_FACE_HUB_TOKEN to push to the Hub"
 85 |         assert output_dir is None, "Cannot provide output_dir if pushing to the Hub"
 86 |     if max_context_len is not None:
 87 |         assert tokenizer_name is not None
 88 |     if push_to_hub_user is None and not Path(output_dir).exists():
 89 |         Path(output_dir).mkdir(parents=True)
 90 |     output_file = f"SWE-bench__{prompt_style}__fs-{file_source}"
 91 |     if k is not None:
 92 |         assert file_source not in {
 93 |             "all",
 94 |             "oracle",
 95 |         }, "Cannot use max_context_len with oracle or all file sources"
 96 |         output_file += f"__k-{k}"
 97 |     if max_context_len is not None:
 98 |         assert file_source not in {
 99 |             "all",
100 |             "oracle",
101 |         }, "Cannot use max_context_len with oracle or all file sources"
102 |         assert (
103 |             tokenizer_name is not None
104 |         ), "Must provide tokenizer_name if max_context_len is not None"
105 |         output_file += f"__mcc-{max_context_len}-{tokenizer_name}"
106 |     if push_to_hub_user is None:
107 |         output_file = Path(output_dir, output_file)
108 |         if output_file.exists():
109 |             logger.info(f"{output_file.absolute().as_posix()} already exists. Aborting")
110 |             return
111 |     if Path(dataset_name_or_path).exists():
112 |         dataset = load_from_disk(dataset_name_or_path)
113 |     else:
114 |         dataset = load_dataset(dataset_name_or_path)
115 | 
116 |     split_instances = dict()
117 |     logger.info(f'Found {set(dataset.keys())} splits')
118 |     if set(splits) - set(dataset.keys()) != set():
119 |         raise ValueError(f"Unknown splits {set(splits) - set(dataset.keys())}")
120 |     for split in splits:
121 |         split_instances[split] = {x["instance_id"]: x for x in dataset[split]}
122 |         add_text_inputs(
123 |             split_instances[split],
124 |             retrieval_file,
125 |             k,
126 |             prompt_style,
127 |             file_source,
128 |             max_context_len=max_context_len,
129 |             tokenizer_name=tokenizer_name,
130 |         )
131 |     columns = [
132 |         "instance_id",
133 |         "text",
134 |         "repo",
135 |         "base_commit",
136 |         "problem_statement",
137 |         "hints_text",
138 |         "created_at",
139 |         "patch",
140 |         "test_patch",
141 |         "version",
142 |         "FAIL_TO_PASS",
143 |         "PASS_TO_PASS",
144 |         "environment_setup_commit",
145 |     ]
146 |     split_data = dict()
147 |     for split in split_instances:
148 |         split_data[split] = {key: list() for key in columns}
149 |         for instance in tqdm(
150 |             split_instances[split].values(), total=len(split_instances[split]), desc=f'Processing {split} instances',
151 |         ):
152 |             datum = extract_fields(instance)
153 |             if datum is None:
154 |                 continue
155 |             for key in columns:
156 |                 split_data[split][key].append(datum[key] if key in datum else "")
157 |         logger.info(f"Found {len(split_data[split]['instance_id'])} {split} ids")
158 |         split_data[split] = Dataset.from_dict(split_data[split])
159 |     dataset = DatasetDict(split_data)
160 |     if validation_ratio > 0 and "train" in dataset:
161 |         train_val = dataset["train"].train_test_split(
162 |             test_size=validation_ratio,
163 |             seed=42,
164 |         )
165 |         dataset["train"] = train_val["train"]
166 |         dataset["validation"] = train_val["test"]
167 |     for split in dataset:
168 |         logger.info(f"Found {len(dataset[split])} {split} instances")
169 |     if push_to_hub_user is not None:
170 |         dataset.push_to_hub(f'{push_to_hub_user}/{output_file}', use_auth_token=hub_token)
171 |     else:
172 |         dataset.save_to_disk(output_file)
173 |     logger.info(f"Finsihed saving to {output_file}")
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     parser = ArgumentParser(description=__doc__)
178 |     parser.add_argument(
179 |         "--dataset_name_or_path",
180 |         type=str,
181 |         default="princeton-nlp/SWE-bench",
182 |         help="Dataset to use for test set from HuggingFace Datasets or path to a save_to_disk directory.",
183 |     )
184 |     parser.add_argument(
185 |         "--splits",
186 |         nargs="+",
187 |         default=["train", "test"],
188 |         help="Splits to use from the dataset.",
189 |     )
190 |     parser.add_argument(
191 |         "--validation_ratio",
192 |         type=float,
193 |         default=0.01,
194 |         help="Ratio of the training set to use for validation.",
195 |     )
196 |     parser.add_argument(
197 |         "--output_dir", type=str, help="Path to the output directory."
198 |     )
199 |     parser.add_argument(
200 |         "--retrieval_file",
201 |         type=str,
202 |         help="Path to the file where the retrieval results are stored.",
203 |     )
204 |     parser.add_argument(
205 |         "--prompt_style",
206 |         type=str,
207 |         default="style-3",
208 |         choices=PROMPT_FUNCTIONS.keys(),
209 |         help="Prompt style to use. See create_instance.PROMPT_FUNCTIONS for details.",
210 |     )
211 |     parser.add_argument(
212 |         "--file_source",
213 |         type=str,
214 |         default="oracle",
215 |         choices=["oracle", "bm25", "all"],
216 |         help="How to select the files to use in context.",
217 |     )
218 |     parser.add_argument(
219 |         "--k",
220 |         type=int,
221 |         default=None,
222 |         help="Maximum number of files to use for retrieval.",
223 |     )
224 |     parser.add_argument(
225 |         "--max_context_len",
226 |         type=int,
227 |         default=None,
228 |         help="Maximum number of tokens to use for context.",
229 |     )
230 |     parser.add_argument(
231 |         "--tokenizer_name",
232 |         type=str,
233 |         default=None,
234 |         choices=TOKENIZER_FUNCS.keys(),
235 |         help="Tokenizer to use for max_context_len. Only needed if max_context_len is specified.",
236 |     )
237 |     parser.add_argument(
238 |         "--push_to_hub_user",
239 |         type=str,
240 |         help="Username to use for pushing to the Hub. If not provided, will save to disk.",
241 |     )
242 |     main(**vars(parser.parse_args()))
243 | 


--------------------------------------------------------------------------------
/swebench/harness/grading.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Any
  3 | 
  4 | from swebench.harness.constants import (
  5 |     APPLY_PATCH_FAIL,
  6 |     APPLY_PATCH_PASS,
  7 |     FAIL_TO_FAIL,
  8 |     FAIL_TO_PASS,
  9 |     KEY_INSTANCE_ID,
 10 |     PASS_TO_FAIL,
 11 |     PASS_TO_PASS,
 12 |     RESET_FAILED,
 13 |     TESTS_ERROR,
 14 |     TESTS_TIMEOUT,
 15 |     ResolvedStatus,
 16 |     TestStatus,
 17 | )
 18 | from swebench.harness.test_spec import TestSpec
 19 | from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
 20 | 
 21 | 
 22 | # MARK: Utility functions
 23 | def test_passed(case: str, sm: dict[str, str]) -> bool:
 24 |     return case in sm and sm[case] in [TestStatus.PASSED.value, TestStatus.XFAIL.value]
 25 | 
 26 | 
 27 | def test_failed(case: str, sm: dict[str, str]) -> bool:
 28 |     return case not in sm or any(
 29 |         sm[case] == status for status in [TestStatus.FAILED.value, TestStatus.ERROR.value]
 30 |     )
 31 | 
 32 | 
 33 | # MARK: Evaluation report functions
 34 | def get_logs_eval(log_fp: str) -> tuple[dict[str, str], bool]:
 35 |     """
 36 |     Retrieve evaluation results for a task instance from its corresponding log file
 37 | 
 38 |     Args:
 39 |         log_fp (str): path to log file
 40 |     Returns:
 41 |         bool: whether the patch applied successfully
 42 |         dict: status map
 43 |     
 44 |     TODO(john-b-yang): Check this is working properly...
 45 |     """
 46 |     # Convert e.g. "logs/scikit-learn__scikit-learn-12421/test_output.txt" to "scikit-learn/scikit-learn"
 47 |     sample_id = str(Path(log_fp).parent.stem)  # e.g. scikit-learn__scikit-learn-12421
 48 |     repo = "-".join(sample_id.replace("__", "/").split("-")[:-1])  # e.g. scikit-learn/scikit-learn
 49 |     log_parser = MAP_REPO_TO_PARSER[repo]
 50 | 
 51 |     with open(log_fp) as f:
 52 |         content = f.read()
 53 |         # TODO fix constant here
 54 |         if (
 55 |             any(
 56 |                 [
 57 |                     x in content
 58 |                     for x in [
 59 |                         APPLY_PATCH_FAIL,
 60 |                         RESET_FAILED,
 61 |                         TESTS_ERROR,
 62 |                         TESTS_TIMEOUT,
 63 |                         "Failed to reset task environment",
 64 |                     ]
 65 |                 ]
 66 |             )
 67 |             or "applied patch" not in content.lower()
 68 |         ):
 69 |             # Eval patch was not applied successfully
 70 |             return {}, False
 71 | 
 72 |         # Get status map of evaluation results
 73 |         content = content.split(f"{APPLY_PATCH_PASS} (pred)")[-1]
 74 |         return log_parser(content), True
 75 | 
 76 | 
 77 | def get_eval_tests_report(
 78 |     eval_sm: dict[str, str],
 79 |     gold_results: dict[str, str],
 80 |     calculate_to_fail: bool = False,
 81 | ) -> dict[str, dict[str, list[str]]]:
 82 |     """
 83 |     Create a report based on failure/pass change from gold results to eval results.
 84 | 
 85 |     Args:
 86 |         eval_sm (dict): evaluation status map
 87 |         gold_results (dict): gold results
 88 |         calculate_to_fail (bool): whether to calculate metrics for "x to fail" tests
 89 |     Returns:
 90 |         report (dict): report of metrics
 91 | 
 92 |     Metric Definitions (Gold Result Pair + Eval Result):
 93 |     - Fail-Pass (F2P) + P: Success (Resolution)
 94 |     - Pass-Pass (P2P) + P: Success (Maintenance)
 95 |     - Fail-Pass (F2P) + F: Failure
 96 |     - Pass-Pass (P2P) + F: Failure
 97 | 
 98 |     Miscellaneous Definitions
 99 |     - Fail-Fail (F2F) + F: Failure Maintenance
100 |     - Pass-Fail (P2F) + F: Not considered
101 |     - Fail-Fail (F2F) + P: Success (Extra Credit)
102 |     - Pass-Fail (P2F) + P: Not considered
103 |     """
104 |     # Calculate resolution metrics
105 |     f2p_success = []
106 |     f2p_failure = []
107 |     for test_case in gold_results[FAIL_TO_PASS]:
108 |         if test_passed(test_case, eval_sm):
109 |             # Assume silent success for now (test case not in eval_sm)
110 |             f2p_success.append(test_case)
111 |         elif test_failed(test_case, eval_sm):
112 |             f2p_failure.append(test_case)
113 | 
114 |     # Calculate maintenance metrics
115 |     p2p_success = []
116 |     p2p_failure = []
117 |     for test_case in gold_results[PASS_TO_PASS]:
118 |         if test_passed(test_case, eval_sm):
119 |             p2p_success.append(test_case)
120 |         elif test_failed(test_case, eval_sm):
121 |             p2p_failure.append(test_case)
122 | 
123 |     results = {
124 |         FAIL_TO_PASS: {
125 |             "success": f2p_success,
126 |             "failure": f2p_failure,
127 |         },
128 |         PASS_TO_PASS: {
129 |             "success": p2p_success,
130 |             "failure": p2p_failure,
131 |         },
132 |     }
133 | 
134 |     f2f_success = []
135 |     f2f_failure = []
136 |     p2f_success = []
137 |     p2f_failure = []
138 |     if calculate_to_fail:
139 |         # Calculate "extra credit" metrics
140 |         for test_case in gold_results[FAIL_TO_FAIL]:
141 |             if test_passed(test_case, eval_sm):
142 |                 f2f_success.append(test_case)
143 |             elif test_failed(test_case, eval_sm):
144 |                 f2f_failure.append(test_case)
145 | 
146 |         # Calculate not considered metrics
147 |         for test_case in gold_results[PASS_TO_FAIL]:
148 |             if test_passed(test_case, eval_sm):
149 |                 p2f_success.append(test_case)
150 |             elif test_failed(test_case, eval_sm):
151 |                 p2f_failure.append(test_case)
152 | 
153 |     results.update(
154 |         {
155 |             FAIL_TO_FAIL: {
156 |                 "success": f2f_success,
157 |                 "failure": f2f_failure,
158 |             },
159 |             PASS_TO_FAIL: {
160 |                 "success": p2f_success,
161 |                 "failure": p2f_failure,
162 |             },
163 |         }
164 |     )
165 |     return results
166 | 
167 | 
168 | def compute_fail_to_pass(report: dict[str, dict[str, Any]]) -> float:
169 |     """
170 |     Compute fail-to-pass metric. Accepts single report as argument.
171 |     """
172 |     total = len(report[FAIL_TO_PASS]["success"]) + len(report[FAIL_TO_PASS]["failure"])
173 |     if total == 0:
174 |         return 1
175 |     return len(report[FAIL_TO_PASS]["success"]) / total
176 | 
177 | 
178 | def compute_pass_to_pass(report: dict[str, dict[str, Any]]) -> float:
179 |     """
180 |     Compute pass-to-pass metric. Accepts single report as argument.
181 |     """
182 |     total = len(report[PASS_TO_PASS]["success"]) + len(report[PASS_TO_PASS]["failure"])
183 |     if total == 0:
184 |         # TODO: Don't factor in p2p metrics
185 |         return 1
186 |     return len(report[PASS_TO_PASS]["success"]) / total
187 | 
188 | 
189 | def get_resolution_status(report: dict[str, dict[str, Any]]) -> str:
190 |     """
191 |     Determine resolved status of an evaluation instance
192 | 
193 |     Criteria:
194 |         - If fail-to-pass (Resolution) = 1 and pass-to-pass (Maintenance) = 1 -> FULL
195 |         - If (fail-to-pass (Resolution) < 1 and > 0) and pass-to-pass (Maintenance) = 1 -> PARTIAL
196 |         - Otherwise -> NO
197 |     """
198 |     f2p = compute_fail_to_pass(report)
199 |     p2p = compute_pass_to_pass(report)
200 | 
201 |     if f2p == 1 and p2p == 1:
202 |         return ResolvedStatus.FULL.value
203 |     elif f2p < 1 and f2p > 0 and p2p == 1:
204 |         return ResolvedStatus.PARTIAL.value
205 |     else:
206 |         return ResolvedStatus.NO.value
207 |     
208 | 
209 | def get_eval_report(
210 |     test_spec: TestSpec,
211 |     prediction: dict[str, str],
212 |     log_path: str,
213 |     include_tests_status: bool,
214 | ) -> dict[str, Any]:
215 |     """
216 |     Generate a report of model evaluation results from a prediction, task instance,
217 |     and evaluation log.
218 | 
219 |     Args:
220 |         test_spec (dict): test spec containing keys "instance_id", "FAIL_TO_PASS", and "PASS_TO_PASS"
221 |         prediction (dict): prediction containing keys "instance_id", "model_name_or_path", and "model_patch"
222 |         log_path (str): path to evaluation log
223 |         include_tests_status (bool): whether to include the status of each test in the returned report
224 |     Returns:
225 |         report (dict): report of metrics
226 |     """
227 |     report_map = {}
228 | 
229 |     instance_id = prediction[KEY_INSTANCE_ID]
230 |     if instance_id not in report_map:
231 |         report_map[instance_id] = {
232 |             "patch_is_None": False,
233 |             "patch_exists": False,
234 |             "patch_successfully_applied": False,
235 |             "resolved": False,
236 |         }
237 | 
238 |     # Check if the model patch exists
239 |     if prediction["model_patch"] is None:
240 |         report_map[instance_id]["none"] = True
241 |         return report_map
242 |     report_map[instance_id]["patch_exists"] = True
243 | 
244 |     # Get evaluation logs
245 |     eval_sm, found = get_logs_eval(log_path)
246 | 
247 |     if not found:
248 |         return report_map
249 |     report_map[instance_id]["patch_successfully_applied"] = True
250 | 
251 |     eval_ref = {
252 |         KEY_INSTANCE_ID: test_spec.instance_id,
253 |         FAIL_TO_PASS: test_spec.FAIL_TO_PASS,
254 |         PASS_TO_PASS: test_spec.PASS_TO_PASS,
255 |     }
256 | 
257 |     report = get_eval_tests_report(eval_sm, eval_ref)
258 |     if get_resolution_status(report) == ResolvedStatus.FULL.value:
259 |         report_map[instance_id]["resolved"] = True
260 | 
261 |     if include_tests_status:
262 |         report_map[instance_id]["tests_status"] = report  # type: ignore
263 |     
264 |     return report_map
265 | 


--------------------------------------------------------------------------------
/swebench/inference/run_live.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | This module contains functions for running a live inference session on a GitHub issue.
  5 | It clones the repository associated with the issue, builds a BM25 retrieval index, and
  6 | generates a prompt for the user to interact with the model. The output is saved to a
  7 | specified directory.
  8 | """
  9 | import json
 10 | import subprocess
 11 | from pathlib import Path
 12 | from ghapi.all import GhApi
 13 | import os
 14 | import re
 15 | import time
 16 | from datetime import datetime
 17 | from tqdm.auto import tqdm
 18 | from swebench.inference.make_datasets.utils import ContextManager, string_to_bool, extract_diff, extract_minimal_patch
 19 | from swebench.inference.make_datasets.create_instance import (
 20 |     PROMPT_FUNCTIONS,
 21 |     TOKENIZER_FUNCS,
 22 |     make_code_text,
 23 |     ingest_files,
 24 | )
 25 | from swebench.inference.make_datasets.bm25_retrieval import (
 26 |     make_index,
 27 |     clone_repo,
 28 |     search,
 29 |     DOCUMENT_ENCODING_FUNCTIONS,
 30 | )
 31 | from swebench.inference.run_api import call_chat, call_anthropic
 32 | import logging
 33 | from argparse import ArgumentParser
 34 | 
 35 | logging.basicConfig(
 36 |     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 37 | )
 38 | logger = logging.getLogger(__name__)
 39 | 
 40 | 
 41 | def get_problem_statement(owner, repo, issue_num, ghapi, include_comments=False):
 42 |     issue = ghapi.issues.get(owner, repo, issue_num)
 43 |     issue_text = "\n".join([issue.title, issue.body])
 44 |     # Solved issues may include comments that give answers away too much
 45 |     if include_comments:
 46 |         all_comments = list(ghapi.issues.list_comments(owner, repo, issue_num))
 47 |         comments = [comment.body for comment in all_comments]
 48 |         comment_text = "Comment: " if comments else "" + "\nComment:".join(comments)
 49 |         issue_text += "\n" + comment_text
 50 |     return issue_text
 51 | 
 52 | 
 53 | def get_readme_files(repo_path):
 54 |     files = list(Path(repo_path).iterdir())
 55 |     files = list(filter(lambda x: x.is_file(), files))
 56 |     files = list(filter(lambda x: x.name.lower().startswith("readme"), files))
 57 |     if files:
 58 |         files = sorted(files, key=lambda x: len(x.name))
 59 |         files = [files[0]]
 60 |     return [Path(file).relative_to(repo_path).as_posix() for file in files]
 61 | 
 62 | 
 63 | def make_instance(
 64 |     owner,
 65 |     repo,
 66 |     query,
 67 |     commit,
 68 |     root_dir,
 69 |     token,
 70 |     document_encoding_func,
 71 |     python,
 72 |     instance_id,
 73 |     tokenizer,
 74 |     tokenizer_func,
 75 |     prompt_style,
 76 |     max_context_len,
 77 |     include_readmes,
 78 | ):
 79 |     """
 80 |     Creates an instance for a given query and repository.
 81 | 
 82 |     Args:
 83 |         owner (str): The owner of the repository.
 84 |         repo (str): The name of the repository.
 85 |         query (str): The query to search for.
 86 |         commit (str): The commit hash to use.
 87 |         root_dir (str): The root directory to clone the repository to.
 88 |         token (str): The GitHub token to use for authentication.
 89 |         document_encoding_func (function): The function to use for encoding documents.
 90 |         python (str): The path to the Python executable.
 91 |         instance_id (int): The ID of the instance.
 92 |         tokenizer (str): The name of the tokenizer to use.
 93 |         tokenizer_func (function): The function to use for tokenization.
 94 |         prompt_style (str): The style of prompt to use.
 95 |         max_context_len (int): The maximum length of the context.
 96 |         include_readmes (bool): Whether to include README files in the instance.
 97 | 
 98 |     Returns:
 99 |         dict: The instance.
100 |     """
101 |     thread_id = 0
102 |     instance = {"instance_id": instance_id, "problem_statement": query}
103 |     logger.info(f"Cloning repo {owner}/{repo}")
104 |     repo_dir = clone_repo(f"{owner}/{repo}", root_dir, token)
105 |     if commit is None:
106 |         commit = subprocess.check_output(
107 |             ["git", "rev-parse", "HEAD"], cwd=repo_dir
108 |         ).decode("utf-8").strip()
109 |     logger.info(f"Building BM25 retrieval index for {owner}/{repo}@{commit}")
110 |     index_dir = make_index(
111 |         repo_dir=repo_dir,
112 |         root_dir=root_dir,
113 |         query=query,
114 |         commit=commit,
115 |         document_encoding_func=document_encoding_func,
116 |         python=python,
117 |         instance_id=instance_id,
118 |     )
119 |     results = search(instance, index_dir)
120 |     hits = results["hits"]
121 |     logger.info(f"Retrieved {len(hits)} documents")
122 |     with ContextManager(repo_dir, commit) as cm:
123 |         if include_readmes:
124 |             readmes = get_readme_files(cm.repo_path)
125 |         else:
126 |             readmes = list()
127 |         instance["readmes"] = ingest_files(readmes)
128 |         for hit in hits:
129 |             hit["file_contents"] = open(hit["docid"]).read()
130 |         instance["file_contents"] = dict()
131 |         base_text_inputs = PROMPT_FUNCTIONS[prompt_style](instance)
132 |         base_text_input_length = len(tokenizer_func(base_text_inputs, tokenizer))
133 |         instance["file_contents"] = {x["docid"]: x["file_contents"] for x in hits}
134 |         cur_input_len = base_text_input_length
135 |         include_files = list()
136 |         for filename in [x["docid"] for x in hits]:
137 |             content = make_code_text({filename: instance["file_contents"][filename]})
138 |             tokens = tokenizer_func(content, tokenizer)
139 |             if cur_input_len + len(tokens) < max_context_len:
140 |                 include_files.append(filename)
141 |                 cur_input_len += len(tokens)
142 |         logger.info(
143 |             f"Including {len(include_files)} files in context with {cur_input_len} tokens:\n"
144 |             + "\n\t".join(sorted(include_files))
145 |         )
146 |         instance["file_contents"] = {
147 |             filename: instance["file_contents"][filename] for filename in include_files
148 |         }
149 |         instance["text_inputs"] = PROMPT_FUNCTIONS[prompt_style](instance)
150 |         return instance
151 | 
152 | 
153 | def parse_issue_url(issue_url):
154 |     issue_pat = re.compile(r"github\.com\/(.+?)\/(.+?)\/issues\/(\d+)")
155 |     match = issue_pat.search(issue_url)
156 |     if not match:
157 |         raise ValueError(
158 |             f"issue_url ({issue_url}) does not seem to be a valid issue url."
159 |             + "\nPlease use url like https://github.com/owner/repo/issues/12345"
160 |         )
161 |     owner, repo, issue_num = match.groups()
162 |     return owner, repo, issue_num
163 | 
164 | 
165 | def main(
166 |     model_name,
167 |     prompt_style,
168 |     issue_url,
169 |     base_commit,
170 |     max_context_length,
171 |     document_encoding_func,
172 |     output_dir,
173 |     root_dir,
174 |     include_readmes,
175 | ):
176 |     if base_commit is not None and len(issue_url) != len(base_commit):
177 |         raise ValueError(
178 |             f"Must provide either no base commits or one base commit per issue url"
179 |         )
180 |     if base_commit is None:
181 |         base_commit = [None] * len(issue_url)
182 |     gh_token = os.environ.get("GITHUB_TOKEN", None)
183 |     if gh_token is not None:
184 |         logger.warning(f'Using GitHub token: {"*" * 8}{gh_token[-4:]}')
185 |     gh = GhApi(token=gh_token)
186 |     tokenizer, tokenizer_func = TOKENIZER_FUNCS["cl100k"]
187 |     document_encoding_func = DOCUMENT_ENCODING_FUNCTIONS[document_encoding_func]
188 |     python = subprocess.check_output(["which", "python"]).decode("utf-8").strip()
189 |     outputs = list()
190 |     for issue, commit in tqdm(zip(issue_url, base_commit), total=len(issue_url)):
191 |         owner, repo, issue_num = parse_issue_url(issue)
192 |         problem_statement = get_problem_statement(owner, repo, int(issue_num), gh)
193 |         instance_id = f"{owner}__{repo}-{issue_num}"
194 |         logger.info(f"Creating instance {instance_id}")
195 |         instance = make_instance(
196 |             owner=owner,
197 |             repo=repo,
198 |             query=problem_statement,
199 |             commit=commit,
200 |             root_dir=root_dir,
201 |             token=gh_token,
202 |             document_encoding_func=document_encoding_func,
203 |             python=python,
204 |             instance_id=instance_id,
205 |             tokenizer=tokenizer,
206 |             tokenizer_func=tokenizer_func,
207 |             prompt_style=prompt_style,
208 |             max_context_len=max_context_length,
209 |             include_readmes=include_readmes,
210 |         )
211 |         logger.info(f"Calling model {model_name}")
212 |         start = time.time()
213 |         if model_name.startswith("gpt"):
214 |             inputs = instance["text_inputs"]
215 |             response, _ = call_chat(
216 |                 model_name, inputs, use_azure=False, temperature=0, top_p=1
217 |             )
218 |             completion = response.choices[0].message.content
219 |             logger.info(f'Generated {response.usage.completion_tokens} tokens in {(time.time() - start):.2f} seconds')
220 |         else:
221 |             from anthropic import Anthropic
222 |             api_key = os.environ.get("ANTHROPIC_API_KEY", None)
223 |             anthropic = Anthropic(api_key=api_key)
224 |             response = call_anthropic(
225 |                 inputs, anthropic, model_name, temperature=0, top_p=1
226 |             )
227 |             completion = response.completion
228 |         model_patch = extract_diff(completion)
229 |         minimal_patch = extract_minimal_patch(model_patch)
230 |         outputs.append(
231 |             {
232 |                 "instance_id": instance_id,
233 |                 "response": completion,
234 |                 "problem_statement": problem_statement,
235 |                 "text_inputs": inputs,
236 |                 "model_patch": model_patch,
237 |                 "minimal_patch": minimal_patch,
238 |             }
239 |         )
240 |     os.makedirs(output_dir, exist_ok=True)
241 |     output_file = Path(
242 |         output_dir,
243 |         f'{model_name}__{prompt_style}__{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.jsonl',
244 |     )
245 |     with open(output_file, "+a") as f:
246 |         for output in outputs:
247 |             print(json.dumps(output), file=f, flush=True)
248 |     logger.info(f"Wrote output to {output_file}")
249 | 
250 | 
251 | if __name__ == "__main__":
252 |     parser = ArgumentParser(description=__doc__)
253 |     parser.add_argument("--model_name", type=str)
254 |     parser.add_argument(
255 |         "--prompt_style", type=str, choices=PROMPT_FUNCTIONS.keys(), default="style-3"
256 |     )
257 |     parser.add_argument("--issue_url", type=str, nargs="+")
258 |     parser.add_argument("--base_commit", type=str, nargs="+")
259 |     parser.add_argument("--max_context_length", type=int, default=16_000)
260 |     parser.add_argument(
261 |         "--document_encoding_func",
262 |         type=str,
263 |         choices=DOCUMENT_ENCODING_FUNCTIONS.keys(),
264 |         default="file_name_and_contents",
265 |     )
266 |     parser.add_argument("--output_dir", type=str, default="./live_outputs")
267 |     parser.add_argument("--root_dir", type=str, default="./run_live_data")
268 |     parser.add_argument("--include_readmes", type=string_to_bool, default=False)
269 |     args = parser.parse_args()
270 |     main(**vars(args))
271 | 


--------------------------------------------------------------------------------