├── swebench ├── collect │ ├── __init__.py │ ├── run_build_dataset_ft.sh │ ├── run_get_tasks_pipeline.sh │ ├── make_repo │ │ ├── call_make_repo.py │ │ └── make_repo.sh │ ├── make_lite │ │ ├── README.md │ │ ├── make_lite.py │ │ └── criteria.py │ ├── cleanup │ │ ├── delete_gh_workflows.py │ │ └── remove_envs.py │ ├── print_pulls.py │ ├── build_dataset_ft.py │ ├── get_top_pypi.py │ ├── README.md │ ├── check_validation.ipynb │ ├── get_tasks_pipeline.py │ └── build_dataset.py ├── harness │ ├── __init__.py │ ├── remove_containers.py │ ├── dockerfiles.py │ ├── prepare_images.py │ └── grading.py ├── inference │ ├── __init__.py │ ├── llamao │ │ ├── __init__.py │ │ └── distributed_attention.py │ ├── make_datasets │ │ ├── __init__.py │ │ ├── eval_retrieval.py │ │ ├── README.md │ │ ├── tokenize_dataset.py │ │ └── create_text_dataset.py │ ├── README.md │ └── run_live.py ├── versioning │ ├── __init__.py │ ├── run_get_versions.sh │ ├── utils.py │ ├── extract_web │ │ ├── get_versions_pydicom.py │ │ ├── get_versions_xarray.py │ │ ├── get_versions_matplotlib.py │ │ ├── get_versions_astropy.py │ │ ├── get_versions_pvlib-python.py │ │ └── get_versions_sqlfluff.py │ ├── README.md │ └── constants.py └── __init__.py ├── setup.cfg ├── assets ├── figures │ ├── teaser.png │ ├── collection.png │ ├── evaluation.png │ ├── validation.png │ └── swellama_banner.png ├── build_deploy.sh ├── evaluation.md └── collection.md ├── pyproject.toml ├── scripts ├── run_validation.sh ├── run_get_versions.sh ├── filter_empty_version.py ├── build_images.sh ├── docker │ └── push_all_images.sh └── eval │ ├── convert_od_output_to_swe_json.py │ ├── update_output_with_eval.py │ └── eval_infer.sh ├── README.md ├── codecov.yml ├── tests ├── test_cli.py ├── test_collect_cli.py └── test_evaluation.py ├── LICENSE ├── docs ├── 20240406_devin_validate │ ├── report.md │ └── get_devin_preds.ipynb ├── 20240415_eval_bug │ ├── sweep_conda_links.py │ └── check_harness.ipynb ├── README_JP.md ├── README_CN.md ├── README_TW.md └── 20240627_docker │ └── README.md ├── setup.py ├── CHANGELOG.md └── Original_README.md /swebench/collect/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /swebench/harness/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /swebench/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /swebench/versioning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /swebench/inference/llamao/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /swebench/inference/make_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | version = attr: swebench.__version__ 3 | license_files = LICENSE -------------------------------------------------------------------------------- /assets/figures/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-Gym/SWE-Bench-Fork/HEAD/assets/figures/teaser.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ['setuptools>=42'] 3 | build-backend = 'setuptools.build_meta' -------------------------------------------------------------------------------- /assets/figures/collection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-Gym/SWE-Bench-Fork/HEAD/assets/figures/collection.png -------------------------------------------------------------------------------- /assets/figures/evaluation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-Gym/SWE-Bench-Fork/HEAD/assets/figures/evaluation.png -------------------------------------------------------------------------------- /assets/figures/validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-Gym/SWE-Bench-Fork/HEAD/assets/figures/validation.png -------------------------------------------------------------------------------- /assets/figures/swellama_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-Gym/SWE-Bench-Fork/HEAD/assets/figures/swellama_banner.png -------------------------------------------------------------------------------- /assets/build_deploy.sh: -------------------------------------------------------------------------------- 1 | # !bin/bash 2 | 3 | python3 -m build 4 | 5 | python3 -m twine upload --skip-existing --repository pypi dist/* 6 | # python3 -m twine upload --skip-existing --repository testpypi dist/* -------------------------------------------------------------------------------- /swebench/collect/run_build_dataset_ft.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python build_dataset_ft.py \ 4 | --instances_path "" \ 5 | --output_path "" \ 6 | --eval_path "" -------------------------------------------------------------------------------- /scripts/run_validation.sh: -------------------------------------------------------------------------------- 1 | REPO_NAME=$1 # e.g. getmoto__moto 2 | VERSION_DATA=data/interim/versioned/${REPO_NAME}_versions.non-empty.jsonl 3 | 4 | python swebench/harness/run_validation.py \ 5 | --dataset_name $VERSION_DATA \ 6 | --run_id test \ 7 | --cache_level instance \ 8 | --max_workers 8 9 | 10 | # --force_rebuild true 11 | -------------------------------------------------------------------------------- /swebench/collect/run_get_tasks_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # If you'd like to parallelize, do the following: 4 | # * Create a .env file in this folder 5 | # * Declare GITHUB_TOKENS=token1,token2,token3... 6 | 7 | python get_tasks_pipeline.py \ 8 | --repos 'scikit-learn/scikit-learn', 'pallets/flask' \ 9 | --path_prs '' \ 10 | --path_tasks '' -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## SWE-Bench-Fork for SWE-Gym 2 | [SWE-Gym Dataset + Models](https://huggingface.co/SWE-Gym) 3 | 4 | [Project page](https://github.com/SWE-Gym/SWE-Gym) 5 | 6 | This fork contains environment setup files used for additonal 11 repos used in SWE-Gym dataset and an improved version of the instance collection pipeline. 7 | 8 | We plan to upstream the changes and merge with SWE-Bench soon. 9 | 10 | The original README.md is available [here](Original_README.md). 11 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | # Configuration for codecov 2 | coverage: 3 | status: 4 | project: 5 | default: 6 | # If we get < 45% coverage, codecov is gonna mark it a failure 7 | target: 45% 8 | threshold: null 9 | patch: 10 | default: 11 | # Codecov won't mark it as a failure if a patch is not covered well 12 | informational: true 13 | github_checks: 14 | # Don't mark lines that aren't covered 15 | annotations: false 16 | 17 | -------------------------------------------------------------------------------- /swebench/collect/make_repo/call_make_repo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import subprocess 4 | 5 | repos = ["Repos here"] 6 | 7 | for repo in repos: 8 | print(f"Making mirror repo for {repo}") 9 | out_make = subprocess.run( 10 | f"./make_repo.sh {repo}", 11 | shell=True, 12 | stdout=subprocess.DEVNULL, 13 | stderr=subprocess.DEVNULL, 14 | ) 15 | if out_make.returncode != 0: 16 | print(f"Error making mirror repo for {repo}") 17 | else: 18 | print(f"Success making mirror repo for {repo}") 19 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | 4 | def test_smoke_test(): 5 | cmd = ["python", "-m", "swebench.harness.run_evaluation", "--help"] 6 | result = subprocess.run(cmd, capture_output=True) 7 | print(result.stdout) 8 | print(result.stderr) 9 | assert result.returncode == 0 10 | 11 | 12 | def test_one_instance(): 13 | cmd = ["python", "-m", "swebench.harness.run_evaluation", "--predictions_path", "gold", "--max_workers", "1", "--instance_ids", "sympy__sympy-20590", "--run_id", "validate-gold"] 14 | result = subprocess.run(cmd, capture_output=True) 15 | print(result.stdout) 16 | print(result.stderr) 17 | assert result.returncode == 0 -------------------------------------------------------------------------------- /scripts/run_get_versions.sh: -------------------------------------------------------------------------------- 1 | 2 | REPO_NAME=$1 # e.g., getmoto__moto 3 | INSTANCE_PATH=/SWE-Bench/data/raw/${REPO_NAME}.jsonl 4 | OUTPUT_DIR=/SWE-Bench/data/interim/versioned 5 | CONDA_PATH=/miniconda3/condabin/conda 6 | TESTBED_PATH=/SWE-Bench/data/testbed 7 | 8 | pushd swebench/versioning 9 | 10 | python get_versions.py \ 11 | --instances_path $INSTANCE_PATH \ 12 | --retrieval_method github \ 13 | --conda_env temp \ 14 | --num_workers 4 \ 15 | --path_conda $CONDA_PATH \ 16 | --output_dir $OUTPUT_DIR \ 17 | --testbed $TESTBED_PATH 18 | 19 | popd 20 | 21 | OUTPUT_PATH=$OUTPUT_DIR/${REPO_NAME}_versions.json 22 | python3 scripts/filter_empty_version.py $OUTPUT_PATH 23 | -------------------------------------------------------------------------------- /swebench/versioning/run_get_versions.sh: -------------------------------------------------------------------------------- 1 | # Example call for getting versions by building the repo locally 2 | python get_versions.py \ 3 | --path_tasks "" \ 4 | --retrieval_method build \ 5 | --conda_env "" \ 6 | --num_threads 10 \ 7 | --path_conda "" \ 8 | --testbed "" 9 | 10 | # Example call for getting versions from github web interface 11 | python get_versions.py \ 12 | --path_tasks "" \ 13 | --retrieval_method github \ 14 | --num_workers 25 \ 15 | --output_dir "" -------------------------------------------------------------------------------- /scripts/filter_empty_version.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("input_path", type=str) 6 | args = parser.parse_args() 7 | 8 | output_path = args.input_path.replace(".json", ".non-empty.jsonl") 9 | df = pd.read_json(args.input_path, lines=False) 10 | print(df.head()) 11 | print(f"Got {len(df)} instances") 12 | df = df[df["version"].notna()] 13 | df["version"] = df["version"].astype(str) 14 | print(f"Got {len(df)} non-empty versions instances") 15 | df.to_json(output_path, lines=True, orient="records") 16 | print(f"Filtered {args.input_path} to {output_path}") 17 | 18 | print("Version Stats:") 19 | print(df["version"].value_counts()) 20 | 21 | print("Unique versions:") 22 | print(sorted(list(df["version"].unique()))) 23 | -------------------------------------------------------------------------------- /scripts/build_images.sh: -------------------------------------------------------------------------------- 1 | DATASET_NAME=$1 2 | if [ -z "$DATASET_NAME" ]; then 3 | DATASET_NAME="princeton-nlp/SWE-bench_Lite" 4 | echo "Using default dataset name: $DATASET_NAME" 5 | fi 6 | SPLIT=$2 7 | if [ -z "$SPLIT" ]; then 8 | SPLIT="test" 9 | echo "Using default split: $SPLIT" 10 | fi 11 | 12 | MAX_WORKERS=$3 13 | if [ -z "$MAX_WORKERS" ]; then 14 | MAX_WORKERS=4 15 | echo "Using default max workers: $MAX_WORKERS" 16 | fi 17 | 18 | RUN_ID="build-images-${DATASET_NAME//\//__}-${SPLIT}" 19 | echo "Using dataset name: $DATASET_NAME" 20 | echo "Using split: $SPLIT" 21 | echo "Using max workers: $MAX_WORKERS" 22 | echo "Using run id: $RUN_ID" 23 | echo "================================================" 24 | 25 | python -m swebench.harness.run_evaluation \ 26 | --dataset_name $DATASET_NAME \ 27 | --split $SPLIT \ 28 | --predictions_path gold \ 29 | --max_workers $MAX_WORKERS \ 30 | --run_id $RUN_ID \ 31 | --cache_level instance 32 | -------------------------------------------------------------------------------- /tests/test_collect_cli.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | 4 | def test_collect_smoke_test(): 5 | cmd = ["python", "-m", "swebench.collect.print_pulls", "--help"] 6 | result = subprocess.run(cmd, capture_output=True) 7 | print(result.stdout) 8 | print(result.stderr) 9 | assert result.returncode == 0 10 | 11 | 12 | def test_collect_one(tmp_path): 13 | cmd = ["python", "-m", "swebench.collect.print_pulls", "pvlib/pvlib-python", str(tmp_path/ "out.txt"), "--max_pulls", "1"] 14 | print(" ".join(cmd)) 15 | result = subprocess.run(cmd, capture_output=True) 16 | print(result.stdout) 17 | print(result.stderr) 18 | assert result.returncode == 0 19 | 20 | 21 | def test_collect_ds(tmp_path): 22 | cmd = ["python", "-m", "swebench.collect.build_dataset", "tests/test_data/pvlib.jsonl", str(tmp_path/ "out.jsonl")] 23 | print(" ".join(cmd)) 24 | result = subprocess.run(cmd, capture_output=True) 25 | print(result.stdout) 26 | print(result.stderr) 27 | assert result.returncode == 0 -------------------------------------------------------------------------------- /tests/test_evaluation.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | import docker 4 | 5 | from swebench.harness.constants import ( 6 | FAIL_TO_PASS, 7 | PASS_TO_PASS, 8 | KEY_INSTANCE_ID, 9 | KEY_MODEL, 10 | ) 11 | from swebench.harness.run_evaluation import make_run_report 12 | 13 | TEST_INSTANCE = collections.defaultdict(lambda: "test") 14 | TEST_INSTANCE[PASS_TO_PASS] = '[]' 15 | TEST_INSTANCE["repo"] = 'pvlib/pvlib-python' 16 | TEST_INSTANCE["version"] = '0.1' 17 | TEST_INSTANCE[FAIL_TO_PASS] = '[]' 18 | 19 | def test_make_run_report(tmpdir) -> None: 20 | client = docker.from_env() 21 | with tmpdir.as_cwd(): 22 | output_path = make_run_report( 23 | { 24 | "test": { 25 | KEY_INSTANCE_ID: "test", 26 | KEY_MODEL: "test" 27 | } 28 | }, 29 | [TEST_INSTANCE], 30 | client, 31 | "test" 32 | ) 33 | assert output_path.is_file() 34 | report = json.loads(output_path.read_text()) 35 | assert report["schema_version"] == 2 -------------------------------------------------------------------------------- /scripts/docker/push_all_images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DOCKER_NAMESPACE=$1 4 | INSTANCE_ID_FILE=$2 5 | 6 | if [ -z "$DOCKER_NAMESPACE" ] || [ -z "$INSTANCE_ID_FILE" ]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # target namespace 12 | image_list=$(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep sweb | grep -v $DOCKER_NAMESPACE) 13 | instance_ids=$(cat $INSTANCE_ID_FILE) 14 | 15 | # KEEP images that are IN the instance_ids 16 | image_list=$(echo "$image_list" | grep -f <(echo "$instance_ids")) 17 | 18 | echo "# of images to push: $(echo "$image_list" | wc -l)" 19 | 20 | # There are three tiers of images 21 | # - base 22 | # - env 23 | # - eval (instance level) 24 | 25 | for image in $image_list; do 26 | echo "Tagging $image" 27 | # rename image by replace "__" with "_s_" to comply with docker naming convention 28 | new_image_name=${image//__/_s_} 29 | docker tag $image $DOCKER_NAMESPACE/$new_image_name 30 | echo "Tagged $image to $DOCKER_NAMESPACE/$new_image_name" 31 | 32 | docker push $DOCKER_NAMESPACE/$new_image_name 33 | echo "Pushed $image" 34 | done 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Carlos E Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, Karthik R Narasimhan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/20240406_devin_validate/report.md: -------------------------------------------------------------------------------- 1 | # Validating Devin's Results 2 | April 6, 2024 3 | 4 | In this report, we briefly detail our validation of [Devin](https://www.cognition-labs.com/introducing-devin), an AI software engineer released by [Cognition Labs](https://www.cognition-labs.com/) that resolves an impressive 13.86% of issues on a random 25% subset of SWE-bench. 5 | 6 | The Cognition Labs team released their own [report on Devin's performance on SWE-bench](https://www.cognition-labs.com/post/swe-bench-technical-report), which includes a much more thorough deep dive into where Devin excels and struggles. 7 | 8 | Our report focuses solely on validating Devin's performance. To this end, we do the following: 9 | 1. Compile the open-sourced Devin predictions ([Github repository](https://github.com/CognitionAI/devin-swebench-results/tree/main)) into a SWE-bench evaluation-compatible `.jsonl` file. 10 | 2. Run evaluation on these predictions with: 11 | ```shell 12 | python evaluation.py \ 13 | --predictions_path devin_all_preds.jsonl \ 14 | --swe_bench_tasks swe-bench.json \ 15 | --log_dir ./results/ \ 16 | --testbed ./testbed/ \ 17 | --skip_existing \ 18 | --timeout 1200 \ 19 | --verbose 20 | ``` 21 | 22 | [To Do: Results] 23 | 24 | ✍️ Carlos & John -------------------------------------------------------------------------------- /swebench/collect/make_lite/README.md: -------------------------------------------------------------------------------- 1 | ## SWE-bench *Lite* 2 | This directory contains the scripts used to make the *lite* version of SWE-bench. The *lite* version is a subset of the full SWE-bench, that filters out certain types of instances to make evaluation on SWE-bench a bit cheaper and more accessible. 3 | 4 | SWE-bench lite consists of 300 test instances and 23 development instances; both subsets of the full SWE-bench splits. We filter the full SWE-bench according to the following criteria to get *lite*: 5 | - We remove instances with images, external hyperlinks, references to specific commit shas and references to other pull requests or issues. 6 | - We remove instances that have fewer than 40 words in the problem statement. 7 | - We remove instances that edit more than 1 file. 8 | - We remove instances where the gold patch has more than 3 edit hunks (see [patch](https://man7.org/linux/man-pages/man1/patch.1.html)). 9 | - We remove instances that create or remove files. 10 | - We remove instances that contain tests with error message checks. 11 | - Finally, we sample 300 test instances and 23 development instances from the remaining instances. 12 | 13 | See `make_lite.py` for the script that makes the *lite* version of SWE-bench, or download the *lite* version from the Hugging Face datasets [princeton-nlp/SWE-bench_Lite](https://huggingface.co/datasets/princeton-nlp/SWE-bench_Lite) 14 | -------------------------------------------------------------------------------- /swebench/versioning/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def get_instances(instance_path: str) -> list: 5 | """ 6 | Get task instances from given path 7 | 8 | Args: 9 | instance_path (str): Path to task instances 10 | Returns: 11 | task_instances (list): List of task instances 12 | """ 13 | if any([instance_path.endswith(x) for x in [".jsonl", ".jsonl.all"]]): 14 | task_instances = list() 15 | with open(instance_path) as f: 16 | for line in f.readlines(): 17 | task_instances.append(json.loads(line)) 18 | return task_instances 19 | 20 | with open(instance_path) as f: 21 | task_instances = json.load(f) 22 | return task_instances 23 | 24 | 25 | def split_instances(input_list: list, n: int) -> list: 26 | """ 27 | Split a list into n approximately equal length sublists 28 | 29 | Args: 30 | input_list (list): List to split 31 | n (int): Number of sublists to split into 32 | Returns: 33 | result (list): List of sublists 34 | """ 35 | avg_length = len(input_list) // n 36 | remainder = len(input_list) % n 37 | result, start = [], 0 38 | 39 | for i in range(n): 40 | length = avg_length + 1 if i < remainder else avg_length 41 | sublist = input_list[start : start + length] 42 | result.append(sublist) 43 | start += length 44 | 45 | return result 46 | -------------------------------------------------------------------------------- /swebench/versioning/extract_web/get_versions_pydicom.py: -------------------------------------------------------------------------------- 1 | import datetime, json, requests, sys 2 | from bs4 import BeautifulSoup 3 | 4 | sys.path.append("../../harness") 5 | from utils import get_instances 6 | 7 | PATH_TASKS_PYDICOM = "" 8 | PATH_TASKS_PYDICOM_V = "" 9 | 10 | data_tasks = get_instances(PATH_TASKS_PYDICOM) 11 | resp = requests.get('https://pydicom.github.io/pydicom/dev/faq/index.html') 12 | soup = BeautifulSoup(resp.text, "html.parser") 13 | release_table = soup.find("table", {"class": "docutils align-default"}) 14 | 15 | times = [] 16 | for row in release_table.find_all('tr'): 17 | cells = row.find_all('td') 18 | if len(cells) == 3: 19 | version = cells[0].text.strip() 20 | date = cells[1].text.strip().strip('~') 21 | if date == 'Jan 2024': 22 | date = '2024-01-01' 23 | else: 24 | date = datetime.strptime(date, "%B %Y").strftime("%Y-%m-%d") 25 | python_versions = max(cells[2].text.strip().split(', ')) 26 | times.append((date, version)) 27 | 28 | times = sorted(times, key=lambda x: x[0], reverse=True) 29 | for task in data_tasks: 30 | created_at = task["created_at"].split("T")[0] 31 | found = False 32 | for t in times: 33 | if t[0] < created_at: 34 | task["version"] = t[1] 35 | found = True 36 | break 37 | if not found: 38 | task["version"] = times[-1][1] 39 | 40 | with open(PATH_TASKS_PYDICOM_V, 'w') as f: 41 | json.dump(data_tasks, fp=f) -------------------------------------------------------------------------------- /assets/evaluation.md: -------------------------------------------------------------------------------- 1 | # Evaluating with SWE-bench 2 | John Yang • November 6, 2023 3 | 4 | In this tutorial, we will explain how to evaluate models and methods using SWE-bench. 5 | 6 | ## 🤖 Creating Predictions 7 | For each task instance of the SWE-bench dataset, given an issue (`problem_statement`) + codebase (`repo` + `base_commit`), your model should attempt to write a diff patch prediction. For full details on the SWE-bench task, please refer to Section 2 of the main paper. 8 | 9 | Each prediction must be formatted as follows: 10 | ```json 11 | { 12 | "instance_id": "", 13 | "model_patch": "<.patch file content string>", 14 | "model_name_or_path": "", 15 | } 16 | ``` 17 | 18 | Store multiple predictions in a `.json` file formatted as `[, ,... ]`. It is not necessary to generate predictions for every task instance. 19 | 20 | If you'd like examples, the [swe-bench/experiments](https://github.com/swe-bench/experiments) GitHub repository contains many examples of well formed patches. 21 | 22 | ## 🔄 Running Evaluation 23 | Evaluate model predictions on SWE-bench Lite using the evaluation harness with the following command: 24 | ```bash 25 | python -m swebench.harness.run_evaluation \ 26 | --dataset_name princeton-nlp/SWE-bench_Lite \ 27 | --predictions_path \ 28 | --max_workers \ 29 | --run_id 30 | # use --predictions_path 'gold' to verify the gold patches 31 | # use --run_id to name the evaluation run 32 | ``` 33 | -------------------------------------------------------------------------------- /swebench/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "2.0.13" 2 | 3 | from swebench.collect.build_dataset import main as build_dataset 4 | from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline 5 | from swebench.collect.print_pulls import main as print_pulls 6 | 7 | from swebench.harness.constants import ( 8 | KEY_INSTANCE_ID, 9 | KEY_MODEL, 10 | KEY_PREDICTION, 11 | MAP_REPO_VERSION_TO_SPECS, 12 | ) 13 | 14 | from swebench.harness.docker_build import ( 15 | build_image, 16 | build_base_images, 17 | build_env_images, 18 | build_instance_images, 19 | build_instance_image, 20 | close_logger, 21 | setup_logger, 22 | ) 23 | 24 | from swebench.harness.docker_utils import ( 25 | cleanup_container, 26 | remove_image, 27 | copy_to_container, 28 | exec_run_with_timeout, 29 | list_images, 30 | ) 31 | 32 | from swebench.harness.grading import ( 33 | compute_fail_to_pass, 34 | compute_pass_to_pass, 35 | get_logs_eval, 36 | get_eval_report, 37 | get_resolution_status, 38 | ResolvedStatus, 39 | TestStatus, 40 | ) 41 | 42 | from swebench.harness.log_parsers import ( 43 | MAP_REPO_TO_PARSER, 44 | ) 45 | 46 | from swebench.harness.run_evaluation import ( 47 | main as run_evaluation, 48 | ) 49 | 50 | from swebench.harness.utils import ( 51 | get_environment_yml, 52 | get_requirements, 53 | ) 54 | 55 | from swebench.versioning.constants import ( 56 | MAP_REPO_TO_VERSION_PATHS, 57 | MAP_REPO_TO_VERSION_PATTERNS, 58 | ) 59 | 60 | from swebench.versioning.get_versions import ( 61 | get_version, 62 | map_version_to_task_instances, 63 | get_versions_from_build, 64 | get_versions_from_web, 65 | ) 66 | 67 | from swebench.versioning.utils import ( 68 | split_instances, 69 | ) -------------------------------------------------------------------------------- /swebench/harness/remove_containers.py: -------------------------------------------------------------------------------- 1 | import json 2 | from argparse import ArgumentParser 3 | 4 | import docker 5 | 6 | """ 7 | Script for removing containers associated with specified instance IDs. 8 | """ 9 | 10 | def main(instance_ids, predictions_path): 11 | all_ids = set() 12 | if predictions_path: 13 | with open(predictions_path, "r") as f: 14 | predictions = json.loads(f.read()) 15 | for pred in predictions: 16 | all_ids.add(pred["instance_id"]) 17 | 18 | if instance_ids: 19 | all_ids |= set(instance_ids) 20 | 21 | if not all_ids: 22 | print("No instance IDs provided, exiting.") 23 | return 24 | 25 | for instance_id in all_ids: 26 | try: 27 | client = docker.from_env() 28 | container = client.containers.get(f"sweb.eval.{instance_id}") 29 | container.stop() 30 | container.remove() 31 | print(f"Removed container {instance_id}") 32 | except docker.errors.NotFound: 33 | print(f"Container {instance_id} not found, skipping.") 34 | except Exception as e: 35 | print(f"Error removing container {instance_id}: {e}") 36 | continue 37 | 38 | 39 | if __name__ == "__main__": 40 | parser = ArgumentParser(description=__doc__) 41 | parser.add_argument( 42 | "--instance_ids", 43 | help="Instance IDs to remove containers for", 44 | ) 45 | parser.add_argument( 46 | "--predictions_path", 47 | help="Path to predictions file", 48 | ) 49 | args = parser.parse_args() 50 | instance_ids = [i.strip() for i in args.instance_ids.split(",")] if args.instance_ids else [] 51 | main( 52 | instance_ids=instance_ids, 53 | predictions_path=args.predictions_path, 54 | ) 55 | -------------------------------------------------------------------------------- /swebench/versioning/extract_web/get_versions_xarray.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import requests 5 | import sys 6 | 7 | from datetime import datetime 8 | 9 | sys.path.append("../../harness") 10 | from utils import get_instances 11 | 12 | PATH_TASKS_XARRAY = "" 13 | 14 | # Get raw xarray dataset 15 | data_tasks = get_instances(PATH_TASKS_XARRAY) 16 | 17 | # Get version to date from xarray home page 18 | resp = requests.get("https://docs.xarray.dev/en/stable/whats-new.html") 19 | pattern = ( 20 | r'v(.*) \((.*)\)' 21 | ) 22 | matches = re.findall(pattern, resp.text) 23 | matches = list(set(matches)) 24 | matches = [x[1:] for x in matches] 25 | 26 | # Get (date, version) pairs 27 | date_formats = ["%B %d %Y", "%d %B %Y"] 28 | keep_major_minor = lambda x, sep: ".".join(x.strip().split(sep)[:2]) 29 | 30 | times = [] 31 | for match in matches: 32 | parts = match[0].split("-") 33 | version = keep_major_minor(".".join(parts[0:3]), ".") 34 | date_str = " ".join(parts[3:]) 35 | 36 | for f_ in date_formats: 37 | try: 38 | date_obj = datetime.strptime(date_str, f_) 39 | times.append((date_obj.strftime("%Y-%m-%d"), version)) 40 | except: 41 | continue 42 | break 43 | 44 | times = sorted(times, key=lambda x: x[0])[::-1] 45 | 46 | for task in data_tasks: 47 | created_at = task["created_at"].split("T")[0] 48 | found = False 49 | for t in times: 50 | if t[0] < created_at: 51 | task["version"] = t[1] 52 | found = True 53 | break 54 | if not found: 55 | task["version"] = None 56 | 57 | # Save xarray versioned data to repository 58 | with open( 59 | os.path.join(PATH_TASKS_XARRAY, "xarray-task-instances_versions.json"), 60 | "w", 61 | ) as f: 62 | json.dump(data_tasks, fp=f) 63 | -------------------------------------------------------------------------------- /swebench/versioning/README.md: -------------------------------------------------------------------------------- 1 | # Versioning 2 | To enable execution based evaluation, SWE-bench assigns each task instances a `version` (with respect to its repository), where the `version` is then a key for the installation instructions. 3 | 4 | This folder contains code for assigning the version of a task instance based on its repository. 5 | 6 | ## 🔧 General Purpose 7 | `get_versions.py` script is a general purpose tool for getting version from either A. reading the GitHub repository or B. from building the repository locally and locating the appropriate version files. 8 | Given a list of candidate task instances, the script assigns each task instance a new `version: ` key/value pair. 9 | 10 | This script can be invoked via the `./run_get_version.sh` script, where the arguments are: 11 | ``` 12 | python get_versions.py \ 13 | --instances_path [Required] [folder] Patch to candidate task instances \ 14 | --retrieval_method [Required] [choice] Method to retrieve versions ("build", "mix", or "github") \ 15 | --cleanup [Required] [bool] Remove testbed and conda environments upon task completion \ 16 | --conda_env [Required] [str] Name of conda environment to run task installation within \ 17 | --num_workers [Required] [int] Number of processes to parallelize on \ 18 | --path_conda [Required] [folder] Path to miniconda or anaconda installation \ 19 | --output_dir [Required] [folder] Path to directory to write versioned task instances to (overwrite by default) \ 20 | --testbed [Required] [folder] Path to testbed directory, for cloning GitHub repos to 21 | ``` 22 | 23 | ## 🌐 Repository Website-Based 24 | The `extract_web/get_versions_*.py` files are repository specific scripts that crawl the website of the PyPI package to find versions and their cut off dates. 25 | This script can be easily adapted to other repositories to check task instances' `creation_date` against the version dates. -------------------------------------------------------------------------------- /swebench/versioning/extract_web/get_versions_matplotlib.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import requests 5 | import sys 6 | 7 | from datetime import datetime 8 | 9 | sys.path.append("../../harness") 10 | from utils import get_instances 11 | 12 | PATH_TASKS_MATPLOTLIB = "" 13 | 14 | # Get raw matplotlib dataset 15 | data_tasks = get_instances(PATH_TASKS_MATPLOTLIB) 16 | 17 | # Get version to date from matplotlib home page 18 | resp = requests.get("https://matplotlib.org/stable/users/release_notes#past-versions") 19 | pattern = r'What\'s new in Matplotlib (.*)' 20 | matches = re.findall(pattern, resp.text) 21 | matches = list(set(matches)) 22 | 23 | # Get (date, version) pairs 24 | date_format = "%b %d, %Y" 25 | keep_major_minor = lambda x, sep: ".".join(x.strip().split(sep)[:2]) 26 | 27 | times = [] 28 | for match in matches: 29 | version, s = match[0], match[1] 30 | if "(" not in s: 31 | continue 32 | version = keep_major_minor(version, ".") 33 | date_string = s[s.find("(") + 1 : s.find(")")] 34 | date_obj = datetime.strptime(date_string, date_format) 35 | times.append((date_obj.strftime("%Y-%m-%d"), version)) 36 | times = sorted(times, key=lambda x: x[0])[::-1] 37 | 38 | for task in data_tasks: 39 | created_at = task["created_at"].split("T")[0] 40 | for t in times: 41 | if t[0] < created_at: 42 | task["version"] = t[1] 43 | break 44 | 45 | # Construct map of versions to task instances 46 | map_v_to_t = {} 47 | for t in data_tasks: 48 | if t["version"] not in map_v_to_t: 49 | map_v_to_t[t["version"]] = [] 50 | map_v_to_t[t["version"]].append(t) 51 | 52 | # Save matplotlib versioned data to repository 53 | with open( 54 | os.path.join(PATH_TASKS_MATPLOTLIB, "matplotlib-task-instances_versions.json"), 55 | "w", 56 | ) as f: 57 | json.dump(data_tasks, fp=f) 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open('README.md', 'r', encoding='utf-8') as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name='swebench', 8 | author='John Yang', 9 | author_email='byjohnyang@gmail.com', 10 | description='The official SWE-bench package - a benchmark for evaluating LMs on software engineering', 11 | keywords='nlp, benchmark, code', 12 | long_description=long_description, 13 | long_description_content_type='text/markdown', 14 | url='https://swebench.com', 15 | project_urls={ 16 | 'Documentation': 'https://github.com/princeton-nlp/SWE-bench', 17 | 'Bug Reports': 'http://github.com/princeton-nlp/SWE-bench/issues', 18 | 'Source Code': 'http://github.com/princeton-nlp/SWE-bench', 19 | 'Website': 'https://swebench.com', 20 | }, 21 | packages=setuptools.find_packages(), 22 | classifiers=[ 23 | 'Programming Language :: Python :: 3.8', 24 | 'Programming Language :: Python :: 3.9', 25 | 'Programming Language :: Python :: 3 :: Only', 26 | 'License :: OSI Approved :: MIT License', 27 | 'Operating System :: OS Independent', 28 | ], 29 | python_requires='>=3.8', 30 | install_requires=[ 31 | 'beautifulsoup4', 32 | 'chardet', 33 | 'datasets', 34 | 'docker', 35 | 'ghapi', 36 | 'GitPython', 37 | 'pre-commit', 38 | 'python-dotenv', 39 | 'requests', 40 | 'rich', 41 | 'unidiff', 42 | 'tqdm', 43 | ], 44 | extras_require={ 45 | 'inference': [ 46 | 'tiktoken', 47 | 'openai', 48 | 'anthropic', 49 | 'transformers', 50 | 'peft', 51 | 'sentencepiece', 52 | 'protobuf', 53 | 'torch', 54 | 'flash_attn', 55 | 'triton', 56 | 'jedi', 57 | 'tenacity', 58 | ], 59 | }, 60 | include_package_data=True, 61 | ) -------------------------------------------------------------------------------- /swebench/collect/cleanup/delete_gh_workflows.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import subprocess 6 | 7 | 8 | def main(repo_url): 9 | """ 10 | Remove .github/workflows folder from all branches of a repo 11 | 12 | Args: 13 | repo_url (str): URL of the target repo 14 | """ 15 | # Get list of remote branches 16 | branches_command = subprocess.run( 17 | ["git", "ls-remote", "--heads", repo_url], capture_output=True, text=True 18 | ) 19 | branches = branches_command.stdout.strip().split("\n") 20 | branches = [branch.split()[1] for branch in branches] 21 | subprocess.run( 22 | ["git", "clone", repo_url, "temp_repo"], 23 | stderr=subprocess.DEVNULL, 24 | stdout=subprocess.DEVNULL, 25 | ) 26 | 27 | # Iterate through all branches 28 | os.chdir("temp_repo") 29 | for branch in branches: 30 | # Switch to branch 31 | print(f"--------------\nProcessing branch: {branch}") 32 | branch = branch.split("/")[-1] 33 | subprocess.run(["git", "checkout", branch]) 34 | 35 | workflows_path = os.path.join(".github", "workflows") 36 | if os.path.exists(workflows_path): 37 | # Remove .github/workflows folder if it exists 38 | print(f"Deleting .github/workflows folder from branch: {branch}") 39 | subprocess.run(["rm", "-rf", workflows_path]) 40 | subprocess.run(["git", "add", "-A"]) 41 | subprocess.run(["git", "commit", "-m", "Remove .github/workflows folder"]) 42 | subprocess.run(["git", "push"]) 43 | else: 44 | print(f".github/workflows folder not found in branch: {branch}") 45 | 46 | os.chdir("..") 47 | subprocess.run(["rm", "-rf", "temp_repo"]) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("--repo_url", type=str, required=True) 53 | args = parser.parse_args() 54 | main(**vars(args)) 55 | -------------------------------------------------------------------------------- /swebench/harness/dockerfiles.py: -------------------------------------------------------------------------------- 1 | # IF you change the base image, you need to rebuild all images (run with --force_rebuild) 2 | _DOCKERFILE_BASE = r""" 3 | FROM --platform={platform} ubuntu:22.04 4 | 5 | ARG DEBIAN_FRONTEND=noninteractive 6 | ENV TZ=Etc/UTC 7 | 8 | RUN apt update && apt install -y \ 9 | wget \ 10 | git \ 11 | build-essential \ 12 | libffi-dev \ 13 | libtiff-dev \ 14 | python3 \ 15 | python3-pip \ 16 | python-is-python3 \ 17 | jq \ 18 | curl \ 19 | locales \ 20 | locales-all \ 21 | tzdata \ 22 | && rm -rf /var/lib/apt/lists/* 23 | 24 | # Download and install conda 25 | RUN wget 'https://repo.anaconda.com/miniconda/Miniconda3-py311_24.7.1-0-Linux-x86_64.sh' -O miniconda.sh \ 26 | && bash miniconda.sh -b -p /opt/miniconda3 27 | # Add conda to PATH 28 | ENV PATH=/opt/miniconda3/bin:$PATH 29 | # Add conda to shell startup scripts like .bashrc (DO NOT REMOVE THIS) 30 | RUN conda init --all 31 | RUN conda config --append channels conda-forge 32 | 33 | RUN adduser --disabled-password --gecos 'dog' nonroot 34 | """ 35 | 36 | _DOCKERFILE_ENV = r"""FROM --platform={platform} sweb.base.{arch}:latest 37 | 38 | COPY ./setup_env.sh /root/ 39 | RUN chmod +x /root/setup_env.sh 40 | RUN /bin/bash -c "source ~/.bashrc && /root/setup_env.sh" 41 | 42 | WORKDIR /testbed/ 43 | 44 | # Automatically activate the testbed environment 45 | RUN echo "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed" > /root/.bashrc 46 | """ 47 | 48 | _DOCKERFILE_INSTANCE = r"""FROM --platform={platform} {env_image_name} 49 | 50 | COPY ./setup_repo.sh /root/ 51 | RUN /bin/bash /root/setup_repo.sh 52 | 53 | WORKDIR /testbed/ 54 | """ 55 | 56 | 57 | def get_dockerfile_base(platform, arch): 58 | if arch == "arm64": 59 | conda_arch = "aarch64" 60 | else: 61 | conda_arch = arch 62 | return _DOCKERFILE_BASE.format(platform=platform, conda_arch=conda_arch) 63 | 64 | 65 | def get_dockerfile_env(platform, arch): 66 | return _DOCKERFILE_ENV.format(platform=platform, arch=arch) 67 | 68 | 69 | def get_dockerfile_instance(platform, env_image_name): 70 | return _DOCKERFILE_INSTANCE.format(platform=platform, env_image_name=env_image_name) 71 | -------------------------------------------------------------------------------- /swebench/collect/make_repo/make_repo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Mirror repository to https://github.com/swe-bench 4 | # Usage make_repo.sh {gh organization}/{gh repository} 5 | 6 | # Abort on error 7 | set -euo pipefail 8 | 9 | REPO_TARGET=$1 10 | 11 | # Check if the target repository exists 12 | gh repo view "$REPO_TARGET" > /dev/null || exit 1 13 | 14 | # Set the organization and repository names 15 | ORG_NAME="swe-train" 16 | NEW_REPO_NAME="${REPO_TARGET//\//__}" 17 | 18 | # Check if the new repository already exists 19 | # gh repo view "$ORG_NAME/$NEW_REPO_NAME" > /dev/null 2>&1 20 | # if [ $? -eq 0 ]; then 21 | # echo "The repository $ORG_NAME/$NEW_REPO_NAME already exists." 22 | # exit 1 23 | # else 24 | # # Create mirror repository 25 | gh repo create "$ORG_NAME/$NEW_REPO_NAME" --private 26 | # fi 27 | 28 | # Check if the repository creation was successful 29 | if [ $? -eq 0 ]; then 30 | echo "** Repository created successfully at $ORG_NAME/$NEW_REPO_NAME." 31 | else 32 | echo "Failed to create the repository." 33 | exit 1 34 | fi 35 | 36 | # Clone the target repository 37 | echo "** Cloning $REPO_TARGET..." 38 | TARGET_REPO_DIR="${REPO_TARGET##*/}.git" 39 | 40 | # Check if the local repository directory already exists 41 | if [ -d "$TARGET_REPO_DIR" ]; then 42 | echo "The local repository directory $TARGET_REPO_DIR already exists." 43 | exit 1 44 | fi 45 | 46 | git clone --bare git@github.com:$REPO_TARGET.git 47 | 48 | # Push files to the mirror repository 49 | echo "** Performing mirror push of files to $ORG_NAME/$NEW_REPO_NAME..." 50 | cd "$TARGET_REPO_DIR"; git push --mirror git@github.com:$ORG_NAME/$NEW_REPO_NAME 51 | 52 | # Remove the target repository 53 | cd ..; rm -rf "$TARGET_REPO_DIR" 54 | 55 | # Clone the mirror repository 56 | git clone git@github.com:$ORG_NAME/$NEW_REPO_NAME.git 57 | 58 | # Delete .github/workflows if it exists 59 | if [ -d "$NEW_REPO_NAME/.github/workflows" ]; then 60 | # Remove the directory 61 | rm -rf "$NEW_REPO_NAME/.github/workflows" 62 | 63 | # Commit and push the changes 64 | cd "$NEW_REPO_NAME"; 65 | git add -A; 66 | git commit -m "Removed .github/workflows"; 67 | git push origin main; # Change 'master' to your desired branch 68 | cd ..; 69 | else 70 | echo "$REPO_NAME/.github/workflows does not exist. No action required." 71 | fi 72 | 73 | rm -rf "$NEW_REPO_NAME" 74 | -------------------------------------------------------------------------------- /swebench/versioning/extract_web/get_versions_astropy.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import requests 5 | import sys 6 | 7 | from datetime import datetime 8 | 9 | sys.path.append("../../harness") 10 | from utils import get_instances 11 | 12 | PATH_TASKS_ASTROPY = "" 13 | 14 | # Get raw astropy dataset 15 | data_tasks = get_instances(PATH_TASKS_ASTROPY) 16 | 17 | # Get version to date from astropy homepage 18 | resp = requests.get("https://docs.astropy.org/en/latest/changelog.html") 19 | pattern = ( 20 | r'Version (.*)' 21 | ) 22 | matches = re.findall(pattern, resp.text) 23 | matches = list(set(matches)) 24 | 25 | # Get (date, version) pairs 26 | date_format = "%Y-%m-%d" 27 | keep_major_minor = lambda x, sep: ".".join(x.strip().split(sep)[:2]) 28 | 29 | # Iterate through matches, construct (version, date) pairs 30 | times = [] 31 | for match in matches: 32 | match_parts = match[1].split(" ") 33 | version, date = match_parts[0], match_parts[1].strip(")").strip("(") 34 | version = keep_major_minor(version, ".") 35 | date_obj = datetime.strptime(date, date_format) 36 | times.append((date_obj.strftime("%Y-%m-%d"), version)) 37 | 38 | # Group times by major/minor version 39 | map_version_to_times = {} 40 | for time in times: 41 | if time[1] not in map_version_to_times: 42 | map_version_to_times[time[1]] = [] 43 | map_version_to_times[time[1]].append(time[0]) 44 | 45 | # Pick the most recent time as the version cut off date 46 | version_to_time = [(k, max(v)) for k, v in map_version_to_times.items()] 47 | version_to_time = sorted(version_to_time, key=lambda x: x[0])[::-1] 48 | 49 | # Assign version to each task instance 50 | for task in data_tasks: 51 | created_at = task["created_at"].split("T")[0] 52 | for t in version_to_time: 53 | found = False 54 | if t[1] < created_at: 55 | task["version"] = t[0] 56 | found = True 57 | break 58 | if not found: 59 | task["version"] = version_to_time[-1][0] 60 | 61 | # Construct map of versions to task instances 62 | map_v_to_t = {} 63 | for task in data_tasks: 64 | if task["version"] not in map_v_to_t: 65 | map_v_to_t[task["version"]] = [] 66 | map_v_to_t[task["version"]].append(t) 67 | 68 | # Save matplotlib versioned data to repository 69 | with open( 70 | os.path.join(PATH_TASKS_ASTROPY, "astropy-task-instances_versions.json"), 71 | "w", 72 | ) as f: 73 | json.dump(data_tasks, fp=f) 74 | -------------------------------------------------------------------------------- /scripts/eval/convert_od_output_to_swe_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import pandas as pd 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('od_output_file', type=str) 8 | args = parser.parse_args() 9 | output_filepath = args.od_output_file.replace('.jsonl', '.swebench.jsonl') 10 | print(f'Converting {args.od_output_file} to {output_filepath}') 11 | 12 | od_format = pd.read_json(args.od_output_file, orient='records', lines=True) 13 | # model name is the folder name of od_output_file 14 | model_name = os.path.basename(os.path.dirname(args.od_output_file)) 15 | 16 | 17 | def process_git_patch(patch): 18 | if not isinstance(patch, str): 19 | return '' 20 | 21 | if not patch.strip(): 22 | # skip empty patches 23 | return '' 24 | 25 | patch = patch.replace('\r\n', '\n') 26 | # There might be some weird characters at the beginning of the patch 27 | # due to some OpenHands inference command outputs 28 | 29 | # FOR EXAMPLE: 30 | # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90 31 | # 0 32 | # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py 33 | # new file mode 100644 34 | # index 0000000000..fc13db5948 35 | 36 | # We "find" the first line that starts with "diff" and then we remove lines before it 37 | lines = patch.split('\n') 38 | for i, line in enumerate(lines): 39 | if line.startswith('diff --git'): 40 | patch = '\n'.join(lines[i:]) 41 | break 42 | 43 | patch = patch.rstrip() + '\n' # Make sure the last line ends with a newline 44 | return patch 45 | 46 | 47 | def convert_row_to_swebench_format(row): 48 | if 'git_patch' in row: 49 | model_patch = row['git_patch'] 50 | elif 'test_result' in row and 'git_patch' in row['test_result']: 51 | model_patch = row['test_result']['git_patch'] 52 | else: 53 | raise ValueError(f'Row {row} does not have a git_patch') 54 | 55 | return { 56 | 'instance_id': row['instance_id'], 57 | 'model_patch': process_git_patch(model_patch), 58 | 'model_name_or_path': model_name, 59 | } 60 | 61 | 62 | swebench_format = od_format.apply(convert_row_to_swebench_format, axis=1) 63 | swebench_format.to_json(output_filepath, lines=True, orient='records') 64 | -------------------------------------------------------------------------------- /swebench/versioning/extract_web/get_versions_pvlib-python.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import requests 4 | import sys 5 | 6 | from datetime import datetime 7 | 8 | sys.path.append("/n/fs/nlp-jy1682/swe-bench/public/harness") 9 | from utils import get_instances 10 | sys.path = sys.path[:-1] 11 | 12 | PATH_TASKS_PVLIB = "" 13 | PATH_TASKS_PVLIB_V = "" 14 | WEBPAGE = "https://pvlib-python.readthedocs.io/en/stable/whatsnew.html" 15 | PATTERN = r'\n\s+v(.*)\n\s+<\/a>' 16 | DATE_FORMAT = "%B %d, %Y" 17 | 18 | # Get raw astropy dataset 19 | data_tasks = get_instances(PATH_TASKS_PVLIB) 20 | 21 | # Get version to date from astropy homepage 22 | resp = requests.get(WEBPAGE) 23 | matches = re.findall(PATTERN, resp.text) 24 | matches = list(set(matches)) 25 | 26 | # Get (date, version) pairs 27 | keep_major_minor = lambda x, sep: ".".join(x.strip().split(sep)[:2]) 28 | 29 | # Iterate through matches, construct (version, date) pairs 30 | times = [] 31 | for match in matches: 32 | match_parts = match[1].split(" (") 33 | version = '.'.join(match_parts[0].split('.')[:-1]) 34 | date = match_parts[1].strip(')').strip('(') 35 | date_obj = datetime.strptime(date, DATE_FORMAT) 36 | times.append((date_obj.strftime("%Y-%m-%d"), version)) 37 | 38 | # Group times by major/minor version 39 | map_version_to_times = {} 40 | for time in times: 41 | if time[1] not in map_version_to_times: 42 | map_version_to_times[time[1]] = [] 43 | map_version_to_times[time[1]].append(time[0]) 44 | 45 | # Pick the most recent time as the version cut off date 46 | version_to_time = [(k, max(v)) for k, v in map_version_to_times.items()] 47 | version_to_time = sorted(version_to_time, key=lambda x: x[0])[::-1] 48 | 49 | # Assign version to each task instance 50 | for task in data_tasks: 51 | created_at = task["created_at"].split("T")[0] 52 | for t in version_to_time: 53 | found = False 54 | if t[1] < created_at: 55 | task["version"] = t[0] 56 | found = True 57 | break 58 | if not found: 59 | task["version"] = version_to_time[-1][0] 60 | 61 | # Construct map of versions to task instances 62 | map_v_to_t = {} 63 | for task in data_tasks: 64 | if task["version"] not in map_v_to_t: 65 | map_v_to_t[task["version"]] = [] 66 | map_v_to_t[task["version"]].append(t) 67 | 68 | # Save matplotlib versioned data to repository 69 | with open(PATH_TASKS_PVLIB_V, "w") as f: 70 | json.dump(data_tasks, fp=f) -------------------------------------------------------------------------------- /swebench/collect/make_lite/make_lite.py: -------------------------------------------------------------------------------- 1 | from criteria import ( 2 | contains_git_commit_hash, 3 | contains_hyperlinks, 4 | contains_image, 5 | contains_issue_reference, 6 | contains_non_modified_files, 7 | contains_pytest_match_arg, 8 | leq_n_code_lines, 9 | leq_n_files, 10 | leq_n_hunks, 11 | leq_n_words, 12 | ) 13 | from datasets import load_dataset, disable_caching, DatasetDict 14 | disable_caching() 15 | 16 | 17 | def filter_problem_statement(instance): 18 | problem_statement = instance["problem_statement"] 19 | repo = instance["repo"] 20 | if leq_n_words(problem_statement, 40) or \ 21 | contains_hyperlinks(problem_statement, repo) or \ 22 | contains_issue_reference(problem_statement, repo) or \ 23 | contains_git_commit_hash(problem_statement) or \ 24 | contains_image(problem_statement): 25 | return False 26 | return True 27 | 28 | 29 | def filter_patch(instance): 30 | patch_text = instance["patch"] 31 | if contains_non_modified_files(patch_text) or \ 32 | not leq_n_files(patch_text, 1) or \ 33 | not leq_n_hunks(patch_text, 3): 34 | return False 35 | return True 36 | 37 | 38 | def filter_patch_test(instance): 39 | patch_text = instance["test_patch"] 40 | if contains_pytest_match_arg(patch_text): 41 | return False 42 | return True 43 | 44 | 45 | def apply_filters(dset, filters, name=''): 46 | print(f'Starting with {len(dset)} instances', end='') 47 | if name: 48 | print(f' for {name}.') 49 | else: 50 | print('.') 51 | for _filter in filters: 52 | dset = dset.filter(_filter, desc=f'Applying {_filter.__name__}') 53 | print(f'After filtering {len(dset)}.') 54 | return dset 55 | 56 | 57 | def take_subset(dset, n, name=''): 58 | dset = dset.sort("instance_id") 59 | print(f'Starting with {len(dset)} instances', end='') 60 | if name: 61 | print(f' for {name}.') 62 | else: 63 | print('.') 64 | dset = dset.shuffle(seed=42).select(range(n)) 65 | print(f'Sampled {len(dset)} instances.') 66 | return dset 67 | 68 | 69 | if __name__ == "__main__": 70 | # Load the dataset 71 | dev = load_dataset("princeton-nlp/SWE-bench")['dev'] 72 | test = load_dataset("princeton-nlp/SWE-bench")['test'] 73 | 74 | test = apply_filters(test, [filter_problem_statement, filter_patch, filter_patch_test], 'test') 75 | test = take_subset(test, 300, 'test') 76 | dev = apply_filters(dev, [filter_problem_statement, filter_patch, filter_patch_test], 'dev') 77 | dset = DatasetDict({'dev': dev, 'test': test}) 78 | # Save the filtered dataset to disk 79 | dset.save_to_disk("SWE-bench_lite") 80 | print("Saved to SWE-bench_lite.") -------------------------------------------------------------------------------- /swebench/inference/llamao/distributed_attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import torch 7 | 8 | from typing import Any 9 | from torch import Tensor 10 | from torch.nn import Module 11 | 12 | import torch.distributed as dist 13 | 14 | class SeqAllToAll(torch.autograd.Function): 15 | @staticmethod 16 | def forward(ctx: Any, input: Tensor, scatter_idx: int, gather_idx: int, group: Any) -> Tensor: 17 | ctx.scatter_idx = scatter_idx 18 | ctx.gather_idx = gather_idx 19 | ctx.group = group 20 | 21 | world_size = dist.get_world_size(group) 22 | 23 | input_list = [t.contiguous() for t in torch.tensor_split(input, world_size, scatter_idx)] 24 | output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)] 25 | 26 | dist.all_to_all(output_list, input_list, group=group) 27 | return torch.cat(output_list, dim=gather_idx).contiguous() 28 | 29 | @staticmethod 30 | def backward(ctx: Any, *grad_output: Tensor) -> tuple[Tensor, None, None, None]: 31 | return (SeqAllToAll.apply(*grad_output, ctx.gather_idx, ctx.scatter_idx, ctx.group), None, None, None) 32 | 33 | 34 | class DistributedAttention(torch.nn.Module): 35 | """Initialization. 36 | 37 | Arguments: 38 | local_attention (Module): local attention with q,k,v 39 | scatter_idx (int): scatter_idx for all2all comm 40 | gather_idx (int): gather_idx for all2all comm 41 | """ 42 | 43 | def __init__( 44 | self, 45 | local_attention: Module, 46 | scatter_idx: int = -2, 47 | gather_idx: int = 1, 48 | ) -> None: 49 | 50 | super().__init__() 51 | self.local_attn = local_attention 52 | self.scatter_idx = scatter_idx # head axis 53 | self.gather_idx = gather_idx # seq axis 54 | 55 | def forward(self, query: Tensor, key_values: Tensor, group: Any = None, **kwargs) -> Tensor: 56 | """ forward 57 | 58 | Arguments: 59 | query (Tensor): query input to the layer 60 | key (Tensor): key input to the layer 61 | value (Tensor): value input to the layer 62 | args: other args 63 | 64 | Returns: 65 | * output (Tensor): context output 66 | """ 67 | #in shape : e.g., [s/p:h:] 68 | query_heads = SeqAllToAll.apply(query, self.scatter_idx, self.gather_idx, group) 69 | key_values_heads = SeqAllToAll.apply(key_values, self.scatter_idx, self.gather_idx, group) 70 | 71 | #out shape : e.g., [s:h/p:] 72 | output_heads = self.local_attn(query_heads, key_values_heads, **kwargs) 73 | 74 | #out e.g., [s/p::h] 75 | return SeqAllToAll.apply(output_heads, self.gather_idx, self.scatter_idx, group) -------------------------------------------------------------------------------- /docs/20240415_eval_bug/sweep_conda_links.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | """ 5 | This script is used to sweep through a list of conda links and run the evaluation script on each one. 6 | 7 | It was originally invoked from the swebench/harness/ folder. 8 | """ 9 | 10 | conda_links = [ 11 | "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Linux-x86_64.sh", 12 | "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh", 13 | "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.9.0-0-Linux-x86_64.sh", 14 | "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.10.0-1-Linux-x86_64.sh", 15 | "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.10.0-1-Linux-x86_64.sh", 16 | "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh", 17 | "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.10.0-1-Linux-x86_64.sh", 18 | "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-1-Linux-x86_64.sh", 19 | "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-1-Linux-x86_64.sh", 20 | "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-1-Linux-x86_64.sh", 21 | "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-1-Linux-x86_64.sh", 22 | "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh", 23 | "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-x86_64.sh", 24 | "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh", 25 | "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-2-Linux-x86_64.sh", 26 | ] 27 | 28 | for conda_link in conda_links: 29 | version = conda_link.split("/")[-1]\ 30 | .split("-", 1)[1]\ 31 | .rsplit("-", 2)[0]\ 32 | .replace(".", "_")\ 33 | .replace("-", "_") 34 | os.makedirs(f"/n/fs/p-swe-bench/results/{version}/", exist_ok=True) 35 | 36 | cmd = ( 37 | "python evaluation.py " 38 | "--predictions_path /n/fs/p-swe-bench/data/original/gold_preds.jsonl " 39 | "--swe_bench_tasks /n/fs/p-swe-bench/data/original/swe-bench.json " 40 | f"--log_dir /n/fs/p-swe-bench/results/{version}/ " 41 | f"--conda_link {conda_link} " 42 | "--testbed /n/fs/p-swe-bench/testbed/ " 43 | "--timeout 1200 " 44 | "--verbose " 45 | ) 46 | 47 | # Run subprocess 48 | subprocess.run(cmd, shell=True) 49 | 50 | # Move results, scorecard to results/{version} log_dir 51 | subprocess.run( 52 | f"mv /n/fs/p-swe-bench/data/original/results.json /n/fs/p-swe-bench/results/{version}/results.json", 53 | shell=True 54 | ) 55 | subprocess.run( 56 | f"mv /n/fs/p-swe-bench/data/original/scorecard.json /n/fs/p-swe-bench/results/{version}/scorecard.json", 57 | shell=True 58 | ) 59 | 60 | # Clear testbed 61 | subprocess.run(f"rm -rf /n/fs/p-swe-bench/testbed/*", shell=True) 62 | -------------------------------------------------------------------------------- /swebench/inference/make_datasets/eval_retrieval.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env python 3 | 4 | """This script can be used to evaluate the BM25 retrieval results for a dataset created with create_text_dataset.py with the --retrieval_file option and --file_source bm25.""" 5 | 6 | import re 7 | import numpy as np 8 | from datasets import load_dataset, disable_caching, load_from_disk 9 | from argparse import ArgumentParser 10 | import logging 11 | 12 | disable_caching() 13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def main(dataset_name_or_path, split): 18 | try: 19 | dataset = load_dataset(dataset_name_or_path, split=split) 20 | except: 21 | dataset = load_from_disk(dataset_name_or_path, split=split) 22 | print(f'Evaluating {len(dataset)} instances from {dataset_name_or_path} {split} split') 23 | instance_files_pattern = re.compile(r'\[start of ([\w\.\-\/]+)\]\n(?:.+?)\n\[end of \1\]', re.DOTALL) 24 | patch_files_pattern = re.compile(r'\-\-\- a/(.+)') 25 | patch_files = {instance['instance_id']: instance['patch'] for instance in dataset} 26 | recalls_any = list() 27 | recalls_all = list() 28 | recalls = list() 29 | for datum in dataset: 30 | instance_id = datum['instance_id'] 31 | retrieved_files = instance_files_pattern.findall(datum['text']) 32 | if retrieved_files and 'readme' in retrieved_files[0].lower(): 33 | retrieved_files = retrieved_files[1:] # first file is usually the readme, we don't want to count that 34 | retrieved_files = set(retrieved_files) 35 | gold_files = set(patch_files_pattern.findall(patch_files[instance_id])) 36 | if len(gold_files) == 0: 37 | print(f"WARNING: Instance {datum['instance_id']} has no gold files") 38 | continue 39 | if len(retrieved_files) == 0: 40 | print(f"WARNING: Instance {datum['instance_id']} has no retrieved files") 41 | continue 42 | recall = len(retrieved_files.intersection(gold_files)) / len(gold_files) 43 | recalls.append(recall) 44 | recalls_any.append(int(recall > 0)) 45 | recalls_all.append(int(recall == 1)) 46 | recalls = np.array(recalls) 47 | recalls_any = np.array(recalls_any) 48 | recalls_all = np.array(recalls_all) 49 | print(f"Avg Recall: {np.mean(recalls)*100:.2f}") 50 | print(f"All Recall: {np.mean(recalls_all)*100:.2f}") 51 | print(f"Any Recall: {np.mean(recalls_any)*100:.2f}") 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = ArgumentParser(description=__doc__) 56 | parser.add_argument('--dataset_name_or_path', type=str, default='princeton-nlp/SWE-bench_bm25_13K') 57 | parser.add_argument('--split', type=str, default='test') 58 | args = parser.parse_args() 59 | main(**vars(args)) 60 | -------------------------------------------------------------------------------- /swebench/collect/print_pulls.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Given the `` of a GitHub repo, this script writes the raw information for all the repo's PRs to a single `.jsonl` file.""" 4 | 5 | from __future__ import annotations 6 | 7 | import argparse 8 | import json 9 | import logging 10 | import os 11 | 12 | from datetime import datetime 13 | from fastcore.xtras import obj2dict 14 | from swebench.collect.utils import Repo 15 | from typing import Optional 16 | 17 | logging.basicConfig( 18 | level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 19 | ) 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def log_all_pulls( 24 | repo: Repo, 25 | output: str, 26 | max_pulls: int = None, 27 | cutoff_date: str = None, 28 | ) -> None: 29 | """ 30 | Iterate over all pull requests in a repository and log them to a file 31 | 32 | Args: 33 | repo (Repo): repository object 34 | output (str): output file name 35 | """ 36 | cutoff_date = datetime.strptime(cutoff_date, "%Y%m%d") \ 37 | .strftime("%Y-%m-%dT%H:%M:%SZ") \ 38 | if cutoff_date is not None else None 39 | 40 | with open(output, "w") as file: 41 | for i_pull, pull in enumerate(repo.get_all_pulls()): 42 | setattr(pull, "resolved_issues", repo.extract_resolved_issues(pull)) 43 | print(json.dumps(obj2dict(pull)), end="\n", flush=True, file=file) 44 | if max_pulls is not None and i_pull >= max_pulls: 45 | break 46 | if cutoff_date is not None and pull.created_at < cutoff_date: 47 | break 48 | 49 | def main( 50 | repo_name: str, 51 | output: str, 52 | token: Optional[str] = None, 53 | max_pulls: int = None, 54 | cutoff_date: str = None, 55 | ): 56 | """ 57 | Logic for logging all pull requests in a repository 58 | 59 | Args: 60 | repo_name (str): name of the repository 61 | output (str): output file name 62 | token (str, optional): GitHub token 63 | """ 64 | if token is None: 65 | token = os.environ.get("GITHUB_TOKEN") 66 | owner, repo = repo_name.split("/") 67 | repo = Repo(owner, repo, token=token) 68 | log_all_pulls(repo, output, max_pulls=max_pulls, cutoff_date=cutoff_date) 69 | 70 | 71 | if __name__ == "__main__": 72 | parser = argparse.ArgumentParser(description=__doc__) 73 | parser.add_argument("repo_name", type=str, help="Name of the repository") 74 | parser.add_argument("output", type=str, help="Output file name") 75 | parser.add_argument("--token", type=str, help="GitHub token") 76 | parser.add_argument("--max_pulls", type=int, help="Maximum number of pulls to log", default=None) 77 | parser.add_argument("--cutoff_date", type=str, help="Cutoff date for PRs to consider in format YYYYMMDD", default=None) 78 | args = parser.parse_args() 79 | main(**vars(args)) 80 | -------------------------------------------------------------------------------- /swebench/collect/build_dataset_ft.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import glob 5 | import json 6 | import os 7 | import random 8 | 9 | from tqdm import tqdm 10 | from datetime import datetime 11 | 12 | 13 | def main(instances_path: str, output_path: str, eval_path: str, seed: int): 14 | """ 15 | Combine all non-eval task instances into a single fine tuning dataset 16 | 17 | Args: 18 | instances_path (str): Path to directory containing all candidate task instances 19 | output_path (str): Path to save output fine tuning dataset to 20 | eval_path (str): Path to directory containing all eval task instances 21 | seed (int): Random seed 22 | """ 23 | # Define output file name 24 | random.seed(seed) 25 | SWE_PRS_FT_DATASET = ( 26 | f"SWE_PRS_FT_DATASET_{datetime.now().strftime('%Y%m%d%H')}_{seed}.jsonl" 27 | ) 28 | destination = os.path.join(output_path, SWE_PRS_FT_DATASET) 29 | total_insts, total_repos = 0, 0 30 | 31 | # Gather Evaluation Set Task Instances 32 | eval_instances = [] 33 | for x in glob.glob(os.path.join(eval_path, "*-task-instances.jsonl")): 34 | with open(x) as f: 35 | eval_instances.extend(f.readlines()) 36 | eval_instances = set(eval_instances) 37 | 38 | # Create fine tuning dataset 39 | with open(destination, "w") as f_out: 40 | for dataset_path in tqdm( 41 | glob.glob(os.path.join(instances_path, "*-task-instances.jsonl.all")) 42 | ): 43 | total_repos += 1 44 | with open(dataset_path) as f: 45 | lines = f.readlines() 46 | 47 | # Remove data from evaluation dataset 48 | lines = [line for line in lines if line not in eval_instances] 49 | 50 | # Shuffle lines 51 | random.shuffle(lines) 52 | 53 | # Keep 500 lines per dataset 54 | for line in lines[:500]: 55 | line = json.loads(line) 56 | if "test_patch" in line: 57 | del line["test_patch"] 58 | f_out.write(json.dumps(line) + "\n") 59 | total_insts += 1 60 | 61 | print( 62 | f"Fine tuning dataset saved to {destination} ({total_insts} instances from {total_repos} repos)" 63 | ) 64 | 65 | 66 | if __name__ == "__main__": 67 | parser = argparse.ArgumentParser() 68 | parser.add_argument( 69 | "--instances_path", 70 | type=str, 71 | help="Path to directory containing all candidate task instances", 72 | ) 73 | parser.add_argument( 74 | "--output_path", type=str, help="Path to save output fine tuning dataset to" 75 | ) 76 | parser.add_argument( 77 | "--eval_path", 78 | type=str, 79 | help="Path to directory containing all eval task instances", 80 | ) 81 | parser.add_argument("--seed", type=int, default=42, help="Random seed") 82 | args = parser.parse_args() 83 | main(**vars(args)) 84 | -------------------------------------------------------------------------------- /swebench/versioning/extract_web/get_versions_sqlfluff.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import sys 5 | from ghapi.core import GhApi 6 | 7 | sys.path.append("../../harness") 8 | from utils import get_instances 9 | 10 | GITHUB_TOKEN = "" 11 | PATH_TASKS_SQLFLUFF = "" 12 | PATH_TO_SAVE = "" 13 | 14 | # Get raw sqlfluff dataset 15 | data_tasks = get_instances(PATH_TASKS_SQLFLUFF) 16 | 17 | # Get all GitHub releases 18 | api = GhApi(token=GITHUB_TOKEN) 19 | 20 | releases, i = [], 0 21 | while True: 22 | temp = api.repos.list_releases('sqlfluff', 'sqlfluff', 100, i + 1) 23 | releases.extend(temp) 24 | if len(temp) < 100: 25 | break 26 | i += 1 27 | pairs = [(x['name'], x['published_at']) for x in releases] 28 | 29 | def process(x): 30 | """Extract version number from name""" 31 | if x.startswith('SQLFluff '): 32 | x = x[len('SQLFluff '):] 33 | pattern = re.compile(r'\[[\d\.\w]*\] - \d*-\d*-\d*') 34 | matches = pattern.findall(x) 35 | if len(matches) > 0: 36 | parts = x.split(' - ') 37 | version = parts[0].replace('[', '').replace(']', '') 38 | version = version.rsplit('.', 1)[0] 39 | return (version, parts[1]) 40 | 41 | pattern = re.compile(r'\d\.\d\.[\d\.]*') 42 | matches = pattern.findall(x) 43 | if len(matches) > 0: 44 | version = matches[0] 45 | version = version.rsplit('.', 1)[0] 46 | return (version, None) 47 | 48 | return (None, None) 49 | 50 | # Collect version/date pairs 51 | version_date_map = {} 52 | for pair in pairs: 53 | pair_rv = process(pair[0]) 54 | if pair_rv[0] == None: 55 | continue 56 | version = pair_rv[0] 57 | if version.startswith('Bugfix Release '): 58 | version = version[len('Bugfix Release '):] 59 | date = pair[1] if pair_rv[1] == None else pair_rv[1] 60 | if version in version_date_map: 61 | version_date_map[version] = max( 62 | version_date_map[version], 63 | date 64 | ) 65 | else: 66 | version_date_map[version] = date 67 | 68 | # Get (date, version) pairs 69 | times = [(v, k) for k, v in version_date_map.items()] 70 | times = sorted(times, key=lambda x: x[0])[::-1] 71 | 72 | # Iterate through data_tasks and assign versions 73 | for task in data_tasks: 74 | created_at = task["created_at"].split("T")[0] 75 | set_version = False 76 | for t in times: 77 | if t[0] < created_at: 78 | task["version"] = t[1] 79 | set_version = True 80 | break 81 | if not set_version: 82 | task["version"] = None 83 | 84 | # Save sqlfluff versioned data to repository 85 | versioned_path = "sqlfluff-task-instances_versions.json" 86 | with open( 87 | os.path.join(PATH_TO_SAVE, versioned_path), 88 | "w", 89 | ) as f: 90 | json.dump(data_tasks, fp=f) 91 | 92 | # Print all versions 93 | versioned = json.load(open(os.path.join(PATH_TO_SAVE, versioned_path))) 94 | print(sorted(list({t['version'] for t in versioned if t['version'] is not None}))) -------------------------------------------------------------------------------- /swebench/versioning/constants.py: -------------------------------------------------------------------------------- 1 | # Constants - Task Instance Version File 2 | MAP_REPO_TO_VERSION_PATHS = { 3 | "dbt-labs/dbt-core": ["core/dbt/version.py", "core/dbt/__init__.py"], 4 | "django/django": ["django/__init__.py"], 5 | "huggingface/transformers": ["src/transformers/__init__.py"], 6 | "marshmallow-code/marshmallow": ["src/marshmallow/__init__.py"], 7 | "mwaskom/seaborn": ["seaborn/__init__.py"], 8 | "pallets/flask": ["src/flask/__init__.py", "flask/__init__.py"], 9 | "psf/requests": ["requests/__version__.py", "requests/__init__.py", "src/requests/__version__.py"], 10 | "pyca/cryptography": [ 11 | "src/cryptography/__about__.py", 12 | "src/cryptography/__init__.py", 13 | ], 14 | "pylint-dev/astroid": ["astroid/__pkginfo__.py", "astroid/__init__.py"], 15 | "pylint-dev/pylint": ["pylint/__pkginfo__.py", "pylint/__init__.py"], 16 | "pytest-dev/pytest": ["src/_pytest/_version.py", "_pytest/_version.py" ], 17 | "pyvista/pyvista": ["pyvista/_version.py", "pyvista/__init__.py"], 18 | "Qiskit/qiskit": ["qiskit/VERSION.txt"], 19 | "scikit-learn/scikit-learn": ["sklearn/__init__.py"], 20 | "sphinx-doc/sphinx": ["sphinx/__init__.py"], 21 | "sympy/sympy": ["sympy/release.py", "sympy/__init__.py"], 22 | "facebookresearch/hydra": ["hydra/__init__.py"], 23 | 24 | } 25 | 26 | # Cosntants - Task Instance Version Regex Pattern 27 | MAP_REPO_TO_VERSION_PATTERNS = { 28 | k: [r'__version__ = [\'"](.*)[\'"]', r"VERSION = \((.*)\)"] 29 | for k in [ 30 | "dbt-labs/dbt-core", 31 | "django/django", 32 | "huggingface/transformers", 33 | "marshmallow-code/marshmallow", 34 | "mwaskom/seaborn", 35 | "pallets/flask", 36 | "psf/requests", 37 | "pyca/cryptography", 38 | "pylint-dev/astroid", 39 | "pylint-dev/pylint", 40 | "scikit-learn/scikit-learn", 41 | "sphinx-doc/sphinx", 42 | "sympy/sympy", 43 | "modin-project/modin", 44 | "facebookresearch/hydra" 45 | ] 46 | } 47 | MAP_REPO_TO_VERSION_PATTERNS.update( 48 | { 49 | k: [ 50 | r'__version__ = [\'"](.*)[\'"]', 51 | r'__version__ = version = [\'"](.*)[\'"]', 52 | r"VERSION = \((.*)\)", 53 | ] 54 | for k in ["pytest-dev/pytest", "matplotlib/matplotlib"] 55 | } 56 | ) 57 | MAP_REPO_TO_VERSION_PATTERNS.update({k: [r"(.*)"] for k in ["Qiskit/qiskit"]}) 58 | MAP_REPO_TO_VERSION_PATTERNS.update({k: [r"version_info = [\d]+,[\d\s]+,"] for k in ["pyvista/pyvista"]}) 59 | 60 | SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/" 61 | 62 | # python/mypy 63 | MAP_REPO_TO_VERSION_PATHS.update({"python/mypy": ["mypy/version.py"]}) 64 | MAP_REPO_TO_VERSION_PATTERNS.update({"python/mypy": [r'__version__ = [\'"](.*)[\'"]', r"VERSION = \((.*)\)"]}) 65 | 66 | # getmoto/moto 67 | MAP_REPO_TO_VERSION_PATHS.update({"getmoto/moto": ["moto/__init__.py"]}) 68 | MAP_REPO_TO_VERSION_PATTERNS.update({"getmoto/moto": [r'__version__ = [\'"](.*)[\'"]', r"VERSION = \((.*)\)"]}) 69 | 70 | # conan-io/conan 71 | MAP_REPO_TO_VERSION_PATHS.update({"conan-io/conan": ["conans/__init__.py"]}) 72 | MAP_REPO_TO_VERSION_PATTERNS.update({"conan-io/conan": [r'__version__ = [\'"](.*)[\'"]', r"VERSION = \((.*)\)"]}) 73 | -------------------------------------------------------------------------------- /docs/20240415_eval_bug/check_harness.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "a5079bf3-97cd-40f3-ba6a-35cd662f7439", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import json\n", 11 | "\n", 12 | "from datasets import load_dataset\n", 13 | "from swebench import MAP_VERSION_TO_INSTALL" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "93b467a6-a450-49e7-9283-5bcb520f7f23", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "data = load_dataset(\"princeton-nlp/SWE-bench\", split=\"test\")\n", 24 | "\n", 25 | "# NOTE: We have not released the gold predictions, so this is just a placeholder and will not work\n", 26 | "golds = [json.loads(x) for x in open(\"gold_preds.jsonl\")]\n", 27 | "golds = {x['instance_id']: x for x in golds}" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "id": "889859d0-f5e7-4670-a133-0dc8fb1cdf75", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "repo_version_pairs = [\n", 38 | " (repo, version)\n", 39 | " for repo, version_map in MAP_VERSION_TO_INSTALL.items()\n", 40 | " for version in version_map.keys()\n", 41 | "]" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "id": "b9aaa3d4-4d8f-4b8c-b516-61c98ab0ccde", 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/plain": [ 53 | "126" 54 | ] 55 | }, 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "check_harness = []\n", 63 | "for repo, version in repo_version_pairs:\n", 64 | " subset = [x for x in data if x['repo'] == repo and x['version'] == version]\n", 65 | " if len(subset) == 0:\n", 66 | " continue\n", 67 | " subset = sorted(subset, key=lambda x: x['created_at'], reverse=False)\n", 68 | " inst_id = subset[-1]['instance_id']\n", 69 | " check_harness.append(golds[inst_id])\n", 70 | "len(check_harness)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 5, 76 | "id": "70591c07-8cc3-4f7f-800a-07eec5d4e5ff", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "with open(\"check-harness.jsonl\", \"w\") as f:\n", 81 | " for gold_pred in check_harness:\n", 82 | " print(json.dumps(gold_pred), file=f, flush=True)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "651f2c61-ed58-403c-ac6a-71f5375e8b59", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3 (ipykernel)", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.11.7" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 5 115 | } 116 | -------------------------------------------------------------------------------- /swebench/collect/cleanup/remove_envs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | import subprocess 6 | 7 | from multiprocessing import Pool 8 | 9 | 10 | def get_conda_env_names(output: str) -> list: 11 | """ 12 | Parse conda environments (`conda env list`) created for a particular conda installation 13 | 14 | Args: 15 | output (str): Output of `conda env list` command 16 | """ 17 | lines = output.split("\n") 18 | env_names = [] 19 | for line in lines: 20 | if line.startswith("#"): 21 | continue 22 | if line.strip() == "": 23 | continue 24 | if " " in line: 25 | env_name = line.split(" ")[0] 26 | env_names.append(env_name) 27 | return [x for x in env_names if len(x) > 0] 28 | 29 | 30 | def delete_folders_with_prefix(prefix, conda_path): 31 | """ 32 | Find and rm folders with a particular prefix in the conda installation's env folder 33 | 34 | Args: 35 | prefix (str): Prefix of folders to remove 36 | conda_path (str): Path to conda installation 37 | """ 38 | envs_folder = os.path.join(conda_path, "envs") 39 | command = f'find {envs_folder} -type d -name "{prefix}*" -exec rm -rf {{}} +' 40 | subprocess.run(command.split(" ")) 41 | 42 | 43 | def remove_environment(env_name, prefix): 44 | """ 45 | Remove all conda environments with a particular prefix from a conda installation 46 | """ 47 | if env_name.startswith(prefix): 48 | print(f"Removing {env_name}") 49 | conda_cmd = "conda remove -n " + env_name + " --all -y" 50 | cmd = conda_source + " && " + conda_cmd 51 | try: 52 | conda_create_output = subprocess.run(cmd.split(), check=True, capture_output=True, text=True) 53 | except subprocess.CalledProcessError as e: 54 | print(f"Error: {e}") 55 | print(f"Error output: {e.stderr}") 56 | raise e 57 | print(f"Output: {conda_create_output.stdout}") 58 | 59 | 60 | if __name__ == "__main__": 61 | """ 62 | Logic for removing conda environments and their folders from a conda installation 63 | """ 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument("prefix", type=str, help="Prefix for environments to delete") 66 | parser.add_argument( 67 | "--conda_path", 68 | type=str, 69 | help="Path to miniconda installation", 70 | ) 71 | args = parser.parse_args() 72 | 73 | # Remove conda environments with a specific prefix 74 | conda_source = "source " + os.path.join(args.conda_path, "etc/profile.d/conda.sh") 75 | check_env = conda_source + " && " + "conda env list" 76 | try: 77 | conda_envs = subprocess.run(check_env.split(" "), check=True, capture_output=True) 78 | except subprocess.CalledProcessError as e: 79 | print(f"Error: {e}") 80 | print(f"Error output: {e.stderr.decode('utf-8')}") 81 | raise e 82 | conda_envs_names = get_conda_env_names(conda_envs.stdout.decode("utf-8")) 83 | 84 | # Remove conda environments in parallel 85 | num_processes = 25 86 | pool = Pool(num_processes) 87 | pool.starmap( 88 | remove_environment, zip(conda_envs_names, [args.prefix] * len(conda_envs_names)) 89 | ) 90 | 91 | # Remove env folder with the same prefix 92 | print( 93 | f"Removing miniconda folder for environments with {args.prefix} from {args.conda_path}" 94 | ) 95 | delete_folders_with_prefix(args.prefix, args.conda_path) 96 | print(f"Done!") 97 | -------------------------------------------------------------------------------- /scripts/eval/update_output_with_eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | from collections import defaultdict 5 | 6 | import pandas as pd 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('input_file', type=str) 10 | args = parser.parse_args() 11 | 12 | dirname = os.path.dirname(args.input_file) 13 | report_json = os.path.join(dirname, 'report.json') 14 | 15 | df = pd.read_json(args.input_file, lines=True) 16 | 17 | output_md_filepath = os.path.join(dirname, 'README.md') 18 | instance_id_to_status = defaultdict( 19 | lambda: {'resolved': False, 'empty_generation': False} 20 | ) 21 | if os.path.exists(report_json): 22 | with open(report_json, 'r') as f: 23 | report = json.load(f) 24 | 25 | output_md = ( 26 | "# SWE-bench Report\n" 27 | "This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n" 28 | "## Summary\n" 29 | f"- total instances: {report['total_instances']}\n" 30 | f"- submitted instances: {report['submitted_instances']}\n" 31 | f"- completed instances: {report['completed_instances']}\n" 32 | f"- empty patch instances: {report['empty_patch_instances']}\n" 33 | f"- resolved instances: {report['resolved_instances']}\n" 34 | f"- unresolved instances: {report['unresolved_instances']}\n" 35 | f"- error instances: {report['error_instances']}\n" 36 | f"- unstopped instances: {report['unstopped_instances']}\n" 37 | ) 38 | 39 | output_md += '\n## Resolved Instances\n' 40 | # instance_id to status 41 | for instance_id in report['resolved_ids']: 42 | instance_id_to_status[instance_id]['resolved'] = True 43 | output_md += ( 44 | f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n' 45 | ) 46 | 47 | output_md += '\n## Unresolved Instances\n' 48 | for instance_id in report['unresolved_ids']: 49 | output_md += ( 50 | f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n' 51 | ) 52 | 53 | output_md += '\n## Error Instances\n' 54 | for instance_id in report['error_ids']: 55 | instance_id_to_status[instance_id]['error_eval'] = True 56 | output_md += ( 57 | f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n' 58 | ) 59 | 60 | output_md += '\n## Empty Patch Instances\n' 61 | for instance_id in report['empty_patch_ids']: 62 | instance_id_to_status[instance_id]['empty_generation'] = True 63 | output_md += ( 64 | f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n' 65 | ) 66 | 67 | output_md += '\n## Incomplete Instances\n' 68 | for instance_id in report['incomplete_ids']: 69 | output_md += ( 70 | f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n' 71 | ) 72 | 73 | # Apply the status to the dataframe 74 | def apply_report(row): 75 | instance_id = row['instance_id'] 76 | if instance_id in instance_id_to_status: 77 | return dict(instance_id_to_status[instance_id]) 78 | return row.get('report', {}) 79 | 80 | df['report'] = df.apply(apply_report, axis=1) 81 | 82 | 83 | if os.path.exists(args.input_file + '.bak'): 84 | conf = input('Existing backup file found. Do you want to overwrite it? (y/n)') 85 | if conf != 'y': 86 | exit() 87 | os.remove(args.input_file + '.bak') 88 | 89 | # backup the original file 90 | os.rename(args.input_file, args.input_file + '.bak') 91 | df.to_json(args.input_file, orient='records', lines=True) 92 | 93 | with open(output_md_filepath, 'w') as f: 94 | f.write(output_md) 95 | -------------------------------------------------------------------------------- /swebench/harness/prepare_images.py: -------------------------------------------------------------------------------- 1 | import docker 2 | import resource 3 | 4 | from argparse import ArgumentParser 5 | 6 | from swebench.harness.constants import KEY_INSTANCE_ID 7 | from swebench.harness.docker_build import build_instance_images 8 | from swebench.harness.docker_utils import list_images 9 | from swebench.harness.test_spec import make_test_spec 10 | from swebench.harness.utils import load_swebench_dataset, str2bool 11 | 12 | 13 | def filter_dataset_to_build( 14 | dataset: list, 15 | instance_ids: list, 16 | client: docker.DockerClient, 17 | force_rebuild: bool 18 | ): 19 | """ 20 | Filter the dataset to only include instances that need to be built. 21 | 22 | Args: 23 | dataset (list): List of instances (usually all of SWE-bench dev/test split) 24 | instance_ids (list): List of instance IDs to build. 25 | client (docker.DockerClient): Docker client. 26 | force_rebuild (bool): Whether to force rebuild all images. 27 | """ 28 | # Get existing images 29 | existing_images = list_images(client) 30 | data_to_build = [] 31 | 32 | # Check if all instance IDs are in the dataset 33 | not_in_dataset = set(instance_ids).difference(set([instance[KEY_INSTANCE_ID] for instance in dataset])) 34 | if not_in_dataset: 35 | raise ValueError(f"Instance IDs not found in dataset: {not_in_dataset}") 36 | 37 | for instance in dataset: 38 | if instance[KEY_INSTANCE_ID] not in instance_ids: 39 | # Skip instances not in the list 40 | continue 41 | 42 | # Check if the instance needs to be built (based on force_rebuild flag and existing images) 43 | spec = make_test_spec(instance) 44 | if force_rebuild: 45 | data_to_build.append(instance) 46 | elif spec.instance_image_key not in existing_images: 47 | data_to_build.append(instance) 48 | 49 | return data_to_build 50 | 51 | 52 | def main( 53 | dataset_name, 54 | split, 55 | instance_ids, 56 | max_workers, 57 | force_rebuild, 58 | open_file_limit, 59 | ): 60 | """ 61 | Build Docker images for the specified instances. 62 | 63 | Args: 64 | instance_ids (list): List of instance IDs to build. 65 | max_workers (int): Number of workers for parallel processing. 66 | force_rebuild (bool): Whether to force rebuild all images. 67 | open_file_limit (int): Open file limit. 68 | """ 69 | # Set open file limit 70 | resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit)) 71 | client = docker.from_env() 72 | 73 | # Filter out instances that were not specified 74 | dataset = load_swebench_dataset(dataset_name, split) 75 | dataset = filter_dataset_to_build(dataset, instance_ids, client, force_rebuild) 76 | 77 | # Build images for remaining instances 78 | successful, failed = build_instance_images( 79 | client=client, 80 | dataset=dataset, 81 | force_rebuild=force_rebuild, 82 | max_workers=max_workers, 83 | ) 84 | print(f"Successfully built {len(successful)} images") 85 | print(f"Failed to build {len(failed)} images") 86 | 87 | 88 | if __name__ == "__main__": 89 | parser = ArgumentParser() 90 | parser.add_argument("--dataset_name", type=str, default="princeton-nlp/SWE-bench_Lite", help="Name of the dataset to use") 91 | parser.add_argument("--split", type=str, default="test", help="Split to use") 92 | parser.add_argument("--instance_ids", nargs="+", type=str, help="Instance IDs to run (space separated)") 93 | parser.add_argument("--max_workers", type=int, default=4, help="Max workers for parallel processing") 94 | parser.add_argument("--force_rebuild", type=str2bool, default=False, help="Force rebuild images") 95 | parser.add_argument("--open_file_limit", type=int, default=8192, help="Open file limit") 96 | args = parser.parse_args() 97 | main(**vars(args)) 98 | -------------------------------------------------------------------------------- /swebench/inference/README.md: -------------------------------------------------------------------------------- 1 | # SWE-bench Inference 2 | In this sub-package, we provide various tools to get started on SWE-bench inference. 3 | In particular, we provide the following important scripts and sub-packages: 4 | 5 | - `make_datasets`, this sub-package contains scripts to generate new datasets for SWE-bench inference with your own prompts and issues. 6 | - `run_api.py`, this script is used to generate API model generations for a given dataset. 7 | - `run_llama.py`, this script is used to run inference using Llama models, i.e. SWE-Llama. 8 | - `run_live.py`, this script is used to generate model generations for new issues on GitHub in real time. 9 | 10 | ## Installation 11 | To install the dependencies for this sub-package, you can run the following command: 12 | ```bash 13 | pip install -e .[inference] 14 | ``` 15 | 16 | ## `make_datasets` 17 | For more information on how to use this sub-package, please refer to the [README](./make_datasets/README.md) in the `make_datasets` sub-package. 18 | 19 | ## Run API inference on test datasets 20 | 21 | This python script is designed to run inference on a dataset using either the OpenAI or Anthropic API, depending on the model specified. It sorts instances by length and continually writes the outputs to a specified file, so that the script can be stopped and restarted without losing progress. 22 | 23 | For instance, to run this script on SWE-bench with the ``Oracle`` context and Anthropic's Claude 2 model, you can run the following command: 24 | ```bash 25 | export ANTHROPIC_API_KEY= 26 | python -m swebench.inference.run_api --dataset_name_or_path princeton-nlp/SWE-bench_oracle --model_name_or_path claude-2 --output_dir ./outputs 27 | ``` 28 | 29 | You can also specify further options: 30 | 31 | - `--split`: To specify the dataset split to use (default is "test"). 32 | - `--shard_id` and `--num_shards`: To process only a shard of the data. 33 | - `--model_args`: A string containing comma-separated key=value pairs for arguments to pass to the model. (e.g. `--model_args="temperature=0.2,top_p=0.95"`) 34 | - `--max_cost`: The maximum cost to spend on inference total. 35 | 36 | 37 | ## Run inference using Llama models (i.e. SWE-Llama) 38 | 39 | You can run inference using [SWE-Llama](https://huggingface.co/princeton-nlp/SWE-Llama-13b) with the `run_llama.py` script. 40 | This script is similar to `run_api.py`, but it is designed to run inference using Llama models. 41 | 42 | For instance, to run this script on SWE-bench with the ``Oracle`` context and SWE-Llama, you can run the following command: 43 | ```bash 44 | python -m swebench.inference.run_llama \ 45 | --dataset_path princeton-nlp/SWE-bench_oracle \ 46 | --model_name_or_path princeton-nlp/SWE-Llama-13b \ 47 | --output_dir ./outputs \ 48 | --temperature 0 49 | ``` 50 | 51 | You can also specify further options: 52 | - `--split`: To specify the dataset split to use (default is "test"). 53 | - `--shard_id` and `--num_shards`: To process only a shard of the data. 54 | - `--temperature`: The temperature to use for sampling (default is 0). 55 | - `--top_p`: The top_p to use for sampling (default is 1). 56 | - `--peft_path`: The path or hf name for the PEFT adapter. 57 | 58 | 59 | ## Run live inference on open GitHub issues 60 | 61 | Follow instructions [here](https://github.com/castorini/pyserini/blob/master/docs/installation.md) to install [Pyserini](https://github.com/castorini/pyserini), to perform BM25 retrieval, and [here](https://github.com/facebookresearch/faiss/blob/main/INSTALL.md) to install [Faiss](https://github.com/facebookresearch/faiss). 62 | 63 | Then run `run_live.py` to try solving a new issue. For example, you can try solving [this issue](https://github.com/huggingface/transformers/issues/26706 ) by running the following command: 64 | 65 | ```bash 66 | export OPENAI_API_KEY= 67 | python -m swebench.inference.run_live --model_name gpt-3.5-turbo-1106 \ 68 | --issue_url https://github.com/huggingface/transformers/issues/26706 69 | ``` 70 | -------------------------------------------------------------------------------- /swebench/collect/get_top_pypi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os, json 4 | import argparse 5 | 6 | from bs4 import BeautifulSoup 7 | from ghapi.core import GhApi 8 | from selenium import webdriver 9 | from selenium.webdriver.common.by import By 10 | 11 | 12 | gh_token = os.environ.get("GITHUB_TOKEN") 13 | if not gh_token: 14 | msg = "Please set the GITHUB_TOKEN environment variable." 15 | raise ValueError(msg) 16 | api = GhApi(token="gh_token") 17 | 18 | 19 | def get_package_stats(data_tasks, f): 20 | """ 21 | Get package stats from pypi page 22 | 23 | Args: 24 | data_tasks (list): List of packages + HTML 25 | f (str): File to write to 26 | """ 27 | # Adjust access type if file already exists 28 | content = None 29 | access_type = "w" 30 | if os.path.exists(f): 31 | with open(f) as fp_: 32 | content = fp_.read() 33 | access_type = "a" 34 | fp_.close() 35 | 36 | # Extra package title, pypi URL, stars, pulls, and github URL 37 | with open(f, access_type) as fp_: 38 | for idx, chunk in enumerate(data_tasks): 39 | # Get package name and pypi URL 40 | package_name = chunk["title"] 41 | package_url = chunk["href"] 42 | if content is not None and package_url in content: 43 | continue 44 | 45 | # Get github URL 46 | package_github = None 47 | driver.get(package_url) 48 | soup = BeautifulSoup(driver.page_source, "html.parser") 49 | for link in soup.find_all("a", class_="vertical-tabs__tab--with-icon"): 50 | found = False 51 | for x in ["Source", "Code", "Homepage"]: 52 | if ( 53 | x.lower() in link.get_text().lower() 54 | and "github" in link["href"].lower() 55 | ): 56 | package_github = link["href"] 57 | found = True 58 | break 59 | if found: 60 | break 61 | 62 | # Get stars and pulls from github API 63 | stars_count, pulls_count = None, None 64 | if package_github is not None: 65 | repo_parts = package_github.split("/")[-2:] 66 | owner, name = repo_parts[0], repo_parts[1] 67 | 68 | try: 69 | repo = api.repos.get(owner, name) 70 | stars_count = int(repo["stargazers_count"]) 71 | issues = api.issues.list_for_repo(owner, name) 72 | pulls_count = int(issues[0]["number"]) 73 | except: 74 | pass 75 | 76 | # Write to file 77 | print( 78 | json.dumps( 79 | { 80 | "rank": idx, 81 | "name": package_name, 82 | "url": package_url, 83 | "github": package_github, 84 | "stars": stars_count, 85 | "pulls": pulls_count, 86 | } 87 | ), 88 | file=fp_, 89 | flush=True, 90 | ) 91 | 92 | 93 | if __name__ == "__main__": 94 | parser = argparse.ArgumentParser() 95 | parser.add_argument("--max-repos", help="Maximum number of repos to get", type=int, default=5000) 96 | args = parser.parse_args() 97 | 98 | # Start selenium driver to get top 5000 pypi page 99 | url_top_pypi = "https://hugovk.github.io/top-pypi-packages/" 100 | driver = webdriver.Chrome() 101 | driver.get(url_top_pypi) 102 | button = driver.find_element(By.CSS_SELECTOR, 'button[ng-click="show(8000)"]') 103 | button.click() 104 | 105 | # Retrieve HTML for packages from page 106 | soup = BeautifulSoup(driver.page_source, "html.parser") 107 | package_list = soup.find("div", {"class": "list"}) 108 | packages = package_list.find_all("a", class_="ng-scope") 109 | 110 | get_package_stats(packages[:args.max_repos], "pypi_rankings.jsonl") 111 | -------------------------------------------------------------------------------- /swebench/collect/README.md: -------------------------------------------------------------------------------- 1 | # Data Collection 2 | This folder includes the code for the first two parts of the benchmark construction procedure as described in the paper, specifically 1. Repo selection and data scraping, and 2. Attribute-based filtering. 3 | 4 | We include a comprehensive [tutorial](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/collection.md) that describes the end-to-end procedure for collecting evaluation task instances from PyPI repositories. 5 | 6 | > SWE-bench's collection pipeline is currently designed to target PyPI packages. We hope to expand SWE-bench to more repositories and languages in the future. 7 | 8 | 9 | 10 | ## Collection Procedure 11 | To run collection on your own repositories, run the `run_get_tasks_pipeline.sh` script. Given a repository or list of repositories (formatted as `owner/name`), for each repository this command will generate... 12 | * `-prs.jsonl` file containing the [metadata for every pull request](https://docs.github.com/rest/reference/pulls#list-pull-requests) from the repository. 13 | * `-task-instances.jsonl.all` file containing all *valid* task instances (has associated issues + gold patch). 14 | * This file's values can be used for fine tuning purposes. 15 | * `-task-instances.jsonl` file containing *valid* task instances that also has associated *tests*. 16 | * This file's values are candidate task instances. Once validated, they can be used for evaluation purposes. 17 | * The `.json.all` includes these task instances as well. 18 | 19 | ## Directory Overview 20 | In this section, we briefly describe each of the files in this directory and its usage details. 21 | 22 | **🧐 GitHub Repository Selection** 23 | * `get_top_pypi.py` 24 | * Purpose: Retrieves the PyPI URL, GitHub URL, # of ⭐, and # of Issues + PRs for the [top 5000](https://hugovk.github.io/top-pypi-packages/") most downloaded PyPI packages. 25 | * Usage: `python get_top_pypi.py` 26 | 27 | **⛏️ GitHub Data Collection** 28 | * `print_pulls.py` 29 | * Purpose: Given the `` of a GitHub repo, this script writes the raw information for all the repo's PRs to a single `.jsonl` file 30 | * Usage: `python print_pulls.py --token ` 31 | * `build_dataset.py` 32 | * Purpose: Given the path to a PRs `.jsonl` file generated by `print_pulls.py`, this script attempts to convert each PR to a task instance. It creates a `jsonl.all` file for any PRs with an issue and a `.jsonl` file for any PRs with both an issue and modifications to that repository's tests. 33 | * Usage: `python build_dataset.py --token ` 34 | * `get_tasks_pipeline.py` 35 | * Purpose: Automates invocation of the repo → task instance construction pipeline (`print_pulls.py` + `build_dataset.py`) for multiple repositories 36 | * Usage: `./run_get_tasks_pipeline` (Check file for arguments) 37 | 38 | **🎵 Fine Tuning Dataset Construction** 39 | * `build_dataset_ft.py` 40 | * Purpose: Given the path to a collection of `.jsonl.all` files generated by `build_dataset.py`, this is a simple script to combine all such files into a single `.jsonl` that can be used to construct a instruction tuning dataset based on [problem statement + original code, code Δ] pairs. 41 | * Usage: `./run_build_dataset_ft` (Check file for arguments) 42 | 43 | **🪞 Mirroring Repositories** 44 | * `make_repo.sh` 45 | * Purpose: A script for creating a [mirror repository](https://docs.github.com/en/repositories/creating-and-managing-repositories/duplicating-a-repository) of an existing repository on GitHub. Examples available under the [swe-bench organization](https://github.com/orgs/swe-bench/repositories). 46 | * Usage: `python call_make_repo.py` (Check file for arguments) 47 | 48 | **🧹 Clean Up** 49 | * `delete_gh_workflows.py` 50 | * Purpose: Recurring workflows from mirror repositories can clog up your inbox for the email account associated with your GitHub token. Given a repo URL, this will automate removing the `.github/workflows` folder from all branches of a repository. 51 | * Usage: `python delete_gh_workflows.py ` 52 | * `remove_envs.py` 53 | * Purpose: SWE Bench's evaluation + validation harnesses rely on the creation of multiple virtual environments with conda to speed up benchmark evaluation. Use these script to parallelize conda environment removal for environments named with the same prefix. 54 | * Usage: `python remove_envs.py --conda_path ` 55 | -------------------------------------------------------------------------------- /docs/README_JP.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | paste.txt 4 | 5 |

6 | 7 | Kawi the SWE-Llama 8 | 9 |

10 | 11 |
12 | 13 | | [日本語](docs/README_JP.md) | [English](https://github.com/princeton-nlp/SWE-bench) | [中文简体](docs/README_CN.md) | [中文繁體](docs/README_TW.md) | 14 | 15 |
16 | 17 | 18 | --- 19 |

20 | ICLR 2024 の論文 SWE-bench: Can Language Models Resolve Real-World GitHub Issues? のコードとデータ 21 |
22 |
23 | 24 | Build 25 | 26 | 27 | License 28 | 29 |

30 | 31 | パブリックリーダーボードは[ウェブサイト](http://swe-bench.github.io)を、SWE-bench ベンチマークの最新アップデート情報は [change log](https://github.com/princeton-nlp/SWE-bench/blob/master/CHANGELOG.md) を参照してください。 32 | 33 | ## 👋 概要 34 | SWE-bench は、GitHub から収集された実世界のソフトウェアの課題に関する大規模言語モデルを評価するためのベンチマークです。 35 | *コードベース*と*イシュー*が与えられ、言語モデルは記述された問題を解決する*パッチ*を生成するタスクを行います。 36 | 37 | 38 | 39 | ## 🚀 セットアップ 40 | SWE-bench をソースからビルドするには、以下の手順に従ってください: 41 | 1. このリポジトリをローカルにクローンする 42 | 2. リポジトリに `cd` で移動する 43 | 3. `conda env create -f environment.yml` を実行して、`swe-bench` という名前の conda 環境を作成する 44 | 4. `conda activate swe-bench` で環境をアクティベートする 45 | 46 | ## 💽 使用法 47 | SWE-bench データセットは直接ダウンロードするか ([dev](https://drive.google.com/uc?export=download&id=1SbOxHiR0eXlq2azPSSOIDZz-Hva0ETpX), [test](https://drive.google.com/uc?export=download&id=164g55i3_B78F6EphCZGtgSrd2GneFyRM) セット)、[HuggingFace](https://huggingface.co/datasets/princeton-nlp/SWE-bench) からダウンロードできます。 48 | 49 | SWE-Bench を使用するには、以下のことができます: 50 | * 前処理済みのデータセットで独自のモデルを学習する 51 | * 既存のモデル (ディスクにあるLLaMAのようなモデルやGPT-4のようなAPIでアクセスできるモデル) で[推論](https://github.com/princeton-nlp/SWE-bench/blob/master/inference/)を実行する。推論ステップでは、レポとイシューを取得し、モデルにそれを修正するためのコードを生成させます。 52 | * SWE-bench に対してモデルを[評価](https://github.com/princeton-nlp/SWE-bench/blob/master/harness/)する。これは、SWE-Benchのタスクとモデルが提案したソリューションを受け取り、その正確性を評価するためのものです。 53 | * 独自のリポジトリに対してSWE-benchの[データ収集手順](https://github.com/princeton-nlp/SWE-bench/blob/master/collect/)を実行し、新しいSWE-Benchタスクを作成する。 54 | 55 | ## ⬇️ ダウンロード 56 | | データセット | モデル | 57 | | - | - | 58 | | [🤗 SWE-bench](https://huggingface.co/datasets/princeton-nlp/SWE-bench) | [🦙 SWE-Llama 13b](https://huggingface.co/princeton-nlp/SWE-Llama-13b) | 59 | | [🤗 "Oracle" Retrieval](https://huggingface.co/datasets/princeton-nlp/SWE-bench_oracle) | [🦙 SWE-Llama 13b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-13b-peft) | 60 | | [🤗 BM25 Retrieval 13K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_13K) | [🦙 SWE-Llama 7b](https://huggingface.co/princeton-nlp/SWE-Llama-7b) | 61 | | [🤗 BM25 Retrieval 27K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_27K) | [🦙 SWE-Llama 7b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-7b-peft) | 62 | | [🤗 BM25 Retrieval 40K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_40K) | | 63 | | [🤗 BM25 Retrieval 50K (Llamaトークン)](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_50k_llama) | | 64 | 65 | ## 🍎 チュートリアル 66 | SWE-benchの様々な部分の使い方についても、以下のブログ記事を書いています。 67 | 特定のトピックについての投稿を見たい場合は、issueでお知らせください。 68 | * [2023年11月1日] SWE-Benchの評価タスクの収集について ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/collection.md)) 69 | * [2023年11月6日] SWE-benchでの評価について ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/evaluation.md)) 70 | 71 | ## 💫 貢献 72 | NLP、機械学習、ソフトウェア工学の研究コミュニティからのフィードバックを歓迎します。貢献、プルリクエスト、issueを歓迎します! 73 | そのためには、新しいプルリクエストまたはissueを提出し、それぞれのテンプレートに従って記入してください。すぐにフォローアップします! 74 | 75 | 連絡先: [Carlos E. Jimenez](http://www.carlosejimenez.com/) と [John Yang](https://john-b-yang.github.io/) (Email: {carlosej, jy1682}@princeton.edu) 76 | 77 | ## ✍️ 引用 78 | 私たちの研究が役立つと思われる場合は、以下の引用をご利用ください。 79 | ``` 80 | @inproceedings{jimenez2024swebench, 81 | title={SWE-bench: Can Language Models Resolve Real-World GitHub Issues?}, 82 | author={Carlos E. Jimenez and John Yang and Alexander Wettig and Shunyu Yao and Kexin Pei and Ofir Press and Karthik Narasimhan}, 83 | booktitle={The Twelfth International Conference on Learning Representations}, 84 | year={2024}, 85 | url={https://openreview.net/forum?id=VTF8yNQM66} 86 | } 87 | ``` 88 | 89 | ## 🪪 ライセンス 90 | MIT。`LICENSE.md`を確認してください。 91 | -------------------------------------------------------------------------------- /docs/20240406_devin_validate/get_devin_preds.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f7e54da5-97a7-4447-ba2b-0ad24dd3de20", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import json\n", 11 | "\n", 12 | "from glob import glob\n", 13 | "from unidiff import PatchSet" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "6383506c-3405-4344-bfdd-6008c30a8e26", 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "Cloning into 'devin-swebench-results'...\n", 27 | "remote: Enumerating objects: 582, done.\u001b[K\n", 28 | "remote: Counting objects: 100% (582/582), done.\u001b[K\n", 29 | "remote: Compressing objects: 100% (570/570), done.\u001b[K\n", 30 | "remote: Total 582 (delta 12), reused 579 (delta 9), pack-reused 0\u001b[K\n", 31 | "Receiving objects: 100% (582/582), 571.31 KiB | 6.35 MiB/s, done.\n", 32 | "Resolving deltas: 100% (12/12), done.\n", 33 | "Updating files: 100% (580/580), done.\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "!git clone git@github.com:CognitionAI/devin-swebench-results.git" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 7, 44 | "id": "0afd1c6b-88e7-4e18-b065-f035f85c34b0", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "def convert_devin_txt_to_pred(pred_file):\n", 49 | " inst_id = pred_file.split(\"/\")[-1].split(\"-diff\")[0]\n", 50 | " pred = open(pred_file).read()\n", 51 | " try:\n", 52 | " PatchSet(pred)\n", 53 | " except:\n", 54 | " print(f\"{inst_id}: Prediction patch is malformed\")\n", 55 | " return {\n", 56 | " \"model_name_or_path\": \"devin-20240406\",\n", 57 | " \"instance_id\": inst_id,\n", 58 | " \"model_patch\": pred\n", 59 | " }" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 8, 65 | "id": "f81ead15-bc56-4cc7-ba0a-de2b68473c2c", 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "570" 72 | ] 73 | }, 74 | "execution_count": 8, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "predictions = []\n", 81 | "for pred_file in \\\n", 82 | " glob(\"devin-swebench-results/output_diffs/fail/*.txt\") + \\\n", 83 | " glob(\"devin-swebench-results/output_diffs/pass/*.txt\"):\n", 84 | " predictions.append(convert_devin_txt_to_pred(pred_file))\n", 85 | "len(predictions)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 9, 91 | "id": "cf22d4d5-5ba7-4b7a-a298-676f1955da0c", 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "{'model_name_or_path': 'devin-20240406',\n", 98 | " 'instance_id': 'django__django-16745',\n", 99 | " 'model_patch': 'diff --git a/django/core/validators.py b/django/core/validators.py\\nindex 6c622f5788..7a1aff3fe5 100644\\n--- a/django/core/validators.py\\n+++ b/django/core/validators.py\\n@@ -397,8 +397,9 @@ class StepValueValidator(BaseValidator):\\n message = _(\"Ensure this value is a multiple of step size %(limit_value)s.\")\\n code = \"step_size\"\\n \\n- def compare(self, a, b):\\n- return not math.isclose(math.remainder(a, b), 0, abs_tol=1e-9)\\n+ def compare(self, a, b, min_value=0):\\n+ offset = a - min_value\\n+ return not math.isclose(math.remainder(offset, b), 0, abs_tol=1e-9)\\n \\n \\n @deconstructible\\n'}" 100 | ] 101 | }, 102 | "execution_count": 9, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "predictions[0]" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 10, 114 | "id": "00c2e805-cf64-4975-bd23-0b5d2be8576d", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "with open(\"devin_predictions.jsonl\", \"w\") as f:\n", 119 | " for pred in predictions:\n", 120 | " print(json.dumps(pred), file=f, flush=True)" 121 | ] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 3 (ipykernel)", 127 | "language": "python", 128 | "name": "python3" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 3 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython3", 140 | "version": "3.11.7" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 5 145 | } 146 | -------------------------------------------------------------------------------- /docs/README_CN.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Kawi the SWE-Llama 4 | 5 |

6 | 7 |
8 | 9 | | [日本語](docs/README_JP.md) | [English](https://github.com/princeton-nlp/SWE-bench) | [中文简体](docs/README_CN.md) | [中文繁體](docs/README_TW.md) | 10 | 11 |
12 | 13 | 14 | --- 15 |

16 | 您可以在我们的ICLR 2024的论文《SWE-bench: Can Language Models Resolve Real-World GitHub Issues?》中找到我们的代码和数据 17 |
18 |
19 | 20 | Build 21 | 22 | 23 | License 24 | 25 | 26 | 27 | 28 |

29 | 30 | 请访问我们的[网站](http://swe-bench.github.io)查看公共排行榜,并查看[更改日志](https://github.com/princeton-nlp/SWE-bench/blob/master/CHANGELOG.md)以获取有关 SWE-bench 基准最新更新的信息。 31 | 32 | ## 👋 总览 33 | SWE-bench 是一个用于评估大型语言模型的基准,这些模型是从 GitHub 收集的真实软件问题。 34 | 给定一个 *代码库* 和一个 *问题*,语言模型的任务是生成一个 *补丁* 来解决所描述的问题。 35 | 36 | 37 | 38 | ## 🚀 设置 39 | 要从源代码构建 SWE-bench,请按照以下步骤操作: 40 | 1. 克隆此仓库到本地 41 | 2. `cd` 进入仓库 42 | 3. 运行 `conda env create -f environment.yml` 创建名为 `swe-bench` 的 conda 环境 43 | 4. 使用 `conda activate swe-bench` 激活环境 44 | 45 | ## 💽 使用 46 | 你可以直接下载 SWE-bench 数据集 ([开发](https://drive.google.com/uc?export=download&id=1SbOxHiR0eXlq2azPSSOIDZz-Hva0ETpX), [测试](https://drive.google.com/uc?export=download&id=164g55i3_B78F6EphCZGtgSrd2GneFyRM) 集) 或从 [HuggingFace](https://huggingface.co/datasets/princeton-nlp/SWE-bench) 下载。 47 | 要使用 SWE-Bench,你可以: 48 | * 在我们预处理的数据集上训练自己的模型 49 | * 在现有模型上运行 [推理](https://github.com/princeton-nlp/SWE-bench/blob/master/inference/) (不管是本地的模型,比如LLaMA,还是通过API访问的模型,比如GPT-4)。推理步骤是你获取一个仓库和一个问题,让模型尝试去修复它。 50 | * 对模型进行 [评估](https://github.com/princeton-nlp/SWE-bench/blob/master/harness/)。这是你拿到一个 SWE-Bench 任务和一个模型提出的解决方案,然后评估其正确性。 51 | * 在你自己的仓库上运行 SWE-bench 的 [数据收集过程](https://github.com/princeton-nlp/SWE-bench/blob/master/collect/),以创建新的 SWE-Bench 任务。 52 | 53 | ## ⬇️ 下载 54 | | 数据集 | 模型 | 55 | |----------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------| 56 | | [🤗 SWE-bench](https://huggingface.co/datasets/princeton-nlp/SWE-bench) | [🦙 SWE-Llama 13b](https://huggingface.co/princeton-nlp/SWE-Llama-13b) | 57 | | [🤗 "Oracle" Retrieval](https://huggingface.co/datasets/princeton-nlp/SWE-bench_oracle) | [🦙 SWE-Llama 13b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-13b-peft) | 58 | | [🤗 BM25 Retrieval 13K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_13K) | [🦙 SWE-Llama 7b](https://huggingface.co/princeton-nlp/SWE-Llama-7b) | 59 | | [🤗 BM25 Retrieval 27K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_27K) | [🦙 SWE-Llama 7b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-7b-peft) | 60 | | [🤗 BM25 Retrieval 40K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_40K) | | 61 | | [🤗 BM25 Retrieval 50K (Llama tokens)](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_50k_llama) | | 62 | 63 | ## 🍎 教程 64 | 我们还写了关于如何使用SWE-bench不同部分的博客文章。 65 | 如果您想看到关于特定主题的文章,请通过问题告诉我们。 66 | * [Nov 1. 2023] Collecting Evaluation Tasks for SWE-Bench ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/collection.md)) 67 | * [Nov 6. 2023] Evaluating on SWE-bench ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/evaluation.md)) 68 | 69 | ## 💫 贡献 70 | 我们欢迎来自更广泛的自然语言处理、机器学习和软件工程研究社区的反馈。我们欢迎任何贡献、PR或问题! 71 | 为此,请提交新的PR或问题,并相应地填写相应的模板。我们将尽快跟进! 72 | 73 | 联系人: [Carlos E. Jimenez](http://www.carlosejimenez.com/) 和 [John Yang](https://john-b-yang.github.io/) (Email: {carlosej, jy1682}@princeton.edu). 74 | 75 | ## ✍️ 引用 76 | 如果你觉得我们的工作有帮助,请使用以下引用。 77 | ``` 78 | @inproceedings{ 79 | jimenez2024swebench, 80 | title={{SWE}-bench: Can Language Models Resolve Real-world Github Issues?}, 81 | author={Carlos E Jimenez and John Yang and Alexander Wettig and Shunyu Yao and Kexin Pei and Ofir Press and Karthik R Narasimhan}, 82 | booktitle={The Twelfth International Conference on Learning Representations}, 83 | year={2024}, 84 | url={https://openreview.net/forum?id=VTF8yNQM66} 85 | } 86 | ``` 87 | 88 | ## 🪪 许可证 89 | MIT. 参考 `LICENSE.md`. 90 | -------------------------------------------------------------------------------- /docs/README_TW.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Kawi the SWE-Llama 4 | 5 |

6 | 7 |
8 | 9 | | [日本語](docs/README_JP.md) | [English](https://github.com/princeton-nlp/SWE-bench) | [中文简体](docs/README_CN.md) | [中文繁體](docs/README_TW.md) | 10 | 11 |
12 | 13 | 14 | --- 15 |

16 | 你可以在我們的ICLR 2024的論文《SWE-bench: Can Language Models Resolve Real-World GitHub Issues?》中找到我們的代碼和數據 17 |
18 |
19 | 20 | Build 21 | 22 | 23 | License 24 | 25 | 26 | 27 | 28 |

29 | 30 | 請訪問我們的[網站](http://swe-bench.github.io)查看公共排行榜,並查看[更改日誌](https://github.com/princeton-nlp/SWE-bench/blob/master/CHANGELOG.md)以獲取有關 SWE-bench 基準最新更新的信息。 31 | 32 | ## 👋 縱覽 33 | SWE-bench 是一個用於評估大型語言模型的基準,這些模型是從 GitHub 收集的真實軟體問題。 34 | 給定一個 *代碼庫* 和一個 *問題*,語言模型的任務是生成一個 *修補程式* 來解決所描述的問題。 35 | 36 | 37 | 38 | ## 🚀 設置 39 | 要從源代碼構建 SWE-bench,請按照以下步驟操作: 40 | 1. 克隆此倉庫到本地 41 | 2. `cd` 進入倉庫 42 | 3. 運行 `conda env create -f environment.yml` 創建名為 `swe-bench` 的 conda 環境 43 | 4. 使用 `conda activate swe-bench` 激活環境 44 | 45 | ## 💽 使用 46 | 你可以直接下載 SWE-bench 數據集 ([開發](https://drive.google.com/uc?export=download&id=1SbOxHiR0eXlq2azPSSOIDZz-Hva0ETpX), [測試](https://drive.google.com/uc?export=download&id=164g55i3_B78F6EphCZGtgSrd2GneFyRM) 集) 或從 [HuggingFace](https://huggingface.co/datasets/princeton-nlp/SWE-bench) 下載。 47 | 要使用 SWE-Bench,你可以: 48 | * 在我們預處理的數據集上訓練自己的模型 49 | * 在現有模型上運行 [推理](https://github.com/princeton-nlp/SWE-bench/blob/master/inference/)(不管是本地的模型,比如LLaMA,還是通過API訪問的模型,比如GPT-4)。推理步驟是你獲取一個倉庫和一個問題,讓模型嘗試去修復它。 50 | * 對模型進行 [評估](https://github.com/princeton-nlp/SWE-bench/blob/master/inference/)。這是你拿到一個 SWE-Bench 任務和一個模型提出的解決方案,然後評估其正確性。 51 | * 在你自己的倉庫上運行 SWE-bench 的 [數據收集過程](https://github.com/princeton-nlp/SWE-bench/blob/master/collect/),以創建新的 SWE-Bench 任務。 52 | 53 | ## ⬇️ 下載 54 | | 數據集 | 模型 | 55 | |---------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------| 56 | | [🤗 SWE-bench](https://huggingface.co/datasets/princeton-nlp/SWE-bench) | [🦙 SWE-Llama 13b](https://huggingface.co/princeton-nlp/SWE-Llama-13b) | 57 | | [🤗 "Oracle" Retrieval](https://huggingface.co/datasets/princeton-nlp/SWE-bench_oracle) | [🦙 SWE-Llama 13b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-13b-peft) | 58 | | [🤗 BM25 Retrieval 13K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_13K) | [🦙 SWE-Llama 7b](https://huggingface.co/princeton-nlp/SWE-Llama-7b) | 59 | | [🤗 BM25 Retrieval 27K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_27K) | [🦙 SWE-Llama 7b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-7b-peft) | 60 | | [🤗 BM25 Retrieval 40K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_40K) | | 61 | | [🤗 BM25 Retrieval 50K (Llama tokens)](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_50k_llama) | | 62 | 63 | ## 🍎 教程 64 | 我們還撰寫了以下有關如何使用SWE-bench不同部分的博客文章。 65 | 如果您想看到有關特定主題的文章,請通過問題告訴我們。 66 | * [Nov 1. 2023] Collecting Evaluation Tasks for SWE-Bench ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/collection.md)) 67 | * [Nov 6. 2023] Evaluating on SWE-bench ([🔗](https://github.com/princeton-nlp/SWE-bench/tree/main/assets/evaluation.md)) 68 | 69 | ## 💫 貢獻 70 | 我們很樂意聽取來自更廣泛的 NLP、機器學習和軟體工程研究社區的意見,並歡迎任何貢獻、拉取請求或問題! 71 | 為此請提交新的拉取請求或問題,並根據相應的模板填寫。我們將盡快跟進! 72 | 73 | 聯繫人: [Carlos E. Jimenez](http://www.carlosejimenez.com/) 和 [John Yang](https://john-b-yang.github.io/) (Email: {carlosej, jy1682}@princeton.edu). 74 | 75 | ## ✍️ 引用 76 | 如果你覺得我們的工作有幫助,請使用以下引用。 77 | ``` 78 | @inproceedings{ 79 | jimenez2024swebench, 80 | title={{SWE}-bench: Can Language Models Resolve Real-world Github Issues?}, 81 | author={Carlos E Jimenez and John Yang and Alexander Wettig and Shunyu Yao and Kexin Pei and Ofir Press and Karthik R Narasimhan}, 82 | booktitle={The Twelfth International Conference on Learning Representations}, 83 | year={2024}, 84 | url={https://openreview.net/forum?id=VTF8yNQM66} 85 | } 86 | ``` 87 | 88 | ## 🪪 授權 89 | MIT. 參考 `LICENSE.md`. 90 | -------------------------------------------------------------------------------- /swebench/inference/make_datasets/README.md: -------------------------------------------------------------------------------- 1 | # `make_datasets` 2 | The `make_datasets` sub-package is used to create datasets for SWE-bench with your own prompts, contexts, and tokenizers. 3 | The sub-package contains the following scripts: 4 | 5 | - `create_text_dataset.py` is used to create a text dataset from SWE-bench with a given prompt and context-source. 6 | - `tokenize_dataset.py` is used to tokenize a text dataset with a given tokenizer. 7 | - `bm25_retrieval.py` can be used to perform BM25 retrieval on the SWE-bench dataset. 8 | 9 | ## `create_text_dataset.py` 10 | This script is used to create a text dataset from SWE-bench with a given prompt and context-source. 11 | Prompts are defined as functions in `create_instance.py`. `style-2` and `style-3` are appropriate for API models, while only `style-2` can be used for SWE-Llama. 12 | `full_file_gen` is used for the full file generation ablation, and `style-2-edits-only` is used for the `oracle-collapsed` ablation. 13 | 14 | Here's an example of how to call the script to create a dataset with `style-3` prompts and `oracle` contexts: 15 | 16 | ```bash 17 | export GITHUB_TOKEN= 18 | python -m swebench.inference.make_datasets.create_text_dataset \ 19 | --dataset_name_or_path princeton-nlp/SWE-bench \ 20 | --output_dir ./base_datasets --prompt_style style-3 \ 21 | --file_source oracle 22 | ``` 23 | 24 | You can also specify further options: 25 | 26 | - `--splits`: To specify the dataset splits to process (default is all splits). If you want to process only the `test` split, you can use `--splits test`. 27 | - `--validation_ratio`: To specify the ratio of the training set to use for validation (default is 0.01). For example, you can use `--validation_ratio 0.05` to use 5% of the training set for validation. 28 | - `--max_context_len`: To specify the maximum number of tokens to use for context. For example, `--max_context_len 15000` will limit the context to 15000 tokens. 29 | - `--tokenizer_name`: To specify the tokenizer to use. You can choose from the available tokenizers defined in `tokenize_dataset.py`. If not specified, the default tokenizer will be used. 30 | - `--push_to_hub_user`: If you want to push the dataset to the Hugging Face Hub, you can specify your username with this option. If specified, make sure you have set your API key environment variable `HUGGING_FACE_HUB_TOKEN`. You do not need to specify `--output_dir` if you use this option. 31 | - `--retrieval_file`: If you want to use BM25 retrieval to create the dataset, you can specify the file containing the retrieval results with this option. The retrieval results should be in the format produced by `bm25_retrieval.py`. You should specify `--file_source bm25` if you use this option. 32 | 33 | The script will create a new dataset in the specified output directory. If you choose to push the dataset to the Hugging Face Hub, it will be available under your username. 34 | 35 | ## `tokenize_dataset.py` 36 | This script is used to tokenize a text dataset with a given tokenizer. You can choose from the available tokenizers defined in the script. The script will create a new tokenized dataset in the specified output directory. 37 | 38 | Here's an example of how to call the script to tokenize a dataset with the `llama` tokenizer: 39 | 40 | ```bash 41 | python -m swebench.inference.make_datasets.tokenize_dataset \ 42 | --dataset_name_or_path ./base_datasets/DATASET_NAME \ 43 | --output_dir ./tokenized_datasets \ 44 | --tokenizer_name llama \ 45 | --num_proc 20 46 | ``` 47 | 48 | - `--push_to_hub_user`: If you want to push the dataset to the Hugging Face Hub, you can specify your username with this option. If specified, make sure you have set your API key environment variable `HUGGING_FACE_HUB_TOKEN`. You do not need to specify `--output_dir` if you use this option. 49 | 50 | __NOTE:__ The `cl100k` tokenizer does not support multiprocessing. 51 | 52 | ## `bm25_retrieval.py` 53 | This script can be used to perform BM25 retrieval on the SWE-bench dataset. It creates a results file in the specified output directory that can be used in `create_text_dataset.py` with the `--retrieval_file` option and `--file_source bm25`. 54 | 55 | Here's an example of how to call the script to perform BM25 retrieval on the `test` split of the SWE-bench dataset: 56 | 57 | ```bash 58 | python -m swebench.inference.make_datasets.bm25_retrieval \ 59 | --dataset_name_or_path princeton-nlp/SWE-bench \ 60 | --output_dir ./retrieval_results \ 61 | --splits test 62 | ``` 63 | 64 | __NOTE:__ The script requires the `pyserini` package to be installed. See the pyserini [installation instructions](https://github.com/castorini/pyserini) for more details. 65 | 66 | 67 | ## `eval_retrieval.py` 68 | This script can be used to evaluate the BM25 retrieval results for a dataset created with `create_text_dataset.py` with the `--retrieval_file` option and `--file_source bm25`. 69 | __NOTE__: The script assumes that the `text` field in the dataset specifies files using the "\[start of filename\]" and "\[end of filename\]" tags used by the default DOCUMENT_ENCODING_FUNCTIONS in `bm25_retrieval.py`. If you change that format, you need to modify the `instance_file_pattern` in `eval_retrieval.py` accordingly. 70 | 71 | Here's an example of how to call the script to evaluate the BM25 retrieval results for a dataset: 72 | 73 | ```bash 74 | python -m swebench.inference.make_datasets.eval_retrieval \ 75 | --dataset_name_or_path princeton-nlp/SWE-bench_bm25_13K \ 76 | --split test 77 | ``` 78 | -------------------------------------------------------------------------------- /swebench/collect/make_lite/criteria.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | 4 | from unidiff import PatchSet 5 | 6 | 7 | def contains_git_commit_hash(text: str) -> bool: 8 | """ 9 | Returns True if the text contains a git commit hash (40 character SHA-1 hash). 10 | * Excludes commit hashes that are part of a URL. 11 | """ 12 | pattern_git_commit_hash = re.compile(r'(? bool: 22 | """ 23 | Returns True if the text contains a URL. Excludes URLs that are part of the repository. 24 | """ 25 | if repo: 26 | repo_prefix = f"http://github.com/{repo}" 27 | pattern_repo = re.escape(repo_prefix) 28 | # Adding a negative lookahead assertion to ensure URLs starting with the repository prefix are excluded 29 | pattern_urls = r'(?:https?://(?!{}).+)|(?:www\.(?!{}).+)'.format(pattern_repo, pattern_repo) 30 | else: 31 | pattern_urls = r'https?://(?:www\.)?\S+' 32 | 33 | return bool(re.search(pattern_urls, text)) 34 | 35 | 36 | def contains_image(text: str) -> bool: 37 | """ 38 | Returns True if the text contains an image or video file extension. 39 | """ 40 | image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.svg', '.webp', '.ico', '.heif', '.bpg', '.avif'] 41 | video_extensions = ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg'] 42 | 43 | pattern_image = '|'.join(re.escape(ext) for ext in image_extensions) 44 | pattern_video = '|'.join(re.escape(ext) for ext in video_extensions) 45 | 46 | image_regex = re.compile(r'\b({})\b'.format(pattern_image), flags=re.IGNORECASE) 47 | video_regex = re.compile(r'\b({})\b'.format(pattern_video), flags=re.IGNORECASE) 48 | 49 | return image_regex.search(text) is not None or video_regex.search(text) is not None 50 | 51 | 52 | def contains_issue_reference(text: str, repo: str) -> bool: 53 | """ 54 | Returns True if text (problem statement) contains a reference to another issue (e.g. #1234). 55 | """ 56 | # Look for GitHub style issue references 57 | pattern_issue_ref = re.compile(r"(\w+)\s+\#(\d+)") 58 | keywords = { 59 | "close", "closes", "closed", 60 | "fix", "fixes", "fixed", 61 | "resolve", "resolves", "resolved", 62 | } 63 | references = dict(pattern_issue_ref.findall(text)) 64 | if references: 65 | for word, _ in references.items(): 66 | if word.lower() in keywords: 67 | return True 68 | 69 | # Look for GitLab style issue references 70 | pattern_gitlab = re.compile(r"https?:\/\/gitlab.com\/(.*)\/issues") 71 | if re.search(pattern_gitlab, text): 72 | return True 73 | 74 | # Look for GitHub `#` style references + verify if the issue exists 75 | pattern_issue_ref = re.compile(r'#\d+') 76 | matches = pattern_issue_ref.findall(text) 77 | for match in matches: 78 | url = f"http://github.com/{repo}/issues/{match[1:]}" 79 | if repo == "django/django": 80 | url = f"https://code.djangoproject.com/ticket/{match[1:]}" 81 | if requests.get(url).status_code == 200: 82 | return True 83 | 84 | return False 85 | 86 | 87 | def contains_non_modified_files(patch_text: str) -> bool: 88 | """ 89 | Returns True if the patch contains files that are not modified. 90 | """ 91 | patch = PatchSet(patch_text) 92 | return len(patch.removed_files) > 0 or len(patch.added_files) > 0 93 | 94 | 95 | def contains_pytest_match_arg(patch_test_text: str) -> bool: 96 | """ 97 | Returns True if the test patch contains a pytest.raises() call with a match argument. 98 | """ 99 | if any([x in patch_test_text for x in [ 100 | 'pytest.raises', 101 | 'pytest.warns', 102 | 'pytest.deprecated_call', 103 | ]]): 104 | return 'match' in patch_test_text 105 | # Django style assertions: 106 | if any([x in patch_test_text for x in [ 107 | 'assertOutput', 108 | 'assertRaises', 109 | 'checks.Error', 110 | ]]): 111 | return True 112 | return False 113 | 114 | 115 | def leq_n_code_lines(patch_text: str, n: int = 25) -> bool: 116 | """ 117 | Returns True if the patch has at most n lines of code changed. 118 | """ 119 | lines = 0 120 | patch = PatchSet(patch_text) 121 | for file in patch: 122 | for hunk in file: 123 | lines += hunk.added 124 | lines += hunk.removed 125 | return lines <= n 126 | 127 | 128 | def leq_n_files(patch_text: str, n: int = 1) -> bool: 129 | """ 130 | Returns True if the patch has at most n files. 131 | """ 132 | patch = PatchSet(patch_text) 133 | return len(patch.modified_files) <= n 134 | 135 | 136 | def leq_n_hunks(patch_text: str, n: int = 3) -> bool: 137 | """ 138 | Returns True if the patch has at most n hunks. 139 | """ 140 | patch = PatchSet(patch_text) 141 | num_hunks = sum([ 142 | len([h for h in f]) 143 | for f in patch.modified_files 144 | ]) 145 | return num_hunks <= n and num_hunks > 0 146 | 147 | 148 | def leq_n_words(text: str, n: int = 50) -> bool: 149 | """ 150 | Returns True if the text has at most n words. 151 | """ 152 | return len(text.split()) <= n 153 | -------------------------------------------------------------------------------- /assets/collection.md: -------------------------------------------------------------------------------- 1 | # Collecting Evaluation Tasks for SWE-Bench 2 | John Yang • November 1, 2023 3 | 4 | In this tutorial, we explain how to use the SWE-Bench repository to collect evaluation task instances from GitHub repositories. 5 | 6 | > SWE-bench's collection pipeline is currently designed to target PyPI packages. We hope to expand SWE-bench to more repositories and languages in the future. 7 | 8 |
9 | 10 |
11 | 12 | ## 🔍 Selecting a Repository 13 | 14 | SWE-bench constructs task instances from issues and pull requests. 15 | A good repository to source evaluation instances from should have many issues and pull requests. 16 | A point of reference for repositories that fit this bill would be the [Top PyPI packages](https://hugovk.github.io/top-pypi-packages/) website. 17 | 18 | Once you've selected a repository, use the `/collect/make_repo/make_repo.sh` script to create a mirror of the repository, like so: 19 | ```bash 20 | ./collect/make_repo/make_repo.sh scikit-learn/scikit-learn 21 | ``` 22 | 23 | ## ⛏️ Collecting Candidate Tasks 24 | 25 | Once you have cloned the repository, you can then use the `collect/get_tasks_pipeline.py` script to collect pull requests and convert them to candidate task instances. 26 | Supply the *repository name(s)* and *logging folders* as arguments to the `run_get_tasks_pipeline.sh` script, then run it like so: 27 | ```bash 28 | ./collect/run_get_tasks_pipeline.sh 29 | ``` 30 | 31 | At this point, for a repository, you should have... 32 | * A mirror clone of the repository under the [SWE-bench organization](https://github.com/orgs/swe-bench/repositories). 33 | * A `-prs.jsonl` file containing all the repository's PRs. 34 | * A `-task-instances.jsonl` file containing all the candidate task instances. 35 | 36 | ## 📙 Specify Execution Parameters 37 | 38 | This step is the most manual of all parts. 39 | To create an appropriate execution environment for task instances from a new repository, you must do the following steps: 40 | * Assign a repository-specific *version* (i.e. `1.2`) to every task instance. 41 | * Specify repository+version-specific installation commands in `harness/constants.py`. 42 | 43 | ### Part A: Versioning 44 | Determining a version for each task instance can be accomplished in a number of ways, depending on the availability + feasability with respect to each repository. 45 | * Scrape from code: A version is explicitly specified in the codebase (in `__init__.py` or `_version.py` for PyPI packages). 46 | * Scrape from web: Repositories with websites (i.e. [xarray.dev](https://xarray.dev/)) have a "Releases" or "What's New" page (i.e. [release page](https://docs.xarray.dev/en/stable/whats-new.html) for xarray). This can be scraped for information. 47 | * Build from code: Sometimes, version-related files (i.e. `_version.py`) are purposely omitted by a developer (check `.gitignore` to verify). In this case, per task instance you can build the repository source code locally and extract the version number from the built codebase. 48 | 49 | Examples and technical details for each are included in `/versioning/`. Please refer to them as needed. 50 | 51 | ### Part B: Installation Configurations 52 | Per repository, you must provide installation instructions per version. In `constants.py`... 53 | 1. In `MAP_VERSION_TO_INSTALL`, declare a `: MAP_VERSION_TO_INSTALL_` key/value pair. 54 | 2. Define a `MAP_VERSION_TO_INSTALL_`, where the key is a version as a string, and the value is a dictionary of installation fields that include the following information: 55 | ```python 56 | { 57 | "python": "3.x", # Required 58 | "packages": "numpy pandas tensorflow", 59 | "install": "pip install -e .", # Required 60 | "pip_packages": ["pytest"], 61 | } 62 | ``` 63 | These instructions can typically be inferred from the companion website or `CONTRIBUTING.md` doc that many open source repositories have. 64 | 65 | ## ⚙️ Execution-based Validation 66 | Congrats, you got through the trickiest part! It's smooth sailing from here on out. 67 | 68 | We now need to check that the task instances install properly + the problem solved by the task instance is non-trivial. 69 | This is taken care of by the `engine_validation.py` code. 70 | Run `./harness/run_validation.sh` and supply the following arguments: 71 | * `instances_path`: Path to versioned candidate task instances 72 | * `log_dir`: Path to folder to store task instance-specific execution logs 73 | * `temp_dir`: Path to directory to perform execution 74 | * `verbose`: Whether to print logging traces to standard output. 75 | 76 | > In practice, you may have to iterate between this step and **Installation Configurations** a couple times. If your instructions are incorrect/under-specified, it may result in candidate task instances not being installed properly. 77 | 78 | ## 🔄 Convert to Task Instances 79 | At this point, we now have all the information necessary to determine if task instances can be used for evaluation with SWE-bench, and save them if they do. 80 | 81 | We provide the `validation.ipynb` Jupyter notebook provided in this folder to make the remaining steps easier. 82 | At a high level, it enables the following: 83 | * In **Monitor Validation**, check the results of the `./run_validation.sh` step. 84 | * In **Get [FP]2[FP] Tests**, determine which task instances are non-trivial (solves at least one test) 85 | * In **Create Task Instances `.json` file**, perform some final preprocessing and save your task instances to a `.json` file. 86 | 87 | Thanks for reading! If you have any questions or comments about the details in the article, please feel free to follow up with an issue. 88 | -------------------------------------------------------------------------------- /docs/20240627_docker/README.md: -------------------------------------------------------------------------------- 1 | # Containerized Evaluation Harness 2 | June 27, 2024 3 | 4 | We’re releasing an update that improves the reliability of the SWE-bench evaluation harness using **containerized environments** based on Docker. 5 | 6 | In the original setup, we hypothesized that `conda` environments would be enough to enforce reproducible evaluation. 7 | In hindsight, it is underspecified. 8 | This past April, we put out [Bug Report 4/5/2024](docs/reports/20240405_eval_bug/README.md), that among several upgrades, adds explicit versioning for packages. 9 | 10 | However, SWE-bench evaluation remained sensitive to discrepancies originating from different platforms and user-specific configurations, leading to inconsistent results. 11 | To eliminate these irregularities, our new harness provisions **per-sample Docker images with Python virtual environments** that have been rigorously tested. 12 | 13 | In the new Docker harness, **99.78% (2289/2294) of SWE-bench** tasks and **100% (300/300) of SWE-bench Lite** tasks consistently resolve correctly with the ground truth solution. Furthermore, containers spawned from these images can be used as development environments for agents that run and develop solutions iteratively. 14 | 15 | ## Running Evaluation 16 | The main entrypoint for the evaluation harness is the `swebench.harness.run_evaluation` module. 17 | 18 | Run the following command to see the available arguments: 19 | ```bash 20 | python -m swebench.harness.run_evaluation -h 21 | ``` 22 | 23 | This module runs docker containers for each evaluation instance in parallel. 24 | In the process of running the evaluation, the harness will: 25 | 1. Build a base image that install basic dependencies for all instances 26 | 2. Build "environment" images that initialize the python environment for various configurations that are common to multiple instances (in total there are about 60 of these - or 100GB of images) 27 | 3. Build "instance" images that install the specific dependencies and source code for each instance 28 | 29 | The harness will then run the evaluation script in each instance container, and collect the results. 30 | After the evaluation is complete, the harness will clean up the containers and images depending on the `--cache_level` argument. 31 | 32 | ## Choosing the right `cache_level` 33 | Since the harness builds images for each instance, it can be time-consuming to rebuild these images every time you run an evaluation. 34 | We provide a `cache_level` argument to control how the harness caches images between runs. 35 | By default, the harness `cache_level` is set to `env`, which means that the harness will store the base and environment images, but not the instance images. 36 | In this setting, the base and environment images will be reused across runs, but take up about 100GB of disk space. 37 | At the time of release, we require about 120GB of free disk space to run the harness with any `cache_level`. 38 | For most users, this is the recommended setting providing a good balance between evaluation speed and disk space usage. 39 | 40 | For users who want the fastest possible evaluation times, we recommend setting `cache_level` to `instance`. 41 | In this setting, the harness will store images for all instances; making evaluation extremely fast. 42 | However, all base, environment, and instance images will be stored, taking up about 2,000GB of disk space. 43 | While this setting is the fastest, it is also extremely disk space intensive. 44 | 45 | For users who want to minimize disk space usage, we recommend setting `cache_level` to `base` or `none`, which will remove all the instance and environment images after each run. 46 | Note at this time, this setting still requires about 100GB of disk space to store the base and environment images when first building them. 47 | 48 | ## Choosing the right `max_workers` 49 | The harness runs instances in parallel using the `max_workers` argument. 50 | Since the harness uses the docker daemon to run instances, the number of workers should be chosen based on the resources available on your machine. 51 | In general, we don't recommend using a very large number of workers, as this can slow down the evaluation process. 52 | Regardless your CPU count, we recommend using fewer than 28 workers. 53 | 54 | On a 16-core machine with `max_workers=12`, it should be possible to run evaluation on SWE-bench Lite in about 30 minutes when using the `env` cache level and under 15 minutes when using the `instance` cache level. 55 | 56 | On an 8-core machine with `max_workers=6`, it should be possible to run evaluation on SWE-bench Lite in about 50 minutes when using the `env` cache level and about 15 minutes when using the `instance` cache level. 57 | 58 | Using a much larger number of workers will likely slow down the evaluation process. 59 | 60 | ## Future Steps 61 | We'd like to soon make the harness even more user-friendly by providing pre-built docker images that include verified starting points for each instance. 62 | 63 | We're also hoping to better enable evaluation via orchestration tools like Kubernetes, which would allow users to run evaluations on larger clusters of machines. 64 | 65 | We're providing experimental support for running evaluations on `arm64` machines, but this is still in the early stages of development. 66 | Users may experience substantial speed degradation when running evaluations on `arm64` machines. 67 | 68 | ## Deliverables 69 | * Please use `swebench>=2.0` for the latest version of the benchmark - the old version is now deprecated but can still be accessed using `swebench<2.0`. 70 | 71 | ## Acknowledgements 72 | This work was done in collaboration with the Preparedness team at OpenAI (including Oliver Jaffe, Chan Jun Shern, James Aung, Giulio Starace, Dane Sherburn, and Neil Chowdhury). 73 | 74 | We'd also like to thank Cognition Labs for providing [inspiration](https://github.com/CognitionAI/devin-swebench-results/tree/main) in the design of the evaluation harness. 75 | 76 | ✍️ Carlos & John -------------------------------------------------------------------------------- /scripts/eval/eval_infer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROCESS_FILEPATH=$1 4 | if [ -z "$PROCESS_FILEPATH" ]; then 5 | echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh [instance_id] [dataset_name] [split]" 6 | exit 1 7 | fi 8 | 9 | if [ ! -f $PROCESS_FILEPATH ]; then 10 | echo "Error: $PROCESS_FILEPATH is not a file" 11 | exit 1 12 | fi 13 | 14 | # If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH 15 | # otherwise, we want to eval on the instance_id 16 | INSTANCE_ID=$2 17 | DATASET_NAME=${3:-"swe-train/swe-train-v0"} 18 | SPLIT=${4:-"train"} 19 | 20 | echo "INSTANCE_ID: $INSTANCE_ID" 21 | echo "DATASET_NAME: $DATASET_NAME" 22 | echo "SPLIT: $SPLIT" 23 | 24 | PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH) 25 | FILE_DIR=$(dirname $PROCESS_FILEPATH) 26 | FILE_NAME=$(basename $PROCESS_FILEPATH) 27 | 28 | echo "Evaluating $FILE_NAME @ $FILE_DIR" 29 | 30 | # ================================================ 31 | # detect whether PROCESS_FILEPATH is in OD format or in SWE-bench format 32 | echo "==============================================================" 33 | echo "Detecting whether PROCESS_FILEPATH is in OD format or in SWE-bench format" 34 | echo "==============================================================" 35 | # SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch 36 | function is_swebench_format() { 37 | # Read the first line of the file 38 | read -r first_line < "$PROCESS_FILEPATH" 39 | 40 | # Use jq to check if the first line has the required fields 41 | echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null 42 | 43 | if [ $? -ne 0 ]; then 44 | return 1 # Return 1 if the first line does not have the required fields 45 | fi 46 | 47 | return 0 # Return 0 if the first line has the required fields 48 | } 49 | # Call the function with the file path 50 | is_swebench_format "$PROCESS_FILEPATH" 51 | IS_SWEBENCH_FORMAT=$? 52 | # Use the result in an if-else statement 53 | if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then 54 | echo "The file IS in SWE-bench format." 55 | SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH 56 | else 57 | echo "The file IS NOT in SWE-bench format." 58 | 59 | # ==== Convert OD format to SWE-bench format ==== 60 | echo "Merged output file with fine-grained report will be saved to $FILE_DIR" 61 | python3 scripts/eval/convert_od_output_to_swe_json.py $PROCESS_FILEPATH 62 | # replace .jsonl with .swebench.jsonl in filename 63 | SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl} 64 | echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL" 65 | # assert that the file exists 66 | if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then 67 | echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process." 68 | exit 1 69 | fi 70 | SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL) 71 | fi 72 | # ================================================ 73 | 74 | echo "==============================================================" 75 | echo "Running SWE-bench evaluation" 76 | echo "==============================================================" 77 | 78 | RUN_ID=$(date +"%Y%m%d_%H%M%S") 79 | N_PROCESS=16 80 | 81 | if [ -z "$INSTANCE_ID" ]; then 82 | echo "Running SWE-bench evaluation on the whole input file..." 83 | # Default to SWE-Bench-lite 84 | # change `--dataset_name` and `--split` to alter dataset 85 | 86 | python -m swebench.harness.run_evaluation \ 87 | --dataset_name "$DATASET_NAME" \ 88 | --split "$SPLIT" \ 89 | --predictions_path $SWEBENCH_FORMAT_JSONL \ 90 | --timeout 1800 \ 91 | --cache_level instance \ 92 | --max_workers $N_PROCESS \ 93 | --run_id $RUN_ID 94 | 95 | # get the "model_name_or_path" from the first line of the SWEBENCH_FORMAT_JSONL 96 | MODEL_NAME_OR_PATH=$(jq -r '.model_name_or_path' $SWEBENCH_FORMAT_JSONL | head -n 1) 97 | echo "MODEL_NAME_OR_PATH: $MODEL_NAME_OR_PATH" 98 | 99 | RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL) 100 | echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR" 101 | 102 | # move the eval results to the target directory 103 | mkdir -p $RESULT_OUTPUT_DIR 104 | # rm eval_outputs directory if it exists 105 | if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then 106 | rm -rf $RESULT_OUTPUT_DIR/eval_outputs 107 | fi 108 | 109 | mv run_instance_logs/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR 110 | mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs 111 | echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt 112 | 113 | # move report file 114 | REPORT_PATH=$MODEL_NAME_OR_PATH.$RUN_ID.json 115 | if [ -f $REPORT_PATH ]; then 116 | # check if $RESULT_OUTPUT_DIR/report.json exists 117 | if [ -f $RESULT_OUTPUT_DIR/report.json ]; then 118 | echo "Report file $RESULT_OUTPUT_DIR/report.json already exists. Overwriting..." 119 | if [ -f $RESULT_OUTPUT_DIR/report.json.bak ]; then 120 | rm $RESULT_OUTPUT_DIR/report.json.bak 121 | fi 122 | mv $RESULT_OUTPUT_DIR/report.json $RESULT_OUTPUT_DIR/report.json.bak 123 | fi 124 | 125 | mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json 126 | fi 127 | 128 | python scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH 129 | 130 | else 131 | echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID" 132 | python -m swebench.harness.run_evaluation \ 133 | --dataset_name "$DATASET_NAME" \ 134 | --split "$SPLIT" \ 135 | --predictions_path $SWEBENCH_FORMAT_JSONL \ 136 | --timeout 1800 \ 137 | --instance_ids $INSTANCE_ID \ 138 | --cache_level instance \ 139 | --max_workers $N_PROCESS \ 140 | --run_id $RUN_ID 141 | fi 142 | -------------------------------------------------------------------------------- /swebench/collect/check_validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "from collections import defaultdict\n", 11 | "import os\n", 12 | "import pandas as pd\n", 13 | "from tqdm import tqdm" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "swebench_data_path = \"\"\n", 23 | "log_root = \"\"\n", 24 | "validation_logs = defaultdict(lambda: defaultdict(set))\n", 25 | "swebench_data = pd.read_json(swebench_data_path, lines=True, orient=\"records\")\n", 26 | "swebench_data_dict = {d[\"instance_id\"]: d for d in swebench_data.to_dict(orient=\"records\")}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# parse test results\n", 36 | "for name in [\"gold\", \"empty\"]:\n", 37 | " log_path = os.path.join(log_root, name)\n", 38 | " log_dirs = os.listdir(log_path)\n", 39 | " \n", 40 | " total_logs = len(log_dirs)\n", 41 | " missing_logs = 0\n", 42 | " print(f\"Processing [{name}] logs\")\n", 43 | " pbar = tqdm(total=total_logs)\n", 44 | " \n", 45 | " for log_dir in log_dirs:\n", 46 | " log_file = os.path.join(log_path, log_dir, \"report.json\")\n", 47 | " pbar.update(1)\n", 48 | " if not os.path.exists(log_file):\n", 49 | " missing_logs += 1\n", 50 | " pbar.set_postfix({\"missing\": missing_logs, \"total\": total_logs})\n", 51 | " continue\n", 52 | " with open(log_file, \"r\") as f:\n", 53 | " log = json.load(f)\n", 54 | " instance_id = list(log.keys())[0]\n", 55 | " if \"tests_status\" in log[instance_id]:\n", 56 | " validation_logs[instance_id][f\"{name}-pass\"] = set(log[instance_id][\"tests_status\"][\"PASS\"])\n", 57 | " validation_logs[instance_id][f\"{name}-fail\"] = set(log[instance_id][\"tests_status\"][\"FAIL\"])" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# From Swe-Bench: we need \"at least one test where it changes from fail to pass\n", 67 | "validated_instances = []\n", 68 | "n_total = 0\n", 69 | "n_validated = 0\n", 70 | "pbar = tqdm(total=len(validation_logs))\n", 71 | "for k, log in validation_logs.items():\n", 72 | " fail_to_pass = log[\"gold-pass\"] & log[\"empty-fail\"]\n", 73 | " pass_to_pass = log[\"gold-pass\"] & log[\"empty-pass\"] \n", 74 | " n_total += 1\n", 75 | " if len(fail_to_pass) > 0:\n", 76 | " n_validated += 1\n", 77 | " # print(f\"{k} has test that changes from fail to pass\")\n", 78 | " validated_instances.append(swebench_data_dict[k])\n", 79 | " validated_instances[-1]['FAIL_TO_PASS'] = list(fail_to_pass)\n", 80 | " validated_instances[-1]['PASS_TO_PASS'] = list(pass_to_pass)\n", 81 | " pbar.update(1)\n", 82 | " pbar.set_postfix({\"validated\": n_validated, \"total\": n_total})" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from datasets import Dataset, Value, Sequence, Features\n", 92 | "KEYS = [\n", 93 | " 'repo',\n", 94 | " 'pull_number',\n", 95 | " 'instance_id',\n", 96 | " 'issue_numbers',\n", 97 | " 'base_commit',\n", 98 | " 'patch',\n", 99 | " 'test_patch',\n", 100 | " 'problem_statement',\n", 101 | " 'hints_text',\n", 102 | " 'created_at',\n", 103 | " 'version',\n", 104 | " 'PASS_TO_PASS',\n", 105 | " 'FAIL_TO_PASS',\n", 106 | "]\n", 107 | "# We need to define feature to make sure the dataset is consistent with the huggingface dataset on the hub\n", 108 | "FEATURES = Features({\n", 109 | " 'repo': Value(dtype='string', id=None),\n", 110 | " 'pull_number': Value(dtype='int64', id=None),\n", 111 | " 'instance_id': Value(dtype='string', id=None),\n", 112 | " 'issue_numbers': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),\n", 113 | " 'base_commit': Value(dtype='string', id=None),\n", 114 | " 'patch': Value(dtype='string', id=None),\n", 115 | " 'test_patch': Value(dtype='string', id=None),\n", 116 | " 'problem_statement': Value(dtype='string', id=None),\n", 117 | " 'hints_text': Value(dtype='string', id=None),\n", 118 | " 'created_at': Value(dtype='string', id=None),\n", 119 | " 'version': Value(dtype='string', id=None),\n", 120 | " 'PASS_TO_PASS': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),\n", 121 | " 'FAIL_TO_PASS': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)\n", 122 | "})\n", 123 | "def to_hf_dataset(data_list):\n", 124 | " return Dataset.from_dict({k: [d[k] for d in data_list] for k in KEYS}, features=FEATURES)\n", 125 | "validated_dataset = to_hf_dataset(validated_instances)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "validated_dataset.push_to_hub(\"\", split=\"\", private=True)" 135 | ] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "swebench", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.9.19" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 2 159 | } 160 | -------------------------------------------------------------------------------- /swebench/collect/get_tasks_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Script to collect pull requests and convert them to candidate task instances""" 4 | 5 | import argparse, os 6 | import traceback 7 | 8 | from dotenv import load_dotenv 9 | from multiprocessing import Pool 10 | from swebench.collect.build_dataset import main as build_dataset 11 | from swebench.collect.print_pulls import main as print_pulls 12 | 13 | 14 | load_dotenv() 15 | 16 | 17 | def split_instances(input_list: list, n: int) -> list: 18 | """ 19 | Split a list into n approximately equal length sublists 20 | 21 | Args: 22 | input_list (list): List to split 23 | n (int): Number of sublists to split into 24 | Returns: 25 | result (list): List of sublists 26 | """ 27 | avg_length = len(input_list) // n 28 | remainder = len(input_list) % n 29 | result, start = [], 0 30 | 31 | for i in range(n): 32 | length = avg_length + 1 if i < remainder else avg_length 33 | sublist = input_list[start : start + length] 34 | result.append(sublist) 35 | start += length 36 | 37 | return result 38 | 39 | 40 | def construct_data_files(data: dict): 41 | """ 42 | Logic for combining multiple .all PR files into a single fine tuning dataset 43 | 44 | Args: 45 | data (dict): Dictionary containing the following keys: 46 | repos (list): List of repositories to retrieve instruction data for 47 | path_prs (str): Path to save PR data files to 48 | path_tasks (str): Path to save task instance data files to 49 | token (str): GitHub token to use for API requests 50 | """ 51 | repos, path_prs, path_tasks, max_pulls, cutoff_date, token = ( 52 | data["repos"], 53 | data["path_prs"], 54 | data["path_tasks"], 55 | data["max_pulls"], 56 | data["cutoff_date"], 57 | data["token"], 58 | ) 59 | for repo in repos: 60 | repo = repo.strip(",").strip() 61 | repo_name = repo.split("/")[1] 62 | try: 63 | path_pr = os.path.join(path_prs, f"{repo_name}-prs.jsonl") 64 | if cutoff_date: 65 | path_pr = path_pr.replace(".jsonl", f"-{cutoff_date}.jsonl") 66 | if not os.path.exists(path_pr): 67 | print(f"Pull request data for {repo} not found, creating...") 68 | print_pulls( 69 | repo, 70 | path_pr, 71 | token, 72 | max_pulls=max_pulls, 73 | cutoff_date=cutoff_date 74 | ) 75 | print(f"✅ Successfully saved PR data for {repo} to {path_pr}") 76 | else: 77 | print(f"📁 Pull request data for {repo} already exists at {path_pr}, skipping...") 78 | 79 | path_task = os.path.join(path_tasks, f"{repo_name}-task-instances.jsonl") 80 | if not os.path.exists(path_task): 81 | print(f"Task instance data for {repo} not found, creating...") 82 | build_dataset(path_pr, path_task, token) 83 | print(f"✅ Successfully saved task instance data for {repo} to {path_task}") 84 | else: 85 | print(f"📁 Task instance data for {repo} already exists at {path_task}, skipping...") 86 | except Exception as e: 87 | print("-"*80) 88 | print(f"Something went wrong for {repo}, skipping: {e}") 89 | print("Here is the full traceback:") 90 | traceback.print_exc() 91 | print("-"*80) 92 | 93 | 94 | def main( 95 | repos: list, 96 | path_prs: str, 97 | path_tasks: str, 98 | max_pulls: int = None, 99 | cutoff_date: str = None, 100 | ): 101 | """ 102 | Spawns multiple threads given multiple GitHub tokens for collecting fine tuning data 103 | 104 | Args: 105 | repos (list): List of repositories to retrieve instruction data for 106 | path_prs (str): Path to save PR data files to 107 | path_tasks (str): Path to save task instance data files to 108 | cutoff_date (str): Cutoff date for PRs to consider in format YYYYMMDD 109 | """ 110 | path_prs, path_tasks = os.path.abspath(path_prs), os.path.abspath(path_tasks) 111 | print(f"Will save PR data to {path_prs}") 112 | print(f"Will save task instance data to {path_tasks}") 113 | print(f"Received following repos to create task instances for: {repos}") 114 | 115 | tokens = os.getenv("GITHUB_TOKENS") 116 | if not tokens: raise Exception("Missing GITHUB_TOKENS, consider rerunning with GITHUB_TOKENS=$(gh auth token)") 117 | tokens = tokens.split(",") 118 | data_task_lists = split_instances(repos, len(tokens)) 119 | 120 | data_pooled = [ 121 | { 122 | "repos": repos, 123 | "path_prs": path_prs, 124 | "path_tasks": path_tasks, 125 | "max_pulls": max_pulls, 126 | "cutoff_date": cutoff_date, 127 | "token": token 128 | } 129 | for repos, token in zip(data_task_lists, tokens) 130 | ] 131 | 132 | with Pool(len(tokens)) as p: 133 | p.map(construct_data_files, data_pooled) 134 | 135 | 136 | if __name__ == "__main__": 137 | parser = argparse.ArgumentParser(description=__doc__) 138 | parser.add_argument( 139 | "--repos", nargs="+", help="List of repositories (e.g., `sqlfluff/sqlfluff`) to create task instances for" 140 | ) 141 | parser.add_argument( 142 | "--path_prs", type=str, help="Path to folder to save PR data files to" 143 | ) 144 | parser.add_argument( 145 | "--path_tasks", 146 | type=str, 147 | help="Path to folder to save task instance data files to", 148 | ) 149 | parser.add_argument( 150 | "--max_pulls", 151 | type=int, 152 | help="Maximum number of pulls to log", 153 | default=None 154 | ) 155 | parser.add_argument( 156 | "--cutoff_date", 157 | type=str, 158 | help="Cutoff date for PRs to consider in format YYYYMMDD", 159 | default=None, 160 | ) 161 | args = parser.parse_args() 162 | main(**vars(args)) 163 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to the PyPI package for SWE-bench ([`swebench`](https://pypi.org/project/swebench/)) will be documented in this file. 4 | 5 | Prior to version 1.1.0, not all deployed versions are listed, as the PyPI package was going through development and testing. The noteworthy versions and the respective changes that were introduced by that version are included. All versions 1.1.0 onwards are fully listed. 6 | 7 | ## [2.0.12] - 7/21/2024 8 | * Minor naming changes 9 | * #186 fix: correct some typings and a incorrect function call 10 | * #183 Fix timeout 11 | * #178 Add schema version to report card 12 | * #177 Fix run live scripts 13 | 14 | ## [2.0.9] - 7/10/2024 15 | * #176 Move inference to swebench.inference sub-package 16 | * #175 Fix link in collect README.md 17 | 18 | ## [2.0.8] - 7/8/2024 19 | * Add `cutoff_date`, `max_pulls` arguments to collection pipeline 20 | * Minor Django issue comment parsing logic 21 | * Rewritten `extract_patches` logic 22 | * Remove `MAP_REPO_TO_TEST_FRAMEWORK` symbol 23 | 24 | ## [2.0.4] - 7/5/2024 25 | * #173 Fix: Allow to set GH token from env var in collect/print_pulls 26 | * #171 Don't let tox install a virtualenv during evaluation 27 | * #169 Handle failures because of None/empty patches 28 | 29 | ## [2.0.3] - 7/2/2024 30 | * #149 Interface fix: run_id is required 31 | * #151 Fix: Support JSON datasets (avoid loading json twice) 32 | * #152 Add very simple CI 33 | * #153 Various nitpicks 34 | * #155 Fix link to collection tutorial 35 | * #161 Fix path to image in docs 36 | * #162 Fix evaluation hanging issue and improve patch apply 37 | * #164 Fix so it doesn't crash when no env imgs to build 38 | * #166 Fix newline outputs for django's log parser 39 | * #168 Update reporting and skip empty model patch predictions 40 | 41 | ## [2.0.0] - 6/27/2024 42 | Major release - the SWE-bench evaluation harness has been upgraded to incorporate containerized, sandboxed execution environments based on Docker. There are several chances to the API resulting from this: 43 | * Removal of the `swebench.metrics` module 44 | * Updates to the API of `swebench.harness` functionality 45 | * Significant modifications to underlying evaluation logic 46 | * Minor updates to installation specifications for different repos + versions. 47 | 48 | Read the full report [here](https://github.com/princeton-nlp/SWE-bench/tree/main/docs/20240627_docker) 49 | 50 | ## [1.1.5] - 5/15/2024 51 | * Add support for HumanEvalFix (Python, JS, Go, Java) ([source](https://huggingface.co/datasets/bigcode/humanevalpack)) 52 | 53 | ## [1.1.0] - 4/15/2024 54 | * Add `env_vars_test` field to allow for environment variable assignment for testing scripts. 55 | * Change `pip_packages` installation specification to be a list instead of a string. 56 | * Define PyPI package versioning explicitly for dev, test repos. 57 | * Fix versioning for `astroid` dependency in `pylint` installation script`. 58 | * Fix minor error in `parse_log_pytest_options`. 59 | * Improve clarity + succinctness of logging. 60 | * Make logging of subprocess args to log file smaller. 61 | * Remove installation specifications for `dbt-core`, `transformers`. 62 | * Remove redundant declaration of constants. 63 | * Remove unused versions from installation specifications for dev, test repos. 64 | * Rewrite `swebench.metrics.get_model_report`. 65 | 66 | ## [1.0.5] - 4/7/2024 67 | * Fix log parsing for `pydicom`, `pylint`, and `requests` libraries. [5cb448](https://github.com/princeton-nlp/SWE-bench/commit/5cb448140a8cd05490650b0671d860765180f26c) 68 | 69 | ## [1.0.4] - 4/5/2024 70 | * Fixed `env_list` parsing. [5be59d](https://github.com/princeton-nlp/SWE-bench/commit/5be59d665233ffb63b9beb30b2740cc41098e51f) 71 | * Updated `ExecWrapper`, `LogWrapper` logic for harness. [231a2b](https://github.com/princeton-nlp/SWE-bench/commit/231a2b205c5ca9ddcb126b73b22667d79e1b6108) 72 | 73 | ## [1.0.2] - 4/2/2024 74 | * Added `try/catch` around `lsof` based clean up for `run_evaluation.py`. [3fb217](https://github.com/princeton-nlp/SWE-bench/commit/3fb2179a5c69737465f916898e8708adffff9914) 75 | * Fixed `get_eval_refs` function. [12a287](https://github.com/princeton-nlp/SWE-bench/commit/12a287a9591cb4a0d65483f0c8bfaa3375285bfc) 76 | * Fixed `seaborn` log parser. [0372b6](https://github.com/princeton-nlp/SWE-bench/commit/0372b6a9ff62516067fb26f602163c231d818163) 77 | 78 | ## [1.0.1] - 3/31/2024 79 | First working version. We strongly recommend not using versions older than this one. 80 | * Added logging for failed installations. [58d24d](https://github.com/princeton-nlp/SWE-bench/commit/58d24d1b65b95ed96d57805604aca7adca49861d) 81 | * Added missing `datasets` dependency. [68e89e](https://github.com/princeton-nlp/SWE-bench/commit/68e89ef8d099ca5c23a8fd5681e3f990cf729fd6) 82 | * Reorganized repository to be directly build-able as a PyPI package. [548bdb](https://github.com/princeton-nlp/SWE-bench/commit/548bdbffb2ac5f0a09c1d7eb95bbee1bce126233) 83 | 84 | ## [0.6.9 - 0.6.9.2] - 3/31/2024 85 | > ⚠️ Do NOT use these versions. The PyPI package for these versions was under development. Specifically, some of the evaluation configurations required re-validation. A detailed report for the failures and our recovery from it are detailed in [Bug Report 4/5/2024](docs/reports/20240405_eval_bug/README.md). 86 | 87 | ## [0.6.1] - 3/14/2023 88 | * Added minor conditions to make `run_evaluation` more robust (e.g. exit on empty predictions) 89 | * Added logic that conditions conda link download based on which architecture/platform (e.g. x86, arm) the code is being run on. 90 | * Added classes to unify `subprocess` execution arguments + make them more consistent throughout the codebase. Also remove `shell=True` flag when not necessary. 91 | * Added deterministic hashing of model name when creating certain testbed paths, defends against https://github.com/conda/conda/issues/12250 92 | * Fixed key errors across the `metrics/` folder. 93 | * Reorganized `harness` code. Moved constants into a separate file to improve readability. 94 | 95 | ## [0.4.8] - 11/8/2023 96 | * `run_evaluation` can be imported to make running the evaluation harness of SWE-bench more accessible. 97 | * Add condition in `harness/context_manager.py` to skip installation if no instructions are provided. 98 | * Add functionality to check and remove logs with `AttributeError` or `ImportError` 99 | * Add support for HumanEval dataset. 100 | * Add support for relative paths for `log_dir` and `testbed` arguments of evaluation. 101 | * Minor renaming for `metrics/report.py` variables. 102 | 103 | ## [0.4.3] - 11/5/2023 104 | Introducing the initial release of SWE-Bench, a novel benchmark that introduces "software engineering as a task". Given a codebase and an issue, a model is tasked with writing a `.patch` file that addresses the desired changes. 105 | 106 | Please view the `README.md` for information on how to run the repository, and check out our paper, [SWE-bench: Can Language Models Resolve Real-World GitHub Issues?](https://arxiv.org/abs/2310.06770), for full details on the project. 107 | 108 | We will maintain a leaderboard on the SWE-bench public [website](http://swe-bench.github.io). We will release details soon on how to submit your generations for evaluation to be included on the leaderboard. 109 | 110 | ## [< 0.4.3] - 11/4/2023 111 | > ⚠️ Do NOT use these versions. The PyPI package was under development for these versions and will not work properly. -------------------------------------------------------------------------------- /swebench/collect/build_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import json 5 | import logging 6 | import os 7 | from typing import Optional 8 | 9 | from swebench.collect.utils import ( 10 | extract_patches, 11 | extract_problem_statement_and_hints, 12 | Repo, 13 | ) 14 | 15 | logging.basicConfig( 16 | level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 17 | ) 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def create_instance(repo: Repo, pull: dict) -> dict: 22 | """ 23 | Create a single task instance from a pull request, where task instance is: 24 | 25 | { 26 | repo (str): owner/repo this task instance is from, 27 | pull_number (int): number of PR this task instance is from, 28 | base_commit (str): SHA of the base commit PR is based on, 29 | patch (str): reference solution as .patch (apply to base commit), 30 | test_patch (str): test suite as .patch (apply to base commit), 31 | } 32 | """ 33 | patch, test_patch = extract_patches(pull, repo) 34 | problem_statement, hints = extract_problem_statement_and_hints(pull, repo) 35 | return { 36 | "repo": repo.repo.full_name, 37 | "pull_number": pull["number"], 38 | "instance_id": (repo.repo.full_name + "-" + str(pull["number"])).replace( 39 | "/", "__" 40 | ), 41 | "issue_numbers": pull["resolved_issues"], 42 | "base_commit": pull["base"]["sha"], 43 | "patch": patch, 44 | "test_patch": test_patch, 45 | "problem_statement": problem_statement, 46 | "hints_text": hints, 47 | "created_at": pull["created_at"], 48 | } 49 | 50 | 51 | def is_valid_pull(pull: dict) -> bool: 52 | """ 53 | Check whether PR has an associated issue and is merged 54 | 55 | Args: 56 | pull (dict): pull request object 57 | Returns: 58 | bool: whether PR is valid 59 | """ 60 | if pull["merged_at"] is None: 61 | return False 62 | if "resolved_issues" not in pull or len(pull["resolved_issues"]) < 1: 63 | return False 64 | return True 65 | 66 | 67 | def is_valid_instance(instance: dict) -> bool: 68 | """ 69 | Check whether task instance has all required fields for task instance creation 70 | 71 | Args: 72 | instance (dict): task instance object 73 | Returns: 74 | bool: whether task instance is valid 75 | """ 76 | if instance["patch"] is None or instance["patch"] == "": 77 | return False 78 | if instance["problem_statement"] is None or instance["problem_statement"] == "": 79 | return False 80 | return True 81 | 82 | 83 | def has_test_patch(instance: dict) -> bool: 84 | """ 85 | Check whether task instance has a test suite 86 | 87 | Args: 88 | instance (dict): task instance object 89 | Returns: 90 | bool: whether task instance has a test suite 91 | """ 92 | if instance["test_patch"] is None or instance["test_patch"].strip() == "": 93 | return False 94 | return True 95 | 96 | 97 | def main(pr_file: str, output: str, token: Optional[str] = None): 98 | """ 99 | Main thread for creating task instances from pull requests 100 | 101 | Args: 102 | pr_file (str): path to pull request JSONL file 103 | output (str): output file name 104 | token (str): GitHub token 105 | """ 106 | if token is None: 107 | # Get GitHub token from environment variable if not provided 108 | token = os.environ.get("GITHUB_TOKEN") 109 | 110 | def load_repo(repo_name): 111 | # Return repo object for a given repo name 112 | owner, repo = repo_name.split("/") 113 | return Repo(owner, repo, token=token) 114 | 115 | repos = dict() 116 | completed = 0 117 | with_tests = 0 118 | total_instances = 0 119 | all_output = output + ".all" 120 | seen_prs = set() 121 | 122 | # Continue where we left off if output file already exists 123 | if os.path.exists(all_output): 124 | with open(all_output) as f: 125 | for line in f: 126 | pr = json.loads(line) 127 | if "instance_id" not in pr: 128 | pr["instance_id"] = ( 129 | pr["repo"] + "-" + str(pr["pull_number"]) 130 | ).replace("/", "__") 131 | instance_id = pr["instance_id"] 132 | seen_prs.add(instance_id) 133 | if is_valid_instance(pr): 134 | completed += 1 135 | if has_test_patch(pr): 136 | with_tests += 1 137 | logger.info(f"Will skip {len(seen_prs)} pull requests that have already been inspected") 138 | 139 | # Write to .all file for all PRs 140 | write_mode_all = "w" if not os.path.exists(all_output) else "a" 141 | with open(all_output, write_mode_all) as all_output: 142 | # Write to output file for PRs with test suites 143 | write_mode = "w" if not os.path.exists(output) else "a" 144 | with open(output, write_mode) as output: 145 | for ix, line in enumerate(open(pr_file)): 146 | total_instances += 1 147 | pull = json.loads(line) 148 | if ix % 100 == 0: 149 | logger.info( 150 | f"[{pull['base']['repo']['full_name']}] (Up to {ix} checked) " 151 | f"{completed} valid, {with_tests} with tests." 152 | ) 153 | # Construct instance fields 154 | instance_id = ( 155 | pull["base"]["repo"]["full_name"] + "-" + str(pull["number"]) 156 | ) 157 | instance_id = instance_id.replace("/", "__") 158 | if instance_id in seen_prs: 159 | seen_prs -= {instance_id} 160 | continue 161 | if not is_valid_pull(pull): 162 | # Throw out invalid PRs 163 | continue 164 | # Create task instance 165 | repo_name = pull["base"]["repo"]["full_name"] 166 | if repo_name not in repos: 167 | repos[repo_name] = load_repo(repo_name) 168 | repo = repos[repo_name] 169 | instance = create_instance(repo, pull) 170 | if is_valid_instance(instance): 171 | # If valid, write to .all output file 172 | print( 173 | json.dumps(instance), end="\n", flush=True, file=all_output 174 | ) # write all instances to a separate file 175 | completed += 1 176 | if has_test_patch(instance): 177 | # If has test suite, write to output file 178 | print(json.dumps(instance), end="\n", flush=True, file=output) 179 | with_tests += 1 180 | logger.info(f"[{', '.join(repos.keys())}] Total instances: {total_instances}, completed: {completed}, with tests: {with_tests}") 181 | logger.info(f"[{', '.join(repos.keys())}] Skipped {len(seen_prs)} pull requests that have already been inspected") 182 | 183 | 184 | if __name__ == "__main__": 185 | parser = argparse.ArgumentParser() 186 | parser.add_argument("pr_file", type=str, help="Path to pull request JSONL file") 187 | parser.add_argument("output", type=str, help="Output file name") 188 | parser.add_argument("--token", type=str, help="GitHub token") 189 | args = parser.parse_args() 190 | main(**vars(args)) 191 | -------------------------------------------------------------------------------- /swebench/inference/make_datasets/tokenize_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Provided a source (raw) directory and the final (eval) directory, create a training split by removing all instances that are in the final directory from the source directory. 4 | """ 5 | 6 | import os 7 | import logging 8 | from argparse import ArgumentParser 9 | from pathlib import Path 10 | 11 | import tiktoken 12 | from datasets import disable_caching, load_from_disk, load_dataset 13 | from tqdm.auto import tqdm 14 | from transformers import LlamaTokenizer 15 | 16 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") 17 | logger = logging.getLogger(__name__) 18 | logger.warning("Disabling caching") 19 | disable_caching() 20 | 21 | 22 | def cl100k(text, tokenizer): 23 | return tokenizer.encode(text, disallowed_special=()) 24 | 25 | 26 | def llama(text, tokenizer): 27 | return tokenizer(text, add_special_tokens=False, return_attention_mask=False)[ 28 | "input_ids" 29 | ] 30 | 31 | 32 | TOKENIZER_FUNCS = { 33 | "cl100k": (tiktoken.get_encoding("cl100k_base"), cl100k), 34 | "llama": (LlamaTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K"), llama), 35 | } 36 | 37 | 38 | def extract_fields(instance, tokenizer_name, tokenizer, tokenizer_func, eos_token): 39 | instance_id = instance["instance_id"] 40 | if instance["text"] is None or instance["patch"] is None: 41 | print(f"No text for {instance_id}") 42 | return {"input_ids": [], "labels": [], "text": "", "patch": ""} 43 | text_inputs = instance["text"].strip() + "\n" 44 | if text_inputs is None or instance["patch"] is None: 45 | print(f"No inputs for {instance_id}") 46 | return None 47 | patch = instance["patch"].strip() 48 | if len(eos_token) > 0: 49 | patch += f"\n{eos_token}" 50 | input_ids = tokenizer_func(text_inputs, tokenizer) 51 | if tokenizer_name in {"llama"}: 52 | label_ids = tokenizer_func( 53 | "\n" + patch, tokenizer 54 | ) # add newline to tokenize patch 55 | idx = label_ids.index(13) 56 | assert ( 57 | idx <= 2 58 | ), "Expected newline token id (13) to be one of the first three tokens" 59 | label_ids = label_ids[idx + 1 :] # remove newline tokens 60 | else: 61 | label_ids = tokenizer_func(patch, tokenizer) 62 | inputs = input_ids + label_ids[:-1] 63 | cond_len = len(input_ids) - 1 64 | labels = [-100] * cond_len + label_ids 65 | assert len(inputs) == len(labels) 66 | return {**instance, "input_ids": inputs, "labels": labels, "text": text_inputs, "patch": patch} 67 | 68 | 69 | def extract_test_fields(instance, tokenizer_name, tokenizer, tokenizer_func, eos_token): 70 | instance_id = instance["instance_id"] 71 | if instance["text"] is None or instance["patch"] is None: 72 | print(f"No text for {instance_id}") 73 | return None 74 | text_inputs = instance["text"].strip() + "\n" 75 | if text_inputs is None or instance["patch"] is None: 76 | print(f"No inputs for {instance_id}") 77 | return None 78 | patch = instance["patch"].strip() 79 | if len(eos_token) > 0: 80 | patch += f"\n{eos_token}" 81 | input_ids = tokenizer_func(text_inputs, tokenizer) 82 | label_ids = tokenizer_func(patch, tokenizer) 83 | inputs = input_ids 84 | labels = label_ids 85 | return {**instance, "input_ids": inputs, "labels": labels, "text": text_inputs, "patch": patch} 86 | 87 | 88 | def add_columns_from_dict(dataset, dict_columns): 89 | """dict_columns is a list of dicts with keys that are columns in dataset""" 90 | for column in dict_columns[0].keys(): 91 | values = [d[column] for d in dict_columns] 92 | if column in dataset.column_names: 93 | dataset = dataset.remove_columns(column) 94 | dataset = dataset.add_column(column, values) 95 | return dataset 96 | 97 | 98 | def main( 99 | dataset_name_or_path, 100 | output_dir, 101 | tokenizer_name, 102 | num_proc, 103 | push_to_hub_user, 104 | ): 105 | if push_to_hub_user is not None: 106 | hub_token = os.environ.get("HUGGING_FACE_HUB_TOKEN", None) 107 | if hub_token is None: 108 | raise ValueError("Must provide HUGGING_FACE_HUB_TOKEN to push to the Hub") 109 | if not Path(output_dir).exists(): 110 | Path(output_dir).mkdir(parents=True) 111 | 112 | if tokenizer_name is not None: 113 | tokenizer, tokenizer_func = TOKENIZER_FUNCS[tokenizer_name] 114 | eos_token = getattr(tokenizer, "eos_token", "") 115 | if num_proc > 0 and tokenizer_name == 'cl100k': 116 | logger.warning('cl100k tokenizer does not support multiprocessing. Ignoring num_proc') 117 | num_proc = 0 118 | 119 | if Path(dataset_name_or_path).exists(): 120 | dataset = load_from_disk(dataset_name_or_path) 121 | else: 122 | dataset = load_dataset(dataset_name_or_path) 123 | dataset = dataset.filter(lambda x: len(x["text"]) <= 5_000_000) # filter out superlong instances 124 | for split in dataset.keys(): 125 | if split == "test": 126 | continue 127 | if num_proc > 0: 128 | dataset[split] = dataset[split].map( 129 | lambda instance: extract_fields( 130 | instance, 131 | tokenizer_name, 132 | tokenizer, 133 | tokenizer_func, 134 | eos_token, 135 | ), 136 | num_proc=num_proc, 137 | batched=False, 138 | desc=f"Tokenizing {split}", 139 | ) 140 | elif len(dataset[split]) > 0: 141 | new_values = list( 142 | map( 143 | lambda x: extract_fields( 144 | x, tokenizer_name, tokenizer, tokenizer_func, eos_token 145 | ), 146 | tqdm( 147 | dataset[split], 148 | total=len(dataset[split]), 149 | desc=f"Tokenizing {split}", 150 | ), 151 | ) 152 | ) 153 | dataset[split] = add_columns_from_dict(dataset[split], new_values) 154 | for split in ["test"]: 155 | if split not in dataset: 156 | logger.warning(f"Split {split} not in dataset. Skipping") 157 | continue 158 | if num_proc > 0: 159 | dataset[split] = dataset[split].map( 160 | lambda instance: extract_test_fields( 161 | instance, 162 | tokenizer_name, 163 | tokenizer, 164 | tokenizer_func, 165 | eos_token, 166 | ), 167 | num_proc=num_proc, 168 | batched=False, 169 | desc=f"Tokenizing {split}", 170 | ) 171 | elif len(dataset[split]) > 0: 172 | new_values = list( 173 | map( 174 | lambda x: extract_test_fields( 175 | x, tokenizer_name, tokenizer, tokenizer_func, eos_token 176 | ), 177 | tqdm( 178 | dataset[split], 179 | total=len(dataset[split]), 180 | desc=f"Tokenizing {split}", 181 | ), 182 | ) 183 | ) 184 | dataset[split] = add_columns_from_dict(dataset[split], new_values) 185 | output_file = Path(dataset_name_or_path).name + f"__tok-{tokenizer_name}" 186 | if push_to_hub_user is not None: 187 | output_file = f"{push_to_hub_user}/{output_file}" 188 | dataset.push_to_hub(output_file, use_auth_token=hub_token) 189 | else: 190 | output_file = Path(output_dir) / output_file 191 | dataset.save_to_disk(output_file) 192 | logger.warning(f"Saved to {output_file}") 193 | 194 | 195 | if __name__ == "__main__": 196 | parser = ArgumentParser(description=__doc__) 197 | parser.add_argument("--dataset_name_or_path", type=str, required=True) 198 | parser.add_argument("--output_dir", type=str, required=True) 199 | parser.add_argument( 200 | "--tokenizer_name", type=str, required=True, choices=TOKENIZER_FUNCS.keys() 201 | ) 202 | parser.add_argument("--num_proc", type=int, default=0) 203 | parser.add_argument( 204 | "--push_to_hub_user", 205 | type=str, 206 | default=None, 207 | help="Push the dataset to the Hub user under this name.", 208 | ) 209 | main(**vars(parser.parse_args())) 210 | -------------------------------------------------------------------------------- /Original_README.md: -------------------------------------------------------------------------------- 1 | ## SWE-Bench-Fork for SWE-Gym 2 | 3 | This fork contains environment setup files used for additonal 11 repos used in SWE-Gym dataset and an improved version of the instance collection pipeline. 4 | 5 | We plan to upstream the changes and merge with SWE-Bench soon. 6 | 7 |

8 | 9 | Kawi the SWE-Llama 10 | 11 |

12 | 13 |
14 | 15 | | [日本語](docs/README_JP.md) | [English](https://github.com/princeton-nlp/SWE-bench) | [中文简体](docs/README_CN.md) | [中文繁體](docs/README_TW.md) | 16 | 17 |
18 | 19 | 20 | --- 21 |

22 | Code and data for our ICLR 2024 paper SWE-bench: Can Language Models Resolve Real-World GitHub Issues? 23 |
24 |
25 | 26 | Build 27 | 28 | 29 | License 30 | 31 | 32 | 33 | 34 |

35 | 36 | Please refer our [website](http://swe-bench.github.io) for the public leaderboard and the [change log](https://github.com/princeton-nlp/SWE-bench/blob/main/CHANGELOG.md) for information on the latest updates to the SWE-bench benchmark. 37 | 38 | ## 📰 News 39 | * **[Aug. 13, 2024]**: Introducing *SWE-bench Verified*! Part 2 of our collaboration with [OpenAI Preparedness](https://openai.com/preparedness/). A subset of 500 problems that real software engineers have confirmed are solvable. Check out more in the [report](https://openai.com/index/introducing-swe-bench-verified/)! 40 | * **[Jun. 27, 2024]**: We have an exciting update for SWE-bench - with support from [OpenAI's Preparedness](https://openai.com/preparedness/) team: We're moving to a fully containerized evaluation harness using Docker for more reproducible evaluations! Read more in our [report](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md). 41 | * **[Apr. 15, 2024]**: SWE-bench has gone through major improvements to resolve issues with the evaluation harness. Read more in our [report](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240415_eval_bug/README.md). 42 | * **[Apr. 2, 2024]**: We have released [SWE-agent](https://github.com/princeton-nlp/SWE-agent), which sets the state-of-the-art on the full SWE-bench test set! ([Tweet 🔗](https://twitter.com/jyangballin/status/1775114444370051582)) 43 | * **[Jan. 16, 2024]**: SWE-bench has been accepted to ICLR 2024 as an oral presentation! ([OpenReview 🔗](https://openreview.net/forum?id=VTF8yNQM66)) 44 | 45 | ## 👋 Overview 46 | SWE-bench is a benchmark for evaluating large language models on real world software issues collected from GitHub. 47 | Given a *codebase* and an *issue*, a language model is tasked with generating a *patch* that resolves the described problem. 48 | 49 | 50 | 51 | To access SWE-bench, copy and run the following code: 52 | ```python 53 | from datasets import load_dataset 54 | swebench = load_dataset('princeton-nlp/SWE-bench', split='test') 55 | ``` 56 | 57 | ## 🚀 Set Up 58 | SWE-bench uses Docker for reproducible evaluations. 59 | Follow the instructions in the [Docker setup guide](https://docs.docker.com/engine/install/) to install Docker on your machine. 60 | If you're setting up on Linux, we recommend seeing the [post-installation steps](https://docs.docker.com/engine/install/linux-postinstall/) as well. 61 | 62 | Finally, to build SWE-bench from source, follow these steps: 63 | ```bash 64 | git clone git@github.com:princeton-nlp/SWE-bench.git 65 | cd SWE-bench 66 | pip install -e . 67 | ``` 68 | 69 | Test your installation by running: 70 | ```bash 71 | python -m swebench.harness.run_evaluation \ 72 | --predictions_path gold \ 73 | --max_workers 1 \ 74 | --instance_ids sympy__sympy-20590 \ 75 | --run_id validate-gold 76 | ``` 77 | 78 | ## 💽 Usage 79 | > [!WARNING] 80 | > Running fast evaluations on SWE-bench can be resource intensive 81 | > We recommend running the evaluation harness on an `x86_64` machine with at least 120GB of free storage, 16GB of RAM, and 8 CPU cores. 82 | > You may need to experiment with the `--max_workers` argument to find the optimal number of workers for your machine, but we recommend using fewer than `min(0.75 * os.cpu_count(), 24)`. 83 | > 84 | > If running with docker desktop, make sure to increase your virtual disk space to have ~120 free GB available, and set max_workers to be consistent with the above for the CPUs available to docker. 85 | > 86 | > Support for `arm64` machines is experimental. 87 | 88 | Evaluate model predictions on SWE-bench Lite using the evaluation harness with the following command: 89 | ```bash 90 | python -m swebench.harness.run_evaluation \ 91 | --dataset_name princeton-nlp/SWE-bench_Lite \ 92 | --predictions_path \ 93 | --max_workers \ 94 | --run_id 95 | # use --predictions_path 'gold' to verify the gold patches 96 | # use --run_id to name the evaluation run 97 | ``` 98 | 99 | This command will generate docker build logs (`logs/build_images`) and evaluation logs (`logs/run_evaluation`) in the current directory. 100 | 101 | The final evaluation results will be stored in the `evaluation_results` directory. 102 | 103 | To see the full list of arguments for the evaluation harness, run: 104 | ```bash 105 | python -m swebench.harness.run_evaluation --help 106 | ``` 107 | 108 | Additionally, the SWE-Bench repo can help you: 109 | * Train your own models on our pre-processed datasets 110 | * Run [inference](https://github.com/princeton-nlp/SWE-bench/blob/main/swebench/inference/README.md) on existing models (either models you have on-disk like LLaMA, or models you have access to through an API like GPT-4). The inference step is where you get a repo and an issue and have the model try to generate a fix for it. 111 | * Run SWE-bench's [data collection procedure](https://github.com/princeton-nlp/SWE-bench/blob/main/swebench/collect/) on your own repositories, to make new SWE-Bench tasks. 112 | 113 | ## ⬇️ Downloads 114 | | Datasets | Models | 115 | | - | - | 116 | | [🤗 SWE-bench](https://huggingface.co/datasets/princeton-nlp/SWE-bench) | [🦙 SWE-Llama 13b](https://huggingface.co/princeton-nlp/SWE-Llama-13b) | 117 | | [🤗 "Oracle" Retrieval](https://huggingface.co/datasets/princeton-nlp/SWE-bench_oracle) | [🦙 SWE-Llama 13b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-13b-peft) | 118 | | [🤗 BM25 Retrieval 13K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_13K) | [🦙 SWE-Llama 7b](https://huggingface.co/princeton-nlp/SWE-Llama-7b) | 119 | | [🤗 BM25 Retrieval 27K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_27K) | [🦙 SWE-Llama 7b (PEFT)](https://huggingface.co/princeton-nlp/SWE-Llama-7b-peft) | 120 | | [🤗 BM25 Retrieval 40K](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_40K) | | 121 | | [🤗 BM25 Retrieval 50K (Llama tokens)](https://huggingface.co/datasets/princeton-nlp/SWE-bench_bm25_50k_llama) | | 122 | 123 | ## 🍎 Tutorials 124 | We've also written the following blog posts on how to use different parts of SWE-bench. 125 | If you'd like to see a post about a particular topic, please let us know via an issue. 126 | * [Nov 1. 2023] Collecting Evaluation Tasks for SWE-Bench ([🔗](https://github.com/princeton-nlp/SWE-bench/blob/main/assets/collection.md)) 127 | * [Nov 6. 2023] Evaluating on SWE-bench ([🔗](https://github.com/princeton-nlp/SWE-bench/blob/main/assets/evaluation.md)) 128 | 129 | ## 💫 Contributions 130 | We would love to hear from the broader NLP, Machine Learning, and Software Engineering research communities, and we welcome any contributions, pull requests, or issues! 131 | To do so, please either file a new pull request or issue and fill in the corresponding templates accordingly. We'll be sure to follow up shortly! 132 | 133 | Contact person: [Carlos E. Jimenez](http://www.carlosejimenez.com/) and [John Yang](https://john-b-yang.github.io/) (Email: carlosej@princeton.edu, johnby@stanford.edu). 134 | 135 | ## ✍️ Citation 136 | If you find our work helpful, please use the following citations. 137 | ``` 138 | @inproceedings{ 139 | jimenez2024swebench, 140 | title={{SWE}-bench: Can Language Models Resolve Real-world Github Issues?}, 141 | author={Carlos E Jimenez and John Yang and Alexander Wettig and Shunyu Yao and Kexin Pei and Ofir Press and Karthik R Narasimhan}, 142 | booktitle={The Twelfth International Conference on Learning Representations}, 143 | year={2024}, 144 | url={https://openreview.net/forum?id=VTF8yNQM66} 145 | } 146 | ``` 147 | 148 | ## 🪪 License 149 | MIT. Check `LICENSE.md`. 150 | -------------------------------------------------------------------------------- /swebench/inference/make_datasets/create_text_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Create a dataset for text-to-text training from the raw task instance outputs. 5 | """ 6 | 7 | import json 8 | import logging 9 | import os 10 | from argparse import ArgumentParser 11 | from pathlib import Path 12 | from datasets import Dataset, DatasetDict, load_dataset, load_from_disk 13 | from tqdm.auto import tqdm 14 | 15 | from swebench.inference.make_datasets.create_instance import add_text_inputs, PROMPT_FUNCTIONS 16 | from swebench.inference.make_datasets.tokenize_dataset import TOKENIZER_FUNCS 17 | from swebench.inference.make_datasets.utils import string_to_bool 18 | 19 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def load_jsonl_file(filename): 24 | if type(filename) == str: 25 | filename = Path(filename) 26 | if filename.name.endswith(".jsonl") or filename.name.endswith(".jsonl.all"): 27 | with open(filename) as f: 28 | return [json.loads(line) for line in f] 29 | elif filename.name.endswith(".json"): 30 | with open(filename) as f: 31 | return json.load(f) 32 | else: 33 | raise ValueError(f"Unknown file type {filename}") 34 | 35 | 36 | def instances_generator(files): 37 | all_data = list() 38 | for file in tqdm(files, desc="Loading instance files"): 39 | all_data.extend(load_jsonl_file(file)) 40 | return all_data 41 | 42 | 43 | def get_training_and_eval_instances(raw_files, test_dataset): 44 | logger.info("Loading instances") 45 | raw_instances = list(instances_generator(raw_files)) 46 | final_instances = list(test_dataset["test"]) 47 | eval_repos = {x["repo"] for x in final_instances} 48 | train_instances = [x for x in raw_instances if x["repo"] not in eval_repos] 49 | train_instances = list(sorted(train_instances, key=lambda x: x["instance_id"])) 50 | eval_instances = list(sorted(final_instances, key=lambda x: x["instance_id"])) 51 | logger.info(f"Found {len(train_instances)} training ids") 52 | logger.info(f"Found {len(eval_instances)} eval ids") 53 | return train_instances, eval_instances 54 | 55 | 56 | def extract_fields(instance): 57 | instance_id = instance["instance_id"] 58 | if instance["text_inputs"] is None or instance["patch"] is None: 59 | print(f"No text for {instance_id}") 60 | return None 61 | text_inputs = instance["text_inputs"].strip() + "\n\n" 62 | if text_inputs is None or instance["patch"] is None: 63 | print(f"No inputs for {instance_id}") 64 | return None 65 | patch = "\n".join([f"", instance["patch"], ""]) 66 | return {**instance, "text": text_inputs, "patch": patch} 67 | 68 | 69 | def main( 70 | dataset_name_or_path, 71 | splits, 72 | validation_ratio, 73 | output_dir, 74 | retrieval_file, 75 | prompt_style, 76 | file_source, 77 | k, 78 | max_context_len, 79 | tokenizer_name, 80 | push_to_hub_user, 81 | ): 82 | if push_to_hub_user is not None: 83 | hub_token = os.environ.get("HUGGING_FACE_HUB_TOKEN", None) 84 | assert hub_token is not None, "Must provide HUGGING_FACE_HUB_TOKEN to push to the Hub" 85 | assert output_dir is None, "Cannot provide output_dir if pushing to the Hub" 86 | if max_context_len is not None: 87 | assert tokenizer_name is not None 88 | if push_to_hub_user is None and not Path(output_dir).exists(): 89 | Path(output_dir).mkdir(parents=True) 90 | output_file = f"SWE-bench__{prompt_style}__fs-{file_source}" 91 | if k is not None: 92 | assert file_source not in { 93 | "all", 94 | "oracle", 95 | }, "Cannot use max_context_len with oracle or all file sources" 96 | output_file += f"__k-{k}" 97 | if max_context_len is not None: 98 | assert file_source not in { 99 | "all", 100 | "oracle", 101 | }, "Cannot use max_context_len with oracle or all file sources" 102 | assert ( 103 | tokenizer_name is not None 104 | ), "Must provide tokenizer_name if max_context_len is not None" 105 | output_file += f"__mcc-{max_context_len}-{tokenizer_name}" 106 | if push_to_hub_user is None: 107 | output_file = Path(output_dir, output_file) 108 | if output_file.exists(): 109 | logger.info(f"{output_file.absolute().as_posix()} already exists. Aborting") 110 | return 111 | if Path(dataset_name_or_path).exists(): 112 | dataset = load_from_disk(dataset_name_or_path) 113 | else: 114 | dataset = load_dataset(dataset_name_or_path) 115 | 116 | split_instances = dict() 117 | logger.info(f'Found {set(dataset.keys())} splits') 118 | if set(splits) - set(dataset.keys()) != set(): 119 | raise ValueError(f"Unknown splits {set(splits) - set(dataset.keys())}") 120 | for split in splits: 121 | split_instances[split] = {x["instance_id"]: x for x in dataset[split]} 122 | add_text_inputs( 123 | split_instances[split], 124 | retrieval_file, 125 | k, 126 | prompt_style, 127 | file_source, 128 | max_context_len=max_context_len, 129 | tokenizer_name=tokenizer_name, 130 | ) 131 | columns = [ 132 | "instance_id", 133 | "text", 134 | "repo", 135 | "base_commit", 136 | "problem_statement", 137 | "hints_text", 138 | "created_at", 139 | "patch", 140 | "test_patch", 141 | "version", 142 | "FAIL_TO_PASS", 143 | "PASS_TO_PASS", 144 | "environment_setup_commit", 145 | ] 146 | split_data = dict() 147 | for split in split_instances: 148 | split_data[split] = {key: list() for key in columns} 149 | for instance in tqdm( 150 | split_instances[split].values(), total=len(split_instances[split]), desc=f'Processing {split} instances', 151 | ): 152 | datum = extract_fields(instance) 153 | if datum is None: 154 | continue 155 | for key in columns: 156 | split_data[split][key].append(datum[key] if key in datum else "") 157 | logger.info(f"Found {len(split_data[split]['instance_id'])} {split} ids") 158 | split_data[split] = Dataset.from_dict(split_data[split]) 159 | dataset = DatasetDict(split_data) 160 | if validation_ratio > 0 and "train" in dataset: 161 | train_val = dataset["train"].train_test_split( 162 | test_size=validation_ratio, 163 | seed=42, 164 | ) 165 | dataset["train"] = train_val["train"] 166 | dataset["validation"] = train_val["test"] 167 | for split in dataset: 168 | logger.info(f"Found {len(dataset[split])} {split} instances") 169 | if push_to_hub_user is not None: 170 | dataset.push_to_hub(f'{push_to_hub_user}/{output_file}', use_auth_token=hub_token) 171 | else: 172 | dataset.save_to_disk(output_file) 173 | logger.info(f"Finsihed saving to {output_file}") 174 | 175 | 176 | if __name__ == "__main__": 177 | parser = ArgumentParser(description=__doc__) 178 | parser.add_argument( 179 | "--dataset_name_or_path", 180 | type=str, 181 | default="princeton-nlp/SWE-bench", 182 | help="Dataset to use for test set from HuggingFace Datasets or path to a save_to_disk directory.", 183 | ) 184 | parser.add_argument( 185 | "--splits", 186 | nargs="+", 187 | default=["train", "test"], 188 | help="Splits to use from the dataset.", 189 | ) 190 | parser.add_argument( 191 | "--validation_ratio", 192 | type=float, 193 | default=0.01, 194 | help="Ratio of the training set to use for validation.", 195 | ) 196 | parser.add_argument( 197 | "--output_dir", type=str, help="Path to the output directory." 198 | ) 199 | parser.add_argument( 200 | "--retrieval_file", 201 | type=str, 202 | help="Path to the file where the retrieval results are stored.", 203 | ) 204 | parser.add_argument( 205 | "--prompt_style", 206 | type=str, 207 | default="style-3", 208 | choices=PROMPT_FUNCTIONS.keys(), 209 | help="Prompt style to use. See create_instance.PROMPT_FUNCTIONS for details.", 210 | ) 211 | parser.add_argument( 212 | "--file_source", 213 | type=str, 214 | default="oracle", 215 | choices=["oracle", "bm25", "all"], 216 | help="How to select the files to use in context.", 217 | ) 218 | parser.add_argument( 219 | "--k", 220 | type=int, 221 | default=None, 222 | help="Maximum number of files to use for retrieval.", 223 | ) 224 | parser.add_argument( 225 | "--max_context_len", 226 | type=int, 227 | default=None, 228 | help="Maximum number of tokens to use for context.", 229 | ) 230 | parser.add_argument( 231 | "--tokenizer_name", 232 | type=str, 233 | default=None, 234 | choices=TOKENIZER_FUNCS.keys(), 235 | help="Tokenizer to use for max_context_len. Only needed if max_context_len is specified.", 236 | ) 237 | parser.add_argument( 238 | "--push_to_hub_user", 239 | type=str, 240 | help="Username to use for pushing to the Hub. If not provided, will save to disk.", 241 | ) 242 | main(**vars(parser.parse_args())) 243 | -------------------------------------------------------------------------------- /swebench/harness/grading.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any 3 | 4 | from swebench.harness.constants import ( 5 | APPLY_PATCH_FAIL, 6 | APPLY_PATCH_PASS, 7 | FAIL_TO_FAIL, 8 | FAIL_TO_PASS, 9 | KEY_INSTANCE_ID, 10 | PASS_TO_FAIL, 11 | PASS_TO_PASS, 12 | RESET_FAILED, 13 | TESTS_ERROR, 14 | TESTS_TIMEOUT, 15 | ResolvedStatus, 16 | TestStatus, 17 | ) 18 | from swebench.harness.test_spec import TestSpec 19 | from swebench.harness.log_parsers import MAP_REPO_TO_PARSER 20 | 21 | 22 | # MARK: Utility functions 23 | def test_passed(case: str, sm: dict[str, str]) -> bool: 24 | return case in sm and sm[case] in [TestStatus.PASSED.value, TestStatus.XFAIL.value] 25 | 26 | 27 | def test_failed(case: str, sm: dict[str, str]) -> bool: 28 | return case not in sm or any( 29 | sm[case] == status for status in [TestStatus.FAILED.value, TestStatus.ERROR.value] 30 | ) 31 | 32 | 33 | # MARK: Evaluation report functions 34 | def get_logs_eval(log_fp: str) -> tuple[dict[str, str], bool]: 35 | """ 36 | Retrieve evaluation results for a task instance from its corresponding log file 37 | 38 | Args: 39 | log_fp (str): path to log file 40 | Returns: 41 | bool: whether the patch applied successfully 42 | dict: status map 43 | 44 | TODO(john-b-yang): Check this is working properly... 45 | """ 46 | # Convert e.g. "logs/scikit-learn__scikit-learn-12421/test_output.txt" to "scikit-learn/scikit-learn" 47 | sample_id = str(Path(log_fp).parent.stem) # e.g. scikit-learn__scikit-learn-12421 48 | repo = "-".join(sample_id.replace("__", "/").split("-")[:-1]) # e.g. scikit-learn/scikit-learn 49 | log_parser = MAP_REPO_TO_PARSER[repo] 50 | 51 | with open(log_fp) as f: 52 | content = f.read() 53 | # TODO fix constant here 54 | if ( 55 | any( 56 | [ 57 | x in content 58 | for x in [ 59 | APPLY_PATCH_FAIL, 60 | RESET_FAILED, 61 | TESTS_ERROR, 62 | TESTS_TIMEOUT, 63 | "Failed to reset task environment", 64 | ] 65 | ] 66 | ) 67 | or "applied patch" not in content.lower() 68 | ): 69 | # Eval patch was not applied successfully 70 | return {}, False 71 | 72 | # Get status map of evaluation results 73 | content = content.split(f"{APPLY_PATCH_PASS} (pred)")[-1] 74 | return log_parser(content), True 75 | 76 | 77 | def get_eval_tests_report( 78 | eval_sm: dict[str, str], 79 | gold_results: dict[str, str], 80 | calculate_to_fail: bool = False, 81 | ) -> dict[str, dict[str, list[str]]]: 82 | """ 83 | Create a report based on failure/pass change from gold results to eval results. 84 | 85 | Args: 86 | eval_sm (dict): evaluation status map 87 | gold_results (dict): gold results 88 | calculate_to_fail (bool): whether to calculate metrics for "x to fail" tests 89 | Returns: 90 | report (dict): report of metrics 91 | 92 | Metric Definitions (Gold Result Pair + Eval Result): 93 | - Fail-Pass (F2P) + P: Success (Resolution) 94 | - Pass-Pass (P2P) + P: Success (Maintenance) 95 | - Fail-Pass (F2P) + F: Failure 96 | - Pass-Pass (P2P) + F: Failure 97 | 98 | Miscellaneous Definitions 99 | - Fail-Fail (F2F) + F: Failure Maintenance 100 | - Pass-Fail (P2F) + F: Not considered 101 | - Fail-Fail (F2F) + P: Success (Extra Credit) 102 | - Pass-Fail (P2F) + P: Not considered 103 | """ 104 | # Calculate resolution metrics 105 | f2p_success = [] 106 | f2p_failure = [] 107 | for test_case in gold_results[FAIL_TO_PASS]: 108 | if test_passed(test_case, eval_sm): 109 | # Assume silent success for now (test case not in eval_sm) 110 | f2p_success.append(test_case) 111 | elif test_failed(test_case, eval_sm): 112 | f2p_failure.append(test_case) 113 | 114 | # Calculate maintenance metrics 115 | p2p_success = [] 116 | p2p_failure = [] 117 | for test_case in gold_results[PASS_TO_PASS]: 118 | if test_passed(test_case, eval_sm): 119 | p2p_success.append(test_case) 120 | elif test_failed(test_case, eval_sm): 121 | p2p_failure.append(test_case) 122 | 123 | results = { 124 | FAIL_TO_PASS: { 125 | "success": f2p_success, 126 | "failure": f2p_failure, 127 | }, 128 | PASS_TO_PASS: { 129 | "success": p2p_success, 130 | "failure": p2p_failure, 131 | }, 132 | } 133 | 134 | f2f_success = [] 135 | f2f_failure = [] 136 | p2f_success = [] 137 | p2f_failure = [] 138 | if calculate_to_fail: 139 | # Calculate "extra credit" metrics 140 | for test_case in gold_results[FAIL_TO_FAIL]: 141 | if test_passed(test_case, eval_sm): 142 | f2f_success.append(test_case) 143 | elif test_failed(test_case, eval_sm): 144 | f2f_failure.append(test_case) 145 | 146 | # Calculate not considered metrics 147 | for test_case in gold_results[PASS_TO_FAIL]: 148 | if test_passed(test_case, eval_sm): 149 | p2f_success.append(test_case) 150 | elif test_failed(test_case, eval_sm): 151 | p2f_failure.append(test_case) 152 | 153 | results.update( 154 | { 155 | FAIL_TO_FAIL: { 156 | "success": f2f_success, 157 | "failure": f2f_failure, 158 | }, 159 | PASS_TO_FAIL: { 160 | "success": p2f_success, 161 | "failure": p2f_failure, 162 | }, 163 | } 164 | ) 165 | return results 166 | 167 | 168 | def compute_fail_to_pass(report: dict[str, dict[str, Any]]) -> float: 169 | """ 170 | Compute fail-to-pass metric. Accepts single report as argument. 171 | """ 172 | total = len(report[FAIL_TO_PASS]["success"]) + len(report[FAIL_TO_PASS]["failure"]) 173 | if total == 0: 174 | return 1 175 | return len(report[FAIL_TO_PASS]["success"]) / total 176 | 177 | 178 | def compute_pass_to_pass(report: dict[str, dict[str, Any]]) -> float: 179 | """ 180 | Compute pass-to-pass metric. Accepts single report as argument. 181 | """ 182 | total = len(report[PASS_TO_PASS]["success"]) + len(report[PASS_TO_PASS]["failure"]) 183 | if total == 0: 184 | # TODO: Don't factor in p2p metrics 185 | return 1 186 | return len(report[PASS_TO_PASS]["success"]) / total 187 | 188 | 189 | def get_resolution_status(report: dict[str, dict[str, Any]]) -> str: 190 | """ 191 | Determine resolved status of an evaluation instance 192 | 193 | Criteria: 194 | - If fail-to-pass (Resolution) = 1 and pass-to-pass (Maintenance) = 1 -> FULL 195 | - If (fail-to-pass (Resolution) < 1 and > 0) and pass-to-pass (Maintenance) = 1 -> PARTIAL 196 | - Otherwise -> NO 197 | """ 198 | f2p = compute_fail_to_pass(report) 199 | p2p = compute_pass_to_pass(report) 200 | 201 | if f2p == 1 and p2p == 1: 202 | return ResolvedStatus.FULL.value 203 | elif f2p < 1 and f2p > 0 and p2p == 1: 204 | return ResolvedStatus.PARTIAL.value 205 | else: 206 | return ResolvedStatus.NO.value 207 | 208 | 209 | def get_eval_report( 210 | test_spec: TestSpec, 211 | prediction: dict[str, str], 212 | log_path: str, 213 | include_tests_status: bool, 214 | ) -> dict[str, Any]: 215 | """ 216 | Generate a report of model evaluation results from a prediction, task instance, 217 | and evaluation log. 218 | 219 | Args: 220 | test_spec (dict): test spec containing keys "instance_id", "FAIL_TO_PASS", and "PASS_TO_PASS" 221 | prediction (dict): prediction containing keys "instance_id", "model_name_or_path", and "model_patch" 222 | log_path (str): path to evaluation log 223 | include_tests_status (bool): whether to include the status of each test in the returned report 224 | Returns: 225 | report (dict): report of metrics 226 | """ 227 | report_map = {} 228 | 229 | instance_id = prediction[KEY_INSTANCE_ID] 230 | if instance_id not in report_map: 231 | report_map[instance_id] = { 232 | "patch_is_None": False, 233 | "patch_exists": False, 234 | "patch_successfully_applied": False, 235 | "resolved": False, 236 | } 237 | 238 | # Check if the model patch exists 239 | if prediction["model_patch"] is None: 240 | report_map[instance_id]["none"] = True 241 | return report_map 242 | report_map[instance_id]["patch_exists"] = True 243 | 244 | # Get evaluation logs 245 | eval_sm, found = get_logs_eval(log_path) 246 | 247 | if not found: 248 | return report_map 249 | report_map[instance_id]["patch_successfully_applied"] = True 250 | 251 | eval_ref = { 252 | KEY_INSTANCE_ID: test_spec.instance_id, 253 | FAIL_TO_PASS: test_spec.FAIL_TO_PASS, 254 | PASS_TO_PASS: test_spec.PASS_TO_PASS, 255 | } 256 | 257 | report = get_eval_tests_report(eval_sm, eval_ref) 258 | if get_resolution_status(report) == ResolvedStatus.FULL.value: 259 | report_map[instance_id]["resolved"] = True 260 | 261 | if include_tests_status: 262 | report_map[instance_id]["tests_status"] = report # type: ignore 263 | 264 | return report_map 265 | -------------------------------------------------------------------------------- /swebench/inference/run_live.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | This module contains functions for running a live inference session on a GitHub issue. 5 | It clones the repository associated with the issue, builds a BM25 retrieval index, and 6 | generates a prompt for the user to interact with the model. The output is saved to a 7 | specified directory. 8 | """ 9 | import json 10 | import subprocess 11 | from pathlib import Path 12 | from ghapi.all import GhApi 13 | import os 14 | import re 15 | import time 16 | from datetime import datetime 17 | from tqdm.auto import tqdm 18 | from swebench.inference.make_datasets.utils import ContextManager, string_to_bool, extract_diff, extract_minimal_patch 19 | from swebench.inference.make_datasets.create_instance import ( 20 | PROMPT_FUNCTIONS, 21 | TOKENIZER_FUNCS, 22 | make_code_text, 23 | ingest_files, 24 | ) 25 | from swebench.inference.make_datasets.bm25_retrieval import ( 26 | make_index, 27 | clone_repo, 28 | search, 29 | DOCUMENT_ENCODING_FUNCTIONS, 30 | ) 31 | from swebench.inference.run_api import call_chat, call_anthropic 32 | import logging 33 | from argparse import ArgumentParser 34 | 35 | logging.basicConfig( 36 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 37 | ) 38 | logger = logging.getLogger(__name__) 39 | 40 | 41 | def get_problem_statement(owner, repo, issue_num, ghapi, include_comments=False): 42 | issue = ghapi.issues.get(owner, repo, issue_num) 43 | issue_text = "\n".join([issue.title, issue.body]) 44 | # Solved issues may include comments that give answers away too much 45 | if include_comments: 46 | all_comments = list(ghapi.issues.list_comments(owner, repo, issue_num)) 47 | comments = [comment.body for comment in all_comments] 48 | comment_text = "Comment: " if comments else "" + "\nComment:".join(comments) 49 | issue_text += "\n" + comment_text 50 | return issue_text 51 | 52 | 53 | def get_readme_files(repo_path): 54 | files = list(Path(repo_path).iterdir()) 55 | files = list(filter(lambda x: x.is_file(), files)) 56 | files = list(filter(lambda x: x.name.lower().startswith("readme"), files)) 57 | if files: 58 | files = sorted(files, key=lambda x: len(x.name)) 59 | files = [files[0]] 60 | return [Path(file).relative_to(repo_path).as_posix() for file in files] 61 | 62 | 63 | def make_instance( 64 | owner, 65 | repo, 66 | query, 67 | commit, 68 | root_dir, 69 | token, 70 | document_encoding_func, 71 | python, 72 | instance_id, 73 | tokenizer, 74 | tokenizer_func, 75 | prompt_style, 76 | max_context_len, 77 | include_readmes, 78 | ): 79 | """ 80 | Creates an instance for a given query and repository. 81 | 82 | Args: 83 | owner (str): The owner of the repository. 84 | repo (str): The name of the repository. 85 | query (str): The query to search for. 86 | commit (str): The commit hash to use. 87 | root_dir (str): The root directory to clone the repository to. 88 | token (str): The GitHub token to use for authentication. 89 | document_encoding_func (function): The function to use for encoding documents. 90 | python (str): The path to the Python executable. 91 | instance_id (int): The ID of the instance. 92 | tokenizer (str): The name of the tokenizer to use. 93 | tokenizer_func (function): The function to use for tokenization. 94 | prompt_style (str): The style of prompt to use. 95 | max_context_len (int): The maximum length of the context. 96 | include_readmes (bool): Whether to include README files in the instance. 97 | 98 | Returns: 99 | dict: The instance. 100 | """ 101 | thread_id = 0 102 | instance = {"instance_id": instance_id, "problem_statement": query} 103 | logger.info(f"Cloning repo {owner}/{repo}") 104 | repo_dir = clone_repo(f"{owner}/{repo}", root_dir, token) 105 | if commit is None: 106 | commit = subprocess.check_output( 107 | ["git", "rev-parse", "HEAD"], cwd=repo_dir 108 | ).decode("utf-8").strip() 109 | logger.info(f"Building BM25 retrieval index for {owner}/{repo}@{commit}") 110 | index_dir = make_index( 111 | repo_dir=repo_dir, 112 | root_dir=root_dir, 113 | query=query, 114 | commit=commit, 115 | document_encoding_func=document_encoding_func, 116 | python=python, 117 | instance_id=instance_id, 118 | ) 119 | results = search(instance, index_dir) 120 | hits = results["hits"] 121 | logger.info(f"Retrieved {len(hits)} documents") 122 | with ContextManager(repo_dir, commit) as cm: 123 | if include_readmes: 124 | readmes = get_readme_files(cm.repo_path) 125 | else: 126 | readmes = list() 127 | instance["readmes"] = ingest_files(readmes) 128 | for hit in hits: 129 | hit["file_contents"] = open(hit["docid"]).read() 130 | instance["file_contents"] = dict() 131 | base_text_inputs = PROMPT_FUNCTIONS[prompt_style](instance) 132 | base_text_input_length = len(tokenizer_func(base_text_inputs, tokenizer)) 133 | instance["file_contents"] = {x["docid"]: x["file_contents"] for x in hits} 134 | cur_input_len = base_text_input_length 135 | include_files = list() 136 | for filename in [x["docid"] for x in hits]: 137 | content = make_code_text({filename: instance["file_contents"][filename]}) 138 | tokens = tokenizer_func(content, tokenizer) 139 | if cur_input_len + len(tokens) < max_context_len: 140 | include_files.append(filename) 141 | cur_input_len += len(tokens) 142 | logger.info( 143 | f"Including {len(include_files)} files in context with {cur_input_len} tokens:\n" 144 | + "\n\t".join(sorted(include_files)) 145 | ) 146 | instance["file_contents"] = { 147 | filename: instance["file_contents"][filename] for filename in include_files 148 | } 149 | instance["text_inputs"] = PROMPT_FUNCTIONS[prompt_style](instance) 150 | return instance 151 | 152 | 153 | def parse_issue_url(issue_url): 154 | issue_pat = re.compile(r"github\.com\/(.+?)\/(.+?)\/issues\/(\d+)") 155 | match = issue_pat.search(issue_url) 156 | if not match: 157 | raise ValueError( 158 | f"issue_url ({issue_url}) does not seem to be a valid issue url." 159 | + "\nPlease use url like https://github.com/owner/repo/issues/12345" 160 | ) 161 | owner, repo, issue_num = match.groups() 162 | return owner, repo, issue_num 163 | 164 | 165 | def main( 166 | model_name, 167 | prompt_style, 168 | issue_url, 169 | base_commit, 170 | max_context_length, 171 | document_encoding_func, 172 | output_dir, 173 | root_dir, 174 | include_readmes, 175 | ): 176 | if base_commit is not None and len(issue_url) != len(base_commit): 177 | raise ValueError( 178 | f"Must provide either no base commits or one base commit per issue url" 179 | ) 180 | if base_commit is None: 181 | base_commit = [None] * len(issue_url) 182 | gh_token = os.environ.get("GITHUB_TOKEN", None) 183 | if gh_token is not None: 184 | logger.warning(f'Using GitHub token: {"*" * 8}{gh_token[-4:]}') 185 | gh = GhApi(token=gh_token) 186 | tokenizer, tokenizer_func = TOKENIZER_FUNCS["cl100k"] 187 | document_encoding_func = DOCUMENT_ENCODING_FUNCTIONS[document_encoding_func] 188 | python = subprocess.check_output(["which", "python"]).decode("utf-8").strip() 189 | outputs = list() 190 | for issue, commit in tqdm(zip(issue_url, base_commit), total=len(issue_url)): 191 | owner, repo, issue_num = parse_issue_url(issue) 192 | problem_statement = get_problem_statement(owner, repo, int(issue_num), gh) 193 | instance_id = f"{owner}__{repo}-{issue_num}" 194 | logger.info(f"Creating instance {instance_id}") 195 | instance = make_instance( 196 | owner=owner, 197 | repo=repo, 198 | query=problem_statement, 199 | commit=commit, 200 | root_dir=root_dir, 201 | token=gh_token, 202 | document_encoding_func=document_encoding_func, 203 | python=python, 204 | instance_id=instance_id, 205 | tokenizer=tokenizer, 206 | tokenizer_func=tokenizer_func, 207 | prompt_style=prompt_style, 208 | max_context_len=max_context_length, 209 | include_readmes=include_readmes, 210 | ) 211 | logger.info(f"Calling model {model_name}") 212 | start = time.time() 213 | if model_name.startswith("gpt"): 214 | inputs = instance["text_inputs"] 215 | response, _ = call_chat( 216 | model_name, inputs, use_azure=False, temperature=0, top_p=1 217 | ) 218 | completion = response.choices[0].message.content 219 | logger.info(f'Generated {response.usage.completion_tokens} tokens in {(time.time() - start):.2f} seconds') 220 | else: 221 | from anthropic import Anthropic 222 | api_key = os.environ.get("ANTHROPIC_API_KEY", None) 223 | anthropic = Anthropic(api_key=api_key) 224 | response = call_anthropic( 225 | inputs, anthropic, model_name, temperature=0, top_p=1 226 | ) 227 | completion = response.completion 228 | model_patch = extract_diff(completion) 229 | minimal_patch = extract_minimal_patch(model_patch) 230 | outputs.append( 231 | { 232 | "instance_id": instance_id, 233 | "response": completion, 234 | "problem_statement": problem_statement, 235 | "text_inputs": inputs, 236 | "model_patch": model_patch, 237 | "minimal_patch": minimal_patch, 238 | } 239 | ) 240 | os.makedirs(output_dir, exist_ok=True) 241 | output_file = Path( 242 | output_dir, 243 | f'{model_name}__{prompt_style}__{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.jsonl', 244 | ) 245 | with open(output_file, "+a") as f: 246 | for output in outputs: 247 | print(json.dumps(output), file=f, flush=True) 248 | logger.info(f"Wrote output to {output_file}") 249 | 250 | 251 | if __name__ == "__main__": 252 | parser = ArgumentParser(description=__doc__) 253 | parser.add_argument("--model_name", type=str) 254 | parser.add_argument( 255 | "--prompt_style", type=str, choices=PROMPT_FUNCTIONS.keys(), default="style-3" 256 | ) 257 | parser.add_argument("--issue_url", type=str, nargs="+") 258 | parser.add_argument("--base_commit", type=str, nargs="+") 259 | parser.add_argument("--max_context_length", type=int, default=16_000) 260 | parser.add_argument( 261 | "--document_encoding_func", 262 | type=str, 263 | choices=DOCUMENT_ENCODING_FUNCTIONS.keys(), 264 | default="file_name_and_contents", 265 | ) 266 | parser.add_argument("--output_dir", type=str, default="./live_outputs") 267 | parser.add_argument("--root_dir", type=str, default="./run_live_data") 268 | parser.add_argument("--include_readmes", type=string_to_bool, default=False) 269 | args = parser.parse_args() 270 | main(**vars(args)) 271 | --------------------------------------------------------------------------------