├── .github ├── CODE_OF_CONDUCT.md ├── dependabot.yml ├── mlc_config.json └── workflows │ ├── build-docs.yaml │ ├── check-links.yaml │ └── pytest.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── agent ├── _gen_trajs.sh ├── _infer_model.sh ├── _traj_mgr.sh ├── swesmith_gen_claude.yaml ├── swesmith_gen_gpt.yaml ├── swesmith_infer.yaml └── swesmith_install_repo.yaml ├── configs ├── bug_gen │ ├── README.md │ ├── class_basic.yml │ ├── func_fun.yml │ ├── lm_modify.yml │ └── lm_rewrite.yml ├── install_repo.sh ├── issue_gen │ ├── ig_tests.yaml │ ├── ig_v1.yaml │ └── ig_v2.yaml └── train │ ├── dpo_qwen_32b.yml │ ├── dpo_qwen_7b.yml │ ├── full_ft_qwen_32b.yml │ └── full_ft_qwen_7b.yml ├── docs ├── CNAME ├── assets │ ├── banner.png │ ├── bug_gen_overview.png │ ├── combine.png │ ├── home │ │ ├── collection.png │ │ ├── leaderboard.png │ │ └── swesmith.png │ ├── lm_generate.png │ ├── overview-light.png │ ├── overview.png │ ├── paper.pdf.html │ ├── pr_mirror.png │ ├── procedural.png │ ├── sbcli_logo.svg │ ├── sbcli_logo_text_below.svg │ ├── sweagent_logo.svg │ ├── sweagent_logo_text_below.svg │ ├── swebench_logo.png │ ├── swebench_logo_text_below.svg │ ├── swerex_logo.svg │ ├── swerex_logo_text_below.svg │ ├── swesmith_logo.png │ └── swesmith_logo_text_below.svg ├── blog.html ├── css │ ├── TiltNeon.ttf │ ├── bubbles.css │ ├── carousel.css │ ├── custom.css │ ├── home.css │ └── mkdocstrings.css ├── docs │ └── index.md ├── getting_started │ ├── assets.md │ ├── index.md │ ├── installation.md │ └── quickstart.md ├── guides │ ├── create_instances.md │ ├── difficulty_rating.md │ ├── env_construction.md │ ├── harnesses.md │ ├── index.md │ ├── issue_gen.md │ └── train_swe_agent.md ├── index.html └── overrides │ └── main.html ├── mkdocs.yml ├── pyproject.toml ├── scripts ├── calculate_cost.py ├── cheatsheet.sh ├── train.get_difficulties.sh ├── train.run_ft_torchtune.sh ├── train.run_ft_unsloth.sh └── train.serve_sglang.sh ├── swesmith ├── __init__.py ├── bug_gen │ ├── collect_patches.py │ ├── combine │ │ ├── same_file.py │ │ └── same_module.py │ ├── criteria.py │ ├── get_cost.py │ ├── llm │ │ ├── modify.py │ │ ├── rewrite.py │ │ └── utils.py │ ├── mirror │ │ ├── generate.py │ │ └── prompts.py │ ├── procedural │ │ ├── __init__.py │ │ ├── classes.py │ │ ├── control_flow.py │ │ ├── generate.py │ │ ├── operations.py │ │ └── remove.py │ └── utils.py ├── build_repo │ ├── __init__.py │ ├── create_images.py │ ├── download_images.py │ └── try_install.py ├── constants.py ├── harness │ ├── __init__.py │ ├── eval.py │ ├── gather.py │ ├── grading.py │ ├── log_parsers.py │ ├── utils.py │ └── valid.py ├── issue_gen │ ├── generate.py │ ├── get_from_pr.py │ ├── get_from_tests.py │ ├── get_static.py │ ├── utils.py │ └── viewer.py ├── train │ ├── README.md │ ├── difficulty_rater │ │ ├── create_datasets.py │ │ ├── get_difficulties.py │ │ └── test_rater.py │ ├── download_checkpoint.py │ ├── run │ │ ├── ft_torchtune.py │ │ └── ft_unsloth.py │ ├── serve_sglang.py │ └── traj_mgr │ │ ├── clean_trajs.py │ │ ├── combine_trajs.py │ │ ├── transform_to_ft.py │ │ ├── transform_to_ft_list.py │ │ └── utils.py └── utils.py └── tests ├── __init__.py ├── bug_gen ├── llm │ └── test_utils_llm.py ├── procedural │ ├── test_classes.py │ ├── test_control_flow.py │ ├── test_operations.py │ └── test_remove.py └── test_utils.py ├── conftest.py └── test_utils.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Maintain dependencies for GitHub Actions 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | -------------------------------------------------------------------------------- /.github/mlc_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "ignorePatterns": [ 3 | { 4 | "pattern": "https://github.com/?.*" 5 | }, 6 | { 7 | "pattern": "https://platform.openai.com/docs/.*" 8 | }, 9 | { 10 | "pattern": "https://docs.anthropic.com/.*" 11 | }, 12 | { 13 | "pattern": ".*localhost.*" 14 | }, 15 | { 16 | "pattern": "https?://(.*\\.)?twitter\\.com/.*" 17 | }, 18 | { 19 | "pattern": "https?://(.*\\.)?x\\.com/.*" 20 | } 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /.github/workflows/build-docs.yaml: -------------------------------------------------------------------------------- 1 | name: build-docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - "build-docs-*" 8 | pull_request: 9 | branches: 10 | - main 11 | permissions: 12 | contents: write 13 | jobs: 14 | deploy: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 20 | - name: Configure Git Credentials 21 | run: | 22 | git config user.name github-actions[bot] 23 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com 24 | - uses: actions/setup-python@v5 25 | with: 26 | python-version: 3.x 27 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 28 | - uses: actions/cache@v4 29 | with: 30 | key: mkdocs-material-${{ env.cache_id }} 31 | path: .cache 32 | restore-keys: | 33 | mkdocs-material- 34 | - name: Install uv 35 | run: | 36 | curl -LsSf https://astral.sh/uv/install.sh | sh 37 | - run: uv pip install --python ${Python_ROOT_DIR} '.[docs]' 38 | - name: Build Documentation 39 | if: github.ref != 'refs/heads/main' 40 | run: mkdocs build 41 | - name: Build + Deploy Documentation 42 | if: github.ref == 'refs/heads/main' 43 | run: mkdocs gh-deploy --force 44 | -------------------------------------------------------------------------------- /.github/workflows/check-links.yaml: -------------------------------------------------------------------------------- 1 | name: Check Markdown links 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | pull_request: 7 | schedule: 8 | - cron: "0 0 1 * *" 9 | 10 | jobs: 11 | markdown-link-check: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@master 15 | - uses: gaurav-nelson/github-action-markdown-link-check@v1 16 | with: 17 | config-file: '.github/mlc_config.json' 18 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yaml: -------------------------------------------------------------------------------- 1 | name: Pytest 2 | 3 | env: 4 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 5 | 6 | on: 7 | push: 8 | branches: 9 | - main 10 | - "test-ci/**" 11 | paths-ignore: 12 | - 'docs/**' 13 | - 'README.md' 14 | - 'mkdocs.yml' 15 | pull_request: 16 | branches: 17 | - main 18 | paths-ignore: 19 | - 'docs/**' 20 | - 'README.md' 21 | - 'mkdocs.yml' 22 | 23 | # Not possible to test windows capability: 24 | # https://github.com/orgs/community/discussions/25491 25 | jobs: 26 | test: 27 | runs-on: ubuntu-latest 28 | defaults: 29 | run: 30 | shell: bash -l {0} 31 | steps: 32 | - name: Checkout code 33 | uses: actions/checkout@v4 34 | - uses: actions/setup-python@v5 35 | with: 36 | python-version: '3.10' 37 | - name: Install uv 38 | run: | 39 | curl -LsSf https://astral.sh/uv/install.sh | sh 40 | - name: Install dependencies 41 | run: | 42 | uv pip install --python ${Python_ROOT_DIR} '.' 43 | - name: Install dev dependencies 44 | run: | 45 | uv pip install --python ${Python_ROOT_DIR} pytest pytest-cov 46 | - name: Run pytest 47 | uses: sjvrijn/pytest-last-failed@v2 48 | with: 49 | pytest-args: '--exitfirst --cov' 50 | - name: Explicitly convert coverage to xml 51 | run: coverage xml 52 | - name: Upload coverage reports to Codecov 53 | uses: codecov/codecov-action@v5.4.3 54 | with: 55 | token: ${{ secrets.CODECOV_TOKEN }} 56 | slug: SWE-bench/SWE-smith -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Files 2 | .env 3 | .DS_Store 4 | .api_key 5 | 6 | # Folders 7 | hidden/ 8 | logs/ 9 | notebooks/ 10 | 11 | # General python 12 | 13 | # Created by https://www.toptal.com/developers/gitignore/api/python 14 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 15 | 16 | ### Python ### 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | share/python-wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | *.py,cover 66 | .hypothesis/ 67 | .pytest_cache/ 68 | cover/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | db.sqlite3 78 | db.sqlite3-journal 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | .pybuilder/ 92 | target/ 93 | 94 | # Jupyter Notebook 95 | .ipynb_checkpoints 96 | 97 | # IPython 98 | profile_default/ 99 | ipython_config.py 100 | 101 | # pyenv 102 | # For a library or package, you might want to ignore these files since the code is 103 | # intended to run in multiple environments; otherwise, check them in: 104 | # .python-version 105 | 106 | # pipenv 107 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 108 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 109 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 110 | # install all needed dependencies. 111 | #Pipfile.lock 112 | 113 | # poetry 114 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 115 | # This is especially recommended for binary packages to ensure reproducibility, and is more 116 | # commonly ignored for libraries. 117 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 118 | #poetry.lock 119 | 120 | # pdm 121 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 122 | #pdm.lock 123 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 124 | # in version control. 125 | # https://pdm.fming.dev/#use-with-ide 126 | .pdm.toml 127 | 128 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 129 | __pypackages__/ 130 | 131 | # Celery stuff 132 | celerybeat-schedule 133 | celerybeat.pid 134 | 135 | # SageMath parsed files 136 | *.sage.py 137 | 138 | # Environments 139 | .env 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | .spyproject 150 | 151 | # Rope project settings 152 | .ropeproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | # Pyre type checker 163 | .pyre/ 164 | 165 | # pytype static type analyzer 166 | .pytype/ 167 | 168 | # Cython debug symbols 169 | cython_debug/ 170 | 171 | # PyCharm 172 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 173 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 174 | # and can be added to the global gitignore or merged into this file. For a more nuclear 175 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 176 | #.idea/ 177 | 178 | ### Python Patch ### 179 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 180 | poetry.toml 181 | 182 | # ruff 183 | .ruff_cache/ 184 | 185 | # LSP config files 186 | pyrightconfig.json 187 | 188 | # End of https://www.toptal.com/developers/gitignore/api/python 189 | 190 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | # Ruff version 4 | rev: v0.11.12 5 | hooks: 6 | # Run the linter 7 | - id: ruff 8 | # Only fix newly changed lines 9 | args: [ --fix, --diff ] 10 | # Run the formatter 11 | - id: ruff-format 12 | args: [ --exclude=notebooks ] -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thanks for your interest in contributing to SWE-smith! There's several ways to contribute. 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 John Yang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Kawhi the SWE-smith 4 | 5 |

6 | 7 |
8 | 9 |
10 | 11 | Build 12 | 13 | 14 | License 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 24 |
25 | 26 | SWE-smith is a toolkit for training software engineering (SWE) agents. With SWE-smith, you can: 27 | * Create an *unlimited* number of [SWE-bench](https://github.com/SWE-bench/SWE-bench) style task instances for any Python repository. 28 | * *Generate trajectories* of [SWE-agent](https://github.com/SWE-agent/SWE-agent) solving those task instances. 29 | * *Train local LMs* on these trajectories to improve their software engineering capabilities ([SWE-agent-LM-32B](https://huggingface.co/SWE-bench/SWE-agent-LM-32B)). 30 | 31 | ## 🚀 Get Started 32 | Check out the [documentation](https://swesmith.com/getting_started/) for a complete guide on how to use SWE-smith, including how to 33 | * [Install](https://swesmith.com/getting_started/installation/) the repository locally or as a PyPI package. 34 | * [Create Task Instances](https://swesmith.com/guides/create_instances/) for any Python repository with SWE-smith. 35 | * Use your task instance to [train your own SWE-agents](https://swesmith.com/guides/train_swe_agent/) 36 | 37 | ## 🏎️ Quick Start 38 | Install the repo: 39 | ```bash 40 | git clone https://github.com/SWE-bench/SWE-smith 41 | cd SWE-smith 42 | conda create -n smith python=3.10; 43 | conda activate smith; 44 | pip install -e . 45 | ``` 46 | 47 | Then, check out `scripts/cheatsheet.sh` for scripts to (1) create execution environments, (2) create task instances, and (3) train SWE-agents. 48 | 49 | > [!TIP] 50 | > SWE-smith requires Docker to create execution environments. SWE-smith was developed and tested on Ubuntu 22.04.4 LTS. 51 | > We do *not* plan on supporting Windows or MacOS. 52 | 53 | ## 💿 Resources 54 | In addition to this toolkit, we've also provided several artifacts on the [SWE-bench HuggingFace](https://huggingface.co/SWE-bench), including: 55 | * [50k Python Task Instances](https://huggingface.co/datasets/SWE-bench/SWE-smith), created using SWE-smith. 56 | * [SWE-agent-LM-32B](https://huggingface.co/SWE-bench/SWE-agent-LM-32B), trained using SWE-smith. Achieves **41.6%** pass@1 on [SWE-bench Verified](https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified)! 57 | * [5k Trajectories](https://huggingface.co/datasets/SWE-bench/SWE-smith-trajectories) that SWE-agent-LM-32B was trained on. 58 | 59 | And there's more coming! 60 | 61 | ## 💫 Contributions 62 | Excited about SWE-smith? We're actively working on several follow ups, and love meaningful collaborations! What we're thinking about... 63 | * Make SWE-smith work for non-Python languages 64 | * New bug generation techniques 65 | * Train SWE-agents with more trajectories and new methods 66 | 67 | Check out the [Contributing Guide](CONTRIBUTING.md) for more. 68 | 69 | Contact Person: [John Yang](https://john-b-yang.github.io/), [Kilian Lieret](https://lieret.net) 70 | (Email: [johnby@stanford.edu](mailto:johnby@stanford.edu)) 71 | 72 | ## 🪪 License 73 | MIT. Check `LICENSE` for more information. 74 | 75 | ## ✍️ Citation 76 | 77 | ```bibtex 78 | @misc{yang2025swesmith, 79 | title={SWE-smith: Scaling Data for Software Engineering Agents}, 80 | author={John Yang and Kilian Leret and Carlos E. Jimenez and Alexander Wettig and Kabir Khandpur and Yanzhe Zhang and Binyuan Hui and Ofir Press and Ludwig Schmidt and Diyi Yang}, 81 | year={2025}, 82 | eprint={2504.21798}, 83 | archivePrefix={arXiv}, 84 | primaryClass={cs.SE}, 85 | url={https://arxiv.org/abs/2504.21798}, 86 | } 87 | ``` 88 | 89 | ## 📕 Related Works 90 |
91 | SWE-bench 92 |    93 | SWE-agent 94 |    95 | SWE-ReX 96 |    97 | sb-cli 98 |
99 | -------------------------------------------------------------------------------- /agent/_gen_trajs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sweagent run-batch --num_workers 20 \ 4 | --instances.deployment.docker_args=--memory=10g \ 5 | --config agent/swesmith_gen_claude.yaml \ 6 | --instances.path /home/john-b-yang/swe-smith/logs/experiments/exp8__ig_orig.json \ 7 | --output_dir trajectories/john-b-yang/swesmith_gen__claude-3.5__t-0.00_p-1.00__c.2.00__exp8__ig_orig_run2 \ 8 | --random_delay_multiplier=1 \ 9 | --agent.model.temperature 0.0 10 | 11 | # Remember to set CLAUDE_API_KEY_ROTATION=key1:::key2:::key3 12 | -------------------------------------------------------------------------------- /agent/_infer_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sweagent run-batch --config agent/swesmith_infer.yaml \ 4 | --instances.deployment.docker_args=--memory=10g \ 5 | --agent.model.api_base https://svt25nwvnpipwz.r20.modal.host/v1 \ 6 | --random_delay_multiplier=1 \ 7 | --output_dir trajectories/john-b-yang/swesmith.ablation.bug.lm_reimplement_500 8 | -------------------------------------------------------------------------------- /agent/_traj_mgr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m swesmith.train.traj_mgr.clean_trajs trajectories/ 4 | 5 | python -m swesmith.train.traj_mgr.combine_trajs 6 | 7 | python -m swesmith.train.traj_mgr.transform_to_ft -------------------------------------------------------------------------------- /agent/swesmith_gen_claude.yaml: -------------------------------------------------------------------------------- 1 | # Heavily based on https://github.com/SWE-agent/SWE-agent/blob/main/config/anthropic_filemap.yaml 2 | instances: 3 | type: swesmith 4 | shuffle: true 5 | agent: 6 | templates: 7 | system_template: |- 8 | You are a helpful assistant that can interact with a computer to solve tasks. 9 | instance_template: |- 10 | 11 | {{working_dir}} 12 | 13 | I've uploaded a python code repository in the directory {{working_dir}}. Consider the following PR description: 14 | 15 | 16 | {{problem_statement}} 17 | 18 | 19 | Can you help me implement the necessary changes to the repository so that the requirements specified in the are met? 20 | I've already taken care of all changes to any of the test files described in the . This means you DON'T have to modify the testing logic or any of the tests in any way! 21 | Your task is to make the minimal changes to non-tests files in the {{working_dir}} directory to ensure the is satisfied. 22 | Follow these steps to resolve the issue: 23 | 1. As a first step, it might be a good idea to find and read code relevant to the 24 | 2. Create a script to reproduce the error and execute it with `python ` using the bash tool, to confirm the error 25 | 3. Edit the source code of the repo to resolve the issue 26 | 4. Rerun your reproduce script and confirm that the error is fixed! 27 | 5. Think about edgecases and make sure your fix handles them as well 28 | Your thinking should be thorough and so it's fine if it's very long. 29 | next_step_template: |- 30 | OBSERVATION: 31 | {{observation}} 32 | next_step_no_output_template: |- 33 | Your command ran successfully and did not produce any output. 34 | tools: 35 | bundles: 36 | - path: tools/registry 37 | - path: tools/edit_anthropic 38 | - path: tools/review_on_submit_m 39 | registry_variables: 40 | USE_FILEMAP: 'true' 41 | SUBMIT_REVIEW_MESSAGES: 42 | - | 43 | Thank you for your work on this issue. Please carefully follow the steps below to help review your changes. 44 | 45 | 1. If you made any changes to your code after running the reproduction script, please run the reproduction script again. 46 | If the reproduction script is failing, please revisit your changes and make sure they are correct. 47 | If you have already removed your reproduction script, please ignore this step. 48 | 2. Remove your reproduction script (if you haven't done so already). 49 | 3. If you have modified any TEST files, please revert them to the state they had before you started fixing the issue. 50 | You can do this with `git checkout -- /path/to/test/file.py`. Use below to find the files you need to revert. 51 | 4. Run the submit command again to confirm. 52 | 53 | Here is a list of all of your changes: 54 | 55 | 56 | {{diff}} 57 | 58 | enable_bash_tool: true 59 | parse_function: 60 | type: function_calling 61 | execution_timeout: 300 62 | history_processors: 63 | - type: cache_control 64 | last_n_messages: 2 65 | model: 66 | # name: claude-3-5-sonnet-20241022 67 | name: claude-3-7-sonnet-20250219 68 | max_output_tokens: 64000 69 | api_key: $CLAUDE_API_KEY_ROTATION 70 | per_instance_cost_limit: 2. 71 | per_instance_call_limit: 75 72 | # delay: 1 73 | -------------------------------------------------------------------------------- /agent/swesmith_gen_gpt.yaml: -------------------------------------------------------------------------------- 1 | # Heavily based on https://github.com/SWE-agent/SWE-agent/blob/main/config/anthropic_filemap.yaml 2 | instances: 3 | type: swesmith 4 | shuffle: true 5 | agent: 6 | templates: 7 | system_template: |- 8 | You are a helpful assistant that can interact with a computer to solve tasks. 9 | instance_template: |- 10 | 11 | {{working_dir}} 12 | 13 | I've uploaded a python code repository in the directory {{working_dir}}. Consider the following PR description: 14 | 15 | 16 | {{problem_statement}} 17 | 18 | 19 | Can you help me implement the necessary changes to the repository so that the requirements specified in the are met? 20 | I've already taken care of all changes to any of the test files described in the . This means you DON'T have to modify the testing logic or any of the tests in any way! 21 | Your task is to make the minimal changes to non-tests files in the {{working_dir}} directory to ensure the is satisfied. 22 | Follow these steps to resolve the issue: 23 | 1. As a first step, it might be a good idea to find and read code relevant to the 24 | 2. Create a script to reproduce the error and execute it with `python ` using the bash tool, to confirm the error 25 | 3. Edit the source code of the repo to resolve the issue 26 | 4. Rerun your reproduce script and confirm that the error is fixed! 27 | 5. Think about edgecases and make sure your fix handles them as well 28 | Your thinking should be thorough and so it's fine if it's very long. 29 | next_step_template: |- 30 | OBSERVATION: 31 | {{observation}} 32 | next_step_no_output_template: |- 33 | Your command ran successfully and did not produce any output. 34 | tools: 35 | execution_timeout: 300 36 | bundles: 37 | - path: tools/registry 38 | - path: tools/edit_anthropic 39 | - path: tools/submit 40 | env_variables: 41 | USE_FILEMAP: 'true' 42 | enable_bash_tool: true 43 | parse_function: 44 | type: function_calling 45 | model: 46 | name: gpt-4o-2024-08-06 47 | per_instance_cost_limit: 2. 48 | per_instance_call_limit: 75 49 | # delay: 1 50 | -------------------------------------------------------------------------------- /agent/swesmith_install_repo.yaml: -------------------------------------------------------------------------------- 1 | agent: 2 | templates: 3 | system_template: |- 4 | You are a helpful assistant that can interact with a computer to solve tasks. 5 | instance_template: |- 6 | 7 | {{working_dir}} 8 | 9 | I've uploaded a python code repository in the directory {{working_dir}}. 10 | 11 | Can you please help me install this repository? 12 | Your goal should be to configure the repository's development environment such that existing tests pass. 13 | You are currently in the root directory of the repository, and nothing has been installed yet. 14 | You in an Ubuntu 22.04 environment. 15 | 16 | The repository is predominantly written in Python. Here are several tips for installing it: 17 | 1. A good place to start is to look for a `CONTRIBUTING.[md|rst]` file, which will often contain instructions on how to install the repository and any dependencies it may have. Occasionally, the `README.md` file may also contain installation instructions. 18 | 2. Usually, a repository may have `setup.py` or `pyproject.toml` files which can be used to install the package. `pip install -e .` is commonly used, although many packages will also require an additional specifier that installs development packages as well (e.g. `pip install -e .[dev]`). 19 | 3. To check whether the repository was installed successfully, run tests and see if they pass. You can usually find tests in a `tests/` or `test/` directory. You can run tests using `pytest` or `unittest`, depending on the framework used by the repository. 20 | 4. Sometimes, you will need to install additional packages, often listed in a `requirements.txt` or `environment.yml` file. Also, be mindful of Ubuntu system dependencies that may need to be installed via `apt-get` (e.g. `sudo apt-get install `). 21 | 22 | Once you are finished with installing the repository, run the `submit` command to submit your changes for review 23 | next_step_template: |- 24 | OBSERVATION: 25 | {{observation}} 26 | next_step_no_output_template: |- 27 | Your command ran successfully and did not produce any output. 28 | tools: 29 | bundles: 30 | - path: tools/registry 31 | - path: tools/edit_anthropic 32 | - path: tools/submit 33 | registry_variables: 34 | USE_FILEMAP: 'true' 35 | enable_bash_tool: true 36 | parse_function: 37 | type: function_calling 38 | execution_timeout: 300 39 | history_processors: 40 | - type: cache_control 41 | last_n_messages: 2 42 | model: 43 | name: claude-3-7-sonnet-20250219 44 | api_key: $CLAUDE_API_KEY_ROTATION 45 | per_instance_cost_limit: 2. 46 | per_instance_call_limit: 150 47 | delay: 1 48 | -------------------------------------------------------------------------------- /configs/bug_gen/README.md: -------------------------------------------------------------------------------- 1 | # Writing Config. Files for Bug Generation 2 | 3 | To create bugs using `swesmith.bug_gen.llm.modify`, the script takes in a configuration file 4 | that allows one to (1) define what kind of bug(s) the LLM should generate, and (2) identify 5 | what functions to run this generation for. 6 | 7 | Here are the steps to create a config file for creating a specific kind of bug. 8 | 9 | 1. Create a `configs/bug_gen/*.yaml file`. Typically, the naming convention is `func_.yaml`. 10 | 2. Within the `.yaml` file, define the following prompts / fields: 11 | ```yaml 12 | name: 13 | criteria: reference to criteria in swesmith/bug_gen/llm/criteria.py 14 | parameters: any additional information you'd like to include + can be referenced in the prompts 15 | system: |- 16 | prompt 17 | demonstration: |- 18 | prompt 19 | instance: |- 20 | prompt 21 | ``` 22 | 3. (Optional) You can use one of the existing criteria, or create a new one in `swesmith/bug_gen/llm/criteria.py` 23 | * The purpose of defining a criteria is to only consider functions where it would be possible to introduce such a bug. 24 | * For example, if you write a prompt for off by one bugs, but the function doesn't have loops or list indexing, then it's likely the LLM cannot generate a reasonably effective and difficult bug. 25 | 26 | > A criteria function usually follows the below form: 27 | ```python 28 | def filter_(code_entity: CodeEntity) -> bool: 29 | """ 30 | `code_entity` is an object representing a function. It includes several 31 | pieces of information, most notably: 32 | * `src_code`: The raw string repr. of a function 33 | * `src_node`: An AST node representation of a function. 34 | """ 35 | node = code_entity.src_node 36 | # Logic for checking whether a function has a property has typically been 37 | # enforced by checking node properties (of course, you're not limited to this) 38 | if satisfies_criteria: 39 | return True 40 | return False 41 | ``` 42 | 43 | Once you create the `.yaml` with a specified criteria, from this repo, run: 44 | ```bash 45 | python -m swesmith.bug_gen.llm.modify \ 46 | --repo datamade/usaddress \ 47 | --model openai/gpt-4o \ 48 | --entity_type func \ 49 | --prompt_config configs/bug_gen/func_.yml \ 50 | --n_workers 4 # 4 parallel queries to LM etc. 51 | ``` 52 | where `--repo` should point to one of the repositories [here](https://github.com/orgs/swesmith/repositories). (Note: should just be `/`, without the `.`) 53 | -------------------------------------------------------------------------------- /configs/bug_gen/func_fun.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | name: func_fun 3 | criteria: all 4 | parameters: 5 | tips: 6 | system: |- 7 | You are a simulation of a tired, deadline-pressured developer who has just worked 14 consecutive hours. 8 | 9 | Your task was to improve the provided code. 10 | Despite your best intentions, your exhausted state causes you to introduce subtle, real-world bugs that would pass code review but cause issues in production. 11 | 12 | Rewrite a function such that it introduces a logical bug that will subtly break existing unit tests in a codebase. 13 | 14 | Here's how to proceed: 15 | 16 | 1. First understand what the code is trying to achieve 17 | 2. Consider how a well-intentioned but fatigued developer might misunderstand it 18 | 3. Implement changes based on that flawed understanding 19 | 4. Ensure the bug represents a genuine cognitive error, not a contrived modification 20 | 5. The code should look like a good-faith attempt at solving the problem 21 | 6. The bug should be something that could genuinely ship to production 22 | 23 | Tips about the bug-introducing task: 24 | 25 | - It should not cause compilation errors. 26 | - It should not be a syntax error. 27 | - It should be subtle and challenging to detect. 28 | - It should not modify the function signature. 29 | - It should not modify the documentation significantly. 30 | - For longer functions, if there is an opportunity to introduce multiple bugs, please do! 31 | - Please DO NOT INCLUDE COMMENTS IN THE CODE indicating the bug location or the bug itself. 32 | - Your code must be included in triple backticks. 33 | 34 | Your answer should be formatted as follows: 35 | 36 | Explanation: 37 | 38 | 39 | Bugged Code: 40 | ``` 41 | 42 | ``` 43 | demonstration: "" 44 | instance: |- 45 | 46 | {{src_code}} 47 | 48 | 49 | As a reminder, Please DO NOT INCLUDE ANY COMMENTS IN THE CODE OR POINT OUT THE BUG IN ANY WAY. 50 | 51 | OUTPUT: -------------------------------------------------------------------------------- /configs/bug_gen/lm_modify.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | name: lm_modify 3 | criteria: simple_complexity10 4 | parameters: 5 | bug_examples: 6 | - "Alter calculation order for incorrect results: Rearrange the sequence of operations in a calculation to subtly change the output (e.g., change (a + b) * c to a + (b * c))." 7 | - "Introduce subtle data transformation errors: Modify data processing logic, such as flipping a sign, truncating a value, or applying the wrong transformation function." 8 | - "Change variable assignments to alter computation state: Assign a wrong or outdated value to a variable that affects subsequent logic." 9 | - "Mishandle edge cases for specific inputs: Change handling logic to ignore or improperly handle boundary cases, like an empty array or a null input." 10 | - "Modify logic in conditionals or loops: Adjust conditions or loop boundaries (e.g., replace <= with <) to change the control flow." 11 | - "Introduce off-by-one errors in indices or loop boundaries: Shift an index or iteration boundary by one, such as starting a loop at 1 instead of 0." 12 | - "Adjust default values or constants to affect behavior: Change a hardcoded value or default parameter that alters how the function behaves under normal use." 13 | - "Reorder operations while maintaining syntax: Rearrange steps in a process so the function produces incorrect intermediate results without breaking the code." 14 | - "Swallow exceptions or return defaults silently: Introduce logic that catches an error but doesn't log or handle it properly, leading to silent failures." 15 | tips: 16 | - "It should not cause compilation errors." 17 | - "It should not be a syntax error." 18 | - "It should be subtle and challenging to detect." 19 | - "It should not modify the function signature." 20 | - "It should not modify the documentation significantly." 21 | - "For longer functions, if there is an opportunity to introduce multiple bugs, please do!" 22 | - "Please DO NOT INCLUDE COMMENTS IN THE CODE indicating the bug location or the bug itself." 23 | system: |- 24 | You are a software developer doing chaos monkey testing. 25 | Your job is to rewrite a function such that it introduces a logical bug that will break existing unit test(s) in a codebase. 26 | 27 | To this end, some kinds of bugs you might introduce include: 28 | {% for bug in (bug_examples | shuffle)[:3] %} 29 | - {{ bug -}} 30 | {% endfor %} 31 | 32 | Tips about the bug-introducing task: 33 | {% for tip in tips | shuffle %} 34 | - {{ tip -}} 35 | {% endfor %} 36 | 37 | Your answer should be formatted as follows: 38 | 39 | Explanation: 40 | 41 | 42 | Bugged Code: 43 | ``` 44 | 45 | ``` 46 | demonstration: "" 47 | instance: |- 48 | 49 | {{src_code}} 50 | 51 | 52 | As a reminder, Please DO NOT INCLUDE ANY COMMENTS IN THE CODE OR POINT OUT THE BUG IN ANY WAY. 53 | 54 | OUTPUT: -------------------------------------------------------------------------------- /configs/bug_gen/lm_rewrite.yml: -------------------------------------------------------------------------------- 1 | name: lm_rewrite 2 | system: |- 3 | You are a software developer and you have been asked to implement a function. 4 | 5 | You will be given the contents of an entire file, with one or more functions defined in it. 6 | Please implement the function(s) that are missing. 7 | Do NOT modify the function signature, including the function name, parameters, return types, or docstring if provided. 8 | Do NOT change any other code in the file. 9 | You should not use any external libraries. 10 | instance: |- 11 | Please implement the function `{func_signature}` in the following code: 12 | 13 | ``` 14 | {file_src_code} 15 | ``` 16 | 17 | Remember, you should not modify the function signature, including the function name, parameters, return types, or docstring if provided. 18 | Do NOT change any other code in the file. 19 | Format your output as: 20 | 21 | 22 | 23 | ``` 24 | {func_to_write} 25 | ``` -------------------------------------------------------------------------------- /configs/install_repo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . /opt/miniconda3/bin/activate 4 | conda create -n testbed python=3.10 -yq 5 | conda activate testbed 6 | pip install -e . 7 | pip install pytest 8 | -------------------------------------------------------------------------------- /configs/issue_gen/ig_tests.yaml: -------------------------------------------------------------------------------- 1 | system: |- 2 | You are a software engineer and you have been asked to give an issue report. 3 | 4 | You will be given the following input: 5 | 1. Test Source Code: The source code for a test in a GitHub repository that is currently failing. 6 | 2. Test Execution Output: The execution output of running the test. 7 | 8 | Given this input, please write a GitHub issue report. 9 | 10 | Guidelines: 11 | - Use a natural tone, as if reported by a developer. 12 | - DO NOT mention the test that failed. 13 | - Include information about how to reproduce the issue. You can use the test source code to write reproduction code. Use the test execution output to convey the expected behavior and what the actual current behavior is. 14 | demonstration: |- 15 | Here is an example of a well written GitHub issue. Mimic the style and information of this issue in your response. 16 | ----------------------------------- 17 | {demo} 18 | instance: |- 19 | Now, write a GitHub issue that conveys the problem reflected in the failing test. 20 | 21 | Remember, 22 | - DO NOT GIVE AWAY THE TEST THAT FAILED. 23 | - DO NOT SAY THAT EXISTING TEST(s) FAILED. 24 | - DO NOT SUGGEST RUNNING ANY TESTING COMMANDS (e.g., pytest). 25 | - Mimic the style and information of the issue text from the demonstration. 26 | - Keep the length of the issue text reasonable and similar to the demonstration. 27 | - Use the test source code to write reproduction code. 28 | - Use the test execution output to convey the expected behavior and what the actual current behavior is. 29 | 30 | {input} 31 | 32 | **Issue Text** 33 | -------------------------------------------------------------------------------- /configs/issue_gen/ig_v1.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | n_instructions: 1 # number of instructions to generate 3 | repro_code_n: 1 # number of repo tests to include in prompt 4 | repro_code_rate: 0 # % of task instances to generate repro code for 5 | add_test_output: True # whether to include test output (from validation step) 6 | system: |- 7 | **Task:** 8 | Write a realistic GitHub issue for the following **patch (diff output)** that introduces a bug. The issue should: 9 | - Clearly describe the problem observed in the original (buggy) code. 10 | - Include relevant details like which function or part of the code is affected. 11 | - Explain expected vs. actual behavior. 12 | - Suggest possible causes without explicitly stating the correct fix. 13 | - Use a natural tone, as if reported by a developer. 14 | 15 | Additional Context: 16 | - The diff shows changes to a file, where - lines represent the original (working) code that was removed. 17 | - + lines represent the new (fixed) code that was added. 18 | - The bug existed in the removed (-) lines, and the fix is in the added (+) lines. 19 | - Focus on describing the issue in the removed lines, not explaining the new fix verbatim. 20 | demonstration: |- 21 | Here is an example of a well formed GitHub issue: 22 | 23 | **Issue Text** 24 | {{problem_statement}} 25 | instance: |- 26 | Now, write a GitHub issue for the following patch (diff output). 27 | 28 | Remember to: 29 | - Clearly describe the problem observed in the original (buggy) code. 30 | - Include some relevant details like which function or part of the code is affected. BUT, don't be too specific 31 | - DO NOT GIVE AWAY THE FIX! THE SOLUTION CODE SHOULD NEVER APPEAR IN YOUR RESPONSE. 32 | - DO NOT SAY THAT EXISTING TEST(s) FAILED. 33 | - DO NOT SUGGEST RUNNING ANY TESTING COMMANDS (e.g., pytest). 34 | - Mimic the style of the issue text from the demonstration. 35 | - Keep the length of the issue text reasonable and similar to the demonstration. 36 | 37 | **Bug Patch (Diff Output):** 38 | {{patch}} 39 | 40 | **Issue Text** 41 | -------------------------------------------------------------------------------- /configs/issue_gen/ig_v2.yaml: -------------------------------------------------------------------------------- 1 | settings: {} 2 | system: |- 3 | You are a software engineer helping to create a realistic dataset of synthetic GitHub issues. 4 | 5 | You will be given the following input: 6 | 7 | 1. Demonstration: A realistic GitHub issue to mimic (included in the tag). 8 | 2. Patch: A git diff output/pull request changes that introduces a bug (included in the tag). 9 | 3. Test output: The output of running the tests after the patch is applied (included in the tag). 10 | 4. Test source code: Source code for one or more tests that failed (included in the tag). 11 | 12 | Output: A realistic GitHub issue for the patch. 13 | 14 | Guidelines: 15 | 16 | - Mimic the style and structure of the demonstration issues. 17 | If the demonstration issues are not well structured, your output should also be not well structured. 18 | If the demonstrations use improper or no markdown, your output should also use improper or no markdown. 19 | If the demonstrations are short/long, your output should also be short/long (if possible). 20 | If the demonstrations include human "flavor text" or "fluff", your output should also include human "flavor text" or "fluff". 21 | Do this even if it conflicts with your default behavior of trying to be extremely concise and helpful. 22 | - DO NOT explain the fix/what caused the bug itself, focus on how to reproduce the issue it introduces 23 | - Do not mention pytest or what exact test failed. Instead, generate a realistic issue. 24 | - If possible, include information about how to reproduce the issue. An ideal reproduction script should raise an error 25 | or print an unexpected output together with the expected output. 26 | However, still include this information in a style very similar to the demonstration issues. 27 | demonstration: |- 28 | Here are a few realistic GitHub issues that you can mimic. 29 | 30 | {% for problem_statement in demo_problem_statements[:2] %} 31 | 32 | {{problem_statement}} 33 | 34 | {% endfor %} 35 | instance: |- 36 | Now, write a GitHub issue for the following patch (diff output). 37 | 38 | 39 | - DO NOT GIVE AWAY THE FIX! THE SOLUTION CODE SHOULD NEVER APPEAR IN YOUR RESPONSE. 40 | - DO NOT SAY THAT EXISTING TEST(s) FAILED. 41 | - DO NOT SUGGEST RUNNING ANY TESTING COMMANDS (e.g., pytest). 42 | - Mimic the style and information of the issue text from the demonstration. 43 | - Keep the length of the issue text reasonable and similar to the demonstration. 44 | 45 | 46 | 47 | {{patch}} 48 | 49 | 50 | 51 | {{test_output}} 52 | 53 | 54 | 55 | {% for test in test_funcs[:5] %} 56 | {{test}} 57 | {% endfor %} 58 | 59 | 60 | **Issue Text** 61 | -------------------------------------------------------------------------------- /configs/train/dpo_qwen_32b.yml: -------------------------------------------------------------------------------- 1 | exp_name: qwen2p5-coder-32b-dpo-lr1e-5-warmup5___ft_xml_all_250413 2 | output_dir: /llm-weights/final/${exp_name} 3 | 4 | # Model Arguments 5 | model: 6 | _component_: torchtune.models.qwen2_5.qwen2_5_32b_instruct 7 | 8 | tokenizer: 9 | _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer 10 | path: /llm-weights/Qwen/Qwen2.5-Coder-32B-Instruct/vocab.json 11 | merges_file: /llm-weights/Qwen/Qwen2.5-Coder-32B-Instruct/merges.txt 12 | max_seq_len: 32768 13 | 14 | checkpointer: 15 | _component_: torchtune.training.FullModelHFCheckpointer 16 | checkpoint_dir: /llm-weights/Qwen/Qwen2.5-Coder-32B-Instruct 17 | checkpoint_files: [ 18 | model-00001-of-00014.safetensors, 19 | model-00002-of-00014.safetensors, 20 | model-00003-of-00014.safetensors, 21 | model-00004-of-00014.safetensors, 22 | model-00005-of-00014.safetensors, 23 | model-00006-of-00014.safetensors, 24 | model-00007-of-00014.safetensors, 25 | model-00008-of-00014.safetensors, 26 | model-00009-of-00014.safetensors, 27 | model-00010-of-00014.safetensors, 28 | model-00011-of-00014.safetensors, 29 | model-00012-of-00014.safetensors, 30 | model-00013-of-00014.safetensors, 31 | model-00014-of-00014.safetensors, 32 | ] 33 | recipe_checkpoint: null 34 | output_dir: ${output_dir} 35 | model_type: QWEN2 36 | safe_serialization: True 37 | resume_from_checkpoint: False 38 | 39 | # Dataset and Sampler 40 | dataset: 41 | _component_: torchtune.datasets.preference_dataset 42 | source: json 43 | data_files: /datasets/trajectories_dpo/dpo_250413.json 44 | conversation_column: messages 45 | conversation_style: openai 46 | new_system_prompt: null 47 | packed: False # True increases speed 48 | column_map: 49 | chosen: chosen_conversations 50 | rejected: rejected_conversations 51 | train_on_input: False 52 | split: train 53 | seed: 42 54 | shuffle: True 55 | batch_size: 1 56 | 57 | # Optimizer and Scheduler 58 | optimizer: 59 | _component_: torch.optim.AdamW 60 | fused: True 61 | weight_decay: 0.01 62 | lr: 1e-5 63 | lr_scheduler: 64 | _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup 65 | num_warmup_steps: 5 66 | optimizer_in_bwd: True 67 | loss: 68 | _component_: torchtune.rlhf.loss.DPOLoss 69 | beta: 0.05 70 | label_smoothing: 0 71 | 72 | # Training 73 | epochs: 3 74 | max_steps_per_epoch: null 75 | gradient_accumulation_steps: 1 # Use to increase virtual batch size 76 | compile: False # pytorch compile, set to true for better perf/memory 77 | 78 | # Logging 79 | metric_logger: 80 | _component_: torchtune.training.metric_logging.WandBLogger 81 | project: devrl-sft 82 | group: ${exp_name} 83 | job_type: full_dpo_distributed 84 | log_every_n_steps: 1 85 | log_peak_memory_stats: True 86 | 87 | # Environment 88 | device: cuda 89 | dtype: bf16 90 | enable_activation_checkpointing: True # True reduces memory 91 | enable_activation_offloading: False # True reduces memory 92 | 93 | # Show case the usage of pytorch profiler 94 | # Set enabled to False as it's only needed for debugging training 95 | profiler: 96 | _component_: torchtune.training.setup_torch_profiler 97 | 98 | enabled: False 99 | 100 | #Output directory of trace artifacts 101 | output_dir: ${output_dir}/profiling_outputs 102 | 103 | #`torch.profiler.ProfilerActivity` types to trace 104 | cpu: True 105 | cuda: True 106 | 107 | #trace options passed to `torch.profiler.profile` 108 | profile_memory: False 109 | with_stack: False 110 | record_shapes: True 111 | with_flops: False 112 | 113 | # `torch.profiler.schedule` options: 114 | # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat 115 | wait_steps: 5 116 | warmup_steps: 5 117 | active_steps: 2 118 | num_cycles: 1 -------------------------------------------------------------------------------- /configs/train/dpo_qwen_7b.yml: -------------------------------------------------------------------------------- 1 | exp_name: qwen2p5-coder-7b-dpo-lr1e-5-warmup5___ft_xml_all_250414 2 | output_dir: /llm-weights/dpo/${exp_name} 3 | 4 | # Model Arguments 5 | model: 6 | _component_: torchtune.models.qwen2_5.qwen2_5_7b_instruct 7 | 8 | tokenizer: 9 | _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer 10 | path: /llm-weights/Qwen/Qwen2.5-Coder-7B-Instruct/vocab.json 11 | merges_file: /llm-weights/Qwen/Qwen2.5-Coder-7B-Instruct/merges.txt 12 | max_seq_len: 32768 13 | 14 | checkpointer: 15 | _component_: torchtune.training.FullModelHFCheckpointer 16 | checkpoint_dir: /llm-weights/outputs/qwen2p5-coder-7b-full-lr1e-4-warmup5___all_250331.jsonl/epoch_4 17 | checkpoint_files: [ 18 | ft-model-00001-of-00004.safetensors, 19 | ft-model-00002-of-00004.safetensors, 20 | ft-model-00003-of-00004.safetensors, 21 | ft-model-00004-of-00004.safetensors, 22 | ] 23 | recipe_checkpoint: null 24 | output_dir: ${output_dir} 25 | model_type: QWEN2 26 | safe_serialization: True 27 | resume_from_checkpoint: False 28 | 29 | # The ref_checkpointer should always point to the original weights. 30 | ref_checkpointer: 31 | _component_: torchtune.training.FullModelHFCheckpointer 32 | checkpoint_dir: /llm-weights/Qwen/Qwen2.5-Coder-7B-Instruct 33 | checkpoint_files: [ 34 | model-00001-of-00004.safetensors, 35 | model-00002-of-00004.safetensors, 36 | model-00003-of-00004.safetensors, 37 | model-00004-of-00004.safetensors, 38 | ] 39 | recipe_checkpoint: null 40 | output_dir: ${output_dir} 41 | model_type: QWEN2 42 | safe_serialization: True 43 | 44 | # Dataset and Sampler 45 | dataset: 46 | _component_: torchtune.datasets.preference_dataset 47 | source: json 48 | data_files: /datasets/trajectories_dpo/swesmith_dpo_250414.json 49 | column_map: 50 | chosen: chosen_conversations 51 | rejected: rejected_conversations 52 | train_on_input: False 53 | seed: 42 54 | shuffle: True 55 | batch_size: 1 56 | 57 | # Optimizer and Scheduler 58 | optimizer: 59 | _component_: torch.optim.AdamW 60 | fused: True 61 | weight_decay: 0.05 62 | lr: 2e-5 63 | lr_scheduler: 64 | _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup 65 | num_warmup_steps: 5 66 | optimizer_in_bwd: False 67 | loss: 68 | _component_: torchtune.rlhf.loss.DPOLoss 69 | beta: 0.05 70 | label_smoothing: 0 71 | 72 | # Training 73 | epochs: 2 74 | max_steps_per_epoch: null 75 | gradient_accumulation_steps: 4 # Use to increase effective batch size 76 | compile: False # torch.compile the model + loss, True increases speed + decreases memory 77 | 78 | # Logging 79 | metric_logger: 80 | _component_: torchtune.training.metric_logging.WandBLogger 81 | project: devrl-sft 82 | group: ${exp_name} 83 | job_type: full_dpo_distributed 84 | log_every_n_steps: 1 85 | log_peak_memory_stats: True 86 | 87 | # Environment 88 | device: cuda 89 | dtype: bf16 90 | enable_activation_checkpointing: True # True reduces memory 91 | enable_activation_offloading: False # True reduces memory 92 | 93 | # Show case the usage of pytorch profiler 94 | # Set enabled to False as it's only needed for debugging training 95 | profiler: 96 | _component_: torchtune.training.setup_torch_profiler 97 | 98 | enabled: False 99 | 100 | #Output directory of trace artifacts 101 | output_dir: ${output_dir}/profiling_outputs 102 | 103 | #`torch.profiler.ProfilerActivity` types to trace 104 | cpu: True 105 | cuda: True 106 | 107 | #trace options passed to `torch.profiler.profile` 108 | profile_memory: False 109 | with_stack: False 110 | record_shapes: True 111 | with_flops: False 112 | 113 | # `torch.profiler.schedule` options: 114 | # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat 115 | wait_steps: 5 116 | warmup_steps: 5 117 | active_steps: 2 118 | num_cycles: 1 -------------------------------------------------------------------------------- /configs/train/full_ft_qwen_32b.yml: -------------------------------------------------------------------------------- 1 | # Config for multi-device full finetuning in full_finetune_distributed.py 2 | # using a Qwen2.5 7B model 3 | # 4 | # This config assumes that you've run the following command before launching 5 | # this run: 6 | # tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct 7 | # 8 | # To launch on 2 devices, run the following command from root: 9 | # tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/7B_full 10 | # 11 | # You can add specific overrides through the command line. For example 12 | # to override the checkpointer directory while launching training 13 | # you can run: 14 | # tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/7B_full checkpointer.checkpoint_dir= 15 | # 16 | # This config works best when the model is being fine-tuned on 2+ GPUs. 17 | # Single device full finetuning requires more memory optimizations. It's 18 | # best to use 7B_full_single_device.yaml for those cases 19 | 20 | exp_name: qwen2p5-coder-32b-full-lr5e-5-warmup5___ft_xml_all_250413 21 | output_dir: /llm-weights/final/${exp_name} 22 | # Model Arguments 23 | model: 24 | _component_: torchtune.models.qwen2_5.qwen2_5_32b_instruct 25 | 26 | tokenizer: 27 | _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer 28 | path: /llm-weights/Qwen/Qwen2.5-Coder-32B-Instruct/vocab.json 29 | merges_file: /llm-weights/Qwen/Qwen2.5-Coder-32B-Instruct/merges.txt 30 | max_seq_len: 32768 31 | 32 | checkpointer: 33 | _component_: torchtune.training.FullModelHFCheckpointer 34 | checkpoint_dir: /llm-weights/Qwen/Qwen2.5-Coder-32B-Instruct 35 | checkpoint_files: [ 36 | model-00001-of-00014.safetensors, 37 | model-00002-of-00014.safetensors, 38 | model-00003-of-00014.safetensors, 39 | model-00004-of-00014.safetensors, 40 | model-00005-of-00014.safetensors, 41 | model-00006-of-00014.safetensors, 42 | model-00007-of-00014.safetensors, 43 | model-00008-of-00014.safetensors, 44 | model-00009-of-00014.safetensors, 45 | model-00010-of-00014.safetensors, 46 | model-00011-of-00014.safetensors, 47 | model-00012-of-00014.safetensors, 48 | model-00013-of-00014.safetensors, 49 | model-00014-of-00014.safetensors, 50 | ] 51 | recipe_checkpoint: null 52 | output_dir: ${output_dir} 53 | model_type: QWEN2 54 | safe_serialization: True 55 | resume_from_checkpoint: False 56 | 57 | # Dataset and Sampler 58 | dataset: 59 | _component_: torchtune.datasets.chat_dataset 60 | source: json 61 | data_files: /datasets/trajectories_sft/ft_xml_all_250413.jsonl 62 | split: train 63 | conversation_column: messages 64 | conversation_style: openai 65 | train_on_input: False 66 | new_system_prompt: null 67 | packed: False # True increases speed 68 | seed: 42 69 | shuffle: True 70 | batch_size: 1 71 | 72 | # Optimizer and Scheduler 73 | optimizer: 74 | _component_: torch.optim.AdamW 75 | fused: True 76 | weight_decay: 0.01 77 | lr: 5e-5 78 | lr_scheduler: 79 | _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup 80 | num_warmup_steps: 5 81 | optimizer_in_bwd: True 82 | loss: 83 | _component_: torchtune.modules.loss.CEWithChunkedOutputLoss 84 | 85 | # Training 86 | epochs: 3 87 | max_steps_per_epoch: null 88 | gradient_accumulation_steps: 1 # Use to increase virtual batch size 89 | compile: True # pytorch compile, set to true for better perf/memory 90 | 91 | # Logging 92 | metric_logger: 93 | _component_: torchtune.training.metric_logging.WandBLogger 94 | project: devrl-sft 95 | group: ${exp_name} 96 | job_type: full_finetune_distributed 97 | log_every_n_steps: 1 98 | log_peak_memory_stats: True 99 | 100 | # Environment 101 | device: cuda 102 | dtype: bf16 103 | enable_activation_checkpointing: True # True reduces memory 104 | enable_activation_offloading: False # True reduces memory 105 | # custom_sharded_layers: ['tok_embeddings'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed. 106 | 107 | # Show case the usage of pytorch profiler 108 | # Set enabled to False as it's only needed for debugging training 109 | profiler: 110 | _component_: torchtune.training.setup_torch_profiler 111 | 112 | enabled: False 113 | 114 | #Output directory of trace artifacts 115 | output_dir: ${output_dir}/profiling_outputs 116 | 117 | #`torch.profiler.ProfilerActivity` types to trace 118 | cpu: True 119 | cuda: True 120 | 121 | #trace options passed to `torch.profiler.profile` 122 | profile_memory: False 123 | with_stack: False 124 | record_shapes: True 125 | with_flops: False 126 | 127 | # `torch.profiler.schedule` options: 128 | # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat 129 | wait_steps: 5 130 | warmup_steps: 5 131 | active_steps: 2 132 | num_cycles: 1 -------------------------------------------------------------------------------- /configs/train/full_ft_qwen_7b.yml: -------------------------------------------------------------------------------- 1 | exp_name: qwen2p5-coder-7b-full-lr5e-5-warmup5___ft_xml_all_250331 2 | output_dir: /llm-weights/outputs/${exp_name} 3 | 4 | # Model Arguments 5 | model: 6 | _component_: torchtune.models.qwen2_5.qwen2_5_7b_instruct 7 | 8 | tokenizer: 9 | _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer 10 | path: /llm-weights/Qwen/Qwen2.5-Coder-7B-Instruct/vocab.json 11 | merges_file: /llm-weights/Qwen/Qwen2.5-Coder-7B-Instruct/merges.txt 12 | max_seq_len: 32768 13 | 14 | checkpointer: 15 | _component_: torchtune.training.FullModelHFCheckpointer 16 | checkpoint_dir: /llm-weights/Qwen/Qwen2.5-Coder-7B-Instruct 17 | checkpoint_files: [ 18 | model-00001-of-00004.safetensors, 19 | model-00002-of-00004.safetensors, 20 | model-00003-of-00004.safetensors, 21 | model-00004-of-00004.safetensors, 22 | ] 23 | recipe_checkpoint: null 24 | output_dir: ${output_dir} 25 | model_type: QWEN2 26 | safe_serialization: True 27 | resume_from_checkpoint: False 28 | 29 | # Dataset and Sampler 30 | dataset: 31 | _component_: torchtune.datasets.chat_dataset 32 | source: json 33 | data_files: /datasets/trajectories_sft/ft_xml_all_250331.jsonl 34 | split: train 35 | conversation_column: messages 36 | conversation_style: openai 37 | train_on_input: False 38 | new_system_prompt: null 39 | packed: False # True increases speed 40 | seed: 42 41 | shuffle: True 42 | batch_size: 1 43 | 44 | # Optimizer and Scheduler 45 | optimizer: 46 | _component_: torch.optim.AdamW 47 | fused: True 48 | weight_decay: 0.01 49 | lr: 5e-5 50 | lr_scheduler: 51 | _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup 52 | num_warmup_steps: 5 53 | optimizer_in_bwd: False 54 | loss: 55 | _component_: torchtune.modules.loss.CEWithChunkedOutputLoss 56 | 57 | # Training 58 | epochs: 3 59 | max_steps_per_epoch: null 60 | gradient_accumulation_steps: 4 # Use to increase virtual batch size 61 | compile: True # pytorch compile, set to true for better perf/memory 62 | 63 | # Logging 64 | metric_logger: 65 | _component_: torchtune.training.metric_logging.WandBLogger 66 | project: devrl-sft 67 | group: ${exp_name} 68 | job_type: full_finetune_distributed 69 | log_every_n_steps: 1 70 | log_peak_memory_stats: True 71 | 72 | # Environment 73 | device: cuda 74 | dtype: bf16 75 | enable_activation_checkpointing: True # True reduces memory 76 | enable_activation_offloading: False # True reduces memory 77 | # custom_sharded_layers: ['tok_embeddings'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed. 78 | 79 | # Show case the usage of pytorch profiler 80 | # Set enabled to False as it's only needed for debugging training 81 | profiler: 82 | _component_: torchtune.training.setup_torch_profiler 83 | 84 | enabled: False 85 | 86 | #Output directory of trace artifacts 87 | output_dir: ${output_dir}/profiling_outputs 88 | 89 | #`torch.profiler.ProfilerActivity` types to trace 90 | cpu: True 91 | cuda: True 92 | 93 | #trace options passed to `torch.profiler.profile` 94 | profile_memory: False 95 | with_stack: False 96 | record_shapes: True 97 | with_flops: False 98 | 99 | # `torch.profiler.schedule` options: 100 | # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat 101 | wait_steps: 5 102 | warmup_steps: 5 103 | active_steps: 2 104 | num_cycles: 1 -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | swesmith.com 2 | -------------------------------------------------------------------------------- /docs/assets/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/banner.png -------------------------------------------------------------------------------- /docs/assets/bug_gen_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/bug_gen_overview.png -------------------------------------------------------------------------------- /docs/assets/combine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/combine.png -------------------------------------------------------------------------------- /docs/assets/home/collection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/home/collection.png -------------------------------------------------------------------------------- /docs/assets/home/leaderboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/home/leaderboard.png -------------------------------------------------------------------------------- /docs/assets/home/swesmith.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/home/swesmith.png -------------------------------------------------------------------------------- /docs/assets/lm_generate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/lm_generate.png -------------------------------------------------------------------------------- /docs/assets/overview-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/overview-light.png -------------------------------------------------------------------------------- /docs/assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/overview.png -------------------------------------------------------------------------------- /docs/assets/paper.pdf.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Redirecting... 6 | 7 | 8 |

If you are not redirected automatically, follow this link.

9 | 10 | -------------------------------------------------------------------------------- /docs/assets/pr_mirror.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/pr_mirror.png -------------------------------------------------------------------------------- /docs/assets/procedural.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/procedural.png -------------------------------------------------------------------------------- /docs/assets/sbcli_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/assets/swebench_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/swebench_logo.png -------------------------------------------------------------------------------- /docs/assets/swesmith_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/assets/swesmith_logo.png -------------------------------------------------------------------------------- /docs/css/TiltNeon.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/docs/css/TiltNeon.ttf -------------------------------------------------------------------------------- /docs/css/bubbles.css: -------------------------------------------------------------------------------- 1 | /* Floating bubbles styles */ 2 | .floating-bubbles { 3 | position: fixed; 4 | bottom: 20px; 5 | right: 20px; 6 | display: flex; 7 | flex-direction: column; 8 | gap: 10px; 9 | z-index: 1000; 10 | } 11 | 12 | .floating-bubbles-title { 13 | position: absolute; 14 | top: -30px; 15 | right: 0; 16 | font-size: 12px; 17 | color: #777; 18 | text-align: right; 19 | font-weight: bold; 20 | opacity: 0; 21 | visibility: hidden; 22 | transition: opacity 0.3s ease, visibility 0.3s ease; 23 | white-space: nowrap; 24 | } 25 | 26 | .floating-bubbles:hover .floating-bubbles-title { 27 | opacity: 1; 28 | visibility: visible; 29 | } 30 | 31 | .bubble { 32 | width: 40px; 33 | height: 40px; 34 | display: flex; 35 | justify-content: center; 36 | align-items: center; 37 | position: relative; 38 | transition: transform 0.3s ease; 39 | } 40 | 41 | .bubble:hover { 42 | transform: scale(1.1); 43 | } 44 | 45 | .bubble img { 46 | width: 40px; 47 | height: 40px; 48 | } 49 | 50 | .bubble-tooltip { 51 | position: absolute; 52 | right: 60px; 53 | background-color: #333; 54 | color: white; 55 | padding: 5px 10px; 56 | border-radius: 4px; 57 | font-size: 14px; 58 | white-space: nowrap; 59 | opacity: 0; 60 | visibility: hidden; 61 | transition: opacity 0.3s ease, visibility 0.3s ease; 62 | } 63 | 64 | .bubble:hover .bubble-tooltip { 65 | opacity: 1; 66 | visibility: visible; 67 | } 68 | 69 | .floating-bubbles:hover .bubble-tooltip { 70 | opacity: 1; 71 | visibility: visible; 72 | } 73 | 74 | /* Hide on mobile */ 75 | @media (max-width: 768px) { 76 | .floating-bubbles { 77 | display: none; 78 | } 79 | } -------------------------------------------------------------------------------- /docs/css/carousel.css: -------------------------------------------------------------------------------- 1 | .carousel { 2 | position: relative; 3 | max-width: 100%; 4 | overflow: hidden; 5 | border-radius: 0.5rem; 6 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); 7 | background: #fff; 8 | margin-bottom: 1rem; 9 | } 10 | 11 | .slides { 12 | display: flex; 13 | transition: transform 0.5s ease-in-out; 14 | } 15 | 16 | .slide { 17 | min-width: 100%; 18 | position: relative; 19 | } 20 | 21 | .slide img { 22 | width: 100%; 23 | display: block; 24 | } 25 | 26 | .caption { 27 | position: absolute; 28 | bottom: 1rem; 29 | left: 1rem; 30 | background: rgba(0, 0, 0, 0.6); 31 | color: #fff; 32 | padding: 0.5rem 1rem; 33 | border-radius: 0.25rem; 34 | } 35 | 36 | .nav-buttons { 37 | position: absolute; 38 | top: 50%; 39 | width: 100%; 40 | display: flex; 41 | justify-content: space-between; 42 | transform: translateY(-50%); 43 | } 44 | 45 | .nav-buttons button { 46 | background: rgba(0, 0, 0, 0.4); 47 | color: #fff; 48 | border: none; 49 | width: 2em; 50 | height: 2em; 51 | font-size: 1.2rem; 52 | cursor: pointer; 53 | border-radius: 50%; 54 | transition: background 0.2s ease; 55 | display: flex; 56 | align-items: center; 57 | justify-content: center; 58 | } 59 | 60 | .nav-buttons button:hover { 61 | background: rgba(0, 0, 0, 0.7); 62 | } -------------------------------------------------------------------------------- /docs/css/custom.css: -------------------------------------------------------------------------------- 1 | [data-md-color-scheme="default"] { 2 | --md-default-bg-color: #fff7ec; 3 | --md-primary-fg-color: #D49017; 4 | --md-typeset-a-color: #006caa; 5 | --md-code-bg-color: #e7e7e7; 6 | } 7 | 8 | [data-md-color-scheme="slate"] { 9 | --md-primary-fg-color: #D49017; 10 | --md-default-fg-color: #fff7ec; 11 | --md-default-bg-color: #111111; 12 | } 13 | 14 | .theme-img--light, 15 | .theme-img--dark { 16 | display: none; 17 | } 18 | 19 | body[data-md-color-scheme="default"] .theme-img--light { display: block; } 20 | body[data-md-color-scheme="slate"] .theme-img--dark { display: block; } 21 | 22 | .clickable-banner { 23 | color: #000000; 24 | } 25 | 26 | .md-main__inner.md-grid, 27 | .md-grid { 28 | max-width: 64rem; 29 | } 30 | 31 | @media screen and (min-width: 1220px) { 32 | .md-main__inner.md-grid, 33 | .md-grid { 34 | max-width: 64rem; 35 | } 36 | } 37 | 38 | .md-typeset h1, 39 | .md-typeset h2, 40 | .md-typeset h3 { 41 | font-weight: 400; 42 | color: var( 43 | --md-primary-fg-color-dark 44 | ); /* this actually works for both light and dark themes */ 45 | } 46 | 47 | @font-face { 48 | font-family: "TiltNeon"; 49 | src: url("/assets/TiltNeon.ttf") format('truetype'); 50 | } 51 | 52 | :root { 53 | --md-text-font: "Agbalumo"; 54 | } -------------------------------------------------------------------------------- /docs/css/home.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --bg-color: #1e1e1e; 3 | --nav-color: #2a2a2a; 4 | --off-white: #e0e0e0; 5 | } 6 | 7 | /* Base styling */ 8 | * { 9 | box-sizing: border-box; 10 | margin: 0; 11 | padding: 0; 12 | font-size: 0.9rem; 13 | } 14 | 15 | body { 16 | font-family: 'IBM Plex Mono', monospace; 17 | background-color: var(--bg-color); 18 | color: var(--off-white); 19 | display: flex; 20 | flex-direction: column; 21 | align-items: center; 22 | min-height: 100vh; 23 | } 24 | 25 | main { 26 | max-width: 520px; 27 | width: 100%; 28 | } 29 | 30 | a { 31 | color: var(--off-white); 32 | transition: color 0.3s ease; 33 | } 34 | 35 | a:hover { 36 | color: #ffffff; 37 | } 38 | 39 | /* Navigation bar */ 40 | nav { 41 | width: 100%; 42 | background-color: var(--nav-color); 43 | position: sticky; 44 | top: 0; 45 | z-index: 1000; 46 | } 47 | 48 | .nav-bar { 49 | list-style: none; 50 | display: flex; 51 | justify-content: center; 52 | padding: 1rem 0; 53 | } 54 | 55 | .nav-bar li { 56 | margin: 0 2rem; 57 | } 58 | 59 | .nav-bar a { 60 | text-decoration: none; 61 | color: #bbbbbb; 62 | font-weight: 500; 63 | font-size: 0.95rem; 64 | text-transform: uppercase; 65 | letter-spacing: 1px; 66 | transition: color 0.3s ease; 67 | } 68 | 69 | .nav-bar a:hover { 70 | color: #ffffff; 71 | } 72 | 73 | /* Title */ 74 | .title { 75 | margin: 3rem 0 0.5rem; 76 | font-size: 2.5rem; 77 | font-weight: 600; 78 | letter-spacing: 0.5px; 79 | text-align: center; 80 | } 81 | 82 | .subtitle { 83 | margin: 0.5em 0 2rem; 84 | font-size: 1.2rem; 85 | font-weight: 400; 86 | text-align: center; 87 | color: #ccc; 88 | } 89 | 90 | /* YouTube Embed */ 91 | .video-container { 92 | aspect-ratio: 16 / 9; 93 | border-radius: 1em; 94 | overflow: hidden; 95 | box-shadow: 0 6px 20px rgba(0, 0, 0, 0.4); 96 | display: flex; 97 | justify-content: center; 98 | } 99 | 100 | .video-container iframe { 101 | width: 100%; 102 | height: 100%; 103 | border: none; 104 | display: block; 105 | } 106 | 107 | section { 108 | margin: 1.5rem auto; 109 | } 110 | 111 | .ecosystem p { 112 | margin-bottom: 1rem; 113 | } 114 | 115 | .summary p { 116 | margin-bottom: 1rem; 117 | } 118 | 119 | .collab div { 120 | background: none; 121 | padding: 0.25rem 0; 122 | border-radius: 0; 123 | box-shadow: none; 124 | line-height: 1.6; 125 | } 126 | 127 | .cite-block { 128 | background-color: #272525; 129 | border-radius: 1em; 130 | color: white; 131 | font-family: monospace; 132 | font-size: 0.7rem; 133 | overflow-x: auto; 134 | overflow-y: hidden; 135 | padding: 1em; 136 | white-space: pre; 137 | } 138 | 139 | .fire-text { 140 | background: linear-gradient(90deg, #ff2d00, #ff6f00, #ffc300, #ff6f00, #ff2d00); 141 | background-size: 300% 100%; 142 | background-clip: text; 143 | -webkit-background-clip: text; 144 | color: transparent; 145 | -webkit-text-fill-color: transparent; 146 | animation: fireGradientShift 5s ease-in-out infinite; 147 | } 148 | 149 | @keyframes fireGradientShift { 150 | 0% { 151 | background-position: 0% 50%; 152 | } 153 | 50% { 154 | background-position: 100% 50%; 155 | } 156 | 100% { 157 | background-position: 0% 50%; 158 | } 159 | } 160 | 161 | .fire-logo { 162 | animation: fireHueShift 5s ease-in-out infinite; 163 | transform-origin: center; 164 | } 165 | 166 | .blog p { 167 | margin-bottom: 1rem; 168 | } 169 | 170 | .blog h3 { 171 | margin-top: 2.5rem; 172 | margin-bottom: 1rem; 173 | } 174 | 175 | .blog ol { 176 | li { 177 | margin: 1rem 0; 178 | } 179 | } 180 | 181 | blockquote { 182 | border-left: 0.25em solid #b77917; /* Indigo-500 */ 183 | background: #000; 184 | padding: 1rem 1.5rem; 185 | margin: 1.5rem 0; 186 | box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05); 187 | border-radius: 0.5rem; 188 | position: relative; 189 | } -------------------------------------------------------------------------------- /docs/css/mkdocstrings.css: -------------------------------------------------------------------------------- 1 | /* From https://mkdocstrings.github.io/python/usage/customization/#symbol-types */ 2 | [data-md-color-scheme="default"] { 3 | --doc-symbol-parameter-fg-color: #df50af; 4 | --doc-symbol-attribute-fg-color: #0079ff; 5 | --doc-symbol-function-fg-color: #00dfa2; 6 | --doc-symbol-method-fg-color: #00dfa2; 7 | --doc-symbol-class-fg-color: #d1b619; 8 | --doc-symbol-module-fg-color: #ff0060; 9 | 10 | --doc-symbol-parameter-bg-color: #df50af1a; 11 | --doc-symbol-attribute-bg-color: #0079ff1a; 12 | --doc-symbol-function-bg-color: #00dfa21a; 13 | --doc-symbol-method-bg-color: #00dfa21a; 14 | --doc-symbol-class-bg-color: #d1b6191a; 15 | --doc-symbol-module-bg-color: #ff00601a; 16 | } 17 | -------------------------------------------------------------------------------- /docs/docs/index.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/getting_started/assets.md: -------------------------------------------------------------------------------- 1 | # Assets 2 | 3 | In addition to the paper and codebase, we release the following assets created with SWE-smith: 4 | 5 | 1. **Environments for 128 GitHub repositories.** You can download the environments (Docker images) locally by running the following command from the root directory of SWE-smith: 6 | ```bash 7 | python swesmith/build_repo/download_images.py 8 | ``` 9 | 10 | 2. **SWE-smith dataset of 50k+ task instances**, made available as a [HuggingFace dataset](https://huggingface.co/datasets/SWE-bench/SWE-smith). 11 | 12 | 3. **5k expert trajectories** + **SWE-agent-LM-32B**. 13 | To create `SWE-agent-LM-32B`, we fine-tuned [Qwen 2.5 Coder Instruct 32B]() on the 5k trajectories. 14 | `SWE-agent-LM-32B` achieves 40.2% pass@1 on SWE-bench Verified. 15 | The trajectories are uploaded to a [HuggingFace dataset](https://huggingface.co/datasets/SWE-bench/SWE-smith-trajectories). 16 | We also release the [32B](https://huggingface.co/SWE-bench/SWE-agent-LM-32B) and [7B](https://huggingface.co/SWE-bench/SWE-agent-LM-7B) versions of the model. 17 | 18 | 4. **SWE-Rater-32B**, a Qwen 2.5 Coder Instruct 32B model fine-tuned on human annotated ratings of a SWE-bench task instance's difficulty. 19 | We release it as a [HuggingFace model](https://huggingface.co/SWE-bench/SWE-Rater-32B). -------------------------------------------------------------------------------- /docs/getting_started/index.md: -------------------------------------------------------------------------------- 1 | # SWE-smith 2 | 3 |
4 | SWE-smith 5 |
6 | 7 | SWE-smith is toolkit for training Software Engineering (SWE) agents. With SWE-smith, you can: 8 | 9 |
10 | Light Mode Image 11 | Dark Mode Image 12 |
13 | 14 | Check out the [installation](installation.md) guide to get started, then head over to the [tutorials](../guides/index.md) to learn 15 | about how to use SWE-smith. 16 | 17 | If you use SWE-smith in your work, we'd greatly appreciate a citation: 18 | 19 | ```bibtex 20 | @misc{yang2025swesmith, 21 | title={SWE-smith: Scaling Data for Software Engineering Agents}, 22 | author={John Yang and Kilian Leret and Carlos E. Jimenez and Alexander Wettig and Kabir Khandpur and Yanzhe Zhang and Binyuan Hui and Ofir Press and Ludwig Schmidt and Diyi Yang}, 23 | year={2025}, 24 | eprint={2504.21798}, 25 | archivePrefix={arXiv}, 26 | primaryClass={cs.SE}, 27 | url={https://arxiv.org/abs/2504.21798}, 28 | } 29 | ``` 30 | -------------------------------------------------------------------------------- /docs/getting_started/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | For the latest stable release 4 | 5 | ```bash 6 | pip install swesmith 7 | ``` 8 | 9 | For the latest development version 10 | 11 | ```bash 12 | git clone https://github.com/SWE-bench/SWE-smith 13 | cd SWE-smith 14 | conda create -n swesmith python=3.10; conda activate swesmith 15 | pip install -e . 16 | ``` 17 | 18 | If you plan to contribute to SWE-smith, please also perform: 19 | 20 | ```bash 21 | pre-commit install 22 | ``` -------------------------------------------------------------------------------- /docs/getting_started/quickstart.md: -------------------------------------------------------------------------------- 1 | We recommend checking out the [tutorials](../guides/index.md) for comprehensive guidance on SWE-smith usage. 2 | 3 | However, if you learn more easily by playing with the code, here's sequences of scripts corresponding to different SWE-smith workflows. 4 | If you run into issues, please consult the [tutorials](../guides/index.md) first, then open an [issue](https://github.com/SWE-bench/SWE-smith/issues/new/choose) if you can't find a solution. 5 | 6 | ### Creating Task Instances 7 | ```bash 8 | # Run LM rewrite strategy to produce bugs 9 | python -m swesmith.bug_gen.llm.modify pandas-dev__pandas.95280573 \ 10 | --config_file configs/bug_gen/lm_modify.yml \ 11 | --model claude-3-7-sonnet-20250219 \ 12 | --n_bugs 1 \ 13 | --n_workers=20 14 | 15 | # Collect all task instances into a single file for validation 16 | python -m swesmith.bug_gen.collect_patches logs/bug_gen/pandas-dev__pandas.95280573/ 17 | 18 | # Run validation on the collected task instances 19 | python -m swesmith.harness.valid logs/bug_gen/pandas-dev__pandas.95280573_all_patches.json \ 20 | --run_id pandas_test \ 21 | --max_workers=8 22 | 23 | # Gather valid task instances 24 | python -m swesmith.harness.gather logs/run_validation/pandas_test 25 | 26 | # Generate issues for the valid task instances 27 | python -m swesmith.issue_gen.generate \ 28 | --dataset_path logs/run_validation/basic/pandas_test.json \ 29 | --model claude-3-7-sonnet-20250219 \ 30 | --n_workers=1 \ 31 | --config_file configs/issue_gen/ig_v2.yaml \ 32 | --experiment_id ig_v2 33 | ``` 34 | 35 | !!! tip "Next steps" 36 | 37 | We provide [detailed tutorials](../guides/index.md) on each of these steps. -------------------------------------------------------------------------------- /docs/guides/difficulty_rating.md: -------------------------------------------------------------------------------- 1 | To see how SWE-smith compares against real world tasks (e.g. SWE-bench), we LoRA Fine-Tuned a [Qwen 2.5 32B Coder Instruct](https://github.com/QwenLM/Qwen2.5-Coder) model on 1.5k human ratings of the difficulty of real world bugs. 2 | 3 | Given the issue text and patch associated with a task instance, the model will rate the task as "easy" (< 15 min), "medium" (15 min - 1 hour), or "hard" (1+ hours). 4 | 5 | ## Inference 6 | 7 | You can rate the difficulty of your own task instances by following these steps: 8 | 9 | 1. Download the [HuggingFace checkpoint](). 10 | 11 | 2. Use `sglang` to serve the checkpoint. The training scripts available in the SWE-smith repository use [Modal](https://modal.com/) as a compute service for hosting inference. 12 | 13 | ```bash 14 | N_HOURS=4 N_GPUS=4 modal run --detach swesmith/train/serve_sglang.py \ 15 | --model-path /path/to/checkpoint \ 16 | --served-model-name gpt-4o \ 17 | --tokenizer-path /path/to/Qwen2.5-Coder-32B-Instruct 18 | ``` 19 | 20 | 3. Run the following script: 21 | 22 | ```bash 23 | python swesmith/train/difficulty_rater/get_difficulties.py \ 24 | --base_url \ 25 | --dataset_path path/to/dataset.json 26 | ``` 27 | 28 | The script will generate a `.json` file containing a mapping from each task instance to a difficulty score. 29 | You can then compute the dataset's difficulty score as the average of all task instance scores. 30 | 31 | ## Prior Datasets 32 | 33 | Using our model, we've assessed the difficulty of existing datasets, assigning scores of 1/5/9 to easy/medium/hard tasks. 34 | 35 | | Dataset | # Instances | Score | `easy` | `med` | `hard` | 36 | |------------------------|-------------|--------|--------|-------|--------| 37 | | SWE-bench | 2294 | 5.014 | 438 | 1408 | 446 | 38 | | └── Lite | 300 | 3.893 | 93 | 197 | 10 | 39 | | └── Verified | 500 | 3.960 | 173 | 284 | 43 | 40 | | SWE-bench Multimodal | 510 | 6.036 | 55 | 265 | 186 | 41 | | SWE-gym | 2438 | 5.625 | 288 | 1456 | 664 | 42 | | └── Lite | 230 | 3.890 | 67 | 156 | 4 | 43 | | SWE-smith (LM Modify) | 1000 | 3.304 | 441 | 542 | 17 | 44 | | SWE-smith (LM Rewrite) | 1000 | 5.272 | 68 | 796 | 136 | 45 | | SWE-smith (Procedural) | 1000 | 3.596 | 374 | 603 | 23 | 46 | | SWE-smith (PR Mirror) | 1000 | 4.876 | 206 | 619 | 175 | 47 | | SWE-smith (Combine) | 1000 | 5.720 | 52 | 716 | 232 | 48 | 49 | From the table, we demonstrate that SWE-smith task instances are comparable to real world tasks, and that our bug generation techniques allow for a wide range of task difficulties. -------------------------------------------------------------------------------- /docs/guides/env_construction.md: -------------------------------------------------------------------------------- 1 | SWE-smith enables automatic construction of execution environments for repositories. 2 | We'll review the two steps of this process: 3 | 4 | 1. SWE-agent + LM attempts to install a repository + run the testing suite. 5 | 2. Construct an execution environment (Docker image). 6 | 7 | For this section, we'll use the [Instagram/MonkeyType](https://github.com/Instagram/MonkeyType/) repository as a running example, 8 | specifically at commit [`70c3acf`](https://github.com/Instagram/MonkeyType/tree/70c3acf62950be5dfb28743c7a719bfdecebcd84). 9 | 10 | ## Automatically Install Repos with SWE-agent 11 | 12 | Coming soon! 13 | 14 | ## Create an Execution Environment 15 | First, create the conda environment for the target repository. 16 | ```bash 17 | python -m swesmith.build_repo.try_install Instagram/MonkeyType install_repo.sh \ 18 | --commit 70c3acf62950be5dfb28743c7a719bfdecebcd84 19 | ``` 20 | where `install_repo.sh` is the script that installs the repository. 21 | ([Example](https://github.com/SWE-bench/SWE-smith/blob/main/configs/install_repo.sh)) 22 | 23 | If successful, two artifacts will be produced under `logs/build_repo/records/`: 24 | * `sweenv_[repo + commit].yml`: A dump of the conda environment that was created. 25 | * `sweenv_[repo + commit].sh`: A log of the installation process. 26 | 27 | Next, run the following command to create a Docker image for the repository. 28 | 29 | ```bash 30 | python -m swesmith.build_repo.create_images --repos Instagram/MonkeyType 31 | ``` 32 | 33 | This command will create two artifacts: 34 | 1. A mirror of the original repository at the specified commit, created under [`swesmith`](https://github.com/orgs/swesmith/repositories). To change the organization, you can... 35 | * Pass in an `--org` argument, or 36 | * (If built from source) Change `ORG_NAME` in `swesmith/constants.py` 37 | 2. A Docker image (`swesmith.x86_64..`) which contains the installed codebase. 38 | 39 | It's good practice to check that your Docker image works as expected. 40 | ```bash 41 | docker run -it --rm swesmith.x86_64.instagram__monkeytype.70c3acf6 42 | ``` 43 | Within the container, run the testing suite (e.g. `pytest`) to ensure that the codebase is functioning as expected. 44 | 45 | !!! note "Get existing Docker images" 46 | 47 | All repositories represented in the SWE-smith [dataset](https://huggingface.co/datasets/SWE-bench/SWE-smith) are available to download. Simply run: 48 | ```bash 49 | python -m swesmith.build_repo.download_images 50 | ``` 51 | -------------------------------------------------------------------------------- /docs/guides/harnesses.md: -------------------------------------------------------------------------------- 1 | # Validation & Evaluation 2 | 3 | Great! You now have an execution environment + a bunch of candidate task instances. How do we determine which ones can be used for training? 4 | 5 | We provide two harnesses for the purposes of: 6 | 7 | * Validation: To check if a candidate task instance is usable (breaks 1+ existing tests). 8 | * Evaluation: To check if the proposed solution for a task instance is correct. 9 | 10 | The purposes of these harnesses are identical to their motivations in [SWE-bench](https://swe-bench.github.io). 11 | 12 | ## Validation 13 | The validation harness is used to check if a candidate task instance is usable (breaks 1+ existing tests). 14 | 15 | Once you've generated task instance candidates, follow these steps to validate them: 16 | 17 | 1. Collect the candidates 18 | 19 | ```bash 20 | python -m swesmith.bug_gen.collect_patches logs/bug_gen/ 21 | ``` 22 | 23 | This produces a `logs/bug_gen/_all_patches.json` file with all the candidate task instances. 24 | 25 | 2. Run validation 26 | 27 | ```bash 28 | python -m swesmith.harness.valid \ 29 | logs/bug_gen/_all_patches.json \ 30 | --run_id 31 | ``` 32 | 33 | The validation harness works in two steps. 34 | First, it runs the original repository's test suite to get the passing statuses of the existing tests. 35 | Then, it applies each candidate task instance to the repository and runs the test suite again. 36 | If the candidate task instance breaks 1+ existing tests, it is considered a usable task instance. 37 | 38 | For each task instance, the validation harness produces a `logs/run_validation//` folder containing the following information: 39 | 40 | * `eval.sh`: The sequence of test command(s) run 41 | * `patch.diff`: The candidate task instance 42 | * `report.json`: `FAIL_TO_PASS` and `PASS_TO_PASS` test cases 43 | * `run_instance.log`: The full trace of running validation 44 | * `test_output.txt`: The standard output of the test command(s) 45 | 46 | 3. Collect validated task instances 47 | 48 | ```bash 49 | python -m swesmith.harness.gather logs/run_validation/ 50 | ``` 51 | 52 | Task instances with 1+ `FAIL_TO_PASS` test cases and 1+ `PASS_TO_PASS` test cases are considered valid. 53 | 54 | This script performs two actions: 55 | 56 | * It collects all valid task instances into a `logs/task_insts/.json`. Each instance contains the following information: 57 | ```json 58 | { 59 | "instance_id": , 60 | "repo": , 61 | "patch": , 62 | "FAIL_TO_PASS": , 63 | "PASS_TO_PASS": , 64 | "created_at": , 65 | "image_name": , 66 | "base_commit": , 67 | } 68 | ``` 69 | * For each valid task instance, a branch called `` is created in the repository. The branch corresponds to the repository with the task instance's bug patch applied. 70 | 71 | ## Evaluation 72 | 73 | The evaluation harness is used to check if the proposed solution for a task instance is correct. 74 | 75 | You can run this script to sanity check that testing for validated task instances works as expected: 76 | 77 | ```bash 78 | python -m swesmith.harness.eval \ 79 | --dataset_path bugs/task_insts/{repo}.json \ 80 | --predictions_path gold \ 81 | --run_id sanity 82 | ``` 83 | 84 | If you want to run on real predictions, simply replace `gold` with the path to your predictions, which should look like: 85 | 86 | ```json 87 | { 88 | "instance_id": , 89 | "patch": , 90 | "model_name_or_path": , 91 | } 92 | ``` 93 | -------------------------------------------------------------------------------- /docs/guides/index.md: -------------------------------------------------------------------------------- 1 | # Tutorials -------------------------------------------------------------------------------- /docs/guides/issue_gen.md: -------------------------------------------------------------------------------- 1 | You have a bunch of task instances with executable environments. 2 | You're very close to training SWE-agents on this data. 3 | There's one last step - let's generate issue text. 4 | 5 | We primarily use LM's to generate issue text. 6 | 7 | ```bash 8 | python swesmith/issue_gen/generate.py logs/task_insts/.json \ 9 | --config_file configs/issue_gen/ig_v2.yaml \ 10 | --model anthropic/claude-3-7-sonnet-20250219 \ 11 | --n_workers 4 \ 12 | --experiment_id ig_v2 \ 13 | --use_existing 14 | ``` 15 | 16 | This will generated issue text for each task instance, producing several artifacts along the way: 17 | 18 | * Under `logs/issue_gen/ig_v2/`, there will be a folder for each task instance, containing: 19 | * `messages.json`: The messages fed to the LM to generate the issue text. 20 | * `metadata.json`: Conatins the issue text + inference cost. 21 | * In the same directory as `logs/task_insts/.json`, a `logs/issue_gen/__ig_v2_n1.json` file will be created, which is a copy of the original file with issue text added to each task instance (as the `problem_statement` field). 22 | 23 | ## Alternatives 24 | 25 | In our paper, we discuss several alternatives for generating issue text. 26 | While our experiments suggest that LM generated issue text is the best proxy for real issue text, we provide instructions for the alternatives below. 27 | 28 | **Static Issue Text** 29 | 30 | The problem statement is generated by randomly selecting one of 7 static issue text templates. 31 | 32 | ```bash 33 | python swesmith/issue_gen/get_static.py logs/task_insts/.json 34 | ``` 35 | 36 | Produces a `logs/issue_gen/__ig_static.json` file. 37 | 38 | **Random F2P Test Case** 39 | 40 | The problem statement shows a randomly selected Fail-to-Pass test case from the task instance. 41 | 42 | ```bash 43 | python swesmith/issue_gen/get_from_tests.py logs/task_insts/.json 44 | ``` 45 | 46 | **Original Issue Text** 47 | 48 | !!! note 49 | This strategy only works for some PR Mirrors, if the pull request the mirror is based on has issue(s) associated with it. 50 | 51 | ```bash 52 | python swesmith/issue_gen/get_from_pr.py logs/task_insts/.json 53 | ``` 54 | 55 | Produces a `logs/issue_gen/__ig_orig.json` file. 56 | -------------------------------------------------------------------------------- /docs/guides/train_swe_agent.md: -------------------------------------------------------------------------------- 1 | # Training SWE-agents 2 | 3 | Now the fun part - we provide details on how to operationalize SWE-smith for training SWE-agents! 4 | 5 | Specifically, we'll cover the workflow for Rejection Sampling Fine Tuning. 6 | 7 | !!! note "SWE-agent" 8 | 9 | The documentation in this section is heavily grounded in the [SWE-agent](https://github.com/SWE-agent/SWE-agent) library. 10 | We do *not* plan to explicitly support non SWE-agent scaffolds, but it should not be difficult - the main adaptations would just be how you generate expert trajectories and predictions for evaluation. 11 | 12 | There's several steps we'll cover: 13 | 14 | 1. Creating a subset of SWE-smith task instances. 15 | 2. Generating expert trajectories for those task instances. 16 | 3. Training a model on the expert trajectories. 17 | 4. Evaluating the model on SWE-bench (Lite/Verified/Multimodal). 18 | 19 | ## Creating SWE-smith Subset 20 | 21 | If you are using SWE-smith, the dataset of all [SWE-smith](https://huggingface.co/datasets/SWE-bench/SWE-smith) is quite large. 22 | Usually, we recommend training on a subset. 23 | To curate a subset, you might use the following logic. 24 | 25 | ```python 26 | import json 27 | 28 | from datasets import load_dataset 29 | swesmith = load_dataset("SWE-bench/SWE-smith", split="train") 30 | 31 | subset_name = "subset0" 32 | def criteria(task_instance): 33 | return ".pr_" in task_instance["instance_id"] and \ 34 | len(task_instance["FAIL_TO_PASS"]) <= 5 and \ 35 | len(task_instance["FAIL_TO_PASS"]) >= 2 36 | bugs = [x for x in swesmith if criteria(x)] 37 | print(f"Found {len(bugs)} bugs that match criteria") 38 | with open(f"logs/experiments/{subset_name}.json", "w") as f: 39 | json.dump(bugs, fp=f, indent=2) 40 | ``` 41 | 42 | ## Generate Expert Trajectories 43 | 44 | 1. Clone [SWE-agent](https://github.com/SWE-agent/SWE-agent). Make sure to follow the installation instructions [here](https://swe-agent.com/latest/installation/source/). 45 | 46 | 2. Create a soft link of the `agent/` folder to SWE-agent, meaning in SWE-agent, run: 47 | ```bash 48 | ln -s path/to/SWE-smith/agent/ . 49 | ``` 50 | 51 | 3. In SWE-agent, run exeprt trajectory generation: 52 | ```bash 53 | ./agent/_gen_trajs.sh 54 | ``` 55 | Check the file to see how the script works. You'll need to adjust the `--instances.path` argument to point to the subset you created in the previous step. 56 | 57 | ## Train Model 58 | 59 | The previous step will generate individual trajectories per task instance under the `SWE-agent/trajectories///` folder. 60 | 61 | We'll now determine which trajectories correspond to resolved instances, convert them to a format that can be used for SFT, and then train a model with them. 62 | 63 | 1. (From SWE-smith) Run evaluation on training task instances. 64 | ```bash 65 | python -m swesmith.harness.eval \ 66 | --dataset_path path/to/subset0.json \ 67 | --predictions_path path/to/trajectories///preds.json \ 68 | --run_id \ 69 | --max_workers 10 \ 70 | --timeout 240 71 | ``` 72 | 73 | !!! tip "`preds.json`" 74 | If there is no `preds.json`, run `sweagent merge-preds trajectories///`. 75 | 76 | This evaluation will generate a `logs/run_evaluation//` 77 | folder with a `report.json` file indicating which instance IDs were successfully resolved. 78 | 79 | 2. (From SWE-smith) Convert trajectories into SFT format. 80 | 81 | ```bash 82 | python -m swesmith.train.traj_mgr.transform_to_ft \ 83 | --traj_dir path/to/trajectories/// \ 84 | --eval_dir logs/run_evaluation// \ 85 | --only_resolved 86 | ``` 87 | 88 | This will product an `ft_xml_*.jsonl` file under the `trajectories_sft/` folder. 89 | This dataset can be used directly for SFT. 90 | 91 | 3. Run training. First, upload the file to Modal 92 | ```bash 93 | modal volume put trajectories_sft/ft_xml_*.jsonl 94 | ``` 95 | 96 | Then, modify `config/train/full_ft_qwen_7b.yml` to point to the file in Modal. 97 | 98 | Finally, run the training script: 99 | ```bash 100 | ./scripts/train.run_ft_torchtune.py 101 | ``` 102 | 103 | ## Evaluation 104 | Run inference on SWE-agent + your SFT'ed model on SWE-bench (Lite/Verified/Multimodal). 105 | 106 | 1. (From SWE-smith) Update `scripts/train.serve_sglang.sh` to point at SFT'ed model, then run it. 107 | 108 | 2. (From SWE-agent) Run inference: 109 | ```bash 110 | ./agent/_infer_model.sh 111 | ``` 112 | Make sure the Modal URL is correct and change the evaluation dataset as desired. 113 | 114 | 3. When inference finishes, run evaluation on the model's predictions. (Check out [sb-cli](https://github.com/SWE-bench/sb-cli/tree/main) for more information on how to conveniently run evaluation for SWE-bench-* datasets.) 115 | ```bash 116 | sb-cli submit swe-bench_verified test \ 117 | --predictions_path trajectories///preds.json \ 118 | --run_id 119 | ``` -------------------------------------------------------------------------------- /docs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 | {{ super() }} 5 | 6 | 7 | 30 | 31 | 32 | 39 | {% endblock %} -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: SWE-smith Documentation 2 | site_url: https://swe-smith.com/ 3 | theme: 4 | name: material 5 | custom_dir: docs/overrides 6 | icon: 7 | repo: fontawesome/brands/github 8 | annotation: material/chevron-right-circle 9 | logo: assets/swesmith_logo.png 10 | favicon: assets/swesmith_logo.png 11 | palette: 12 | - media: "(prefers-color-scheme)" 13 | toggle: 14 | icon: material/brightness-auto 15 | name: Switch to light mode 16 | - scheme: default 17 | # primary: black # override in custom.css 18 | accent: deep orange 19 | media: "(prefers-color-scheme: light)" 20 | toggle: 21 | icon: material/weather-night 22 | name: Switch to dark mode 23 | - scheme: slate 24 | # primary: black # override in custom.css 25 | accent: deep orange 26 | media: "(prefers-color-scheme: dark)" 27 | toggle: 28 | icon: material/weather-sunny 29 | name: Switch to light mode 30 | features: 31 | - navigation.tabs 32 | - navigation.tabs.sticky 33 | - navigation.indexes 34 | - content.action.edit 35 | - navigation.footer 36 | - content.code.copy 37 | - content.footnote.tooltips 38 | - header.autohide 39 | - announce.dismiss 40 | - content.code.annotate 41 | markdown_extensions: 42 | - sane_lists 43 | - admonition 44 | - pymdownx.details 45 | - pymdownx.superfences 46 | - pymdownx.magiclink 47 | - footnotes 48 | - attr_list 49 | - md_in_html 50 | - pymdownx.snippets: 51 | check_paths: true 52 | - pymdownx.emoji: 53 | emoji_index: !!python/name:material.extensions.emoji.twemoji 54 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 55 | nav: 56 | - Home: index.html 57 | - Getting started: 58 | - getting_started/index.md 59 | - Installation: getting_started/installation.md 60 | - Assets: getting_started/assets.md 61 | - Quickstart: getting_started/quickstart.md 62 | - Tutorials: 63 | - guides/index.md 64 | - Build Environments: guides/env_construction.md 65 | - Create Instances: guides/create_instances.md 66 | - Validation & Evaluation: guides/harnesses.md 67 | - Generate Issue Text: guides/issue_gen.md 68 | - Rate Difficulty: guides/difficulty_rating.md 69 | - Train SWE-agents: guides/train_swe_agent.md 70 | plugins: 71 | - glightbox 72 | - search 73 | - include-markdown 74 | - mike: 75 | canonical_version: latest 76 | version_selector: true 77 | - mkdocstrings: 78 | default_handler: python 79 | handlers: 80 | python: 81 | paths: ['swesmith'] 82 | options: 83 | merge_init_into_class: true 84 | summary: false 85 | show_root_heading: true 86 | heading_level: 2 87 | docstring_style: google 88 | show_if_no_docstring: true 89 | show_signature: true 90 | show_signature_annotations: true 91 | signature_crossrefs: true 92 | separate_signature: true 93 | show_symbol_type_heading: true 94 | show_symbol_type_toc: true 95 | extensions: 96 | - griffe_pydantic: 97 | schema: false 98 | repo_url: https://github.com/SWE-bench/SWE-smith 99 | repo_name: SWE-bench/SWE-smith 100 | edit_uri: edit/main/docs/ 101 | extra_css: 102 | - css/custom.css 103 | - css/mkdocstrings.css 104 | - css/bubbles.css 105 | extra: 106 | version: 107 | provider: mike 108 | analytics: 109 | provider: google 110 | property: G-T5P2NYGJYR -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ['setuptools>=42'] 3 | build-backend = 'setuptools.build_meta' 4 | 5 | [project] 6 | name = "swesmith" 7 | dynamic = ["version"] 8 | authors = [ 9 | {name = "John Yang", email = "byjohnyang@gmail.com"} 10 | ] 11 | description = "The official SWE-smith package - A toolkit for generating software engineering training data at scale." 12 | readme = "README.md" 13 | requires-python = ">=3.10" 14 | keywords = ["nlp", "benchmark", "code"] 15 | license = {file = "LICENSE"} 16 | classifiers = [ 17 | "Programming Language :: Python :: 3.10", 18 | "Programming Language :: Python :: 3.11", 19 | "Programming Language :: Python :: 3 :: Only", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | ] 23 | dependencies = [ 24 | "astor", 25 | "datasets", 26 | "docker", 27 | "ghapi", 28 | "jupyter", 29 | "libcst", 30 | "litellm", 31 | "matplotlib", 32 | "modal", 33 | "openai", 34 | "pre-commit", 35 | "python-dotenv", 36 | "rich", 37 | "sglang", 38 | "sparklines", 39 | "swebench", 40 | "tiktoken", 41 | "tqdm", 42 | "unidiff", 43 | "textual", 44 | ] 45 | 46 | [project.optional-dependencies] 47 | docs = [ 48 | "mkdocs", 49 | "mkdocs-material", 50 | "mkdocs-glightbox", 51 | "mkdocs-include-markdown-plugin", 52 | "mkdocstrings[python]>=0.18", 53 | "mike", 54 | ] 55 | test = [ 56 | "pytest", 57 | "pytest-cov", 58 | ] 59 | 60 | [tool.pytest.ini_options] 61 | testpaths = ["tests"] 62 | python_files = "test_*.py" 63 | python_classes = "Test*" 64 | python_functions = "test_*" 65 | 66 | [tool.setuptools] 67 | include-package-data = true 68 | 69 | [tool.setuptools.dynamic] 70 | version = {attr = "swesmith.__version__"} 71 | 72 | [tool.setuptools.packages.find] 73 | where = ["."] 74 | namespaces = false 75 | 76 | [project.urls] 77 | "Documentation" = "https://github.com/SWE-bench/SWE-smith" 78 | "Bug Reports" = "https://github.com/SWE-bench/SWE-smith/issues" 79 | "Source Code" = "https://github.com/SWE-bench/SWE-smith" 80 | "Website" = "https://swesmith.com" 81 | 82 | [tool.ruff] 83 | exclude = ["notebooks"] 84 | -------------------------------------------------------------------------------- /scripts/calculate_cost.py: -------------------------------------------------------------------------------- 1 | """ 2 | Purpose: Calculate the cost of generating bugs across all repositories 3 | 4 | Usage: python scripts/calculate_cost.py 5 | """ 6 | 7 | import argparse 8 | import os 9 | 10 | from swesmith.bug_gen.get_cost import main as get_cost 11 | from swesmith.constants import LOG_DIR_BUG_GEN 12 | 13 | 14 | def main(bug_type: str) -> None: 15 | folders = [ 16 | x 17 | for x in os.listdir(LOG_DIR_BUG_GEN) 18 | if os.path.isdir(os.path.join(LOG_DIR_BUG_GEN, x)) 19 | ] 20 | total_cost, total_bugs = 0, 0 21 | print("Repo | Cost | Bugs | Cost/Instance") 22 | for folder in folders: 23 | cost, bugs, per_instance = get_cost( 24 | os.path.join(LOG_DIR_BUG_GEN, folder), bug_type 25 | ) 26 | if cost == 0: 27 | continue 28 | print(f"- {folder}: {cost} | {bugs} | {per_instance}") 29 | total_cost += cost 30 | total_bugs += bugs 31 | print( 32 | f"Total: {round(total_cost, 2)} | {total_bugs} | {round(total_cost / total_bugs, 6)}" 33 | ) 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser( 38 | description="Determine the total cost of generating bugs across all repositories" 39 | ) 40 | parser.add_argument( 41 | dest="bug_type", 42 | type=str, 43 | help="Type of patches to collect. (default: all)", 44 | default="all", 45 | ) 46 | args = parser.parse_args() 47 | main(**vars(args)) 48 | -------------------------------------------------------------------------------- /scripts/cheatsheet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file contains examples for how to call all scripts and functionalities provided by the SWE-smith toolkit. 4 | 5 | ## The scripts are written such that you do *not* need to have the repository installed locally (run `pip install swesmith`). 6 | ## *Although*, some scripts require config files (you can download them from the repo). 7 | 8 | ## NOTE: If you want to create repositories + task instances under your own account, 9 | ## change swesmith/constants.py:29 (the `ORG_NAME` variable) to your own account. 10 | 11 | 12 | ###### MARK: Create Environment for Repository ###### 13 | 14 | # Attempts to create a conda environment for the repo. If successfully, a 15 | # dump of the conda environment is saved to `logs/build_images/records`` 16 | python -m swesmith.build_repo.try_install Instagram/MonkeyType configs/install_repo.sh --commit 70c3acf62950be5dfb28743c7a719bfdecebcd84 17 | 18 | # Download all existing SWE-smith environments 19 | # (All images downloaded by default, but you can specify a specific repo 20 | # from https://github.com/orgs/swesmith/repositories using `--repo`) 21 | python -m swesmith.build_repo.download_images 22 | 23 | # Create execution environment (Docker images) for all repositories 24 | python -m swesmith.build_repo.create_images --repos Instagram/MonkeyType 25 | 26 | 27 | ###### MARK: Generate Candidate Task Instances ###### 28 | 29 | # This would point at "https://github.com/swesmith/Instagram__MonkeyType.70c3acf6" 30 | repo="Instagram__MonkeyType.70c3acf6" 31 | 32 | # LM Rewrite 33 | python -m swesmith.bug_gen.llm.rewrite $repo \ 34 | --model anthropic/claude-3-7-sonnet-20250219 \ 35 | --type func \ 36 | --config_file configs/bug_gen/lm_rewrite.yml \ 37 | --n_workers 1 38 | 39 | # LM Modify 40 | python -m swesmith.bug_gen.llm.modify $repo \ 41 | --n_bugs 1 \ 42 | --model openai/gpt-4o \ 43 | --entity_type func \ 44 | --prompt_config configs/bug_gen/lm_modify.yml 45 | 46 | # Procedural Modifications 47 | python -m swesmith.bug_gen.procedural.generate $repo \ 48 | --type func \ 49 | --max_bugs 10 50 | 51 | # Combine (Same File) - Must have validated task instances to run this script 52 | python -m swesmith.bug_gen.combine.same_file logs/bug_gen/$repo \ 53 | --num_patches 3 \ 54 | --limit_per_file 15 \ 55 | --max_combos 100 56 | 57 | # Combine (Same Module) - Must have validated task instances to run this script 58 | python -m swesmith.bug_gen.combine.same_module logs/bug_gen/$repo \ 59 | --num_patches 2 \ 60 | --limit_per_module 20 \ 61 | --max_combos 200 \ 62 | --depth 2 63 | 64 | # PR Mirroring 65 | ## NOTE: `path/to/task_candidates.jsonl` is the output of running this 66 | ## the SWE-bench task candidate collection script: 67 | ## https://github.com/SWE-bench/SWE-bench/blob/main/swebench/collect/run_get_tasks_pipeline.sh 68 | python -m swesmith.bug_gen.mirror.generate path/to/task_candidates.jsonl --model openai/o3-mini 69 | 70 | 71 | ###### MARK: Validate + Evaluate Task Instances ###### 72 | ## NOTE: Before running the below, make sure 73 | ## - You have created task instances 74 | ## - The repository you're creating task instances for has an environment (Docker image) 75 | ## - (If testing is not pytest) You've specified a log parser in swesmith/harness/log_parsers.py 76 | 77 | # Collect all patches 78 | python -m swesmith.bug_gen.collect_patches logs/bug_gen/$repo 79 | 80 | # Run validation 81 | python -m swesmith.harness.valid logs/bug_gen/$repo_all_patches.json \ 82 | --run_id $repo 83 | 84 | # Collect task instances with 1+ F2P 85 | python -m swesmith.harness.gather logs/run_validation/$repo 86 | 87 | # Run evaluation 88 | python -m swesmith.harness.eval \ 89 | --dataset_path logs/task_insts/$repo.json \ 90 | --predictions_path gold \ 91 | --run_id $repo 92 | 93 | 94 | ####### MARK: Generate Issues ###### 95 | python -m swesmith.issue_gen.generate logs/task_insts/$repo.json \ 96 | --config_file configs/issue_gen/ig_v2.yaml \ 97 | --model anthropic/claude-3-7-sonnet-20250219 \ 98 | --n_workers 4 \ 99 | --experiment_id ig_v2 \ 100 | --use_existing 101 | 102 | # Alternatives: 103 | # python -m swesmith.issue_gen.get_from_pr logs/task_insts/$repo.json 104 | # python -m swesmith.issue_gen.get_from_tests logs/task_insts/$repo.json 105 | # python -m swesmith.issue_gen.get_static logs/task_insts/$repo.json -------------------------------------------------------------------------------- /scripts/train.get_difficulties.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python swesmith/train/difficulty_rater/get_difficulties.py \ 4 | --base_url https://ylok22798a8ebw.r15.modal.host \ 5 | --dataset_path logs/experiments/exp32__ig_v2_n1.json 6 | # --dataset_path ../swe_gym_instances_solved.json 7 | -------------------------------------------------------------------------------- /scripts/train.run_ft_torchtune.sh: -------------------------------------------------------------------------------- 1 | N_GPUS=8 modal run --detach swesmith/train/run/ft_torchtune.py --config configs/train/full_ft_qwen_32b.yml 2 | 3 | # N_GPUS=2 modal run --detach swesmith/train/run/ft_torchtune.py --config configs/train/full_ft_qwen_7b.yml -------------------------------------------------------------------------------- /scripts/train.run_ft_unsloth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | N_GPUS=1 modal run --detach swesmith/train/run/ft_unsloth.py -------------------------------------------------------------------------------- /scripts/train.serve_sglang.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # N_HOURS=4 N_GPUS=4 modal run --detach swesmith/train/serve_sglang.py \ 4 | # --model-path /llm-weights/outputs/qwen2p5-coder-32b-lora-lr1e-4-warmup5___difficulty/qwen2p5-coder-32b-lora-lr1e-4-warmup5___difficulty/merged \ 5 | # --served-model-name gpt-4o \ 6 | # --tokenizer-path /llm-weights/Qwen/Qwen2.5-Coder-32B-Instruct 7 | 8 | N_HOURS=3 N_GPUS=8 modal run --detach swesmith/train/serve_sglang.py \ 9 | --model-path /llm-weights/final/qwen2p5-coder-32b-full-lr5e-5-warmup5___ft_xml_all_250413_run3/epoch_2 \ 10 | --served-model-name gpt-4o \ 11 | --tokenizer-path /llm-weights/Qwen/Qwen2.5-Coder-32B-Instruct 12 | -------------------------------------------------------------------------------- /swesmith/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.2" 2 | -------------------------------------------------------------------------------- /swesmith/bug_gen/collect_patches.py: -------------------------------------------------------------------------------- 1 | """ 2 | Purpose: Collect all the patches into a single json file that can be fed into swesmith.harness.valid 3 | 4 | Usage: python -m swesmith.bug_gen.collect_patches logs/bug_gen/ 5 | 6 | NOTE: Must be with respect to a logs/bug_gen/<...>/ directory 7 | """ 8 | 9 | import argparse 10 | import os 11 | import json 12 | from pathlib import Path 13 | 14 | from swebench.harness.constants import KEY_INSTANCE_ID 15 | from swesmith.constants import LOG_DIR_BUG_GEN, KEY_IMAGE_NAME, KEY_PATCH, PREFIX_BUG 16 | from swesmith.utils import get_image_name 17 | 18 | 19 | def main(bug_gen_path: str | Path, bug_type: str = "all", num_bugs: int = -1): 20 | """ 21 | Collect all the patches into a single json file that can be fed into swebench.harness.valid 22 | :param repo_path: Path to the bug_gen logs. 23 | :param bug_type: Type of patches to collect. (default: all) 24 | :param num_bugs: Number of bugs to collect. (default: all) 25 | """ 26 | bug_gen_path = Path(bug_gen_path) 27 | if not bug_gen_path.resolve().is_relative_to((Path() / LOG_DIR_BUG_GEN).resolve()): 28 | print( 29 | f"Warning: {bug_gen_path} may not point to a bug_gen log directory (should be in {(Path() / LOG_DIR_BUG_GEN).resolve()})." 30 | ) 31 | 32 | repo = bug_gen_path.name 33 | commit = repo.rsplit(".", 1)[-1] 34 | repo = repo.rsplit(".", 1)[0] 35 | image_name = get_image_name(repo, commit) 36 | 37 | patches = [] 38 | prefix = f"{PREFIX_BUG}__" 39 | if bug_type != "all": 40 | prefix += bug_type + "_" 41 | exit_loop = False 42 | for root, _, files in os.walk(bug_gen_path): 43 | for file in files: 44 | if file.startswith(prefix) and file.endswith(".diff"): 45 | bug_type_and_uuid = file.split(f"{PREFIX_BUG}__")[-1].split(".diff")[0] 46 | instance_id = f"{repo}.{commit}.{bug_type_and_uuid}" 47 | patch = {} 48 | 49 | # Add metadata if it exists 50 | metadata_file = f"metadata__{bug_type_and_uuid}.json" 51 | if os.path.exists(os.path.join(root, metadata_file)): 52 | patch.update(json.load(open(os.path.join(root, metadata_file)))) 53 | 54 | # Add necessary bug patch information 55 | patch.update( 56 | { 57 | KEY_INSTANCE_ID: instance_id, 58 | KEY_PATCH: open(os.path.join(root, file), "r").read(), 59 | KEY_IMAGE_NAME: image_name, 60 | } 61 | ) 62 | patches.append(patch) 63 | if num_bugs != -1 and len(patches) >= num_bugs: 64 | exit_loop = True 65 | break 66 | if exit_loop: 67 | break 68 | 69 | bug_patches_file = ( 70 | bug_gen_path.parent / f"{bug_gen_path.name}_{bug_type}_patches.json" 71 | ) 72 | if num_bugs != -1: 73 | bug_patches_file = bug_patches_file.with_name( 74 | bug_patches_file.stem + f"_n{num_bugs}" + bug_patches_file.suffix 75 | ) 76 | if len(patches) > 0: 77 | with open(bug_patches_file, "w") as f: 78 | f.write(json.dumps(patches, indent=4)) 79 | print(f"Saved {len(patches)} patches to {bug_patches_file}") 80 | else: 81 | print(f"No patches found for `{bug_type}` in {bug_gen_path}") 82 | 83 | 84 | if __name__ == "__main__": 85 | parser = argparse.ArgumentParser( 86 | description="Collect all the patches into a single json file that can be fed into swesmith.harness.valid" 87 | ) 88 | parser.add_argument("bug_gen_path", help="Path to the bug_gen logs.") 89 | parser.add_argument( 90 | "--type", 91 | dest="bug_type", 92 | type=str, 93 | help="Type of patches to collect. (default: all)", 94 | default="all", 95 | ) 96 | parser.add_argument( 97 | "-n", 98 | "--num_bugs", 99 | type=int, 100 | help="Number of bugs to collect. (default: all)", 101 | default=-1, 102 | ) 103 | args = parser.parse_args() 104 | main(**vars(args)) 105 | -------------------------------------------------------------------------------- /swesmith/bug_gen/get_cost.py: -------------------------------------------------------------------------------- 1 | """ 2 | Purpose: Determine the total cost of LLM generated bugs (bug__func*.json) for a given repository. 3 | 4 | Usage: python -m swesmith.bug_gen.get_cost logs/bug_gen/ 5 | """ 6 | 7 | import argparse 8 | import json 9 | import os 10 | 11 | 12 | def main(repo_path: str, bug_type: str) -> float: 13 | total_cost = 0.0 14 | total_bugs = 0 15 | prefix = "metadata__" 16 | if bug_type != "all": 17 | prefix += f"{bug_type}" 18 | for root, _, files in os.walk(repo_path): 19 | for file in files: 20 | if file.startswith(prefix) and file.endswith(".json"): 21 | with open(os.path.join(root, file), "r") as f: 22 | data = json.load(f) 23 | if "cost" in data: 24 | total_cost += data["cost"] 25 | total_bugs += 1 26 | per_instance = total_cost / total_bugs if total_bugs > 0 else 0 27 | return total_cost, total_bugs, per_instance 28 | 29 | 30 | if __name__ == "__main__": 31 | parser = argparse.ArgumentParser( 32 | description="Determine the total cost of generating bugs for a given repository." 33 | ) 34 | parser.add_argument("repo_path", help="Path to the bug_gen logs.") 35 | parser.add_argument( 36 | "--type", 37 | dest="bug_type", 38 | type=str, 39 | help="Type of patches to collect. (default: all)", 40 | default="all", 41 | ) 42 | args = parser.parse_args() 43 | print(main(**vars(args))) 44 | -------------------------------------------------------------------------------- /swesmith/bug_gen/llm/utils.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import astor 3 | import re 4 | 5 | 6 | PROMPT_KEYS = ["system", "demonstration", "instance"] 7 | 8 | 9 | def extract_code_block(text: str) -> str: 10 | pattern = r"```(?:\w+)?\n(.*?)```" 11 | match = re.search(pattern, text, re.DOTALL) 12 | return match.group(1).strip() if match else "" 13 | 14 | 15 | def get_function_signature(node): 16 | """Generate the function signature as a string.""" 17 | args = [ast.unparse(arg) for arg in node.args.args] # For Python 3.9+ 18 | args_str = ", ".join(args) 19 | return f"def {node.name}({args_str})" 20 | 21 | 22 | def strip_function_body(source_code): 23 | tree = ast.parse(source_code) 24 | 25 | class FunctionBodyStripper(ast.NodeTransformer): 26 | def visit_FunctionDef(self, node): 27 | # Keep the original arguments and decorator list 28 | new_node = ast.FunctionDef( 29 | name=node.name, 30 | args=node.args, 31 | body=[], # Empty body initially 32 | decorator_list=node.decorator_list, 33 | returns=node.returns, 34 | type_params=getattr(node, "type_params", None), # For Python 3.12+ 35 | ) 36 | 37 | # Add docstring if it exists 38 | if ( 39 | node.body 40 | and isinstance(node.body[0], ast.Expr) 41 | and isinstance(node.body[0].value, ast.Str) 42 | ): 43 | new_node.body.append(node.body[0]) 44 | 45 | # Add a comment indicating to implement this function 46 | new_node.body.append(ast.Expr(ast.Str("TODO: Implement this function"))) 47 | 48 | # Add a 'pass' statement after the docstring 49 | new_node.body.append(ast.Pass()) 50 | 51 | return new_node 52 | 53 | stripped_tree = FunctionBodyStripper().visit(tree) 54 | ast.fix_missing_locations(stripped_tree) 55 | 56 | return astor.to_source(stripped_tree) 57 | -------------------------------------------------------------------------------- /swesmith/bug_gen/mirror/prompts.py: -------------------------------------------------------------------------------- 1 | RECOVERY_PROMPT = """You are given the source code of a file and a corresponding diff patch that reflects changes made to this file. 2 | Your task is to rewrite the entire source code while reversing the changes indicated by the diff patch. 3 | That is, if a line was added in the diff, remove it; if a line was removed, add it back; and if a line was modified, restore it to its previous state. 4 | 5 | DO NOT MAKE ANY OTHER CHANGES TO THE SOURCE CODE. If a line was not explicitly added or removed in the diff, it should remain unchanged in the output. 6 | 7 | INPUT: 8 | 9 | Source code will be provided here. 10 | 11 | 12 | 13 | Diff patch will be provided here. 14 | 15 | 16 | OUTPUT: 17 | The fully rewritten source code, after undoing all changes specified in the diff. 18 | The output should be valid Python code. 19 | """ 20 | 21 | DEMO_PROMPT = """Demonstration: 22 | 23 | INPUT: 24 | 25 | def greet(name): 26 | print(f"Hi, {name}! How's it going?") 27 | print("Even though this line is not in the diff, it should remain unchanged.") 28 | 29 | def farewell(name): 30 | print(f"Goodbye, {name}!") 31 | 32 | 33 | 34 | diff --git a/greet.py b/greet.py 35 | index 1234567..7654321 100644 36 | --- a/greet.py 37 | +++ b/greet.py 38 | @@ -1,4 +1,4 @@ 39 | def greet(name): 40 | - print(f"Hello, {name}! How are you?") 41 | + print(f"Hi, {name}! How's it going?") 42 | 43 | def farewell(name): 44 | print(f"Goodbye, {name}!") 45 | 46 | 47 | 48 | OUTPUT: 49 | def greet(name): 50 | print(f"Hello, {name}! How are you?") 51 | print("Even though this line is not in the diff, it should remain unchanged.") 52 | 53 | def farewell(name): 54 | print(f"Goodbye, {name}!") 55 | """ 56 | 57 | TASK_PROMPT = """Task: 58 | 59 | INPUT: 60 | 61 | {} 62 | 63 | 64 | 65 | {} 66 | 67 | 68 | 69 | NOTES: 70 | - As a reminder, DO NOT MAKE ANY OTHER CHANGES TO THE SOURCE CODE. If a line was not explicitly added or removed in the diff, it should remain unchanged in the output. 71 | - Only make changes based on lines that were: 72 | * Added (have a + in front of them) 73 | * Removed (have a - in front of them) 74 | - DO NOT PROVIDE ANY TEXT ASIDE FROM THE REWRITTEN FILE. ANSWER WITH ONLY THE REWRITTEN CODE. 75 | 76 | OUTPUT:""" 77 | -------------------------------------------------------------------------------- /swesmith/bug_gen/procedural/__init__.py: -------------------------------------------------------------------------------- 1 | import libcst 2 | import random 3 | 4 | from swesmith.constants import DEFAULT_PM_LIKELIHOOD 5 | 6 | 7 | class BaseProceduralModifier(libcst.CSTTransformer): 8 | def __init__(self, likelihood: float = DEFAULT_PM_LIKELIHOOD, seed: float = 24): 9 | super().__init__() 10 | assert 0 <= likelihood <= 1, "Likelihood must be between 0 and 1." 11 | self.rand = random.Random(seed) 12 | self.likelihood = likelihood 13 | 14 | def flip(self) -> bool: 15 | return self.rand.random() < self.likelihood 16 | 17 | 18 | FLIPPED_OPERATORS = { 19 | libcst.Add: libcst.Subtract, 20 | libcst.And: libcst.Or, 21 | libcst.BitAnd: libcst.BitOr, 22 | libcst.BitAnd: libcst.BitXor, 23 | libcst.BitOr: libcst.BitAnd, 24 | libcst.BitXor: libcst.BitAnd, 25 | libcst.Divide: libcst.Multiply, 26 | libcst.Equal: libcst.NotEqual, 27 | libcst.FloorDivide: libcst.Modulo, 28 | libcst.GreaterThan: libcst.LessThan, 29 | libcst.GreaterThanEqual: libcst.LessThanEqual, 30 | libcst.In: libcst.NotIn, 31 | libcst.Is: libcst.IsNot, 32 | libcst.IsNot: libcst.Is, 33 | libcst.LeftShift: libcst.RightShift, 34 | libcst.LessThan: libcst.GreaterThan, 35 | libcst.LessThanEqual: libcst.GreaterThanEqual, 36 | libcst.Modulo: libcst.FloorDivide, 37 | libcst.Multiply: libcst.Divide, 38 | libcst.NotEqual: libcst.Equal, 39 | libcst.NotIn: libcst.In, 40 | libcst.Or: libcst.And, 41 | libcst.Power: libcst.Multiply, 42 | libcst.RightShift: libcst.LeftShift, 43 | libcst.Subtract: libcst.Add, 44 | } 45 | -------------------------------------------------------------------------------- /swesmith/bug_gen/procedural/classes.py: -------------------------------------------------------------------------------- 1 | import libcst 2 | 3 | from swesmith.bug_gen.procedural import BaseProceduralModifier 4 | from swesmith.bug_gen.criteria import * 5 | 6 | 7 | class ClassRemoveBasesModifier(BaseProceduralModifier): 8 | explanation: str = "The base class has been removed from the class definition." 9 | name: str = "func_pm_class_rm_base" 10 | conditions: list = [filter_classes_has_parents, filter_min_simple_complexity] 11 | 12 | def leave_ClassDef(self, original_node, updated_node): 13 | bases = list(updated_node.bases) 14 | if len(bases) > 0 and self.flip(): 15 | if len(bases) == 1: 16 | bases = [] 17 | else: 18 | to_remove = self.rand.randint(0, len(bases) - 1) 19 | bases.pop(to_remove) 20 | 21 | return updated_node.with_changes(bases=tuple(bases)) 22 | 23 | 24 | class ClassShuffleMethodsModifier(BaseProceduralModifier): 25 | explanation: str = "The methods in a class have been shuffled." 26 | name: str = "func_pm_class_shuffle_funcs" 27 | conditions: list = [filter_classes, filter_min_simple_complexity] 28 | 29 | def leave_ClassDef(self, original_node, updated_node): 30 | methods = [ 31 | n for n in updated_node.body.body if isinstance(n, libcst.FunctionDef) 32 | ] 33 | non_methods = [ 34 | n for n in updated_node.body.body if not isinstance(n, libcst.FunctionDef) 35 | ] 36 | self.rand.shuffle(methods) 37 | new_body = non_methods + methods 38 | return updated_node.with_changes( 39 | body=updated_node.body.with_changes(body=tuple(new_body)) 40 | ) 41 | 42 | 43 | class ClassRemoveFuncsModifier(BaseProceduralModifier): 44 | explanation: str = ( 45 | "Method(s) and their reference(s) have been removed from the class." 46 | ) 47 | name: str = "func_pm_class_rm_funcs" 48 | conditions: list = [filter_classes, filter_min_simple_complexity] 49 | 50 | def leave_ClassDef( 51 | self, original_node: libcst.ClassDef, updated_node: libcst.ClassDef 52 | ) -> libcst.ClassDef: 53 | # Access the statements inside the indented block 54 | body_statements = list(updated_node.body.body) 55 | 56 | # Track which function names we're removing 57 | removed_functions = set() 58 | 59 | # First pass: identify functions to remove 60 | new_body_statements = [] 61 | for stmt in body_statements: 62 | if isinstance(stmt, libcst.FunctionDef) and self.flip(): 63 | # Track this function name for removal 64 | removed_functions.add(stmt.name.value) 65 | # Skip this function (remove it) 66 | continue 67 | new_body_statements.append(stmt) 68 | 69 | # Only proceed if we actually removed something 70 | if not removed_functions: 71 | return updated_node 72 | 73 | # Create a reference remover to clean up references to removed functions 74 | reference_remover = FunctionReferenceRemover(removed_functions) 75 | 76 | # Second pass: process the remaining statements to remove references 77 | clean_statements = [] 78 | for stmt in new_body_statements: 79 | # The correct way to apply a transformer to a node 80 | clean_stmt = stmt.visit(reference_remover) 81 | clean_statements.append(clean_stmt) 82 | 83 | # Create a new indented block with the cleaned statements 84 | new_body = updated_node.body.with_changes(body=tuple(clean_statements)) 85 | 86 | # Return the updated class with the new body 87 | return updated_node.with_changes(body=new_body) 88 | 89 | 90 | class FunctionReferenceRemover(libcst.CSTTransformer): 91 | """Helper transformer to remove references to deleted functions.""" 92 | 93 | def __init__(self, removed_functions): 94 | super().__init__() 95 | self.removed_functions = removed_functions 96 | self.in_self_attr = False 97 | 98 | def visit_Attribute(self, node: libcst.Attribute) -> bool: 99 | # Check if this is a self.method_name pattern 100 | if ( 101 | isinstance(node.value, libcst.Name) 102 | and node.value.value == "self" 103 | and node.attr.value in self.removed_functions 104 | ): 105 | self.in_self_attr = True 106 | return True 107 | 108 | def leave_Attribute( 109 | self, original_node: libcst.Attribute, updated_node: libcst.Attribute 110 | ) -> libcst.BaseExpression: 111 | if ( 112 | isinstance(updated_node.value, libcst.Name) 113 | and updated_node.value.value == "self" 114 | and updated_node.attr.value in self.removed_functions 115 | ): 116 | # Reset state 117 | self.in_self_attr = False 118 | return updated_node 119 | 120 | def leave_Call( 121 | self, original_node: libcst.Call, updated_node: libcst.Call 122 | ) -> libcst.BaseExpression: 123 | # Check if we're calling a removed function through self 124 | if self.in_self_attr: 125 | # Reset state 126 | self.in_self_attr = False 127 | # Replace with a placeholder that won't cause errors 128 | return libcst.Name(value="None") 129 | return updated_node 130 | -------------------------------------------------------------------------------- /swesmith/bug_gen/procedural/control_flow.py: -------------------------------------------------------------------------------- 1 | import libcst 2 | 3 | from functools import partial 4 | from swesmith.bug_gen.procedural import BaseProceduralModifier 5 | from swesmith.bug_gen.criteria import * 6 | 7 | 8 | class ControlIfElseInvertModifier(BaseProceduralModifier): 9 | explanation: str = ( 10 | "The if-else conditions may be out of order, or the bodies are inverted." 11 | ) 12 | name: str = "func_pm_ctrl_invert_if" 13 | conditions: list = [ 14 | filter_functions, 15 | filter_if_else, 16 | partial(filter_min_simple_complexity, threshold=5), 17 | ] 18 | 19 | def leave_If(self, original_node: libcst.If, updated_node: libcst.If) -> libcst.If: 20 | if not self.flip(): 21 | return updated_node 22 | 23 | # Only proceed if there's an else branch to swap with 24 | if not updated_node.orelse: 25 | return updated_node 26 | 27 | # We need to handle standard else blocks 28 | if isinstance(updated_node.orelse, libcst.Else): 29 | # Store the original bodies 30 | if_body = updated_node.body 31 | else_body = updated_node.orelse.body 32 | 33 | # Create a new else clause with the original if body 34 | new_else = libcst.Else( 35 | body=if_body, 36 | whitespace_before_colon=updated_node.orelse.whitespace_before_colon, 37 | ) 38 | 39 | # Return a new If statement with swapped bodies 40 | return updated_node.with_changes(body=else_body, orelse=new_else) 41 | 42 | # Skip elif cases for now 43 | return updated_node 44 | 45 | 46 | class ControlShuffleLinesModifier(BaseProceduralModifier): 47 | explanation: str = "The lines inside a function may be out of order." 48 | name: str = "func_pm_ctrl_shuffle" 49 | conditions: list = [ 50 | filter_functions, 51 | partial(filter_min_simple_complexity, threshold=3), 52 | partial(filter_max_simple_complexity, threshold=10), 53 | ] 54 | 55 | def leave_FunctionDef( 56 | self, original_node: libcst.FunctionDef, updated_node: libcst.FunctionDef 57 | ) -> libcst.FunctionDef: 58 | # Skip modification if random check fails 59 | if not self.flip(): 60 | return updated_node 61 | 62 | # Make sure we're working with an indented block 63 | if not isinstance(updated_node.body, libcst.IndentedBlock): 64 | return updated_node 65 | 66 | # Get the body statements 67 | body = list(updated_node.body.body) 68 | 69 | # Don't shuffle if there are fewer than 2 statements 70 | if len(body) < 2: 71 | return updated_node 72 | 73 | # Create a shuffled copy of the statements 74 | shuffled_body = body.copy() 75 | self.rand.shuffle(shuffled_body) 76 | 77 | # Create a new indented block with the shuffled statements 78 | new_body = libcst.IndentedBlock( 79 | body=tuple(shuffled_body), 80 | indent=updated_node.body.indent, 81 | header=updated_node.body.header, 82 | footer=updated_node.body.footer, 83 | ) 84 | 85 | # Return the updated function with the new body 86 | return updated_node.with_changes(body=new_body) 87 | -------------------------------------------------------------------------------- /swesmith/bug_gen/procedural/remove.py: -------------------------------------------------------------------------------- 1 | import libcst 2 | 3 | from swesmith.bug_gen.procedural import BaseProceduralModifier 4 | from swesmith.bug_gen.criteria import * 5 | 6 | 7 | class RemoveLoopModifier(BaseProceduralModifier): 8 | explanation: str = "There is one or more missing loops that is causing the bug." 9 | name: str = "func_pm_remove_loop" 10 | conditions: list = [ 11 | filter_functions, 12 | filter_loops, 13 | filter_min_simple_complexity, 14 | ] # Assuming filter functions will be applied externally 15 | 16 | def leave_For(self, original_node, updated_node): 17 | return libcst.RemoveFromParent() if self.flip() else updated_node 18 | 19 | def leave_While(self, original_node, updated_node): 20 | return libcst.RemoveFromParent() if self.flip() else updated_node 21 | 22 | 23 | class RemoveConditionalModifier(BaseProceduralModifier): 24 | explanation: str = "There is one or more missing conditionals that causes the bug." 25 | name: str = "func_pm_remove_cond" 26 | conditions: list = [ 27 | filter_functions, 28 | filter_conditionals, 29 | filter_min_simple_complexity, 30 | ] 31 | 32 | def leave_If(self, original_node, updated_node): 33 | return libcst.RemoveFromParent() if self.flip() else updated_node 34 | 35 | 36 | class RemoveAssignModifier(BaseProceduralModifier): 37 | explanation: str = "There is likely a missing assignment in the code." 38 | name: str = "func_pm_remove_assign" 39 | conditions: list = [ 40 | filter_functions, 41 | filter_assignments, 42 | filter_min_simple_complexity, 43 | ] 44 | 45 | def leave_Assign(self, original_node, updated_node): 46 | return libcst.RemoveFromParent() if self.flip() else updated_node 47 | 48 | def leave_AugAssign(self, original_node, updated_node): 49 | return libcst.RemoveFromParent() if self.flip() else updated_node 50 | 51 | 52 | class RemoveWrapperModifier(BaseProceduralModifier): 53 | explanation: str = "There are missing wrappers (with, try blocks) in the code." 54 | name: str = "func_pm_remove_wrapper" 55 | conditions: list = [filter_functions, filter_wrappers, filter_min_simple_complexity] 56 | 57 | def leave_With(self, original_node, updated_node): 58 | return libcst.RemoveFromParent() if self.flip() else updated_node 59 | 60 | def leave_AsyncWith(self, original_node, updated_node): 61 | return libcst.RemoveFromParent() if self.flip() else updated_node 62 | 63 | def leave_Try(self, original_node, updated_node): 64 | return libcst.RemoveFromParent() if self.flip() else updated_node 65 | -------------------------------------------------------------------------------- /swesmith/build_repo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/swesmith/build_repo/__init__.py -------------------------------------------------------------------------------- /swesmith/build_repo/download_images.py: -------------------------------------------------------------------------------- 1 | """ 2 | Purpose: Standalone script to download all SWEFT images 3 | 4 | Usage: python -m swesmith.build_repo.download_images 5 | """ 6 | 7 | import argparse 8 | import docker 9 | import os 10 | import json 11 | import requests 12 | 13 | DOCKER_ORG = "jyangballin" 14 | TAG = "latest" 15 | 16 | 17 | def get_docker_hub_login(): 18 | docker_config_path = os.path.expanduser("~/.docker/config.json") 19 | 20 | try: 21 | with open(docker_config_path, "r") as config_file: 22 | docker_config = json.load(config_file) 23 | 24 | auths = docker_config.get("auths", {}) 25 | docker_hub = auths.get("https://index.docker.io/v1/") 26 | 27 | if not docker_hub: 28 | raise Exception( 29 | "Docker Hub credentials not found. Please log in using 'docker login'." 30 | ) 31 | 32 | # The token is encoded in Base64 (username:password), decode it 33 | from base64 import b64decode 34 | 35 | auth_token = docker_hub.get("auth") 36 | if not auth_token: 37 | raise Exception("No auth token found in Docker config.") 38 | 39 | decoded_auth = b64decode(auth_token).decode("utf-8") 40 | username, password = decoded_auth.split(":", 1) 41 | return username, password 42 | 43 | except FileNotFoundError: 44 | raise Exception( 45 | "Docker config file not found. Have you logged in using 'docker login'?" 46 | ) 47 | except Exception as e: 48 | raise Exception(f"Error retrieving Docker Hub token: {e}") 49 | 50 | 51 | def get_dockerhub_token(username, password): 52 | """Get DockerHub authentication token""" 53 | auth_url = "https://hub.docker.com/v2/users/login" 54 | auth_data = {"username": username, "password": password} 55 | response = requests.post(auth_url, json=auth_data) 56 | response.raise_for_status() 57 | return response.json()["token"] 58 | 59 | 60 | def get_docker_repositories(username, token): 61 | url = f"https://hub.docker.com/v2/repositories/{username}/" 62 | headers = {"Authorization": f"Bearer {token}"} 63 | 64 | repositories = [] 65 | while url: 66 | response = requests.get(url, headers=headers) 67 | if response.status_code != 200: 68 | raise Exception( 69 | f"Failed to fetch repositories: {response.status_code}, {response.text}" 70 | ) 71 | 72 | data = response.json() 73 | repositories.extend(data.get("results", [])) 74 | url = data.get("next") # Get the next page URL, if any 75 | 76 | return repositories 77 | 78 | 79 | def main(repo: str, proceed: bool = True): 80 | username, password = get_docker_hub_login() 81 | token = get_dockerhub_token(username, password) 82 | client = docker.from_env() 83 | 84 | # Get list of swesmith repositories 85 | repos = get_docker_repositories(DOCKER_ORG, token) 86 | repos = [r for r in repos if r["name"].startswith("swesmith")] 87 | if repo: 88 | repos = [ 89 | r 90 | for r in repos 91 | if repo.replace("__", "_1776_") in r["name"] 92 | or repo in r["name"] 93 | or repo.replace("/", "_1776_") in r["name"] 94 | ] 95 | if len(repos) == 0: 96 | print(f"Could not find image for {repo}, exiting...") 97 | return 98 | 99 | print(f"Found {len(repos)} environments:") 100 | for idx, r in enumerate(repos): 101 | print("-", r["name"]) 102 | if idx == 4: 103 | print(f"(+ {len(repos) - 5} more...)") 104 | break 105 | if not proceed and input("Proceed with downloading images? (y/n): ").lower() != "y": 106 | return 107 | 108 | # Download images 109 | for r in repos: 110 | print(f"Downloading {r['name']}...") 111 | client.images.pull(f"{DOCKER_ORG}/{r['name']}:{TAG}") 112 | # Rename images via tagging 113 | new_name = f"{r['name'].replace('_1776_', '__')}:{TAG}" 114 | client.images.get(f"{DOCKER_ORG}/{r['name']}:{TAG}").tag(new_name) 115 | 116 | 117 | if __name__ == "__main__": 118 | parser = argparse.ArgumentParser() 119 | parser.add_argument("--repo", type=str, help="Repository name", default=None) 120 | parser.add_argument( 121 | "-y", 122 | "--proceed", 123 | action="store_true", 124 | help="Proceed with downloading images", 125 | ) 126 | args = parser.parse_args() 127 | main(**vars(args)) 128 | -------------------------------------------------------------------------------- /swesmith/harness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/swesmith/harness/__init__.py -------------------------------------------------------------------------------- /swesmith/harness/log_parsers.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from swebench.harness.constants import TestStatus 4 | 5 | 6 | def parse_log_pytest(log: str) -> dict[str, str]: 7 | """ 8 | Parser for test logs generated with PyTest framework 9 | 10 | Args: 11 | log (str): log content 12 | Returns: 13 | dict: test case to test status mapping 14 | """ 15 | test_status_map = {} 16 | for line in log.split("\n"): 17 | for status in TestStatus: 18 | is_match = re.match(f"^(\S+)(\s+){status.value}", line) 19 | if is_match: 20 | test_status_map[is_match.group(1)] = status.value 21 | continue 22 | return test_status_map 23 | 24 | 25 | def parse_log_mypy(log: str) -> dict[str, str]: 26 | """Parser for test logs generated by mypy""" 27 | test_status_map = {} 28 | for line in log.split("\n"): 29 | for status in [ 30 | TestStatus.PASSED.value, 31 | TestStatus.FAILED.value, 32 | ]: 33 | if status in line: 34 | test_case = line.split()[-1] 35 | test_status_map[test_case] = status 36 | break 37 | return test_status_map 38 | 39 | 40 | def parse_log_python_slugify(log: str) -> dict[str, str]: 41 | """Parser for test logs generated by un33k/python-slugify""" 42 | test_status_map = {} 43 | pattern = "^([a-zA-Z0-9_\-,\.\s\(\)']+)\s\.{3}\s" 44 | for line in log.split("\n"): 45 | is_match = re.match(f"{pattern}ok$", line) 46 | if is_match: 47 | test_status_map[is_match.group(1)] = TestStatus.PASSED.value 48 | continue 49 | for keyword, status in { 50 | "FAIL": TestStatus.FAILED, 51 | "ERROR": TestStatus.ERROR, 52 | }.items(): 53 | is_match = re.match(f"{pattern}{keyword}$", line) 54 | if is_match: 55 | test_status_map[is_match.group(1)] = status.value 56 | continue 57 | return test_status_map 58 | 59 | 60 | def parse_log_tornado(log: str) -> dict[str, str]: 61 | """Parser for test logs generated by tornadoweb/tornado""" 62 | test_status_map = {} 63 | for line in log.split("\n"): 64 | if line.endswith("... ok"): 65 | test_case = line.split(" ... ")[0] 66 | test_status_map[test_case] = TestStatus.PASSED.value 67 | elif " ... skipped " in line: 68 | test_case = line.split(" ... ")[0] 69 | test_status_map[test_case] = TestStatus.SKIPPED.value 70 | elif any([line.startswith(x) for x in ["ERROR:", "FAIL:"]]): 71 | test_case = " ".join(line.split()[1:3]) 72 | test_status_map[test_case] = TestStatus.FAILED.value 73 | return test_status_map 74 | 75 | 76 | def parse_log_paramiko(log: str) -> dict[str, str]: 77 | """Parser for test logs generated by paramiko/paramiko""" 78 | test_status_map = {} 79 | for line in log.split("\n"): 80 | for status in TestStatus: 81 | is_match = re.match(f"^{status.value}\s(\S+)", line) 82 | if is_match: 83 | test_status_map[is_match.group(1)] = status.value 84 | continue 85 | return test_status_map 86 | 87 | 88 | def parse_log_autograd(log: str) -> dict[str, str]: 89 | """Parser for test logs generated by pytorch/pytorch""" 90 | test_status_map = {} 91 | for line in log.split("\n"): 92 | for status in TestStatus: 93 | is_match = re.match(f"^\[gw\d\]\s{status.value}\s(\S+)", line) 94 | if is_match: 95 | test_status_map[is_match.group(1)] = status.value 96 | continue 97 | return test_status_map 98 | 99 | 100 | MAP_REPO_TO_PARSER = { 101 | "HIPS/autograd": parse_log_autograd, 102 | "paramiko/paramiko": parse_log_paramiko, 103 | "python/mypy": parse_log_mypy, 104 | "tornadoweb/tornado": parse_log_tornado, 105 | "un33k/python-slugify": parse_log_python_slugify, 106 | } 107 | -------------------------------------------------------------------------------- /swesmith/issue_gen/get_from_pr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Purpose: Given a bug patch, retrieve the issue text from the PR that the bug was created from. 3 | 4 | python swesmith/issue_gen/get_from_pr.py logs/experiments/*.json 5 | """ 6 | 7 | import argparse 8 | import json 9 | 10 | from pathlib import Path 11 | from swesmith.constants import LOG_DIR_BUG_GEN 12 | from swesmith.bug_gen.mirror.generate import INSTANCE_REF, MIRROR_PR 13 | from tqdm.auto import tqdm 14 | 15 | 16 | def transform_to_sweb_inst_id(inst): 17 | repo = inst["repo"].split("/", 1)[-1].rsplit(".", 1)[0] 18 | pr_num = inst["instance_id"].rsplit("_", 1)[-1] 19 | return f"{repo}-{pr_num}" 20 | 21 | 22 | def get_original_ps_from_pr(instance, log_dir_bug_gen=LOG_DIR_BUG_GEN): 23 | log_dir_bug_gen = Path(log_dir_bug_gen) 24 | sweb_inst_id = transform_to_sweb_inst_id(instance) 25 | pr_num = sweb_inst_id.rsplit("-", 1)[-1] 26 | metadata_path = ( 27 | log_dir_bug_gen 28 | / instance["repo"].split("/")[-1] 29 | / MIRROR_PR 30 | / sweb_inst_id 31 | / f"metadata__pr_{pr_num}.json" 32 | ) 33 | if not metadata_path.exists(): 34 | return "" 35 | metadata = json.load(open(metadata_path, "r")) 36 | if INSTANCE_REF not in metadata: 37 | return "" 38 | ps = metadata[INSTANCE_REF]["problem_statement"] 39 | return ps 40 | 41 | 42 | def main(dataset_path: str): 43 | dataset_path = Path(dataset_path) 44 | 45 | # Load bug dataset 46 | dataset = json.load(open(dataset_path, "r")) 47 | print(f"Found {len(dataset)} task instances to generate instructions for") 48 | kept = [] 49 | for instance in tqdm(dataset): 50 | ps = get_original_ps_from_pr(instance) 51 | if len(ps.strip()) > 0: 52 | instance["problem_statement"] = ps 53 | kept.append(instance) 54 | print( 55 | f"{len(kept)} instances have problem statements ({len(dataset) - len(kept)} missing)" 56 | ) 57 | 58 | if len(kept) > 0: 59 | # Create .json version of the dataset 60 | output_path = dataset_path.parent / f"{dataset_path.stem}__ig_orig.json" 61 | with open(output_path, "w") as f: 62 | json.dump(kept, f, indent=2) 63 | print(f"Wrote dataset with original problem statements to {output_path}") 64 | else: 65 | print("No instances found with original problem statements.") 66 | 67 | 68 | if __name__ == "__main__": 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument( 71 | "dataset_path", 72 | type=str, 73 | help="Path to the dataset", 74 | ) 75 | args = parser.parse_args() 76 | main(**vars(args)) 77 | -------------------------------------------------------------------------------- /swesmith/issue_gen/get_static.py: -------------------------------------------------------------------------------- 1 | """ 2 | Purpose: Given a task instance, attached a fixed problem statement to the issue text. 3 | 4 | python swesmith/issue_gen/get_fixed.py logs/experiments/*.json 5 | """ 6 | 7 | import argparse 8 | import json 9 | import random 10 | from typing import Set 11 | 12 | from pathlib import Path 13 | from swebench.harness.constants import FAIL_TO_PASS, KEY_INSTANCE_ID 14 | from swesmith.bug_gen.procedural.generate import ( 15 | PM_TECHNIQUES_CLASSES, 16 | PM_TECHNIQUES_FUNCS, 17 | ) 18 | from tqdm.auto import tqdm 19 | from unidiff import PatchSet 20 | 21 | BUG_TYPE_TO_PROMPT = { 22 | x.name: x.explanation for x in PM_TECHNIQUES_CLASSES + PM_TECHNIQUES_FUNCS 23 | } 24 | 25 | # MARK: Basic says-nothing prompt 26 | PROMPT_BASIC = ( 27 | """There is a bug in this codebase. Please look into it and resolve the issue.""" 28 | ) 29 | 30 | # MARK: Prompts that mention file names 31 | PROMPT_FILES = """There are bug(s) in this codebase, likely located in the following file(s): 32 | {gold_files} 33 | 34 | Please look into them and fix any bugs that you find.""" 35 | 36 | # MARK: Prompts that mention file + function names 37 | PROMPT_FILES_FUNCS = """There are bug(s) in this codebase, likely located in the following file(s). 38 | {gold_files} 39 | 40 | I think these function(s) are relevant to the bug: 41 | {gold_funcs} 42 | 43 | Please look into them and fix any bugs that you find.""" 44 | 45 | # MARK: Prompts that mention test cases 46 | PROMPT_TESTS_BASIC = ( 47 | """Several tests in the codebase are breaking. Please find the bugs and fix them.""" 48 | ) 49 | PROMPT_TESTS_F2P = """Several tests in the codebase are breaking. 50 | 51 | The tests that are failing are: 52 | {f2p_list} 53 | 54 | Please fix the codebase such that the tests pass.""" 55 | 56 | # MARK: Prompts that mention the type of bug 57 | PROMPT_BUG_TYPE_BASIC = """There is a bug in this codebase. {bug_type}Please look into it and resolve the issue.""" 58 | PROMPT_BUG_TYPE_FILES = """There is a bug in this codebase. {bug_type}It seems to be related to the following files:" \ 59 | {gold_files} 60 | Please look into these files and resolve the issue.""" 61 | PROMPT_BUG_TYPE_FILES_TESTS = """There is a bug in this codebase. {bug_type}It seems to be related to the following files: 62 | {gold_files} 63 | 64 | Please look into these files and resolve the issue. I believe a test case is also failing because of this bug: 65 | {f2p_single}""" 66 | PROMPT_BUG_TYPE_FILES_FUNCS_TESTS = """There is a bug in this codebase. {bug_type}It seems to be related to the following files: 67 | {gold_files} 68 | 69 | I think these function(s) are relevant to the bug: 70 | {gold_funcs} 71 | 72 | Please look into this and resolve the issue. I believe a test case is also failing because of this bug: 73 | {f2p_single}""" 74 | 75 | PROMPT_POOL = [ 76 | (PROMPT_BASIC, 0.05), 77 | (PROMPT_FILES, 0.1), 78 | (PROMPT_FILES_FUNCS, 0.15), 79 | (PROMPT_TESTS_BASIC, 0.1), 80 | (PROMPT_TESTS_F2P, 0.1), 81 | (PROMPT_BUG_TYPE_BASIC, 0.05), 82 | (PROMPT_BUG_TYPE_FILES, 0.15), 83 | (PROMPT_BUG_TYPE_FILES_TESTS, 0.15), 84 | (PROMPT_BUG_TYPE_FILES_FUNCS_TESTS, 0.15), 85 | ] 86 | 87 | random.seed(24) 88 | 89 | 90 | def print_list(x): 91 | return "- " + "\n- ".join(x) 92 | 93 | 94 | def get_bug_exp(instance) -> str: 95 | inst_id = instance[KEY_INSTANCE_ID] 96 | for bug_type, prompt in BUG_TYPE_TO_PROMPT.items(): 97 | if bug_type in inst_id: 98 | return prompt 99 | return "" 100 | 101 | 102 | def get_changed_functions(patch_text) -> Set[str]: 103 | patch = PatchSet(patch_text.splitlines()) 104 | changed_funcs = set() 105 | 106 | for file in patch: 107 | for hunk in file: 108 | for line in hunk: 109 | if line.is_added or line.is_removed: 110 | # Extract function context 111 | function_name = hunk.section_header 112 | if function_name: 113 | changed_funcs.add(function_name.strip()) 114 | 115 | return changed_funcs 116 | 117 | 118 | def main(dataset_path: str | Path) -> None: 119 | dataset_path = Path(dataset_path) 120 | dataset = [] 121 | if dataset_path.name.endswith(".json"): 122 | dataset = json.load(open(dataset_path, "r")) 123 | elif dataset_path.name.endswith(".jsonl"): 124 | dataset = [json.loads(x) for x in open(dataset_path, "r")] 125 | else: 126 | raise ValueError( 127 | f"Unsupported file format (must be .json, .jsonl): {dataset_path}" 128 | ) 129 | dataset_path = Path(dataset_path) 130 | print(f"Found {len(dataset)} task instances to generate instructions for") 131 | 132 | prompt_pool = [x[0] for x in PROMPT_POOL] 133 | prompt_weights = [x[1] for x in PROMPT_POOL] 134 | for instance in tqdm(dataset): 135 | instance["bug_type"] = get_bug_exp(instance) 136 | instance["f2p_single"] = random.choice(instance[FAIL_TO_PASS]) 137 | instance["f2p_list"] = print_list(instance[FAIL_TO_PASS]) 138 | instance["gold_files"] = print_list( 139 | [x.path for x in PatchSet(instance["patch"])] 140 | ) 141 | instance["gold_funcs"] = print_list(get_changed_functions(instance["patch"])) 142 | 143 | prompt = random.choices(prompt_pool, weights=prompt_weights, k=1)[0] 144 | instance["problem_statement"] = prompt.format(**instance) 145 | out_path = dataset_path.parent / f"{dataset_path.stem}__ig_static.json" 146 | with open(out_path, "w") as f: 147 | json.dump(dataset, f, indent=2) 148 | print(f"Wrote dataset with static instructions to {out_path}") 149 | 150 | 151 | if __name__ == "__main__": 152 | parser = argparse.ArgumentParser() 153 | parser.add_argument("dataset_path", type=str, help="Path to the dataset file") 154 | args = parser.parse_args() 155 | main(**vars(args)) 156 | -------------------------------------------------------------------------------- /swesmith/issue_gen/utils.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | import random 4 | from typing import Any 5 | 6 | from pathlib import Path 7 | from swebench.harness.constants import FAIL_TO_PASS 8 | from swesmith.utils import clone_repo 9 | 10 | 11 | def extract_pytest_test( 12 | file_path: str | Path, test_name: str, class_name: str | None = None 13 | ) -> str | None: 14 | try: 15 | with open(file_path, "r", encoding="utf-8") as f: 16 | tree = ast.parse(f.read()) 17 | except Exception: 18 | return None 19 | 20 | # If class_name is provided, look inside the class 21 | if class_name: 22 | for node in tree.body: 23 | if isinstance(node, ast.ClassDef) and node.name == class_name: 24 | for method in node.body: 25 | if isinstance(method, ast.FunctionDef) and method.name == test_name: 26 | return ast.unparse(method) # Extract function from class 27 | else: 28 | # Look for a top-level function 29 | for node in tree.body: 30 | if isinstance(node, ast.FunctionDef) and node.name == test_name: 31 | return ast.unparse(node) # Extract function 32 | 33 | return None 34 | 35 | 36 | def get_test_function(instance: dict, idx: int | None = None) -> dict[str, Any]: 37 | # test names are in pytest format (e.g., test_file::test_name) 38 | test = ( 39 | random.choice(instance[FAIL_TO_PASS]) 40 | if idx is None 41 | else instance[FAIL_TO_PASS][idx] 42 | if idx < len(instance[FAIL_TO_PASS]) 43 | else instance[FAIL_TO_PASS][-1] 44 | ) 45 | class_name = None 46 | if "::" not in test: 47 | test_file = "test.py" 48 | test_name = test.split()[0] 49 | else: 50 | test_file, test_name = test.split("::", 1) 51 | if "::" in test_name: 52 | class_name, test_name = test_name.split("::", 1) 53 | # Remove any parameters from the test name 54 | test_name = test_name.split("[")[0] 55 | 56 | # Clone repo for instance 57 | repo = instance["repo"] 58 | repo_name = repo.split("/")[-1] 59 | cloned = clone_repo(repo_name) 60 | 61 | # Update test_file to be relative to the repo 62 | test_file = os.path.join(repo_name, test_file) 63 | 64 | return { 65 | "test_src": extract_pytest_test(test_file, test_name, class_name), 66 | "test_file": test_file, 67 | "test_name": test_name, 68 | "class_name": class_name, 69 | "repo_name": repo_name, 70 | "cloned": cloned, 71 | } 72 | -------------------------------------------------------------------------------- /swesmith/train/README.md: -------------------------------------------------------------------------------- 1 | # SWE-smith Training Code 2 | This folder contains the training scripts for fine-tuning on SWE-smith trajectories. 3 | 4 | The code is heavily inspired by the [SWE-gym](https://github.com/SWE-Gym/SWE-Gym) team. We thank them for open-sourcing their codebase, allowing for easy reproduction of the fine-tuning procedure they used. 5 | If you found this part of the codebase useful, please make sure to [cite the SWE-gym](https://github.com/SWE-Gym/SWE-Gym?tab=readme-ov-file#-citation) team as well. 6 | 7 | ### Notes 8 | All fine-tuning + model serving is carried out with [Modal](https://modal.com/). 9 | 10 | To fine tune a model, follow this procedure: 11 | 1. Download a model checkpoint from HuggingFace 12 | ```bash 13 | modal run download_checkpoint.py --source-repo Qwen/Qwen2.5-7B-Instruct --target-dir /llm-weights/Qwen/Qwen2.5-7B-Instruct 14 | ``` 15 | 16 | 2. Run fine tuning with a SWE-smith dataset 17 | ```bash 18 | NGPUS=8 modal run train/run_ft_torchtune.py --config train/config/torchtune.yml 19 | ``` 20 | 21 | 3. Host model with `sglang` and run inference with SWE-agent 22 | ```bash 23 | N_HOURS=4 N_GPUS=4 modal run --detach serve_sglang.py --model-path /llm-weights/my-oss-model --served-model-name gpt-4o --tokenizer-path /llm-weights/Qwen/Qwen2.5-Coder-32B-Instruct 24 | ``` 25 | 26 | From the SWE-agent local repository, run the following command to start running inference with the local model 27 | ```bash 28 | #!/bin/bash 29 | 30 | sweagent run-batch \ 31 | --agent.model.api_base /v1 \ 32 | --agent.model.api_key swesmith \ # This is set in serve_sglang.py 33 | --agent.model.name gpt-4o \ # TODO: Change this when SWE-agent is fixed 34 | --instances.type swe_bench \ 35 | --instances.dataset_name jyang20/swebv-mini \ 36 | --instances.split test \ 37 | --config config/anthropic_no_fcalls.yaml 38 | # --instances.evaluate True # Install sb-cli for this 39 | ``` 40 | -------------------------------------------------------------------------------- /swesmith/train/difficulty_rater/create_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Purpose: Create difficulty train / test datasets from SWE-bench Verified annotations of task difficulty. 3 | 4 | Usage: 5 | python train/difficulty_rater/create_datasets.py 6 | 7 | NOTE: Please include the follwing files in the same directory when running this script: 8 | - ensembled_annotations_public.csv 9 | - samples_with_3_annotations_public.csv 10 | """ 11 | 12 | import json 13 | import pandas as pd 14 | 15 | from collections import Counter 16 | from datasets import load_dataset 17 | from swebench.harness.constants import KEY_INSTANCE_ID 18 | 19 | PROMPT_SYSTEM = """Below I have given you information about a GitHub pull request. The information includes 20 | the problem statement describing the bug and the patch representing the changes made that 21 | successfully resolves the issue. Please categorize the difficulty of the original task based 22 | on this information. There are 4 levels of difficulty you can choose from: 23 | 24 | * <15 min fix 25 | * 15 min - 1 hour 26 | * 1-4 hours 27 | * >4 hours""" 28 | 29 | PROMPT_INSTANCE = """### Input: 30 | **Problem Statement** 31 | {problem_statement} 32 | 33 | **Solution Patch** 34 | {patch} 35 | 36 | **Response** 37 | """ 38 | 39 | if __name__ == "__main__": 40 | sweb = load_dataset("SWE-bench/SWE-bench") 41 | sweb_map = {x[KEY_INSTANCE_ID]: x for x in sweb["test"]} 42 | ensembled = pd.read_csv("ensembled_annotations_public.csv") 43 | samplesw3 = pd.read_csv("samples_with_3_annotations_public.csv") 44 | 45 | df = ensembled[[KEY_INSTANCE_ID, "difficulty"]] 46 | test_df = df.sample(frac=0.2, random_state=42) 47 | train_df = df.drop(test_df.index) 48 | print(f"Train size: {len(train_df)}, Test size: {len(test_df)}") 49 | 50 | for pair in [ 51 | ("difficulty_train.jsonl", train_df), 52 | ("difficulty_test.jsonl", test_df), 53 | ]: 54 | distribution = [] 55 | with open(pair[0], "w") as f: 56 | for row in pair[1].itertuples(index=False, name=None): 57 | inst = sweb_map[row[0]] 58 | label = row[1] 59 | if label == ">4 hours": 60 | label = "1-4 hours" 61 | messages = { 62 | "messages": [ 63 | {"role": "system", "content": PROMPT_SYSTEM}, 64 | {"role": "user", "content": PROMPT_INSTANCE.format(**inst)}, 65 | {"role": "assistant", "content": label}, 66 | ] 67 | } 68 | distribution.append(label) 69 | f.write(json.dumps(messages) + "\n") 70 | 71 | print(f"{pair[0]} distribution:") 72 | for k, v in Counter(distribution).items(): 73 | print(f"* {k}: {v} ({round(v * 100 / len(distribution), 2)}%)") 74 | 75 | check = [json.loads(x) for x in open("difficulty_train.jsonl").readlines()] 76 | print(len(check)) 77 | check = [json.loads(x) for x in open("difficulty_test.jsonl").readlines()] 78 | print(len(check)) 79 | -------------------------------------------------------------------------------- /swesmith/train/difficulty_rater/get_difficulties.py: -------------------------------------------------------------------------------- 1 | """ 2 | Purpose: Get difficulty ratings for different bugs 3 | 4 | Usage: 5 | python train/difficulty_rater/get_difficulties.py --base_url --dataset_path 6 | 7 | NOTE: 8 | Make sure the sglang server for the difficulty rating model is running. 9 | """ 10 | 11 | import argparse 12 | import json 13 | import openai 14 | import os 15 | 16 | from collections import Counter 17 | from concurrent.futures import ThreadPoolExecutor, as_completed 18 | from swebench.harness.constants import KEY_INSTANCE_ID 19 | from swesmith.constants import SGLANG_API_KEY 20 | from swesmith.train.difficulty_rater.create_datasets import ( 21 | PROMPT_SYSTEM, 22 | PROMPT_INSTANCE, 23 | ) 24 | from tqdm.auto import tqdm 25 | 26 | DIFFICULTY_SCORE = {"15 min - 1 hour": 5, "1-4 hours": 9, "<15 min fix": 1} 27 | 28 | 29 | def process_instance(client, instance): 30 | try: 31 | response = client.chat.completions.create( 32 | model="gpt-4o", 33 | messages=[ 34 | {"role": "system", "content": PROMPT_SYSTEM}, 35 | {"role": "user", "content": PROMPT_INSTANCE.format(**instance)}, 36 | ], 37 | temperature=0, 38 | max_tokens=64, 39 | ) 40 | difficulty = response.choices[0].message.content.strip() 41 | return { 42 | KEY_INSTANCE_ID: instance[KEY_INSTANCE_ID], 43 | "difficulty": difficulty, 44 | } 45 | except: 46 | return { 47 | KEY_INSTANCE_ID: instance[KEY_INSTANCE_ID], 48 | "difficulty": "error", 49 | } 50 | 51 | 52 | def main(base_url, dataset_path, overwrite=False): 53 | client = openai.Client(base_url=f"{base_url}/v1", api_key=SGLANG_API_KEY) 54 | 55 | dataset = None 56 | if dataset_path.endswith(".json"): 57 | dataset = json.load(open(dataset_path)) 58 | elif dataset_path.endswith(".jsonl"): 59 | dataset = [json.loads(line) for line in open(dataset_path).readlines()] 60 | 61 | ext = ".json" if dataset_path.endswith(".json") else ".jsonl" 62 | difficulties_path = dataset_path.replace(ext, "_difficulties.jsonl") 63 | 64 | id_to_diff = {} 65 | completed = [] 66 | mode = "w" 67 | if os.path.exists(difficulties_path) and not overwrite: 68 | for line in open(difficulties_path).readlines(): 69 | line = json.loads(line) 70 | id_to_diff[line[KEY_INSTANCE_ID]] = line["difficulty"] 71 | completed.append(line[KEY_INSTANCE_ID]) 72 | print(f"Skipping {len(completed)} completed instances") 73 | dataset = [x for x in dataset if x[KEY_INSTANCE_ID] not in completed] 74 | mode = "a" 75 | 76 | print(f"Rating {len(dataset)} instances (will write to {difficulties_path})") 77 | num_threads = 4 # Adjust based on API rate limits 78 | with ( 79 | ThreadPoolExecutor(max_workers=num_threads) as executor, 80 | open(difficulties_path, mode) as f, 81 | ): 82 | future_to_instance = { 83 | executor.submit(process_instance, client, instance): instance 84 | for instance in dataset 85 | } 86 | 87 | for future in tqdm(as_completed(future_to_instance), total=len(dataset)): 88 | result = future.result() 89 | if result: # Skip None values 90 | f.write(json.dumps(result) + "\n") 91 | id_to_diff[result[KEY_INSTANCE_ID]] = result["difficulty"] 92 | 93 | print(f"Assessed difficulty for {len(id_to_diff)} instances") 94 | difficulty_dist = Counter(id_to_diff.values()) 95 | print(difficulty_dist) 96 | for k in list(difficulty_dist.keys()): 97 | if k not in DIFFICULTY_SCORE: 98 | del difficulty_dist[k] 99 | difficulty_rating = round( 100 | sum( 101 | DIFFICULTY_SCORE[rating] * count 102 | for rating, count in difficulty_dist.items() 103 | ) 104 | / sum(difficulty_dist.values()), 105 | 3, 106 | ) 107 | print(f"Difficulty score: {difficulty_rating}") 108 | print(f"Saved to {difficulties_path}") 109 | 110 | 111 | if __name__ == "__main__": 112 | parser = argparse.ArgumentParser( 113 | description="Get difficulty ratings for different bugs" 114 | ) 115 | parser.add_argument( 116 | "--base_url", type=str, required=True, help="Base URL of the Model API" 117 | ) 118 | parser.add_argument( 119 | "--dataset_path", type=str, required=True, help="Path to the dataset" 120 | ) 121 | parser.add_argument( 122 | "--overwrite", 123 | action="store_true", 124 | help="Whether to overwrite existing difficulties", 125 | ) 126 | args = parser.parse_args() 127 | main(**vars(args)) 128 | -------------------------------------------------------------------------------- /swesmith/train/difficulty_rater/test_rater.py: -------------------------------------------------------------------------------- 1 | """ 2 | Purpose: Test the difficulty rater model 3 | 4 | Usage: python train/difficulty_rater/test_rater.py --base_url 5 | 6 | NOTE: Please make sure the sglang server is running and `difficulty_test.jsonl` is in the same directory. 7 | """ 8 | 9 | import argparse 10 | import json 11 | import openai 12 | 13 | from tqdm.auto import tqdm 14 | 15 | 16 | def main(base_url: str): 17 | test_insts = [json.loads(x) for x in open("difficulty_test.jsonl")] 18 | client = openai.Client(base_url=f"{base_url}/v1", api_key="swesmith") 19 | responses = [] 20 | 21 | for inst in tqdm(test_insts): 22 | answer = inst["messages"][-1]["content"] 23 | messages = inst["messages"][:-1] 24 | 25 | response = client.chat.completions.create( 26 | model="gpt-4o", 27 | messages=messages, 28 | temperature=0, 29 | max_tokens=64, 30 | ) 31 | resp = response.choices[0].message.content.strip() 32 | pred = resp 33 | if "\n" in pred: 34 | pred = pred.split("\n")[0] 35 | responses.append([pred, answer, resp]) 36 | 37 | print( 38 | f"Accuracy: {round(sum([x[0] == x[1] for x in responses]) / len(responses) * 100, 4)}%" 39 | ) 40 | 41 | sig_diff = 0 42 | for x in responses: 43 | if (x[0] == "1-4 hours" and x[1] == "<15 min fix") or ( 44 | x[1] == "1-4 hours" and x[0] == "<15 min fix" 45 | ): 46 | sig_diff += 1 47 | print(f"# of significantly different preds: {sig_diff}") 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("--base_url", type=str, default="http://localhost:8000") 53 | args = parser.parse_args() 54 | main(**vars(args)) 55 | -------------------------------------------------------------------------------- /swesmith/train/download_checkpoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | From: https://github.com/SWE-Gym/SWE-Gym/blob/main/scripts/modal_misc/download_checkpoint.py 3 | 4 | Download a checkpoint from Hugging Face. 5 | 6 | modal run download_checkpoint.py --source-repo /path/to/source_repo --target-dir /path/to/target_dir 7 | 8 | Example: 9 | modal run download_checkpoint.py --source-repo meta-llama/Llama-3.3-70B-Instruct --target-dir /llm-weights/meta-llama/Llama-3.3-70B-Instruct 10 | """ 11 | 12 | import modal 13 | import os 14 | 15 | from swesmith.constants import VOLUME_NAME_MODEL 16 | 17 | app = modal.App("download-hf-ckpts") 18 | model_volume = modal.Volume.from_name(VOLUME_NAME_MODEL, create_if_missing=True) 19 | 20 | image = ( 21 | modal.Image.debian_slim(python_version="3.12") 22 | .apt_install(["git", "git-lfs"]) 23 | .pip_install("huggingface_hub[cli]") 24 | ) 25 | 26 | 27 | MINUTES = 60 # seconds 28 | HOURS = 60 * MINUTES 29 | 30 | 31 | @app.function( 32 | volumes={f"/{VOLUME_NAME_MODEL}": model_volume}, 33 | image=image, 34 | timeout=1 * HOURS, 35 | secrets=[modal.Secret.from_name("john-hf-secret")], 36 | ) 37 | def download_ckpts(source_repo: str, target_dir: str): 38 | # make sure target_dir exists 39 | os.makedirs(target_dir, exist_ok=True) 40 | 41 | import subprocess 42 | import sys 43 | 44 | command = "git lfs install" 45 | subprocess.run( 46 | command.split(), 47 | stdout=sys.stdout, 48 | stderr=sys.stderr, 49 | check=True, 50 | ) 51 | 52 | command = f"huggingface-cli download {source_repo} --local-dir {target_dir}" 53 | subprocess.run( 54 | command.split(), 55 | stdout=sys.stdout, 56 | stderr=sys.stderr, 57 | check=True, 58 | ) 59 | model_volume.commit() 60 | 61 | 62 | @app.local_entrypoint() 63 | def main(source_repo: str, target_dir: str): 64 | download_ckpts.remote(source_repo=source_repo, target_dir=target_dir) 65 | -------------------------------------------------------------------------------- /swesmith/train/run/ft_torchtune.py: -------------------------------------------------------------------------------- 1 | """From: https://github.com/SWE-Gym/SWE-Gym/blob/main/scripts/training/openhands/train_torchtune_full.py 2 | 3 | Full fine tune an LM using torchtune 4 | 5 | modal run swesmith/train/run/ft_torchtune.py --config /path/to/config.yaml 6 | """ 7 | 8 | import os 9 | import modal 10 | import yaml 11 | 12 | from swesmith.constants import VOLUME_NAME_DATASET, VOLUME_NAME_MODEL 13 | 14 | torchtune_image = ( 15 | modal.Image.debian_slim(python_version="3.12") 16 | .apt_install("git") 17 | .pip_install( 18 | [ 19 | "torch", 20 | "torchvision", 21 | "torchao", 22 | "wandb", 23 | "torchtune", 24 | ] 25 | ) 26 | ) 27 | 28 | 29 | app = modal.App("torchtune-training") 30 | trained_model_volume = modal.Volume.from_name(VOLUME_NAME_MODEL, create_if_missing=True) 31 | dataset_volume = modal.Volume.from_name(VOLUME_NAME_DATASET, create_if_missing=True) 32 | 33 | MINUTES = 60 # seconds 34 | HOURS = 60 * MINUTES 35 | N_GPUS = int(os.environ.get("N_GPUS", 2)) 36 | N_HOURS = int(os.environ.get("N_HOURS", 10)) 37 | 38 | 39 | @app.function( 40 | image=torchtune_image, 41 | # gpu=modal.gpu.A100(count=N_GPU, size="80GB"), 42 | gpu=f"H100:{N_GPUS}", 43 | volumes={ 44 | f"/{VOLUME_NAME_MODEL}": trained_model_volume, 45 | f"/{VOLUME_NAME_DATASET}": dataset_volume, 46 | }, 47 | timeout=N_HOURS * HOURS, 48 | secrets=[ 49 | modal.Secret.from_name("john-wandb-secret"), 50 | modal.Secret.from_name("john-hf-secret"), 51 | ], 52 | ) 53 | def run_train(config_name: str, config: dict, n_gpus: int): 54 | config_path = f"/tmp/{config_name}.yaml" 55 | with open(config_path, "w") as f: 56 | yaml.dump(config, f) 57 | command = f"tune run --nnodes 1 --nproc_per_node {n_gpus} full_finetune_distributed --config {config_path}" 58 | import subprocess 59 | import sys 60 | 61 | subprocess.run( 62 | command.split(), 63 | stdout=sys.stdout, 64 | stderr=sys.stderr, 65 | check=True, 66 | ) 67 | trained_model_volume.commit() 68 | 69 | 70 | @app.local_entrypoint() 71 | def main(config: str): 72 | # load yaml config 73 | config_name = os.path.basename(config) 74 | with open(config, "r") as f: 75 | config = yaml.safe_load(f) 76 | run_train.remote(config_name=config_name, config=config, n_gpus=N_GPUS) 77 | -------------------------------------------------------------------------------- /swesmith/train/serve_sglang.py: -------------------------------------------------------------------------------- 1 | """Host a model with SGLang 2 | 3 | N_HOURS=4 N_GPUS=4 modal run --detach serve_sglang.py --model-path /llm-weights/my-oss-model --served-model-name my-oss-model --tokenizer-path /llm-weights/Qwen/Qwen2.5-Coder-32B-Instruct 4 | 5 | NOTE: Make sure /llm-weights/my-oss-model points at a folder with weights (on Modal Volume) 6 | """ 7 | 8 | import modal 9 | import os 10 | import shutil 11 | import subprocess 12 | import sys 13 | 14 | from swesmith.constants import VOLUME_NAME_MODEL, SGLANG_API_KEY 15 | 16 | sglang_image = ( 17 | modal.Image.debian_slim(python_version="3.12") 18 | .pip_install("sglang[all]==0.3.6") 19 | .run_commands("pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/") 20 | ) 21 | 22 | MINUTES = 60 # seconds 23 | HOURS = 60 * MINUTES 24 | 25 | try: 26 | volume = modal.Volume.from_name(VOLUME_NAME_MODEL, create_if_missing=False) 27 | except modal.exception.NotFoundError: 28 | raise Exception("Download models first with modal run download_model_to_volume.py") 29 | 30 | N_GPUS = int(os.environ.get("N_GPUS", 2)) 31 | N_HOURS = float(os.environ.get("N_HOURS", 4)) 32 | 33 | app = modal.App("sglang-serve") 34 | 35 | 36 | @app.function( 37 | image=sglang_image, 38 | gpu=modal.gpu.A100(count=N_GPUS, size="80GB"), 39 | # gpu=modal.gpu.H100(count=N_GPUS), 40 | container_idle_timeout=5 * MINUTES, 41 | timeout=int(N_HOURS * HOURS), 42 | allow_concurrent_inputs=1000, 43 | volumes={f"/{VOLUME_NAME_MODEL}": volume}, 44 | ) 45 | def run_server( 46 | model_path: str, 47 | served_model_name: str, 48 | tokenizer_path: str, 49 | context_length: int, 50 | n_gpus: int, 51 | ): 52 | # first check if model_path has config.json, if not copy it from tokenizer_path 53 | if not os.path.exists(os.path.join(model_path, "config.json")): 54 | print(f"Copying config.json from {tokenizer_path} to {model_path}") 55 | shutil.copy( 56 | os.path.join(tokenizer_path, "config.json"), 57 | os.path.join(model_path, "config.json"), 58 | ) 59 | # print the content of the config.json 60 | print("Content of the config.json:") 61 | with open(os.path.join(model_path, "config.json"), "r") as f: 62 | print(f.read()) 63 | assert os.path.exists(os.path.join(model_path, "config.json")), ( 64 | f"config.json not found in {model_path}. os.listdir(model_path): {os.listdir(model_path)}" 65 | ) 66 | 67 | with modal.forward(3000, unencrypted=True) as tunnel: 68 | command = f"python -m sglang.launch_server --model-path {model_path} --tokenizer-path {tokenizer_path} --tp-size {n_gpus} --port 3000 --host 0.0.0.0 --served-model-name {served_model_name} --context-length {context_length} --api-key {SGLANG_API_KEY}" 69 | print("Server listening at", tunnel.url) 70 | subprocess.run( 71 | command.split(), 72 | stdout=sys.stdout, 73 | stderr=sys.stderr, 74 | check=True, 75 | ) 76 | 77 | 78 | @app.local_entrypoint() 79 | def main( 80 | model_path: str, 81 | served_model_name: str, 82 | tokenizer_path: str = "/llm-weights/Qwen/Qwen2.5-Coder-7B-Instruct", 83 | context_length: int = 32768, 84 | ): 85 | print(f"Serving {model_path} on {served_model_name} with {N_GPUS} GPUs") 86 | print(f"Timeout: {N_HOURS} hours") 87 | run_server.remote( 88 | model_path, served_model_name, tokenizer_path, context_length, N_GPUS 89 | ) 90 | -------------------------------------------------------------------------------- /swesmith/train/traj_mgr/clean_trajs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Remove unnecessary files from the trajectories directory. 3 | 4 | Usage: python swesmith/ 5 | """ 6 | 7 | import argparse 8 | import os 9 | 10 | 11 | def main(traj_dir): 12 | assert traj_dir.startswith("trajectories"), ( 13 | "This script can only be run on SWE-agent trajectories." 14 | ) 15 | for folder in sorted( 16 | [x for x in os.listdir(traj_dir) if os.path.isdir(os.path.join(traj_dir, x))] 17 | ): 18 | folder = os.path.join(traj_dir, folder) 19 | removed = 0 20 | for root, _, files in os.walk(folder): 21 | for file in files: 22 | if any( 23 | [ 24 | file.endswith(ext) 25 | for ext in [ 26 | ".config.yaml", 27 | ".debug.log", 28 | ".info.log", 29 | ".trace.log", 30 | ] 31 | ] 32 | ): 33 | if file == "run_batch.config.yaml": 34 | continue 35 | # Delete this file 36 | os.remove(os.path.join(root, file)) 37 | removed += 1 38 | print(f"{folder}: Removed {removed} files.") 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument( 44 | "traj_dir", 45 | type=str, 46 | help="Path to the directory containing the trajectories.", 47 | ) 48 | args = parser.parse_args() 49 | main(**vars(args)) 50 | -------------------------------------------------------------------------------- /swesmith/train/traj_mgr/combine_trajs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Purpose: Combine multiple .jsonl files together and shuffle the lines, where the .jsonl files correspond to 3 | SFT datasets of SWE-agent expert trajectories. 4 | 5 | Usage: You should run this script in the root directory of the SWE-agent repository. 6 | 7 | python -m swesmith.train.traj_mgr.combine_trajs 8 | """ 9 | 10 | import argparse 11 | import json 12 | import os 13 | import random 14 | import rich 15 | import sys 16 | 17 | from pathlib import Path 18 | from sparklines import sparklines 19 | from swebench.harness.constants import KEY_INSTANCE_ID 20 | 21 | SFT_DIR = Path("trajectories_sft/") 22 | 23 | 24 | def merge_and_shuffle_jsonl( 25 | max_per_inst: int = 3, 26 | output_file: str = None, 27 | seed: int = 24, 28 | ): 29 | # List all .jsonl files in expert_trajs/ 30 | try: 31 | all_trajs = sorted([f for f in os.listdir(SFT_DIR) if f.endswith(".jsonl")]) 32 | print("Select 2+ files to merge:") 33 | print("Index | Filename | # Trajectories") 34 | for idx, file in enumerate(all_trajs): 35 | if file.endswith(".jsonl"): 36 | num_trajs = len( 37 | open(os.path.join(SFT_DIR, file), "r", encoding="utf-8").readlines() 38 | ) 39 | print(f"{idx}: {file} ({num_trajs})") 40 | selected_indices = input( 41 | "Enter the indices of the files to merge (specify indices or range of indices, e.g. `7 11-13`): " 42 | ) 43 | process_idx = ( 44 | lambda idx: list(range(int(idx.split("-")[0]), int(idx.split("-")[1]) + 1)) 45 | if "-" in idx 46 | else [int(idx.strip())] 47 | ) 48 | selected_indices = [ 49 | idx for part in selected_indices.split() for idx in process_idx(part) 50 | ] 51 | files = [os.path.join(SFT_DIR, all_trajs[idx]) for idx in selected_indices] 52 | 53 | if not output_file: 54 | output_file = input("Name of output file (without extension): ") + ".jsonl" 55 | output_file = Path(os.path.join(SFT_DIR, output_file)) 56 | else: 57 | output_file = Path(output_file) 58 | except KeyboardInterrupt: 59 | print("\nExiting...") 60 | return 61 | 62 | # Read all lines from the input JSONL files 63 | inst_to_trajs = {} 64 | for file in files: 65 | try: 66 | with open(file, "r", encoding="utf-8") as f: 67 | for traj in f.readlines(): 68 | traj = json.loads(traj) 69 | inst_id = traj[KEY_INSTANCE_ID] 70 | if inst_id not in inst_to_trajs: 71 | inst_to_trajs[inst_id] = [] 72 | inst_to_trajs[inst_id].append(traj) 73 | except FileNotFoundError: 74 | print(f"Warning: File not found - {file}", file=sys.stderr) 75 | except Exception as e: 76 | print(f"Error reading {file}: {e}", file=sys.stderr) 77 | 78 | all_trajs = [] 79 | random.seed(seed) 80 | bug_types, repo_count = {}, {} 81 | for k, v in inst_to_trajs.items(): 82 | s = min(len(v), max_per_inst) 83 | all_trajs.extend(random.sample(v, s)) 84 | 85 | bug_type = k.rsplit(".", 1)[-1].rsplit("_", 1)[0] 86 | if bug_type.startswith("func_pm"): 87 | bug_type = "func_pm" 88 | if bug_type not in bug_types: 89 | bug_types[bug_type] = 0 90 | bug_types[bug_type] += s 91 | 92 | repo = k.rsplit(".", 1)[0] 93 | if repo not in repo_count: 94 | repo_count[repo] = 0 95 | repo_count[repo] += s 96 | random.shuffle(all_trajs) 97 | rich.print(bug_types) 98 | rich.print(sparklines(bug_types.values())[0]) 99 | 100 | # Write to the output file 101 | with open(output_file, "w", encoding="utf-8") as f: 102 | for traj in all_trajs: 103 | f.write(json.dumps(traj) + "\n") 104 | 105 | print( 106 | f"Merged and shuffled content written to {output_file} ({len(all_trajs)} lines)" 107 | ) 108 | 109 | metadata_file = str(output_file.parent / f"metadata__{output_file.stem}.json") 110 | print(f"Writing metadata to {metadata_file}") 111 | with open(metadata_file, "w") as f: 112 | json.dump( 113 | { 114 | "output_file": str(output_file), 115 | "num_files": len(files), 116 | "num_trajs": len(all_trajs), 117 | "max_per_inst": max_per_inst, 118 | "bug_types_dist": bug_types, 119 | "seed": seed, 120 | "files": files, 121 | "repo_count": [ 122 | f"{repo} | {count}" 123 | for repo, count in sorted( 124 | repo_count.items(), key=lambda x: x[1], reverse=True 125 | ) 126 | ], 127 | }, 128 | f, 129 | indent=4, 130 | ) 131 | 132 | 133 | if __name__ == "__main__": 134 | parser = argparse.ArgumentParser( 135 | description="Merge and shuffle multiple JSONL files." 136 | ) 137 | parser.add_argument( 138 | "-m", 139 | "--max_per_inst", 140 | type=int, 141 | default=3, 142 | help="Max number of trajectories per instance.", 143 | ) 144 | parser.add_argument("-o", "--output_file", help="Name of the output file.") 145 | parser.add_argument( 146 | "-s", "--seed", type=int, default=24, help="Random seed for shuffling." 147 | ) 148 | 149 | args = parser.parse_args() 150 | merge_and_shuffle_jsonl(**vars(args)) 151 | -------------------------------------------------------------------------------- /swesmith/train/traj_mgr/transform_to_ft.py: -------------------------------------------------------------------------------- 1 | """ 2 | Given a folder of SWE-agent trajectories, extracts the trajectories 3 | and transforms them into a fine-tuning compatible format, namely... 4 | 5 | [ 6 | { 7 | "messages": [ 8 | { 9 | "role": "system", 10 | "content": "system prompt (optional)" 11 | }, 12 | { 13 | "role": "user", 14 | "content": "human instruction" 15 | }, 16 | { 17 | "role": "assistant", 18 | "content": "model response" 19 | } 20 | ] 21 | }, 22 | ... 23 | ] 24 | 25 | Usage: (from SWE-agent directory) 26 | python -m swesmith.train.traj_mgr.transform_to_ft --traj_dir \ 27 | --eval_dir \ 28 | --resolved_only 29 | """ 30 | 31 | import argparse 32 | import json 33 | import os 34 | 35 | from swesmith.train.traj_mgr.utils import MAP_STYLE_TO_FUNC 36 | from tqdm.auto import tqdm 37 | 38 | 39 | def main( 40 | traj_dir: str, 41 | eval_dir: str, 42 | style: str, 43 | only_resolved: bool = False, 44 | out_dir: str = ".", 45 | ): 46 | if style not in MAP_STYLE_TO_FUNC: 47 | raise ValueError( 48 | f"Style {style} not supported. Options: {list(MAP_STYLE_TO_FUNC.keys())}" 49 | ) 50 | transform_traj = MAP_STYLE_TO_FUNC[style] 51 | 52 | folders = [ 53 | x for x in os.listdir(traj_dir) if os.path.isdir(os.path.join(traj_dir, x)) 54 | ] 55 | print(f"Found {len(folders)} trajectory folders in {traj_dir}") 56 | 57 | if only_resolved and os.path.exists(eval_dir): 58 | print("Only keeping trajectories for resolved instances") 59 | 60 | if not os.path.exists(out_dir): 61 | from pathlib import Path 62 | 63 | Path(out_dir).mkdir(parents=True, exist_ok=True) 64 | out_path = os.path.join(out_dir, f"ft_{style}_{os.path.basename(eval_dir)}.jsonl") 65 | 66 | num_trajs = 0 67 | with open(out_path, "w") as f: 68 | for folder in tqdm(folders): 69 | if folder not in os.listdir(eval_dir): 70 | continue 71 | if "report.json" not in os.listdir(os.path.join(eval_dir, folder)): 72 | continue 73 | 74 | if only_resolved: 75 | report_path = os.path.join(eval_dir, folder, "report.json") 76 | report = json.load(open(report_path, "r")) 77 | is_resolved = ( 78 | report.get("resolved", False) 79 | if folder not in report 80 | else report[folder].get("resolved", False) 81 | ) 82 | if not is_resolved: 83 | continue 84 | 85 | traj_path = os.path.join(traj_dir, folder, f"{folder}.traj") 86 | traj = transform_traj(json.load(open(traj_path, "r"))) 87 | traj["instance_id"] = folder 88 | f.write(json.dumps(traj) + "\n") 89 | num_trajs += 1 90 | 91 | print(f"Found {num_trajs} valid trajectories") 92 | print(f"Wrote trajectories to {out_path}") 93 | 94 | 95 | if __name__ == "__main__": 96 | if False: 97 | arg_parser = argparse.ArgumentParser( 98 | description="Transform SWE-agent trajectories to fine-tuning format" 99 | ) 100 | arg_parser.add_argument( 101 | "--traj_dir", 102 | type=str, 103 | required=True, 104 | help="Path to folder containing SWE-agent trajectories", 105 | ) 106 | arg_parser.add_argument( 107 | "--eval_dir", 108 | type=str, 109 | required=True, 110 | help="Path to folder containing evaluation results", 111 | ) 112 | arg_parser.add_argument( 113 | "--style", 114 | type=str, 115 | required=False, 116 | default="xml", 117 | help="Style of the trajectories", 118 | ) 119 | arg_parser.add_argument( 120 | "--only_resolved", 121 | action="store_true", 122 | required=False, 123 | help="Only keep trajectories for resolved instances", 124 | ) 125 | arg_parser.add_argument( 126 | "--out_path", 127 | type=str, 128 | required=False, 129 | default=".", 130 | help="Path to output directory", 131 | ) 132 | args = arg_parser.parse_args() 133 | main(**vars(args)) 134 | 135 | USER = "john-b-yang" 136 | TRAJS_EXP_PREFIX = "swesmith_gen_" 137 | PATH_TO_TRAJS = f"trajectories/{USER}/" 138 | PATH_TO_EVAL_DIR = f"/home/{USER}/swe-smith/logs/run_evaluation/" 139 | 140 | for run_id in sorted(os.listdir(PATH_TO_TRAJS)): 141 | if not run_id.startswith(TRAJS_EXP_PREFIX): 142 | continue 143 | traj_dir = os.path.join(PATH_TO_TRAJS, run_id) 144 | eval_dir = os.path.join(PATH_TO_EVAL_DIR, run_id) 145 | out_path = f"trajectories_sft/ft_xml_{os.path.basename(eval_dir)}.jsonl" 146 | if os.path.exists(out_path): 147 | num = len(open(out_path, "r").readlines()) 148 | print(f"Skipping {out_path} because it already exists ({num} trajs)") 149 | continue 150 | print("*" * 20) 151 | main( 152 | traj_dir, 153 | eval_dir, 154 | style="xml", 155 | only_resolved=True, 156 | out_dir="trajectories_sft/", 157 | ) 158 | -------------------------------------------------------------------------------- /swesmith/train/traj_mgr/transform_to_ft_list.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | 4 | """ 5 | Similar to transform_to_ft.py, but takes a list of paths to trajectories to transform. 6 | All filtering must have already been done on the list of paths. 7 | 8 | Example usage: 9 | python transform_to_ft_list.py --traj_list traj_list.json --out_path ft_list.jsonl 10 | 11 | See transform_to_ft.py for more details on the format of the output. 12 | """ 13 | 14 | import argparse 15 | import json 16 | 17 | from pathlib import Path 18 | from swesmith.train.traj_mgr.utils import transform_traj_xml 19 | from tqdm.auto import tqdm 20 | 21 | 22 | def main(traj_list_file: Path, out_path: Path) -> None: 23 | traj_paths: list[str] = json.loads(Path(traj_list_file).read_text()) 24 | print(f"Transforming {len(traj_paths)} trajectories") 25 | out_path.parent.mkdir(parents=True, exist_ok=True) 26 | with open(out_path, "w") as f: 27 | for traj_path in tqdm(traj_paths): 28 | traj = json.loads(Path(traj_path).read_text()) 29 | traj_xml = transform_traj_xml(traj) 30 | f.write(json.dumps(traj_xml) + "\n") 31 | 32 | print(f"Wrote {len(traj_paths)} trajectories to {out_path}") 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument( 38 | "--traj_list", 39 | type=Path, 40 | required=True, 41 | help="Path to file containing list of trajectories to transform", 42 | ) 43 | parser.add_argument( 44 | "--out_path", type=Path, required=True, help="Path to output .jsonlfile" 45 | ) 46 | args = parser.parse_args() 47 | main(traj_list_file=args.traj_list, out_path=args.out_path) 48 | -------------------------------------------------------------------------------- /swesmith/train/traj_mgr/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import yaml 3 | 4 | XML_STR_REPLACES = ["old_str", "new_str", "file_text"] 5 | 6 | 7 | # TODO: Fix this, this is hardcoded, so will break if not called from root of a directory 8 | SYSTEM_PROMPT = yaml.safe_load(open("agent/swesmith_infer.yaml", "r"))["agent"][ 9 | "templates" 10 | ]["system_template"] 11 | 12 | 13 | def transform_traj_backticks(traj: dict) -> dict: 14 | new_traj = [] 15 | for message in traj["trajectory"][-1]["messages"][:-1]: 16 | # Pick out the last message b/c it contains full trajectory 17 | # Also, skip the last message b/c it's just the patch output (post-submit) 18 | role = message["role"] if message["role"] != "tool" else "user" 19 | if message["role"] == "assistant": 20 | content = f"{message['thought']}\n\n```\n{message['action']}\n```" 21 | elif message["role"] == "system": 22 | content = message["content"] 23 | else: 24 | assert len(message["content"]) == 1 25 | content = message["content"][0]["text"] 26 | new_traj.append({"role": role, "content": content}) 27 | return {"messages": new_traj} 28 | 29 | 30 | def transform_traj_xml(traj: dict) -> dict: 31 | def tool_call_to_action(tool_calls): 32 | actions = [] 33 | if tool_calls is None: 34 | return [] 35 | for tool_call in tool_calls: 36 | action = [f""] 37 | arguments = json.loads(tool_call["function"]["arguments"]) 38 | for k, v in arguments.items(): 39 | a = f"{v}" 40 | if k in XML_STR_REPLACES: 41 | a = f"\n{v}\n" 42 | action.append(a) 43 | action.append("") 44 | actions.append("\n".join(action)) 45 | return actions 46 | 47 | new_traj = [] 48 | messages = traj["trajectory"][-1]["messages"][:-1] 49 | for message in messages: 50 | role = message["role"] if message["role"] != "tool" else "user" 51 | if message["role"] == "assistant": 52 | if message["content"] == "Exit due to cost limit": 53 | content = ( 54 | "Since we have successfully fixed the issue and verified it works, " 55 | + "let's submit the changes:\n\n" 56 | + "\n" 57 | ) 58 | else: 59 | action = "\n".join(tool_call_to_action(message["tool_calls"])) 60 | content = f"{message['thought']}\n\n{action}" 61 | elif message["role"] == "system": 62 | content = SYSTEM_PROMPT 63 | else: 64 | if isinstance(message["content"], list): 65 | assert len(message["content"]) == 1 66 | content = message["content"][0]["text"] 67 | elif isinstance(message["content"], str): 68 | content = message["content"] 69 | else: 70 | raise ValueError(f"Message type not recognized: {type(message)}") 71 | new_traj.append({"role": role, "content": content}) 72 | return {"messages": new_traj} 73 | 74 | 75 | MAP_STYLE_TO_FUNC = {"ticks": transform_traj_backticks, "xml": transform_traj_xml} 76 | -------------------------------------------------------------------------------- /swesmith/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import platform 4 | import random 5 | import string 6 | import subprocess 7 | 8 | from ghapi.all import GhApi 9 | from pathlib import Path 10 | from swesmith.constants import MAP_REPO_TO_SPECS, ORG_NAME, LOG_DIR_ENV_RECORDS 11 | 12 | 13 | def get_arch_and_platform() -> tuple[str, str]: 14 | """ 15 | Get the architecture and platform for the current machine. 16 | """ 17 | arch = "x86_64" if platform.machine() not in {"aarch64", "arm64"} else "arm64" 18 | if arch == "x86_64": 19 | pltf = "linux/x86_64" 20 | elif arch == "arm64": 21 | pltf = "linux/arm64/v8" 22 | else: 23 | raise ValueError(f"Invalid architecture: {arch}") 24 | return arch, pltf 25 | 26 | 27 | def get_image_name(repo: str, commit: str, arch: str | None = None) -> str: 28 | """ 29 | Get the docker image ID for a repository at a specific commit. 30 | """ 31 | arch = arch or get_arch_and_platform()[0] 32 | return f"swesmith.{arch}.{repo.replace('/', '__').lower()}.{commit[:8]}" 33 | 34 | 35 | def get_repo_commit_from_image_name(image_name: str) -> tuple[str, str]: 36 | """ 37 | Get the repository and commit from a docker image ID. 38 | """ 39 | # Parsing supports repos with '.' in their name 40 | image_name = image_name.split(".", 2)[-1] 41 | repo = image_name.rsplit(".", 1)[0].replace("__", "/") 42 | partial_commit = image_name.rsplit(".", 1)[-1] 43 | for repo_name in MAP_REPO_TO_SPECS: 44 | # Hack because docker image_name must be lowercase 45 | if repo_name.lower() == repo: 46 | repo = repo_name 47 | break 48 | commit = get_full_commit(repo, partial_commit) 49 | return repo, commit 50 | 51 | 52 | def get_env_yml_path(repo: str, commit: str) -> str: 53 | """ 54 | Get the path to the environment.yml file for a repository at a specific commit. 55 | """ 56 | if len(commit) != 40: 57 | raise ValueError( 58 | f"Must provide full commit hash, not partial commit ({commit})" 59 | ) 60 | return f"{LOG_DIR_ENV_RECORDS}/sweenv_{repo.replace('/', '__')}_{commit}.yml" 61 | 62 | 63 | def get_full_commit(repo, partial_commit) -> str: 64 | """ 65 | Get the full commit hash for a repository at a specific commit. 66 | """ 67 | for commit in MAP_REPO_TO_SPECS[repo]: 68 | if commit.startswith(partial_commit): 69 | return commit 70 | 71 | raise ValueError(f"Commit {partial_commit} not found for repository {repo}.") 72 | 73 | 74 | def get_repo_name(repo, commit) -> str: 75 | """ 76 | Get the SWE-smith GitHub repository name for a repository at a specific commit. 77 | """ 78 | return f"{repo.replace('/', '__')}.{commit[:8]}" 79 | 80 | 81 | def clone_repo(repo: str, dest: str | None = None, org: str = ORG_NAME) -> str | None: 82 | """Clone a repository from GitHub.""" 83 | if not os.path.exists(dest or repo): 84 | clone_cmd = ( 85 | f"git clone git@github.com:{org}/{repo}.git" 86 | if dest is None 87 | else f"git clone git@github.com:{org}/{repo}.git {dest}" 88 | ) 89 | subprocess.run( 90 | clone_cmd, 91 | check=True, 92 | shell=True, 93 | stdout=subprocess.DEVNULL, 94 | stderr=subprocess.DEVNULL, 95 | ) 96 | return repo if dest is None else dest 97 | return None 98 | 99 | 100 | def generate_hash(s): 101 | return "".join( 102 | random.Random(int(hashlib.sha256(s.encode()).hexdigest(), 16)).choices( 103 | string.ascii_lowercase + string.digits, k=8 104 | ) 105 | ) 106 | 107 | 108 | def get_test_paths(dir_path: str, ext: str = ".py") -> list[Path]: 109 | """ 110 | Get all testing file paths relative to the given directory. 111 | """ 112 | return [ 113 | Path(os.path.relpath(os.path.join(root, file), dir_path)) 114 | for root, _, files in os.walk(Path(dir_path).resolve()) 115 | for file in files 116 | if ( 117 | ( 118 | any([x in root.split("/") for x in ["tests", "test", "specs"]]) 119 | or file.lower().startswith("test") 120 | or file.rsplit(".", 1)[0].endswith("test") 121 | ) 122 | and (ext is None or file.endswith(ext)) 123 | ) 124 | ] 125 | 126 | 127 | def does_repo_exist(repo: str) -> bool: 128 | """ 129 | Check if a repository exists in project organization. 130 | """ 131 | GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") 132 | api = GhApi(token=GITHUB_TOKEN) 133 | org_repos = [ 134 | x["name"] 135 | for page in range(1, 3) 136 | for x in api.repos.list_for_org(ORG_NAME, per_page=100, page=page) # type: ignore 137 | ] 138 | return repo in org_repos 139 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWE-bench/SWE-smith/cfb6cf29568d0841ce15620c96ec795243b229fa/tests/__init__.py -------------------------------------------------------------------------------- /tests/bug_gen/llm/test_utils_llm.py: -------------------------------------------------------------------------------- 1 | import ast 2 | from swesmith.bug_gen.llm.utils import extract_code_block, get_function_signature 3 | 4 | 5 | def parse_func(code): 6 | return ast.parse(code).body[0] 7 | 8 | 9 | def test_extract_code_block_basic(): 10 | text = """ 11 | Here is some code: 12 | ```python\nprint('hello')\n``` 13 | """ 14 | assert extract_code_block(text) == "print('hello')" 15 | 16 | 17 | def test_extract_code_block_no_language(): 18 | text = """ 19 | Example: 20 | ```\nfoo = 1\nbar = 2\n``` 21 | """ 22 | assert extract_code_block(text) == "foo = 1\nbar = 2" 23 | 24 | 25 | def test_extract_code_block_no_block(): 26 | text = "No code block here." 27 | assert extract_code_block(text) == "" 28 | 29 | 30 | def test_extract_code_block_multiple_blocks(): 31 | text = """ 32 | ```python\nfirst = True\n``` 33 | Some text 34 | ```python\nsecond = False\n``` 35 | """ 36 | # Should extract only the first block 37 | assert extract_code_block(text) == "first = True" 38 | 39 | 40 | def test_extract_code_block_strip_whitespace(): 41 | text = """ 42 | ```\n a = 1\n b = 2 \n\n``` 43 | """ 44 | assert extract_code_block(text) == "a = 1\n b = 2" 45 | 46 | 47 | def test_get_function_signature_simple(): 48 | node = parse_func("def foo(a, b): pass") 49 | assert get_function_signature(node) == "def foo(a, b)" 50 | 51 | 52 | def test_get_function_signature_no_args(): 53 | node = parse_func("def bar(): pass") 54 | assert get_function_signature(node) == "def bar()" 55 | 56 | 57 | def test_get_function_signature_with_defaults(): 58 | node = parse_func("def baz(a, b=2): pass") 59 | assert get_function_signature(node) == "def baz(a, b)" 60 | 61 | 62 | def test_get_function_signature_varargs(): 63 | node = parse_func("def qux(*args, **kwargs): pass") 64 | assert get_function_signature(node) == "def qux()" 65 | 66 | 67 | def test_get_function_signature_annotations(): 68 | node = parse_func("def annotated(a: int, b: str) -> None: pass") 69 | assert get_function_signature(node) == "def annotated(a: int, b: str)" 70 | -------------------------------------------------------------------------------- /tests/bug_gen/procedural/test_classes.py: -------------------------------------------------------------------------------- 1 | import libcst 2 | import pytest 3 | from swesmith.bug_gen.procedural.classes import ( 4 | ClassRemoveBasesModifier, 5 | ClassShuffleMethodsModifier, 6 | ClassRemoveFuncsModifier, 7 | ) 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "src,expected_variants", 12 | [ 13 | # Remove single base 14 | ( 15 | """ 16 | class Foo(Bar): 17 | pass 18 | """, 19 | [ 20 | "class Foo():\n pass", 21 | ], 22 | ), 23 | # Remove one of multiple bases 24 | ( 25 | """ 26 | class Foo(Bar, Baz): 27 | pass 28 | """, 29 | [ 30 | "class Foo(Bar):\n pass", 31 | "class Foo(Baz):\n pass", 32 | "class Foo():\n pass", 33 | ], 34 | ), 35 | ], 36 | ) 37 | def test_class_remove_bases(src, expected_variants): 38 | module = libcst.parse_module(src) 39 | modifier = ClassRemoveBasesModifier(likelihood=1.0, seed=42) 40 | modified = module.visit(modifier) 41 | assert any( 42 | modified.code.strip() == variant.strip() for variant in expected_variants 43 | ) 44 | 45 | 46 | @pytest.mark.parametrize( 47 | "src,expected_variants", 48 | [ 49 | # Shuffle two methods 50 | ( 51 | """class Foo: 52 | def a(self): 53 | pass 54 | def b(self): 55 | pass 56 | """, 57 | [ 58 | "class Foo:\n def a(self):\n pass\n def b(self):\n pass", 59 | "class Foo:\n def b(self):\n pass\n def a(self):\n pass", 60 | ], 61 | ), 62 | # No shuffle if only one method 63 | ( 64 | """class Bar: 65 | def a(self): 66 | pass 67 | """, 68 | [ 69 | "class Bar:\n def a(self):\n pass", 70 | ], 71 | ), 72 | ], 73 | ) 74 | def test_class_shuffle_methods(src, expected_variants): 75 | module = libcst.parse_module(src) 76 | modifier = ClassShuffleMethodsModifier(likelihood=1.0, seed=42) 77 | modified = module.visit(modifier) 78 | assert any( 79 | modified.code.strip() == variant.strip() for variant in expected_variants 80 | ) 81 | 82 | 83 | @pytest.mark.parametrize( 84 | "src,expected_variants", 85 | [ 86 | # Remove a method and its reference 87 | ( 88 | """class Foo: 89 | def a(self): 90 | pass 91 | def b(self): 92 | self.a() 93 | return 1 94 | """, 95 | [ 96 | # Only b remains, and self.a() is replaced with None 97 | "class Foo:\n def b(self):\n None\n return 1\n", 98 | # Only a remains 99 | "class Foo:\n def a(self):\n pass\n", 100 | # Both removed 101 | "class Foo:\n pass\n", 102 | ], 103 | ), 104 | # Remove both methods 105 | ( 106 | """class Bar: 107 | def a(self): 108 | pass 109 | def b(self): 110 | pass 111 | """, 112 | [ 113 | "class Bar:\n pass\n", 114 | "class Bar:\n\n", 115 | ], 116 | ), 117 | # No removal if no methods 118 | ( 119 | """class Baz: 120 | x = 1 121 | """, 122 | [ 123 | "class Baz:\n x = 1\n", 124 | ], 125 | ), 126 | ], 127 | ) 128 | def test_class_remove_funcs(src, expected_variants): 129 | module = libcst.parse_module(src) 130 | modifier = ClassRemoveFuncsModifier(likelihood=1.0, seed=42) 131 | modified = module.visit(modifier) 132 | assert any( 133 | modified.code.strip() == variant.strip() for variant in expected_variants 134 | ) 135 | -------------------------------------------------------------------------------- /tests/bug_gen/procedural/test_control_flow.py: -------------------------------------------------------------------------------- 1 | import libcst 2 | import pytest 3 | from swesmith.bug_gen.procedural.control_flow import ( 4 | ControlIfElseInvertModifier, 5 | ControlShuffleLinesModifier, 6 | ) 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "src,expected", 11 | [ 12 | # Simple if-else inversion 13 | ( 14 | """ 15 | def foo(x): 16 | if x > 0: 17 | return 1 18 | else: 19 | return -1 20 | """, 21 | """def foo(x): 22 | if x > 0: 23 | return -1 24 | else: 25 | return 1 26 | """, 27 | ), 28 | # No else branch, should not change 29 | ( 30 | """ 31 | def bar(x): 32 | if x == 0: 33 | return 0 34 | """, 35 | """def bar(x): 36 | if x == 0: 37 | return 0 38 | """, 39 | ), 40 | ], 41 | ) 42 | def test_control_if_else_invert(src, expected): 43 | module = libcst.parse_module(src) 44 | modifier = ControlIfElseInvertModifier(likelihood=1.0, seed=42) 45 | modified = module.visit(modifier) 46 | assert modified.code.strip() == expected.strip() 47 | 48 | 49 | @pytest.mark.parametrize( 50 | "src,expected_variants", 51 | [ 52 | # Function with two statements to shuffle 53 | ( 54 | """ 55 | def foo(): 56 | a = 1 57 | b = 2 58 | """, 59 | [ 60 | "def foo():\n a = 1\n b = 2\n", 61 | "def foo():\n b = 2\n a = 1\n", 62 | ], 63 | ), 64 | # Function with only one statement, should not change 65 | ( 66 | """ 67 | def bar(): 68 | x = 42 69 | """, 70 | [ 71 | "def bar():\n x = 42\n", 72 | ], 73 | ), 74 | ], 75 | ) 76 | def test_control_shuffle_lines(src, expected_variants): 77 | module = libcst.parse_module(src) 78 | modifier = ControlShuffleLinesModifier(likelihood=1.0, seed=42) 79 | modified = module.visit(modifier) 80 | assert any( 81 | modified.code.strip() == variant.strip() for variant in expected_variants 82 | ) 83 | -------------------------------------------------------------------------------- /tests/bug_gen/procedural/test_operations.py: -------------------------------------------------------------------------------- 1 | import libcst 2 | import pytest 3 | from swesmith.bug_gen.procedural.operations import ( 4 | OperationBreakChainsModifier, 5 | OperationChangeConstantsModifier, 6 | ) 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "src,expected_variants", 11 | [ 12 | # Case 1: left is a BinaryOperation 13 | ( 14 | """ 15 | def foo(a, b, c): 16 | return a + b + c 17 | """, 18 | [ 19 | "def foo(a, b, c):\n return a + c\n", 20 | "def foo(a, b, c):\n return a + b\n", 21 | ], 22 | ), 23 | # Case 2: right is a BinaryOperation 24 | ( 25 | """ 26 | def bar(x, y, z): 27 | return x * (y * z) 28 | """, 29 | [ 30 | "def bar(x, y, z):\n return x * z\n", 31 | ], 32 | ), 33 | # Case 3: no BinaryOperation, should not change 34 | ( 35 | """ 36 | def baz(x): 37 | return x + 1 38 | """, 39 | [ 40 | "def baz(x):\n return x + 1\n", 41 | ], 42 | ), 43 | # Case 4: multiple BinaryOperations, should break one chain 44 | ( 45 | """ 46 | def qux(a, b, c, d): 47 | return a + b + c * d 48 | """, 49 | [ 50 | "def qux(a, b, c, d):\n return a + (c * d)\n", 51 | "def qux(a, b, c, d):\n return (a + b) + c\n", 52 | "def qux(a, b, c, d):\n return a + c * d\n", 53 | "def qux(a, b, c, d):\n return (a + b) + d\n", 54 | ], 55 | ), 56 | ], 57 | ) 58 | def test_operation_break_chains(src, expected_variants): 59 | module = libcst.parse_module(src) 60 | modifier = OperationBreakChainsModifier(likelihood=0.5, seed=42) # deterministic 61 | modified = module.visit(modifier) 62 | result = modified.code 63 | assert any(result.strip() == variant.strip() for variant in expected_variants), ( 64 | f"Got: {result!r}, expected one of: {expected_variants!r}" 65 | ) 66 | 67 | 68 | @pytest.mark.parametrize( 69 | "src,expected_variants", 70 | [ 71 | # Case 1: left is an integer constant 72 | ( 73 | """ 74 | def foo(): 75 | return 2 + x 76 | """, 77 | [ 78 | "def foo():\n return 1 + x\n", 79 | "def foo():\n return 3 + x\n", 80 | ], 81 | ), 82 | # Case 2: right is an integer constant 83 | ( 84 | """ 85 | def bar(): 86 | return y - 5 87 | """, 88 | [ 89 | "def bar():\n return y - 4\n", 90 | "def bar():\n return y - 6\n", 91 | ], 92 | ), 93 | # Case 3: both sides are integer constants 94 | ( 95 | """ 96 | def baz(): 97 | return 10 * 20 98 | """, 99 | [ 100 | "def baz():\n return 9 * 19\n", 101 | "def baz():\n return 9 * 21\n", 102 | "def baz():\n return 11 * 19\n", 103 | "def baz():\n return 11 * 21\n", 104 | ], 105 | ), 106 | # Case 4: no integer constants, should not change 107 | ( 108 | """ 109 | def qux(a, b): 110 | return a / b 111 | """, 112 | [ 113 | "def qux(a, b):\n return a / b\n", 114 | ], 115 | ), 116 | ], 117 | ) 118 | def test_operation_change_constants(src, expected_variants): 119 | module = libcst.parse_module(src) 120 | modifier = OperationChangeConstantsModifier(likelihood=1.0, seed=42) # always flip 121 | modified = module.visit(modifier) 122 | result = modified.code 123 | assert any(result.strip() == variant.strip() for variant in expected_variants), ( 124 | f"Got: {result!r}, expected one of: {expected_variants!r}" 125 | ) 126 | -------------------------------------------------------------------------------- /tests/bug_gen/procedural/test_remove.py: -------------------------------------------------------------------------------- 1 | import libcst 2 | import pytest 3 | from swesmith.bug_gen.procedural.remove import ( 4 | RemoveLoopModifier, 5 | RemoveConditionalModifier, 6 | RemoveAssignModifier, 7 | RemoveWrapperModifier, 8 | ) 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "src,expected", 13 | [ 14 | # Remove for loop 15 | ( 16 | """ 17 | def foo(): 18 | for i in range(3): 19 | print(i) 20 | return 1 21 | """, 22 | """def foo(): 23 | return 1 24 | """, 25 | ), 26 | # Remove while loop 27 | ( 28 | """ 29 | def bar(): 30 | while True: 31 | break 32 | return 2 33 | """, 34 | """def bar(): 35 | return 2 36 | """, 37 | ), 38 | ], 39 | ) 40 | def test_remove_loop(src, expected): 41 | module = libcst.parse_module(src) 42 | modifier = RemoveLoopModifier(likelihood=1.0, seed=42) 43 | modified = module.visit(modifier) 44 | assert modified.code.strip() == expected.strip() 45 | 46 | 47 | @pytest.mark.parametrize( 48 | "src,expected", 49 | [ 50 | # Remove if statement 51 | ( 52 | """ 53 | def foo(x): 54 | if x > 0: 55 | return x 56 | return 0 57 | """, 58 | """def foo(x): 59 | return 0 60 | """, 61 | ), 62 | # If with else, remove whole if 63 | ( 64 | """ 65 | def bar(x): 66 | if x < 0: 67 | return -1 68 | else: 69 | return 1 70 | """, 71 | """def bar(x): 72 | pass 73 | """, 74 | ), 75 | ], 76 | ) 77 | def test_remove_conditional(src, expected): 78 | module = libcst.parse_module(src) 79 | modifier = RemoveConditionalModifier(likelihood=1.0, seed=42) 80 | modified = module.visit(modifier) 81 | assert modified.code.strip() == expected.strip() 82 | 83 | 84 | @pytest.mark.parametrize( 85 | "src,expected", 86 | [ 87 | # Remove assignment 88 | ( 89 | """ 90 | def foo(): 91 | x = 1 92 | return x 93 | """, 94 | """def foo(): 95 | return x 96 | """, 97 | ), 98 | # Remove augmented assignment 99 | ( 100 | """ 101 | def bar(): 102 | y = 2 103 | y += 3 104 | return y 105 | """, 106 | """def bar(): 107 | return y 108 | """, 109 | ), 110 | ], 111 | ) 112 | def test_remove_assign(src, expected): 113 | module = libcst.parse_module(src) 114 | modifier = RemoveAssignModifier(likelihood=1.0, seed=42) 115 | modified = module.visit(modifier) 116 | assert modified.code.strip() == expected.strip() 117 | 118 | 119 | @pytest.mark.parametrize( 120 | "src,expected", 121 | [ 122 | # Remove with block 123 | ( 124 | """ 125 | def foo(): 126 | with open('f') as f: 127 | data = f.read() 128 | return 1 129 | """, 130 | """def foo(): 131 | return 1 132 | """, 133 | ), 134 | # Remove try block 135 | ( 136 | """ 137 | def bar(): 138 | try: 139 | x = 1 140 | except Exception: 141 | x = 2 142 | return x 143 | """, 144 | """def bar(): 145 | return x 146 | """, 147 | ), 148 | ], 149 | ) 150 | def test_remove_wrapper(src, expected): 151 | module = libcst.parse_module(src) 152 | modifier = RemoveWrapperModifier(likelihood=1.0, seed=42) 153 | modified = module.visit(modifier) 154 | assert modified.code.strip() == expected.strip() 155 | -------------------------------------------------------------------------------- /tests/bug_gen/test_utils.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | import shutil 4 | import tempfile 5 | import unittest 6 | 7 | from swesmith.bug_gen import utils 8 | 9 | 10 | class TestUtils(unittest.TestCase): 11 | def setUp(self): 12 | self.test_dir = tempfile.mkdtemp() 13 | self.test_file = os.path.join(self.test_dir, "test.py") 14 | with open(self.test_file, "w") as f: 15 | f.write(""" 16 | def foo(): 17 | return 1 18 | 19 | class Bar: 20 | def baz(self): 21 | return 2 22 | """) 23 | 24 | def tearDown(self): 25 | shutil.rmtree(self.test_dir) 26 | 27 | def test_apply_code_change(self): 28 | # Setup CodeEntity and BugRewrite 29 | node = ast.parse(open(self.test_file).read()).body[0] 30 | entity = utils.get_entity_from_node( 31 | node, open(self.test_file).read(), self.test_file 32 | ) 33 | bug = utils.BugRewrite( 34 | rewrite="def foo():\n return 42\n", 35 | explanation="change return", 36 | strategy="test", 37 | ) 38 | utils.apply_code_change(entity, bug) 39 | with open(self.test_file) as f: 40 | content = f.read() 41 | self.assertIn("return 42", content) 42 | 43 | def test_apply_patches(self): 44 | # Create a git repo and patch file 45 | repo = tempfile.mkdtemp() 46 | subprocess = __import__("subprocess") 47 | subprocess.run(["git", "init"], cwd=repo, check=True, stdout=subprocess.DEVNULL) 48 | test_file = os.path.join(repo, "a.py") 49 | with open(test_file, "w") as f: 50 | f.write("print('hi')\n") 51 | for cmd in [ 52 | "git branch -m main", 53 | "git add a.py", 54 | 'git config user.email "you@example.com"', 55 | 'git config user.name "Your Name"', 56 | "git commit -m init", 57 | ]: 58 | subprocess.run( 59 | cmd.split(), 60 | cwd=repo, 61 | check=True, 62 | stdout=subprocess.DEVNULL, 63 | ) 64 | with open(test_file, "w") as f: 65 | f.write("print('bye')\n") 66 | patch = utils.get_patch(repo) 67 | patch_file = os.path.join(self.test_dir, "patch.diff") 68 | print(patch) 69 | with open(patch_file, "w") as f: 70 | f.write(patch) 71 | # Reset rep o before applying patch 72 | subprocess.run( 73 | ["git", "reset", "--hard"], cwd=repo, check=True, stdout=subprocess.DEVNULL 74 | ) 75 | subprocess.run( 76 | ["git", "clean", "-fd"], cwd=repo, check=True, stdout=subprocess.DEVNULL 77 | ) 78 | # Apply the patch 79 | result = utils.apply_patches(repo, [patch_file]) 80 | self.assertIsInstance(result, str) 81 | shutil.rmtree(repo) 82 | 83 | def test_extract_entities_from_directory(self): 84 | entities = utils.extract_entities_from_directory( 85 | self.test_dir, "func", exclude_tests=False 86 | ) 87 | self.assertTrue(any(e.src_code.startswith("def foo") for e in entities)) 88 | entities = utils.extract_entities_from_directory( 89 | self.test_dir, "class", exclude_tests=False 90 | ) 91 | self.assertTrue(any("class Bar" in e.src_code for e in entities)) 92 | 93 | def test_get_combos(self): 94 | items = [1, 2, 3] 95 | combos = utils.get_combos(items, 2, 2) 96 | self.assertEqual(len(combos), 2) 97 | self.assertTrue(all(len(c) >= 2 for c in combos)) 98 | 99 | def test_get_entity_from_node(self): 100 | with open(self.test_file) as f: 101 | content = f.read() 102 | tree = ast.parse(content) 103 | node = tree.body[0] 104 | entity = utils.get_entity_from_node(node, content, self.test_file) 105 | self.assertEqual(entity.line_start, 2) 106 | self.assertIn("def foo", entity.src_code) 107 | 108 | def test_get_patch(self): 109 | repo = tempfile.mkdtemp() 110 | subprocess = __import__("subprocess") 111 | subprocess.run(["git", "init"], cwd=repo, check=True, stdout=subprocess.DEVNULL) 112 | test_file = os.path.join(repo, "b.py") 113 | with open(test_file, "w") as f: 114 | f.write("print('hi')\n") 115 | for cmd in [ 116 | "git add b.py", 117 | 'git config user.email "you@example.com"', 118 | 'git config user.name "Your Name"', 119 | "git commit -m init", 120 | ]: 121 | subprocess.run( 122 | cmd.split(), 123 | cwd=repo, 124 | check=True, 125 | stdout=subprocess.DEVNULL, 126 | ) 127 | with open(test_file, "w") as f: 128 | f.write("print('bye')\n") 129 | patch = utils.get_patch(repo) 130 | self.assertIsInstance(patch, str) 131 | shutil.rmtree(repo) 132 | 133 | 134 | if __name__ == "__main__": 135 | unittest.main() 136 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common pytest fixtures and configuration for SWE-smith tests. 3 | """ 4 | 5 | import os 6 | import sys 7 | 8 | # Add the repository root to the Python path to ensure imports work correctly 9 | repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 10 | if repo_root not in sys.path: 11 | sys.path.insert(0, repo_root) 12 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from swesmith.utils import * 2 | from unittest.mock import patch 3 | 4 | 5 | def test_get_repo_commit_from_image_name(): 6 | image_name = "swesmith.x86_64.instagram__monkeytype.70c3acf6" 7 | repo, commit = get_repo_commit_from_image_name(image_name) 8 | assert repo == "Instagram/MonkeyType" 9 | assert commit == "70c3acf62950be5dfb28743c7a719bfdecebcd84" 10 | 11 | 12 | def test_get_repo_name(): 13 | repo = "Instagram/MonkeyType" 14 | commit = "70c3acf62950be5dfb28743c7a719bfdecebcd84" 15 | image_name = get_image_name(repo, commit) 16 | assert image_name == "swesmith.x86_64.instagram__monkeytype.70c3acf6" 17 | 18 | 19 | def test_get_full_commit(): 20 | repo = "Instagram/MonkeyType" 21 | partial_commit = "70c3acf6" 22 | full_commit = get_full_commit(repo, partial_commit) 23 | assert full_commit == "70c3acf62950be5dfb28743c7a719bfdecebcd84" 24 | 25 | 26 | def test_clone_repo(): 27 | repo = "TestRepo" 28 | dest = None 29 | org = "TestOrg" 30 | expected_cmd = f"git clone git@github.com:{org}/{repo}.git" 31 | with ( 32 | patch("os.path.exists", return_value=False) as mock_exists, 33 | patch("subprocess.run") as mock_run, 34 | ): 35 | result = clone_repo(repo, dest, org) 36 | mock_exists.assert_called_once_with(repo) 37 | mock_run.assert_called_once_with( 38 | expected_cmd, 39 | check=True, 40 | shell=True, 41 | stdout=subprocess.DEVNULL, 42 | stderr=subprocess.DEVNULL, 43 | ) 44 | assert result == repo 45 | 46 | # Test with dest specified 47 | dest = "some_dir" 48 | expected_cmd = f"git clone git@github.com:{org}/{repo}.git {dest}" 49 | with ( 50 | patch("os.path.exists", return_value=False) as mock_exists, 51 | patch("subprocess.run") as mock_run, 52 | ): 53 | result = clone_repo(repo, dest, org) 54 | mock_exists.assert_called_once_with(dest) 55 | mock_run.assert_called_once_with( 56 | expected_cmd, 57 | check=True, 58 | shell=True, 59 | stdout=subprocess.DEVNULL, 60 | stderr=subprocess.DEVNULL, 61 | ) 62 | assert result == dest 63 | 64 | # Test when repo already exists 65 | with ( 66 | patch("os.path.exists", return_value=True) as mock_exists, 67 | patch("subprocess.run") as mock_run, 68 | ): 69 | result = clone_repo(repo, dest, org) 70 | mock_exists.assert_called_once_with(dest) 71 | mock_run.assert_not_called() 72 | assert result is None 73 | 74 | 75 | def test_get_test_paths(tmp_path): 76 | # Create directory structure 77 | (tmp_path / "tests").mkdir() 78 | (tmp_path / "src").mkdir() 79 | (tmp_path / "specs").mkdir() 80 | # Test files 81 | test_files = [ 82 | tmp_path / "tests" / "test_foo.py", 83 | tmp_path / "tests" / "foo_test.py", 84 | tmp_path / "specs" / "bar_test.py", 85 | tmp_path / "src" / "test_bar.py", 86 | tmp_path / "src" / "baz_test.py", 87 | ] 88 | # Non-test files 89 | non_test_files = [ 90 | tmp_path / "src" / "foo.py", 91 | tmp_path / "src" / "bar.txt", 92 | tmp_path / "src" / "gin.py", 93 | ] 94 | for f in test_files + non_test_files: 95 | f.parent.mkdir(parents=True, exist_ok=True) 96 | f.write_text("# test file" if f in test_files else "# not a test file") 97 | 98 | # Call get_test_paths 99 | result = get_test_paths(str(tmp_path)) 100 | result_set = set(str(p) for p in result) 101 | # Expected: all test_files, relative to tmp_path 102 | expected = set(str(f.relative_to(tmp_path)) for f in test_files) 103 | assert result_set == expected 104 | 105 | 106 | def test_does_repo_exist(): 107 | repo_name = "TestRepo" 108 | # Mock environment variable and GhApi 109 | with ( 110 | patch("os.getenv", return_value="dummy_token") as mock_getenv, 111 | patch("swesmith.utils.GhApi") as mock_GhApi, 112 | ): 113 | mock_api_instance = mock_GhApi.return_value 114 | # Simulate repo exists in first page 115 | mock_api_instance.repos.list_for_org.side_effect = [ 116 | [{"name": repo_name}], # page 1 117 | [], # page 2 118 | ] 119 | assert does_repo_exist(repo_name) is True 120 | # Simulate repo does not exist 121 | mock_api_instance.repos.list_for_org.side_effect = [[{"name": "OtherRepo"}], []] 122 | assert does_repo_exist(repo_name) is False 123 | --------------------------------------------------------------------------------