├── .github
└── workflows
│ └── deploy-docs.yml
├── .gitignore
├── LICENSE
├── README.md
├── assets
└── sb-cli-logo.png
├── docs
├── assets
│ ├── icon.svg
│ ├── logo.svg
│ ├── mini_logo.svg
│ ├── mini_logo_text_below.svg
│ ├── sbcli_logo_text_below.svg
│ ├── sweagent_logo.svg
│ ├── sweagent_logo_text_below.svg
│ ├── swebench_logo.png
│ ├── swebench_logo_text_below.svg
│ ├── swerex_logo.svg
│ ├── swerex_logo_text_below.svg
│ ├── swesmith_logo.png
│ └── swesmith_logo_text_below.svg
├── authentication.md
├── css
│ ├── bubbles.css
│ └── custom.css
├── index.md
├── installation.md
├── overrides
│ └── main.html
├── quick-start.md
├── submit-to-leaderboard.md
└── user-guide
│ ├── delete-run.md
│ ├── get-quotas.md
│ ├── get-report.md
│ ├── index.md
│ ├── list-runs.md
│ └── submit.md
├── mkdocs.yaml
├── pyproject.toml
└── sb_cli
├── __init__.py
├── config.py
├── delete_run.py
├── gen_api_key.py
├── get_quotas.py
├── get_report.py
├── list_runs.py
├── submit.py
├── utils.py
└── verify_api_key.py
/.github/workflows/deploy-docs.yml:
--------------------------------------------------------------------------------
1 | name: Deploy MkDocs
2 | on:
3 | push:
4 | branches:
5 | - main
6 | paths:
7 | - 'docs/**'
8 | - 'mkdocs.yaml'
9 | - '.github/workflows/deploy-docs.yml'
10 |
11 |
12 | # Prevent concurrent runs that could conflict when pushing to gh-pages
13 | concurrency:
14 | group: build-docs-${{ github.ref }}
15 | cancel-in-progress: false
16 |
17 | permissions:
18 | contents: write
19 |
20 | jobs:
21 | deploy:
22 | runs-on: ubuntu-latest
23 | steps:
24 | - uses: actions/checkout@v4
25 |
26 | - uses: actions/setup-python@v4
27 | with:
28 | python-version: '3.x'
29 |
30 | - name: Install dependencies
31 | run: |
32 | pip install mkdocs-material
33 | pip install pillow cairosvg # for social cards
34 | pip install mkdocs-glightbox # for your glightbox plugin
35 |
36 | - name: Deploy docs
37 | run: mkdocs gh-deploy --force
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | site/
2 | __pycache__/
3 | *.log
4 | *.json
5 | *.csv
6 | *.jsonl
7 | *.json-[0-9]*
8 |
9 | # Python
10 | *.py[cod]
11 | *.so
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 |
29 | # IDEs and editors
30 | .idea/
31 | .vscode/
32 | .cursor/
33 | *.swp
34 | *.swo
35 | .DS_Store
36 | Thumbs.db
37 |
38 | # Virtual Environment
39 | venv/
40 | ENV/
41 | .env
42 | .venv
43 |
44 | # Testing
45 | .coverage
46 | htmlcov/
47 | .tox/
48 | .pytest_cache/
49 | nosetests.xml
50 | coverage.xml
51 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 SWE-bench
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # SWE-bench CLI
6 |
7 | A command-line interface for interacting with the SWE-bench API. Use this tool to submit predictions, manage runs, and retrieve evaluation reports.
8 |
9 | Read the full documentation [here](https://www.swebench.com/sb-cli/). For submission guidelines, see [here](https://swebench.com/sb-cli/submit-to-leaderboard).
10 |
11 | ## Installation
12 |
13 | ```bash
14 | pip install sb-cli
15 | ```
16 |
17 | ## Authentication
18 |
19 | Before using the CLI, you'll need to get an API key:
20 |
21 | 1. Generate an API key:
22 | ```bash
23 | sb-cli gen-api-key your.email@example.com
24 | ```
25 |
26 | 2. Set your API key as an environment variable - and store it somewhere safe!
27 | ```bash
28 | export SWEBENCH_API_KEY=your_api_key
29 | # or add export SWEBENCH_API_KEY=your_api_key to your .*rc file
30 | ```
31 |
32 | 3. You'll receive an email with a verification code. Verify your API key:
33 | ```bash
34 | sb-cli verify-api-key YOUR_VERIFICATION_CODE
35 | ```
36 |
37 | ## Subsets and Splits
38 |
39 | SWE-bench has different subsets and splits available:
40 |
41 | ### Subsets
42 | - `swe-bench-m`: The SWE-bench Multimodal dataset
43 | - `swe-bench_verified`: 500 verified problems from SWE-bench [Learn more](https://openai.com/index/introducing-swe-bench-verified/)
44 | - `swe-bench_lite`: A subset of the original SWE-bench for testing
45 |
46 |
47 | ### Splits
48 | - `dev`: Development/validation split
49 | - `test`: Test split (currently only available for `swe-bench_lite` and `swe-bench_verified`)
50 |
51 | You'll need to specify both a subset and split for most commands.
52 |
53 | ## Usage
54 |
55 | ### Submit Predictions
56 |
57 | Submit your model's predictions to SWE-bench:
58 |
59 | ```bash
60 | sb-cli submit swe-bench-m test \
61 | --predictions_path predictions.json \
62 | --run_id my_run_id
63 | ```
64 |
65 | Options:
66 | - `--run_id`: ID of the run to submit predictions for (optional, defaults to the name of the parent directory of the predictions file)
67 | - `--instance_ids`: Comma-separated list of specific instance IDs to submit (optional)
68 | - `--output_dir`: Directory to save report files (default: sb-cli-reports)
69 | - `--overwrite`: Overwrite existing report (default: 0)
70 | - `--gen_report`: Generate a report after evaluation is complete (default: 1)
71 |
72 | ### Get Report
73 |
74 | Retrieve evaluation results for a specific run:
75 |
76 | ```bash
77 | sb-cli get-report swe-bench-m dev my_run_id -o ./reports
78 | ```
79 |
80 | ### List Runs
81 |
82 | View all your existing run IDs for a specific subset and split:
83 |
84 | ```bash
85 | sb-cli list-runs swe-bench-m dev
86 | ```
87 |
88 | ## Predictions File Format
89 |
90 | Your predictions file should be a JSON file in one of these formats:
91 |
92 | ```json
93 | {
94 | "instance_id_1": {
95 | "model_patch": "...",
96 | "model_name_or_path": "..."
97 | },
98 | "instance_id_2": {
99 | "model_patch": "...",
100 | "model_name_or_path": "..."
101 | }
102 | }
103 | ```
104 |
105 | Or as a list:
106 |
107 | ```json
108 | [
109 | {
110 | "instance_id": "instance_id_1",
111 | "model_patch": "...",
112 | "model_name_or_path": "..."
113 | },
114 | {
115 | "instance_id": "instance_id_2",
116 | "model_patch": "...",
117 | "model_name_or_path": "..."
118 | }
119 | ]
120 | ```
121 |
122 | ## Submitting to the Multimodal Leaderboard
123 |
124 | To submit your system to the [SWE-bench Multimodal](https://www.swebench.com/multimodal) leaderboard:
125 |
126 | 1. Submit your predictions for the `swe-bench-m` / `test` split using the CLI
127 | 2. Fork the [experiments repository](https://github.com/swe-bench/experiments)
128 | 3. Add your submission files under `experiments/multimodal/YOUR_MODEL_NAME/`
129 | 4. Create a PR with your submission
130 |
131 | See the detailed guide in our [submission documentation](https://swe-bench.com/sb-cli/submit-to-leaderboard).
132 |
133 | Note: Check your test split quota using `sb-cli quota swe-bench-m test` before submitting.
134 |
135 | ## Related projects
136 |
137 |
138 |

139 |
140 |

141 |
142 |

143 |
144 |

145 |
146 |

147 |
148 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/docs/assets/mini_logo_text_below.svg:
--------------------------------------------------------------------------------
1 |
8 |
--------------------------------------------------------------------------------
/docs/assets/sbcli_logo_text_below.svg:
--------------------------------------------------------------------------------
1 |
54 |
--------------------------------------------------------------------------------
/docs/assets/sweagent_logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
185 |
--------------------------------------------------------------------------------
/docs/assets/sweagent_logo_text_below.svg:
--------------------------------------------------------------------------------
1 |
13 |
--------------------------------------------------------------------------------
/docs/assets/swebench_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SWE-bench/sb-cli/b679692b8b7e274a6c89fd0842f25b02da4b9256/docs/assets/swebench_logo.png
--------------------------------------------------------------------------------
/docs/assets/swerex_logo.svg:
--------------------------------------------------------------------------------
1 |
39 |
--------------------------------------------------------------------------------
/docs/assets/swerex_logo_text_below.svg:
--------------------------------------------------------------------------------
1 |
27 |
--------------------------------------------------------------------------------
/docs/assets/swesmith_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SWE-bench/sb-cli/b679692b8b7e274a6c89fd0842f25b02da4b9256/docs/assets/swesmith_logo.png
--------------------------------------------------------------------------------
/docs/authentication.md:
--------------------------------------------------------------------------------
1 | # Authentication
2 |
3 | Before using the SWE-bench CLI, you'll need to set up authentication using an API key.
4 |
5 | ## Getting an API Key
6 |
7 | 1. Generate a new API key using your email:
8 |
9 | ```bash
10 | sb-cli gen-api-key your.email@example.com
11 | ```
12 |
13 | 2. The CLI will output your API key. Save this key somewhere safe - you'll need it for all future operations.
14 |
15 | ## Setting Up Your API Key
16 |
17 | There are two ways to use your API key:
18 |
19 | ### 1. Environment Setup (Recommended)
20 |
21 | Set your API key as an environment variable:
22 |
23 | ```bash
24 | export SWEBENCH_API_KEY=your_api_key
25 | ```
26 |
27 | For permanent setup, add this line to your shell's configuration file (`.bashrc`, `.zshrc`, etc.).
28 |
29 | !!! note
30 | You can test that your key is working with `sb-cli get-quotas`
31 |
32 | ### 2. Command Line Option
33 |
34 | Alternatively, you can pass your API key directly with each command:
35 |
36 | ```bash
37 | sb-cli submit --api_key your_api_key ...
38 | ```
39 |
40 | ## Verifying Your API Key
41 |
42 | After receiving your API key, you'll get an email with a verification code. Verify your API key using:
43 |
44 | ```bash
45 | sb-cli verify-api-key YOUR_VERIFICATION_CODE
46 | ```
47 |
--------------------------------------------------------------------------------
/docs/css/bubbles.css:
--------------------------------------------------------------------------------
1 | /* Floating bubbles styles */
2 | .floating-bubbles {
3 | position: fixed;
4 | bottom: 20px;
5 | right: 20px;
6 | display: flex;
7 | flex-direction: column;
8 | gap: 10px;
9 | z-index: 1000;
10 | }
11 |
12 | .floating-bubbles-title {
13 | position: absolute;
14 | top: -30px;
15 | right: 0;
16 | font-size: 12px;
17 | color: #777;
18 | text-align: right;
19 | font-weight: bold;
20 | opacity: 0;
21 | visibility: hidden;
22 | transition:
23 | opacity 0.3s ease,
24 | visibility 0.3s ease;
25 | white-space: nowrap;
26 | }
27 |
28 | .floating-bubbles:hover .floating-bubbles-title {
29 | opacity: 1;
30 | visibility: visible;
31 | }
32 |
33 | .bubble {
34 | width: 40px;
35 | height: 40px;
36 | display: flex;
37 | justify-content: center;
38 | align-items: center;
39 | position: relative;
40 | transition: transform 0.3s ease;
41 | }
42 |
43 | .bubble:hover {
44 | transform: scale(1.1);
45 | }
46 |
47 | .bubble img {
48 | width: 40px;
49 | height: 40px;
50 | }
51 |
52 | .bubble-tooltip {
53 | position: absolute;
54 | right: 60px;
55 | background-color: #333;
56 | color: white;
57 | padding: 5px 10px;
58 | border-radius: 4px;
59 | font-size: 14px;
60 | white-space: nowrap;
61 | opacity: 0;
62 | visibility: hidden;
63 | transition:
64 | opacity 0.3s ease,
65 | visibility 0.3s ease;
66 | }
67 |
68 | .bubble:hover .bubble-tooltip {
69 | opacity: 1;
70 | visibility: visible;
71 | }
72 |
73 | .floating-bubbles:hover .bubble-tooltip {
74 | opacity: 1;
75 | visibility: visible;
76 | }
77 |
78 | /* Hide on mobile */
79 | @media (max-width: 768px) {
80 | .floating-bubbles {
81 | display: none;
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/docs/css/custom.css:
--------------------------------------------------------------------------------
1 | [data-md-color-scheme="default"] {
2 | --md-default-bg-color: #fff7ec;
3 | --md-primary-fg-color: #e79925;
4 | --md-typeset-a-color: #0099ff;
5 | --md-code-bg-color: #e7e7e7;
6 | }
7 |
8 | [data-md-color-scheme="slate"] {
9 | --md-primary-fg-color: #e79925;
10 | --md-default-fg-color: #fff7ec;
11 | --md-default-bg-color: #111111;
12 | --md-typeset-a-color: #0099ff;
13 | }
14 |
15 | .clickable-banner {
16 | color: #ff0000;
17 | }
18 |
19 | .md-main__inner.md-grid,
20 | .md-grid {
21 | max-width: 64rem;
22 | }
23 |
24 | @media screen and (min-width: 1220px) {
25 | .md-main__inner.md-grid,
26 | .md-grid {
27 | max-width: 64rem;
28 | }
29 | }
30 |
31 | .md-typeset h1,
32 | .md-typeset h2,
33 | .md-typeset h3 {
34 | font-weight: 400;
35 | color: var(--md-primary-fg-color-dark); /* this actually works for both light and dark themes */
36 | }
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 |
4 |

5 |
6 |
7 | SWE-bench CLI is a command-line tool for interacting with the SWE-bench API. This tool allows you to:
8 |
9 | - Submit to the [SWE-bench Multimodal](https://www.swebench.com/multimodal) leaderboard
10 | - Submit model predictions for evaluation
11 | - Retrieve evaluation reports
12 | - Manage your evaluation runs
13 | - Track your model's performance
14 |
15 | All on the cloud!
16 |
17 | ## Key Features
18 |
19 | - **Easy Submission**: Submit your model's predictions with a single command
20 | - **Real-time Tracking**: Monitor evaluation progress in real-time
21 | - **Run Management**: Access and delete runs as needed
22 |
23 | ## Quick Links
24 |
25 | - [Installation](installation.md): Get started with installing the CLI
26 | - [Authentication](authentication.md): Set up your API key
27 | - [Quick Start](quick-start.md): Submit your first predictions
28 | - [User Guide](user-guide/index.md): Detailed guide on using the CLI
29 | - [Submit to Leaderboard](submit-to-leaderboard.md): Submit your results to the SWE-bench Multimodal leaderboard
--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | Installing the SWE-bench CLI is straightforward using pip.
4 |
5 | ## Using pip
6 |
7 | ```bash
8 | pip install sb-cli
9 | ```
10 |
11 | ## Verifying Installation
12 |
13 | After installation, verify that the CLI is working correctly:
14 |
15 | ```bash
16 | sb-cli --help
17 | ```
18 |
19 | You should see output listing all available commands.
20 |
21 | ## Development Installation
22 |
23 | If you want to contribute or install from source:
24 |
25 | ```bash
26 | git clone https://github.com/swe-bench/sb-cli.git
27 | cd sb-cli
28 | pip install -e .
29 | ```
30 |
31 | ## Upgrading
32 |
33 | To upgrade to the latest version:
34 |
35 | ```bash
36 | pip install --upgrade sb-cli
37 | ```
38 |
--------------------------------------------------------------------------------
/docs/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block announce %}
3 | 📣 Check out Mini-SWE-Agent, an AI agent implemented in 100 lines that achieves 65% on SWE-bench verified!
4 | {% endblock %}
5 |
6 | {% block content %}
7 | {{ super() }}
8 |
9 |
10 |
32 | {% endblock %}
33 |
--------------------------------------------------------------------------------
/docs/quick-start.md:
--------------------------------------------------------------------------------
1 | # Quick Start
2 |
3 | This guide will help you submit your first predictions to SWE-bench and get an evaluation report.
4 |
5 | ## Prerequisites
6 |
7 | Before starting, ensure you have:
8 | - Installed the CLI (`pip install sb-cli`)
9 | - Generated and verified your API key (see [Authentication](authentication.md))
10 | - Set up your `SWEBENCH_API_KEY` environment variable
11 |
12 | ## 1. Prepare Your Predictions
13 |
14 | Create a JSON file (`preds.json`) with your model's predictions for the [SWE-bench M](https://arxiv.org/abs/2410.03859) `dev` split:
15 |
16 | ```json
17 | {
18 | "instance_id_1": {
19 | "model_patch": "your code changes here",
20 | "model_name_or_path": "your-model-name"
21 | },
22 | "instance_id_2": {
23 | "model_patch": "more code changes",
24 | "model_name_or_path": "your-model-name"
25 | }
26 | }
27 | ```
28 |
29 | ## 2. Submit Predictions
30 |
31 | Submit your predictions to SWE-bench:
32 |
33 | ```bash
34 | sb-cli submit swe-bench-m dev \
35 | --predictions_path preds.json \
36 | --run_id my_first_run
37 | ```
38 |
39 | The CLI will:
40 | 1. Upload your predictions
41 | 2. Monitor the evaluation progress
42 | 3. Generate a report when complete
43 |
44 | ## 3. Check Results
45 |
46 | You can access your evaluation report again by running:
47 |
48 | ```bash
49 | sb-cli get-report swe-bench-m dev my_first_run
50 | ```
51 |
52 | ## 4. View All Runs
53 |
54 | You can view all your submitted runs for `swe-bench-m` / `dev` by running:
55 |
56 | ```bash
57 | sb-cli list-runs swe-bench-m dev
58 | ```
59 |
--------------------------------------------------------------------------------
/docs/submit-to-leaderboard.md:
--------------------------------------------------------------------------------
1 | # Submit to Multimodal Leaderboard
2 |
3 | This guide explains how to submit your system's results to the [SWE-bench Multimodal](https://www.swebench.com/multimodal) leaderboard.
4 |
5 | ## Prerequisites
6 |
7 | Before submitting, ensure you have:
8 |
9 | - Completed the [Quick Start](quick-start.md) guide
10 | - Generated predictions for the `swe-bench-m` / `test` split
11 |
12 | ## Submission Steps
13 |
14 | 1. **Check your API Quotas**
15 |
16 | You can check your quotas using `sb-cli get-quotas`.
17 |
18 | If you don't have enough quota for the `swe-bench-m` / `test` split, email the SWE-bench team at `support@swebench.com` and we can increase it for you.
19 |
20 | We're currently limiting submissions to `swe-bench-m` / `test` to prevent abuse.
21 |
22 | 2. **Submit Predictions to API**
23 |
24 | Then, submit your predictions to the SWE-bench API:
25 |
26 | ```bash
27 | sb-cli submit swe-bench-m test \
28 | --predictions_path ./path/to/preds.json \
29 | --run_id your-run-id
30 | ```
31 |
32 | 2. **Prepare GitHub Submission**
33 |
34 | Fork and clone the [experiments repository](https://github.com/swe-bench/experiments):
35 |
36 | ```bash
37 | git clone https://github.com/YOUR_USERNAME/experiments.git
38 | cd experiments
39 | ```
40 |
41 | 3. **Create Submission Files**
42 |
43 | Create a new directory for your submission and add the following files:
44 |
45 | ```
46 | experiments/
47 | └── multimodal/
48 | └── YOUR_MODEL_NAME/
49 | ├── README.md # Description of your model/approach
50 | └── metadata.yaml # Submission metadata
51 | ```
52 |
53 | Example `metadata.yaml`:
54 | ```yaml
55 | name: "Your System's Name"
56 | oss: true # Whether your system is open source
57 | site: "https://github.com/..." # URL to link to from the leaderboard
58 | ```
59 |
60 | 4. **Submit Pull Request**
61 |
62 | Create a pull request to the [experiments repository](https://github.com/swe-bench/experiments) with your submission files.
63 |
64 | In addition the files above, you should include the following details in the PR's description:
65 | 1. Email used for sb-cli submission
66 | 2. The `run_id` used for sb-cli submission
67 |
68 | The SWE-bench team will:
69 | 1. Add your predictions and results to the PR
70 | 2. Review and merge your submission
71 | 3. Update the leaderboard with your results
72 |
73 | ## Notes
74 |
75 | - Make sure your `run_id` matches between the API submission and metadata file
76 | - We maintain this leaderboard for free, we try to update it as soon as possible after submission
77 | - Test split submissions are limited - please check your quota using `sb-cli quota swe-bench-m test` before submitting so you don't run out. Quotas are reloaded every 30 days.
78 |
79 | For questions about the submission process, please open an issue in the [experiments repository](https://github.com/swe-bench/experiments/issues).
80 |
--------------------------------------------------------------------------------
/docs/user-guide/delete-run.md:
--------------------------------------------------------------------------------
1 | # Delete Run Command
2 |
3 | !!! warning
4 | This command is currently disabled.
5 |
6 | The `delete-run` command removes a specific run id and its associated data.
7 |
8 | ## Usage
9 |
10 | ```bash
11 | sb-cli delete-run
12 | ```
13 |
14 | ## Arguments
15 |
16 | - `subset`: Dataset subset (`swe-bench-m`, `swe-bench_lite`, `swe-bench_verified`)
17 | - `split`: Dataset split (`dev` or `test`)
18 | - `run_id`: ID of the run to delete
19 |
20 | ## Important Notes
21 |
22 | - Deletion is permanent and cannot be undone
23 | - Only runs associated with your API key can be deleted
24 | - Running evaluations will be cancelled
25 | - Associated reports will be removed
26 |
27 | ## Examples
28 |
29 | 1. Delete a specific run:
30 | ```bash
31 | sb-cli delete-run swe-bench-m dev my_run_id
32 | ```
33 |
34 | ## Best Practices
35 |
36 | 1. Always verify the run ID before deletion:
37 | ```bash
38 | sb-cli list-runs swe-bench-m dev
39 | ```
40 |
41 | 2. Save important reports before deletion:
42 | ```bash
43 | sb-cli get-report swe-bench-m dev my_run_id -o ./backup
44 | sb-cli delete-run swe-bench-m dev my_run_id
45 | ```
46 |
47 | 3. Consider keeping a local backup of important results before deletion
48 |
--------------------------------------------------------------------------------
/docs/user-guide/get-quotas.md:
--------------------------------------------------------------------------------
1 | # Get Quotas Command
2 |
3 | The `get-quotas` command displays your remaining submission quotas for each dataset subset and split combination.
4 |
5 | ## Usage
6 |
7 | ```bash
8 | sb-cli get-quotas
9 | ```
10 |
11 | Example output:
12 | ```bash
13 | > sb-cli get-quotas
14 | Remaining Submission Quotas
15 | ┏━━━━━━━━━━━━━━━━----┳━━━━━━━┳━━━━━━━━━━━━━━━━┓
16 | ┃ Subset ┃ Split ┃ Remaining Runs ┃
17 | ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━┩
18 | │ swe-bench-m │ test │ 1 │
19 | │ swe-bench-m │ dev │ 997 │
20 | │ swe-bench_lite │ test │ 1 │
21 | │ swe-bench_lite │ dev │ 976 │
22 | │ swe-bench_verified │ test │ 1 │
23 | └────────────────────┴───────┴────────────────┘
24 | ```
25 |
26 | ## Options
27 |
28 | - `--api_key`: API key to use (defaults to `SWEBENCH_API_KEY` environment variable)
29 |
30 | ## Output Format
31 |
32 | The command displays a table with three columns:
33 | - **Subset**: Dataset subset (`swe-bench-m`,`swe-bench_lite`,`swe-bench_verified`)
34 | - **Split**: Dataset split (`dev` or `test`)
35 | - **Remaining Runs**: Number of submissions remaining for this subset/split combination
36 |
37 | This command will display the remaining submissions you can make for each dataset subset and split combination.
38 |
39 | ## Notes
40 | - Quotas are refreshed periodically according to your subscription level
41 |
--------------------------------------------------------------------------------
/docs/user-guide/get-report.md:
--------------------------------------------------------------------------------
1 | # Get Report Command
2 |
3 | The `get-report` command retrieves evaluation results for a specific run.
4 |
5 | ## Usage
6 |
7 | ```bash
8 | sb-cli get-report [options]
9 | ```
10 |
11 | ## Arguments
12 |
13 | - `subset`: Dataset subset (`swe-bench-m`, `swe-bench_lite`, `swe-bench_verified`)
14 | - `split`: Dataset split (`dev` or `test`)
15 | - `run_id`: ID of the run to get results for
16 |
17 | ## Options
18 |
19 | - `--output_dir`, `-o`: Directory to save report files (default: sb-cli-reports)
20 | - `--overwrite`: Overwrite existing report files (0/1, default: 0)
21 | - `--extra_arg`, `-e`: Additional arguments in KEY=VALUE format
22 |
23 | ## Report Format
24 |
25 | The command outputs a summary to the console and saves two JSON files:
26 |
27 | 1. `{subset}__{split}__{run_id}.json`: The full evaluation report
28 | 2. `{subset}__{split}__{run_id}.response.json`: Additional response data
29 |
30 | The console summary includes:
31 | - Resolved instances (total and submitted)
32 | - Submission statistics
33 | - Error counts
34 | - Pending evaluations
35 |
36 | ## Examples
37 |
38 | 1. Basic usage:
39 | ```bash
40 | sb-cli get-report swe-bench-m dev my_run_id
41 | ```
42 |
43 | 2. Custom output directory:
44 | ```bash
45 | sb-cli get-report swe-bench-m dev my_run_id -o ./reports
46 | ```
47 |
48 | 3. Overwrite existing report:
49 | ```bash
50 | sb-cli get-report swe-bench-m dev my_run_id --overwrite 1
51 | ```
52 |
--------------------------------------------------------------------------------
/docs/user-guide/index.md:
--------------------------------------------------------------------------------
1 | # User Guide Overview
2 |
3 | This guide provides detailed information about using the SWE-bench CLI. Each command is documented with examples and common use cases.
4 |
5 | ## Available Commands
6 |
7 | - **[submit](submit.md)**: Submit model predictions for evaluation
8 | - **[get-report](get-report.md)**: Retrieve evaluation reports
9 | - **[list-runs](list-runs.md)**: View all your submitted runs
10 | - **[delete-run](delete-run.md)**: Remove a specific run
11 |
12 | ## Dataset Information
13 |
14 | SWE-bench has different subsets and splits available:
15 |
16 | ### Subsets
17 | - `swe-bench-m`: The main dataset
18 | - `swe-bench_lite`: A smaller subset for testing and development
19 | - `swe-bench_verified`: 500 verified problems from SWE-bench [Learn more](https://openai.com/index/introducing-swe-bench-verified/)
20 |
21 | ### Splits
22 | - `dev`: Development/validation split
23 | - `test`: Test split (currently only available for `swe-bench_lite`)
24 |
25 | ## Common Workflows
26 |
27 | 1. **Basic Evaluation**:
28 | ```bash
29 | sb-cli submit swe-bench-m dev --predictions_path preds.json --run_id my_run
30 | sb-cli get-report swe-bench-m dev my_run
31 | ```
32 |
33 | 2. **Development Testing**:
34 | ```bash
35 | sb-cli submit swe-bench_lite dev --predictions_path test.json --run_id test_run
36 | ```
37 |
38 | 3. **Managing Runs**:
39 | ```bash
40 | sb-cli list-runs swe-bench-m dev
41 | sb-cli delete-run swe-bench-m dev old_run_id
42 | ```
43 |
--------------------------------------------------------------------------------
/docs/user-guide/list-runs.md:
--------------------------------------------------------------------------------
1 | # List Runs Command
2 |
3 | The `list-runs` command shows all your submitted runs for a specific subset and split.
4 |
5 | ## Usage
6 |
7 | ```bash
8 | sb-cli list-runs
9 | ```
10 |
11 | ## Arguments
12 |
13 | - `subset`: Dataset subset (`swe-bench-m`,`swe-bench_lite`,`swe-bench_verified`)
14 | - `split`: Dataset split (`dev` or `test`)
15 |
16 | ## Output
17 |
18 | The command displays a list of all run IDs associated with your API key for the specified subset and split. If no runs are found, it will indicate this.
19 |
20 | ## Examples
21 |
22 | 1. List runs for main dataset:
23 | ```bash
24 | sb-cli list-runs swe-bench-m dev
25 | ```
26 |
27 | 2. List runs for lite dataset:
28 | ```bash
29 | sb-cli list-runs swe-bench_lite dev
30 | ```
31 |
32 | ## Common Use Cases
33 |
34 | - Finding old run IDs before deletion
35 | - Checking submission history
36 | - Verifying successful submissions
37 | - Managing multiple experiments
38 |
--------------------------------------------------------------------------------
/docs/user-guide/submit.md:
--------------------------------------------------------------------------------
1 | # Submit Command
2 |
3 | The `submit` command uploads your model's predictions for evaluation.
4 |
5 | ## Usage
6 |
7 | ```bash
8 | sb-cli submit --predictions_path [options]
9 | ```
10 |
11 | ## Arguments
12 |
13 | - `subset`: Dataset subset (`swe-bench-m`, `swe-bench_lite`, `swe-bench_verified`)
14 | - `split`: Dataset split (`dev` or `test`)
15 |
16 | ## Options
17 |
18 | - `--predictions_path`: Path to your predictions file (required)
19 | - `--run_id`: Unique identifier for this submission. You can use the values PARENT or STEM to use the parent directory name or the stem of the predictions file name. (default: PARENT)
20 | - `--instance_ids`: Comma-separated list of specific instances to submit
21 | - `--output_dir`: Directory to save report files (default: sb-cli-reports)
22 | - `--overwrite`: Overwrite existing report (0/1, default: 0)
23 | - `--gen_report`: Generate report after completion (0/1, default: 1)
24 | - `--verify_submission`: Verify submission before waiting (0/1, default: 1)
25 | - `--wait_for_evaluation`: Wait for evaluation to complete (0/1, default: 1)
26 |
27 | ## Predictions File Format
28 |
29 | Your predictions file should be a JSON file in one of these formats:
30 |
31 | ### Dictionary Format
32 | ```json
33 | {
34 | "instance_id_1": {
35 | "model_patch": "...",
36 | "model_name_or_path": "..."
37 | },
38 | "instance_id_2": {
39 | "model_patch": "...",
40 | "model_name_or_path": "..."
41 | }
42 | }
43 | ```
44 |
45 | ### List Format
46 | ```json
47 | [
48 | {
49 | "instance_id": "instance_id_1",
50 | "model_patch": "...",
51 | "model_name_or_path": "..."
52 | },
53 | {
54 | "instance_id": "instance_id_2",
55 | "model_patch": "...",
56 | "model_name_or_path": "..."
57 | }
58 | ]
59 | ```
60 |
61 | ## Examples
62 |
63 | 1. Basic submission:
64 | ```bash
65 | sb-cli submit swe-bench-m dev --predictions_path preds.json
66 | ```
67 |
68 | 2. Custom run ID and output directory:
69 | ```bash
70 | sb-cli submit swe-bench-m dev \
71 | --predictions_path preds.json \
72 | --run_id custom_run \
73 | --output_dir ./reports
74 | ```
75 |
76 | 3. Submit specific instances:
77 | ```bash
78 | sb-cli submit swe-bench-m dev \
79 | --predictions_path preds.json \
80 | --instance_ids id1,id2,id3
81 | ```
--------------------------------------------------------------------------------
/mkdocs.yaml:
--------------------------------------------------------------------------------
1 | site_name: sb-cli
2 | theme:
3 | name: material
4 | custom_dir: docs/overrides
5 | logo: assets/icon.svg
6 | favicon: assets/icon.svg
7 | icon:
8 | repo: fontawesome/brands/github
9 | annotation: material/chevron-right-circle
10 | palette:
11 | - media: "(prefers-color-scheme)"
12 | toggle:
13 | icon: material/brightness-auto
14 | name: Switch to system theme
15 | - scheme: default
16 | # primary: black # override in custom.css
17 | accent: dark orange
18 | media: "(prefers-color-scheme: light)"
19 | toggle:
20 | icon: material/weather-night
21 | name: Switch to dark mode
22 | - scheme: slate
23 | # primary: black # override in custom.css
24 | accent: dark orange
25 | media: "(prefers-color-scheme: dark)"
26 | toggle:
27 | icon: material/weather-sunny
28 | name: Switch to light mode
29 | features:
30 | - navigation.tabs
31 | - navigation.tabs.sticky
32 | - navigation.sections
33 | - navigation.indexes
34 | - navigation.footer
35 | - header.autohide
36 | - announce.dismiss
37 | - toc.follow
38 | - search.suggest
39 | - search.highlight
40 | - content.tabs.link
41 | - content.code.annotation
42 | - content.code.copy
43 | repo_url: https://github.com/swe-bench/sb-cli
44 | repo_name: SWE-bench/sb-cli
45 | edit_uri: edit/main/docs
46 | nav:
47 | - "Getting Started":
48 | - index.md
49 | - "Installation": installation.md
50 | - "Authentication": authentication.md
51 | - "Quick Start": quick-start.md
52 | - "Submit to Leaderboard": submit-to-leaderboard.md
53 | - User Guide:
54 | - Overview: user-guide/index.md
55 | - Get Quotas: user-guide/get-quotas.md
56 | - Submit: user-guide/submit.md
57 | - Get Report: user-guide/get-report.md
58 | - List Runs: user-guide/list-runs.md
59 | - Delete Run: user-guide/delete-run.md
60 | markdown_extensions:
61 | - sane_lists
62 | - admonition
63 | - pymdownx.details
64 | - pymdownx.superfences
65 | - pymdownx.magiclink
66 | - footnotes
67 | - attr_list
68 | - md_in_html
69 | - pymdownx.highlight:
70 | anchor_linenums: true
71 | - pymdownx.inlinehilite
72 | - pymdownx.snippets:
73 | check_paths: true
74 | - pymdownx.emoji:
75 | emoji_index: !!python/name:material.extensions.emoji.twemoji
76 | emoji_generator: !!python/name:material.extensions.emoji.to_svg
77 | - pymdownx.tabbed:
78 | alternate_style: true
79 |
80 | plugins:
81 | - search
82 | - glightbox
83 |
84 | extra_css:
85 | - css/custom.css
86 | - css/bubbles.css
87 | extra:
88 | analytics:
89 | provider: google
90 | property: G-T5P2NYGJYR
91 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "sb-cli"
3 | version = "0.1.5"
4 | description = "Submit predictions to the SWE-bench API and manage your runs"
5 | authors = [
6 | { name = "SWE-bench team", email = "support@swebench.com" }
7 | ]
8 | dependencies = [
9 | "typer>=0.9.0",
10 | "click<8.2.0", # see https://github.com/fastapi/typer/discussions/1215
11 | "requests",
12 | "rich",
13 | ]
14 | requires-python = ">=3.10"
15 | readme = "README.md"
16 | license = { text = "MIT" }
17 | classifiers = [
18 | "Programming Language :: Python :: 3",
19 | "Programming Language :: Python :: 3.10",
20 | "Programming Language :: Python :: 3.11",
21 | "License :: OSI Approved :: MIT License",
22 | "Operating System :: OS Independent",
23 | ]
24 |
25 | [project.urls]
26 | Homepage = "https://github.com/swe-bench/sb-cli"
27 | Issues = "https://github.com/swe-bench/sb-cli/issues"
28 |
29 | [project.scripts]
30 | sb-cli = "sb_cli:main"
31 |
32 | [build-system]
33 | requires = ["hatchling"]
34 | build-backend = "hatchling.build"
35 |
36 | [tool.hatch.build.targets.wheel]
37 | packages = ["sb_cli"]
38 |
39 | [tool.ruff]
40 | line-length = 88
41 | src = ["sb_cli"]
42 |
43 | [project.optional-dependencies]
44 | dev = [
45 | "mkdocs>=1.5.0",
46 | "mkdocs-material>=9.0.0",
47 | ]
48 |
--------------------------------------------------------------------------------
/sb_cli/__init__.py:
--------------------------------------------------------------------------------
1 | import typer
2 |
3 | app = typer.Typer(help="CLI tool for interacting with the SWE-bench M API")
4 |
5 | from . import (
6 | gen_api_key,
7 | get_report,
8 | list_runs,
9 | submit,
10 | verify_api_key,
11 | delete_run,
12 | get_quotas
13 | )
14 |
15 | app.command(name="get-report")(get_report.get_report)
16 | app.command(name="list-runs")(list_runs.list_runs)
17 | app.command(name="submit")(submit.submit)
18 | app.command(name="verify-api-key")(verify_api_key.verify)
19 | app.command(name="gen-api-key")(gen_api_key.gen_api_key)
20 | app.command(name="delete-run")(delete_run.delete_run)
21 | app.command(name="get-quotas")(get_quotas.get_quotas)
22 | def main():
23 | """Run the SWE-bench CLI application"""
24 | import sys
25 | if len(sys.argv) == 1:
26 | app(['--help'])
27 | else:
28 | app()
29 |
--------------------------------------------------------------------------------
/sb_cli/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | from enum import Enum
3 |
4 | API_BASE_URL = os.getenv("SWEBENCH_API_URL", "https://api.swebench.com")
5 |
6 | class Subset(str, Enum):
7 | swe_bench_m = 'swe-bench-m'
8 | swe_bench_lite = 'swe-bench_lite'
9 | swe_bench_verified = 'swe-bench_verified'
10 |
--------------------------------------------------------------------------------
/sb_cli/delete_run.py:
--------------------------------------------------------------------------------
1 | import typer
2 | import requests
3 | from typing import Optional
4 | from rich.console import Console
5 | from sb_cli.config import API_BASE_URL, Subset
6 | from sb_cli.utils import verify_response
7 |
8 | app = typer.Typer(help="Delete a specific run by its ID")
9 |
10 | def delete_run(
11 | subset: Subset = typer.Argument(..., help="Subset of the run to delete"),
12 | split: str = typer.Argument(..., help="Split of the run to delete"),
13 | run_id: str = typer.Argument(..., help="Run ID to delete"),
14 | api_key: Optional[str] = typer.Option(None, help="API key to use", envvar="SWEBENCH_API_KEY"),
15 | ):
16 | """Delete a specific run by its ID"""
17 | console = Console()
18 | headers = {
19 | "x-api-key": api_key
20 | }
21 | payload = {
22 | "run_id": run_id,
23 | "split": split,
24 | "subset": subset.value
25 | }
26 |
27 | with console.status(f"[blue]Deleting run {run_id}..."):
28 | response = requests.delete(
29 | f"{API_BASE_URL}/delete-run",
30 | headers=headers,
31 | json=payload
32 | )
33 | verify_response(response)
34 | result = response.json()
35 |
36 | if response.status_code == 200:
37 | typer.echo(f"Run {run_id} successfully deleted for subset {subset.value} and split {split}")
38 | else:
39 | typer.echo(f"Failed to delete run {run_id}: {result.get('message', 'Unknown error')}")
40 |
41 | if __name__ == "__main__":
42 | app()
43 |
--------------------------------------------------------------------------------
/sb_cli/gen_api_key.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import typer
3 | from sb_cli.config import API_BASE_URL
4 | from sb_cli.utils import verify_response
5 |
6 | app = typer.Typer(help="Get an API key for accessing the SWE-bench M API")
7 |
8 | def gen_api_key(
9 | email: str = typer.Argument(
10 | ...,
11 | help="Email address to generate an API key for",
12 | show_default=False
13 | )
14 | ):
15 | """
16 | Generate a new API key for accessing the SWE-bench API.
17 |
18 | The API key will be sent to the provided email address along with a verification code.
19 | You will need to verify the API key using the 'verify-api-key' command before it can be used.
20 | """
21 | payload = {
22 | 'email': email,
23 | }
24 | response = requests.post(f'{API_BASE_URL}/gen-api-key', json=payload)
25 | verify_response(response)
26 | result = response.json()
27 | message = result['message']
28 | api_key = result['api_key']
29 | typer.echo(message)
30 | typer.echo(api_key)
31 | typer.echo(f"To save your API key, place the following line in your ~/.bashrc or ~/.zshrc file:")
32 | typer.echo(f"export SWEBENCH_API_KEY={api_key}")
33 | typer.echo(typer.style("Before using this API key you must verify the code sent to your email, using the 'verify-api-key' command.", fg="red"))
--------------------------------------------------------------------------------
/sb_cli/get_quotas.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import typer
3 | from typing import Optional
4 | from rich.console import Console
5 | from rich.table import Table
6 | from sb_cli.config import API_BASE_URL
7 | from sb_cli.utils import verify_response
8 |
9 | app = typer.Typer(help="Get remaining quota counts for your API key")
10 |
11 | def get_quotas(
12 | api_key: Optional[str] = typer.Option(
13 | None,
14 | '--api_key',
15 | help="API key to use",
16 | envvar="SWEBENCH_API_KEY"
17 | ),
18 | ):
19 | """Get remaining quota counts for all authorized subsets and splits."""
20 | console = Console()
21 | headers = {"x-api-key": api_key}
22 |
23 | with console.status("[blue]Fetching quota information..."):
24 | response = requests.get(f"{API_BASE_URL}/get-quotas", headers=headers)
25 | verify_response(response)
26 | result = response.json()
27 |
28 | # Create a rich table to display the quotas
29 | table = Table(title="Remaining Submission Quotas")
30 | table.add_column("Subset", style="cyan")
31 | table.add_column("Split", style="magenta")
32 | table.add_column("Remaining Runs", style="green", justify="right")
33 |
34 | quotas = result["remaining_quotas"]
35 | if not quotas:
36 | console.print("[yellow]No remaining quotas found for any subset/split combination[/]")
37 | return
38 |
39 | # Add rows to the table
40 | for subset, splits in quotas.items():
41 | for split, remaining in splits.items():
42 | table.add_row(subset, split, str(remaining))
43 |
44 | console.print(table)
45 |
--------------------------------------------------------------------------------
/sb_cli/get_report.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import requests
4 | import typer
5 | from pathlib import Path
6 | from typing import Optional
7 | from rich.console import Console
8 | from sb_cli.config import API_BASE_URL, Subset
9 | from sb_cli.utils import verify_response
10 |
11 | app = typer.Typer(help="Get the evaluation report for a specific run")
12 |
13 | def safe_save_json(data: dict, file_path: Path, overwrite: bool = False):
14 | if file_path.exists() and not overwrite:
15 | ext = 1
16 | base_stem = file_path.stem
17 | while (file_path.parent / f"{base_stem}-{ext}.json").exists():
18 | ext += 1
19 | file_path = file_path.parent / f"{base_stem}-{ext}.json"
20 | with open(file_path, 'w') as f:
21 | json.dump(data, f, indent=4)
22 | return file_path
23 |
24 |
25 | def get_str_report(report: dict) -> dict:
26 | resolved_total = report['resolved_instances'] / report['total_instances']
27 | resolved_submitted = (report['resolved_instances'] / report['submitted_instances']) if report['submitted_instances'] > 0 else 0
28 | submitted = report['submitted_instances'] / report['total_instances']
29 | return (
30 | f"Resolved (total): {resolved_total:.2%} ({report['resolved_instances']} / {report['total_instances']})\n"
31 | f"Resolved (submitted): {resolved_submitted:.2%} ({report['resolved_instances']} / {report['submitted_instances']})\n"
32 | f"Submitted: {submitted:.2%} ({report['submitted_instances']})\n"
33 | f"Errors: {report['error_instances']}\n"
34 | f"Pending: {report['pending_instances']}\n"
35 | f"Successful runs: {report['completed_instances']}\n"
36 | f"Failed runs: {report['failed_instances']}"
37 | )
38 |
39 |
40 | def get_report(
41 | subset: Subset = typer.Argument(
42 | help="Subset to evaluate",
43 | callback=lambda x: x.value if isinstance(x, Subset) else x
44 | ),
45 | split: str = typer.Argument(
46 | ...,
47 | help="Split to evaluate"
48 | ),
49 | run_id: str = typer.Argument(..., help="Run ID"),
50 | api_key: Optional[str] = typer.Option(
51 | None,
52 | '--api_key',
53 | help="API key to use",
54 | envvar="SWEBENCH_API_KEY"
55 | ),
56 | overwrite: int = typer.Option(0, '--overwrite', help="Overwrite existing report"),
57 | output_dir: Optional[str] = typer.Option(
58 | 'sb-cli-reports',
59 | '--output_dir',
60 | '-o',
61 | help="Directory to save report files"
62 | ),
63 | extra_args: Optional[str] = typer.Option(
64 | '',
65 | '--extra_arg',
66 | '-e',
67 | help="Additional argument in the format KEY=VALUE",
68 | )
69 | ):
70 | """Get report for a run from the run ID"""
71 | kwargs = {}
72 | if extra_args and isinstance(extra_args, str):
73 | kwargs = {arg.split('=')[0]: arg.split('=')[1] for arg in extra_args.split(',')}
74 | elif extra_args and not isinstance(extra_args, typer.models.OptionInfo):
75 | raise ValueError(f"Invalid extra arguments: has type {type(extra_args)}")
76 | payload = {
77 | 'run_id': run_id,
78 | 'subset': subset,
79 | 'split': split,
80 | **kwargs
81 | }
82 | headers = {'x-api-key': api_key} if api_key else {}
83 | console = Console()
84 | with console.status(f"[blue]Creating report for run {run_id}...", spinner="dots"):
85 | response = requests.post(f"{API_BASE_URL}/get-report", json=payload, headers=headers)
86 | verify_response(response)
87 | response = response.json()
88 | report = response.pop('report')
89 | typer.echo(get_str_report(report))
90 | report_name = f"{subset}__{split}__{run_id}"
91 |
92 | if output_dir:
93 | output_path = Path(output_dir)
94 | output_path.mkdir(parents=True, exist_ok=True)
95 | report_path = output_path / f"{report_name}.json"
96 | response_path = output_path / f"{report_name}.response.json"
97 | else:
98 | report_path = Path(f"{report_name}.json")
99 | response_path = Path(f"{report_name}.response.json")
100 |
101 | report_path = safe_save_json(report, report_path, overwrite)
102 | typer.echo(f"Saved full report to {report_path}!")
103 | if response:
104 | response_path = safe_save_json(response, response_path, False)
105 | typer.echo(f"Saved response to {response_path}")
--------------------------------------------------------------------------------
/sb_cli/list_runs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 | import typer
4 | from typing import Optional
5 | from rich.console import Console
6 | from sb_cli.config import API_BASE_URL, Subset
7 | from sb_cli.utils import verify_response
8 |
9 | app = typer.Typer(help="List all existing run IDs", name="list-runs")
10 |
11 | def list_runs(
12 | subset: Subset = typer.Argument(..., help="Subset to list runs for"),
13 | split: str = typer.Argument(..., help="Split to list runs for"),
14 | api_key: Optional[str] = typer.Option(None, help="API key to use", envvar="SWEBENCH_API_KEY"),
15 | ):
16 | """List all existing run IDs in your account"""
17 | console = Console()
18 | headers = {
19 | "x-api-key": api_key
20 | }
21 | with console.status("[blue]Fetching runs..."):
22 | response = requests.post(
23 | f"{API_BASE_URL}/list-runs",
24 | headers=headers,
25 | json={"split": split, "subset": subset.value}
26 | )
27 | verify_response(response)
28 | result = response.json()
29 |
30 | if len(result['run_ids']) == 0:
31 | typer.echo(f"No runs found for subset {subset.value} and split {split}")
32 | else:
33 | typer.echo(f"Run IDs ({subset.value} - {split}):")
34 | for run_id in result['run_ids']:
35 | typer.echo(run_id)
36 |
--------------------------------------------------------------------------------
/sb_cli/submit.py:
--------------------------------------------------------------------------------
1 | import json
2 | import time
3 | import requests
4 | import typer
5 | import sys
6 | from typing import Optional
7 | from typing_extensions import Annotated
8 | from concurrent.futures import ThreadPoolExecutor, as_completed
9 | from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeElapsedColumn
10 | from rich.console import Console
11 | from sb_cli.config import API_BASE_URL, Subset
12 | from sb_cli.get_report import get_report
13 | from sb_cli.utils import verify_response
14 | from pathlib import Path
15 |
16 | app = typer.Typer(help="Submit predictions to the SBM API")
17 |
18 | def submit_prediction(prediction: dict, headers: dict, payload_base: dict):
19 | """Submit a single prediction."""
20 | payload = payload_base.copy()
21 | payload["prediction"] = prediction
22 | response = requests.post(f'{API_BASE_URL}/submit', json=payload, headers=headers)
23 | verify_response(response)
24 | return response.json()
25 |
26 | # Prediction Processing
27 | def process_predictions(predictions_path: str, instance_ids: list[str]):
28 | """Load and validate predictions from file."""
29 | with open(predictions_path, 'r') as f:
30 | if predictions_path.endswith('.json'):
31 | predictions = json.load(f)
32 | else:
33 | predictions = [json.loads(line) for line in f]
34 | preds = []
35 | if isinstance(predictions, list):
36 | for p in predictions:
37 | instance_id = p['instance_id']
38 | if instance_ids and instance_id not in instance_ids:
39 | continue
40 | preds.append({
41 | 'instance_id': instance_id,
42 | 'model_patch': p['model_patch'],
43 | 'model_name_or_path': p['model_name_or_path']
44 | })
45 | else:
46 | for instance_id, p in predictions.items():
47 | if instance_ids and instance_id not in instance_ids:
48 | continue
49 | preds.append({
50 | 'instance_id': instance_id,
51 | 'model_patch': p['model_patch'],
52 | 'model_name_or_path': p['model_name_or_path']
53 | })
54 | if len(set([p['model_name_or_path'] for p in preds])) > 1:
55 | raise ValueError("All predictions must be for the same model")
56 | if len(set([p['instance_id'] for p in preds])) != len(preds):
57 | raise ValueError("Duplicate instance IDs found in predictions - please remove duplicates before submitting")
58 | return preds
59 |
60 | def process_poll_response(results: dict, all_ids: list[str]):
61 | """Process polling response and categorize instance IDs."""
62 | running_ids = set(results['running']) & set(all_ids)
63 | completed_ids = set(results['completed']) & set(all_ids)
64 | pending_ids = set(all_ids) - running_ids - completed_ids
65 | return {
66 | 'running': list(running_ids),
67 | 'completed': list(completed_ids),
68 | 'pending': list(pending_ids)
69 | }
70 |
71 | # Progress Tracking Functions
72 | def run_progress_task(
73 | console: Console,
74 | task_name: str,
75 | total: int,
76 | task_func,
77 | timeout: Optional[int] = None,
78 | *args,
79 | **kwargs
80 | ):
81 | """Run a task with a progress bar and a default timeout."""
82 | progress = Progress(
83 | SpinnerColumn(),
84 | TextColumn(f"[blue]{task_name}..."),
85 | BarColumn(),
86 | TaskProgressColumn(text_format="[progress.percentage]{task.percentage:>3.1f}%"),
87 | TimeElapsedColumn(),
88 | console=console,
89 | )
90 | start_time = time.time()
91 | completed = 0
92 | exception = None
93 | with progress:
94 | task = progress.add_task("", total=total)
95 | try:
96 | # Run the task function with a timeout
97 | result = task_func(progress, task, *args, **kwargs)
98 | except Exception as e:
99 | exception = e
100 | finally:
101 | elapsed_time = time.time() - start_time
102 | progress.stop()
103 | if exception:
104 | console.print(f"[red]Error during task: {str(exception)}[/]")
105 | raise exception
106 | final_percentage = progress.tasks[task].completed / progress.tasks[task].total * 100
107 | completed = progress.tasks[task].completed
108 | total = progress.tasks[task].total
109 | progress.remove_task(task)
110 | if completed == total:
111 | console.print(f"[green]✓ {task_name} complete![/]")
112 | elif timeout and elapsed_time > timeout:
113 | # don't print the timeout message if the task completed
114 | console.print(f"[red]✗ {task_name} timed out after {timeout} seconds. Try re-running submit to continue.[/]")
115 | sys.exit(1)
116 | else:
117 | console.print(f"[yellow]✓ {task_name} completed with {completed}/{total} instances[/]")
118 | return {
119 | "result": result,
120 | "elapsed_time": elapsed_time,
121 | "final_percentage": final_percentage,
122 | "completed": completed,
123 | "total": total,
124 | "timeout": timeout and elapsed_time > timeout,
125 | }
126 |
127 | def submit_predictions_with_progress(
128 | predictions: list[dict],
129 | headers: dict,
130 | payload_base: dict,
131 | console: Console,
132 | ) -> tuple[list[str], list[str]]:
133 | """Submit predictions with a progress bar and return new and completed IDs."""
134 | def task_func(progress, task):
135 | all_new_ids = []
136 | all_completed_ids = []
137 | failed_ids = []
138 | with ThreadPoolExecutor(max_workers=min(24, len(predictions))) as executor:
139 | future_to_prediction = {
140 | executor.submit(submit_prediction, pred, headers, payload_base): pred
141 | for pred in predictions
142 | }
143 | for future in as_completed(future_to_prediction):
144 | try:
145 | launch_data = future.result()
146 | if launch_data["launched"]:
147 | all_new_ids.append(launch_data['instance_id'])
148 | else:
149 | all_completed_ids.append(launch_data['instance_id'])
150 | except Exception as e:
151 | # Retrieve the prediction associated with the failed future
152 | pred = future_to_prediction[future]
153 | failed_ids.append(pred['instance_id'])
154 | console.print(f"[red]Error submitting prediction for instance {pred['instance_id']}: {str(e)}")
155 | finally:
156 | progress.update(task, advance=1)
157 | return {
158 | "new_ids": all_new_ids,
159 | "all_completed_ids": all_completed_ids,
160 | "failed_ids": failed_ids
161 | }
162 | console = Console()
163 | result = run_progress_task(
164 | console,
165 | "Submitting predictions",
166 | len(predictions),
167 | task_func,
168 | )["result"]
169 | new_ids = result["new_ids"]
170 | all_completed_ids = result["all_completed_ids"]
171 | failed_ids = result["failed_ids"]
172 | if len(all_completed_ids) > 0:
173 | console.print((
174 | f'[yellow] Warning: {len(all_completed_ids)} predictions already submitted. '
175 | 'These will not be re-evaluated[/]'
176 | ))
177 | if len(new_ids) > 0:
178 | console.print(
179 | f'[green] {len(new_ids)} new predictions uploaded[/][yellow] - these cannot be changed[/]'
180 | )
181 | if len(failed_ids) > 0:
182 | console.print(
183 | f'[red]✗ {len(failed_ids)} predictions failed to submit[/]'
184 | )
185 | return new_ids, all_completed_ids
186 |
187 | def wait_for_running(
188 | *,
189 | all_ids: list[str],
190 | api_key: str,
191 | subset: str,
192 | split: str,
193 | run_id: str,
194 | timeout: int
195 | ):
196 | """Spin a progress bar until no predictions are pending."""
197 | def task_func(progress, task):
198 | headers = {"x-api-key": api_key}
199 | poll_payload = {'run_id': run_id, 'subset': subset, 'split': split}
200 | start_time = time.time()
201 | while True:
202 | poll_response = requests.get(f'{API_BASE_URL}/poll-jobs', json=poll_payload, headers=headers)
203 | verify_response(poll_response)
204 | poll_results = process_poll_response(poll_response.json(), all_ids)
205 | progress.update(task, completed=len(poll_results['running']) + len(poll_results['completed']))
206 | if len(poll_results['pending']) == 0:
207 | break
208 |
209 | if (time.time() - start_time) > timeout:
210 | break
211 | else:
212 | time.sleep(8)
213 | result = run_progress_task(
214 | Console(),
215 | "Processing submission",
216 | len(all_ids),
217 | task_func,
218 | timeout=timeout,
219 | )
220 | if result["timeout"] and result["completed"] == 0:
221 | raise ValueError((
222 | "Submission waiter timed out without making progress - this is probably a bug.\n"
223 | "Please submit a bug report at https://github.com/swe-bench/sb-cli/issues"
224 | ))
225 |
226 | def wait_for_evaluation(
227 | *,
228 | all_ids: list[str],
229 | api_key: str,
230 | subset: str,
231 | split: str,
232 | run_id: str,
233 | timeout: int
234 | ):
235 | """Spin a progress bar until all predictions are complete."""
236 | def task_func(progress, task):
237 | headers = {"x-api-key": api_key}
238 | poll_payload = {'run_id': run_id, 'subset': subset, 'split': split}
239 | start_time = time.time()
240 | while True:
241 | poll_response = requests.get(f'{API_BASE_URL}/poll-jobs', json=poll_payload, headers=headers)
242 | verify_response(poll_response)
243 | poll_results = process_poll_response(poll_response.json(), all_ids)
244 | progress.update(task, completed=len(poll_results['completed']))
245 | if len(poll_results['completed']) == len(all_ids):
246 | break
247 |
248 | if (time.time() - start_time) > timeout:
249 | break
250 | else:
251 | time.sleep(15)
252 |
253 | run_progress_task(
254 | Console(),
255 | "Evaluating predictions",
256 | len(all_ids),
257 | task_func,
258 | timeout=timeout,
259 | )
260 |
261 | # Main Submission Function
262 | def submit(
263 | subset: Subset = typer.Argument(..., help="Subset to submit predictions for"),
264 | split: str = typer.Argument(..., help="Split to submit predictions for"),
265 | predictions_path: str = typer.Option(..., '--predictions_path', help="Path to the predictions file"),
266 | run_id: str = typer.Option("PARENT", '--run_id', help="Run ID for the predictions"),
267 | instance_ids: Optional[str] = typer.Option(
268 | None,
269 | '--instance_ids',
270 | help="Instance ID subset to submit predictions - (defaults to all submitted instances)",
271 | callback=lambda x: x.split(',') if x else None
272 | ),
273 | output_dir: Optional[str] = typer.Option('sb-cli-reports', '--output_dir', '-o', help="Directory to save report files"),
274 | overwrite: int = typer.Option(0, '--overwrite', help="Overwrite existing report"),
275 | gen_report: int = typer.Option(1, '--gen_report', help="Generate a report after evaluation is complete"),
276 | verify_submission: int = typer.Option(1, '--verify_submission', help="Verify submission before waiting for completion"),
277 | should_wait_for_evaluation: int = typer.Option(1, '--wait_for_evaluation', help="Wait for evaluation to complete before generating a report"),
278 | api_key: Optional[str] = typer.Option(
279 | None,
280 | '--api_key',
281 | help="API key to use - (defaults to SWEBENCH_API_KEY)",
282 | envvar="SWEBENCH_API_KEY"
283 | ),
284 | ):
285 | """Submit predictions to the SWE-bench M API."""
286 | console = Console()
287 |
288 | # Convert predictions_path to a Path object
289 | predictions_path = Path(predictions_path)
290 |
291 | # Determine run_id based on special options
292 | if run_id == "PARENT":
293 | run_id = predictions_path.parent.name
294 | elif run_id == "STEM":
295 | run_id = predictions_path.stem
296 |
297 | predictions = process_predictions(str(predictions_path), instance_ids)
298 | headers = {
299 | "x-api-key": api_key
300 | }
301 | payload_base = {
302 | "split": split,
303 | "subset": subset,
304 | "instance_ids": instance_ids,
305 | "run_id": run_id
306 | }
307 |
308 | console.print(f"[yellow] Submitting predictions for {run_id} - ({subset.value} {split})[/]")
309 |
310 | new_ids, all_completed_ids = submit_predictions_with_progress(predictions, headers, payload_base, console=console)
311 | all_ids = new_ids + all_completed_ids
312 |
313 | run_metadata = {
314 | 'run_id': run_id,
315 | 'subset': subset.value,
316 | 'split': split,
317 | 'api_key': api_key
318 | }
319 | if verify_submission:
320 | wait_for_running(
321 | all_ids=all_ids,
322 | timeout=60 * 5,
323 | **run_metadata
324 | )
325 | if should_wait_for_evaluation:
326 | wait_for_evaluation(
327 | all_ids=all_ids,
328 | timeout=60 * 30,
329 | **run_metadata
330 | )
331 | if gen_report:
332 | get_report(
333 | output_dir=output_dir,
334 | overwrite=overwrite,
335 | **run_metadata,
336 | )
337 |
--------------------------------------------------------------------------------
/sb_cli/utils.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | def verify_response(response):
4 | if response.status_code != 200:
5 | message = response.json().get("message", "No message provided")
6 | raise requests.RequestException(f"API request failed with status code {response.status_code}: {message}")
--------------------------------------------------------------------------------
/sb_cli/verify_api_key.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import typer
3 | from typing import Optional
4 | from sb_cli.config import API_BASE_URL
5 | from sb_cli.utils import verify_response
6 |
7 | app = typer.Typer()
8 |
9 | def verify(
10 | verification_code: str = typer.Argument(..., help="Verification code to verify"),
11 | api_key: Optional[str] = typer.Option(None, '--api_key', help="API key to verify", envvar="SWEBENCH_API_KEY"),
12 | ):
13 | """Verify API key against the SWE-bench M API."""
14 | headers = {
15 | "x-api-key": api_key
16 | }
17 | try:
18 | payload = {
19 | 'verification_code': verification_code
20 | }
21 | response = requests.post(f"{API_BASE_URL}/verify-api-key", json=payload, headers=headers)
22 | verify_response(response)
23 | message = response.json()['message']
24 | typer.echo(message)
25 | except requests.RequestException as e:
26 | typer.secho(f"API request failed: {str(e)}", fg="red", err=True)
27 | raise typer.Exit(1)
28 |
--------------------------------------------------------------------------------