├── it_bench_arxiv.pdf
├── images
├── select-org.png
├── select-repo.png
├── go-to-github-app.png
├── sample_it_tasks.png
├── agent-issue-selection.png
├── benchmark-registration.png
├── agent-registration-done.png
├── agent-registration-email.png
├── agent-registration-fill.png
├── benchmark-registration-done.png
├── benchmark-registration-fill.png
└── benchmark-registration-email.png
├── CONTRIBUTORS.md
├── .github
├── workflows
│ ├── parse_issue.py
│ ├── leaderboard_update.yaml
│ ├── update_benchmark_status.yaml
│ ├── update_agent_manifest.yaml
│ ├── benchmark_registration.yaml
│ ├── agent_registration.yaml
│ ├── leaderboard.py
│ └── update_benchmark_helper.py
├── ISSUE_TEMPLATE
│ ├── benchmark.yaml
│ ├── onboarding-sre.yaml
│ └── onboarding.yaml
└── GH_ACTIONS_DOCS.md
├── LEADERBOARD_CISO.md
├── .pre-commit-config.yaml
├── .secrets.baseline
├── LEADERBOARD_SRE.md
├── docs
├── leaderboard.md
└── how-to-launch-benchmark-ciso.md
├── README.md
└── LICENSE
/it_bench_arxiv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/it_bench_arxiv.pdf
--------------------------------------------------------------------------------
/images/select-org.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/select-org.png
--------------------------------------------------------------------------------
/images/select-repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/select-repo.png
--------------------------------------------------------------------------------
/images/go-to-github-app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/go-to-github-app.png
--------------------------------------------------------------------------------
/images/sample_it_tasks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/sample_it_tasks.png
--------------------------------------------------------------------------------
/images/agent-issue-selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/agent-issue-selection.png
--------------------------------------------------------------------------------
/images/benchmark-registration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/benchmark-registration.png
--------------------------------------------------------------------------------
/images/agent-registration-done.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/agent-registration-done.png
--------------------------------------------------------------------------------
/images/agent-registration-email.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/agent-registration-email.png
--------------------------------------------------------------------------------
/images/agent-registration-fill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/agent-registration-fill.png
--------------------------------------------------------------------------------
/images/benchmark-registration-done.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/benchmark-registration-done.png
--------------------------------------------------------------------------------
/images/benchmark-registration-fill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/benchmark-registration-fill.png
--------------------------------------------------------------------------------
/images/benchmark-registration-email.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/benchmark-registration-email.png
--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | # Contributors
2 | - Saurabh Jha
3 | - Rohan Arora
4 | - Yuji Watanabe
5 | - Takumi Yanagawa
6 | - Yinfang Chen (UIUC - University of Illinois at Urbana-Champaign)
7 | - Jackson Clark (UIUC - University of Illinois at Urbana-Champaign)
8 | - Bhavya Bhavya
9 | - Mudit Verma
10 | - Harshit Kumar
11 | - Hirokuni Kitahara
12 | - Noah Zheutlin
13 | - Saki Takano
14 | - Divya Pathak
15 | - Felix George
16 | - Xinbo Wu (UIUC - University of Illinois at Urbana-Champaign)
17 | - Bekir O Turkkan
18 | - Gerard Vanloo
19 | - Michael Nidd
20 | - Ting Dai
21 | - Oishik Chatterjee
22 | - Pranjal Gupta
23 | - Suranjana Samanta
24 | - Pooja Aggarwal
25 | - Rong Lee
26 | - Pavankumar Murali
27 | - Jae-wook Ahn
28 | - Debanjana Kar
29 | - Ameet Rahane
30 | - Carlos Fonseca
31 | - Amit Paradkar
32 | - Yu Deng
33 | - Pratibha Moogi
34 | - Prateeti Mohapatra
35 | - Naoki Abe
36 | - Chandrasekhar Narayanaswami
37 | - Tianyin Xu (UIUC - University of Illinois at Urbana-Champaign)
38 | - Lav R. Varshney (UIUC - University of Illinois at Urbana-Champaign)
39 | - Ruchi Mahindru
40 | - Anca Sailer
41 | - Laura Shwartz
42 | - Daby Sow
43 | - Nicholas C. M. Fuller
44 | - Ruchir Puri
45 |
--------------------------------------------------------------------------------
/.github/workflows/parse_issue.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import sys
4 |
5 |
6 | def parse_issue_body(issue_body: str):
7 | result = {}
8 | sections = re.split(r"^###\s+", issue_body, flags=re.MULTILINE)
9 |
10 | for section in sections:
11 | if not section.strip():
12 | continue
13 | lines = section.strip().splitlines()
14 | if not lines:
15 | continue
16 | key = lines[0].strip()
17 | value_lines = lines[1:]
18 |
19 | if not value_lines:
20 | result[key] = ""
21 | continue
22 |
23 | value_lines = [x for x in value_lines if x != '']
24 | if all(re.match(r"^- \[[ xX]\] ", line) for line in value_lines):
25 | options = {}
26 | for line in value_lines:
27 | match = re.match(r"^- \[([ xX])\] (.+)", line)
28 | if match:
29 | checked = match.group(1).lower() == 'x'
30 | label = match.group(2).strip()
31 | options[label] = checked
32 | result[key] = options
33 | else:
34 | value = "\n".join(value_lines).strip()
35 | result[key] = value
36 |
37 | print(json.dumps(result))
38 |
39 | if __name__ == "__main__":
40 | issue_body = sys.stdin.read()
41 | parse_issue_body(issue_body)
42 |
--------------------------------------------------------------------------------
/LEADERBOARD_CISO.md:
--------------------------------------------------------------------------------
1 | ## 📊 IT Bench Leaderboard (CISO)
2 | This leaderboard shows the performance of agents on CISO-related IT automation scenarios.
3 | For details on how to participate or interpret results, see the [README](../main/README.md).
4 |
5 | **Column Descriptions:**
6 | - *Score*: Average benchmark score across scenarios (1.0 = perfect)
7 | - *#Passed*: Number of scenarios successfully passed
8 | - *Mean Agent Execution Duration*: Average time taken across scenarios
9 | - *Scenario Category*: Categories of evaluated tasks (e.g., RHEL, Kyverno, etc.)
10 |
11 | Updated on: 02/05/2025 18:06:54
12 |
13 | ---
14 |
15 | | Agent Name | Agent Submitter | Organization | Scenario Category | Score ⬆️ | #Passed | Mean Agent Execution Duration | Date (UTC) | Issue Link |
16 | |--------------|-----------------|--------------|-------------------|----------|------------------|----------------------------|------------|------------|
17 | | ciso-agent-expert-rhel9-opa | [xinbowu2](https://github.com/xinbowu2) | University of Illinois at Urbana-Champaign (UIUC) | Gen-CIS-b-RHEL9-Ansible-OPA | 0.30 | 3 | 134s | 02/05/2025 05:51:40 | [#30](https://github.com/itbench-hub/ITBench/issues/30) |
18 | | pre-release-agent-2025-0428 | [yana1205](https://github.com/yana1205) | IBM Research - Tokyo | Gen-CIS-b-K8s-Kyverno | 0.20 | 2 | 109s | 28/04/2025 23:08:42 | [#28](https://github.com/itbench-hub/ITBench/issues/28) |
19 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # This is an example configuration to enable detect-secrets in the pre-commit hook.
2 | # Add this file to the root folder of your repository.
3 | #
4 | # Read pre-commit hook framework https://pre-commit.com/ for more details about the structure of config yaml file and how git pre-commit would invoke each hook.
5 | #
6 | # This line indicates we will use the hook from ibm/detect-secrets to run scan during committing phase.
7 | repos:
8 | - repo: https://github.com/ibm/detect-secrets
9 | # If you desire to use a specific version of detect-secrets, you can replace `master` with other git revisions such as branch, tag or commit sha.
10 | # You are encouraged to use static refs such as tags, instead of branch name
11 | #
12 | # Running "pre-commit autoupdate" automatically updates rev to latest tag
13 | rev: 0.13.1+ibm.62.dss
14 | hooks:
15 | - id: detect-secrets # pragma: whitelist secret
16 | # Add options for detect-secrets-hook binary. You can run `detect-secrets-hook --help` to list out all possible options.
17 | # You may also run `pre-commit run detect-secrets` to preview the scan result.
18 | # when "--baseline" without "--use-all-plugins", pre-commit scan with just plugins in baseline file
19 | # when "--baseline" with "--use-all-plugins", pre-commit scan with all available plugins
20 | # add "--fail-on-unaudited" to fail pre-commit for unaudited potential secrets
21 | args: [--baseline, .secrets.baseline, --use-all-plugins]
22 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/benchmark.yaml:
--------------------------------------------------------------------------------
1 | name: IT Bench new benchmark request
2 | description: Request for a new benchmark to be run.
3 | title: "[Registration]: < agent and benchmark name here >"
4 | labels: ["benchmark"]
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | Thank you for your interest in benchmarking an IT Bench Agent.
10 | Please fill out this form to request for a new benchmark to be set up for your agent
11 |
12 | ## Important!
13 |
14 | Before you submit this form, you need to have already registered your agent using the agent registration issue template.
15 | Currently, please use the same GitHub account to open the benchmark issue as the one used for the agent registration.
16 |
17 | You can register your agent using either of the following links:
18 | - [SRE Agent Registration](../itbench/issues/new?template=onboarding-sre.yaml)
19 | - [CISO Agent Registration](../itbench/issues/new?template=onboarding.yaml)
20 |
21 | - type: input
22 | id: repo_url
23 | attributes:
24 | label: "Config Repo"
25 | description: |
26 | Provide the GitHub Repository URL where your agent configuration is stored (this is the same repo used in the registration step.)
27 | placeholder: "e.g. https://github.com/your_org/repo_name"
28 | validations:
29 | required: true
30 |
31 | - type: input
32 | id: benchmark-name
33 | attributes:
34 | label: Benchmark Name
35 | placeholder: my-new-benchmark
36 | validations:
37 | required: true
38 | - type: dropdown
39 | id: schedule-now
40 | attributes:
41 | label: Schedule Now
42 | description: Do you want this benchmark to be immediately scheduled?
43 | options:
44 | - 'true'
45 | - 'false'
46 | default: 0
47 | validations:
48 | required: true
49 |
50 |
51 |
52 | - type: markdown
53 | attributes:
54 | value: Thank you for completing this form, we will review your request shortly.
--------------------------------------------------------------------------------
/.secrets.baseline:
--------------------------------------------------------------------------------
1 | {
2 | "exclude": {
3 | "files": null,
4 | "lines": null
5 | },
6 | "generated_at": "2025-05-01T17:44:35Z",
7 | "plugins_used": [
8 | {
9 | "name": "AWSKeyDetector"
10 | },
11 | {
12 | "name": "ArtifactoryDetector"
13 | },
14 | {
15 | "name": "AzureStorageKeyDetector"
16 | },
17 | {
18 | "base64_limit": 4.5,
19 | "name": "Base64HighEntropyString"
20 | },
21 | {
22 | "name": "BasicAuthDetector"
23 | },
24 | {
25 | "name": "BoxDetector"
26 | },
27 | {
28 | "name": "CloudantDetector"
29 | },
30 | {
31 | "ghe_instance": "github.ibm.com",
32 | "name": "GheDetector"
33 | },
34 | {
35 | "name": "GitHubTokenDetector"
36 | },
37 | {
38 | "hex_limit": 3,
39 | "name": "HexHighEntropyString"
40 | },
41 | {
42 | "name": "IbmCloudIamDetector"
43 | },
44 | {
45 | "name": "IbmCosHmacDetector"
46 | },
47 | {
48 | "name": "JwtTokenDetector"
49 | },
50 | {
51 | "keyword_exclude": null,
52 | "name": "KeywordDetector"
53 | },
54 | {
55 | "name": "MailchimpDetector"
56 | },
57 | {
58 | "name": "NpmDetector"
59 | },
60 | {
61 | "name": "PrivateKeyDetector"
62 | },
63 | {
64 | "name": "SlackDetector"
65 | },
66 | {
67 | "name": "SoftlayerDetector"
68 | },
69 | {
70 | "name": "SquareOAuthDetector"
71 | },
72 | {
73 | "name": "StripeDetector"
74 | },
75 | {
76 | "name": "TwilioKeyDetector"
77 | }
78 | ],
79 | "results": {
80 | "docs/how-to-launch-benchmark-ciso.md": [
81 | {
82 | "hashed_secret": "d1da57683505716a1a8716658c4432742355360a",
83 | "is_verified": false,
84 | "line_number": 15,
85 | "type": "Secret Keyword",
86 | "verified_result": null
87 | }
88 | ]
89 | },
90 | "version": "0.13.1+ibm.62.dss",
91 | "word_list": {
92 | "file": null,
93 | "hash": null
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/onboarding-sre.yaml:
--------------------------------------------------------------------------------
1 | name: IT Bench Agent Registration (SRE)
2 | description: Register your SRE ITBench agent for benchmarking
3 | title: "[Registration - SRE]: < agent name here >"
4 | labels: ["registration"]
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | Thank you for your interest in benchmarking a SRE ITBench Agent.
10 | Please fill out this form to request connection details for the IT Bench service.
11 |
12 | ## Important!
13 |
14 | Before you submit this form, you need to have completed the following tasks
15 | (See also [Getting Started](../itbench-leaderboard?tab=readme-ov-file#prerequisites)):
16 |
17 | 1. Create an empty repository in GitHub with visibility set to private
18 | 2. Install the [ibm-itbench](https://github.com/apps/ibm-itbench-github-app) app into that repository.
19 | 3. Make sure that the person submitting this issue is added as a collaborator to that repository.
20 |
21 | Once those two steps have been completed, please complete this form and provide the URL for the
22 | repository you created in the relevant section.
23 |
24 | ### Reference
25 |
26 | You can find examples of expected agent actions and outputs in the sample scenario repository:
27 | https://github.com/IBM/ITBench-Scenarios/blob/main/sre/docs/incident_scenarios.md
28 |
29 | ---
30 |
31 | - type: input
32 | id: agent-name
33 | attributes:
34 | label: Agent Name
35 | description: Please avoid using spaces in the name.
36 | placeholder: my-itbench-agent
37 | validations:
38 | required: true
39 | - type: dropdown
40 | id: agent-type
41 | attributes:
42 | label: Agent Type
43 | description: What type of agent is this?
44 | options:
45 | - SRE
46 | default: 0
47 | validations:
48 | required: true
49 | - type: dropdown
50 | id: agent-level
51 | attributes:
52 | label: Agent Level
53 | description: What level of agent is this?
54 | options:
55 | - Beginner
56 | - Intermediate
57 | - Expert
58 | default: 0
59 | validations:
60 | required: true
61 | - type: checkboxes
62 | id: scenario-categories
63 | attributes:
64 | label: Agent Scenarios
65 | description: You may select more than one, options not applicable to the agent type will be ignored.
66 | options:
67 | - label: Change
68 | - label: Configuration Setting
69 | - label: Latency
70 | - label: Resource Unavailable
71 | - label: Other
72 | - type: input
73 | id: repo_url
74 | attributes:
75 | label: "Config Repo"
76 | description: |
77 | Provide the GitHub Repository URL that we will create data required for benchmark.
78 | Please install the [ibm-itbench](https://github.com/apps/ibm-itbench-github-app) GitHub App in the repository before submitting this form!.
79 | placeholder: "e.g. https://github.com/your_org/repo_name"
80 | validations:
81 | required: true
82 |
83 | - type: markdown
84 | attributes:
85 | value: Thank you for completing this form, we will review your request shortly.
86 |
--------------------------------------------------------------------------------
/LEADERBOARD_SRE.md:
--------------------------------------------------------------------------------
1 | ## 📊 IT Bench Leaderboard (SRE)
2 | This leaderboard shows the performance of agents on SRE-related IT automation scenarios.
3 |
4 | **Column Descriptions:**
5 | - *Diagnosis - NTAM Fault Localization*: Normalized Topology Aware Metric (NTAM) Average Fault Propagation Chain
6 | - *Diagnosis - NTAM Fault Propagation*: NTAM Average Fault Localization
7 | - *% Resolved*: Percentage of incidents repaired (mitigation efficiency)
8 |
9 | Updated on: 02/05/2025 18:06:54
10 |
11 | ### Single Trial
12 | For details on how to participate or interpret results, see the [README](/README.md).
13 |
14 | ---
15 |
16 | | Agent (Name) | Agent Submitter | Organization | Scenario Category | Trials across incidents | Diagnosis - NTAM Fault Localization | Diagnosis - NTAM Fault Propagation | Diagnosis - Time to Diagnosis | Diagnosis - Duration agent tried for Diagnosis | Repair - Time to Repair | % Resolved | Date (UTC) | Issue Link |
17 | |--------------|-----------------|--------------|-------------------|-------------------------|-------------------------------------|------------------------------------|-------------------------------|------------------------------------------------|-------------------------|------------|------------|------------|
18 | | ITBench-SRE-Agent-GPT-4o | [ITBench-SRE-Agent](https://github.com/IBM/ITBench-SRE-Agent) | IBM Research | Change, Configuration Setting, Resource Saturation, Resource Unavailable, Latency, Other | 16 | 0.33 ± 0.08 (σ=0.31) | 0.29 ± 0.06 (σ=0.23) | 69.82 ± 11.30 (σ=15.98) | 70.38 ± 4.98 (σ=19.91) | 220.15 ± 27.25 (σ=54.51) | 25.00 |
19 | | ITBench-SRE-Agent-Granite-3-2 | [ITBench-SRE-Agent](https://github.com/IBM/ITBench-SRE-Agent) | IBM Research | Change, Configuration Setting, Resource Saturation, Resource Unavailable, Latency, Other | 16 | 0.19 ± 0.06 (σ=0.26) | 0.21 ± 0.05 (σ=0.21) | 96.47 ± NaN (σ=NaN) | 93.75 ± 15.90 (σ=63.59) | ∞ ± 0.00 (σ=0.00) | 0.00 |
20 | | ITBench-SRE-Agent-LLama-3-3-70B | [ITBench-SRE-Agent](https://github.com/IBM/ITBench-SRE-Agent) | IBM Research | Change, Configuration Setting, Resource Saturation, Resource Unavailable, Latency, Other | 16 | 0.14 ± 0.04 (σ=0.15) | 0.21 ± 0.04 (σ=0.16) | ∞ ± 0.00 (σ=0.00) | 63.36 ± 3.43 (σ=13.71) | 193.19 ± 1.25 (σ=1.76) | 12.50 |
21 |
22 | ### Multiple Trials (Limited availability; expected general availability (GA) in July, 2025)
23 |
24 | ---
25 |
26 | | Agent (Name) | Agent Submitter | Organization | Scenario Category | Trials across incidents | Diagnosis - NTAM Fault Localization | Diagnosis - NTAM Fault Propagation | Diagnosis - Time to Diagnosis | Diagnosis - Duration agent tried for Diagnosis | Repair - Time to Repair | % Resolved | Date (UTC) | Issue Link |
27 | |--------------|-----------------|--------------|-------------------|-------------------------|-------------------------------------|------------------------------------|-------------------------------|------------------------------------------------|-------------------------|------------|------------|------------|
28 | | ITBench-SRE-Agent-GPT-4o | [ITBench-SRE-Agent](https://github.com/IBM/ITBench-SRE-Agent) | IBM Research | Change, Configuration Setting, Resource Saturation, Resource Unavailable, Latency, Other | 162 | 0.36 ± 0.07 (σ=0.29) | 0.29 ± 0.03 (σ=0.13) | 117.27 ± 36.62 (σ=73.25) | 86.49 ± 8.88 (σ=36.60) | 204.81 ± 9.88 (σ=31.24) | 24.79 |
29 |
--------------------------------------------------------------------------------
/.github/workflows/leaderboard_update.yaml:
--------------------------------------------------------------------------------
1 | name: Leaderboard Update
2 | on:
3 | workflow_dispatch:
4 | inputs:
5 | use-sample:
6 | type: boolean
7 | required: false
8 | description: If set, display leaderboard with sample data
9 | benchmark-id:
10 | type: string
11 | required: false
12 | description: If set, display leaderboard of the provided benchmark id
13 | github-username:
14 | type: string
15 | required: false
16 | description: If set, display leaderboard of the provided github username
17 | jobs:
18 | update_leaderboard:
19 | runs-on: ubuntu-latest
20 | environment: onboarding
21 | name: Update the Leaderboard
22 | steps:
23 | - name: Checkout Repository
24 | uses: actions/checkout@v2
25 |
26 | - name: List Issues of Finished Benchmark
27 | env:
28 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
29 | GH_REPO: ${{ github.repository }}
30 | run: |
31 | gh issue list --label "benchmark" --state "closed" --json number,author,comments > issues.json
32 | jq -c '[.[].number]' issues.json
33 |
34 | usernames=($(jq -r '.[].author.login' issues.json | sort -u))
35 | query='{'$'\n'
36 | for i in "${!usernames[@]}"; do
37 | login="${usernames[$i]}"
38 | query+=" u$((i+1)): user(login: \"${login}\") { login company }"$'\n'
39 | done
40 | query+='}'
41 |
42 | gh api graphql -f query="$query" | jq -r '
43 | .data |
44 | to_entries |
45 | map({ key: .value.login, value: { company: .value.company } }) |
46 | from_entries
47 | ' > users.json
48 |
49 | - name: Pull Leaderboard data
50 | env:
51 | ITBENCH_API: ${{vars.ITBENCH_API}}
52 | ITBENCH_API_TOKEN: ${{ secrets.ITBENCH_API_TOKEN }}
53 | GH_REPO: ${{ github.repository }}
54 | USE_SAMPLE: ${{ github.event.inputs.use-sample }}
55 | BENCHMARK_ID: ${{ github.event.inputs.benchmark-id }}
56 | GITHUB_USERNAME: ${{ github.event.inputs.github-username }}
57 | run: |
58 |
59 | echo "Parse gh issues"
60 | python ./.github/workflows/update_benchmark_helper.py parse -i issues.json -o updated_issues.json
61 |
62 | echo "Requesting Leaderboard data from API"
63 |
64 | if [ "$USE_SAMPLE" == "true" ]; then
65 | python ./.github/workflows/leaderboard.py global --sample -b $BENCHMARK_ID -u $GITHUB_USERNAME --issues updated_issues.json --users users.json --out-overall LEADERBOARD.md --out-ciso LEADERBOARD_CISO.md --out-sre LEADERBOARD_SRE.md
66 | else
67 | python ./.github/workflows/leaderboard.py global --issues updated_issues.json --users users.json --out-overall LEADERBOARD.md --out-ciso LEADERBOARD_CISO.md --out-sre LEADERBOARD_SRE.md
68 | fi
69 |
70 | - name: Open PR
71 | env:
72 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
73 | run: |
74 | git config --global user.name "GitHub Actions"
75 | git config --global user.email "actions@github.com"
76 |
77 | git checkout -b leaderboard
78 |
79 | git add LEADERBOARD_CISO.md LEADERBOARD_SRE.md
80 |
81 | git commit -m "chore: update leaderboard data"
82 |
83 | git push origin leaderboard -f
84 |
85 | gh pr create \
86 | --base main \
87 | --head leaderboard \
88 | --title "chore: update leaderboard data" \
89 | --body "This PR updates the leaderboard automatically via GitHub Actions."
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/onboarding.yaml:
--------------------------------------------------------------------------------
1 | name: IT Bench Agent Registration (CISO)
2 | description: Register your CISO ITBench agent for benchmarking
3 | title: "[Registration - CISO]: < agent name here >"
4 | labels: ["registration"]
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | Thank you for your interest in benchmarking a CISO ITBench Agent.
10 | Please fill out this form to request connection details for the IT Bench service.
11 |
12 | ## Important!
13 |
14 | Before you submit this form, you need to have completed the following tasks
15 | (See also [Getting Started](../itbench-leaderboard?tab=readme-ov-file#prerequisites)):
16 |
17 | 1. Create an empty repository in GitHub with visibility set to private
18 | 2. Install the [ibm-itbench](https://github.com/apps/ibm-itbench-github-app) app into that repository.
19 | 3. Make sure that the person submitting this issue is added as a collaborator to that repository.
20 |
21 | Once those two steps have been completed, please complete this form and provide the URL for the
22 | repository you created in the relevant section.
23 |
24 | ### Reference
25 |
26 | You can find examples of expected agent actions and outputs in the sample scenario repository:
27 | https://github.com/IBM/ITBench-Scenarios/blob/main/ciso/README.md#scenarios
28 |
29 | ---
30 |
31 | - type: input
32 | id: agent-name
33 | attributes:
34 | label: Agent Name
35 | description: Please avoid using spaces in the name.
36 | placeholder: my-itbench-agent
37 | validations:
38 | required: true
39 | - type: dropdown
40 | id: agent-type
41 | attributes:
42 | label: Agent Type
43 | description: What type of agent is this?
44 | options:
45 | - CISO
46 | default: 0
47 | validations:
48 | required: true
49 | - type: dropdown
50 | id: agent-level
51 | attributes:
52 | label: Agent Level
53 | description: |
54 | Select the level of scenarios you want your Agent to participate in.
55 | **Important:** Categories depend on the Level. Please follow these rules:
56 | - Beginner: only "Kubernetes in Kyverno"
57 | - Intermediate: only "Kubernetes in OPA"
58 | - Expert: "Kubernetes in Kyverno Update" and "RHEL9 in OPA"
59 | options:
60 | - Beginner
61 | - Intermediate
62 | - Expert
63 | default: 0
64 | validations:
65 | required: true
66 | - type: checkboxes
67 | id: scenario-categories
68 | attributes:
69 | label: Agent Scenarios
70 | description: |
71 | Choose the scenario categories for your Agent.
72 | **Please select only the categories that match your Level above.**
73 | - Beginner → Kubernetes in Kyverno
74 | - Intermediate → Kubernetes in OPA
75 | - Expert → Kubernetes in Kyverno Update, RHEL9 in OPA
76 | options:
77 | - label: Kubernetes in Kyverno
78 | - label: Kubernetes in OPA
79 | - label: Kubernetes in Kyverno Update
80 | - label: RHEL9 in OPA
81 | - type: input
82 | id: repo_url
83 | attributes:
84 | label: "Config Repo"
85 | description: |
86 | Provide the GitHub Repository URL that we will create data required for benchmark.
87 | Please install the [ibm-itbench](https://github.com/apps/ibm-itbench-github-app) GitHub App in the repository before submitting this form!.
88 | placeholder: "e.g. https://github.com/your_org/repo_name"
89 | validations:
90 | required: true
91 |
92 | - type: markdown
93 | attributes:
94 | value: Thank you for completing this form, we will review your request shortly.
95 |
--------------------------------------------------------------------------------
/.github/workflows/update_benchmark_status.yaml:
--------------------------------------------------------------------------------
1 | name: Update Benchmark Status
2 | on:
3 | schedule:
4 | - cron: "*/10 * * * *"
5 | issue_comment:
6 | types: [created]
7 | workflow_dispatch:
8 |
9 | env:
10 | REQUEST_TIMEOUT: ${{ vars.REQUEST_TIMEOUT }}
11 |
12 | jobs:
13 | update_status:
14 | runs-on: ubuntu-latest
15 | environment: onboarding
16 | name: Update the Benchmark Progress
17 | steps:
18 | - name: Determine Trigger Type
19 | id: check_trigger
20 | run: |
21 | if [[ "${{ github.event_name }}" == "issue_comment" ]]; then
22 | COMMENT_BODY=$(jq -r '.comment.body' "$GITHUB_EVENT_PATH")
23 | if [[ "$COMMENT_BODY" == "/refresh" ]]; then
24 | echo "TRIGGER=issue_comment" >> $GITHUB_ENV
25 | ISSUE_NUMBER=$(jq -r '.issue.number' "$GITHUB_EVENT_PATH")
26 | echo "ISSUE_NUMBER=$ISSUE_NUMBER" >> $GITHUB_ENV
27 | else
28 | echo "Not a /refresh command, skipping."
29 | exit 0
30 | fi
31 | else
32 | echo "TRIGGER=schedule" >> $GITHUB_ENV
33 | fi
34 | - name: Checkout Repository
35 | uses: actions/checkout@v2
36 | - name: List Issues with 'track-progress' Label
37 | env:
38 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
39 | GH_REPO: ${{ github.repository }}
40 | run: |
41 | if [[ "$TRIGGER" == "schedule" ]]; then
42 | echo "Scheduled task: List all issues with the 'track-progress' label"
43 | gh issue list --label "track-progress" --state "open" --json number,author,comments > issues.json
44 | elif [[ "$TRIGGER" == "issue_comment" ]]; then
45 | echo "Issue comment trigger: Store only the commented issue"
46 | gh issue view "$ISSUE_NUMBER" --json number,author,comments | jq '[.]' > issues.json
47 | fi
48 | echo "Tracked issues"
49 | jq -c '[.[].number]' issues.json
50 | - name: Process and Update Status
51 | env:
52 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
53 | GH_REPO: ${{ github.repository }}
54 | ITBENCH_API: ${{vars.ITBENCH_API}}
55 | ITBENCH_API_TOKEN: ${{ secrets.ITBENCH_API_TOKEN }}
56 | run: |
57 | if [ ! -s issues.json ]; then
58 | echo "No issues found."
59 | exit 0
60 | fi
61 | echo "Parse gh issues"
62 | python ./.github/workflows/update_benchmark_helper.py parse -i issues.json -o updated_issues.json
63 | echo "Fetch benchmark status"
64 | python ./.github/workflows/update_benchmark_helper.py status -i updated_issues.json -o benchmark_statuses.json
65 | echo "Generate benchmark status comment"
66 | python .github/workflows/update_benchmark_helper.py comment -i benchmark_statuses.json -o benchmark_status_comments.jsonl
67 |
68 | echo "Update each issues"
69 | cat benchmark_status_comments.jsonl | while IFS= read -r line
70 | do
71 | number=$(printf "%s" "$line" | jq -r '.number')
72 | if [[ -z "$number" ]]; then
73 | continue
74 | fi
75 |
76 | status_comment_id=$(printf "%s" "$line" | jq -r '.status_comment_id')
77 | closed=$(printf "%s" "$line" | jq -r '.closed')
78 | body=$(printf "%s" "$line" | jq -r '.comment')
79 |
80 | if [[ "$status_comment_id" == "null" ]]; then
81 | echo " Creating new comment for issue #$number"
82 | gh issue comment "$number" --body "$body"
83 | else
84 | echo " Editing comment $status_comment_id for issue #$number"
85 | gh api --silent -X PATCH /repos/${GH_REPO}/issues/comments/${status_comment_id} -F "body=${body}"
86 | fi
87 |
88 | if [[ "$closed" == "true" ]]; then
89 | echo " Close the issue #$number"
90 | gh issue close $number
91 | fi
92 | done
93 |
94 |
--------------------------------------------------------------------------------
/.github/workflows/update_agent_manifest.yaml:
--------------------------------------------------------------------------------
1 | name: Update Agent Manifest
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | benchmark-issue:
7 | type: number
8 | required: true
9 | description: Please input the benchmark issue number
10 |
11 | jobs:
12 | update_agent_manifest:
13 | runs-on: ubuntu-latest
14 | environment: onboarding
15 | name: Update Agent Manifest
16 | steps:
17 | - name: Get Agent Repo
18 | id: get-agent-repo
19 | env:
20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
21 | GH_REPO: ${{ github.repository }}
22 | BENCHMARK_ISSUE_NUMBER: ${{ github.event.inputs.benchmark-issue }}
23 | run: |
24 | config_repo_url="$(gh issue view $BENCHMARK_ISSUE_NUMBER --json body -q .body | grep -A2 '### Config Repo' | tail -n1)"
25 | echo "Agent Repo: $config_repo_url"
26 | github_username="$(gh issue view $BENCHMARK_ISSUE_NUMBER --json author -q .author.login)"
27 | echo "GitHub Username: $github_username"
28 | agent_repo_owner="$(echo $config_repo_url | awk -F/ '{print $4}')"
29 | agent_repo_name="$(echo $config_repo_url | awk -F/ '{print $5}')"
30 | echo "Agent Repo Owner: $agent_repo_owner"
31 | echo "Agent Repo Name: $agent_repo_name"
32 | echo "agent_repo_owner=$agent_repo_owner" >> "$GITHUB_OUTPUT"
33 | echo "agent_repo_name=$agent_repo_name" >> "$GITHUB_OUTPUT"
34 | echo "github_username=$github_username" >> "$GITHUB_OUTPUT"
35 |
36 | - name: Generate GitHub token on behalf of repo
37 | id: generate-token
38 | uses: actions/create-github-app-token@v1
39 | with:
40 | app-id: ${{ vars.ITBENCH_APP_ID }}
41 | private-key: ${{ secrets.ITBENCH_APP_KEY }}
42 | owner: ${{ steps.get-agent-repo.outputs.agent_repo_owner}}
43 | repositories: ${{ steps.get-agent-repo.outputs.agent_repo_name}}
44 |
45 | - name: Update agent-manifest.json
46 | env:
47 | GH_TOKEN: ${{ steps.generate-token.outputs.token }}
48 | run: |
49 | cleanup() {
50 | echo "Cleaning up agent-manifest.json/agent-manifest.raw.json"
51 | rm -f agent-manifest.json agent-manifest.raw.json agent-manifest.new.json
52 | }
53 | trap cleanup EXIT
54 | trap cleanup SIGINT
55 | trap cleanup SIGTERM
56 |
57 | repo_full_path="repos/${{ steps.get-agent-repo.outputs.agent_repo_owner}}/${{ steps.get-agent-repo.outputs.agent_repo_name}}"
58 | gh api $repo_full_path/contents/agent-manifest.json -q '.content' | base64 -d > agent-manifest.json
59 | agent_id=$(jq -r .metadata.id agent-manifest.json)
60 |
61 | status_code=$(curl -s -X GET \
62 | -H "Authorization: Bearer ${{ secrets.ITBENCH_API_TOKEN }}" \
63 | -H "Content-type: application/json" \
64 | "${{vars.ITBENCH_API}}/gitops/agents/$agent_id?github_username=${{ steps.get-agent-repo.outputs.github_username}}" \
65 | --output agent-manifest.raw.json \
66 | --write-out "%{http_code}")
67 |
68 | if [ "$status_code" -ne 200 ]; then
69 | echo "❌ API request failed with status $status_code"
70 | exit 1
71 | fi
72 |
73 | new_agent_token=$(jq -r '.spec.agent_manifest.token' agent-manifest.raw.json)
74 | if [ -z "$new_agent_token" ] || [ "$new_agent_token" = "null" ]; then
75 | echo "❌ Failed to extract agent token from response"
76 | exit 1
77 | fi
78 |
79 | jq --arg new_agent_token "$new_agent_token" -r '.token=$new_agent_token' agent-manifest.json > agent-manifest.new.json
80 | current_sha=$(gh api $repo_full_path/contents/agent-manifest.json -q '.sha' || echo "")
81 | gh api -X PUT \
82 | -H "Accept: application/vnd.github.v3+json" \
83 | $repo_full_path/contents/agent-manifest.json \
84 | -f message="Update agent-manifest.json via API" \
85 | -f content="$(cat agent-manifest.new.json | base64)" \
86 | -f sha="$current_sha"
--------------------------------------------------------------------------------
/docs/leaderboard.md:
--------------------------------------------------------------------------------
1 | # ITBench-Leaderboard
2 |
3 | ## 🌟 Explore the Leaderboards
4 |
5 | | Domain | Leaderboard |
6 | |--------|-------------|
7 | | 🔐 **CISO** | 👉 [View CISO Leaderboard](../LEADERBOARD_CISO.md) |
8 | | ⚙️ **SRE** | 👉 [View SRE Leaderboard](../LEADERBOARD_SRE.md) |
9 |
10 | ## Getting Started
11 | ### Prerequisites
12 | - **A private GitHub repository**
13 | - A file facilitating the agent and leaderboard handshake is pushed to this private repository.
14 | - The file(s) may be created or deleted automatically during the benchmark lifecycle.
15 | - **A Kubernetes sandbox cluster (KinD recommended)** -- Only needed for CISO
16 | - Do not use a production cluster, because the benchmark process will create and delete resources dynamically.
17 | - Please refer to [prepare-kubeconfig-kind.md](https://github.com/itbench-hub/ITBench-Scenarios/blob/main/ciso/prepare-kubeconfig-kind.md)
18 | - **An agent to benchmark**
19 | - A base agent is available from IBM for immediate use. The base agent for the CISO use case can be found [here](https://github.com/itbench-hub/ITBench-CISO-CAA-Agent), and one for SRE and FinOps use cases can be found [here](https://github.com/itbench-hub/ITBench-SRE-Agent). This allows you to leverage your methodologies and make improvements without having to worry about interactions between the agent and leaderboard service.
20 |
21 | ### Setup
22 |
23 | #### Step 1. Install the ITBench GitHub App
24 | Install the ibm-itbench GitHub app into the private GitHub repository (see Prerequisites).
25 |
26 | 1. Go to the installation page [here](https://github.com/apps/ibm-itbench-github-app).
27 |
28 |
29 | 2. Select your GitHub Organization.
30 |
31 |
32 | 3. Select your Agent configuration repo.
33 |
34 |
35 |
36 | > ⚠️ **Note**: If the repository was created by someone else (e.g., a teammate), ensure that the GitHub account submitting the agent registration issue is added as a **collaborator**.
37 |
38 | #### Step 2. Register your agent
39 | In this step, you will register your agent information with ITBench.
40 |
41 | 1. Create a new registration issue.
42 | - Go to [Agent Registration Form](https://github.com/itbench-hub/ITBench/issues/new/choose) and create a new issue.
43 | 
44 | 2. Fill in the issue template with the following information:
45 | - Agent Name: Your agent name
46 | - Agent Level: "Beginner"
47 | - Agent Scenarios: "Kubernetes in Kyverno"
48 | - Config Repo: URL for your agent configuration repo
49 | (You may adjust the settings depending on the scenarios or agent level.)
50 |
51 |
52 | 3. Submit the issue.
53 | - Click "Create" to submit your registration request.
54 | - Once your request is approved:
55 | - An approved label will be attached to your issue.
56 | - A comment will be added with a link to the generated agent configuration file stored in the specified configuration repository.
57 | Download the linked configuration file to proceed.
58 |
59 |
60 | - If you subscribe to the issue, you will also receive email notifications.
61 |
62 |
63 |
64 | If there are any problems with your submission, we will respond directly on the issue.
65 | If you do not receive any response within a couple of days, please reach out to the [maintainers](../README.md#contacts).
66 |
67 | #### Step 3. Create a benchmark request
68 | In this step, you will register your benchmark entry.
69 | 1. Create a new benchmark issue.
70 | - Go to [Benchmark Registration Form](https://github.com/itbench-hub/ITBench/issues/new/choose) and create a new issue.
71 | - Currently, please use the **same GitHub account** that you used for the agent registration issue.
72 | (This is currently required for the system to correctly associate your benchmark request.)
73 |
74 |
75 | 2. Fill in the issue template.
76 | - The name for the Config Repo must match the repository you used during agent registration.
77 |
78 |
79 | 3. Submit the issue.
80 | - Click "Create" to submit your registration request. Once your request is approved:
81 | - An approved label will be attached to your issue.
82 | - The issue comment will be updated with your Benchmark ID.
83 |
84 |
85 | - If you subscribe to the issue, you will also receive email notifications.
86 |
87 |
88 |
89 | If there are any problems with your submission, we will respond directly on the issue.
90 | If you do not receive any response within a couple of days, please reach out to the [maintainers](../README.md#contacts).
91 |
92 | ### Running your agent or our base agent against the benchmark
93 | You can run either your own custom agent or one of our built-in agents against the ITBench benchmark.
94 |
95 | The following guides and videos demonstrate how to run the benchmark using our built-in agents. These may also serve as helpful references when setting up your own agent:
96 |
97 | - **CISO Agent** – [Documentation](../docs/how-to-launch-benchmark-ciso.md) ・ [Demo Video](https://ibm.box.com/s/3i7mapxyit7ugnbldigqunzs6bkvv4cy)
98 | - **SRE Agent** – [Documentation](https://github.com/itbench-hub/ITBench-SRE-Agent/blob/main/Leaderboard.md)
99 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ITBench
2 |
3 | **[Paper](./it_bench_arxiv.pdf) | [Leaderboard](#leaderboard) | [Scenarios](#scenarios) | [Agents](#agents) | [How to Cite](#how-to-cite) | [Contributors](./CONTRIBUTORS.md) | [Contacts](#contacts)**
4 |
5 | ---
6 |
7 | ## 📢 Announcements
8 |
9 | ### Latest Updates
10 | - **[June 13, 2025]** Identified 25+ additional scenarios to be developed over the summer.
11 | - **[May 2, 2025]** 🚀 ITBench now provides **fully-managed scenario environments** for everyone! Our platform handles the complete workflow—from scenario deployment to agent evaluation and leaderboard updates. Visit our GitHub repository [here](https://github.com/ibm/ITBench-Leaderboard) for guidelines and get started today.
12 | - **[February 28, 2025]** 🏆 **Limited Access Beta**: Invite-only access to the ITBench hosted scenario environments. ITBench handles scenario deployment, agent evaluation, and leaderboard updates. To request access, e-mail us [here](mailto:agent-bench-automation@ibm.com).
13 | - **[February 7, 2025]** 🎉 **Initial release!** Includes research paper, self-hosted environment setup tooling, sample scenarios, and baseline agents.
14 |
15 | ---
16 |
17 | ## Overview
18 |
19 | ITBench measures the performance of AI agents across a wide variety of **complex and real-world inspired IT automation tasks** targeting three key use cases:
20 |
21 | | Use Case | Focus Area |
22 | |----------|------------|
23 | | **SRE** (Site Reliability Engineering) | Availability and resiliency |
24 | | **CISO** (Compliance & Security Operations) | Compliance and security enforcement |
25 | | **FinOps** (Financial Operations) | Cost efficiencies and ROI optimization |
26 |
27 | 
28 |
29 | ### Key Features
30 |
31 | - **Real-world representation** of IT environments and incident scenarios
32 | - **Open, extensible framework** with comprehensive IT coverage
33 | - **Push-button workflows** and interpretable metrics
34 | - **Kubernetes-based** scenario environments
35 |
36 | ### What's Included
37 |
38 | ITBench enables researchers and developers to replicate real-world incidents in Kubernetes environments and develop AI agents to address them.
39 |
40 | **We provide:**
41 | 1. **Push-button deployment tooling** for environment setup *(open-source)*
42 | 2. **Framework for recreating realistic IT scenarios using the deployment tooling:**
43 | - **6 SRE scenarios** and **21 mechanisms** *(open-source)*
44 | - **4 categories of CISO scenarios** *(open-source)*
45 | - **1 FinOps scenario** *(open-source)*
46 | 3. **Two reference AI agents:**
47 | - SRE (Site Reliability Engineering) Agent *(open-source)*
48 | - CISO (Chief Information Security Officer) Agent *(open-source)*
49 | 4. **Fully-managed leaderboard** for agent evaluation and comparison
50 |
51 | ---
52 |
53 | ## Roadmap
54 |
55 | | Timeline | Key Deliverables |
56 | |----------|------------------|
57 | | **July 2025** | • Refactor leading to a scenario specification generator and runner allowing for most (if not all) mechanisms to be re-used across diverse applications and microservices
• Implementation of 10 of the additional scenarios identified |
58 | | **August 2025** | • **SRE-Agent-Lite**: Lightweight agent to assist non-systems personnel with environment debugging
• **Snapshot & Replay**: Data capture and replay capabilities
• Implementation of 15 of the additional scenarios to be developed over the summer|
59 | | **Fall 2025** | **BYOA (Bring Your Own Application)**: Support for custom application integration |
60 |
61 | ---
62 |
63 | ## Leaderboard
64 |
65 | The ITBench Leaderboard tracks agent performance across SRE, FinOps, and CISO scenarios. We provide fully managed scenario environments while researchers/developers run their agents on their own systems and submit their outputs for evaluation.
66 |
67 | | Domain | Leaderboard |
68 | |--------|-------------|
69 | | **SRE** | [View SRE Leaderboard](https://github.com/itbench-hub/ITBench/blob/main/LEADERBOARD_SRE.md) |
70 | | **CISO** | [View CISO Leaderboard](https://github.com/itbench-hub/ITBench/blob/main/LEADERBOARD_CISO.md) |
71 |
72 | > **Get Started**: Visit [docs/leaderboard.md](docs/leaderboard.md) for access and evaluation guidelines.
73 |
74 | ---
75 |
76 | ## Scenarios
77 |
78 | ITBench incorporates a collection of problems that we call **scenarios**. Each scenario is deployed in an operational environment where specific problems occur.
79 |
80 | ### Examples of Scenarios
81 | - **SRE**: Resolve "High error rate on service checkout" in a Kubernetes environment
82 | - **CISO**: Assess compliance posture for "new control rule detected for RHEL 9"
83 | - **FinOps**: Identify and resolve cost overruns and anomalies
84 |
85 | **Find all scenarios**: [Scenarios repository](https://github.com/IBM/ITBench-Scenarios)
86 |
87 | ---
88 |
89 | ## Agents
90 |
91 | Two baseline agents are being open-sourced with ITBench, built using the **CrewAI framework**.
92 |
93 | ### Agent Features
94 | - **Configurable LLMs**: watsonx, Azure, or vLLM support
95 | - **Natural language tools**: Interactions with the environment for information gathering
96 |
97 | ### Available Agents
98 |
99 | | Agent | Repository |
100 | |-------|------------|
101 | | **SRE Agent** | [itbench-sre-agent](https://github.com/IBM/itbench-sre-agent) |
102 | | **CISO Agent** | [itbench-ciso-caa-agent](https://github.com/IBM/itbench-ciso-caa-agent) |
103 |
104 | ---
105 |
106 | ## How to Cite
107 |
108 | ```bibtex
109 | @misc{jha2025itbench,
110 | title={ITBench: Evaluating AI Agents across Diverse Real-World IT Automation Tasks},
111 | author={Jha, Saurabh and Arora, Rohan and Watanabe, Yuji and others},
112 | year={2025},
113 | url={https://github.com/IBM/itbench-sample-scenarios/blob/main/it_bench_arxiv.pdf}
114 | }
115 | ```
116 |
117 | ---
118 |
119 | ## Join the Discussion
120 |
121 | Have questions or need help getting started with ITBench?
122 |
123 | - [**Create a GitHub issue**](https://github.com/IBM/ITBench/issues/new) for bug reports or feature requests
124 | - [**Join our Discord community**](https://discord.gg/6fzy3JRHmt) for real-time discussions
125 | - For formal inquiries, please see the [contacts section](#contacts)
126 |
127 | ---
128 |
129 | ## Contacts
130 |
131 | - **General inquiries**: agent-bench-automation@ibm.com
132 | - **Saurabh Jha**: saurabh.jha@ibm.com
133 | - **Yuji Watanabe**: muew@jp.ibm.com
134 |
--------------------------------------------------------------------------------
/.github/GH_ACTIONS_DOCS.md:
--------------------------------------------------------------------------------
1 | # GitHub Actions Documentation for IT Bench
2 |
3 | ## General
4 |
5 | To support the IT Bench user experience, this repository contains a number of GitHub Actions workflows that automate required tasks.
6 |
7 | These include:
8 | - [Agent Registration](#agent-registration)
9 | - [Public Leaderboard Updates](#public-leaderboard-updates)
10 |
11 |
12 |
13 |
14 | ## Agent Registration
15 |
16 |
17 |
18 | ```mermaid
19 | flowchart TD
20 | A[User Creates Config Repository] --> B
21 | B[User Installs IT Bench App into Config Repository] --> C
22 | C[User Opens registration ticket in IT Bench Repository] --> D{Ticket Approved?}
23 |
24 | D -->|Yes| F
25 | D -->|No| E
26 | E[User makes required changes to registration Ticket] -->D
27 | subgraph Registration Workflow
28 | F[Parse Registration Issue]-->G
29 | G[Verify Config Repo Private and App Installed]--> H
30 | G -->|Failed Check| FAIL
31 | FAIL[Comment on issue with error message and return to user]
32 | H[Register Agent details with IT Bench API]-->I
33 | I[Commit agent_manifest.json to config repo]-->J
34 | J[Reply to issue and close]
35 | end
36 |
37 | ```
38 |
39 |
40 | ### Simple onboarding instructions for users
41 |
42 |
43 | To onboard your agent and get started benchmarking, please follow the following steps:
44 |
45 | 1. Create an empty repository (or use a repository of your choice) on GitHub:
46 | - The repository must be set to private.
47 | - The onboarding process will create a file called `agent-manifest.json` at the root of the repository, so if using an existing repository make sure that there will not be a clash.
48 | 2. Install the [`ibm-itbench`](https://github.com/apps/ibm-itbench) app into the repository that you created in step 1.
49 | 3. Fill out and submit [this issue template](https://github.com/jpwsutton/itbenchautomation/issues/new?template=onboarding.yaml) with the details of the agent you are developing and provide the URL to the GitHub Repo you created in step 1 e.g. https://github.com/jpwsutton/my-test-agent
50 | 4. Once the registration issue has been approved, an automated process will generate a manifest for your agent to access the IT Bench Server and will save it to the root of your repository from step 1. You can now download this file and use it with the agent harness to initiate a benchmark.
51 |
52 |
53 | ## Public Leaderboard updates
54 |
55 | ```mermaid
56 | flowchart TD
57 | A[Workflow Initiated] --> B
58 | B[agent-bench-automation repo cloned] --> C
59 | C[leaderboard.py pulls leaderboard data from IT Bench API and generates markdown table] --> D
60 | D[Markdown table is pulled into a larger markdown file with a header] --> E
61 | E[Updated leaderboard markdown file committed and pushed to agent-bench-automation repository]
62 |
63 | ```
64 |
65 |
66 |
67 | ## Setup
68 |
69 |
70 | ### Creating the GitHub Application and configuring for GH Actions
71 |
72 | 1. Go to your [GitHub Apps page](https://github.com/settings/apps) in your developer settings and click "New GitHub App"
73 | 2. Populate the following Settings:
74 | - GitHub App name
75 | - Homepage URL (Set to the IT Bench Repo)
76 | - Disable Webhook
77 | - Permissions:
78 | - Repository Permissions:
79 | - Metadata: Read-Only (Default)
80 | - Single file: Read and Write
81 | - Path: agent-manifest.json
82 | - Organisation Permissions: None
83 | - Account Permissions: None
84 | - Where can this GitHub App be installed? - Any Account
85 | 3. Within the App settings, generate a private key and backup in 1password.
86 | 4. In the GitHub Repo Settings:
87 | - Environments - Create a new environment called `onboarding`
88 | - Environment Secrets:
89 | - `ITBENCH_APP_KEY` - Set to the generated Private Key from step 3
90 | - `ITBENCH_API_TOKEN` - Set to the JWT token for the IT Bench API
91 | - Environment Variables:
92 | - `ITBENCH_APP_ID` - Set to the App ID number
93 | - `ITBENCH_API` - The IT Bench Server API Endpoint
94 | - Actions:
95 | - Allow all actions and reusable workflows
96 | - Require approval for all external contributors
97 | - Read and write permissions
98 | - Runners (If using an external runner)
99 | - Create the runner using default options.
100 | 5. In the issues view create the following labels:
101 | - `approved`
102 | - `benchmark`
103 | - `error`
104 | - `registering`
105 | - `registration`
106 | - `track-progress`
107 |
108 |
109 | ### Self hosted runners
110 |
111 | Because the IT Bench Service is currently running on an internal IBM server, the workflow actions require a self hosted runner with access to the IBM internal network in order to run. The one extra step is to set up the certificate for the internal IT bench server as it is not trusted.
112 |
113 | You can download the certificate (in case the server changes) by running the following command:
114 |
115 | ```bash
116 | openssl s_client -showcerts -connect tokyo-itbench-1.sl.cloud9.ibm.com:443 2>/dev/null Runners and follow the process to create and install a new self hosted runner either on your local machine or an appropritate server on the IBM Network.
121 | 2. Create a file on the same machine where the self hosted runner will run called `itbench_res.pem` and enter the ca certificate below:
122 |
123 | ```
124 | -----BEGIN CERTIFICATE-----
125 | MIIDmDCCAoCgAwIBAgIUFwWsO8VVH739Qa87EH+Y8mPtm40wDQYJKoZIhvcNAQEL
126 | BQAwbTELMAkGA1UEBhMCVVMxETAPBgNVBAgMCE5ldyBZb3JrMREwDwYDVQQHDAhZ
127 | ...
128 | hB6eiBJigoWYTIFryyPIH5KaMTqyDNCKLbqEMgyRlo0D0ZnHGWMI9FkF3r2bMb7p
129 | PAJ1xNviYcUUdVcPQ81H2hHejnFPTtRnnjBwLf6DV4EulVLEOmutbwuxvAvwkpM3
130 | IsI+erZxjtK7paPl
131 | -----END CERTIFICATE-----
132 | ```
133 | 3. Go into the `actions-runner` directory and edit the `.env` file, adding the following line:
134 | `ITBENCH_CERT=/path/to/itbench_res.pem`.
135 | 4. Start the runner with `./run.sh`
136 |
137 |
138 | ### Switching to a public runner
139 |
140 | At some point, the IT bench server will be set up on the public internet. This will require a number of small changes, which are listed here:
141 |
142 | 1. For all workflows, ensure that the `runs-on` field has been changed from `self-hosted` to `ubuntu-latest`.
143 | 2. Remove the `ITBENCH_CERT` references:
144 | - `agent_registration.yaml`: Remove `--cacert $ITBENCH_CERT \` from the curl command in the `generate-manifest` step.
145 | - `leaderboard.py`:
146 | - Remove the `ITBENCH_CERT` environment variable from the top of the file.
147 | - Remove the ssl context from the `get_leaderboard` function.
148 | - Remove the `context` argument from the request call in the `get_leaderboard` function.
--------------------------------------------------------------------------------
/.github/workflows/benchmark_registration.yaml:
--------------------------------------------------------------------------------
1 | name: Register a new agent benchmark
2 |
3 | on:
4 | issues:
5 | types: [labeled]
6 |
7 | jobs:
8 | register_agent:
9 | if: github.event.label.name == 'approved' && contains(github.event.issue.labels.*.name, 'benchmark')
10 | # The type of runner that the job will run on
11 | runs-on: ubuntu-latest
12 | environment: onboarding
13 | name: Registers a benchmark
14 | steps:
15 | - name: Checkout Repository
16 | uses: actions/checkout@v2
17 | - name: Parse issue
18 | id: parse
19 | run: |
20 | echo "${{ github.event.issue.body }}" > issue_body.txt
21 | python ./.github/workflows/parse_issue.py < issue_body.txt > parsed_output.json
22 | echo "payload=$(cat parsed_output.json)" >> $GITHUB_OUTPUT
23 | # Examples on how to use the output
24 | - name: Show parsed payload data and store variables
25 | id: extract-parsed-data
26 | run: |
27 | echo '${{ steps.parse.outputs.payload }}'
28 | agent_repo="${{ fromJson(steps.parse.outputs.payload)['Config Repo']}}"
29 | agent_repo_owner="$(echo $agent_repo | awk -F/ '{print $4}')"
30 | agent_repo_name="$(echo $agent_repo | awk -F/ '{print $5}')"
31 | echo $agent_repo_owner
32 | echo $agent_repo_name
33 | echo "agent_repo_owner=$agent_repo_owner" >> "$GITHUB_OUTPUT"
34 | echo "agent_repo_name=$agent_repo_name" >> "$GITHUB_OUTPUT"
35 | - name: Comment on issue
36 | uses: actions/github-script@v7
37 | env:
38 | COMMENT_BODY: |
39 | 👋 ${{ github.event.issue.user.login }}
40 |
41 | Thank you for submitting your benchmark registration details, we are currently processing your request and will
42 | comment back once the registration has been completed.
43 |
44 | ## Benchmark Details:
45 |
46 | Name: ${{ fromJson(steps.parse.outputs.payload)['Benchmark Name'] }}
47 | Schedule now? ${{ fromJson(steps.parse.outputs.payload)['Schedule Now'] }}
48 |
49 | Target Config Repo: ${{ fromJson(steps.parse.outputs.payload)['Config Repo']}}
50 |
51 | with:
52 | script: |
53 | github.rest.issues.createComment({
54 | issue_number: context.issue.number,
55 | owner: context.repo.owner,
56 | repo: context.repo.repo,
57 | body: process.env.COMMENT_BODY
58 | })
59 | github.rest.issues.addLabels({
60 | issue_number: context.issue.number,
61 | owner: context.repo.owner,
62 | repo: context.repo.repo,
63 | labels: ['registering']
64 | })
65 |
66 |
67 | - name: Generate GitHub token on behalf of repo
68 | id: generate-token
69 | uses: actions/create-github-app-token@v1
70 | with:
71 | app-id: ${{ vars.ITBENCH_APP_ID }}
72 | private-key: ${{ secrets.ITBENCH_APP_KEY }}
73 | owner: ${{ steps.extract-parsed-data.outputs.agent_repo_owner}}
74 | repositories: ${{ steps.extract-parsed-data.outputs.agent_repo_name}}
75 |
76 | - name: Check repository is private
77 | id: check-repo-private
78 | env:
79 | GH_TOKEN: ${{ steps.generate-token.outputs.token }}
80 | run: |
81 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}"
82 | repo_private=$(gh api $repo_full_path -q '.private')
83 |
84 | echo "Repo Private: $repo_private"
85 |
86 | if [ "$repo_private" = "true" ]; then
87 | echo "Target repository is set to private."
88 | else
89 | echo "Target repository is not set to private. Failing!"
90 | echo "error_public_repo=1" >> "$GITHUB_OUTPUT"
91 | exit 1
92 | fi
93 |
94 | - name: Check Issue opened by repo collaborator
95 | id: check-repo-collaborator
96 | env:
97 | GH_TOKEN: ${{ steps.generate-token.outputs.token }}
98 | run : |
99 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}/collaborators"
100 | repo_collaborators=$(gh api $repo_full_path -q '[.[].login] | contains(["${{ github.event.issue.user.login }}"])')
101 |
102 | echo "Issue creator is collaborator: $repo_collaborators"
103 |
104 | if [ "$repo_collaborators" = "true" ]; then
105 | echo "Issue creator is collaborator."
106 | else
107 | echo "Issue creator is not a collaborator. Failing!"
108 | exit 1
109 | fi
110 |
111 | - name: Get Agent Details
112 | id: get-agent-config
113 | env:
114 | GH_TOKEN: ${{ steps.generate-token.outputs.token }}
115 | run : |
116 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}/contents/agent-manifest.json"
117 | agent_id=$(gh api $repo_full_path -q '.content' | base64 -d | jq '.metadata.id')
118 |
119 | echo "Agent ID: $agent_id"
120 |
121 | echo "agent_id=$agent_id" >> "$GITHUB_OUTPUT"
122 |
123 | - name: register-benchmark
124 | id: register-benchmark
125 | run: |
126 |
127 | echo "Registering Benchmark request with IT Bench API"
128 |
129 | response_json='${{steps.parse.outputs.payload}}'
130 |
131 | benchmark_body=$(echo $response_json | jq '{ "name" : ."Benchmark Name", "immediate" : ."Schedule Now"}' | jq --arg AGENT_ID ${{steps.get-agent-config.outputs.agent_id}} '. += {"agent_id": $AGENT_ID}')
132 |
133 | echo $benchmark_body | jq
134 |
135 |
136 | reg_resp=$(curl \
137 | --url ${{vars.ITBENCH_API}}/gitops/create-benchmark?github_username=${{ github.event.issue.user.login }} \
138 | --header "authorization: Bearer ${{ secrets.ITBENCH_API_TOKEN }}" \
139 | --header 'content-type: application/json' \
140 | --data "$benchmark_body")
141 |
142 | echo $reg_resp
143 |
144 | if [[ $? -eq 0 ]]; then
145 |
146 | echo "Request was successful"
147 |
148 | # Check that the spec is in the response body
149 | echo $reg_resp | jq -e '.id?'
150 |
151 |
152 | if [[ $? -eq 0 ]]; then
153 |
154 | echo "benchmark_id=$(echo $reg_resp | jq -r '.id')" >> "$GITHUB_OUTPUT"
155 | echo "benchmark_name=$(echo $reg_resp | jq -r '.name')" >> "$GITHUB_OUTPUT"
156 |
157 |
158 |
159 | else
160 | echo "Body recieved from IT bench was invalid."
161 | echo $reg_resp
162 | exit 1
163 | fi
164 |
165 | else
166 | echo "Request failed."
167 | echo $reg_resp
168 | exit 1
169 | fi
170 |
171 |
172 | - name: Comment on issue
173 | uses: actions/github-script@v7
174 | env:
175 | COMMENT_BODY: |
176 | 👋 ${{ github.event.issue.user.login }}
177 |
178 | The registration of your benchmark is now complete.
179 |
180 | Here are the Details:
181 |
182 |
183 | Name: ${{ steps.register-benchmark.outputs.benchmark_name }}
184 | Type: ${{ steps.register-benchmark.outputs.benchmark_id }}
185 |
186 |
187 |
188 |
189 | with:
190 | script: |
191 | github.rest.issues.createComment({
192 | issue_number: context.issue.number,
193 | owner: context.repo.owner,
194 | repo: context.repo.repo,
195 | body: process.env.COMMENT_BODY
196 | })
197 |
198 |
199 | - name: Report Failure
200 | if: failure()
201 | uses: actions/github-script@v7
202 | env:
203 | PRIVATE_REPO: ${{ steps.check-repo-private.outputs.error_public_repo == 1}}
204 | COMMENT_BODY: |
205 | 👋 ${{ github.event.issue.user.login }}
206 |
207 | Unfortunately there was an unknown issue with registering the benchmark.
208 |
209 | This issue has been marked for manual intervention and the team has been notified.
210 |
211 | ----
212 |
213 | Run link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
214 |
215 |
216 |
217 | with:
218 | script: |
219 |
220 | console.log("Responding with generic error message.")
221 | github.rest.issues.createComment({
222 | issue_number: context.issue.number,
223 | owner: context.repo.owner,
224 | repo: context.repo.repo,
225 | body: process.env.COMMENT_BODY
226 | })
227 | github.rest.issues.addLabels({
228 | issue_number: context.issue.number,
229 | owner: context.repo.owner,
230 | repo: context.repo.repo,
231 | labels: ['error']
232 | })
233 |
234 |
235 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright contributors to the ITBench project.
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/.github/workflows/agent_registration.yaml:
--------------------------------------------------------------------------------
1 | name: Register a new agent
2 |
3 | on:
4 | issues:
5 | types: [labeled]
6 |
7 | jobs:
8 | register_agent:
9 | if: github.event.label.name == 'approved' && contains(github.event.issue.labels.*.name, 'registration')
10 | # The type of runner that the job will run on
11 | runs-on: ubuntu-latest
12 | environment: onboarding
13 | name: Registers an Agent
14 | steps:
15 | - name: Checkout Repository
16 | uses: actions/checkout@v2
17 | - name: Parse issue
18 | id: parse
19 | run: |
20 | echo "${{ github.event.issue.body }}" > issue_body.txt
21 | python ./.github/workflows/parse_issue.py < issue_body.txt > parsed_output.json
22 | echo "payload=$(cat parsed_output.json)" >> $GITHUB_OUTPUT
23 | # Examples on how to use the output
24 | - name: Show parsed payload data and store variables
25 | id: extract-parsed-data
26 | run: |
27 | echo '${{ steps.parse.outputs.payload }}'
28 | agent_repo="${{ fromJson(steps.parse.outputs.payload)['Config Repo']}}"
29 | agent_repo_owner="$(echo $agent_repo | awk -F/ '{print $4}')"
30 | agent_repo_name="$(echo $agent_repo | awk -F/ '{print $5}')"
31 | echo $agent_repo_owner
32 | echo $agent_repo_name
33 | echo "agent_repo_owner=$agent_repo_owner" >> "$GITHUB_OUTPUT"
34 | echo "agent_repo_name=$agent_repo_name" >> "$GITHUB_OUTPUT"
35 | - name: Comment on issue
36 | uses: actions/github-script@v7
37 | env:
38 | COMMENT_BODY: |
39 | 👋 ${{ github.event.issue.user.login }}
40 |
41 | Thank you for submitting your agent registration details, we are currently processing your request and will
42 | be in contact shortly with connection details for your agent harness to use to connect to the IT Bench service.
43 |
44 | ## Agent Details:
45 |
46 | Name: ${{ fromJson(steps.parse.outputs.payload)['Agent Name'] }}
47 | Type: ${{ fromJson(steps.parse.outputs.payload)['Agent Type'] }}
48 | Level: ${{ fromJson(steps.parse.outputs.payload)['Agent Level'] }}
49 |
50 | Target Config Repo: ${{ fromJson(steps.parse.outputs.payload)['Config Repo']}}
51 |
52 | with:
53 | script: |
54 | github.rest.issues.createComment({
55 | issue_number: context.issue.number,
56 | owner: context.repo.owner,
57 | repo: context.repo.repo,
58 | body: process.env.COMMENT_BODY
59 | })
60 | github.rest.issues.addLabels({
61 | issue_number: context.issue.number,
62 | owner: context.repo.owner,
63 | repo: context.repo.repo,
64 | labels: ['registering']
65 | })
66 |
67 |
68 | - name: Generate GitHub token on behalf of repo
69 | id: generate-token
70 | uses: actions/create-github-app-token@v1
71 | with:
72 | app-id: ${{ vars.ITBENCH_APP_ID }}
73 | private-key: ${{ secrets.ITBENCH_APP_KEY }}
74 | owner: ${{ steps.extract-parsed-data.outputs.agent_repo_owner}}
75 | repositories: ${{ steps.extract-parsed-data.outputs.agent_repo_name}}
76 |
77 | - name: Check repository is private
78 | id: check-repo-private
79 | env:
80 | GH_TOKEN: ${{ steps.generate-token.outputs.token }}
81 | run: |
82 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}"
83 | repo_private=$(gh api $repo_full_path -q '.private')
84 |
85 | echo "Repo Private: $repo_private"
86 |
87 | if [ "$repo_private" = "true" ]; then
88 | echo "Target repository is set to private."
89 | else
90 | echo "Target repository is not set to private. Failing!"
91 | echo "error_public_repo=1" >> "$GITHUB_OUTPUT"
92 | exit 1
93 | fi
94 |
95 | - name: Check Issue opened by repo collaborator
96 | id: check-repo-collaborator
97 | env:
98 | GH_TOKEN: ${{ steps.generate-token.outputs.token }}
99 | run : |
100 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}/collaborators"
101 | repo_collaborators=$(gh api $repo_full_path -q '[.[].login] | contains(["${{ github.event.issue.user.login }}"])')
102 |
103 | echo "Issue creator is collaborator: $repo_collaborators"
104 |
105 | if [ "$repo_collaborators" = "true" ]; then
106 | echo "Issue creator is collaborator."
107 | else
108 | echo "Issue creator is not a collaborator. Failing!"
109 | exit 1
110 | fi
111 |
112 | - name: generate-manifest
113 | id: generate-manifest
114 | run: |
115 |
116 | echo "Registering Agent with IT Bench API"
117 |
118 | response_json='${{steps.parse.outputs.payload}}'
119 |
120 | agent_body=$(echo $response_json | jq '{"name": ."Agent Name", "type" : ."Agent Type", "level" : ."Agent Level", "scenario_categories" : [."Agent Scenarios" | to_entries[] | select(.value).key]}')
121 |
122 | echo $agent_body | jq
123 |
124 | response_file=$(mktemp)
125 | trap 'echo "Cleaning up $response_file"; rm -f "$response_file"' EXIT
126 | status_code=$(curl \
127 | --url ${{vars.ITBENCH_API}}/gitops/agents?github_username=${{ github.event.issue.user.login }} \
128 | --header "authorization: Bearer ${{ secrets.ITBENCH_API_TOKEN }}" \
129 | --header 'content-type: application/json' \
130 | --data "$agent_body" \
131 | --output "$response_file" \
132 | --write-out "%{http_code}")
133 |
134 | if [[ $? -eq 0 ]]; then
135 |
136 | echo "Curl execution was successful"
137 |
138 | echo "::debug:: $(cat $response_file)"
139 | # Check that the spec is in the response body
140 |
141 | if [[ "$status_code" == "200" || "$status_code" == "201" ]]; then
142 |
143 | echo "manifest=$( cat $response_file | jq '.spec.agent_manifest + {metadata: {id: .metadata.id}}' | base64 -w 0)" >> "$GITHUB_OUTPUT"
144 |
145 | else
146 | msg="Body recieved from IT bench was invalid."
147 | echo "$msg"
148 | echo "error=1" >> "$GITHUB_OUTPUT"
149 | error_detail=$(jq -r '.detail // "No detail message in response."' "$response_file")
150 | echo "error_detail=${error_detail}" >> "$GITHUB_OUTPUT"
151 | exit 1
152 | fi
153 |
154 | else
155 | echo "Request failed."
156 | msg="CURL execution was failed with status code $status_code."
157 | echo "$msg"
158 | echo "error=1" >> "$GITHUB_OUTPUT"
159 | echo "error_detail=$msg" >> "$GITHUB_OUTPUT"
160 | exit 1
161 | fi
162 |
163 | - name: Push manifest to config repository
164 | id: file-push
165 | env:
166 | GH_TOKEN: ${{ steps.generate-token.outputs.token }}
167 | run: |
168 | gh api octocat
169 |
170 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}/contents/agent-manifest.json"
171 |
172 | echo "Repo Path: $repo_full_path"
173 |
174 | current_sha=$(gh api $repo_full_path -q '.sha' || echo "")
175 |
176 | echo "Current SHA: $current_sha"
177 |
178 | ghout=$(gh api -X PUT \
179 | -H "Accept: application/vnd.github.v3+json" \
180 | $repo_full_path \
181 | -f message="Add agent-manifest.json via API" \
182 | -f content="${{ steps.generate-manifest.outputs.manifest}}" \
183 | -f branch="main" \
184 | -f sha="$current_sha")
185 |
186 | if [[ $? -eq 0 ]]; then
187 | echo $ghout | jq
188 |
189 |
190 | file_path=$(echo $ghout | jq .content.html_url)
191 | echo "File path: $file_path"
192 |
193 | echo "manifest_path=$file_path" >> "$GITHUB_OUTPUT"
194 | fi
195 |
196 | - name: Comment on issue
197 | uses: actions/github-script@v7
198 | env:
199 | COMMENT_BODY: |
200 | 👋 ${{ github.event.issue.user.login }}
201 |
202 | The registration of your agent is now complete.
203 |
204 | Your agent manifest is located at: ${{ steps.file-push.outputs.manifest_path}}
205 |
206 |
207 | ## Agent Details:
208 |
209 | Name: ${{ fromJson(steps.parse.outputs.payload)['Agent Name'] }}
210 | Type: ${{ fromJson(steps.parse.outputs.payload)['Agent Type'] }}
211 | Level: ${{ fromJson(steps.parse.outputs.payload)['Agent Level'] }}
212 |
213 | Target Config Repo: ${{ fromJson(steps.parse.outputs.payload)['Config Repo']}}
214 |
215 | with:
216 | script: |
217 | github.rest.issues.createComment({
218 | issue_number: context.issue.number,
219 | owner: context.repo.owner,
220 | repo: context.repo.repo,
221 | body: process.env.COMMENT_BODY
222 | })
223 |
224 | github.rest.issues.update({
225 | issue_number: context.issue.number,
226 | owner: context.repo.owner,
227 | repo: context.repo.repo,
228 | state: 'closed'
229 | })
230 |
231 |
232 | - name: Report Failure
233 | if: failure()
234 | uses: actions/github-script@v7
235 | env:
236 | PRIVATE_REPO: ${{ steps.check-repo-private.outputs.error_public_repo == 1}}
237 | ERROR_ON_GENERATE_MANIFEST: ${{ steps.generate-manifest.outputs.error == 1 }}
238 | COMMENT_BODY: |
239 | 👋 ${{ github.event.issue.user.login }}
240 |
241 | Unfortunately there was an unknown issue with registering the agent.
242 |
243 | This issue has been marked for manual intervention and the team has been notified.
244 |
245 | ----
246 |
247 | Run link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
248 |
249 | PRIV_REPO_COMMENT_BODY: |
250 | 👋 ${{ github.event.issue.user.login }}
251 |
252 | It looks like the repository you've provided to us is not set to private.
253 | As we will be committing a token to your repository, it needs to be set to private before we can continue.
254 |
255 | Please make the nessesary changes and reply back to this issue, our team will then re-start the registration process.
256 |
257 | ----
258 |
259 | Run link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
260 |
261 | ERROR_ON_GENERATE_MANIFEST_COMMENT_BODY: |
262 | 👋 ${{ github.event.issue.user.login }}
263 |
264 | There was an issue while registering the agent.
265 |
266 | Error Detail:
267 | ${{ steps.generate-manifest.outputs.error_detail }}
268 |
269 | ----
270 |
271 | Run link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
272 |
273 | with:
274 | script: |
275 | console.log(`Private Repo: ${process.env.PRIVATE_REPO}`)
276 |
277 | if (process.env.PRIVATE_REPO == 'true'){
278 | console.log("Responding with non private repo message.")
279 | github.rest.issues.createComment({
280 | issue_number: context.issue.number,
281 | owner: context.repo.owner,
282 | repo: context.repo.repo,
283 | body: process.env.PRIV_REPO_COMMENT_BODY
284 | })
285 | } else if (process.env.ERROR_ON_GENERATE_MANIFEST == 'true') {
286 | console.log("Responding with manifest error message.")
287 | github.rest.issues.createComment({
288 | issue_number: context.issue.number,
289 | owner: context.repo.owner,
290 | repo: context.repo.repo,
291 | body: process.env.ERROR_ON_GENERATE_MANIFEST_COMMENT_BODY
292 | })
293 | } else {
294 | console.log("Responding with generic error message.")
295 | github.rest.issues.createComment({
296 | issue_number: context.issue.number,
297 | owner: context.repo.owner,
298 | repo: context.repo.repo,
299 | body: process.env.COMMENT_BODY
300 | })
301 | }
302 | github.rest.issues.addLabels({
303 | issue_number: context.issue.number,
304 | owner: context.repo.owner,
305 | repo: context.repo.repo,
306 | labels: ['error']
307 | })
308 |
309 |
310 |
--------------------------------------------------------------------------------
/docs/how-to-launch-benchmark-ciso.md:
--------------------------------------------------------------------------------
1 | # How to Launch the Benchmark (CISO Agent)
2 |
3 | This guide walks you through launching the ITBench benchmark for a CISO agent.
4 | You will run two Docker containers—**Agent Harness** and **Bench Runner**—and keep them running during the evaluation.
5 |
6 | > ⚠️ **Note:** You must not run multiple Agent Harnesses or Bench Runners at the same time.
7 |
8 | ## Option 1: Use the CISO CAA Agent (Prebuilt)
9 |
10 | If you would like to benchmark using the [official CISO CAA Agent](https://github.com/itbench-hub/itbench-ciso-caa-agent), follow these steps:
11 |
12 | 1. Create a .env File
13 | Create a .env file with the following contents:
14 | ```
15 | OPENAI_API_KEY =
16 | OPENAI_MODEL_NAME = gpt-4o-mini
17 | CODE_GEN_MODEL = gpt-4o-mini
18 | ```
19 | If you want to use other models, refer to [this section](https://github.com/itbench-hub/itbench-ciso-caa-agent?tab=readme-ov-file#3-create-env-file-and-set-llm-api-credentials)
20 |
21 | 1. Run CISO Agent Harness Docker container
22 | Run the container, replacing `` and `` replaced with your own paths.
23 | ```
24 | docker run --rm -it --name ciso-agent-harness \
25 | --mount type=bind,src=,dst=/tmp/agent-manifest.json \
26 | --mount type=bind,src=,dst=/etc/ciso-agent/.env \
27 | quay.io/it-bench/ciso-agent-harness:latest \
28 | --host itbench.apps.prod.itbench.res.ibm.com \
29 | --benchmark_timeout 3600
30 | ```
31 |
32 |
33 | 1. Run the CISO DEF Runner Docker Container
34 | Open a new terminal window and run the container, replacing `` and `` replaced with your own paths.
35 |
36 | (If you are benchmarking a RHEL scenario, please refer to [the full specification.](#full-specification-of-bench-runner))
37 | ```
38 | docker run --rm -it --name ciso-bench-runner \
39 | --mount type=bind,src=,dst=/tmp/agent-manifest.json \
40 | --mount type=bind,src=,dst=/tmp/kubeconfig.yaml \
41 | quay.io/it-bench/ciso-bench-runner:latest \
42 | --host itbench.apps.prod.itbench.res.ibm.com \
43 | --runner_id my-ciso-runner-1
44 | ```
45 |
46 |
47 | 1. Benchmark Progress and Status Updates
48 | - The benchmark will proceed automatically after starting:
49 | - The benchmark will typically complete within about one hour, after which both Docker containers will exit automatically.
50 | - Once completed, you can safely close both terminal windows.
51 | - During the benchmark:
52 | - The original registration issue will be updated approximately every 10 minutes.
53 | - A table summarizing the results will appear, showing the status of each scenario.
54 |
55 |
56 |
57 | Table Fields:
58 | | Field | Description |
59 | |:------------------|:----------------------------------------------------|
60 | | Scenario Name | The name of the scenario |
61 | | Description | A short description of the control being assessed |
62 | | Passed | Whether the agent passed the scenario (True/False) |
63 | | Time To Resolve | Time taken to complete |
64 | | Error | Any unexpected error encountered |
65 | | Message | Additional information or status |
66 | | Date | Completion timestamp |
67 |
68 | 5. Once all scenarios are completed:
69 | - The Docker commands will automatically stop.
70 |
71 |
72 |
73 | - The registration issue comment will update its status to **Finished**, and the issue will automatically close.
74 |
75 |
76 |
77 | 6. Troubleshooting
78 |
79 | - If the benchmark fails to start:
80 | - Add a comment to the issue with the text abort.
81 | - Optionally, include additional notes about the problem.
82 |
83 | - If the containers keep running without completing:
84 | - Check if the "Date" field in the table is not updating.
85 | - If it is stuck, terminate the container processes manually (Ctrl+C) and add abort to the issue comment.
86 |
87 | 7. Leaderboard Update:
88 | - The benchmark results will be manually reflected on the leaderboard within a few days.
89 |
90 |
91 |
92 | - If you do not see updates after a few days, please reach out to [Contact Support](#contact-support).
93 |
94 |
95 | ## Option 2: Use Your Own Agent
96 |
97 | If you are submitting your own custom agent, follow these steps:
98 |
99 | 1. Create Agent Harness config
100 | ```yaml
101 | # This field defines the path where the scenario's environment information is stored.
102 | # When the agent harness runs the command below, the scenario data is fetched from the server and saved at this location.
103 | path_to_data_provided_by_scenario: /tmp/agent/scenario_data.json
104 |
105 | # This field defines the path where the agent's output results should be stored.
106 | # The agent harness uploads this file back to the server for evaluation.
107 | path_to_data_pushed_to_scenario: /tmp/agent/agent_data.txt
108 |
109 | # Command to be run by the agent harness
110 | run:
111 | command: ["/bin/bash"]
112 | args:
113 | - -c
114 | - |
115 |
116 | ```
117 |
118 | The `command` is executed with `args` inside a docker container that is built from a Dockerfile you create (we will instruct in the later section).
119 |
120 | For example, the following is [the Agent Harness config](https://github.com/itbench-hub/ITBench-CISO-CAA-Agent/blob/main/agent-harness.yaml) of the sample CISO CAA Agent. It appears complex because it includes error handling. When creating your own harness config, it doesn’t need to be this complicated. However, make sure to include proper termination handling to avoid infinite loops.
121 |
122 | ```yaml
123 | path_to_data_provided_by_scenario: /tmp/agent/scenario_data.json
124 | path_to_data_pushed_to_scenario: /tmp/agent/agent_data.tar
125 | run:
126 | command: ["/bin/bash"]
127 | args:
128 | - -c
129 | - |
130 |
131 | timestamp=$(date +%Y%m%d%H%M%S)
132 | tmpdir=/tmp/agent/${timestamp}
133 | mkdir -p ${tmpdir}
134 |
135 | cat /tmp/agent/scenario_data.json > ${tmpdir}/scenario_data.json
136 |
137 | jq -r .goal_template ${tmpdir}/scenario_data.json > ${tmpdir}/goal_template.txt
138 | jq -r .vars.kubeconfig ${tmpdir}/scenario_data.json > ${tmpdir}/kubeconfig.yaml
139 | jq -r .vars.ansible_ini ${tmpdir}/scenario_data.json > ${tmpdir}/ansible.ini
140 | jq -r .vars.ansible_user_key ${tmpdir}/scenario_data.json > ${tmpdir}/user_key
141 | chmod 600 ${tmpdir}/user_key
142 | sed -i.bak -E "s|(ansible_ssh_private_key_file=\")[^\"]*|\1${tmpdir}/user_key|" ${tmpdir}/ansible.ini
143 |
144 | sed "s|{{ kubeconfig }}|${tmpdir}/kubeconfig.yaml|g" ${tmpdir}/goal_template.txt > ${tmpdir}/goal.txt
145 | sed -i.bak -E "s|\{\{ path_to_inventory \}\}|${tmpdir}/ansible.ini|g" ${tmpdir}/goal.txt
146 |
147 | echo "You can use \`${tmpdir}\` as your workdir." >> ${tmpdir}/goal.txt
148 |
149 | source .venv/bin/activate
150 | timeout 200 python src/ciso_agent/main.py --goal "`cat ${tmpdir}/goal.txt`" --auto-approve -o ${tmpdir}/agent-result.json || true
151 |
152 | tar -C ${tmpdir} -cf /tmp/agent/agent_data.tar .
153 | ```
154 |
155 | 1. Timestamped Temporary Directory Creation
156 | ```
157 | timestamp=$(date +%Y%m%d%H%M%S)
158 | tmpdir=/tmp/agent/${timestamp}
159 | mkdir -p ${tmpdir}
160 | ```
161 | 2. Scenario Data Processing
162 | ```
163 | cat /tmp/agent/scenario_data.json > ${tmpdir}/scenario_data.json
164 | ```
165 | Copies the downloaded scenario data from IT Bench, which is specified in `path_to_data_provided_by_scenario`, into the temporary directory.
166 | 3. Extracting Key Variables to be passed to python command arguments to run the CISO CAA Agent
167 | ```
168 | jq -r .goal_template ${tmpdir}/scenario_data.json > ${tmpdir}/goal_template.txt
169 | jq -r .vars.kubeconfig ${tmpdir}/scenario_data.json > ${tmpdir}/kubeconfig.yaml
170 | jq -r .vars.ansible_ini ${tmpdir}/scenario_data.json > ${tmpdir}/ansible.ini
171 | jq -r .vars.ansible_user_key ${tmpdir}/scenario_data.json > ${tmpdir}/user_key
172 | chmod 600 ${tmpdir}/user_key
173 | ```
174 |
175 | 4. Updating ansible.ini with User Key for RHEL scenario cases.
176 | ```
177 | sed -i.bak -E "s|(ansible_ssh_private_key_file=\")[^\"]*|\1${tmpdir}/user_key|" ${tmpdir}/ansible.ini
178 | ```
179 | 5. Preparing the Goal File to be passed to python command arguments to run the CISO CAA Agent
180 | ```
181 | sed "s|{{ kubeconfig }}|${tmpdir}/kubeconfig.yaml|g" ${tmpdir}/goal_template.txt > ${tmpdir}/goal.txt
182 | sed -i.bak -E "s|\{\{ path_to_inventory \}\}|${tmpdir}/ansible.ini|g" ${tmpdir}/goal.txt
183 | echo "You can use \`${tmpdir}\` as your workdir." >> ${tmpdir}/goal.txt
184 | ```
185 | 6. Running the Agent (Automated or Manual)
186 | ```
187 | source .venv/bin/activate
188 | timeout 200 python src/ciso_agent/main.py --goal "`cat ${tmpdir}/goal.txt`" --auto-approve -o ${tmpdir}/agent-result.json || true
189 | ```
190 | - Enable python virtual env
191 | - Runs main.py with the goal extracted from goal.txt.
192 | - Enforces a timeout of 200 seconds to avoid infinite running.
193 | - Saves the result as agent-result.json in `${tmpdir}` directory.
194 | 7. Archiving the Execution Data by the agent
195 | The CISO CAA Agent generates compliance policy programs and stores them in the designated working directory. The script ensures that all relevant execution data is archived for further analysis.
196 | ```
197 | tar -C ${tmpdir} -cf /tmp/agent/agent_data.tar .
198 | ```
199 | 1. Create a Docker image
200 | The docker image is built from Agent Harness base image and is expected to contain your Agent (e.g. crewai python program).
201 |
202 | For example, the Dockerfile is as follows in the case of CISO Agent:
203 | ```
204 | FROM icr.io/agent-bench/ciso-agent-harness-base:0.0.3 AS base
205 | RUN ln -sf /bin/bash /bin/sh
206 | RUN apt update -y && apt install -y curl gnupg2 unzip ssh
207 |
208 | # install dependencies here to avoid too much build time
209 | COPY itbench-ciso-caa-agent /etc/ciso-agent
210 | WORKDIR /etc/ciso-agent
211 | RUN python -m venv .venv && source .venv/bin/activate && pip install -r requirements-dev.txt --no-cache-dir
212 |
213 | # install `ansible-playbook`
214 | RUN pip install --upgrade ansible-core jmespath kubernetes==31.0.0 setuptools==70.0.0 --no-cache-dir
215 | RUN ansible-galaxy collection install kubernetes.core community.crypto
216 | RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
217 | # install `jq`
218 | RUN apt update -y && apt install -y jq
219 | # install `kubectl`
220 | RUN curl -LO https://dl.k8s.io/release/v1.31.0/bin/linux/$(dpkg --print-architecture)/kubectl && \
221 | chmod +x ./kubectl && \
222 | mv ./kubectl /usr/local/bin/kubectl
223 | # install `aws` (need this for using kubectl against AWS cluster)
224 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" && \
225 | unzip awscliv2.zip && \
226 | ./aws/install
227 | # install `opa`
228 | RUN curl -L -o opa https://github.com/open-policy-agent/opa/releases/download/v1.0.0/opa_linux_$(dpkg --print-architecture)_static && \
229 | chmod +x ./opa && \
230 | mv ./opa /usr/local/bin/opa
231 |
232 | RUN python -m venv .venv && source .venv/bin/activate && pip install -e /etc/ciso-agent --no-cache-dir
233 |
234 | COPY agent-bench-automation.wiki/.gist/agent-harness/entrypoint.sh /etc/entrypoint.sh
235 | RUN chmod +x /etc/entrypoint.sh
236 | WORKDIR /etc/agent-benchmark
237 |
238 | ENTRYPOINT ["/etc/entrypoint.sh"]
239 | ```
240 |
241 | ## Conclusion
242 |
243 | Congratulations! You’ve successfully completed the ITBench benchmarking process.
244 |
245 | ## Contact Support
246 |
247 | If you do not receive any response within a couple of days, please leave a comment in your original registration issue and mention our support team.
248 | - Mention: @yana, @rohanarora
249 | - Add Label: `need help`
250 |
251 | Example Comment:
252 | ```
253 | @yana, @rohanarora
254 | Hi, I have not received a response regarding my registration request.
255 | Adding the "need help" label for visibility.
256 | ```
257 |
258 | ## Misc
259 |
260 | #### Full Specification of Bench Runner
261 |
262 | ```
263 | docker run --rm -it --name ciso-bench-runner \
264 | --mount type=bind,src=,dst=/tmp/agent-manifest.json \
265 | --mount type=bind,src=,dst=/tmp/kubeconfig.yaml \
266 | --mount type=bind,src=,dst=/tmp/rhel-bundle-config/ssh_key \
267 | quay.io/it-bench/ciso-bench-runner:latest \
268 | --host itbench.apps.prod.itbench.res.ibm.com \
269 | --runner_id my-ciso-runner-1 \
270 | --rhel_address \
271 | --rhel_username
272 | ```
--------------------------------------------------------------------------------
/.github/workflows/leaderboard.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 | import re
5 | import urllib.request
6 | from datetime import datetime, timedelta, timezone
7 | from typing import Optional
8 | from urllib.parse import urlencode
9 |
10 | ITBENCH_API = os.getenv("ITBENCH_API")
11 | ITBENCH_API_TOKEN = os.getenv("ITBENCH_API_TOKEN")
12 | GH_REPO = os.getenv("GH_REPO")
13 | REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "10"))
14 |
15 |
16 | def get_leaderboard(benchmark_id: str = None, github_username: str = None):
17 | url = f"{ITBENCH_API}/gitops/aggregate-results"
18 | query_params = {}
19 | if benchmark_id is not None:
20 | query_params["benchmark_id"] = benchmark_id
21 | if github_username is not None:
22 | query_params["github_username"] = github_username
23 | if query_params:
24 | url += "?" + urlencode(query_params)
25 | headers = {"Authorization": f"Bearer {ITBENCH_API_TOKEN}"}
26 | req = urllib.request.Request(url=url, headers=headers, method="GET")
27 | res = urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT)
28 |
29 | if res.getcode() != 200:
30 | print(f"Error requesting leaderboard JSON: {res.status_code}. {res.content}")
31 | exit(1)
32 |
33 | res_body = res.read()
34 | res_dict = json.loads(res_body.decode("utf-8"))
35 | return res_dict
36 |
37 |
38 | def parse_json_timedelta(delta):
39 | if not delta:
40 | return "N/A"
41 |
42 | match = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+(?:\.\d+)?)S)?", delta)
43 | if not match:
44 | return "Invalid"
45 |
46 | hours = int(match.group(1)) if match.group(1) else 0
47 | minutes = int(match.group(2)) if match.group(2) else 0
48 | seconds = float(match.group(3)) if match.group(3) else 0.0
49 | return str(int(timedelta(hours=hours, minutes=minutes, seconds=seconds).total_seconds())) + "s"
50 |
51 |
52 | def get_timestamp(dt: Optional[datetime] = None) -> str:
53 | if not dt:
54 | dt = datetime.now(timezone.utc)
55 | return dt.strftime("%d/%m/%Y %H:%M:%S")
56 |
57 |
58 | def to_datetime(timestamp: str) -> datetime:
59 | return datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
60 |
61 |
62 | def build_overall_table(leaderboard):
63 | bench_summary = []
64 | prev_score = None
65 | rank = 0
66 | count = 0
67 | for benchmark in leaderboard:
68 | count += 1
69 | if benchmark["score"] != prev_score:
70 | rank = count
71 | name = benchmark["agent"]
72 | github_username_link = benchmark["github_username_link"]
73 | github_username_org = benchmark["github_username_org"]
74 | score = f'{int(benchmark["score"] * 100)}%'
75 | agent_type = benchmark["agent_type"]
76 | checkmarks = "✅" * benchmark["num_of_passed"] if benchmark["num_of_passed"] >= 0 else "N/A"
77 | notes = f'Related to {benchmark["incident_type"]} scenarios'
78 | issue_link = benchmark["issue_link"]
79 |
80 | sre = finops = ciso = "N/A"
81 | if agent_type == "SRE":
82 | sre = checkmarks
83 | elif agent_type == "FinOps":
84 | finops = checkmarks
85 | elif agent_type == "CISO":
86 | ciso = checkmarks
87 | bench_line = [
88 | rank,
89 | name,
90 | github_username_link,
91 | github_username_org,
92 | score,
93 | sre,
94 | finops,
95 | ciso,
96 | issue_link,
97 | notes,
98 | ]
99 | prev_score = benchmark["score"]
100 | bench_summary.append(bench_line)
101 |
102 | header_str = ['Rank', 'Agent Name', 'Agent Submitter', 'Organization', 'Overall Score', 'SRE', 'FinOps', 'CISO', 'Issue Link', 'Notes']
103 | line_fmt = '| {:^4} | {:^20} | {:^13} | {:^13} | {:^13} | {:^13} | {:^13} | {:^13} | {:^13} | {:<30} |'
104 | headers = line_fmt.format(*header_str)
105 | header_len = len(headers)
106 |
107 | texts = []
108 | texts.append("## 📊 IT Bench Leaderboard")
109 | header = """\
110 | This table shows a consolidated view of all agent submissions across different domains (SRE, FinOps, CISO).
111 |
112 | For details on how to participate, see the [README](../README.md).
113 |
114 | **Column Descriptions:**
115 | - *Overall Score*: Combined performance across available domains
116 | - *SRE / FinOps / CISO*: ✅ if benchmarks in that domain were completed
117 | - *Notes*: Additional context on the evaluated scenarios
118 | """
119 | texts.append(header)
120 | texts.append(f"\n\nUpdated on: {get_timestamp()}\n\n")
121 | texts.append("-" * header_len)
122 | texts.append(headers)
123 | texts.append(line_fmt.format(*("---" * 7)))
124 | for bench_line in bench_summary:
125 | texts.append(line_fmt.format(*bench_line))
126 |
127 | return "\n".join(texts)
128 |
129 | def build_ciso_table(leaderboard) -> str:
130 | column_mapping = {
131 | "id": "Benchmark (ID)",
132 | "github_username_link": "Agent Submitter",
133 | "github_username_org": "Organization",
134 | "agent": "Agent Name",
135 | "incident_type": "Scenario Category",
136 | "score": "Score ⬆️",
137 | "mttr": "Mean Agent Execution Duration",
138 | "num_of_passed": "#Passed",
139 | "issue_link": "Issue Link",
140 | "date": "Date (UTC)",
141 | }
142 | columns = ["agent", "github_username_link", "github_username_org", "incident_type", "score", "num_of_passed", "mttr", "date", "issue_link"]
143 | headers = [column_mapping[col] for col in columns]
144 |
145 | texts = []
146 | texts.append("## 📊 IT Bench Leaderboard (CISO)")
147 | header = """\
148 | This leaderboard shows the performance of agents on CISO-related IT automation scenarios.
149 | For details on how to participate or interpret results, see the [README](../main/README.md).
150 |
151 | **Column Descriptions:**
152 | - *Score*: Average benchmark score across scenarios (1.0 = perfect)
153 | - *#Passed*: Number of scenarios successfully passed
154 | - *Mean Agent Execution Duration*: Average time taken across scenarios
155 | - *Scenario Category*: Categories of evaluated tasks (e.g., RHEL, Kyverno, etc.)
156 | """
157 | texts.append(header)
158 | texts.append(f"\n\nUpdated on: {get_timestamp()}\n\n")
159 | texts.append("---")
160 | texts.append("| " + " | ".join(headers) + " |")
161 | texts.append("|" + "|".join(["-" * (len(h) + 2) for h in headers]) + "|")
162 |
163 | for row in leaderboard:
164 | values = []
165 | for col in columns:
166 | val = row.get(col, "")
167 | if col == "mttr":
168 | val = parse_json_timedelta(val)
169 | elif col == "date":
170 | val = get_timestamp(to_datetime(val))
171 | elif isinstance(val, float):
172 | val = f"{val:.2f}"
173 | values.append(str(val))
174 | texts.append("| " + " | ".join(values) + " |")
175 | return "\n".join(texts)
176 |
177 | def get_nested_value(metric_name, content) -> dict:
178 | metric_parent, metric = metric_name.split("__")
179 | nested_dict = content[metric_parent][metric]
180 |
181 | formatted_dict = {k: (lambda v: f"{v:.2f}" if isinstance(v, float) else v)(val)
182 | for k, val in nested_dict.items()}
183 | return json.dumps(formatted_dict)
184 |
185 | def build_sre_table(leaderboard) -> str:
186 | column_mapping = {
187 | "id": "Benchmark (ID)",
188 | "github_username_link": "Agent Submitter",
189 | "github_username_org": "Organization",
190 | "name_decorated": "Benchmark (Name)",
191 | "agent": "Agent (Name)",
192 | "incident_type": "Scenario Category",
193 | "trials": "Trials across incidents",
194 | "percent_agent_submitted_diagnosis_results": "Diagnosis received - % of Trials",
195 | "diagnosis__ntam_fault_localization": "Diagnosis - NTAM Fault Localization",
196 | "diagnosis__ntam_fault_propagation": "Diagnosis - NTAM Fault Propagation",
197 | "diagnosis__time_to_diagnosis": "Diagnosis - Time to Diagnosis",
198 | "diagnosis__duration_agent_tried_for_diagnosis": "Diagnosis - Duration agent tried for Diagnosis",
199 | "repair__time_to_repair": "Repair - Time to Repair",
200 | "percent_resolved": "% Resolved",
201 | "issue_link": "Issue Link",
202 | "date": "Date (UTC)",
203 | }
204 | columns = ["agent", "github_username_link", "github_username_org",
205 | "incident_type", "trials",
206 | "diagnosis__ntam_fault_localization",
207 | "diagnosis__ntam_fault_propagation",
208 | "diagnosis__time_to_diagnosis",
209 | "diagnosis__duration_agent_tried_for_diagnosis",
210 | "repair__time_to_repair",
211 | "percent_resolved",
212 | "date", "issue_link"]
213 | headers = [column_mapping[col] for col in columns]
214 |
215 | texts = []
216 | texts.append("## 📊 IT Bench Leaderboard (SRE)")
217 | header = f"""\
218 | This leaderboard shows the performance of agents on SRE-related IT automation scenarios.
219 | For details on how to participate or interpret results, see the [README](../main/README.md).
220 |
221 | **Column Descriptions:**
222 | - *Diagnosis - NTAM Fault Localization*: Normalized Topology Aware Metric (NTAM) Average Fault Propagation Chain
223 | - *Diagnosis - NTAM Fault Propagation*: NTAM Average Fault Localisation
224 | - *% Resolved*: Percentage of incidents repaired (mitigation efficiency)
225 | """
226 | texts.append(header)
227 | texts.append(f"\n\nUpdated on: {get_timestamp()}\n\n")
228 | texts.append("---")
229 | texts.append("| " + " | ".join(headers) + " |")
230 | texts.append("|" + "|".join(["-" * (len(h) + 2) for h in headers]) + "|")
231 |
232 | for row in leaderboard:
233 | values = []
234 | for col in columns:
235 | val = row.get(col, "")
236 | if col == "mttr":
237 | val = parse_json_timedelta(val)
238 | elif col == "date":
239 | val = get_timestamp(to_datetime(val))
240 | elif (col == "diagnosis__ntam_fault_localization" or
241 | col == "diagnosis__ntam_fault_propagation" or
242 | col == "diagnosis__time_to_diagnosis" or
243 | col == "diagnosis__duration_agent_tried_for_diagnosis" or
244 | col == "repair__time_to_repair"):
245 | val = get_nested_value(col, row)
246 | elif col == "percent_resolved":
247 | val = row.get("repair", {}).get(col, 0.0)
248 | elif isinstance(val, float):
249 | val = f"{val:.2f}"
250 | values.append(str(val))
251 | texts.append("| " + " | ".join(values) + " |")
252 | return "\n".join(texts)
253 |
254 | SAMPLE_DATA = [
255 | {
256 | 'name': 'Run-2',
257 | 'incident_type': 'SRE',
258 | 'agent': 'Agent-104',
259 | 'results': [{}] * 10,
260 | 'mttr': 'PT0S',
261 | 'num_of_passed': 3,
262 | 'score': 0.3,
263 | 'date': '2025-03-11T13:54:23.576999Z',
264 | 'id': 'f324b0ca-5065-435e-a140-1db3f409926d',
265 | 'agent_type': 'SRE',
266 | 'github_username': 'Rohan-Arora',
267 | },
268 | {
269 | 'name': 'My CISO Agent Benchmark',
270 | 'incident_type': 'Gen-CIS-b-K8s-Kyverno',
271 | 'agent': 'My CISO Agent (Yana)',
272 | 'results': [{}] * 10,
273 | 'mttr': 'PT1M5.70376S',
274 | 'num_of_passed': 3,
275 | 'score': 0.3,
276 | 'date': '2025-03-17T00:36:52.334468Z',
277 | 'id': '337e85bf-f29d-4b60-b159-6f66c9d6febe',
278 | 'agent_type': 'CISO',
279 | 'github_username': 'yana1205',
280 | },
281 | {
282 | 'name': 'Top SRE Benchmark',
283 | 'incident_type': 'SRE',
284 | 'agent': 'Baseline SRE Agent',
285 | 'results': [{}] * 10,
286 | 'mttr': 'PT30S',
287 | 'num_of_passed': 7,
288 | 'score': 0.70,
289 | 'date': '2025-03-20T12:00:00Z',
290 | 'id': 'aaa-bbb',
291 | 'agent_type': 'SRE',
292 | 'github_username': 'sre_star',
293 | },
294 | {
295 | 'name': 'Top CISO Benchmark',
296 | 'incident_type': 'Gen-CIS-b-RHEL9-Ansible-OPA',
297 | 'agent': 'Baseline CISO Agentp',
298 | 'results': [{}] * 10,
299 | 'mttr': 'PT1M',
300 | 'num_of_passed': 6,
301 | 'score': 0.6,
302 | 'date': '2025-03-20T12:10:00Z',
303 | 'id': 'ccc-ddd',
304 | 'agent_type': 'CISO',
305 | 'github_username': 'ciso_champ',
306 | },
307 | ]
308 |
309 |
310 | if __name__ == "__main__":
311 |
312 | parser = argparse.ArgumentParser(description="Print IT Bench leaderboard")
313 | parser.add_argument("leaderboard")
314 | parser.add_argument("-u", "--github_username", type=str)
315 | parser.add_argument("-b", "--benchmark_id", type=str)
316 | parser.add_argument("--issues", type=str, required=True)
317 | parser.add_argument("--users", type=str, required=True)
318 | parser.add_argument("--out-ciso", type=str, required=True)
319 | parser.add_argument("--out-sre", type=str, required=True)
320 | parser.add_argument("--out-overall", type=str, required=True)
321 | parser.add_argument("--sample", action="store_true", help="Use sample data")
322 | args = parser.parse_args()
323 | if args.sample:
324 | leaderboard = SAMPLE_DATA
325 | # leaderboard_real = get_leaderboard(args.benchmark_id, args.github_username)
326 | leaderboard_real = []
327 | leaderboard = leaderboard + leaderboard_real
328 | else:
329 | if args.leaderboard == "global":
330 | leaderboard = get_leaderboard()
331 | else:
332 | leaderboard = get_leaderboard(args.benchmark_id, args.github_username)
333 |
334 | with open(args.issues, "r") as f:
335 | issues = json.load(f)
336 |
337 | with open(args.users, "r") as f:
338 | users = json.load(f)
339 |
340 | benchmark_issue_mapping = {issue["benchmark_id"]: issue["number"] for issue in issues}
341 | for item in leaderboard:
342 | number = benchmark_issue_mapping.get(item["id"])
343 | item["issue_link"] = f"[#{number}](https://github.com/{GH_REPO}/issues/{number})" if number else "Not Found"
344 | username = item.get("github_username")
345 | item["github_username_link"] = f"[{username}](https://github.com/{username})" if username else "N/A"
346 | company = users.get(username, {}).get("company")
347 | item["github_username_org"] = company if company else ""
348 | # temporal solution for SRE metrics
349 | if "score" not in item:
350 | item["score"] = item.get("percent_agent_submitted_diagnosis_results", 0.0) / 100
351 | if "num_of_passed" not in item:
352 | item["num_of_passed"] = int(item["score"] * 10) # treate number of pass as decile of score
353 |
354 | leaderboard = sorted(leaderboard, key=lambda x: x["score"], reverse=True)
355 | leaderboard_ciso = [x for x in leaderboard if x["agent_type"] == "CISO"]
356 | leaderboard_sre = [x for x in leaderboard if x["agent_type"] == "SRE"]
357 |
358 | overall_table = build_overall_table(leaderboard)
359 | with open(args.out_overall, "w") as f:
360 | f.write(overall_table)
361 |
362 | ciso_table = build_ciso_table(leaderboard_ciso)
363 | with open(args.out_ciso, "w") as f:
364 | f.write(ciso_table)
365 |
366 | sre_table = build_sre_table(leaderboard_sre)
367 | with open(args.out_sre, "w") as f:
368 | f.write(sre_table)
369 |
--------------------------------------------------------------------------------
/.github/workflows/update_benchmark_helper.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import logging
4 | import os
5 | import re
6 | import textwrap
7 | import urllib.request
8 | from dataclasses import asdict, dataclass, field
9 | from datetime import datetime, timedelta, timezone
10 | from typing import Any, Dict, List, Optional
11 | from urllib.parse import urlparse
12 |
13 | ITBENCH_API = os.getenv("ITBENCH_API")
14 | ITBENCH_API_TOKEN = os.getenv("ITBENCH_API_TOKEN")
15 | LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
16 |
17 | logger = logging.getLogger(__name__)
18 | loglevel = logging.getLevelNamesMapping().get(LOG_LEVEL, logging.INFO)
19 | logging.basicConfig(level=loglevel, format="%(asctime)s - %(levelname)s - %(message)s")
20 | logger.setLevel(loglevel)
21 |
22 |
23 | @dataclass
24 | class UpdatedIssue:
25 | number: int
26 | github_username: str
27 | benchmark_id: str
28 | comments: List[Dict[str, Any]] = field(default_factory=list)
29 |
30 |
31 | @dataclass
32 | class BenchmarkStatus:
33 | number: int
34 | github_username: str
35 | benchmark_id: str
36 | agent_type: str
37 | status: str
38 | error_message: Optional[str] = None
39 | results: List[Dict[str, Any]] = field(default_factory=list)
40 | scenario_name_description_map: Optional[Dict[str, str]] = None
41 | status_comment_id: Optional[str] = None
42 |
43 |
44 | @dataclass
45 | class BenchmarkStatusComment:
46 | number: int
47 | comment: str
48 | closed: bool
49 | status_comment_id: Optional[str] = None
50 |
51 |
52 | def output(args, data):
53 | if args.output:
54 | with open(args.output, "w") as f:
55 | f.write(data)
56 | else:
57 | print(data)
58 |
59 |
60 | class ParseCommand:
61 |
62 | def exec(self, args):
63 | with open(args.input, "r") as f:
64 | issues = json.load(f)
65 |
66 | updated_issues: List[UpdatedIssue] = []
67 | for issue in issues:
68 | number = issue.get("number")
69 | author = issue.get("author", {})
70 | comments = issue.get("comments", [])
71 | benchmark_id_comments = [{"comment": x, "benchmark_id": self.extract_benchmark_id(x)} for x in comments]
72 | benchmark_id_comment = [x for x in benchmark_id_comments if x.get("benchmark_id")]
73 | if len(benchmark_id_comment) == 0:
74 | logger.warning(f"No Benchmark ID comment found for issue {number}, skipping.")
75 | continue
76 | benchmark_id_comment = benchmark_id_comment[0]
77 | updated_issue = UpdatedIssue(
78 | number=number,
79 | github_username=author.get("login"),
80 | benchmark_id=benchmark_id_comment["benchmark_id"],
81 | comments=comments,
82 | )
83 | updated_issues.append(updated_issue)
84 |
85 | data = json.dumps([asdict(x) for x in updated_issues], indent=2)
86 | output(args, data)
87 |
88 | def extract_benchmark_id(self, issue):
89 | pattern = r""
90 | match = re.search(pattern, issue.get("body", ""))
91 | if match:
92 | return match.group("id")
93 | else:
94 | return None
95 |
96 |
97 | class StatusCommand:
98 |
99 | def exec(self, args):
100 | with open(args.input, "r") as f:
101 | updated_issues = json.load(f)
102 |
103 | updated_issues = [UpdatedIssue(**x) for x in updated_issues]
104 | benchmark_statuses: List[BenchmarkStatus] = []
105 | for upd in updated_issues:
106 | github_username = upd.github_username
107 | benchmark_id = upd.benchmark_id
108 |
109 | # find existing status comment
110 | status_comment = [x for x in upd.comments if re.match(r"^### Status", x.get("body", ""))]
111 | if len(status_comment) == 0:
112 | status_comment_id = None
113 | else:
114 | # Example GitHub issue comment URL:
115 | # e.g., https://github.com/yana1205/gitops-bench-0310/issues/10#issuecomment-2726194238
116 | url = status_comment[0].get("url") # Retrieve the comment URL from the status data
117 |
118 | # Parse the URL and extract the fragment part (everything after "#")
119 | # The fragment contains the comment ID, formatted as "issuecomment-"
120 | parsed_url = urlparse(url)
121 | status_comment_id = parsed_url.fragment.replace("issuecomment-", "") # Extract only the numeric comment ID
122 |
123 | # get results of finished scenarios
124 | bench_results, error = self.request(
125 | f"{ITBENCH_API}/gitops/retrieve-results?benchmark_id={benchmark_id}&github_username={github_username}"
126 | )
127 | if error:
128 | bs = self.to_benchmark_status(
129 | upd, error_message="Failed to get benchmark progress.", status="Unkown", status_comment_id=status_comment_id
130 | )
131 | benchmark_statuses.append(bs)
132 | continue
133 | bench_result = bench_results[0] # benchmark_id is specified in query param so the response should contain only 1 item.
134 | benchmark = bench_result.get("benchmark", {})
135 | spec = benchmark.get("spec", {})
136 | agent_type = spec.get("agent_type", None)
137 | status = benchmark.get("status", {})
138 | phase = status.get("phase", "Errored")
139 | results = bench_result.get("results", {})
140 | bs = self.to_benchmark_status(
141 | upd,
142 | agent_type=agent_type,
143 | status=phase,
144 | status_comment_id=status_comment_id,
145 | results=results,
146 | benchmark=benchmark,
147 | )
148 | benchmark_statuses.append(bs)
149 |
150 | data = json.dumps([asdict(x) for x in benchmark_statuses], indent=2)
151 | output(args, data)
152 |
153 | def request(self, url):
154 | headers = {"Authorization": f"Bearer {ITBENCH_API_TOKEN}"}
155 | req = urllib.request.Request(url=url, headers=headers, method="GET")
156 | res = urllib.request.urlopen(req, timeout=10)
157 | if res.getcode() != 200:
158 | logger.error(f"Error requesting benchmark JSON: {res.status_code}. {res.content}")
159 | return None, True
160 | res_body = res.read()
161 | res_dict = json.loads(res_body.decode("utf-8"))
162 | return res_dict, False
163 |
164 | def to_benchmark_status(
165 | self,
166 | upd: UpdatedIssue,
167 | agent_type: str,
168 | status: str,
169 | status_comment_id,
170 | error_message: Optional[str] = None,
171 | results: List[Dict[str, Any]] = [],
172 | benchmark: Optional[Dict[str, Any]] = None,
173 | ):
174 | spec = benchmark.get("spec", {})
175 | scenario_name_description_map = {x["spec"]["name"]: x["spec"]["description"] for x in spec.get("scenarios", [])}
176 | return BenchmarkStatus(
177 | number=upd.number,
178 | github_username=upd.github_username,
179 | benchmark_id=upd.benchmark_id,
180 | agent_type=agent_type,
181 | error_message=error_message,
182 | status=status,
183 | status_comment_id=status_comment_id,
184 | results=results,
185 | scenario_name_description_map=scenario_name_description_map,
186 | )
187 |
188 |
189 | class CommentCommand:
190 |
191 | def exec(self, args):
192 | with open(args.input, "r") as f:
193 | benchmark_statuses = json.load(f)
194 | benchmark_statuses = [BenchmarkStatus(**x) for x in benchmark_statuses]
195 |
196 | benchmark_status_comments: List[BenchmarkStatusComment] = []
197 | for benchmark_status in benchmark_statuses:
198 | if benchmark_status.error_message:
199 | comment = self.to_error_comment(benchmark_status)
200 | else:
201 | comment = self.to_comment(benchmark_status)
202 | closed = benchmark_status.status in ["Finished", "Errored"]
203 | bsc = BenchmarkStatusComment(
204 | number=benchmark_status.number,
205 | status_comment_id=benchmark_status.status_comment_id,
206 | comment=comment,
207 | closed=closed,
208 | )
209 | benchmark_status_comments.append(bsc)
210 |
211 | data = "\n".join([json.dumps(asdict(x)) for x in benchmark_status_comments])
212 | data += "\n"
213 | output(args, data)
214 |
215 | def to_comment(self, benchmark_status: BenchmarkStatus):
216 | if benchmark_status.agent_type == "CISO":
217 | table = self.to_table(benchmark_status)
218 | elif benchmark_status.agent_type == "SRE":
219 | table = self.to_table_sre(benchmark_status)
220 | else:
221 | table = "TBD"
222 | timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
223 | return f"""\
224 | ### Status
225 |
226 | #### Benchmark Status
227 | - **Benchmark ID**: {benchmark_status.benchmark_id}
228 | - **Status**: {benchmark_status.status}
229 |
230 | #### Results of Finished Scenarios
231 | {table}
232 |
233 | #### Last Updated: {timestamp}
234 | """
235 |
236 | def to_error_comment(self, benchmark_status: BenchmarkStatus):
237 | return f"""
238 | ### Status
239 |
240 | #### Benchmark Status
241 | - **Benchmark ID**: {benchmark_status.benchmark_id}
242 | - **Status**: {benchmark_status.status}
243 | - **Message**: {benchmark_status.message}
244 | """
245 |
246 | def parse_ttr(self, ttr):
247 | if not ttr:
248 | return "N/A"
249 |
250 | match = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+(?:\.\d+)?)S)?", ttr)
251 | if not match:
252 | return "Invalid"
253 |
254 | hours = int(match.group(1)) if match.group(1) else 0
255 | minutes = int(match.group(2)) if match.group(2) else 0
256 | seconds = float(match.group(3)) if match.group(3) else 0.0
257 | return str(int(timedelta(hours=hours, minutes=minutes, seconds=seconds).total_seconds())) + "s"
258 |
259 | def to_table(self, benchmark_status: BenchmarkStatus):
260 | results = benchmark_status.results
261 | table = []
262 |
263 | table.append("| Scenario Name | Description | Passed | Time To Resolve | Error | Message | Date |")
264 | table.append("|---------------|-------------|--------|-----------------|-------|---------|------|")
265 |
266 | for result in results:
267 | spec = result["spec"]
268 | name = spec["name"]
269 | description = spec["description"]
270 | passed = "✅" if spec["passed"] else "❌"
271 | errored = "Error" if spec["errored"] else "No error"
272 | ttr = self.parse_ttr(spec["ttr"])
273 | date = spec["date"]
274 | message_text = textwrap.shorten(spec["message"], width=50, placeholder="...")
275 | table.append(f"| {name} | {description} | {passed} | {ttr} | {errored} | {message_text} | {date} |")
276 |
277 | return "\n".join(table)
278 |
279 |
280 | def to_table_sre(self, benchmark_status: BenchmarkStatus):
281 | results = benchmark_status.results
282 | table = []
283 |
284 | table.append(
285 | "| Passed | Error | Trials | Date |"
286 | )
287 | table.append(
288 | "|--------|-------|--------|------|"
289 | )
290 |
291 | for result in results:
292 | spec = result["spec"]
293 | name = spec["name"]
294 | description = spec["description"]
295 | if not description or description == "":
296 | description = benchmark_status.scenario_name_description_map.get(name)
297 | passed = "✅" if spec["passed"] else "❌"
298 | errored = "Error" if spec["errored"] else "No error"
299 | date = spec["date"]
300 |
301 | try:
302 | message_data = json.loads(spec["message"])
303 |
304 | trials = message_data.get("trials", "N/A")
305 |
306 | # Extract diagnosis data (commented out from table but kept for potential future use)
307 | diagnosis = message_data.get("diagnosis", {})
308 | ntam_fault_localization = diagnosis.get("ntam_fault_localization", {}).get("mean", "N/A")
309 | ntam_fault_propagation = diagnosis.get("ntam_fault_propagation", {}).get("mean", "N/A")
310 | time_to_diagnosis = diagnosis.get("time_to_diagnosis", {}).get("mean", "N/A")
311 | duration_agent_tried = diagnosis.get("duration_agent_tried_for_diagnosis", {}).get("mean", "N/A")
312 |
313 | # Extract repair data (commented out from table but kept for potential future use)
314 | repair = message_data.get("repair", {})
315 | time_to_repair = repair.get("time_to_repair", {}).get("mean", "N/A")
316 | percent_resolved = repair.get("percent_resolved", "N/A")
317 |
318 | def format_value(value):
319 | if value == "N/A" or value is None:
320 | return "N/A"
321 | elif value == float('inf') or str(value) == "Infinity":
322 | return "∞"
323 | elif isinstance(value, (int, float)):
324 | return f"{value:.2f}"
325 | else:
326 | return str(value)
327 |
328 | trials_str = str(trials) if trials != "N/A" else "N/A"
329 | # Format diagnostic and repair values (commented out from table but kept for potential future use)
330 | # ntam_fault_localization_str = format_value(ntam_fault_localization)
331 | # ntam_fault_propagation_str = format_value(ntam_fault_propagation)
332 | # time_to_diagnosis_str = format_value(time_to_diagnosis)
333 | # duration_agent_tried_str = format_value(duration_agent_tried)
334 | # time_to_repair_str = format_value(time_to_repair)
335 | # percent_resolved_str = format_value(percent_resolved)
336 |
337 | except (json.JSONDecodeError, KeyError, TypeError) as e:
338 | # If JSON parsing fails or data is missing, use N/A for all fields
339 | trials_str = "N/A"
340 | # Commented out diagnostic and repair fields (kept for potential future use)
341 | # ntam_fault_localization_str = "N/A"
342 | # ntam_fault_propagation_str = "N/A"
343 | # time_to_diagnosis_str = "N/A"
344 | # duration_agent_tried_str = "N/A"
345 | # time_to_repair_str = "N/A"
346 | # percent_resolved_str = "N/A"
347 |
348 | table.append(
349 | f"| {passed} | {errored} | {trials_str} | {date} |"
350 | )
351 |
352 | return "\n".join(table)
353 |
354 | def main():
355 | parser = argparse.ArgumentParser()
356 | subparsers = parser.add_subparsers(dest="command", required=True)
357 |
358 | parser_parse = subparsers.add_parser("parse", help="Parse issues.json, filter by track flag, extract benchmark id")
359 | parser_parse.add_argument("-i", "--input", required=True, help="Input file (issues.json)")
360 | parser_parse.add_argument("-o", "--output", help="Output file (Default. stdout)")
361 | parser_parse.set_defaults(func=ParseCommand().exec)
362 |
363 | parser_status = subparsers.add_parser("status", help="Get progress and current results of the benchmark")
364 | parser_status.add_argument("-i", "--input", required=True, help="Input file (parsed issues)")
365 | parser_status.add_argument("-o", "--output", help="Output file (Default. stdout)")
366 | parser_status.set_defaults(func=StatusCommand().exec)
367 |
368 | parser_status = subparsers.add_parser("comment", help="Create comment from benchmark statuses")
369 | parser_status.add_argument("-i", "--input", required=True, help="Input file (benchmark_statuses.json)")
370 | parser_status.add_argument("-o", "--output", help="Output file (Default. stdout)")
371 | parser_status.set_defaults(func=CommentCommand().exec)
372 |
373 | args = parser.parse_args()
374 | args.func(args)
375 |
376 |
377 | if __name__ == "__main__":
378 | main()
379 |
--------------------------------------------------------------------------------