├── it_bench_arxiv.pdf ├── images ├── select-org.png ├── select-repo.png ├── go-to-github-app.png ├── sample_it_tasks.png ├── agent-issue-selection.png ├── benchmark-registration.png ├── agent-registration-done.png ├── agent-registration-email.png ├── agent-registration-fill.png ├── benchmark-registration-done.png ├── benchmark-registration-fill.png └── benchmark-registration-email.png ├── CONTRIBUTORS.md ├── .github ├── workflows │ ├── parse_issue.py │ ├── leaderboard_update.yaml │ ├── update_benchmark_status.yaml │ ├── update_agent_manifest.yaml │ ├── benchmark_registration.yaml │ ├── agent_registration.yaml │ ├── leaderboard.py │ └── update_benchmark_helper.py ├── ISSUE_TEMPLATE │ ├── benchmark.yaml │ ├── onboarding-sre.yaml │ └── onboarding.yaml └── GH_ACTIONS_DOCS.md ├── LEADERBOARD_CISO.md ├── .pre-commit-config.yaml ├── .secrets.baseline ├── LEADERBOARD_SRE.md ├── docs ├── leaderboard.md └── how-to-launch-benchmark-ciso.md ├── README.md └── LICENSE /it_bench_arxiv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/it_bench_arxiv.pdf -------------------------------------------------------------------------------- /images/select-org.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/select-org.png -------------------------------------------------------------------------------- /images/select-repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/select-repo.png -------------------------------------------------------------------------------- /images/go-to-github-app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/go-to-github-app.png -------------------------------------------------------------------------------- /images/sample_it_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/sample_it_tasks.png -------------------------------------------------------------------------------- /images/agent-issue-selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/agent-issue-selection.png -------------------------------------------------------------------------------- /images/benchmark-registration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/benchmark-registration.png -------------------------------------------------------------------------------- /images/agent-registration-done.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/agent-registration-done.png -------------------------------------------------------------------------------- /images/agent-registration-email.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/agent-registration-email.png -------------------------------------------------------------------------------- /images/agent-registration-fill.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/agent-registration-fill.png -------------------------------------------------------------------------------- /images/benchmark-registration-done.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/benchmark-registration-done.png -------------------------------------------------------------------------------- /images/benchmark-registration-fill.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/benchmark-registration-fill.png -------------------------------------------------------------------------------- /images/benchmark-registration-email.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/ITBench/HEAD/images/benchmark-registration-email.png -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | - Saurabh Jha 3 | - Rohan Arora 4 | - Yuji Watanabe 5 | - Takumi Yanagawa 6 | - Yinfang Chen (UIUC - University of Illinois at Urbana-Champaign) 7 | - Jackson Clark (UIUC - University of Illinois at Urbana-Champaign) 8 | - Bhavya Bhavya 9 | - Mudit Verma 10 | - Harshit Kumar 11 | - Hirokuni Kitahara 12 | - Noah Zheutlin 13 | - Saki Takano 14 | - Divya Pathak 15 | - Felix George 16 | - Xinbo Wu (UIUC - University of Illinois at Urbana-Champaign) 17 | - Bekir O Turkkan 18 | - Gerard Vanloo 19 | - Michael Nidd 20 | - Ting Dai 21 | - Oishik Chatterjee 22 | - Pranjal Gupta 23 | - Suranjana Samanta 24 | - Pooja Aggarwal 25 | - Rong Lee 26 | - Pavankumar Murali 27 | - Jae-wook Ahn 28 | - Debanjana Kar 29 | - Ameet Rahane 30 | - Carlos Fonseca 31 | - Amit Paradkar 32 | - Yu Deng 33 | - Pratibha Moogi 34 | - Prateeti Mohapatra 35 | - Naoki Abe 36 | - Chandrasekhar Narayanaswami 37 | - Tianyin Xu (UIUC - University of Illinois at Urbana-Champaign) 38 | - Lav R. Varshney (UIUC - University of Illinois at Urbana-Champaign) 39 | - Ruchi Mahindru 40 | - Anca Sailer 41 | - Laura Shwartz 42 | - Daby Sow 43 | - Nicholas C. M. Fuller 44 | - Ruchir Puri 45 | -------------------------------------------------------------------------------- /.github/workflows/parse_issue.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import sys 4 | 5 | 6 | def parse_issue_body(issue_body: str): 7 | result = {} 8 | sections = re.split(r"^###\s+", issue_body, flags=re.MULTILINE) 9 | 10 | for section in sections: 11 | if not section.strip(): 12 | continue 13 | lines = section.strip().splitlines() 14 | if not lines: 15 | continue 16 | key = lines[0].strip() 17 | value_lines = lines[1:] 18 | 19 | if not value_lines: 20 | result[key] = "" 21 | continue 22 | 23 | value_lines = [x for x in value_lines if x != ''] 24 | if all(re.match(r"^- \[[ xX]\] ", line) for line in value_lines): 25 | options = {} 26 | for line in value_lines: 27 | match = re.match(r"^- \[([ xX])\] (.+)", line) 28 | if match: 29 | checked = match.group(1).lower() == 'x' 30 | label = match.group(2).strip() 31 | options[label] = checked 32 | result[key] = options 33 | else: 34 | value = "\n".join(value_lines).strip() 35 | result[key] = value 36 | 37 | print(json.dumps(result)) 38 | 39 | if __name__ == "__main__": 40 | issue_body = sys.stdin.read() 41 | parse_issue_body(issue_body) 42 | -------------------------------------------------------------------------------- /LEADERBOARD_CISO.md: -------------------------------------------------------------------------------- 1 | ## 📊 IT Bench Leaderboard (CISO) 2 | This leaderboard shows the performance of agents on CISO-related IT automation scenarios. 3 | For details on how to participate or interpret results, see the [README](../main/README.md). 4 | 5 | **Column Descriptions:** 6 | - *Score*: Average benchmark score across scenarios (1.0 = perfect) 7 | - *#Passed*: Number of scenarios successfully passed 8 | - *Mean Agent Execution Duration*: Average time taken across scenarios 9 | - *Scenario Category*: Categories of evaluated tasks (e.g., RHEL, Kyverno, etc.) 10 | 11 | Updated on: 02/05/2025 18:06:54 12 | 13 | --- 14 | 15 | | Agent Name | Agent Submitter | Organization | Scenario Category | Score ⬆️ | #Passed | Mean Agent Execution Duration | Date (UTC) | Issue Link | 16 | |--------------|-----------------|--------------|-------------------|----------|------------------|----------------------------|------------|------------| 17 | | ciso-agent-expert-rhel9-opa | [xinbowu2](https://github.com/xinbowu2) | University of Illinois at Urbana-Champaign (UIUC) | Gen-CIS-b-RHEL9-Ansible-OPA | 0.30 | 3 | 134s | 02/05/2025 05:51:40 | [#30](https://github.com/itbench-hub/ITBench/issues/30) | 18 | | pre-release-agent-2025-0428 | [yana1205](https://github.com/yana1205) | IBM Research - Tokyo | Gen-CIS-b-K8s-Kyverno | 0.20 | 2 | 109s | 28/04/2025 23:08:42 | [#28](https://github.com/itbench-hub/ITBench/issues/28) | 19 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # This is an example configuration to enable detect-secrets in the pre-commit hook. 2 | # Add this file to the root folder of your repository. 3 | # 4 | # Read pre-commit hook framework https://pre-commit.com/ for more details about the structure of config yaml file and how git pre-commit would invoke each hook. 5 | # 6 | # This line indicates we will use the hook from ibm/detect-secrets to run scan during committing phase. 7 | repos: 8 | - repo: https://github.com/ibm/detect-secrets 9 | # If you desire to use a specific version of detect-secrets, you can replace `master` with other git revisions such as branch, tag or commit sha. 10 | # You are encouraged to use static refs such as tags, instead of branch name 11 | # 12 | # Running "pre-commit autoupdate" automatically updates rev to latest tag 13 | rev: 0.13.1+ibm.62.dss 14 | hooks: 15 | - id: detect-secrets # pragma: whitelist secret 16 | # Add options for detect-secrets-hook binary. You can run `detect-secrets-hook --help` to list out all possible options. 17 | # You may also run `pre-commit run detect-secrets` to preview the scan result. 18 | # when "--baseline" without "--use-all-plugins", pre-commit scan with just plugins in baseline file 19 | # when "--baseline" with "--use-all-plugins", pre-commit scan with all available plugins 20 | # add "--fail-on-unaudited" to fail pre-commit for unaudited potential secrets 21 | args: [--baseline, .secrets.baseline, --use-all-plugins] 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/benchmark.yaml: -------------------------------------------------------------------------------- 1 | name: IT Bench new benchmark request 2 | description: Request for a new benchmark to be run. 3 | title: "[Registration]: < agent and benchmark name here >" 4 | labels: ["benchmark"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thank you for your interest in benchmarking an IT Bench Agent. 10 | Please fill out this form to request for a new benchmark to be set up for your agent 11 | 12 | ## Important! 13 | 14 | Before you submit this form, you need to have already registered your agent using the agent registration issue template. 15 | Currently, please use the same GitHub account to open the benchmark issue as the one used for the agent registration. 16 | 17 | You can register your agent using either of the following links: 18 | - [SRE Agent Registration](../itbench/issues/new?template=onboarding-sre.yaml) 19 | - [CISO Agent Registration](../itbench/issues/new?template=onboarding.yaml) 20 | 21 | - type: input 22 | id: repo_url 23 | attributes: 24 | label: "Config Repo" 25 | description: | 26 | Provide the GitHub Repository URL where your agent configuration is stored (this is the same repo used in the registration step.) 27 | placeholder: "e.g. https://github.com/your_org/repo_name" 28 | validations: 29 | required: true 30 | 31 | - type: input 32 | id: benchmark-name 33 | attributes: 34 | label: Benchmark Name 35 | placeholder: my-new-benchmark 36 | validations: 37 | required: true 38 | - type: dropdown 39 | id: schedule-now 40 | attributes: 41 | label: Schedule Now 42 | description: Do you want this benchmark to be immediately scheduled? 43 | options: 44 | - 'true' 45 | - 'false' 46 | default: 0 47 | validations: 48 | required: true 49 | 50 | 51 | 52 | - type: markdown 53 | attributes: 54 | value: Thank you for completing this form, we will review your request shortly. -------------------------------------------------------------------------------- /.secrets.baseline: -------------------------------------------------------------------------------- 1 | { 2 | "exclude": { 3 | "files": null, 4 | "lines": null 5 | }, 6 | "generated_at": "2025-05-01T17:44:35Z", 7 | "plugins_used": [ 8 | { 9 | "name": "AWSKeyDetector" 10 | }, 11 | { 12 | "name": "ArtifactoryDetector" 13 | }, 14 | { 15 | "name": "AzureStorageKeyDetector" 16 | }, 17 | { 18 | "base64_limit": 4.5, 19 | "name": "Base64HighEntropyString" 20 | }, 21 | { 22 | "name": "BasicAuthDetector" 23 | }, 24 | { 25 | "name": "BoxDetector" 26 | }, 27 | { 28 | "name": "CloudantDetector" 29 | }, 30 | { 31 | "ghe_instance": "github.ibm.com", 32 | "name": "GheDetector" 33 | }, 34 | { 35 | "name": "GitHubTokenDetector" 36 | }, 37 | { 38 | "hex_limit": 3, 39 | "name": "HexHighEntropyString" 40 | }, 41 | { 42 | "name": "IbmCloudIamDetector" 43 | }, 44 | { 45 | "name": "IbmCosHmacDetector" 46 | }, 47 | { 48 | "name": "JwtTokenDetector" 49 | }, 50 | { 51 | "keyword_exclude": null, 52 | "name": "KeywordDetector" 53 | }, 54 | { 55 | "name": "MailchimpDetector" 56 | }, 57 | { 58 | "name": "NpmDetector" 59 | }, 60 | { 61 | "name": "PrivateKeyDetector" 62 | }, 63 | { 64 | "name": "SlackDetector" 65 | }, 66 | { 67 | "name": "SoftlayerDetector" 68 | }, 69 | { 70 | "name": "SquareOAuthDetector" 71 | }, 72 | { 73 | "name": "StripeDetector" 74 | }, 75 | { 76 | "name": "TwilioKeyDetector" 77 | } 78 | ], 79 | "results": { 80 | "docs/how-to-launch-benchmark-ciso.md": [ 81 | { 82 | "hashed_secret": "d1da57683505716a1a8716658c4432742355360a", 83 | "is_verified": false, 84 | "line_number": 15, 85 | "type": "Secret Keyword", 86 | "verified_result": null 87 | } 88 | ] 89 | }, 90 | "version": "0.13.1+ibm.62.dss", 91 | "word_list": { 92 | "file": null, 93 | "hash": null 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/onboarding-sre.yaml: -------------------------------------------------------------------------------- 1 | name: IT Bench Agent Registration (SRE) 2 | description: Register your SRE ITBench agent for benchmarking 3 | title: "[Registration - SRE]: < agent name here >" 4 | labels: ["registration"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thank you for your interest in benchmarking a SRE ITBench Agent. 10 | Please fill out this form to request connection details for the IT Bench service. 11 | 12 | ## Important! 13 | 14 | Before you submit this form, you need to have completed the following tasks 15 | (See also [Getting Started](../itbench-leaderboard?tab=readme-ov-file#prerequisites)): 16 | 17 | 1. Create an empty repository in GitHub with visibility set to private 18 | 2. Install the [ibm-itbench](https://github.com/apps/ibm-itbench-github-app) app into that repository. 19 | 3. Make sure that the person submitting this issue is added as a collaborator to that repository. 20 | 21 | Once those two steps have been completed, please complete this form and provide the URL for the 22 | repository you created in the relevant section. 23 | 24 | ### Reference 25 | 26 | You can find examples of expected agent actions and outputs in the sample scenario repository: 27 | https://github.com/IBM/ITBench-Scenarios/blob/main/sre/docs/incident_scenarios.md 28 | 29 | --- 30 | 31 | - type: input 32 | id: agent-name 33 | attributes: 34 | label: Agent Name 35 | description: Please avoid using spaces in the name. 36 | placeholder: my-itbench-agent 37 | validations: 38 | required: true 39 | - type: dropdown 40 | id: agent-type 41 | attributes: 42 | label: Agent Type 43 | description: What type of agent is this? 44 | options: 45 | - SRE 46 | default: 0 47 | validations: 48 | required: true 49 | - type: dropdown 50 | id: agent-level 51 | attributes: 52 | label: Agent Level 53 | description: What level of agent is this? 54 | options: 55 | - Beginner 56 | - Intermediate 57 | - Expert 58 | default: 0 59 | validations: 60 | required: true 61 | - type: checkboxes 62 | id: scenario-categories 63 | attributes: 64 | label: Agent Scenarios 65 | description: You may select more than one, options not applicable to the agent type will be ignored. 66 | options: 67 | - label: Change 68 | - label: Configuration Setting 69 | - label: Latency 70 | - label: Resource Unavailable 71 | - label: Other 72 | - type: input 73 | id: repo_url 74 | attributes: 75 | label: "Config Repo" 76 | description: | 77 | Provide the GitHub Repository URL that we will create data required for benchmark. 78 | Please install the [ibm-itbench](https://github.com/apps/ibm-itbench-github-app) GitHub App in the repository before submitting this form!. 79 | placeholder: "e.g. https://github.com/your_org/repo_name" 80 | validations: 81 | required: true 82 | 83 | - type: markdown 84 | attributes: 85 | value: Thank you for completing this form, we will review your request shortly. 86 | -------------------------------------------------------------------------------- /LEADERBOARD_SRE.md: -------------------------------------------------------------------------------- 1 | ## 📊 IT Bench Leaderboard (SRE) 2 | This leaderboard shows the performance of agents on SRE-related IT automation scenarios. 3 | 4 | **Column Descriptions:** 5 | - *Diagnosis - NTAM Fault Localization*: Normalized Topology Aware Metric (NTAM) Average Fault Propagation Chain 6 | - *Diagnosis - NTAM Fault Propagation*: NTAM Average Fault Localization 7 | - *% Resolved*: Percentage of incidents repaired (mitigation efficiency) 8 | 9 | Updated on: 02/05/2025 18:06:54 10 | 11 | ### Single Trial 12 | For details on how to participate or interpret results, see the [README](/README.md). 13 | 14 | --- 15 | 16 | | Agent (Name) | Agent Submitter | Organization | Scenario Category | Trials across incidents | Diagnosis - NTAM Fault Localization | Diagnosis - NTAM Fault Propagation | Diagnosis - Time to Diagnosis | Diagnosis - Duration agent tried for Diagnosis | Repair - Time to Repair | % Resolved | Date (UTC) | Issue Link | 17 | |--------------|-----------------|--------------|-------------------|-------------------------|-------------------------------------|------------------------------------|-------------------------------|------------------------------------------------|-------------------------|------------|------------|------------| 18 | | ITBench-SRE-Agent-GPT-4o | [ITBench-SRE-Agent](https://github.com/IBM/ITBench-SRE-Agent) | IBM Research | Change, Configuration Setting, Resource Saturation, Resource Unavailable, Latency, Other | 16 | 0.33 ± 0.08 (σ=0.31) | 0.29 ± 0.06 (σ=0.23) | 69.82 ± 11.30 (σ=15.98) | 70.38 ± 4.98 (σ=19.91) | 220.15 ± 27.25 (σ=54.51) | 25.00 | 19 | | ITBench-SRE-Agent-Granite-3-2 | [ITBench-SRE-Agent](https://github.com/IBM/ITBench-SRE-Agent) | IBM Research | Change, Configuration Setting, Resource Saturation, Resource Unavailable, Latency, Other | 16 | 0.19 ± 0.06 (σ=0.26) | 0.21 ± 0.05 (σ=0.21) | 96.47 ± NaN (σ=NaN) | 93.75 ± 15.90 (σ=63.59) | ∞ ± 0.00 (σ=0.00) | 0.00 | 20 | | ITBench-SRE-Agent-LLama-3-3-70B | [ITBench-SRE-Agent](https://github.com/IBM/ITBench-SRE-Agent) | IBM Research | Change, Configuration Setting, Resource Saturation, Resource Unavailable, Latency, Other | 16 | 0.14 ± 0.04 (σ=0.15) | 0.21 ± 0.04 (σ=0.16) | ∞ ± 0.00 (σ=0.00) | 63.36 ± 3.43 (σ=13.71) | 193.19 ± 1.25 (σ=1.76) | 12.50 | 21 | 22 | ### Multiple Trials (Limited availability; expected general availability (GA) in July, 2025) 23 | 24 | --- 25 | 26 | | Agent (Name) | Agent Submitter | Organization | Scenario Category | Trials across incidents | Diagnosis - NTAM Fault Localization | Diagnosis - NTAM Fault Propagation | Diagnosis - Time to Diagnosis | Diagnosis - Duration agent tried for Diagnosis | Repair - Time to Repair | % Resolved | Date (UTC) | Issue Link | 27 | |--------------|-----------------|--------------|-------------------|-------------------------|-------------------------------------|------------------------------------|-------------------------------|------------------------------------------------|-------------------------|------------|------------|------------| 28 | | ITBench-SRE-Agent-GPT-4o | [ITBench-SRE-Agent](https://github.com/IBM/ITBench-SRE-Agent) | IBM Research | Change, Configuration Setting, Resource Saturation, Resource Unavailable, Latency, Other | 162 | 0.36 ± 0.07 (σ=0.29) | 0.29 ± 0.03 (σ=0.13) | 117.27 ± 36.62 (σ=73.25) | 86.49 ± 8.88 (σ=36.60) | 204.81 ± 9.88 (σ=31.24) | 24.79 | 29 | -------------------------------------------------------------------------------- /.github/workflows/leaderboard_update.yaml: -------------------------------------------------------------------------------- 1 | name: Leaderboard Update 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | use-sample: 6 | type: boolean 7 | required: false 8 | description: If set, display leaderboard with sample data 9 | benchmark-id: 10 | type: string 11 | required: false 12 | description: If set, display leaderboard of the provided benchmark id 13 | github-username: 14 | type: string 15 | required: false 16 | description: If set, display leaderboard of the provided github username 17 | jobs: 18 | update_leaderboard: 19 | runs-on: ubuntu-latest 20 | environment: onboarding 21 | name: Update the Leaderboard 22 | steps: 23 | - name: Checkout Repository 24 | uses: actions/checkout@v2 25 | 26 | - name: List Issues of Finished Benchmark 27 | env: 28 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 29 | GH_REPO: ${{ github.repository }} 30 | run: | 31 | gh issue list --label "benchmark" --state "closed" --json number,author,comments > issues.json 32 | jq -c '[.[].number]' issues.json 33 | 34 | usernames=($(jq -r '.[].author.login' issues.json | sort -u)) 35 | query='{'$'\n' 36 | for i in "${!usernames[@]}"; do 37 | login="${usernames[$i]}" 38 | query+=" u$((i+1)): user(login: \"${login}\") { login company }"$'\n' 39 | done 40 | query+='}' 41 | 42 | gh api graphql -f query="$query" | jq -r ' 43 | .data | 44 | to_entries | 45 | map({ key: .value.login, value: { company: .value.company } }) | 46 | from_entries 47 | ' > users.json 48 | 49 | - name: Pull Leaderboard data 50 | env: 51 | ITBENCH_API: ${{vars.ITBENCH_API}} 52 | ITBENCH_API_TOKEN: ${{ secrets.ITBENCH_API_TOKEN }} 53 | GH_REPO: ${{ github.repository }} 54 | USE_SAMPLE: ${{ github.event.inputs.use-sample }} 55 | BENCHMARK_ID: ${{ github.event.inputs.benchmark-id }} 56 | GITHUB_USERNAME: ${{ github.event.inputs.github-username }} 57 | run: | 58 | 59 | echo "Parse gh issues" 60 | python ./.github/workflows/update_benchmark_helper.py parse -i issues.json -o updated_issues.json 61 | 62 | echo "Requesting Leaderboard data from API" 63 | 64 | if [ "$USE_SAMPLE" == "true" ]; then 65 | python ./.github/workflows/leaderboard.py global --sample -b $BENCHMARK_ID -u $GITHUB_USERNAME --issues updated_issues.json --users users.json --out-overall LEADERBOARD.md --out-ciso LEADERBOARD_CISO.md --out-sre LEADERBOARD_SRE.md 66 | else 67 | python ./.github/workflows/leaderboard.py global --issues updated_issues.json --users users.json --out-overall LEADERBOARD.md --out-ciso LEADERBOARD_CISO.md --out-sre LEADERBOARD_SRE.md 68 | fi 69 | 70 | - name: Open PR 71 | env: 72 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 73 | run: | 74 | git config --global user.name "GitHub Actions" 75 | git config --global user.email "actions@github.com" 76 | 77 | git checkout -b leaderboard 78 | 79 | git add LEADERBOARD_CISO.md LEADERBOARD_SRE.md 80 | 81 | git commit -m "chore: update leaderboard data" 82 | 83 | git push origin leaderboard -f 84 | 85 | gh pr create \ 86 | --base main \ 87 | --head leaderboard \ 88 | --title "chore: update leaderboard data" \ 89 | --body "This PR updates the leaderboard automatically via GitHub Actions." -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/onboarding.yaml: -------------------------------------------------------------------------------- 1 | name: IT Bench Agent Registration (CISO) 2 | description: Register your CISO ITBench agent for benchmarking 3 | title: "[Registration - CISO]: < agent name here >" 4 | labels: ["registration"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thank you for your interest in benchmarking a CISO ITBench Agent. 10 | Please fill out this form to request connection details for the IT Bench service. 11 | 12 | ## Important! 13 | 14 | Before you submit this form, you need to have completed the following tasks 15 | (See also [Getting Started](../itbench-leaderboard?tab=readme-ov-file#prerequisites)): 16 | 17 | 1. Create an empty repository in GitHub with visibility set to private 18 | 2. Install the [ibm-itbench](https://github.com/apps/ibm-itbench-github-app) app into that repository. 19 | 3. Make sure that the person submitting this issue is added as a collaborator to that repository. 20 | 21 | Once those two steps have been completed, please complete this form and provide the URL for the 22 | repository you created in the relevant section. 23 | 24 | ### Reference 25 | 26 | You can find examples of expected agent actions and outputs in the sample scenario repository: 27 | https://github.com/IBM/ITBench-Scenarios/blob/main/ciso/README.md#scenarios 28 | 29 | --- 30 | 31 | - type: input 32 | id: agent-name 33 | attributes: 34 | label: Agent Name 35 | description: Please avoid using spaces in the name. 36 | placeholder: my-itbench-agent 37 | validations: 38 | required: true 39 | - type: dropdown 40 | id: agent-type 41 | attributes: 42 | label: Agent Type 43 | description: What type of agent is this? 44 | options: 45 | - CISO 46 | default: 0 47 | validations: 48 | required: true 49 | - type: dropdown 50 | id: agent-level 51 | attributes: 52 | label: Agent Level 53 | description: | 54 | Select the level of scenarios you want your Agent to participate in. 55 | **Important:** Categories depend on the Level. Please follow these rules: 56 | - Beginner: only "Kubernetes in Kyverno" 57 | - Intermediate: only "Kubernetes in OPA" 58 | - Expert: "Kubernetes in Kyverno Update" and "RHEL9 in OPA" 59 | options: 60 | - Beginner 61 | - Intermediate 62 | - Expert 63 | default: 0 64 | validations: 65 | required: true 66 | - type: checkboxes 67 | id: scenario-categories 68 | attributes: 69 | label: Agent Scenarios 70 | description: | 71 | Choose the scenario categories for your Agent. 72 | **Please select only the categories that match your Level above.** 73 | - Beginner → Kubernetes in Kyverno 74 | - Intermediate → Kubernetes in OPA 75 | - Expert → Kubernetes in Kyverno Update, RHEL9 in OPA 76 | options: 77 | - label: Kubernetes in Kyverno 78 | - label: Kubernetes in OPA 79 | - label: Kubernetes in Kyverno Update 80 | - label: RHEL9 in OPA 81 | - type: input 82 | id: repo_url 83 | attributes: 84 | label: "Config Repo" 85 | description: | 86 | Provide the GitHub Repository URL that we will create data required for benchmark. 87 | Please install the [ibm-itbench](https://github.com/apps/ibm-itbench-github-app) GitHub App in the repository before submitting this form!. 88 | placeholder: "e.g. https://github.com/your_org/repo_name" 89 | validations: 90 | required: true 91 | 92 | - type: markdown 93 | attributes: 94 | value: Thank you for completing this form, we will review your request shortly. 95 | -------------------------------------------------------------------------------- /.github/workflows/update_benchmark_status.yaml: -------------------------------------------------------------------------------- 1 | name: Update Benchmark Status 2 | on: 3 | schedule: 4 | - cron: "*/10 * * * *" 5 | issue_comment: 6 | types: [created] 7 | workflow_dispatch: 8 | 9 | env: 10 | REQUEST_TIMEOUT: ${{ vars.REQUEST_TIMEOUT }} 11 | 12 | jobs: 13 | update_status: 14 | runs-on: ubuntu-latest 15 | environment: onboarding 16 | name: Update the Benchmark Progress 17 | steps: 18 | - name: Determine Trigger Type 19 | id: check_trigger 20 | run: | 21 | if [[ "${{ github.event_name }}" == "issue_comment" ]]; then 22 | COMMENT_BODY=$(jq -r '.comment.body' "$GITHUB_EVENT_PATH") 23 | if [[ "$COMMENT_BODY" == "/refresh" ]]; then 24 | echo "TRIGGER=issue_comment" >> $GITHUB_ENV 25 | ISSUE_NUMBER=$(jq -r '.issue.number' "$GITHUB_EVENT_PATH") 26 | echo "ISSUE_NUMBER=$ISSUE_NUMBER" >> $GITHUB_ENV 27 | else 28 | echo "Not a /refresh command, skipping." 29 | exit 0 30 | fi 31 | else 32 | echo "TRIGGER=schedule" >> $GITHUB_ENV 33 | fi 34 | - name: Checkout Repository 35 | uses: actions/checkout@v2 36 | - name: List Issues with 'track-progress' Label 37 | env: 38 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 39 | GH_REPO: ${{ github.repository }} 40 | run: | 41 | if [[ "$TRIGGER" == "schedule" ]]; then 42 | echo "Scheduled task: List all issues with the 'track-progress' label" 43 | gh issue list --label "track-progress" --state "open" --json number,author,comments > issues.json 44 | elif [[ "$TRIGGER" == "issue_comment" ]]; then 45 | echo "Issue comment trigger: Store only the commented issue" 46 | gh issue view "$ISSUE_NUMBER" --json number,author,comments | jq '[.]' > issues.json 47 | fi 48 | echo "Tracked issues" 49 | jq -c '[.[].number]' issues.json 50 | - name: Process and Update Status 51 | env: 52 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 53 | GH_REPO: ${{ github.repository }} 54 | ITBENCH_API: ${{vars.ITBENCH_API}} 55 | ITBENCH_API_TOKEN: ${{ secrets.ITBENCH_API_TOKEN }} 56 | run: | 57 | if [ ! -s issues.json ]; then 58 | echo "No issues found." 59 | exit 0 60 | fi 61 | echo "Parse gh issues" 62 | python ./.github/workflows/update_benchmark_helper.py parse -i issues.json -o updated_issues.json 63 | echo "Fetch benchmark status" 64 | python ./.github/workflows/update_benchmark_helper.py status -i updated_issues.json -o benchmark_statuses.json 65 | echo "Generate benchmark status comment" 66 | python .github/workflows/update_benchmark_helper.py comment -i benchmark_statuses.json -o benchmark_status_comments.jsonl 67 | 68 | echo "Update each issues" 69 | cat benchmark_status_comments.jsonl | while IFS= read -r line 70 | do 71 | number=$(printf "%s" "$line" | jq -r '.number') 72 | if [[ -z "$number" ]]; then 73 | continue 74 | fi 75 | 76 | status_comment_id=$(printf "%s" "$line" | jq -r '.status_comment_id') 77 | closed=$(printf "%s" "$line" | jq -r '.closed') 78 | body=$(printf "%s" "$line" | jq -r '.comment') 79 | 80 | if [[ "$status_comment_id" == "null" ]]; then 81 | echo " Creating new comment for issue #$number" 82 | gh issue comment "$number" --body "$body" 83 | else 84 | echo " Editing comment $status_comment_id for issue #$number" 85 | gh api --silent -X PATCH /repos/${GH_REPO}/issues/comments/${status_comment_id} -F "body=${body}" 86 | fi 87 | 88 | if [[ "$closed" == "true" ]]; then 89 | echo " Close the issue #$number" 90 | gh issue close $number 91 | fi 92 | done 93 | 94 | -------------------------------------------------------------------------------- /.github/workflows/update_agent_manifest.yaml: -------------------------------------------------------------------------------- 1 | name: Update Agent Manifest 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | benchmark-issue: 7 | type: number 8 | required: true 9 | description: Please input the benchmark issue number 10 | 11 | jobs: 12 | update_agent_manifest: 13 | runs-on: ubuntu-latest 14 | environment: onboarding 15 | name: Update Agent Manifest 16 | steps: 17 | - name: Get Agent Repo 18 | id: get-agent-repo 19 | env: 20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 21 | GH_REPO: ${{ github.repository }} 22 | BENCHMARK_ISSUE_NUMBER: ${{ github.event.inputs.benchmark-issue }} 23 | run: | 24 | config_repo_url="$(gh issue view $BENCHMARK_ISSUE_NUMBER --json body -q .body | grep -A2 '### Config Repo' | tail -n1)" 25 | echo "Agent Repo: $config_repo_url" 26 | github_username="$(gh issue view $BENCHMARK_ISSUE_NUMBER --json author -q .author.login)" 27 | echo "GitHub Username: $github_username" 28 | agent_repo_owner="$(echo $config_repo_url | awk -F/ '{print $4}')" 29 | agent_repo_name="$(echo $config_repo_url | awk -F/ '{print $5}')" 30 | echo "Agent Repo Owner: $agent_repo_owner" 31 | echo "Agent Repo Name: $agent_repo_name" 32 | echo "agent_repo_owner=$agent_repo_owner" >> "$GITHUB_OUTPUT" 33 | echo "agent_repo_name=$agent_repo_name" >> "$GITHUB_OUTPUT" 34 | echo "github_username=$github_username" >> "$GITHUB_OUTPUT" 35 | 36 | - name: Generate GitHub token on behalf of repo 37 | id: generate-token 38 | uses: actions/create-github-app-token@v1 39 | with: 40 | app-id: ${{ vars.ITBENCH_APP_ID }} 41 | private-key: ${{ secrets.ITBENCH_APP_KEY }} 42 | owner: ${{ steps.get-agent-repo.outputs.agent_repo_owner}} 43 | repositories: ${{ steps.get-agent-repo.outputs.agent_repo_name}} 44 | 45 | - name: Update agent-manifest.json 46 | env: 47 | GH_TOKEN: ${{ steps.generate-token.outputs.token }} 48 | run: | 49 | cleanup() { 50 | echo "Cleaning up agent-manifest.json/agent-manifest.raw.json" 51 | rm -f agent-manifest.json agent-manifest.raw.json agent-manifest.new.json 52 | } 53 | trap cleanup EXIT 54 | trap cleanup SIGINT 55 | trap cleanup SIGTERM 56 | 57 | repo_full_path="repos/${{ steps.get-agent-repo.outputs.agent_repo_owner}}/${{ steps.get-agent-repo.outputs.agent_repo_name}}" 58 | gh api $repo_full_path/contents/agent-manifest.json -q '.content' | base64 -d > agent-manifest.json 59 | agent_id=$(jq -r .metadata.id agent-manifest.json) 60 | 61 | status_code=$(curl -s -X GET \ 62 | -H "Authorization: Bearer ${{ secrets.ITBENCH_API_TOKEN }}" \ 63 | -H "Content-type: application/json" \ 64 | "${{vars.ITBENCH_API}}/gitops/agents/$agent_id?github_username=${{ steps.get-agent-repo.outputs.github_username}}" \ 65 | --output agent-manifest.raw.json \ 66 | --write-out "%{http_code}") 67 | 68 | if [ "$status_code" -ne 200 ]; then 69 | echo "❌ API request failed with status $status_code" 70 | exit 1 71 | fi 72 | 73 | new_agent_token=$(jq -r '.spec.agent_manifest.token' agent-manifest.raw.json) 74 | if [ -z "$new_agent_token" ] || [ "$new_agent_token" = "null" ]; then 75 | echo "❌ Failed to extract agent token from response" 76 | exit 1 77 | fi 78 | 79 | jq --arg new_agent_token "$new_agent_token" -r '.token=$new_agent_token' agent-manifest.json > agent-manifest.new.json 80 | current_sha=$(gh api $repo_full_path/contents/agent-manifest.json -q '.sha' || echo "") 81 | gh api -X PUT \ 82 | -H "Accept: application/vnd.github.v3+json" \ 83 | $repo_full_path/contents/agent-manifest.json \ 84 | -f message="Update agent-manifest.json via API" \ 85 | -f content="$(cat agent-manifest.new.json | base64)" \ 86 | -f sha="$current_sha" -------------------------------------------------------------------------------- /docs/leaderboard.md: -------------------------------------------------------------------------------- 1 | # ITBench-Leaderboard 2 | 3 | ## 🌟 Explore the Leaderboards 4 | 5 | | Domain | Leaderboard | 6 | |--------|-------------| 7 | | 🔐 **CISO** | 👉 [View CISO Leaderboard](../LEADERBOARD_CISO.md) | 8 | | ⚙️ **SRE** | 👉 [View SRE Leaderboard](../LEADERBOARD_SRE.md) | 9 | 10 | ## Getting Started 11 | ### Prerequisites 12 | - **A private GitHub repository** 13 | - A file facilitating the agent and leaderboard handshake is pushed to this private repository. 14 | - The file(s) may be created or deleted automatically during the benchmark lifecycle. 15 | - **A Kubernetes sandbox cluster (KinD recommended)** -- Only needed for CISO 16 | - Do not use a production cluster, because the benchmark process will create and delete resources dynamically. 17 | - Please refer to [prepare-kubeconfig-kind.md](https://github.com/itbench-hub/ITBench-Scenarios/blob/main/ciso/prepare-kubeconfig-kind.md) 18 | - **An agent to benchmark** 19 | - A base agent is available from IBM for immediate use. The base agent for the CISO use case can be found [here](https://github.com/itbench-hub/ITBench-CISO-CAA-Agent), and one for SRE and FinOps use cases can be found [here](https://github.com/itbench-hub/ITBench-SRE-Agent). This allows you to leverage your methodologies and make improvements without having to worry about interactions between the agent and leaderboard service. 20 | 21 | ### Setup 22 | 23 | #### Step 1. Install the ITBench GitHub App 24 | Install the ibm-itbench GitHub app into the private GitHub repository (see Prerequisites). 25 | 26 | 1. Go to the installation page [here](https://github.com/apps/ibm-itbench-github-app). 27 | 28 | go-to-github-app 29 | 2. Select your GitHub Organization. 30 | 31 | select-org 32 | 3. Select your Agent configuration repo. 33 | 34 | select-repo 35 | 36 | > ⚠️ **Note**: If the repository was created by someone else (e.g., a teammate), ensure that the GitHub account submitting the agent registration issue is added as a **collaborator**. 37 | 38 | #### Step 2. Register your agent 39 | In this step, you will register your agent information with ITBench. 40 | 41 | 1. Create a new registration issue. 42 | - Go to [Agent Registration Form](https://github.com/itbench-hub/ITBench/issues/new/choose) and create a new issue. 43 | ![agent-issue-selection](../images/agent-issue-selection.png) 44 | 2. Fill in the issue template with the following information: 45 | - Agent Name: Your agent name 46 | - Agent Level: "Beginner" 47 | - Agent Scenarios: "Kubernetes in Kyverno" 48 | - Config Repo: URL for your agent configuration repo 49 | (You may adjust the settings depending on the scenarios or agent level.) 50 | 51 | agent-registration-fill 52 | 3. Submit the issue. 53 | - Click "Create" to submit your registration request. 54 | - Once your request is approved: 55 | - An approved label will be attached to your issue. 56 | - A comment will be added with a link to the generated agent configuration file stored in the specified configuration repository. 57 | Download the linked configuration file to proceed. 58 | 59 | agent-registration-done 60 | - If you subscribe to the issue, you will also receive email notifications. 61 | 62 | agent-registration-email 63 | 64 | If there are any problems with your submission, we will respond directly on the issue. 65 | If you do not receive any response within a couple of days, please reach out to the [maintainers](../README.md#contacts). 66 | 67 | #### Step 3. Create a benchmark request 68 | In this step, you will register your benchmark entry. 69 | 1. Create a new benchmark issue. 70 | - Go to [Benchmark Registration Form](https://github.com/itbench-hub/ITBench/issues/new/choose) and create a new issue. 71 | - Currently, please use the **same GitHub account** that you used for the agent registration issue. 72 | (This is currently required for the system to correctly associate your benchmark request.) 73 | 74 | benchmark-registration 75 | 2. Fill in the issue template. 76 | - The name for the Config Repo must match the repository you used during agent registration. 77 | 78 | benchmark-registration-fill 79 | 3. Submit the issue. 80 | - Click "Create" to submit your registration request. Once your request is approved: 81 | - An approved label will be attached to your issue. 82 | - The issue comment will be updated with your Benchmark ID. 83 | 84 | benchmark-registration-done 85 | - If you subscribe to the issue, you will also receive email notifications. 86 | 87 | benchmark-registration-email 88 | 89 | If there are any problems with your submission, we will respond directly on the issue. 90 | If you do not receive any response within a couple of days, please reach out to the [maintainers](../README.md#contacts). 91 | 92 | ### Running your agent or our base agent against the benchmark 93 | You can run either your own custom agent or one of our built-in agents against the ITBench benchmark. 94 | 95 | The following guides and videos demonstrate how to run the benchmark using our built-in agents. These may also serve as helpful references when setting up your own agent: 96 | 97 | - **CISO Agent** – [Documentation](../docs/how-to-launch-benchmark-ciso.md) ・ [Demo Video](https://ibm.box.com/s/3i7mapxyit7ugnbldigqunzs6bkvv4cy) 98 | - **SRE Agent** – [Documentation](https://github.com/itbench-hub/ITBench-SRE-Agent/blob/main/Leaderboard.md) 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ITBench 2 | 3 | **[Paper](./it_bench_arxiv.pdf) | [Leaderboard](#leaderboard) | [Scenarios](#scenarios) | [Agents](#agents) | [How to Cite](#how-to-cite) | [Contributors](./CONTRIBUTORS.md) | [Contacts](#contacts)** 4 | 5 | --- 6 | 7 | ## 📢 Announcements 8 | 9 | ### Latest Updates 10 | - **[June 13, 2025]** Identified 25+ additional scenarios to be developed over the summer. 11 | - **[May 2, 2025]** 🚀 ITBench now provides **fully-managed scenario environments** for everyone! Our platform handles the complete workflow—from scenario deployment to agent evaluation and leaderboard updates. Visit our GitHub repository [here](https://github.com/ibm/ITBench-Leaderboard) for guidelines and get started today. 12 | - **[February 28, 2025]** 🏆 **Limited Access Beta**: Invite-only access to the ITBench hosted scenario environments. ITBench handles scenario deployment, agent evaluation, and leaderboard updates. To request access, e-mail us [here](mailto:agent-bench-automation@ibm.com). 13 | - **[February 7, 2025]** 🎉 **Initial release!** Includes research paper, self-hosted environment setup tooling, sample scenarios, and baseline agents. 14 | 15 | --- 16 | 17 | ## Overview 18 | 19 | ITBench measures the performance of AI agents across a wide variety of **complex and real-world inspired IT automation tasks** targeting three key use cases: 20 | 21 | | Use Case | Focus Area | 22 | |----------|------------| 23 | | **SRE** (Site Reliability Engineering) | Availability and resiliency | 24 | | **CISO** (Compliance & Security Operations) | Compliance and security enforcement | 25 | | **FinOps** (Financial Operations) | Cost efficiencies and ROI optimization | 26 | 27 | ![sample_tasks](./images/sample_it_tasks.png) 28 | 29 | ### Key Features 30 | 31 | - **Real-world representation** of IT environments and incident scenarios 32 | - **Open, extensible framework** with comprehensive IT coverage 33 | - **Push-button workflows** and interpretable metrics 34 | - **Kubernetes-based** scenario environments 35 | 36 | ### What's Included 37 | 38 | ITBench enables researchers and developers to replicate real-world incidents in Kubernetes environments and develop AI agents to address them. 39 | 40 | **We provide:** 41 | 1. **Push-button deployment tooling** for environment setup *(open-source)* 42 | 2. **Framework for recreating realistic IT scenarios using the deployment tooling:** 43 | - **6 SRE scenarios** and **21 mechanisms** *(open-source)* 44 | - **4 categories of CISO scenarios** *(open-source)* 45 | - **1 FinOps scenario** *(open-source)* 46 | 3. **Two reference AI agents:** 47 | - SRE (Site Reliability Engineering) Agent *(open-source)* 48 | - CISO (Chief Information Security Officer) Agent *(open-source)* 49 | 4. **Fully-managed leaderboard** for agent evaluation and comparison 50 | 51 | --- 52 | 53 | ## Roadmap 54 | 55 | | Timeline | Key Deliverables | 56 | |----------|------------------| 57 | | **July 2025** | • Refactor leading to a scenario specification generator and runner allowing for most (if not all) mechanisms to be re-used across diverse applications and microservices
• Implementation of 10 of the additional scenarios identified | 58 | | **August 2025** | • **SRE-Agent-Lite**: Lightweight agent to assist non-systems personnel with environment debugging
• **Snapshot & Replay**: Data capture and replay capabilities
• Implementation of 15 of the additional scenarios to be developed over the summer| 59 | | **Fall 2025** | **BYOA (Bring Your Own Application)**: Support for custom application integration | 60 | 61 | --- 62 | 63 | ## Leaderboard 64 | 65 | The ITBench Leaderboard tracks agent performance across SRE, FinOps, and CISO scenarios. We provide fully managed scenario environments while researchers/developers run their agents on their own systems and submit their outputs for evaluation. 66 | 67 | | Domain | Leaderboard | 68 | |--------|-------------| 69 | | **SRE** | [View SRE Leaderboard](https://github.com/itbench-hub/ITBench/blob/main/LEADERBOARD_SRE.md) | 70 | | **CISO** | [View CISO Leaderboard](https://github.com/itbench-hub/ITBench/blob/main/LEADERBOARD_CISO.md) | 71 | 72 | > **Get Started**: Visit [docs/leaderboard.md](docs/leaderboard.md) for access and evaluation guidelines. 73 | 74 | --- 75 | 76 | ## Scenarios 77 | 78 | ITBench incorporates a collection of problems that we call **scenarios**. Each scenario is deployed in an operational environment where specific problems occur. 79 | 80 | ### Examples of Scenarios 81 | - **SRE**: Resolve "High error rate on service checkout" in a Kubernetes environment 82 | - **CISO**: Assess compliance posture for "new control rule detected for RHEL 9" 83 | - **FinOps**: Identify and resolve cost overruns and anomalies 84 | 85 | **Find all scenarios**: [Scenarios repository](https://github.com/IBM/ITBench-Scenarios) 86 | 87 | --- 88 | 89 | ## Agents 90 | 91 | Two baseline agents are being open-sourced with ITBench, built using the **CrewAI framework**. 92 | 93 | ### Agent Features 94 | - **Configurable LLMs**: watsonx, Azure, or vLLM support 95 | - **Natural language tools**: Interactions with the environment for information gathering 96 | 97 | ### Available Agents 98 | 99 | | Agent | Repository | 100 | |-------|------------| 101 | | **SRE Agent** | [itbench-sre-agent](https://github.com/IBM/itbench-sre-agent) | 102 | | **CISO Agent** | [itbench-ciso-caa-agent](https://github.com/IBM/itbench-ciso-caa-agent) | 103 | 104 | --- 105 | 106 | ## How to Cite 107 | 108 | ```bibtex 109 | @misc{jha2025itbench, 110 | title={ITBench: Evaluating AI Agents across Diverse Real-World IT Automation Tasks}, 111 | author={Jha, Saurabh and Arora, Rohan and Watanabe, Yuji and others}, 112 | year={2025}, 113 | url={https://github.com/IBM/itbench-sample-scenarios/blob/main/it_bench_arxiv.pdf} 114 | } 115 | ``` 116 | 117 | --- 118 | 119 | ## Join the Discussion 120 | 121 | Have questions or need help getting started with ITBench? 122 | 123 | - [**Create a GitHub issue**](https://github.com/IBM/ITBench/issues/new) for bug reports or feature requests 124 | - [**Join our Discord community**](https://discord.gg/6fzy3JRHmt) for real-time discussions 125 | - For formal inquiries, please see the [contacts section](#contacts) 126 | 127 | --- 128 | 129 | ## Contacts 130 | 131 | - **General inquiries**: agent-bench-automation@ibm.com 132 | - **Saurabh Jha**: saurabh.jha@ibm.com 133 | - **Yuji Watanabe**: muew@jp.ibm.com 134 | -------------------------------------------------------------------------------- /.github/GH_ACTIONS_DOCS.md: -------------------------------------------------------------------------------- 1 | # GitHub Actions Documentation for IT Bench 2 | 3 | ## General 4 | 5 | To support the IT Bench user experience, this repository contains a number of GitHub Actions workflows that automate required tasks. 6 | 7 | These include: 8 | - [Agent Registration](#agent-registration) 9 | - [Public Leaderboard Updates](#public-leaderboard-updates) 10 | 11 | 12 | 13 | 14 | ## Agent Registration 15 | 16 | 17 | 18 | ```mermaid 19 | flowchart TD 20 | A[User Creates Config Repository] --> B 21 | B[User Installs IT Bench App into Config Repository] --> C 22 | C[User Opens registration ticket in IT Bench Repository] --> D{Ticket Approved?} 23 | 24 | D -->|Yes| F 25 | D -->|No| E 26 | E[User makes required changes to registration Ticket] -->D 27 | subgraph Registration Workflow 28 | F[Parse Registration Issue]-->G 29 | G[Verify Config Repo Private and App Installed]--> H 30 | G -->|Failed Check| FAIL 31 | FAIL[Comment on issue with error message and return to user] 32 | H[Register Agent details with IT Bench API]-->I 33 | I[Commit agent_manifest.json to config repo]-->J 34 | J[Reply to issue and close] 35 | end 36 | 37 | ``` 38 | 39 | 40 | ### Simple onboarding instructions for users 41 | 42 | 43 | To onboard your agent and get started benchmarking, please follow the following steps: 44 | 45 | 1. Create an empty repository (or use a repository of your choice) on GitHub: 46 | - The repository must be set to private. 47 | - The onboarding process will create a file called `agent-manifest.json` at the root of the repository, so if using an existing repository make sure that there will not be a clash. 48 | 2. Install the [`ibm-itbench`](https://github.com/apps/ibm-itbench) app into the repository that you created in step 1. 49 | 3. Fill out and submit [this issue template](https://github.com/jpwsutton/itbenchautomation/issues/new?template=onboarding.yaml) with the details of the agent you are developing and provide the URL to the GitHub Repo you created in step 1 e.g. https://github.com/jpwsutton/my-test-agent 50 | 4. Once the registration issue has been approved, an automated process will generate a manifest for your agent to access the IT Bench Server and will save it to the root of your repository from step 1. You can now download this file and use it with the agent harness to initiate a benchmark. 51 | 52 | 53 | ## Public Leaderboard updates 54 | 55 | ```mermaid 56 | flowchart TD 57 | A[Workflow Initiated] --> B 58 | B[agent-bench-automation repo cloned] --> C 59 | C[leaderboard.py pulls leaderboard data from IT Bench API and generates markdown table] --> D 60 | D[Markdown table is pulled into a larger markdown file with a header] --> E 61 | E[Updated leaderboard markdown file committed and pushed to agent-bench-automation repository] 62 | 63 | ``` 64 | 65 | 66 | 67 | ## Setup 68 | 69 | 70 | ### Creating the GitHub Application and configuring for GH Actions 71 | 72 | 1. Go to your [GitHub Apps page](https://github.com/settings/apps) in your developer settings and click "New GitHub App" 73 | 2. Populate the following Settings: 74 | - GitHub App name 75 | - Homepage URL (Set to the IT Bench Repo) 76 | - Disable Webhook 77 | - Permissions: 78 | - Repository Permissions: 79 | - Metadata: Read-Only (Default) 80 | - Single file: Read and Write 81 | - Path: agent-manifest.json 82 | - Organisation Permissions: None 83 | - Account Permissions: None 84 | - Where can this GitHub App be installed? - Any Account 85 | 3. Within the App settings, generate a private key and backup in 1password. 86 | 4. In the GitHub Repo Settings: 87 | - Environments - Create a new environment called `onboarding` 88 | - Environment Secrets: 89 | - `ITBENCH_APP_KEY` - Set to the generated Private Key from step 3 90 | - `ITBENCH_API_TOKEN` - Set to the JWT token for the IT Bench API 91 | - Environment Variables: 92 | - `ITBENCH_APP_ID` - Set to the App ID number 93 | - `ITBENCH_API` - The IT Bench Server API Endpoint 94 | - Actions: 95 | - Allow all actions and reusable workflows 96 | - Require approval for all external contributors 97 | - Read and write permissions 98 | - Runners (If using an external runner) 99 | - Create the runner using default options. 100 | 5. In the issues view create the following labels: 101 | - `approved` 102 | - `benchmark` 103 | - `error` 104 | - `registering` 105 | - `registration` 106 | - `track-progress` 107 | 108 | 109 | ### Self hosted runners 110 | 111 | Because the IT Bench Service is currently running on an internal IBM server, the workflow actions require a self hosted runner with access to the IBM internal network in order to run. The one extra step is to set up the certificate for the internal IT bench server as it is not trusted. 112 | 113 | You can download the certificate (in case the server changes) by running the following command: 114 | 115 | ```bash 116 | openssl s_client -showcerts -connect tokyo-itbench-1.sl.cloud9.ibm.com:443 2>/dev/null Runners and follow the process to create and install a new self hosted runner either on your local machine or an appropritate server on the IBM Network. 121 | 2. Create a file on the same machine where the self hosted runner will run called `itbench_res.pem` and enter the ca certificate below: 122 | 123 | ``` 124 | -----BEGIN CERTIFICATE----- 125 | MIIDmDCCAoCgAwIBAgIUFwWsO8VVH739Qa87EH+Y8mPtm40wDQYJKoZIhvcNAQEL 126 | BQAwbTELMAkGA1UEBhMCVVMxETAPBgNVBAgMCE5ldyBZb3JrMREwDwYDVQQHDAhZ 127 | ... 128 | hB6eiBJigoWYTIFryyPIH5KaMTqyDNCKLbqEMgyRlo0D0ZnHGWMI9FkF3r2bMb7p 129 | PAJ1xNviYcUUdVcPQ81H2hHejnFPTtRnnjBwLf6DV4EulVLEOmutbwuxvAvwkpM3 130 | IsI+erZxjtK7paPl 131 | -----END CERTIFICATE----- 132 | ``` 133 | 3. Go into the `actions-runner` directory and edit the `.env` file, adding the following line: 134 | `ITBENCH_CERT=/path/to/itbench_res.pem`. 135 | 4. Start the runner with `./run.sh` 136 | 137 | 138 | ### Switching to a public runner 139 | 140 | At some point, the IT bench server will be set up on the public internet. This will require a number of small changes, which are listed here: 141 | 142 | 1. For all workflows, ensure that the `runs-on` field has been changed from `self-hosted` to `ubuntu-latest`. 143 | 2. Remove the `ITBENCH_CERT` references: 144 | - `agent_registration.yaml`: Remove `--cacert $ITBENCH_CERT \` from the curl command in the `generate-manifest` step. 145 | - `leaderboard.py`: 146 | - Remove the `ITBENCH_CERT` environment variable from the top of the file. 147 | - Remove the ssl context from the `get_leaderboard` function. 148 | - Remove the `context` argument from the request call in the `get_leaderboard` function. -------------------------------------------------------------------------------- /.github/workflows/benchmark_registration.yaml: -------------------------------------------------------------------------------- 1 | name: Register a new agent benchmark 2 | 3 | on: 4 | issues: 5 | types: [labeled] 6 | 7 | jobs: 8 | register_agent: 9 | if: github.event.label.name == 'approved' && contains(github.event.issue.labels.*.name, 'benchmark') 10 | # The type of runner that the job will run on 11 | runs-on: ubuntu-latest 12 | environment: onboarding 13 | name: Registers a benchmark 14 | steps: 15 | - name: Checkout Repository 16 | uses: actions/checkout@v2 17 | - name: Parse issue 18 | id: parse 19 | run: | 20 | echo "${{ github.event.issue.body }}" > issue_body.txt 21 | python ./.github/workflows/parse_issue.py < issue_body.txt > parsed_output.json 22 | echo "payload=$(cat parsed_output.json)" >> $GITHUB_OUTPUT 23 | # Examples on how to use the output 24 | - name: Show parsed payload data and store variables 25 | id: extract-parsed-data 26 | run: | 27 | echo '${{ steps.parse.outputs.payload }}' 28 | agent_repo="${{ fromJson(steps.parse.outputs.payload)['Config Repo']}}" 29 | agent_repo_owner="$(echo $agent_repo | awk -F/ '{print $4}')" 30 | agent_repo_name="$(echo $agent_repo | awk -F/ '{print $5}')" 31 | echo $agent_repo_owner 32 | echo $agent_repo_name 33 | echo "agent_repo_owner=$agent_repo_owner" >> "$GITHUB_OUTPUT" 34 | echo "agent_repo_name=$agent_repo_name" >> "$GITHUB_OUTPUT" 35 | - name: Comment on issue 36 | uses: actions/github-script@v7 37 | env: 38 | COMMENT_BODY: | 39 | 👋 ${{ github.event.issue.user.login }} 40 | 41 | Thank you for submitting your benchmark registration details, we are currently processing your request and will 42 | comment back once the registration has been completed. 43 | 44 | ## Benchmark Details: 45 | 46 | Name: ${{ fromJson(steps.parse.outputs.payload)['Benchmark Name'] }} 47 | Schedule now? ${{ fromJson(steps.parse.outputs.payload)['Schedule Now'] }} 48 | 49 | Target Config Repo: ${{ fromJson(steps.parse.outputs.payload)['Config Repo']}} 50 | 51 | with: 52 | script: | 53 | github.rest.issues.createComment({ 54 | issue_number: context.issue.number, 55 | owner: context.repo.owner, 56 | repo: context.repo.repo, 57 | body: process.env.COMMENT_BODY 58 | }) 59 | github.rest.issues.addLabels({ 60 | issue_number: context.issue.number, 61 | owner: context.repo.owner, 62 | repo: context.repo.repo, 63 | labels: ['registering'] 64 | }) 65 | 66 | 67 | - name: Generate GitHub token on behalf of repo 68 | id: generate-token 69 | uses: actions/create-github-app-token@v1 70 | with: 71 | app-id: ${{ vars.ITBENCH_APP_ID }} 72 | private-key: ${{ secrets.ITBENCH_APP_KEY }} 73 | owner: ${{ steps.extract-parsed-data.outputs.agent_repo_owner}} 74 | repositories: ${{ steps.extract-parsed-data.outputs.agent_repo_name}} 75 | 76 | - name: Check repository is private 77 | id: check-repo-private 78 | env: 79 | GH_TOKEN: ${{ steps.generate-token.outputs.token }} 80 | run: | 81 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}" 82 | repo_private=$(gh api $repo_full_path -q '.private') 83 | 84 | echo "Repo Private: $repo_private" 85 | 86 | if [ "$repo_private" = "true" ]; then 87 | echo "Target repository is set to private." 88 | else 89 | echo "Target repository is not set to private. Failing!" 90 | echo "error_public_repo=1" >> "$GITHUB_OUTPUT" 91 | exit 1 92 | fi 93 | 94 | - name: Check Issue opened by repo collaborator 95 | id: check-repo-collaborator 96 | env: 97 | GH_TOKEN: ${{ steps.generate-token.outputs.token }} 98 | run : | 99 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}/collaborators" 100 | repo_collaborators=$(gh api $repo_full_path -q '[.[].login] | contains(["${{ github.event.issue.user.login }}"])') 101 | 102 | echo "Issue creator is collaborator: $repo_collaborators" 103 | 104 | if [ "$repo_collaborators" = "true" ]; then 105 | echo "Issue creator is collaborator." 106 | else 107 | echo "Issue creator is not a collaborator. Failing!" 108 | exit 1 109 | fi 110 | 111 | - name: Get Agent Details 112 | id: get-agent-config 113 | env: 114 | GH_TOKEN: ${{ steps.generate-token.outputs.token }} 115 | run : | 116 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}/contents/agent-manifest.json" 117 | agent_id=$(gh api $repo_full_path -q '.content' | base64 -d | jq '.metadata.id') 118 | 119 | echo "Agent ID: $agent_id" 120 | 121 | echo "agent_id=$agent_id" >> "$GITHUB_OUTPUT" 122 | 123 | - name: register-benchmark 124 | id: register-benchmark 125 | run: | 126 | 127 | echo "Registering Benchmark request with IT Bench API" 128 | 129 | response_json='${{steps.parse.outputs.payload}}' 130 | 131 | benchmark_body=$(echo $response_json | jq '{ "name" : ."Benchmark Name", "immediate" : ."Schedule Now"}' | jq --arg AGENT_ID ${{steps.get-agent-config.outputs.agent_id}} '. += {"agent_id": $AGENT_ID}') 132 | 133 | echo $benchmark_body | jq 134 | 135 | 136 | reg_resp=$(curl \ 137 | --url ${{vars.ITBENCH_API}}/gitops/create-benchmark?github_username=${{ github.event.issue.user.login }} \ 138 | --header "authorization: Bearer ${{ secrets.ITBENCH_API_TOKEN }}" \ 139 | --header 'content-type: application/json' \ 140 | --data "$benchmark_body") 141 | 142 | echo $reg_resp 143 | 144 | if [[ $? -eq 0 ]]; then 145 | 146 | echo "Request was successful" 147 | 148 | # Check that the spec is in the response body 149 | echo $reg_resp | jq -e '.id?' 150 | 151 | 152 | if [[ $? -eq 0 ]]; then 153 | 154 | echo "benchmark_id=$(echo $reg_resp | jq -r '.id')" >> "$GITHUB_OUTPUT" 155 | echo "benchmark_name=$(echo $reg_resp | jq -r '.name')" >> "$GITHUB_OUTPUT" 156 | 157 | 158 | 159 | else 160 | echo "Body recieved from IT bench was invalid." 161 | echo $reg_resp 162 | exit 1 163 | fi 164 | 165 | else 166 | echo "Request failed." 167 | echo $reg_resp 168 | exit 1 169 | fi 170 | 171 | 172 | - name: Comment on issue 173 | uses: actions/github-script@v7 174 | env: 175 | COMMENT_BODY: | 176 | 👋 ${{ github.event.issue.user.login }} 177 | 178 | The registration of your benchmark is now complete. 179 | 180 | Here are the Details: 181 | 182 | 183 | Name: ${{ steps.register-benchmark.outputs.benchmark_name }} 184 | Type: ${{ steps.register-benchmark.outputs.benchmark_id }} 185 | 186 | 187 | 188 | 189 | with: 190 | script: | 191 | github.rest.issues.createComment({ 192 | issue_number: context.issue.number, 193 | owner: context.repo.owner, 194 | repo: context.repo.repo, 195 | body: process.env.COMMENT_BODY 196 | }) 197 | 198 | 199 | - name: Report Failure 200 | if: failure() 201 | uses: actions/github-script@v7 202 | env: 203 | PRIVATE_REPO: ${{ steps.check-repo-private.outputs.error_public_repo == 1}} 204 | COMMENT_BODY: | 205 | 👋 ${{ github.event.issue.user.login }} 206 | 207 | Unfortunately there was an unknown issue with registering the benchmark. 208 | 209 | This issue has been marked for manual intervention and the team has been notified. 210 | 211 | ---- 212 | 213 | Run link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} 214 | 215 | 216 | 217 | with: 218 | script: | 219 | 220 | console.log("Responding with generic error message.") 221 | github.rest.issues.createComment({ 222 | issue_number: context.issue.number, 223 | owner: context.repo.owner, 224 | repo: context.repo.repo, 225 | body: process.env.COMMENT_BODY 226 | }) 227 | github.rest.issues.addLabels({ 228 | issue_number: context.issue.number, 229 | owner: context.repo.owner, 230 | repo: context.repo.repo, 231 | labels: ['error'] 232 | }) 233 | 234 | 235 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright contributors to the ITBench project. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /.github/workflows/agent_registration.yaml: -------------------------------------------------------------------------------- 1 | name: Register a new agent 2 | 3 | on: 4 | issues: 5 | types: [labeled] 6 | 7 | jobs: 8 | register_agent: 9 | if: github.event.label.name == 'approved' && contains(github.event.issue.labels.*.name, 'registration') 10 | # The type of runner that the job will run on 11 | runs-on: ubuntu-latest 12 | environment: onboarding 13 | name: Registers an Agent 14 | steps: 15 | - name: Checkout Repository 16 | uses: actions/checkout@v2 17 | - name: Parse issue 18 | id: parse 19 | run: | 20 | echo "${{ github.event.issue.body }}" > issue_body.txt 21 | python ./.github/workflows/parse_issue.py < issue_body.txt > parsed_output.json 22 | echo "payload=$(cat parsed_output.json)" >> $GITHUB_OUTPUT 23 | # Examples on how to use the output 24 | - name: Show parsed payload data and store variables 25 | id: extract-parsed-data 26 | run: | 27 | echo '${{ steps.parse.outputs.payload }}' 28 | agent_repo="${{ fromJson(steps.parse.outputs.payload)['Config Repo']}}" 29 | agent_repo_owner="$(echo $agent_repo | awk -F/ '{print $4}')" 30 | agent_repo_name="$(echo $agent_repo | awk -F/ '{print $5}')" 31 | echo $agent_repo_owner 32 | echo $agent_repo_name 33 | echo "agent_repo_owner=$agent_repo_owner" >> "$GITHUB_OUTPUT" 34 | echo "agent_repo_name=$agent_repo_name" >> "$GITHUB_OUTPUT" 35 | - name: Comment on issue 36 | uses: actions/github-script@v7 37 | env: 38 | COMMENT_BODY: | 39 | 👋 ${{ github.event.issue.user.login }} 40 | 41 | Thank you for submitting your agent registration details, we are currently processing your request and will 42 | be in contact shortly with connection details for your agent harness to use to connect to the IT Bench service. 43 | 44 | ## Agent Details: 45 | 46 | Name: ${{ fromJson(steps.parse.outputs.payload)['Agent Name'] }} 47 | Type: ${{ fromJson(steps.parse.outputs.payload)['Agent Type'] }} 48 | Level: ${{ fromJson(steps.parse.outputs.payload)['Agent Level'] }} 49 | 50 | Target Config Repo: ${{ fromJson(steps.parse.outputs.payload)['Config Repo']}} 51 | 52 | with: 53 | script: | 54 | github.rest.issues.createComment({ 55 | issue_number: context.issue.number, 56 | owner: context.repo.owner, 57 | repo: context.repo.repo, 58 | body: process.env.COMMENT_BODY 59 | }) 60 | github.rest.issues.addLabels({ 61 | issue_number: context.issue.number, 62 | owner: context.repo.owner, 63 | repo: context.repo.repo, 64 | labels: ['registering'] 65 | }) 66 | 67 | 68 | - name: Generate GitHub token on behalf of repo 69 | id: generate-token 70 | uses: actions/create-github-app-token@v1 71 | with: 72 | app-id: ${{ vars.ITBENCH_APP_ID }} 73 | private-key: ${{ secrets.ITBENCH_APP_KEY }} 74 | owner: ${{ steps.extract-parsed-data.outputs.agent_repo_owner}} 75 | repositories: ${{ steps.extract-parsed-data.outputs.agent_repo_name}} 76 | 77 | - name: Check repository is private 78 | id: check-repo-private 79 | env: 80 | GH_TOKEN: ${{ steps.generate-token.outputs.token }} 81 | run: | 82 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}" 83 | repo_private=$(gh api $repo_full_path -q '.private') 84 | 85 | echo "Repo Private: $repo_private" 86 | 87 | if [ "$repo_private" = "true" ]; then 88 | echo "Target repository is set to private." 89 | else 90 | echo "Target repository is not set to private. Failing!" 91 | echo "error_public_repo=1" >> "$GITHUB_OUTPUT" 92 | exit 1 93 | fi 94 | 95 | - name: Check Issue opened by repo collaborator 96 | id: check-repo-collaborator 97 | env: 98 | GH_TOKEN: ${{ steps.generate-token.outputs.token }} 99 | run : | 100 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}/collaborators" 101 | repo_collaborators=$(gh api $repo_full_path -q '[.[].login] | contains(["${{ github.event.issue.user.login }}"])') 102 | 103 | echo "Issue creator is collaborator: $repo_collaborators" 104 | 105 | if [ "$repo_collaborators" = "true" ]; then 106 | echo "Issue creator is collaborator." 107 | else 108 | echo "Issue creator is not a collaborator. Failing!" 109 | exit 1 110 | fi 111 | 112 | - name: generate-manifest 113 | id: generate-manifest 114 | run: | 115 | 116 | echo "Registering Agent with IT Bench API" 117 | 118 | response_json='${{steps.parse.outputs.payload}}' 119 | 120 | agent_body=$(echo $response_json | jq '{"name": ."Agent Name", "type" : ."Agent Type", "level" : ."Agent Level", "scenario_categories" : [."Agent Scenarios" | to_entries[] | select(.value).key]}') 121 | 122 | echo $agent_body | jq 123 | 124 | response_file=$(mktemp) 125 | trap 'echo "Cleaning up $response_file"; rm -f "$response_file"' EXIT 126 | status_code=$(curl \ 127 | --url ${{vars.ITBENCH_API}}/gitops/agents?github_username=${{ github.event.issue.user.login }} \ 128 | --header "authorization: Bearer ${{ secrets.ITBENCH_API_TOKEN }}" \ 129 | --header 'content-type: application/json' \ 130 | --data "$agent_body" \ 131 | --output "$response_file" \ 132 | --write-out "%{http_code}") 133 | 134 | if [[ $? -eq 0 ]]; then 135 | 136 | echo "Curl execution was successful" 137 | 138 | echo "::debug:: $(cat $response_file)" 139 | # Check that the spec is in the response body 140 | 141 | if [[ "$status_code" == "200" || "$status_code" == "201" ]]; then 142 | 143 | echo "manifest=$( cat $response_file | jq '.spec.agent_manifest + {metadata: {id: .metadata.id}}' | base64 -w 0)" >> "$GITHUB_OUTPUT" 144 | 145 | else 146 | msg="Body recieved from IT bench was invalid." 147 | echo "$msg" 148 | echo "error=1" >> "$GITHUB_OUTPUT" 149 | error_detail=$(jq -r '.detail // "No detail message in response."' "$response_file") 150 | echo "error_detail=${error_detail}" >> "$GITHUB_OUTPUT" 151 | exit 1 152 | fi 153 | 154 | else 155 | echo "Request failed." 156 | msg="CURL execution was failed with status code $status_code." 157 | echo "$msg" 158 | echo "error=1" >> "$GITHUB_OUTPUT" 159 | echo "error_detail=$msg" >> "$GITHUB_OUTPUT" 160 | exit 1 161 | fi 162 | 163 | - name: Push manifest to config repository 164 | id: file-push 165 | env: 166 | GH_TOKEN: ${{ steps.generate-token.outputs.token }} 167 | run: | 168 | gh api octocat 169 | 170 | repo_full_path="repos/${{ steps.extract-parsed-data.outputs.agent_repo_owner}}/${{ steps.extract-parsed-data.outputs.agent_repo_name}}/contents/agent-manifest.json" 171 | 172 | echo "Repo Path: $repo_full_path" 173 | 174 | current_sha=$(gh api $repo_full_path -q '.sha' || echo "") 175 | 176 | echo "Current SHA: $current_sha" 177 | 178 | ghout=$(gh api -X PUT \ 179 | -H "Accept: application/vnd.github.v3+json" \ 180 | $repo_full_path \ 181 | -f message="Add agent-manifest.json via API" \ 182 | -f content="${{ steps.generate-manifest.outputs.manifest}}" \ 183 | -f branch="main" \ 184 | -f sha="$current_sha") 185 | 186 | if [[ $? -eq 0 ]]; then 187 | echo $ghout | jq 188 | 189 | 190 | file_path=$(echo $ghout | jq .content.html_url) 191 | echo "File path: $file_path" 192 | 193 | echo "manifest_path=$file_path" >> "$GITHUB_OUTPUT" 194 | fi 195 | 196 | - name: Comment on issue 197 | uses: actions/github-script@v7 198 | env: 199 | COMMENT_BODY: | 200 | 👋 ${{ github.event.issue.user.login }} 201 | 202 | The registration of your agent is now complete. 203 | 204 | Your agent manifest is located at: ${{ steps.file-push.outputs.manifest_path}} 205 | 206 | 207 | ## Agent Details: 208 | 209 | Name: ${{ fromJson(steps.parse.outputs.payload)['Agent Name'] }} 210 | Type: ${{ fromJson(steps.parse.outputs.payload)['Agent Type'] }} 211 | Level: ${{ fromJson(steps.parse.outputs.payload)['Agent Level'] }} 212 | 213 | Target Config Repo: ${{ fromJson(steps.parse.outputs.payload)['Config Repo']}} 214 | 215 | with: 216 | script: | 217 | github.rest.issues.createComment({ 218 | issue_number: context.issue.number, 219 | owner: context.repo.owner, 220 | repo: context.repo.repo, 221 | body: process.env.COMMENT_BODY 222 | }) 223 | 224 | github.rest.issues.update({ 225 | issue_number: context.issue.number, 226 | owner: context.repo.owner, 227 | repo: context.repo.repo, 228 | state: 'closed' 229 | }) 230 | 231 | 232 | - name: Report Failure 233 | if: failure() 234 | uses: actions/github-script@v7 235 | env: 236 | PRIVATE_REPO: ${{ steps.check-repo-private.outputs.error_public_repo == 1}} 237 | ERROR_ON_GENERATE_MANIFEST: ${{ steps.generate-manifest.outputs.error == 1 }} 238 | COMMENT_BODY: | 239 | 👋 ${{ github.event.issue.user.login }} 240 | 241 | Unfortunately there was an unknown issue with registering the agent. 242 | 243 | This issue has been marked for manual intervention and the team has been notified. 244 | 245 | ---- 246 | 247 | Run link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} 248 | 249 | PRIV_REPO_COMMENT_BODY: | 250 | 👋 ${{ github.event.issue.user.login }} 251 | 252 | It looks like the repository you've provided to us is not set to private. 253 | As we will be committing a token to your repository, it needs to be set to private before we can continue. 254 | 255 | Please make the nessesary changes and reply back to this issue, our team will then re-start the registration process. 256 | 257 | ---- 258 | 259 | Run link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} 260 | 261 | ERROR_ON_GENERATE_MANIFEST_COMMENT_BODY: | 262 | 👋 ${{ github.event.issue.user.login }} 263 | 264 | There was an issue while registering the agent. 265 | 266 | Error Detail: 267 | ${{ steps.generate-manifest.outputs.error_detail }} 268 | 269 | ---- 270 | 271 | Run link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} 272 | 273 | with: 274 | script: | 275 | console.log(`Private Repo: ${process.env.PRIVATE_REPO}`) 276 | 277 | if (process.env.PRIVATE_REPO == 'true'){ 278 | console.log("Responding with non private repo message.") 279 | github.rest.issues.createComment({ 280 | issue_number: context.issue.number, 281 | owner: context.repo.owner, 282 | repo: context.repo.repo, 283 | body: process.env.PRIV_REPO_COMMENT_BODY 284 | }) 285 | } else if (process.env.ERROR_ON_GENERATE_MANIFEST == 'true') { 286 | console.log("Responding with manifest error message.") 287 | github.rest.issues.createComment({ 288 | issue_number: context.issue.number, 289 | owner: context.repo.owner, 290 | repo: context.repo.repo, 291 | body: process.env.ERROR_ON_GENERATE_MANIFEST_COMMENT_BODY 292 | }) 293 | } else { 294 | console.log("Responding with generic error message.") 295 | github.rest.issues.createComment({ 296 | issue_number: context.issue.number, 297 | owner: context.repo.owner, 298 | repo: context.repo.repo, 299 | body: process.env.COMMENT_BODY 300 | }) 301 | } 302 | github.rest.issues.addLabels({ 303 | issue_number: context.issue.number, 304 | owner: context.repo.owner, 305 | repo: context.repo.repo, 306 | labels: ['error'] 307 | }) 308 | 309 | 310 | -------------------------------------------------------------------------------- /docs/how-to-launch-benchmark-ciso.md: -------------------------------------------------------------------------------- 1 | # How to Launch the Benchmark (CISO Agent) 2 | 3 | This guide walks you through launching the ITBench benchmark for a CISO agent. 4 | You will run two Docker containers—**Agent Harness** and **Bench Runner**—and keep them running during the evaluation. 5 | 6 | > ⚠️ **Note:** You must not run multiple Agent Harnesses or Bench Runners at the same time. 7 | 8 | ## Option 1: Use the CISO CAA Agent (Prebuilt) 9 | 10 | If you would like to benchmark using the [official CISO CAA Agent](https://github.com/itbench-hub/itbench-ciso-caa-agent), follow these steps: 11 | 12 | 1. Create a .env File 13 | Create a .env file with the following contents: 14 | ``` 15 | OPENAI_API_KEY = 16 | OPENAI_MODEL_NAME = gpt-4o-mini 17 | CODE_GEN_MODEL = gpt-4o-mini 18 | ``` 19 | If you want to use other models, refer to [this section](https://github.com/itbench-hub/itbench-ciso-caa-agent?tab=readme-ov-file#3-create-env-file-and-set-llm-api-credentials) 20 | 21 | 1. Run CISO Agent Harness Docker container 22 | Run the container, replacing `` and `` replaced with your own paths. 23 | ``` 24 | docker run --rm -it --name ciso-agent-harness \ 25 | --mount type=bind,src=,dst=/tmp/agent-manifest.json \ 26 | --mount type=bind,src=,dst=/etc/ciso-agent/.env \ 27 | quay.io/it-bench/ciso-agent-harness:latest \ 28 | --host itbench.apps.prod.itbench.res.ibm.com \ 29 | --benchmark_timeout 3600 30 | ``` 31 | image 32 | 33 | 1. Run the CISO DEF Runner Docker Container 34 | Open a new terminal window and run the container, replacing `` and `` replaced with your own paths. 35 | 36 | (If you are benchmarking a RHEL scenario, please refer to [the full specification.](#full-specification-of-bench-runner)) 37 | ``` 38 | docker run --rm -it --name ciso-bench-runner \ 39 | --mount type=bind,src=,dst=/tmp/agent-manifest.json \ 40 | --mount type=bind,src=,dst=/tmp/kubeconfig.yaml \ 41 | quay.io/it-bench/ciso-bench-runner:latest \ 42 | --host itbench.apps.prod.itbench.res.ibm.com \ 43 | --runner_id my-ciso-runner-1 44 | ``` 45 | image 46 | 47 | 1. Benchmark Progress and Status Updates 48 | - The benchmark will proceed automatically after starting: 49 | - The benchmark will typically complete within about one hour, after which both Docker containers will exit automatically. 50 | - Once completed, you can safely close both terminal windows. 51 | - During the benchmark: 52 | - The original registration issue will be updated approximately every 10 minutes. 53 | - A table summarizing the results will appear, showing the status of each scenario. 54 | 55 | image 56 | 57 | Table Fields: 58 | | Field | Description | 59 | |:------------------|:----------------------------------------------------| 60 | | Scenario Name | The name of the scenario | 61 | | Description | A short description of the control being assessed | 62 | | Passed | Whether the agent passed the scenario (True/False) | 63 | | Time To Resolve | Time taken to complete | 64 | | Error | Any unexpected error encountered | 65 | | Message | Additional information or status | 66 | | Date | Completion timestamp | 67 | 68 | 5. Once all scenarios are completed: 69 | - The Docker commands will automatically stop. 70 | 71 | command-done 72 | 73 | - The registration issue comment will update its status to **Finished**, and the issue will automatically close. 74 | 75 | issue-close 76 | 77 | 6. Troubleshooting 78 | 79 | - If the benchmark fails to start: 80 | - Add a comment to the issue with the text abort. 81 | - Optionally, include additional notes about the problem. 82 | 83 | - If the containers keep running without completing: 84 | - Check if the "Date" field in the table is not updating. 85 | - If it is stuck, terminate the container processes manually (Ctrl+C) and add abort to the issue comment. 86 | 87 | 7. Leaderboard Update: 88 | - The benchmark results will be manually reflected on the leaderboard within a few days. 89 | 90 | image 91 | 92 | - If you do not see updates after a few days, please reach out to [Contact Support](#contact-support). 93 | 94 | 95 | ## Option 2: Use Your Own Agent 96 | 97 | If you are submitting your own custom agent, follow these steps: 98 | 99 | 1. Create Agent Harness config 100 | ```yaml 101 | # This field defines the path where the scenario's environment information is stored. 102 | # When the agent harness runs the command below, the scenario data is fetched from the server and saved at this location. 103 | path_to_data_provided_by_scenario: /tmp/agent/scenario_data.json 104 | 105 | # This field defines the path where the agent's output results should be stored. 106 | # The agent harness uploads this file back to the server for evaluation. 107 | path_to_data_pushed_to_scenario: /tmp/agent/agent_data.txt 108 | 109 | # Command to be run by the agent harness 110 | run: 111 | command: ["/bin/bash"] 112 | args: 113 | - -c 114 | - | 115 | 116 | ``` 117 | 118 | The `command` is executed with `args` inside a docker container that is built from a Dockerfile you create (we will instruct in the later section). 119 | 120 | For example, the following is [the Agent Harness config](https://github.com/itbench-hub/ITBench-CISO-CAA-Agent/blob/main/agent-harness.yaml) of the sample CISO CAA Agent. It appears complex because it includes error handling. When creating your own harness config, it doesn’t need to be this complicated. However, make sure to include proper termination handling to avoid infinite loops. 121 | 122 | ```yaml 123 | path_to_data_provided_by_scenario: /tmp/agent/scenario_data.json 124 | path_to_data_pushed_to_scenario: /tmp/agent/agent_data.tar 125 | run: 126 | command: ["/bin/bash"] 127 | args: 128 | - -c 129 | - | 130 | 131 | timestamp=$(date +%Y%m%d%H%M%S) 132 | tmpdir=/tmp/agent/${timestamp} 133 | mkdir -p ${tmpdir} 134 | 135 | cat /tmp/agent/scenario_data.json > ${tmpdir}/scenario_data.json 136 | 137 | jq -r .goal_template ${tmpdir}/scenario_data.json > ${tmpdir}/goal_template.txt 138 | jq -r .vars.kubeconfig ${tmpdir}/scenario_data.json > ${tmpdir}/kubeconfig.yaml 139 | jq -r .vars.ansible_ini ${tmpdir}/scenario_data.json > ${tmpdir}/ansible.ini 140 | jq -r .vars.ansible_user_key ${tmpdir}/scenario_data.json > ${tmpdir}/user_key 141 | chmod 600 ${tmpdir}/user_key 142 | sed -i.bak -E "s|(ansible_ssh_private_key_file=\")[^\"]*|\1${tmpdir}/user_key|" ${tmpdir}/ansible.ini 143 | 144 | sed "s|{{ kubeconfig }}|${tmpdir}/kubeconfig.yaml|g" ${tmpdir}/goal_template.txt > ${tmpdir}/goal.txt 145 | sed -i.bak -E "s|\{\{ path_to_inventory \}\}|${tmpdir}/ansible.ini|g" ${tmpdir}/goal.txt 146 | 147 | echo "You can use \`${tmpdir}\` as your workdir." >> ${tmpdir}/goal.txt 148 | 149 | source .venv/bin/activate 150 | timeout 200 python src/ciso_agent/main.py --goal "`cat ${tmpdir}/goal.txt`" --auto-approve -o ${tmpdir}/agent-result.json || true 151 | 152 | tar -C ${tmpdir} -cf /tmp/agent/agent_data.tar . 153 | ``` 154 | 155 | 1. Timestamped Temporary Directory Creation 156 | ``` 157 | timestamp=$(date +%Y%m%d%H%M%S) 158 | tmpdir=/tmp/agent/${timestamp} 159 | mkdir -p ${tmpdir} 160 | ``` 161 | 2. Scenario Data Processing 162 | ``` 163 | cat /tmp/agent/scenario_data.json > ${tmpdir}/scenario_data.json 164 | ``` 165 | Copies the downloaded scenario data from IT Bench, which is specified in `path_to_data_provided_by_scenario`, into the temporary directory. 166 | 3. Extracting Key Variables to be passed to python command arguments to run the CISO CAA Agent 167 | ``` 168 | jq -r .goal_template ${tmpdir}/scenario_data.json > ${tmpdir}/goal_template.txt 169 | jq -r .vars.kubeconfig ${tmpdir}/scenario_data.json > ${tmpdir}/kubeconfig.yaml 170 | jq -r .vars.ansible_ini ${tmpdir}/scenario_data.json > ${tmpdir}/ansible.ini 171 | jq -r .vars.ansible_user_key ${tmpdir}/scenario_data.json > ${tmpdir}/user_key 172 | chmod 600 ${tmpdir}/user_key 173 | ``` 174 | 175 | 4. Updating ansible.ini with User Key for RHEL scenario cases. 176 | ``` 177 | sed -i.bak -E "s|(ansible_ssh_private_key_file=\")[^\"]*|\1${tmpdir}/user_key|" ${tmpdir}/ansible.ini 178 | ``` 179 | 5. Preparing the Goal File to be passed to python command arguments to run the CISO CAA Agent 180 | ``` 181 | sed "s|{{ kubeconfig }}|${tmpdir}/kubeconfig.yaml|g" ${tmpdir}/goal_template.txt > ${tmpdir}/goal.txt 182 | sed -i.bak -E "s|\{\{ path_to_inventory \}\}|${tmpdir}/ansible.ini|g" ${tmpdir}/goal.txt 183 | echo "You can use \`${tmpdir}\` as your workdir." >> ${tmpdir}/goal.txt 184 | ``` 185 | 6. Running the Agent (Automated or Manual) 186 | ``` 187 | source .venv/bin/activate 188 | timeout 200 python src/ciso_agent/main.py --goal "`cat ${tmpdir}/goal.txt`" --auto-approve -o ${tmpdir}/agent-result.json || true 189 | ``` 190 | - Enable python virtual env 191 | - Runs main.py with the goal extracted from goal.txt. 192 | - Enforces a timeout of 200 seconds to avoid infinite running. 193 | - Saves the result as agent-result.json in `${tmpdir}` directory. 194 | 7. Archiving the Execution Data by the agent 195 | The CISO CAA Agent generates compliance policy programs and stores them in the designated working directory. The script ensures that all relevant execution data is archived for further analysis. 196 | ``` 197 | tar -C ${tmpdir} -cf /tmp/agent/agent_data.tar . 198 | ``` 199 | 1. Create a Docker image 200 | The docker image is built from Agent Harness base image and is expected to contain your Agent (e.g. crewai python program). 201 | 202 | For example, the Dockerfile is as follows in the case of CISO Agent: 203 | ``` 204 | FROM icr.io/agent-bench/ciso-agent-harness-base:0.0.3 AS base 205 | RUN ln -sf /bin/bash /bin/sh 206 | RUN apt update -y && apt install -y curl gnupg2 unzip ssh 207 | 208 | # install dependencies here to avoid too much build time 209 | COPY itbench-ciso-caa-agent /etc/ciso-agent 210 | WORKDIR /etc/ciso-agent 211 | RUN python -m venv .venv && source .venv/bin/activate && pip install -r requirements-dev.txt --no-cache-dir 212 | 213 | # install `ansible-playbook` 214 | RUN pip install --upgrade ansible-core jmespath kubernetes==31.0.0 setuptools==70.0.0 --no-cache-dir 215 | RUN ansible-galaxy collection install kubernetes.core community.crypto 216 | RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config 217 | # install `jq` 218 | RUN apt update -y && apt install -y jq 219 | # install `kubectl` 220 | RUN curl -LO https://dl.k8s.io/release/v1.31.0/bin/linux/$(dpkg --print-architecture)/kubectl && \ 221 | chmod +x ./kubectl && \ 222 | mv ./kubectl /usr/local/bin/kubectl 223 | # install `aws` (need this for using kubectl against AWS cluster) 224 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" && \ 225 | unzip awscliv2.zip && \ 226 | ./aws/install 227 | # install `opa` 228 | RUN curl -L -o opa https://github.com/open-policy-agent/opa/releases/download/v1.0.0/opa_linux_$(dpkg --print-architecture)_static && \ 229 | chmod +x ./opa && \ 230 | mv ./opa /usr/local/bin/opa 231 | 232 | RUN python -m venv .venv && source .venv/bin/activate && pip install -e /etc/ciso-agent --no-cache-dir 233 | 234 | COPY agent-bench-automation.wiki/.gist/agent-harness/entrypoint.sh /etc/entrypoint.sh 235 | RUN chmod +x /etc/entrypoint.sh 236 | WORKDIR /etc/agent-benchmark 237 | 238 | ENTRYPOINT ["/etc/entrypoint.sh"] 239 | ``` 240 | 241 | ## Conclusion 242 | 243 | Congratulations! You’ve successfully completed the ITBench benchmarking process. 244 | 245 | ## Contact Support 246 | 247 | If you do not receive any response within a couple of days, please leave a comment in your original registration issue and mention our support team. 248 | - Mention: @yana, @rohanarora 249 | - Add Label: `need help` 250 | 251 | Example Comment: 252 | ``` 253 | @yana, @rohanarora 254 | Hi, I have not received a response regarding my registration request. 255 | Adding the "need help" label for visibility. 256 | ``` 257 | 258 | ## Misc 259 | 260 | #### Full Specification of Bench Runner 261 | 262 | ``` 263 | docker run --rm -it --name ciso-bench-runner \ 264 | --mount type=bind,src=,dst=/tmp/agent-manifest.json \ 265 | --mount type=bind,src=,dst=/tmp/kubeconfig.yaml \ 266 | --mount type=bind,src=,dst=/tmp/rhel-bundle-config/ssh_key \ 267 | quay.io/it-bench/ciso-bench-runner:latest \ 268 | --host itbench.apps.prod.itbench.res.ibm.com \ 269 | --runner_id my-ciso-runner-1 \ 270 | --rhel_address \ 271 | --rhel_username 272 | ``` -------------------------------------------------------------------------------- /.github/workflows/leaderboard.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import urllib.request 6 | from datetime import datetime, timedelta, timezone 7 | from typing import Optional 8 | from urllib.parse import urlencode 9 | 10 | ITBENCH_API = os.getenv("ITBENCH_API") 11 | ITBENCH_API_TOKEN = os.getenv("ITBENCH_API_TOKEN") 12 | GH_REPO = os.getenv("GH_REPO") 13 | REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "10")) 14 | 15 | 16 | def get_leaderboard(benchmark_id: str = None, github_username: str = None): 17 | url = f"{ITBENCH_API}/gitops/aggregate-results" 18 | query_params = {} 19 | if benchmark_id is not None: 20 | query_params["benchmark_id"] = benchmark_id 21 | if github_username is not None: 22 | query_params["github_username"] = github_username 23 | if query_params: 24 | url += "?" + urlencode(query_params) 25 | headers = {"Authorization": f"Bearer {ITBENCH_API_TOKEN}"} 26 | req = urllib.request.Request(url=url, headers=headers, method="GET") 27 | res = urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT) 28 | 29 | if res.getcode() != 200: 30 | print(f"Error requesting leaderboard JSON: {res.status_code}. {res.content}") 31 | exit(1) 32 | 33 | res_body = res.read() 34 | res_dict = json.loads(res_body.decode("utf-8")) 35 | return res_dict 36 | 37 | 38 | def parse_json_timedelta(delta): 39 | if not delta: 40 | return "N/A" 41 | 42 | match = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+(?:\.\d+)?)S)?", delta) 43 | if not match: 44 | return "Invalid" 45 | 46 | hours = int(match.group(1)) if match.group(1) else 0 47 | minutes = int(match.group(2)) if match.group(2) else 0 48 | seconds = float(match.group(3)) if match.group(3) else 0.0 49 | return str(int(timedelta(hours=hours, minutes=minutes, seconds=seconds).total_seconds())) + "s" 50 | 51 | 52 | def get_timestamp(dt: Optional[datetime] = None) -> str: 53 | if not dt: 54 | dt = datetime.now(timezone.utc) 55 | return dt.strftime("%d/%m/%Y %H:%M:%S") 56 | 57 | 58 | def to_datetime(timestamp: str) -> datetime: 59 | return datetime.fromisoformat(timestamp.replace("Z", "+00:00")) 60 | 61 | 62 | def build_overall_table(leaderboard): 63 | bench_summary = [] 64 | prev_score = None 65 | rank = 0 66 | count = 0 67 | for benchmark in leaderboard: 68 | count += 1 69 | if benchmark["score"] != prev_score: 70 | rank = count 71 | name = benchmark["agent"] 72 | github_username_link = benchmark["github_username_link"] 73 | github_username_org = benchmark["github_username_org"] 74 | score = f'{int(benchmark["score"] * 100)}%' 75 | agent_type = benchmark["agent_type"] 76 | checkmarks = "✅" * benchmark["num_of_passed"] if benchmark["num_of_passed"] >= 0 else "N/A" 77 | notes = f'Related to {benchmark["incident_type"]} scenarios' 78 | issue_link = benchmark["issue_link"] 79 | 80 | sre = finops = ciso = "N/A" 81 | if agent_type == "SRE": 82 | sre = checkmarks 83 | elif agent_type == "FinOps": 84 | finops = checkmarks 85 | elif agent_type == "CISO": 86 | ciso = checkmarks 87 | bench_line = [ 88 | rank, 89 | name, 90 | github_username_link, 91 | github_username_org, 92 | score, 93 | sre, 94 | finops, 95 | ciso, 96 | issue_link, 97 | notes, 98 | ] 99 | prev_score = benchmark["score"] 100 | bench_summary.append(bench_line) 101 | 102 | header_str = ['Rank', 'Agent Name', 'Agent Submitter', 'Organization', 'Overall Score', 'SRE', 'FinOps', 'CISO', 'Issue Link', 'Notes'] 103 | line_fmt = '| {:^4} | {:^20} | {:^13} | {:^13} | {:^13} | {:^13} | {:^13} | {:^13} | {:^13} | {:<30} |' 104 | headers = line_fmt.format(*header_str) 105 | header_len = len(headers) 106 | 107 | texts = [] 108 | texts.append("## 📊 IT Bench Leaderboard") 109 | header = """\ 110 | This table shows a consolidated view of all agent submissions across different domains (SRE, FinOps, CISO). 111 | 112 | For details on how to participate, see the [README](../README.md). 113 | 114 | **Column Descriptions:** 115 | - *Overall Score*: Combined performance across available domains 116 | - *SRE / FinOps / CISO*: ✅ if benchmarks in that domain were completed 117 | - *Notes*: Additional context on the evaluated scenarios 118 | """ 119 | texts.append(header) 120 | texts.append(f"\n\nUpdated on: {get_timestamp()}\n\n") 121 | texts.append("-" * header_len) 122 | texts.append(headers) 123 | texts.append(line_fmt.format(*("---" * 7))) 124 | for bench_line in bench_summary: 125 | texts.append(line_fmt.format(*bench_line)) 126 | 127 | return "\n".join(texts) 128 | 129 | def build_ciso_table(leaderboard) -> str: 130 | column_mapping = { 131 | "id": "Benchmark (ID)", 132 | "github_username_link": "Agent Submitter", 133 | "github_username_org": "Organization", 134 | "agent": "Agent Name", 135 | "incident_type": "Scenario Category", 136 | "score": "Score ⬆️", 137 | "mttr": "Mean Agent Execution Duration", 138 | "num_of_passed": "#Passed", 139 | "issue_link": "Issue Link", 140 | "date": "Date (UTC)", 141 | } 142 | columns = ["agent", "github_username_link", "github_username_org", "incident_type", "score", "num_of_passed", "mttr", "date", "issue_link"] 143 | headers = [column_mapping[col] for col in columns] 144 | 145 | texts = [] 146 | texts.append("## 📊 IT Bench Leaderboard (CISO)") 147 | header = """\ 148 | This leaderboard shows the performance of agents on CISO-related IT automation scenarios. 149 | For details on how to participate or interpret results, see the [README](../main/README.md). 150 | 151 | **Column Descriptions:** 152 | - *Score*: Average benchmark score across scenarios (1.0 = perfect) 153 | - *#Passed*: Number of scenarios successfully passed 154 | - *Mean Agent Execution Duration*: Average time taken across scenarios 155 | - *Scenario Category*: Categories of evaluated tasks (e.g., RHEL, Kyverno, etc.) 156 | """ 157 | texts.append(header) 158 | texts.append(f"\n\nUpdated on: {get_timestamp()}\n\n") 159 | texts.append("---") 160 | texts.append("| " + " | ".join(headers) + " |") 161 | texts.append("|" + "|".join(["-" * (len(h) + 2) for h in headers]) + "|") 162 | 163 | for row in leaderboard: 164 | values = [] 165 | for col in columns: 166 | val = row.get(col, "") 167 | if col == "mttr": 168 | val = parse_json_timedelta(val) 169 | elif col == "date": 170 | val = get_timestamp(to_datetime(val)) 171 | elif isinstance(val, float): 172 | val = f"{val:.2f}" 173 | values.append(str(val)) 174 | texts.append("| " + " | ".join(values) + " |") 175 | return "\n".join(texts) 176 | 177 | def get_nested_value(metric_name, content) -> dict: 178 | metric_parent, metric = metric_name.split("__") 179 | nested_dict = content[metric_parent][metric] 180 | 181 | formatted_dict = {k: (lambda v: f"{v:.2f}" if isinstance(v, float) else v)(val) 182 | for k, val in nested_dict.items()} 183 | return json.dumps(formatted_dict) 184 | 185 | def build_sre_table(leaderboard) -> str: 186 | column_mapping = { 187 | "id": "Benchmark (ID)", 188 | "github_username_link": "Agent Submitter", 189 | "github_username_org": "Organization", 190 | "name_decorated": "Benchmark (Name)", 191 | "agent": "Agent (Name)", 192 | "incident_type": "Scenario Category", 193 | "trials": "Trials across incidents", 194 | "percent_agent_submitted_diagnosis_results": "Diagnosis received - % of Trials", 195 | "diagnosis__ntam_fault_localization": "Diagnosis - NTAM Fault Localization", 196 | "diagnosis__ntam_fault_propagation": "Diagnosis - NTAM Fault Propagation", 197 | "diagnosis__time_to_diagnosis": "Diagnosis - Time to Diagnosis", 198 | "diagnosis__duration_agent_tried_for_diagnosis": "Diagnosis - Duration agent tried for Diagnosis", 199 | "repair__time_to_repair": "Repair - Time to Repair", 200 | "percent_resolved": "% Resolved", 201 | "issue_link": "Issue Link", 202 | "date": "Date (UTC)", 203 | } 204 | columns = ["agent", "github_username_link", "github_username_org", 205 | "incident_type", "trials", 206 | "diagnosis__ntam_fault_localization", 207 | "diagnosis__ntam_fault_propagation", 208 | "diagnosis__time_to_diagnosis", 209 | "diagnosis__duration_agent_tried_for_diagnosis", 210 | "repair__time_to_repair", 211 | "percent_resolved", 212 | "date", "issue_link"] 213 | headers = [column_mapping[col] for col in columns] 214 | 215 | texts = [] 216 | texts.append("## 📊 IT Bench Leaderboard (SRE)") 217 | header = f"""\ 218 | This leaderboard shows the performance of agents on SRE-related IT automation scenarios. 219 | For details on how to participate or interpret results, see the [README](../main/README.md). 220 | 221 | **Column Descriptions:** 222 | - *Diagnosis - NTAM Fault Localization*: Normalized Topology Aware Metric (NTAM) Average Fault Propagation Chain 223 | - *Diagnosis - NTAM Fault Propagation*: NTAM Average Fault Localisation 224 | - *% Resolved*: Percentage of incidents repaired (mitigation efficiency) 225 | """ 226 | texts.append(header) 227 | texts.append(f"\n\nUpdated on: {get_timestamp()}\n\n") 228 | texts.append("---") 229 | texts.append("| " + " | ".join(headers) + " |") 230 | texts.append("|" + "|".join(["-" * (len(h) + 2) for h in headers]) + "|") 231 | 232 | for row in leaderboard: 233 | values = [] 234 | for col in columns: 235 | val = row.get(col, "") 236 | if col == "mttr": 237 | val = parse_json_timedelta(val) 238 | elif col == "date": 239 | val = get_timestamp(to_datetime(val)) 240 | elif (col == "diagnosis__ntam_fault_localization" or 241 | col == "diagnosis__ntam_fault_propagation" or 242 | col == "diagnosis__time_to_diagnosis" or 243 | col == "diagnosis__duration_agent_tried_for_diagnosis" or 244 | col == "repair__time_to_repair"): 245 | val = get_nested_value(col, row) 246 | elif col == "percent_resolved": 247 | val = row.get("repair", {}).get(col, 0.0) 248 | elif isinstance(val, float): 249 | val = f"{val:.2f}" 250 | values.append(str(val)) 251 | texts.append("| " + " | ".join(values) + " |") 252 | return "\n".join(texts) 253 | 254 | SAMPLE_DATA = [ 255 | { 256 | 'name': 'Run-2', 257 | 'incident_type': 'SRE', 258 | 'agent': 'Agent-104', 259 | 'results': [{}] * 10, 260 | 'mttr': 'PT0S', 261 | 'num_of_passed': 3, 262 | 'score': 0.3, 263 | 'date': '2025-03-11T13:54:23.576999Z', 264 | 'id': 'f324b0ca-5065-435e-a140-1db3f409926d', 265 | 'agent_type': 'SRE', 266 | 'github_username': 'Rohan-Arora', 267 | }, 268 | { 269 | 'name': 'My CISO Agent Benchmark', 270 | 'incident_type': 'Gen-CIS-b-K8s-Kyverno', 271 | 'agent': 'My CISO Agent (Yana)', 272 | 'results': [{}] * 10, 273 | 'mttr': 'PT1M5.70376S', 274 | 'num_of_passed': 3, 275 | 'score': 0.3, 276 | 'date': '2025-03-17T00:36:52.334468Z', 277 | 'id': '337e85bf-f29d-4b60-b159-6f66c9d6febe', 278 | 'agent_type': 'CISO', 279 | 'github_username': 'yana1205', 280 | }, 281 | { 282 | 'name': 'Top SRE Benchmark', 283 | 'incident_type': 'SRE', 284 | 'agent': 'Baseline SRE Agent', 285 | 'results': [{}] * 10, 286 | 'mttr': 'PT30S', 287 | 'num_of_passed': 7, 288 | 'score': 0.70, 289 | 'date': '2025-03-20T12:00:00Z', 290 | 'id': 'aaa-bbb', 291 | 'agent_type': 'SRE', 292 | 'github_username': 'sre_star', 293 | }, 294 | { 295 | 'name': 'Top CISO Benchmark', 296 | 'incident_type': 'Gen-CIS-b-RHEL9-Ansible-OPA', 297 | 'agent': 'Baseline CISO Agentp', 298 | 'results': [{}] * 10, 299 | 'mttr': 'PT1M', 300 | 'num_of_passed': 6, 301 | 'score': 0.6, 302 | 'date': '2025-03-20T12:10:00Z', 303 | 'id': 'ccc-ddd', 304 | 'agent_type': 'CISO', 305 | 'github_username': 'ciso_champ', 306 | }, 307 | ] 308 | 309 | 310 | if __name__ == "__main__": 311 | 312 | parser = argparse.ArgumentParser(description="Print IT Bench leaderboard") 313 | parser.add_argument("leaderboard") 314 | parser.add_argument("-u", "--github_username", type=str) 315 | parser.add_argument("-b", "--benchmark_id", type=str) 316 | parser.add_argument("--issues", type=str, required=True) 317 | parser.add_argument("--users", type=str, required=True) 318 | parser.add_argument("--out-ciso", type=str, required=True) 319 | parser.add_argument("--out-sre", type=str, required=True) 320 | parser.add_argument("--out-overall", type=str, required=True) 321 | parser.add_argument("--sample", action="store_true", help="Use sample data") 322 | args = parser.parse_args() 323 | if args.sample: 324 | leaderboard = SAMPLE_DATA 325 | # leaderboard_real = get_leaderboard(args.benchmark_id, args.github_username) 326 | leaderboard_real = [] 327 | leaderboard = leaderboard + leaderboard_real 328 | else: 329 | if args.leaderboard == "global": 330 | leaderboard = get_leaderboard() 331 | else: 332 | leaderboard = get_leaderboard(args.benchmark_id, args.github_username) 333 | 334 | with open(args.issues, "r") as f: 335 | issues = json.load(f) 336 | 337 | with open(args.users, "r") as f: 338 | users = json.load(f) 339 | 340 | benchmark_issue_mapping = {issue["benchmark_id"]: issue["number"] for issue in issues} 341 | for item in leaderboard: 342 | number = benchmark_issue_mapping.get(item["id"]) 343 | item["issue_link"] = f"[#{number}](https://github.com/{GH_REPO}/issues/{number})" if number else "Not Found" 344 | username = item.get("github_username") 345 | item["github_username_link"] = f"[{username}](https://github.com/{username})" if username else "N/A" 346 | company = users.get(username, {}).get("company") 347 | item["github_username_org"] = company if company else "" 348 | # temporal solution for SRE metrics 349 | if "score" not in item: 350 | item["score"] = item.get("percent_agent_submitted_diagnosis_results", 0.0) / 100 351 | if "num_of_passed" not in item: 352 | item["num_of_passed"] = int(item["score"] * 10) # treate number of pass as decile of score 353 | 354 | leaderboard = sorted(leaderboard, key=lambda x: x["score"], reverse=True) 355 | leaderboard_ciso = [x for x in leaderboard if x["agent_type"] == "CISO"] 356 | leaderboard_sre = [x for x in leaderboard if x["agent_type"] == "SRE"] 357 | 358 | overall_table = build_overall_table(leaderboard) 359 | with open(args.out_overall, "w") as f: 360 | f.write(overall_table) 361 | 362 | ciso_table = build_ciso_table(leaderboard_ciso) 363 | with open(args.out_ciso, "w") as f: 364 | f.write(ciso_table) 365 | 366 | sre_table = build_sre_table(leaderboard_sre) 367 | with open(args.out_sre, "w") as f: 368 | f.write(sre_table) 369 | -------------------------------------------------------------------------------- /.github/workflows/update_benchmark_helper.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import re 6 | import textwrap 7 | import urllib.request 8 | from dataclasses import asdict, dataclass, field 9 | from datetime import datetime, timedelta, timezone 10 | from typing import Any, Dict, List, Optional 11 | from urllib.parse import urlparse 12 | 13 | ITBENCH_API = os.getenv("ITBENCH_API") 14 | ITBENCH_API_TOKEN = os.getenv("ITBENCH_API_TOKEN") 15 | LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") 16 | 17 | logger = logging.getLogger(__name__) 18 | loglevel = logging.getLevelNamesMapping().get(LOG_LEVEL, logging.INFO) 19 | logging.basicConfig(level=loglevel, format="%(asctime)s - %(levelname)s - %(message)s") 20 | logger.setLevel(loglevel) 21 | 22 | 23 | @dataclass 24 | class UpdatedIssue: 25 | number: int 26 | github_username: str 27 | benchmark_id: str 28 | comments: List[Dict[str, Any]] = field(default_factory=list) 29 | 30 | 31 | @dataclass 32 | class BenchmarkStatus: 33 | number: int 34 | github_username: str 35 | benchmark_id: str 36 | agent_type: str 37 | status: str 38 | error_message: Optional[str] = None 39 | results: List[Dict[str, Any]] = field(default_factory=list) 40 | scenario_name_description_map: Optional[Dict[str, str]] = None 41 | status_comment_id: Optional[str] = None 42 | 43 | 44 | @dataclass 45 | class BenchmarkStatusComment: 46 | number: int 47 | comment: str 48 | closed: bool 49 | status_comment_id: Optional[str] = None 50 | 51 | 52 | def output(args, data): 53 | if args.output: 54 | with open(args.output, "w") as f: 55 | f.write(data) 56 | else: 57 | print(data) 58 | 59 | 60 | class ParseCommand: 61 | 62 | def exec(self, args): 63 | with open(args.input, "r") as f: 64 | issues = json.load(f) 65 | 66 | updated_issues: List[UpdatedIssue] = [] 67 | for issue in issues: 68 | number = issue.get("number") 69 | author = issue.get("author", {}) 70 | comments = issue.get("comments", []) 71 | benchmark_id_comments = [{"comment": x, "benchmark_id": self.extract_benchmark_id(x)} for x in comments] 72 | benchmark_id_comment = [x for x in benchmark_id_comments if x.get("benchmark_id")] 73 | if len(benchmark_id_comment) == 0: 74 | logger.warning(f"No Benchmark ID comment found for issue {number}, skipping.") 75 | continue 76 | benchmark_id_comment = benchmark_id_comment[0] 77 | updated_issue = UpdatedIssue( 78 | number=number, 79 | github_username=author.get("login"), 80 | benchmark_id=benchmark_id_comment["benchmark_id"], 81 | comments=comments, 82 | ) 83 | updated_issues.append(updated_issue) 84 | 85 | data = json.dumps([asdict(x) for x in updated_issues], indent=2) 86 | output(args, data) 87 | 88 | def extract_benchmark_id(self, issue): 89 | pattern = r"" 90 | match = re.search(pattern, issue.get("body", "")) 91 | if match: 92 | return match.group("id") 93 | else: 94 | return None 95 | 96 | 97 | class StatusCommand: 98 | 99 | def exec(self, args): 100 | with open(args.input, "r") as f: 101 | updated_issues = json.load(f) 102 | 103 | updated_issues = [UpdatedIssue(**x) for x in updated_issues] 104 | benchmark_statuses: List[BenchmarkStatus] = [] 105 | for upd in updated_issues: 106 | github_username = upd.github_username 107 | benchmark_id = upd.benchmark_id 108 | 109 | # find existing status comment 110 | status_comment = [x for x in upd.comments if re.match(r"^### Status", x.get("body", ""))] 111 | if len(status_comment) == 0: 112 | status_comment_id = None 113 | else: 114 | # Example GitHub issue comment URL: 115 | # e.g., https://github.com/yana1205/gitops-bench-0310/issues/10#issuecomment-2726194238 116 | url = status_comment[0].get("url") # Retrieve the comment URL from the status data 117 | 118 | # Parse the URL and extract the fragment part (everything after "#") 119 | # The fragment contains the comment ID, formatted as "issuecomment-" 120 | parsed_url = urlparse(url) 121 | status_comment_id = parsed_url.fragment.replace("issuecomment-", "") # Extract only the numeric comment ID 122 | 123 | # get results of finished scenarios 124 | bench_results, error = self.request( 125 | f"{ITBENCH_API}/gitops/retrieve-results?benchmark_id={benchmark_id}&github_username={github_username}" 126 | ) 127 | if error: 128 | bs = self.to_benchmark_status( 129 | upd, error_message="Failed to get benchmark progress.", status="Unkown", status_comment_id=status_comment_id 130 | ) 131 | benchmark_statuses.append(bs) 132 | continue 133 | bench_result = bench_results[0] # benchmark_id is specified in query param so the response should contain only 1 item. 134 | benchmark = bench_result.get("benchmark", {}) 135 | spec = benchmark.get("spec", {}) 136 | agent_type = spec.get("agent_type", None) 137 | status = benchmark.get("status", {}) 138 | phase = status.get("phase", "Errored") 139 | results = bench_result.get("results", {}) 140 | bs = self.to_benchmark_status( 141 | upd, 142 | agent_type=agent_type, 143 | status=phase, 144 | status_comment_id=status_comment_id, 145 | results=results, 146 | benchmark=benchmark, 147 | ) 148 | benchmark_statuses.append(bs) 149 | 150 | data = json.dumps([asdict(x) for x in benchmark_statuses], indent=2) 151 | output(args, data) 152 | 153 | def request(self, url): 154 | headers = {"Authorization": f"Bearer {ITBENCH_API_TOKEN}"} 155 | req = urllib.request.Request(url=url, headers=headers, method="GET") 156 | res = urllib.request.urlopen(req, timeout=10) 157 | if res.getcode() != 200: 158 | logger.error(f"Error requesting benchmark JSON: {res.status_code}. {res.content}") 159 | return None, True 160 | res_body = res.read() 161 | res_dict = json.loads(res_body.decode("utf-8")) 162 | return res_dict, False 163 | 164 | def to_benchmark_status( 165 | self, 166 | upd: UpdatedIssue, 167 | agent_type: str, 168 | status: str, 169 | status_comment_id, 170 | error_message: Optional[str] = None, 171 | results: List[Dict[str, Any]] = [], 172 | benchmark: Optional[Dict[str, Any]] = None, 173 | ): 174 | spec = benchmark.get("spec", {}) 175 | scenario_name_description_map = {x["spec"]["name"]: x["spec"]["description"] for x in spec.get("scenarios", [])} 176 | return BenchmarkStatus( 177 | number=upd.number, 178 | github_username=upd.github_username, 179 | benchmark_id=upd.benchmark_id, 180 | agent_type=agent_type, 181 | error_message=error_message, 182 | status=status, 183 | status_comment_id=status_comment_id, 184 | results=results, 185 | scenario_name_description_map=scenario_name_description_map, 186 | ) 187 | 188 | 189 | class CommentCommand: 190 | 191 | def exec(self, args): 192 | with open(args.input, "r") as f: 193 | benchmark_statuses = json.load(f) 194 | benchmark_statuses = [BenchmarkStatus(**x) for x in benchmark_statuses] 195 | 196 | benchmark_status_comments: List[BenchmarkStatusComment] = [] 197 | for benchmark_status in benchmark_statuses: 198 | if benchmark_status.error_message: 199 | comment = self.to_error_comment(benchmark_status) 200 | else: 201 | comment = self.to_comment(benchmark_status) 202 | closed = benchmark_status.status in ["Finished", "Errored"] 203 | bsc = BenchmarkStatusComment( 204 | number=benchmark_status.number, 205 | status_comment_id=benchmark_status.status_comment_id, 206 | comment=comment, 207 | closed=closed, 208 | ) 209 | benchmark_status_comments.append(bsc) 210 | 211 | data = "\n".join([json.dumps(asdict(x)) for x in benchmark_status_comments]) 212 | data += "\n" 213 | output(args, data) 214 | 215 | def to_comment(self, benchmark_status: BenchmarkStatus): 216 | if benchmark_status.agent_type == "CISO": 217 | table = self.to_table(benchmark_status) 218 | elif benchmark_status.agent_type == "SRE": 219 | table = self.to_table_sre(benchmark_status) 220 | else: 221 | table = "TBD" 222 | timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") 223 | return f"""\ 224 | ### Status 225 | 226 | #### Benchmark Status 227 | - **Benchmark ID**: {benchmark_status.benchmark_id} 228 | - **Status**: {benchmark_status.status} 229 | 230 | #### Results of Finished Scenarios 231 | {table} 232 | 233 | #### Last Updated: {timestamp} 234 | """ 235 | 236 | def to_error_comment(self, benchmark_status: BenchmarkStatus): 237 | return f""" 238 | ### Status 239 | 240 | #### Benchmark Status 241 | - **Benchmark ID**: {benchmark_status.benchmark_id} 242 | - **Status**: {benchmark_status.status} 243 | - **Message**: {benchmark_status.message} 244 | """ 245 | 246 | def parse_ttr(self, ttr): 247 | if not ttr: 248 | return "N/A" 249 | 250 | match = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+(?:\.\d+)?)S)?", ttr) 251 | if not match: 252 | return "Invalid" 253 | 254 | hours = int(match.group(1)) if match.group(1) else 0 255 | minutes = int(match.group(2)) if match.group(2) else 0 256 | seconds = float(match.group(3)) if match.group(3) else 0.0 257 | return str(int(timedelta(hours=hours, minutes=minutes, seconds=seconds).total_seconds())) + "s" 258 | 259 | def to_table(self, benchmark_status: BenchmarkStatus): 260 | results = benchmark_status.results 261 | table = [] 262 | 263 | table.append("| Scenario Name | Description | Passed | Time To Resolve | Error | Message | Date |") 264 | table.append("|---------------|-------------|--------|-----------------|-------|---------|------|") 265 | 266 | for result in results: 267 | spec = result["spec"] 268 | name = spec["name"] 269 | description = spec["description"] 270 | passed = "✅" if spec["passed"] else "❌" 271 | errored = "Error" if spec["errored"] else "No error" 272 | ttr = self.parse_ttr(spec["ttr"]) 273 | date = spec["date"] 274 | message_text = textwrap.shorten(spec["message"], width=50, placeholder="...") 275 | table.append(f"| {name} | {description} | {passed} | {ttr} | {errored} | {message_text} | {date} |") 276 | 277 | return "\n".join(table) 278 | 279 | 280 | def to_table_sre(self, benchmark_status: BenchmarkStatus): 281 | results = benchmark_status.results 282 | table = [] 283 | 284 | table.append( 285 | "| Passed | Error | Trials | Date |" 286 | ) 287 | table.append( 288 | "|--------|-------|--------|------|" 289 | ) 290 | 291 | for result in results: 292 | spec = result["spec"] 293 | name = spec["name"] 294 | description = spec["description"] 295 | if not description or description == "": 296 | description = benchmark_status.scenario_name_description_map.get(name) 297 | passed = "✅" if spec["passed"] else "❌" 298 | errored = "Error" if spec["errored"] else "No error" 299 | date = spec["date"] 300 | 301 | try: 302 | message_data = json.loads(spec["message"]) 303 | 304 | trials = message_data.get("trials", "N/A") 305 | 306 | # Extract diagnosis data (commented out from table but kept for potential future use) 307 | diagnosis = message_data.get("diagnosis", {}) 308 | ntam_fault_localization = diagnosis.get("ntam_fault_localization", {}).get("mean", "N/A") 309 | ntam_fault_propagation = diagnosis.get("ntam_fault_propagation", {}).get("mean", "N/A") 310 | time_to_diagnosis = diagnosis.get("time_to_diagnosis", {}).get("mean", "N/A") 311 | duration_agent_tried = diagnosis.get("duration_agent_tried_for_diagnosis", {}).get("mean", "N/A") 312 | 313 | # Extract repair data (commented out from table but kept for potential future use) 314 | repair = message_data.get("repair", {}) 315 | time_to_repair = repair.get("time_to_repair", {}).get("mean", "N/A") 316 | percent_resolved = repair.get("percent_resolved", "N/A") 317 | 318 | def format_value(value): 319 | if value == "N/A" or value is None: 320 | return "N/A" 321 | elif value == float('inf') or str(value) == "Infinity": 322 | return "∞" 323 | elif isinstance(value, (int, float)): 324 | return f"{value:.2f}" 325 | else: 326 | return str(value) 327 | 328 | trials_str = str(trials) if trials != "N/A" else "N/A" 329 | # Format diagnostic and repair values (commented out from table but kept for potential future use) 330 | # ntam_fault_localization_str = format_value(ntam_fault_localization) 331 | # ntam_fault_propagation_str = format_value(ntam_fault_propagation) 332 | # time_to_diagnosis_str = format_value(time_to_diagnosis) 333 | # duration_agent_tried_str = format_value(duration_agent_tried) 334 | # time_to_repair_str = format_value(time_to_repair) 335 | # percent_resolved_str = format_value(percent_resolved) 336 | 337 | except (json.JSONDecodeError, KeyError, TypeError) as e: 338 | # If JSON parsing fails or data is missing, use N/A for all fields 339 | trials_str = "N/A" 340 | # Commented out diagnostic and repair fields (kept for potential future use) 341 | # ntam_fault_localization_str = "N/A" 342 | # ntam_fault_propagation_str = "N/A" 343 | # time_to_diagnosis_str = "N/A" 344 | # duration_agent_tried_str = "N/A" 345 | # time_to_repair_str = "N/A" 346 | # percent_resolved_str = "N/A" 347 | 348 | table.append( 349 | f"| {passed} | {errored} | {trials_str} | {date} |" 350 | ) 351 | 352 | return "\n".join(table) 353 | 354 | def main(): 355 | parser = argparse.ArgumentParser() 356 | subparsers = parser.add_subparsers(dest="command", required=True) 357 | 358 | parser_parse = subparsers.add_parser("parse", help="Parse issues.json, filter by track flag, extract benchmark id") 359 | parser_parse.add_argument("-i", "--input", required=True, help="Input file (issues.json)") 360 | parser_parse.add_argument("-o", "--output", help="Output file (Default. stdout)") 361 | parser_parse.set_defaults(func=ParseCommand().exec) 362 | 363 | parser_status = subparsers.add_parser("status", help="Get progress and current results of the benchmark") 364 | parser_status.add_argument("-i", "--input", required=True, help="Input file (parsed issues)") 365 | parser_status.add_argument("-o", "--output", help="Output file (Default. stdout)") 366 | parser_status.set_defaults(func=StatusCommand().exec) 367 | 368 | parser_status = subparsers.add_parser("comment", help="Create comment from benchmark statuses") 369 | parser_status.add_argument("-i", "--input", required=True, help="Input file (benchmark_statuses.json)") 370 | parser_status.add_argument("-o", "--output", help="Output file (Default. stdout)") 371 | parser_status.set_defaults(func=CommentCommand().exec) 372 | 373 | args = parser.parse_args() 374 | args.func(args) 375 | 376 | 377 | if __name__ == "__main__": 378 | main() 379 | --------------------------------------------------------------------------------