├── .asset
├── openrca.png
└── rcaagent.png
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── dataset
└── README.md
├── docs
├── .gitignore
├── eslint.config.js
├── index.html
├── package-lock.json
├── package.json
├── public
│ ├── cuhksz.png
│ ├── microsoft.jpg
│ ├── ms_logo.svg
│ ├── overview.pdf
│ ├── overview.png
│ ├── overview.svg
│ ├── thu.jpg
│ └── vite.svg
├── src
│ ├── App.tsx
│ ├── components
│ │ └── Layout.tsx
│ ├── data
│ │ └── modelData.ts
│ ├── index.css
│ ├── main.tsx
│ └── pages
│ │ └── Home.tsx
├── tsconfig.app.json
├── tsconfig.json
├── tsconfig.node.json
└── vite.config.ts
├── main
├── evaluate.py
├── generate.py
├── prompt.py
└── task_specification.json
├── rca
├── api_config.yaml
├── api_router.py
├── archive
│ ├── agent-Bank.csv
│ ├── agent-Market-cloudbed-1.csv
│ ├── agent-Market-cloudbed-2.csv
│ └── agent-Telecom.csv
├── baseline
│ ├── cot_lm.py
│ ├── direct_lm.py
│ ├── oracle_kpis.py
│ └── rca_agent
│ │ ├── controller.py
│ │ ├── executor.py
│ │ ├── prompt
│ │ ├── agent_prompt.py
│ │ ├── basic_prompt_Bank.py
│ │ ├── basic_prompt_Market.py
│ │ └── basic_prompt_Telecom.py
│ │ └── rca_agent.py
├── run_agent_standard.py
├── run_sampling_balanced.py
├── run_sampling_oracle.py
└── test.sh
└── requirements.txt
/.asset/openrca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/.asset/openrca.png
--------------------------------------------------------------------------------
/.asset/rcaagent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/.asset/rcaagent.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # OpenRCA
2 | .temp/
3 | dataset/
4 | test/
5 | api_config.yaml
6 |
7 | # Mac
8 | .DS_Store
9 |
10 | # Byte-compiled / optimized / DLL files
11 | __pycache__/
12 | *.py[cod]
13 | *$py.class
14 |
15 | # C extensions
16 | *.so
17 |
18 | # Distribution / packaging
19 | .Python
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | wheels/
32 | share/python-wheels/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | MANIFEST
37 |
38 | # PyInstaller
39 | # Usually these files are written by a python script from a template
40 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 |
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 |
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .nox/
52 | .coverage
53 | .coverage.*
54 | .cache
55 | nosetests.xml
56 | coverage.xml
57 | *.cover
58 | *.py,cover
59 | .hypothesis/
60 | .pytest_cache/
61 | cover/
62 |
63 | # Translations
64 | *.mo
65 | *.pot
66 |
67 | # Django stuff:
68 | *.log
69 | local_settings.py
70 | db.sqlite3
71 | db.sqlite3-journal
72 |
73 | # Flask stuff:
74 | instance/
75 | .webassets-cache
76 |
77 | # Scrapy stuff:
78 | .scrapy
79 |
80 | # Sphinx documentation
81 | docs/_build/
82 |
83 | # PyBuilder
84 | .pybuilder/
85 | target/
86 |
87 | # Jupyter Notebook
88 | .ipynb_checkpoints
89 |
90 | # IPython
91 | profile_default/
92 | ipython_config.py
93 |
94 | # pyenv
95 | # For a library or package, you might want to ignore these files since the code is
96 | # intended to run in multiple environments; otherwise, check them in:
97 | # .python-version
98 |
99 | # pipenv
100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | # install all needed dependencies.
104 | #Pipfile.lock
105 |
106 | # poetry
107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | # This is especially recommended for binary packages to ensure reproducibility, and is more
109 | # commonly ignored for libraries.
110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 |
113 | # pdm
114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | # in version control.
118 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
119 | .pdm.toml
120 | .pdm-python
121 | .pdm-build/
122 |
123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124 | __pypackages__/
125 |
126 | # Celery stuff
127 | celerybeat-schedule
128 | celerybeat.pid
129 |
130 | # SageMath parsed files
131 | *.sage.py
132 |
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 |
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 |
146 | # Rope project settings
147 | .ropeproject
148 |
149 | # mkdocs documentation
150 | /site
151 |
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 |
157 | # Pyre type checker
158 | .pyre/
159 |
160 | # pytype static type analyzer
161 | .pytype/
162 |
163 | # Cython debug symbols
164 | cython_debug/
165 |
166 | # PyCharm
167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169 | # and can be added to the global gitignore or merged into this file. For a more nuclear
170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
171 | #.idea/
172 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OpenRCA
2 |
3 | 
4 | [](https://opensource.org/licenses/MIT)
5 | 
6 |
7 |
8 |
9 | OpenRCA is a benchmark for assessing LLMs' root cause analysis ability in a software operating scenario. When given a natural language query, LLMs need to analyze large volumes of telemetry data to identify the relevant root cause elements. This process requires the models to understand complex system dependencies and perform comprehensive reasoning across various types of telemetry data, including KPI time series, dependency trace graphs, and semi-structured log text.
10 |
11 |
12 |
13 |
14 |
15 | We also introduce RCA-agent as a baseline for OpenRCA. By using Python for data retrieval and analysis, the model avoids processing overly long contexts, enabling it to focus on reasoning and scalable for extensive telemetry.
16 |
17 |
18 |
19 | ## ✨ Quick Start
20 |
21 | > ⚠️ Since the OpenRCA dataset includes a large amount of telemetry and RCA-agent requires extensive memory operations, we recommend using a device with at least 80GB of storage space and 32GB of memory.
22 |
23 | ### 🛠️ Installation
24 |
25 | OpenRCA requires **Python >= 3.10**. It can be installed by running the following command:
26 | ```bash
27 | # [optional to create conda environment]
28 | # conda create -n openrca python=3.10
29 | # conda activate openrca
30 |
31 | # clone the repository
32 | git clone https://github.com/microsoft/OpenRCA.git
33 | cd OpenRCA
34 | # install the requirements
35 | pip install -r requirements.txt
36 | ```
37 |
38 | The telemetry data can be download from [Google Drive](https://drive.google.com/drive/folders/1wGiEnu4OkWrjPxfx5ZTROnU37-5UDoPM?usp=drive_link). Once you have download the telemetry dataset, please put them into the path `dataset/` (which is empty now).
39 |
40 | The directory structure of the data is:
41 |
42 | ```
43 | .
44 | ├── {SYSTEM}
45 | │ ├── query.csv
46 | │ ├── record.csv
47 | │ └── telemetry
48 | │ ├── {DATE}
49 | │ │ ├── log
50 | │ │ ├── metric
51 | │ │ └── trace
52 | │ └── ...
53 | └── ...
54 | ```
55 |
56 | where the `{SYSTEM}` can be `Telecom`, `Bank`, or `Market`, and the `{DATE}` format is `{YYYY_MM_DD}`.
57 |
58 | ### 🖊️ Evaluation
59 |
60 | Using following command to evaluate:
61 |
62 | ```bash
63 | python -m main.evaluate \
64 | -p [prediction csv files to evaluate] \
65 | -q [groundtruth csv files to evaluate] \
66 | -r [report csv file to save]
67 | ```
68 |
69 | Note that the prediction CSV file must include at least a "prediction" field for valid evaluation (extra fields are allowed). Each prediction should be a JSON-like string containing all required elements for each query (extra elements are allowed). If there are multiple failures, list them in chronological order (e.g., 1, 2, 3, ...):
70 |
71 |
72 | ```json
73 | {
74 | "1": {
75 | "root cause occurrence datetime": "[%Y-%m-%d %H:%M:%S]",
76 | "root cause component": "[COMPONENT]",
77 | "root cause reason": "[REASON]"
78 | },
79 | ...
80 | }
81 | ```
82 |
83 | For example, to evaluate the archived predictions of RCA-agent (Claude ver.), you can use the following command:
84 |
85 | ```bash
86 | python -m main.evaluate \
87 | -p \
88 | rca/archive/agent-Bank.csv \
89 | rca/archive/agent-Market-cloudbed-1.csv \
90 | rca/archive/agent-Market-cloudbed-2.csv \
91 | rca/archive/agent-Telecom.csv \
92 | -q \
93 | dataset/Bank/query.csv \
94 | dataset/Market/cloudbed-1/query.csv \
95 | dataset/Market/cloudbed-2/query.csv \
96 | dataset/Telecom/query.csv \
97 | -r \
98 | test/agent_claude.csv
99 | ```
100 |
101 | ### 🚩 Reproduction
102 |
103 | To reproduce results in the paper, please first setup your API configurations before running OpenRCA's baselines. Taking OpenAI as an example, you can configure `rca/api_config.yaml` file as follows:
104 |
105 | ```yaml
106 | SOURCE: "OpenAI"
107 | MODEL: "gpt-4o-2024-05-13"
108 | API_KEY: "sk-xxxxxxxxxxxxxx"
109 | ```
110 |
111 | Then, run the following commands for result reproduction:
112 |
113 | ```bash
114 | python -m rca.{TESTS} --dataset {DATASET_NAME}
115 | # Optional tests: run_agent_standard, run_baseline_balanced, run_baseline_oracle
116 | # Optional datasets: Telecom, Bank, Market/cloudbed-1, Market/cloudbed-2
117 | ```
118 |
119 | For example, if you want to evaluate RCA-agent on Bank dataset, you should use the following command:
120 |
121 | ```bash
122 | python -m rca.run_agent_standard --dataset Bank
123 | ```
124 |
125 | Note that the telemetry of two Market cloudbed service group are collected separately. For example, if you want to evaluate RCA-agent on the whole Market dataset, you should use the following command:
126 |
127 | ```bash
128 | python -m rca.run_agent_standard --dataset Market/cloudbed-1
129 | python -m rca.run_agent_standard --dataset Market/cloudbed-2
130 | ```
131 |
132 | The generated results and monitor files can be found in a new `test` directory created after running any test script.
133 |
134 | ### 💽 Reconstruction
135 |
136 | You can generate new task for OpenRCA telemetry or your own privacy telemetry by modifying `main/task_specification.json` and run the following command:
137 |
138 | ```bash
139 | python -m main.generate \
140 | -s [your specification config file] \
141 | -r [record file to generate query] \
142 | -q [query file to save] \
143 | -t [timezone of telemetry]
144 | ```
145 |
146 | Note that the record schema should be consistent with the `record.csv` of OpenRCA.
147 |
148 | You can also re-generate random queries of OpenRCA with the following command:
149 |
150 | ```bash
151 | python -m main.generate -d True
152 | ```
153 |
154 | ## 📚 Citation
155 |
156 | If you use OpenRCA in your research, please cite our paper:
157 |
158 | ```bibtex
159 | @inproceedings{
160 | xu2025openrca,
161 | title={OpenRCA: Can Large Language Models Locate the Root Cause of Software Failures?},
162 | author={Xu, Junjielong and Zhang, Qinan and Zhong, Zhiqing and He, Shilin and Zhang, Chaoyun and Lin, Qingwei and Pei, Dan and He, Pinjia and Zhang, Dongmei and Zhang, Qi},
163 | booktitle={The Thirteenth International Conference on Learning Representations},
164 | year={2025},
165 | url={https://openreview.net/forum?id=M4qNIzQYpd}
166 | }
167 | ```
168 |
169 | ## Trademarks
170 |
171 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
172 | trademarks or logos is subject to and must follow
173 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
174 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
175 | Any use of third-party trademarks or logos are subject to those third-party's policies.
176 |
177 | ## Disclaimer
178 | The recommended models in this Repo are just examples, used to explore the potential of agent systems with the paper at ICLR2025. Users can replace the models in this Repo according to their needs. When using the recommended models in this Repo, you need to comply with the licenses of these models respectively. Microsoft shall not be held liable for any infringement of third-party rights resulting from your usage of this repo. Users agree to defend, indemnify and hold Microsoft harmless from and against all damages, costs, and attorneys' fees in connection with any claims arising from this Repo. If anyone believes that this Repo infringes on your rights, please notify the project owner email.
179 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
1 | # TODO: The maintainer of this repo has not yet edited this file
2 |
3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
4 |
5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
8 |
9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 |
11 | # Support
12 |
13 | ## How to file issues and get help
14 |
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
17 | feature request as a new Issue.
18 |
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 |
23 | ## Microsoft Support Policy
24 |
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 |
--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
1 | # Dataset
2 |
3 | Please download all datasets from [Google Drive](https://drive.google.com/drive/folders/1wGiEnu4OkWrjPxfx5ZTROnU37-5UDoPM?usp=drive_link).
--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 | pnpm-debug.log*
8 | lerna-debug.log*
9 |
10 | node_modules
11 | dist
12 | dist-ssr
13 | *.local
14 |
15 | # Editor directories and files
16 | .vscode/*
17 | !.vscode/extensions.json
18 | .idea
19 | .DS_Store
20 | *.suo
21 | *.ntvs*
22 | *.njsproj
23 | *.sln
24 | *.sw?
25 |
--------------------------------------------------------------------------------
/docs/eslint.config.js:
--------------------------------------------------------------------------------
1 | import js from '@eslint/js'
2 | import globals from 'globals'
3 | import reactHooks from 'eslint-plugin-react-hooks'
4 | import reactRefresh from 'eslint-plugin-react-refresh'
5 | import tseslint from 'typescript-eslint'
6 |
7 | export default tseslint.config(
8 | { ignores: ['dist'] },
9 | {
10 | extends: [js.configs.recommended, ...tseslint.configs.recommended],
11 | files: ['**/*.{ts,tsx}'],
12 | languageOptions: {
13 | ecmaVersion: 2020,
14 | globals: globals.browser,
15 | },
16 | plugins: {
17 | 'react-hooks': reactHooks,
18 | 'react-refresh': reactRefresh,
19 | },
20 | rules: {
21 | ...reactHooks.configs.recommended.rules,
22 | 'react-refresh/only-export-components': [
23 | 'warn',
24 | { allowConstantExport: true },
25 | ],
26 | },
27 | },
28 | )
29 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | OpenRCA
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/docs/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "leadboard",
3 | "private": true,
4 | "version": "0.0.0",
5 | "type": "module",
6 | "homepage": "https://github.com/shuaijiumei/openrca_leadboard",
7 | "scripts": {
8 | "dev": "vite",
9 | "build": "tsc && vite build",
10 | "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
11 | "preview": "vite preview",
12 | "predeploy": "npm run build",
13 | "deploy": "gh-pages -d dist"
14 | },
15 | "dependencies": {
16 | "@emotion/react": "^11.11.3",
17 | "@emotion/styled": "^11.11.0",
18 | "@mui/icons-material": "^5.15.10",
19 | "@mui/material": "^5.15.10",
20 | "react": "^18.2.0",
21 | "react-dom": "^18.2.0",
22 | "react-router-dom": "^6.22.1"
23 | },
24 | "devDependencies": {
25 | "@types/react": "^18.2.55",
26 | "@types/react-dom": "^18.2.19",
27 | "@typescript-eslint/eslint-plugin": "^6.21.0",
28 | "@typescript-eslint/parser": "^6.21.0",
29 | "@vitejs/plugin-react": "^4.2.1",
30 | "eslint": "^8.56.0",
31 | "eslint-plugin-react-hooks": "^4.6.0",
32 | "eslint-plugin-react-refresh": "^0.4.5",
33 | "gh-pages": "^6.3.0",
34 | "typescript": "^5.2.2",
35 | "vite": "^6.2.5"
36 | }
37 | }
--------------------------------------------------------------------------------
/docs/public/cuhksz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/docs/public/cuhksz.png
--------------------------------------------------------------------------------
/docs/public/microsoft.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/docs/public/microsoft.jpg
--------------------------------------------------------------------------------
/docs/public/ms_logo.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/public/overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/docs/public/overview.pdf
--------------------------------------------------------------------------------
/docs/public/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/docs/public/overview.png
--------------------------------------------------------------------------------
/docs/public/thu.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/docs/public/thu.jpg
--------------------------------------------------------------------------------
/docs/public/vite.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/src/App.tsx:
--------------------------------------------------------------------------------
1 | import { BrowserRouter as Router, Routes, Route } from 'react-router-dom';
2 | import Layout from './components/Layout';
3 | import Home from './pages/Home';
4 | import { ThemeProvider, createTheme } from '@mui/material';
5 |
6 | const theme = createTheme({
7 | typography: {
8 | fontFamily: '"Inter", "Helvetica", "Arial", sans-serif',
9 | },
10 | palette: {
11 | primary: {
12 | main: '#1976d2',
13 | },
14 | },
15 | });
16 |
17 | function App() {
18 | return (
19 |
20 |
21 |
22 |
23 | } />
24 | {/* Add more routes as needed */}
25 |
26 |
27 |
28 |
29 | );
30 | }
31 |
32 | export default App;
--------------------------------------------------------------------------------
/docs/src/components/Layout.tsx:
--------------------------------------------------------------------------------
1 | import { AppBar, Toolbar, Typography, Button, Box } from '@mui/material';
2 | import { Link } from 'react-router-dom';
3 |
4 | interface LayoutProps {
5 | children: React.ReactNode;
6 | }
7 |
8 | const Layout: React.FC = ({ children }) => {
9 | return (
10 |
11 |
12 |
13 |
14 |
15 | OpenRCA
16 |
17 |
18 | {/* */}
21 |
24 |
27 |
28 |
29 |
30 | {children}
31 |
32 |
33 | );
34 | };
35 |
36 | export default Layout;
--------------------------------------------------------------------------------
/docs/src/data/modelData.ts:
--------------------------------------------------------------------------------
1 | export interface Data {
2 | name: string;
3 | model: string;
4 | org: string;
5 | correct: string;
6 | partial: string;
7 | date: string;
8 | }
9 |
10 | // 模型颜色映射
11 | export const modelColorMap: { [key: string]: { color: string, backgroundColor: string } } = {
12 | 'Claude 3.5 Sonnet': { color: '#1a237e', backgroundColor: '#e8eaf6' },
13 | 'GPT-4o': { color: '#004d40', backgroundColor: '#e0f2f1' },
14 | 'Gemini 1.5 Pro': { color: '#b71c1c', backgroundColor: '#ffebee' },
15 | 'Mistral Large 2': { color: '#0d47a1', backgroundColor: '#bbdefb' },
16 | 'Command R+': { color: '#4a148c', backgroundColor: '#e1bee7' },
17 | 'Llama 3.1 Instruct': { color: '#e65100', backgroundColor: '#ffe0b2' }
18 | };
19 |
20 | // 组织图标映射
21 | export const orgLogoMap: { [key: string]: string } = {
22 | 'Microsoft': '/OpenRCA/ms_logo.svg',
23 | 'Google': '/OpenRCA/google_logo.svg',
24 | 'OpenAI': '/OpenRCA/openai_logo.svg',
25 | 'Anthropic': '/OpenRCA/anthropic_logo.svg',
26 | 'Meta': '/OpenRCA/meta_logo.svg'
27 | };
28 |
29 | // 新闻数据
30 | export const news = [
31 | { date: '2025/1/23', content: 'Our paper has been accepted by ICLR 2025.' },
32 | { date: '2025/1/23', content: 'Released OpenRCA dataset with 335 failure cases.' }
33 | ];
34 |
35 | // 模型数据
36 | export const modelData: Data[] = [
37 | // Closed Models - RCA-Agent
38 | { name: 'RCA-Agent', model: 'Claude 3.5 Sonnet', org: 'Microsoft', correct: '11.34%', partial: '17.31%', date: '2025/1/23' },
39 | { name: 'RCA-Agent', model: 'GPT-4o', org: 'Microsoft', correct: '8.96%', partial: '17.91%', date: '2025/1/23' },
40 | { name: 'RCA-Agent', model: 'Gemini 1.5 Pro', org: 'Microsoft', correct: '2.69%', partial: '6.87%', date: '2025/1/23' },
41 |
42 | // Closed Models - Balanced
43 | { name: 'Prompting (Balanced)', model: 'Claude 3.5 Sonnet', org: 'Microsoft', correct: '3.88%', partial: '18.81%', date: '2025/1/23' },
44 | { name: 'Prompting (Balanced)', model: 'GPT-4o', org: 'Microsoft', correct: '3.28%', partial: '14.33%', date: '2025/1/23' },
45 | { name: 'Prompting (Balanced)', model: 'Gemini 1.5 Pro', org: 'Microsoft', correct: '6.27%', partial: '24.18%', date: '2025/1/23' },
46 |
47 | // Closed Models - Oracle
48 | { name: 'Prompting (Oracle)', model: 'Claude 3.5 Sonnet', org: 'Microsoft', correct: '5.37%', partial: '17.61%', date: '2025/1/23' },
49 | { name: 'Prompting (Oracle)', model: 'GPT-4o', org: 'Microsoft', correct: '6.27%', partial: '15.82%', date: '2025/1/23' },
50 | { name: 'Prompting (Oracle)', model: 'Gemini 1.5 Pro', org: 'Microsoft', correct: '7.16%', partial: '23.58%', date: '2025/1/23' },
51 |
52 | // Open Source Models - Balanced
53 | { name: 'Prompting (Balanced)', model: 'Mistral Large 2', org: 'Microsoft', correct: '3.58%', partial: '6.40%', date: '2025/1/23' },
54 | { name: 'Prompting (Balanced)', model: 'Command R+', org: 'Microsoft', correct: '4.18%', partial: '8.96%', date: '2025/1/23' },
55 | { name: 'Prompting (Balanced)', model: 'Llama 3.1 Instruct', org: 'Microsoft', correct: '2.99%', partial: '14.63%', date: '2025/1/23' },
56 |
57 | // Open Source Models - Oracle
58 | { name: 'Prompting (Oracle)', model: 'Mistral Large 2', org: 'Microsoft', correct: '4.48%', partial: '10.45%', date: '2025/1/23' },
59 | { name: 'Prompting (Oracle)', model: 'Command R+', org: 'Microsoft', correct: '4.78%', partial: '7.46%', date: '2025/1/23' },
60 | { name: 'Prompting (Oracle)', model: 'Llama 3.1 Instruct', org: 'Microsoft', correct: '3.88%', partial: '14.93%', date: '2025/1/23' },
61 |
62 | // Open Source Models - RCA-Agent
63 | { name: 'RCA-Agent', model: 'Llama 3.1 Instruct', org: 'Microsoft', correct: '3.28%', partial: '5.67%', date: '2025/1/23' }
64 | ];
--------------------------------------------------------------------------------
/docs/src/index.css:
--------------------------------------------------------------------------------
1 | :root {
2 | font-family: Inter, system-ui, Avenir, Helvetica, Arial, sans-serif;
3 | line-height: 1.5;
4 | font-weight: 400;
5 |
6 | color-scheme: light dark;
7 | color: #213547;
8 | background-color: #ffffff;
9 |
10 | font-synthesis: none;
11 | text-rendering: optimizeLegibility;
12 | -webkit-font-smoothing: antialiased;
13 | -moz-osx-font-smoothing: grayscale;
14 | }
15 |
16 | body {
17 | margin: 0;
18 | min-width: 320px;
19 | min-height: 100vh;
20 | }
21 |
22 | a {
23 | font-weight: 500;
24 | color: #646cff;
25 | text-decoration: inherit;
26 | }
27 | a:hover {
28 | color: #747bff;
29 | }
30 |
31 | @media (prefers-color-scheme: dark) {
32 | :root {
33 | color: #213547;
34 | background-color: #ffffff;
35 | }
36 | }
--------------------------------------------------------------------------------
/docs/src/main.tsx:
--------------------------------------------------------------------------------
1 | import React from 'react'
2 | import ReactDOM from 'react-dom/client'
3 | import App from './App.tsx'
4 | import './index.css'
5 |
6 | ReactDOM.createRoot(document.getElementById('root')!).render(
7 |
8 |
9 | ,
10 | )
--------------------------------------------------------------------------------
/docs/tsconfig.app.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
4 | "target": "ES2020",
5 | "useDefineForClassFields": true,
6 | "lib": ["ES2020", "DOM", "DOM.Iterable"],
7 | "module": "ESNext",
8 | "skipLibCheck": true,
9 |
10 | /* Bundler mode */
11 | "moduleResolution": "bundler",
12 | "allowImportingTsExtensions": true,
13 | "isolatedModules": true,
14 | "moduleDetection": "force",
15 | "noEmit": true,
16 | "jsx": "react-jsx",
17 |
18 | /* Linting */
19 | "strict": true,
20 | "noUnusedLocals": true,
21 | "noUnusedParameters": true,
22 | "noFallthroughCasesInSwitch": true,
23 | "noUncheckedSideEffectImports": true
24 | },
25 | "include": ["src"]
26 | }
27 |
--------------------------------------------------------------------------------
/docs/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "files": [],
3 | "references": [
4 | { "path": "./tsconfig.app.json" },
5 | { "path": "./tsconfig.node.json" }
6 | ]
7 | }
8 |
--------------------------------------------------------------------------------
/docs/tsconfig.node.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
4 | "target": "ES2022",
5 | "lib": ["ES2023"],
6 | "module": "ESNext",
7 | "skipLibCheck": true,
8 |
9 | /* Bundler mode */
10 | "moduleResolution": "bundler",
11 | "allowImportingTsExtensions": true,
12 | "isolatedModules": true,
13 | "moduleDetection": "force",
14 | "noEmit": true,
15 |
16 | /* Linting */
17 | "strict": true,
18 | "noUnusedLocals": true,
19 | "noUnusedParameters": true,
20 | "noFallthroughCasesInSwitch": true,
21 | "noUncheckedSideEffectImports": true
22 | },
23 | "include": ["vite.config.ts"]
24 | }
25 |
--------------------------------------------------------------------------------
/docs/vite.config.ts:
--------------------------------------------------------------------------------
1 | import { defineConfig } from 'vite'
2 | import react from '@vitejs/plugin-react'
3 |
4 | // https://vite.dev/config/
5 | export default defineConfig({
6 | base: '/OpenRCA/',
7 | plugins: [react()],
8 | })
9 |
--------------------------------------------------------------------------------
/main/evaluate.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import re
4 | import argparse
5 |
6 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
7 | sys.path.insert(0, parent_dir)
8 |
9 | def evaluate(prediction:str, scoring_points:str):
10 | """
11 | Evaluate single JSON-like prediction with corresponding scoring points
12 | args:
13 | prediction: str, the prediction (JSON-like string)
14 | scoring_points: str, the scoring points string
15 | """
16 |
17 | import itertools
18 |
19 | predict_pattern = (
20 | r'{\s*'
21 | r'(?:"root cause occurrence datetime":\s*"(.*?)")?,?\s*'
22 | r'(?:"root cause component":\s*"(.*?)")?,?\s*'
23 | r'(?:"root cause reason":\s*"(.*?)")?\s*}'
24 | )
25 |
26 | predict_matches = re.findall(predict_pattern, prediction)
27 |
28 |
29 | predict_results = []
30 |
31 | for match in predict_matches:
32 | datetime_str, component, reason = match
33 | predict_results.append({
34 | "root cause occurrence datetime": datetime_str,
35 | "root cause component": component,
36 | "root cause reason": reason
37 | })
38 |
39 |
40 |
41 | prediction_length = len(predict_results)
42 |
43 | component_pattern = r"The (?:\d+-th|only) predicted root cause component is ([^\n]+)"
44 | reason_pattern = r"The (?:\d+-th|only) predicted root cause reason is ([^\n]+)"
45 | time_pattern = r"The (?:\d+-th|only) root cause occurrence time is within 1 minutes \(i.e., <=1min\) of ([^\n]+)"
46 |
47 | components = re.findall(component_pattern, scoring_points)
48 | reasons = re.findall(reason_pattern, scoring_points)
49 | times = re.findall(time_pattern, scoring_points)
50 |
51 | scoringpoints_length = max(len(components),len(reasons),len(times))
52 | socres_num = len(components)+len(reasons)+len(times)
53 |
54 | def time_difference(time1_str,time2_str):
55 | from datetime import datetime
56 | time_format = "%Y-%m-%d %H:%M:%S"
57 |
58 | try:
59 | time1 = datetime.strptime(time1_str, time_format)
60 | time2 = datetime.strptime(time2_str, time_format)
61 | except ValueError:
62 | return False
63 |
64 | time_difference = abs(time1 - time2)
65 | if time_difference.total_seconds() <= 60:
66 | return True
67 | else:
68 | return False
69 |
70 | scores_get = 0
71 | passing_criteria = []
72 | failing_criteria = []
73 |
74 | if scoringpoints_length == prediction_length:
75 | best_sore = -1
76 | for perm in itertools.permutations(predict_results):
77 | current_score = 0
78 | current_passing = []
79 | for i in range(scoringpoints_length):
80 | if len(components) == scoringpoints_length:
81 | if perm[i]['root cause component'] == components[i]:
82 | current_score +=1
83 | current_passing.append(components[i])
84 | if len(reasons) == scoringpoints_length:
85 | if perm[i]['root cause reason'] == reasons[i]:
86 | current_score +=1
87 | current_passing.append(reasons[i])
88 | if len(times) == scoringpoints_length:
89 | if time_difference(times[i],perm[i]['root cause occurrence datetime']):
90 | current_score +=1
91 | current_passing.append(times[i])
92 | if current_score>best_sore:
93 | best_sore = current_score
94 | passing_criteria = current_passing
95 | scores_get = best_sore
96 |
97 | failing_criteria = list(set(components+reasons+times)-set(passing_criteria))
98 |
99 | final_score = scores_get/socres_num
100 | bin_score = round(final_score,2)
101 | return passing_criteria, failing_criteria, bin_score
102 |
103 |
104 | def file_evaluate(prediction_file:str, query_file:str, report_file:str):
105 | """
106 | Evaluate a prediction file of certain dataset with corresponding query file and save the evaluation results to a csv file
107 | args:
108 | prediction_file: str, the path of the prediction file (csv, with at least one fields: 'prediction')
109 | query_file: str, the path of a specific dataset recorded labels (csv)
110 | report_file: str, the path of the evaluation file (csv)
111 | """
112 | import pandas as pd
113 |
114 | pred_df = pd.read_csv(prediction_file)
115 | query_df = pd.read_csv(query_file)
116 | eval_df = pd.DataFrame(columns=["query", "answer", "groundtruth", "passed", "failed", "score", "task_index"])
117 |
118 | if len(pred_df) != len(query_df):
119 | raise ValueError("The length of prediction file and record file should be the same")
120 |
121 | for idx in range(len(pred_df)):
122 | prediction = pred_df.loc[idx, "prediction"]
123 | scoring_points = query_df.loc[idx, "scoring_points"]
124 | passing_criteria, failing_criteria, score = evaluate(prediction, scoring_points)
125 | instruction = query_df.loc[idx, "instruction"]
126 | task_index = query_df.loc[idx, "task_index"]
127 | new_row = pd.DataFrame({
128 | "query": [instruction],
129 | "answer": [prediction],
130 | "groundtruth": [scoring_points],
131 | "passed": [passing_criteria],
132 | "failed": [failing_criteria],
133 | "score": [score],
134 | "task_index": [task_index]
135 | })
136 | eval_df = pd.concat([eval_df, new_row], ignore_index=True)
137 |
138 |
139 | if os.path.exists(report_file):
140 | eval_df.to_csv(report_file, mode='a', header=False, index=False)
141 | else:
142 | if not os.path.exists(os.path.dirname(report_file)):
143 | os.makedirs(os.path.dirname(report_file))
144 | eval_df.to_csv(report_file, index=False)
145 |
146 |
147 | def report(report_file):
148 | """
149 | Visualize the final result of a report after evaluation
150 | args:
151 | report_file: str, report after evaluation
152 | """
153 | import pandas as pd
154 |
155 | scores = {
156 | "easy": 0,
157 | "middle": 0,
158 | "hard": 0,
159 | }
160 | nums = {
161 | "easy": 0,
162 | "middle": 0,
163 | "hard": 0,
164 | }
165 |
166 | df = pd.read_csv(report_file)
167 | # By default, task_1-3 is easy, task_4-6 is middle, task_7 is hard. For DIY task specifications, you should change this line to modify the difficulty:
168 | df["difficulty"] = df["task_index"].apply(lambda x: "easy" if int(x.split('_')[1]) <= 3 else "middle" if int(x.split('_')[1]) <= 6 else "hard")
169 | scores['easy'] += len(df[(df["score"]==1.0) & (df["difficulty"]=="easy")])
170 | scores['middle'] += len(df[(df["score"]==1.0) & (df["difficulty"]=="middle")])
171 | scores['hard'] += len(df[(df["score"]==1.0) & (df["difficulty"]=="hard")])
172 | nums['easy'] += len(df[df["difficulty"]=="easy"])
173 | nums['middle'] += len(df[df["difficulty"]=="middle"])
174 | nums['hard'] += len(df[df["difficulty"]=="hard"])
175 |
176 | print(f"{'-'*12:<12}{'-'*12:<12}{'-'*12:<12}{'-'*12}")
177 | print(f"{'Class':<12}{'Total(#)':<12}{'Correct(#)':<12}{'Accuracy(%)':<12}")
178 | print(f"{'-'*12:<12}{'-'*12:<12}{'-'*12:<12}{'-'*12}")
179 | for key in scores.keys():
180 | accuracy = scores[key] / nums[key] if nums[key] > 0 else 0
181 | print(f"{key:<12}{nums[key]:<12}{scores[key]:<12}{accuracy:.2%}")
182 | print(f"{'-'*12:<12}{'-'*12:<12}{'-'*12:<12}{'-'*12}")
183 | total_accuracy = sum(scores.values()) / sum(nums.values()) if sum(nums.values()) > 0 else 0
184 | print(f"{'Total':<12}{sum(nums.values()):<12}{sum(scores.values()):<12}{total_accuracy:.2%}")
185 | print(f"{'-'*12:<12}{'-'*12:<12}{'-'*12:<12}{'-'*12}")
186 |
187 |
188 |
189 |
190 | if __name__ == '__main__':
191 | """
192 | Evaluate a list of prediction files with corresponding query files, save the evaluation results, and display the statistic results.
193 | args:
194 | p: list, a list of prediction files to evaluate
195 | q: list, a list of query files to evaluate
196 | r: str, report file to save
197 | """
198 | parser = argparse.ArgumentParser()
199 | parser.add_argument("-p", type=str, nargs='+', help="a list of prediction files to evaluate")
200 | parser.add_argument("-q", type=str, nargs='+', help="a list of query files to evaluate")
201 | parser.add_argument("-r", type=str, help="evaluation file to save")
202 | args = parser.parse_args()
203 |
204 | if len(args.p) != len(args.q):
205 | raise ValueError("The length of prediction files, query files and evaluation files should be the same")
206 | if os.path.exists(args.r):
207 | os.remove(args.r)
208 |
209 | for i in range(len(args.p)):
210 | try:
211 | file_evaluate(args.p[i], args.q[i], args.r)
212 | except Exception as e:
213 | print(f"Error when evaluating the file {args.p[i]}: {e}")
214 | continue
215 |
216 | report(args.r)
--------------------------------------------------------------------------------
/main/generate.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from datetime import datetime, timedelta
3 | import random
4 | import json
5 | import sys
6 | import os
7 | import pytz
8 | import argparse
9 | from main.prompt import system, user
10 |
11 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
12 | sys.path.insert(0, parent_dir)
13 |
14 | from scripts.utils import get_chat_completion
15 |
16 | random.seed(42)
17 |
18 | def timestamp2timeperiod(timestamp, timezone) -> str:
19 | time = datetime.fromtimestamp(timestamp, timezone)
20 | minute = time.minute
21 | start_time = time.replace(minute=minute - (minute % 30), second=0, microsecond=0)
22 | end_time = start_time + timedelta(minutes=30)
23 | start_time_str = start_time.strftime('%Y-%m-%d %H:%M:%S')
24 | end_time_str = end_time.strftime('%Y-%m-%d %H:%M:%S')
25 | return f"{start_time_str} to {end_time_str}"
26 |
27 | def timestamp2datetime(timestamp, timezone) -> str:
28 | time = datetime.fromtimestamp(timestamp, timezone)
29 | utc_plus_8_time = time.strftime('%Y-%m-%d %H:%M:%S')
30 | return utc_plus_8_time
31 |
32 | def get_half_hour_conflict_failure_flag(meta_data) -> str:
33 | sorted_time = sorted(meta_data['timestamp'])
34 | half_hour_conflict_failure_flag = {}
35 | previous_failure_timestamp = 0
36 | for i in range(len(sorted_time)):
37 | timestamp = sorted_time[i]
38 | current_failure_timestamp_left = timestamp // 1800
39 | if current_failure_timestamp_left > previous_failure_timestamp:
40 | previous_failure_timestamp = current_failure_timestamp_left
41 | half_hour_conflict_failure_flag[timestamp] = False
42 | else:
43 | half_hour_conflict_failure_flag[timestamp] = True
44 | half_hour_conflict_failure_flag[sorted_time[i - 1]] = True
45 | return half_hour_conflict_failure_flag
46 |
47 | def get_multi_response_dict(row, meta_data):
48 | num = 0
49 | multi_dict = {
50 | "datetime": [],
51 | "component": [],
52 | "reason": [],
53 | }
54 | cand_df = meta_data[meta_data['timestamp']//1800 == row['timestamp']//1800]
55 | for idx, cand in cand_df.iterrows():
56 | num += 1
57 | for key in multi_dict:
58 | multi_dict[key].append(cand[key])
59 |
60 | return num, multi_dict
61 |
62 | def query_generate(gt_path, spec_path, extra_spec, query_path, timezone):
63 |
64 | meta_data = pd.read_csv(gt_path)
65 | with open(spec_path, "r", encoding="utf8") as f:
66 | task_templates = json.load(f)
67 |
68 | half_hour_conflict_failure_flag = get_half_hour_conflict_failure_flag(meta_data)
69 |
70 | full_task_ID_list = list(task_templates.keys())
71 | df = pd.DataFrame(columns=["task_index", "instruction", "scoring_points"])
72 | for idx, row in meta_data.iterrows():
73 | print(f"processing: {idx}")
74 |
75 | timestamp = row['timestamp']
76 | reason = row['reason']
77 | component = row['component']
78 | datetime = timestamp2datetime(timestamp, timezone)
79 | time_period = timestamp2timeperiod(timestamp, timezone)
80 | task_index = random.choice(full_task_ID_list)
81 |
82 | if half_hour_conflict_failure_flag[timestamp]:
83 | num, ans = get_multi_response_dict(row, meta_data)
84 | scoring_points = ""
85 | for i in range(num):
86 | scoring_points_template = task_templates[task_index]['scoring_points'].copy()
87 |
88 | scoring_points_filled = [points.format(
89 | idx = f'{i+1}-th',
90 | datetime = ans['datetime'][i],
91 | reason = ans['reason'][i],
92 | component = ans['component'][i],
93 | ) for points in scoring_points_template]
94 | scoring_points += "\n".join(scoring_points_filled)
95 | scoring_points += "\n"
96 | print(f"The {idx}-th is a multi-response task, containing {num} root cause.")
97 | else:
98 | num = 1
99 | scoring_points = ""
100 | for point in task_templates[task_index]['scoring_points']:
101 | scoring_points += point.format(
102 | idx='only',
103 | time_period=time_period,
104 | datetime=datetime,
105 | component=component,
106 | reason=reason
107 | )
108 | scoring_points += "\n"
109 |
110 | input_specification = "```known\n"
111 | for spec in task_templates[task_index]['input']:
112 | input_specification += f"- "
113 | input_specification += spec.format(
114 | num=num,
115 | time_period=time_period
116 | )
117 | input_specification += "\n"
118 | if extra_spec:
119 | input_specification += f"- {extra_spec}\n"
120 | input_specification = input_specification.strip() + "\n```"
121 |
122 | output_specification = "```query\n"
123 | for spec in task_templates[task_index]['output']:
124 | output_specification += f"- "
125 | output_specification += spec.format(
126 | time_period="**UNKNOWN**",
127 | datetime="**UNKNOWN**",
128 | component="**UNKNOWN**",
129 | reason="**UNKNOWN**",
130 | )
131 | output_specification += "\n"
132 | output_specification = output_specification.strip() + "\n```"
133 |
134 | prompt = [
135 | {'role': 'system', 'content': system},
136 | {'role': 'user', 'content': user.format(input_specification=input_specification,
137 | output_specification=output_specification)},
138 | ]
139 |
140 | print(scoring_points)
141 |
142 | for i in range(3):
143 | # try:
144 | instruction = get_chat_completion(
145 | messages=prompt,
146 | temperature=1.0
147 | )
148 | instruction = instruction
149 | instruction = json.loads(instruction)['issue']
150 | break
151 | # except Exception as e:
152 | # print(e)
153 | # continue
154 |
155 | new_df = pd.DataFrame([{"task_index": task_index,
156 | "instruction": instruction,
157 | "scoring_points": scoring_points}])
158 | df = pd.concat([df, new_df],
159 | ignore_index=True)
160 |
161 | df.to_csv(query_path, index=False)
162 | print(f"genereated: {task_index}")
163 |
164 | template = """\t"{idx}": {{
165 | "root cause occurrence datetime": {datetime},
166 | "root cause component": {component},
167 | "root cause reason": {reason},
168 | }},\n"""
169 |
170 | key_field = ["root cause occurrence datetime", "root cause component", "root cause reason"]
171 |
172 |
173 | if __name__ == '__main__':
174 | """
175 | Generate the query based on the task specification and save it to the corresponding file
176 | args:
177 | d: bool, whether to use default setting or not
178 | s: str, the path of the task specification config
179 | r: list, a list of record files to generate query
180 | q: list, a list of query files to save
181 | e: list, a list of extra spec you want to add in your query (in addition to the spec in json config). If you don't want to add extra spec, just leave it None.
182 | t: str, timezone of the location where telemetry is collected
183 | """
184 |
185 | parser = argparse.ArgumentParser()
186 | parser.add_argument("-d", type=bool, default=False, help="default setting or not")
187 | parser.add_argument("-s", type=str, help="the path of the task specification config")
188 | parser.add_argument("-r", type=str, nargs='+', help="a list of record files to generate query")
189 | parser.add_argument("-q", type=str, nargs='+', help="a list of query files to save")
190 | parser.add_argument("-e", type=str, nargs='+', help="a list of extra spec you want to add in your query")
191 | parser.add_argument("-t", type=str, help="timezone of the location where telemetry is collected")
192 | args = parser.parse_args()
193 |
194 | if args.d:
195 | spec_path = 'main/task_specification.json'
196 | record_path_list = [
197 | 'dataset/Market/cloudbed-1/record.csv',
198 | 'dataset/Market/cloudbed-2/record.csv',
199 | 'dataset/Bank/record.csv',
200 | 'dataset/Telecom/record.csv',
201 | ]
202 | extra_spec_list = [
203 | "system: cloudbed-1",
204 | "system: cloudbed-2",
205 | None,
206 | None,
207 | ]
208 | query_path_list = [
209 | 'dataset/Market/cloudbed-1/query.csv',
210 | 'dataset/Market/cloudbed-2/query.csv',
211 | 'dataset/Bank/query.csv',
212 | 'dataset/Telecom/query.csv',
213 | ]
214 | timezone = pytz.timezone('Asia/Shanghai')
215 |
216 | else:
217 | spec_path = args.s
218 | record_path_list = args.r
219 | extra_spec_list = args.e if args.e else [None] * len(args.r)
220 | query_path_list = args.q
221 | timezone = pytz.timezone(args.t)
222 |
223 | data_list = list(zip(record_path_list, extra_spec_list, query_path_list))
224 |
225 | for record_path, extra_spec, query_path in data_list:
226 | print("processing: ", record_path)
227 | query_generate(record_path, spec_path, extra_spec, query_path, timezone)
--------------------------------------------------------------------------------
/main/prompt.py:
--------------------------------------------------------------------------------
1 | system = """Let's play a game. In this game, your task is to generate a issue related to DevOps failure diagnosis based on a given set of specifications. The goal is to make the issue realistic enough that even top human experts might believe it reflects a genuine issue an engineer could encounter at work. They should not be able to tell that the issue was generated by an AI Assistant based on specifications.
2 |
3 | The specifications provided to you include the following components:
4 |
5 | ```known
6 | (The known information explicitly provided in the issue.)
7 | ```
8 |
9 | ```query
10 | (The target query that required the user to answer.)
11 | ```
12 |
13 | Your response should follow the JSON format below:
14 |
15 | {
16 | "issue": (Your generated issue based on the specifications.)
17 | }
18 | (DO NOT contain "```json" and "```" tags. DO contain the JSON object with the brackets "{}" only.)
19 |
20 | For example, if the following specifications are given:
21 |
22 | ```known
23 | - number of failures: 1
24 | - time range: 2022-03-21 11:30:00 to 2022-03-21 12:00:00
25 | - system: None
26 | ```
27 |
28 | ```query
29 | - root cause occurrence time: **UNKNOWN**
30 | ```
31 |
32 | Then, you could generate a issue be like:
33 |
34 | {
35 | "issue": "During the specified time range of March 21, 2022, from 11:30 to 12:00, the cloud service system experience a failure. The exact time of the root cause occurrence is unknown, which complicates the diagnosis process. Please pinpoint the root cause occurrence datetime."
36 | }
37 |
38 | There is another example:
39 |
40 | ```known
41 | - number of failures: 2
42 | - time range: 2022-03-20 09:30:00 to 2022-03-20 10:00:00
43 | - system: cloudbed-1
44 | ```
45 |
46 | ```query
47 | - root cause occurrence time: **UNKNOWN**
48 | - root cause component: **UNKNOWN**
49 | - root cause reason: **UNKNOWN**
50 | ```
51 |
52 | The generated issue be like:
53 |
54 | {
55 | "issue": "The cloud service system, cloudbed-1, may have experienced two failures within the time range of March 20, 2022, from 09:30 to 10:00. The exact number of failures, the time of occurrence, the affected components, and the underlying reasons for these failures are currently unknown. You are tasked with identifying the root cause occurrence datetime, the root cause component, and the root cause reason."
56 | }
57 |
58 | Some rules to follow:
59 |
60 | 1. Do not tell the user "how to solve the issue" (e.g., retrieve the telemetry data like metrics/logs/traces).
61 | 2. Do not involve human interaction in the issue (e.g., "ask the engineer for more information").
62 | 3. Do not include any specific values that are not mentioned in the specification (e.g., "the CPU usage was 80%").
63 |
64 | Now, let's get started!"""
65 |
66 | user = """Please generate a issue related to DevOps failure diagnosis based on the following specifications:
67 |
68 | ```known
69 | {input_specification}
70 | ```
71 |
72 | ```query
73 | {output_specification}
74 | ```"""
--------------------------------------------------------------------------------
/main/task_specification.json:
--------------------------------------------------------------------------------
1 | {
2 | "task_1": {
3 | "input": [
4 | "time range: {time_period}",
5 | "number of failures: {num}"
6 | ],
7 | "output": [
8 | "root cause occurrence time: {datetime}"
9 | ],
10 | "scoring_points": [
11 | "The {idx} root cause occurrence time is within 1 minutes (i.e., <=1min) of {datetime}"
12 | ]
13 | },
14 | "task_2": {
15 | "input": [
16 | "time range: {time_period}",
17 | "number of failures: {num}"
18 | ],
19 | "output": [
20 | "root cause reason: {reason}"
21 | ],
22 | "scoring_points": [
23 | "The {idx} predicted root cause reason is {reason}"
24 | ]
25 | },
26 | "task_3": {
27 | "input": [
28 | "time range: {time_period}",
29 | "number of failures: {num}"
30 | ],
31 | "output": [
32 | "root cause component: {component}"
33 | ],
34 | "scoring_points": [
35 | "The {idx} predicted root cause component is {component}"
36 | ]
37 | },
38 | "task_4": {
39 | "input": [
40 | "time range: {time_period}",
41 | "number of failures: {num}"
42 | ],
43 | "output": [
44 | "root cause occurrence time: {datetime}",
45 | "root cause reason: {reason}"
46 | ],
47 | "scoring_points": [
48 | "The {idx} root cause occurrence time is within 1 minutes (i.e., <=1min) of {datetime}",
49 | "The {idx} predicted root cause reason is {reason}"
50 | ]
51 | },
52 | "task_5": {
53 | "input": [
54 | "time range: {time_period}",
55 | "number of failures: {num}"
56 | ],
57 | "output": [
58 | "root cause occurrence time: {datetime}",
59 | "root cause component: {component}"
60 | ],
61 | "scoring_points": [
62 | "The {idx} root cause occurrence time is within 1 minutes (i.e., <=1min) of {datetime}",
63 | "The {idx} predicted root cause component is {component}"
64 | ]
65 | },
66 | "task_6": {
67 | "input": [
68 | "time range: {time_period}",
69 | "number of failures: {num}"
70 | ],
71 | "output": [
72 | "root cause component: {component}",
73 | "root cause reason: {reason}"
74 | ],
75 | "scoring_points": [
76 | "The {idx} predicted root cause component is {component}",
77 | "The {idx} predicted root cause reason is {reason}"
78 | ]
79 | },
80 | "task_7": {
81 | "input": [
82 | "time range: {time_period}",
83 | "number of failures: {num}"
84 | ],
85 | "output": [
86 | "root cause component: {component}",
87 | "root cause occurrence time: {datetime}",
88 | "root cause reason: {reason}"
89 | ],
90 | "scoring_points": [
91 | "The {idx} root cause occurrence time is within 1 minutes (i.e., <=1min) of {datetime}",
92 | "The {idx} predicted root cause component is {component}",
93 | "The {idx} predicted root cause reason is {reason}"
94 | ]
95 | }
96 | }
--------------------------------------------------------------------------------
/rca/api_config.yaml:
--------------------------------------------------------------------------------
1 | SOURCE: "OpenAI"
2 | MODEL: "gpt-4o-2024-05-13"
3 | API_KEY: "sk-xxxxxxxxxxxxxx"
4 | API_BASE: ""
--------------------------------------------------------------------------------
/rca/api_router.py:
--------------------------------------------------------------------------------
1 | import os
2 | import yaml
3 | import time
4 |
5 | def load_config(config_path="rca/api_config.yaml"):
6 | configs = dict(os.environ)
7 | with open(config_path, "r") as file:
8 | yaml_data = yaml.safe_load(file)
9 | configs.update(yaml_data)
10 | return configs
11 |
12 | configs = load_config()
13 |
14 | def OpenAI_chat_completion(messages, temperature):
15 | from openai import OpenAI
16 | client = OpenAI(
17 | api_key=configs["API_KEY"]
18 | )
19 | return client.chat.completions.create(
20 | model = configs["MODEL"],
21 | messages = messages,
22 | temperature = temperature,
23 | ).choices[0].message.content
24 |
25 | def Google_chat_completion(messages, temperature):
26 | import google.generativeai as genai
27 | genai.configure(
28 | api_key=configs["API_KEY"]
29 | )
30 | genai.GenerationConfig(temperature=temperature)
31 | system_instruction = messages[0]["content"] if messages[0]["role"] == "system" else None
32 | messages = [item for item in messages if item["role"] != "system"]
33 | messages = [{"role": "model" if item["role"] == "assistant" else item["role"], "parts": item["content"]} for item in messages]
34 | history = messages[:-1]
35 | message = messages[-1]
36 | return genai.GenerativeModel(
37 | model_name=configs["MODEL"],
38 | system_instruction=system_instruction
39 | ).start_chat(
40 | history=history if history != [] else None
41 | ).send_message(message).text
42 |
43 | def Anthropic_chat_completion(messages, temperature):
44 | import anthropic
45 | client = anthropic.Anthropic(
46 | api_key=configs["API_KEY"]
47 | )
48 | return client.messages.create(
49 | model=configs["MODEL"],
50 | messages=messages,
51 | temperature=temperature
52 | ).content
53 |
54 | # for 3-rd party API which is compatible with OpenAI API (with different 'API_BASE')
55 | def AI_chat_completion(messages, temperature):
56 | from openai import OpenAI
57 | client = OpenAI(
58 | api_key=configs["API_KEY"],
59 | base_url=configs["API_BASE"]
60 | )
61 | return client.chat.completions.create(
62 | model = configs["MODEL"],
63 | messages = messages,
64 | temperature = temperature,
65 | ).choices[0].message.content
66 |
67 | def get_chat_completion(messages, temperature=0.0):
68 |
69 | def send_request():
70 | if configs["SOURCE"] == "AI":
71 | return AI_chat_completion(messages, temperature)
72 | elif configs["SOURCE"] == "OpenAI":
73 | return OpenAI_chat_completion(messages, temperature)
74 | elif configs["SOURCE"] == "Google":
75 | return Google_chat_completion(messages, temperature)
76 | elif configs["SOURCE"] == "Anthropic":
77 | return Anthropic_chat_completion(messages, temperature)
78 | else:
79 | raise ValueError("Invalid SOURCE in api_config file.")
80 |
81 | for i in range(3):
82 | try:
83 | return send_request()
84 | except Exception as e:
85 | print(e)
86 | if '429' in str(e):
87 | print("Rate limit exceeded. Waiting for 1 second.")
88 | time.sleep(1)
89 | continue
90 | else:
91 | raise e
--------------------------------------------------------------------------------
/rca/archive/agent-Telecom.csv:
--------------------------------------------------------------------------------
1 | instruction,prediction,groundtruth,passed,failed,score,row_id,task_index
2 | "During the specified time range of April 11, 2020, from 00:00 to 00:30, there was one failure reported. The exact reason for this failure is currently unknown. Your task is to identify the root cause reason for the failure during this period.","{
3 | ""1"": {
4 | ""root cause occurrence datetime"": ""2020-04-11 00:01:22"",
5 | ""root cause component"": ""os_015"",
6 | ""root cause reason"": ""CPU fault""
7 | }
8 | }","level: pod
9 | reason: CPU fault
10 | component: docker_003
11 | timestamp: 1586534700
12 | datetime: 2020-04-11 00:05:00",CPU fault,,1.0,0.0,task_2
13 | "Within the time range of April 11, 2020, from 01:30 to 02:00, there was one failure in the system. The exact time of the root cause occurrence and the reason behind this failure are currently unknown. You are required to identify the root cause occurrence datetime and the root cause reason.","{
14 | ""1"": {
15 | ""root cause occurrence datetime"": ""2020-04-11 01:44:00"",
16 | ""root cause component"": ""os_021"",
17 | ""root cause reason"": ""network delay""
18 | }
19 | }","level: pod
20 | reason: CPU fault
21 | component: docker_004
22 | timestamp: 1586540400
23 | datetime: 2020-04-11 01:40:00",,"2020-04-11 01:40:00
24 | CPU fault",0.0,1.0,task_4
25 | "On April 11, 2020, between 02:00 and 02:30, a failure occurred in the system. The number of failures recorded within this time range is 1. The reason for the root cause of this failure is currently unknown and needs to be determined.","{
26 | ""1"": {
27 | ""root cause occurrence datetime"": ""2020-04-11 02:00:00"",
28 | ""root cause component"": ""db_007"",
29 | ""root cause reason"": ""db connection limit""
30 | }
31 | }","level: service
32 | reason: db connection limit
33 | component: db_007
34 | timestamp: 1586542500
35 | datetime: 2020-04-11 02:15:00",db connection limit,,1.0,2.0,task_2
36 | "A single failure was detected during the time range of April 11, 2020, from 02:30 to 03:00. The specific reason for this failure is currently unknown. Your task is to determine the root cause reason.","{
37 | ""1"": {
38 | ""root cause component"": ""os_014"",
39 | ""root cause reason"": ""CPU fault""
40 | }
41 | }","level: pod
42 | reason: CPU fault
43 | component: docker_002
44 | timestamp: 1586544600
45 | datetime: 2020-04-11 02:50:00",CPU fault,,1.0,3.0,task_2
46 | "On April 11, 2020, between 04:30 and 05:00, there was a reported failure in the system. The exact time of the root cause occurrence, the affected component, and the underlying reason for this failure are currently unknown. You are tasked with identifying the root cause occurrence datetime, the root cause component, and the root cause reason.","{
47 | ""1"": {
48 | ""root cause occurrence datetime"": ""2020-04-11 04:29:00"",
49 | ""root cause component"": ""db_004"",
50 | ""root cause reason"": ""db connection limit""
51 | }
52 | }","level: pod
53 | reason: CPU fault
54 | component: docker_008
55 | timestamp: 1586551200
56 | datetime: 2020-04-11 04:40:00",,"docker_008
57 | 2020-04-11 04:40:00
58 | CPU fault",0.0,4.0,task_7
59 | "Within the specified time range of April 11, 2020, from 05:30 to 06:00, a failure was detected in the system. The precise time at which the root cause of the failure occurred is currently unknown. Please determine the exact root cause occurrence datetime to assist in diagnosing the issue.","{
60 | ""1"": {
61 | ""root cause occurrence datetime"": ""2020-04-11 05:45:58"",
62 | ""root cause component"": ""db_003"",
63 | ""root cause reason"": ""db connection limit""
64 | }
65 | }","level: service
66 | reason: db close
67 | component: db_003
68 | timestamp: 1586555100
69 | datetime: 2020-04-11 05:45:00",2020-04-11 05:45:00,,1.0,5.0,task_1
70 | "There was a single failure in the system within the time range of May 22, 2020, from 00:00 to 00:30. The precise time when the root cause of the failure occurred is unknown, and the reason behind the failure also remains unidentified. You are tasked with determining the exact root cause occurrence time and the underlying reason for the failure.","{
71 | ""1"": {
72 | ""root cause occurrence datetime"": ""2020-05-22 00:11:38"",
73 | ""root cause component"": ""db_011"",
74 | ""root cause reason"": ""db connection limit""
75 | }
76 | }","level: service
77 | reason: db close
78 | component: db_003
79 | timestamp: 1590077220
80 | datetime: 2020-05-22 00:07:00",,"2020-05-22 00:07:00
81 | db close",0.0,6.0,task_4
82 | "During the specified time range of May 22, 2020, from 00:30 to 01:00, the system experienced a single failure. The exact time of occurrence, affected component, and the underlying reason for this failure are currently unknown. Please pinpoint the root cause occurrence datetime, the root cause component, and the root cause reason.","{
83 | ""1"": {
84 | ""root cause occurrence datetime"": ""2020-05-22 00:59:00"",
85 | ""root cause component"": ""os_022"",
86 | ""root cause reason"": ""CPU fault""
87 | }
88 | }","level: pod
89 | reason: CPU fault
90 | component: docker_001
91 | timestamp: 1590079680
92 | datetime: 2020-05-22 00:48:00",CPU fault,"2020-05-22 00:48:00
93 | docker_001",0.3333333333333333,7.0,task_7
94 | "During the time range of May 22, 2020, from 01:30 to 02:00, there was one failure reported. The exact time of the root cause occurrence and the affected component are currently unknown. Please identify the root cause occurrence datetime and the root cause component.","{
95 | ""1"": {
96 | ""root cause occurrence datetime"": ""2020-05-22 01:32:00"",
97 | ""root cause component"": ""db_003"",
98 | ""root cause reason"": ""db connection limit""
99 | }
100 | }","level: node
101 | reason: network delay
102 | component: os_018
103 | timestamp: 1590083280
104 | datetime: 2020-05-22 01:48:00",,"2020-05-22 01:48:00
105 | os_018",0.0,8.0,task_5
106 | "On May 22, 2020, between 02:00 and 02:30, the system encountered a failure. The exact time of the root cause occurrence is currently unknown. Please determine the root cause occurrence datetime.","{
107 | ""1"": {
108 | ""root cause occurrence datetime"": ""2020-05-22 02:22:31"",
109 | ""root cause component"": ""docker_003"",
110 | ""root cause reason"": ""CPU fault""
111 | }
112 | }","level: pod
113 | reason: CPU fault
114 | component: docker_005
115 | timestamp: 1590085080
116 | datetime: 2020-05-22 02:18:00",,2020-05-22 02:18:00,0.0,9.0,task_1
117 | "During the specified time range of May 23, 2020, from 00:00 to 00:30, the system experienced a failure. The exact time of the root cause occurrence is unknown, complicating the diagnosis process. Please pinpoint the root cause occurrence datetime.","{
118 | ""1"": {
119 | ""root cause occurrence datetime"": ""2020-05-23 00:05:00"",
120 | ""root cause component"": ""os_021"",
121 | ""root cause reason"": ""CPU fault""
122 | }
123 | }","level: pod
124 | reason: CPU fault
125 | component: docker_004
126 | timestamp: 1590163500
127 | datetime: 2020-05-23 00:05:00",2020-05-23 00:05:00,,1.0,10.0,task_1
128 | "During the specified time range of May 23, 2020, from 00:30 to 01:00, the system experienced one failure. The exact component affected and the reason for this failure are unknown. Please identify the root cause component and the root cause reason.","{
129 | ""1"": {
130 | ""root cause component"": ""db_007"",
131 | ""root cause reason"": ""db connection limit""
132 | }
133 | }","level: service
134 | reason: db connection limit
135 | component: db_003
136 | timestamp: 1590165600
137 | datetime: 2020-05-23 00:40:00",db connection limit,db_003,0.5,11.0,task_6
138 | "There was one failure reported within the specified time range of May 23, 2020, from 01:00 to 01:30. The component responsible for the failure, the exact time of the root cause occurrence, and the underlying reason for this failure are currently unknown. Please identify the root cause component, the root cause occurrence datetime, and the root cause reason.","{
139 | ""1"": {
140 | ""root cause occurrence datetime"": ""2020-05-23 00:54:13"",
141 | ""root cause component"": ""os_021"",
142 | ""root cause reason"": ""network delay""
143 | }
144 | }","level: node
145 | reason: network loss
146 | component: os_021
147 | timestamp: 1590167760
148 | datetime: 2020-05-23 01:16:00",os_021,"2020-05-23 01:16:00
149 | network loss",0.3333333333333333,12.0,task_7
150 | "Within the specified time range of May 23, 2020, from 01:30 to 02:00, there was one failure. The exact time of the root cause occurrence is unknown, creating a challenge for failure diagnosis. Please identify the root cause occurrence datetime.","{
151 | ""1"": {
152 | ""root cause occurrence datetime"": ""2020-05-23 01:45:25"",
153 | ""root cause component"": ""db_002"",
154 | ""root cause reason"": ""db connection limit""
155 | }
156 | }","level: node
157 | reason: network loss
158 | component: os_009
159 | timestamp: 1590169920
160 | datetime: 2020-05-23 01:52:00",,2020-05-23 01:52:00,0.0,13.0,task_1
161 | "On May 23, 2020, between 03:30 and 04:00, there was a single failure observed in the system. The exact time when the root cause occurred is currently unknown. Please investigate and determine the root cause occurrence datetime.","{
162 | ""1"": {
163 | ""root cause occurrence datetime"": ""2020-05-23 03:33:26"",
164 | ""root cause component"": ""db_011"",
165 | ""root cause reason"": ""db connection limit""
166 | }
167 | }","level: node
168 | reason: network delay
169 | component: os_021
170 | timestamp: 1590176160
171 | datetime: 2020-05-23 03:36:00",,2020-05-23 03:36:00,0.0,14.0,task_1
172 | "Within the time range of May 23, 2020, from 04:30 to 05:00, there was one failure reported. The component responsible for this failure, the exact time of occurrence, and the underlying reason are currently unknown. You are tasked with identifying the root cause occurrence datetime, the root cause component, and the root cause reason.","{
173 | ""1"": {
174 | ""root cause occurrence datetime"": ""2020-05-23 04:44:23"",
175 | ""root cause component"": ""os_021"",
176 | ""root cause reason"": ""network delay""
177 | }
178 | }","level: pod
179 | reason: CPU fault
180 | component: docker_006
181 | timestamp: 1590180420
182 | datetime: 2020-05-23 04:47:00",,"2020-05-23 04:47:00
183 | docker_006
184 | CPU fault",0.0,15.0,task_7
185 | "During the specified time range of May 24, 2020, from 01:30 to 02:00, the system experienced a failure. The exact time of the root cause occurrence, the affected component, and the underlying reason for this failure are currently unknown. You are tasked with identifying the root cause occurrence datetime, the root cause component, and the root cause reason.","{
186 | ""1"": {
187 | ""root cause occurrence datetime"": ""2020-05-24 01:30:00"",
188 | ""root cause component"": ""os_022"",
189 | ""root cause reason"": ""CPU fault""
190 | }
191 | }","level: service
192 | reason: db close
193 | component: db_007
194 | timestamp: 1590256020
195 | datetime: 2020-05-24 01:47:00",,"db_007
196 | 2020-05-24 01:47:00
197 | db close",0.0,16.0,task_7
198 | "A failure occurred during the time range of May 24, 2020, from 03:00 to 03:30. The specific reason for this failure is currently unknown, and further diagnosis is required. Please identify the root cause reason.","{
199 | ""1"": {
200 | ""root cause component"": ""os_012"",
201 | ""root cause reason"": ""CPU fault""
202 | }
203 | }","level: pod
204 | reason: CPU fault
205 | component: docker_003
206 | timestamp: 1590261420
207 | datetime: 2020-05-24 03:17:00",CPU fault,,1.0,17.0,task_2
208 | "Within the specified time range of May 24, 2020, from 04:00 to 04:30, there was a single failure detected in the system. The specific reason behind this failure is currently unknown. Your task is to determine the root cause reason for this failure.","{
209 | ""1"": {
210 | ""root cause component"": ""db_013"",
211 | ""root cause reason"": ""CPU fault""
212 | }
213 | }","level: pod
214 | reason: CPU fault
215 | component: docker_006
216 | timestamp: 1590265020
217 | datetime: 2020-05-24 04:17:00",CPU fault,,1.0,18.0,task_2
218 | "On May 25, 2020, between 01:30 and 02:00, a failure occurred within the system. The exact time at which the root cause occurred and the reason for this failure are currently unknown. Please identify the root cause occurrence datetime and the root cause reason.","{
219 | ""1"": {
220 | ""root cause occurrence datetime"": ""2020-05-25 01:47:00"",
221 | ""root cause reason"": ""network delay""
222 | }
223 | }","level: node
224 | reason: network delay
225 | component: os_020
226 | timestamp: 1590342420
227 | datetime: 2020-05-25 01:47:00","network delay
228 | 2020-05-25 01:47:00",,1.0,19.0,task_4
229 | "There was a failure that occurred within the specified time range of May 25, 2020, from 03:30 to 04:00. The exact reason for this failure is unknown, complicating the failure diagnosis process. You are tasked with identifying the root cause reason.","{
230 | ""1"": {
231 | ""root cause occurrence datetime"": ""2020-05-25 03:23:57"",
232 | ""root cause component"": ""os_009"",
233 | ""root cause reason"": ""CPU fault""
234 | }
235 | }","level: node
236 | reason: network loss
237 | component: os_017
238 | timestamp: 1590349620
239 | datetime: 2020-05-25 03:47:00",,network loss,0.0,20.0,task_2
240 | "During the specified time range of May 25, 2020, from 04:30 to 05:00, one failure was detected in the system. However, the exact component involved, the time of occurrence, and the underlying reason for this failure are currently unknown. Please identify the root cause component, the root cause occurrence time, and the root cause reason.","{
241 | ""1"": {
242 | ""root cause occurrence datetime"": ""2020-05-25 04:44:05"",
243 | ""root cause component"": ""db_008"",
244 | ""root cause reason"": ""db connection limit""
245 | }
246 | }","level: service
247 | reason: db close
248 | component: db_003
249 | timestamp: 1590353220
250 | datetime: 2020-05-25 04:47:00",,"2020-05-25 04:47:00
251 | db_003
252 | db close",0.0,21.0,task_7
253 | "During the time range of May 26, 2020, from 02:00 to 02:30, the system experienced one failure. The exact time when the root cause occurred and the reason for this failure are currently unknown. Please identify the root cause occurrence datetime and the root cause reason.","{
254 | ""1"": {
255 | ""root cause occurrence datetime"": ""2020-05-26 01:54:25"",
256 | ""root cause component"": ""db_011"",
257 | ""root cause reason"": ""db connection limit""
258 | }
259 | }","level: service
260 | reason: db connection limit
261 | component: db_007
262 | timestamp: 1590430140
263 | datetime: 2020-05-26 02:09:00",db connection limit,2020-05-26 02:09:00,0.5,22.0,task_4
264 | "During the specified time range of May 26, 2020, from 02:30 to 03:00, there was one failure observed in the system. The exact time of the root cause occurrence is unknown. Please determine the root cause occurrence datetime.","{
265 | ""1"": {
266 | ""root cause occurrence datetime"": ""2020-05-26 02:20:01"",
267 | ""root cause component"": ""os_021"",
268 | ""root cause reason"": ""network delay""
269 | }
270 | }","level: pod
271 | reason: CPU fault
272 | component: docker_008
273 | timestamp: 1590431940
274 | datetime: 2020-05-26 02:39:00",,2020-05-26 02:39:00,0.0,23.0,task_1
275 | "There was a single system failure during the specified time range of May 26, 2020, from 03:00 to 03:30. The root cause of this failure remains unknown. Identifying the underlying reason for this failure is necessary to proceed with troubleshooting.","{
276 | ""1"": {
277 | ""root cause occurrence datetime"": ""2020-05-26 03:10:00"",
278 | ""root cause component"": ""os_021"",
279 | ""root cause reason"": ""network loss""
280 | }
281 | }","level: node
282 | reason: network delay
283 | component: os_021
284 | timestamp: 1590433740
285 | datetime: 2020-05-26 03:09:00",,network delay,0.0,24.0,task_2
286 | "During the time range of May 26, 2020, from 03:30 to 04:00, the system experienced a failure. The exact occurrence time and reason behind the root cause of this failure are unknown, which hinders the troubleshooting process. Your task is to identify the root cause occurrence datetime and the reason behind the failure.","{
287 | ""1"": {
288 | ""root cause occurrence datetime"": ""2020-05-26 03:40:32"",
289 | ""root cause component"": ""os_018"",
290 | ""root cause reason"": ""network delay""
291 | }
292 | }","level: node
293 | reason: network loss
294 | component: os_018
295 | timestamp: 1590435540
296 | datetime: 2020-05-26 03:39:00",,"network loss
297 | 2020-05-26 03:39:00",0.0,25.0,task_4
298 | "During the specified time range of May 26, 2020, from 04:00 to 04:30, the system experienced a failure. The root cause component responsible for this failure is currently unknown. Please identify the root cause component.","{
299 | ""1"": {
300 | ""root cause component"": ""os_020"",
301 | ""root cause reason"": ""network delay""
302 | }
303 | }","level: node
304 | reason: network delay
305 | component: os_020
306 | timestamp: 1590437700
307 | datetime: 2020-05-26 04:15:00",os_020,,1.0,26.0,task_3
308 | "During the specified time range of May 26, 2020, from 04:30 to 05:00, a single failure occurred. The exact time of the root cause occurrence, the affected component, and the underlying reason for this failure are currently unknown. You are tasked with identifying the root cause occurrence datetime, the root cause component, and the root cause reason.","{
309 | ""1"": {
310 | ""root cause occurrence datetime"": ""2020-05-26 04:45:00"",
311 | ""root cause component"": ""os_021"",
312 | ""root cause reason"": ""CPU fault""
313 | }
314 | }","level: node
315 | reason: network delay
316 | component: os_001
317 | timestamp: 1590439500
318 | datetime: 2020-05-26 04:45:00",2020-05-26 04:45:00,"network delay
319 | os_001",0.3333333333333333,27.0,task_7
320 | "Within the specified time range of May 26, 2020, from 05:00 to 05:30, there was a single failure detected in the system. However, the details about the root cause component, the exact time of the root cause occurrence, and the underlying reason for the failure are currently unknown. You need to identify the root cause component, the occurrence time, and the reason for this failure.","{
321 | ""1"": {
322 | ""root cause occurrence datetime"": ""2020-05-26 05:00:00"",
323 | ""root cause component"": ""db_002"",
324 | ""root cause reason"": ""db connection limit""
325 | }
326 | }","level: pod
327 | reason: CPU fault
328 | component: docker_002
329 | timestamp: 1590441300
330 | datetime: 2020-05-26 05:15:00",,"docker_002
331 | CPU fault
332 | 2020-05-26 05:15:00",0.0,28.0,task_7
333 | "A single failure was detected within the given time range of May 27, 2020, from 00:00 to 00:30. The specific time of the root cause occurrence and the underlying reason for this failure are currently unknown. Your task is to determine the root cause occurrence datetime and identify the root cause reason.","{
334 | ""1"": {
335 | ""root cause occurrence datetime"": ""2020-05-27 00:00:00"",
336 | ""root cause component"": ""os_012"",
337 | ""root cause reason"": ""network loss""
338 | }
339 | }","level: node
340 | reason: network delay
341 | component: os_018
342 | timestamp: 1590510180
343 | datetime: 2020-05-27 00:23:00",,"network delay
344 | 2020-05-27 00:23:00",0.0,29.0,task_4
345 | "There was one failure within the specified time range of May 27, 2020, from 00:30 to 01:00. The exact component responsible for this failure is currently unknown. Please identify the root cause component.","{
346 | ""1"": {
347 | ""root cause component"": ""os_003"",
348 | ""root cause reason"": ""network delay""
349 | }
350 | }","level: node
351 | reason: network delay
352 | component: os_017
353 | timestamp: 1590511980
354 | datetime: 2020-05-27 00:53:00",,os_017,0.0,30.0,task_3
355 | "During the specified time range of May 27, 2020, from 01:00 to 01:30, there was a failure reported in the system. The exact occurrence time of the root cause and the underlying reason for the failure are currently unknown. Please identify the root cause occurrence datetime and the root cause reason.","{
356 | ""1"": {
357 | ""root cause occurrence datetime"": ""2020-05-27 00:52:38"",
358 | ""root cause component"": ""db_009"",
359 | ""root cause reason"": ""db connection limit""
360 | }
361 | }","level: pod
362 | reason: CPU fault
363 | component: docker_006
364 | timestamp: 1590513780
365 | datetime: 2020-05-27 01:23:00",,"2020-05-27 01:23:00
366 | CPU fault",0.0,31.0,task_4
367 | "During the specified time range of May 27, 2020, from 01:30 to 02:00, the system experienced a failure. The exact root cause component and the underlying reason for this failure are currently unknown. Please identify the root cause component and the root cause reason.","{
368 | ""1"": {
369 | ""root cause occurrence datetime"": ""2020-05-27 01:24:00"",
370 | ""root cause component"": ""os_022"",
371 | ""root cause reason"": ""CPU fault""
372 | }
373 | }","level: service
374 | reason: db connection limit
375 | component: db_003
376 | timestamp: 1590515580
377 | datetime: 2020-05-27 01:53:00",,"db_003
378 | db connection limit",0.0,32.0,task_6
379 | "During the specified time range of May 27, 2020, from 02:00 to 02:30, the system experienced one failure. The exact component where the failure originated and the underlying reason for this failure are currently unknown. You are tasked with identifying the root cause component and the root cause reason.","{
380 | ""1"": {
381 | ""root cause component"": ""os_021"",
382 | ""root cause reason"": ""CPU fault""
383 | }
384 | }","level: service
385 | reason: db close
386 | component: db_007
387 | timestamp: 1590517380
388 | datetime: 2020-05-27 02:23:00",,"db_007
389 | db close",0.0,33.0,task_6
390 | "Within the time range of May 27, 2020, from 02:30 to 03:00, there was one failure detected in the system. The specific component responsible for this failure, the exact occurrence time, and the underlying reason for the failure are all currently unknown. You are tasked with identifying the root cause component, the root cause occurrence datetime, and the root cause reason.","{
391 | ""1"": {
392 | ""root cause occurrence datetime"": ""2020-05-27 02:30:00"",
393 | ""root cause component"": ""db_003"",
394 | ""root cause reason"": ""db connection limit""
395 | }
396 | }","level: service
397 | reason: db connection limit
398 | component: db_007
399 | timestamp: 1590519180
400 | datetime: 2020-05-27 02:53:00",db connection limit,"db_007
401 | 2020-05-27 02:53:00",0.3333333333333333,34.0,task_7
402 | "During the time range of May 27, 2020, from 05:00 to 05:30, a single failure occurred within the system. The specific component responsible for this failure, as well as the underlying reason, are currently unknown. Your task is to identify the root cause component and determine the root cause reason.","{
403 | ""1"": {
404 | ""root cause component"": ""db_007"",
405 | ""root cause reason"": ""db connection limit""
406 | }
407 | }","level: pod
408 | reason: CPU fault
409 | component: docker_001
410 | timestamp: 1590527340
411 | datetime: 2020-05-27 05:09:00",,"CPU fault
412 | docker_001",0.0,35.0,task_6
413 | "There was one failure recorded during the time range of May 27, 2020, from 05:30 to 06:00. The specific time when the root cause occurred and the reason for this failure are currently unknown. You need to identify the root cause occurrence datetime and the root cause reason.","{
414 | ""1"": {
415 | ""root cause occurrence datetime"": ""2020-05-27 05:24:03"",
416 | ""root cause component"": ""os_009"",
417 | ""root cause reason"": ""CPU fault""
418 | }
419 | }","level: node
420 | reason: network delay
421 | component: os_021
422 | timestamp: 1590529140
423 | datetime: 2020-05-27 05:39:00",,"2020-05-27 05:39:00
424 | network delay",0.0,36.0,task_4
425 | "During the specified time range of May 28, 2020, from 00:30 to 01:00, the system experienced a single failure. The root cause of this failure, however, is currently unknown. Please identify the reason behind this failure.","{
426 | ""1"": {
427 | ""root cause occurrence datetime"": ""2020-05-28 00:30:00"",
428 | ""root cause component"": ""os_021"",
429 | ""root cause reason"": ""CPU fault""
430 | }
431 | }","level: pod
432 | reason: CPU fault
433 | component: docker_001
434 | timestamp: 1590598020
435 | datetime: 2020-05-28 00:47:00",CPU fault,,1.0,37.0,task_2
436 | "Between the time range of May 28, 2020, from 03:30 to 04:00, a failure was detected in the system. The root cause of this failure is currently unknown. Your task is to identify the root cause reason for this failure.","{
437 | ""1"": {
438 | ""root cause component"": ""os_004"",
439 | ""root cause reason"": ""CPU fault""
440 | }
441 | }","level: node
442 | reason: network loss
443 | component: os_018
444 | timestamp: 1590608820
445 | datetime: 2020-05-28 03:47:00",,network loss,0.0,38.0,task_2
446 | "During the time range of May 29, 2020, from 02:00 to 02:30, the system experienced a failure. The exact component that caused this failure remains unknown. Please identify the root cause component responsible for this failure.","{
447 | ""1"": {
448 | ""root cause component"": ""db_003"",
449 | ""root cause reason"": ""db connection limit""
450 | }
451 | }","level: service
452 | reason: db connection limit
453 | component: db_003
454 | timestamp: 1590689460
455 | datetime: 2020-05-29 02:11:00",db_003,,1.0,39.0,task_3
456 | "On May 29, 2020, between 02:30 and 03:00, there was one failure reported. The specific reason for this failure is currently unknown. Your task is to identify the root cause reason behind this failure.","{
457 | ""1"": {
458 | ""root cause component"": ""docker_002"",
459 | ""root cause reason"": ""CPU fault""
460 | }
461 | }","level: pod
462 | reason: CPU fault
463 | component: docker_008
464 | timestamp: 1590691260
465 | datetime: 2020-05-29 02:41:00",CPU fault,,1.0,40.0,task_2
466 | "On May 29, 2020, between 03:30 and 04:00, a failure was identified in the system. However, the exact time when the root cause occurred is unknown. Please determine the root cause occurrence datetime.","{
467 | ""1"": {
468 | ""root cause occurrence datetime"": ""2020-05-29 03:20:00"",
469 | ""root cause component"": ""os_018"",
470 | ""root cause reason"": ""network loss""
471 | }
472 | }","level: pod
473 | reason: CPU fault
474 | component: docker_001
475 | timestamp: 1590694860
476 | datetime: 2020-05-29 03:41:00",,2020-05-29 03:41:00,0.0,41.0,task_1
477 | "On May 30, 2020, between 00:00 to 00:30, a single failure was observed. However, both the exact time of the root cause occurrence and the component involved in this failure are currently unknown. You are tasked with identifying the root cause occurrence datetime and the root cause component.","{
478 | ""1"": {
479 | ""root cause occurrence datetime"": ""2020-05-30 00:07:00"",
480 | ""root cause component"": ""os_002"",
481 | ""root cause reason"": ""CPU fault""
482 | }
483 | }","level: node
484 | reason: network loss
485 | component: os_021
486 | timestamp: 1590768780
487 | datetime: 2020-05-30 00:13:00",,"os_021
488 | 2020-05-30 00:13:00",0.0,42.0,task_5
489 | "There was a failure within the time range of May 30, 2020, from 02:30 to 03:00. The exact component responsible for the failure and the underlying reason for this failure are currently unknown. Please identify the root cause component and reason.","{
490 | ""1"": {
491 | ""root cause component"": ""os_022"",
492 | ""root cause reason"": ""CPU fault""
493 | }
494 | }","level: node
495 | reason: network delay
496 | component: os_009
497 | timestamp: 1590777780
498 | datetime: 2020-05-30 02:43:00",,"os_009
499 | network delay",0.0,43.0,task_6
500 | "Within the time range of May 30, 2020, from 03:30 to 04:00, there was an occurrence of a single system failure. However, the specific time at which the root cause occurred and the component responsible for this failure remain unidentified. Further investigation is needed to determine the root cause occurrence time and the root cause component.","{
501 | ""1"": {
502 | ""root cause occurrence datetime"": ""2020-05-30 03:23:43"",
503 | ""root cause component"": ""os_008"",
504 | ""root cause reason"": ""CPU fault""
505 | }
506 | }","level: node
507 | reason: network delay
508 | component: os_020
509 | timestamp: 1590781380
510 | datetime: 2020-05-30 03:43:00",,"os_020
511 | 2020-05-30 03:43:00",0.0,44.0,task_5
512 | "Within the specified time range of May 30, 2020, from 04:00 to 04:30, there was one failure reported. The exact root cause occurrence time is currently unknown. Please identify the root cause occurrence datetime.","{
513 | ""1"": {
514 | ""root cause occurrence datetime"": ""2020-05-30 04:06:48"",
515 | ""root cause component"": ""os_021"",
516 | ""root cause reason"": ""network delay""
517 | }
518 | }","level: pod
519 | reason: CPU fault
520 | component: docker_002
521 | timestamp: 1590783180
522 | datetime: 2020-05-30 04:13:00",,2020-05-30 04:13:00,0.0,45.0,task_1
523 | "Within the time range of May 30, 2020, from 05:00 to 05:30, the system encountered a single failure. The specific component responsible for this failure and the underlying reason remain unknown. Please identify the root cause component and the reason for this failure.","{
524 | ""1"": {
525 | ""root cause component"": ""db_002"",
526 | ""root cause reason"": ""db connection limit""
527 | }
528 | }","level: node
529 | reason: network loss
530 | component: os_018
531 | timestamp: 1590786780
532 | datetime: 2020-05-30 05:13:00",,"os_018
533 | network loss",0.0,46.0,task_6
534 | "During the time range of May 31, 2020, from 02:30 to 03:00, the system experienced a single failure. The exact component responsible for this failure is currently unknown. Please identify the component that caused the failure during this period.","{
535 | ""1"": {
536 | ""root cause component"": ""db_003"",
537 | ""root cause reason"": ""db connection limit""
538 | }
539 | }","level: node
540 | reason: network delay
541 | component: os_021
542 | timestamp: 1590864420
543 | datetime: 2020-05-31 02:47:00",,os_021,0.0,47.0,task_3
544 | "On May 31, 2020, during the time range from 03:00 to 03:30, there was a single failure in the system. The exact time when the root cause occurred is currently unknown. Please identify the root cause occurrence datetime.","{
545 | ""1"": {
546 | ""root cause occurrence datetime"": ""2020-05-31 02:58:13"",
547 | ""root cause component"": ""db_001"",
548 | ""root cause reason"": ""CPU fault""
549 | }
550 | }","level: node
551 | reason: network delay
552 | component: os_017
553 | timestamp: 1590866220
554 | datetime: 2020-05-31 03:17:00",,2020-05-31 03:17:00,0.0,48.0,task_1
555 | "During the specified time range of May 31, 2020, from 03:30 to 04:00, there was one failure reported. The exact time when the root cause occurred is unknown, making it challenging to diagnose the issue. Please determine the root cause occurrence datetime.","{
556 | ""1"": {
557 | ""root cause occurrence datetime"": ""2020-05-31 03:36:15"",
558 | ""root cause component"": ""os_018"",
559 | ""root cause reason"": ""CPU fault""
560 | }
561 | }","level: service
562 | reason: db connection limit
563 | component: db_003
564 | timestamp: 1590868020
565 | datetime: 2020-05-31 03:47:00",,2020-05-31 03:47:00,0.0,49.0,task_1
566 | "During the specified time range of May 31, 2020, from 04:00 to 04:30, the system experienced a failure. The exact occurrence time and component responsible for the root cause are currently unknown. Please identify the root cause occurrence datetime and the root cause component.","{
567 | ""1"": {
568 | ""root cause occurrence datetime"": ""2020-05-31 04:19:00"",
569 | ""root cause component"": ""db_001""
570 | }
571 | }","level: pod
572 | reason: CPU fault
573 | component: docker_004
574 | timestamp: 1590869820
575 | datetime: 2020-05-31 04:17:00",,"docker_004
576 | 2020-05-31 04:17:00",0.0,50.0,task_5
577 |
--------------------------------------------------------------------------------
/rca/baseline/cot_lm.py:
--------------------------------------------------------------------------------
1 | import tiktoken
2 | from rca.api_router import get_chat_completion
3 |
4 | system = """You will be provided with some telemetry data and an issue statement explaining a root cause analysis problem to resolve.
5 |
6 | {info}
7 |
8 | {data}"""
9 |
10 | user = """Now, I need you to provide an root cause analysis to the following question:
11 |
12 | ```issue
13 | {objective}
14 | ```
15 |
16 | Note: A root cause is the fundamental factor that triggers a service system failure, causing other system components to exhibit various anomalous behaviors. It consists of three elements: the root cause component, the start time of the root cause occurrence, and the reason for its occurrence. The objective of root cause analysis may vary, aiming to identify one or more of these elements based on the issue. Each failure has only one root cause. However, sometimes a system's abnormal state may be due to multiple simultaneous failures, each with its own root cause. If you find that there is a call relationship between multiple components exhibiting abnormal behavior, these anomalies originate from the same failure, with the component at the downstream end of the call chain being the root cause component. The anomalies in the other components are caused by the failure. If there is no call relationship between the abnormal components, each component may be the root cause of a different failure. Typically, the number of failures occurring within half an hour does not exceed three.
17 |
18 | Please first conduct a comprehensive analysis on the given telemetry data step-by-step in your response. Then, summarize your findings using the following JSON format to provide a concise answer to the given issue at the end of your response. In the summarized ansewr, you only need to provide the elements asked by the issue, and ommited the other fields in the JSON. The overall format is as follows:
19 |
20 | {{
21 | "1": {{
22 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited),
23 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited),
24 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited),
25 | }}, (mandatory)
26 | "2": {{
27 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited),
28 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited),
29 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited),
30 | }}, (only if the failure number is "unknown" or "more than one" in the issue)
31 | ... (only if the failure number is "unknown" or "more than one" in the issue)
32 | }}
33 | (DO NOT contain "```json" and "```" tags. DO contain the JSON object with the brackets "{{}}" only.)
34 |
35 | Please follow the format above to provide your response of current issue.
36 |
37 | Response below:"""
38 |
39 | class CoTLM:
40 | def __init__(self, oracle, schema) -> None:
41 | self.tokenizer = tiktoken.encoding_for_model("gpt-4o")
42 | self.oracle = oracle
43 | self.schema = schema
44 |
45 |
46 | def run(self, instruction, period_data, sample_interval, logger, max_try=3):
47 | logger.info(f"Objective: {instruction}")
48 |
49 | data = f"""## TELEMETRY DATA (Sampled every {sample_interval/60} min):"""
50 | for key in sorted(period_data.keys()):
51 | value = period_data[key]
52 | data += "\n\n" + "".join([f"### {str(key).upper()} DATA", value])
53 | logger.debug(f"{str(key).upper()} DATA tokens: {len(self.tokenizer.encode(value))}")
54 | info = self.schema
55 | prompt = [
56 | {'role': 'system', 'content': system.format(info=info, data=data)},
57 | {'role': 'user', 'content': user.format(objective=instruction)}
58 | ]
59 |
60 | logger.debug(f"prompt tokens: {len(self.tokenizer.encode(prompt[0]['content']))}")
61 |
62 | for i in range(max_try):
63 | try:
64 | response = get_chat_completion(
65 | messages=prompt,
66 | )
67 | logger.debug(f"Raw Response:\n{response}")
68 | return response, prompt
69 | except Exception as e:
70 | logger.error(e)
71 | if 'context_length_exceeded' in str(e):
72 | logger.error("Token length exceeds the limit.")
73 | return "EXCEED!", prompt
74 | logger.warning("Max steps reached. Please check the history.")
75 | return "Max steps reached. Please check the history.", prompt
76 |
77 |
78 |
--------------------------------------------------------------------------------
/rca/baseline/direct_lm.py:
--------------------------------------------------------------------------------
1 | import tiktoken
2 | from rca.api_router import get_chat_completion
3 |
4 | system = """You will be provided with some telemetry data and an issue statement explaining a root cause analysis problem to resolve.
5 |
6 | {info}
7 |
8 | {data}"""
9 |
10 | user = """Now, I need you to provide an root cause analysis to the following question:
11 |
12 | ```issue
13 | {objective}
14 | ```
15 |
16 | Note: A root cause is the fundamental factor that triggers a service system failure, causing other system components to exhibit various anomalous behaviors. It consists of three elements: the root cause component, the start time of the root cause occurrence, and the reason for its occurrence. The objective of root cause analysis may vary, aiming to identify one or more of these elements based on the issue. Each failure has only one root cause. However, sometimes a system's abnormal state may be due to multiple simultaneous failures, each with its own root cause. If you find that there is a call relationship between multiple components exhibiting abnormal behavior, these anomalies originate from the same failure, with the component at the downstream end of the call chain being the root cause component. The anomalies in the other components are caused by the failure. If there is no call relationship between the abnormal components, each component may be the root cause of a different failure. Typically, the number of failures occurring within half an hour does not exceed three.
17 |
18 | Your response should be structured into a JSON format, itemising the relevant root cause information you find. You only need to provide the elements asked by the issue, and ommited the other fields in the JSON. The overall format is as follows:
19 |
20 | {{
21 | "1": {{
22 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited),
23 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited),
24 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited),
25 | }}, (mandatory)
26 | "2": {{
27 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited),
28 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited),
29 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited),
30 | }}, (only if the failure number is "unknown" or "more than one" in the issue)
31 | ... (only if the failure number is "unknown" or "more than one" in the issue)
32 | }}
33 | (DO NOT contain "```json" and "```" tags. DO contain the JSON object with the brackets "{{}}" only.)
34 |
35 | Please follow the format above to provide your response of current issue.
36 |
37 | Response below:"""
38 |
39 | class DirectLM:
40 | def __init__(self, oracle, schema) -> None:
41 | self.tokenizer = tiktoken.encoding_for_model("gpt-4o")
42 | self.oracle = oracle
43 | self.schema = schema
44 |
45 |
46 | def run(self, instruction, period_data, sample_interval, logger, max_try=3):
47 | logger.info(f"Objective: {instruction}")
48 |
49 | data = f"""## TELEMETRY DATA (Sampled every {sample_interval/60} min):"""
50 | for key in sorted(period_data.keys()):
51 | value = period_data[key]
52 | data += "\n\n" + "".join([f"### {str(key).upper()} DATA", value])
53 | logger.debug(f"{str(key).upper()} DATA tokens: {len(self.tokenizer.encode(value))}")
54 | info = self.schema
55 | prompt = [
56 | {'role': 'system', 'content': system.format(info=info, data=data)},
57 | {'role': 'user', 'content': user.format(objective=instruction)}
58 | ]
59 |
60 | logger.debug(f"prompt tokens: {len(self.tokenizer.encode(prompt[0]['content']))}")
61 |
62 | for i in range(max_try):
63 | try:
64 | response = get_chat_completion(
65 | messages=prompt,
66 | )
67 | logger.debug(f"Raw Response:\n{response}")
68 | return response, prompt
69 | except Exception as e:
70 | logger.error(e)
71 | if 'context_length_exceeded' in str(e):
72 | logger.error("Token length exceeds the limit.")
73 | return "EXCEED!", prompt
74 | logger.warning("Max steps reached. Please check the history.")
75 | return "Max steps reached. Please check the history.", prompt
76 |
77 |
78 |
--------------------------------------------------------------------------------
/rca/baseline/oracle_kpis.py:
--------------------------------------------------------------------------------
1 | kpi_Telecom = {
2 | "db": ["Proc_Used_Pct",
3 | "Sess_Connect",
4 | "Proc_User_Used_Pct",
5 | "On_Off_State",
6 | "tnsping_result_time",
7 | ],
8 | "cpu": ["container_cpu_used",
9 | ],
10 | "mem": ["Memory_used_pct",
11 | ],
12 | "io": ["Disk_io_util"
13 | ],
14 | "net": ["Sent_queue",
15 | "Received_queue",
16 | ]
17 |
18 | }
19 |
20 | kpi_Telecom_len = sum(len(v) for v in kpi_Telecom.values())
21 |
22 |
23 | kpi_Bank = {
24 | "jvm": ["JVM-Operating System_7779_JVM_JVM_CPULoad",
25 | "JVM-Operating System_7778_JVM_JVM_CPULoad",
26 | "JVM-Memory_7778_JVM_Memory_NoHeapMemoryUsed",
27 | "JVM-Memory_7779_JVM_Memory_NoHeapMemoryUsed",
28 | "JVM-Memory_7779_JVM_Memory_HeapMemoryUsage",
29 | "JVM-Memory_7778_JVM_Memory_HeapMemoryUsage",
30 | "JVM-Memory_7778_JVM_Memory_HeapMemoryUsed",
31 | "JVM-Memory_7779_JVM_Memory_HeapMemoryUsed",
32 | ],
33 |
34 | "cpu": ["OSLinux-CPU_CPU_CPUCpuUtil",
35 | "OSLinux-CPU_CPU-0_SingleCpuUtil",
36 | ],
37 |
38 | "mem": ["OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc",
39 | "OSLinux-OSLinux_MEMORY_MEMORY_NoCacheMemPerc",
40 | "OSLinux-OSLinux_MEMORY_MEMORY_MEMFreeMem",
41 | ],
42 |
43 | "io": ["OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKReadWrite",
44 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead",
45 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKReadWrite",
46 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRTps",
47 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKRTps",
48 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKRead",
49 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKBps",
50 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKPercentBusy",
51 | ],
52 |
53 | "net": ["OSLinux-OSLinux_NETWORK_NETWORK_TCP-FIN-WAIT",
54 | "OSLinux-OSLinux_NETWORK_ens160_NETBandwidthUtil",
55 | "OSLinux-OSLinux_NETWORK_NETWORK_TotalTcpConnNum",
56 | "OSLinux-OSLinux_NETWORK_ens160_NETPacketsOut",
57 | "OSLinux-OSLinux_NETWORK_ens160_NETPacketsIn",
58 | "OSLinux-OSLinux_NETWORK_ens160_NETKBTotalPerSec",
59 | "OSLinux-OSLinux_NETWORK_NETWORK_TCP-CLOSE-WAIT"
60 | ]
61 | }
62 |
63 |
64 | kpi_Bank_len = sum([len(v) for v in kpi_Bank.values()])
65 |
66 |
67 | kpi_Market = {
68 | "process": ["container_threads",
69 | ],
70 |
71 | "cpu": ["container_cpu_usage_seconds",
72 | "system.cpu.pct_usage",
73 | ],
74 |
75 | "mem": ["system.mem.used",
76 | "container_memory_usage_MB",
77 | ],
78 |
79 | "io": ["container_fs_reads./dev/vda",
80 | "container_fs_writes./dev/vda",
81 | "system.io.r_s",
82 | "system.io.w_s",
83 | "container_fs_writes./dev/vda1",
84 | "system.disk.used",
85 | "system.disk.pct_usage",
86 | ],
87 |
88 | "net": ["container_network_receive_packets.eth0",
89 | "container_network_receive_MB.eth0",
90 | "recommendationservice-grpc",
91 | "frontend-http",
92 | "cartservice-grpc",
93 | "checkoutservice-grpc",
94 | "productcatalogservice-grpc",
95 | "emailservice-grpc",
96 | "adservice-grpc",
97 | ],
98 | }
99 |
100 | kpi_Market_len = sum([len(v) for v in kpi_Market.values()])
--------------------------------------------------------------------------------
/rca/baseline/rca_agent/controller.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | from IPython.terminal.embed import InteractiveShellEmbed
4 |
5 | from rca.baseline.rca_agent.executor import execute_act
6 |
7 | from rca.api_router import get_chat_completion
8 |
9 | system = """You are the Administrator of a DevOps Assistant system for failure diagnosis. To solve each given issue, you should iteratively instruct an Executor to write and execute Python code for data analysis on telemetry files of target system. By analyzing the execution results, you should approximate the answer step-by-step.
10 |
11 | There is some domain knowledge for you:
12 |
13 | {background}
14 |
15 | {agent}
16 |
17 | The issue you are going to solve is:
18 |
19 | {objective}
20 |
21 | Solve the issue step-by-step. In each step, your response should follow the JSON format below:
22 |
23 | {format}
24 |
25 | Let's begin."""
26 |
27 | format = """{
28 | "analysis": (Your analysis of the code execution result from Executor in the last step, with detailed reasoning of 'what have been done' and 'what can be derived'. Respond 'None' if it is the first step.),
29 | "completed": ("True" if you believe the issue is resolved, and an answer can be derived in the 'instruction' field. Otherwise "False"),
30 | "instruction": (Your instruction for the Executor to perform via code execution in the next step. Do not involve complex multi-step instruction. Keep your instruction atomic, with clear request of 'what to do' and 'how to do'. Respond a summary by yourself if you believe the issue is resolved. Respond a summary by yourself if you believe the issue is resolved. Respond a summary by yourself if you believe the issue is resolved.)
31 | }
32 | (DO NOT contain "```json" and "```" tags. DO contain the JSON object with the brackets "{}" only. Use '\\n' instead of an actual newline character to ensure JSON compatibility when you want to insert a line break within a string.)"""
33 |
34 | summary = """Now, you have decided to finish your reasoning process. You should now provide the final answer to the issue. The candidates of possible root cause components and reasons are provided to you. The root cause components and reasons must be selected from the provided candidates.
35 |
36 | {cand}
37 |
38 | Recall the issue is: {objective}
39 |
40 | Please first review your previous reasoning process to infer an exact answer of the issue. Then, summarize your final answer of the root causes using the following JSON format at the end of your response:
41 |
42 | ```json
43 | {{
44 | "1": {{
45 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited),
46 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited),
47 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited),
48 | }}, (mandatory)
49 | "2": {{
50 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited),
51 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited),
52 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited),
53 | }}, (only if the failure number is "unknown" or "more than one" in the issue)
54 | ... (only if the failure number is "unknown" or "more than one" in the issue)
55 | }}
56 | ```
57 | (Please use "```json" and "```" tags to wrap the JSON object. You only need to provide the elements asked by the issue, and ommited the other fields in the JSON.)
58 | Note that all the root cause components and reasons must be selected from the provided candidates. Do not reply 'unknown' or 'null' or 'not found' in the JSON. Do not be too conservative in selecting the root cause components and reasons. Be decisive to infer a possible answer based on your current observation."""
59 |
60 | def control_loop(objective:str, plan:str, ap, bp, logger, max_step = 15, max_turn = 3) -> str:
61 |
62 | prompt = [
63 | {'role': 'system', 'content': system.format(objective=objective,
64 | format=format,
65 | agent=ap.rules,
66 | background=bp.schema)},
67 | {'role': 'user', 'content': "Let's begin."}
68 | ]
69 |
70 | history = []
71 | trajectory = []
72 | observation = "Let's begin."
73 | status = False
74 | kernel = InteractiveShellEmbed()
75 | init_code = "import pandas as pd\n"+ \
76 | "pd.set_option('display.width', 427)\n"+ \
77 | "pd.set_option('display.max_columns', 10)\n"
78 | kernel.run_cell(init_code)
79 |
80 | for step in range(max_step):
81 |
82 | note = [{'role': 'user', 'content': f"Continue your reasoning process for the target issue:\n\n{objective}\n\nFollow the rules during issue solving:\n\n{ap.rules}.\n\nResponse format:\n\n{format}"}]
83 | attempt_actor = []
84 | try:
85 | response_raw = get_chat_completion(
86 | messages=prompt + note,
87 | )
88 | if "```json" in response_raw:
89 | response_raw = re.search(r"```json\n(.*)\n```", response_raw, re.S).group(1).strip()
90 | logger.debug(f"Raw Response:\n{response_raw}")
91 | if '"analysis":' not in response_raw or '"instruction":' not in response_raw or '"completed":' not in response_raw:
92 | logger.warning("Invalid response format. Please provide a valid JSON response.")
93 | prompt.append({'role': 'assistant', 'content': response_raw})
94 | prompt.append({'role': 'user', 'content': "Please provide your analysis in requested JSON format."})
95 | continue
96 | response = json.loads(response_raw)
97 | analysis = response['analysis']
98 | instruction = response['instruction']
99 | completed = response['completed']
100 | logger.info('-'*80 + '\n' + f"### Step[{step+1}]\nAnalysis: {analysis}\nInstruction: {instruction}" + '\n' + '-'*80)
101 |
102 | if completed == "True":
103 | kernel.reset()
104 | prompt.append({'role': 'assistant', 'content': response_raw})
105 | prompt.append({'role': 'user', 'content': summary.format(objective=objective,
106 | cand=bp.cand)})
107 | answer = get_chat_completion(
108 | messages=prompt,
109 | )
110 | logger.debug(f"Raw Final Answer:\n{answer}")
111 | prompt.append({'role': 'assistant', 'content': answer})
112 | if "```json" in answer:
113 | answer = re.search(r"```json\n(.*)\n```", answer, re.S).group(1).strip()
114 | return answer, trajectory, prompt
115 |
116 | code, result, status, new_history = execute_act(instruction, bp.schema, history, attempt_actor, kernel, logger)
117 | if not status:
118 | logger.warn(f'Self-Correction failed.')
119 | observation = "The Executor failed to execute the instruction. Please provide a new instruction."
120 | observation = f"{result}"
121 | history = new_history
122 | trajectory.append({'code': f"# In[{step+1}]:\n\n{code}", 'result': f"Out[{step+1}]:\n```\n{result}```"})
123 | logger.info('-'*80 + '\n' + f"Step[{step+1}]\n### Observation:\n{result}" + '\n' + '-'*80)
124 | prompt.append({'role': 'assistant', 'content': response_raw})
125 | prompt.append({'role': 'user', 'content': observation})
126 |
127 | except Exception as e:
128 | logger.error(e)
129 | prompt.append({'role': 'assistant', 'content': response_raw})
130 | prompt.append({'role': 'user', 'content': f"{str(e)}\nPlease provide your analysis in requested JSON format."})
131 | if 'context_length_exceeded' in str(e):
132 | logger.warning("Token length exceeds the limit.")
133 | kernel.reset()
134 | return "Token length exceeds. No root cause found.", trajectory, prompt
135 |
136 | logger.warning("Max steps reached. Please check the history.")
137 | kernel.reset()
138 | final_prompt = {'role': 'user', 'content': summary.format(objective=objective,
139 | cand=bp.cand).replace('Now, you have decided to finish your reasoning process. ', 'Now, the maximum steps of your reasoning have been reached. ')}
140 | if prompt[-1]['role'] == 'user':
141 | prompt[-1]['content'] = final_prompt['content']
142 | else:
143 | prompt.append({'role': 'user', 'content': final_prompt['content']})
144 | answer = get_chat_completion(
145 | messages=prompt,
146 | )
147 | logger.debug(f"Raw Final Answer:\n{answer}")
148 | prompt.append({'role': 'assistant', 'content': answer})
149 | if "```json" in answer:
150 | answer = re.search(r"```json\n(.*)\n```", answer, re.S).group(1).strip()
151 | return answer, trajectory, prompt
152 |
--------------------------------------------------------------------------------
/rca/baseline/rca_agent/executor.py:
--------------------------------------------------------------------------------
1 | import re
2 | import time
3 | from datetime import datetime
4 | from rca.api_router import get_chat_completion
5 | import tiktoken
6 | import traceback
7 |
8 | system = """You are a DevOps assistant for writing Python code to answer DevOps questions. For each question, you need to write Python code to solve it by retrieving and processing telemetry data of the target system. Your generated Python code will be automatically submitted to a IPython Kernel. The execution result output in IPython Kernel will be used as the answer to the question.
9 |
10 | {rule}
11 |
12 | There is some domain knowledge for you:
13 |
14 | {background}
15 |
16 | Your response should follow the Python block format below:
17 |
18 | {format}"""
19 |
20 | format = """```python
21 | (YOUR CODE HERE)
22 | ```"""
23 |
24 | summary = """The code execution is successful. The execution result is shown below:
25 |
26 | {result}
27 |
28 | Please summarize a straightforward answer to the question based on the execution results. Use plain English."""
29 |
30 | conclusion = """{answer}
31 |
32 | The original code execution output of IPython Kernel is also provided below for reference:
33 |
34 | {result}"""
35 |
36 | rule = """## RULES OF PYTHON CODE WRITING:
37 |
38 | 1. Reuse variables as much as possible for execution efficiency since the IPython Kernel is stateful, i.e., variables define in previous steps can be used in subsequent steps.
39 | 2. Use variable name rather than `print()` to display the execution results since your Python environment is IPython Kernel rather than Python.exe. If you want to display multiple variables, use commas to separate them, e.g. `var1, var2`.
40 | 3. Use pandas Dataframe to process and display tabular data for efficiency and briefness. Avoid transforming Dataframe to list or dict type for display.
41 | 4. If you encounter an error or unexpected result, rewrite the code by referring to the given IPython Kernel error message.
42 | 5. Do not simulate any virtual situation or assume anything unknown. Solve the real problem.
43 | 6. Do not store any data as files in the disk. Only cache the data as variables in the memory.
44 | 7. Do not visualize the data or draw pictures or graphs via Python. You can only provide text-based results. Never include the `matplotlib` or `seaborn` library in the code.
45 | 8. Do not generate anything else except the Python code block except the instruction tell you to 'Use plain English'. If you find the input instruction is a summarization task (which is typically happening in the last step), you should comprehensively summarize the conclusion as a string in your code and display it directly.
46 | 9. Do not calculate threshold AFTER filtering data within the given time duration. Always calculate global thresholds using the entire KPI series of a specific component within a metric file BEFORE filtering data within the given time duration.
47 | 10. All issues use **UTC+8** time. However, the local machine's default timezone is unknown. Please use `pytz.timezone('Asia/Shanghai')` to explicityly set the timezone to UTC+8.
48 | """
49 |
50 | def execute_act(instruction:str, background:str, history, attempt, kernel, logger) -> str:
51 |
52 | logger.debug("Start execution")
53 | t1 = datetime.now()
54 | if history == []:
55 | history = [
56 | {'role': 'system', 'content': system.format(rule=rule, background=background, format=format)},
57 | ]
58 | code_pattern = re.compile(r"```python\n(.*?)\n```", re.DOTALL)
59 | code = ""
60 | result = ""
61 | retry_flag = False
62 | status = False
63 | history.extend([{'role': 'user', 'content': instruction}])
64 | prompt = history.copy()
65 | note = [{'role': 'user', 'content': f"Continue your code writing process following the rules:\n\n{rule}\n\nResponse format:\n\n{format}"}]
66 | tokenizer = tiktoken.encoding_for_model("gpt-4")
67 | for i in range(2):
68 | try:
69 | if not retry_flag:
70 | response = get_chat_completion(
71 | messages=prompt + note,
72 | )
73 | else:
74 | response = get_chat_completion(
75 | messages=prompt,
76 | )
77 | retry_flag = False
78 | if re.search(code_pattern, response):
79 | code = re.search(code_pattern, response).group(1).strip()
80 | else:
81 | code = response.strip()
82 | logger.debug(f"Raw Code:\n{code}")
83 | if "import matplotlib" in code or "import seaborn" in code:
84 | logger.warning("The generated visualization code detected.")
85 | prompt.append({'role': 'assistant', 'content': code})
86 | prompt.append({'role': 'user', 'content': "You are not permitted to generate visualizations. If the instruction requires visualization, please provide the text-based results."})
87 | continue
88 | exec = kernel.run_cell(code)
89 | status = exec.success
90 | if status:
91 | result = str(exec.result).strip()
92 | tokens_len = len(tokenizer.encode(result))
93 | if tokens_len > 16384:
94 | logger.warning(f"Token length exceeds the limit: {tokens_len}")
95 | continue
96 | t2 = datetime.now()
97 | row_pattern = r"\[(\d+)\s+rows\s+x\s+\d+\s+columns\]"
98 | match = re.search(row_pattern, result)
99 | if match:
100 | rows = int(match.group(1))
101 | if rows > 10:
102 | result += f"\n\n**Note**: The printed pandas DataFrame is truncated due to its size. Only **10 rows** are displayed, which may introduce observation bias due to the incomplete table. If you want to comprehensively understand the details without bias, please ask Executor using `df.head(X)` to display more rows."
103 | logger.debug(f"Execution Result:\n{result}")
104 | logger.debug(f"Execution finished. Time cost: {t2-t1}")
105 | history.extend([
106 | {'role': 'assistant', 'content': code},
107 | {'role': 'user', 'content': summary.format(result=result)},
108 | ])
109 | answer = get_chat_completion(
110 | messages=history,
111 | )
112 | logger.debug(f"Brief Answer:\n{answer}")
113 | history.extend([
114 | {'role': 'assistant', 'content': answer},
115 | ])
116 | result = conclusion.format(answer=answer, result=result)
117 |
118 | return code, result, status, history
119 | else:
120 | result = ''.join(traceback.format_exception(type(exec.error_in_exec), exec.error_in_exec, exec.error_in_exec.__traceback__))
121 | t2 = datetime.now()
122 | logger.warning(f"Execution failed. Error message: {result}")
123 | logger.debug(f"Execution finished. Time cost: {t2-t1}")
124 | prompt.append({'role': 'assistant', 'content': code})
125 | prompt.append({'role': 'user', 'content': f"Execution failed:\n{result}\nPlease revise your code and retry."})
126 | retry_flag = True
127 |
128 | except Exception as e:
129 | logger.error(e)
130 | time.sleep(1)
131 |
132 | t2 = datetime.now()
133 | logger.error(f"Max try reached. Please check the history. Time cost: {t2-t1}")
134 | err = "The Executor failed to complete the instruction, please re-write a new instruction for Executor."
135 | history.extend([{'role': 'assistant', 'content': err}])
136 | return err, err, True, history
--------------------------------------------------------------------------------
/rca/baseline/rca_agent/prompt/agent_prompt.py:
--------------------------------------------------------------------------------
1 | rules = """## RULES OF FAILURE DIAGNOSIS:
2 |
3 | What you SHOULD do:
4 |
5 | 1. **Follow the workflow of `preprocess -> anomaly detection -> fault identification -> root cause localization` for failure diagnosis.**
6 | 1.1. Preprocess:
7 | - Aggregate each KPI of each components that are possible to be the root cause component to obtain multiple time series classified by 'component-KPI' (e.g., service_A-cpu_usage_pct).
8 | - Then, calculate global thresholds (e.g., global P95, where 'global' means the threshold of all 'component-KPI' time series within a whole metric file) for each 'component-KPI' time series. - Finally, filter data within the given time duration for all time series to perform further analysis.
9 | - Since the root cause component must be selected from the provided possible root cause components, all other level's components (e.g., service mesh components, middleware components, etc.) should be ignored.
10 | 1.2. Anomaly detection:
11 | - An anomaly is typically a data point that exceeds the global threshold.
12 | - Look for anomalies below a certain threshold (e.g., <=P95, <=P15, or <=P5) in traffic KPIs or business KPIs (e.g., success rate (ss)) since some network failures can cause a sudden drop on them due to packet loss.
13 | - Loose the global threshold (e.g., from >=P95 to >=P90, or from <=P95 to <=P15, <=P5) if you really cannot find any anomalies.
14 | 1.3. Fault identification:
15 | - A 'fault' is a consecutive sub-series of a specific component-KPI time series. Thus, fault identification is the process of identifying which components experienced faults, on which resources, and at what occurrence time points.
16 | - Filter out isolated noise spikes to locate faults.
17 | - Faults where the maximum (or minimum) value in the sub-series only slightly exceeds (or falls below) the threshold (e.g., threshold breach <= 50% of the extremal), it’s likely a false positive caused by random KPI fluctuations, and should be excluded.
18 | 1.4. Root cause localization:
19 | - The objective of root cause localization is to determine which identified 'fault' is the root cause of the failure. The root cause occurrence time, component, and reason can be derived from the first piece of data point of that fault.
20 | - If multiple faulty components are identified at **different levels** (e.g., some being containers and others nodes), and all of them are potential root cause candidates, while the issue itself describes a **single failure**, the root cause level should be determined by the fault that shows the most significant deviation from the threshold (i.e., >> 50%). However, this method is only applicable to identify the root cause level, not the root cause component. If there are multiple faulty components at the same level, you should use traces and logs to identify the root cause component.
21 | - If multiple service-level faulty components are identified, the root cause component is typically the last (the most downstream in a call chain) **faulty** service within a trace. Use traces to identify the root cause component among multiple faulty services.
22 | - If multiple container-level faulty components are identified, the root cause component is typically the last (the most downstream in a call chain) **faulty** container within a trace. Use traces to identify the root cause component among multiple faulty container.
23 | - If multiple node-level faulty components are identified and the issue doesn't specify **a single failure**, each of these nodes might be the root cause of separate failures. Otherwise, the predominant nodes with the most faults is the root cause component. The node-level failure do not propagate, and trace only captures communication between all containers or all services.
24 | - If only one component's one resource KPI has one fault occurred in a specific time, that fault is the root cause. Otherwise, you should use traces and logs to identify the root cause component and reason.
25 | 2. **Follow the order of `threshold calculation -> data extraction -> metric analyis -> trace analysis -> log analysis` for failure diagnosis.**
26 | 2.0. Before analysis: You should extract and filter the data to include those within the failure duration only after the global threshold has been calculated. After these two steps, you can perform metric analysis, trace analysis, and log analysis.
27 | 2.1. Metric analysis: Use metrics to calculate whether each KPIs of each component has consecutive anomalies beyond the global threshold is the fastest way to find the faults. Since there are a large number of traces and logs, metrics analysis should first be used to narrow down the search space of duration and components.
28 | 2.2. Trace analysis: Use traces can further localize which container-level or service-level faulty component is the root cause components when there are multiple faulty components at the same level (container or service) identified by metrics analysis.
29 | 2.3. Log analysis: Use logs can further localize which resource is the root cause reason when there are multiple faulty resource KPIs of a component identified by metrics analysis. Logs can also help to identify the root cause component among multiple faulty components at the same level.
30 | 2.4. Always confirm whether the target key or field is valid (e.g., component's name, KPI's name, trace ID, log ID, etc.) when Executor's retrieval result is empty.
31 |
32 | What you SHOULD NOT do:
33 |
34 | 1. **DO NOT include any programming language (Python) in your response.** Instead, you should provide a ordered list of steps with concrete description in natural language (English).
35 | 2. **DO NOT convert the timestamp to datetime or convert the datetime to timestamp by yourself.** These detailed process will be handled by the Executor.
36 | 3. **DO NOT use the local data (filtered/cached series in specific time duration) to calculate the global threshold of aggregated 'component-KPI' time series.** Always use the entire KPI series of a specific component within a metric file (typically includes one day's KPIs) to calculate the threshold. To obtain global threshold, you can first aggregate each component's each KPI to calculate their threshold, and then retrieve the objective time duration of aggregated 'component-KPI' to perform anomaly detection and spike filtering.
37 | 4. **DO NOT visualize the data or draw pictures or graphs via Python.** The Executor can only provide text-based results. Never include the `matplotlib` or `seaborn` library in the code.
38 | 5. **DO NOT save anything in the local file system.** Cache the intermediate results in the IPython Kernel. Never use the bash command in the code cell.
39 | 6. **DO NOT calculate threshold AFTER filtering data within the given time duration.** Always calculate global thresholds using the entire KPI series of a specific component within a metric file BEFORE filtering data within the given time duration.
40 | 7. **DO NOT query a specific KPI without knowing which KPIs are available.** Different systems may have completely different KPI naming conventions. If you want to query a specific KPI, first ensure that you are aware of all the available KPIs.
41 | 8. **DO NOT mistakenly identify a healthy (non-faulty) service at the downstream end of a trace that includes faulty components as the root cause.** The root cause component should be the most downstream **faulty** service to appear within the trace call chain, which must first and foremost be a FAULTY component identified by metrics analysis.
42 | 9. **DO NOT focus solely on warning or error logs during log analysis. Many info logs contain critical information about service operations and interactions between services, which can be valuable for root cause analysis.**"""
43 |
--------------------------------------------------------------------------------
/rca/baseline/rca_agent/prompt/basic_prompt_Bank.py:
--------------------------------------------------------------------------------
1 | cand = """## POSSIBLE ROOT CAUSE REASONS:
2 |
3 | - high CPU usage
4 | - high memory usage
5 | - network latency
6 | - network packet loss
7 | - high disk I/O read usage
8 | - high disk space usage
9 | - high JVM CPU load
10 | - JVM Out of Memory (OOM) Heap
11 |
12 | ## POSSIBLE ROOT CAUSE COMPONENTS:
13 |
14 | - apache01
15 | - apache02
16 | - Tomcat01
17 | - Tomcat02
18 | - Tomcat04
19 | - Tomcat03
20 | - MG01
21 | - MG02
22 | - IG01
23 | - IG02
24 | - Mysql01
25 | - Mysql02
26 | - Redis01
27 | - Redis02"""
28 |
29 | schema = f"""## TELEMETRY DIRECTORY STRUCTURE:
30 |
31 | - You can access the telemetry directory in our microservices system: `dataset/Bank/telemetry/`.
32 |
33 | - This directory contains subdirectories organized by a date (e.g., `dataset/Bank/telemetry/2021_03_05/`).
34 |
35 | - Within each date-specific directory, you’ll find these subdirectories: `metric`, `trace`, and `log` (e.g., `dataset/Bank/telemetry/2021_03_05/metric/`).
36 |
37 | - The telemetry data in those subdirectories is stored in CSV format (e.g., `dataset/Bank/telemetry/2021_03_05/metric/metric_container.csv`).
38 |
39 | ## DATA SCHEMA
40 |
41 | 1. **Metric Files**:
42 |
43 | 1. `metric_app.csv`:
44 |
45 | ```csv
46 | timestamp,rr,sr,cnt,mrt,tc
47 | 1614787440,100.0,100.0,22,53.27,ServiceTest1
48 | ```
49 |
50 | 2. `metric_container.csv`:
51 |
52 | ```csv
53 | timestamp,cmdb_id,kpi_name,value
54 | 1614787200,Tomcat04,OSLinux-CPU_CPU_CPUCpuUtil,26.2957
55 | ```
56 |
57 | 2. **Trace Files**:
58 |
59 | 1. `trace_span.csv`:
60 |
61 | ```csv
62 | timestamp,cmdb_id,parent_id,span_id,trace_id,duration
63 | 1614787199628,dockerA2,369-bcou-dle-way1-c514cf30-43410@0824-2f0e47a816-17492,21030300016145905763,gw0120210304000517192504,19
64 | ```
65 |
66 | 3. **Log Files**:
67 |
68 | 1. `log_service.csv`:
69 |
70 | ```csv
71 | log_id,timestamp,cmdb_id,log_name,value
72 | 8c7f5908ed126abdd0de6dbdd739715c,1614787201,Tomcat01,gc,"3748789.580: [GC (CMS Initial Mark) [1 CMS-initial-mark: 2462269K(3145728K)] 3160896K(4089472K), 0.1985754 secs] [Times: user=0.59 sys=0.00, real=0.20 secs] "
73 | ```
74 |
75 | {cand}
76 |
77 | ## CLARIFICATION OF TELEMETRY DATA:
78 |
79 | 1. This microservice system is a banking platform.
80 |
81 | 2. The `metric_app.csv` file only contains four KPIs: rr, sr, cnt, and mrt,. In contrast, `metric_container.csv` records a variety of KPIs, such as CPU usage and memory usage. The specific names of these KPIs can be found in the `kpi_name` field.
82 |
83 | 3. In different telemetry files, the timestamp units and cmdb_id formats may vary:
84 |
85 | - Metric: Timestamp units are in seconds (e.g., 1614787440).
86 |
87 | - Trace: Timestamp units are in milliseconds (e.g., 1614787199628).
88 |
89 | - Log: Timestamp units are in seconds (e.g., 1614787201).
90 |
91 | 4. Please use the UTC+8 time zone in all analysis steps since system is deployed in China/Hong Kong/Singapore."""
92 |
93 |
--------------------------------------------------------------------------------
/rca/baseline/rca_agent/prompt/basic_prompt_Market.py:
--------------------------------------------------------------------------------
1 | cand = """## POSSIBLE ROOT CAUSE COMPONENTS:
2 |
3 | (if the root cause is at the node level, i.e., the root cause is a specific node)
4 | - node-1
5 | - node-2
6 | - node-3
7 | - node-4
8 | - node-5
9 | - node-6
10 |
11 | (if the root cause is at the pod level, i.e., the root cause is a specific container)
12 |
13 | - frontend-0
14 | - frontend-1
15 | - frontend-2
16 | - frontend2-0
17 | - shippingservice-0
18 | - shippingservice-1
19 | - shippingservice-2
20 | - shippingservice2-0
21 | - checkoutservice-0
22 | - checkoutservice-1
23 | - checkoutservice-2
24 | - checkoutservice2-0
25 | - currencyservice-0
26 | - currencyservice-1
27 | - currencyservice-2
28 | - currencyservice2-0
29 | - adservice-0
30 | - adservice-1
31 | - adservice-2
32 | - adservice2-0
33 | - emailservice-0
34 | - emailservice-1
35 | - emailservice-2
36 | - emailservice2-0
37 | - cartservice-0
38 | - cartservice-1
39 | - cartservice-2
40 | - cartservice2-0
41 | - productcatalogservice-0
42 | - productcatalogservice-1
43 | - productcatalogservice-2
44 | - productcatalogservice2-0
45 | - recommendationservice-0
46 | - recommendationservice-1
47 | - recommendationservice-2
48 | - recommendationservice2-0
49 | - paymentservice-0
50 | - paymentservice-1
51 | - paymentservice-2
52 | - paymentservice2-0
53 |
54 | (if the root cause is at the service level, i.e., if all pods of a specific service are faulty, the root cause is the service itself)
55 |
56 | - frontend
57 | - shippingservice
58 | - checkoutservice
59 | - currencyservice
60 | - adservice
61 | - emailservice
62 | - cartservice
63 | - productcatalogservice
64 | - recommendationservice
65 | - paymentservice
66 |
67 | ## POSSIBLE ROOT CAUSE REASONS:
68 |
69 | - container CPU load
70 | - container memory load
71 | - container network packet retransmission
72 | - container network packet corruption
73 | - container network latency
74 | - container packet loss
75 | - container process termination
76 | - container read I/O load
77 | - container write I/O load
78 | - node CPU load
79 | - node CPU spike
80 | - node memory consumption
81 | - node disk read I/O consumption
82 | - node disk write I/O consumption
83 | - node disk space consumption"""
84 |
85 | schema = f"""## TELEMETRY DIRECTORY STRUCTURE:
86 |
87 | - You can access the telemetry directories of two cloudbed (i.e., `cloudbed-1` and `cloudbed-2`) in our microservices system: `dataset/Market/cloudbed-1/telemetry/` and `dataset/Market/cloudbed-2/telemetry/`.
88 |
89 | - This directory contains subdirectories organized by a date (e.g., `dataset/Market/cloudbed-1/telemetry/2022_03_20/`).
90 |
91 | - Within each date-specific directory, you’ll find these subdirectories: `metric`, `trace`, and `log` (e.g., `dataset/Market/cloudbed-1/telemetry/2022_03_20/metric/`).
92 |
93 | - The telemetry data in those subdirectories is stored in CSV format (e.g., `dataset/Market/cloudbed-1/telemetry/2022_03_20/metric/metric_container.csv`).
94 |
95 | ## DATA SCHEMA
96 |
97 | 1. **Metric Files**:
98 |
99 | 1. `metric_container.csv`:
100 |
101 | ```csv
102 | timestamp,cmdb_id,kpi_name,value
103 | 1647781200,node-6.adservice2-0,container_fs_writes_MB./dev/vda,0.0
104 | ```
105 |
106 | 2. `metric_mesh.csv`:
107 |
108 | ```csv
109 | timestamp,cmdb_id,kpi_name,value
110 | 1647790380,cartservice-1.source.cartservice.redis-cart,istio_tcp_sent_bytes.-,1255.0
111 | ```
112 |
113 | 3. `metric_node.csv`:
114 |
115 | ```csv
116 | timestamp,cmdb_id,kpi_name,value
117 | 1647705600,node-1,system.cpu.iowait,0.31
118 | ```
119 |
120 | 4. `metric_runtime.csv`:
121 |
122 | ```csv
123 | timestamp,cmdb_id,kpi_name,value
124 | 1647730800,adservice.ts:8088,java_nio_BufferPool_TotalCapacity.direct,57343.0
125 | ```
126 |
127 | 5. `metric_service.csv`:
128 |
129 | ```csv
130 | service,timestamp,rr,sr,mrt,count
131 | adservice-grpc,1647716400,100.0,100.0,2.429508196728182,61
132 | ```
133 |
134 | 2. **Trace Files**:
135 |
136 | 1. `trace_span.csv`:
137 |
138 | ```csv
139 | timestamp,cmdb_id,span_id,trace_id,duration,type,status_code,operation_name,parent_span
140 | 1647705600361,frontend-0,a652d4d10e9478fc,9451fd8fdf746a80687451dae4c4e984,49877,rpc,0,hipstershop.CheckoutService/PlaceOrder,952754a738a11675
141 | ```
142 |
143 | 3. **Log Files**:
144 |
145 | 1. `log_proxy.csv`:
146 |
147 | ```csv
148 | log_id,timestamp,cmdb_id,log_name,value
149 | KN43pn8BmS57GQLkQUdP,1647761110,cartservice-1,log_cartservice-service_application,etCartAsync called with userId=3af80013-c2c1-4ae6-86d0-1d9d308e6f5b
150 | ```
151 |
152 | 2. `log_service.csv`:
153 |
154 | ```csv
155 | log_id,timestamp,cmdb_id,log_name,value
156 | GIvpon8BDiVcQfZwJ5a9,1647705660,currencyservice-0,log_currencyservice-service_application,"severity: info, message: Getting supported currencies..."
157 | ```
158 |
159 | {cand}
160 |
161 | ## CLARIFICATION OF TELEMETRY DATA:
162 |
163 | 1. This microservice system is a E-commerce platform which includes a failover mechanism, with each service deployed across four pods. In this system, a container (pod) can be deployed in different nodes. If the root cause component is a single pod of a specific service (e.g., node-1.adservice-0), the failure may not significantly impact the corresponding service metrics. In contrast, if the root cause component is a service itself (e.g., adservice), which means all pods of this service are faulty, the corresponding service metrics will be significantly impacted. Moreover, such fault could be propagate through the call chain, resulting in other service's metrics faulty. Note that `Pod` equals to `Container` in this system.
164 |
165 | 2. The `metric_service.csv` file only contains four KPIs: rr, sr, mrt, and count. In contrast, other metric files record a variety of KPIs, such as CPU usage and memory usage. The specific names of these KPIs can be found in the `kpi_name` field.
166 |
167 | 3. Note that the `cmdb_id` is the name of specific components, including nodes, pods, services, etc.
168 |
169 | - Metrics:
170 | - Runtime: The application name and port, e.g., `adservice.ts:8088`
171 | - Service: The service name and protocol, e.g., `adservic-grpc`
172 | - Container: The pod name combined with a node name, e.g., `node-1.adservice-0`
173 | - Node: The node name, e.g., `node-1`
174 | - Mesh: The service-to-service connection identifier within the mesh, e.g.,`cartservice-1.source.cartservice.redis-cart`
175 |
176 | - Traces: The pod name, e.g., `adservice-0`
177 |
178 | - Logs: The pod name, e.g., `adservice-0`
179 |
180 | 4. In different telemetry files, the timestamp units and cmdb_id formats may vary:
181 |
182 | - Metric: Timestamp units are in seconds (e.g., 1647781200). cmdb_id varies by metric file:
183 | - In container metrics: `-x.-x` (e.g., `node-1.adservice-0`)
184 | - In node metrics: `-x` (e.g., `node-1`)
185 | - In service metrics: `-grpc` (e.g., `adservice-grpc`)
186 |
187 | - Trace: Timestamp units are in milliseconds (e.g., 1647705600361). cmdb_id is consistently `-x` (e.g., frontend-0).
188 |
189 | - Log: Timestamp units are in seconds (e.g., 1647705660). cmdb_id is consistently `-x` (e.g., currencyservice-0).
190 |
191 | 5. Please use the UTC+8 time zone in all analysis steps since system is deployed in China/Hong Kong/Singapore."""
192 |
--------------------------------------------------------------------------------
/rca/baseline/rca_agent/prompt/basic_prompt_Telecom.py:
--------------------------------------------------------------------------------
1 | cand = """## POSSIBLE ROOT CAUSE REASONS:
2 |
3 | - CPU fault
4 | - network delay
5 | - network loss
6 | - db connection limit
7 | - db close
8 |
9 | ## POSSIBLE ROOT CAUSE COMPONENTS:
10 |
11 | (if the root cause is at the node level, i.e., the root cause is a specific node)
12 |
13 | - os_001
14 | - os_002
15 | - os_003
16 | - os_004
17 | - os_005
18 | - os_006
19 | - os_007
20 | - os_008
21 | - os_009
22 | - os_010
23 | - os_011
24 | - os_012
25 | - os_013
26 | - os_014
27 | - os_015
28 | - os_016
29 | - os_017
30 | - os_018
31 | - os_019
32 | - os_020
33 | - os_021
34 | - os_022
35 |
36 | (if the root cause is at the pod level, i.e., the root cause is a specific container)
37 |
38 | - docker_001
39 | - docker_002
40 | - docker_003
41 | - docker_004
42 | - docker_005
43 | - docker_006
44 | - docker_007
45 | - docker_008
46 |
47 | (if the root cause is at the service level, i.e., if all pods of a specific service are faulty, the root cause is the service itself)
48 |
49 | - db_001
50 | - db_002
51 | - db_003
52 | - db_004
53 | - db_005
54 | - db_006
55 | - db_007
56 | - db_008
57 | - db_009
58 | - db_010
59 | - db_011
60 | - db_012
61 | - db_013"""
62 |
63 | schema = f"""## TELEMETRY DIRECTORY STRUCTURE:
64 |
65 | - You can access the telemetry directory in our microservices system: `dataset/Telecom/telemetry/`
66 |
67 | - This directory contains subdirectories organized by a date (e.g., `dataset/Telecom/telemetry/2020_04_11/`).
68 |
69 | - Within each date-specific directory, you’ll find these subdirectories: `metric` and `trace` (e.g., `dataset/Telecom/telemetry/2020_04_11/metric/`).
70 |
71 | - The telemetry data in those subdirectories is stored in CSV format (e.g., `dataset/Telecom/telemetry/2020_04_11/metric/metric_container.csv`).
72 |
73 | ## DATA SCHEMA
74 |
75 | 1. **Metric Files**:
76 |
77 | 1. `metric_app.csv`:
78 |
79 | ```csv
80 | serviceName,startTime,avg_time,num,succee_num,succee_rate
81 | osb_001,1586534400000,0.333,1,1,1.0
82 | ```
83 |
84 | 2. `metric_container.csv`:
85 |
86 | ```csv
87 | itemid,name,bomc_id,timestamp,value,cmdb_id
88 | 999999996381330,container_mem_used,ZJ-004-060,1586534423000,59.000000,docker_008
89 | ```
90 |
91 | 3. `metric_middleware.csv`:
92 |
93 | ```csv
94 | itemid,name,bomc_id,timestamp,value,cmdb_id
95 | 999999996508323,connected_clients,ZJ-005-024,1586534672000,25,redis_003
96 | ```
97 |
98 | 4. `metric_node.csv`:
99 |
100 | ```csv
101 | itemid,name,bomc_id,timestamp,value,cmdb_id
102 | 999999996487783,CPU_iowait_time,ZJ-001-010,1586534683000,0.022954,os_017
103 | ```
104 |
105 | 5. `metric_service.csv`:
106 |
107 | ```csv
108 | itemid,name,bomc_id,timestamp,value,cmdb_id
109 | 999999998650974,MEM_Total,ZJ-002-055,1586534694000,381.902264,db_003
110 | ```
111 |
112 | 2. **Trace Files**:
113 |
114 | 1. `trace_span.csv`:
115 |
116 | ```csv
117 | callType,startTime,elapsedTime,success,traceId,id,pid,cmdb_id,dsName,serviceName
118 | JDBC,1586534400335,2.0,True,01df517164d1c0365586,407d617164d1c14f2613,6e02217164d1c14b2607,docker_006,db_003,
119 | LOCAL,1586534400331,6.0,True,01df517164d1c0365586,6e02217164d1c14b2607,8432217164d1c1442597,docker_006,db_003,local_method_017
120 | RemoteProcess,1586534400324,55.0,True,01df517164d1c0365586,8432217164d1c1442597,b755e17164d1c13f5066,docker_006,,csf_005
121 | FlyRemote,1586534400149,7.0,TRUE,fa1e817164d1c0375444,da74117164d1c0955052,b959f17164d1c08c5050,docker_003,,fly_remote_001
122 | OSB,1586534660846,376.0,True,d9c4817164d5baee6924,77d1117164d5baee6925,None,os_021,,osb_001
123 | ```
124 |
125 | {cand}
126 |
127 | ## CLARIFICATION OF TELEMETRY DATA:
128 |
129 | 1. This service system is a telecom database system.
130 |
131 | 2. The `metric_app.csv` file only contains five KPIs: startTime, avg_time, num, succee_num, succee_rate. In contrast, other metric files record a variety of KPIs, such as CPU usage and memory usage. The specific names of these KPIs can be found in the `name` field.
132 |
133 | 3. In all telemetry files, the timestamp units and cmdb_id formats remain consistent:
134 |
135 | - Metric: Timestamp units are in milliseconds (e.g., 1586534423000).
136 |
137 | - Trace: Timestamp units are in milliseconds (e.g., 1586534400335).
138 |
139 | 4. Please use the UTC+8 time zone in all analysis steps since system is deployed in China/Hong Kong/Singapore."""
140 |
--------------------------------------------------------------------------------
/rca/baseline/rca_agent/rca_agent.py:
--------------------------------------------------------------------------------
1 | from rca.baseline.rca_agent.controller import control_loop
2 |
3 | class RCA_Agent:
4 | def __init__(self, agent_prompt, basic_prompt) -> None:
5 |
6 | self.ap = agent_prompt
7 | self.bp = basic_prompt
8 |
9 | def run(self, instruction, logger, max_step=25, max_turn=5):
10 |
11 | logger.info(f"Objective: {instruction}")
12 | prediction, trajectory, prompt = control_loop(instruction, "", self.ap, self.bp, logger=logger, max_step=max_step, max_turn=max_turn)
13 | logger.info(f"Result: {prediction}")
14 |
15 | return prediction, trajectory, prompt
--------------------------------------------------------------------------------
/rca/run_agent_standard.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | import argparse
5 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
6 | sys.path.insert(0, project_root)
7 | from main.evaluate import evaluate
8 | from rca.api_router import configs
9 |
10 | from datetime import datetime
11 | from loguru import logger
12 | from nbformat import v4 as nbf
13 | import pandas as pd
14 | import signal
15 |
16 | def handler(signum, frame):
17 | raise TimeoutError("Loop execution exceeded the time limit")
18 |
19 | def main(args, uid, dataset):
20 |
21 | from rca.baseline.rca_agent.rca_agent import RCA_Agent
22 | import rca.baseline.rca_agent.prompt.agent_prompt as ap
23 | if dataset == "Telecom":
24 | import rca.baseline.rca_agent.prompt.basic_prompt_Telecom as bp
25 | elif dataset == "Bank":
26 | import rca.baseline.rca_agent.prompt.basic_prompt_Bank as bp
27 | elif dataset == "Market/cloudbed-1" or dataset == "Market/cloudbed-2":
28 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp
29 |
30 | inst_file = f"dataset/{dataset}/query.csv"
31 | gt_file = f"dataset/{dataset}/record.csv"
32 | eval_file = f"test/result/{dataset}/agent-{args.tag}-{configs['MODEL'].split('/')[-1]}.csv"
33 | obs_path = f"test/monitor/{dataset}/agent-{args.tag}-{configs['MODEL'].split('/')[-1]}"
34 | unique_obs_path = f"{obs_path}/{uid}"
35 |
36 | instruct_data = pd.read_csv(inst_file)
37 | gt_data = pd.read_csv(gt_file)
38 | if not os.path.exists(inst_file) or not os.path.exists(gt_file):
39 | raise FileNotFoundError(f"Please download the dataset first.")
40 |
41 | if not os.path.exists(f"{unique_obs_path}/history"):
42 | os.makedirs(f"{unique_obs_path}/history")
43 | if not os.path.exists(f"{unique_obs_path}/trajectory"):
44 | os.makedirs(f"{unique_obs_path}/trajectory")
45 | if not os.path.exists(f"{unique_obs_path}/prompt"):
46 | os.makedirs(f"{unique_obs_path}/prompt")
47 | if not os.path.exists(eval_file):
48 | if not os.path.exists(f"test/result/{dataset}"):
49 | os.makedirs(f"test/result/{dataset}")
50 | eval_df = pd.DataFrame(columns=["instruction", "prediction", "groundtruth", "passed", "failed", "score"])
51 | else:
52 | eval_df = pd.read_csv(eval_file)
53 |
54 | scores = {
55 | "total": 0,
56 | "easy": 0,
57 | "middle": 0,
58 | "hard": 0,
59 | }
60 | nums = {
61 | "total": 0,
62 | "easy": 0,
63 | "middle": 0,
64 | "hard": 0,
65 | }
66 |
67 | signal.signal(signal.SIGALRM, handler)
68 | logger.info(f"Using dataset: {dataset}")
69 | logger.info(f"Using model: {configs['MODEL'].split('/')[-1]}")
70 |
71 | for idx, row in instruct_data.iterrows():
72 |
73 | if idx < args.start_idx:
74 | continue
75 | if idx > args.end_idx:
76 | break
77 |
78 | instruction = row["instruction"]
79 | task_index = row["task_index"]
80 | scoring_points = row["scoring_points"]
81 | task_id = int(task_index.split('_')[1])
82 | best_score = 0
83 |
84 | if task_id <= 3:
85 | catalog = "easy"
86 | elif task_id <= 6:
87 | catalog = "middle"
88 | elif task_id <= 7:
89 | catalog = "hard"
90 |
91 | for i in range(args.sample_num):
92 | uuid = uid + f"_#{idx}-{i}"
93 | nb = nbf.new_notebook()
94 | nbfile = f"{unique_obs_path}/trajectory/{uuid}.ipynb"
95 | promptfile = f"{unique_obs_path}/prompt/{uuid}.json"
96 | logfile = f"{unique_obs_path}/history/{uuid}.log"
97 | logger.remove()
98 | logger.add(sys.stdout, colorize=True, enqueue=True, level="INFO")
99 | logger.add(logfile, colorize=True, enqueue=True, level="INFO")
100 | logger.debug('\n' + "#"*80 + f"\n{uuid}: {task_index}\n" + "#"*80)
101 | try:
102 | signal.alarm(args.timeout)
103 |
104 | agent = RCA_Agent(ap, bp)
105 | prediction, trajectory, prompt = agent.run(instruction,
106 | logger,
107 | max_step=args.controller_max_step,
108 | max_turn=args.controller_max_turn)
109 |
110 | signal.alarm(0)
111 |
112 | for step in trajectory:
113 | code_cell = nbf.new_code_cell(step['code'])
114 | result_cell = nbf.new_markdown_cell(f"```\n{step['result']}\n```")
115 | nb.cells.append(code_cell)
116 | nb.cells.append(result_cell)
117 | with open(nbfile, 'w', encoding='utf-8') as f:
118 | json.dump(nb, f, ensure_ascii=False, indent=4)
119 | logger.info(f"Trajectory has been saved to {nbfile}")
120 |
121 | with open(promptfile, 'w', encoding='utf-8') as f:
122 | json.dump({"messages": prompt}, f, ensure_ascii=False, indent=4)
123 | logger.info(f"Prompt has been saved to {promptfile}")
124 |
125 | new_eval_df = pd.DataFrame([{"row_id": idx,
126 | "task_index": task_index,
127 | "instruction": instruction,
128 | "prediction": prediction,
129 | "groundtruth": '\n'.join([f'{col}: {gt_data.iloc[idx][col]}' for col in gt_data.columns if col != 'description']),
130 | "passed": "N/A",
131 | "failed": "N/A",
132 | "score": "N/A"}])
133 | eval_df = pd.concat([eval_df, new_eval_df],
134 | ignore_index=True)
135 | eval_df.to_csv(eval_file,
136 | index=False)
137 |
138 | passed_criteria, failed_criteria, score = evaluate(prediction, scoring_points)
139 |
140 | logger.info(f"Prediction: {prediction}")
141 | logger.info(f"Scoring Points: {scoring_points}")
142 | logger.info(f"Passed Criteria: {passed_criteria}")
143 | logger.info(f"Failed Criteria: {failed_criteria}")
144 | logger.info(f"Score: {score}")
145 | best_score = max(best_score, score)
146 |
147 | eval_df.loc[eval_df.index[-1], "passed"] = '\n'.join(passed_criteria)
148 | eval_df.loc[eval_df.index[-1], "failed"] = '\n'.join(failed_criteria)
149 | eval_df.loc[eval_df.index[-1], "score"] = score
150 | eval_df.to_csv(eval_file,
151 | index=False)
152 |
153 | temp_scores = scores.copy()
154 | temp_scores[catalog] += best_score
155 | temp_scores["total"] += best_score
156 | temp_nums = nums.copy()
157 | temp_nums[catalog] += 1
158 | temp_nums["total"] += 1
159 |
160 | except TimeoutError:
161 | logger.error(f"Loop {i} exceeded the time limit and was skipped")
162 | continue
163 |
164 | scores = temp_scores
165 | nums = temp_nums
166 |
167 |
168 | if __name__ == "__main__":
169 |
170 | uid = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
171 | parser = argparse.ArgumentParser()
172 | parser.add_argument("--dataset", type=str, default="Market/cloudbed-1")
173 | parser.add_argument("--sample_num", type=int, default=1)
174 | parser.add_argument("--start_idx", type=int, default=0)
175 | parser.add_argument("--end_idx", type=int, default=150)
176 | parser.add_argument("--controller_max_step", type=int, default=25)
177 | parser.add_argument("--controller_max_turn", type=int, default=5)
178 | parser.add_argument("--timeout", type=int, default=600)
179 | parser.add_argument("--tag", type=str, default='rca')
180 | parser.add_argument("--auto", type=bool, default=False)
181 |
182 | args = parser.parse_args()
183 |
184 | if args.auto:
185 | print(f"Auto mode is on. Model is fixed to {configs['MODEL']}")
186 | datasets = ["Market/cloudbed-1", "Market/cloudbed-2", "Bank", "Telecom"]
187 | for dataset in datasets:
188 | main(args, uid, dataset)
189 | else:
190 | dataset = args.dataset
191 | main(args, uid, dataset)
--------------------------------------------------------------------------------
/rca/run_sampling_balanced.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
4 | sys.path.insert(0, project_root)
5 | import pandas as pd
6 | import argparse
7 | from datetime import datetime
8 | from loguru import logger
9 | from copy import deepcopy
10 |
11 | from rca.baseline.direct_lm import DirectLM
12 | from rca.baseline.cot_lm import CoTLM
13 | from main.evaluate import evaluate
14 | from time import time
15 | from rca.api_router import configs
16 |
17 | import random
18 |
19 | import tiktoken
20 | tokenizer = tiktoken.encoding_for_model("gpt-4o")
21 |
22 | def cache_df_dict(dataset_name:str):
23 |
24 | df_dict = dict()
25 |
26 | if dataset_name == "Telecom":
27 | from rca.baseline.oracle_kpis import kpi_Telecom_len
28 | selected_kpi_len = kpi_Telecom_len
29 |
30 | example_df_dict = {
31 | "metric": [],
32 | "trace": [],
33 | }
34 | dataset_path = "Telecom"
35 |
36 | import rca.baseline.rca_agent.prompt.basic_prompt_Telecom as bp
37 | cand = bp.cand
38 |
39 | elif dataset_name == "Bank":
40 | from rca.baseline.oracle_kpis import kpi_Bank_len
41 | selected_kpi_len = kpi_Bank_len
42 |
43 | example_df_dict = {
44 | "log": [],
45 | "metric": [],
46 | "trace": [],
47 | }
48 | dataset_path = "Bank"
49 |
50 | import rca.baseline.rca_agent.prompt.basic_prompt_Bank as bp
51 | cand = bp.cand
52 |
53 | elif dataset_name == "Market/cloudbed-1":
54 | from rca.baseline.oracle_kpis import kpi_Market_len
55 | selected_kpi_len = kpi_Market_len
56 |
57 | example_df_dict = {
58 | "log": [],
59 | "metric": [],
60 | "trace": [],
61 | }
62 | dataset_path = "Market/cloudbed-1"
63 |
64 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp
65 | cand = bp.cand
66 |
67 | elif dataset_name == "Market/cloudbed-2":
68 | from rca.baseline.oracle_kpis import kpi_Market_len
69 | selected_kpi_len = kpi_Market_len
70 |
71 | example_df_dict = {
72 | "log": [],
73 | "metric": [],
74 | "trace": [],
75 | }
76 | dataset_path = "Market/cloudbed-2"
77 |
78 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp
79 | cand = bp.cand
80 |
81 | for day_time in os.listdir(f"dataset/{dataset_path}/telemetry/"):
82 | if day_time == '.DS_Store':
83 | continue
84 | if day_time not in df_dict:
85 | df_dict[day_time] = deepcopy(example_df_dict)
86 |
87 | for data_type in os.listdir(f"dataset/{dataset_path}/telemetry/{day_time}"):
88 | if data_type == '.DS_Store':
89 | continue
90 | for fname in os.listdir(f"dataset/{dataset_path}/telemetry/{day_time}/{data_type}"):
91 | t0 = time()
92 | cur_df = pd.read_csv(f"dataset/{dataset_path}/telemetry/{day_time}/{data_type}/{fname}")
93 | t1 = time()
94 | logger.debug(f"{round(t1-t0,1)} seconds for reading {fname}")
95 |
96 | #preprocess
97 | cur_df = cur_df.reset_index()
98 | if "timestamp" in cur_df.columns:
99 | col = "timestamp"
100 | elif "startTime" in cur_df.columns:
101 | col = "startTime"
102 | else:
103 | logger.error("There is no 'startTime' or 'timestamp' indicating the timestamp of the data entries")
104 | raise IndexError
105 | cur_df[col] = cur_df[col].apply(lambda x: int(x // 1000) if len(str(x)) == 13 else x)
106 | t2 = time()
107 | logger.debug(f"{round(t2-t1, 1)} seconds for prerpocessing DataFrame")
108 | if cur_df.empty:
109 | logger.warning(f"{fname} is empty")
110 | else:
111 | df_dict[day_time][data_type].append((fname, cur_df))
112 |
113 | return df_dict, selected_kpi_len, cand
114 |
115 |
116 | def extract_period_data(df_list:list, data_type:str, target_timestamp:int, sample_interval=60, selected_kpi=None, selected_kpi_len=None) -> list:
117 |
118 | logger.debug(f"Extracting {data_type} data ...")
119 |
120 | extracted_data = ""
121 | for fname, df_file in df_list:
122 |
123 | if data_type == "metric" and len(selected_kpi) >= selected_kpi_len:
124 | logger.info(f"Selected KPI number ({len(selected_kpi)}) have reached the limit: {selected_kpi_len}")
125 | break
126 |
127 | if "timestamp" in df_file.columns:
128 | col = "timestamp"
129 | elif "startTime" in df_file.columns:
130 | col = "startTime"
131 | else:
132 | logger.error("There is no 'startTime' or 'timestamp' indicating the timestamp of the data entries")
133 | raise IndexError
134 |
135 | t1 = time()
136 | start_timestamp = target_timestamp - target_timestamp % 1800
137 | end_timestamp = start_timestamp + 1800
138 | filtered_df = df_file[(df_file[col] >= start_timestamp) & (df_file[col] <= end_timestamp)]
139 | filtered_df = filtered_df.drop(columns=["index"])
140 |
141 | t2 = time()
142 | logger.debug(f"{round(t2-t1,1)} seconds for filtering 30 min data")
143 |
144 | if data_type == "log":
145 | filtered_df = filtered_df.drop(columns=["log_id"])
146 | filtered_df = filtered_df.drop(columns=["cmdb_id"])
147 | filtered_df = filtered_df.drop(columns=["log_name"])
148 | schema = filtered_df.columns
149 | extracted_data = extracted_data + f'\n\n#### {fname}'
150 | extracted_data = extracted_data + f' Schema: ' + ','.join(schema) + '\n'
151 | resampled_df = filtered_df.groupby(filtered_df[col] // (sample_interval/5)).first()
152 | if resampled_df.empty:
153 | extracted_data = extracted_data + "DATA NOT AVAILABLE\n"
154 | else:
155 | data = resampled_df.astype(str).agg(','.join, axis=1)
156 | extracted_data = extracted_data + '\n'.join(data) + '\n'
157 | elif data_type == "trace":
158 | opt_traceid_field_name = ["traceId", "trace_id"]
159 | traceid_field_name = None
160 | for opt_name in opt_traceid_field_name:
161 | if opt_name in df_file.columns:
162 | traceid_field_name = opt_name
163 | if traceid_field_name==None:
164 | logger.error("There is no 'traceId' or 'trace_id' indicating the trace_id of the data entries")
165 | raise IndexError
166 | opt_spanid_field_name = ["id", "span_id"]
167 | spanid_field_name = None
168 | for opt_name in opt_spanid_field_name:
169 | if opt_name in df_file.columns:
170 | spanid_field_name = opt_name
171 | if spanid_field_name==None:
172 | logger.error("There is no 'id' or 'span_id' indicating the span_id of the data entries")
173 | raise IndexError
174 | opt_parent_field_name = ["pid", "parent_id", "parent_span"]
175 | parent_field_name = None
176 | for opt_name in opt_parent_field_name:
177 | if opt_name in df_file.columns:
178 | parent_field_name = opt_name
179 | if parent_field_name==None:
180 | logger.error("There is no 'pid' or 'parent_id' indicating the parent_id of the data entries")
181 | raise IndexError
182 | opt_duration_field_name = ["elapsedTime", "duration"]
183 | duration_field_name = None
184 | for opt_name in opt_duration_field_name:
185 | if opt_name in df_file.columns:
186 | duration_field_name = opt_name
187 |
188 | filtered_df = filtered_df[[col, traceid_field_name, spanid_field_name, parent_field_name, duration_field_name, "cmdb_id"]]
189 | schema = filtered_df.columns
190 | schema = schema.drop([traceid_field_name])
191 | extracted_data = extracted_data + f'\n\n#### {fname}'
192 | extracted_data = extracted_data + f' Schema: ' + ','.join([s for s in schema if s != col]) + '\n'
193 | resampled_df = filtered_df.groupby(filtered_df[col] // sample_interval).first()
194 | trace_ids = resampled_df[traceid_field_name]
195 | trace_dfs = filtered_df[filtered_df[traceid_field_name].isin(trace_ids)]
196 | trace_grouped_df = trace_dfs.groupby(traceid_field_name)
197 | for trace_id, trace_df in trace_grouped_df:
198 | resource_name = f'{trace_id}'
199 | for field in trace_df.columns:
200 | if trace_df[field].dtype in [float, "float64"]:
201 | trace_df[field] = trace_df[field].apply(lambda x: round(x, 2))
202 | trace_df = trace_df.drop(columns=[traceid_field_name, col])
203 | if 'group' in trace_df.columns:
204 | trace_df = trace_df.drop(columns=["group"])
205 | if trace_df.empty:
206 | extracted_data = extracted_data + f'Trace ID: {resource_name}\n```\n' + "DATA NOT AVAILABLE\n```\n"
207 | else:
208 | data = trace_df.astype(str).agg(','.join, axis=1)
209 | extracted_data = extracted_data + f'Trace ID: {resource_name}\n```\n' + '\n'.join(data) + '\n```\n'
210 | t3 = time()
211 | logger.debug(f"{round(t2-t1,1)} seconds for extracting trace data")
212 | elif data_type == "metric":
213 | opt_kpi_field_name = ["name", "kpi_name", "serviceName", "tc", "service"]
214 | kpi_field_name = None
215 | for opt_name in opt_kpi_field_name:
216 | if opt_name in df_file.columns:
217 | kpi_field_name = opt_name
218 | break
219 | if kpi_field_name==None:
220 | logger.error("There is no 'name' or 'serviceName' indicating the kpi_name of the data entries")
221 | raise IndexError
222 |
223 | if kpi_field_name == 'name' or kpi_field_name == 'kpi_name':
224 | if len(filtered_df[kpi_field_name].unique()) > 0:
225 | kpi = random.choice(filtered_df[kpi_field_name].unique())
226 | while kpi in selected_kpi:
227 | kpi = random.choice(filtered_df[kpi_field_name].unique())
228 | selected_kpi.add(kpi)
229 | filtered_df = filtered_df[filtered_df[kpi_field_name] == kpi]
230 | else:
231 | continue
232 | elif kpi_field_name == 'serviceName' or kpi_field_name == 'tc' or kpi_field_name == 'service':
233 | if kpi_field_name not in selected_kpi:
234 | selected_kpi.add(kpi_field_name)
235 | else:
236 | continue
237 | filtered_df["group"] = filtered_df[col].apply(lambda x: x // sample_interval)
238 | if 'cmdb_id' not in filtered_df.columns:
239 | filtered_df["resource_name"] = filtered_df[kpi_field_name]
240 | filtered_df = filtered_df.drop(columns=[kpi_field_name])
241 | else:
242 | filtered_df["resource_name"] = filtered_df["cmdb_id"] + "_" + filtered_df[kpi_field_name]
243 | filtered_df = filtered_df.drop(columns=["cmdb_id", kpi_field_name])
244 | if "itemid" in filtered_df.columns:
245 | filtered_df = filtered_df.drop(columns=["itemid"])
246 | if "bomc_id" in filtered_df.columns:
247 | filtered_df = filtered_df.drop(columns=["bomc_id"])
248 | schema = filtered_df.columns
249 | schema = schema.drop("resource_name")
250 | schema = schema.drop('group')
251 | extracted_data = extracted_data + f'\n\n#### {fname}'
252 | extracted_data = extracted_data + f' Schema: ' + ','.join([s for s in schema if s != col]) + '\n'
253 | resource_grouped_df = filtered_df.groupby("resource_name")
254 | for resource_name, resource_df in resource_grouped_df:
255 | resampled_df = resource_df.groupby(resource_df[col] // sample_interval).first()
256 | for field in resampled_df.columns:
257 | if resampled_df[field].dtype in [float, "float64"]:
258 | resampled_df[field] = resampled_df[field].apply(lambda x: round(x, 2))
259 | resampled_df = resampled_df.drop(columns=["resource_name"])
260 | if 'group' in resampled_df.columns:
261 | resampled_df = resampled_df.drop(columns=["group"])
262 | if resampled_df.empty:
263 | extracted_data = extracted_data + f'{resource_name}\n```\n' + "DATA NOT AVAILABLE\n```\n"
264 | resampled_df = resampled_df.drop(columns=[col])
265 | data = resampled_df.astype(str).agg(','.join, axis=1)
266 | extracted_data = extracted_data + f'{resource_name}\n```\n' + '\n'.join(data) + '\n```\n'
267 |
268 | t3 = time()
269 | logger.debug(f"{round(t3-t2,1)} seconds for selecting metric data")
270 | return extracted_data, selected_kpi
271 |
272 | def main(args):
273 | import rca.baseline.rca_agent.prompt.agent_prompt as ap
274 | if args.dataset == "Telecom":
275 | import rca.baseline.rca_agent.prompt.basic_prompt_Telecom as bp
276 | elif args.dataset == "Bank":
277 | import rca.baseline.rca_agent.prompt.basic_prompt_Bank as bp
278 | elif args.dataset == "Market/cloudbed-1" or args.dataset == "Market/cloudbed-2":
279 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp
280 |
281 | inst_file = f"dataset/{args.dataset}/query.csv"
282 | gt_file = f"dataset/{args.dataset}/record.csv"
283 | eval_file = f"test/result/{args.dataset}/balanced_{args.tag}_{args.mode}-{configs['MODEL'].split('/')[-1]}.csv"
284 | obs_path = f"test/monitor/{args.dataset}/balanced_{args.tag}_{args.mode}-{configs['MODEL'].split('/')[-1]}"
285 | unique_obs_path = f"{obs_path}/{uid}"
286 |
287 | instruct_data = pd.read_csv(inst_file)
288 | gt_data = pd.read_csv(gt_file)
289 | if not os.path.exists(inst_file) or not os.path.exists(gt_file):
290 | raise FileNotFoundError(f"Please download the dataset first.")
291 |
292 | if not os.path.exists(f"{unique_obs_path}/prompt"):
293 | os.makedirs(f"{unique_obs_path}/prompt")
294 | if not os.path.exists(eval_file):
295 | if not os.path.exists(f"test/result/{args.dataset}"):
296 | os.makedirs(f"test/result/{args.dataset}")
297 | eval_df = pd.DataFrame(columns=["instruction", "prediction", "groundtruth", "passed", "failed", "score"])
298 | else:
299 | eval_df = pd.read_csv(eval_file)
300 |
301 | logfile = f"{unique_obs_path}/batch.log"
302 | logger.remove()
303 | logger.add(sys.stdout, colorize=True, enqueue=True, level="INFO")
304 | logger.add(logfile, colorize=True, enqueue=True, level="INFO")
305 |
306 | scores = {
307 | "total": 0,
308 | "easy": 0,
309 | "middle": 0,
310 | "hard": 0,
311 | }
312 | nums = {
313 | "total": 0,
314 | "easy": 0,
315 | "middle": 0,
316 | "hard": 0,
317 | }
318 |
319 | logger.info(f"Using dataset: {args.dataset}")
320 | logger.info(f"Using model: {configs['MODEL'].split('/')[-1]}")
321 | logger.info("Start caching dataframes ...")
322 | df_dict, selected_kpi_len, cand = cache_df_dict(args.dataset)
323 |
324 | for idx, row in instruct_data.iterrows():
325 |
326 | if idx < args.start_idx:
327 | continue
328 | if idx > args.end_idx:
329 | break
330 |
331 | instruction = row["instruction"]
332 | timestamp = gt_data.iloc[idx]["timestamp"].astype(int)
333 | date_time = gt_data.iloc[idx]["datetime"].split(" ")[0].replace("-","_")
334 | task_index = row["task_index"]
335 | scoring_points = row["scoring_points"]
336 | print(scoring_points)
337 | task_id = int(task_index.split('_')[1])
338 | best_score = 0
339 |
340 | if task_id <= 3:
341 | catalog = "easy"
342 | elif task_id <= 6:
343 | catalog = "middle"
344 | elif task_id <= 7:
345 | catalog = "hard"
346 |
347 |
348 |
349 | for i in range(args.sample_num):
350 | uuid = uid + f"_#{idx}-{i}"
351 | promptfile = f"{unique_obs_path}/prompt/{uuid}.txt"
352 | logger.debug('\n' + "#"*80 + f"\n{uuid}: {task_index}\n" + "#"*80)
353 |
354 | period_data = dict()
355 |
356 | if args.dataset != "Telecom":
357 | period_data["log"], _ = extract_period_data(deepcopy(df_dict[date_time]["log"]),
358 | "log", timestamp,
359 | sample_interval=args.sample_interval,
360 | )
361 |
362 | period_data["trace"], _ = extract_period_data(deepcopy(df_dict[date_time]["trace"]),
363 | "trace",
364 | timestamp,
365 | sample_interval=args.sample_interval,
366 | )
367 |
368 | selected_kpi = set()
369 | new_kpi = ""
370 | period_data['metric'] = ""
371 | logger.info(f"Sampling Started.")
372 | while len(selected_kpi) < selected_kpi_len:
373 | new_kpi, selected_kpi = extract_period_data(deepcopy(df_dict[date_time]["metric"]),
374 | "metric",
375 | timestamp,
376 | sample_interval=args.sample_interval,
377 | selected_kpi=selected_kpi,
378 | selected_kpi_len=selected_kpi_len
379 | )
380 | period_data['metric'] += new_kpi
381 | logger.info(f"Selected KPI number: {len(selected_kpi)}\tLimit: {selected_kpi_len}")
382 |
383 | logger.info(f"Sampling Finished. Total tokens: {sum([len(tokenizer.encode(data)) for data in period_data.values()])}")
384 |
385 | try:
386 | if args.mode == "direct":
387 | model = DirectLM(gt_data, cand)
388 | elif args.mode == "cot":
389 | model = CoTLM(gt_data, cand)
390 |
391 | prediction, prompt = model.run(instruction, period_data, args.sample_interval, logger)
392 | with open (promptfile, 'w') as f:
393 | for p in prompt:
394 | f.write(str(p['content']))
395 | f.write('\n\n')
396 | f.write(str(prediction))
397 |
398 | new_eval_df = pd.DataFrame([{"row_id": idx,
399 | "task_index": task_index,
400 | "instruction": instruction,
401 | "prediction": prediction,
402 | "groundtruth": '\n'.join([f'{col}: {gt_data.iloc[idx][col]}' for col in gt_data.columns if col != 'description']),
403 | "passed": "N/A",
404 | "failed": "N/A",
405 | "score": "N/A"}])
406 | eval_df = pd.concat([eval_df, new_eval_df],
407 | ignore_index=True)
408 | eval_df.to_csv(eval_file,
409 | index=False)
410 |
411 | if prediction == "EXCEED!":
412 | passed_criteria = ["EXCEED!"]
413 | failed_criteria = ["EXCEED!"]
414 | score = 0.0
415 | else:
416 | passed_criteria, failed_criteria, score = evaluate(prediction, scoring_points)
417 | logger.info(f"Prediction: {prediction}")
418 | logger.info(f"Scoring Points: {scoring_points}")
419 | logger.info(f"Passed Criteria: {passed_criteria}")
420 | logger.info(f"Failed Criteria: {failed_criteria}")
421 | logger.info(f"Score: {score}")
422 | best_score = max(best_score, score)
423 |
424 | eval_df.loc[eval_df.index[-1], "passed"] = '\n'.join(passed_criteria)
425 | eval_df.loc[eval_df.index[-1], "failed"] = '\n'.join(failed_criteria)
426 | eval_df.loc[eval_df.index[-1], "score"] = score
427 | eval_df.to_csv(eval_file,
428 | index=False)
429 |
430 | temp_scores = scores.copy()
431 | temp_scores[catalog] += best_score
432 | temp_scores["total"] += best_score
433 | temp_nums = nums.copy()
434 | temp_nums[catalog] += 1
435 | temp_nums["total"] += 1
436 |
437 | except Exception as e:
438 | logger.error(e)
439 | continue
440 |
441 | scores = temp_scores
442 | nums = temp_nums
443 |
444 |
445 |
446 | if __name__ == "__main__":
447 | uid = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
448 | parser = argparse.ArgumentParser()
449 | parser.add_argument("--dataset", type=str, default="Market/cloudbed-1")
450 | parser.add_argument("--sample_num", type=int, default=1)
451 | parser.add_argument("--start_idx", type=int, default=0)
452 | parser.add_argument("--end_idx", type=int, default=150)
453 | parser.add_argument("--sample_interval", type=int, default=60)
454 | parser.add_argument("--mode", type=str, default="direct")
455 | parser.add_argument("--tag", type=str, default='lm')
456 |
457 | args = parser.parse_args()
458 |
459 | main(args)
460 |
461 |
--------------------------------------------------------------------------------
/rca/run_sampling_oracle.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
4 | sys.path.insert(0, project_root)
5 | import pandas as pd
6 | import argparse
7 | from datetime import datetime
8 | from loguru import logger
9 | from copy import deepcopy
10 |
11 | from rca.baseline.direct_lm import DirectLM
12 | from rca.baseline.cot_lm import CoTLM
13 | from main.evaluate import evaluate
14 | from time import time
15 | from rca.api_router import configs
16 |
17 | def cache_df_dict(dataset_name:str):
18 |
19 | df_dict = dict()
20 |
21 | if dataset_name == "Telecom":
22 | from rca.baseline.oracle_kpis import kpi_Telecom
23 | selected_kpi_dict = kpi_Telecom
24 |
25 | example_df_dict = {
26 | "metric": [],
27 | "trace": [],
28 | }
29 | dataset_path = "Telecom"
30 |
31 | import rca.baseline.rca_agent.prompt.basic_prompt_Telecom as bp
32 | cand = bp.cand
33 |
34 | elif dataset_name == "Bank":
35 | from rca.baseline.oracle_kpis import kpi_Bank
36 | selected_kpi_dict = kpi_Bank
37 |
38 | example_df_dict = {
39 | "log": [],
40 | "metric": [],
41 | "trace": [],
42 | }
43 | dataset_path = "Bank"
44 |
45 | import rca.baseline.rca_agent.prompt.basic_prompt_Bank as bp
46 | cand = bp.cand
47 |
48 | elif dataset_name == "Market/cloudbed-1":
49 | from rca.baseline.oracle_kpis import kpi_Market
50 | selected_kpi_dict = kpi_Market
51 |
52 | example_df_dict = {
53 | "log": [],
54 | "metric": [],
55 | "trace": [],
56 | }
57 | dataset_path = "Market/cloudbed-1"
58 |
59 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp
60 | cand = bp.cand
61 |
62 | elif dataset_name == "Market/cloudbed-2":
63 | from rca.baseline.oracle_kpis import kpi_Market
64 | selected_kpi_dict = kpi_Market
65 |
66 | example_df_dict = {
67 | "log": [],
68 | "metric": [],
69 | "trace": [],
70 | }
71 | dataset_path = "Market/cloudbed-2"
72 |
73 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp
74 | cand = bp.cand
75 |
76 | for day_time in os.listdir(f"dataset/{dataset_path}/telemetry/"):
77 | if day_time == '.DS_Store':
78 | continue
79 | if day_time not in df_dict:
80 | df_dict[day_time] = deepcopy(example_df_dict)
81 |
82 | for data_type in os.listdir(f"dataset/{dataset_path}/telemetry/{day_time}"):
83 | if data_type == '.DS_Store':
84 | continue
85 | for fname in os.listdir(f"dataset/{dataset_path}/telemetry/{day_time}/{data_type}"):
86 | t0 = time()
87 | cur_df = pd.read_csv(f"dataset/{dataset_path}/telemetry/{day_time}/{data_type}/{fname}")
88 | t1 = time()
89 | logger.debug(f"{round(t1-t0,1)} seconds for reading {fname}")
90 |
91 | cur_df = cur_df.reset_index()
92 | if "timestamp" in cur_df.columns:
93 | col = "timestamp"
94 | elif "startTime" in cur_df.columns:
95 | col = "startTime"
96 | else:
97 | logger.error("There is no 'startTime' or 'timestamp' indicating the timestamp of the data entries")
98 | raise IndexError
99 | cur_df[col] = cur_df[col].apply(lambda x: int(x // 1000) if len(str(x)) == 13 else x)
100 | t2 = time()
101 | logger.debug(f"{round(t2-t1, 1)} seconds for prerpocessing DataFrame")
102 | if cur_df.empty:
103 | logger.warning(f"{fname} is empty")
104 | else:
105 | df_dict[day_time][data_type].append((fname, cur_df))
106 |
107 | return df_dict, selected_kpi_dict, cand
108 |
109 |
110 | def extract_period_data(df_list:list, data_type:str, target_timestamp:int, sample_interval=60, selected_kpi=None) -> list:
111 |
112 | logger.debug(f"Extracting {data_type} data ...")
113 |
114 | extracted_data = ""
115 | for fname, df_file in df_list:
116 |
117 | if "timestamp" in df_file.columns:
118 | col = "timestamp"
119 | elif "startTime" in df_file.columns:
120 | col = "startTime"
121 | else:
122 | logger.error("There is no 'startTime' or 'timestamp' indicating the timestamp of the data entries")
123 | raise IndexError
124 |
125 | t1 = time()
126 | start_timestamp = target_timestamp - target_timestamp % 1800
127 | end_timestamp = start_timestamp + 1800
128 | filtered_df = df_file[(df_file[col] >= start_timestamp) & (df_file[col] <= end_timestamp)]
129 | filtered_df = filtered_df.drop(columns=["index"])
130 |
131 | t2 = time()
132 | logger.debug(f"{round(t2-t1,1)} seconds for filtering 30 min data")
133 |
134 | if data_type == "log":
135 | filtered_df = filtered_df.drop(columns=["log_id"])
136 | filtered_df = filtered_df.drop(columns=["cmdb_id"])
137 | filtered_df = filtered_df.drop(columns=["log_name"])
138 | schema = filtered_df.columns
139 | extracted_data = extracted_data + f'\n\n#### {fname}'
140 | extracted_data = extracted_data + f' Schema: ' + ','.join(schema) + '\n'
141 | resampled_df = filtered_df.groupby(filtered_df[col] // (sample_interval/5)).first()
142 | if resampled_df.empty:
143 | extracted_data = extracted_data + "DATA NOT AVAILABLE\n"
144 | else:
145 | data = resampled_df.astype(str).agg(','.join, axis=1)
146 | extracted_data = extracted_data + '\n'.join(data) + '\n'
147 | elif data_type == "trace":
148 | opt_traceid_field_name = ["traceId", "trace_id"]
149 | traceid_field_name = None
150 | for opt_name in opt_traceid_field_name:
151 | if opt_name in df_file.columns:
152 | traceid_field_name = opt_name
153 | if traceid_field_name==None:
154 | logger.error("There is no 'traceId' or 'trace_id' indicating the trace_id of the data entries")
155 | raise IndexError
156 | opt_spanid_field_name = ["id", "span_id"]
157 | spanid_field_name = None
158 | for opt_name in opt_spanid_field_name:
159 | if opt_name in df_file.columns:
160 | spanid_field_name = opt_name
161 | if spanid_field_name==None:
162 | logger.error("There is no 'id' or 'span_id' indicating the span_id of the data entries")
163 | raise IndexError
164 | opt_parent_field_name = ["pid", "parent_id", "parent_span"]
165 | parent_field_name = None
166 | for opt_name in opt_parent_field_name:
167 | if opt_name in df_file.columns:
168 | parent_field_name = opt_name
169 | if parent_field_name==None:
170 | logger.error("There is no 'pid' or 'parent_id' indicating the parent_id of the data entries")
171 | raise IndexError
172 | opt_duration_field_name = ["elapsedTime", "duration"]
173 | duration_field_name = None
174 | for opt_name in opt_duration_field_name:
175 | if opt_name in df_file.columns:
176 | duration_field_name = opt_name
177 | if duration_field_name==None:
178 | logger.error("There is no 'elapsedTime' or 'duration' indicating the duration of the data entries")
179 | raise IndexError
180 |
181 | filtered_df = filtered_df[[col, traceid_field_name, spanid_field_name, parent_field_name, duration_field_name, "cmdb_id"]]
182 | schema = filtered_df.columns
183 | schema = schema.drop([traceid_field_name])
184 | extracted_data = extracted_data + f'\n\n#### {fname}'
185 | extracted_data = extracted_data + f' Schema: ' + ','.join([s for s in schema if s != col]) + '\n'
186 | resampled_df = filtered_df.groupby(filtered_df[col] // sample_interval).first()
187 | trace_ids = resampled_df[traceid_field_name]
188 | trace_dfs = filtered_df[filtered_df[traceid_field_name].isin(trace_ids)]
189 | trace_grouped_df = trace_dfs.groupby(traceid_field_name)
190 | for trace_id, trace_df in trace_grouped_df:
191 | resource_name = f'{trace_id}'
192 | for field in trace_df.columns:
193 | if trace_df[field].dtype in [float, "float64"]:
194 | trace_df[field] = trace_df[field].apply(lambda x: round(x, 2))
195 | trace_df = trace_df.drop(columns=[traceid_field_name, col])
196 | if 'group' in trace_df.columns:
197 | trace_df = trace_df.drop(columns=["group"])
198 | if trace_df.empty:
199 | extracted_data = extracted_data + f'Trace ID: {resource_name}\n```\n' + "DATA NOT AVAILABLE\n```\n"
200 | else:
201 | data = trace_df.astype(str).agg(','.join, axis=1)
202 | extracted_data = extracted_data + f'Trace ID: {resource_name}\n```\n' + '\n'.join(data) + '\n```\n'
203 | t3 = time()
204 | logger.debug(f"{round(t2-t1,1)} seconds for extracting trace data")
205 | elif data_type == "metric":
206 | opt_kpi_field_name = ["name", "kpi_name", "serviceName", "tc", "service"]
207 | kpi_field_name = None
208 | for opt_name in opt_kpi_field_name:
209 | if opt_name in df_file.columns:
210 | kpi_field_name = opt_name
211 | break
212 | if kpi_field_name==None:
213 | logger.error("There is no 'name' or 'serviceName' indicating the kpi_name of the data entries")
214 | raise IndexError
215 |
216 | if kpi_field_name == 'name' or kpi_field_name == 'kpi_name':
217 | filtered_df = filtered_df[filtered_df[kpi_field_name].isin(selected_kpi)]
218 |
219 | filtered_df["group"] = filtered_df[col].apply(lambda x: x // sample_interval)
220 | if 'cmdb_id' not in filtered_df.columns:
221 | filtered_df["resource_name"] = filtered_df[kpi_field_name]
222 | filtered_df = filtered_df.drop(columns=[kpi_field_name])
223 | else:
224 | filtered_df["resource_name"] = filtered_df["cmdb_id"] + "_" + filtered_df[kpi_field_name]
225 | filtered_df = filtered_df.drop(columns=["cmdb_id", kpi_field_name])
226 | if "itemid" in filtered_df.columns:
227 | filtered_df = filtered_df.drop(columns=["itemid"])
228 | if "bomc_id" in filtered_df.columns:
229 | filtered_df = filtered_df.drop(columns=["bomc_id"])
230 | schema = filtered_df.columns
231 | schema = schema.drop("resource_name")
232 | schema = schema.drop('group')
233 | extracted_data = extracted_data + f'\n\n#### {fname}'
234 | extracted_data = extracted_data + f' Schema: ' + ','.join([s for s in schema if s != col]) + '\n'
235 | resource_grouped_df = filtered_df.groupby("resource_name")
236 | for resource_name, resource_df in resource_grouped_df:
237 | resampled_df = resource_df.groupby(resource_df[col] // sample_interval).first()
238 | for field in resampled_df.columns:
239 | if resampled_df[field].dtype in [float, "float64"]:
240 | resampled_df[field] = resampled_df[field].apply(lambda x: round(x, 2))
241 | resampled_df = resampled_df.drop(columns=["resource_name"])
242 | if 'group' in resampled_df.columns:
243 | resampled_df = resampled_df.drop(columns=["group"])
244 | if resampled_df.empty:
245 | extracted_data = extracted_data + f'{resource_name}\n```\n' + "DATA NOT AVAILABLE\n```\n"
246 | resampled_df = resampled_df.drop(columns=[col])
247 | data = resampled_df.astype(str).agg(','.join, axis=1)
248 | extracted_data = extracted_data + f'{resource_name}\n```\n' + '\n'.join(data) + '\n```\n'
249 |
250 | t3 = time()
251 | logger.debug(f"{round(t3-t2,1)} seconds for selecting metric data")
252 | return extracted_data
253 |
254 | def main(args):
255 | import rca.baseline.rca_agent.prompt.agent_prompt as ap
256 | if args.dataset == "Telecom":
257 | import rca.baseline.rca_agent.prompt.basic_prompt_Telecom as bp
258 | elif args.dataset == "Bank":
259 | import rca.baseline.rca_agent.prompt.basic_prompt_Bank as bp
260 | elif args.dataset == "Market/cloudbed-1" or args.dataset == "Market/cloudbed-2":
261 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp
262 |
263 | inst_file = f"dataset/{args.dataset}/query.csv"
264 | gt_file = f"dataset/{args.dataset}/record.csv"
265 | eval_file = f"test/result/{args.dataset}/oracle_{args.tag}_{args.mode}-{configs['MODEL'].split('/')[-1]}.csv"
266 | obs_path = f"test/monitor/{args.dataset}/oracle_{args.tag}_{args.mode}-{configs['MODEL'].split('/')[-1]}"
267 | unique_obs_path = f"{obs_path}/{uid}"
268 |
269 | instruct_data = pd.read_csv(inst_file)
270 | gt_data = pd.read_csv(gt_file)
271 | if not os.path.exists(inst_file) or not os.path.exists(gt_file):
272 | raise FileNotFoundError(f"Please download the dataset first.")
273 |
274 | if not os.path.exists(f"{unique_obs_path}/prompt"):
275 | os.makedirs(f"{unique_obs_path}/prompt")
276 | if not os.path.exists(eval_file):
277 | if not os.path.exists(f"test/result/{args.dataset}"):
278 | os.makedirs(f"test/result/{args.dataset}")
279 | eval_df = pd.DataFrame(columns=["instruction", "prediction", "groundtruth", "passed", "failed", "score"])
280 | else:
281 | eval_df = pd.read_csv(eval_file)
282 |
283 | logfile = f"{unique_obs_path}/batch.log"
284 | logger.remove()
285 | logger.add(sys.stdout, colorize=True, enqueue=True, level="INFO")
286 | logger.add(logfile, colorize=True, enqueue=True, level="INFO")
287 |
288 | scores = {
289 | "total": 0,
290 | "easy": 0,
291 | "middle": 0,
292 | "hard": 0,
293 | }
294 | nums = {
295 | "total": 0,
296 | "easy": 0,
297 | "middle": 0,
298 | "hard": 0,
299 | }
300 |
301 | logger.info(f"Using dataset: {args.dataset}")
302 | logger.info(f"Using model: {configs['MODEL'].split('/')[-1]}")
303 | logger.info("Start caching dataframes ...")
304 | df_dict, selected_kpi_dict, cand = cache_df_dict(args.dataset)
305 |
306 | selected_kpi = []
307 | for typ, ls in selected_kpi_dict.items():
308 | selected_kpi.extend(ls)
309 | selected_kpi = set(selected_kpi)
310 |
311 | for idx, row in instruct_data.iterrows():
312 |
313 | if idx < args.start_idx:
314 | continue
315 | if idx > args.end_idx:
316 | break
317 |
318 | instruction = row["instruction"]
319 | timestamp = gt_data.iloc[idx]["timestamp"].astype(int)
320 | date_time = gt_data.iloc[idx]["datetime"].split(" ")[0].replace("-","_")
321 | task_index = row["task_index"]
322 | scoring_points = row["scoring_points"]
323 | task_id = int(task_index.split('_')[1])
324 | best_score = 0
325 |
326 | if task_id <= 3:
327 | catalog = "easy"
328 | elif task_id <= 6:
329 | catalog = "middle"
330 | elif task_id <= 7:
331 | catalog = "hard"
332 |
333 | for i in range(args.sample_num):
334 | uuid = uid + f"_#{idx}-{i}"
335 | promptfile = f"{unique_obs_path}/prompt/{uuid}.txt"
336 | logger.debug('\n' + "#"*80 + f"\n{uuid}: {task_index}\n" + "#"*80)
337 |
338 | period_data = dict()
339 |
340 | if args.dataset != "Telecom":
341 | period_data["log"] = extract_period_data(deepcopy(df_dict[date_time]["log"]),
342 | "log", timestamp,
343 | sample_interval=args.sample_interval,
344 | )
345 |
346 |
347 | period_data["metric"] = extract_period_data(deepcopy(df_dict[date_time]["metric"]),
348 | "metric",
349 | timestamp,
350 | sample_interval=args.sample_interval,
351 | selected_kpi=selected_kpi,
352 | )
353 |
354 | period_data["trace"] = extract_period_data(deepcopy(df_dict[date_time]["trace"]),
355 | "trace",
356 | timestamp,
357 | sample_interval=args.sample_interval,
358 | )
359 |
360 | try:
361 | if args.mode == "direct":
362 | model = DirectLM(gt_data, cand)
363 | elif args.mode == "cot":
364 | model = CoTLM(gt_data, cand)
365 |
366 | prediction, prompt = model.run(instruction, period_data, args.sample_interval, logger)
367 | with open (promptfile, 'w') as f:
368 | for p in prompt:
369 | f.write(str(p['content']))
370 | f.write('\n\n')
371 | f.write(str(prediction))
372 |
373 | new_eval_df = pd.DataFrame([{"row_id": idx,
374 | "task_index": task_index,
375 | "instruction": instruction,
376 | "prediction": prediction,
377 | "groundtruth": '\n'.join([f'{col}: {gt_data.iloc[idx][col]}' for col in gt_data.columns if col != 'description']),
378 | "passed": "N/A",
379 | "failed": "N/A",
380 | "score": "N/A"}])
381 | eval_df = pd.concat([eval_df, new_eval_df],
382 | ignore_index=True)
383 | eval_df.to_csv(eval_file,
384 | index=False)
385 |
386 | if prediction == "EXCEED!":
387 | passed_criteria = ["EXCEED!"]
388 | failed_criteria = ["EXCEED!"]
389 | score = 0.0
390 | else:
391 | passed_criteria, failed_criteria, score = evaluate(prediction, scoring_points)
392 | logger.info(f"Prediction: {prediction}")
393 | logger.info(f"Scoring Points: {scoring_points}")
394 | logger.info(f"Passed Criteria: {passed_criteria}")
395 | logger.info(f"Failed Criteria: {failed_criteria}")
396 | logger.info(f"Score: {score}")
397 | best_score = max(best_score, score)
398 |
399 | eval_df.loc[eval_df.index[-1], "passed"] = '\n'.join(passed_criteria)
400 | eval_df.loc[eval_df.index[-1], "failed"] = '\n'.join(failed_criteria)
401 | eval_df.loc[eval_df.index[-1], "score"] = score
402 | eval_df.to_csv(eval_file,
403 | index=False)
404 |
405 | temp_scores = scores.copy()
406 | temp_scores[catalog] += best_score
407 | temp_scores["total"] += best_score
408 | temp_nums = nums.copy()
409 | temp_nums[catalog] += 1
410 | temp_nums["total"] += 1
411 |
412 | except Exception as e:
413 | logger.error(e)
414 | continue
415 |
416 | scores = temp_scores
417 | nums = temp_nums
418 |
419 |
420 |
421 | if __name__ == "__main__":
422 | uid = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
423 | parser = argparse.ArgumentParser()
424 | parser.add_argument("--dataset", type=str, default="Market/cloudbed-1")
425 | parser.add_argument("--sample_num", type=int, default=1)
426 | parser.add_argument("--start_idx", type=int, default=0)
427 | parser.add_argument("--end_idx", type=int, default=150)
428 | parser.add_argument("--sample_interval", type=int, default=60)
429 | parser.add_argument("--mode", type=str, default="direct")
430 | parser.add_argument("--tag", type=str, default='lm')
431 |
432 | args = parser.parse_args()
433 |
434 | main(args)
435 |
436 |
--------------------------------------------------------------------------------
/rca/test.sh:
--------------------------------------------------------------------------------
1 | python -m main.evaluate \
2 | -p \
3 | rca/archive/agent-Bank.csv \
4 | rca/archive/agent-Market-cloudbed-1.csv \
5 | rca/archive/agent-Market-cloudbed-2.csv \
6 | rca/archive/agent-Telecom.csv \
7 | -q \
8 | dataset/Bank/query.csv \
9 | dataset/Market/cloudbed-1/query.csv \
10 | dataset/Market/cloudbed-2/query.csv \
11 | dataset/Telecom/query.csv \
12 | -r \
13 | test/agent_claude.csv
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | anthropic==0.39.0
2 | ipython==8.17.2
3 | loguru==0.7.2
4 | nbformat==5.10.4
5 | openai==1.54.3
6 | pandas==1.5.3
7 | protobuf==5.28.3
8 | pytz==2022.7
9 | PyYAML==6.0.2
10 | tiktoken==0.7.0
11 |
--------------------------------------------------------------------------------