├── .asset ├── openrca.png └── rcaagent.png ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── dataset └── README.md ├── docs ├── .gitignore ├── eslint.config.js ├── index.html ├── package-lock.json ├── package.json ├── public │ ├── cuhksz.png │ ├── microsoft.jpg │ ├── ms_logo.svg │ ├── overview.pdf │ ├── overview.png │ ├── overview.svg │ ├── thu.jpg │ └── vite.svg ├── src │ ├── App.tsx │ ├── components │ │ └── Layout.tsx │ ├── data │ │ └── modelData.ts │ ├── index.css │ ├── main.tsx │ └── pages │ │ └── Home.tsx ├── tsconfig.app.json ├── tsconfig.json ├── tsconfig.node.json └── vite.config.ts ├── main ├── evaluate.py ├── generate.py ├── prompt.py └── task_specification.json ├── rca ├── api_config.yaml ├── api_router.py ├── archive │ ├── agent-Bank.csv │ ├── agent-Market-cloudbed-1.csv │ ├── agent-Market-cloudbed-2.csv │ └── agent-Telecom.csv ├── baseline │ ├── cot_lm.py │ ├── direct_lm.py │ ├── oracle_kpis.py │ └── rca_agent │ │ ├── controller.py │ │ ├── executor.py │ │ ├── prompt │ │ ├── agent_prompt.py │ │ ├── basic_prompt_Bank.py │ │ ├── basic_prompt_Market.py │ │ └── basic_prompt_Telecom.py │ │ └── rca_agent.py ├── run_agent_standard.py ├── run_sampling_balanced.py ├── run_sampling_oracle.py └── test.sh └── requirements.txt /.asset/openrca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/.asset/openrca.png -------------------------------------------------------------------------------- /.asset/rcaagent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/.asset/rcaagent.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # OpenRCA 2 | .temp/ 3 | dataset/ 4 | test/ 5 | api_config.yaml 6 | 7 | # Mac 8 | .DS_Store 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | cover/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | .pybuilder/ 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | # For a library or package, you might want to ignore these files since the code is 96 | # intended to run in multiple environments; otherwise, check them in: 97 | # .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 119 | .pdm.toml 120 | .pdm-python 121 | .pdm-build/ 122 | 123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 124 | __pypackages__/ 125 | 126 | # Celery stuff 127 | celerybeat-schedule 128 | celerybeat.pid 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # Environments 134 | .env 135 | .venv 136 | env/ 137 | venv/ 138 | ENV/ 139 | env.bak/ 140 | venv.bak/ 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # pytype static type analyzer 161 | .pytype/ 162 | 163 | # Cython debug symbols 164 | cython_debug/ 165 | 166 | # PyCharm 167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 169 | # and can be added to the global gitignore or merged into this file. For a more nuclear 170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 171 | #.idea/ 172 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenRCA 2 | 3 | ![Python Version](https://img.shields.io/badge/Python-3776AB?&logo=python&logoColor=white-blue&label=3.10%20%7C%203.11)  4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)  5 | ![Welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat) 6 | 7 | 8 | 9 | OpenRCA is a benchmark for assessing LLMs' root cause analysis ability in a software operating scenario. When given a natural language query, LLMs need to analyze large volumes of telemetry data to identify the relevant root cause elements. This process requires the models to understand complex system dependencies and perform comprehensive reasoning across various types of telemetry data, including KPI time series, dependency trace graphs, and semi-structured log text. 10 | 11 | 12 | 13 | 14 | 15 | We also introduce RCA-agent as a baseline for OpenRCA. By using Python for data retrieval and analysis, the model avoids processing overly long contexts, enabling it to focus on reasoning and scalable for extensive telemetry. 16 | 17 | 18 | 19 | ## ✨ Quick Start 20 | 21 | > ⚠️ Since the OpenRCA dataset includes a large amount of telemetry and RCA-agent requires extensive memory operations, we recommend using a device with at least 80GB of storage space and 32GB of memory. 22 | 23 | ### 🛠️ Installation 24 | 25 | OpenRCA requires **Python >= 3.10**. It can be installed by running the following command: 26 | ```bash 27 | # [optional to create conda environment] 28 | # conda create -n openrca python=3.10 29 | # conda activate openrca 30 | 31 | # clone the repository 32 | git clone https://github.com/microsoft/OpenRCA.git 33 | cd OpenRCA 34 | # install the requirements 35 | pip install -r requirements.txt 36 | ``` 37 | 38 | The telemetry data can be download from [Google Drive](https://drive.google.com/drive/folders/1wGiEnu4OkWrjPxfx5ZTROnU37-5UDoPM?usp=drive_link). Once you have download the telemetry dataset, please put them into the path `dataset/` (which is empty now). 39 | 40 | The directory structure of the data is: 41 | 42 | ``` 43 | . 44 | ├── {SYSTEM} 45 | │   ├── query.csv 46 | │   ├── record.csv 47 | │   └── telemetry 48 | │   ├── {DATE} 49 | │   │   ├── log 50 | │   │   ├── metric 51 | │   │   └── trace 52 | │   └── ... 53 | └── ... 54 | ``` 55 | 56 | where the `{SYSTEM}` can be `Telecom`, `Bank`, or `Market`, and the `{DATE}` format is `{YYYY_MM_DD}`. 57 | 58 | ### 🖊️ Evaluation 59 | 60 | Using following command to evaluate: 61 | 62 | ```bash 63 | python -m main.evaluate \ 64 | -p [prediction csv files to evaluate] \ 65 | -q [groundtruth csv files to evaluate] \ 66 | -r [report csv file to save] 67 | ``` 68 | 69 | Note that the prediction CSV file must include at least a "prediction" field for valid evaluation (extra fields are allowed). Each prediction should be a JSON-like string containing all required elements for each query (extra elements are allowed). If there are multiple failures, list them in chronological order (e.g., 1, 2, 3, ...): 70 | 71 | 72 | ```json 73 | { 74 | "1": { 75 | "root cause occurrence datetime": "[%Y-%m-%d %H:%M:%S]", 76 | "root cause component": "[COMPONENT]", 77 | "root cause reason": "[REASON]" 78 | }, 79 | ... 80 | } 81 | ``` 82 | 83 | For example, to evaluate the archived predictions of RCA-agent (Claude ver.), you can use the following command: 84 | 85 | ```bash 86 | python -m main.evaluate \ 87 | -p \ 88 | rca/archive/agent-Bank.csv \ 89 | rca/archive/agent-Market-cloudbed-1.csv \ 90 | rca/archive/agent-Market-cloudbed-2.csv \ 91 | rca/archive/agent-Telecom.csv \ 92 | -q \ 93 | dataset/Bank/query.csv \ 94 | dataset/Market/cloudbed-1/query.csv \ 95 | dataset/Market/cloudbed-2/query.csv \ 96 | dataset/Telecom/query.csv \ 97 | -r \ 98 | test/agent_claude.csv 99 | ``` 100 | 101 | ### 🚩 Reproduction 102 | 103 | To reproduce results in the paper, please first setup your API configurations before running OpenRCA's baselines. Taking OpenAI as an example, you can configure `rca/api_config.yaml` file as follows: 104 | 105 | ```yaml 106 | SOURCE: "OpenAI" 107 | MODEL: "gpt-4o-2024-05-13" 108 | API_KEY: "sk-xxxxxxxxxxxxxx" 109 | ``` 110 | 111 | Then, run the following commands for result reproduction: 112 | 113 | ```bash 114 | python -m rca.{TESTS} --dataset {DATASET_NAME} 115 | # Optional tests: run_agent_standard, run_baseline_balanced, run_baseline_oracle 116 | # Optional datasets: Telecom, Bank, Market/cloudbed-1, Market/cloudbed-2 117 | ``` 118 | 119 | For example, if you want to evaluate RCA-agent on Bank dataset, you should use the following command: 120 | 121 | ```bash 122 | python -m rca.run_agent_standard --dataset Bank 123 | ``` 124 | 125 | Note that the telemetry of two Market cloudbed service group are collected separately. For example, if you want to evaluate RCA-agent on the whole Market dataset, you should use the following command: 126 | 127 | ```bash 128 | python -m rca.run_agent_standard --dataset Market/cloudbed-1 129 | python -m rca.run_agent_standard --dataset Market/cloudbed-2 130 | ``` 131 | 132 | The generated results and monitor files can be found in a new `test` directory created after running any test script. 133 | 134 | ### 💽 Reconstruction 135 | 136 | You can generate new task for OpenRCA telemetry or your own privacy telemetry by modifying `main/task_specification.json` and run the following command: 137 | 138 | ```bash 139 | python -m main.generate \ 140 | -s [your specification config file] \ 141 | -r [record file to generate query] \ 142 | -q [query file to save] \ 143 | -t [timezone of telemetry] 144 | ``` 145 | 146 | Note that the record schema should be consistent with the `record.csv` of OpenRCA. 147 | 148 | You can also re-generate random queries of OpenRCA with the following command: 149 | 150 | ```bash 151 | python -m main.generate -d True 152 | ``` 153 | 154 | ## 📚 Citation 155 | 156 | If you use OpenRCA in your research, please cite our paper: 157 | 158 | ```bibtex 159 | @inproceedings{ 160 | xu2025openrca, 161 | title={OpenRCA: Can Large Language Models Locate the Root Cause of Software Failures?}, 162 | author={Xu, Junjielong and Zhang, Qinan and Zhong, Zhiqing and He, Shilin and Zhang, Chaoyun and Lin, Qingwei and Pei, Dan and He, Pinjia and Zhang, Dongmei and Zhang, Qi}, 163 | booktitle={The Thirteenth International Conference on Learning Representations}, 164 | year={2025}, 165 | url={https://openreview.net/forum?id=M4qNIzQYpd} 166 | } 167 | ``` 168 | 169 | ## Trademarks 170 | 171 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 172 | trademarks or logos is subject to and must follow 173 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 174 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 175 | Any use of third-party trademarks or logos are subject to those third-party's policies. 176 | 177 | ## Disclaimer 178 | The recommended models in this Repo are just examples, used to explore the potential of agent systems with the paper at ICLR2025. Users can replace the models in this Repo according to their needs. When using the recommended models in this Repo, you need to comply with the licenses of these models respectively. Microsoft shall not be held liable for any infringement of third-party rights resulting from your usage of this repo. Users agree to defend, indemnify and hold Microsoft harmless from and against all damages, costs, and attorneys' fees in connection with any claims arising from this Repo. If anyone believes that this Repo infringes on your rights, please notify the project owner email. 179 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | 3 | Please download all datasets from [Google Drive](https://drive.google.com/drive/folders/1wGiEnu4OkWrjPxfx5ZTROnU37-5UDoPM?usp=drive_link). -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | pnpm-debug.log* 8 | lerna-debug.log* 9 | 10 | node_modules 11 | dist 12 | dist-ssr 13 | *.local 14 | 15 | # Editor directories and files 16 | .vscode/* 17 | !.vscode/extensions.json 18 | .idea 19 | .DS_Store 20 | *.suo 21 | *.ntvs* 22 | *.njsproj 23 | *.sln 24 | *.sw? 25 | -------------------------------------------------------------------------------- /docs/eslint.config.js: -------------------------------------------------------------------------------- 1 | import js from '@eslint/js' 2 | import globals from 'globals' 3 | import reactHooks from 'eslint-plugin-react-hooks' 4 | import reactRefresh from 'eslint-plugin-react-refresh' 5 | import tseslint from 'typescript-eslint' 6 | 7 | export default tseslint.config( 8 | { ignores: ['dist'] }, 9 | { 10 | extends: [js.configs.recommended, ...tseslint.configs.recommended], 11 | files: ['**/*.{ts,tsx}'], 12 | languageOptions: { 13 | ecmaVersion: 2020, 14 | globals: globals.browser, 15 | }, 16 | plugins: { 17 | 'react-hooks': reactHooks, 18 | 'react-refresh': reactRefresh, 19 | }, 20 | rules: { 21 | ...reactHooks.configs.recommended.rules, 22 | 'react-refresh/only-export-components': [ 23 | 'warn', 24 | { allowConstantExport: true }, 25 | ], 26 | }, 27 | }, 28 | ) 29 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | OpenRCA 8 | 9 | 10 |
11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "leadboard", 3 | "private": true, 4 | "version": "0.0.0", 5 | "type": "module", 6 | "homepage": "https://github.com/shuaijiumei/openrca_leadboard", 7 | "scripts": { 8 | "dev": "vite", 9 | "build": "tsc && vite build", 10 | "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0", 11 | "preview": "vite preview", 12 | "predeploy": "npm run build", 13 | "deploy": "gh-pages -d dist" 14 | }, 15 | "dependencies": { 16 | "@emotion/react": "^11.11.3", 17 | "@emotion/styled": "^11.11.0", 18 | "@mui/icons-material": "^5.15.10", 19 | "@mui/material": "^5.15.10", 20 | "react": "^18.2.0", 21 | "react-dom": "^18.2.0", 22 | "react-router-dom": "^6.22.1" 23 | }, 24 | "devDependencies": { 25 | "@types/react": "^18.2.55", 26 | "@types/react-dom": "^18.2.19", 27 | "@typescript-eslint/eslint-plugin": "^6.21.0", 28 | "@typescript-eslint/parser": "^6.21.0", 29 | "@vitejs/plugin-react": "^4.2.1", 30 | "eslint": "^8.56.0", 31 | "eslint-plugin-react-hooks": "^4.6.0", 32 | "eslint-plugin-react-refresh": "^0.4.5", 33 | "gh-pages": "^6.3.0", 34 | "typescript": "^5.2.2", 35 | "vite": "^6.2.5" 36 | } 37 | } -------------------------------------------------------------------------------- /docs/public/cuhksz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/docs/public/cuhksz.png -------------------------------------------------------------------------------- /docs/public/microsoft.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/docs/public/microsoft.jpg -------------------------------------------------------------------------------- /docs/public/ms_logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/public/overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/docs/public/overview.pdf -------------------------------------------------------------------------------- /docs/public/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/docs/public/overview.png -------------------------------------------------------------------------------- /docs/public/thu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/OpenRCA/bc7aba3a01513f59ea6f2ff7788df1c2be7bf0b8/docs/public/thu.jpg -------------------------------------------------------------------------------- /docs/public/vite.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/src/App.tsx: -------------------------------------------------------------------------------- 1 | import { BrowserRouter as Router, Routes, Route } from 'react-router-dom'; 2 | import Layout from './components/Layout'; 3 | import Home from './pages/Home'; 4 | import { ThemeProvider, createTheme } from '@mui/material'; 5 | 6 | const theme = createTheme({ 7 | typography: { 8 | fontFamily: '"Inter", "Helvetica", "Arial", sans-serif', 9 | }, 10 | palette: { 11 | primary: { 12 | main: '#1976d2', 13 | }, 14 | }, 15 | }); 16 | 17 | function App() { 18 | return ( 19 | 20 | 21 | 22 | 23 | } /> 24 | {/* Add more routes as needed */} 25 | 26 | 27 | 28 | 29 | ); 30 | } 31 | 32 | export default App; -------------------------------------------------------------------------------- /docs/src/components/Layout.tsx: -------------------------------------------------------------------------------- 1 | import { AppBar, Toolbar, Typography, Button, Box } from '@mui/material'; 2 | import { Link } from 'react-router-dom'; 3 | 4 | interface LayoutProps { 5 | children: React.ReactNode; 6 | } 7 | 8 | const Layout: React.FC = ({ children }) => { 9 | return ( 10 | 11 | 12 | 13 | 14 | 15 | OpenRCA 16 | 17 | 18 | {/* */} 21 | 24 | 27 | 28 | 29 | 30 | {children} 31 | 32 | 33 | ); 34 | }; 35 | 36 | export default Layout; -------------------------------------------------------------------------------- /docs/src/data/modelData.ts: -------------------------------------------------------------------------------- 1 | export interface Data { 2 | name: string; 3 | model: string; 4 | org: string; 5 | correct: string; 6 | partial: string; 7 | date: string; 8 | } 9 | 10 | // 模型颜色映射 11 | export const modelColorMap: { [key: string]: { color: string, backgroundColor: string } } = { 12 | 'Claude 3.5 Sonnet': { color: '#1a237e', backgroundColor: '#e8eaf6' }, 13 | 'GPT-4o': { color: '#004d40', backgroundColor: '#e0f2f1' }, 14 | 'Gemini 1.5 Pro': { color: '#b71c1c', backgroundColor: '#ffebee' }, 15 | 'Mistral Large 2': { color: '#0d47a1', backgroundColor: '#bbdefb' }, 16 | 'Command R+': { color: '#4a148c', backgroundColor: '#e1bee7' }, 17 | 'Llama 3.1 Instruct': { color: '#e65100', backgroundColor: '#ffe0b2' } 18 | }; 19 | 20 | // 组织图标映射 21 | export const orgLogoMap: { [key: string]: string } = { 22 | 'Microsoft': '/OpenRCA/ms_logo.svg', 23 | 'Google': '/OpenRCA/google_logo.svg', 24 | 'OpenAI': '/OpenRCA/openai_logo.svg', 25 | 'Anthropic': '/OpenRCA/anthropic_logo.svg', 26 | 'Meta': '/OpenRCA/meta_logo.svg' 27 | }; 28 | 29 | // 新闻数据 30 | export const news = [ 31 | { date: '2025/1/23', content: 'Our paper has been accepted by ICLR 2025.' }, 32 | { date: '2025/1/23', content: 'Released OpenRCA dataset with 335 failure cases.' } 33 | ]; 34 | 35 | // 模型数据 36 | export const modelData: Data[] = [ 37 | // Closed Models - RCA-Agent 38 | { name: 'RCA-Agent', model: 'Claude 3.5 Sonnet', org: 'Microsoft', correct: '11.34%', partial: '17.31%', date: '2025/1/23' }, 39 | { name: 'RCA-Agent', model: 'GPT-4o', org: 'Microsoft', correct: '8.96%', partial: '17.91%', date: '2025/1/23' }, 40 | { name: 'RCA-Agent', model: 'Gemini 1.5 Pro', org: 'Microsoft', correct: '2.69%', partial: '6.87%', date: '2025/1/23' }, 41 | 42 | // Closed Models - Balanced 43 | { name: 'Prompting (Balanced)', model: 'Claude 3.5 Sonnet', org: 'Microsoft', correct: '3.88%', partial: '18.81%', date: '2025/1/23' }, 44 | { name: 'Prompting (Balanced)', model: 'GPT-4o', org: 'Microsoft', correct: '3.28%', partial: '14.33%', date: '2025/1/23' }, 45 | { name: 'Prompting (Balanced)', model: 'Gemini 1.5 Pro', org: 'Microsoft', correct: '6.27%', partial: '24.18%', date: '2025/1/23' }, 46 | 47 | // Closed Models - Oracle 48 | { name: 'Prompting (Oracle)', model: 'Claude 3.5 Sonnet', org: 'Microsoft', correct: '5.37%', partial: '17.61%', date: '2025/1/23' }, 49 | { name: 'Prompting (Oracle)', model: 'GPT-4o', org: 'Microsoft', correct: '6.27%', partial: '15.82%', date: '2025/1/23' }, 50 | { name: 'Prompting (Oracle)', model: 'Gemini 1.5 Pro', org: 'Microsoft', correct: '7.16%', partial: '23.58%', date: '2025/1/23' }, 51 | 52 | // Open Source Models - Balanced 53 | { name: 'Prompting (Balanced)', model: 'Mistral Large 2', org: 'Microsoft', correct: '3.58%', partial: '6.40%', date: '2025/1/23' }, 54 | { name: 'Prompting (Balanced)', model: 'Command R+', org: 'Microsoft', correct: '4.18%', partial: '8.96%', date: '2025/1/23' }, 55 | { name: 'Prompting (Balanced)', model: 'Llama 3.1 Instruct', org: 'Microsoft', correct: '2.99%', partial: '14.63%', date: '2025/1/23' }, 56 | 57 | // Open Source Models - Oracle 58 | { name: 'Prompting (Oracle)', model: 'Mistral Large 2', org: 'Microsoft', correct: '4.48%', partial: '10.45%', date: '2025/1/23' }, 59 | { name: 'Prompting (Oracle)', model: 'Command R+', org: 'Microsoft', correct: '4.78%', partial: '7.46%', date: '2025/1/23' }, 60 | { name: 'Prompting (Oracle)', model: 'Llama 3.1 Instruct', org: 'Microsoft', correct: '3.88%', partial: '14.93%', date: '2025/1/23' }, 61 | 62 | // Open Source Models - RCA-Agent 63 | { name: 'RCA-Agent', model: 'Llama 3.1 Instruct', org: 'Microsoft', correct: '3.28%', partial: '5.67%', date: '2025/1/23' } 64 | ]; -------------------------------------------------------------------------------- /docs/src/index.css: -------------------------------------------------------------------------------- 1 | :root { 2 | font-family: Inter, system-ui, Avenir, Helvetica, Arial, sans-serif; 3 | line-height: 1.5; 4 | font-weight: 400; 5 | 6 | color-scheme: light dark; 7 | color: #213547; 8 | background-color: #ffffff; 9 | 10 | font-synthesis: none; 11 | text-rendering: optimizeLegibility; 12 | -webkit-font-smoothing: antialiased; 13 | -moz-osx-font-smoothing: grayscale; 14 | } 15 | 16 | body { 17 | margin: 0; 18 | min-width: 320px; 19 | min-height: 100vh; 20 | } 21 | 22 | a { 23 | font-weight: 500; 24 | color: #646cff; 25 | text-decoration: inherit; 26 | } 27 | a:hover { 28 | color: #747bff; 29 | } 30 | 31 | @media (prefers-color-scheme: dark) { 32 | :root { 33 | color: #213547; 34 | background-color: #ffffff; 35 | } 36 | } -------------------------------------------------------------------------------- /docs/src/main.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import ReactDOM from 'react-dom/client' 3 | import App from './App.tsx' 4 | import './index.css' 5 | 6 | ReactDOM.createRoot(document.getElementById('root')!).render( 7 | 8 | 9 | , 10 | ) -------------------------------------------------------------------------------- /docs/tsconfig.app.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo", 4 | "target": "ES2020", 5 | "useDefineForClassFields": true, 6 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 7 | "module": "ESNext", 8 | "skipLibCheck": true, 9 | 10 | /* Bundler mode */ 11 | "moduleResolution": "bundler", 12 | "allowImportingTsExtensions": true, 13 | "isolatedModules": true, 14 | "moduleDetection": "force", 15 | "noEmit": true, 16 | "jsx": "react-jsx", 17 | 18 | /* Linting */ 19 | "strict": true, 20 | "noUnusedLocals": true, 21 | "noUnusedParameters": true, 22 | "noFallthroughCasesInSwitch": true, 23 | "noUncheckedSideEffectImports": true 24 | }, 25 | "include": ["src"] 26 | } 27 | -------------------------------------------------------------------------------- /docs/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": [], 3 | "references": [ 4 | { "path": "./tsconfig.app.json" }, 5 | { "path": "./tsconfig.node.json" } 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /docs/tsconfig.node.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo", 4 | "target": "ES2022", 5 | "lib": ["ES2023"], 6 | "module": "ESNext", 7 | "skipLibCheck": true, 8 | 9 | /* Bundler mode */ 10 | "moduleResolution": "bundler", 11 | "allowImportingTsExtensions": true, 12 | "isolatedModules": true, 13 | "moduleDetection": "force", 14 | "noEmit": true, 15 | 16 | /* Linting */ 17 | "strict": true, 18 | "noUnusedLocals": true, 19 | "noUnusedParameters": true, 20 | "noFallthroughCasesInSwitch": true, 21 | "noUncheckedSideEffectImports": true 22 | }, 23 | "include": ["vite.config.ts"] 24 | } 25 | -------------------------------------------------------------------------------- /docs/vite.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite' 2 | import react from '@vitejs/plugin-react' 3 | 4 | // https://vite.dev/config/ 5 | export default defineConfig({ 6 | base: '/OpenRCA/', 7 | plugins: [react()], 8 | }) 9 | -------------------------------------------------------------------------------- /main/evaluate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import re 4 | import argparse 5 | 6 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) 7 | sys.path.insert(0, parent_dir) 8 | 9 | def evaluate(prediction:str, scoring_points:str): 10 | """ 11 | Evaluate single JSON-like prediction with corresponding scoring points 12 | args: 13 | prediction: str, the prediction (JSON-like string) 14 | scoring_points: str, the scoring points string 15 | """ 16 | 17 | import itertools 18 | 19 | predict_pattern = ( 20 | r'{\s*' 21 | r'(?:"root cause occurrence datetime":\s*"(.*?)")?,?\s*' 22 | r'(?:"root cause component":\s*"(.*?)")?,?\s*' 23 | r'(?:"root cause reason":\s*"(.*?)")?\s*}' 24 | ) 25 | 26 | predict_matches = re.findall(predict_pattern, prediction) 27 | 28 | 29 | predict_results = [] 30 | 31 | for match in predict_matches: 32 | datetime_str, component, reason = match 33 | predict_results.append({ 34 | "root cause occurrence datetime": datetime_str, 35 | "root cause component": component, 36 | "root cause reason": reason 37 | }) 38 | 39 | 40 | 41 | prediction_length = len(predict_results) 42 | 43 | component_pattern = r"The (?:\d+-th|only) predicted root cause component is ([^\n]+)" 44 | reason_pattern = r"The (?:\d+-th|only) predicted root cause reason is ([^\n]+)" 45 | time_pattern = r"The (?:\d+-th|only) root cause occurrence time is within 1 minutes \(i.e., <=1min\) of ([^\n]+)" 46 | 47 | components = re.findall(component_pattern, scoring_points) 48 | reasons = re.findall(reason_pattern, scoring_points) 49 | times = re.findall(time_pattern, scoring_points) 50 | 51 | scoringpoints_length = max(len(components),len(reasons),len(times)) 52 | socres_num = len(components)+len(reasons)+len(times) 53 | 54 | def time_difference(time1_str,time2_str): 55 | from datetime import datetime 56 | time_format = "%Y-%m-%d %H:%M:%S" 57 | 58 | try: 59 | time1 = datetime.strptime(time1_str, time_format) 60 | time2 = datetime.strptime(time2_str, time_format) 61 | except ValueError: 62 | return False 63 | 64 | time_difference = abs(time1 - time2) 65 | if time_difference.total_seconds() <= 60: 66 | return True 67 | else: 68 | return False 69 | 70 | scores_get = 0 71 | passing_criteria = [] 72 | failing_criteria = [] 73 | 74 | if scoringpoints_length == prediction_length: 75 | best_sore = -1 76 | for perm in itertools.permutations(predict_results): 77 | current_score = 0 78 | current_passing = [] 79 | for i in range(scoringpoints_length): 80 | if len(components) == scoringpoints_length: 81 | if perm[i]['root cause component'] == components[i]: 82 | current_score +=1 83 | current_passing.append(components[i]) 84 | if len(reasons) == scoringpoints_length: 85 | if perm[i]['root cause reason'] == reasons[i]: 86 | current_score +=1 87 | current_passing.append(reasons[i]) 88 | if len(times) == scoringpoints_length: 89 | if time_difference(times[i],perm[i]['root cause occurrence datetime']): 90 | current_score +=1 91 | current_passing.append(times[i]) 92 | if current_score>best_sore: 93 | best_sore = current_score 94 | passing_criteria = current_passing 95 | scores_get = best_sore 96 | 97 | failing_criteria = list(set(components+reasons+times)-set(passing_criteria)) 98 | 99 | final_score = scores_get/socres_num 100 | bin_score = round(final_score,2) 101 | return passing_criteria, failing_criteria, bin_score 102 | 103 | 104 | def file_evaluate(prediction_file:str, query_file:str, report_file:str): 105 | """ 106 | Evaluate a prediction file of certain dataset with corresponding query file and save the evaluation results to a csv file 107 | args: 108 | prediction_file: str, the path of the prediction file (csv, with at least one fields: 'prediction') 109 | query_file: str, the path of a specific dataset recorded labels (csv) 110 | report_file: str, the path of the evaluation file (csv) 111 | """ 112 | import pandas as pd 113 | 114 | pred_df = pd.read_csv(prediction_file) 115 | query_df = pd.read_csv(query_file) 116 | eval_df = pd.DataFrame(columns=["query", "answer", "groundtruth", "passed", "failed", "score", "task_index"]) 117 | 118 | if len(pred_df) != len(query_df): 119 | raise ValueError("The length of prediction file and record file should be the same") 120 | 121 | for idx in range(len(pred_df)): 122 | prediction = pred_df.loc[idx, "prediction"] 123 | scoring_points = query_df.loc[idx, "scoring_points"] 124 | passing_criteria, failing_criteria, score = evaluate(prediction, scoring_points) 125 | instruction = query_df.loc[idx, "instruction"] 126 | task_index = query_df.loc[idx, "task_index"] 127 | new_row = pd.DataFrame({ 128 | "query": [instruction], 129 | "answer": [prediction], 130 | "groundtruth": [scoring_points], 131 | "passed": [passing_criteria], 132 | "failed": [failing_criteria], 133 | "score": [score], 134 | "task_index": [task_index] 135 | }) 136 | eval_df = pd.concat([eval_df, new_row], ignore_index=True) 137 | 138 | 139 | if os.path.exists(report_file): 140 | eval_df.to_csv(report_file, mode='a', header=False, index=False) 141 | else: 142 | if not os.path.exists(os.path.dirname(report_file)): 143 | os.makedirs(os.path.dirname(report_file)) 144 | eval_df.to_csv(report_file, index=False) 145 | 146 | 147 | def report(report_file): 148 | """ 149 | Visualize the final result of a report after evaluation 150 | args: 151 | report_file: str, report after evaluation 152 | """ 153 | import pandas as pd 154 | 155 | scores = { 156 | "easy": 0, 157 | "middle": 0, 158 | "hard": 0, 159 | } 160 | nums = { 161 | "easy": 0, 162 | "middle": 0, 163 | "hard": 0, 164 | } 165 | 166 | df = pd.read_csv(report_file) 167 | # By default, task_1-3 is easy, task_4-6 is middle, task_7 is hard. For DIY task specifications, you should change this line to modify the difficulty: 168 | df["difficulty"] = df["task_index"].apply(lambda x: "easy" if int(x.split('_')[1]) <= 3 else "middle" if int(x.split('_')[1]) <= 6 else "hard") 169 | scores['easy'] += len(df[(df["score"]==1.0) & (df["difficulty"]=="easy")]) 170 | scores['middle'] += len(df[(df["score"]==1.0) & (df["difficulty"]=="middle")]) 171 | scores['hard'] += len(df[(df["score"]==1.0) & (df["difficulty"]=="hard")]) 172 | nums['easy'] += len(df[df["difficulty"]=="easy"]) 173 | nums['middle'] += len(df[df["difficulty"]=="middle"]) 174 | nums['hard'] += len(df[df["difficulty"]=="hard"]) 175 | 176 | print(f"{'-'*12:<12}{'-'*12:<12}{'-'*12:<12}{'-'*12}") 177 | print(f"{'Class':<12}{'Total(#)':<12}{'Correct(#)':<12}{'Accuracy(%)':<12}") 178 | print(f"{'-'*12:<12}{'-'*12:<12}{'-'*12:<12}{'-'*12}") 179 | for key in scores.keys(): 180 | accuracy = scores[key] / nums[key] if nums[key] > 0 else 0 181 | print(f"{key:<12}{nums[key]:<12}{scores[key]:<12}{accuracy:.2%}") 182 | print(f"{'-'*12:<12}{'-'*12:<12}{'-'*12:<12}{'-'*12}") 183 | total_accuracy = sum(scores.values()) / sum(nums.values()) if sum(nums.values()) > 0 else 0 184 | print(f"{'Total':<12}{sum(nums.values()):<12}{sum(scores.values()):<12}{total_accuracy:.2%}") 185 | print(f"{'-'*12:<12}{'-'*12:<12}{'-'*12:<12}{'-'*12}") 186 | 187 | 188 | 189 | 190 | if __name__ == '__main__': 191 | """ 192 | Evaluate a list of prediction files with corresponding query files, save the evaluation results, and display the statistic results. 193 | args: 194 | p: list, a list of prediction files to evaluate 195 | q: list, a list of query files to evaluate 196 | r: str, report file to save 197 | """ 198 | parser = argparse.ArgumentParser() 199 | parser.add_argument("-p", type=str, nargs='+', help="a list of prediction files to evaluate") 200 | parser.add_argument("-q", type=str, nargs='+', help="a list of query files to evaluate") 201 | parser.add_argument("-r", type=str, help="evaluation file to save") 202 | args = parser.parse_args() 203 | 204 | if len(args.p) != len(args.q): 205 | raise ValueError("The length of prediction files, query files and evaluation files should be the same") 206 | if os.path.exists(args.r): 207 | os.remove(args.r) 208 | 209 | for i in range(len(args.p)): 210 | try: 211 | file_evaluate(args.p[i], args.q[i], args.r) 212 | except Exception as e: 213 | print(f"Error when evaluating the file {args.p[i]}: {e}") 214 | continue 215 | 216 | report(args.r) -------------------------------------------------------------------------------- /main/generate.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datetime import datetime, timedelta 3 | import random 4 | import json 5 | import sys 6 | import os 7 | import pytz 8 | import argparse 9 | from main.prompt import system, user 10 | 11 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) 12 | sys.path.insert(0, parent_dir) 13 | 14 | from scripts.utils import get_chat_completion 15 | 16 | random.seed(42) 17 | 18 | def timestamp2timeperiod(timestamp, timezone) -> str: 19 | time = datetime.fromtimestamp(timestamp, timezone) 20 | minute = time.minute 21 | start_time = time.replace(minute=minute - (minute % 30), second=0, microsecond=0) 22 | end_time = start_time + timedelta(minutes=30) 23 | start_time_str = start_time.strftime('%Y-%m-%d %H:%M:%S') 24 | end_time_str = end_time.strftime('%Y-%m-%d %H:%M:%S') 25 | return f"{start_time_str} to {end_time_str}" 26 | 27 | def timestamp2datetime(timestamp, timezone) -> str: 28 | time = datetime.fromtimestamp(timestamp, timezone) 29 | utc_plus_8_time = time.strftime('%Y-%m-%d %H:%M:%S') 30 | return utc_plus_8_time 31 | 32 | def get_half_hour_conflict_failure_flag(meta_data) -> str: 33 | sorted_time = sorted(meta_data['timestamp']) 34 | half_hour_conflict_failure_flag = {} 35 | previous_failure_timestamp = 0 36 | for i in range(len(sorted_time)): 37 | timestamp = sorted_time[i] 38 | current_failure_timestamp_left = timestamp // 1800 39 | if current_failure_timestamp_left > previous_failure_timestamp: 40 | previous_failure_timestamp = current_failure_timestamp_left 41 | half_hour_conflict_failure_flag[timestamp] = False 42 | else: 43 | half_hour_conflict_failure_flag[timestamp] = True 44 | half_hour_conflict_failure_flag[sorted_time[i - 1]] = True 45 | return half_hour_conflict_failure_flag 46 | 47 | def get_multi_response_dict(row, meta_data): 48 | num = 0 49 | multi_dict = { 50 | "datetime": [], 51 | "component": [], 52 | "reason": [], 53 | } 54 | cand_df = meta_data[meta_data['timestamp']//1800 == row['timestamp']//1800] 55 | for idx, cand in cand_df.iterrows(): 56 | num += 1 57 | for key in multi_dict: 58 | multi_dict[key].append(cand[key]) 59 | 60 | return num, multi_dict 61 | 62 | def query_generate(gt_path, spec_path, extra_spec, query_path, timezone): 63 | 64 | meta_data = pd.read_csv(gt_path) 65 | with open(spec_path, "r", encoding="utf8") as f: 66 | task_templates = json.load(f) 67 | 68 | half_hour_conflict_failure_flag = get_half_hour_conflict_failure_flag(meta_data) 69 | 70 | full_task_ID_list = list(task_templates.keys()) 71 | df = pd.DataFrame(columns=["task_index", "instruction", "scoring_points"]) 72 | for idx, row in meta_data.iterrows(): 73 | print(f"processing: {idx}") 74 | 75 | timestamp = row['timestamp'] 76 | reason = row['reason'] 77 | component = row['component'] 78 | datetime = timestamp2datetime(timestamp, timezone) 79 | time_period = timestamp2timeperiod(timestamp, timezone) 80 | task_index = random.choice(full_task_ID_list) 81 | 82 | if half_hour_conflict_failure_flag[timestamp]: 83 | num, ans = get_multi_response_dict(row, meta_data) 84 | scoring_points = "" 85 | for i in range(num): 86 | scoring_points_template = task_templates[task_index]['scoring_points'].copy() 87 | 88 | scoring_points_filled = [points.format( 89 | idx = f'{i+1}-th', 90 | datetime = ans['datetime'][i], 91 | reason = ans['reason'][i], 92 | component = ans['component'][i], 93 | ) for points in scoring_points_template] 94 | scoring_points += "\n".join(scoring_points_filled) 95 | scoring_points += "\n" 96 | print(f"The {idx}-th is a multi-response task, containing {num} root cause.") 97 | else: 98 | num = 1 99 | scoring_points = "" 100 | for point in task_templates[task_index]['scoring_points']: 101 | scoring_points += point.format( 102 | idx='only', 103 | time_period=time_period, 104 | datetime=datetime, 105 | component=component, 106 | reason=reason 107 | ) 108 | scoring_points += "\n" 109 | 110 | input_specification = "```known\n" 111 | for spec in task_templates[task_index]['input']: 112 | input_specification += f"- " 113 | input_specification += spec.format( 114 | num=num, 115 | time_period=time_period 116 | ) 117 | input_specification += "\n" 118 | if extra_spec: 119 | input_specification += f"- {extra_spec}\n" 120 | input_specification = input_specification.strip() + "\n```" 121 | 122 | output_specification = "```query\n" 123 | for spec in task_templates[task_index]['output']: 124 | output_specification += f"- " 125 | output_specification += spec.format( 126 | time_period="**UNKNOWN**", 127 | datetime="**UNKNOWN**", 128 | component="**UNKNOWN**", 129 | reason="**UNKNOWN**", 130 | ) 131 | output_specification += "\n" 132 | output_specification = output_specification.strip() + "\n```" 133 | 134 | prompt = [ 135 | {'role': 'system', 'content': system}, 136 | {'role': 'user', 'content': user.format(input_specification=input_specification, 137 | output_specification=output_specification)}, 138 | ] 139 | 140 | print(scoring_points) 141 | 142 | for i in range(3): 143 | # try: 144 | instruction = get_chat_completion( 145 | messages=prompt, 146 | temperature=1.0 147 | ) 148 | instruction = instruction 149 | instruction = json.loads(instruction)['issue'] 150 | break 151 | # except Exception as e: 152 | # print(e) 153 | # continue 154 | 155 | new_df = pd.DataFrame([{"task_index": task_index, 156 | "instruction": instruction, 157 | "scoring_points": scoring_points}]) 158 | df = pd.concat([df, new_df], 159 | ignore_index=True) 160 | 161 | df.to_csv(query_path, index=False) 162 | print(f"genereated: {task_index}") 163 | 164 | template = """\t"{idx}": {{ 165 | "root cause occurrence datetime": {datetime}, 166 | "root cause component": {component}, 167 | "root cause reason": {reason}, 168 | }},\n""" 169 | 170 | key_field = ["root cause occurrence datetime", "root cause component", "root cause reason"] 171 | 172 | 173 | if __name__ == '__main__': 174 | """ 175 | Generate the query based on the task specification and save it to the corresponding file 176 | args: 177 | d: bool, whether to use default setting or not 178 | s: str, the path of the task specification config 179 | r: list, a list of record files to generate query 180 | q: list, a list of query files to save 181 | e: list, a list of extra spec you want to add in your query (in addition to the spec in json config). If you don't want to add extra spec, just leave it None. 182 | t: str, timezone of the location where telemetry is collected 183 | """ 184 | 185 | parser = argparse.ArgumentParser() 186 | parser.add_argument("-d", type=bool, default=False, help="default setting or not") 187 | parser.add_argument("-s", type=str, help="the path of the task specification config") 188 | parser.add_argument("-r", type=str, nargs='+', help="a list of record files to generate query") 189 | parser.add_argument("-q", type=str, nargs='+', help="a list of query files to save") 190 | parser.add_argument("-e", type=str, nargs='+', help="a list of extra spec you want to add in your query") 191 | parser.add_argument("-t", type=str, help="timezone of the location where telemetry is collected") 192 | args = parser.parse_args() 193 | 194 | if args.d: 195 | spec_path = 'main/task_specification.json' 196 | record_path_list = [ 197 | 'dataset/Market/cloudbed-1/record.csv', 198 | 'dataset/Market/cloudbed-2/record.csv', 199 | 'dataset/Bank/record.csv', 200 | 'dataset/Telecom/record.csv', 201 | ] 202 | extra_spec_list = [ 203 | "system: cloudbed-1", 204 | "system: cloudbed-2", 205 | None, 206 | None, 207 | ] 208 | query_path_list = [ 209 | 'dataset/Market/cloudbed-1/query.csv', 210 | 'dataset/Market/cloudbed-2/query.csv', 211 | 'dataset/Bank/query.csv', 212 | 'dataset/Telecom/query.csv', 213 | ] 214 | timezone = pytz.timezone('Asia/Shanghai') 215 | 216 | else: 217 | spec_path = args.s 218 | record_path_list = args.r 219 | extra_spec_list = args.e if args.e else [None] * len(args.r) 220 | query_path_list = args.q 221 | timezone = pytz.timezone(args.t) 222 | 223 | data_list = list(zip(record_path_list, extra_spec_list, query_path_list)) 224 | 225 | for record_path, extra_spec, query_path in data_list: 226 | print("processing: ", record_path) 227 | query_generate(record_path, spec_path, extra_spec, query_path, timezone) -------------------------------------------------------------------------------- /main/prompt.py: -------------------------------------------------------------------------------- 1 | system = """Let's play a game. In this game, your task is to generate a issue related to DevOps failure diagnosis based on a given set of specifications. The goal is to make the issue realistic enough that even top human experts might believe it reflects a genuine issue an engineer could encounter at work. They should not be able to tell that the issue was generated by an AI Assistant based on specifications. 2 | 3 | The specifications provided to you include the following components: 4 | 5 | ```known 6 | (The known information explicitly provided in the issue.) 7 | ``` 8 | 9 | ```query 10 | (The target query that required the user to answer.) 11 | ``` 12 | 13 | Your response should follow the JSON format below: 14 | 15 | { 16 | "issue": (Your generated issue based on the specifications.) 17 | } 18 | (DO NOT contain "```json" and "```" tags. DO contain the JSON object with the brackets "{}" only.) 19 | 20 | For example, if the following specifications are given: 21 | 22 | ```known 23 | - number of failures: 1 24 | - time range: 2022-03-21 11:30:00 to 2022-03-21 12:00:00 25 | - system: None 26 | ``` 27 | 28 | ```query 29 | - root cause occurrence time: **UNKNOWN** 30 | ``` 31 | 32 | Then, you could generate a issue be like: 33 | 34 | { 35 | "issue": "During the specified time range of March 21, 2022, from 11:30 to 12:00, the cloud service system experience a failure. The exact time of the root cause occurrence is unknown, which complicates the diagnosis process. Please pinpoint the root cause occurrence datetime." 36 | } 37 | 38 | There is another example: 39 | 40 | ```known 41 | - number of failures: 2 42 | - time range: 2022-03-20 09:30:00 to 2022-03-20 10:00:00 43 | - system: cloudbed-1 44 | ``` 45 | 46 | ```query 47 | - root cause occurrence time: **UNKNOWN** 48 | - root cause component: **UNKNOWN** 49 | - root cause reason: **UNKNOWN** 50 | ``` 51 | 52 | The generated issue be like: 53 | 54 | { 55 | "issue": "The cloud service system, cloudbed-1, may have experienced two failures within the time range of March 20, 2022, from 09:30 to 10:00. The exact number of failures, the time of occurrence, the affected components, and the underlying reasons for these failures are currently unknown. You are tasked with identifying the root cause occurrence datetime, the root cause component, and the root cause reason." 56 | } 57 | 58 | Some rules to follow: 59 | 60 | 1. Do not tell the user "how to solve the issue" (e.g., retrieve the telemetry data like metrics/logs/traces). 61 | 2. Do not involve human interaction in the issue (e.g., "ask the engineer for more information"). 62 | 3. Do not include any specific values that are not mentioned in the specification (e.g., "the CPU usage was 80%"). 63 | 64 | Now, let's get started!""" 65 | 66 | user = """Please generate a issue related to DevOps failure diagnosis based on the following specifications: 67 | 68 | ```known 69 | {input_specification} 70 | ``` 71 | 72 | ```query 73 | {output_specification} 74 | ```""" -------------------------------------------------------------------------------- /main/task_specification.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_1": { 3 | "input": [ 4 | "time range: {time_period}", 5 | "number of failures: {num}" 6 | ], 7 | "output": [ 8 | "root cause occurrence time: {datetime}" 9 | ], 10 | "scoring_points": [ 11 | "The {idx} root cause occurrence time is within 1 minutes (i.e., <=1min) of {datetime}" 12 | ] 13 | }, 14 | "task_2": { 15 | "input": [ 16 | "time range: {time_period}", 17 | "number of failures: {num}" 18 | ], 19 | "output": [ 20 | "root cause reason: {reason}" 21 | ], 22 | "scoring_points": [ 23 | "The {idx} predicted root cause reason is {reason}" 24 | ] 25 | }, 26 | "task_3": { 27 | "input": [ 28 | "time range: {time_period}", 29 | "number of failures: {num}" 30 | ], 31 | "output": [ 32 | "root cause component: {component}" 33 | ], 34 | "scoring_points": [ 35 | "The {idx} predicted root cause component is {component}" 36 | ] 37 | }, 38 | "task_4": { 39 | "input": [ 40 | "time range: {time_period}", 41 | "number of failures: {num}" 42 | ], 43 | "output": [ 44 | "root cause occurrence time: {datetime}", 45 | "root cause reason: {reason}" 46 | ], 47 | "scoring_points": [ 48 | "The {idx} root cause occurrence time is within 1 minutes (i.e., <=1min) of {datetime}", 49 | "The {idx} predicted root cause reason is {reason}" 50 | ] 51 | }, 52 | "task_5": { 53 | "input": [ 54 | "time range: {time_period}", 55 | "number of failures: {num}" 56 | ], 57 | "output": [ 58 | "root cause occurrence time: {datetime}", 59 | "root cause component: {component}" 60 | ], 61 | "scoring_points": [ 62 | "The {idx} root cause occurrence time is within 1 minutes (i.e., <=1min) of {datetime}", 63 | "The {idx} predicted root cause component is {component}" 64 | ] 65 | }, 66 | "task_6": { 67 | "input": [ 68 | "time range: {time_period}", 69 | "number of failures: {num}" 70 | ], 71 | "output": [ 72 | "root cause component: {component}", 73 | "root cause reason: {reason}" 74 | ], 75 | "scoring_points": [ 76 | "The {idx} predicted root cause component is {component}", 77 | "The {idx} predicted root cause reason is {reason}" 78 | ] 79 | }, 80 | "task_7": { 81 | "input": [ 82 | "time range: {time_period}", 83 | "number of failures: {num}" 84 | ], 85 | "output": [ 86 | "root cause component: {component}", 87 | "root cause occurrence time: {datetime}", 88 | "root cause reason: {reason}" 89 | ], 90 | "scoring_points": [ 91 | "The {idx} root cause occurrence time is within 1 minutes (i.e., <=1min) of {datetime}", 92 | "The {idx} predicted root cause component is {component}", 93 | "The {idx} predicted root cause reason is {reason}" 94 | ] 95 | } 96 | } -------------------------------------------------------------------------------- /rca/api_config.yaml: -------------------------------------------------------------------------------- 1 | SOURCE: "OpenAI" 2 | MODEL: "gpt-4o-2024-05-13" 3 | API_KEY: "sk-xxxxxxxxxxxxxx" 4 | API_BASE: "" -------------------------------------------------------------------------------- /rca/api_router.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | import time 4 | 5 | def load_config(config_path="rca/api_config.yaml"): 6 | configs = dict(os.environ) 7 | with open(config_path, "r") as file: 8 | yaml_data = yaml.safe_load(file) 9 | configs.update(yaml_data) 10 | return configs 11 | 12 | configs = load_config() 13 | 14 | def OpenAI_chat_completion(messages, temperature): 15 | from openai import OpenAI 16 | client = OpenAI( 17 | api_key=configs["API_KEY"] 18 | ) 19 | return client.chat.completions.create( 20 | model = configs["MODEL"], 21 | messages = messages, 22 | temperature = temperature, 23 | ).choices[0].message.content 24 | 25 | def Google_chat_completion(messages, temperature): 26 | import google.generativeai as genai 27 | genai.configure( 28 | api_key=configs["API_KEY"] 29 | ) 30 | genai.GenerationConfig(temperature=temperature) 31 | system_instruction = messages[0]["content"] if messages[0]["role"] == "system" else None 32 | messages = [item for item in messages if item["role"] != "system"] 33 | messages = [{"role": "model" if item["role"] == "assistant" else item["role"], "parts": item["content"]} for item in messages] 34 | history = messages[:-1] 35 | message = messages[-1] 36 | return genai.GenerativeModel( 37 | model_name=configs["MODEL"], 38 | system_instruction=system_instruction 39 | ).start_chat( 40 | history=history if history != [] else None 41 | ).send_message(message).text 42 | 43 | def Anthropic_chat_completion(messages, temperature): 44 | import anthropic 45 | client = anthropic.Anthropic( 46 | api_key=configs["API_KEY"] 47 | ) 48 | return client.messages.create( 49 | model=configs["MODEL"], 50 | messages=messages, 51 | temperature=temperature 52 | ).content 53 | 54 | # for 3-rd party API which is compatible with OpenAI API (with different 'API_BASE') 55 | def AI_chat_completion(messages, temperature): 56 | from openai import OpenAI 57 | client = OpenAI( 58 | api_key=configs["API_KEY"], 59 | base_url=configs["API_BASE"] 60 | ) 61 | return client.chat.completions.create( 62 | model = configs["MODEL"], 63 | messages = messages, 64 | temperature = temperature, 65 | ).choices[0].message.content 66 | 67 | def get_chat_completion(messages, temperature=0.0): 68 | 69 | def send_request(): 70 | if configs["SOURCE"] == "AI": 71 | return AI_chat_completion(messages, temperature) 72 | elif configs["SOURCE"] == "OpenAI": 73 | return OpenAI_chat_completion(messages, temperature) 74 | elif configs["SOURCE"] == "Google": 75 | return Google_chat_completion(messages, temperature) 76 | elif configs["SOURCE"] == "Anthropic": 77 | return Anthropic_chat_completion(messages, temperature) 78 | else: 79 | raise ValueError("Invalid SOURCE in api_config file.") 80 | 81 | for i in range(3): 82 | try: 83 | return send_request() 84 | except Exception as e: 85 | print(e) 86 | if '429' in str(e): 87 | print("Rate limit exceeded. Waiting for 1 second.") 88 | time.sleep(1) 89 | continue 90 | else: 91 | raise e -------------------------------------------------------------------------------- /rca/archive/agent-Telecom.csv: -------------------------------------------------------------------------------- 1 | instruction,prediction,groundtruth,passed,failed,score,row_id,task_index 2 | "During the specified time range of April 11, 2020, from 00:00 to 00:30, there was one failure reported. The exact reason for this failure is currently unknown. Your task is to identify the root cause reason for the failure during this period.","{ 3 | ""1"": { 4 | ""root cause occurrence datetime"": ""2020-04-11 00:01:22"", 5 | ""root cause component"": ""os_015"", 6 | ""root cause reason"": ""CPU fault"" 7 | } 8 | }","level: pod 9 | reason: CPU fault 10 | component: docker_003 11 | timestamp: 1586534700 12 | datetime: 2020-04-11 00:05:00",CPU fault,,1.0,0.0,task_2 13 | "Within the time range of April 11, 2020, from 01:30 to 02:00, there was one failure in the system. The exact time of the root cause occurrence and the reason behind this failure are currently unknown. You are required to identify the root cause occurrence datetime and the root cause reason.","{ 14 | ""1"": { 15 | ""root cause occurrence datetime"": ""2020-04-11 01:44:00"", 16 | ""root cause component"": ""os_021"", 17 | ""root cause reason"": ""network delay"" 18 | } 19 | }","level: pod 20 | reason: CPU fault 21 | component: docker_004 22 | timestamp: 1586540400 23 | datetime: 2020-04-11 01:40:00",,"2020-04-11 01:40:00 24 | CPU fault",0.0,1.0,task_4 25 | "On April 11, 2020, between 02:00 and 02:30, a failure occurred in the system. The number of failures recorded within this time range is 1. The reason for the root cause of this failure is currently unknown and needs to be determined.","{ 26 | ""1"": { 27 | ""root cause occurrence datetime"": ""2020-04-11 02:00:00"", 28 | ""root cause component"": ""db_007"", 29 | ""root cause reason"": ""db connection limit"" 30 | } 31 | }","level: service 32 | reason: db connection limit 33 | component: db_007 34 | timestamp: 1586542500 35 | datetime: 2020-04-11 02:15:00",db connection limit,,1.0,2.0,task_2 36 | "A single failure was detected during the time range of April 11, 2020, from 02:30 to 03:00. The specific reason for this failure is currently unknown. Your task is to determine the root cause reason.","{ 37 | ""1"": { 38 | ""root cause component"": ""os_014"", 39 | ""root cause reason"": ""CPU fault"" 40 | } 41 | }","level: pod 42 | reason: CPU fault 43 | component: docker_002 44 | timestamp: 1586544600 45 | datetime: 2020-04-11 02:50:00",CPU fault,,1.0,3.0,task_2 46 | "On April 11, 2020, between 04:30 and 05:00, there was a reported failure in the system. The exact time of the root cause occurrence, the affected component, and the underlying reason for this failure are currently unknown. You are tasked with identifying the root cause occurrence datetime, the root cause component, and the root cause reason.","{ 47 | ""1"": { 48 | ""root cause occurrence datetime"": ""2020-04-11 04:29:00"", 49 | ""root cause component"": ""db_004"", 50 | ""root cause reason"": ""db connection limit"" 51 | } 52 | }","level: pod 53 | reason: CPU fault 54 | component: docker_008 55 | timestamp: 1586551200 56 | datetime: 2020-04-11 04:40:00",,"docker_008 57 | 2020-04-11 04:40:00 58 | CPU fault",0.0,4.0,task_7 59 | "Within the specified time range of April 11, 2020, from 05:30 to 06:00, a failure was detected in the system. The precise time at which the root cause of the failure occurred is currently unknown. Please determine the exact root cause occurrence datetime to assist in diagnosing the issue.","{ 60 | ""1"": { 61 | ""root cause occurrence datetime"": ""2020-04-11 05:45:58"", 62 | ""root cause component"": ""db_003"", 63 | ""root cause reason"": ""db connection limit"" 64 | } 65 | }","level: service 66 | reason: db close 67 | component: db_003 68 | timestamp: 1586555100 69 | datetime: 2020-04-11 05:45:00",2020-04-11 05:45:00,,1.0,5.0,task_1 70 | "There was a single failure in the system within the time range of May 22, 2020, from 00:00 to 00:30. The precise time when the root cause of the failure occurred is unknown, and the reason behind the failure also remains unidentified. You are tasked with determining the exact root cause occurrence time and the underlying reason for the failure.","{ 71 | ""1"": { 72 | ""root cause occurrence datetime"": ""2020-05-22 00:11:38"", 73 | ""root cause component"": ""db_011"", 74 | ""root cause reason"": ""db connection limit"" 75 | } 76 | }","level: service 77 | reason: db close 78 | component: db_003 79 | timestamp: 1590077220 80 | datetime: 2020-05-22 00:07:00",,"2020-05-22 00:07:00 81 | db close",0.0,6.0,task_4 82 | "During the specified time range of May 22, 2020, from 00:30 to 01:00, the system experienced a single failure. The exact time of occurrence, affected component, and the underlying reason for this failure are currently unknown. Please pinpoint the root cause occurrence datetime, the root cause component, and the root cause reason.","{ 83 | ""1"": { 84 | ""root cause occurrence datetime"": ""2020-05-22 00:59:00"", 85 | ""root cause component"": ""os_022"", 86 | ""root cause reason"": ""CPU fault"" 87 | } 88 | }","level: pod 89 | reason: CPU fault 90 | component: docker_001 91 | timestamp: 1590079680 92 | datetime: 2020-05-22 00:48:00",CPU fault,"2020-05-22 00:48:00 93 | docker_001",0.3333333333333333,7.0,task_7 94 | "During the time range of May 22, 2020, from 01:30 to 02:00, there was one failure reported. The exact time of the root cause occurrence and the affected component are currently unknown. Please identify the root cause occurrence datetime and the root cause component.","{ 95 | ""1"": { 96 | ""root cause occurrence datetime"": ""2020-05-22 01:32:00"", 97 | ""root cause component"": ""db_003"", 98 | ""root cause reason"": ""db connection limit"" 99 | } 100 | }","level: node 101 | reason: network delay 102 | component: os_018 103 | timestamp: 1590083280 104 | datetime: 2020-05-22 01:48:00",,"2020-05-22 01:48:00 105 | os_018",0.0,8.0,task_5 106 | "On May 22, 2020, between 02:00 and 02:30, the system encountered a failure. The exact time of the root cause occurrence is currently unknown. Please determine the root cause occurrence datetime.","{ 107 | ""1"": { 108 | ""root cause occurrence datetime"": ""2020-05-22 02:22:31"", 109 | ""root cause component"": ""docker_003"", 110 | ""root cause reason"": ""CPU fault"" 111 | } 112 | }","level: pod 113 | reason: CPU fault 114 | component: docker_005 115 | timestamp: 1590085080 116 | datetime: 2020-05-22 02:18:00",,2020-05-22 02:18:00,0.0,9.0,task_1 117 | "During the specified time range of May 23, 2020, from 00:00 to 00:30, the system experienced a failure. The exact time of the root cause occurrence is unknown, complicating the diagnosis process. Please pinpoint the root cause occurrence datetime.","{ 118 | ""1"": { 119 | ""root cause occurrence datetime"": ""2020-05-23 00:05:00"", 120 | ""root cause component"": ""os_021"", 121 | ""root cause reason"": ""CPU fault"" 122 | } 123 | }","level: pod 124 | reason: CPU fault 125 | component: docker_004 126 | timestamp: 1590163500 127 | datetime: 2020-05-23 00:05:00",2020-05-23 00:05:00,,1.0,10.0,task_1 128 | "During the specified time range of May 23, 2020, from 00:30 to 01:00, the system experienced one failure. The exact component affected and the reason for this failure are unknown. Please identify the root cause component and the root cause reason.","{ 129 | ""1"": { 130 | ""root cause component"": ""db_007"", 131 | ""root cause reason"": ""db connection limit"" 132 | } 133 | }","level: service 134 | reason: db connection limit 135 | component: db_003 136 | timestamp: 1590165600 137 | datetime: 2020-05-23 00:40:00",db connection limit,db_003,0.5,11.0,task_6 138 | "There was one failure reported within the specified time range of May 23, 2020, from 01:00 to 01:30. The component responsible for the failure, the exact time of the root cause occurrence, and the underlying reason for this failure are currently unknown. Please identify the root cause component, the root cause occurrence datetime, and the root cause reason.","{ 139 | ""1"": { 140 | ""root cause occurrence datetime"": ""2020-05-23 00:54:13"", 141 | ""root cause component"": ""os_021"", 142 | ""root cause reason"": ""network delay"" 143 | } 144 | }","level: node 145 | reason: network loss 146 | component: os_021 147 | timestamp: 1590167760 148 | datetime: 2020-05-23 01:16:00",os_021,"2020-05-23 01:16:00 149 | network loss",0.3333333333333333,12.0,task_7 150 | "Within the specified time range of May 23, 2020, from 01:30 to 02:00, there was one failure. The exact time of the root cause occurrence is unknown, creating a challenge for failure diagnosis. Please identify the root cause occurrence datetime.","{ 151 | ""1"": { 152 | ""root cause occurrence datetime"": ""2020-05-23 01:45:25"", 153 | ""root cause component"": ""db_002"", 154 | ""root cause reason"": ""db connection limit"" 155 | } 156 | }","level: node 157 | reason: network loss 158 | component: os_009 159 | timestamp: 1590169920 160 | datetime: 2020-05-23 01:52:00",,2020-05-23 01:52:00,0.0,13.0,task_1 161 | "On May 23, 2020, between 03:30 and 04:00, there was a single failure observed in the system. The exact time when the root cause occurred is currently unknown. Please investigate and determine the root cause occurrence datetime.","{ 162 | ""1"": { 163 | ""root cause occurrence datetime"": ""2020-05-23 03:33:26"", 164 | ""root cause component"": ""db_011"", 165 | ""root cause reason"": ""db connection limit"" 166 | } 167 | }","level: node 168 | reason: network delay 169 | component: os_021 170 | timestamp: 1590176160 171 | datetime: 2020-05-23 03:36:00",,2020-05-23 03:36:00,0.0,14.0,task_1 172 | "Within the time range of May 23, 2020, from 04:30 to 05:00, there was one failure reported. The component responsible for this failure, the exact time of occurrence, and the underlying reason are currently unknown. You are tasked with identifying the root cause occurrence datetime, the root cause component, and the root cause reason.","{ 173 | ""1"": { 174 | ""root cause occurrence datetime"": ""2020-05-23 04:44:23"", 175 | ""root cause component"": ""os_021"", 176 | ""root cause reason"": ""network delay"" 177 | } 178 | }","level: pod 179 | reason: CPU fault 180 | component: docker_006 181 | timestamp: 1590180420 182 | datetime: 2020-05-23 04:47:00",,"2020-05-23 04:47:00 183 | docker_006 184 | CPU fault",0.0,15.0,task_7 185 | "During the specified time range of May 24, 2020, from 01:30 to 02:00, the system experienced a failure. The exact time of the root cause occurrence, the affected component, and the underlying reason for this failure are currently unknown. You are tasked with identifying the root cause occurrence datetime, the root cause component, and the root cause reason.","{ 186 | ""1"": { 187 | ""root cause occurrence datetime"": ""2020-05-24 01:30:00"", 188 | ""root cause component"": ""os_022"", 189 | ""root cause reason"": ""CPU fault"" 190 | } 191 | }","level: service 192 | reason: db close 193 | component: db_007 194 | timestamp: 1590256020 195 | datetime: 2020-05-24 01:47:00",,"db_007 196 | 2020-05-24 01:47:00 197 | db close",0.0,16.0,task_7 198 | "A failure occurred during the time range of May 24, 2020, from 03:00 to 03:30. The specific reason for this failure is currently unknown, and further diagnosis is required. Please identify the root cause reason.","{ 199 | ""1"": { 200 | ""root cause component"": ""os_012"", 201 | ""root cause reason"": ""CPU fault"" 202 | } 203 | }","level: pod 204 | reason: CPU fault 205 | component: docker_003 206 | timestamp: 1590261420 207 | datetime: 2020-05-24 03:17:00",CPU fault,,1.0,17.0,task_2 208 | "Within the specified time range of May 24, 2020, from 04:00 to 04:30, there was a single failure detected in the system. The specific reason behind this failure is currently unknown. Your task is to determine the root cause reason for this failure.","{ 209 | ""1"": { 210 | ""root cause component"": ""db_013"", 211 | ""root cause reason"": ""CPU fault"" 212 | } 213 | }","level: pod 214 | reason: CPU fault 215 | component: docker_006 216 | timestamp: 1590265020 217 | datetime: 2020-05-24 04:17:00",CPU fault,,1.0,18.0,task_2 218 | "On May 25, 2020, between 01:30 and 02:00, a failure occurred within the system. The exact time at which the root cause occurred and the reason for this failure are currently unknown. Please identify the root cause occurrence datetime and the root cause reason.","{ 219 | ""1"": { 220 | ""root cause occurrence datetime"": ""2020-05-25 01:47:00"", 221 | ""root cause reason"": ""network delay"" 222 | } 223 | }","level: node 224 | reason: network delay 225 | component: os_020 226 | timestamp: 1590342420 227 | datetime: 2020-05-25 01:47:00","network delay 228 | 2020-05-25 01:47:00",,1.0,19.0,task_4 229 | "There was a failure that occurred within the specified time range of May 25, 2020, from 03:30 to 04:00. The exact reason for this failure is unknown, complicating the failure diagnosis process. You are tasked with identifying the root cause reason.","{ 230 | ""1"": { 231 | ""root cause occurrence datetime"": ""2020-05-25 03:23:57"", 232 | ""root cause component"": ""os_009"", 233 | ""root cause reason"": ""CPU fault"" 234 | } 235 | }","level: node 236 | reason: network loss 237 | component: os_017 238 | timestamp: 1590349620 239 | datetime: 2020-05-25 03:47:00",,network loss,0.0,20.0,task_2 240 | "During the specified time range of May 25, 2020, from 04:30 to 05:00, one failure was detected in the system. However, the exact component involved, the time of occurrence, and the underlying reason for this failure are currently unknown. Please identify the root cause component, the root cause occurrence time, and the root cause reason.","{ 241 | ""1"": { 242 | ""root cause occurrence datetime"": ""2020-05-25 04:44:05"", 243 | ""root cause component"": ""db_008"", 244 | ""root cause reason"": ""db connection limit"" 245 | } 246 | }","level: service 247 | reason: db close 248 | component: db_003 249 | timestamp: 1590353220 250 | datetime: 2020-05-25 04:47:00",,"2020-05-25 04:47:00 251 | db_003 252 | db close",0.0,21.0,task_7 253 | "During the time range of May 26, 2020, from 02:00 to 02:30, the system experienced one failure. The exact time when the root cause occurred and the reason for this failure are currently unknown. Please identify the root cause occurrence datetime and the root cause reason.","{ 254 | ""1"": { 255 | ""root cause occurrence datetime"": ""2020-05-26 01:54:25"", 256 | ""root cause component"": ""db_011"", 257 | ""root cause reason"": ""db connection limit"" 258 | } 259 | }","level: service 260 | reason: db connection limit 261 | component: db_007 262 | timestamp: 1590430140 263 | datetime: 2020-05-26 02:09:00",db connection limit,2020-05-26 02:09:00,0.5,22.0,task_4 264 | "During the specified time range of May 26, 2020, from 02:30 to 03:00, there was one failure observed in the system. The exact time of the root cause occurrence is unknown. Please determine the root cause occurrence datetime.","{ 265 | ""1"": { 266 | ""root cause occurrence datetime"": ""2020-05-26 02:20:01"", 267 | ""root cause component"": ""os_021"", 268 | ""root cause reason"": ""network delay"" 269 | } 270 | }","level: pod 271 | reason: CPU fault 272 | component: docker_008 273 | timestamp: 1590431940 274 | datetime: 2020-05-26 02:39:00",,2020-05-26 02:39:00,0.0,23.0,task_1 275 | "There was a single system failure during the specified time range of May 26, 2020, from 03:00 to 03:30. The root cause of this failure remains unknown. Identifying the underlying reason for this failure is necessary to proceed with troubleshooting.","{ 276 | ""1"": { 277 | ""root cause occurrence datetime"": ""2020-05-26 03:10:00"", 278 | ""root cause component"": ""os_021"", 279 | ""root cause reason"": ""network loss"" 280 | } 281 | }","level: node 282 | reason: network delay 283 | component: os_021 284 | timestamp: 1590433740 285 | datetime: 2020-05-26 03:09:00",,network delay,0.0,24.0,task_2 286 | "During the time range of May 26, 2020, from 03:30 to 04:00, the system experienced a failure. The exact occurrence time and reason behind the root cause of this failure are unknown, which hinders the troubleshooting process. Your task is to identify the root cause occurrence datetime and the reason behind the failure.","{ 287 | ""1"": { 288 | ""root cause occurrence datetime"": ""2020-05-26 03:40:32"", 289 | ""root cause component"": ""os_018"", 290 | ""root cause reason"": ""network delay"" 291 | } 292 | }","level: node 293 | reason: network loss 294 | component: os_018 295 | timestamp: 1590435540 296 | datetime: 2020-05-26 03:39:00",,"network loss 297 | 2020-05-26 03:39:00",0.0,25.0,task_4 298 | "During the specified time range of May 26, 2020, from 04:00 to 04:30, the system experienced a failure. The root cause component responsible for this failure is currently unknown. Please identify the root cause component.","{ 299 | ""1"": { 300 | ""root cause component"": ""os_020"", 301 | ""root cause reason"": ""network delay"" 302 | } 303 | }","level: node 304 | reason: network delay 305 | component: os_020 306 | timestamp: 1590437700 307 | datetime: 2020-05-26 04:15:00",os_020,,1.0,26.0,task_3 308 | "During the specified time range of May 26, 2020, from 04:30 to 05:00, a single failure occurred. The exact time of the root cause occurrence, the affected component, and the underlying reason for this failure are currently unknown. You are tasked with identifying the root cause occurrence datetime, the root cause component, and the root cause reason.","{ 309 | ""1"": { 310 | ""root cause occurrence datetime"": ""2020-05-26 04:45:00"", 311 | ""root cause component"": ""os_021"", 312 | ""root cause reason"": ""CPU fault"" 313 | } 314 | }","level: node 315 | reason: network delay 316 | component: os_001 317 | timestamp: 1590439500 318 | datetime: 2020-05-26 04:45:00",2020-05-26 04:45:00,"network delay 319 | os_001",0.3333333333333333,27.0,task_7 320 | "Within the specified time range of May 26, 2020, from 05:00 to 05:30, there was a single failure detected in the system. However, the details about the root cause component, the exact time of the root cause occurrence, and the underlying reason for the failure are currently unknown. You need to identify the root cause component, the occurrence time, and the reason for this failure.","{ 321 | ""1"": { 322 | ""root cause occurrence datetime"": ""2020-05-26 05:00:00"", 323 | ""root cause component"": ""db_002"", 324 | ""root cause reason"": ""db connection limit"" 325 | } 326 | }","level: pod 327 | reason: CPU fault 328 | component: docker_002 329 | timestamp: 1590441300 330 | datetime: 2020-05-26 05:15:00",,"docker_002 331 | CPU fault 332 | 2020-05-26 05:15:00",0.0,28.0,task_7 333 | "A single failure was detected within the given time range of May 27, 2020, from 00:00 to 00:30. The specific time of the root cause occurrence and the underlying reason for this failure are currently unknown. Your task is to determine the root cause occurrence datetime and identify the root cause reason.","{ 334 | ""1"": { 335 | ""root cause occurrence datetime"": ""2020-05-27 00:00:00"", 336 | ""root cause component"": ""os_012"", 337 | ""root cause reason"": ""network loss"" 338 | } 339 | }","level: node 340 | reason: network delay 341 | component: os_018 342 | timestamp: 1590510180 343 | datetime: 2020-05-27 00:23:00",,"network delay 344 | 2020-05-27 00:23:00",0.0,29.0,task_4 345 | "There was one failure within the specified time range of May 27, 2020, from 00:30 to 01:00. The exact component responsible for this failure is currently unknown. Please identify the root cause component.","{ 346 | ""1"": { 347 | ""root cause component"": ""os_003"", 348 | ""root cause reason"": ""network delay"" 349 | } 350 | }","level: node 351 | reason: network delay 352 | component: os_017 353 | timestamp: 1590511980 354 | datetime: 2020-05-27 00:53:00",,os_017,0.0,30.0,task_3 355 | "During the specified time range of May 27, 2020, from 01:00 to 01:30, there was a failure reported in the system. The exact occurrence time of the root cause and the underlying reason for the failure are currently unknown. Please identify the root cause occurrence datetime and the root cause reason.","{ 356 | ""1"": { 357 | ""root cause occurrence datetime"": ""2020-05-27 00:52:38"", 358 | ""root cause component"": ""db_009"", 359 | ""root cause reason"": ""db connection limit"" 360 | } 361 | }","level: pod 362 | reason: CPU fault 363 | component: docker_006 364 | timestamp: 1590513780 365 | datetime: 2020-05-27 01:23:00",,"2020-05-27 01:23:00 366 | CPU fault",0.0,31.0,task_4 367 | "During the specified time range of May 27, 2020, from 01:30 to 02:00, the system experienced a failure. The exact root cause component and the underlying reason for this failure are currently unknown. Please identify the root cause component and the root cause reason.","{ 368 | ""1"": { 369 | ""root cause occurrence datetime"": ""2020-05-27 01:24:00"", 370 | ""root cause component"": ""os_022"", 371 | ""root cause reason"": ""CPU fault"" 372 | } 373 | }","level: service 374 | reason: db connection limit 375 | component: db_003 376 | timestamp: 1590515580 377 | datetime: 2020-05-27 01:53:00",,"db_003 378 | db connection limit",0.0,32.0,task_6 379 | "During the specified time range of May 27, 2020, from 02:00 to 02:30, the system experienced one failure. The exact component where the failure originated and the underlying reason for this failure are currently unknown. You are tasked with identifying the root cause component and the root cause reason.","{ 380 | ""1"": { 381 | ""root cause component"": ""os_021"", 382 | ""root cause reason"": ""CPU fault"" 383 | } 384 | }","level: service 385 | reason: db close 386 | component: db_007 387 | timestamp: 1590517380 388 | datetime: 2020-05-27 02:23:00",,"db_007 389 | db close",0.0,33.0,task_6 390 | "Within the time range of May 27, 2020, from 02:30 to 03:00, there was one failure detected in the system. The specific component responsible for this failure, the exact occurrence time, and the underlying reason for the failure are all currently unknown. You are tasked with identifying the root cause component, the root cause occurrence datetime, and the root cause reason.","{ 391 | ""1"": { 392 | ""root cause occurrence datetime"": ""2020-05-27 02:30:00"", 393 | ""root cause component"": ""db_003"", 394 | ""root cause reason"": ""db connection limit"" 395 | } 396 | }","level: service 397 | reason: db connection limit 398 | component: db_007 399 | timestamp: 1590519180 400 | datetime: 2020-05-27 02:53:00",db connection limit,"db_007 401 | 2020-05-27 02:53:00",0.3333333333333333,34.0,task_7 402 | "During the time range of May 27, 2020, from 05:00 to 05:30, a single failure occurred within the system. The specific component responsible for this failure, as well as the underlying reason, are currently unknown. Your task is to identify the root cause component and determine the root cause reason.","{ 403 | ""1"": { 404 | ""root cause component"": ""db_007"", 405 | ""root cause reason"": ""db connection limit"" 406 | } 407 | }","level: pod 408 | reason: CPU fault 409 | component: docker_001 410 | timestamp: 1590527340 411 | datetime: 2020-05-27 05:09:00",,"CPU fault 412 | docker_001",0.0,35.0,task_6 413 | "There was one failure recorded during the time range of May 27, 2020, from 05:30 to 06:00. The specific time when the root cause occurred and the reason for this failure are currently unknown. You need to identify the root cause occurrence datetime and the root cause reason.","{ 414 | ""1"": { 415 | ""root cause occurrence datetime"": ""2020-05-27 05:24:03"", 416 | ""root cause component"": ""os_009"", 417 | ""root cause reason"": ""CPU fault"" 418 | } 419 | }","level: node 420 | reason: network delay 421 | component: os_021 422 | timestamp: 1590529140 423 | datetime: 2020-05-27 05:39:00",,"2020-05-27 05:39:00 424 | network delay",0.0,36.0,task_4 425 | "During the specified time range of May 28, 2020, from 00:30 to 01:00, the system experienced a single failure. The root cause of this failure, however, is currently unknown. Please identify the reason behind this failure.","{ 426 | ""1"": { 427 | ""root cause occurrence datetime"": ""2020-05-28 00:30:00"", 428 | ""root cause component"": ""os_021"", 429 | ""root cause reason"": ""CPU fault"" 430 | } 431 | }","level: pod 432 | reason: CPU fault 433 | component: docker_001 434 | timestamp: 1590598020 435 | datetime: 2020-05-28 00:47:00",CPU fault,,1.0,37.0,task_2 436 | "Between the time range of May 28, 2020, from 03:30 to 04:00, a failure was detected in the system. The root cause of this failure is currently unknown. Your task is to identify the root cause reason for this failure.","{ 437 | ""1"": { 438 | ""root cause component"": ""os_004"", 439 | ""root cause reason"": ""CPU fault"" 440 | } 441 | }","level: node 442 | reason: network loss 443 | component: os_018 444 | timestamp: 1590608820 445 | datetime: 2020-05-28 03:47:00",,network loss,0.0,38.0,task_2 446 | "During the time range of May 29, 2020, from 02:00 to 02:30, the system experienced a failure. The exact component that caused this failure remains unknown. Please identify the root cause component responsible for this failure.","{ 447 | ""1"": { 448 | ""root cause component"": ""db_003"", 449 | ""root cause reason"": ""db connection limit"" 450 | } 451 | }","level: service 452 | reason: db connection limit 453 | component: db_003 454 | timestamp: 1590689460 455 | datetime: 2020-05-29 02:11:00",db_003,,1.0,39.0,task_3 456 | "On May 29, 2020, between 02:30 and 03:00, there was one failure reported. The specific reason for this failure is currently unknown. Your task is to identify the root cause reason behind this failure.","{ 457 | ""1"": { 458 | ""root cause component"": ""docker_002"", 459 | ""root cause reason"": ""CPU fault"" 460 | } 461 | }","level: pod 462 | reason: CPU fault 463 | component: docker_008 464 | timestamp: 1590691260 465 | datetime: 2020-05-29 02:41:00",CPU fault,,1.0,40.0,task_2 466 | "On May 29, 2020, between 03:30 and 04:00, a failure was identified in the system. However, the exact time when the root cause occurred is unknown. Please determine the root cause occurrence datetime.","{ 467 | ""1"": { 468 | ""root cause occurrence datetime"": ""2020-05-29 03:20:00"", 469 | ""root cause component"": ""os_018"", 470 | ""root cause reason"": ""network loss"" 471 | } 472 | }","level: pod 473 | reason: CPU fault 474 | component: docker_001 475 | timestamp: 1590694860 476 | datetime: 2020-05-29 03:41:00",,2020-05-29 03:41:00,0.0,41.0,task_1 477 | "On May 30, 2020, between 00:00 to 00:30, a single failure was observed. However, both the exact time of the root cause occurrence and the component involved in this failure are currently unknown. You are tasked with identifying the root cause occurrence datetime and the root cause component.","{ 478 | ""1"": { 479 | ""root cause occurrence datetime"": ""2020-05-30 00:07:00"", 480 | ""root cause component"": ""os_002"", 481 | ""root cause reason"": ""CPU fault"" 482 | } 483 | }","level: node 484 | reason: network loss 485 | component: os_021 486 | timestamp: 1590768780 487 | datetime: 2020-05-30 00:13:00",,"os_021 488 | 2020-05-30 00:13:00",0.0,42.0,task_5 489 | "There was a failure within the time range of May 30, 2020, from 02:30 to 03:00. The exact component responsible for the failure and the underlying reason for this failure are currently unknown. Please identify the root cause component and reason.","{ 490 | ""1"": { 491 | ""root cause component"": ""os_022"", 492 | ""root cause reason"": ""CPU fault"" 493 | } 494 | }","level: node 495 | reason: network delay 496 | component: os_009 497 | timestamp: 1590777780 498 | datetime: 2020-05-30 02:43:00",,"os_009 499 | network delay",0.0,43.0,task_6 500 | "Within the time range of May 30, 2020, from 03:30 to 04:00, there was an occurrence of a single system failure. However, the specific time at which the root cause occurred and the component responsible for this failure remain unidentified. Further investigation is needed to determine the root cause occurrence time and the root cause component.","{ 501 | ""1"": { 502 | ""root cause occurrence datetime"": ""2020-05-30 03:23:43"", 503 | ""root cause component"": ""os_008"", 504 | ""root cause reason"": ""CPU fault"" 505 | } 506 | }","level: node 507 | reason: network delay 508 | component: os_020 509 | timestamp: 1590781380 510 | datetime: 2020-05-30 03:43:00",,"os_020 511 | 2020-05-30 03:43:00",0.0,44.0,task_5 512 | "Within the specified time range of May 30, 2020, from 04:00 to 04:30, there was one failure reported. The exact root cause occurrence time is currently unknown. Please identify the root cause occurrence datetime.","{ 513 | ""1"": { 514 | ""root cause occurrence datetime"": ""2020-05-30 04:06:48"", 515 | ""root cause component"": ""os_021"", 516 | ""root cause reason"": ""network delay"" 517 | } 518 | }","level: pod 519 | reason: CPU fault 520 | component: docker_002 521 | timestamp: 1590783180 522 | datetime: 2020-05-30 04:13:00",,2020-05-30 04:13:00,0.0,45.0,task_1 523 | "Within the time range of May 30, 2020, from 05:00 to 05:30, the system encountered a single failure. The specific component responsible for this failure and the underlying reason remain unknown. Please identify the root cause component and the reason for this failure.","{ 524 | ""1"": { 525 | ""root cause component"": ""db_002"", 526 | ""root cause reason"": ""db connection limit"" 527 | } 528 | }","level: node 529 | reason: network loss 530 | component: os_018 531 | timestamp: 1590786780 532 | datetime: 2020-05-30 05:13:00",,"os_018 533 | network loss",0.0,46.0,task_6 534 | "During the time range of May 31, 2020, from 02:30 to 03:00, the system experienced a single failure. The exact component responsible for this failure is currently unknown. Please identify the component that caused the failure during this period.","{ 535 | ""1"": { 536 | ""root cause component"": ""db_003"", 537 | ""root cause reason"": ""db connection limit"" 538 | } 539 | }","level: node 540 | reason: network delay 541 | component: os_021 542 | timestamp: 1590864420 543 | datetime: 2020-05-31 02:47:00",,os_021,0.0,47.0,task_3 544 | "On May 31, 2020, during the time range from 03:00 to 03:30, there was a single failure in the system. The exact time when the root cause occurred is currently unknown. Please identify the root cause occurrence datetime.","{ 545 | ""1"": { 546 | ""root cause occurrence datetime"": ""2020-05-31 02:58:13"", 547 | ""root cause component"": ""db_001"", 548 | ""root cause reason"": ""CPU fault"" 549 | } 550 | }","level: node 551 | reason: network delay 552 | component: os_017 553 | timestamp: 1590866220 554 | datetime: 2020-05-31 03:17:00",,2020-05-31 03:17:00,0.0,48.0,task_1 555 | "During the specified time range of May 31, 2020, from 03:30 to 04:00, there was one failure reported. The exact time when the root cause occurred is unknown, making it challenging to diagnose the issue. Please determine the root cause occurrence datetime.","{ 556 | ""1"": { 557 | ""root cause occurrence datetime"": ""2020-05-31 03:36:15"", 558 | ""root cause component"": ""os_018"", 559 | ""root cause reason"": ""CPU fault"" 560 | } 561 | }","level: service 562 | reason: db connection limit 563 | component: db_003 564 | timestamp: 1590868020 565 | datetime: 2020-05-31 03:47:00",,2020-05-31 03:47:00,0.0,49.0,task_1 566 | "During the specified time range of May 31, 2020, from 04:00 to 04:30, the system experienced a failure. The exact occurrence time and component responsible for the root cause are currently unknown. Please identify the root cause occurrence datetime and the root cause component.","{ 567 | ""1"": { 568 | ""root cause occurrence datetime"": ""2020-05-31 04:19:00"", 569 | ""root cause component"": ""db_001"" 570 | } 571 | }","level: pod 572 | reason: CPU fault 573 | component: docker_004 574 | timestamp: 1590869820 575 | datetime: 2020-05-31 04:17:00",,"docker_004 576 | 2020-05-31 04:17:00",0.0,50.0,task_5 577 | -------------------------------------------------------------------------------- /rca/baseline/cot_lm.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | from rca.api_router import get_chat_completion 3 | 4 | system = """You will be provided with some telemetry data and an issue statement explaining a root cause analysis problem to resolve. 5 | 6 | {info} 7 | 8 | {data}""" 9 | 10 | user = """Now, I need you to provide an root cause analysis to the following question: 11 | 12 | ```issue 13 | {objective} 14 | ``` 15 | 16 | Note: A root cause is the fundamental factor that triggers a service system failure, causing other system components to exhibit various anomalous behaviors. It consists of three elements: the root cause component, the start time of the root cause occurrence, and the reason for its occurrence. The objective of root cause analysis may vary, aiming to identify one or more of these elements based on the issue. Each failure has only one root cause. However, sometimes a system's abnormal state may be due to multiple simultaneous failures, each with its own root cause. If you find that there is a call relationship between multiple components exhibiting abnormal behavior, these anomalies originate from the same failure, with the component at the downstream end of the call chain being the root cause component. The anomalies in the other components are caused by the failure. If there is no call relationship between the abnormal components, each component may be the root cause of a different failure. Typically, the number of failures occurring within half an hour does not exceed three. 17 | 18 | Please first conduct a comprehensive analysis on the given telemetry data step-by-step in your response. Then, summarize your findings using the following JSON format to provide a concise answer to the given issue at the end of your response. In the summarized ansewr, you only need to provide the elements asked by the issue, and ommited the other fields in the JSON. The overall format is as follows: 19 | 20 | {{ 21 | "1": {{ 22 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited), 23 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited), 24 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited), 25 | }}, (mandatory) 26 | "2": {{ 27 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited), 28 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited), 29 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited), 30 | }}, (only if the failure number is "unknown" or "more than one" in the issue) 31 | ... (only if the failure number is "unknown" or "more than one" in the issue) 32 | }} 33 | (DO NOT contain "```json" and "```" tags. DO contain the JSON object with the brackets "{{}}" only.) 34 | 35 | Please follow the format above to provide your response of current issue. 36 | 37 | Response below:""" 38 | 39 | class CoTLM: 40 | def __init__(self, oracle, schema) -> None: 41 | self.tokenizer = tiktoken.encoding_for_model("gpt-4o") 42 | self.oracle = oracle 43 | self.schema = schema 44 | 45 | 46 | def run(self, instruction, period_data, sample_interval, logger, max_try=3): 47 | logger.info(f"Objective: {instruction}") 48 | 49 | data = f"""## TELEMETRY DATA (Sampled every {sample_interval/60} min):""" 50 | for key in sorted(period_data.keys()): 51 | value = period_data[key] 52 | data += "\n\n" + "".join([f"### {str(key).upper()} DATA", value]) 53 | logger.debug(f"{str(key).upper()} DATA tokens: {len(self.tokenizer.encode(value))}") 54 | info = self.schema 55 | prompt = [ 56 | {'role': 'system', 'content': system.format(info=info, data=data)}, 57 | {'role': 'user', 'content': user.format(objective=instruction)} 58 | ] 59 | 60 | logger.debug(f"prompt tokens: {len(self.tokenizer.encode(prompt[0]['content']))}") 61 | 62 | for i in range(max_try): 63 | try: 64 | response = get_chat_completion( 65 | messages=prompt, 66 | ) 67 | logger.debug(f"Raw Response:\n{response}") 68 | return response, prompt 69 | except Exception as e: 70 | logger.error(e) 71 | if 'context_length_exceeded' in str(e): 72 | logger.error("Token length exceeds the limit.") 73 | return "EXCEED!", prompt 74 | logger.warning("Max steps reached. Please check the history.") 75 | return "Max steps reached. Please check the history.", prompt 76 | 77 | 78 | -------------------------------------------------------------------------------- /rca/baseline/direct_lm.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | from rca.api_router import get_chat_completion 3 | 4 | system = """You will be provided with some telemetry data and an issue statement explaining a root cause analysis problem to resolve. 5 | 6 | {info} 7 | 8 | {data}""" 9 | 10 | user = """Now, I need you to provide an root cause analysis to the following question: 11 | 12 | ```issue 13 | {objective} 14 | ``` 15 | 16 | Note: A root cause is the fundamental factor that triggers a service system failure, causing other system components to exhibit various anomalous behaviors. It consists of three elements: the root cause component, the start time of the root cause occurrence, and the reason for its occurrence. The objective of root cause analysis may vary, aiming to identify one or more of these elements based on the issue. Each failure has only one root cause. However, sometimes a system's abnormal state may be due to multiple simultaneous failures, each with its own root cause. If you find that there is a call relationship between multiple components exhibiting abnormal behavior, these anomalies originate from the same failure, with the component at the downstream end of the call chain being the root cause component. The anomalies in the other components are caused by the failure. If there is no call relationship between the abnormal components, each component may be the root cause of a different failure. Typically, the number of failures occurring within half an hour does not exceed three. 17 | 18 | Your response should be structured into a JSON format, itemising the relevant root cause information you find. You only need to provide the elements asked by the issue, and ommited the other fields in the JSON. The overall format is as follows: 19 | 20 | {{ 21 | "1": {{ 22 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited), 23 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited), 24 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited), 25 | }}, (mandatory) 26 | "2": {{ 27 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited), 28 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited), 29 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited), 30 | }}, (only if the failure number is "unknown" or "more than one" in the issue) 31 | ... (only if the failure number is "unknown" or "more than one" in the issue) 32 | }} 33 | (DO NOT contain "```json" and "```" tags. DO contain the JSON object with the brackets "{{}}" only.) 34 | 35 | Please follow the format above to provide your response of current issue. 36 | 37 | Response below:""" 38 | 39 | class DirectLM: 40 | def __init__(self, oracle, schema) -> None: 41 | self.tokenizer = tiktoken.encoding_for_model("gpt-4o") 42 | self.oracle = oracle 43 | self.schema = schema 44 | 45 | 46 | def run(self, instruction, period_data, sample_interval, logger, max_try=3): 47 | logger.info(f"Objective: {instruction}") 48 | 49 | data = f"""## TELEMETRY DATA (Sampled every {sample_interval/60} min):""" 50 | for key in sorted(period_data.keys()): 51 | value = period_data[key] 52 | data += "\n\n" + "".join([f"### {str(key).upper()} DATA", value]) 53 | logger.debug(f"{str(key).upper()} DATA tokens: {len(self.tokenizer.encode(value))}") 54 | info = self.schema 55 | prompt = [ 56 | {'role': 'system', 'content': system.format(info=info, data=data)}, 57 | {'role': 'user', 'content': user.format(objective=instruction)} 58 | ] 59 | 60 | logger.debug(f"prompt tokens: {len(self.tokenizer.encode(prompt[0]['content']))}") 61 | 62 | for i in range(max_try): 63 | try: 64 | response = get_chat_completion( 65 | messages=prompt, 66 | ) 67 | logger.debug(f"Raw Response:\n{response}") 68 | return response, prompt 69 | except Exception as e: 70 | logger.error(e) 71 | if 'context_length_exceeded' in str(e): 72 | logger.error("Token length exceeds the limit.") 73 | return "EXCEED!", prompt 74 | logger.warning("Max steps reached. Please check the history.") 75 | return "Max steps reached. Please check the history.", prompt 76 | 77 | 78 | -------------------------------------------------------------------------------- /rca/baseline/oracle_kpis.py: -------------------------------------------------------------------------------- 1 | kpi_Telecom = { 2 | "db": ["Proc_Used_Pct", 3 | "Sess_Connect", 4 | "Proc_User_Used_Pct", 5 | "On_Off_State", 6 | "tnsping_result_time", 7 | ], 8 | "cpu": ["container_cpu_used", 9 | ], 10 | "mem": ["Memory_used_pct", 11 | ], 12 | "io": ["Disk_io_util" 13 | ], 14 | "net": ["Sent_queue", 15 | "Received_queue", 16 | ] 17 | 18 | } 19 | 20 | kpi_Telecom_len = sum(len(v) for v in kpi_Telecom.values()) 21 | 22 | 23 | kpi_Bank = { 24 | "jvm": ["JVM-Operating System_7779_JVM_JVM_CPULoad", 25 | "JVM-Operating System_7778_JVM_JVM_CPULoad", 26 | "JVM-Memory_7778_JVM_Memory_NoHeapMemoryUsed", 27 | "JVM-Memory_7779_JVM_Memory_NoHeapMemoryUsed", 28 | "JVM-Memory_7779_JVM_Memory_HeapMemoryUsage", 29 | "JVM-Memory_7778_JVM_Memory_HeapMemoryUsage", 30 | "JVM-Memory_7778_JVM_Memory_HeapMemoryUsed", 31 | "JVM-Memory_7779_JVM_Memory_HeapMemoryUsed", 32 | ], 33 | 34 | "cpu": ["OSLinux-CPU_CPU_CPUCpuUtil", 35 | "OSLinux-CPU_CPU-0_SingleCpuUtil", 36 | ], 37 | 38 | "mem": ["OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc", 39 | "OSLinux-OSLinux_MEMORY_MEMORY_NoCacheMemPerc", 40 | "OSLinux-OSLinux_MEMORY_MEMORY_MEMFreeMem", 41 | ], 42 | 43 | "io": ["OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKReadWrite", 44 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead", 45 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKReadWrite", 46 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRTps", 47 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKRTps", 48 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKRead", 49 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKBps", 50 | "OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKPercentBusy", 51 | ], 52 | 53 | "net": ["OSLinux-OSLinux_NETWORK_NETWORK_TCP-FIN-WAIT", 54 | "OSLinux-OSLinux_NETWORK_ens160_NETBandwidthUtil", 55 | "OSLinux-OSLinux_NETWORK_NETWORK_TotalTcpConnNum", 56 | "OSLinux-OSLinux_NETWORK_ens160_NETPacketsOut", 57 | "OSLinux-OSLinux_NETWORK_ens160_NETPacketsIn", 58 | "OSLinux-OSLinux_NETWORK_ens160_NETKBTotalPerSec", 59 | "OSLinux-OSLinux_NETWORK_NETWORK_TCP-CLOSE-WAIT" 60 | ] 61 | } 62 | 63 | 64 | kpi_Bank_len = sum([len(v) for v in kpi_Bank.values()]) 65 | 66 | 67 | kpi_Market = { 68 | "process": ["container_threads", 69 | ], 70 | 71 | "cpu": ["container_cpu_usage_seconds", 72 | "system.cpu.pct_usage", 73 | ], 74 | 75 | "mem": ["system.mem.used", 76 | "container_memory_usage_MB", 77 | ], 78 | 79 | "io": ["container_fs_reads./dev/vda", 80 | "container_fs_writes./dev/vda", 81 | "system.io.r_s", 82 | "system.io.w_s", 83 | "container_fs_writes./dev/vda1", 84 | "system.disk.used", 85 | "system.disk.pct_usage", 86 | ], 87 | 88 | "net": ["container_network_receive_packets.eth0", 89 | "container_network_receive_MB.eth0", 90 | "recommendationservice-grpc", 91 | "frontend-http", 92 | "cartservice-grpc", 93 | "checkoutservice-grpc", 94 | "productcatalogservice-grpc", 95 | "emailservice-grpc", 96 | "adservice-grpc", 97 | ], 98 | } 99 | 100 | kpi_Market_len = sum([len(v) for v in kpi_Market.values()]) -------------------------------------------------------------------------------- /rca/baseline/rca_agent/controller.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from IPython.terminal.embed import InteractiveShellEmbed 4 | 5 | from rca.baseline.rca_agent.executor import execute_act 6 | 7 | from rca.api_router import get_chat_completion 8 | 9 | system = """You are the Administrator of a DevOps Assistant system for failure diagnosis. To solve each given issue, you should iteratively instruct an Executor to write and execute Python code for data analysis on telemetry files of target system. By analyzing the execution results, you should approximate the answer step-by-step. 10 | 11 | There is some domain knowledge for you: 12 | 13 | {background} 14 | 15 | {agent} 16 | 17 | The issue you are going to solve is: 18 | 19 | {objective} 20 | 21 | Solve the issue step-by-step. In each step, your response should follow the JSON format below: 22 | 23 | {format} 24 | 25 | Let's begin.""" 26 | 27 | format = """{ 28 | "analysis": (Your analysis of the code execution result from Executor in the last step, with detailed reasoning of 'what have been done' and 'what can be derived'. Respond 'None' if it is the first step.), 29 | "completed": ("True" if you believe the issue is resolved, and an answer can be derived in the 'instruction' field. Otherwise "False"), 30 | "instruction": (Your instruction for the Executor to perform via code execution in the next step. Do not involve complex multi-step instruction. Keep your instruction atomic, with clear request of 'what to do' and 'how to do'. Respond a summary by yourself if you believe the issue is resolved. Respond a summary by yourself if you believe the issue is resolved. Respond a summary by yourself if you believe the issue is resolved.) 31 | } 32 | (DO NOT contain "```json" and "```" tags. DO contain the JSON object with the brackets "{}" only. Use '\\n' instead of an actual newline character to ensure JSON compatibility when you want to insert a line break within a string.)""" 33 | 34 | summary = """Now, you have decided to finish your reasoning process. You should now provide the final answer to the issue. The candidates of possible root cause components and reasons are provided to you. The root cause components and reasons must be selected from the provided candidates. 35 | 36 | {cand} 37 | 38 | Recall the issue is: {objective} 39 | 40 | Please first review your previous reasoning process to infer an exact answer of the issue. Then, summarize your final answer of the root causes using the following JSON format at the end of your response: 41 | 42 | ```json 43 | {{ 44 | "1": {{ 45 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited), 46 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited), 47 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited), 48 | }}, (mandatory) 49 | "2": {{ 50 | "root cause occurrence datetime": (if asked by the issue, format: '%Y-%m-%d %H:%M:%S', otherwise ommited), 51 | "root cause component": (if asked by the issue, one selected from the possible root cause component list, otherwise ommited), 52 | "root cause reason": (if asked by the issue, one selected from the possible root cause reason list, otherwise ommited), 53 | }}, (only if the failure number is "unknown" or "more than one" in the issue) 54 | ... (only if the failure number is "unknown" or "more than one" in the issue) 55 | }} 56 | ``` 57 | (Please use "```json" and "```" tags to wrap the JSON object. You only need to provide the elements asked by the issue, and ommited the other fields in the JSON.) 58 | Note that all the root cause components and reasons must be selected from the provided candidates. Do not reply 'unknown' or 'null' or 'not found' in the JSON. Do not be too conservative in selecting the root cause components and reasons. Be decisive to infer a possible answer based on your current observation.""" 59 | 60 | def control_loop(objective:str, plan:str, ap, bp, logger, max_step = 15, max_turn = 3) -> str: 61 | 62 | prompt = [ 63 | {'role': 'system', 'content': system.format(objective=objective, 64 | format=format, 65 | agent=ap.rules, 66 | background=bp.schema)}, 67 | {'role': 'user', 'content': "Let's begin."} 68 | ] 69 | 70 | history = [] 71 | trajectory = [] 72 | observation = "Let's begin." 73 | status = False 74 | kernel = InteractiveShellEmbed() 75 | init_code = "import pandas as pd\n"+ \ 76 | "pd.set_option('display.width', 427)\n"+ \ 77 | "pd.set_option('display.max_columns', 10)\n" 78 | kernel.run_cell(init_code) 79 | 80 | for step in range(max_step): 81 | 82 | note = [{'role': 'user', 'content': f"Continue your reasoning process for the target issue:\n\n{objective}\n\nFollow the rules during issue solving:\n\n{ap.rules}.\n\nResponse format:\n\n{format}"}] 83 | attempt_actor = [] 84 | try: 85 | response_raw = get_chat_completion( 86 | messages=prompt + note, 87 | ) 88 | if "```json" in response_raw: 89 | response_raw = re.search(r"```json\n(.*)\n```", response_raw, re.S).group(1).strip() 90 | logger.debug(f"Raw Response:\n{response_raw}") 91 | if '"analysis":' not in response_raw or '"instruction":' not in response_raw or '"completed":' not in response_raw: 92 | logger.warning("Invalid response format. Please provide a valid JSON response.") 93 | prompt.append({'role': 'assistant', 'content': response_raw}) 94 | prompt.append({'role': 'user', 'content': "Please provide your analysis in requested JSON format."}) 95 | continue 96 | response = json.loads(response_raw) 97 | analysis = response['analysis'] 98 | instruction = response['instruction'] 99 | completed = response['completed'] 100 | logger.info('-'*80 + '\n' + f"### Step[{step+1}]\nAnalysis: {analysis}\nInstruction: {instruction}" + '\n' + '-'*80) 101 | 102 | if completed == "True": 103 | kernel.reset() 104 | prompt.append({'role': 'assistant', 'content': response_raw}) 105 | prompt.append({'role': 'user', 'content': summary.format(objective=objective, 106 | cand=bp.cand)}) 107 | answer = get_chat_completion( 108 | messages=prompt, 109 | ) 110 | logger.debug(f"Raw Final Answer:\n{answer}") 111 | prompt.append({'role': 'assistant', 'content': answer}) 112 | if "```json" in answer: 113 | answer = re.search(r"```json\n(.*)\n```", answer, re.S).group(1).strip() 114 | return answer, trajectory, prompt 115 | 116 | code, result, status, new_history = execute_act(instruction, bp.schema, history, attempt_actor, kernel, logger) 117 | if not status: 118 | logger.warn(f'Self-Correction failed.') 119 | observation = "The Executor failed to execute the instruction. Please provide a new instruction." 120 | observation = f"{result}" 121 | history = new_history 122 | trajectory.append({'code': f"# In[{step+1}]:\n\n{code}", 'result': f"Out[{step+1}]:\n```\n{result}```"}) 123 | logger.info('-'*80 + '\n' + f"Step[{step+1}]\n### Observation:\n{result}" + '\n' + '-'*80) 124 | prompt.append({'role': 'assistant', 'content': response_raw}) 125 | prompt.append({'role': 'user', 'content': observation}) 126 | 127 | except Exception as e: 128 | logger.error(e) 129 | prompt.append({'role': 'assistant', 'content': response_raw}) 130 | prompt.append({'role': 'user', 'content': f"{str(e)}\nPlease provide your analysis in requested JSON format."}) 131 | if 'context_length_exceeded' in str(e): 132 | logger.warning("Token length exceeds the limit.") 133 | kernel.reset() 134 | return "Token length exceeds. No root cause found.", trajectory, prompt 135 | 136 | logger.warning("Max steps reached. Please check the history.") 137 | kernel.reset() 138 | final_prompt = {'role': 'user', 'content': summary.format(objective=objective, 139 | cand=bp.cand).replace('Now, you have decided to finish your reasoning process. ', 'Now, the maximum steps of your reasoning have been reached. ')} 140 | if prompt[-1]['role'] == 'user': 141 | prompt[-1]['content'] = final_prompt['content'] 142 | else: 143 | prompt.append({'role': 'user', 'content': final_prompt['content']}) 144 | answer = get_chat_completion( 145 | messages=prompt, 146 | ) 147 | logger.debug(f"Raw Final Answer:\n{answer}") 148 | prompt.append({'role': 'assistant', 'content': answer}) 149 | if "```json" in answer: 150 | answer = re.search(r"```json\n(.*)\n```", answer, re.S).group(1).strip() 151 | return answer, trajectory, prompt 152 | -------------------------------------------------------------------------------- /rca/baseline/rca_agent/executor.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | from datetime import datetime 4 | from rca.api_router import get_chat_completion 5 | import tiktoken 6 | import traceback 7 | 8 | system = """You are a DevOps assistant for writing Python code to answer DevOps questions. For each question, you need to write Python code to solve it by retrieving and processing telemetry data of the target system. Your generated Python code will be automatically submitted to a IPython Kernel. The execution result output in IPython Kernel will be used as the answer to the question. 9 | 10 | {rule} 11 | 12 | There is some domain knowledge for you: 13 | 14 | {background} 15 | 16 | Your response should follow the Python block format below: 17 | 18 | {format}""" 19 | 20 | format = """```python 21 | (YOUR CODE HERE) 22 | ```""" 23 | 24 | summary = """The code execution is successful. The execution result is shown below: 25 | 26 | {result} 27 | 28 | Please summarize a straightforward answer to the question based on the execution results. Use plain English.""" 29 | 30 | conclusion = """{answer} 31 | 32 | The original code execution output of IPython Kernel is also provided below for reference: 33 | 34 | {result}""" 35 | 36 | rule = """## RULES OF PYTHON CODE WRITING: 37 | 38 | 1. Reuse variables as much as possible for execution efficiency since the IPython Kernel is stateful, i.e., variables define in previous steps can be used in subsequent steps. 39 | 2. Use variable name rather than `print()` to display the execution results since your Python environment is IPython Kernel rather than Python.exe. If you want to display multiple variables, use commas to separate them, e.g. `var1, var2`. 40 | 3. Use pandas Dataframe to process and display tabular data for efficiency and briefness. Avoid transforming Dataframe to list or dict type for display. 41 | 4. If you encounter an error or unexpected result, rewrite the code by referring to the given IPython Kernel error message. 42 | 5. Do not simulate any virtual situation or assume anything unknown. Solve the real problem. 43 | 6. Do not store any data as files in the disk. Only cache the data as variables in the memory. 44 | 7. Do not visualize the data or draw pictures or graphs via Python. You can only provide text-based results. Never include the `matplotlib` or `seaborn` library in the code. 45 | 8. Do not generate anything else except the Python code block except the instruction tell you to 'Use plain English'. If you find the input instruction is a summarization task (which is typically happening in the last step), you should comprehensively summarize the conclusion as a string in your code and display it directly. 46 | 9. Do not calculate threshold AFTER filtering data within the given time duration. Always calculate global thresholds using the entire KPI series of a specific component within a metric file BEFORE filtering data within the given time duration. 47 | 10. All issues use **UTC+8** time. However, the local machine's default timezone is unknown. Please use `pytz.timezone('Asia/Shanghai')` to explicityly set the timezone to UTC+8. 48 | """ 49 | 50 | def execute_act(instruction:str, background:str, history, attempt, kernel, logger) -> str: 51 | 52 | logger.debug("Start execution") 53 | t1 = datetime.now() 54 | if history == []: 55 | history = [ 56 | {'role': 'system', 'content': system.format(rule=rule, background=background, format=format)}, 57 | ] 58 | code_pattern = re.compile(r"```python\n(.*?)\n```", re.DOTALL) 59 | code = "" 60 | result = "" 61 | retry_flag = False 62 | status = False 63 | history.extend([{'role': 'user', 'content': instruction}]) 64 | prompt = history.copy() 65 | note = [{'role': 'user', 'content': f"Continue your code writing process following the rules:\n\n{rule}\n\nResponse format:\n\n{format}"}] 66 | tokenizer = tiktoken.encoding_for_model("gpt-4") 67 | for i in range(2): 68 | try: 69 | if not retry_flag: 70 | response = get_chat_completion( 71 | messages=prompt + note, 72 | ) 73 | else: 74 | response = get_chat_completion( 75 | messages=prompt, 76 | ) 77 | retry_flag = False 78 | if re.search(code_pattern, response): 79 | code = re.search(code_pattern, response).group(1).strip() 80 | else: 81 | code = response.strip() 82 | logger.debug(f"Raw Code:\n{code}") 83 | if "import matplotlib" in code or "import seaborn" in code: 84 | logger.warning("The generated visualization code detected.") 85 | prompt.append({'role': 'assistant', 'content': code}) 86 | prompt.append({'role': 'user', 'content': "You are not permitted to generate visualizations. If the instruction requires visualization, please provide the text-based results."}) 87 | continue 88 | exec = kernel.run_cell(code) 89 | status = exec.success 90 | if status: 91 | result = str(exec.result).strip() 92 | tokens_len = len(tokenizer.encode(result)) 93 | if tokens_len > 16384: 94 | logger.warning(f"Token length exceeds the limit: {tokens_len}") 95 | continue 96 | t2 = datetime.now() 97 | row_pattern = r"\[(\d+)\s+rows\s+x\s+\d+\s+columns\]" 98 | match = re.search(row_pattern, result) 99 | if match: 100 | rows = int(match.group(1)) 101 | if rows > 10: 102 | result += f"\n\n**Note**: The printed pandas DataFrame is truncated due to its size. Only **10 rows** are displayed, which may introduce observation bias due to the incomplete table. If you want to comprehensively understand the details without bias, please ask Executor using `df.head(X)` to display more rows." 103 | logger.debug(f"Execution Result:\n{result}") 104 | logger.debug(f"Execution finished. Time cost: {t2-t1}") 105 | history.extend([ 106 | {'role': 'assistant', 'content': code}, 107 | {'role': 'user', 'content': summary.format(result=result)}, 108 | ]) 109 | answer = get_chat_completion( 110 | messages=history, 111 | ) 112 | logger.debug(f"Brief Answer:\n{answer}") 113 | history.extend([ 114 | {'role': 'assistant', 'content': answer}, 115 | ]) 116 | result = conclusion.format(answer=answer, result=result) 117 | 118 | return code, result, status, history 119 | else: 120 | result = ''.join(traceback.format_exception(type(exec.error_in_exec), exec.error_in_exec, exec.error_in_exec.__traceback__)) 121 | t2 = datetime.now() 122 | logger.warning(f"Execution failed. Error message: {result}") 123 | logger.debug(f"Execution finished. Time cost: {t2-t1}") 124 | prompt.append({'role': 'assistant', 'content': code}) 125 | prompt.append({'role': 'user', 'content': f"Execution failed:\n{result}\nPlease revise your code and retry."}) 126 | retry_flag = True 127 | 128 | except Exception as e: 129 | logger.error(e) 130 | time.sleep(1) 131 | 132 | t2 = datetime.now() 133 | logger.error(f"Max try reached. Please check the history. Time cost: {t2-t1}") 134 | err = "The Executor failed to complete the instruction, please re-write a new instruction for Executor." 135 | history.extend([{'role': 'assistant', 'content': err}]) 136 | return err, err, True, history -------------------------------------------------------------------------------- /rca/baseline/rca_agent/prompt/agent_prompt.py: -------------------------------------------------------------------------------- 1 | rules = """## RULES OF FAILURE DIAGNOSIS: 2 | 3 | What you SHOULD do: 4 | 5 | 1. **Follow the workflow of `preprocess -> anomaly detection -> fault identification -> root cause localization` for failure diagnosis.** 6 | 1.1. Preprocess: 7 | - Aggregate each KPI of each components that are possible to be the root cause component to obtain multiple time series classified by 'component-KPI' (e.g., service_A-cpu_usage_pct). 8 | - Then, calculate global thresholds (e.g., global P95, where 'global' means the threshold of all 'component-KPI' time series within a whole metric file) for each 'component-KPI' time series. - Finally, filter data within the given time duration for all time series to perform further analysis. 9 | - Since the root cause component must be selected from the provided possible root cause components, all other level's components (e.g., service mesh components, middleware components, etc.) should be ignored. 10 | 1.2. Anomaly detection: 11 | - An anomaly is typically a data point that exceeds the global threshold. 12 | - Look for anomalies below a certain threshold (e.g., <=P95, <=P15, or <=P5) in traffic KPIs or business KPIs (e.g., success rate (ss)) since some network failures can cause a sudden drop on them due to packet loss. 13 | - Loose the global threshold (e.g., from >=P95 to >=P90, or from <=P95 to <=P15, <=P5) if you really cannot find any anomalies. 14 | 1.3. Fault identification: 15 | - A 'fault' is a consecutive sub-series of a specific component-KPI time series. Thus, fault identification is the process of identifying which components experienced faults, on which resources, and at what occurrence time points. 16 | - Filter out isolated noise spikes to locate faults. 17 | - Faults where the maximum (or minimum) value in the sub-series only slightly exceeds (or falls below) the threshold (e.g., threshold breach <= 50% of the extremal), it’s likely a false positive caused by random KPI fluctuations, and should be excluded. 18 | 1.4. Root cause localization: 19 | - The objective of root cause localization is to determine which identified 'fault' is the root cause of the failure. The root cause occurrence time, component, and reason can be derived from the first piece of data point of that fault. 20 | - If multiple faulty components are identified at **different levels** (e.g., some being containers and others nodes), and all of them are potential root cause candidates, while the issue itself describes a **single failure**, the root cause level should be determined by the fault that shows the most significant deviation from the threshold (i.e., >> 50%). However, this method is only applicable to identify the root cause level, not the root cause component. If there are multiple faulty components at the same level, you should use traces and logs to identify the root cause component. 21 | - If multiple service-level faulty components are identified, the root cause component is typically the last (the most downstream in a call chain) **faulty** service within a trace. Use traces to identify the root cause component among multiple faulty services. 22 | - If multiple container-level faulty components are identified, the root cause component is typically the last (the most downstream in a call chain) **faulty** container within a trace. Use traces to identify the root cause component among multiple faulty container. 23 | - If multiple node-level faulty components are identified and the issue doesn't specify **a single failure**, each of these nodes might be the root cause of separate failures. Otherwise, the predominant nodes with the most faults is the root cause component. The node-level failure do not propagate, and trace only captures communication between all containers or all services. 24 | - If only one component's one resource KPI has one fault occurred in a specific time, that fault is the root cause. Otherwise, you should use traces and logs to identify the root cause component and reason. 25 | 2. **Follow the order of `threshold calculation -> data extraction -> metric analyis -> trace analysis -> log analysis` for failure diagnosis.** 26 | 2.0. Before analysis: You should extract and filter the data to include those within the failure duration only after the global threshold has been calculated. After these two steps, you can perform metric analysis, trace analysis, and log analysis. 27 | 2.1. Metric analysis: Use metrics to calculate whether each KPIs of each component has consecutive anomalies beyond the global threshold is the fastest way to find the faults. Since there are a large number of traces and logs, metrics analysis should first be used to narrow down the search space of duration and components. 28 | 2.2. Trace analysis: Use traces can further localize which container-level or service-level faulty component is the root cause components when there are multiple faulty components at the same level (container or service) identified by metrics analysis. 29 | 2.3. Log analysis: Use logs can further localize which resource is the root cause reason when there are multiple faulty resource KPIs of a component identified by metrics analysis. Logs can also help to identify the root cause component among multiple faulty components at the same level. 30 | 2.4. Always confirm whether the target key or field is valid (e.g., component's name, KPI's name, trace ID, log ID, etc.) when Executor's retrieval result is empty. 31 | 32 | What you SHOULD NOT do: 33 | 34 | 1. **DO NOT include any programming language (Python) in your response.** Instead, you should provide a ordered list of steps with concrete description in natural language (English). 35 | 2. **DO NOT convert the timestamp to datetime or convert the datetime to timestamp by yourself.** These detailed process will be handled by the Executor. 36 | 3. **DO NOT use the local data (filtered/cached series in specific time duration) to calculate the global threshold of aggregated 'component-KPI' time series.** Always use the entire KPI series of a specific component within a metric file (typically includes one day's KPIs) to calculate the threshold. To obtain global threshold, you can first aggregate each component's each KPI to calculate their threshold, and then retrieve the objective time duration of aggregated 'component-KPI' to perform anomaly detection and spike filtering. 37 | 4. **DO NOT visualize the data or draw pictures or graphs via Python.** The Executor can only provide text-based results. Never include the `matplotlib` or `seaborn` library in the code. 38 | 5. **DO NOT save anything in the local file system.** Cache the intermediate results in the IPython Kernel. Never use the bash command in the code cell. 39 | 6. **DO NOT calculate threshold AFTER filtering data within the given time duration.** Always calculate global thresholds using the entire KPI series of a specific component within a metric file BEFORE filtering data within the given time duration. 40 | 7. **DO NOT query a specific KPI without knowing which KPIs are available.** Different systems may have completely different KPI naming conventions. If you want to query a specific KPI, first ensure that you are aware of all the available KPIs. 41 | 8. **DO NOT mistakenly identify a healthy (non-faulty) service at the downstream end of a trace that includes faulty components as the root cause.** The root cause component should be the most downstream **faulty** service to appear within the trace call chain, which must first and foremost be a FAULTY component identified by metrics analysis. 42 | 9. **DO NOT focus solely on warning or error logs during log analysis. Many info logs contain critical information about service operations and interactions between services, which can be valuable for root cause analysis.**""" 43 | -------------------------------------------------------------------------------- /rca/baseline/rca_agent/prompt/basic_prompt_Bank.py: -------------------------------------------------------------------------------- 1 | cand = """## POSSIBLE ROOT CAUSE REASONS: 2 | 3 | - high CPU usage 4 | - high memory usage 5 | - network latency 6 | - network packet loss 7 | - high disk I/O read usage 8 | - high disk space usage 9 | - high JVM CPU load 10 | - JVM Out of Memory (OOM) Heap 11 | 12 | ## POSSIBLE ROOT CAUSE COMPONENTS: 13 | 14 | - apache01 15 | - apache02 16 | - Tomcat01 17 | - Tomcat02 18 | - Tomcat04 19 | - Tomcat03 20 | - MG01 21 | - MG02 22 | - IG01 23 | - IG02 24 | - Mysql01 25 | - Mysql02 26 | - Redis01 27 | - Redis02""" 28 | 29 | schema = f"""## TELEMETRY DIRECTORY STRUCTURE: 30 | 31 | - You can access the telemetry directory in our microservices system: `dataset/Bank/telemetry/`. 32 | 33 | - This directory contains subdirectories organized by a date (e.g., `dataset/Bank/telemetry/2021_03_05/`). 34 | 35 | - Within each date-specific directory, you’ll find these subdirectories: `metric`, `trace`, and `log` (e.g., `dataset/Bank/telemetry/2021_03_05/metric/`). 36 | 37 | - The telemetry data in those subdirectories is stored in CSV format (e.g., `dataset/Bank/telemetry/2021_03_05/metric/metric_container.csv`). 38 | 39 | ## DATA SCHEMA 40 | 41 | 1. **Metric Files**: 42 | 43 | 1. `metric_app.csv`: 44 | 45 | ```csv 46 | timestamp,rr,sr,cnt,mrt,tc 47 | 1614787440,100.0,100.0,22,53.27,ServiceTest1 48 | ``` 49 | 50 | 2. `metric_container.csv`: 51 | 52 | ```csv 53 | timestamp,cmdb_id,kpi_name,value 54 | 1614787200,Tomcat04,OSLinux-CPU_CPU_CPUCpuUtil,26.2957 55 | ``` 56 | 57 | 2. **Trace Files**: 58 | 59 | 1. `trace_span.csv`: 60 | 61 | ```csv 62 | timestamp,cmdb_id,parent_id,span_id,trace_id,duration 63 | 1614787199628,dockerA2,369-bcou-dle-way1-c514cf30-43410@0824-2f0e47a816-17492,21030300016145905763,gw0120210304000517192504,19 64 | ``` 65 | 66 | 3. **Log Files**: 67 | 68 | 1. `log_service.csv`: 69 | 70 | ```csv 71 | log_id,timestamp,cmdb_id,log_name,value 72 | 8c7f5908ed126abdd0de6dbdd739715c,1614787201,Tomcat01,gc,"3748789.580: [GC (CMS Initial Mark) [1 CMS-initial-mark: 2462269K(3145728K)] 3160896K(4089472K), 0.1985754 secs] [Times: user=0.59 sys=0.00, real=0.20 secs] " 73 | ``` 74 | 75 | {cand} 76 | 77 | ## CLARIFICATION OF TELEMETRY DATA: 78 | 79 | 1. This microservice system is a banking platform. 80 | 81 | 2. The `metric_app.csv` file only contains four KPIs: rr, sr, cnt, and mrt,. In contrast, `metric_container.csv` records a variety of KPIs, such as CPU usage and memory usage. The specific names of these KPIs can be found in the `kpi_name` field. 82 | 83 | 3. In different telemetry files, the timestamp units and cmdb_id formats may vary: 84 | 85 | - Metric: Timestamp units are in seconds (e.g., 1614787440). 86 | 87 | - Trace: Timestamp units are in milliseconds (e.g., 1614787199628). 88 | 89 | - Log: Timestamp units are in seconds (e.g., 1614787201). 90 | 91 | 4. Please use the UTC+8 time zone in all analysis steps since system is deployed in China/Hong Kong/Singapore.""" 92 | 93 | -------------------------------------------------------------------------------- /rca/baseline/rca_agent/prompt/basic_prompt_Market.py: -------------------------------------------------------------------------------- 1 | cand = """## POSSIBLE ROOT CAUSE COMPONENTS: 2 | 3 | (if the root cause is at the node level, i.e., the root cause is a specific node) 4 | - node-1 5 | - node-2 6 | - node-3 7 | - node-4 8 | - node-5 9 | - node-6 10 | 11 | (if the root cause is at the pod level, i.e., the root cause is a specific container) 12 | 13 | - frontend-0 14 | - frontend-1 15 | - frontend-2 16 | - frontend2-0 17 | - shippingservice-0 18 | - shippingservice-1 19 | - shippingservice-2 20 | - shippingservice2-0 21 | - checkoutservice-0 22 | - checkoutservice-1 23 | - checkoutservice-2 24 | - checkoutservice2-0 25 | - currencyservice-0 26 | - currencyservice-1 27 | - currencyservice-2 28 | - currencyservice2-0 29 | - adservice-0 30 | - adservice-1 31 | - adservice-2 32 | - adservice2-0 33 | - emailservice-0 34 | - emailservice-1 35 | - emailservice-2 36 | - emailservice2-0 37 | - cartservice-0 38 | - cartservice-1 39 | - cartservice-2 40 | - cartservice2-0 41 | - productcatalogservice-0 42 | - productcatalogservice-1 43 | - productcatalogservice-2 44 | - productcatalogservice2-0 45 | - recommendationservice-0 46 | - recommendationservice-1 47 | - recommendationservice-2 48 | - recommendationservice2-0 49 | - paymentservice-0 50 | - paymentservice-1 51 | - paymentservice-2 52 | - paymentservice2-0 53 | 54 | (if the root cause is at the service level, i.e., if all pods of a specific service are faulty, the root cause is the service itself) 55 | 56 | - frontend 57 | - shippingservice 58 | - checkoutservice 59 | - currencyservice 60 | - adservice 61 | - emailservice 62 | - cartservice 63 | - productcatalogservice 64 | - recommendationservice 65 | - paymentservice 66 | 67 | ## POSSIBLE ROOT CAUSE REASONS: 68 | 69 | - container CPU load 70 | - container memory load 71 | - container network packet retransmission 72 | - container network packet corruption 73 | - container network latency 74 | - container packet loss 75 | - container process termination 76 | - container read I/O load 77 | - container write I/O load 78 | - node CPU load 79 | - node CPU spike 80 | - node memory consumption 81 | - node disk read I/O consumption 82 | - node disk write I/O consumption 83 | - node disk space consumption""" 84 | 85 | schema = f"""## TELEMETRY DIRECTORY STRUCTURE: 86 | 87 | - You can access the telemetry directories of two cloudbed (i.e., `cloudbed-1` and `cloudbed-2`) in our microservices system: `dataset/Market/cloudbed-1/telemetry/` and `dataset/Market/cloudbed-2/telemetry/`. 88 | 89 | - This directory contains subdirectories organized by a date (e.g., `dataset/Market/cloudbed-1/telemetry/2022_03_20/`). 90 | 91 | - Within each date-specific directory, you’ll find these subdirectories: `metric`, `trace`, and `log` (e.g., `dataset/Market/cloudbed-1/telemetry/2022_03_20/metric/`). 92 | 93 | - The telemetry data in those subdirectories is stored in CSV format (e.g., `dataset/Market/cloudbed-1/telemetry/2022_03_20/metric/metric_container.csv`). 94 | 95 | ## DATA SCHEMA 96 | 97 | 1. **Metric Files**: 98 | 99 | 1. `metric_container.csv`: 100 | 101 | ```csv 102 | timestamp,cmdb_id,kpi_name,value 103 | 1647781200,node-6.adservice2-0,container_fs_writes_MB./dev/vda,0.0 104 | ``` 105 | 106 | 2. `metric_mesh.csv`: 107 | 108 | ```csv 109 | timestamp,cmdb_id,kpi_name,value 110 | 1647790380,cartservice-1.source.cartservice.redis-cart,istio_tcp_sent_bytes.-,1255.0 111 | ``` 112 | 113 | 3. `metric_node.csv`: 114 | 115 | ```csv 116 | timestamp,cmdb_id,kpi_name,value 117 | 1647705600,node-1,system.cpu.iowait,0.31 118 | ``` 119 | 120 | 4. `metric_runtime.csv`: 121 | 122 | ```csv 123 | timestamp,cmdb_id,kpi_name,value 124 | 1647730800,adservice.ts:8088,java_nio_BufferPool_TotalCapacity.direct,57343.0 125 | ``` 126 | 127 | 5. `metric_service.csv`: 128 | 129 | ```csv 130 | service,timestamp,rr,sr,mrt,count 131 | adservice-grpc,1647716400,100.0,100.0,2.429508196728182,61 132 | ``` 133 | 134 | 2. **Trace Files**: 135 | 136 | 1. `trace_span.csv`: 137 | 138 | ```csv 139 | timestamp,cmdb_id,span_id,trace_id,duration,type,status_code,operation_name,parent_span 140 | 1647705600361,frontend-0,a652d4d10e9478fc,9451fd8fdf746a80687451dae4c4e984,49877,rpc,0,hipstershop.CheckoutService/PlaceOrder,952754a738a11675 141 | ``` 142 | 143 | 3. **Log Files**: 144 | 145 | 1. `log_proxy.csv`: 146 | 147 | ```csv 148 | log_id,timestamp,cmdb_id,log_name,value 149 | KN43pn8BmS57GQLkQUdP,1647761110,cartservice-1,log_cartservice-service_application,etCartAsync called with userId=3af80013-c2c1-4ae6-86d0-1d9d308e6f5b 150 | ``` 151 | 152 | 2. `log_service.csv`: 153 | 154 | ```csv 155 | log_id,timestamp,cmdb_id,log_name,value 156 | GIvpon8BDiVcQfZwJ5a9,1647705660,currencyservice-0,log_currencyservice-service_application,"severity: info, message: Getting supported currencies..." 157 | ``` 158 | 159 | {cand} 160 | 161 | ## CLARIFICATION OF TELEMETRY DATA: 162 | 163 | 1. This microservice system is a E-commerce platform which includes a failover mechanism, with each service deployed across four pods. In this system, a container (pod) can be deployed in different nodes. If the root cause component is a single pod of a specific service (e.g., node-1.adservice-0), the failure may not significantly impact the corresponding service metrics. In contrast, if the root cause component is a service itself (e.g., adservice), which means all pods of this service are faulty, the corresponding service metrics will be significantly impacted. Moreover, such fault could be propagate through the call chain, resulting in other service's metrics faulty. Note that `Pod` equals to `Container` in this system. 164 | 165 | 2. The `metric_service.csv` file only contains four KPIs: rr, sr, mrt, and count. In contrast, other metric files record a variety of KPIs, such as CPU usage and memory usage. The specific names of these KPIs can be found in the `kpi_name` field. 166 | 167 | 3. Note that the `cmdb_id` is the name of specific components, including nodes, pods, services, etc. 168 | 169 | - Metrics: 170 | - Runtime: The application name and port, e.g., `adservice.ts:8088` 171 | - Service: The service name and protocol, e.g., `adservic-grpc` 172 | - Container: The pod name combined with a node name, e.g., `node-1.adservice-0` 173 | - Node: The node name, e.g., `node-1` 174 | - Mesh: The service-to-service connection identifier within the mesh, e.g.,`cartservice-1.source.cartservice.redis-cart` 175 | 176 | - Traces: The pod name, e.g., `adservice-0` 177 | 178 | - Logs: The pod name, e.g., `adservice-0` 179 | 180 | 4. In different telemetry files, the timestamp units and cmdb_id formats may vary: 181 | 182 | - Metric: Timestamp units are in seconds (e.g., 1647781200). cmdb_id varies by metric file: 183 | - In container metrics: `-x.-x` (e.g., `node-1.adservice-0`) 184 | - In node metrics: `-x` (e.g., `node-1`) 185 | - In service metrics: `-grpc` (e.g., `adservice-grpc`) 186 | 187 | - Trace: Timestamp units are in milliseconds (e.g., 1647705600361). cmdb_id is consistently `-x` (e.g., frontend-0). 188 | 189 | - Log: Timestamp units are in seconds (e.g., 1647705660). cmdb_id is consistently `-x` (e.g., currencyservice-0). 190 | 191 | 5. Please use the UTC+8 time zone in all analysis steps since system is deployed in China/Hong Kong/Singapore.""" 192 | -------------------------------------------------------------------------------- /rca/baseline/rca_agent/prompt/basic_prompt_Telecom.py: -------------------------------------------------------------------------------- 1 | cand = """## POSSIBLE ROOT CAUSE REASONS: 2 | 3 | - CPU fault 4 | - network delay 5 | - network loss 6 | - db connection limit 7 | - db close 8 | 9 | ## POSSIBLE ROOT CAUSE COMPONENTS: 10 | 11 | (if the root cause is at the node level, i.e., the root cause is a specific node) 12 | 13 | - os_001 14 | - os_002 15 | - os_003 16 | - os_004 17 | - os_005 18 | - os_006 19 | - os_007 20 | - os_008 21 | - os_009 22 | - os_010 23 | - os_011 24 | - os_012 25 | - os_013 26 | - os_014 27 | - os_015 28 | - os_016 29 | - os_017 30 | - os_018 31 | - os_019 32 | - os_020 33 | - os_021 34 | - os_022 35 | 36 | (if the root cause is at the pod level, i.e., the root cause is a specific container) 37 | 38 | - docker_001 39 | - docker_002 40 | - docker_003 41 | - docker_004 42 | - docker_005 43 | - docker_006 44 | - docker_007 45 | - docker_008 46 | 47 | (if the root cause is at the service level, i.e., if all pods of a specific service are faulty, the root cause is the service itself) 48 | 49 | - db_001 50 | - db_002 51 | - db_003 52 | - db_004 53 | - db_005 54 | - db_006 55 | - db_007 56 | - db_008 57 | - db_009 58 | - db_010 59 | - db_011 60 | - db_012 61 | - db_013""" 62 | 63 | schema = f"""## TELEMETRY DIRECTORY STRUCTURE: 64 | 65 | - You can access the telemetry directory in our microservices system: `dataset/Telecom/telemetry/` 66 | 67 | - This directory contains subdirectories organized by a date (e.g., `dataset/Telecom/telemetry/2020_04_11/`). 68 | 69 | - Within each date-specific directory, you’ll find these subdirectories: `metric` and `trace` (e.g., `dataset/Telecom/telemetry/2020_04_11/metric/`). 70 | 71 | - The telemetry data in those subdirectories is stored in CSV format (e.g., `dataset/Telecom/telemetry/2020_04_11/metric/metric_container.csv`). 72 | 73 | ## DATA SCHEMA 74 | 75 | 1. **Metric Files**: 76 | 77 | 1. `metric_app.csv`: 78 | 79 | ```csv 80 | serviceName,startTime,avg_time,num,succee_num,succee_rate 81 | osb_001,1586534400000,0.333,1,1,1.0 82 | ``` 83 | 84 | 2. `metric_container.csv`: 85 | 86 | ```csv 87 | itemid,name,bomc_id,timestamp,value,cmdb_id 88 | 999999996381330,container_mem_used,ZJ-004-060,1586534423000,59.000000,docker_008 89 | ``` 90 | 91 | 3. `metric_middleware.csv`: 92 | 93 | ```csv 94 | itemid,name,bomc_id,timestamp,value,cmdb_id 95 | 999999996508323,connected_clients,ZJ-005-024,1586534672000,25,redis_003 96 | ``` 97 | 98 | 4. `metric_node.csv`: 99 | 100 | ```csv 101 | itemid,name,bomc_id,timestamp,value,cmdb_id 102 | 999999996487783,CPU_iowait_time,ZJ-001-010,1586534683000,0.022954,os_017 103 | ``` 104 | 105 | 5. `metric_service.csv`: 106 | 107 | ```csv 108 | itemid,name,bomc_id,timestamp,value,cmdb_id 109 | 999999998650974,MEM_Total,ZJ-002-055,1586534694000,381.902264,db_003 110 | ``` 111 | 112 | 2. **Trace Files**: 113 | 114 | 1. `trace_span.csv`: 115 | 116 | ```csv 117 | callType,startTime,elapsedTime,success,traceId,id,pid,cmdb_id,dsName,serviceName 118 | JDBC,1586534400335,2.0,True,01df517164d1c0365586,407d617164d1c14f2613,6e02217164d1c14b2607,docker_006,db_003, 119 | LOCAL,1586534400331,6.0,True,01df517164d1c0365586,6e02217164d1c14b2607,8432217164d1c1442597,docker_006,db_003,local_method_017 120 | RemoteProcess,1586534400324,55.0,True,01df517164d1c0365586,8432217164d1c1442597,b755e17164d1c13f5066,docker_006,,csf_005 121 | FlyRemote,1586534400149,7.0,TRUE,fa1e817164d1c0375444,da74117164d1c0955052,b959f17164d1c08c5050,docker_003,,fly_remote_001 122 | OSB,1586534660846,376.0,True,d9c4817164d5baee6924,77d1117164d5baee6925,None,os_021,,osb_001 123 | ``` 124 | 125 | {cand} 126 | 127 | ## CLARIFICATION OF TELEMETRY DATA: 128 | 129 | 1. This service system is a telecom database system. 130 | 131 | 2. The `metric_app.csv` file only contains five KPIs: startTime, avg_time, num, succee_num, succee_rate. In contrast, other metric files record a variety of KPIs, such as CPU usage and memory usage. The specific names of these KPIs can be found in the `name` field. 132 | 133 | 3. In all telemetry files, the timestamp units and cmdb_id formats remain consistent: 134 | 135 | - Metric: Timestamp units are in milliseconds (e.g., 1586534423000). 136 | 137 | - Trace: Timestamp units are in milliseconds (e.g., 1586534400335). 138 | 139 | 4. Please use the UTC+8 time zone in all analysis steps since system is deployed in China/Hong Kong/Singapore.""" 140 | -------------------------------------------------------------------------------- /rca/baseline/rca_agent/rca_agent.py: -------------------------------------------------------------------------------- 1 | from rca.baseline.rca_agent.controller import control_loop 2 | 3 | class RCA_Agent: 4 | def __init__(self, agent_prompt, basic_prompt) -> None: 5 | 6 | self.ap = agent_prompt 7 | self.bp = basic_prompt 8 | 9 | def run(self, instruction, logger, max_step=25, max_turn=5): 10 | 11 | logger.info(f"Objective: {instruction}") 12 | prediction, trajectory, prompt = control_loop(instruction, "", self.ap, self.bp, logger=logger, max_step=max_step, max_turn=max_turn) 13 | logger.info(f"Result: {prediction}") 14 | 15 | return prediction, trajectory, prompt -------------------------------------------------------------------------------- /rca/run_agent_standard.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 6 | sys.path.insert(0, project_root) 7 | from main.evaluate import evaluate 8 | from rca.api_router import configs 9 | 10 | from datetime import datetime 11 | from loguru import logger 12 | from nbformat import v4 as nbf 13 | import pandas as pd 14 | import signal 15 | 16 | def handler(signum, frame): 17 | raise TimeoutError("Loop execution exceeded the time limit") 18 | 19 | def main(args, uid, dataset): 20 | 21 | from rca.baseline.rca_agent.rca_agent import RCA_Agent 22 | import rca.baseline.rca_agent.prompt.agent_prompt as ap 23 | if dataset == "Telecom": 24 | import rca.baseline.rca_agent.prompt.basic_prompt_Telecom as bp 25 | elif dataset == "Bank": 26 | import rca.baseline.rca_agent.prompt.basic_prompt_Bank as bp 27 | elif dataset == "Market/cloudbed-1" or dataset == "Market/cloudbed-2": 28 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp 29 | 30 | inst_file = f"dataset/{dataset}/query.csv" 31 | gt_file = f"dataset/{dataset}/record.csv" 32 | eval_file = f"test/result/{dataset}/agent-{args.tag}-{configs['MODEL'].split('/')[-1]}.csv" 33 | obs_path = f"test/monitor/{dataset}/agent-{args.tag}-{configs['MODEL'].split('/')[-1]}" 34 | unique_obs_path = f"{obs_path}/{uid}" 35 | 36 | instruct_data = pd.read_csv(inst_file) 37 | gt_data = pd.read_csv(gt_file) 38 | if not os.path.exists(inst_file) or not os.path.exists(gt_file): 39 | raise FileNotFoundError(f"Please download the dataset first.") 40 | 41 | if not os.path.exists(f"{unique_obs_path}/history"): 42 | os.makedirs(f"{unique_obs_path}/history") 43 | if not os.path.exists(f"{unique_obs_path}/trajectory"): 44 | os.makedirs(f"{unique_obs_path}/trajectory") 45 | if not os.path.exists(f"{unique_obs_path}/prompt"): 46 | os.makedirs(f"{unique_obs_path}/prompt") 47 | if not os.path.exists(eval_file): 48 | if not os.path.exists(f"test/result/{dataset}"): 49 | os.makedirs(f"test/result/{dataset}") 50 | eval_df = pd.DataFrame(columns=["instruction", "prediction", "groundtruth", "passed", "failed", "score"]) 51 | else: 52 | eval_df = pd.read_csv(eval_file) 53 | 54 | scores = { 55 | "total": 0, 56 | "easy": 0, 57 | "middle": 0, 58 | "hard": 0, 59 | } 60 | nums = { 61 | "total": 0, 62 | "easy": 0, 63 | "middle": 0, 64 | "hard": 0, 65 | } 66 | 67 | signal.signal(signal.SIGALRM, handler) 68 | logger.info(f"Using dataset: {dataset}") 69 | logger.info(f"Using model: {configs['MODEL'].split('/')[-1]}") 70 | 71 | for idx, row in instruct_data.iterrows(): 72 | 73 | if idx < args.start_idx: 74 | continue 75 | if idx > args.end_idx: 76 | break 77 | 78 | instruction = row["instruction"] 79 | task_index = row["task_index"] 80 | scoring_points = row["scoring_points"] 81 | task_id = int(task_index.split('_')[1]) 82 | best_score = 0 83 | 84 | if task_id <= 3: 85 | catalog = "easy" 86 | elif task_id <= 6: 87 | catalog = "middle" 88 | elif task_id <= 7: 89 | catalog = "hard" 90 | 91 | for i in range(args.sample_num): 92 | uuid = uid + f"_#{idx}-{i}" 93 | nb = nbf.new_notebook() 94 | nbfile = f"{unique_obs_path}/trajectory/{uuid}.ipynb" 95 | promptfile = f"{unique_obs_path}/prompt/{uuid}.json" 96 | logfile = f"{unique_obs_path}/history/{uuid}.log" 97 | logger.remove() 98 | logger.add(sys.stdout, colorize=True, enqueue=True, level="INFO") 99 | logger.add(logfile, colorize=True, enqueue=True, level="INFO") 100 | logger.debug('\n' + "#"*80 + f"\n{uuid}: {task_index}\n" + "#"*80) 101 | try: 102 | signal.alarm(args.timeout) 103 | 104 | agent = RCA_Agent(ap, bp) 105 | prediction, trajectory, prompt = agent.run(instruction, 106 | logger, 107 | max_step=args.controller_max_step, 108 | max_turn=args.controller_max_turn) 109 | 110 | signal.alarm(0) 111 | 112 | for step in trajectory: 113 | code_cell = nbf.new_code_cell(step['code']) 114 | result_cell = nbf.new_markdown_cell(f"```\n{step['result']}\n```") 115 | nb.cells.append(code_cell) 116 | nb.cells.append(result_cell) 117 | with open(nbfile, 'w', encoding='utf-8') as f: 118 | json.dump(nb, f, ensure_ascii=False, indent=4) 119 | logger.info(f"Trajectory has been saved to {nbfile}") 120 | 121 | with open(promptfile, 'w', encoding='utf-8') as f: 122 | json.dump({"messages": prompt}, f, ensure_ascii=False, indent=4) 123 | logger.info(f"Prompt has been saved to {promptfile}") 124 | 125 | new_eval_df = pd.DataFrame([{"row_id": idx, 126 | "task_index": task_index, 127 | "instruction": instruction, 128 | "prediction": prediction, 129 | "groundtruth": '\n'.join([f'{col}: {gt_data.iloc[idx][col]}' for col in gt_data.columns if col != 'description']), 130 | "passed": "N/A", 131 | "failed": "N/A", 132 | "score": "N/A"}]) 133 | eval_df = pd.concat([eval_df, new_eval_df], 134 | ignore_index=True) 135 | eval_df.to_csv(eval_file, 136 | index=False) 137 | 138 | passed_criteria, failed_criteria, score = evaluate(prediction, scoring_points) 139 | 140 | logger.info(f"Prediction: {prediction}") 141 | logger.info(f"Scoring Points: {scoring_points}") 142 | logger.info(f"Passed Criteria: {passed_criteria}") 143 | logger.info(f"Failed Criteria: {failed_criteria}") 144 | logger.info(f"Score: {score}") 145 | best_score = max(best_score, score) 146 | 147 | eval_df.loc[eval_df.index[-1], "passed"] = '\n'.join(passed_criteria) 148 | eval_df.loc[eval_df.index[-1], "failed"] = '\n'.join(failed_criteria) 149 | eval_df.loc[eval_df.index[-1], "score"] = score 150 | eval_df.to_csv(eval_file, 151 | index=False) 152 | 153 | temp_scores = scores.copy() 154 | temp_scores[catalog] += best_score 155 | temp_scores["total"] += best_score 156 | temp_nums = nums.copy() 157 | temp_nums[catalog] += 1 158 | temp_nums["total"] += 1 159 | 160 | except TimeoutError: 161 | logger.error(f"Loop {i} exceeded the time limit and was skipped") 162 | continue 163 | 164 | scores = temp_scores 165 | nums = temp_nums 166 | 167 | 168 | if __name__ == "__main__": 169 | 170 | uid = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 171 | parser = argparse.ArgumentParser() 172 | parser.add_argument("--dataset", type=str, default="Market/cloudbed-1") 173 | parser.add_argument("--sample_num", type=int, default=1) 174 | parser.add_argument("--start_idx", type=int, default=0) 175 | parser.add_argument("--end_idx", type=int, default=150) 176 | parser.add_argument("--controller_max_step", type=int, default=25) 177 | parser.add_argument("--controller_max_turn", type=int, default=5) 178 | parser.add_argument("--timeout", type=int, default=600) 179 | parser.add_argument("--tag", type=str, default='rca') 180 | parser.add_argument("--auto", type=bool, default=False) 181 | 182 | args = parser.parse_args() 183 | 184 | if args.auto: 185 | print(f"Auto mode is on. Model is fixed to {configs['MODEL']}") 186 | datasets = ["Market/cloudbed-1", "Market/cloudbed-2", "Bank", "Telecom"] 187 | for dataset in datasets: 188 | main(args, uid, dataset) 189 | else: 190 | dataset = args.dataset 191 | main(args, uid, dataset) -------------------------------------------------------------------------------- /rca/run_sampling_balanced.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 4 | sys.path.insert(0, project_root) 5 | import pandas as pd 6 | import argparse 7 | from datetime import datetime 8 | from loguru import logger 9 | from copy import deepcopy 10 | 11 | from rca.baseline.direct_lm import DirectLM 12 | from rca.baseline.cot_lm import CoTLM 13 | from main.evaluate import evaluate 14 | from time import time 15 | from rca.api_router import configs 16 | 17 | import random 18 | 19 | import tiktoken 20 | tokenizer = tiktoken.encoding_for_model("gpt-4o") 21 | 22 | def cache_df_dict(dataset_name:str): 23 | 24 | df_dict = dict() 25 | 26 | if dataset_name == "Telecom": 27 | from rca.baseline.oracle_kpis import kpi_Telecom_len 28 | selected_kpi_len = kpi_Telecom_len 29 | 30 | example_df_dict = { 31 | "metric": [], 32 | "trace": [], 33 | } 34 | dataset_path = "Telecom" 35 | 36 | import rca.baseline.rca_agent.prompt.basic_prompt_Telecom as bp 37 | cand = bp.cand 38 | 39 | elif dataset_name == "Bank": 40 | from rca.baseline.oracle_kpis import kpi_Bank_len 41 | selected_kpi_len = kpi_Bank_len 42 | 43 | example_df_dict = { 44 | "log": [], 45 | "metric": [], 46 | "trace": [], 47 | } 48 | dataset_path = "Bank" 49 | 50 | import rca.baseline.rca_agent.prompt.basic_prompt_Bank as bp 51 | cand = bp.cand 52 | 53 | elif dataset_name == "Market/cloudbed-1": 54 | from rca.baseline.oracle_kpis import kpi_Market_len 55 | selected_kpi_len = kpi_Market_len 56 | 57 | example_df_dict = { 58 | "log": [], 59 | "metric": [], 60 | "trace": [], 61 | } 62 | dataset_path = "Market/cloudbed-1" 63 | 64 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp 65 | cand = bp.cand 66 | 67 | elif dataset_name == "Market/cloudbed-2": 68 | from rca.baseline.oracle_kpis import kpi_Market_len 69 | selected_kpi_len = kpi_Market_len 70 | 71 | example_df_dict = { 72 | "log": [], 73 | "metric": [], 74 | "trace": [], 75 | } 76 | dataset_path = "Market/cloudbed-2" 77 | 78 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp 79 | cand = bp.cand 80 | 81 | for day_time in os.listdir(f"dataset/{dataset_path}/telemetry/"): 82 | if day_time == '.DS_Store': 83 | continue 84 | if day_time not in df_dict: 85 | df_dict[day_time] = deepcopy(example_df_dict) 86 | 87 | for data_type in os.listdir(f"dataset/{dataset_path}/telemetry/{day_time}"): 88 | if data_type == '.DS_Store': 89 | continue 90 | for fname in os.listdir(f"dataset/{dataset_path}/telemetry/{day_time}/{data_type}"): 91 | t0 = time() 92 | cur_df = pd.read_csv(f"dataset/{dataset_path}/telemetry/{day_time}/{data_type}/{fname}") 93 | t1 = time() 94 | logger.debug(f"{round(t1-t0,1)} seconds for reading {fname}") 95 | 96 | #preprocess 97 | cur_df = cur_df.reset_index() 98 | if "timestamp" in cur_df.columns: 99 | col = "timestamp" 100 | elif "startTime" in cur_df.columns: 101 | col = "startTime" 102 | else: 103 | logger.error("There is no 'startTime' or 'timestamp' indicating the timestamp of the data entries") 104 | raise IndexError 105 | cur_df[col] = cur_df[col].apply(lambda x: int(x // 1000) if len(str(x)) == 13 else x) 106 | t2 = time() 107 | logger.debug(f"{round(t2-t1, 1)} seconds for prerpocessing DataFrame") 108 | if cur_df.empty: 109 | logger.warning(f"{fname} is empty") 110 | else: 111 | df_dict[day_time][data_type].append((fname, cur_df)) 112 | 113 | return df_dict, selected_kpi_len, cand 114 | 115 | 116 | def extract_period_data(df_list:list, data_type:str, target_timestamp:int, sample_interval=60, selected_kpi=None, selected_kpi_len=None) -> list: 117 | 118 | logger.debug(f"Extracting {data_type} data ...") 119 | 120 | extracted_data = "" 121 | for fname, df_file in df_list: 122 | 123 | if data_type == "metric" and len(selected_kpi) >= selected_kpi_len: 124 | logger.info(f"Selected KPI number ({len(selected_kpi)}) have reached the limit: {selected_kpi_len}") 125 | break 126 | 127 | if "timestamp" in df_file.columns: 128 | col = "timestamp" 129 | elif "startTime" in df_file.columns: 130 | col = "startTime" 131 | else: 132 | logger.error("There is no 'startTime' or 'timestamp' indicating the timestamp of the data entries") 133 | raise IndexError 134 | 135 | t1 = time() 136 | start_timestamp = target_timestamp - target_timestamp % 1800 137 | end_timestamp = start_timestamp + 1800 138 | filtered_df = df_file[(df_file[col] >= start_timestamp) & (df_file[col] <= end_timestamp)] 139 | filtered_df = filtered_df.drop(columns=["index"]) 140 | 141 | t2 = time() 142 | logger.debug(f"{round(t2-t1,1)} seconds for filtering 30 min data") 143 | 144 | if data_type == "log": 145 | filtered_df = filtered_df.drop(columns=["log_id"]) 146 | filtered_df = filtered_df.drop(columns=["cmdb_id"]) 147 | filtered_df = filtered_df.drop(columns=["log_name"]) 148 | schema = filtered_df.columns 149 | extracted_data = extracted_data + f'\n\n#### {fname}' 150 | extracted_data = extracted_data + f' Schema: ' + ','.join(schema) + '\n' 151 | resampled_df = filtered_df.groupby(filtered_df[col] // (sample_interval/5)).first() 152 | if resampled_df.empty: 153 | extracted_data = extracted_data + "DATA NOT AVAILABLE\n" 154 | else: 155 | data = resampled_df.astype(str).agg(','.join, axis=1) 156 | extracted_data = extracted_data + '\n'.join(data) + '\n' 157 | elif data_type == "trace": 158 | opt_traceid_field_name = ["traceId", "trace_id"] 159 | traceid_field_name = None 160 | for opt_name in opt_traceid_field_name: 161 | if opt_name in df_file.columns: 162 | traceid_field_name = opt_name 163 | if traceid_field_name==None: 164 | logger.error("There is no 'traceId' or 'trace_id' indicating the trace_id of the data entries") 165 | raise IndexError 166 | opt_spanid_field_name = ["id", "span_id"] 167 | spanid_field_name = None 168 | for opt_name in opt_spanid_field_name: 169 | if opt_name in df_file.columns: 170 | spanid_field_name = opt_name 171 | if spanid_field_name==None: 172 | logger.error("There is no 'id' or 'span_id' indicating the span_id of the data entries") 173 | raise IndexError 174 | opt_parent_field_name = ["pid", "parent_id", "parent_span"] 175 | parent_field_name = None 176 | for opt_name in opt_parent_field_name: 177 | if opt_name in df_file.columns: 178 | parent_field_name = opt_name 179 | if parent_field_name==None: 180 | logger.error("There is no 'pid' or 'parent_id' indicating the parent_id of the data entries") 181 | raise IndexError 182 | opt_duration_field_name = ["elapsedTime", "duration"] 183 | duration_field_name = None 184 | for opt_name in opt_duration_field_name: 185 | if opt_name in df_file.columns: 186 | duration_field_name = opt_name 187 | 188 | filtered_df = filtered_df[[col, traceid_field_name, spanid_field_name, parent_field_name, duration_field_name, "cmdb_id"]] 189 | schema = filtered_df.columns 190 | schema = schema.drop([traceid_field_name]) 191 | extracted_data = extracted_data + f'\n\n#### {fname}' 192 | extracted_data = extracted_data + f' Schema: ' + ','.join([s for s in schema if s != col]) + '\n' 193 | resampled_df = filtered_df.groupby(filtered_df[col] // sample_interval).first() 194 | trace_ids = resampled_df[traceid_field_name] 195 | trace_dfs = filtered_df[filtered_df[traceid_field_name].isin(trace_ids)] 196 | trace_grouped_df = trace_dfs.groupby(traceid_field_name) 197 | for trace_id, trace_df in trace_grouped_df: 198 | resource_name = f'{trace_id}' 199 | for field in trace_df.columns: 200 | if trace_df[field].dtype in [float, "float64"]: 201 | trace_df[field] = trace_df[field].apply(lambda x: round(x, 2)) 202 | trace_df = trace_df.drop(columns=[traceid_field_name, col]) 203 | if 'group' in trace_df.columns: 204 | trace_df = trace_df.drop(columns=["group"]) 205 | if trace_df.empty: 206 | extracted_data = extracted_data + f'Trace ID: {resource_name}\n```\n' + "DATA NOT AVAILABLE\n```\n" 207 | else: 208 | data = trace_df.astype(str).agg(','.join, axis=1) 209 | extracted_data = extracted_data + f'Trace ID: {resource_name}\n```\n' + '\n'.join(data) + '\n```\n' 210 | t3 = time() 211 | logger.debug(f"{round(t2-t1,1)} seconds for extracting trace data") 212 | elif data_type == "metric": 213 | opt_kpi_field_name = ["name", "kpi_name", "serviceName", "tc", "service"] 214 | kpi_field_name = None 215 | for opt_name in opt_kpi_field_name: 216 | if opt_name in df_file.columns: 217 | kpi_field_name = opt_name 218 | break 219 | if kpi_field_name==None: 220 | logger.error("There is no 'name' or 'serviceName' indicating the kpi_name of the data entries") 221 | raise IndexError 222 | 223 | if kpi_field_name == 'name' or kpi_field_name == 'kpi_name': 224 | if len(filtered_df[kpi_field_name].unique()) > 0: 225 | kpi = random.choice(filtered_df[kpi_field_name].unique()) 226 | while kpi in selected_kpi: 227 | kpi = random.choice(filtered_df[kpi_field_name].unique()) 228 | selected_kpi.add(kpi) 229 | filtered_df = filtered_df[filtered_df[kpi_field_name] == kpi] 230 | else: 231 | continue 232 | elif kpi_field_name == 'serviceName' or kpi_field_name == 'tc' or kpi_field_name == 'service': 233 | if kpi_field_name not in selected_kpi: 234 | selected_kpi.add(kpi_field_name) 235 | else: 236 | continue 237 | filtered_df["group"] = filtered_df[col].apply(lambda x: x // sample_interval) 238 | if 'cmdb_id' not in filtered_df.columns: 239 | filtered_df["resource_name"] = filtered_df[kpi_field_name] 240 | filtered_df = filtered_df.drop(columns=[kpi_field_name]) 241 | else: 242 | filtered_df["resource_name"] = filtered_df["cmdb_id"] + "_" + filtered_df[kpi_field_name] 243 | filtered_df = filtered_df.drop(columns=["cmdb_id", kpi_field_name]) 244 | if "itemid" in filtered_df.columns: 245 | filtered_df = filtered_df.drop(columns=["itemid"]) 246 | if "bomc_id" in filtered_df.columns: 247 | filtered_df = filtered_df.drop(columns=["bomc_id"]) 248 | schema = filtered_df.columns 249 | schema = schema.drop("resource_name") 250 | schema = schema.drop('group') 251 | extracted_data = extracted_data + f'\n\n#### {fname}' 252 | extracted_data = extracted_data + f' Schema: ' + ','.join([s for s in schema if s != col]) + '\n' 253 | resource_grouped_df = filtered_df.groupby("resource_name") 254 | for resource_name, resource_df in resource_grouped_df: 255 | resampled_df = resource_df.groupby(resource_df[col] // sample_interval).first() 256 | for field in resampled_df.columns: 257 | if resampled_df[field].dtype in [float, "float64"]: 258 | resampled_df[field] = resampled_df[field].apply(lambda x: round(x, 2)) 259 | resampled_df = resampled_df.drop(columns=["resource_name"]) 260 | if 'group' in resampled_df.columns: 261 | resampled_df = resampled_df.drop(columns=["group"]) 262 | if resampled_df.empty: 263 | extracted_data = extracted_data + f'{resource_name}\n```\n' + "DATA NOT AVAILABLE\n```\n" 264 | resampled_df = resampled_df.drop(columns=[col]) 265 | data = resampled_df.astype(str).agg(','.join, axis=1) 266 | extracted_data = extracted_data + f'{resource_name}\n```\n' + '\n'.join(data) + '\n```\n' 267 | 268 | t3 = time() 269 | logger.debug(f"{round(t3-t2,1)} seconds for selecting metric data") 270 | return extracted_data, selected_kpi 271 | 272 | def main(args): 273 | import rca.baseline.rca_agent.prompt.agent_prompt as ap 274 | if args.dataset == "Telecom": 275 | import rca.baseline.rca_agent.prompt.basic_prompt_Telecom as bp 276 | elif args.dataset == "Bank": 277 | import rca.baseline.rca_agent.prompt.basic_prompt_Bank as bp 278 | elif args.dataset == "Market/cloudbed-1" or args.dataset == "Market/cloudbed-2": 279 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp 280 | 281 | inst_file = f"dataset/{args.dataset}/query.csv" 282 | gt_file = f"dataset/{args.dataset}/record.csv" 283 | eval_file = f"test/result/{args.dataset}/balanced_{args.tag}_{args.mode}-{configs['MODEL'].split('/')[-1]}.csv" 284 | obs_path = f"test/monitor/{args.dataset}/balanced_{args.tag}_{args.mode}-{configs['MODEL'].split('/')[-1]}" 285 | unique_obs_path = f"{obs_path}/{uid}" 286 | 287 | instruct_data = pd.read_csv(inst_file) 288 | gt_data = pd.read_csv(gt_file) 289 | if not os.path.exists(inst_file) or not os.path.exists(gt_file): 290 | raise FileNotFoundError(f"Please download the dataset first.") 291 | 292 | if not os.path.exists(f"{unique_obs_path}/prompt"): 293 | os.makedirs(f"{unique_obs_path}/prompt") 294 | if not os.path.exists(eval_file): 295 | if not os.path.exists(f"test/result/{args.dataset}"): 296 | os.makedirs(f"test/result/{args.dataset}") 297 | eval_df = pd.DataFrame(columns=["instruction", "prediction", "groundtruth", "passed", "failed", "score"]) 298 | else: 299 | eval_df = pd.read_csv(eval_file) 300 | 301 | logfile = f"{unique_obs_path}/batch.log" 302 | logger.remove() 303 | logger.add(sys.stdout, colorize=True, enqueue=True, level="INFO") 304 | logger.add(logfile, colorize=True, enqueue=True, level="INFO") 305 | 306 | scores = { 307 | "total": 0, 308 | "easy": 0, 309 | "middle": 0, 310 | "hard": 0, 311 | } 312 | nums = { 313 | "total": 0, 314 | "easy": 0, 315 | "middle": 0, 316 | "hard": 0, 317 | } 318 | 319 | logger.info(f"Using dataset: {args.dataset}") 320 | logger.info(f"Using model: {configs['MODEL'].split('/')[-1]}") 321 | logger.info("Start caching dataframes ...") 322 | df_dict, selected_kpi_len, cand = cache_df_dict(args.dataset) 323 | 324 | for idx, row in instruct_data.iterrows(): 325 | 326 | if idx < args.start_idx: 327 | continue 328 | if idx > args.end_idx: 329 | break 330 | 331 | instruction = row["instruction"] 332 | timestamp = gt_data.iloc[idx]["timestamp"].astype(int) 333 | date_time = gt_data.iloc[idx]["datetime"].split(" ")[0].replace("-","_") 334 | task_index = row["task_index"] 335 | scoring_points = row["scoring_points"] 336 | print(scoring_points) 337 | task_id = int(task_index.split('_')[1]) 338 | best_score = 0 339 | 340 | if task_id <= 3: 341 | catalog = "easy" 342 | elif task_id <= 6: 343 | catalog = "middle" 344 | elif task_id <= 7: 345 | catalog = "hard" 346 | 347 | 348 | 349 | for i in range(args.sample_num): 350 | uuid = uid + f"_#{idx}-{i}" 351 | promptfile = f"{unique_obs_path}/prompt/{uuid}.txt" 352 | logger.debug('\n' + "#"*80 + f"\n{uuid}: {task_index}\n" + "#"*80) 353 | 354 | period_data = dict() 355 | 356 | if args.dataset != "Telecom": 357 | period_data["log"], _ = extract_period_data(deepcopy(df_dict[date_time]["log"]), 358 | "log", timestamp, 359 | sample_interval=args.sample_interval, 360 | ) 361 | 362 | period_data["trace"], _ = extract_period_data(deepcopy(df_dict[date_time]["trace"]), 363 | "trace", 364 | timestamp, 365 | sample_interval=args.sample_interval, 366 | ) 367 | 368 | selected_kpi = set() 369 | new_kpi = "" 370 | period_data['metric'] = "" 371 | logger.info(f"Sampling Started.") 372 | while len(selected_kpi) < selected_kpi_len: 373 | new_kpi, selected_kpi = extract_period_data(deepcopy(df_dict[date_time]["metric"]), 374 | "metric", 375 | timestamp, 376 | sample_interval=args.sample_interval, 377 | selected_kpi=selected_kpi, 378 | selected_kpi_len=selected_kpi_len 379 | ) 380 | period_data['metric'] += new_kpi 381 | logger.info(f"Selected KPI number: {len(selected_kpi)}\tLimit: {selected_kpi_len}") 382 | 383 | logger.info(f"Sampling Finished. Total tokens: {sum([len(tokenizer.encode(data)) for data in period_data.values()])}") 384 | 385 | try: 386 | if args.mode == "direct": 387 | model = DirectLM(gt_data, cand) 388 | elif args.mode == "cot": 389 | model = CoTLM(gt_data, cand) 390 | 391 | prediction, prompt = model.run(instruction, period_data, args.sample_interval, logger) 392 | with open (promptfile, 'w') as f: 393 | for p in prompt: 394 | f.write(str(p['content'])) 395 | f.write('\n\n') 396 | f.write(str(prediction)) 397 | 398 | new_eval_df = pd.DataFrame([{"row_id": idx, 399 | "task_index": task_index, 400 | "instruction": instruction, 401 | "prediction": prediction, 402 | "groundtruth": '\n'.join([f'{col}: {gt_data.iloc[idx][col]}' for col in gt_data.columns if col != 'description']), 403 | "passed": "N/A", 404 | "failed": "N/A", 405 | "score": "N/A"}]) 406 | eval_df = pd.concat([eval_df, new_eval_df], 407 | ignore_index=True) 408 | eval_df.to_csv(eval_file, 409 | index=False) 410 | 411 | if prediction == "EXCEED!": 412 | passed_criteria = ["EXCEED!"] 413 | failed_criteria = ["EXCEED!"] 414 | score = 0.0 415 | else: 416 | passed_criteria, failed_criteria, score = evaluate(prediction, scoring_points) 417 | logger.info(f"Prediction: {prediction}") 418 | logger.info(f"Scoring Points: {scoring_points}") 419 | logger.info(f"Passed Criteria: {passed_criteria}") 420 | logger.info(f"Failed Criteria: {failed_criteria}") 421 | logger.info(f"Score: {score}") 422 | best_score = max(best_score, score) 423 | 424 | eval_df.loc[eval_df.index[-1], "passed"] = '\n'.join(passed_criteria) 425 | eval_df.loc[eval_df.index[-1], "failed"] = '\n'.join(failed_criteria) 426 | eval_df.loc[eval_df.index[-1], "score"] = score 427 | eval_df.to_csv(eval_file, 428 | index=False) 429 | 430 | temp_scores = scores.copy() 431 | temp_scores[catalog] += best_score 432 | temp_scores["total"] += best_score 433 | temp_nums = nums.copy() 434 | temp_nums[catalog] += 1 435 | temp_nums["total"] += 1 436 | 437 | except Exception as e: 438 | logger.error(e) 439 | continue 440 | 441 | scores = temp_scores 442 | nums = temp_nums 443 | 444 | 445 | 446 | if __name__ == "__main__": 447 | uid = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 448 | parser = argparse.ArgumentParser() 449 | parser.add_argument("--dataset", type=str, default="Market/cloudbed-1") 450 | parser.add_argument("--sample_num", type=int, default=1) 451 | parser.add_argument("--start_idx", type=int, default=0) 452 | parser.add_argument("--end_idx", type=int, default=150) 453 | parser.add_argument("--sample_interval", type=int, default=60) 454 | parser.add_argument("--mode", type=str, default="direct") 455 | parser.add_argument("--tag", type=str, default='lm') 456 | 457 | args = parser.parse_args() 458 | 459 | main(args) 460 | 461 | -------------------------------------------------------------------------------- /rca/run_sampling_oracle.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 4 | sys.path.insert(0, project_root) 5 | import pandas as pd 6 | import argparse 7 | from datetime import datetime 8 | from loguru import logger 9 | from copy import deepcopy 10 | 11 | from rca.baseline.direct_lm import DirectLM 12 | from rca.baseline.cot_lm import CoTLM 13 | from main.evaluate import evaluate 14 | from time import time 15 | from rca.api_router import configs 16 | 17 | def cache_df_dict(dataset_name:str): 18 | 19 | df_dict = dict() 20 | 21 | if dataset_name == "Telecom": 22 | from rca.baseline.oracle_kpis import kpi_Telecom 23 | selected_kpi_dict = kpi_Telecom 24 | 25 | example_df_dict = { 26 | "metric": [], 27 | "trace": [], 28 | } 29 | dataset_path = "Telecom" 30 | 31 | import rca.baseline.rca_agent.prompt.basic_prompt_Telecom as bp 32 | cand = bp.cand 33 | 34 | elif dataset_name == "Bank": 35 | from rca.baseline.oracle_kpis import kpi_Bank 36 | selected_kpi_dict = kpi_Bank 37 | 38 | example_df_dict = { 39 | "log": [], 40 | "metric": [], 41 | "trace": [], 42 | } 43 | dataset_path = "Bank" 44 | 45 | import rca.baseline.rca_agent.prompt.basic_prompt_Bank as bp 46 | cand = bp.cand 47 | 48 | elif dataset_name == "Market/cloudbed-1": 49 | from rca.baseline.oracle_kpis import kpi_Market 50 | selected_kpi_dict = kpi_Market 51 | 52 | example_df_dict = { 53 | "log": [], 54 | "metric": [], 55 | "trace": [], 56 | } 57 | dataset_path = "Market/cloudbed-1" 58 | 59 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp 60 | cand = bp.cand 61 | 62 | elif dataset_name == "Market/cloudbed-2": 63 | from rca.baseline.oracle_kpis import kpi_Market 64 | selected_kpi_dict = kpi_Market 65 | 66 | example_df_dict = { 67 | "log": [], 68 | "metric": [], 69 | "trace": [], 70 | } 71 | dataset_path = "Market/cloudbed-2" 72 | 73 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp 74 | cand = bp.cand 75 | 76 | for day_time in os.listdir(f"dataset/{dataset_path}/telemetry/"): 77 | if day_time == '.DS_Store': 78 | continue 79 | if day_time not in df_dict: 80 | df_dict[day_time] = deepcopy(example_df_dict) 81 | 82 | for data_type in os.listdir(f"dataset/{dataset_path}/telemetry/{day_time}"): 83 | if data_type == '.DS_Store': 84 | continue 85 | for fname in os.listdir(f"dataset/{dataset_path}/telemetry/{day_time}/{data_type}"): 86 | t0 = time() 87 | cur_df = pd.read_csv(f"dataset/{dataset_path}/telemetry/{day_time}/{data_type}/{fname}") 88 | t1 = time() 89 | logger.debug(f"{round(t1-t0,1)} seconds for reading {fname}") 90 | 91 | cur_df = cur_df.reset_index() 92 | if "timestamp" in cur_df.columns: 93 | col = "timestamp" 94 | elif "startTime" in cur_df.columns: 95 | col = "startTime" 96 | else: 97 | logger.error("There is no 'startTime' or 'timestamp' indicating the timestamp of the data entries") 98 | raise IndexError 99 | cur_df[col] = cur_df[col].apply(lambda x: int(x // 1000) if len(str(x)) == 13 else x) 100 | t2 = time() 101 | logger.debug(f"{round(t2-t1, 1)} seconds for prerpocessing DataFrame") 102 | if cur_df.empty: 103 | logger.warning(f"{fname} is empty") 104 | else: 105 | df_dict[day_time][data_type].append((fname, cur_df)) 106 | 107 | return df_dict, selected_kpi_dict, cand 108 | 109 | 110 | def extract_period_data(df_list:list, data_type:str, target_timestamp:int, sample_interval=60, selected_kpi=None) -> list: 111 | 112 | logger.debug(f"Extracting {data_type} data ...") 113 | 114 | extracted_data = "" 115 | for fname, df_file in df_list: 116 | 117 | if "timestamp" in df_file.columns: 118 | col = "timestamp" 119 | elif "startTime" in df_file.columns: 120 | col = "startTime" 121 | else: 122 | logger.error("There is no 'startTime' or 'timestamp' indicating the timestamp of the data entries") 123 | raise IndexError 124 | 125 | t1 = time() 126 | start_timestamp = target_timestamp - target_timestamp % 1800 127 | end_timestamp = start_timestamp + 1800 128 | filtered_df = df_file[(df_file[col] >= start_timestamp) & (df_file[col] <= end_timestamp)] 129 | filtered_df = filtered_df.drop(columns=["index"]) 130 | 131 | t2 = time() 132 | logger.debug(f"{round(t2-t1,1)} seconds for filtering 30 min data") 133 | 134 | if data_type == "log": 135 | filtered_df = filtered_df.drop(columns=["log_id"]) 136 | filtered_df = filtered_df.drop(columns=["cmdb_id"]) 137 | filtered_df = filtered_df.drop(columns=["log_name"]) 138 | schema = filtered_df.columns 139 | extracted_data = extracted_data + f'\n\n#### {fname}' 140 | extracted_data = extracted_data + f' Schema: ' + ','.join(schema) + '\n' 141 | resampled_df = filtered_df.groupby(filtered_df[col] // (sample_interval/5)).first() 142 | if resampled_df.empty: 143 | extracted_data = extracted_data + "DATA NOT AVAILABLE\n" 144 | else: 145 | data = resampled_df.astype(str).agg(','.join, axis=1) 146 | extracted_data = extracted_data + '\n'.join(data) + '\n' 147 | elif data_type == "trace": 148 | opt_traceid_field_name = ["traceId", "trace_id"] 149 | traceid_field_name = None 150 | for opt_name in opt_traceid_field_name: 151 | if opt_name in df_file.columns: 152 | traceid_field_name = opt_name 153 | if traceid_field_name==None: 154 | logger.error("There is no 'traceId' or 'trace_id' indicating the trace_id of the data entries") 155 | raise IndexError 156 | opt_spanid_field_name = ["id", "span_id"] 157 | spanid_field_name = None 158 | for opt_name in opt_spanid_field_name: 159 | if opt_name in df_file.columns: 160 | spanid_field_name = opt_name 161 | if spanid_field_name==None: 162 | logger.error("There is no 'id' or 'span_id' indicating the span_id of the data entries") 163 | raise IndexError 164 | opt_parent_field_name = ["pid", "parent_id", "parent_span"] 165 | parent_field_name = None 166 | for opt_name in opt_parent_field_name: 167 | if opt_name in df_file.columns: 168 | parent_field_name = opt_name 169 | if parent_field_name==None: 170 | logger.error("There is no 'pid' or 'parent_id' indicating the parent_id of the data entries") 171 | raise IndexError 172 | opt_duration_field_name = ["elapsedTime", "duration"] 173 | duration_field_name = None 174 | for opt_name in opt_duration_field_name: 175 | if opt_name in df_file.columns: 176 | duration_field_name = opt_name 177 | if duration_field_name==None: 178 | logger.error("There is no 'elapsedTime' or 'duration' indicating the duration of the data entries") 179 | raise IndexError 180 | 181 | filtered_df = filtered_df[[col, traceid_field_name, spanid_field_name, parent_field_name, duration_field_name, "cmdb_id"]] 182 | schema = filtered_df.columns 183 | schema = schema.drop([traceid_field_name]) 184 | extracted_data = extracted_data + f'\n\n#### {fname}' 185 | extracted_data = extracted_data + f' Schema: ' + ','.join([s for s in schema if s != col]) + '\n' 186 | resampled_df = filtered_df.groupby(filtered_df[col] // sample_interval).first() 187 | trace_ids = resampled_df[traceid_field_name] 188 | trace_dfs = filtered_df[filtered_df[traceid_field_name].isin(trace_ids)] 189 | trace_grouped_df = trace_dfs.groupby(traceid_field_name) 190 | for trace_id, trace_df in trace_grouped_df: 191 | resource_name = f'{trace_id}' 192 | for field in trace_df.columns: 193 | if trace_df[field].dtype in [float, "float64"]: 194 | trace_df[field] = trace_df[field].apply(lambda x: round(x, 2)) 195 | trace_df = trace_df.drop(columns=[traceid_field_name, col]) 196 | if 'group' in trace_df.columns: 197 | trace_df = trace_df.drop(columns=["group"]) 198 | if trace_df.empty: 199 | extracted_data = extracted_data + f'Trace ID: {resource_name}\n```\n' + "DATA NOT AVAILABLE\n```\n" 200 | else: 201 | data = trace_df.astype(str).agg(','.join, axis=1) 202 | extracted_data = extracted_data + f'Trace ID: {resource_name}\n```\n' + '\n'.join(data) + '\n```\n' 203 | t3 = time() 204 | logger.debug(f"{round(t2-t1,1)} seconds for extracting trace data") 205 | elif data_type == "metric": 206 | opt_kpi_field_name = ["name", "kpi_name", "serviceName", "tc", "service"] 207 | kpi_field_name = None 208 | for opt_name in opt_kpi_field_name: 209 | if opt_name in df_file.columns: 210 | kpi_field_name = opt_name 211 | break 212 | if kpi_field_name==None: 213 | logger.error("There is no 'name' or 'serviceName' indicating the kpi_name of the data entries") 214 | raise IndexError 215 | 216 | if kpi_field_name == 'name' or kpi_field_name == 'kpi_name': 217 | filtered_df = filtered_df[filtered_df[kpi_field_name].isin(selected_kpi)] 218 | 219 | filtered_df["group"] = filtered_df[col].apply(lambda x: x // sample_interval) 220 | if 'cmdb_id' not in filtered_df.columns: 221 | filtered_df["resource_name"] = filtered_df[kpi_field_name] 222 | filtered_df = filtered_df.drop(columns=[kpi_field_name]) 223 | else: 224 | filtered_df["resource_name"] = filtered_df["cmdb_id"] + "_" + filtered_df[kpi_field_name] 225 | filtered_df = filtered_df.drop(columns=["cmdb_id", kpi_field_name]) 226 | if "itemid" in filtered_df.columns: 227 | filtered_df = filtered_df.drop(columns=["itemid"]) 228 | if "bomc_id" in filtered_df.columns: 229 | filtered_df = filtered_df.drop(columns=["bomc_id"]) 230 | schema = filtered_df.columns 231 | schema = schema.drop("resource_name") 232 | schema = schema.drop('group') 233 | extracted_data = extracted_data + f'\n\n#### {fname}' 234 | extracted_data = extracted_data + f' Schema: ' + ','.join([s for s in schema if s != col]) + '\n' 235 | resource_grouped_df = filtered_df.groupby("resource_name") 236 | for resource_name, resource_df in resource_grouped_df: 237 | resampled_df = resource_df.groupby(resource_df[col] // sample_interval).first() 238 | for field in resampled_df.columns: 239 | if resampled_df[field].dtype in [float, "float64"]: 240 | resampled_df[field] = resampled_df[field].apply(lambda x: round(x, 2)) 241 | resampled_df = resampled_df.drop(columns=["resource_name"]) 242 | if 'group' in resampled_df.columns: 243 | resampled_df = resampled_df.drop(columns=["group"]) 244 | if resampled_df.empty: 245 | extracted_data = extracted_data + f'{resource_name}\n```\n' + "DATA NOT AVAILABLE\n```\n" 246 | resampled_df = resampled_df.drop(columns=[col]) 247 | data = resampled_df.astype(str).agg(','.join, axis=1) 248 | extracted_data = extracted_data + f'{resource_name}\n```\n' + '\n'.join(data) + '\n```\n' 249 | 250 | t3 = time() 251 | logger.debug(f"{round(t3-t2,1)} seconds for selecting metric data") 252 | return extracted_data 253 | 254 | def main(args): 255 | import rca.baseline.rca_agent.prompt.agent_prompt as ap 256 | if args.dataset == "Telecom": 257 | import rca.baseline.rca_agent.prompt.basic_prompt_Telecom as bp 258 | elif args.dataset == "Bank": 259 | import rca.baseline.rca_agent.prompt.basic_prompt_Bank as bp 260 | elif args.dataset == "Market/cloudbed-1" or args.dataset == "Market/cloudbed-2": 261 | import rca.baseline.rca_agent.prompt.basic_prompt_Market as bp 262 | 263 | inst_file = f"dataset/{args.dataset}/query.csv" 264 | gt_file = f"dataset/{args.dataset}/record.csv" 265 | eval_file = f"test/result/{args.dataset}/oracle_{args.tag}_{args.mode}-{configs['MODEL'].split('/')[-1]}.csv" 266 | obs_path = f"test/monitor/{args.dataset}/oracle_{args.tag}_{args.mode}-{configs['MODEL'].split('/')[-1]}" 267 | unique_obs_path = f"{obs_path}/{uid}" 268 | 269 | instruct_data = pd.read_csv(inst_file) 270 | gt_data = pd.read_csv(gt_file) 271 | if not os.path.exists(inst_file) or not os.path.exists(gt_file): 272 | raise FileNotFoundError(f"Please download the dataset first.") 273 | 274 | if not os.path.exists(f"{unique_obs_path}/prompt"): 275 | os.makedirs(f"{unique_obs_path}/prompt") 276 | if not os.path.exists(eval_file): 277 | if not os.path.exists(f"test/result/{args.dataset}"): 278 | os.makedirs(f"test/result/{args.dataset}") 279 | eval_df = pd.DataFrame(columns=["instruction", "prediction", "groundtruth", "passed", "failed", "score"]) 280 | else: 281 | eval_df = pd.read_csv(eval_file) 282 | 283 | logfile = f"{unique_obs_path}/batch.log" 284 | logger.remove() 285 | logger.add(sys.stdout, colorize=True, enqueue=True, level="INFO") 286 | logger.add(logfile, colorize=True, enqueue=True, level="INFO") 287 | 288 | scores = { 289 | "total": 0, 290 | "easy": 0, 291 | "middle": 0, 292 | "hard": 0, 293 | } 294 | nums = { 295 | "total": 0, 296 | "easy": 0, 297 | "middle": 0, 298 | "hard": 0, 299 | } 300 | 301 | logger.info(f"Using dataset: {args.dataset}") 302 | logger.info(f"Using model: {configs['MODEL'].split('/')[-1]}") 303 | logger.info("Start caching dataframes ...") 304 | df_dict, selected_kpi_dict, cand = cache_df_dict(args.dataset) 305 | 306 | selected_kpi = [] 307 | for typ, ls in selected_kpi_dict.items(): 308 | selected_kpi.extend(ls) 309 | selected_kpi = set(selected_kpi) 310 | 311 | for idx, row in instruct_data.iterrows(): 312 | 313 | if idx < args.start_idx: 314 | continue 315 | if idx > args.end_idx: 316 | break 317 | 318 | instruction = row["instruction"] 319 | timestamp = gt_data.iloc[idx]["timestamp"].astype(int) 320 | date_time = gt_data.iloc[idx]["datetime"].split(" ")[0].replace("-","_") 321 | task_index = row["task_index"] 322 | scoring_points = row["scoring_points"] 323 | task_id = int(task_index.split('_')[1]) 324 | best_score = 0 325 | 326 | if task_id <= 3: 327 | catalog = "easy" 328 | elif task_id <= 6: 329 | catalog = "middle" 330 | elif task_id <= 7: 331 | catalog = "hard" 332 | 333 | for i in range(args.sample_num): 334 | uuid = uid + f"_#{idx}-{i}" 335 | promptfile = f"{unique_obs_path}/prompt/{uuid}.txt" 336 | logger.debug('\n' + "#"*80 + f"\n{uuid}: {task_index}\n" + "#"*80) 337 | 338 | period_data = dict() 339 | 340 | if args.dataset != "Telecom": 341 | period_data["log"] = extract_period_data(deepcopy(df_dict[date_time]["log"]), 342 | "log", timestamp, 343 | sample_interval=args.sample_interval, 344 | ) 345 | 346 | 347 | period_data["metric"] = extract_period_data(deepcopy(df_dict[date_time]["metric"]), 348 | "metric", 349 | timestamp, 350 | sample_interval=args.sample_interval, 351 | selected_kpi=selected_kpi, 352 | ) 353 | 354 | period_data["trace"] = extract_period_data(deepcopy(df_dict[date_time]["trace"]), 355 | "trace", 356 | timestamp, 357 | sample_interval=args.sample_interval, 358 | ) 359 | 360 | try: 361 | if args.mode == "direct": 362 | model = DirectLM(gt_data, cand) 363 | elif args.mode == "cot": 364 | model = CoTLM(gt_data, cand) 365 | 366 | prediction, prompt = model.run(instruction, period_data, args.sample_interval, logger) 367 | with open (promptfile, 'w') as f: 368 | for p in prompt: 369 | f.write(str(p['content'])) 370 | f.write('\n\n') 371 | f.write(str(prediction)) 372 | 373 | new_eval_df = pd.DataFrame([{"row_id": idx, 374 | "task_index": task_index, 375 | "instruction": instruction, 376 | "prediction": prediction, 377 | "groundtruth": '\n'.join([f'{col}: {gt_data.iloc[idx][col]}' for col in gt_data.columns if col != 'description']), 378 | "passed": "N/A", 379 | "failed": "N/A", 380 | "score": "N/A"}]) 381 | eval_df = pd.concat([eval_df, new_eval_df], 382 | ignore_index=True) 383 | eval_df.to_csv(eval_file, 384 | index=False) 385 | 386 | if prediction == "EXCEED!": 387 | passed_criteria = ["EXCEED!"] 388 | failed_criteria = ["EXCEED!"] 389 | score = 0.0 390 | else: 391 | passed_criteria, failed_criteria, score = evaluate(prediction, scoring_points) 392 | logger.info(f"Prediction: {prediction}") 393 | logger.info(f"Scoring Points: {scoring_points}") 394 | logger.info(f"Passed Criteria: {passed_criteria}") 395 | logger.info(f"Failed Criteria: {failed_criteria}") 396 | logger.info(f"Score: {score}") 397 | best_score = max(best_score, score) 398 | 399 | eval_df.loc[eval_df.index[-1], "passed"] = '\n'.join(passed_criteria) 400 | eval_df.loc[eval_df.index[-1], "failed"] = '\n'.join(failed_criteria) 401 | eval_df.loc[eval_df.index[-1], "score"] = score 402 | eval_df.to_csv(eval_file, 403 | index=False) 404 | 405 | temp_scores = scores.copy() 406 | temp_scores[catalog] += best_score 407 | temp_scores["total"] += best_score 408 | temp_nums = nums.copy() 409 | temp_nums[catalog] += 1 410 | temp_nums["total"] += 1 411 | 412 | except Exception as e: 413 | logger.error(e) 414 | continue 415 | 416 | scores = temp_scores 417 | nums = temp_nums 418 | 419 | 420 | 421 | if __name__ == "__main__": 422 | uid = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 423 | parser = argparse.ArgumentParser() 424 | parser.add_argument("--dataset", type=str, default="Market/cloudbed-1") 425 | parser.add_argument("--sample_num", type=int, default=1) 426 | parser.add_argument("--start_idx", type=int, default=0) 427 | parser.add_argument("--end_idx", type=int, default=150) 428 | parser.add_argument("--sample_interval", type=int, default=60) 429 | parser.add_argument("--mode", type=str, default="direct") 430 | parser.add_argument("--tag", type=str, default='lm') 431 | 432 | args = parser.parse_args() 433 | 434 | main(args) 435 | 436 | -------------------------------------------------------------------------------- /rca/test.sh: -------------------------------------------------------------------------------- 1 | python -m main.evaluate \ 2 | -p \ 3 | rca/archive/agent-Bank.csv \ 4 | rca/archive/agent-Market-cloudbed-1.csv \ 5 | rca/archive/agent-Market-cloudbed-2.csv \ 6 | rca/archive/agent-Telecom.csv \ 7 | -q \ 8 | dataset/Bank/query.csv \ 9 | dataset/Market/cloudbed-1/query.csv \ 10 | dataset/Market/cloudbed-2/query.csv \ 11 | dataset/Telecom/query.csv \ 12 | -r \ 13 | test/agent_claude.csv -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anthropic==0.39.0 2 | ipython==8.17.2 3 | loguru==0.7.2 4 | nbformat==5.10.4 5 | openai==1.54.3 6 | pandas==1.5.3 7 | protobuf==5.28.3 8 | pytz==2022.7 9 | PyYAML==6.0.2 10 | tiktoken==0.7.0 11 | --------------------------------------------------------------------------------