├── .gitignore
├── .python-version
├── CHANGELOG.md
├── CONTRIBUTION.md
├── Dockerfile
├── FAQ.md
├── README.md
├── README_EN.md
├── images
    ├── chatwise_demo.png
    ├── chatwise_inter.png
    ├── cherry_studio_demo.png
    ├── cherry_studio_inter.png
    ├── cursor_demo.png
    ├── cursor_inter.png
    ├── cursor_tools.png
    ├── find_slowest_trace.png
    ├── fuzzy_search_and_get_logs.png
    ├── npx_debug.png
    └── search_log_store.png
├── license
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── requirements-dev.txt
├── requirements.txt
├── sample
    └── config
    │   └── knowledge_config.json
├── src
    └── mcp_server_aliyun_observability
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── api_error.py
    │   ├── server.py
    │   ├── toolkit
    │       ├── arms_toolkit.py
    │       ├── cms_toolkit.py
    │       ├── sls_toolkit.py
    │       └── util_toolkit.py
    │   └── utils.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_arms_toolkit.py
    ├── test_cms_toolkit.py
    └── test_sls_toolkit.py
└── uv.lock


/.gitignore:
--------------------------------------------------------------------------------
 1 | .pycache
 2 | __pycache__
 3 | *.pyc
 4 | *.pyo
 5 | *.pyd
 6 | *.pyw
 7 | *.pyz
 8 | *.pywz
 9 | *.pyzw
10 | .env
11 | .vscode
12 | .idea
13 | dist/*
14 | build
15 | .venv
16 | **/*.egg-info
17 | **/*.egg
18 | **/*.dist-info
19 | **/*.whl
20 | **/*.tar.gz
21 | **/*.zip
22 | .cursor
23 | .pytest_cache
24 | **/*.tar.bz2
25 | uv.lock


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # 版本更新
  2 | 
  3 | ## 0.2.6
  4 | - 增加 用户私有知识库 RAG 支持，在启动 MCP Server 时，设置可选参数--knowledge-config ./knowledge_config.json，配置文件样例请参见sample/config/knowledge_config.json
  5 | 
  6 | ## 0.2.5
  7 | - 增加 ARMS 慢 Trace 分析工具
  8 | 
  9 | ## 0.2.4
 10 | - 增加 ARMS 火焰图工具,支持单火焰图分析以及差分火焰图
 11 | 
 12 | ## 0.2.3
 13 | - 增加 ARMS 应用详情工具
 14 | - 优化一些tool 的命名，更加规范，提升模型解析成功率
 15 | 
 16 | ## 0.2.2
 17 | - 优化 SLS 查询工具，时间范围不显示传入，由SQL 生成工具直接返回判定
 18 | - sls_list_projects 工具增加个数限制，并且做出提示
 19 | 
 20 | ## 0.2.1
 21 | - 优化 SLS 查询工具，增加 from_timestamp 和 to_timestamp 参数，确保查询语句的正确性
 22 | - 增加 SLS 日志查询的 prompts
 23 | 
 24 | ## 0.2.0
 25 | - 增加 cms_translate_natural_language_to_promql 工具，根据自然语言生成 promql 查询语句
 26 | 
 27 | ## 0.1.9
 28 | - 支持 STS Token 方式登录，可通过环境变量ALIBABA_CLOUD_SECURITY_TOKEN 指定
 29 | - 修改 README.md 文档，增加 Cursor，Cline 等集成说明以及 UV 命令等说明
 30 | 
 31 | ## 0.1.8
 32 | - 优化 SLS 列出日志库工具，添加日志库类型验证，确保参数符合规范
 33 | 
 34 | 
 35 | ## 0.1.7
 36 | - 优化错误处理机制，简化错误代码，提高系统稳定性
 37 | - 改进 SLS 日志服务相关工具
 38 |     - 增强 sls_list_logstores 工具，添加日志库类型验证，确保参数符合规范
 39 |     - 完善日志库类型描述，明确区分日志类型(logs)和指标类型(metrics)
 40 |     - 优化指标类型日志库筛选逻辑，仅当用户明确需要时才返回指标类型
 41 | 
 42 | ## 0.1.6
 43 | ### 工具列表
 44 | - 增加 SQL 诊断工具, 当 SLS 查询语句执行失败时，可以调用该工具，根据错误信息，生成诊断结果。诊断结果会包含查询语句的正确性、性能分析、优化建议等信息。
 45 | 
 46 | 
 47 | ## 0.1.0
 48 | 本次发布版本为 0.1.0，以新增工具为主，主要包含 SLS 日志服务和 ARMS 应用实时监控服务相关工具。
 49 | 
 50 | 
 51 | ### 工具列表
 52 | 
 53 | - 增加 SLS 日志服务相关工具
 54 |     - `sls_describe_logstore`
 55 |         - 获取 SLS Logstore 的索引信息
 56 |     - `sls_list_projects`
 57 |         - 获取 SLS 项目列表
 58 |     - `sls_list_logstores`
 59 |         - 获取 SLS Logstore 列表
 60 |     - `sls_describe_logstore`
 61 |         - 获取 SLS Logstore 的索引信息
 62 |     - `sls_execute_query`
 63 |         - 执行SLS 日志查询
 64 |     - `sls_translate_natural_language_to_query`
 65 |         - 翻译自然语言为SLS 查询语句
 66 | 
 67 | - 增加 ARMS 应用实时监控服务相关工具
 68 |     - `arms_search_apps`
 69 |         - 搜索 ARMS 应用
 70 |     - `arms_generate_trace_query`
 71 |         - 根据自然语言生成 trace 查询语句
 72 | 
 73 | ### 场景举例
 74 | 
 75 | - 场景一: 快速查询某个 logstore 相关结构
 76 |     - 使用工具:
 77 |         - `sls_list_logstores`
 78 |         - `sls_describe_logstore`
 79 |     ![image](./images/search_log_store.png)
 80 | 
 81 | 
 82 | - 场景二: 模糊查询最近一天某个 logstore下面访问量最高的应用是什么
 83 |     - 分析:
 84 |         - 需要判断 logstore 是否存在
 85 |         - 获取 logstore 相关结构
 86 |         - 根据要求生成查询语句(对于语句用户可确认修改)
 87 |         - 执行查询语句
 88 |         - 根据查询结果生成响应
 89 |     - 使用工具:
 90 |         - `sls_list_logstores`
 91 |         - `sls_describe_logstore`
 92 |         - `sls_translate_natural_language_to_query`
 93 |         - `sls_execute_query`
 94 |     ![image](./images/fuzzy_search_and_get_logs.png)
 95 | 
 96 |     
 97 | - 场景三: 查询 ARMS 某个应用下面响应最慢的几条 Trace
 98 |     - 分析:
 99 |         - 需要判断应用是否存在
100 |         - 获取应用相关结构
101 |         - 根据要求生成查询语句(对于语句用户可确认修改)
102 |         - 执行查询语句
103 |         - 根据查询结果生成响应
104 |     - 使用工具:
105 |         - `arms_search_apps`
106 |         - `arms_generate_trace_query`
107 |         - `sls_translate_natural_language_to_query`
108 |         - `sls_execute_query`
109 |     ![image](./images/find_slowest_trace.png)
110 | 
111 | 


--------------------------------------------------------------------------------
/CONTRIBUTION.md:
--------------------------------------------------------------------------------
  1 | # MCP 贡献指南
  2 | 
  3 | ## 步骤
  4 | 1. 从 master 分支创建一个分支
  5 | 2. 在分支上进行开发测试
  6 | 3. 测试完毕之后提交PR
  7 | 4. 合并PR到Release分支
  8 | 5. 基于 Release 分支发布新版本
  9 | 6. 更新 master 分支
 10 | 7. 生成版本 tag
 11 | 
 12 | ## 项目结构
 13 | 
 14 | ```
 15 | mcp_server_aliyun_observability/
 16 | ├── src/
 17 | │ ├── mcp_server_aliyun_observability/
 18 | │ │ ├── server.py
 19 | │ │ ├── toolkit/
 20 | │ │ │ ├── sls_toolkit.py
 21 | │ │ │ ├── arms_toolkit.py
 22 | │ │ │ └── util_toolkit.py
 23 | │ │ └── utils.py
 24 | │ │ └── api_error.py
 25 | │ └── tests/
 26 | │ │ ├── test_sls_toolkit.py
 27 | │ │ └── test_arms_toolkit.py
 28 | │ └── conftest.py
 29 | ```
 30 | 1. server.py 是 MCP 服务端代码，负责处理 MCP 请求
 31 | 2. toolkit 目录下是 MCP 工具所在，按照产品来组织文件，比如 `src/mcp_server_aliyun_observability/toolkit/sls_toolkit.py` 来定义SLS相关的工具，`src/mcp_server_aliyun_observability/toolkit/arms_toolkit.py` 来定义ARMS相关的工具。
 32 | 3. api_error.py 是一些OpenApi 错误码的定义,如果你的工具实现直接调用了阿里云OpenApi，可以在这里定义错误码来给出友好提示
 33 | 4. utils.py 是一些工具类
 34 | 5. tests 目录下是测试用例
 35 | 
 36 | ## 如何增加一个 MCP 工具
 37 | 
 38 | Python 版本要求 >=3.10（MCP SDK 的版本要求）,建议通过venv或者 conda 来创建虚拟环境
 39 | 
 40 | ## 任务拆解
 41 | 
 42 | 1. 首先需要明确提供什么样的场景，然后再根据场景拆解需要提供什么功能
 43 | 2. 对于复杂的场景不建议提供一个工具，而是拆分成多个工具，然后由 LLM 来组合完成任务
 44 |    - 好处：提升工具的执行成功率
 45 |    - 如果其中一步失败，模型也可以尝试纠正
 46 |    - 示例：查询 APM 一个应用的慢调用可拆解为查询应用信息、生成查询慢调用 SQL、执行查询慢调用 SQL 等步骤
 47 | 3. 尽量复用已有工具，不要新增相同含义的工具
 48 | 
 49 | ## 工具定义
 50 | 1. 新增的工具位于 `src/mcp_server_aliyun_observability/toolkit` 目录下，通过增加 `@self.server.tool()` 注解来定义一个工具。
 51 | 2. 当前可按照产品来组织文件，比如 `src/mcp_server_aliyun_observability/toolkit/sls_toolkit.py` 来定义SLS相关的工具，`src/mcp_server_aliyun_observability/toolkit/arms_toolkit.py` 来定义ARMS相关的工具。
 52 | 3. 工具上需要增加@tool 注解
 53 | 
 54 | ### 1. 工具命名
 55 | 
 56 | * 格式为 `{product_name}_{function_name}`
 57 | * 示例：`sls_describe_logstore`、`arms_search_apps` 等
 58 | * 优势：方便模型识别，当用户集成的工具较多时不会造成歧义和冲突
 59 | 
 60 | ### 2. 参数描述
 61 | 
 62 | * 需要尽可能详细，包括输入输出明确定义、示例、使用指导
 63 | * 参数使用 pydantic 的模型来定义，示例：
 64 | 
 65 | ```python
 66 | @self.server.tool()
 67 | def sls_list_projects(
 68 |     ctx: Context,
 69 |     project_name_query: str = Field(
 70 |         None, description="project name,fuzzy search"
 71 |     ),
 72 |     limit: int = Field(
 73 |         default=10, description="limit,max is 100", ge=1, le=100
 74 |     ),
 75 |     region_id: str = Field(default=..., description="aliyun region id"),
 76 | ) -> list[dict[str, Any]]:
 77 | ```
 78 | 
 79 | * 参数注意事项：
 80 |   - 参数个数尽量控制在五个以内，超过需考虑拆分工具
 81 |   - 相同含义字段定义保持一致（避免一会叫 `project_name`，一会叫 `project`）
 82 |   - 参数类型使用基础类型（str, int, list, dict 等），不使用自定义类型
 83 |   - 如果参数可选值是固定枚举类，在字段描述中要说明可选择的值，同时在代码方法里面也要增加可选值的校验
 84 | 
 85 | ### 3. 返回值设计
 86 | 
 87 | * 优先使用基础类型，不使用自定义类型
 88 | * 控制返回内容长度，特别是数据查询类场景考虑分页返回，防止用户上下文占用过大
 89 | * 返回内容字段清晰，数据类最好转换为明确的 key-value 形式
 90 | * 针对无返回值的情况，比如数据查询为空，不要直接返回空列表，可以返回文本提示比如 `"没有找到相关数据"`供大模型使用
 91 | 
 92 | ### 4. 异常处理
 93 | 
 94 | * 直接调用 API 且异常信息清晰的情况下可不做处理，直接抛出原始错误日志有助于模型识别
 95 | * 如遇 SYSTEM_ERROR 等模糊不清的异常，应处理后返回友好提示
 96 | * 做好重试机制，比如网络抖动、服务端限流等，避免模型因此类问题而重复调用
 97 | 
 98 | ### 5. 工具描述
 99 | 
100 | * 添加工具描述有两种方法：
101 |   - 在 `@self.server.tool()` 中增加 description 参数
102 |   - 使用 Python 的 docstring 描述
103 | * 描述内容应包括：功能概述、使用场景、返回数据结构、查询示例、参数说明等，示例：
104 | 
105 | ```
106 | 列出阿里云日志服务中的所有项目。
107 | 
108 | ## 功能概述
109 | 
110 | 该工具可以列出指定区域中的所有SLS项目，支持通过项目名进行模糊搜索。如果不提供项目名称，则返回该区域的所有项目。
111 | 
112 | ## 使用场景
113 | 
114 | - 当需要查找特定项目是否存在时
115 | - 当需要获取某个区域下所有可用的SLS项目列表时
116 | - 当需要根据项目名称的部分内容查找相关项目时
117 | 
118 | ## 返回数据结构
119 | 
120 | 返回的项目信息包含：
121 | - project_name: 项目名称
122 | - description: 项目描述
123 | - region_id: 项目所在区域
124 | 
125 | ## 查询示例
126 | 
127 | - "有没有叫 XXX 的 project"
128 | - "列出所有SLS项目"
129 | 
130 | Args:
131 |     ctx: MCP上下文，用于访问SLS客户端
132 |     project_name_query: 项目名称查询字符串，支持模糊搜索
133 |     limit: 返回结果的最大数量，范围1-100，默认10
134 |     region_id: 阿里云区域ID
135 | 
136 | Returns:
137 |     包含项目信息的字典列表，每个字典包含project_name、description和region_id
138 | ```
139 | * 可以使用 LLM 生成初步描述，然后根据需要进行调整完善
140 | 
141 | ### 如何测试
142 | 
143 | #### [阶段1] 不基于 LLM，使用测试用例测试
144 | 
145 | 1. 补充下测试用例，在 tests目录下,可有参考 test_sls_toolkit.py 的实现
146 | 2. 使用 `pytest` 运行测试用例，保证功能是正确可用
147 | 
148 | #### [阶段2] 基于 LLM，使用测试用例测试
149 | 1. 通过 Cursor,Client 等客户端来测试和大模型集成后的最终效果


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | # 复制项目文件
 6 | COPY README.md pyproject.toml ./
 7 | COPY src/ ./src/
 8 | 
 9 | # 安装依赖
10 | RUN pip install --no-cache-dir -e .
11 | 
12 | # 暴露 MCP 服务器使用的任何端口
13 | # EXPOSE 8080
14 | 
15 | # 设置环境变量
16 | ENV PYTHONUNBUFFERED=1
17 | 
18 | # 运行服务器
19 | ENTRYPOINT ["python", "-m", "mcp_server_sls"] 


--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | ## CherryStudio 问题
4 | ### 使用 SSE 访问时候，提示 "启动失败" Error invoking remote method 'mcp::list-tools':Error: SSE error: Non-200 status code (404)"
5 | 
6 | 这个一般是端口被其他服务占用，可以检查下端口是否被占用，或者更换端口。
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## 阿里云可观测MCP服务
  2 | <p align="center">
  3 |   <a href="./README.md"><img alt="中文自述文件" src="https://img.shields.io/badge/简体中文-d9d9d9""></a>
  4 |   <a href="./README_EN.md"><img alt="英文自述文件" src="https://img.shields.io/badge/English-d9d9d9")
  5 | </p>
  6 | 
  7 | ### 简介
  8 | 
  9 | 阿里云可观测 MCP服务，提供了一系列访问阿里云可观测各产品的工具能力，覆盖产品包含阿里云日志服务SLS、阿里云应用实时监控服务ARMS、阿里云云监控等，任意支持 MCP 协议的智能体助手都可快速接入。支持的产品如下:
 10 | 
 11 | - [阿里云日志服务SLS](https://help.aliyun.com/zh/sls/product-overview/what-is-log-service)
 12 | - [阿里云应用实时监控服务ARMS](https://help.aliyun.com/zh/arms/?scm=20140722.S_help@@%E6%96%87%E6%A1%A3@@34364._.RL_arms-LOC_2024NSHelpLink-OR_ser-PAR1_215042f917434789732438827e4665-V_4-P0_0-P1_0)
 13 | 
 14 | 目前提供的 MCP 工具以阿里云日志服务为主，其他产品会陆续支持，工具详细如下:
 15 | 
 16 | ### 版本记录
 17 | 可以查看 [CHANGELOG.md](./CHANGELOG.md)
 18 | 
 19 | ### 常见问题
 20 | 可以查看 [FAQ.md](./FAQ.md)
 21 | 
 22 | ### 工具列表
 23 | #### 日志相关
 24 | | 工具名称 | 用途 | 关键参数 | 最佳实践 |  
 25 | |---------|------|---------|---------|  
 26 | | `sls_list_projects` | 列出SLS项目，支持模糊搜索和分页 | `projectName`：项目名称（可选，模糊搜索）<br>`limit`：返回项目数量上限（默认50，范围1-100）<br>`regionId`：阿里云区域ID | - 在不确定可用项目时，首先使用此工具<br>- 使用合理的`limit`值避免返回过多结果 |  
 27 | | `sls_list_logstores` | 列出项目内的日志存储，支持名称模糊搜索 | `project`：SLS项目名称（必需）<br>`logStore`：日志存储名称（可选，模糊搜索）<br>`limit`：返回结果数量上限（默认10）<br>`isMetricStore`：是否筛选指标存储<br>`logStoreType`：日志存储类型<br>`regionId`：阿里云区域ID | - 确定项目后使用此工具查找相关日志存储<br>- 可通过`logStoreType`筛选特定类型日志存储 |  
 28 | | `sls_describe_logstore` | 检索日志存储的结构和索引信息 | `project`：SLS项目名称（必需）<br>`logStore`：SLS日志存储名称（必需）<br>`regionId`：阿里云区域ID | - 在查询前使用此工具了解可用字段及其类型<br>- 检查所需字段是否启用了索引 |  
 29 | | `sls_execute_sql_query` | 在指定时间范围内对日志存储执行SQL查询 | `project`：SLS项目名称（必需）<br>`logStore`：SLS日志存储名称（必需）<br>`query`：SQL查询语句（必需）<br>`fromTimestampInSeconds`：查询开始时间戳（必需）<br>`toTimestampInSeconds`：查询结束时间戳（必需）<br>`limit`：返回结果数量上限（默认10）<br>`regionId`：阿里云区域ID | - 使用适当的时间范围优化查询性能<br>- 限制返回结果数量避免获取过多数据 |  
 30 | | `sls_translate_text_to_sql_query` | 将自然语言描述转换为SLS SQL查询语句 | `text`：查询的自然语言描述（必需）<br>`project`：SLS项目名称（必需）<br>`logStore`：SLS日志存储名称（必需）<br>`regionId`：阿里云区域ID | - 适用于不熟悉SQL语法的用户<br>- 对于复杂查询，可能需要优化生成的SQL |  
 31 | | `sls_diagnose_query` | 诊断SLS查询问题，提供失败原因分析 | `query`：待诊断的SLS查询（必需）<br>`errorMessage`：查询失败的错误信息（必需）<br>`project`：SLS项目名称（必需）<br>`logStore`：SLS日志存储名称（必需）<br>`regionId`：阿里云区域ID | - 查询失败时使用此工具了解根本原因<br>- 根据诊断建议修改查询语句 |  
 32 | 
 33 | ##### 应用相关
 34 | | 工具名称 | 用途 | 关键参数 | 最佳实践 |  
 35 | |---------|------|---------|---------|  
 36 | | `arms_search_apps` | 根据应用名称搜索ARMS应用 | `appNameQuery`: 应用名称查询字符串（必需）<br>`regionId`: 阿里云区域ID（必需，格式：'cn-hangzhou'）<br>`pageSize`: 每页结果数量（默认：20，范围：1-100）<br>`pageNumber`: 页码（默认：1） | - 用于查找特定名称的应用<br>- 用于获取其他ARMS操作所需的应用PID<br>- 使用合理的分页参数优化查询结果<br>- 查看用户拥有的应用列表 |  
 37 | | `arms_generate_trace_query` | 根据自然语言问题生成ARMS追踪数据的SLS查询 | `user_id`: 阿里云账户ID（必需）<br>`pid`: 应用PID（必需）<br>`region_id`: 阿里云区域ID（必需）<br>`question`: 关于追踪的自然语言问题（必需） | - 用于查询应用的追踪信息<br>- 分析应用性能问题<br>- 跟踪特定请求的执行路径<br>- 分析服务调用关系<br>- 集成了自动重试机制处理瞬态错误 |  
 38 | | `arms_get_application_info` | 获取特定ARMS应用的详细信息 | `pid`: 应用PID（必需）<br>`regionId`: 阿里云区域ID（必需） | - 当用户明确请求应用信息时使用<br>- 确定应用的开发语言<br>- 在执行其他操作前先获取应用基本信息 |  
 39 | | `arms_profile_flame_analysis` | 分析ARMS应用火焰图性能热点 | `pid`: 应用PID（必需）<br>`startMs`: 分析开始时间戳（必需）<br>`endMs`: 分析结束时间戳（必需）<br>`profileType`: 分析类型，如'cpu'、'memory'（默认：'cpu'）<br>`ip`: 服务主机IP（可选）<br>`thread`: 线程ID（可选）<br>`threadGroup`: 线程组（可选）<br>`regionId`: 阿里云区域ID（必需） | - 用于分析应用性能热点问题<br>- 支持CPU和内存类型的性能分析<br>- 可筛选特定IP、线程或线程组<br>- 适用于Java和Go应用 |
 40 | | `arms_diff_profile_flame_analysis` | 对比不同时间段的火焰图性能变化 | `pid`: 应用PID（必需）<br>`currentStartMs`: 当前时间段开始时间戳（必需）<br>`currentEndMs`: 当前时间段结束时间戳（必需）<br>`referenceStartMs`: 参考时间段开始时间戳（必需）<br>`referenceEndMs`: 参考时间段结束时间戳（必需）<br>`profileType`: 分析类型，如'cpu'、'memory'（默认：'cpu'）<br>`ip`: 服务主机IP（可选）<br>`thread`: 线程ID（可选）<br>`threadGroup`: 线程组（可选）<br>`regionId`: 阿里云区域ID（必需） | - 用于发布前后性能对比<br>- 分析性能优化效果<br>- 识别性能退化点<br>- 支持CPU和内存类型的性能对比<br>- 适用于Java和Go应用 |
 41 | 
 42 | ##### 指标相关
 43 | 
 44 | | 工具名称 | 用途 | 关键参数 | 最佳实践 |  
 45 | |---------|------|---------|---------|  
 46 | | `cms_translate_text_to_promql` | 将自然语言描述转换为PromQL查询语句 | `text`: 要转换的自然语言文本（必需）<br>`project`: SLS项目名称（必需）<br>`metricStore`: SLS指标存储名称（必需）<br>`regionId`: 阿里云区域ID（必需） | - 提供清晰、具体的指标描述<br>- 如已知，可在描述中提及特定的指标名称、标签或操作<br>- 排除项目或指标存储名称本身<br>- 检查并优化生成的查询以提高准确性和性能 |
 47 | 
 48 | 
 49 | ### 权限要求
 50 | 
 51 | 为了确保 MCP Server 能够成功访问和操作您的阿里云可观测性资源，您需要配置以下权限：
 52 | 
 53 | 1.  **阿里云访问密钥 (AccessKey)**：
 54 |     *   服务运行需要有效的阿里云 AccessKey ID 和 AccessKey Secret。
 55 |     *   获取和管理 AccessKey，请参考 [阿里云 AccessKey 管理官方文档](https://help.aliyun.com/document_detail/53045.html)。
 56 |   
 57 | 2. 当你初始化时候不传入 AccessKey 和 AccessKey Secret 时，会使用[默认凭据链进行登录](https://www.alibabacloud.com/help/zh/sdk/developer-reference/v2-manage-python-access-credentials#62bf90d04dztq)
 58 |    1. 如果环境变量 中的ALIBABA_CLOUD_ACCESS_KEY_ID 和 ALIBABA_CLOUD_ACCESS_KEY_SECRET均存在且非空，则使用它们作为默认凭据。
 59 |    2. 如果同时设置了ALIBABA_CLOUD_ACCESS_KEY_ID、ALIBABA_CLOUD_ACCESS_KEY_SECRET和ALIBABA_CLOUD_SECURITY_TOKEN，则使用STS Token作为默认凭据。
 60 |    
 61 | 3.  **RAM 授权 (重要)**：
 62 |     *   与 AccessKey 关联的 RAM 用户或角色**必须**被授予访问相关云服务所需的权限。
 63 |     *   **强烈建议遵循"最小权限原则"**：仅授予运行您计划使用的 MCP 工具所必需的最小权限集，以降低安全风险。
 64 |     *   根据您需要使用的工具，参考以下文档进行权限配置：
 65 |         *   **日志服务 (SLS)**：如果您需要使用 `sls_*` 相关工具，请参考 [日志服务权限说明](https://help.aliyun.com/zh/sls/overview-8)，并授予必要的读取、查询等权限。
 66 |         *   **应用实时监控服务 (ARMS)**：如果您需要使用 `arms_*` 相关工具，请参考 [ARMS 权限说明](https://help.aliyun.com/zh/arms/security-and-compliance/overview-8?scm=20140722.H_74783._.OR_help-T_cn~zh-V_1)，并授予必要的查询权限。
 67 |     *   请根据您的实际应用场景，精细化配置所需权限。
 68 | 
 69 | ### 安全与部署建议
 70 | 
 71 | 请务必关注以下安全事项和部署最佳实践：
 72 | 
 73 | 1.  **密钥安全**：
 74 |     *   本 MCP Server 在运行时会使用您提供的 AccessKey 调用阿里云 OpenAPI，但**不会以任何形式存储您的 AccessKey**，也不会将其用于设计功能之外的任何其他用途。
 75 | 
 76 | 2.  **访问控制 (关键)**：
 77 |     *   当您选择通过 **SSE (Server-Sent Events) 协议** 访问 MCP Server 时，**您必须自行负责该服务接入点的访问控制和安全防护**。
 78 |     *   **强烈建议**将 MCP Server 部署在**内部网络或受信环境**中，例如您的私有 VPC (Virtual Private Cloud) 内，避免直接暴露于公共互联网。
 79 |     *   推荐的部署方式是使用**阿里云函数计算 (FC)**，并配置其网络设置为**仅 VPC 内访问**，以实现网络层面的隔离和安全。
 80 |     *   **注意**：**切勿**在没有任何身份验证或访问控制机制的情况下，将配置了您 AccessKey 的 MCP Server SSE 端点暴露在公共互联网上，这会带来极高的安全风险。
 81 | 
 82 | ### 使用说明
 83 | 
 84 | 
 85 | 在使用 MCP Server 之前，需要先获取阿里云的 AccessKeyId 和 AccessKeySecret，请参考 [阿里云 AccessKey 管理](https://help.aliyun.com/document_detail/53045.html)
 86 | 
 87 | 
 88 | #### 使用 pip 安装
 89 | > ⚠️ 需要 Python 3.10 及以上版本。
 90 | 
 91 | 直接使用 pip 安装即可，安装命令如下：
 92 | 
 93 | ```bash
 94 | pip install mcp-server-aliyun-observability
 95 | ```
 96 | 1. 安装之后，直接运行即可，运行命令如下：
 97 | 
 98 | ```bash
 99 | python -m mcp_server_aliyun_observability --transport sse --access-key-id <your_access_key_id> --access-key-secret <your_access_key_secret>
100 | ```
101 | 可通过命令行传递指定参数:
102 | - `--transport` 指定传输方式，可选值为 `sse` 或 `stdio`，默认值为 `stdio`
103 | - `--access-key-id` 指定阿里云 AccessKeyId，不指定时会使用环境变量中的ALIBABA_CLOUD_ACCESS_KEY_ID
104 | - `--access-key-secret` 指定阿里云 AccessKeySecret，不指定时会使用环境变量中的ALIBABA_CLOUD_ACCESS_KEY_SECRET
105 | - `--log-level` 指定日志级别，可选值为 `DEBUG`、`INFO`、`WARNING`、`ERROR`，默认值为 `INFO`
106 | - `--transport-port` 指定传输端口，默认值为 `8000`,仅当 `--transport` 为 `sse` 时有效
107 | 
108 | 2. 使用uv 命令启动
109 |    可以指定下版本号，会自动拉取对应依赖，默认是 studio 方式启动
110 | ```bash
111 | uvx --from 'mcp-server-aliyun-observability==0.2.1' mcp-server-aliyun-observability 
112 | ```
113 | 
114 | 3. 使用 uvx 命令启动
115 | 
116 | ```bash
117 | uvx run mcp-server-aliyun-observability
118 | ```
119 | 
120 | ### 从源码安装
121 | 
122 | ```bash
123 | 
124 | # clone 源码
125 | git clone git@github.com:aliyun/alibabacloud-observability-mcp-server.git
126 | # 进入源码目录
127 | cd alibabacloud-observability-mcp-server
128 | # 安装
129 | pip install -e .
130 | # 运行
131 | python -m mcp_server_aliyun_observability --transport sse --access-key-id <your_access_key_id> --access-key-secret <your_access_key_secret>
132 | ```
133 | 
134 | 
135 | ### AI 工具集成
136 | 
137 | > 以 SSE 启动方式为例,transport 端口为 8888,实际使用时需要根据实际情况修改
138 | 
139 | #### Cursor，Cline 等集成
140 | 1. 使用 SSE 启动方式
141 | ```json
142 | {
143 |   "mcpServers": {
144 |     "alibaba_cloud_observability": {
145 |       "url": "http://localhost:7897/sse"
146 |         }
147 |   }
148 | }
149 | ```
150 | 2. 使用 stdio 启动方式
151 |    直接从源码目录启动,注意
152 |     1. 需要指定 `--directory` 参数,指定源码目录，最好是绝对路径
153 |     2. uv命令 最好也使用绝对路径，如果使用了虚拟环境，则需要使用虚拟环境的绝对路径
154 | ```json
155 | {
156 |   "mcpServers": {
157 |     "alibaba_cloud_observability": {
158 |       "command": "uv",
159 |       "args": [
160 |         "--directory",
161 |         "/path/to/your/alibabacloud-observability-mcp-server",
162 |         "run",
163 |         "mcp-server-aliyun-observability"
164 |       ],
165 |       "env": {
166 |         "ALIBABA_CLOUD_ACCESS_KEY_ID": "<your_access_key_id>",
167 |         "ALIBABA_CLOUD_ACCESS_KEY_SECRET": "<your_access_key_secret>"
168 |       }
169 |     }
170 |   }
171 | }
172 | ```
173 | 1. 使用 stdio 启动方式-从 module 启动
174 | ```json
175 | {
176 |   "mcpServers": {
177 |     "alibaba_cloud_observability": {
178 |       "command": "uv",
179 |       "args": [
180 |         "run",
181 |         "mcp-server-aliyun-observability"
182 |       ],
183 |       "env": {
184 |         "ALIBABA_CLOUD_ACCESS_KEY_ID": "<your_access_key_id>",
185 |         "ALIBABA_CLOUD_ACCESS_KEY_SECRET": "<your_access_key_secret>"
186 |       }
187 |     }
188 |   }
189 | }
190 | ```
191 | 
192 | #### Cherry Studio集成
193 | 
194 | ![image](./images/cherry_studio_inter.png)
195 | 
196 | ![image](./images/cherry_studio_demo.png)
197 | 
198 | 
199 | #### Cursor集成
200 | 
201 | ![image](./images/cursor_inter.png)
202 | 
203 | ![image](./images/cursor_tools.png)
204 | 
205 | ![image](./images/cursor_demo.png)
206 | 
207 | 
208 | #### ChatWise集成
209 | 
210 | ![image](./images/chatwise_inter.png)
211 | 
212 | ![image](./images/chatwise_demo.png)
213 | 
214 | 


--------------------------------------------------------------------------------
/README_EN.md:
--------------------------------------------------------------------------------
  1 | ## Alibaba Cloud Observability MCP Server
  2 | 
  3 | ### Introduction
  4 | 
  5 | Alibaba Cloud Observability MCP Server provides a set of tools for accessing various products in Alibaba Cloud's observability suite. It covers products including Alibaba Cloud Log Service (SLS), Alibaba Cloud Application Real-Time Monitoring Service (ARMS), and Alibaba Cloud CloudMonitor. Any intelligent agent that supports the MCP protocol can quickly integrate with it. Supported products include:
  6 | 
  7 | - [Alibaba Cloud Log Service (SLS)](https://www.alibabacloud.com/help/en/sls/product-overview/what-is-log-service)
  8 | - [Alibaba Cloud Application Real-Time Monitoring Service (ARMS)](https://www.alibabacloud.com/help/en/arms)
  9 | 
 10 | Currently, the MCP tools primarily focus on Alibaba Cloud Log Service, with support for other products being added incrementally. The detailed tools are as follows:
 11 | 
 12 | ### Version History
 13 | You can check the [CHANGELOG.md](./CHANGELOG.md)
 14 | 
 15 | ### FAQ
 16 | You can check the [FAQ.md](./FAQ.md)
 17 | 
 18 | ##### Example Scenarios
 19 | 
 20 | - Scenario 1: Quickly query the structure of a specific logstore
 21 |     - Tools used:
 22 |         - `sls_list_logstores`
 23 |         - `sls_describe_logstore`
 24 |     ![image](./images/search_log_store.png)
 25 | 
 26 | 
 27 | - Scenario 2: Fuzzy query to find the application with the highest traffic in a logstore over the past day
 28 |     - Analysis:
 29 |         - Need to verify if the logstore exists
 30 |         - Get the logstore structure
 31 |         - Generate query statements based on requirements (users can confirm and modify statements)
 32 |         - Execute query statements
 33 |         - Generate responses based on query results
 34 |     - Tools used:
 35 |         - `sls_list_logstores`
 36 |         - `sls_describe_logstore`
 37 |         - `sls_translate_natural_language_to_query`
 38 |         - `sls_execute_query`
 39 |     ![image](./images/fuzzy_search_and_get_logs.png)
 40 | 
 41 |     
 42 | - Scenario 3: Query the slowest traces in an ARMS application
 43 |     - Analysis:
 44 |         - Need to verify if the application exists
 45 |         - Get the application structure
 46 |         - Generate query statements based on requirements (users can confirm and modify statements)
 47 |         - Execute query statements
 48 |         - Generate responses based on query results
 49 |     - Tools used:
 50 |         - `arms_search_apps`
 51 |         - `arms_generate_trace_query`
 52 |         - `sls_translate_natural_language_to_query`
 53 |         - `sls_execute_query`
 54 |     ![image](./images/find_slowest_trace.png)
 55 | 
 56 | 
 57 | ### Permission Requirements
 58 | 
 59 | To ensure MCP Server can successfully access and operate your Alibaba Cloud observability resources, you need to configure the following permissions:
 60 | 
 61 | 1.  **Alibaba Cloud Access Key (AccessKey)**:
 62 |     *   Service requires a valid Alibaba Cloud AccessKey ID and AccessKey Secret.
 63 |     *   To obtain and manage AccessKeys, refer to the [Alibaba Cloud AccessKey Management official documentation](https://www.alibabacloud.com/help/en/basics-for-beginners/latest/obtain-an-accesskey-pair).
 64 |   
 65 | 2. When you initialize without providing AccessKey and AccessKey Secret, it will use the [Default Credential Chain for login](https://www.alibabacloud.com/help/en/sdk/developer-reference/v2-manage-python-access-credentials#62bf90d04dztq)
 66 |    1. If environment variables ALIBABA_CLOUD_ACCESS_KEY_ID and ALIBABA_CLOUD_ACCESS_KEY_SECRET exist and are non-empty, they will be used as default credentials.
 67 |    2. If ALIBABA_CLOUD_ACCESS_KEY_ID, ALIBABA_CLOUD_ACCESS_KEY_SECRET, and ALIBABA_CLOUD_SECURITY_TOKEN are all set, STS Token will be used as default credentials.
 68 |    
 69 | 3.  **RAM Authorization (Important)**:
 70 |     *   The RAM user or role associated with the AccessKey **must** be granted the necessary permissions to access the relevant cloud services.
 71 |     *   **Strongly recommended to follow the "Principle of Least Privilege"**: Only grant the minimum set of permissions necessary to run the MCP tools you plan to use, to reduce security risks.
 72 |     *   Based on the tools you need to use, refer to the following documentation for permission configuration:
 73 |         *   **Log Service (SLS)**: If you need to use `sls_*` related tools, refer to [Log Service Permissions](https://www.alibabacloud.com/help/en/sls/user-guide/overview-2) and grant necessary read, query, and other permissions.
 74 |         *   **Application Real-Time Monitoring Service (ARMS)**: If you need to use `arms_*` related tools, refer to [ARMS Permissions](https://www.alibabacloud.com/help/en/arms/user-guide/manage-ram-permissions) and grant necessary query permissions.
 75 |     *   Please configure the required permissions in detail according to your actual application scenario.
 76 | 
 77 | ### Security and Deployment Recommendations
 78 | 
 79 | Please pay attention to the following security issues and deployment best practices:
 80 | 
 81 | 1.  **Key Security**:
 82 |     *   This MCP Server will use the AccessKey you provide to call Alibaba Cloud OpenAPI when running, but **will not store your AccessKey in any form**, nor will it be used for any other purpose beyond the designed functionality.
 83 | 
 84 | 2.  **Access Control (Critical)**:
 85 |     *   When you choose to access the MCP Server through the **SSE (Server-Sent Events) protocol**, **you must take responsibility for access control and security protection of the service access point**.
 86 |     *   **Strongly recommended** to deploy the MCP Server in an **internal network or trusted environment**, such as your private VPC (Virtual Private Cloud), avoiding direct exposure to the public internet.
 87 |     *   The recommended deployment method is to use **Alibaba Cloud Function Compute (FC)** and configure its network settings to **VPC-only access** to achieve network-level isolation and security.
 88 |     *   **Note**: **Never** expose the MCP Server SSE endpoint configured with your AccessKey on the public internet without any identity verification or access control mechanisms, as this poses a high security risk.
 89 | 
 90 | ### Usage Instructions
 91 | 
 92 | 
 93 | Before using the MCP Server, you need to obtain Alibaba Cloud's AccessKeyId and AccessKeySecret. Please refer to [Alibaba Cloud AccessKey Management](https://www.alibabacloud.com/help/en/basics-for-beginners/latest/obtain-an-accesskey-pair)
 94 | 
 95 | 
 96 | #### Install using pip
 97 | > ⚠️ Requires Python 3.10 or higher.
 98 | 
 99 | Simply install using pip:
100 | 
101 | ```bash
102 | pip install mcp-server-aliyun-observability
103 | ```
104 | 1. After installation, run directly with the following command:
105 | 
106 | ```bash
107 | python -m mcp_server_aliyun_observability --transport sse --access-key-id <your_access_key_id> --access-key-secret <your_access_key_secret>
108 | ```
109 | You can pass specific parameters through the command line:
110 | - `--transport` Specify the transport method, options are `sse` or `stdio`, default is `stdio`
111 | - `--access-key-id` Specify Alibaba Cloud AccessKeyId, if not specified, ALIBABA_CLOUD_ACCESS_KEY_ID from environment variables will be used
112 | - `--access-key-secret` Specify Alibaba Cloud AccessKeySecret, if not specified, ALIBABA_CLOUD_ACCESS_KEY_SECRET from environment variables will be used
113 | - `--log-level` Specify log level, options are `DEBUG`, `INFO`, `WARNING`, `ERROR`, default is `INFO`
114 | - `--transport-port` Specify transport port, default is `8000`, only effective when `--transport` is `sse`
115 | 
116 | 2. Start using uv command
117 |    
118 | ```bash
119 | uv run mcp-server-aliyun-observability 
120 | ```
121 | ### Install from source code
122 | 
123 | ```bash
124 | 
125 | # clone the source code
126 | git clone git@github.com:aliyun/alibabacloud-observability-mcp-server.git
127 | # enter the source directory
128 | cd alibabacloud-observability-mcp-server
129 | # install
130 | pip install -e .
131 | # run
132 | python -m mcp_server_aliyun_observability --transport sse --access-key-id <your_access_key_id> --access-key-secret <your_access_key_secret>
133 | ```
134 | 
135 | 
136 | ### AI Tool Integration
137 | 
138 | > Taking SSE startup mode as an example, with transport port 8888. In actual use, you need to modify according to your specific situation.
139 | 
140 | #### Integration with Cursor, Cline, etc.
141 | 1. Using SSE startup method
142 | ```json
143 | {
144 |   "mcpServers": {
145 |     "alibaba_cloud_observability": {
146 |       "url": "http://localhost:7897/sse"
147 |         }
148 |   }
149 | }
150 | ```
151 | 2. Using stdio startup method
152 |    Start directly from the source code directory, note:
153 |     1. Need to specify the `--directory` parameter to indicate the source code directory, preferably an absolute path
154 |     2. The uv command should also use an absolute path; if using a virtual environment, you need to use the absolute path of the virtual environment
155 | ```json
156 | {
157 |   "mcpServers": {
158 |     "alibaba_cloud_observability": {
159 |       "command": "uv",
160 |       "args": [
161 |         "--directory",
162 |         "/path/to/your/alibabacloud-observability-mcp-server",
163 |         "run",
164 |         "mcp-server-aliyun-observability"
165 |       ],
166 |       "env": {
167 |         "ALIBABA_CLOUD_ACCESS_KEY_ID": "<your_access_key_id>",
168 |         "ALIBABA_CLOUD_ACCESS_KEY_SECRET": "<your_access_key_secret>"
169 |       }
170 |     }
171 |   }
172 | }
173 | ```
174 | 3. Using stdio startup method - start from module
175 | ```json
176 | {
177 |   "mcpServers": {
178 |     "alibaba_cloud_observability": {
179 |       "command": "uv",
180 |       "args": [
181 |         "run",
182 |         "mcp-server-aliyun-observability"
183 |       ],
184 |       "env": {
185 |         "ALIBABA_CLOUD_ACCESS_KEY_ID": "<your_access_key_id>",
186 |         "ALIBABA_CLOUD_ACCESS_KEY_SECRET": "<your_access_key_secret>"
187 |       }
188 |     }
189 |   }
190 | }
191 | ```
192 | 
193 | #### Cherry Studio Integration
194 | 
195 | ![image](./images/cherry_studio_inter.png)
196 | 
197 | ![image](./images/cherry_studio_demo.png)
198 | 
199 | 
200 | #### Cursor Integration
201 | 
202 | ![image](./images/cursor_inter.png)
203 | 
204 | ![image](./images/cursor_tools.png)
205 | 
206 | ![image](./images/cursor_demo.png)
207 | 
208 | 
209 | #### ChatWise Integration
210 | 
211 | ![image](./images/chatwise_inter.png)
212 | 
213 | ![image](./images/chatwise_demo.png) 


--------------------------------------------------------------------------------
/images/chatwise_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/chatwise_demo.png


--------------------------------------------------------------------------------
/images/chatwise_inter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/chatwise_inter.png


--------------------------------------------------------------------------------
/images/cherry_studio_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/cherry_studio_demo.png


--------------------------------------------------------------------------------
/images/cherry_studio_inter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/cherry_studio_inter.png


--------------------------------------------------------------------------------
/images/cursor_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/cursor_demo.png


--------------------------------------------------------------------------------
/images/cursor_inter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/cursor_inter.png


--------------------------------------------------------------------------------
/images/cursor_tools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/cursor_tools.png


--------------------------------------------------------------------------------
/images/find_slowest_trace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/find_slowest_trace.png


--------------------------------------------------------------------------------
/images/fuzzy_search_and_get_logs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/fuzzy_search_and_get_logs.png


--------------------------------------------------------------------------------
/images/npx_debug.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/npx_debug.png


--------------------------------------------------------------------------------
/images/search_log_store.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/alibabacloud-observability-mcp-server/020856ba14df3e0aa610ba4159fe428926c0030b/images/search_log_store.png


--------------------------------------------------------------------------------
/license:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Alibaba Cloud
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "mcp-server-aliyun-observability"
 3 | version = "0.2.6"
 4 | description = "aliyun observability mcp server"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | dependencies = [
 8 |     "mcp>=1.3.0",
 9 |     "pydantic>=2.10.0",
10 |     "alibabacloud_arms20190808==8.0.0",
11 |     "alibabacloud_sls20201230==5.7.0",
12 |     "alibabacloud_credentials>=1.0.1",
13 |     "tenacity>=8.0.0",
14 | ]
15 | 
16 | [build-system]
17 | requires = ["hatchling", "wheel", "setuptools"]
18 | build-backend = "hatchling.build"
19 | 
20 | [tool.uv]
21 | dev-dependencies = ["pyright>=1.1.389"]
22 | 
23 | [tool.hatch.build.targets.wheel]
24 | packages = ["src/mcp_server_aliyun_observability"]
25 | 
26 | [tool.hatch.build]
27 | include = [
28 |     "src/**/*.py",
29 |     "README.md",
30 |     "LICENSE",
31 |     "pyproject.toml",
32 | ]
33 | 
34 | exclude = [
35 |     "**/*.pyc",
36 |     "**/__pycache__",
37 |     "**/*.pyo",
38 |     "**/*.pyd",
39 |     "**/*.png",
40 |     ".git",
41 |     ".env",
42 |     ".gitignore",
43 |     "*.so",
44 |     "*.dylib",
45 |     "*.dll",
46 | ]
47 | 
48 | [tool.hatch.metadata]
49 | allow-direct-references = true
50 | 
51 | [project.optional-dependencies]
52 | dev = ["pytest", "pytest-mock", "pytest-cov"]
53 | 
54 | [project.urls]
55 | 
56 | 
57 | [project.scripts]
58 | mcp-server-aliyun-observability = "mcp_server_aliyun_observability:main" 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | python_files = test_*.py
4 | python_classes = Test*
5 | python_functions = test_*
6 | addopts = -v --tb=short
7 | asyncio_default_fixture_loop_scope = function


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest>=7.0.0
2 | pytest-asyncio>=0.21.0
3 | pytest-cov>=4.0.0
4 | pytest-mock>=3.10.0 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | mcp>=1.3.0
 2 | pydantic>=2.10.0
 3 | alibabacloud_arms20190808>=8.0.0
 4 | alibabacloud_credentials>=1.0.1
 5 | tenacity>=8.0.0
 6 | pytest>=7.0.0
 7 | pytest-asyncio>=0.21.0
 8 | pytest-cov>=4.0.0
 9 | pytest-mock>=3.10.0 
10 | 


--------------------------------------------------------------------------------
/sample/config/knowledge_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "default_endpoint": {
 3 |         "uri": "https://api.default.com",
 4 |         "key": "Bearer dataset-***"
 5 |     },
 6 |     "projects": {
 7 |         "project1": {
 8 |             "default_endpoint": {
 9 |                 "uri": "https://api.project1.com",
10 |                 "key": "Bearer dataset-***"
11 |             },
12 |             "logstore1": {
13 |                 "uri": "https://api.project1.logstore1.com",
14 |                 "key": "Bearer dataset-***"
15 |             },
16 |             "logstore2": {
17 |                 "uri": "https://api.project1.logstore2.com",
18 |                 "key": "Bearer dataset-***"
19 |             }
20 |         },
21 |         "project2": {
22 |             "logstore3": {
23 |                 "uri": "https://api.project2.logstore3.com",
24 |                 "key": "Bearer dataset-***"
25 |             }
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/mcp_server_aliyun_observability/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import click
 5 | import dotenv
 6 | 
 7 | from mcp_server_aliyun_observability.server import server
 8 | from mcp_server_aliyun_observability.utils import CredentialWrapper
 9 | 
10 | dotenv.load_dotenv()
11 | 
12 | 
13 | @click.command()
14 | @click.option(
15 |     "--access-key-id",
16 |     type=str,
17 |     help="aliyun access key id",
18 |     required=False,
19 | )
20 | @click.option(
21 |     "--access-key-secret",
22 |     type=str,
23 |     help="aliyun access key secret",
24 |     required=False,
25 | )
26 | @click.option(
27 |     "--knowledge-config",
28 |     type=str,
29 |     help="knowledge config file path",
30 |     required=False,
31 | )
32 | @click.option(
33 |     "--transport", type=str, help="transport type. stdio or sse", default="stdio"
34 | )
35 | @click.option("--log-level", type=str, help="log level", default="INFO")
36 | @click.option("--transport-port", type=int, help="transport port", default=8000)
37 | def main(access_key_id, access_key_secret, knowledge_config, transport, log_level, transport_port):
38 |     if access_key_id and access_key_secret:
39 |         credential = CredentialWrapper(access_key_id, access_key_secret, knowledge_config)
40 |     else:
41 |         credential = None
42 |     server(credential, transport, log_level, transport_port)
43 | 


--------------------------------------------------------------------------------
/src/mcp_server_aliyun_observability/__main__.py:
--------------------------------------------------------------------------------
1 | from mcp_server_aliyun_observability import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/src/mcp_server_aliyun_observability/api_error.py:
--------------------------------------------------------------------------------
  1 | TEQ_EXCEPTION_ERROR = [
  2 |     {
  3 |         "httpStatusCode": 400,
  4 |         "errorCode": "RequestTimeExpired",
  5 |         "errorMessage": "Request time _requestTime_ has been expired while server time is _server time_.",
  6 |         "description": "请求时间和服务端时间差别超过15分钟。",
  7 |         "solution": "请您检查请求端时间，稍后重试。",
  8 |     },
  9 |     {
 10 |         "httpStatusCode": 400,
 11 |         "errorCode": "ProjectAlreadyExist",
 12 |         "errorMessage": "Project _ProjectName_ already exist.",
 13 |         "description": "Project名称已存在。Project名称在阿里云地域内全局唯一。",
 14 |         "solution": "请您更换Project名称后重试。",
 15 |     },
 16 |     {
 17 |         "httpStatusCode": 401,
 18 |         "errorCode": "SignatureNotMatch",
 19 |         "errorMessage": "Signature _signature_ not matched.",
 20 |         "description": "请求的数字签名不匹配。",
 21 |         "solution": "请您重试或更换AccessKey后重试。",
 22 |     },
 23 |     {
 24 |         "httpStatusCode": 401,
 25 |         "errorCode": "Unauthorized",
 26 |         "errorMessage": "The AccessKeyId is unauthorized.",
 27 |         "description": "提供的AccessKey ID值未授权。",
 28 |         "solution": "请确认您的AccessKey ID有访问日志服务权限。",
 29 |     },
 30 |     {
 31 |         "httpStatusCode": 401,
 32 |         "errorCode": "Unauthorized",
 33 |         "errorMessage": "The security token you provided is invalid.",
 34 |         "description": "STS Token不合法。",
 35 |         "solution": "请检查您的STS接口请求，确认STS Token是合法有效的。",
 36 |     },
 37 |     {
 38 |         "httpStatusCode": 401,
 39 |         "errorCode": "Unauthorized",
 40 |         "errorMessage": "The security token you provided has expired.",
 41 |         "description": "STS Token已经过期。",
 42 |         "solution": "请重新申请STS Token后发起请求。",
 43 |     },
 44 |     {
 45 |         "httpStatusCode": 401,
 46 |         "errorCode": "Unauthorized",
 47 |         "errorMessage": "AccessKeyId not found: _AccessKey ID_",
 48 |         "description": "AccessKey ID不存在。",
 49 |         "solution": "请检查您的AccessKey ID，重新获取后再发起请求。",
 50 |     },
 51 |     {
 52 |         "httpStatusCode": 401,
 53 |         "errorCode": "Unauthorized",
 54 |         "errorMessage": "AccessKeyId is disabled: _AccessKey ID_",
 55 |         "description": "AccessKey ID是禁用状态。",
 56 |         "solution": "请检查您的AccessKey ID，确认为已启用状态后重新发起请求。",
 57 |     },
 58 |     {
 59 |         "httpStatusCode": 401,
 60 |         "errorCode": "Unauthorized",
 61 |         "errorMessage": "Your SLS service has been forbidden.",
 62 |         "description": "日志服务已经被禁用。",
 63 |         "solution": "请检查您的日志服务状态，例如是否已欠费。",
 64 |     },
 65 |     {
 66 |         "httpStatusCode": 401,
 67 |         "errorCode": "Unauthorized",
 68 |         "errorMessage": "The project does not belong to you.",
 69 |         "description": "Project不属于当前访问用户。",
 70 |         "solution": "请更换Project或者访问用户后重试。",
 71 |     },
 72 |     {
 73 |         "httpStatusCode": 401,
 74 |         "errorCode": "InvalidAccessKeyId",
 75 |         "errorMessage": "The access key id you provided is invalid: _AccessKey ID_.",
 76 |         "description": "AccessKey ID不合法。",
 77 |         "solution": "请检查您的AccessKey ID，确认AccessKey ID是合法有效的。",
 78 |     },
 79 |     {
 80 |         "httpStatusCode": 401,
 81 |         "errorCode": "InvalidAccessKeyId",
 82 |         "errorMessage": "Your SLS service has not opened.",
 83 |         "description": "日志服务没有开通。",
 84 |         "solution": "请登录日志服务控制台或者通过API开通日志服务后，重新发起请求。",
 85 |     },
 86 |     {
 87 |         "httpStatusCode": 403,
 88 |         "errorCode": "WriteQuotaExceed",
 89 |         "errorMessage": "Write quota is exceeded.",
 90 |         "description": "超过写入日志限额。",
 91 |         "solution": "请您优化写入日志请求，减少写入日志数量。",
 92 |     },
 93 |     {
 94 |         "httpStatusCode": 403,
 95 |         "errorCode": "ReadQuotaExceed",
 96 |         "errorMessage": "Read quota is exceeded.",
 97 |         "description": "超过读取日志限额。",
 98 |         "solution": "请您优化读取日志请求，减少读取日志数量。",
 99 |     },
100 |     {
101 |         "httpStatusCode": 403,
102 |         "errorCode": "MetaOperationQpsLimitExceeded",
103 |         "errorMessage": "Qps limit for the meta operation is exceeded.",
104 |         "description": "超出默认设置的QPS阈值。",
105 |         "solution": "请您优化资源操作请求，减少资源操作次数。建议您延迟几秒后重试。",
106 |     },
107 |     {
108 |         "httpStatusCode": 403,
109 |         "errorCode": "ProjectForbidden",
110 |         "errorMessage": "Project _ProjectName_ has been forbidden.",
111 |         "description": "Project已经被禁用。",
112 |         "solution": "请检查Project状态，您的Project当前可能已经欠费。",
113 |     },
114 |     {
115 |         "httpStatusCode": 404,
116 |         "errorCode": "ProjectNotExist",
117 |         "errorMessage": "The Project does not exist : _name_",
118 |         "description": "日志项目（Project）不存在。",
119 |         "solution": "请您检查Project名称，确认已存在该Project或者地域是否正确。",
120 |     },
121 |     {
122 |         "httpStatusCode": 413,
123 |         "errorCode": "PostBodyTooLarge",
124 |         "errorMessage": "Body size _bodysize_ must little than 10485760.",
125 |         "description": "请求消息体body不能超过10M。",
126 |         "solution": "请您调整请求消息体的大小后重试。",
127 |     },
128 |     {
129 |         "httpStatusCode": 500,
130 |         "errorCode": "InternalServerError",
131 |         "errorMessage": "Internal server error message.",
132 |         "description": "服务器内部错误。",
133 |         "solution": "请您稍后重试。",
134 |     },
135 |     {
136 |         "httpStatusCode": 500,
137 |         "errorCode": "RequestTimeout",
138 |         "errorMessage": "The request is timeout. Please try again later.",
139 |         "description": "请求处理超时。",
140 |         "solution": "请您稍后重试。",
141 |     },
142 | ]
143 | 


--------------------------------------------------------------------------------
/src/mcp_server_aliyun_observability/server.py:
--------------------------------------------------------------------------------
 1 | from contextlib import asynccontextmanager
 2 | from typing import AsyncIterator, Optional
 3 | 
 4 | from alibabacloud_credentials.client import Client as CredClient
 5 | from mcp.server import FastMCP
 6 | from mcp.server.fastmcp import FastMCP
 7 | 
 8 | from mcp_server_aliyun_observability.toolkit.arms_toolkit import ArmsToolkit
 9 | from mcp_server_aliyun_observability.toolkit.sls_toolkit import SLSToolkit
10 | from mcp_server_aliyun_observability.toolkit.cms_toolkit import CMSToolkit
11 | from mcp_server_aliyun_observability.toolkit.util_toolkit import UtilToolkit
12 | from mcp_server_aliyun_observability.utils import (
13 |     ArmsClientWrapper,
14 |     CredentialWrapper,
15 |     SLSClientWrapper,
16 | )
17 | 
18 | 
19 | def create_lifespan(credential: Optional[CredentialWrapper] = None):
20 |     @asynccontextmanager
21 |     async def lifespan(fastmcp: FastMCP) -> AsyncIterator[dict]:
22 |         sls_client = SLSClientWrapper(credential)
23 |         arms_client = ArmsClientWrapper(credential)
24 |         cms_client = SLSClientWrapper(credential)
25 |         yield {
26 |             "sls_client": sls_client,
27 |             "arms_client": arms_client,
28 |             "cms_client": cms_client,
29 |         }
30 | 
31 |     return lifespan
32 | 
33 | 
34 | def init_server(
35 |     credential: Optional[CredentialWrapper] = None,
36 |     log_level: str = "INFO",
37 |     transport_port: int = 8000,
38 | ):
39 |     """initialize the global mcp server instance"""
40 |     mcp_server = FastMCP(
41 |         name="mcp_aliyun_observability_server",
42 |         lifespan=create_lifespan(credential),
43 |         log_level=log_level,
44 |         port=transport_port,
45 |     )
46 |     SLSToolkit(mcp_server)
47 |     UtilToolkit(mcp_server)
48 |     ArmsToolkit(mcp_server)
49 |     CMSToolkit(mcp_server)
50 |     return mcp_server
51 | 
52 | 
53 | def server(
54 |     credential: Optional[CredentialWrapper] = None,
55 |     transport: str = "stdio",
56 |     log_level: str = "INFO",
57 |     transport_port: int = 8000,
58 | ):
59 |     server: FastMCP = init_server(credential, log_level, transport_port)
60 |     server.run(transport)
61 | 


--------------------------------------------------------------------------------
/src/mcp_server_aliyun_observability/toolkit/arms_toolkit.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any
  3 | 
  4 | from alibabacloud_arms20190808.client import Client as ArmsClient
  5 | from alibabacloud_sls20201230.client import Client
  6 | from alibabacloud_arms20190808.models import (
  7 |     SearchTraceAppByPageRequest,
  8 |     SearchTraceAppByPageResponse,
  9 |     SearchTraceAppByPageResponseBodyPageBean, GetTraceAppRequest, GetTraceAppResponse,
 10 |     GetTraceAppResponseBodyTraceApp,
 11 | )
 12 | from alibabacloud_tea_util import models as util_models
 13 | from alibabacloud_sls20201230.models import CallAiToolsRequest, CallAiToolsResponse
 14 | from mcp.server.fastmcp import Context, FastMCP
 15 | from pydantic import Field
 16 | from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
 17 | 
 18 | from mcp_server_aliyun_observability.utils import (
 19 |     get_arms_user_trace_log_store,
 20 |     text_to_sql,
 21 | )
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | class ArmsToolkit:
 27 |     def __init__(self, server: FastMCP):
 28 |         self.server = server
 29 |         self._register_tools()
 30 | 
 31 |     def _register_tools(self):
 32 |         """register arms related tools functions"""
 33 | 
 34 |         @self.server.tool()
 35 |         def arms_search_apps(
 36 |             ctx: Context,
 37 |             appNameQuery: str = Field(..., description="app name query"),
 38 |             regionId: str = Field(
 39 |                 ...,
 40 |                 description="region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
 41 |             ),
 42 |             pageSize: int = Field(
 43 |                 20, description="page size,max is 100", ge=1, le=100
 44 |             ),
 45 |             pageNumber: int = Field(1, description="page number,default is 1", ge=1),
 46 |         ) -> list[dict[str, Any]]:
 47 |             """搜索ARMS应用。
 48 | 
 49 |             ## 功能概述
 50 | 
 51 |             该工具用于根据应用名称搜索ARMS应用，返回应用的基本信息，包括应用名称、PID、用户ID和类型。
 52 | 
 53 |             ## 使用场景
 54 | 
 55 |             - 当需要查找特定名称的应用时
 56 |             - 当需要获取应用的PID以便进行其他ARMS操作时
 57 |             - 当需要检查用户拥有的应用列表时
 58 | 
 59 |             ## 搜索条件
 60 | 
 61 |             - app_name_query必须是应用名称的一部分，而非自然语言
 62 |             - 搜索结果将分页返回，可以指定页码和每页大小
 63 | 
 64 |             ## 返回数据结构
 65 | 
 66 |             返回一个字典，包含以下信息：
 67 |             - total: 符合条件的应用总数
 68 |             - page_size: 每页大小
 69 |             - page_number: 当前页码
 70 |             - trace_apps: 应用列表，每个应用包含app_name、pid、user_id和type
 71 | 
 72 |             ## 查询示例
 73 | 
 74 |             - "帮我查询下 XXX 的应用"
 75 |             - "找出名称包含'service'的应用"
 76 | 
 77 |             Args:
 78 |                 ctx: MCP上下文，用于访问ARMS客户端
 79 |                 app_name_query: 应用名称查询字符串
 80 |                 region_id: 阿里云区域ID
 81 |                 page_size: 每页大小，范围1-100，默认20
 82 |                 page_number: 页码，默认1
 83 | 
 84 |             Returns:
 85 |                 包含应用信息的字典
 86 |             """
 87 |             arms_client: ArmsClient = ctx.request_context.lifespan_context[
 88 |                 "arms_client"
 89 |             ].with_region(regionId)
 90 |             request: SearchTraceAppByPageRequest = SearchTraceAppByPageRequest(
 91 |                 trace_app_name=appNameQuery,
 92 |                 region_id=regionId,
 93 |                 page_size=pageSize,
 94 |                 page_number=pageNumber,
 95 |             )
 96 |             response: SearchTraceAppByPageResponse = (
 97 |                 arms_client.search_trace_app_by_page(request)
 98 |             )
 99 |             page_bean: SearchTraceAppByPageResponseBodyPageBean = (
100 |                 response.body.page_bean
101 |             )
102 |             result = {
103 |                 "total": page_bean.total_count,
104 |                 "page_size": page_bean.page_size,
105 |                 "page_number": page_bean.page_number,
106 |                 "trace_apps": [],
107 |             }
108 |             if page_bean:
109 |                 result["trace_apps"] = [
110 |                     {
111 |                         "app_name": app.app_name,
112 |                         "pid": app.pid,
113 |                         "user_id": app.user_id,
114 |                         "type": app.type,
115 |                     }
116 |                     for app in page_bean.trace_apps
117 |                 ]
118 | 
119 |             return result
120 | 
121 |         @self.server.tool()
122 |         @retry(
123 |             stop=stop_after_attempt(2),
124 |             wait=wait_fixed(1),
125 |             retry=retry_if_exception_type(Exception),
126 |             reraise=True,
127 |         )
128 |         def arms_generate_trace_query(
129 |             ctx: Context,
130 |             user_id: int = Field(..., description="user aliyun account id"),
131 |             pid: str = Field(..., description="pid,the pid of the app"),
132 |             region_id: str = Field(
133 |                 ...,
134 |                 description="region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
135 |             ),
136 |             question: str = Field(
137 |                 ..., description="question,the question to query the trace"
138 |             ),
139 |         ) -> dict:
140 |             """生成ARMS应用的调用链查询语句。
141 | 
142 |             ## 功能概述
143 | 
144 |             该工具用于将自然语言描述转换为ARMS调用链查询语句，便于分析应用性能和问题。
145 | 
146 |             ## 使用场景
147 | 
148 |             - 当需要查询应用的调用链信息时
149 |             - 当需要分析应用性能问题时
150 |             - 当需要跟踪特定请求的执行路径时
151 |             - 当需要分析服务间调用关系时
152 | 
153 |             ## 查询处理
154 | 
155 |             工具会将自然语言问题转换为SLS查询，并返回：
156 |             - 生成的SLS查询语句
157 |             - 存储调用链数据的项目名
158 |             - 存储调用链数据的日志库名
159 | 
160 |             ## 查询上下文
161 | 
162 |             查询会考虑以下信息：
163 |             - 应用的PID
164 |             - 响应时间以纳秒存储，需转换为毫秒
165 |             - 数据以span记录存储，查询耗时需要对符合条件的span进行求和
166 |             - 服务相关信息使用serviceName字段
167 |             - 如果用户明确提出要查询 trace信息，则需要在查询问题上question 上添加说明返回trace信息
168 | 
169 |             ## 查询示例
170 | 
171 |             - "帮我查询下 XXX 的 trace 信息"
172 |             - "分析最近一小时内响应时间超过1秒的调用链"
173 | 
174 |             Args:
175 |                 ctx: MCP上下文，用于访问ARMS和SLS客户端
176 |                 user_id: 用户阿里云账号ID
177 |                 pid: 应用的PID
178 |                 region_id: 阿里云区域ID
179 |                 question: 查询调用链的自然语言问题
180 | 
181 |             Returns:
182 |                 包含查询信息的字典，包括sls_query、project和log_store
183 |             """
184 | 
185 |             data: dict[str, str] = get_arms_user_trace_log_store(user_id, region_id)
186 |             instructions = [
187 |                 "1. pid为" + pid,
188 |                 "2. 响应时间字段为 duration,单位为纳秒，转换成毫秒",
189 |                 "3. 注意因为保存的是每个 span 记录,如果是耗时，需要对所有符合条件的span 耗时做求和",
190 |                 "4. 涉及到接口服务等字段,使用 serviceName字段",
191 |                 "5. 如果用户明确提出要查询 trace信息，则需要返回 trace_id",
192 |             ]
193 |             instructions_str = "\n".join(instructions)
194 |             prompt = f"""
195 |             问题:
196 |             {question}
197 |             补充信息:
198 |             {instructions_str}
199 |             请根据以上信息生成sls查询语句
200 |             """
201 |             sls_text_to_query = text_to_sql(
202 |                 ctx, prompt, data["project"], data["log_store"], region_id
203 |             )
204 |             return {
205 |                 "sls_query": sls_text_to_query["data"],
206 |                 "requestId": sls_text_to_query["requestId"],
207 |                 "project": data["project"],
208 |                 "log_store": data["log_store"],
209 |             }
210 |           
211 |         @self.server.tool()
212 |         def arms_profile_flame_analysis(
213 |                 ctx: Context,
214 |                 pid: str = Field(..., description="arms application id"),
215 |                 startMs: str = Field(..., description="profile start ms"),
216 |                 endMs: str = Field(..., description="profile end ms"),
217 |                 profileType: str = Field(default="cpu", description="profile type, like 'cpu' 'memory'"),
218 |                 ip: str = Field(None, description="arms service host ip"),
219 |                 thread: str = Field(None, description="arms service thread id"),
220 |                 threadGroup: str = Field(None, description="arms service thread group"),
221 |                 regionId: str = Field(default=...,
222 |                     description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
223 |                 ),
224 |         ) -> dict:
225 |             """分析ARMS应用火焰图性能热点。
226 | 
227 |             ## 功能概述
228 | 
229 |             当应用存在性能问题且开启持续剖析时，可以调用该工具对ARMS应用火焰图性能热点进行分析，生成分析结果。分析结果会包含火焰图的性能热点问题、优化建议等信息。
230 | 
231 |             ## 使用场景
232 | 
233 |             - 当需要分析ARMS应用火焰图性能问题时
234 | 
235 |             ## 查询示例
236 | 
237 |             - "帮我分析下ARMS应用 XXX 的火焰图性能热点"
238 | 
239 |             Args:
240 |                 ctx: MCP上下文，用于访问SLS客户端
241 |                 pid: ARMS应用监控服务PID
242 |                 startMs: 分析的开始时间，通过get_current_time工具获取毫秒级时间戳
243 |                 endMs: 分析的结束时间，通过get_current_time工具获取毫秒级时间戳
244 |                 profileType: Profile类型，用于选择需要分析的Profile指标，支持CPU热点和内存热点，如'cpu'、'memory'
245 |                 ip: ARMS应用服务主机地址，非必要参数，用于选择所在的服务机器，如有多个填写时以英文逗号","分隔，如'192.168.0.1,192.168.0.2'，不填写默认查询服务所在的所有IP
246 |                 thread: 服务线程名称，非必要参数，用于选择对应线程，如有多个填写时以英文逗号","分隔，如'C1 CompilerThre,C2 CompilerThre'，不填写默认查询服务所有线程
247 |                 threadGroup: 服务聚合线程组名称，非必要参数，用于选择对应线程组，如有多个填写时以英文逗号","分隔，如'http-nio-*-exec-*,http-nio-*-ClientPoller-*'，不填写默认查询服务所有聚合线程组
248 |                 regionId: 阿里云区域ID，如'cn-hangzhou'、'cn-shanghai'等
249 |             """
250 |             try:
251 |                 valid_types = ['cpu', 'memory']
252 |                 profileType = profileType.lower()
253 |                 if profileType not in valid_types:
254 |                     raise ValueError(f"无效的profileType: {profileType}, 仅支持: {', '.join(valid_types)}")
255 | 
256 |                 # Connect to ARMS client
257 |                 arms_client: ArmsClient = ctx.request_context.lifespan_context["arms_client"].with_region(regionId)
258 |                 request: GetTraceAppRequest = GetTraceAppRequest(pid=pid, region_id=regionId)
259 |                 response: GetTraceAppResponse = arms_client.get_trace_app(request)
260 |                 trace_app: GetTraceAppResponseBodyTraceApp = response.body.trace_app
261 | 
262 |                 if not trace_app:
263 |                     raise ValueError("无法找到应用信息")
264 | 
265 |                 # Extract application details
266 |                 service_name = trace_app.app_name
267 |                 language = trace_app.language
268 | 
269 |                 # Validate language parameter
270 |                 if language not in ['java', 'go']:
271 |                     raise ValueError(f"暂不支持的语言类型: {language}. 当前仅支持 'java' 和 'go'")
272 | 
273 |                 # Prepare SLS client for Flame analysis
274 |                 sls_client: Client = ctx.request_context.lifespan_context["sls_client"].with_region("cn-shanghai")
275 |                 ai_request: CallAiToolsRequest = CallAiToolsRequest(
276 |                     tool_name="profile_flame_analysis",
277 |                     region_id=regionId
278 |                 )
279 | 
280 |                 params: dict[str, Any] = {
281 |                     "serviceName": service_name,
282 |                     "startMs": startMs,
283 |                     "endMs": endMs,
284 |                     "profileType": profileType,
285 |                     "ip": ip,
286 |                     "language": language,
287 |                     "thread": thread,
288 |                     "threadGroup": threadGroup,
289 |                     "sys.query": f"帮我分析下应用 {service_name} 的火焰图性能热点问题",
290 |                 }
291 | 
292 |                 ai_request.params = params
293 |                 runtime: util_models.RuntimeOptions = util_models.RuntimeOptions(read_timeout=60000,
294 |                                                                                  connect_timeout=60000)
295 | 
296 |                 tool_response: CallAiToolsResponse = sls_client.call_ai_tools_with_options(request=ai_request,
297 |                                                                                            headers={},
298 |                                                                                            runtime=runtime)
299 |                 data = tool_response.body
300 | 
301 |                 if "------answer------\n" in data:
302 |                     data = data.split("------answer------\n")[1]
303 | 
304 |                 return {
305 |                     "data": data
306 |                 }
307 | 
308 |             except Exception as e:
309 |                 logger.error(f"调用火焰图数据性能热点AI工具失败: {str(e)}")
310 |                 raise
311 | 
312 |         @self.server.tool()
313 |         def arms_diff_profile_flame_analysis(
314 |                 ctx: Context,
315 |                 pid: str = Field(..., description="arms application id"),
316 |                 currentStartMs: str = Field(..., description="current profile start ms"),
317 |                 currentEndMs: str = Field(..., description="current profile end ms"),
318 |                 referenceStartMs: str = Field(..., description="reference profile start ms (for comparison)"),
319 |                 referenceEndMs: str = Field(..., description="reference profile end ms (for comparison)"),
320 |                 profileType: str = Field(default="cpu", description="profile type, like 'cpu' 'memory'"),
321 |                 ip: str = Field(None, description="arms service host ip"),
322 |                 thread: str = Field(None, description="arms service thread id"),
323 |                 threadGroup: str = Field(None, description="arms service thread group"),
324 |                 regionId: str = Field(default=...,
325 |                                       description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'")
326 |         ) -> dict:
327 |             """对比两个时间段火焰图的性能变化。
328 | 
329 |             ## 功能概述
330 | 
331 |             对应用在两个不同时间段内的性能进行分析，生成差分火焰图。通常用于发布前后或性能优化前后性能对比，帮助识别性能提升或退化。
332 | 
333 |             ## 使用场景
334 | 
335 |             - 发布前后、性能优化前后不同时间段火焰图性能对比
336 | 
337 |             ## 查询示例
338 | 
339 |             - "帮我分析应用 XXX 在发布前后的性能变化情况"
340 | 
341 |             Args:
342 |                 ctx: MCP上下文，用于访问SLS客户端
343 |                 pid: ARMS应用监控服务PID
344 |                 currentStartMs: 火焰图当前（基准）时间段的开始时间戳，通过get_current_time工具获取毫秒级时间戳
345 |                 currentEndMs: 火焰图当前（基准）时间段的结束时间戳，通过get_current_time工具获取毫秒级时间戳
346 |                 referenceStartMs: 火焰图对比时间段（参考时间段）的开始时间戳，通过get_current_time工具获取毫秒级时间戳
347 |                 referenceEndMs: 火焰图对比时间段（参考时间段）的结束时间戳，通过get_current_time工具获取毫秒级时间戳
348 |                 profileType: Profile类型，如'cpu'、'memory'
349 |                 ip: ARMS应用服务主机地址，非必要参数，用于选择所在的服务机器，如有多个填写时以英文逗号","分隔，如'192.168.0.1,192.168.0.2'，不填写默认查询服务所在的所有IP
350 |                 thread: 服务线程名称，非必要参数，用于选择对应线程，如有多个填写时以英文逗号","分隔，如'C1 CompilerThre,C2 CompilerThre'，不填写默认查询服务所有线程
351 |                 threadGroup: 服务聚合线程组名称，非必要参数，用于选择对应线程组，如有多个填写时以英文逗号","分隔，如'http-nio-*-exec-*,http-nio-*-ClientPoller-*'，不填写默认查询服务所有聚合线程组
352 |                 regionId: 阿里云区域ID，如'cn-hangzhou'、'cn-shanghai'等
353 |             """
354 |             try:
355 |                 valid_types = ['cpu', 'memory']
356 |                 profileType = profileType.lower()
357 |                 if profileType not in valid_types:
358 |                     raise ValueError(f"无效的profileType: {profileType}, 仅支持: {', '.join(valid_types)}")
359 | 
360 |                 arms_client: ArmsClient = ctx.request_context.lifespan_context["arms_client"].with_region(regionId)
361 |                 request: GetTraceAppRequest = GetTraceAppRequest(
362 |                     pid=pid,
363 |                     region_id=regionId,
364 |                 )
365 |                 response: GetTraceAppResponse = arms_client.get_trace_app(request)
366 |                 trace_app: GetTraceAppResponseBodyTraceApp = response.body.trace_app
367 | 
368 |                 if not trace_app:
369 |                     raise ValueError("无法找到应用信息")
370 | 
371 |                 service_name = trace_app.app_name
372 |                 language = trace_app.language
373 | 
374 |                 if language not in ['java', 'go']:
375 |                     raise ValueError(f"暂不支持的语言类型: {language}. 当前仅支持 'java' 和 'go'")
376 | 
377 |                 sls_client: Client = ctx.request_context.lifespan_context["sls_client"].with_region("cn-shanghai")
378 |                 ai_request: CallAiToolsRequest = CallAiToolsRequest(
379 |                     tool_name="diff_profile_flame_analysis",
380 |                     region_id=regionId
381 |                 )
382 | 
383 |                 params: dict[str, Any] = {
384 |                     "serviceName": service_name,
385 |                     "startMs": currentStartMs,
386 |                     "endMs": currentEndMs,
387 |                     "baseStartMs": referenceStartMs,
388 |                     "baseEndMs": referenceEndMs,
389 |                     "profileType": profileType,
390 |                     "ip": ip,
391 |                     "language": language,
392 |                     "thread": thread,
393 |                     "threadGroup": threadGroup,
394 |                     "sys.query": f"帮我分析应用 {service_name} 在两个时间段前后的性能变化情况",
395 |                 }
396 | 
397 |                 ai_request.params = params
398 |                 runtime: util_models.RuntimeOptions = util_models.RuntimeOptions(read_timeout=60000, connect_timeout=60000)
399 | 
400 |                 tool_response: CallAiToolsResponse = sls_client.call_ai_tools_with_options(request=ai_request, headers={}, runtime=runtime)
401 |                 data = tool_response.body
402 | 
403 |                 if "------answer------\n" in data:
404 |                     data = data.split("------answer------\n")[1]
405 | 
406 |                 return {
407 |                     "data": data
408 |                 }
409 | 
410 |             except Exception as e:
411 |                 logger.error(f"调用差分火焰图性能变化分析工具失败: {str(e)}")
412 |                 raise
413 | 
414 |         @self.server.tool()
415 |         def arms_get_application_info(ctx: Context,
416 |                                       pid: str = Field(..., description="pid,the pid of the app"),
417 |                                       regionId: str = Field(...,
418 |                                         description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
419 |                                       ),
420 |                                       ) -> dict:
421 |             """
422 |             根据 PID获取具体某个应用的信息，
423 |             ## 功能概述
424 |             1. 获取ARMS应用信息，会返回应用的 PID，AppName,开发语言类型比如 java,python 等
425 |             
426 |             ## 使用场景
427 |             1. 当用户明确提出要查询某个应用的信息时，可以调用该工具
428 |             2. 有场景需要获取应用的开发语言类型，可以调用该工具
429 |             """
430 |             arms_client: ArmsClient = ctx.request_context.lifespan_context[
431 |                 "arms_client"
432 |             ].with_region(regionId)
433 |             request: GetTraceAppRequest = GetTraceAppRequest(
434 |                 pid=pid,
435 |                 region_id=regionId,
436 |             )
437 |             response: GetTraceAppResponse = arms_client.get_trace_app(request)
438 |             if response.body:
439 |                 trace_app: GetTraceAppResponseBodyTraceApp = response.body.trace_app
440 |                 return {
441 |                     "pid": trace_app.pid,
442 |                     "app_name": trace_app.app_name,
443 |                     "language": trace_app.language,
444 |                 }
445 |             else:
446 |                 return "没有找到应用信息"
447 |         
448 |         @self.server.tool()
449 |         def arms_trace_quality_analysis(ctx: Context,
450 |                 traceId: str = Field(..., description="traceId"),
451 |                 startMs: int = Field(..., description="start time (ms) for trace query. unit is millisecond, should be unix timestamp, only number, no other characters"),
452 |                 endMs: int = Field(..., description="end time (ms) for trace query. unit is millisecond, should be unix timestamp, only number, no other characters"),
453 |                 regionId: str = Field(default=...,
454 |                                       description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'")
455 |         ) -> dict:
456 |             """Trace 质量检测
457 | 
458 |             ## 功能概述
459 |             识别指定 traceId 的 Trace 是否存在完整性问题（断链）和性能问题（错慢调用）
460 | 
461 |             ## 使用场景
462 | 
463 |             - 检测调用链是否存在问题
464 | 
465 |             ## 查询示例
466 | 
467 |             - "帮我分析调用链"
468 | 
469 |             Args:
470 |                 ctx: MCP上下文，用于访问SLS客户端
471 |                 traceId: 待分析的 Trace 的 traceId，必要参数
472 |                 startMs: 分析的开始时间，通过get_current_time工具获取毫秒级时间戳
473 |                 endMs: 分析的结束时间，通过get_current_time工具获取毫秒级时间戳
474 |                 regionId: 阿里云区域ID，如'cn-hangzhou'、'cn-shanghai'等
475 |             """
476 |             try:
477 | 
478 |                 sls_client: Client = ctx.request_context.lifespan_context["sls_client"].with_region("cn-shanghai")
479 |                 ai_request: CallAiToolsRequest = CallAiToolsRequest(
480 |                     tool_name="trace_struct_analysis",
481 |                     region_id=regionId
482 |                 )
483 | 
484 |                 params: dict[str, Any] = {
485 |                     "startMs": startMs,
486 |                     "endMs": endMs,
487 |                     "traceId": traceId,
488 |                     "sys.query": f"分析这个trace",
489 |                 }
490 | 
491 |                 ai_request.params = params
492 |                 runtime: util_models.RuntimeOptions = util_models.RuntimeOptions(read_timeout=60000, connect_timeout=60000)
493 | 
494 |                 tool_response: CallAiToolsResponse = sls_client.call_ai_tools_with_options(request=ai_request, headers={}, runtime=runtime)
495 |                 data = tool_response.body
496 | 
497 |                 if "------answer------\n" in data:
498 |                     data = data.split("------answer------\n")[1]
499 | 
500 |                 return {
501 |                     "data": data
502 |                 }
503 | 
504 |             except Exception as e:
505 |                 logger.error(f"调用Trace质量检测工具失败: {str(e)}")
506 |                 raise
507 | 
508 |         @self.server.tool()
509 |         def arms_slow_trace_analysis(ctx: Context,
510 |                                      traceId: str = Field(..., description="traceId"),
511 |                                      startMs: int = Field(..., description="start time (ms) for trace query. unit is millisecond, should be unix timestamp, only number, no other characters"),
512 |                                      endMs: int = Field(..., description="end time (ms) for trace query. unit is millisecond, should be unix timestamp, only number, no other characters"),
513 |                                      regionId: str = Field(default=...,
514 |                                                            description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'")
515 |                                      ) -> dict:
516 |             """深入分析 Trace 慢调用根因
517 | 
518 |             ## 功能概述
519 | 
520 |             针对 Trace 中的慢调用进行诊断分析，输出包含概述、根因、影响范围及解决方案的诊断报告。
521 | 
522 |             ## 使用场景
523 | 
524 |             - 性能问题定位和修复
525 | 
526 |             ## 查询示例
527 | 
528 |             - "请分析 ${traceId} 这个 trace 慢的原因"
529 | 
530 |             Args:
531 |                 ctx: MCP上下文，用于访问SLS客户端
532 |                 traceId: 待分析的Trace的 traceId，必要参数
533 |                 startMs: 分析的开始时间，通过get_current_time工具获取毫秒级时间戳
534 |                 endMs: 分析的结束时间，通过get_current_time工具获取毫秒级时间戳
535 |                 regionId: 阿里云区域ID，如'cn-hangzhou'、'cn-shanghai'等
536 |             """
537 |             try:
538 | 
539 |                 sls_client: Client = ctx.request_context.lifespan_context["sls_client"].with_region("cn-shanghai")
540 |                 ai_request: CallAiToolsRequest = CallAiToolsRequest(
541 |                     tool_name="trace_slow_analysis",
542 |                     region_id=regionId
543 |                 )
544 | 
545 |                 params: dict[str, Any] = {
546 |                     "startMs": startMs,
547 |                     "endMs": endMs,
548 |                     "traceId": traceId,
549 |                     "sys.query": f"深入分析慢调用根因",
550 |                 }
551 | 
552 |                 ai_request.params = params
553 |                 runtime: util_models.RuntimeOptions = util_models.RuntimeOptions(read_timeout=60000,
554 |                                                                                  connect_timeout=60000)
555 | 
556 |                 tool_response: CallAiToolsResponse = sls_client.call_ai_tools_with_options(request=ai_request,
557 |                                                                                            headers={}, runtime=runtime)
558 |                 data = tool_response.body
559 | 
560 |                 if "------answer------\n" in data:
561 |                     data = data.split("------answer------\n")[1]
562 | 
563 |                 return {
564 |                     "data": data
565 |                 }
566 | 
567 |             except Exception as e:
568 |                 logger.error(f"调用Trace慢调用分析工具失败: {str(e)}")
569 |                 raise
570 | 
571 |         @self.server.tool()
572 |         def arms_error_trace_analysis(ctx: Context,
573 |                                      traceId: str = Field(..., description="traceId"),
574 |                                      startMs: int = Field(..., description="start time (ms) for trace query. unit is millisecond, should be unix timestamp, only number, no other characters"),
575 |                                      endMs: int = Field(..., description="end time (ms) for trace query. unit is millisecond, should be unix timestamp, only number, no other characters"),
576 |                                      regionId: str = Field(default=...,
577 |                                                            description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'")
578 |                                      ) -> dict:
579 |             """深入分析 Trace 错误根因
580 | 
581 |             ## 功能概述
582 | 
583 |             针对 Trace 中的错误调用进行深入诊断分析，输出包含概述、根因、影响范围及解决方案的错误诊断报告。
584 | 
585 |             ## 使用场景
586 | 
587 |             - 性能问题定位和修复
588 | 
589 |             ## 查询示例
590 | 
591 |             - "请分析 ${traceId} 这个 trace 发生错误的原因"
592 | 
593 |             Args:
594 |                 ctx: MCP上下文，用于访问SLS客户端
595 |                 traceId: 待分析的Trace的 traceId，必要参数
596 |                 startMs: 分析的开始时间，通过get_current_time工具获取毫秒级时间戳
597 |                 endMs: 分析的结束时间，通过get_current_time工具获取毫秒级时间戳
598 |                 regionId: 阿里云区域ID，如'cn-hangzhou'、'cn-shanghai'等
599 |             """
600 |             try:
601 | 
602 |                 sls_client: Client = ctx.request_context.lifespan_context["sls_client"].with_region("cn-shanghai")
603 |                 ai_request: CallAiToolsRequest = CallAiToolsRequest(
604 |                     tool_name="trace_error_analysis",
605 |                     region_id=regionId
606 |                 )
607 | 
608 |                 params: dict[str, Any] = {
609 |                     "startMs": startMs,
610 |                     "endMs": endMs,
611 |                     "traceId": traceId,
612 |                     "sys.query": f"深入分析错误根因",
613 |                 }
614 | 
615 |                 ai_request.params = params
616 |                 runtime: util_models.RuntimeOptions = util_models.RuntimeOptions(read_timeout=60000,
617 |                                                                                  connect_timeout=60000)
618 | 
619 |                 tool_response: CallAiToolsResponse = sls_client.call_ai_tools_with_options(request=ai_request,
620 |                                                                                            headers={}, runtime=runtime)
621 |                 data = tool_response.body
622 | 
623 |                 if "------answer------\n" in data:
624 |                     data = data.split("------answer------\n")[1]
625 | 
626 |                 return {
627 |                     "data": data
628 |                 }
629 | 
630 |             except Exception as e:
631 |                 logger.error(f"调用Trace错误分析工具失败: {str(e)}")
632 |                 raise
633 | 


--------------------------------------------------------------------------------
/src/mcp_server_aliyun_observability/toolkit/cms_toolkit.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, Callable, Dict, List, Optional, TypeVar, cast
  3 | from functools import wraps
  4 | 
  5 | from alibabacloud_sls20201230.client import Client as SLSClient
  6 | from alibabacloud_sls20201230.models import (
  7 |     CallAiToolsRequest,
  8 |     CallAiToolsResponse,
  9 |     GetLogsRequest,
 10 |     GetLogsResponse,
 11 |     ListLogStoresRequest,
 12 |     ListLogStoresResponse,
 13 |     ListProjectRequest,
 14 |     ListProjectResponse,
 15 | )
 16 | from alibabacloud_tea_util import models as util_models
 17 | from mcp.server.fastmcp import Context, FastMCP
 18 | from pydantic import Field
 19 | from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
 20 | 
 21 | from mcp_server_aliyun_observability.utils import handle_tea_exception
 22 | 
 23 | # 配置日志
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | class CMSToolkit:
 28 |     """aliyun observability tools manager"""
 29 | 
 30 |     def __init__(self, server: FastMCP):
 31 |         """
 32 |         initialize the tools manager
 33 | 
 34 |         Args:
 35 |             server: FastMCP server instance
 36 |         """
 37 |         self.server = server
 38 |         self._register_tools()
 39 | 
 40 |     def _register_tools(self):
 41 |         """register cms and prometheus related tools functions"""
 42 | 
 43 |         @self.server.tool()
 44 |         @retry(
 45 |             stop=stop_after_attempt(2),
 46 |             wait=wait_fixed(1),
 47 |             retry=retry_if_exception_type(Exception),
 48 |             reraise=True,
 49 |         )
 50 |         @handle_tea_exception
 51 |         def cms_translate_text_to_promql(
 52 |                 ctx: Context,
 53 |                 text: str = Field(
 54 |                     ...,
 55 |                     description="the natural language text to generate promql",
 56 |                 ),
 57 |                 project: str = Field(..., description="sls project name"),
 58 |                 metricStore: str = Field(..., description="sls metric store name"),
 59 |                 regionId: str = Field(
 60 |                     default=...,
 61 |                     description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
 62 |                 ),
 63 |         ) -> str:
 64 |             """将自然语言转换为Prometheus PromQL查询语句。
 65 | 
 66 |             ## 功能概述
 67 | 
 68 |             该工具可以将自然语言描述转换为有效的PromQL查询语句，便于用户使用自然语言表达查询需求。
 69 | 
 70 |             ## 使用场景
 71 | 
 72 |             - 当用户不熟悉PromQL查询语法时
 73 |             - 当需要快速构建复杂查询时
 74 |             - 当需要从自然语言描述中提取查询意图时
 75 | 
 76 |             ## 使用限制
 77 | 
 78 |             - 仅支持生成PromQL查询
 79 |             - 生成的是查询语句，而非查询结果
 80 |             - 禁止使用sls_execute_query工具执行，两者接口不兼容
 81 | 
 82 |             ## 最佳实践
 83 | 
 84 |             - 提供清晰简洁的自然语言描述
 85 |             - 不要在描述中包含项目或时序库名称
 86 |             - 首次生成的查询可能不完全符合要求，可能需要多次尝试
 87 | 
 88 |             ## 查询示例
 89 | 
 90 |             - "帮我生成 XXX 的PromQL查询语句"
 91 |             - "查询每个namespace下的Pod数量"
 92 | 
 93 |             Args:
 94 |                 ctx: MCP上下文，用于访问SLS客户端
 95 |                 text: 用于生成查询的自然语言文本
 96 |                 project: SLS项目名称
 97 |                 metricStore: SLS时序库名称
 98 |                 regionId: 阿里云区域ID
 99 | 
100 |             Returns:
101 |                 生成的PromQL查询语句
102 |             """
103 |             try:
104 |                 sls_client: SLSClient = ctx.request_context.lifespan_context[
105 |                     "sls_client"
106 |                 ].with_region("cn-shanghai")
107 |                 request: CallAiToolsRequest = CallAiToolsRequest()
108 |                 request.tool_name = "text_to_promql"
109 |                 request.region_id = regionId
110 |                 params: dict[str, Any] = {
111 |                     "project": project,
112 |                     "metricstore": metricStore,
113 |                     "sys.query": text,
114 |                 }
115 |                 request.params = params
116 |                 runtime: util_models.RuntimeOptions = util_models.RuntimeOptions()
117 |                 runtime.read_timeout = 60000
118 |                 runtime.connect_timeout = 60000
119 |                 tool_response: CallAiToolsResponse = (
120 |                     sls_client.call_ai_tools_with_options(
121 |                         request=request, headers={}, runtime=runtime
122 |                     )
123 |                 )
124 |                 data = tool_response.body
125 |                 if "------answer------\n" in data:
126 |                     data = data.split("------answer------\n")[1]
127 |                 return data
128 |             except Exception as e:
129 |                 logger.error(f"调用CMS AI工具失败: {str(e)}")
130 |                 raise
131 | 
132 |         @self.server.tool()
133 |         @retry(
134 |             stop=stop_after_attempt(2),
135 |             wait=wait_fixed(1),
136 |             retry=retry_if_exception_type(Exception),
137 |             reraise=True,
138 |         )
139 |         @handle_tea_exception
140 |         def cms_execute_promql_query(
141 |                 ctx: Context,
142 |                 project: str = Field(..., description="sls project name"),
143 |                 metricStore: str = Field(..., description="sls metric store name"),
144 |                 query: str = Field(..., description="query"),
145 |                 fromTimestampInSeconds: int = Field(
146 |                     ...,
147 |                     description="from timestamp,unit is second,should be unix timestamp, only number,no other characters",
148 |                 ),
149 |                 toTimestampInSeconds: int = Field(
150 |                     ...,
151 |                     description="to timestamp,unit is second,should be unix timestamp, only number,no other characters",
152 |                 ),
153 |                 regionId: str = Field(
154 |                     default=...,
155 |                     description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
156 |                 ),
157 |         ) -> dict:
158 |             """执行Prometheus指标查询。
159 | 
160 |             ## 功能概述
161 | 
162 |             该工具用于在指定的SLS项目和时序库上执行查询语句，并返回查询结果。查询将在指定的时间范围内执行。
163 |             如果上下文没有提到具体的 SQL 语句，必须优先使用 cms_translate_text_to_promql 工具生成查询语句,无论问题有多简单。
164 |             如果上下文没有提到具体的时间戳，必须优先使用 sls_get_current_time 工具生成时间戳参数，默认为最近15分钟
165 | 
166 |             ## 使用场景
167 | 
168 |             - 当需要根据特定条件查询日志数据时
169 |             - 当需要分析特定时间范围内的日志信息时
170 |             - 当需要检索日志中的特定事件或错误时
171 |             - 当需要统计日志数据的聚合信息时
172 | 
173 | 
174 |             ## 查询语法
175 | 
176 |             查询必须使用PromQL有效的查询语法，而非自然语言。
177 | 
178 |             ## 时间范围
179 | 
180 |             查询必须指定时间范围：
181 |             - fromTimestampInSeconds: 开始时间戳（秒）
182 |             - toTimestampInSeconds: 结束时间戳（秒）
183 |             默认为最近15分钟，需要调用 sls_get_current_time 工具获取当前时间
184 | 
185 |             ## 查询示例
186 | 
187 |             - "帮我查询下 job xxx 的采集状态"
188 |             - "查一下当前有多少个 Pod"
189 | 
190 |             ## 输出
191 |             查询结果为：xxxxx
192 |             对应的图示：将 image 中的 URL 连接到图示中，并展示在图示中。
193 | 
194 |             Args:
195 |                 ctx: MCP上下文，用于访问CMS客户端
196 |                 project: SLS项目名称
197 |                 metricStore: SLS日志库名称
198 |                 query: PromQL查询语句
199 |                 fromTimestampInSeconds: 查询开始时间戳（秒）
200 |                 toTimestampInSeconds: 查询结束时间戳（秒）
201 |                 regionId: 阿里云区域ID
202 | 
203 |             Returns:
204 |                 查询结果列表，每个元素为一条日志记录
205 |             """
206 |             spls = CMSSPLContainer()
207 |             sls_client: SLSClient = ctx.request_context.lifespan_context[
208 |                 "sls_client"
209 |             ].with_region(regionId)
210 |             query = spls.get_spl("raw-promql-template").replace("<PROMQL>", query)
211 |             print(query)
212 | 
213 |             request: GetLogsRequest = GetLogsRequest(
214 |                 query=query,
215 |                 from_=fromTimestampInSeconds,
216 |                 to=toTimestampInSeconds,
217 |             )
218 |             runtime: util_models.RuntimeOptions = util_models.RuntimeOptions()
219 |             runtime.read_timeout = 60000
220 |             runtime.connect_timeout = 60000
221 |             response: GetLogsResponse = sls_client.get_logs_with_options(
222 |                 project, metricStore, request, headers={}, runtime=runtime
223 |             )
224 |             response_body: List[Dict[str, Any]] = response.body
225 | 
226 |             result = {
227 |                 "data": response_body,
228 |                 "message": (
229 |                     "success"
230 |                     if response_body
231 |                     else "Not found data by query,you can try to change the query or time range"
232 |                 ),
233 |             }
234 |             print(result)
235 |             return result
236 | 
237 | 
238 | class CMSSPLContainer:
239 |     def __init__(self):
240 |         self.spls = {}
241 |         self.spls[
242 |             "raw-promql-template"
243 |         ] = r"""
244 | .set "sql.session.velox_support_row_constructor_enabled" = 'true';
245 | .set "sql.session.presto_velox_mix_run_not_check_linked_agg_enabled" = 'true';
246 | .set "sql.session.presto_velox_mix_run_support_complex_type_enabled" = 'true';
247 | .set "sql.session.velox_sanity_limit_enabled" = 'false';
248 | .metricstore with(promql_query='<PROMQL>',range='1m')| extend latest_ts = element_at(__ts__,cardinality(__ts__)), latest_val = element_at(__value__,cardinality(__value__))
249 | |  stats arr_ts = array_agg(__ts__), arr_val = array_agg(__value__), title_agg = array_agg(json_format(cast(__labels__ as json))), anomalies_score_series = array_agg(array[0.0]), anomalies_type_series = array_agg(array['']), cnt = count(*), latest_ts = array_agg(latest_ts), latest_val = array_agg(latest_val)
250 | | extend cluster_res = cluster(arr_val,'kmeans') | extend params = concat('{"n_col": ', cast(cnt as varchar), ',"subplot":true}')
251 | | extend image = series_anomalies_plot(arr_ts, arr_val, anomalies_score_series, anomalies_type_series, title_agg, params)| project title_agg,cnt,latest_ts,latest_val,image
252 | """
253 | 
254 |     def get_spl(self, key) -> str:
255 |         return self.spls.get(key, "Key not found")
256 | 


--------------------------------------------------------------------------------
/src/mcp_server_aliyun_observability/toolkit/sls_toolkit.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, Dict, List
  3 | 
  4 | from alibabacloud_sls20201230.client import Client
  5 | from alibabacloud_sls20201230.models import (CallAiToolsRequest,
  6 |                                              CallAiToolsResponse,
  7 |                                              GetIndexResponse,
  8 |                                              GetIndexResponseBody,
  9 |                                              GetLogsRequest, GetLogsResponse,
 10 |                                              IndexJsonKey, IndexKey,
 11 |                                              ListLogStoresRequest,
 12 |                                              ListLogStoresResponse,
 13 |                                              ListProjectRequest,
 14 |                                              ListProjectResponse)
 15 | from alibabacloud_tea_util import models as util_models
 16 | from mcp.server.fastmcp import Context, FastMCP
 17 | from mcp.server.fastmcp.prompts import base
 18 | from pydantic import Field
 19 | from tenacity import (retry, retry_if_exception_type, stop_after_attempt,
 20 |                       wait_fixed)
 21 | 
 22 | from mcp_server_aliyun_observability.utils import (append_current_time,
 23 |                                                    get_current_time,
 24 |                                                    handle_tea_exception,
 25 |                                                    parse_json_keys,
 26 |                                                    text_to_sql)
 27 | 
 28 | # 配置日志
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | 
 32 | class SLSToolkit:
 33 |     """aliyun observability tools manager"""
 34 | 
 35 |     def __init__(self, server: FastMCP):
 36 |         """
 37 |         initialize the tools manager
 38 | 
 39 |         Args:
 40 |             server: FastMCP server instance
 41 |         """
 42 |         self.server = server
 43 |         self._register_sls_tools()
 44 |         self._register_prompts()
 45 |         
 46 |         
 47 |     def _register_prompts(self):
 48 |         """register sls related prompts functions"""
 49 |         
 50 |         @self.server.prompt(name="sls 日志查询 prompt",description="当用户需要查询 sls 日志时，可以调用该 prompt来获取过程")
 51 |         def query_sls_logs(question: str) -> str:
 52 |             """当用户需要查询 sls 日志时，可以调用该 prompt来获取过程"""
 53 |             return [
 54 |                 base.UserMessage("基于以下问题查询下对应的 sls日志:"),
 55 |                 base.UserMessage(
 56 |                     f"问题: {question}"
 57 |                 ),
 58 |                 base.UserMessage("过程如下:"),
 59 |                 base.UserMessage(content="1.首先尝试从上下文提取有效的 project 和 logstore 信息,如果上下文没有提供，请使用 sls_list_projects 和 sls_list_logstores 工具获取"),
 60 |                 base.UserMessage(content="2.如果问题里面已经明确包含了查询语句，则直接使用，如果问题里面没有明确包含查询语句，则需要使用 sls_translate_natural_language_to_log_query 工具生成查询语句"),
 61 |                 base.UserMessage(
 62 |                     "3. 最后使用 sls_execute_query 工具执行查询语句，获取查询结果"
 63 |                 ),
 64 |                 base.UserMessage("3. 返回查询到的日志"),
 65 |             ]
 66 |             
 67 |     def _register_sls_tools(self):
 68 |         """register sls related tools functions"""
 69 | 
 70 |         @self.server.tool()
 71 |         def sls_list_projects(
 72 |             ctx: Context,
 73 |             projectName: str = Field(
 74 |                 None, description="project name,fuzzy search"
 75 |             ),
 76 |             limit: int = Field(
 77 |                 default=50, description="limit,max is 100", ge=1, le=100
 78 |             ),
 79 |             regionId: str = Field(default=..., description="aliyun region id"),
 80 |         ) -> list[dict[str, Any]]:
 81 |             """列出阿里云日志服务中的所有项目。
 82 | 
 83 |             ## 功能概述
 84 | 
 85 |             该工具可以列出指定区域中的所有SLS项目，支持通过项目名进行模糊搜索。如果不提供项目名称，则返回该区域的所有项目。
 86 | 
 87 |             ## 使用场景
 88 | 
 89 |             - 当需要查找特定项目是否存在时
 90 |             - 当需要获取某个区域下所有可用的SLS项目列表时
 91 |             - 当需要根据项目名称的部分内容查找相关项目时
 92 | 
 93 |             ## 返回数据结构
 94 | 
 95 |             返回的项目信息包含：
 96 |             - project_name: 项目名称
 97 |             - description: 项目描述
 98 |             - region_id: 项目所在区域
 99 | 
100 |             ## 查询示例
101 | 
102 |             - "有没有叫 XXX 的 project"
103 |             - "列出所有SLS项目"
104 | 
105 |             Args:
106 |                 ctx: MCP上下文，用于访问SLS客户端
107 |                 projectName: 项目名称查询字符串，支持模糊搜索
108 |                 limit: 返回结果的最大数量，范围1-100，默认10
109 |                 regionId: 阿里云区域ID,region id format like "xx-xxx",like "cn-hangzhou"
110 | 
111 |             Returns:
112 |                 包含项目信息的字典列表，每个字典包含project_name、description和region_id
113 |             """
114 |             sls_client: Client = ctx.request_context.lifespan_context[
115 |                 "sls_client"
116 |             ].with_region(regionId)
117 |             request: ListProjectRequest = ListProjectRequest(
118 |                 project_name=projectName,
119 |                 size=limit,
120 |             )
121 |             response: ListProjectResponse = sls_client.list_project(request)
122 | 
123 |             return{
124 |                 "projects": [
125 |                     {
126 |                         "project_name": project.project_name,
127 |                         "description": project.description,
128 |                         "region_id": project.region,
129 |                 }
130 |                 for project in response.body.projects
131 |             ],
132 |                 "message": f"当前最多支持查询{limit}个项目，未防止返回数据过长，如果需要查询更多项目，您可以提供 project 的关键词来模糊查询"
133 |             }
134 | 
135 |         @self.server.tool()
136 |         @retry(
137 |             stop=stop_after_attempt(2),
138 |             wait=wait_fixed(1),
139 |             retry=retry_if_exception_type(Exception),
140 |             reraise=True,
141 |         )
142 |         @handle_tea_exception
143 |         def sls_list_logstores(
144 |             ctx: Context,
145 |             project: str = Field(..., description="sls project name,must exact match,should not contain chinese characters"),
146 |             logStore: str = Field(None, description="log store name,fuzzy search"),
147 |             limit: int = Field(10, description="limit,max is 100", ge=1, le=100),
148 |             isMetricStore: bool = Field(
149 |                 False,
150 |                 description="is metric store,default is False,only use want to find metric store",
151 |             ),
152 |             logStoreType: str = Field(
153 |                 None,
154 |                 description="log store type,default is logs,should be logs,metrics",
155 |             ),
156 |             regionId: str = Field(
157 |                 default=...,
158 |                 description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
159 |             ),
160 |         ) -> list[str]:
161 |             """列出SLS项目中的日志库。
162 | 
163 |             ## 功能概述
164 | 
165 |             该工具可以列出指定SLS项目中的所有日志库，如果不选，则默认为日志库类型
166 |             支持通过日志库名称进行模糊搜索。如果不提供日志库名称，则返回项目中的所有日志库。
167 | 
168 |             ## 使用场景
169 | 
170 |             - 当需要查找特定项目下是否存在某个日志库时
171 |             - 当需要获取项目中所有可用的日志库列表时
172 |             - 当需要根据日志库名称的部分内容查找相关日志库时
173 |             - 如果从上下文未指定 project参数，除非用户说了遍历，则可使用 sls_list_projects 工具获取项目列表
174 | 
175 |             ## 是否指标库
176 | 
177 |             如果需要查找指标或者时序相关的库,请将is_metric_store参数设置为True
178 | 
179 |             ## 查询示例
180 | 
181 |             - "我想查询有没有 XXX 的日志库"
182 |             - "某个 project 有哪些 log store"
183 | 
184 |             Args:
185 |                 ctx: MCP上下文，用于访问SLS客户端
186 |                 project: SLS项目名称，必须精确匹配
187 |                 log_store: 日志库名称，支持模糊搜索
188 |                 limit: 返回结果的最大数量，范围1-100，默认10
189 |                 is_metric_store: 是否指标库，可选值为True或False，默认为False
190 |                 region_id: 阿里云区域ID
191 | 
192 |             Returns:
193 |                 日志库名称的字符串列表
194 |             """
195 |             if isMetricStore:
196 |                 logStoreType = "Metrics"
197 |                 
198 |             if project == "":
199 |                 return {
200 |                     "total": 0,
201 |                     "logstores": [],
202 |                     "messager": "Please specify the project name,if you want to list all projects,please use sls_list_projects tool",
203 |                 }
204 |             sls_client: Client = ctx.request_context.lifespan_context[
205 |                 "sls_client"
206 |             ].with_region(regionId)
207 |             request: ListLogStoresRequest = ListLogStoresRequest(
208 |                 logstore_name=logStore,
209 |                 size=limit,
210 |                 telemetry_type=logStoreType,
211 |             )
212 |             response: ListLogStoresResponse = sls_client.list_log_stores(
213 |                 project, request
214 |             )
215 |             log_store_count = response.body.total
216 |             log_store_list = response.body.logstores
217 |             return {
218 |                 "total": log_store_count,
219 |                 "logstores": log_store_list,
220 |                 "message": (
221 |                     "Sorry not found logstore,please make sure your project and region or logstore name is correct, if you want to find metric store,please check is_metric_store parameter"
222 |                     if log_store_count == 0
223 |                     else f"当前最多支持查询{limit}个日志库，未防止返回数据过长，如果需要查询更多日志库，您可以提供 logstore 的关键词来模糊查询"
224 |                 ),
225 |             }
226 | 
227 |         @self.server.tool()
228 |         @retry(
229 |             stop=stop_after_attempt(2),
230 |             wait=wait_fixed(1),
231 |             retry=retry_if_exception_type(Exception),
232 |             reraise=True,
233 |         )
234 |         @handle_tea_exception
235 |         def sls_describe_logstore(
236 |             ctx: Context,
237 |             project: str = Field(
238 |                 ..., description="sls project name,must exact match,not fuzzy search"
239 |             ),
240 |             logStore: str = Field(
241 |                 ..., description="sls log store name,must exact match,not fuzzy search"
242 |             ),
243 |             regionId: str = Field(
244 |                 default=...,
245 |                 description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
246 |             ),
247 |         ) -> dict:
248 |             """获取SLS日志库的结构信息。
249 | 
250 |             ## 功能概述
251 | 
252 |             该工具用于获取指定SLS项目中日志库的索引信息和结构定义，包括字段类型、别名、是否大小写敏感等信息。
253 | 
254 |             ## 使用场景
255 | 
256 |             - 当需要了解日志库的字段结构时
257 |             - 当需要获取日志库的索引配置信息时
258 |             - 当构建查询语句前需要了解可用字段时
259 |             - 当需要分析日志数据结构时
260 | 
261 |             ## 返回数据结构
262 | 
263 |             返回一个字典，键为字段名，值包含以下信息：
264 |             - alias: 字段别名
265 |             - sensitive: 是否大小写敏感
266 |             - type: 字段类型
267 |             - json_keys: JSON字段的子字段信息
268 | 
269 |             ## 查询示例
270 | 
271 |             - "我想查询 XXX 的日志库的 schema"
272 |             - "我想查询 XXX 的日志库的 index"
273 |             - "我想查询 XXX 的日志库的结构信息"
274 | 
275 |             Args:
276 |                 ctx: MCP上下文，用于访问SLS客户端
277 |                 project: SLS项目名称，必须精确匹配
278 |                 log_store: SLS日志库名称，必须精确匹配
279 |                 region_id: 阿里云区域ID
280 | 
281 |             Returns:
282 |                 包含日志库结构信息的字典
283 |             """
284 |             sls_client: Client = ctx.request_context.lifespan_context[
285 |                 "sls_client"
286 |             ].with_region(regionId)
287 |             response: GetIndexResponse = sls_client.get_index(project, logStore)
288 |             response_body: GetIndexResponseBody = response.body
289 |             keys: dict[str, IndexKey] = response_body.keys
290 |             index_dict: dict[str, dict[str, str]] = {}
291 |             for key, value in keys.items():
292 |                 index_dict[key] = {
293 |                     "alias": value.alias,
294 |                     "sensitive": value.case_sensitive,
295 |                     "type": value.type,
296 |                     "json_keys": parse_json_keys(value.json_keys),
297 |                 }
298 |             return index_dict
299 | 
300 |         @self.server.tool()
301 |         @retry(
302 |             stop=stop_after_attempt(2),
303 |             wait=wait_fixed(1),
304 |             retry=retry_if_exception_type(Exception),
305 |             reraise=True,
306 |         )
307 |         @handle_tea_exception
308 |         def sls_execute_sql_query(
309 |             ctx: Context,
310 |             project: str = Field(..., description="sls project name"),
311 |             logStore: str = Field(..., description="sls log store name"),
312 |             query: str = Field(..., description="query"),
313 |             fromTimestampInSeconds: int = Field(
314 |                 ..., description="from timestamp,unit is second,should be unix timestamp, only number,no other characters"
315 |             ),
316 |             toTimestampInSeconds: int = Field(..., description="to timestamp,unit is second,should be unix timestamp, only number,no other characters"),
317 |             limit: int = Field(10, description="limit,max is 100", ge=1, le=100),
318 |             regionId: str = Field(
319 |                 default=...,
320 |                 description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
321 |             ),
322 |         ) -> dict:
323 |             """执行SLS日志查询。
324 | 
325 |             ## 功能概述
326 | 
327 |             该工具用于在指定的SLS项目和日志库上执行查询语句，并返回查询结果。查询将在指定的时间范围内执行。 如果上下文没有提到具体的 SQL 语句，必须优先使用 sls_translate_text_to_sql_query 工具生成查询语句,无论问题有多简单
328 | 
329 |             ## 使用场景
330 | 
331 |             - 当需要根据特定条件查询日志数据时
332 |             - 当需要分析特定时间范围内的日志信息时
333 |             - 当需要检索日志中的特定事件或错误时
334 |             - 当需要统计日志数据的聚合信息时
335 | 
336 | 
337 |             ## 查询语法
338 | 
339 |             查询必须使用SLS有效的查询语法，而非自然语言。如果不了解日志库的结构，可以先使用sls_describe_logstore工具获取索引信息。
340 | 
341 |             ## 时间范围
342 | 
343 |             查询必须指定时间范围：  if the query is generated by sls_translate_text_to_sql_query tool, should use the fromTimestampInSeconds and toTimestampInSeconds in the sls_translate_text_to_sql_query response
344 |             - fromTimestampInSeconds: 开始时间戳（秒）
345 |             - toTimestampInSeconds: 结束时间戳（秒）
346 | 
347 |             ## 查询示例
348 | 
349 |             - "帮我查询下 XXX 的日志信息"
350 |             - "查找最近一小时内的错误日志"
351 | 
352 |             ## 错误处理
353 |             - Column xxx can not be resolved 如果是 sls_translate_text_to_sql_query 工具生成的查询语句 可能存在查询列未开启统计，可以提示用户增加相对应的信息，或者调用 sls_describe_logstore 工具获取索引信息之后，要用户选择正确的字段或者提示用户对列开启统计。当确定列开启统计之后，可以再次调用sls_translate_text_to_sql_query 工具生成查询语句
354 | 
355 |             Args:
356 |                 ctx: MCP上下文，用于访问SLS客户端
357 |                 project: SLS项目名称
358 |                 logStore: SLS日志库名称
359 |                 query: SLS查询语句
360 |                 fromTimestamp: 查询开始时间戳（秒）
361 |                 toTimestamp: 查询结束时间戳（秒）
362 |                 limit: 返回结果的最大数量，范围1-100，默认10
363 |                 regionId: 阿里云区域ID
364 | 
365 |             Returns:
366 |                 查询结果列表，每个元素为一条日志记录
367 |             """
368 |             sls_client: Client = ctx.request_context.lifespan_context[
369 |                 "sls_client"
370 |             ].with_region(regionId)
371 |             request: GetLogsRequest = GetLogsRequest(
372 |                 query=query,
373 |                 from_=fromTimestampInSeconds,
374 |                 to=toTimestampInSeconds,
375 |                 line=limit,
376 |             )
377 |             runtime: util_models.RuntimeOptions = util_models.RuntimeOptions()
378 |             runtime.read_timeout = 60000
379 |             runtime.connect_timeout = 60000
380 |             response: GetLogsResponse = sls_client.get_logs_with_options(
381 |                 project, logStore, request, headers={}, runtime=runtime
382 |             )
383 |             response_body: List[Dict[str, Any]] = response.body
384 |             result = {
385 |                 "data": response_body,
386 |                 "message": "success"
387 |                 if response_body
388 |                 else "Not found data by query,you can try to change the query or time range",
389 |             }
390 |             return result
391 | 
392 |         @self.server.tool()
393 |         @retry(
394 |             stop=stop_after_attempt(2),
395 |             wait=wait_fixed(1),
396 |             retry=retry_if_exception_type(Exception),
397 |             reraise=True,
398 |         )
399 |         @handle_tea_exception
400 |         def sls_translate_text_to_sql_query(
401 |             ctx: Context,
402 |             text: str = Field(
403 |                 ...,
404 |                 description="the natural language text to generate sls log store query",
405 |             ),
406 |             project: str = Field(..., description="sls project name"),
407 |             logStore: str = Field(..., description="sls log store name"),
408 |             regionId: str = Field(
409 |                 default=...,
410 |                 description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
411 |             ),
412 |         ) -> dict[str, Any]:
413 |             """将自然语言转换为SLS查询语句。当用户有明确的 logstore 查询需求，必须优先使用该工具来生成查询语句
414 | 
415 |             ## 功能概述
416 | 
417 |             该工具可以将自然语言描述转换为有效的SLS查询语句，便于用户使用自然语言表达查询需求。用户有任何 SLS 日志查询需求时，都需要优先使用该工具。
418 | 
419 |             ## 使用场景
420 | 
421 |             - 当用户不熟悉SLS查询语法时
422 |             - 当需要快速构建复杂查询时
423 |             - 当需要从自然语言描述中提取查询意图时
424 | 
425 |             ## 使用限制
426 | 
427 |             - 仅支持生成SLS查询，不支持其他数据库的SQL如MySQL、PostgreSQL等
428 |             - 生成的是查询语句，而非查询结果，需要配合sls_execute_query工具使用
429 |             - 如果查询涉及ARMS应用，应优先使用arms_generate_trace_query工具
430 |             - 需要对应的 log_sotre 已经设定了索引信息，如果生成的结果里面有字段没有索引或者开启统计，可能会导致查询失败，需要友好的提示用户增加相对应的索引信息
431 | 
432 |             ## 最佳实践
433 | 
434 |             - 提供清晰简洁的自然语言描述
435 |             - 不要在描述中包含项目或日志库名称
436 |             - 如有需要，指定查询的时间范围
437 |             - 首次生成的查询可能不完全符合要求，可能需要多次尝试
438 | 
439 |             ## 查询示例
440 | 
441 |             - "帮我生成下 XXX 的日志查询语句"
442 |             - "查找最近一小时内的错误日志"
443 | 
444 |             Args:
445 |                 ctx: MCP上下文，用于访问SLS客户端
446 |                 text: 用于生成查询的自然语言文本
447 |                 project: SLS项目名称
448 |                 log_store: SLS日志库名称
449 |                 region_id: 阿里云区域ID
450 | 
451 |             Returns:
452 |                 生成的SLS查询语句
453 |             """
454 |             
455 |             return text_to_sql(ctx, text, project, logStore, regionId)
456 | 
457 |         @self.server.tool()
458 |         def sls_diagnose_query(
459 |             ctx: Context,
460 |             query: str = Field(..., description="sls query"),
461 |             errorMessage: str = Field(..., description="error message"),
462 |             project: str = Field(..., description="sls project name"),
463 |             logStore: str = Field(..., description="sls log store name"),
464 |             regionId: str = Field(
465 |                 default=...,
466 |                 description="aliyun region id,region id format like 'xx-xxx',like 'cn-hangzhou'",
467 |             ),
468 |         ) -> dict:
469 |             """诊断SLS查询语句。
470 | 
471 |             ## 功能概述
472 | 
473 |             当 SLS 查询语句执行失败时，可以调用该工具，根据错误信息，生成诊断结果。诊断结果会包含查询语句的正确性、性能分析、优化建议等信息。
474 | 
475 |             ## 使用场景
476 | 
477 |             - 当需要诊断SLS查询语句的正确性时
478 |             - 当 SQL 执行错误需要查找原因时
479 | 
480 |             ## 查询示例
481 | 
482 |             - "帮我诊断下 XXX 的日志查询语句"
483 |             - "帮我分析下 XXX 的日志查询语句"
484 | 
485 |             Args:
486 |                 ctx: MCP上下文，用于访问SLS客户端
487 |                 query: SLS查询语句
488 |                 error_message: 错误信息
489 |                 project: SLS项目名称
490 |                 log_store: SLS日志库名称
491 |                 region_id: 阿里云区域ID
492 |             """
493 |             try:
494 |                 sls_client_wrapper = ctx.request_context.lifespan_context["sls_client"]
495 |                 sls_client: Client = sls_client_wrapper.with_region("cn-shanghai")
496 |                 knowledge_config = sls_client_wrapper.get_knowledge_config(project, logStore)
497 |                 request: CallAiToolsRequest = CallAiToolsRequest()
498 |                 request.tool_name = "diagnosis_sql"
499 |                 request.region_id = regionId
500 |                 params: dict[str, Any] = {
501 |                     "project": project,
502 |                     "logstore": logStore,
503 |                     "sys.query": append_current_time(f"帮我诊断下 {query} 的日志查询语句,错误信息为 {errorMessage}"),
504 |                     "external_knowledge_uri": knowledge_config["uri"] if knowledge_config else "",
505 |                     "external_knowledge_key": knowledge_config["key"] if knowledge_config else "",
506 |                 }
507 |                 request.params = params
508 |                 runtime: util_models.RuntimeOptions = util_models.RuntimeOptions()
509 |                 runtime.read_timeout = 60000
510 |                 runtime.connect_timeout = 60000
511 |                 tool_response: CallAiToolsResponse = (
512 |                     sls_client.call_ai_tools_with_options(
513 |                         request=request, headers={}, runtime=runtime
514 |                     )
515 |                 )
516 |                 data = tool_response.body
517 |                 if "------answer------\n" in data:
518 |                     data = data.split("------answer------\n")[1]
519 |                 return data
520 |             except Exception as e:
521 |                 logger.error(f"调用SLS AI工具失败: {str(e)}")
522 |                 raise


--------------------------------------------------------------------------------
/src/mcp_server_aliyun_observability/toolkit/util_toolkit.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from datetime import datetime
 3 | 
 4 | from mcp.server.fastmcp import Context, FastMCP
 5 | 
 6 | from mcp_server_aliyun_observability import utils
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class UtilToolkit:
12 |     def __init__(self, server: FastMCP):
13 |         self.server = server
14 |         self._register_common_tools()
15 | 
16 |     def _register_common_tools(self):
17 |         """register common tools functions"""
18 | 
19 |         @self.server.tool()
20 |         def sls_get_regions(ctx: Context) -> dict:
21 |             """获取阿里云的部分区域列表。
22 | 
23 |             ## 功能概述
24 | 
25 |             该工具用于获取阿里云的部分区域列表，便于在执行SLS查询时指定区域。
26 | 
27 |             ## 使用场景
28 | 
29 |             - 当需要获取阿里云的部分区域列表时
30 |             - 当需要根据区域进行SLS查询时
31 |             - 当用户没有明确指定区域ID 时，可以调用该工具获取区域列表，并要求用户进行选择
32 | 
33 |             ## 返回数据格式
34 | 
35 |             返回包含区域列表的字典，每个字典包含region_id和region_name。
36 | 
37 |             ## 查询示例
38 | 
39 |             - "获取阿里云的部分区域列表"
40 |             """
41 |             return [  
42 |                     {"RegionName": "华北1（青岛）", "RegionId": "cn-qingdao"},  
43 |                     {"RegionName": "华北2（北京）", "RegionId": "cn-beijing"},
44 |                     {"RegionName": "华北3（张家口）", "RegionId": "cn-zhangjiakou"},
45 |                     {"RegionName": "华北5（呼和浩特）", "RegionId": "cn-huhehaote"},
46 |                     {"RegionName": "华北6（乌兰察布）", "RegionId": "cn-wulanchabu"},
47 |                     {"RegionName": "华东1（杭州）", "RegionId": "cn-hangzhou"},  
48 |                     {"RegionName": "华东2（上海）", "RegionId": "cn-shanghai"},  
49 |                     {"RegionName": "华东5（南京-本地地域）", "RegionId": "cn-nanjing"},  
50 |                     {"RegionName": "华东6（福州-本地地域）", "RegionId": "cn-fuzhou"},  
51 |                     {"RegionName": "华南1（深圳）", "RegionId": "cn-shenzhen"},  
52 |                     {"RegionName": "华南2（河源）", "RegionId": "cn-heyuan"},  
53 |                     {"RegionName": "华南3（广州）", "RegionId": "cn-guangzhou"},  
54 |                     {"RegionName": "西南1（成都）", "RegionId": "cn-chengdu"},  
55 |                 ]
56 |             
57 |         @self.server.tool()
58 |         def sls_get_current_time(ctx: Context) -> dict:
59 |             """获取当前时间。
60 | 
61 |             ## 功能概述
62 |             1. 获取当前时间，会返回当前时间字符串和当前时间戳(毫秒)
63 | 
64 |             ## 使用场景
65 |             1. 只有当无法从聊天记录里面获取到当前时间时候才可以调用该工具
66 |             """
67 |             return utils.get_current_time()
68 | 


--------------------------------------------------------------------------------
/src/mcp_server_aliyun_observability/utils.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import logging
  3 | import json
  4 | import os.path
  5 | from pathlib import Path
  6 | from datetime import datetime
  7 | from functools import wraps
  8 | from typing import Any, Callable, Optional, TypeVar, cast
  9 | 
 10 | from alibabacloud_arms20190808.client import Client as ArmsClient
 11 | from alibabacloud_credentials.client import Client as CredClient
 12 | from alibabacloud_sls20201230.client import Client
 13 | from alibabacloud_sls20201230.client import Client as SLSClient
 14 | from alibabacloud_sls20201230.models import (CallAiToolsRequest,
 15 |                                              CallAiToolsResponse, IndexJsonKey)
 16 | from alibabacloud_tea_openapi import models as open_api_models
 17 | from alibabacloud_tea_util import models as util_models
 18 | from mcp.server.fastmcp import Context
 19 | from Tea.exceptions import TeaException
 20 | 
 21 | from mcp_server_aliyun_observability.api_error import TEQ_EXCEPTION_ERROR
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | class KnowledgeEndpoint:
 27 |     """外部知识库配置
 28 |     该类用于加载和管理外部知识库的配置，包括全局/Project/Logstore级别的外部知识库 endpoint 配置。
 29 |     其配置优先级：Logstore > Project default > Global default
 30 |     配置文件示例如下：
 31 |     ```json
 32 |     {
 33 |     "default_endpoint": {"uri": "https://api.default.com", "key": "Bearer dataset-***"},
 34 |     "projects": {
 35 |         "project1": {
 36 |             "default_endpoint": {"uri": "https://api.project1.com", "key": "Bearer dataset-***"},
 37 |             "logstore1": {"uri": "https://api.project1.logstore1.com","key": "Bearer dataset-***"},
 38 |             "logstore2": {"uri": "https://api.project1.logstore2.com","key": "Bearer dataset-***"}
 39 |         },
 40 |         "project2": {
 41 |             "logstore3": {"uri": "https://api.project2.logstore3.com","key": "Bearer dataset-***"}
 42 |         }
 43 |     }
 44 |     ```
 45 |     }
 46 |     """
 47 |     def __init__(self, file_path):
 48 |         try:
 49 |             # 将路径转换为绝对路径，支持用户目录（~）和环境变量（如 $HOME）
 50 |             expanded_path = os.path.expandvars(file_path)
 51 |             self.file_path = Path(expanded_path).expanduser().resolve()
 52 |             with open(self.file_path, 'r', encoding='utf-8') as file:
 53 |                 self.config = json.load(file)
 54 |                 logger.warning(f"已加载外部知识库配置文件 {self.file_path}")
 55 |         except FileNotFoundError:
 56 |             logger.warning(f"外部知识库配置文件 {self.file_path} 不存在")
 57 |         except json.JSONDecodeError as e:
 58 |             logger.warning(f"外部知识库配置 JSON 格式错误: {e}")
 59 | 
 60 |         # 全局默认 endpoint
 61 |         self.global_default = self.config.get("default_endpoint", None)
 62 | 
 63 |         # 项目配置
 64 |         self.projects = self.config.get("projects", {})
 65 | 
 66 |     def get_config(self, project:str, logstore:str) -> str:
 67 |         """获取指定项目和日志仓库的外部知识库 endpoint 配置
 68 |         优先级：logstore > project default > global default
 69 |         :param project: 项目名称
 70 |         :param logstore: 日志仓库名称
 71 |         :return: 外部知识库 endpoint
 72 |         """
 73 |         project_config = self.projects.get(project, None)
 74 |         if project_config is None:
 75 |             return self.global_default
 76 | 
 77 |         logstore_config = project_config.get(logstore)
 78 |         if logstore_config is None:
 79 |             return self.project_config.get("default_endpoint", None)
 80 | 
 81 |         return logstore_config
 82 | 
 83 | class CredentialWrapper:
 84 |     """
 85 |     A wrapper for aliyun credentials
 86 |     """
 87 | 
 88 |     access_key_id: str
 89 |     access_key_secret: str
 90 |     knowledge_config: KnowledgeEndpoint
 91 | 
 92 |     def __init__(self, access_key_id: str, access_key_secret: str, knowledge_config: str):
 93 |         self.access_key_id = access_key_id
 94 |         self.access_key_secret = access_key_secret
 95 |         self.knowledge_config = KnowledgeEndpoint(knowledge_config) if knowledge_config else None
 96 |     
 97 |     
 98 | class SLSClientWrapper:
 99 |     """
100 |     A wrapper for aliyun client
101 |     """
102 | 
103 |     def __init__(self, credential: Optional[CredentialWrapper] = None):
104 |         self.credential = credential
105 | 
106 |     def with_region(
107 |         self, region: str = None, endpoint: Optional[str] = None
108 |     ) -> SLSClient:
109 |         if self.credential:
110 |             config = open_api_models.Config(
111 |                 access_key_id=self.credential.access_key_id,
112 |                 access_key_secret=self.credential.access_key_secret,
113 |             )
114 |         else:
115 |             credentialsClient = CredClient()
116 |             config = open_api_models.Config(credential=credentialsClient)
117 |         config.endpoint = f"{region}.log.aliyuncs.com"
118 |         return SLSClient(config)
119 |     
120 |     def get_knowledge_config(self, project: str, logstore: str) -> str:
121 |         if self.credential and self.credential.knowledge_config:
122 |             res = self.credential.knowledge_config.get_config(project, logstore)
123 |             if "uri" in res and "key" in res:
124 |                 return res
125 |         return None
126 | 
127 | 
128 | class ArmsClientWrapper:
129 |     """
130 |     A wrapper for aliyun arms client
131 |     """
132 | 
133 |     def __init__(self, credential: Optional[CredentialWrapper] = None):
134 |         self.credential = credential
135 | 
136 |     def with_region(self, region: str, endpoint: Optional[str] = None) -> ArmsClient:
137 |         if self.credential:
138 |             config = open_api_models.Config(
139 |                 access_key_id=self.credential.access_key_id,
140 |                 access_key_secret=self.credential.access_key_secret,
141 |             )
142 |         else:
143 |             credentialsClient = CredClient()
144 |             config = open_api_models.Config(credential=credentialsClient)
145 |         config.endpoint = endpoint or f"arms.{region}.aliyuncs.com"
146 |         return ArmsClient(config)
147 | 
148 | 
149 | def parse_json_keys(json_keys: dict[str, IndexJsonKey]) -> dict[str, dict[str, str]]:
150 |     result: dict[str, dict[str, str]] = {}
151 |     for key, value in json_keys.items():
152 |         result[key] = {
153 |             "alias": value.alias,
154 |             "sensitive": value.case_sensitive,
155 |             "type": value.type,
156 |         }
157 |     return result
158 | 
159 | 
160 | def get_arms_user_trace_log_store(user_id: int, region: str) -> dict[str, str]:
161 |     """
162 |     get the log store name of the user's trace
163 |     """
164 |     # project是基于 user_id md5,proj-xtrace-xxx-cn-hangzhou
165 |     if "finance" in region:
166 |         text = str(user_id) + region
167 |         project = f"proj-xtrace-{md5_string(text)}"
168 |     else:
169 |         text = str(user_id)
170 |         project = f"proj-xtrace-{md5_string(text)}-{region}"
171 |     # logstore-xtrace-1277589232893727-cn-hangzhou
172 |     log_store = "logstore-tracing"
173 |     return {"project": project, "log_store": log_store}
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | def get_current_time() -> str:
181 |     """
182 |     获取当前时间
183 |     """
184 |     return {
185 |         "current_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
186 |         "current_timestamp": int(datetime.now().timestamp()),
187 |     }
188 | 
189 | 
190 | def md5_string(origin: str) -> str:
191 |     """
192 |     计算字符串的MD5值，与Java实现对应
193 | 
194 |     Args:
195 |         origin: 要计算MD5的字符串
196 | 
197 |     Returns:
198 |         MD5值的十六进制字符串
199 |     """
200 |     buf = origin.encode()
201 | 
202 |     md5 = hashlib.md5()
203 | 
204 |     md5.update(buf)
205 | 
206 |     tmp = md5.digest()
207 | 
208 |     sb = []
209 |     for b in tmp:
210 |         hex_str = format(b & 0xFF, "x")
211 |         sb.append(hex_str)
212 | 
213 |     return "".join(sb)
214 | 
215 | 
216 | T = TypeVar("T")
217 | 
218 | 
219 | def handle_tea_exception(func: Callable[..., T]) -> Callable[..., T]:
220 |     """
221 |     装饰器：处理阿里云 SDK 的 TeaException 异常
222 | 
223 |     Args:
224 |         func: 被装饰的函数
225 | 
226 |     Returns:
227 |         装饰后的函数，会自动处理 TeaException 异常
228 |     """
229 | 
230 |     @wraps(func)
231 |     def wrapper(*args, **kwargs) -> T:
232 |         try:
233 |             return func(*args, **kwargs)
234 |         except TeaException as e:
235 |             for error in TEQ_EXCEPTION_ERROR:
236 |                 if e.code == error["errorCode"]:
237 |                     return cast(
238 |                         T,
239 |                         {
240 |                             "solution": error["solution"],
241 |                             "message": error["errorMessage"],
242 |                         },
243 |                     )
244 |             message=e.message
245 |             if "Max retries exceeded with url" in message:
246 |                 return cast(
247 |                     T,
248 |                     {
249 |                         "solution": """
250 |                         可能原因:
251 |                             1.	当前网络不具备访问内网域名的权限（如从公网或不通阿里云 VPC 访问）；
252 |                             2.	指定 region 错误或不可用；
253 |                             3.	工具或网络中存在代理、防火墙限制；
254 |                             如果你需要排查，可以从：
255 |                             •	尝试 ping 下域名是否可联通
256 |                             •	查看是否有 VPC endpoint 配置错误等，如果是非VPC 环境，请配置公网入口端点，一般公网端点不会包含-intranet 等字样
257 |                             """,
258 |                         "message": e.message,
259 |                     },
260 |                 )
261 |             raise e
262 | 
263 |     return wrapper
264 | 
265 | 
266 | def text_to_sql(
267 |     ctx: Context, text: str, project: str, log_store: str, region_id: str
268 | ) -> dict[str, Any]:
269 |     try:
270 |         sls_client_wrapper = ctx.request_context.lifespan_context["sls_client"]
271 |         sls_client: Client = sls_client_wrapper.with_region("cn-shanghai")
272 |         knowledge_config = sls_client_wrapper.get_knowledge_config(project, log_store)
273 |         request: CallAiToolsRequest = CallAiToolsRequest()
274 |         request.tool_name = "text_to_sql"
275 |         request.region_id = region_id
276 |         params: dict[str, Any] = {
277 |             "project": project,
278 |             "logstore": log_store,
279 |             "sys.query": append_current_time(text),
280 |             "external_knowledge_uri": knowledge_config["uri"] if knowledge_config else "",
281 |             "external_knowledge_key": knowledge_config["key"] if knowledge_config else "",
282 |         }
283 |         request.params = params
284 |         runtime: util_models.RuntimeOptions = util_models.RuntimeOptions()
285 |         runtime.read_timeout = 60000
286 |         runtime.connect_timeout = 60000
287 |         tool_response: CallAiToolsResponse = sls_client.call_ai_tools_with_options(
288 |             request=request, headers={}, runtime=runtime
289 |         )
290 |         data = tool_response.body
291 |         if "------answer------\n" in data:
292 |             data = data.split("------answer------\n")[1]
293 |         return {
294 |             "data": data,
295 |             "requestId": tool_response.headers.get("x-log-requestid", ""),
296 |         }
297 |     except Exception as e:
298 |         logger.error(f"调用SLS AI工具失败: {str(e)}")
299 |         raise
300 | 
301 | def append_current_time(text: str) -> str:
302 |     """
303 |     添加当前时间
304 |     """
305 |     return f"当前时间: {get_current_time()},问题:{text}"


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 测试包初始化文件
3 | """
4 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | import pytest
 4 | from mcp.server.fastmcp import Context, FastMCP
 5 | 
 6 | 
 7 | @pytest.fixture(scope="session")
 8 | def mock_sls_client():
 9 |     """创建模拟的SLS客户端"""
10 |     return Mock()
11 | 
12 | @pytest.fixture(scope="session")
13 | def mock_arms_client():
14 |     """创建模拟的ARMS客户端"""
15 |     return Mock()
16 | 
17 | @pytest.fixture(scope="session")
18 | def mock_context(mock_sls_client, mock_arms_client):
19 |     """创建模拟的Context实例"""
20 |     context = Mock(spec=Context)
21 |     context.request_context = Mock()
22 |     context.request_context.lifespan_context = {
23 |         "sls_client": mock_sls_client,
24 |         "arms_client": mock_arms_client
25 |     }
26 |     return context
27 | 
28 | @pytest.fixture(scope="session")
29 | def mock_server():
30 |     """创建模拟的FastMCP服务器实例"""
31 |     return Mock(spec=FastMCP) 


--------------------------------------------------------------------------------
/tests/test_arms_toolkit.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | 
  4 | import dotenv
  5 | import pytest
  6 | from mcp.server.fastmcp import Context, FastMCP
  7 | from mcp.shared.context import RequestContext
  8 | 
  9 | from mcp_server_aliyun_observability.server import create_lifespan
 10 | from mcp_server_aliyun_observability.toolkit.arms_toolkit import ArmsToolkit
 11 | from mcp_server_aliyun_observability.utils import (ArmsClientWrapper,
 12 |                                                    CredentialWrapper,
 13 |                                                    SLSClientWrapper)
 14 | 
 15 | dotenv.load_dotenv()
 16 | 
 17 | import logging
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def mcp_server():
 24 |     """创建模拟的FastMCP服务器实例"""
 25 |     mcp_server = FastMCP(
 26 |         name="mcp_aliyun_observability_server",
 27 |         lifespan=create_lifespan(
 28 |             credential=CredentialWrapper(
 29 |                 access_key_id=os.getenv("ALIYUN_ACCESS_KEY_ID"),
 30 |                 access_key_secret=os.getenv("ALIYUN_ACCESS_KEY_SECRET"),
 31 |             ),
 32 |         ),
 33 |     )
 34 |     return mcp_server
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def mock_request_context():
 39 |     """创建模拟的RequestContext实例"""
 40 |     context = Context(
 41 |         request_context=RequestContext(
 42 |             request_id="test_request_id",
 43 |             meta=None,
 44 |             session=None,
 45 |             lifespan_context={
 46 |                 "arms_client": ArmsClientWrapper(
 47 |                     credential=CredentialWrapper(
 48 |                         access_key_id=os.getenv("ALIYUN_ACCESS_KEY_ID"),
 49 |                         access_key_secret=os.getenv("ALIYUN_ACCESS_KEY_SECRET"),
 50 |                     ),
 51 |                 ),
 52 |                 "sls_client": SLSClientWrapper(
 53 |                     credential=CredentialWrapper(
 54 |                         access_key_id=os.getenv("ALIYUN_ACCESS_KEY_ID"),
 55 |                         access_key_secret=os.getenv("ALIYUN_ACCESS_KEY_SECRET"),
 56 |                     ),
 57 |                 ),
 58 |             },
 59 |         )
 60 |     )
 61 |     return context
 62 | 
 63 | 
 64 | @pytest.fixture
 65 | def tool_manager(mcp_server):
 66 |     """创建ToolManager实例"""
 67 |     return ArmsToolkit(mcp_server)
 68 | 
 69 | @pytest.mark.asyncio
 70 | async def test_arms_profile_flame_analysis_success(
 71 |     tool_manager: ArmsToolkit,
 72 |     mcp_server: FastMCP,
 73 |     mock_request_context: Context,
 74 | ):
 75 |     """测试arms_profile_flame_analysis成功的情况"""
 76 |     tool = mcp_server._tool_manager.get_tool("arms_profile_flame_analysis")
 77 |     result_data = await tool.run(
 78 |         {
 79 |             "pid": "test_pid",
 80 |             "startMs": "1609459200000",
 81 |             "endMs": "1609545600000",
 82 |             "profileType": "cpu",
 83 |             "ip": "127.0.0.1",
 84 |             "thread": "main-thread",
 85 |             "threadGroup": "default-group",
 86 |             "regionId": "cn-hangzhou",
 87 |         },
 88 |         context=mock_request_context,
 89 |     )
 90 |     assert result_data is not None
 91 | 
 92 | @pytest.mark.asyncio
 93 | async def test_arms_diff_flame_analysis_success(
 94 |     tool_manager: ArmsToolkit,
 95 |     mcp_server: FastMCP,
 96 |     mock_request_context: Context,
 97 | ):
 98 |     """测试arms_diff_flame_analysis成功的情况"""
 99 |     tool = mcp_server._tool_manager.get_tool("arms_diff_profile_flame_analysis")
100 |     result_data = await tool.run(
101 |         {
102 |             "pid": "test_pid",
103 |             "startMs": "1609459200000",
104 |             "endMs": "1609462800000",
105 |             "baseStartMs": "1609545600000",
106 |             "baseEndMs": "1609549200000",
107 |             "profileType": "cpu",
108 |             "ip": "127.0.0.1",
109 |             "thread": "main-thread",
110 |             "threadGroup": "default-group",
111 |             "regionId": "cn-hangzhou",
112 |         },
113 |         context=mock_request_context,
114 |     )
115 |     assert result_data is not None
116 | 
117 | @pytest.mark.asyncio
118 | async def test_arms_trace_quality_analysis(
119 |     tool_manager: ArmsToolkit,
120 |     mcp_server: FastMCP,
121 |     mock_request_context: Context,
122 | ):
123 |     """测试arms_trace_quality_analysis成功的情况"""
124 |     tool = mcp_server._tool_manager.get_tool("arms_trace_quality_analysis")
125 |     result_data = await tool.run(
126 |         {
127 |             "traceId": "test_trace_id",
128 |             "startMs": 1746686989000,
129 |             "endMs": 1746690589507,
130 |             "regionId": "cn-hangzhou",
131 |         },
132 |         context=mock_request_context,
133 |     )
134 |     assert result_data is not None
135 | 
136 | @pytest.mark.asyncio
137 | async def test_arms_slow_trace_analysis(
138 |     tool_manager: ArmsToolkit,
139 |     mcp_server: FastMCP,
140 |     mock_request_context: Context,
141 | ):
142 |     """测试arms_slow_trace_analysis成功的情况"""
143 |     tool = mcp_server._tool_manager.get_tool("arms_slow_trace_analysis")
144 |     result_data = await tool.run(
145 |         {
146 |             "traceId": "test_trace_id",
147 |             "startMs": 1746686989000,
148 |             "endMs": 1746690589507,
149 |             "regionId": "cn-hangzhou",
150 |         },
151 |         context=mock_request_context,
152 |     )
153 |     assert result_data is not None
154 | 
155 | @pytest.mark.asyncio
156 | async def test_arms_error_trace_analysis(
157 |     tool_manager: ArmsToolkit,
158 |     mcp_server: FastMCP,
159 |     mock_request_context: Context,
160 | ):
161 |     """测试arms_error_trace_analysis成功的情况"""
162 |     tool = mcp_server._tool_manager.get_tool("arms_error_trace_analysis")
163 |     result_data = await tool.run(
164 |         {
165 |             "traceId": "test_trace_id",
166 |             "startMs": 1746686989000,
167 |             "endMs": 1746690589507,
168 |             "regionId": "cn-hangzhou",
169 |         },
170 |         context=mock_request_context,
171 |     )
172 |     assert result_data is not None


--------------------------------------------------------------------------------
/tests/test_cms_toolkit.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | 
  4 | import dotenv
  5 | import pytest
  6 | from mcp.server.fastmcp import Context, FastMCP
  7 | from mcp.shared.context import RequestContext
  8 | 
  9 | from mcp_server_aliyun_observability.server import create_lifespan
 10 | from mcp_server_aliyun_observability.toolkit.cms_toolkit import CMSToolkit
 11 | from mcp_server_aliyun_observability.utils import CredentialWrapper, SLSClientWrapper
 12 | 
 13 | dotenv.load_dotenv()
 14 | 
 15 | import logging
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def mcp_server():
 22 |     """创建模拟的FastMCP服务器实例"""
 23 |     mcp_server = FastMCP(
 24 |         name="mcp_aliyun_observability_server",
 25 |         lifespan=create_lifespan(
 26 |             CredentialWrapper(
 27 |                 os.getenv("ALIYUN_ACCESS_KEY_ID"), os.getenv("ALIYUN_ACCESS_KEY_SECRET")
 28 |             )
 29 |         ),
 30 |     )
 31 |     return mcp_server
 32 | 
 33 | 
 34 | @pytest.fixture
 35 | def mock_request_context():
 36 |     """创建模拟的RequestContext实例"""
 37 |     context = Context(
 38 |         request_context=RequestContext(
 39 |             request_id="test_request_id",
 40 |             meta=None,
 41 |             session=None,
 42 |             lifespan_context={
 43 |                 "cms_client": SLSClientWrapper(
 44 |                     CredentialWrapper(
 45 |                         os.getenv("ALIYUN_ACCESS_KEY_ID"),
 46 |                         os.getenv("ALIYUN_ACCESS_KEY_SECRET"),
 47 |                     )
 48 |                 ),
 49 |             },
 50 |         )
 51 |     )
 52 |     return context
 53 | 
 54 | 
 55 | @pytest.fixture
 56 | def tool_manager(mcp_server):
 57 |     """创建ToolManager实例"""
 58 |     return CMSToolkit(mcp_server)
 59 | 
 60 | 
 61 | @pytest.mark.asyncio
 62 | async def test_cms_summarize_alert_events_success(
 63 |     tool_manager: CMSToolkit,
 64 |     mcp_server: FastMCP,
 65 |     mock_request_context: Context,
 66 | ):
 67 |     """测试CMS 告警总结成功的情况"""
 68 |     tool = mcp_server._tool_manager.get_tool("cms_summarize_alert_events")
 69 |     text = await tool.run(
 70 |         {
 71 |             "fromTimestampInSeconds": int(datetime.now().timestamp()) - 3600,
 72 |             "toTimestampInSeconds": int(datetime.now().timestamp()),
 73 |             "regionId": os.getenv("TEST_REGION"),
 74 |         },
 75 |         context=mock_request_context,
 76 |     )
 77 |     assert text is not None
 78 |     # """
 79 |     #  response_body: List[Dict[str, Any]] = response.body
 80 |     #         result = {
 81 |     #             "data": response_body,
 82 |     # """
 83 |     # item = text["data"][0]
 84 |     # assert item["total"] is not None
 85 |     # assert text["message"] == "success"
 86 | 
 87 | 
 88 | @pytest.mark.asyncio
 89 | async def test_cms_execute_promql_query_success(
 90 |     tool_manager: CMSToolkit,
 91 |     mcp_server: FastMCP,
 92 |     mock_request_context: Context,
 93 | ):
 94 |     """测试PromQL查询执行成功的情况"""
 95 |     tool = mcp_server._tool_manager.get_tool("cms_execute_promql_query")
 96 |     text = await tool.run(
 97 |         {
 98 |             "project": os.getenv("TEST_PROJECT"),
 99 |             "metricStore": os.getenv("TEST_METRICSTORE"),
100 |             "query": "sum(kube_pod_info) by (namespace)",
101 |             "fromTimestampInSeconds": int(datetime.now().timestamp()) - 3600,
102 |             "toTimestampInSeconds": int(datetime.now().timestamp()),
103 |             "regionId": os.getenv("TEST_REGION"),
104 |         },
105 |         context=mock_request_context,
106 |     )
107 |     assert text is not None
108 |     # """
109 |     #  response_body: List[Dict[str, Any]] = response.body
110 |     #         result = {
111 |     #             "data": response_body,
112 |     # """
113 |     # item = text["data"][0]
114 |     # assert item["total"] is not None
115 |     # assert text["message"] == "success"
116 | 


--------------------------------------------------------------------------------
/tests/test_sls_toolkit.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | 
  4 | import dotenv
  5 | import pytest
  6 | from mcp.server.fastmcp import Context, FastMCP
  7 | from mcp.shared.context import RequestContext
  8 | 
  9 | from mcp_server_aliyun_observability.server import create_lifespan
 10 | from mcp_server_aliyun_observability.toolkit.sls_toolkit import SLSToolkit
 11 | from mcp_server_aliyun_observability.utils import (CredentialWrapper,
 12 |                                                    SLSClientWrapper)
 13 | 
 14 | dotenv.load_dotenv()
 15 | 
 16 | import logging
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | @pytest.fixture
 22 | def mcp_server():
 23 |     """创建模拟的FastMCP服务器实例"""
 24 |     mcp_server = FastMCP(
 25 |         name="mcp_aliyun_observability_server",
 26 |         lifespan=create_lifespan(
 27 |             credential=CredentialWrapper(
 28 |                 access_key_id=os.getenv("ALIYUN_ACCESS_KEY_ID"),
 29 |                 access_key_secret=os.getenv("ALIYUN_ACCESS_KEY_SECRET"),
 30 |             ),
 31 |         ),
 32 |     )
 33 |     return mcp_server
 34 | 
 35 | 
 36 | @pytest.fixture
 37 | def mock_request_context():
 38 |     """创建模拟的RequestContext实例"""
 39 |     context = Context(
 40 |         request_context=RequestContext(
 41 |             request_id="test_request_id",
 42 |             meta=None,
 43 |             session=None,
 44 |             lifespan_context={
 45 |                 "sls_client": SLSClientWrapper(
 46 |                     credential=CredentialWrapper(
 47 |                         access_key_id=os.getenv("ALIYUN_ACCESS_KEY_ID"),
 48 |                         access_key_secret=os.getenv("ALIYUN_ACCESS_KEY_SECRET"),
 49 |                     ),
 50 |                 ),
 51 |             },
 52 |         )
 53 |     )
 54 |     return context
 55 | 
 56 | 
 57 | @pytest.fixture
 58 | def tool_manager(mcp_server):
 59 |     """创建ToolManager实例"""
 60 |     return SLSToolkit(mcp_server)
 61 | 
 62 | 
 63 | @pytest.mark.asyncio
 64 | async def test_sls_execute_query_success(
 65 |     tool_manager: SLSToolkit,
 66 |     mcp_server: FastMCP,
 67 |     mock_request_context: Context,
 68 | ):
 69 |     """测试SLS查询执行成功的情况"""
 70 |     tool = mcp_server._tool_manager.get_tool("sls_execute_sql_query")
 71 |     text = await tool.run(
 72 |         {
 73 |             "project": os.getenv("TEST_PROJECT"),
 74 |             "logStore": os.getenv("TEST_LOG_STORE"),
 75 |             "query": "* | select count(*) as total",
 76 |             "fromTimestampInSeconds": int(datetime.now().timestamp()) - 3600,
 77 |             "toTimestampInSeconds": int(datetime.now().timestamp()),
 78 |             "limit": 10,
 79 |             "regionId": os.getenv("TEST_REGION"),
 80 |         },
 81 |         context=mock_request_context,
 82 |     )
 83 |     assert text["data"] is not None
 84 |     """
 85 |      response_body: List[Dict[str, Any]] = response.body
 86 |             result = {
 87 |                 "data": response_body,
 88 |     """
 89 |     item = text["data"][0]
 90 |     assert item["total"] is not None
 91 |     assert text["message"] == "success"
 92 | 
 93 | 
 94 | @pytest.mark.asyncio
 95 | async def test_sls_list_projects_success(
 96 |     tool_manager: SLSToolkit,
 97 |     mcp_server: FastMCP,
 98 |     mock_request_context: Context,
 99 | ):
100 |     """测试SLS列出项目成功的情况"""
101 |     tool = mcp_server._tool_manager.get_tool("sls_list_projects")
102 |     text = await tool.run(
103 |         {
104 |             "projectName": "",
105 |             "limit": 10,
106 |             "regionId": os.getenv("TEST_REGION"),
107 |         },
108 |         context=mock_request_context,
109 |     )
110 |     assert len(text) > 0
111 | 
112 | @pytest.mark.asyncio
113 | async def test_sls_list_logstores_success(
114 |     tool_manager: SLSToolkit,
115 |     mcp_server: FastMCP,
116 |     mock_request_context: Context,
117 | ):
118 |     """测试SLS列出日志库成功的情况"""
119 |     tool = mcp_server._tool_manager.get_tool("sls_list_logstores")
120 |     text = await tool.run(
121 |         {
122 |             "project": os.getenv("TEST_PROJECT"),
123 |             "regionId": os.getenv("TEST_REGION"),
124 |             "limit": 10,
125 |         },
126 |         context=mock_request_context,
127 |     )
128 |     assert len(text["logstores"]) > 0
129 | 
130 | 
131 | @pytest.mark.asyncio
132 | async def test_sls_list_metric_store_success(
133 |     tool_manager: SLSToolkit,
134 |     mcp_server: FastMCP,
135 |     mock_request_context: Context,
136 | ):
137 |     """测试SLS列出日志库成功的情况"""
138 |     tool = mcp_server._tool_manager.get_tool("sls_list_logstores")
139 |     text = await tool.run(
140 |         {
141 |             "project": os.getenv("TEST_PROJECT"),
142 |             "logStore": "",
143 |             "limit": 10,
144 |             "isMetricStore": True,
145 |             "regionId": os.getenv("TEST_REGION"),
146 |         },
147 |         context=mock_request_context,
148 |     )
149 |     assert len(text["logstores"]) >= 0
150 | 
151 | @pytest.mark.asyncio
152 | async def test_sls_describe_logstore_success(
153 |     tool_manager: SLSToolkit,
154 |     mcp_server: FastMCP,
155 |     mock_request_context: Context,
156 | ):
157 |     """测试SLS描述日志库成功的情况"""
158 |     tool = mcp_server._tool_manager.get_tool("sls_describe_logstore")
159 |     text = await tool.run(
160 |         {
161 |             "project": os.getenv("TEST_PROJECT"),
162 |             "logStore": os.getenv("TEST_LOG_STORE"),
163 |             "regionId": os.getenv("TEST_REGION"),
164 |         },
165 |         context=mock_request_context,
166 |     )
167 |     assert text is not None
168 | 
169 | 
170 | @pytest.mark.asyncio
171 | async def test_sls_translate_text_to_sql_query_success(
172 |     tool_manager: SLSToolkit,
173 |     mcp_server: FastMCP,
174 |     mock_request_context: Context,
175 | ):
176 |     """测试SLS自然语言转换为查询语句成功的情况"""
177 |     tool = mcp_server._tool_manager.get_tool("sls_translate_text_to_sql_query")
178 |     text = await tool.run(
179 |         {
180 |             "project": os.getenv("TEST_PROJECT"),
181 |             "logStore": os.getenv("TEST_LOG_STORE"),
182 |             "text": "我想查询最近10分钟内，所有日志库的日志数量",
183 |             "regionId": os.getenv("TEST_REGION"),
184 |         },
185 |         context=mock_request_context,
186 |     )
187 |     assert text is not None
188 |     assert "select" in text["data"] or "SELECT" in text["data"]
189 | 


--------------------------------------------------------------------------------