├── .idea ├── MCPBench.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── vcs.xml └── workspace.xml ├── LICENSE ├── README.md ├── README_zh.md ├── assets └── figure1.png ├── configs ├── mcp_config_db.json └── mcp_config_template.json ├── evaluation_db.sh ├── evaluation_gaia.sh ├── evaluation_websearch.sh ├── langProBe ├── DB │ ├── DB_utils │ │ ├── __init__.py │ │ └── schema.py │ ├── __init__.py │ ├── data │ │ └── car_bi.jsonl │ └── db_program.py ├── GAIA │ ├── __init__.py │ ├── data │ │ ├── 2023 │ │ │ ├── __init__.py │ │ │ └── validation │ │ │ │ ├── 076c8171-9b3b-49b9-a477-244d2a532826.xlsx │ │ │ │ ├── 1f975693-876d-457b-a649-393859e79bf3.mp3 │ │ │ │ ├── 2b3ef98c-cc05-450b-a719-711aee40ac65.mp3 │ │ │ │ ├── 32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx │ │ │ │ ├── 366e2f2b-8632-4ef2-81eb-bc3877489217.pdf │ │ │ │ ├── 389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt │ │ │ │ ├── 3da89939-209c-4086-8520-7eb734e6b4ef.xlsx │ │ │ │ ├── 4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx │ │ │ │ ├── 4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx │ │ │ │ ├── 54612da3-fd56-4941-80f4-5eb82330de25.xlsx │ │ │ │ ├── 5b2a14e8-6e59-479c-80e3-4696e8980152.jpg │ │ │ │ ├── 5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx │ │ │ │ ├── 6359a0b1-8f7b-499b-9336-840f9ab90688.png │ │ │ │ ├── 65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx │ │ │ │ ├── 67e8878b-5cef-4375-804e-e6291fdbe78a.pdf │ │ │ │ ├── 7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx │ │ │ │ ├── 7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx │ │ │ │ ├── 7dd30055-0198-452e-8c25-f73dbe27dcb8.pdb │ │ │ │ ├── 8d46b8d6-b38a-47ff-ac74-cda14cf2d19b.csv │ │ │ │ ├── 8f80e01c-1296-4371-9486-bb3d68651a60.png │ │ │ │ ├── 9318445f-fe6a-4e1b-acbf-c68228c9906a.png │ │ │ │ ├── 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 │ │ │ │ ├── 9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip │ │ │ │ ├── __init__.py │ │ │ │ ├── a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx │ │ │ │ ├── b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg │ │ │ │ ├── b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png │ │ │ │ ├── bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld │ │ │ │ ├── bfcd99e1-0690-4b53-a85c-0174a8629083.zip │ │ │ │ ├── c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx │ │ │ │ ├── cca530fc-4052-43b2-b130-b30968d8aa44.png │ │ │ │ ├── cca70ce6-1952-45d2-acd4-80c903b0bc49.png │ │ │ │ ├── cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx │ │ │ │ ├── d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png │ │ │ │ ├── da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx │ │ │ │ ├── df6561b2-7ee5-4540-baab-5095f742716a.png │ │ │ │ ├── e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf │ │ │ │ ├── edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx │ │ │ │ ├── f918266a-b3e0-4914-865d-4faa564f1aef.py │ │ │ │ └── metadata.jsonl │ │ ├── GAIA.py │ │ ├── README.md │ │ ├── __init__.py │ │ ├── gaia_dev_part.jsonl │ │ └── statics.py │ └── gaia_program.py ├── WebSearch │ ├── __init__.py │ └── data │ │ ├── websearch_300.jsonl │ │ └── websearch_600.jsonl ├── __init__.py ├── analysis.py ├── async_mcp_client.py ├── benchmark.py ├── config_utils.py ├── constants.py ├── dspy_program.py ├── evaluation.py ├── evaluation_utils.py ├── langchain_program.py ├── mcp_program.py ├── optimizers.py ├── program_utils.py ├── register_benchmark.py └── synced_mcp_client.py ├── launch_mcps_as_sse.sh ├── mcpbench.pdf └── requirements.txt /.idea/MCPBench.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 60 | 61 | 66 | 67 | 68 | 70 | 71 | 76 | 78 | 84 | 90 | { 91 | "associatedIndex": 6 92 | } 93 | 94 | 95 | 96 | 97 | 98 | 101 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 1745207684567 144 | 150 | 151 | 158 | 159 | 166 | 167 | 174 | 175 | 182 | 185 | 186 | 188 | 189 | 198 | 199 | 200 | 201 | 202 | 203 | 205 | 206 | 207 | 208 | 209 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 🦊 MCPBench: A Benchmark for Evaluating MCP Servers 3 |

4 | 5 | 6 | 7 |
8 | 9 | [![Documentation][docs-image]][docs-url] 10 | [![Package License][package-license-image]][package-license-url] 11 | 12 |
13 | 14 |
15 |

16 | 17 | [中文](https://github.com/modelscope/MCPBench/blob/main/README_zh.md) | 18 | [English](https://github.com/modelscope/MCPBench/blob/main/README.md) 19 | 20 |

21 |
22 | 23 | MCPBench is an evaluation framework for MCP Servers. It supports the evaluation of three types of servers: Web Search, Database Query and GAIA, and is compatible with both local and remote MCP Servers. The framework primarily evaluates different MCP Servers (such as Brave Search, DuckDuckGo, etc.) in terms of task completion accuracy, latency, and token consumption under the same LLM and Agent configurations. Here is the [evaluation report](https://arxiv.org/abs/2504.11094). 24 | 25 | MCPBench Overview 26 | 27 | > The implementation refers to [LangProBe: a Language Programs Benchmark](https://arxiv.org/abs/2502.20315).\ 28 | > Big thanks to Qingxu Fu for the initial implementation! 29 | 30 |
31 | 32 | 33 | 34 | # 📋 Table of Contents 35 | 36 | - [🔥 News](#news) 37 | - [🛠️ Installation](#installation) 38 | - [🚀 Quick Start](#quick-start) 39 | - [Launch MCP Server](#launch-mcp-server) 40 | - [Launch Evaluation](#launch-evaluation) 41 | - [🧂 Datasets and Experiments](#datasets-and-experiments) 42 | - [🚰 Cite](#cite) 43 | 44 | # 🔥 News 45 | + `Apr. 29, 2025` 🌟 Update the code for evaluating the MCP Server Package within GAIA. 46 | + `Apr. 14, 2025` 🌟 We are proud to announce that MCPBench is now open-sourced. 47 | 48 | # 🛠️ Installation 49 | The framework requires Python version >= 3.11, nodejs and jq. 50 | 51 | ```bash 52 | conda create -n mcpbench python=3.11 -y 53 | conda activate mcpbench 54 | pip install -r requirements.txt 55 | ``` 56 | # 🚀 Quick Start 57 | Please first determine the type of MCP server you want to use: 58 | - If it is a remote host (accessed via **SSE**, such as [ModelScope](https://modelscope.cn/mcp), [Smithery](https://smithery.ai), or localhost), you can directly conduct the [evaluation](#launch-evaluation). 59 | - If it is started locally (accessed via npx using **STDIO**), you need to launch it. 60 | 61 | ## Launch MCP Server (optional for stdio) 62 | First, you need to write the following configuration: 63 | ```json 64 | { 65 | "mcp_pool": [ 66 | { 67 | "name": "firecrawl", 68 | "run_config": [ 69 | { 70 | "command": "npx -y firecrawl-mcp", 71 | "args": "FIRECRAWL_API_KEY=xxx", 72 | "port": 8005 73 | } 74 | ] 75 | } 76 | ] 77 | } 78 | ``` 79 | Save this config file in the `configs` folder and launch it using: 80 | 81 | ```bash 82 | sh launch_mcps_as_sse.sh YOUR_CONFIG_FILE 83 | ``` 84 | 85 | For example, save the above configuration in the `configs/firecrawl.json` file and launch it using: 86 | 87 | ```bash 88 | sh launch_mcps_as_sse.sh firecrawl.json 89 | ``` 90 | 91 | ## Launch Evaluation 92 | To evaluate the MCP Server's performance, you need to set up the necessary MCP Server information. the code will automatically detect the tools and parameters in the Server, so you don't need to configure them manually, like: 93 | ```json 94 | { 95 | "mcp_pool": [ 96 | { 97 | "name": "Remote MCP example", 98 | "url": "url from https://modelscope.cn/mcp or https://smithery.ai" 99 | }, 100 | { 101 | "name": "firecrawl (Local run example)", 102 | "run_config": [ 103 | { 104 | "command": "npx -y firecrawl-mcp", 105 | "args": "FIRECRAWL_API_KEY=xxx", 106 | "port": 8005 107 | } 108 | ] 109 | } 110 | ] 111 | } 112 | ``` 113 | 114 | To evaluate the MCP Server's performance on WebSearch tasks: 115 | ```bash 116 | sh evaluation_websearch.sh YOUR_CONFIG_FILE 117 | ``` 118 | 119 | To evaluate the MCP Server's performance on Database Query tasks: 120 | ```bash 121 | sh evaluation_db.sh YOUR_CONFIG_FILE 122 | ``` 123 | 124 | To evaluate the MCP Server's performance on GAIA tasks: 125 | ```bash 126 | sh evaluation_gaia.sh YOUR_CONFIG_FILE 127 | ``` 128 | 129 | For example, save the above configuration in the `configs/firecrawl.json` file and launch it using: 130 | 131 | ```bash 132 | sh evaluation_websearch.sh firecrawl.json 133 | ``` 134 | 135 | # Datasets and Experimental Results 136 | Our framework provides two datasets for evaluation. For the WebSearch task, the dataset is located at `MCPBench/langProBe/WebSearch/data/websearch_600.jsonl`, containing 200 QA pairs each from [Frames](https://arxiv.org/abs/2409.12941), news, and technology domains. Our framework for automatically constructing evaluation datasets will be open-sourced later. 137 | 138 | For the Database Query task, the dataset is located at `MCPBench/langProBe/DB/data/car_bi.jsonl`. You can add your own dataset in the following format: 139 | 140 | ```json 141 | { 142 | "unique_id": "", 143 | "Prompt": "", 144 | "Answer": "" 145 | } 146 | ``` 147 | 148 | We have evaluated mainstream MCP Servers on both tasks. For detailed experimental results, please refer to [Documentation](https://arxiv.org/abs/2504.11094) 149 | 150 | # 🚰 Cite 151 | If you find this work useful, please consider citing our project or giving us a 🌟: 152 | 153 | ```bibtex 154 | @misc{mcpbench, 155 | title={MCPBench: A Benchmark for Evaluating MCP Servers}, 156 | author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao}, 157 | howpublished = {\url{https://github.com/modelscope/MCPBench}}, 158 | year={2025} 159 | } 160 | ``` 161 | 162 | Alternatively, you may reference our report. 163 | ```bibtex 164 | @article{mcpbench_report, 165 | title={Evaluation Report on MCP Servers}, 166 | author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao}, 167 | year={2025}, 168 | journal={arXiv preprint arXiv:2504.11094}, 169 | url={https://arxiv.org/abs/2504.11094}, 170 | primaryClass={cs.AI} 171 | } 172 | ``` 173 | 174 | [docs-image]: https://img.shields.io/badge/Documentation-EB3ECC 175 | [docs-url]: https://arxiv.org/abs/2504.11094 176 | [package-license-image]: https://img.shields.io/badge/License-Apache_2.0-blue.svg 177 | [package-license-url]: https://github.com/modelscope/MCPBench/blob/main/LICENSE 178 | 179 | -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 |

2 | 🦊 MCPBench: A Benchmark for Evaluating MCP Servers 3 |

4 | 5 |
6 | 7 | [![文档][docs-image]][docs-url] 8 | [![软件包许可证][package-license-image]][package-license-url] 9 | 10 |
11 | 12 |
13 |

14 | 15 | [中文](https://github.com/modelscope/MCPBench/blob/main/README_zh.md) | 16 | [English](https://github.com/modelscope/MCPBench/blob/main/README.md) 17 | 18 |

19 |
20 | 21 | MCPBench 是一个用于评估 MCP Server的基准测试框架。它支持评估三种类型的服务器:网络搜索、数据库查询和GAIA任务,并且兼容本地和远程 MCP 服务器。该框架主要在相同的 LLM 和 Agent 配置下,从任务完成准确性、延迟和 Token 消耗等方面评估不同的 MCP 服务器(如 Brave Search、DuckDuckGo 等)。详见[评估报告](https://arxiv.org/abs/2504.11094)。 22 | 23 | MCPBench 概览 24 | 25 | > 实现参考了 [LangProBe: a Language Programs Benchmark](https://arxiv.org/abs/2502.20315)。\ 26 | > 特别感谢 Qingxu Fu 的初始实现! 27 | 28 |
29 | 30 | # 📋 目录 31 | 32 | - [🔥 最新动态](#news) 33 | - [🛠️ 安装](#installation) 34 | - [🚀 快速开始](#quick-start) 35 | - [启动 MCP 服务器](#launch-mcp-server) 36 | - [启动评测](#launch-evaluation) 37 | - [🧂 数据集与实验](#datasets-and-experiments) 38 | - [🚰 引用](#cite) 39 | 40 | # 🔥 最新动态 41 | + `2025年4月29日` 🌟 更新了GAIA内MCP Server Package的评测代码。 42 | + `2025年4月14日` 🌟 MCPBench 正式开源。 43 | 44 | # 🛠️ 安装 45 | 本框架需要 Python >= 3.11、nodejs 和 jq。 46 | 47 | ```bash 48 | conda create -n mcpbench python=3.11 -y 49 | conda activate mcpbench 50 | pip install -r requirements.txt 51 | ``` 52 | # 🚀 快速开始 53 | 请先确定你要使用的 MCP 服务器类型: 54 | - 若为远程主机(通过 **SSE** 访问,如 [ModelScope](https://modelscope.cn/mcp)、[Smithery](https://smithery.ai) 或 localhost),可直接进行[评测](#launch-evaluation)。 55 | - 若为本地启动(通过 npx 以 **STDIO** 访问),你需要启动MCP服务器。 56 | ## 启动 MCP 服务器 57 | 首先,需要编写如下配置: 58 | ```json 59 | { 60 | "mcp_pool": [ 61 | { 62 | "name": "firecrawl", 63 | "run_config": [ 64 | { 65 | "command": "npx -y firecrawl-mcp", 66 | "args": "FIRECRAWL_API_KEY=xxx", 67 | "port": 8005 68 | } 69 | ] 70 | } 71 | ] 72 | } 73 | ``` 74 | 将该配置文件保存至 `configs` 文件夹,并通过如下命令启动: 75 | 76 | ```bash 77 | sh launch_mcps_as_sse.sh YOUR_CONFIG_FILE 78 | ``` 79 | 80 | 例如,将上述配置保存为 `configs/firecrawl.json`,并通过如下命令启动: 81 | 82 | ```bash 83 | sh launch_mcps_as_sse.sh firecrawl.json 84 | ``` 85 | 86 | ## 启动评测 87 | 要评测 MCP 服务器性能,需设置相关信息。代码会自动检测服务器中的工具和参数,无需手动配置。例如: 88 | 89 | ```json 90 | { 91 | "mcp_pool": [ 92 | { 93 | "name": "Remote MCP example", 94 | "url": "url from https://modelscope.cn/mcp or https://smithery.ai" 95 | }, 96 | { 97 | "name": "firecrawl (Local run example)", 98 | "run_config": [ 99 | { 100 | "command": "npx -y firecrawl-mcp", 101 | "args": "FIRECRAWL_API_KEY=xxx", 102 | "port": 8005 103 | } 104 | ] 105 | } 106 | ] 107 | } 108 | ``` 109 | 110 | 评测 MCP 服务器在网页搜索任务上的表现: 111 | ```bash 112 | sh evaluation_websearch.sh YOUR_CONFIG_FILE 113 | ``` 114 | 115 | 评测 MCP 服务器在数据库查询任务上的表现: 116 | ```bash 117 | sh evaluation_db.sh YOUR_CONFIG_FILE 118 | ``` 119 | 120 | 评测 MCP 服务器在 GAIA 任务上的表现: 121 | ```bash 122 | sh evaluation_gaia.sh YOUR_CONFIG_FILE 123 | ``` 124 | 125 | 例如,将上述配置保存为 `configs/firecrawl.json`,并通过如下命令启动: 126 | 127 | ```bash 128 | sh evaluation_websearch.sh firecrawl.json 129 | ``` 130 | 131 | # 数据集与实验结果 132 | 本框架提供了两类评测数据集: 133 | - 网页搜索任务数据集位于 `MCPBench/langProBe/WebSearch/data/websearch_600.jsonl`,包含来自 [Frames](https://arxiv.org/abs/2409.12941)、新闻、科技领域的各200组问答对。自动化构建评测数据集的工具后续也将开源。 134 | - 数据库查询任务数据集位于 `MCPBench/langProBe/DB/data/car_bi.jsonl`。你也可以按如下格式自定义数据集: 135 | 136 | ```json 137 | { 138 | "unique_id": "", 139 | "Prompt": "", 140 | "Answer": "" 141 | } 142 | ``` 143 | 144 | 我们已在主流 MCP 服务器上完成了上述任务的评测。详细实验结果请参考[文档](https://arxiv.org/abs/2504.11094)。 145 | 146 | # 🚰 引用 147 | 如果本项目对你有帮助,请引用我们的工作或是给我们一个🌟: 148 | 149 | ```bibtex 150 | @misc{mcpbench, 151 | title={MCPBench: A Benchmark for Evaluating MCP Servers}, 152 | author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao}, 153 | howpublished = {\url{https://github.com/modelscope/MCPBench}}, 154 | year={2025} 155 | } 156 | ``` 157 | 158 | 或引用我们的报告: 159 | ```bibtex 160 | @article{mcpbench_report, 161 | title={Evaluation Report on MCP Servers}, 162 | author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao}, 163 | year={2025}, 164 | journal={arXiv preprint arXiv:2504.11094}, 165 | url={https://arxiv.org/abs/2504.11094}, 166 | primaryClass={cs.AI} 167 | } 168 | ``` 169 | 170 | [docs-image]: https://img.shields.io/badge/Documentation-EB3ECC 171 | [docs-url]: https://arxiv.org/abs/2504.11094 172 | [package-license-image]: https://img.shields.io/badge/License-Apache_2.0-blue.svg 173 | [package-license-url]: https://github.com/modelscope/MCPBench/blob/main/LICENSE 174 | 175 | -------------------------------------------------------------------------------- /assets/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/assets/figure1.png -------------------------------------------------------------------------------- /configs/mcp_config_db.json: -------------------------------------------------------------------------------- 1 | { 2 | "mcp_pool": [ 3 | { 4 | "name": "mysql", 5 | "run_config": [ 6 | { 7 | "command": "uvx --from mysql-mcp-server mysql_mcp_server", 8 | "args": "MYSQL_HOST=localhost MYSQL_PORT=3306 MYSQL_USER=root MYSQL_PASSWORD=xxx MYSQL_DATABASE=car_bi", 9 | "port": 8005 10 | } 11 | ] 12 | } 13 | ], 14 | "query_type": "SQL" 15 | } 16 | -------------------------------------------------------------------------------- /configs/mcp_config_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "mcp_pool": [ 3 | { 4 | "name": "Remote MCP example", 5 | "url": "url from https://modelscope.cn/mcp or https://smithery.ai" 6 | }, 7 | { 8 | "name": "Local run example", 9 | "run_config": [ 10 | { 11 | "command": "npx -y firecrawl-mcp", 12 | "args": "FIRECRAWL_API_KEY=xxx", 13 | "port": 8005 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /evaluation_db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 检查是否提供了配置文件路径参数 3 | if [ -z "$1" ]; then 4 | echo "Usage: $0 " 5 | exit 1 6 | fi 7 | 8 | # 构造完整路径 9 | CONFIG_FILE="$1" 10 | if [[ ! "$CONFIG_FILE" == /* ]]; then 11 | CONFIG_FILE="configs/$CONFIG_FILE" 12 | fi 13 | 14 | 15 | 16 | # 使用更直接的方法启动评估程序,确保多进程正确初始化 17 | DSPY_CACHEDIR=evaluation_mcp/.dspy_cache \ 18 | python -c " 19 | import multiprocessing as mp 20 | mp.set_start_method('spawn', True) 21 | from langProBe.evaluation import main 22 | main() 23 | " \ 24 | --benchmark=DB \ 25 | --dataset_mode=test \ 26 | --dataset_path=langProBe/DB/data/car_bi.jsonl \ 27 | --file_path=evaluation_db \ 28 | --lm=openai/qwen-max-2025-01-25 \ 29 | --lm_api_base=https://dashscope.aliyuncs.com/compatible-mode/v1 \ 30 | --lm_api_key=xxx \ 31 | --missing_mode_file=path/to/logs/task_messages.jsonl \ 32 | --num_threads=1 \ 33 | --config=$CONFIG_FILE 34 | -------------------------------------------------------------------------------- /evaluation_gaia.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 检查是否提供了配置文件路径参数 3 | if [ -z "$1" ]; then 4 | echo "Usage: $0 " 5 | exit 1 6 | fi 7 | 8 | # 构造完整路径 9 | CONFIG_FILE="$1" 10 | if [[ ! "$CONFIG_FILE" == /* ]]; then 11 | CONFIG_FILE="configs/$CONFIG_FILE" 12 | fi 13 | 14 | 15 | 16 | # 使用更直接的方法启动评估程序,确保多进程正确初始化 17 | DSPY_CACHEDIR=evaluation_mcp/.dspy_cache \ 18 | python -c " 19 | import multiprocessing as mp 20 | mp.set_start_method('spawn', True) 21 | from langProBe.evaluation import main 22 | main() 23 | " \ 24 | --benchmark=GAIA \ 25 | --dataset_mode=full \ 26 | --dataset_path=langProBe/GAIA/data/gaia_rest.jsonl \ 27 | --file_path=evaluation_gaia \ 28 | --lm=openai/qwen-max-2025-01-25 \ 29 | --lm_api_base=https://dashscope.aliyuncs.com/compatible-mode/v1 \ 30 | --missing_mode_file=path/to/logs/task_messages.jsonl \ 31 | --num_threads=1 \ 32 | --config=$CONFIG_FILE 33 | -------------------------------------------------------------------------------- /evaluation_websearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 检查是否提供了配置文件路径参数 3 | if [ -z "$1" ]; then 4 | echo "Usage: $0 " 5 | exit 1 6 | fi 7 | 8 | # 构造完整路径 9 | CONFIG_FILE="$1" 10 | if [[ ! "$CONFIG_FILE" == /* ]]; then 11 | CONFIG_FILE="configs/$CONFIG_FILE" 12 | fi 13 | 14 | 15 | 16 | # 使用更直接的方法启动评估程序,确保多进程正确初始化 17 | DSPY_CACHEDIR=evaluation_mcp/.dspy_cache \ 18 | python -c " 19 | import multiprocessing as mp 20 | mp.set_start_method('spawn', True) 21 | from langProBe.evaluation import main 22 | main() 23 | " \ 24 | --benchmark=WebSearch \ 25 | --dataset_mode=full \ 26 | --dataset_path=langProBe/WebSearch/data/websearch_test.jsonl \ 27 | --file_path=evaluation_websearch_test \ 28 | --lm=openai/deepseek-v3 \ 29 | --lm_api_base=https://dashscope.aliyuncs.com/compatible-mode/v1 \ 30 | --lm_api_key=xxx \ 31 | --missing_mode_file=path/to/logs/task_messages.jsonl \ 32 | --num_threads=1 \ 33 | --config=$CONFIG_FILE -------------------------------------------------------------------------------- /langProBe/DB/DB_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/DB/DB_utils/__init__.py -------------------------------------------------------------------------------- /langProBe/DB/DB_utils/schema.py: -------------------------------------------------------------------------------- 1 | SCHEMA = """ 2 | create table competitors 3 | ( 4 | id int unsigned auto_increment comment '唯一标识符' 5 | primary key, 6 | competitor_name varchar(50) not null comment '竞品名称', 7 | car_series varchar(50) not null comment '车系名称', 8 | sales int not null comment '竞品销量', 9 | market_share_percentage decimal(5, 2) not null comment '竞品市场占有率百分比', 10 | record_date date not null comment '记录日期' 11 | ) 12 | comment '存储竞品销量和市场占有率' collate = utf8mb4_unicode_520_ci; 13 | 14 | create table customer_flow 15 | ( 16 | id int unsigned auto_increment comment '唯一标识符' 17 | primary key, 18 | region varchar(50) not null comment '大区', 19 | store varchar(50) not null comment '门店名称', 20 | first_visit_flow int not null comment '首次到店客流量', 21 | total_visit_flow int not null comment '总客流量', 22 | visit_datetime datetime not null comment '访问时间', 23 | conversion_rate decimal(5, 2) not null comment '成交率' 24 | ) 25 | comment '存储大区、门店、客流量和成交率信息' collate = utf8mb4_unicode_520_ci; 26 | 27 | create index idx_region_store 28 | on customer_flow (region, store); 29 | 30 | create table inventory 31 | ( 32 | id int unsigned auto_increment comment '唯一标识符' 33 | primary key, 34 | car_series varchar(50) not null comment '车系名称', 35 | region varchar(50) not null comment '大区', 36 | warehouse varchar(100) not null comment '仓库名称', 37 | quantity int not null comment '库存数量', 38 | last_checked datetime not null comment '最后盘点时间', 39 | series_type varchar(50) not null comment '车系类型' 40 | ) 41 | comment '存储库存信息' collate = utf8mb4_unicode_520_ci; 42 | 43 | create table market_sales 44 | ( 45 | id int unsigned auto_increment comment '唯一标识符' 46 | primary key, 47 | total_market_sales int not null comment '总体市场销量', 48 | car_series_market_sales int not null comment '车系市场销量', 49 | record_date date not null comment '记录日期' 50 | ) 51 | comment '存储市场销量信息' collate = utf8mb4_unicode_520_ci; 52 | 53 | create table market_share 54 | ( 55 | id int unsigned auto_increment comment '唯一标识符' 56 | primary key, 57 | car_series varchar(50) not null comment '车系名称', 58 | market_share_percentage decimal(5, 2) not null comment '市场占有率百分比', 59 | record_date date not null comment '记录日期' 60 | ) 61 | comment '存储车系市场占有率变化' collate = utf8mb4_unicode_520_ci; 62 | 63 | create table order_stats 64 | ( 65 | id int unsigned auto_increment comment '唯一标识符' 66 | primary key, 67 | car_series varchar(50) not null comment '车系名称', 68 | region varchar(50) not null comment '大区', 69 | order_quantity int not null comment '订单数量', 70 | large_order_quantity int not null comment '大定数量', 71 | locked_order_quantity int not null comment '锁单数量', 72 | retained_large_order_quantity int not null comment '留存大定数量' 73 | ) 74 | comment '存储订单统计数据' collate = utf8mb4_unicode_520_ci; 75 | 76 | create table policies 77 | ( 78 | id int unsigned auto_increment comment '唯一标识符' 79 | primary key, 80 | policy_name varchar(100) not null comment '政策名称', 81 | description text null comment '政策描述', 82 | type varchar(50) not null comment '车系类型', 83 | effective_date date not null comment '生效日期', 84 | expiry_date date null comment '失效日期' 85 | ) 86 | comment '存储国家及地方汽车产业政策' collate = utf8mb4_unicode_520_ci; 87 | 88 | create table sales 89 | ( 90 | id int unsigned auto_increment comment '唯一标识符' 91 | primary key, 92 | car_series varchar(50) not null comment '车系名称', 93 | region varchar(50) not null comment '大区', 94 | quantity int not null comment '销量数量', 95 | sale_date date not null comment '销售日期', 96 | series_type varchar(50) not null comment '车系类型' 97 | ) 98 | comment '存储实际销量数据' collate = utf8mb4_unicode_520_ci; 99 | 100 | create table sales_targets 101 | ( 102 | id int unsigned auto_increment comment '唯一标识符' 103 | primary key, 104 | car_series varchar(50) not null comment '车系名称', 105 | region varchar(50) not null comment '大区', 106 | monthly_target int not null comment '月度销量目标', 107 | yearly_target int not null comment '年度销量目标' 108 | ) 109 | comment '存储各车系在各大区的销量目标' collate = utf8mb4_unicode_520_ci; 110 | """ -------------------------------------------------------------------------------- /langProBe/DB/__init__.py: -------------------------------------------------------------------------------- 1 | from langProBe.benchmark import BenchmarkMeta, MCPBench 2 | from .db_program import DBPredict 3 | from langProBe.evaluation_utils import mcp_metric 4 | 5 | MCP_SAMPLE_SYSTEM_PROMPT = """ 6 | You are a helpful assistant. You are able to answer questions using different tools. 7 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools. 8 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server. 9 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool. 10 | The tool description includes: 11 | A brief text description outlining the functionality of the tool. 12 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter. 13 | """ 14 | 15 | def get_mcp_sample_benchmark(): 16 | mcp_sample_baseline = DBPredict( 17 | max_steps=5, 18 | system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, 19 | task_name="database_search") 20 | 21 | return [ 22 | BenchmarkMeta( 23 | MCPBench, 24 | [mcp_sample_baseline], 25 | mcp_metric, 26 | optimizers=[], 27 | name="MCP_DB" 28 | ) 29 | ] 30 | 31 | benchmark = get_mcp_sample_benchmark() -------------------------------------------------------------------------------- /langProBe/DB/data/car_bi.jsonl: -------------------------------------------------------------------------------- 1 | {"unique_id": 2, "Prompt": "2025年2月19日记录的竞品名称是什么?", "Answer": "飞海科技科技有限公司"} 2 | {"unique_id": 4, "Prompt": "华泰通安网络有限公司的销量是多少?", "Answer": "30"} 3 | {"unique_id": 6, "Prompt": "诺依曼软件科技有限公司的记录日期是什么时候?", "Answer": "2025-01-05"} 4 | {"unique_id": 9, "Prompt": "东方峻景网络有限公司的市场占有率是多少?", "Answer": "9.06"} 5 | {"unique_id": 11, "Prompt": "西南区域中,系列D的订单数量是多少?", "Answer": "60"} 6 | {"unique_id": 12, "Prompt": "华北区域中,所有车系的大定数量总和是多少?", "Answer": "98"} 7 | {"unique_id": 13, "Prompt": "华南区域中,首次到店客流量最高的门店是哪个?", "Answer": "帅县店"} 8 | {"unique_id": 14, "Prompt": "华东区域中,成交率最低的门店是哪个?", "Answer": "强市店"} 9 | {"unique_id": 15, "Prompt": "西北区域中,总体市场销量最高的日期是哪一天?", "Answer": "2024-01-16"} 10 | {"unique_id": 16, "Prompt": "2024年12月,华南区域的总客流量是多少?", "Answer": "1168"} 11 | {"unique_id": 17, "Prompt": "锁单数量大于10的车系有哪些?", "Answer": "['系列C']"} 12 | {"unique_id": 18, "Prompt": "在2025年2月,华南区域的总订单数量是多少?", "Answer": "0"} 13 | {"unique_id": 19, "Prompt": "留存大定数量最多的车系是哪个?", "Answer": "系列C"} 14 | {"unique_id": 20, "Prompt": "系列A在华东区域的市场占有率是多少?", "Answer": "21.41%"} 15 | {"unique_id": 22, "Prompt": "系列B在华东区域的月度销量目标是多少?", "Answer": "58"} 16 | {"unique_id": 23, "Prompt": "系列D在2025年2月19日的市场占有率是多少?", "Answer": "19.99%"} 17 | {"unique_id": 25, "Prompt": "系列D在华北区域的年度销量目标是多少?", "Answer": "1320"} 18 | {"unique_id": 28, "Prompt": "飞海科技科技有限公司在2025年2月19日的竞品销量是多少?", "Answer": "23"} 19 | {"unique_id": 31, "Prompt": "万迅电脑传媒有限公司的竞品市场占有率百分比是多少?", "Answer": "6.92"} 20 | {"unique_id": 33, "Prompt": "2024年12月30日,系列C在华南区域的销量是多少?", "Answer": "19"} 21 | {"unique_id": 36, "Prompt": "华东区域中燃油车的库存总数是多少?", "Answer": "700"} 22 | {"unique_id": 38, "Prompt": "华南区域中系列B的库存总数是多少?", "Answer": "533"} 23 | {"unique_id": 39, "Prompt": "仓库名称为'梧州市仓库'的库存总数是多少?", "Answer": "330"} 24 | {"unique_id": 40, "Prompt": "系列C在西南区域的库存总数是多少?", "Answer": "177"} 25 | {"unique_id": 44, "Prompt": "所有政策中,哪些政策的类型是‘燃油车’?", "Answer": "['燃油车新购补贴', '燃油车置换补贴']"} 26 | {"unique_id": 45, "Prompt": "最早生效的政策名称是什么?", "Answer": "新能源置换补贴"} 27 | {"unique_id": 46, "Prompt": "失效日期在2024年12月30日之后的政策有哪些?", "Answer": "['燃油车新购补贴', '燃油车置换补贴']"} 28 | {"unique_id": 47, "Prompt": "描述为‘新能源新购补贴’的政策的生效日期是什么时候?", "Answer": "2024-08-16"} 29 | {"unique_id": 48, "Prompt": "名称包含‘置换’的政策有哪些?", "Answer": "['燃油车置换补贴', '新能源置换补贴']"} 30 | {"unique_id": 49, "Prompt": "政策类型为‘新能源’且在2024年内生效的政策有哪些?", "Answer": "['新能源新购补贴', '新能源置换补贴']"} 31 | {"unique_id": 50, "Prompt": "哪条政策的有效期最长?", "Answer": "燃油车新购补贴"} 32 | {"unique_id": 51, "Prompt": "政策‘新能源新购补贴’是否已经失效?", "Answer": "True"} 33 | {"unique_id": 54, "Prompt": "车系市场销量最高的记录日期是哪一天?", "Answer": "2025-01-27"} 34 | {"unique_id": 55, "Prompt": "西南区域系列D的年度销量目标是多少?", "Answer": "1032"} 35 | {"unique_id": 56, "Prompt": "所有政策中,生效日期最早的是哪个政策?", "Answer": "新能源置换补贴"} 36 | {"unique_id": 58, "Prompt": "华东区域系列C的月度销量目标是多少?", "Answer": "97"} 37 | {"unique_id": 59, "Prompt": "哪些政策在2025年仍然有效?", "Answer": "燃油车新购补贴, 燃油车置换补贴"} 38 | {"unique_id": 60, "Prompt": "华北区域系列B的年度销量目标是多少?", "Answer": "2244"} 39 | {"unique_id": 61, "Prompt": "总体市场销量最低的记录日期是哪一天?", "Answer": "2025-02-11"} 40 | {"unique_id": 62, "Prompt": "华南区域系列A的月度销量目标是多少?", "Answer": "184"} 41 | {"unique_id": 63, "Prompt": "系列D在西南区域的库存总数是多少?", "Answer": "253"} 42 | {"unique_id": 64, "Prompt": "系列B在华北区域的总库存量是多少?", "Answer": "396"} 43 | {"unique_id": 65, "Prompt": "华东区域系列A的库存总量是多少?", "Answer": "374"} 44 | {"unique_id": 66, "Prompt": "华南区域系列C的库存总量是多少?", "Answer": "278"} 45 | {"unique_id": 68, "Prompt": "系列B的竞品市场占有率总和是多少?", "Answer": "23.17"} 46 | {"unique_id": 69, "Prompt": "系列A在西南区域的月度销量目标是多少?", "Answer": "57"} 47 | {"unique_id": 70, "Prompt": "系列C在华东区域的年度销量目标是多少?", "Answer": "1164"} 48 | {"unique_id": 71, "Prompt": "系列B在华南区域的库存总量是多少?", "Answer": "533"} 49 | {"unique_id": 72, "Prompt": "记录日期为2025-02-12的竞品销量总和是多少?", "Answer": "61"} 50 | {"unique_id": 74, "Prompt": "车系市场销量最高的记录日期是哪一天?", "Answer": "2025-01-27"} 51 | {"unique_id": 76, "Prompt": "政策‘燃油车新购补贴’的生效日期是什么时候?", "Answer": "2024-02-02"} 52 | {"unique_id": 77, "Prompt": "哪些政策在2025年仍然有效?", "Answer": "['燃油车新购补贴', '燃油车置换补贴']"} 53 | {"unique_id": 78, "Prompt": "总体市场销量最低的记录日期是哪一天?", "Answer": "2025-02-11"} 54 | {"unique_id": 79, "Prompt": "新能源相关的政策有哪些?", "Answer": "['新能源新购补贴', '新能源置换补贴']"} 55 | {"unique_id": 80, "Prompt": "2025年1月11日的车系市场销量是多少?", "Answer": "91"} 56 | {"unique_id": 81, "Prompt": "政策‘新能源新购补贴’的失效日期是什么时候?", "Answer": "2024-12-23"} 57 | {"unique_id": 84, "Prompt": "系列A在2025年2月7日的竞品销量是多少?", "Answer": "87"} 58 | {"unique_id": 85, "Prompt": "华东区域系列C的库存总数是多少?", "Answer": "355"} 59 | {"unique_id": 86, "Prompt": "系列B的竞品市场占有率最高的公司名称是什么?", "Answer": "华泰通安网络有限公司"} 60 | {"unique_id": 87, "Prompt": "2025年2月27日的车系市场销量是多少?", "Answer": "88"} 61 | {"unique_id": 88, "Prompt": "系列D在华北区域的库存总数是多少?", "Answer": "344"} 62 | {"unique_id": 90, "Prompt": "2025年1月20日的总体市场销量是多少?", "Answer": "742"} 63 | {"unique_id": 91, "Prompt": "系列B在华南区域的库存总数是多少?", "Answer": "533"} 64 | {"unique_id": 94, "Prompt": "系列A在华南区域的库存总数是多少?", "Answer": "562"} 65 | {"unique_id": 95, "Prompt": "新能源车的总库存数量是多少?", "Answer": "2385"} 66 | {"unique_id": 96, "Prompt": "哪个仓库的库存数量最多,数量是多少?", "Answer": "梧州县仓库, 297"} 67 | {"unique_id": 97, "Prompt": "华北区域中燃油车的库存总数是多少?", "Answer": "616"} 68 | {"unique_id": 98, "Prompt": "最后盘点时间在2025年1月的库存总数是多少?", "Answer": "1518"} 69 | {"unique_id": 99, "Prompt": "系列B在西南区域的库存总数是多少?", "Answer": "489"} 70 | {"unique_id": 100, "Prompt": "华东区域中新能源车的库存总数是多少?", "Answer": "959"} 71 | {"unique_id": 101, "Prompt": "系列C在华南区域的库存总数是多少?", "Answer": "278"} 72 | {"unique_id": 102, "Prompt": "2025年2月盘点的库存总数是多少?", "Answer": "2133"} -------------------------------------------------------------------------------- /langProBe/DB/db_program.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import re 5 | import time 6 | import traceback 7 | from datetime import datetime 8 | from typing import List, Tuple, Optional 9 | 10 | import dspy 11 | from openai import OpenAI 12 | 13 | from langProBe.dspy_program import LangProBeDSPyMetaProgram 14 | import langProBe.constants as constants 15 | 16 | from langProBe.mcp_program import MCPPredict 17 | from langProBe.program_utils import ( 18 | call_lm, 19 | build_init_messages, 20 | build_messages, 21 | response_parsing, 22 | mcp_calling, 23 | ProcessManager 24 | ) 25 | 26 | MCP_SAMPLE_SYSTEM_PROMPT = """ 27 | You are a helpful assistant. You are able to answer questions using different tools. 28 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools. 29 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server. 30 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool. 31 | The tool description includes: 32 | A brief text description outlining the functionality of the tool. 33 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter. 34 | """ 35 | 36 | USER_PROMPT_SQL = """ 37 | Here is the database schema 38 | {schema} 39 | 40 | Question: 41 | {question} 42 | """ 43 | 44 | USER_PROMPT_NL = """ 45 | Question: 46 | {question} 47 | """ 48 | 49 | class DBPredict(MCPPredict): 50 | def __init__(self, max_steps=5, system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, task_name="database_search"): 51 | super().__init__(max_steps, system_prompt, task_name) 52 | 53 | def forward(self, **kwargs) -> dspy.Prediction: 54 | unique_id = kwargs.get('id') 55 | question = kwargs.get('question') 56 | gt = kwargs.get('answer') 57 | 58 | manager = ProcessManager() 59 | manager.lm_api_key = self.lm.api_key 60 | manager.lm_api_base = self.lm.api_base 61 | manager.model = self.lm.model 62 | manager.id = unique_id 63 | 64 | self.run_logger.info(f"ID: {manager.id}, Starting forward pass for question: {question}") 65 | 66 | from langProBe.evaluation import global_config 67 | mcps = global_config['mcp_pool'] 68 | 69 | from langProBe.evaluation import global_config 70 | if global_config.get('query_type', 'NL') == 'SQL': 71 | from .DB_utils.schema import SCHEMA 72 | user_prompt = USER_PROMPT_SQL.format(schema=SCHEMA, question=question) 73 | else: 74 | user_prompt = USER_PROMPT_NL.format(question=question) 75 | 76 | messages = build_init_messages(self.system_prompt, mcps, user_prompt) 77 | steps = 0 78 | all_completion_tokens = 0 79 | all_prompt_tokens = 0 80 | start_time = time.time() 81 | 82 | while not messages[-1][constants.ROLE] == constants.ASSISTANT and steps < self.max_steps: 83 | response, completion_tokens, prompt_tokens = call_lm(messages, manager, self.run_logger) 84 | all_completion_tokens += completion_tokens 85 | all_prompt_tokens += prompt_tokens 86 | mcp_calls = response_parsing(response) 87 | 88 | new_messages = mcp_calling(mcp_calls, manager, self.run_logger) 89 | messages = build_messages(messages, new_messages) 90 | steps += 1 91 | 92 | end_time = time.time() 93 | 94 | if messages[-1][constants.ROLE] != constants.ASSISTANT: 95 | self.run_logger.warning("Maximum steps reached without getting an answer") 96 | messages.append({ 97 | constants.ROLE: constants.ASSISTANT, 98 | constants.CONTENT: "超过最长次数限制,该问题无法解决", 99 | }) 100 | 101 | self.run_logger.info(f"ID: {manager.id}, Forward pass completed successfully") 102 | success = self.evaluate_prediction(question, gt, messages[-1][constants.CONTENT]) 103 | self.log_messages(messages, question, success, (end_time - start_time), all_prompt_tokens, 104 | all_completion_tokens) 105 | self.run_logger.info(f"ID: {manager.id}, Evaluation completed successfully") 106 | 107 | return dspy.Prediction( 108 | success=success, 109 | question=question, 110 | ground_truth=gt, 111 | answer=messages[-1][constants.CONTENT], 112 | trace=messages, 113 | process_report=manager 114 | ) 115 | -------------------------------------------------------------------------------- /langProBe/GAIA/__init__.py: -------------------------------------------------------------------------------- 1 | from langProBe.benchmark import BenchmarkMeta, MCPBench 2 | from langProBe.mcp_program import MCPPredict 3 | from langProBe.evaluation_utils import mcp_metric 4 | from .gaia_program import GAIAPredict 5 | 6 | MCP_SAMPLE_SYSTEM_PROMPT = """ 7 | You are a helpful assistant. You are able to answer questions using different tools. 8 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools. 9 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server. 10 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool. 11 | The tool description includes: 12 | A brief text description outlining the functionality of the tool. 13 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter. 14 | If you have obtained the final result. Please provide your final answer enclosed within tags. Ensure that only the final answer is included, without any additional explanations or commentary. 15 | """ 16 | def get_mcp_sample_benchmark(): 17 | mcp_sample_baseline = GAIAPredict( 18 | max_steps=50, 19 | system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, 20 | task_name="gaia") 21 | 22 | return [ 23 | BenchmarkMeta( 24 | MCPBench, 25 | [mcp_sample_baseline], 26 | mcp_metric, 27 | optimizers=[], 28 | name="MCP_GAIA" 29 | ) 30 | ] 31 | 32 | benchmark = get_mcp_sample_benchmark() -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/__init__.py -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/076c8171-9b3b-49b9-a477-244d2a532826.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/076c8171-9b3b-49b9-a477-244d2a532826.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3 -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/2b3ef98c-cc05-450b-a719-711aee40ac65.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/2b3ef98c-cc05-450b-a719-711aee40ac65.mp3 -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/366e2f2b-8632-4ef2-81eb-bc3877489217.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/366e2f2b-8632-4ef2-81eb-bc3877489217.pdf -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt: -------------------------------------------------------------------------------- 1 | H H H 2 | -------------------------------- 3 | H H H H -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/3da89939-209c-4086-8520-7eb734e6b4ef.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/3da89939-209c-4086-8520-7eb734e6b4ef.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/54612da3-fd56-4941-80f4-5eb82330de25.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/54612da3-fd56-4941-80f4-5eb82330de25.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/5b2a14e8-6e59-479c-80e3-4696e8980152.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/5b2a14e8-6e59-479c-80e3-4696e8980152.jpg -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/6359a0b1-8f7b-499b-9336-840f9ab90688.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/6359a0b1-8f7b-499b-9336-840f9ab90688.png -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/67e8878b-5cef-4375-804e-e6291fdbe78a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/67e8878b-5cef-4375-804e-e6291fdbe78a.pdf -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/8d46b8d6-b38a-47ff-ac74-cda14cf2d19b.csv: -------------------------------------------------------------------------------- 1 | species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex 2 | Adelie,Torgersen,39.1,18.7,181,3750,MALE 3 | Adelie,Torgersen,39.5,17.4,186,3800,FEMALE 4 | Adelie,Torgersen,40.3,18,195,3250,FEMALE 5 | Adelie,Torgersen,,,,, 6 | Adelie,Torgersen,36.7,19.3,193,3450,FEMALE 7 | Adelie,Torgersen,39.3,20.6,190,3650,MALE 8 | Adelie,Torgersen,38.9,17.8,181,3625,FEMALE 9 | Adelie,Torgersen,39.2,19.6,195,4675,MALE 10 | Adelie,Torgersen,34.1,18.1,193,3475, 11 | Adelie,Torgersen,42,20.2,190,4250, 12 | Adelie,Torgersen,37.8,17.1,186,3300, 13 | Adelie,Torgersen,37.8,17.3,180,3700, 14 | Adelie,Torgersen,41.1,17.6,182,3200,FEMALE 15 | Adelie,Torgersen,38.6,21.2,191,3800,MALE 16 | Adelie,Torgersen,34.6,21.1,198,4400,MALE 17 | Adelie,Torgersen,36.6,17.8,185,3700,FEMALE 18 | Adelie,Torgersen,38.7,19,195,3450,FEMALE 19 | Adelie,Torgersen,42.5,20.7,197,4500,MALE 20 | Adelie,Torgersen,34.4,18.4,184,3325,FEMALE 21 | Adelie,Torgersen,46,21.5,194,4200,MALE 22 | Adelie,Biscoe,37.8,18.3,174,3400,FEMALE 23 | Adelie,Biscoe,37.7,18.7,180,3600,MALE 24 | Adelie,Biscoe,35.9,19.2,189,3800,FEMALE 25 | Adelie,Biscoe,38.2,18.1,185,3950,MALE 26 | Adelie,Biscoe,38.8,17.2,180,3800,MALE 27 | Adelie,Biscoe,35.3,18.9,187,3800,FEMALE 28 | Adelie,Biscoe,40.6,18.6,183,3550,MALE 29 | Adelie,Biscoe,40.5,17.9,187,3200,FEMALE 30 | Adelie,Biscoe,37.9,18.6,172,3150,FEMALE 31 | Adelie,Biscoe,40.5,18.9,180,3950,MALE 32 | Adelie,Dream,39.5,16.7,178,3250,FEMALE 33 | Adelie,Dream,37.2,18.1,178,3900,MALE 34 | Adelie,Dream,39.5,17.8,188,3300,FEMALE 35 | Adelie,Dream,40.9,18.9,184,3900,MALE 36 | Adelie,Dream,36.4,17,195,3325,FEMALE 37 | Adelie,Dream,39.2,21.1,196,4150,MALE 38 | Adelie,Dream,38.8,20,190,3950,MALE 39 | Adelie,Dream,42.2,18.5,180,3550,FEMALE 40 | Adelie,Dream,37.6,19.3,181,3300,FEMALE 41 | Adelie,Dream,39.8,19.1,184,4650,MALE 42 | Adelie,Dream,36.5,18,182,3150,FEMALE 43 | Adelie,Dream,40.8,18.4,195,3900,MALE 44 | Adelie,Dream,36,18.5,186,3100,FEMALE 45 | Adelie,Dream,44.1,19.7,196,4400,MALE 46 | Adelie,Dream,37,16.9,185,3000,FEMALE 47 | Adelie,Dream,39.6,18.8,190,4600,MALE 48 | Adelie,Dream,41.1,19,182,3425,MALE 49 | Adelie,Dream,37.5,18.9,179,2975, 50 | Adelie,Dream,36,17.9,190,3450,FEMALE 51 | Adelie,Dream,42.3,21.2,191,4150,MALE 52 | Adelie,Biscoe,39.6,17.7,186,3500,FEMALE 53 | Adelie,Biscoe,40.1,18.9,188,4300,MALE 54 | Adelie,Biscoe,35,17.9,190,3450,FEMALE 55 | Adelie,Biscoe,42,19.5,200,4050,MALE 56 | Adelie,Biscoe,34.5,18.1,187,2900,FEMALE 57 | Adelie,Biscoe,41.4,18.6,191,3700,MALE 58 | Adelie,Biscoe,39,17.5,186,3550,FEMALE 59 | Adelie,Biscoe,40.6,18.8,193,3800,MALE 60 | Adelie,Biscoe,36.5,16.6,181,2850,FEMALE 61 | Adelie,Biscoe,37.6,19.1,194,3750,MALE 62 | Adelie,Biscoe,35.7,16.9,185,3150,FEMALE 63 | Adelie,Biscoe,41.3,21.1,195,4400,MALE 64 | Adelie,Biscoe,37.6,17,185,3600,FEMALE 65 | Adelie,Biscoe,41.1,18.2,192,4050,MALE 66 | Adelie,Biscoe,36.4,17.1,184,2850,FEMALE 67 | Adelie,Biscoe,41.6,18,192,3950,MALE 68 | Adelie,Biscoe,35.5,16.2,195,3350,FEMALE 69 | Adelie,Biscoe,41.1,19.1,188,4100,MALE 70 | Adelie,Torgersen,35.9,16.6,190,3050,FEMALE 71 | Adelie,Torgersen,41.8,19.4,198,4450,MALE 72 | Adelie,Torgersen,33.5,19,190,3600,FEMALE 73 | Adelie,Torgersen,39.7,18.4,190,3900,MALE 74 | Adelie,Torgersen,39.6,17.2,196,3550,FEMALE 75 | Adelie,Torgersen,45.8,18.9,197,4150,MALE 76 | Adelie,Torgersen,35.5,17.5,190,3700,FEMALE 77 | Adelie,Torgersen,42.8,18.5,195,4250,MALE 78 | Adelie,Torgersen,40.9,16.8,191,3700,FEMALE 79 | Adelie,Torgersen,37.2,19.4,184,3900,MALE 80 | Adelie,Torgersen,36.2,16.1,187,3550,FEMALE 81 | Adelie,Torgersen,42.1,19.1,195,4000,MALE 82 | Adelie,Torgersen,34.6,17.2,189,3200,FEMALE 83 | Adelie,Torgersen,42.9,17.6,196,4700,MALE 84 | Adelie,Torgersen,36.7,18.8,187,3800,FEMALE 85 | Adelie,Torgersen,35.1,19.4,193,4200,MALE 86 | Adelie,Dream,37.3,17.8,191,3350,FEMALE 87 | Adelie,Dream,41.3,20.3,194,3550,MALE 88 | Adelie,Dream,36.3,19.5,190,3800,MALE 89 | Adelie,Dream,36.9,18.6,189,3500,FEMALE 90 | Adelie,Dream,38.3,19.2,189,3950,MALE 91 | Adelie,Dream,38.9,18.8,190,3600,FEMALE 92 | Adelie,Dream,35.7,18,202,3550,FEMALE 93 | Adelie,Dream,41.1,18.1,205,4300,MALE 94 | Adelie,Dream,34,17.1,185,3400,FEMALE 95 | Adelie,Dream,39.6,18.1,186,4450,MALE 96 | Adelie,Dream,36.2,17.3,187,3300,FEMALE 97 | Adelie,Dream,40.8,18.9,208,4300,MALE 98 | Adelie,Dream,38.1,18.6,190,3700,FEMALE 99 | Adelie,Dream,40.3,18.5,196,4350,MALE 100 | Adelie,Dream,33.1,16.1,178,2900,FEMALE 101 | Adelie,Dream,43.2,18.5,192,4100,MALE 102 | Adelie,Biscoe,35,17.9,192,3725,FEMALE 103 | Adelie,Biscoe,41,20,203,4725,MALE 104 | Adelie,Biscoe,37.7,16,183,3075,FEMALE 105 | Adelie,Biscoe,37.8,20,190,4250,MALE 106 | Adelie,Biscoe,37.9,18.6,193,2925,FEMALE 107 | Adelie,Biscoe,39.7,18.9,184,3550,MALE 108 | Adelie,Biscoe,38.6,17.2,199,3750,FEMALE 109 | Adelie,Biscoe,38.2,20,190,3900,MALE 110 | Adelie,Biscoe,38.1,17,181,3175,FEMALE 111 | Adelie,Biscoe,43.2,19,197,4775,MALE 112 | Adelie,Biscoe,38.1,16.5,198,3825,FEMALE 113 | Adelie,Biscoe,45.6,20.3,191,4600,MALE 114 | Adelie,Biscoe,39.7,17.7,193,3200,FEMALE 115 | Adelie,Biscoe,42.2,19.5,197,4275,MALE 116 | Adelie,Biscoe,39.6,20.7,191,3900,FEMALE 117 | Adelie,Biscoe,42.7,18.3,196,4075,MALE 118 | Adelie,Torgersen,38.6,17,188,2900,FEMALE 119 | Adelie,Torgersen,37.3,20.5,199,3775,MALE 120 | Adelie,Torgersen,35.7,17,189,3350,FEMALE 121 | Adelie,Torgersen,41.1,18.6,189,3325,MALE 122 | Adelie,Torgersen,36.2,17.2,187,3150,FEMALE 123 | Adelie,Torgersen,37.7,19.8,198,3500,MALE 124 | Adelie,Torgersen,40.2,17,176,3450,FEMALE 125 | Adelie,Torgersen,41.4,18.5,202,3875,MALE 126 | Adelie,Torgersen,35.2,15.9,186,3050,FEMALE 127 | Adelie,Torgersen,40.6,19,199,4000,MALE 128 | Adelie,Torgersen,38.8,17.6,191,3275,FEMALE 129 | Adelie,Torgersen,41.5,18.3,195,4300,MALE 130 | Adelie,Torgersen,39,17.1,191,3050,FEMALE 131 | Adelie,Torgersen,44.1,18,210,4000,MALE 132 | Adelie,Torgersen,38.5,17.9,190,3325,FEMALE 133 | Adelie,Torgersen,43.1,19.2,197,3500,MALE 134 | Adelie,Dream,36.8,18.5,193,3500,FEMALE 135 | Adelie,Dream,37.5,18.5,199,4475,MALE 136 | Adelie,Dream,38.1,17.6,187,3425,FEMALE 137 | Adelie,Dream,41.1,17.5,190,3900,MALE 138 | Adelie,Dream,35.6,17.5,191,3175,FEMALE 139 | Adelie,Dream,40.2,20.1,200,3975,MALE 140 | Adelie,Dream,37,16.5,185,3400,FEMALE 141 | Adelie,Dream,39.7,17.9,193,4250,MALE 142 | Adelie,Dream,40.2,17.1,193,3400,FEMALE 143 | Adelie,Dream,40.6,17.2,187,3475,MALE 144 | Adelie,Dream,32.1,15.5,188,3050,FEMALE 145 | Adelie,Dream,40.7,17,190,3725,MALE 146 | Adelie,Dream,37.3,16.8,192,3000,FEMALE 147 | Adelie,Dream,39,18.7,185,3650,MALE 148 | Adelie,Dream,39.2,18.6,190,4250,MALE 149 | Adelie,Dream,36.6,18.4,184,3475,FEMALE 150 | Adelie,Dream,36,17.8,195,3450,FEMALE 151 | Adelie,Dream,37.8,18.1,193,3750,MALE 152 | Adelie,Dream,36,17.1,187,3700,FEMALE 153 | Adelie,Dream,41.5,18.5,201,4000,MALE 154 | Chinstrap,Dream,46.5,17.9,192,3500,FEMALE 155 | Chinstrap,Dream,50,19.5,196,3900,MALE 156 | Chinstrap,Dream,51.3,19.2,193,3650,MALE 157 | Chinstrap,Dream,45.4,18.7,188,3525,FEMALE 158 | Chinstrap,Dream,52.7,19.8,197,3725,MALE 159 | Chinstrap,Dream,45.2,17.8,198,3950,FEMALE 160 | Chinstrap,Dream,46.1,18.2,178,3250,FEMALE 161 | Chinstrap,Dream,51.3,18.2,197,3750,MALE 162 | Chinstrap,Dream,46,18.9,195,4150,FEMALE 163 | Chinstrap,Dream,51.3,19.9,198,3700,MALE 164 | Chinstrap,Dream,46.6,17.8,193,3800,FEMALE 165 | Chinstrap,Dream,51.7,20.3,194,3775,MALE 166 | Chinstrap,Dream,47,17.3,185,3700,FEMALE 167 | Chinstrap,Dream,52,18.1,201,4050,MALE 168 | Chinstrap,Dream,45.9,17.1,190,3575,FEMALE 169 | Chinstrap,Dream,50.5,19.6,201,4050,MALE 170 | Chinstrap,Dream,50.3,20,197,3300,MALE 171 | Chinstrap,Dream,58,17.8,181,3700,FEMALE 172 | Chinstrap,Dream,46.4,18.6,190,3450,FEMALE 173 | Chinstrap,Dream,49.2,18.2,195,4400,MALE 174 | Chinstrap,Dream,42.4,17.3,181,3600,FEMALE 175 | Chinstrap,Dream,48.5,17.5,191,3400,MALE 176 | Chinstrap,Dream,43.2,16.6,187,2900,FEMALE 177 | Chinstrap,Dream,50.6,19.4,193,3800,MALE 178 | Chinstrap,Dream,46.7,17.9,195,3300,FEMALE 179 | Chinstrap,Dream,52,19,197,4150,MALE 180 | Chinstrap,Dream,50.5,18.4,200,3400,FEMALE 181 | Chinstrap,Dream,49.5,19,200,3800,MALE 182 | Chinstrap,Dream,46.4,17.8,191,3700,FEMALE 183 | Chinstrap,Dream,52.8,20,205,4550,MALE 184 | Chinstrap,Dream,40.9,16.6,187,3200,FEMALE 185 | Chinstrap,Dream,54.2,20.8,201,4300,MALE 186 | Chinstrap,Dream,42.5,16.7,187,3350,FEMALE 187 | Chinstrap,Dream,51,18.8,203,4100,MALE 188 | Chinstrap,Dream,49.7,18.6,195,3600,MALE 189 | Chinstrap,Dream,47.5,16.8,199,3900,FEMALE 190 | Chinstrap,Dream,47.6,18.3,195,3850,FEMALE 191 | Chinstrap,Dream,52,20.7,210,4800,MALE 192 | Chinstrap,Dream,46.9,16.6,192,2700,FEMALE 193 | Chinstrap,Dream,53.5,19.9,205,4500,MALE 194 | Chinstrap,Dream,49,19.5,210,3950,MALE 195 | Chinstrap,Dream,46.2,17.5,187,3650,FEMALE 196 | Chinstrap,Dream,50.9,19.1,196,3550,MALE 197 | Chinstrap,Dream,45.5,17,196,3500,FEMALE 198 | Chinstrap,Dream,50.9,17.9,196,3675,FEMALE 199 | Chinstrap,Dream,50.8,18.5,201,4450,MALE 200 | Chinstrap,Dream,50.1,17.9,190,3400,FEMALE 201 | Chinstrap,Dream,49,19.6,212,4300,MALE 202 | Chinstrap,Dream,51.5,18.7,187,3250,MALE 203 | Chinstrap,Dream,49.8,17.3,198,3675,FEMALE 204 | Chinstrap,Dream,48.1,16.4,199,3325,FEMALE 205 | Chinstrap,Dream,51.4,19,201,3950,MALE 206 | Chinstrap,Dream,45.7,17.3,193,3600,FEMALE 207 | Chinstrap,Dream,50.7,19.7,203,4050,MALE 208 | Chinstrap,Dream,42.5,17.3,187,3350,FEMALE 209 | Chinstrap,Dream,52.2,18.8,197,3450,MALE 210 | Chinstrap,Dream,45.2,16.6,191,3250,FEMALE 211 | Chinstrap,Dream,49.3,19.9,203,4050,MALE 212 | Chinstrap,Dream,50.2,18.8,202,3800,MALE 213 | Chinstrap,Dream,45.6,19.4,194,3525,FEMALE 214 | Chinstrap,Dream,51.9,19.5,206,3950,MALE 215 | Chinstrap,Dream,46.8,16.5,189,3650,FEMALE 216 | Chinstrap,Dream,45.7,17,195,3650,FEMALE 217 | Chinstrap,Dream,55.8,19.8,207,4000,MALE 218 | Chinstrap,Dream,43.5,18.1,202,3400,FEMALE 219 | Chinstrap,Dream,49.6,18.2,193,3775,MALE 220 | Chinstrap,Dream,50.8,19,210,4100,MALE 221 | Chinstrap,Dream,50.2,18.7,198,3775,FEMALE 222 | Gentoo,Biscoe,46.1,13.2,211,4500,FEMALE 223 | Gentoo,Biscoe,50,16.3,230,5700,MALE 224 | Gentoo,Biscoe,48.7,14.1,210,4450,FEMALE 225 | Gentoo,Biscoe,50,15.2,218,5700,MALE 226 | Gentoo,Biscoe,47.6,14.5,215,5400,MALE 227 | Gentoo,Biscoe,46.5,13.5,210,4550,FEMALE 228 | Gentoo,Biscoe,45.4,14.6,211,4800,FEMALE 229 | Gentoo,Biscoe,46.7,15.3,219,5200,MALE 230 | Gentoo,Biscoe,43.3,13.4,209,4400,FEMALE 231 | Gentoo,Biscoe,46.8,15.4,215,5150,MALE 232 | Gentoo,Biscoe,40.9,13.7,214,4650,FEMALE 233 | Gentoo,Biscoe,49,16.1,216,5550,MALE 234 | Gentoo,Biscoe,45.5,13.7,214,4650,FEMALE 235 | Gentoo,Biscoe,48.4,14.6,213,5850,MALE 236 | Gentoo,Biscoe,45.8,14.6,210,4200,FEMALE 237 | Gentoo,Biscoe,49.3,15.7,217,5850,MALE 238 | Gentoo,Biscoe,42,13.5,210,4150,FEMALE 239 | Gentoo,Biscoe,49.2,15.2,221,6300,MALE 240 | Gentoo,Biscoe,46.2,14.5,209,4800,FEMALE 241 | Gentoo,Biscoe,48.7,15.1,222,5350,MALE 242 | Gentoo,Biscoe,50.2,14.3,218,5700,MALE 243 | Gentoo,Biscoe,45.1,14.5,215,5000,FEMALE 244 | Gentoo,Biscoe,46.5,14.5,213,4400,FEMALE 245 | Gentoo,Biscoe,46.3,15.8,215,5050,MALE 246 | Gentoo,Biscoe,42.9,13.1,215,5000,FEMALE 247 | Gentoo,Biscoe,46.1,15.1,215,5100,MALE 248 | Gentoo,Biscoe,44.5,14.3,216,4100, 249 | Gentoo,Biscoe,47.8,15,215,5650,MALE 250 | Gentoo,Biscoe,48.2,14.3,210,4600,FEMALE 251 | Gentoo,Biscoe,50,15.3,220,5550,MALE 252 | Gentoo,Biscoe,47.3,15.3,222,5250,MALE 253 | Gentoo,Biscoe,42.8,14.2,209,4700,FEMALE 254 | Gentoo,Biscoe,45.1,14.5,207,5050,FEMALE 255 | Gentoo,Biscoe,59.6,17,230,6050,MALE 256 | Gentoo,Biscoe,49.1,14.8,220,5150,FEMALE 257 | Gentoo,Biscoe,48.4,16.3,220,5400,MALE 258 | Gentoo,Biscoe,42.6,13.7,213,4950,FEMALE 259 | Gentoo,Biscoe,44.4,17.3,219,5250,MALE 260 | Gentoo,Biscoe,44,13.6,208,4350,FEMALE 261 | Gentoo,Biscoe,48.7,15.7,208,5350,MALE 262 | Gentoo,Biscoe,42.7,13.7,208,3950,FEMALE 263 | Gentoo,Biscoe,49.6,16,225,5700,MALE 264 | Gentoo,Biscoe,45.3,13.7,210,4300,FEMALE 265 | Gentoo,Biscoe,49.6,15,216,4750,MALE 266 | Gentoo,Biscoe,50.5,15.9,222,5550,MALE 267 | Gentoo,Biscoe,43.6,13.9,217,4900,FEMALE 268 | Gentoo,Biscoe,45.5,13.9,210,4200,FEMALE 269 | Gentoo,Biscoe,50.5,15.9,225,5400,MALE 270 | Gentoo,Biscoe,44.9,13.3,213,5100,FEMALE 271 | Gentoo,Biscoe,45.2,15.8,215,5300,MALE 272 | Gentoo,Biscoe,46.6,14.2,210,4850,FEMALE 273 | Gentoo,Biscoe,48.5,14.1,220,5300,MALE 274 | Gentoo,Biscoe,45.1,14.4,210,4400,FEMALE 275 | Gentoo,Biscoe,50.1,15,225,5000,MALE 276 | Gentoo,Biscoe,46.5,14.4,217,4900,FEMALE 277 | Gentoo,Biscoe,45,15.4,220,5050,MALE 278 | Gentoo,Biscoe,43.8,13.9,208,4300,FEMALE 279 | Gentoo,Biscoe,45.5,15,220,5000,MALE 280 | Gentoo,Biscoe,43.2,14.5,208,4450,FEMALE 281 | Gentoo,Biscoe,50.4,15.3,224,5550,MALE 282 | Gentoo,Biscoe,45.3,13.8,208,4200,FEMALE 283 | Gentoo,Biscoe,46.2,14.9,221,5300,MALE 284 | Gentoo,Biscoe,45.7,13.9,214,4400,FEMALE 285 | Gentoo,Biscoe,54.3,15.7,231,5650,MALE 286 | Gentoo,Biscoe,45.8,14.2,219,4700,FEMALE 287 | Gentoo,Biscoe,49.8,16.8,230,5700,MALE 288 | Gentoo,Biscoe,46.2,14.4,214,4650, 289 | Gentoo,Biscoe,49.5,16.2,229,5800,MALE 290 | Gentoo,Biscoe,43.5,14.2,220,4700,FEMALE 291 | Gentoo,Biscoe,50.7,15,223,5550,MALE 292 | Gentoo,Biscoe,47.7,15,216,4750,FEMALE 293 | Gentoo,Biscoe,46.4,15.6,221,5000,MALE 294 | Gentoo,Biscoe,48.2,15.6,221,5100,MALE 295 | Gentoo,Biscoe,46.5,14.8,217,5200,FEMALE 296 | Gentoo,Biscoe,46.4,15,216,4700,FEMALE 297 | Gentoo,Biscoe,48.6,16,230,5800,MALE 298 | Gentoo,Biscoe,47.5,14.2,209,4600,FEMALE 299 | Gentoo,Biscoe,51.1,16.3,220,6000,MALE 300 | Gentoo,Biscoe,45.2,13.8,215,4750,FEMALE 301 | Gentoo,Biscoe,45.2,16.4,223,5950,MALE 302 | Gentoo,Biscoe,49.1,14.5,212,4625,FEMALE 303 | Gentoo,Biscoe,52.5,15.6,221,5450,MALE 304 | Gentoo,Biscoe,47.4,14.6,212,4725,FEMALE 305 | Gentoo,Biscoe,50,15.9,224,5350,MALE 306 | Gentoo,Biscoe,44.9,13.8,212,4750,FEMALE 307 | Gentoo,Biscoe,50.8,17.3,228,5600,MALE 308 | Gentoo,Biscoe,43.4,14.4,218,4600,FEMALE 309 | Gentoo,Biscoe,51.3,14.2,218,5300,MALE 310 | Gentoo,Biscoe,47.5,14,212,4875,FEMALE 311 | Gentoo,Biscoe,52.1,17,230,5550,MALE 312 | Gentoo,Biscoe,47.5,15,218,4950,FEMALE 313 | Gentoo,Biscoe,52.2,17.1,228,5400,MALE 314 | Gentoo,Biscoe,45.5,14.5,212,4750,FEMALE 315 | Gentoo,Biscoe,49.5,16.1,224,5650,MALE 316 | Gentoo,Biscoe,44.5,14.7,214,4850,FEMALE 317 | Gentoo,Biscoe,50.8,15.7,226,5200,MALE 318 | Gentoo,Biscoe,49.4,15.8,216,4925,MALE 319 | Gentoo,Biscoe,46.9,14.6,222,4875,FEMALE 320 | Gentoo,Biscoe,48.4,14.4,203,4625,FEMALE 321 | Gentoo,Biscoe,51.1,16.5,225,5250,MALE 322 | Gentoo,Biscoe,48.5,15,219,4850,FEMALE 323 | Gentoo,Biscoe,55.9,17,228,5600,MALE 324 | Gentoo,Biscoe,47.2,15.5,215,4975,FEMALE 325 | Gentoo,Biscoe,49.1,15,228,5500,MALE 326 | Gentoo,Biscoe,47.3,13.8,216,4725, 327 | Gentoo,Biscoe,46.8,16.1,215,5500,MALE 328 | Gentoo,Biscoe,41.7,14.7,210,4700,FEMALE 329 | Gentoo,Biscoe,53.4,15.8,219,5500,MALE 330 | Gentoo,Biscoe,43.3,14,208,4575,FEMALE 331 | Gentoo,Biscoe,48.1,15.1,209,5500,MALE 332 | Gentoo,Biscoe,50.5,15.2,216,5000,FEMALE 333 | Gentoo,Biscoe,49.8,15.9,229,5950,MALE 334 | Gentoo,Biscoe,43.5,15.2,213,4650,FEMALE 335 | Gentoo,Biscoe,51.5,16.3,230,5500,MALE 336 | Gentoo,Biscoe,46.2,14.1,217,4375,FEMALE 337 | Gentoo,Biscoe,55.1,16,230,5850,MALE 338 | Gentoo,Biscoe,44.5,15.7,217,4875, 339 | Gentoo,Biscoe,48.8,16.2,222,6000,MALE 340 | Gentoo,Biscoe,47.2,13.7,214,4925,FEMALE 341 | Gentoo,Biscoe,,,,, 342 | Gentoo,Biscoe,46.8,14.3,215,4850,FEMALE 343 | Gentoo,Biscoe,50.4,15.7,222,5750,MALE 344 | Gentoo,Biscoe,45.2,14.8,212,5200,FEMALE 345 | Gentoo,Biscoe,49.9,16.1,213,5400,MALE 346 | -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/8f80e01c-1296-4371-9486-bb3d68651a60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/8f80e01c-1296-4371-9486-bb3d68651a60.png -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/__init__.py -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "http://schema.org", 3 | "@type": "Collection", 4 | "@id": "https://doi.org/10.5447/ipk/2022/29", 5 | "url": "https://doi.ipk-gatersleben.de:443/DOI/64fb788c-7495-4800-8568-fd562b07017e/fbda7260-8307-485e-a9b7-d84292e3eb04/2", 6 | "additionalType": "directory", 7 | "name": "GLOBAL STRATEGY FOR THE CONSERVATION OF POTATO", 8 | "author": { 9 | "name": "Manuela Nagel", 10 | "givenName": "Manuela", 11 | "familyName": "Nagel", 12 | "affiliation": { 13 | "@type": "Organization", 14 | "name": "Leibniz Institute of Plant Genetics and Crop Plant Research (IPK), Seeland OT Gatersleben, Corrensstraße 3, 06466, Germany" 15 | }, 16 | "@id": "https://orcid.org/0000-0003-0396-0333" 17 | }, 18 | "editor": [ 19 | { 20 | "name": "Ehsan Dulloo", 21 | "givenName": "Ehsan", 22 | "familyName": "Dulloo", 23 | "affiliation": { 24 | "@type": "Organization", 25 | "name": "International Consultant, ," 26 | }, 27 | "contributorType": "Researcher" 28 | }, 29 | { 30 | "name": "Prishnee Bissessur", 31 | "givenName": "Prishnee", 32 | "familyName": "Bissessur", 33 | "affiliation": { 34 | "@type": "Organization", 35 | "name": "International Consultant, ," 36 | }, 37 | "contributorType": "Researcher" 38 | }, 39 | { 40 | "name": "Tatjana Gavrilenko", 41 | "givenName": "Tatjana", 42 | "familyName": "Gavrilenko", 43 | "affiliation": { 44 | "@type": "Organization", 45 | "name": "N.I. Vavilov All-Russian Institute of Plant Genetic Resources, , Russia" 46 | }, 47 | "contributorType": "Researcher", 48 | "@id": "https://orcid.org/0000-0002-2605-6569" 49 | }, 50 | { 51 | "name": "John Bamberg", 52 | "givenName": "John", 53 | "familyName": "Bamberg", 54 | "affiliation": { 55 | "@type": "Organization", 56 | "name": "U. S. Potato Genebank, , USA" 57 | }, 58 | "contributorType": "Researcher", 59 | "@id": "https://orcid.org/0000-0001-6102-7846" 60 | }, 61 | { 62 | "name": "David Ellis", 63 | "givenName": "David", 64 | "familyName": "Ellis", 65 | "affiliation": { 66 | "@type": "Organization", 67 | "name": "International Potato Center (CIP), , Peru" 68 | }, 69 | "contributorType": "Researcher", 70 | "@id": "https://orcid.org/0000-0002-0209-2784" 71 | }, 72 | { 73 | "name": "Peter Giovannini", 74 | "givenName": "Peter", 75 | "familyName": "Giovannini", 76 | "affiliation": { 77 | "@type": "Organization", 78 | "name": "Global Crop Diversity Trust, ," 79 | }, 80 | "contributorType": "Researcher", 81 | "@id": "https://orcid.org/0000-0002-1053-2030" 82 | } 83 | ], 84 | "description": "Cultivated potato, Solanum tuberosum ssp. tuberosum, is the third most consumed crop globally and important not only for food but also for for the animal feed, pharmaceutical, textile and paper industries. To gain an overview on the current state of the conservation and use of potato genetic resources, the Global Crop Diversity Trust (Crop Trust), commissioned an update of the ‘Global conservation strategy for potato genetic resources’. This updated strategy aims to support the efficiency and effectiveness of potato diversity conservation at national, regional and international levels, and to identify priorities for strengthening the conservation and use of potato genetic resources.", 85 | "keywords": "ex situ conservation, plant genetic resources, potato, Solanum tuberosum, global strategy, conservation strategy, wild potato, Andigenum group, Chilotanum group, native potato variety, genebank, accession, true potato seed, potato tuber, late blight", 86 | "inLanguage": "en", 87 | "contentSize": "0 B", 88 | "datePublished": "2022", 89 | "schemaVersion": "http://datacite.org/schema/kernel-4", 90 | "publisher": { 91 | "@type": "Organization", 92 | "name": "e!DAL - Plant Genomics and Phenomics Research Data Repository (PGP), IPK Gatersleben, Seeland OT Gatersleben, Corrensstraße 3, 06466, Germany" 93 | }, 94 | "provider": { 95 | "@type": "Organization", 96 | "name": "datacite" 97 | } 98 | } -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/bfcd99e1-0690-4b53-a85c-0174a8629083.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/bfcd99e1-0690-4b53-a85c-0174a8629083.zip -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/cca70ce6-1952-45d2-acd4-80c903b0bc49.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/cca70ce6-1952-45d2-acd4-80c903b0bc49.png -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/df6561b2-7ee5-4540-baab-5095f742716a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/df6561b2-7ee5-4540-baab-5095f742716a.png -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx -------------------------------------------------------------------------------- /langProBe/GAIA/data/2023/validation/f918266a-b3e0-4914-865d-4faa564f1aef.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | import time 3 | 4 | class UhOh(Exception): 5 | pass 6 | 7 | class Hmm: 8 | def __init__(self): 9 | self.value = randint(-100, 100) 10 | 11 | def Yeah(self): 12 | if self.value == 0: 13 | return True 14 | else: 15 | raise UhOh() 16 | 17 | def Okay(): 18 | while True: 19 | yield Hmm() 20 | 21 | def keep_trying(go, first_try=True): 22 | maybe = next(go) 23 | try: 24 | if maybe.Yeah(): 25 | return maybe.value 26 | except UhOh: 27 | if first_try: 28 | print("Working...") 29 | print("Please wait patiently...") 30 | time.sleep(0.1) 31 | return keep_trying(go, first_try=False) 32 | 33 | if __name__ == "__main__": 34 | go = Okay() 35 | print(f"{keep_trying(go)}") 36 | -------------------------------------------------------------------------------- /langProBe/GAIA/data/GAIA.py: -------------------------------------------------------------------------------- 1 | """GAIA 2023 dataset.""" 2 | 3 | 4 | import json 5 | import os 6 | 7 | import datasets 8 | 9 | 10 | _CITATION = """ """ 11 | 12 | _DESCRIPTION = """ """ 13 | 14 | _HOMEPAGE = "" 15 | 16 | _LICENSE = "" 17 | 18 | _NAMES = [ 19 | "2023_all", 20 | "2023_level1", 21 | "2023_level2", 22 | "2023_level3", 23 | ] 24 | 25 | YEAR_TO_LEVELS = {"2023": [1, 2, 3]} 26 | 27 | separator = "_" 28 | 29 | 30 | class GAIA_dataset(datasets.GeneratorBasedBuilder): 31 | VERSION = datasets.Version("0.0.1") 32 | 33 | BUILDER_CONFIGS = [ 34 | datasets.BuilderConfig(name=name, version=version, description=name) 35 | for name, version in zip(_NAMES, [VERSION] * len(_NAMES)) 36 | ] 37 | 38 | def _info(self): 39 | features = datasets.Features( 40 | { 41 | "task_id": datasets.Value("string"), 42 | "Question": datasets.Value("string"), 43 | "Level": datasets.Value("string"), 44 | "Final answer": datasets.Value("string"), # ? for test values 45 | "file_name": datasets.Value("string"), 46 | "file_path": datasets.Value("string"), # generated here 47 | "Annotator Metadata": {k: datasets.Value("string") for k in ["Steps", "Number of steps", "How long did this take?", "Tools", "Number of tools"]} # "", 48 | } 49 | ) 50 | return datasets.DatasetInfo( 51 | description=_DESCRIPTION, 52 | features=features, 53 | homepage=_HOMEPAGE, 54 | license=_LICENSE, 55 | citation=_CITATION, 56 | ) 57 | 58 | def _split_generators(self, dl_manager): 59 | year, level_name = self.config.name.split(separator) 60 | if level_name == "all": 61 | levels = YEAR_TO_LEVELS[year] 62 | else: 63 | level_name = int(level_name.split("level")[1]) 64 | levels = [level_name] 65 | print(year, level_name) 66 | 67 | output = [] 68 | for split in ["test", "validation"]: 69 | root_file = dl_manager.download(os.path.join(year, split, "metadata.jsonl")) 70 | test_attached_files = {"": ""} 71 | with open(root_file, "r", encoding="utf-8") as f: 72 | for line in f: 73 | cur_line = json.loads(line) 74 | if cur_line["Level"] in levels and cur_line["file_name"] != "": 75 | attached_file_name = cur_line["file_name"] 76 | attached_file = dl_manager.download(os.path.join(year, split, attached_file_name)) 77 | test_attached_files[attached_file_name] = attached_file 78 | 79 | output.append( 80 | datasets.SplitGenerator( 81 | name=getattr(datasets.Split, split.upper()), 82 | gen_kwargs={"root_file": root_file, "attached_files": test_attached_files, "levels": levels}, 83 | ) 84 | ) 85 | return output 86 | 87 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 88 | def _generate_examples(self, root_file: str, attached_files: dict, levels: list[int]): 89 | with open(root_file, "r", encoding="utf-8") as f: 90 | for key, line in enumerate(f): 91 | cur_line = json.loads(line) 92 | if cur_line["Level"] in levels: 93 | cur_line["file_path"] = attached_files[cur_line["file_name"]] 94 | yield key, cur_line 95 | 96 | 97 | -------------------------------------------------------------------------------- /langProBe/GAIA/data/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - en 4 | pretty_name: General AI Assistants Benchmark 5 | extra_gated_prompt: "To avoid contamination and data leakage, you agree to not reshare this dataset outside of a gated or private repository on the HF hub." 6 | extra_gated_fields: 7 | I agree to not reshare the GAIA submissions set according to the above conditions: checkbox 8 | --- 9 | # GAIA dataset 10 | 11 | GAIA is a benchmark which aims at evaluating next-generation LLMs (LLMs with augmented capabilities due to added tooling, efficient prompting, access to search, etc). 12 | 13 | We added gating to prevent bots from scraping the dataset. Please do not reshare the validation or test set in a crawlable format. 14 | 15 | ## Data and leaderboard 16 | GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve. It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata. 17 | 18 | GAIA leaderboard can be found in this space (https://huggingface.co/spaces/gaia-benchmark/leaderboard). 19 | 20 | Questions are contained in metadata.jsonl. Some questions come with an additional file, that can be found in the same folder and whose id is given in the field file_name. 21 | 22 | More details in [the paper](https://arxiv.org/abs/2311.12983) for now and soon here as well. -------------------------------------------------------------------------------- /langProBe/GAIA/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/__init__.py -------------------------------------------------------------------------------- /langProBe/GAIA/data/statics.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from collections import defaultdict 4 | 5 | def parse_tools(tools_str): 6 | """ 7 | 解析 Tools 字符串,将其分割为单独的工具列表。 8 | 假设 Tools 字段是以数字和点开头的每行工具,例如: 9 | "1. Web browser 10 | 2. Image recognition tools (to identify and parse a figure with three axes)" 11 | """ 12 | tools = [] 13 | # 使用正则表达式匹配每个工具条目 14 | pattern = re.compile(r'\d+\.\s*(.*)') 15 | for line in tools_str.split('\n'): 16 | match = pattern.match(line.strip()) 17 | if match: 18 | tool = match.group(1).strip() 19 | # 去除可能的括号内说明 20 | tool = re.sub(r'\s*\(.*\)', '', tool) 21 | tools.append(tool) 22 | return tools 23 | 24 | def process_jsonl(file_path): 25 | tool_counts = defaultdict(int) 26 | total_tools = 0 27 | tool_numbers = [] 28 | processed_tasks = 0 29 | 30 | with open(file_path, 'r', encoding='utf-8') as f: 31 | for line_number, line in enumerate(f, 1): 32 | line = line.strip() 33 | if not line: 34 | continue # 跳过空行 35 | # 调试信息:确认正在处理哪一行 36 | print(f"处理第 {line_number} 行") 37 | 38 | try: 39 | data = json.loads(line) 40 | except json.JSONDecodeError as e: 41 | print(f"第 {line_number} 行: JSON 解码错误: {e}") 42 | continue 43 | 44 | # 提取 Annotator Metadata 45 | annotator_metadata = data.get("Annotator Metadata", {}) 46 | if not annotator_metadata: 47 | print(f"第 {line_number} 行: 未找到 'Annotator Metadata' 字段。") 48 | continue 49 | 50 | number_of_tools = annotator_metadata.get("Number of tools") 51 | tools_str = annotator_metadata.get("Tools", "") 52 | 53 | if number_of_tools is None: 54 | print(f"第 {line_number} 行: 未找到 'Number of tools' 字段。") 55 | else: 56 | try: 57 | num_tools = int(number_of_tools) 58 | tool_numbers.append(num_tools) 59 | except ValueError: 60 | print(f"第 {line_number} 行: 'Number of tools' 不是有效的整数。") 61 | 62 | if not tools_str: 63 | print(f"第 {line_number} 行: 'Tools' 字段为空。") 64 | continue 65 | 66 | tools = parse_tools(tools_str) 67 | print(f"第 {line_number} 行解析到的工具: {tools}") 68 | print(f"第 {line_number} 行的工具数量: {len(tools)}") 69 | 70 | # 验证 Number of tools 是否与解析的工具数量一致 71 | if number_of_tools: 72 | try: 73 | num_tools = int(number_of_tools) 74 | if num_tools != len(tools): 75 | print(f"第 {line_number} 行: Number of tools ({num_tools}) 与解析的工具数量 ({len(tools)}) 不一致。") 76 | except ValueError: 77 | pass # 已在上一步处理 78 | 79 | # 统计每个工具的出现次数 80 | for tool in tools: 81 | tool_counts[tool] += 1 82 | total_tools += 1 83 | 84 | processed_tasks += 1 85 | 86 | return tool_counts, tool_numbers, total_tools, processed_tasks 87 | 88 | def main(): 89 | jsonl_file = '2023/validation/metadata.jsonl' # 替换为你的 JSONL 文件路径 90 | tool_counts, tool_numbers, total_tools, processed_tasks = process_jsonl(jsonl_file) 91 | 92 | print("\n每个工具的总出现次数:") 93 | if not tool_counts: 94 | print("没有统计到任何工具。请检查文件内容和解析逻辑。") 95 | else: 96 | for tool, count in sorted(tool_counts.items(), key=lambda x: x[1], reverse=True): 97 | print(f"{tool}: {count}") 98 | 99 | # 计算并输出平均工具数量 100 | if tool_numbers: 101 | average_tools = sum(tool_numbers) / len(tool_numbers) 102 | print(f"\n平均每个题目的工具数量: {average_tools:.2f}") 103 | else: 104 | print("\n没有统计到任何 'Number of tools' 数据。") 105 | 106 | print(f"\n总处理题目数: {processed_tasks}") 107 | print(f"总工具数量: {total_tools}") 108 | 109 | if __name__ == "__main__": 110 | main() 111 | -------------------------------------------------------------------------------- /langProBe/GAIA/gaia_program.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import re 5 | import time 6 | import traceback 7 | from datetime import datetime 8 | from typing import List, Tuple, Optional 9 | from langProBe.evaluation_utils import question_scorer 10 | 11 | from langProBe.mcp_program import MCPPredict 12 | 13 | import dspy 14 | from openai import OpenAI 15 | 16 | from langProBe.dspy_program import LangProBeDSPyMetaProgram 17 | import langProBe.constants as constants 18 | 19 | from langProBe.mcp_program import MCPPredict 20 | from langProBe.program_utils import ( 21 | call_lm, 22 | build_init_messages, 23 | build_messages, 24 | response_parsing, 25 | mcp_calling, 26 | ProcessManager 27 | ) 28 | 29 | MCP_SAMPLE_SYSTEM_PROMPT = """ 30 | You are a helpful assistant. You are able to answer questions using different tools. 31 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools. 32 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server. 33 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool. 34 | The tool description includes: 35 | A brief text description outlining the functionality of the tool. 36 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter. 37 | If you have obtained the final result. Please provide your final answer enclosed within tags. Ensure that only the final answer is included, without any additional explanations or commentary. 38 | """ 39 | 40 | class GAIAPredict(MCPPredict): 41 | def __init__(self, max_steps=5, system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, task_name="gaia"): 42 | super().__init__(max_steps, system_prompt, task_name) 43 | 44 | def evaluate_prediction(self, question: str, ground_truth: str, prediction: str) -> Tuple[bool, Optional[str]]: 45 | return question_scorer(prediction, ground_truth, self.run_logger) 46 | 47 | def extract_last_answer(self, text): 48 | pattern = re.compile(r'(.*?)', re.DOTALL) 49 | matches = pattern.findall(text) 50 | 51 | if matches: 52 | return matches[-1] 53 | else: 54 | return None 55 | 56 | def forward(self, **kwargs) -> dspy.Prediction: 57 | unique_id = kwargs.get('id') 58 | question = kwargs.get('question') 59 | gt = kwargs.get('answer') 60 | 61 | manager = ProcessManager() 62 | manager.lm_api_key = self.lm.api_key 63 | manager.lm_api_base = self.lm.api_base 64 | manager.model = self.lm.model 65 | manager.id = unique_id 66 | 67 | self.run_logger.info(f"ID: {manager.id}, Starting forward pass for question: {question}") 68 | 69 | from langProBe.evaluation import global_config 70 | mcps = global_config['mcp_pool'] 71 | 72 | messages = build_init_messages(self.system_prompt, mcps, question) 73 | steps = 0 74 | all_completion_tokens = 0 75 | all_prompt_tokens = 0 76 | start_time = time.time() 77 | 78 | while not messages[-1][constants.ROLE] == constants.ASSISTANT and steps < self.max_steps: 79 | response, completion_tokens, prompt_tokens = call_lm(messages, manager, self.run_logger) 80 | all_completion_tokens += completion_tokens 81 | all_prompt_tokens += prompt_tokens 82 | mcp_calls = response_parsing(response) 83 | 84 | new_messages = mcp_calling(mcp_calls, manager, self.run_logger) 85 | messages = build_messages(messages, new_messages) 86 | steps += 1 87 | 88 | end_time = time.time() 89 | 90 | if messages[-1][constants.ROLE] != constants.ASSISTANT: 91 | self.run_logger.warning("Maximum steps reached without getting an answer") 92 | messages.append({ 93 | constants.ROLE: constants.ASSISTANT, 94 | constants.CONTENT: "超过最长次数限制,该问题无法解决", 95 | }) 96 | 97 | self.run_logger.info(f"ID: {manager.id}, Forward pass completed successfully") 98 | success = self.evaluate_prediction(question, gt, self.extract_last_answer(messages[-1][constants.CONTENT])) 99 | self.log_messages(messages, question, success, (end_time - start_time), all_prompt_tokens, 100 | all_completion_tokens) 101 | self.run_logger.info(f"ID: {manager.id}, Evaluation completed successfully") 102 | 103 | return dspy.Prediction( 104 | success=success, 105 | question=question, 106 | ground_truth=gt, 107 | answer=messages[-1][constants.CONTENT], 108 | trace=messages, 109 | process_report=manager 110 | ) -------------------------------------------------------------------------------- /langProBe/WebSearch/__init__.py: -------------------------------------------------------------------------------- 1 | from langProBe.benchmark import BenchmarkMeta, MCPBench 2 | from langProBe.mcp_program import MCPPredict 3 | from langProBe.evaluation_utils import mcp_metric 4 | 5 | MCP_SAMPLE_SYSTEM_PROMPT = """ 6 | You are a helpful assistant. You are able to answer questions using different tools. 7 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools. 8 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server. 9 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool. 10 | The tool description includes: 11 | A brief text description outlining the functionality of the tool. 12 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter. 13 | """ 14 | 15 | def get_mcp_sample_benchmark(): 16 | mcp_sample_baseline = MCPPredict( 17 | max_steps=5, 18 | system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, 19 | task_name="websearch") 20 | 21 | return [ 22 | BenchmarkMeta( 23 | MCPBench, 24 | [mcp_sample_baseline], 25 | mcp_metric, 26 | optimizers=[], 27 | name="MCP_WEBSEARCH" # 添加显式名称 28 | ) 29 | ] 30 | 31 | benchmark = get_mcp_sample_benchmark() -------------------------------------------------------------------------------- /langProBe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/__init__.py -------------------------------------------------------------------------------- /langProBe/analysis.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import pandas as pd 4 | 5 | 6 | def read_evaluation_results(dir: str): 7 | # Define the path to the directory 8 | file_path = pathlib.Path(dir) 9 | 10 | # List all .txt files in the directory 11 | all_result_files = list(file_path.rglob("*.txt")) 12 | 13 | # Initialize a list to store the extracted data 14 | extracted_data = [] 15 | 16 | # Process each file 17 | for file in all_result_files: 18 | # Split the filename to get benchmark, program, and optimizer 19 | file_name_parts = file.stem.split("_") 20 | if len(file_name_parts) >= 3: 21 | benchmark = ''.join(file_name_parts[:-1]) 22 | program = file_name_parts[-1] 23 | else: 24 | raise ValueError(f"Invalid file name: {file.name}") 25 | 26 | with open(file, "r") as f: 27 | lines = f.readlines() 28 | 29 | # Extract information from the lines 30 | if len(lines) == 2: # Checking if we have 2 lines 31 | header = lines[0].strip() 32 | values = lines[1].strip().split(",") 33 | 34 | # Check if optimizer is present in the file content 35 | if "optimizer" in header: 36 | # Extract values for file with optimizer 37 | data = { 38 | "file_name": file.name, 39 | "benchmark": benchmark, 40 | "program": program, 41 | "score": float(values[0]), 42 | "cost": float(values[1]), 43 | "input_tokens": int(values[2]), 44 | "output_tokens": int(values[3]), 45 | } 46 | else: 47 | # Extract values for file without optimizer 48 | data = { 49 | "file_name": file.name, 50 | "benchmark": benchmark, 51 | "program": program, 52 | "score": float(values[0]), 53 | "cost": float(values[1]), 54 | "input_tokens": int(values[2]), 55 | "output_tokens": int(values[3]), 56 | } 57 | 58 | # Append the extracted data to the list 59 | extracted_data.append(data) 60 | 61 | # Convert the list of dictionaries to a pandas DataFrame 62 | # import pdb; pdb.set_trace() 63 | df = pd.DataFrame(extracted_data) 64 | df = canonicalize_program(df) 65 | return df 66 | 67 | 68 | program_mapping = { 69 | "AppWorldReact": "ReActBaseline", 70 | "AppWorldReactAugumented": "ReActAugumented", 71 | "Predict": "Predict", 72 | "ChainOfThought": "CoT", 73 | "GeneratorCriticRanker": "GeneratorCriticRanker", 74 | "GeneratorCriticFuser": "GeneratorCriticFuser", 75 | "RAG": "RAG", 76 | "EvaluationValidityPredict": "Predict", 77 | "EvaluationValidityModule": "CoT", 78 | "CoT": "CoT", 79 | "Classify": "CoTBasedVote", 80 | "HeartDiseaseClassify": "CoTBasedVote", 81 | "RetrieveMultiHop": "RetrieveMultiHop", 82 | "SimplifiedBaleen": "SimplifiedBaleen", 83 | "SimplifiedBaleenWithHandwrittenInstructions": "SimplifiedBaleenWithInst", 84 | "UnderspecifiedAnnotationCoT": "CoT", 85 | "UnderspecifiedAnnotationGeneratorCriticFuser": "GeneratorCriticFuser", 86 | "UnderspecifiedAnnotationGeneratorCriticRanker": "GeneratorCriticRanker", 87 | "EvaluationValidityGeneratorCriticRanker": "GeneratorCriticRanker", 88 | "EvaluationValidityGeneratorCriticFuser": "GeneratorCriticFuser", 89 | "UnderspecifiedAnnotationPredict": "Predict", 90 | "EvaluationValidityCoT": "CoT", 91 | "EvaluationValidityPredict": "Predict", 92 | # Relook at the following programs 93 | "IReRaCOT": "CoT", 94 | "IReRaPredict": "Predict", 95 | "Infer": "CoT", 96 | "InferRetrieve": "RAG", 97 | "IReRaRetrieve": "RAG", 98 | "IReRaRetrieveRank": "RAGBasedRank", 99 | "InferRetrieveRank": "RAGBasedRank", 100 | "HoverMultiHopPredict": "Predict", 101 | "HoverMultiHop": "MultiHopSummarize", 102 | } 103 | 104 | 105 | def canonicalize_program(data_df): 106 | # Update the benchmark names based on the program 107 | data_df.loc[ 108 | data_df["program"].isin( 109 | [ 110 | "UnderspecifiedAnnotationCoT", 111 | "UnderspecifiedAnnotationPredict", 112 | "UnderspecifiedAnnotationGeneratorCriticFuser", 113 | "UnderspecifiedAnnotationGeneratorCriticRanker", 114 | ] 115 | ), 116 | "benchmark", 117 | ] = "SWEBenchUnderspecified" 118 | 119 | data_df.loc[ 120 | data_df["program"].isin( 121 | [ 122 | "EvaluationValidityCoT", 123 | "EvaluationValidityPredict", 124 | "EvaluationValidityGeneratorCriticFuser", 125 | "EvaluationValidityGeneratorCriticRanker", 126 | ] 127 | ), 128 | "benchmark", 129 | ] = "SWEBenchValidity" 130 | data_df["program"] = data_df["program"].replace(program_mapping) 131 | data_df["benchmark"] = data_df["benchmark"].apply(lambda x: x.replace("Bench", "")) 132 | return data_df 133 | -------------------------------------------------------------------------------- /langProBe/async_mcp_client.py: -------------------------------------------------------------------------------- 1 | from contextlib import AsyncExitStack 2 | from typing import Optional 3 | 4 | from anthropic import Anthropic 5 | from mcp import ClientSession 6 | from mcp.client.sse import sse_client 7 | 8 | 9 | class AsyncMCPClient: 10 | 11 | def __init__(self): 12 | # Initialize session and client objects 13 | self.session: Optional[ClientSession] = None 14 | self.exit_stack = AsyncExitStack() 15 | self.anthropic = Anthropic() 16 | 17 | async def connect_to_sse_server(self, server_url: str): 18 | """Connect to an MCP server running with SSE transport""" 19 | # Store the context managers so they stay alive 20 | self._streams_context = sse_client(url=server_url) 21 | streams = await self._streams_context.__aenter__() 22 | 23 | self._session_context = ClientSession(*streams) 24 | self.session: ClientSession = await self._session_context.__aenter__() 25 | 26 | # Initialize 27 | await self.session.initialize() 28 | 29 | # List available tools to verify connection 30 | # print("Initialized SSE client...") 31 | # print("Listing tools...") 32 | response = await self.session.list_tools() 33 | tools = response.tools 34 | # print("\nConnected to server with tools:", [tool.name for tool in tools]) 35 | 36 | async def cleanup(self): 37 | """Properly clean up the session and streams""" 38 | if self._session_context: 39 | await self._session_context.__aexit__(None, None, None) 40 | if self._streams_context: 41 | await self._streams_context.__aexit__(None, None, None) 42 | 43 | async def call_tool(self, tool_name: str, tool_args: dict) -> dict: 44 | """Call a tool with the given arguments""" 45 | result = await self.session.call_tool(tool_name, tool_args) 46 | return result 47 | 48 | async def list_tools(self): 49 | """List available tools""" 50 | response = await self.session.list_tools() 51 | return response 52 | 53 | async def get_prompt(self, *args, **kwargs): 54 | response = await self.session.get_prompt(*args, **kwargs) 55 | return response 56 | 57 | async def list_prompts(self): 58 | response = await self.session.list_prompts() 59 | return response 60 | 61 | async def list_resources(self): 62 | response = await self.session.list_resources() 63 | return response 64 | 65 | async def read_resource(self, *args, **kwargs): 66 | response = await self.session.read_resource(*args, **kwargs) 67 | return response 68 | 69 | async def process_query(self, query: str) -> str: 70 | """Process a query using Claude and available tools""" 71 | messages = [ 72 | { 73 | "role": "user", 74 | "content": query 75 | } 76 | ] 77 | 78 | response = await self.session.list_tools() 79 | available_tools = [{ 80 | "name": tool.name, 81 | "description": tool.description, 82 | "input_schema": tool.inputSchema 83 | } for tool in response.tools] 84 | 85 | # Initial Claude API call 86 | response = self.anthropic.messages.create( 87 | model="claude-3-5-sonnet-20241022", 88 | max_tokens=1000, 89 | messages=messages, 90 | tools=available_tools 91 | ) 92 | 93 | # Process response and handle tool calls 94 | tool_results = [] 95 | final_text = [] 96 | 97 | for content in response.content: 98 | if content.type == 'text': 99 | final_text.append(content.text) 100 | elif content.type == 'tool_use': 101 | tool_name = content.name 102 | tool_args = content.input 103 | 104 | # Execute tool call 105 | result = await self.session.call_tool(tool_name, tool_args) 106 | tool_results.append({"call": tool_name, "result": result}) 107 | final_text.append(f"[Calling tool {tool_name} with args {tool_args}]") 108 | 109 | # Continue conversation with tool results 110 | if hasattr(content, 'text') and content.text: 111 | messages.append({ 112 | "role": "assistant", 113 | "content": content.text 114 | }) 115 | messages.append({ 116 | "role": "user", 117 | "content": result.content 118 | }) 119 | 120 | # Get next response from Claude 121 | response = self.anthropic.messages.create( 122 | model="claude-3-5-sonnet-20241022", 123 | max_tokens=1000, 124 | messages=messages, 125 | ) 126 | 127 | final_text.append(response.content[0].text) 128 | 129 | return "\n".join(final_text) 130 | 131 | async def chat_loop(self): 132 | """Run an interactive chat loop""" 133 | # print("\nMCP Client Started!") 134 | # print("Type your queries or 'quit' to exit.") 135 | 136 | while True: 137 | try: 138 | query = input("\nQuery: ").strip() 139 | 140 | if query.lower() == 'quit': 141 | break 142 | 143 | response = await self.process_query(query) 144 | print("\n" + response) 145 | 146 | except Exception as e: 147 | print(f"\nError: {str(e)}") 148 | 149 | # async def main(): 150 | # client = AsyncMCPClient() 151 | # try: 152 | # await client.connect_to_sse_server(server_url="http://localhost:8080/sse") 153 | # result = await client.call_tool("get_alerts", {"state": "CA"}) 154 | # print(result) 155 | # finally: 156 | # await client.cleanup() 157 | 158 | 159 | # result = asyncio.run(main()) -------------------------------------------------------------------------------- /langProBe/benchmark.py: -------------------------------------------------------------------------------- 1 | import random, os 2 | from abc import ABC, abstractmethod 3 | from dataclasses import dataclass, field 4 | from enum import Enum 5 | from typing import Callable, List, Type 6 | 7 | import dspy 8 | from dspy.evaluate import Evaluate 9 | from dspy.teleprompt import Teleprompter 10 | 11 | import langProBe.optimizers as langprobe_optimizers 12 | from langProBe.dspy_program import LangProBeDSPyMetaProgram 13 | from langProBe.config_utils import read_json, read_jsonl 14 | from langProBe.program_utils import ProcessManager 15 | 16 | 17 | 18 | 19 | dataset_size = {"full": None, "lite": 500, "tiny": 200, "test": 2} 20 | 21 | 22 | class Benchmark(ABC): 23 | def __init__(self, dataset_mode="lite"): 24 | # dataset for training and validation 25 | self.dataset = None 26 | # dataset for the actual benchmarking 27 | self.test_set = None 28 | self.train_set = None 29 | self.dev_set = None 30 | self.val_set = None 31 | 32 | self.init_dataset() 33 | assert self.dataset is not None, "Dataset not initialized" 34 | assert self.test_set is not None, "Test set not initialized" 35 | self.max_testset_size = dataset_size[dataset_mode] 36 | 37 | self.test_set = self.trim_dataset(self.test_set, self.max_testset_size) 38 | 39 | # TODO: FIXME: "test" option is for debugging purposes only, should be removed for final release 40 | if dataset_mode == "test": 41 | self.dataset = self.trim_dataset(self.dataset, 60) 42 | self.create_splits() 43 | self.test_set = self.trim_dataset(self.test_set, 50) 44 | 45 | if not self.train_set or not self.dev_set or not self.val_set: 46 | self.create_splits() 47 | 48 | self.train_set = self.trim_dataset(self.train_set, 150) 49 | self.dev_set = self.trim_dataset(self.dev_set, 300) 50 | self.val_set = self.trim_dataset(self.val_set, 300) 51 | 52 | assert self.train_set is not None, "Train set not initialized" 53 | assert self.dev_set is not None, "Dev set not initialized" 54 | assert self.val_set is not None, "Val set not initialized" 55 | 56 | @abstractmethod 57 | def init_dataset(self) -> None: 58 | """ 59 | Initializes the dataset for the benchmark, and sets it to self.dataset. 60 | Each element in the dataset should be an instance of dspy.Example. 61 | """ 62 | return 63 | 64 | def trim_dataset(self, dataset, size: int) -> None: 65 | if size is None or size >= len(dataset): 66 | return dataset 67 | rng = random.Random() 68 | rng.seed(1) 69 | return rng.sample(dataset, size) 70 | 71 | def create_splits(self) -> None: 72 | """ 73 | Creates the splits for the dataset (not including test). 74 | Upon completion, self.train_set, self.dev_set, and self.val_set should be set. 75 | """ 76 | 77 | total_len = len(self.dataset) 78 | self.dev_set = self.dataset[: int(0.4 * total_len)] 79 | self.val_set = self.dataset[int(0.4 * total_len) : int(0.8 * total_len)] 80 | self.train_set = self.dataset[int(0.8 * total_len) :] 81 | 82 | def get_dataset(self): 83 | return self.dataset 84 | 85 | def get_train_set(self): 86 | return self.train_set 87 | 88 | def get_dev_set(self): 89 | return self.dev_set 90 | 91 | def get_test_set(self): 92 | return self.test_set 93 | 94 | 95 | class MCPBench(Benchmark): 96 | def __init__(self, dataset_mode="lite", dataset_path=None, missing_data=[]): 97 | self.dataset_path = dataset_path 98 | self.missing_data = missing_data 99 | super().__init__(dataset_mode=dataset_mode) 100 | 101 | def init_dataset(self): 102 | self.dataset = [] 103 | self.test_set = [] 104 | if self.missing_data: 105 | test_raw_data = self.missing_data 106 | else: 107 | test_raw_data = read_jsonl(self.dataset_path) 108 | 109 | for test_data in test_raw_data: 110 | self.test_set.append( 111 | dspy.Example( 112 | id=test_data["unique_id"], 113 | question=test_data["Prompt"], 114 | answer=test_data["Answer"], 115 | ).with_inputs("id", "question", "answer", "config") 116 | ) 117 | 118 | 119 | 120 | 121 | @dataclass 122 | class EvaluationResult: 123 | benchmark: str 124 | program: str 125 | 126 | score: float 127 | cost: float 128 | input_tokens: int 129 | output_tokens: int 130 | 131 | outputs_raw_data: List|None = None 132 | 133 | # optimizer: str = None 134 | # optimized_program: dspy.Module = None 135 | # optimizer_input_tokens: int = None 136 | # optimizer_output_tokens: int = None 137 | # optimizer_cost: float = None 138 | 139 | # optimizer_program_scores: list[float] = None 140 | 141 | 142 | @dataclass 143 | class BenchmarkMeta: 144 | benchmark: Type[Benchmark] 145 | program: List[dspy.Module] 146 | metric: Callable 147 | dataset_mode: str = "lite" 148 | 149 | optimizers: List[langprobe_optimizers.OptimizerConfig] = field( 150 | default_factory=lambda: langprobe_optimizers.DEFAULT_OPTIMIZERS 151 | ) 152 | 153 | # BenchmarkMeta.num_threads has higher priority than run time argument of num_threads 154 | # use this as an upper bound for the number of threads to use 155 | num_threads: int = None 156 | name: str = None 157 | 158 | 159 | def setup_lm(dspy_config=None): 160 | lm: dspy.LM = dspy_config.get("lm", dspy.settings.lm) 161 | assert lm is not None, "dspy language model not set" 162 | 163 | lm = lm.copy() 164 | assert len(lm.history) == 0, "language model history not empty" 165 | return lm 166 | 167 | 168 | # def calculate_stats(lm: dspy.LM) -> tuple[float, int, int]: 169 | # cost = 0 170 | # input_tokens = 0 171 | # output_tokens = 0 172 | # for i, trace in enumerate(lm.history): 173 | # cost += trace.get("cost", None) or 0 174 | # input_tokens += trace.get("usage", 0).get("prompt_tokens", 0) 175 | # output_tokens += trace.get("usage", 0).get("completion_tokens", 0) 176 | 177 | # return cost, input_tokens, output_tokens 178 | 179 | def calculate_stats(manager: List[ProcessManager]) -> tuple[float, float, float]: 180 | input_tokens = sum(usage["prompt_tokens"] for trace in manager for usage in trace.lm_usages) 181 | output_tokens = sum(usage["completion_tokens"] for trace in manager for usage in trace.lm_usages) 182 | 183 | avg_input = input_tokens // len(manager) 184 | avg_output = output_tokens // len(manager) 185 | 186 | return 0, avg_input, avg_output 187 | 188 | 189 | 190 | class EvaluateBench(ABC): 191 | def __init__( 192 | self, 193 | benchmark: Benchmark, 194 | program: dspy.Module, 195 | metric: Callable, 196 | lm: str, 197 | benchmark_name: str = None, 198 | num_threads: int = 1, 199 | api_key: str = None, 200 | api_base: str = None, 201 | ): 202 | self.benchmark = benchmark 203 | self.program = program 204 | 205 | self.program.setup_lm(lm, api_key=api_key, api_base=api_base) 206 | self.metric = metric 207 | self.num_threads = num_threads 208 | devset = benchmark.get_test_set() 209 | self.evaluate_prog = Evaluate( 210 | devset=devset, 211 | metric=self.metric, 212 | num_threads=self.num_threads, 213 | display_progress=True, 214 | max_errors=5000, 215 | return_outputs=True, 216 | provide_traceback=True, 217 | ) 218 | 219 | self.program_name = getattr( 220 | self.program, "_name", self.program.__class__.__name__ 221 | ) 222 | self.benchmark_name = benchmark_name or self.benchmark.__class__.__name__ 223 | self.results: list[EvaluationResult] = [] 224 | 225 | def get_empty_results(self): 226 | return EvaluationResult( 227 | benchmark=self.benchmark_name, 228 | program=self.program_name, 229 | score=0, 230 | cost=0, 231 | input_tokens=0, 232 | output_tokens=0, 233 | ) 234 | 235 | 236 | def evaluate_baseline(self, dspy_config=None) -> EvaluationResult: 237 | with dspy.context(**dspy_config): 238 | score, info = self.evaluate_prog(self.program) 239 | result = self.get_empty_results() 240 | datasets, outputs, _ = zip(*info) 241 | managers = [one.process_report for one in outputs] 242 | 243 | result.score = score 244 | result.outputs_raw_data = outputs 245 | result.cost, result.input_tokens, result.output_tokens = calculate_stats(managers) 246 | 247 | return result 248 | 249 | def evaluate(self, dspy_config=None) -> EvaluationResult: 250 | """ 251 | Args: 252 | dspy_config: A dictionary of configurations for dspy.context 253 | Returns: 254 | A list of EvaluationResult objects. 255 | """ 256 | if dspy_config is None: 257 | dspy_config = {} 258 | 259 | result = self.evaluate_baseline(dspy_config) 260 | self.results = result 261 | return result 262 | -------------------------------------------------------------------------------- /langProBe/config_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | def read_json(file_path): 3 | """ 4 | Read a JSON file and return the content as a dictionary. 5 | """ 6 | with open(file_path, 'r') as file: 7 | data = json.load(file) 8 | return data 9 | 10 | def read_jsonl(file_path): 11 | """ 12 | Read a JSONL file and return the content as a list of dictionaries. 13 | """ 14 | data = [] 15 | with open(file_path, 'r', encoding='utf-8') as f: 16 | for line in f: 17 | test_data = json.loads(line) 18 | data.append(test_data) 19 | return data -------------------------------------------------------------------------------- /langProBe/constants.py: -------------------------------------------------------------------------------- 1 | ROLE = 'role' 2 | CONTENT = 'content' 3 | SYSTEM = 'system' 4 | USER = 'user' 5 | ASSISTANT = 'assistant' 6 | TOOL = 'tool' 7 | TOOL_CALLS = 'tool_calls' -------------------------------------------------------------------------------- /langProBe/dspy_program.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | 3 | 4 | #################################### Common Programs #################################### 5 | 6 | 7 | def deduplicate(seq: list[str]) -> list[str]: 8 | """ 9 | Source: https://stackoverflow.com/a/480227/1493011 10 | """ 11 | 12 | seen = set() 13 | return [x for x in seq if not (x in seen or seen.add(x))] 14 | 15 | 16 | class LangProBeDSPyMetaProgram(dspy.Module): 17 | def setup_lm(self, lm, api_key=None, api_base=None): 18 | dspy.settings.experimental = True 19 | self.lm = dspy.LM(lm, api_key=api_key, api_base=api_base) 20 | self.set_lm(self.lm) 21 | 22 | def program_type(self): 23 | return "dspy" 24 | 25 | 26 | class Predict(dspy.Predict, LangProBeDSPyMetaProgram): 27 | pass 28 | 29 | 30 | class CoT(dspy.ChainOfThought, LangProBeDSPyMetaProgram): 31 | pass 32 | 33 | 34 | def default_input_to_query(**kwargs): 35 | if len(kwargs) == 1: 36 | return list(kwargs.values())[0] 37 | else: 38 | raise ValueError( 39 | "Cannot convert multiple inputs to a query, please specify input_to_query." 40 | ) 41 | 42 | 43 | class RAG(LangProBeDSPyMetaProgram, dspy.Module): 44 | def __init__( 45 | self, 46 | signature, 47 | retriever=dspy.Retrieve(k=3), 48 | input_to_query=default_input_to_query, 49 | ): 50 | self.retriver = retriever 51 | verified_signature = dspy.ensure_signature(signature) 52 | verified_signature = verified_signature.prepend( 53 | "context", dspy.InputField(desc="may contain relevant facts") 54 | ) 55 | self.prog = dspy.ChainOfThought(verified_signature) 56 | self.input_to_query = input_to_query 57 | 58 | def forward(self, **kwargs): 59 | context = self.retriver(self.input_to_query(**kwargs)).passages 60 | pred = self.prog(context=context, **kwargs) 61 | return pred 62 | 63 | 64 | class SimplifiedBaleen(LangProBeDSPyMetaProgram, dspy.Module): 65 | def __init__( 66 | self, signature, query_gen_input=None, retriever=dspy.Retrieve(k=2), max_hops=2 67 | ): 68 | """ 69 | args: 70 | signature: The signature to the final generate module 71 | query_gen_input: a list of keywords to be used as input to the query generation module 72 | retriever: a retriever module to be used to retrieve relevant facts 73 | max_hops: the number of hops to be used in the simplified 74 | FIXME (shangyin) correctly handle query_gen_input 75 | """ 76 | 77 | self.max_hops = max_hops 78 | self.retriever = retriever 79 | verified_signature = dspy.ensure_signature(signature) 80 | verified_signature = verified_signature.prepend( 81 | "context", dspy.InputField(desc="may contain relevant facts") 82 | ) 83 | 84 | # remove the output field from the generate query signature 85 | # generate_query should use a default instruction rather than instruction from the original signature 86 | # FIXME (shangyin) fix the default signature.instructions 87 | input_fields = verified_signature.input_fields 88 | generate_query_signature = dspy.Signature(input_fields) 89 | generate_query_signature = generate_query_signature.append( 90 | "search_query", dspy.OutputField() 91 | ) 92 | 93 | self.generate_query = [ 94 | dspy.ChainOfThought(generate_query_signature) for _ in range(self.max_hops) 95 | ] 96 | self.generate_answer = dspy.ChainOfThought(verified_signature) 97 | 98 | def forward(self, **kwargs): 99 | context = [] 100 | 101 | for hop in range(self.max_hops): 102 | query = self.generate_query[hop](context=context, **kwargs).search_query 103 | passages = self.retriever(query).passages 104 | context = deduplicate(context + passages) 105 | 106 | pred = self.generate_answer(context=context, **kwargs) 107 | return pred 108 | 109 | 110 | #################################### Archon Programs #################################### 111 | 112 | # Note Ranker and Fuser are equipped with self.get_prediction() method to return a Prediction object 113 | # in the original signature 114 | 115 | 116 | class ArchonGenerator(LangProBeDSPyMetaProgram, dspy.Module): 117 | # https://github.com/ScalingIntelligence/Archon/blob/main/src/archon/completions/components/Generator.py 118 | 119 | def __init__(self, signature, n=5): 120 | # For dspy, n responses are generated with a single model now. 121 | # If desired, we can create a new module in dspy that uses multiple models to generate n responses. 122 | verified_signature = dspy.ensure_signature(signature) 123 | assert ( 124 | len(verified_signature.output_fields) == 1 125 | ), "ArchonGenerator only supports a single output field" 126 | 127 | self.prog = dspy.ChainOfThought(verified_signature, n=n) 128 | self.output_field = list(verified_signature.output_fields.keys())[0] 129 | 130 | def forward(self, **kwargs) -> dspy.Prediction: 131 | return self.prog(**kwargs) 132 | 133 | def get_responses(self, **kwargs) -> list[str]: 134 | responses = self.prog(**kwargs).completions.__getattr__(self.output_field) 135 | return responses 136 | 137 | def get_formatted_responses(self, **kwargs) -> str: 138 | responses = self.get_responses(**kwargs) 139 | return responses_formatter(responses) 140 | 141 | 142 | def responses_formatter(responses): 143 | if not isinstance(responses, list): 144 | dspy.logger.warning( 145 | "Responses of CriticGenerator should be a list of responses. " 146 | ) 147 | responses = [responses] 148 | formatted_responses = [] 149 | for i, response in enumerate(responses): 150 | formatted_responses.append(f"[{i+1}] {response}") 151 | return "\n".join(formatted_responses) 152 | 153 | 154 | class FeedbackGeneratorSignature(dspy.Signature): 155 | """ 156 | Evaluate all responses based on their relevance to the instructions. 157 | All the responses should be included and evaluated using identifiers. 158 | You must include both strengths and weaknesses, even if there are more of one than the other. 159 | Start with the analysis for the first response and end with the analysis for the last response. 160 | """ 161 | 162 | task_instructions = dspy.InputField( 163 | desc="The instructions on how the responses are generated." 164 | ) 165 | responses = dspy.InputField( 166 | desc="The generated responses to critize. Each response will start with a numerical identifier in [], like [1].", 167 | ) 168 | feedback: list[str] = dspy.OutputField( 169 | desc="The feedback for each response. Discuss the strengths and weaknesses of each response." 170 | ) 171 | 172 | 173 | class ArchonCritic(LangProBeDSPyMetaProgram, dspy.Module): 174 | # https://github.com/ScalingIntelligence/Archon/blob/main/src/archon/completions/components/Critic.py 175 | 176 | def __init__(self, signature, n=5): 177 | # signature should be the signature to the original generator module 178 | verified_signature = dspy.ensure_signature(signature) 179 | assert ( 180 | len(verified_signature.output_fields) == 1 181 | ), "ArchonCritic only supports a single output field" 182 | self.signature = verified_signature 183 | 184 | self.instructions = verified_signature.instructions 185 | feedback_gen_signature = FeedbackGeneratorSignature 186 | # add all inputfields from the original signature to the feedback_gen_signature 187 | for name, field in reversed(verified_signature.input_fields.items()): 188 | feedback_gen_signature = feedback_gen_signature.prepend(name, field) 189 | 190 | self.feedback_gen = dspy.ChainOfThought(feedback_gen_signature) 191 | 192 | def forward(self, formatted_responses, **kwargs) -> dspy.Prediction: 193 | return self.feedback_gen( 194 | task_instructions=self.instructions, responses=formatted_responses, **kwargs 195 | ) 196 | 197 | def get_feedback(self, formatted_responses: str, **kwargs) -> list[str]: 198 | return self.forward(formatted_responses, **kwargs).feedback 199 | 200 | 201 | class RankerGeneratorSignature(dspy.Signature): 202 | """ 203 | Rank the responses based on their relevance to the instruction, in descending order (from most relevant to least relevant). 204 | """ 205 | 206 | task_instructions = dspy.InputField( 207 | desc="The instructions on how the responses are generated." 208 | ) 209 | 210 | responses = dspy.InputField( 211 | desc="The responses to rank. Each response will start with a numerical identifier in [], like [1].", 212 | ) 213 | 214 | ranking: list[int] = dspy.OutputField( 215 | desc="The ranking of the responses. List the responses in descending order of relevance to the instructions." 216 | ) 217 | 218 | 219 | class ArchonRanker(LangProBeDSPyMetaProgram, dspy.Module): 220 | # https://github.com/ScalingIntelligence/Archon/blob/main/src/archon/completions/components/prompts.py#L68 221 | def __init__(self, signature, n=5, use_critic=False): 222 | verified_signature = dspy.ensure_signature(signature) 223 | assert ( 224 | len(verified_signature.output_fields) == 1 225 | ), "ArchonRanker only supports a single output field" 226 | self.signature = verified_signature 227 | self.instructions = verified_signature.instructions 228 | 229 | ranker_signature = RankerGeneratorSignature 230 | if use_critic: 231 | ranker_signature = ranker_signature.append( 232 | "feedback", 233 | dspy.InputField( 234 | desc="The feedback (strength/weakness) for each response." 235 | ), 236 | ) 237 | ranker_signature.instructions += ( 238 | "and their provided critiques of strengths and weaknesses." 239 | ) 240 | 241 | # add all inputfields from the original signature to the feedback_gen_signature 242 | for name, field in reversed(verified_signature.input_fields.items()): 243 | ranker_signature = ranker_signature.prepend(name, field) 244 | 245 | self.ranker = dspy.ChainOfThought(ranker_signature) 246 | 247 | def forward(self, formatted_responses: str, **kwargs): 248 | return self.ranker( 249 | task_instructions=self.instructions, responses=formatted_responses, **kwargs 250 | ) 251 | 252 | def get_ranking(self, formatted_responses: str, **kwargs) -> list[int]: 253 | return self.forward(formatted_responses, **kwargs).ranking 254 | 255 | def get_prediction(self, responses: list[str], **kwargs) -> dspy.Prediction: 256 | formatted_responses = responses_formatter(responses) 257 | ranking = self.get_ranking(formatted_responses, **kwargs) 258 | top_response = responses[ranking[0]] 259 | pred = dspy.Prediction() 260 | pred.__setattr__(list(self.signature.output_fields.keys())[0], top_response) 261 | return pred 262 | 263 | 264 | class FuserGeneratorSignature(dspy.Signature): 265 | """ 266 | Your task is to synthesize a list of responses to a task into a single, high-quality response of the same format. Do not include explanations. 267 | """ 268 | 269 | task_instructions = dspy.InputField( 270 | desc="The instructions on how the responses are generated. Your final response should FOLLOW these instructions." 271 | ) 272 | 273 | responses = dspy.InputField( 274 | desc="The responses to synthesize.", 275 | ) 276 | 277 | final_response = dspy.OutputField( 278 | desc="""The final response, compiled from the input responses. 279 | Please provide a single response with the same format as all previous responses, excluding the number identifier. 280 | Ensure your response is well-structured, coherent, and adheres to the highest standards of accuracy and reliability. """ 281 | ) 282 | 283 | 284 | class ArchonFuser(LangProBeDSPyMetaProgram, dspy.Module): 285 | def __init__(self, signature, use_critic=False): 286 | verified_signature = dspy.ensure_signature(signature) 287 | assert ( 288 | len(verified_signature.output_fields) == 1 289 | ), "ArchonFuser only supports a single output field" 290 | self.signature = verified_signature 291 | self.instructions = verified_signature.instructions 292 | 293 | fuser_signature = FuserGeneratorSignature 294 | if use_critic: 295 | fuser_signature = fuser_signature.append( 296 | "feedback", 297 | dspy.InputField( 298 | desc="The feedback (strength/weakness) for each response." 299 | ), 300 | ) 301 | fuser_signature.instructions += "For each response, we also provide critiques of strengths and weaknesses." 302 | output_field_desc = list(verified_signature.output_fields.values())[ 303 | 0 304 | ].json_schema_extra["desc"] 305 | fuser_signature.output_fields["final_response"].json_schema_extra[ 306 | "desc" 307 | ] += f"{output_field_desc}" 308 | 309 | # add all inputfields from the original signature to the feedback_gen_signature 310 | for name, field in reversed(verified_signature.input_fields.items()): 311 | fuser_signature = fuser_signature.prepend(name, field) 312 | 313 | self.fuser = dspy.ChainOfThought(fuser_signature) 314 | 315 | def forward(self, formatted_responses: str, **kwargs): 316 | return self.fuser( 317 | task_instructions=self.instructions, responses=formatted_responses, **kwargs 318 | ) 319 | 320 | def get_response(self, formatted_responses: str, **kwargs) -> str: 321 | return self.forward(formatted_responses, **kwargs).final_response 322 | 323 | def get_prediction(self, formatted_responses: str, **kwargs) -> dspy.Prediction: 324 | final_response = self.get_response(formatted_responses, **kwargs) 325 | pred = dspy.Prediction() 326 | pred.__setattr__(list(self.signature.output_fields.keys())[0], final_response) 327 | return pred 328 | 329 | 330 | # TODO(shangyin) new adapters from Archon to be added: Verifier 331 | 332 | #################################### Archon Example Programs #################################### 333 | 334 | 335 | class GeneratorCriticRanker(LangProBeDSPyMetaProgram, dspy.Module): 336 | def __init__(self, signature, n=5): 337 | verified_signature = dspy.ensure_signature(signature) 338 | assert ( 339 | len(verified_signature.output_fields) == 1 340 | ), "ArchonExample only supports a single output field" 341 | self.signature = verified_signature 342 | 343 | self.generator = ArchonGenerator(self.signature, n) 344 | self.critic = ArchonCritic(self.signature, n) 345 | self.ranker = ArchonRanker(self.signature, n, use_critic=True) 346 | 347 | if n != 5: # override default name 348 | self._name = f"GeneratorCriticRanker{n}" 349 | 350 | def forward(self, **kwargs): 351 | responses = self.generator.get_responses(**kwargs) 352 | formatted_responses = responses_formatter(responses) 353 | feedback = self.critic.get_feedback(formatted_responses, **kwargs) 354 | return self.ranker.get_prediction(responses, feedback=feedback, **kwargs) 355 | 356 | 357 | class GeneratorCriticFuser(LangProBeDSPyMetaProgram, dspy.Module): 358 | def __init__(self, signature, n=5): 359 | verified_signature = dspy.ensure_signature(signature) 360 | assert ( 361 | len(verified_signature.output_fields) == 1 362 | ), "GeneratorCriticFuser only supports a single output field" 363 | self.signature = verified_signature 364 | 365 | self.generator = ArchonGenerator(self.signature, n) 366 | self.critic = ArchonCritic(self.signature, n) 367 | self.fuser = ArchonFuser(self.signature, use_critic=True) 368 | 369 | if n != 5: # override default name 370 | self._name = f"GeneratorCriticFuser{n}" 371 | 372 | def forward(self, **kwargs): 373 | formatted_responses = self.generator.get_formatted_responses(**kwargs) 374 | feedback = self.critic.get_feedback(formatted_responses, **kwargs) 375 | return self.fuser.get_prediction( 376 | formatted_responses, feedback=feedback, **kwargs 377 | ) 378 | 379 | 380 | class GeneratorRanker(LangProBeDSPyMetaProgram, dspy.Module): 381 | def __init__(self, signature, n=5): 382 | verified_signature = dspy.ensure_signature(signature) 383 | assert ( 384 | len(verified_signature.output_fields) == 1 385 | ), "GeneratorRanker only supports a single output field" 386 | self.signature = verified_signature 387 | 388 | self.generator = ArchonGenerator(self.signature, n) 389 | self.ranker = ArchonRanker(self.signature, use_critic=False) 390 | 391 | def forward(self, **kwargs): 392 | responses = self.generator.get_responses(**kwargs) 393 | return self.ranker.get_prediction(responses) 394 | 395 | 396 | class GeneratorFuser(LangProBeDSPyMetaProgram, dspy.Module): 397 | def __init__(self, signature, n=5): 398 | verified_signature = dspy.ensure_signature(signature) 399 | assert ( 400 | len(verified_signature.output_fields) == 1 401 | ), "GeneratorFuser only supports a single output field" 402 | self.signature = verified_signature 403 | 404 | self.generator = ArchonGenerator(self.signature, n) 405 | self.fuser = ArchonFuser(self.signature, use_critic=False) 406 | 407 | def forward(self, **kwargs): 408 | formatted_responses = self.generator.get_formatted_responses(**kwargs) 409 | return self.fuser.get_prediction(formatted_responses) 410 | 411 | 412 | if __name__ == "__main__": 413 | # Example usage 414 | dspy.configure( 415 | lm=dspy.LM("openai/gpt-4o-mini"), 416 | # example rm for RAG w. passages from wikipedia dump 417 | rm=dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts"), 418 | ) 419 | 420 | question = "What is the capital of France?" 421 | context = "France is a country in Europe." 422 | 423 | # CoT 424 | print("======== CoT =========") 425 | cot = CoT("question, context -> answer") 426 | cot(question=question, context=context) 427 | dspy.settings.lm.inspect_history() 428 | 429 | # RAG 430 | print("======== RAG =========") 431 | rag = RAG("question -> answer") 432 | rag(question=question) 433 | dspy.settings.lm.inspect_history() 434 | 435 | # SimplifiedBaleen 436 | print("======== SimplifiedBaleen =========") 437 | simplified_baleen = SimplifiedBaleen("question -> answer") 438 | simplified_baleen(question=question) 439 | dspy.settings.lm.inspect_history(n=3) 440 | 441 | # GeneratorCriticRanker 442 | print("======== GeneratorCriticRanker =========") 443 | archon_example = GeneratorCriticRanker("question -> answer") 444 | archon_example(question=question) 445 | dspy.settings.lm.inspect_history(n=3) 446 | 447 | # GeneratorRanker 448 | print("======== GeneratorRanker =========") 449 | generator_ranker = GeneratorRanker("question -> answer") 450 | generator_ranker(question=question) 451 | dspy.settings.lm.inspect_history(n=3) 452 | 453 | # GeneratorCriticFuser 454 | print("======== GeneratorCriticFuser =========") 455 | generator_critic_fuser = GeneratorCriticFuser("question -> answer") 456 | generator_critic_fuser(question=question) 457 | dspy.settings.lm.inspect_history(n=3) 458 | 459 | # GeneratorFuser 460 | print("======== GeneratorFuser =========") 461 | generator_fuser = GeneratorFuser("question -> answer") 462 | generator_fuser(question=question) 463 | dspy.settings.lm.inspect_history(n=3) 464 | -------------------------------------------------------------------------------- /langProBe/evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import os 4 | import pathlib 5 | import sys 6 | import time 7 | from contextlib import contextmanager 8 | from pathlib import Path 9 | 10 | import dspy 11 | 12 | from langProBe.analysis import read_evaluation_results 13 | from langProBe.benchmark import BenchmarkMeta, EvaluateBench, EvaluationResult 14 | from langProBe.config_utils import read_json, read_jsonl 15 | from langProBe.dspy_program import ( 16 | GeneratorCriticFuser, 17 | GeneratorCriticRanker, 18 | LangProBeDSPyMetaProgram, 19 | ) 20 | from langProBe.optimizers import create_optimizer, DEFAULT_OPTIMIZERS 21 | from langProBe.register_benchmark import register_all_benchmarks, registered_benchmarks 22 | from langProBe.evaluation_utils import find_missing_entries, replace_logger_filehandler 23 | 24 | 25 | class CompareAnswerSignature(dspy.Signature): 26 | """ 27 | Compare the answer to the ground truth answer. 28 | """ 29 | 30 | answer = dspy.InputField(desc="The answer to a problem") 31 | ground_truth = dspy.InputField(desc="The ground truth answer to the same problem") 32 | is_correct = dspy.OutputField( 33 | desc="Whether the answer is correct, either True or False." 34 | ) 35 | 36 | 37 | class CompareAnswer(dspy.Module): 38 | def __init__(self): 39 | self.compare_answer = dspy.ChainOfThought(CompareAnswerSignature) 40 | 41 | def forward(self, ground_truth, answer): 42 | pred = self.compare_answer(answer=answer, ground_truth=ground_truth) 43 | return pred 44 | 45 | 46 | def llm_as_judge_evaluate(gold, pred, extract_answer_fun=lambda x: x.answer): 47 | compare_answer = CompareAnswer() 48 | answer_raw = compare_answer( 49 | ground_truth=extract_answer_fun(gold), answer=extract_answer_fun(pred) 50 | ).is_correct 51 | if answer_raw.lower().startswith("true"): 52 | return True 53 | else: 54 | return False 55 | 56 | 57 | @contextmanager 58 | def suppress_output(suppress=True): 59 | if suppress: 60 | # Save the original streams 61 | original_stderr = sys.stderr 62 | original_stdout = sys.stdout 63 | 64 | # Redirect stderr and stdout to devnull 65 | sys.stderr = open(os.devnull, "w") 66 | sys.stdout = open(os.devnull, "w") 67 | 68 | try: 69 | yield 70 | finally: 71 | if suppress: 72 | # Restore the original streams 73 | sys.stderr.close() 74 | sys.stdout.close() 75 | sys.stderr = original_stderr 76 | sys.stdout = original_stdout 77 | 78 | 79 | def generate_evaluation_records(file_path): 80 | file_path = pathlib.Path(file_path) 81 | 82 | # if the records file already exists, do not overwrite it 83 | if (file_path / "evaluation_records.csv").exists(): 84 | return 85 | 86 | # List all .txt files in the directory 87 | all_result_files = list(file_path.rglob("*.txt")) 88 | 89 | records = [] 90 | 91 | # Process each file 92 | for file in all_result_files: 93 | # Split the filename to get benchmark, program, and optimizer 94 | file_name_parts = file.stem.split("_") 95 | if len(file_name_parts) >= 3: 96 | benchmark = file_name_parts[0] 97 | program = file_name_parts[1] 98 | optimizer = file_name_parts[2] 99 | records.append((benchmark, program, optimizer)) 100 | else: 101 | raise ValueError(f"Invalid file name: {file.name}") 102 | 103 | with open(f"{file_path}/evaluation_records.csv", "w") as f: 104 | f.write("benchmark,program,optimizer\n") 105 | for record in records: 106 | f.write(",".join(record) + "\n") 107 | 108 | 109 | def add_to_evaluation_records(file_path, evaluation_results: list[EvaluationResult]): 110 | file_path = pathlib.Path(file_path) 111 | 112 | with open(f"{file_path}/evaluation_records.csv", "a") as f: 113 | for evaluation_result in evaluation_results: 114 | f.write( 115 | f"{evaluation_result.benchmark},{evaluation_result.program},{evaluation_result.optimizer}\n" 116 | ) 117 | 118 | 119 | def read_evaluation_records(file_path): 120 | file_path = pathlib.Path(file_path) 121 | records = [] 122 | 123 | # create the records file if it does not exist 124 | if not (file_path / "evaluation_records.csv").exists(): 125 | # create empty records file without header 126 | with open(f"{file_path}/evaluation_records.csv", "w") as f: 127 | f.write("") 128 | with open(f"{file_path}/evaluation_records.csv", "r") as f: 129 | lines = f.readlines() 130 | for line in lines[1:]: 131 | records.append(tuple(line.strip().split(","))) 132 | 133 | return records 134 | 135 | 136 | def evaluate( 137 | benchmark_meta: BenchmarkMeta, 138 | lm, 139 | file_path, 140 | num_threads=8, 141 | suppress_dspy_output=True, 142 | dataset_mode=None, 143 | dataset_path=None, 144 | missing_mode_file="", 145 | api_key=None, 146 | api_base=None, 147 | ): 148 | """ 149 | benchmark_meta: BenchmarkMeta object to evaluate 150 | lm: Language model to use, should be an instance of dspy.LM 151 | missing_mode: only evaluate experiments without a result file 152 | """ 153 | dataset_mode = dataset_mode or benchmark_meta.dataset_mode 154 | 155 | if missing_mode_file: 156 | origin_data = read_jsonl(dataset_path) 157 | runed_data = read_jsonl(missing_mode_file) 158 | missing_data = find_missing_entries(origin_data, runed_data) 159 | benchmark = benchmark_meta.benchmark(dataset_mode=dataset_mode, missing_data=missing_data) 160 | replace_logger_filehandler(os.path.splitext(missing_mode_file)[0]) 161 | else: 162 | benchmark = benchmark_meta.benchmark(dataset_mode=dataset_mode, dataset_path=dataset_path) 163 | # Canonicalize optimizers to (optimizer, compile_kwargs) tuples 164 | benchmark_name = benchmark_meta.name or benchmark.__class__.__name__ 165 | 166 | num_threads = benchmark_meta.num_threads or num_threads 167 | print(f"Evaluating {benchmark_name}") 168 | print(f"num_threads: {num_threads}") 169 | print(f"Test set size: {len(benchmark.test_set)}") 170 | 171 | 172 | Path(file_path).mkdir(parents=True, exist_ok=True) 173 | 174 | evaluation_records = read_evaluation_records(file_path) 175 | 176 | # create a stats file for each experiment 177 | stats_file = os.path.join(file_path, f"{benchmark_name}.stat") 178 | with open(stats_file, "w") as f: 179 | f.write( 180 | f"benchmark: {benchmark_name}\n" 181 | f"lm: {lm}\n" 182 | f"test_set_size: {len(benchmark.test_set)}\n" 183 | ) 184 | 185 | for program in benchmark_meta.program: 186 | program_name = getattr(program, "_name", program.__class__.__name__) 187 | 188 | print(f"Program: {program_name}") 189 | 190 | with suppress_output(suppress=suppress_dspy_output): 191 | evaluate_bench = EvaluateBench( 192 | benchmark=benchmark, 193 | program=program, 194 | metric=benchmark_meta.metric, 195 | lm=lm, 196 | benchmark_name=benchmark_meta.name, 197 | num_threads=num_threads, 198 | api_key=api_key if api_key else os.getenv("OPENAI_API_KEY", ""), 199 | api_base=api_base if api_base else os.getenv("OPENAI_API_BASE", ""), 200 | ) 201 | evaluate_bench.evaluate() 202 | # print(f"Results: {evaluate_bench.results}") 203 | 204 | # if missing_mode: 205 | # add_to_evaluation_records(file_path, evaluate_bench.results) 206 | evaluation_result = evaluate_bench.results 207 | 208 | file_name = f"{evaluation_result.benchmark}_{evaluation_result.program}" 209 | with open(os.path.join(file_path, f"{file_name}.txt"), "w") as f: 210 | f.write(f"score,cost,input_tokens,output_tokens\n") 211 | f.write( 212 | f"{evaluation_result.score},{evaluation_result.cost},{evaluation_result.input_tokens}," 213 | f"{evaluation_result.output_tokens}\n" 214 | ) 215 | 216 | 217 | def evaluate_all( 218 | benchmarks, 219 | lm, 220 | file_path, 221 | num_threads=8, 222 | suppress_dspy_output=False, 223 | dataset_mode=None, 224 | dataset_path=None, 225 | missing_mode_file="", 226 | api_key=None, 227 | api_base=None, 228 | ): 229 | # 只有当benchmarks是字符串列表时才进行注册 230 | if benchmarks and isinstance(benchmarks[0], str): 231 | benchmarks = register_all_benchmarks(benchmarks) 232 | 233 | for benchmark_meta in benchmarks: 234 | evaluate( 235 | benchmark_meta, 236 | lm, 237 | file_path, 238 | num_threads, 239 | suppress_dspy_output, 240 | dataset_mode, 241 | dataset_path, 242 | missing_mode_file, 243 | api_key=api_key, 244 | api_base=api_base, 245 | ) 246 | 247 | df = read_evaluation_results(file_path) 248 | df.to_csv(f"{file_path}/evaluation_results.csv", index=False) 249 | df["model"] = lm 250 | 251 | # generate evaluation records 252 | generate_evaluation_records(file_path) 253 | 254 | global_config=None 255 | def main(): 256 | import multiprocessing 257 | multiprocessing.freeze_support() 258 | 259 | parser = argparse.ArgumentParser(description="LangProbe benchmark evaluation") 260 | parser.add_argument("--benchmark", type=str, required=True, help="Benchmark to evaluate") 261 | parser.add_argument("--lm", type=str, required=True, help="Language model to use") 262 | parser.add_argument("--lm_api_key", type=str, help="API key for language model") 263 | parser.add_argument( 264 | "--lm_api_base", type=str, help="API base for language model" 265 | ) 266 | parser.add_argument( 267 | "--dataset_mode", type=str, help="Dataset mode (train, val, test)" 268 | ) 269 | parser.add_argument( 270 | "--dataset_path", type=str, help="Dataset path" 271 | ) 272 | parser.add_argument( 273 | "--num_threads", type=int, default=8, help="Number of threads to use" 274 | ) 275 | parser.add_argument( 276 | "--file_path", type=str, default="evaluation", help="File path for evaluation results" 277 | ) 278 | parser.add_argument( 279 | "--suppress_dspy_output", 280 | action="store_true", 281 | help="Suppress dspy output", 282 | ) 283 | parser.add_argument( 284 | "--missing_mode_file", 285 | type=str, 286 | default="", 287 | help="Only run missing experiments (skip experiments that already have results), value = path to log/jsonl", 288 | ) 289 | parser.add_argument( 290 | "--config", 291 | type=str, 292 | default='ddgo.json', 293 | help="Configuration file for the benchmark", 294 | ) 295 | 296 | args = parser.parse_args() 297 | 298 | global global_config 299 | global_config= read_json(args.config) 300 | # 处理benchmark参数 301 | benchmark_path = args.benchmark 302 | if not benchmark_path.startswith("langProBe."): 303 | benchmark_path = f"langProBe.{benchmark_path}" 304 | 305 | # 注册所有基准测试 306 | register_all_benchmarks([benchmark_path]) 307 | 308 | benchmarks = [benchmark for benchmark in registered_benchmarks] 309 | if not benchmarks: 310 | print(f"No benchmark registered with name {args.benchmark}") 311 | sys.exit(1) 312 | 313 | evaluate_all( 314 | benchmarks, 315 | args.lm, 316 | args.file_path, 317 | num_threads=args.num_threads, 318 | suppress_dspy_output=args.suppress_dspy_output, 319 | dataset_mode=args.dataset_mode, 320 | dataset_path=args.dataset_path, 321 | missing_mode_file=args.missing_mode_file, 322 | api_key=args.lm_api_key, 323 | api_base=args.lm_api_base, 324 | ) 325 | 326 | if __name__ == "__main__": 327 | main() 328 | -------------------------------------------------------------------------------- /langProBe/evaluation_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import dspy 3 | from typing import List, Tuple, Optional 4 | from langProBe.program_utils import call_lm, ProcessManager 5 | import langProBe.constants as constants 6 | import logging 7 | import re 8 | import string 9 | import warnings 10 | import os 11 | import logging 12 | import numpy as np 13 | 14 | 15 | EVALUATE_PROMPT = """对于以下问题:{question} 16 | 17 | 请判断预测答案是否回答正确,回答对关键信息就算正确: 18 | 19 | 预测答案: {prediction} 20 | 正确答案: {ground_truth} 21 | 22 | 只需要返回True或False。""" 23 | 24 | def evaluate_final_answer( 25 | question: str, 26 | ground_truth: str, 27 | prediction: str, 28 | manager: ProcessManager, 29 | logger: logging.Logger, 30 | ) -> Tuple[bool, Optional[str]]: 31 | prompt = EVALUATE_PROMPT.format(question=question, prediction=prediction, ground_truth=ground_truth) 32 | messages = [ 33 | { 34 | constants.ROLE: constants.USER, 35 | constants.CONTENT: prompt 36 | } 37 | ] 38 | logger.info(f"开始评测final answer") 39 | logger.info(f"question: {question}") 40 | logger.info(f"ground_truth: {ground_truth}") 41 | logger.info(f"prediction: {prediction}") 42 | response_content, _, _ = call_lm(messages, manager, logger, temperature=0.01) 43 | return "true" in response_content.lower() 44 | 45 | 46 | def normalize_number_str(number_str: str) -> float: 47 | # we replace these common units and commas to allow 48 | # conversion to float 49 | for char in ["$", "%", ","]: 50 | number_str = number_str.replace(char, "") 51 | try: 52 | return float(number_str) 53 | except ValueError: 54 | print(f"String {number_str} cannot be normalized to number str.") 55 | return float("inf") 56 | 57 | 58 | def split_string( 59 | s: str, 60 | char_list: list[str] = [",", ";"], 61 | ) -> list[str]: 62 | pattern = f"[{''.join(char_list)}]" 63 | return re.split(pattern, s) 64 | 65 | def normalize_str(input_str, remove_punct=True) -> str: 66 | """ 67 | Normalize a string by: 68 | - Removing all white spaces 69 | - Optionally removing punctuation (if remove_punct is True) 70 | - Converting to lowercase 71 | Parameters: 72 | - input_str: str, the string to normalize 73 | - remove_punct: bool, whether to remove punctuation (default: True) 74 | Returns: 75 | - str, the normalized string 76 | """ 77 | # Remove all white spaces. Required e.g for seagull vs. sea gull 78 | no_spaces = re.sub(r"\s", "", input_str) 79 | 80 | # Remove punctuation, if specified. 81 | if remove_punct: 82 | translator = str.maketrans("", "", string.punctuation) 83 | return no_spaces.lower().translate(translator) 84 | else: 85 | return no_spaces.lower() 86 | 87 | 88 | def question_scorer( 89 | model_answer: str, 90 | ground_truth: str, 91 | logger: logging.Logger 92 | ) -> Tuple[bool, Optional[str]]: 93 | def is_float(element: any) -> bool: 94 | try: 95 | float(element) 96 | return True 97 | except ValueError: 98 | return False 99 | 100 | if model_answer is None: 101 | model_answer = "None" 102 | logger.debug("Model answer is None. Converted to string 'None'.") 103 | 104 | # If ground truth is a number 105 | if is_float(ground_truth): 106 | info = f"Evaluating '{model_answer}' as a number." 107 | logger.info(info) 108 | normalized_answer = normalize_number_str(model_answer) 109 | try: 110 | result = normalized_answer == float(ground_truth) 111 | logger.debug(f"Normalized model answer: {normalized_answer}, Ground truth: {ground_truth}, Result: {result}") 112 | return result 113 | except ValueError as e: 114 | error_msg = f"Normalization error: {e}" 115 | logger.error(error_msg) 116 | return False 117 | 118 | # If ground truth is a list 119 | elif any(char in ground_truth for char in [",", ";"]): 120 | info = f"Evaluating '{model_answer}' as a comma/semi-colon separated list." 121 | logger.info(info) 122 | 123 | gt_elems = split_string(ground_truth) 124 | ma_elems = split_string(model_answer) 125 | logger.debug(f"Ground truth elements: {gt_elems}") 126 | logger.debug(f"Model answer elements: {ma_elems}") 127 | 128 | # Check if lengths are the same 129 | if len(gt_elems) != len(ma_elems): 130 | warning_msg = "Answer lists have different lengths." 131 | logger.warning(warning_msg) 132 | return False 133 | 134 | # Compare each element as float or string 135 | comparisons = [] 136 | for idx, (ma_elem, gt_elem) in enumerate(zip(ma_elems, gt_elems), start=1): 137 | if is_float(gt_elem): 138 | try: 139 | normalized_ma_elem = normalize_number_str(ma_elem) 140 | comparison = normalized_ma_elem == float(gt_elem) 141 | logger.debug(f"Element {idx}: Normalized model answer element '{normalized_ma_elem}' == Ground truth element '{float(gt_elem)}': {comparison}") 142 | except ValueError as e: 143 | error_msg = f"Normalization error at element {idx}: {e}" 144 | logger.error(error_msg) 145 | return False 146 | else: 147 | normalized_ma = normalize_str(ma_elem, remove_punct=False) 148 | normalized_gt = normalize_str(gt_elem, remove_punct=False) 149 | comparison = normalized_ma == normalized_gt 150 | logger.debug(f"Element {idx}: Normalized model answer element '{normalized_ma}' == Ground truth element '{normalized_gt}': {comparison}") 151 | comparisons.append(comparison) 152 | 153 | all_correct = all(comparisons) 154 | if not all_correct: 155 | detail_msg = "Mismatch found in list elements." 156 | logger.info(detail_msg) 157 | return all_correct 158 | logger.debug("All list elements match the ground truth.") 159 | return all_correct 160 | 161 | # If ground truth is a string 162 | else: 163 | info = f"Evaluating '{model_answer}' as a string." 164 | logger.info(info) 165 | normalized_ma = normalize_str(model_answer) 166 | normalized_gt = normalize_str(ground_truth) 167 | result = normalized_ma == normalized_gt 168 | logger.debug(f"Normalized model answer: '{normalized_ma}' == Normalized ground truth: '{normalized_gt}': {result}") 169 | return result 170 | 171 | def mcp_metric(example: dspy.Example, pred: dspy.Prediction): 172 | return pred.success 173 | 174 | 175 | 176 | def extract_questions(data, key): 177 | """从数据中提取指定字段(如 Prompt 或 question)用于比较""" 178 | questions = set() 179 | for item in data: 180 | questions.add(item[key]) 181 | return questions 182 | 183 | def find_missing_entries(data_a, data_b): 184 | # data_a是原数据,data_b是已经跑了的数据 185 | 186 | questions_in_b = extract_questions(data_b, 'question') 187 | 188 | # 找出在B中不存在的A条目 189 | missing_entries = [item for item in data_a if item['Prompt'] not in questions_in_b] 190 | 191 | return missing_entries 192 | 193 | import logging 194 | 195 | import os 196 | import logging 197 | 198 | def replace_logger_filehandler(new_log_name): 199 | """ 200 | 替换 logger 中已有的 FileHandler,并为每个 logger 保留其原有的 formatter。 201 | 同时删除原有日志文件。 202 | 203 | :param new_log_name: 新的日志文件名(不带后缀) 204 | """ 205 | 206 | def update_handler(logger, file_suffix): 207 | old_log_paths = [] 208 | formatter = None 209 | for handler in logger.handlers: 210 | if isinstance(handler, logging.FileHandler): 211 | if formatter is None: 212 | formatter = handler.formatter 213 | old_log_paths.append(handler.baseFilename) 214 | 215 | for handler in list(logger.handlers): 216 | if isinstance(handler, logging.FileHandler): 217 | handler.close() 218 | logger.removeHandler(handler) 219 | 220 | for log_path in old_log_paths: 221 | if os.path.exists(log_path): 222 | try: 223 | os.remove(log_path) 224 | except Exception as e: 225 | pass 226 | 227 | if logger.name == 'MCPPredictRunLogger': 228 | new_name = new_log_name.replace("message", "run") 229 | else: 230 | new_name = new_log_name 231 | 232 | new_handler = logging.FileHandler(f"{new_name}.{file_suffix}", mode='a', encoding='utf-8') 233 | if formatter: 234 | new_handler.setFormatter(formatter) 235 | logger.addHandler(new_handler) 236 | 237 | run_logger = logging.getLogger('MCPPredictRunLogger') 238 | update_handler(run_logger, 'log') 239 | 240 | message_logger = logging.getLogger('MCPPredictMessageLogger') 241 | update_handler(message_logger, 'jsonl') 242 | 243 | 244 | 245 | if __name__ == "__main__": 246 | print(question_scorer("123", "123")) 247 | -------------------------------------------------------------------------------- /langProBe/langchain_program.py: -------------------------------------------------------------------------------- 1 | from langchain.chains import LLMChain 2 | from langchain.prompts import PromptTemplate 3 | from langchain_community.chat_models import ChatLiteLLM 4 | 5 | from langProBe.program_utils import DotDict 6 | 7 | 8 | class LangProBeLangChainMetaProgram: 9 | def __init__(self, input_kwargs, output_kwargs): 10 | self.lm = None 11 | self.input_kwargs = input_kwargs 12 | self.out_kwargs = output_kwargs 13 | 14 | def setup_lm(self, lm: str, api_key: str = None, api_base: str = None): 15 | self.lm = ChatLiteLLM(model=lm, api_key=api_key, api_base=api_base) 16 | 17 | 18 | class NaiveLangChainProgram(LangProBeLangChainMetaProgram): 19 | def __call__(self, **kwargs): 20 | if not self.lm: 21 | raise ValueError("Language model not initialized. Call setup_lm() first.") 22 | 23 | # Validate input keys 24 | missing_keys = [key for key in self.input_kwargs if key not in kwargs] 25 | if missing_keys: 26 | raise ValueError(f"Missing required inputs: {missing_keys}") 27 | 28 | # Dynamically generate prompt template 29 | prompt_text = "Given the following inputs:\n" 30 | for key in self.input_kwargs: 31 | prompt_text += f"- {key}: {{{key}}}\n" 32 | prompt_text += f"Output the following field: {self.out_kwargs[0]}. Your response should be this output field only, with no explanation and formatting.\n Your response:" 33 | 34 | prompt_template = PromptTemplate( 35 | input_variables=self.input_kwargs, template=prompt_text 36 | ) 37 | 38 | # Create LLM chain 39 | chain = LLMChain(llm=self.lm, prompt=prompt_template) 40 | 41 | # Run the chain 42 | response = chain.run(kwargs) 43 | 44 | # Format output 45 | return DotDict({self.out_kwargs[0]: response}) 46 | -------------------------------------------------------------------------------- /langProBe/mcp_program.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | from pydantic import BaseModel, Field 3 | from langProBe.program_utils import ( 4 | call_lm, 5 | build_init_messages, 6 | build_messages, 7 | response_parsing, 8 | mcp_calling, 9 | ProcessManager 10 | ) 11 | import time 12 | from langProBe.evaluation_utils import evaluate_final_answer 13 | import langProBe.constants as constants 14 | import logging 15 | import os 16 | from datetime import datetime 17 | import json 18 | from typing import List, Dict, Optional, Tuple 19 | 20 | 21 | MCP_SAMPLE_SYSTEM_PROMPT = """ 22 | You are a helpful assistant. You are able to answer questions using different tools. 23 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools. 24 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server. 25 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool. 26 | The tool description includes: 27 | A brief text description outlining the functionality of the tool. 28 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter. 29 | """ 30 | 31 | class MCP_LM(BaseModel): 32 | model: str = Field( 33 | default=None, 34 | description="The model to use for the MCP program.", 35 | ) 36 | api_key: str = Field( 37 | default=None, 38 | description="The API key for the model.", 39 | ) 40 | api_base: str = Field( 41 | default=None, 42 | description="The API base URL for the model.", 43 | ) 44 | 45 | class LangProBeMCPMetaProgram(dspy.Module): 46 | def __init__(self): 47 | super().__init__() 48 | self.lm = MCP_LM() 49 | def setup_lm(self, lm, api_key=None, api_base=None): 50 | self.lm.model = lm 51 | self.lm.api_key = api_key 52 | self.lm.api_base = api_base 53 | 54 | def program_type(self): 55 | return "mcp" 56 | 57 | 58 | class MCPPredict(LangProBeMCPMetaProgram, dspy.Module): 59 | def __init__(self, max_steps=5, system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, task_name="mcp_sample"): 60 | super().__init__() 61 | self.system_prompt = system_prompt 62 | self.task_name = task_name 63 | self.max_steps = max_steps 64 | self.max_length = 30000 65 | 66 | # 配置运行日志记录器 67 | self.run_logger = logging.getLogger('MCPPredictRunLogger') 68 | self.run_logger.setLevel(logging.INFO) 69 | 70 | # 配置消息日志记录器 71 | self.message_logger = logging.getLogger('MCPPredictMessageLogger') 72 | self.message_logger.setLevel(logging.INFO) 73 | 74 | # 创建日志目录 75 | os.makedirs('logs', exist_ok=True) 76 | self.setup_loggers() 77 | 78 | def setup_loggers(self): 79 | timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') 80 | 81 | # 设置运行日志 82 | run_log_file = f'logs/{self.task_name}_run_{timestamp}.log' 83 | run_handler = logging.FileHandler(run_log_file, encoding='utf-8') 84 | run_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 85 | run_handler.setFormatter(run_formatter) 86 | self.run_logger.addHandler(run_handler) 87 | 88 | # 设置消息日志 89 | message_log_file = f'logs/{self.task_name}_messages_{timestamp}.jsonl' 90 | message_handler = logging.FileHandler(message_log_file, encoding='utf-8') 91 | self.message_logger.addHandler(message_handler) 92 | 93 | 94 | def update_log_paths(self, new_log_dir): 95 | # 确保新的日志目录存在 96 | os.makedirs(new_log_dir, exist_ok=True) 97 | 98 | # 更新运行日志记录器 99 | for handler in self.run_logger.handlers[:]: 100 | self.run_logger.removeHandler(handler) 101 | 102 | run_log_file = f'{new_log_dir}/{self.task_name}_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log' 103 | run_handler = logging.FileHandler(run_log_file, encoding='utf-8') 104 | run_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 105 | run_handler.setFormatter(run_formatter) 106 | self.run_logger.addHandler(run_handler) 107 | 108 | # 更新消息日志记录器 109 | for handler in self.message_logger.handlers[:]: 110 | self.message_logger.removeHandler(handler) 111 | 112 | message_log_file = f'{new_log_dir}/{self.task_name}_messages_{datetime.now().strftime("%Y%m%d_%H%M%S")}.jsonl' 113 | message_handler = logging.FileHandler(message_log_file, encoding='utf-8') 114 | self.message_logger.addHandler(message_handler) 115 | 116 | def evaluate_prediction(self, question: str, ground_truth: str, prediction: str) -> Tuple[bool, Optional[str]]: 117 | answer_eval_manager = ProcessManager() 118 | answer_eval_manager.lm_api_key = self.lm.api_key 119 | answer_eval_manager.lm_api_base = self.lm.api_base 120 | answer_eval_manager.model = "openai/deepseek-v3" 121 | return evaluate_final_answer(question, ground_truth, prediction, answer_eval_manager, self.run_logger) 122 | 123 | def log_messages(self, messages, question, success, time_cost, prompt_tokens_cost, completion_tokens_cost): 124 | log_entry = { 125 | "question": question, 126 | "messages": messages, 127 | "success": success, 128 | "time_cost": time_cost, 129 | "prompt_tokens_cost": prompt_tokens_cost, 130 | "completion_tokens_cost": completion_tokens_cost 131 | } 132 | self.message_logger.info(json.dumps(log_entry, ensure_ascii=False)) 133 | 134 | 135 | def forward(self, **kwargs) -> dspy.Prediction: 136 | unique_id = kwargs.get('id') 137 | question = kwargs.get('question') 138 | gt = kwargs.get('answer') 139 | 140 | manager = ProcessManager() 141 | manager.lm_api_key = self.lm.api_key 142 | manager.lm_api_base = self.lm.api_base 143 | manager.model = self.lm.model 144 | manager.id = unique_id 145 | 146 | self.run_logger.info(f"ID: {manager.id}, Starting forward pass for question: {question}") 147 | 148 | 149 | from langProBe.evaluation import global_config 150 | mcps = global_config['mcp_pool'] 151 | 152 | messages = build_init_messages(self.system_prompt, mcps, question) 153 | steps = 0 154 | all_completion_tokens = 0 155 | all_prompt_tokens = 0 156 | start_time = time.time() 157 | 158 | while not messages[-1][constants.ROLE] == constants.ASSISTANT and steps < self.max_steps: 159 | response, completion_tokens, prompt_tokens= call_lm(messages, manager, self.run_logger) 160 | all_completion_tokens += completion_tokens 161 | all_prompt_tokens += prompt_tokens 162 | mcp_calls = response_parsing(response) 163 | 164 | new_messages = mcp_calling(mcp_calls, manager, self.run_logger) 165 | 166 | messages = build_messages(messages, new_messages) 167 | steps += 1 168 | 169 | end_time = time.time() 170 | 171 | # 如果达到最大步数仍未得到答案 172 | if messages[-1][constants.ROLE] != constants.ASSISTANT: 173 | self.run_logger.warning("Maximum steps reached without getting an answer") 174 | messages.append({ 175 | constants.ROLE: constants.ASSISTANT, 176 | constants.CONTENT: "超过最长次数限制,该问题无法解决", 177 | }) 178 | 179 | 180 | self.run_logger.info(f"ID: {manager.id}, Forward pass completed successfully") 181 | success = self.evaluate_prediction(question, gt, messages[-1][constants.CONTENT]) 182 | self.log_messages(messages, question, success, (end_time-start_time), all_prompt_tokens, all_completion_tokens) 183 | self.run_logger.info(f"ID: {manager.id}, Evaluation completed successfully") 184 | # self.run_logger.info("==" * 50) 185 | 186 | return dspy.Prediction( 187 | success=success, 188 | question=question, 189 | ground_truth=gt, 190 | answer=messages[-1][constants.CONTENT], 191 | trace=messages, 192 | process_report=manager 193 | ) -------------------------------------------------------------------------------- /langProBe/optimizers.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import random 3 | from dataclasses import dataclass 4 | from functools import partial 5 | from typing import Callable, Type 6 | 7 | import dspy 8 | import dspy.teleprompt 9 | import numpy as np 10 | from dspy.evaluate.evaluate import Evaluate 11 | from dspy.teleprompt import BootstrapFewShot 12 | 13 | 14 | class BootstrapFewShotInfer(BootstrapFewShot): 15 | def __init__( 16 | self, 17 | num_candidates=5, 18 | num_rules=5, 19 | num_threads=8, 20 | teacher_settings=None, 21 | **kwargs, 22 | ): 23 | super().__init__(teacher_settings=teacher_settings, **kwargs) 24 | self.num_candidates = num_candidates 25 | self.num_rules = num_rules 26 | self.num_threads = num_threads 27 | self.rules_induction_program = RulesInductionProgramINFER( 28 | num_rules, teacher_settings=teacher_settings 29 | ) 30 | self.metric = kwargs.get("metric") 31 | self.max_errors = kwargs.get("max_errors", 5) 32 | 33 | def compile(self, student, *, teacher=None, trainset, valset=None): 34 | super().compile(student, teacher=teacher, trainset=trainset) 35 | if valset is None: 36 | train_size = int(0.8 * len(trainset)) 37 | trainset, valset = trainset[:train_size], trainset[train_size:] 38 | original_program = copy.deepcopy(self.student) 39 | all_predictors = [ 40 | p for p in original_program.predictors() if hasattr(p, "signature") 41 | ] 42 | instructions_list = [p.signature.instructions for p in all_predictors] 43 | 44 | best_score = -np.inf 45 | best_program = None 46 | 47 | for candidate_idx in range(self.num_candidates): 48 | candidate_program = copy.deepcopy(original_program) 49 | candidate_predictors = [ 50 | p for p in candidate_program.predictors() if hasattr(p, "signature") 51 | ] 52 | for i, predictor in enumerate(candidate_predictors): 53 | predictor.signature.instructions = instructions_list[i] 54 | for i, predictor in enumerate(candidate_predictors): 55 | rules = self.induce_natural_language_rules(predictor, trainset) 56 | predictor.signature.instructions = instructions_list[i] 57 | self.update_program_instructions(predictor, rules) 58 | score = self.evaluate_program(candidate_program, valset) 59 | if score > best_score: 60 | best_score = score 61 | best_program = candidate_program 62 | print( 63 | f"New best candidate (Candidate {candidate_idx+1}) with score {best_score}" 64 | ) 65 | print("Final best score:", best_score) 66 | self.student = best_program 67 | return best_program 68 | 69 | def induce_natural_language_rules(self, predictor, trainset): 70 | demos = self.get_predictor_demos(trainset, predictor) 71 | signature = predictor.signature 72 | while True: 73 | examples_text = self.format_examples(demos, signature) 74 | try: 75 | natural_language_rules = self.rules_induction_program(examples_text) 76 | break 77 | except Exception as e: 78 | print("entereing here") 79 | print(len(demos)) 80 | 81 | if ( 82 | isinstance(e, ValueError) 83 | or e.__class__.__name__ == "BadRequestError" 84 | or "ContextWindowExceededError" in str(e) 85 | ): 86 | if len(demos) > 1: 87 | demos = demos[:-1] 88 | else: 89 | natural_language_rules = "" 90 | raise RuntimeError( 91 | "Failed to generate natural language rules: A single example could not fit in context." 92 | ) from e 93 | return natural_language_rules 94 | 95 | def update_program_instructions(self, predictor, natural_language_rules): 96 | predictor.signature.instructions = ( 97 | f"{predictor.signature.instructions}\n\n" 98 | f"Please apply the following rules when making your prediction:\n{natural_language_rules}" 99 | ) 100 | 101 | def format_examples(self, demos, signature): 102 | examples_text = "" 103 | for demo in demos: 104 | input_fields = { 105 | k: v for k, v in demo.items() if k in signature.input_fields 106 | } 107 | output_fields = { 108 | k: v for k, v in demo.items() if k in signature.output_fields 109 | } 110 | input_text = "\n".join(f"{k}: {v}" for k, v in input_fields.items()) 111 | output_text = "\n".join(f"{k}: {v}" for k, v in output_fields.items()) 112 | examples_text += f"Example:\n{input_text}\n{output_text}\n\n" 113 | return examples_text 114 | 115 | def get_predictor_demos(self, trainset, predictor): 116 | signature = predictor.signature 117 | return [ 118 | { 119 | key: value 120 | for key, value in example.items() 121 | if key in signature.input_fields or key in signature.output_fields 122 | } 123 | for example in trainset 124 | ] 125 | 126 | def evaluate_program(self, program, dataset): 127 | evaluate = Evaluate( 128 | devset=dataset, 129 | metric=self.metric, 130 | num_threads=self.num_threads, 131 | max_errors=self.max_errors, 132 | display_table=False, 133 | display_progress=True, 134 | return_all_scores=True, 135 | ) 136 | score, _ = evaluate(program, metric=self.metric) 137 | return score 138 | 139 | 140 | class RulesInductionProgramINFER(dspy.Module): 141 | def __init__(self, num_rules, teacher_settings=None, verbose=False): 142 | super().__init__() 143 | docstring = f"""Given a set of examples, extract a set of {num_rules} concise and non-redundant natural language rules that explain the patterns in the data. These rules should be specific and actionable, providing clear guidance for performing the task.""" 144 | 145 | class CustomRulesInduction(dspy.Signature): 146 | __doc__ = docstring 147 | examples_text = dspy.InputField(desc="Text containing examples") 148 | natural_language_rules = dspy.OutputField( 149 | desc="Induced natural language rules" 150 | ) 151 | 152 | self.rules_induction = dspy.ChainOfThought(CustomRulesInduction) 153 | self.verbose = verbose 154 | self.teacher_settings = teacher_settings or {} 155 | 156 | def forward(self, examples_text): 157 | original_temp = dspy.settings.lm.kwargs.get("temperature", 0.7) 158 | if self.teacher_settings: 159 | with dspy.settings.context(**self.teacher_settings): 160 | print("Using teacher settings") 161 | print(dspy.settings.lm.model) 162 | dspy.settings.lm.kwargs["temperature"] = random.uniform(0.9, 1.0) 163 | print(dspy.settings.lm.kwargs["temperature"]) 164 | prediction = self.rules_induction(examples_text=examples_text) 165 | else: 166 | # print('Using default DSPy settings') 167 | # print(dspy.settings.lm.model) 168 | dspy.settings.lm.kwargs["temperature"] = random.uniform(0.9, 1.0) 169 | prediction = self.rules_induction(examples_text=examples_text) 170 | dspy.settings.lm.kwargs["temperature"] = original_temp 171 | natural_language_rules = prediction.natural_language_rules.strip() 172 | if self.verbose: 173 | print(natural_language_rules) 174 | return natural_language_rules 175 | 176 | 177 | @dataclass 178 | class OptimizerConfig: 179 | optimizer: Type[dspy.teleprompt.Teleprompter] 180 | init_args: dict 181 | compile_args: dict 182 | langProBe_configs: dict 183 | name: str 184 | 185 | def __str__(self): 186 | return f""" 187 | [[ 188 | Optimizer: {self.name} ({self.optimizer}) 189 | init_args: {self.init_args} 190 | compile_args: {self.compile_args} 191 | langProBe_configs: {self.langProBe_configs} 192 | ]] 193 | """ 194 | 195 | def __repr__(self): 196 | return self.__str__() 197 | 198 | 199 | # Optimizer configuration formats: 200 | DEFAULT_OPTIMIZERS = [ 201 | OptimizerConfig( 202 | optimizer=dspy.teleprompt.BootstrapFewShot, 203 | init_args=dict(max_errors=5000, max_labeled_demos=2), 204 | compile_args=dict(), 205 | langProBe_configs=dict(use_valset=False, save_candidate_score=False), 206 | name="BootstrapFewShot", 207 | ), 208 | OptimizerConfig( 209 | optimizer=dspy.teleprompt.BootstrapFewShotWithRandomSearch, 210 | init_args=dict(max_errors=5000, max_labeled_demos=2, num_threads=16), 211 | compile_args=dict(), 212 | langProBe_configs=dict(use_valset=True, save_candidate_score=True), 213 | name="BootstrapFewShotWithRandomSearch", 214 | ), 215 | OptimizerConfig( 216 | optimizer=dspy.teleprompt.MIPROv2, 217 | init_args=dict(max_errors=5000, auto="medium", num_threads=16), 218 | compile_args=dict( 219 | requires_permission_to_run=False, 220 | num_trials=20, 221 | max_bootstrapped_demos=4, 222 | max_labeled_demos=2, 223 | ), 224 | langProBe_configs=dict( 225 | use_valset=True, 226 | save_candidate_score=True, 227 | ), 228 | name="MIPROv2-lite", 229 | ), 230 | OptimizerConfig( 231 | optimizer=dspy.teleprompt.MIPROv2, 232 | init_args=dict(max_errors=5000, num_threads=16, num_candidates=12), 233 | compile_args=dict( 234 | requires_permission_to_run=False, 235 | num_trials=50, 236 | max_bootstrapped_demos=4, 237 | max_labeled_demos=2, 238 | minibatch_size=35, 239 | minibatch_full_eval_steps=5, 240 | ), 241 | langProBe_configs=dict( 242 | use_valset=True, 243 | save_candidate_score=True, 244 | ), 245 | name="MIPROv2", 246 | ), 247 | OptimizerConfig( 248 | optimizer=BootstrapFewShotInfer, 249 | init_args=dict(max_errors=5000, num_candidates=10, num_rules=10, num_threads=8), 250 | compile_args=dict(), 251 | langProBe_configs=dict(use_valset=True), 252 | name="RuleInfer-lite", 253 | ), 254 | OptimizerConfig( 255 | optimizer=BootstrapFewShotInfer, 256 | init_args=dict(max_errors=5000, num_candidates=10, num_rules=20, num_threads=8), 257 | compile_args=dict(), 258 | langProBe_configs=dict(use_valset=True), 259 | name="RuleInfer", 260 | ), 261 | ] 262 | 263 | 264 | def update_optimizer_from_list( 265 | optimizer_list: list[OptimizerConfig], optimizer: OptimizerConfig 266 | ) -> list[OptimizerConfig]: 267 | new_optimizer_list = [] 268 | for optimizer_config in optimizer_list: 269 | if optimizer.optimizer == optimizer_config.optimizer: 270 | new_optimizer_list.append(optimizer) 271 | else: 272 | new_optimizer_list.append(optimizer_config) 273 | return new_optimizer_list 274 | 275 | 276 | def create_optimizer( 277 | optimizer_config: OptimizerConfig, metric, num_threads=None 278 | ) -> tuple[Callable, dict]: 279 | name = optimizer_config.name 280 | optimizer = optimizer_config.optimizer 281 | init_args = optimizer_config.init_args 282 | if num_threads and "num_threads" in init_args: 283 | init_args["num_threads"] = num_threads 284 | compile_args = optimizer_config.compile_args 285 | langProBe_configs = optimizer_config.langProBe_configs | {"name": name} 286 | optimizer = optimizer(metric=metric, **init_args) 287 | return partial(optimizer.compile, **compile_args), langProBe_configs 288 | -------------------------------------------------------------------------------- /langProBe/program_utils.py: -------------------------------------------------------------------------------- 1 | from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log 2 | from typing import List, Tuple, Optional, Dict, Union 3 | from openai import OpenAI 4 | import json 5 | import copy 6 | from pydantic import BaseModel, Field 7 | import re 8 | import os 9 | import langProBe.constants as constants 10 | import logging 11 | from .synced_mcp_client import SyncedMcpClient 12 | 13 | TOOL_PROMPT = """ 14 | ## Tool Calling Rules 15 | When external tools are required, the call request must be strictly generated according to the following rules: 16 | 17 | { 18 | "server_name": "", 19 | "tool_name": "", 20 | "inputs": { 21 | "": "", 22 | "": "", 23 | } 24 | } 25 | 26 | 27 | If no tool is called, provide the final answer directly. 28 | 29 | """ 30 | 31 | class ProcessManager(BaseModel): 32 | id: str = Field( 33 | default=None, 34 | description="The ID of the process.", 35 | ) 36 | lm_api_key: str = Field( 37 | default=os.getenv("OPENAI_API_KEY"), 38 | description="OpenAI API Key" 39 | ) 40 | lm_api_base: str = Field( 41 | default=os.getenv("OPENAI_API_BASE"), 42 | description="OpenAI API Base URL" 43 | ) 44 | model: str = Field( 45 | default=None, 46 | description="OpenAI Model Name, with prefix 'openai/'" 47 | ) 48 | lm_usages: List[Dict] = Field( 49 | default=[], 50 | description="Usage statistics for the model" 51 | ) 52 | mcp_rts: List[Dict] = Field( 53 | default=[], 54 | description="Usage statistics for the MCPs" 55 | ) 56 | mcp_retry_times: List[Dict] = Field( 57 | default=[], 58 | description="Statistics for the MCP retries" 59 | ) 60 | 61 | 62 | class MCPCall(BaseModel): 63 | mcp_server_name: Optional[str] = None 64 | mcp_tool_name: Optional[str] = None 65 | mcp_args: Optional[Dict] = None 66 | 67 | 68 | class MCPCallList(BaseModel): 69 | shutdown: bool = False 70 | mcps: Optional[List[MCPCall]] = None 71 | raw_content: Optional[str] = None 72 | 73 | @retry( 74 | stop=stop_after_attempt(5), 75 | wait=wait_exponential(multiplier=1, min=2, max=10), 76 | reraise=True, 77 | ) 78 | def call_lm( 79 | messages: List, 80 | manager: ProcessManager, 81 | logger: logging.Logger, 82 | temperature: float|None=None, 83 | ) -> tuple[str | None, int, int]: 84 | 85 | try: 86 | oai = OpenAI( 87 | api_key=manager.lm_api_key, 88 | base_url=manager.lm_api_base, 89 | ) 90 | prefix, model_name = manager.model.split('/') 91 | assert prefix == 'openai' 92 | 93 | if model_name in ['deepseek-r1', 'qwq-plus', 'qwq-32b']: # qwen reasoning模型仅支持流式输出 94 | reasoning_content = "" # 定义完整思考过程 95 | answer_content = "" # 定义完整回复 96 | is_answering = False # 判断是否结束思考过程并开始回复 97 | 98 | completion = oai.chat.completions.create( 99 | model=model_name, 100 | messages=messages, 101 | stream=True, 102 | stream_options={ 103 | "include_usage": True 104 | } 105 | ) 106 | for chunk in completion: 107 | # 如果chunk.choices为空,则打印usage 108 | if not chunk.choices: 109 | usage = chunk.usage 110 | else: 111 | delta = chunk.choices[0].delta 112 | if hasattr(delta, 'reasoning_content') and delta.reasoning_content != None: 113 | reasoning_content += delta.reasoning_content 114 | else: 115 | # 开始回复 116 | if delta.content != "" and is_answering is False: 117 | is_answering = True 118 | answer_content += delta.content 119 | completion_tokens = usage.completion_tokens 120 | prompt_tokens = usage.prompt_tokens 121 | manager.lm_usages.append({ 122 | "completion_tokens": completion_tokens, 123 | "prompt_tokens": prompt_tokens, 124 | }) 125 | return '' + reasoning_content + '' + answer_content, completion_tokens, prompt_tokens 126 | 127 | 128 | if temperature is not None: 129 | response = oai.beta.chat.completions.parse( 130 | messages=messages, 131 | model=model_name, 132 | temperature = temperature 133 | ) 134 | else: 135 | response = oai.beta.chat.completions.parse( 136 | messages=messages, 137 | model=model_name, 138 | ) 139 | # print("Response is " + str(response)) 140 | response_text = response.choices[0].message.content 141 | completion_tokens = response.usage.completion_tokens 142 | prompt_tokens = response.usage.prompt_tokens 143 | manager.lm_usages.append({ 144 | "completion_tokens": completion_tokens, 145 | "prompt_tokens": prompt_tokens, 146 | }) 147 | return response_text, completion_tokens, prompt_tokens 148 | 149 | except Exception as e: 150 | logger.error(f"ID: {manager.id}, Error in call_lm: {str(e)}") 151 | if response: 152 | logger.error(f"ID: {manager.id}, Response: {response}") 153 | raise 154 | 155 | def build_system_content(base_system: str, 156 | mcps: List, 157 | ) -> str: 158 | tools_section = "## Available Tools\n" 159 | for mcp in mcps: 160 | tools_section += f"### Server '{mcp['name']}' include following tools\n" 161 | if mcp['name'] in ['wuying-agentbay-mcp-server', 'Playwright']: 162 | tools_section += f"当使用本server来执行搜索任务时,请以https://www.baidu.com为初始网站进行搜索。" 163 | url = mcp.get("url") 164 | if not url: 165 | try: 166 | port = mcp.get('run_config')[0]["port"] 167 | url = f"http://localhost:{port}/sse" 168 | except: 169 | raise Exception("No url found") 170 | client = SyncedMcpClient(server_url=url) 171 | try: 172 | result = client.list_tools() 173 | tools = result.tools 174 | except Exception as e: 175 | raise Exception(f"Fail access to server: {mcp['name']}, error: {e}") 176 | 177 | for t in tools: 178 | tools_section += f"- {t.name}: {t.description}\n" 179 | input_schema = t.inputSchema 180 | required_params = input_schema.get("required", []) 181 | params_desc = [] 182 | 183 | if "properties" in input_schema: 184 | for param_name, param_info in input_schema["properties"].items(): 185 | is_required = param_name in required_params 186 | param_type = param_info.get("type", "") 187 | param_desc = param_info.get("description", "") 188 | 189 | req_tag = "必填" if is_required else "可选" 190 | params_desc.append( 191 | f"- {param_name} ({param_type}, {req_tag}): {param_desc}" 192 | ) 193 | 194 | # 使用更丰富的描述 195 | params_text = "\n".join(params_desc) if params_desc else "无参数" 196 | tools_section += f" 参数:\n{params_text}\n\n" 197 | 198 | prompt = base_system + f"""{tools_section}""" + TOOL_PROMPT 199 | 200 | return prompt 201 | 202 | 203 | def build_init_messages( 204 | base_system: str, 205 | mcps: List, 206 | user_question: str, 207 | ) -> List[Dict]: 208 | system_content = build_system_content(base_system, mcps) 209 | messages = [ 210 | { 211 | constants.ROLE: constants.SYSTEM, 212 | constants.CONTENT: system_content 213 | }, 214 | { 215 | constants.ROLE: constants.USER, 216 | constants.CONTENT: user_question 217 | } 218 | ] 219 | return messages 220 | 221 | 222 | 223 | def build_messages( 224 | messages: List[Dict], 225 | message_to_append: List[Dict], 226 | ) -> List[Dict]: 227 | assert messages[0][constants.ROLE] == constants.SYSTEM 228 | 229 | final_message = copy.deepcopy(messages) 230 | 231 | if message_to_append: 232 | if message_to_append[-1][constants.ROLE] == constants.USER: 233 | assert len(message_to_append) == 1 234 | assert final_message[-1][constants.ROLE] in {constants.ASSISTANT, constants.TOOL, constants.SYSTEM} 235 | final_message.extend(message_to_append) 236 | elif message_to_append[-1][constants.ROLE] == constants.ASSISTANT: 237 | assert len(message_to_append) == 1 238 | assert final_message[-1][constants.ROLE] in {constants.USER, constants.TOOL} 239 | final_message.extend(message_to_append) 240 | elif message_to_append[-1][constants.ROLE] == constants.TOOL: 241 | assert len(message_to_append) == 2 242 | assert final_message[-1][constants.ROLE] in {constants.USER, constants.TOOL} 243 | final_message.extend(message_to_append) 244 | 245 | # TODO: 超过最长上下文长度处理 246 | 247 | return final_message 248 | 249 | 250 | 251 | def response_parsing(content: str) -> MCPCallList: 252 | pattern = r'(.*?)<\/tool>' 253 | matches = re.findall(pattern, content, re.DOTALL) 254 | mcps = [] 255 | for match in matches: 256 | # TODO: 错误处理 257 | data = json.loads(match) 258 | mcps.append(MCPCall( 259 | mcp_server_name=data['server_name'].strip(), 260 | mcp_tool_name=data['tool_name'].strip(), 261 | mcp_args=data['inputs'] 262 | )) 263 | 264 | if mcps: 265 | return MCPCallList(shutdown=False, mcps=mcps, raw_content=content) 266 | else: 267 | return MCPCallList(shutdown=True, mcps=None, raw_content=content) 268 | 269 | 270 | def mcp_calling( 271 | mcp_call_list: MCPCallList, 272 | manager: ProcessManager, 273 | logger: logging.Logger, 274 | ) -> List[Dict]: 275 | logger.debug(f"ID:{manager.id}, Entering mcp_calling with mcp_call_list: {mcp_call_list}") 276 | 277 | if mcp_call_list.shutdown: 278 | logger.info(f"ID:{manager.id}, Shutdown flag is set. No more MCP calling.") 279 | messages = [ 280 | { 281 | constants.ROLE: constants.ASSISTANT, 282 | constants.CONTENT: mcp_call_list.raw_content if mcp_call_list.raw_content else '', 283 | } 284 | ] 285 | logger.debug(f"ID:{manager.id}, Shutdown messages prepared: {messages}") 286 | return messages 287 | else: 288 | logger.info(f"ID:{manager.id}, Processing MCP call list with {len(mcp_call_list.mcps)} MCPs.") 289 | mcp_list = mcp_call_list.mcps 290 | messages = [ 291 | { 292 | constants.ROLE: constants.ASSISTANT, 293 | constants.CONTENT: mcp_call_list.raw_content if mcp_call_list.raw_content else '', 294 | constants.TOOL_CALLS: [] 295 | } 296 | ] 297 | result_str = "" 298 | for idx, mcp in enumerate(mcp_list, start=1): 299 | logger.debug(f"ID:{manager.id}, Processing MCP #{idx}: {mcp}") 300 | mcp_server_name = mcp.mcp_server_name 301 | mcp_tool_name = mcp.mcp_tool_name 302 | mcp_args = mcp.mcp_args 303 | 304 | tool_call = { 305 | "type": "function", 306 | "function": { 307 | "name": mcp_tool_name, 308 | "arguments": json.dumps(mcp_args, ensure_ascii=False) 309 | } 310 | } 311 | messages[0][constants.TOOL_CALLS].append(tool_call) 312 | logger.info(f"ID:{manager.id}, Calling MCP Server: {mcp_server_name}, Tool: {mcp_tool_name}, Arguments: {mcp_args}") 313 | 314 | # Manage manager.mcp_rts and manager.mcp_retry_times 315 | from langProBe.evaluation import global_config 316 | try: 317 | parsed_data = global_config 318 | 319 | target_name = mcp_server_name 320 | port = None 321 | url = None 322 | for item in parsed_data.get("mcp_pool", []): 323 | if item.get("name") != target_name: 324 | continue 325 | 326 | url = item.get("url", "") 327 | if url: 328 | logger.debug(f"ID:{manager.id}, Found URL for MCP Server '{target_name}': {url}") 329 | break 330 | run_configs = item.get("run_config", []) 331 | for config in run_configs: 332 | port = config.get("port") 333 | if port: 334 | url = f"http://localhost:{port}/sse" 335 | logger.debug(f"ID:{manager.id}, Constructed URL for MCP Server '{target_name}': {url}") 336 | break 337 | if url: 338 | break 339 | 340 | if not url: 341 | logger.error(f"ID:{manager.id}, No valid URL found for MCP Server '{target_name}'.") 342 | raise ValueError(f"ID:{manager.id}, No valid URL found for MCP Server '{target_name}'.") 343 | 344 | client = SyncedMcpClient(server_url=url) 345 | logger.debug(f"ID:{manager.id}, Initialized SyncedMcpClient with URL: {url}") 346 | client.list_tools() 347 | logger.debug(f"ID:{manager.id}, Retrieved tool list from MCP Server '{target_name}'.") 348 | except Exception as e: 349 | logger.error(f"ID:{manager.id}, Failed to initialize SyncedMcpClient for server '{mcp_server_name}': {str(e)}") 350 | client = None 351 | 352 | if client: 353 | try: 354 | logger.debug(f"ID:{manager.id}, Calling tool '{mcp_tool_name}' with arguments: {mcp_args}") 355 | result = client.call_tool(mcp_tool_name, mcp_args) 356 | texts = [item.text for item in result.content] 357 | result_str_segment = ''.join(texts) 358 | logger.debug(f"ID:{manager.id}, Received result from tool '{mcp_tool_name}': {result_str_segment}") 359 | 360 | logger.info(f"ID:{manager.id}, MCP Server '{mcp_server_name}' returned: {result_str_segment[:5000]}") 361 | 362 | result_str += result_str_segment 363 | except Exception as e: 364 | logger.error(f"ID:{manager.id}, Error calling tool '{mcp_tool_name}' on MCP Server '{mcp_server_name}': {str(e)}") 365 | else: 366 | logger.warning(f"ID:{manager.id}, Skipping tool call for '{mcp_tool_name}' due to client initialization failure.") 367 | 368 | messages.append({ 369 | constants.ROLE: constants.TOOL, 370 | constants.CONTENT: result_str[:5000], 371 | }) 372 | logger.debug(f"ID:{manager.id}, Final messages prepared: {messages}") 373 | logger.info(f"ID:{manager.id}, mcp_calling completed successfully.") 374 | return messages 375 | 376 | class DotDict(dict): 377 | def __getattr__(self, key): 378 | try: 379 | return self[key] 380 | except KeyError: 381 | raise AttributeError( 382 | f"'{type(self).__name__}' object has no attribute '{key}'" 383 | ) 384 | 385 | def __setattr__(self, key, value): 386 | self[key] = value 387 | 388 | def __delattr__(self, key): 389 | try: 390 | del self[key] 391 | except KeyError: 392 | raise AttributeError( 393 | f"'{type(self).__name__}' object has no attribute '{key}'" 394 | ) 395 | -------------------------------------------------------------------------------- /langProBe/register_benchmark.py: -------------------------------------------------------------------------------- 1 | ########################## Benchmarks ########################## 2 | import importlib 3 | 4 | 5 | # To use registered benchmarks, do 6 | # `benchmark.benchmark, benchmark.programs, benchmark.metric` 7 | registered_benchmarks = [] 8 | 9 | 10 | def check_benchmark(benchmark): 11 | try: 12 | assert hasattr(benchmark, "benchmark") 13 | except AssertionError: 14 | return False 15 | return True 16 | 17 | 18 | def register_benchmark(benchmark: str): 19 | try: 20 | # 尝试直接导入模块 21 | benchmark_metas = importlib.import_module(benchmark, package="langProBe") 22 | except ModuleNotFoundError: 23 | # 如果直接导入失败,尝试使用完整路径导入 24 | benchmark_metas = importlib.import_module(f"langProBe.{benchmark}", package=None) 25 | 26 | if check_benchmark(benchmark_metas): 27 | registered_benchmarks.extend(benchmark_metas.benchmark) 28 | else: 29 | raise AssertionError(f"{benchmark} does not have the required attributes") 30 | return benchmark_metas.benchmark 31 | 32 | 33 | def register_all_benchmarks(benchmarks): 34 | for benchmark in benchmarks: 35 | register_benchmark(benchmark) 36 | return registered_benchmarks 37 | -------------------------------------------------------------------------------- /langProBe/synced_mcp_client.py: -------------------------------------------------------------------------------- 1 | # teamwork_mcp/synced_mcp_client.py 2 | import asyncio 3 | import atexit 4 | import logging 5 | import pickle 6 | from multiprocessing import Process, Queue, Lock 7 | from typing import Any, Tuple, Dict 8 | 9 | # 全局客户端实例和锁,确保全局唯一的客户端实例 10 | _CLIENT_INSTANCE = None 11 | _CLIENT_LOCK = Lock() 12 | 13 | 14 | class SyncedMcpClient(Process): 15 | """ 16 | A synchronous MCP client that runs the AsyncMCPClient in a separate process 17 | and communicates with it using multiprocessing Queues and pickle. 18 | """ 19 | 20 | def __init__(self, server_url: str = None): 21 | super().__init__() 22 | # turn off logging from the logger of 'httpx' 23 | httpx_logger = logging.getLogger("httpx") 24 | httpx_logger.setLevel(logging.WARNING) 25 | 26 | self.server_url = server_url 27 | self.request_queue = Queue() 28 | self.response_queue = Queue() 29 | self.is_running = False 30 | self.daemon = True 31 | atexit.register(self.cleanup) 32 | 33 | # begin new process 34 | self.start() 35 | 36 | def run(self): 37 | """ 38 | The main process function that runs the AsyncMCPClient in a separate process. 39 | """ 40 | self.is_running = True 41 | asyncio.run(self._run_async_client()) 42 | 43 | async def _run_async_client(self): 44 | """ 45 | Runs the AsyncMCPClient and handles communication with the main process. 46 | """ 47 | from .async_mcp_client import AsyncMCPClient 48 | 49 | client = AsyncMCPClient() 50 | await client.connect_to_sse_server(server_url=self.server_url) 51 | 52 | try: 53 | while self.is_running: 54 | if not self.request_queue.empty(): 55 | request = self.request_queue.get() 56 | if request == 'terminate': 57 | break 58 | try: 59 | func_name, args, kwargs = pickle.loads(request) 60 | func = getattr(client, func_name) 61 | result = await func(*args, **kwargs) 62 | self.response_queue.put(pickle.dumps(('success', result))) 63 | except Exception as e: 64 | self.response_queue.put(pickle.dumps(('error', str(e)))) 65 | await asyncio.sleep(0.01) 66 | 67 | except Exception as e: 68 | self.httpx_logger.exception(e) 69 | self.response_queue.put(pickle.dumps(('error', f"Client initialization error: {str(e)}"))) 70 | 71 | finally: 72 | await client.cleanup() 73 | 74 | def _send_request(self, func_name: str, args: Tuple = (), kwargs: Dict = None) -> Any: 75 | """ 76 | Sends a request to the async process and waits for the response. 77 | """ 78 | if kwargs is None: 79 | kwargs = {} 80 | self.request_queue.put(pickle.dumps((func_name, args, kwargs))) 81 | response = self.response_queue.get(timeout=900) 82 | status, result = pickle.loads(response) 83 | if status == 'error': 84 | raise Exception(result) 85 | return result 86 | 87 | def call_tool(self, tool_name: str, tool_args: Dict = None) -> Any: 88 | """ 89 | Calls a tool synchronously by sending a request to the async process. 90 | """ 91 | return self._send_request('call_tool', args=(tool_name,), kwargs={'tool_args': tool_args}) 92 | 93 | def get_prompt(self, name: str, arguments: dict[str, str] | None = None) -> Any: 94 | """ 95 | Calls a tool synchronously by sending a request to the async process. 96 | """ 97 | return self._send_request('get_prompt', args=(), kwargs={'name': name, 'arguments': arguments}) 98 | 99 | def read_resource(self, uri) -> Any: 100 | """ 101 | Calls a tool synchronously by sending a request to the async process. 102 | """ 103 | return self._send_request('read_resource', args=(), kwargs={'uri': uri}) 104 | 105 | def list_resources(self) -> Any: 106 | return self._send_request('list_resources', args=(), kwargs={}) 107 | 108 | def list_prompts(self) -> Any: 109 | return self._send_request('list_prompts', args=(), kwargs={}) 110 | 111 | 112 | 113 | def list_tools(self) -> Any: 114 | """ 115 | Lists all available tools synchronously. 116 | """ 117 | return self._send_request('list_tools', args=(), kwargs={}) 118 | 119 | def process_query(self, query: str) -> Any: 120 | """ 121 | Processes a query synchronously. 122 | """ 123 | return self._send_request('process_query', args=(query,)) 124 | 125 | 126 | def cleanup(self): 127 | """ 128 | Cleans up resources and terminates the process. 129 | """ 130 | if self.is_running: 131 | self.is_running = False 132 | self.request_queue.put('terminate') 133 | self.join(timeout=5) 134 | if self.is_alive(): 135 | self.terminate() 136 | # def synced_main(): 137 | # import time 138 | # client = SyncedMcpClient(server_url="http://0.0.0.0:8080/sse") 139 | # client.start() 140 | # result = client.call_tool("get_alerts", {"state": "CA"}) 141 | # print(result) 142 | # time.sleep(5) 143 | # 144 | # 145 | # if __name__ == "__main__": 146 | # synced_main() -------------------------------------------------------------------------------- /launch_mcps_as_sse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 检查是否提供了配置文件路径参数 4 | if [ -z "$1" ]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # 构造完整路径 10 | CONFIG_FILE="$1" 11 | if [[ ! "$CONFIG_FILE" == /* ]]; then 12 | CONFIG_FILE="configs/$CONFIG_FILE" 13 | fi 14 | 15 | # 检查配置文件是否存在 16 | if [[ ! -f "$CONFIG_FILE" ]]; then 17 | echo "配置文件 '$CONFIG_FILE' 不存在。" 18 | exit 1 19 | fi 20 | 21 | # 读取 mcp_pool 数组的长度 22 | SERVER_COUNT=$(jq '.mcp_pool | length' "$CONFIG_FILE") 23 | 24 | if [[ "$SERVER_COUNT" -eq 0 ]]; then 25 | echo "mcp_pool 中未定义服务器。" 26 | exit 1 27 | fi 28 | 29 | # 遍历 mcp_pool 数组,启动每个服务器 30 | for (( i=0; i=2.6 2 | mcp 3 | uv 4 | dashscope 5 | shortuuid 6 | anthropic --------------------------------------------------------------------------------