├── .idea
├── MCPBench.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── README.md
├── README_zh.md
├── assets
└── figure1.png
├── configs
├── mcp_config_db.json
└── mcp_config_template.json
├── evaluation_db.sh
├── evaluation_gaia.sh
├── evaluation_websearch.sh
├── langProBe
├── DB
│ ├── DB_utils
│ │ ├── __init__.py
│ │ └── schema.py
│ ├── __init__.py
│ ├── data
│ │ └── car_bi.jsonl
│ └── db_program.py
├── GAIA
│ ├── __init__.py
│ ├── data
│ │ ├── 2023
│ │ │ ├── __init__.py
│ │ │ └── validation
│ │ │ │ ├── 076c8171-9b3b-49b9-a477-244d2a532826.xlsx
│ │ │ │ ├── 1f975693-876d-457b-a649-393859e79bf3.mp3
│ │ │ │ ├── 2b3ef98c-cc05-450b-a719-711aee40ac65.mp3
│ │ │ │ ├── 32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx
│ │ │ │ ├── 366e2f2b-8632-4ef2-81eb-bc3877489217.pdf
│ │ │ │ ├── 389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt
│ │ │ │ ├── 3da89939-209c-4086-8520-7eb734e6b4ef.xlsx
│ │ │ │ ├── 4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx
│ │ │ │ ├── 4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx
│ │ │ │ ├── 54612da3-fd56-4941-80f4-5eb82330de25.xlsx
│ │ │ │ ├── 5b2a14e8-6e59-479c-80e3-4696e8980152.jpg
│ │ │ │ ├── 5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx
│ │ │ │ ├── 6359a0b1-8f7b-499b-9336-840f9ab90688.png
│ │ │ │ ├── 65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx
│ │ │ │ ├── 67e8878b-5cef-4375-804e-e6291fdbe78a.pdf
│ │ │ │ ├── 7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx
│ │ │ │ ├── 7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx
│ │ │ │ ├── 7dd30055-0198-452e-8c25-f73dbe27dcb8.pdb
│ │ │ │ ├── 8d46b8d6-b38a-47ff-ac74-cda14cf2d19b.csv
│ │ │ │ ├── 8f80e01c-1296-4371-9486-bb3d68651a60.png
│ │ │ │ ├── 9318445f-fe6a-4e1b-acbf-c68228c9906a.png
│ │ │ │ ├── 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3
│ │ │ │ ├── 9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip
│ │ │ │ ├── __init__.py
│ │ │ │ ├── a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx
│ │ │ │ ├── b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg
│ │ │ │ ├── b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png
│ │ │ │ ├── bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld
│ │ │ │ ├── bfcd99e1-0690-4b53-a85c-0174a8629083.zip
│ │ │ │ ├── c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx
│ │ │ │ ├── cca530fc-4052-43b2-b130-b30968d8aa44.png
│ │ │ │ ├── cca70ce6-1952-45d2-acd4-80c903b0bc49.png
│ │ │ │ ├── cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx
│ │ │ │ ├── d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png
│ │ │ │ ├── da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx
│ │ │ │ ├── df6561b2-7ee5-4540-baab-5095f742716a.png
│ │ │ │ ├── e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf
│ │ │ │ ├── edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx
│ │ │ │ ├── f918266a-b3e0-4914-865d-4faa564f1aef.py
│ │ │ │ └── metadata.jsonl
│ │ ├── GAIA.py
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── gaia_dev_part.jsonl
│ │ └── statics.py
│ └── gaia_program.py
├── WebSearch
│ ├── __init__.py
│ └── data
│ │ ├── websearch_300.jsonl
│ │ └── websearch_600.jsonl
├── __init__.py
├── analysis.py
├── async_mcp_client.py
├── benchmark.py
├── config_utils.py
├── constants.py
├── dspy_program.py
├── evaluation.py
├── evaluation_utils.py
├── langchain_program.py
├── mcp_program.py
├── optimizers.py
├── program_utils.py
├── register_benchmark.py
└── synced_mcp_client.py
├── launch_mcps_as_sse.sh
├── mcpbench.pdf
└── requirements.txt
/.idea/MCPBench.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
84 |
90 | {
91 | "associatedIndex": 6
92 | }
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 | 1745207684567
144 |
145 |
146 | 1745207684567
147 |
148 |
149 |
150 |
151 |
152 | 1745570279500
153 |
154 |
155 |
156 | 1745570279500
157 |
158 |
159 |
160 | 1745912634851
161 |
162 |
163 |
164 | 1745912634851
165 |
166 |
167 |
168 | 1745917420634
169 |
170 |
171 |
172 | 1745917420634
173 |
174 |
175 |
176 | 1745984178661
177 |
178 |
179 |
180 | 1745984178661
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | 🦊 MCPBench: A Benchmark for Evaluating MCP Servers
3 |
4 |
5 |
6 |
7 |
8 |
9 | [![Documentation][docs-image]][docs-url]
10 | [![Package License][package-license-image]][package-license-url]
11 |
12 |
13 |
14 |
15 |
16 |
17 | [中文](https://github.com/modelscope/MCPBench/blob/main/README_zh.md) |
18 | [English](https://github.com/modelscope/MCPBench/blob/main/README.md)
19 |
20 |
21 |
22 |
23 | MCPBench is an evaluation framework for MCP Servers. It supports the evaluation of three types of servers: Web Search, Database Query and GAIA, and is compatible with both local and remote MCP Servers. The framework primarily evaluates different MCP Servers (such as Brave Search, DuckDuckGo, etc.) in terms of task completion accuracy, latency, and token consumption under the same LLM and Agent configurations. Here is the [evaluation report](https://arxiv.org/abs/2504.11094).
24 |
25 |
26 |
27 | > The implementation refers to [LangProBe: a Language Programs Benchmark](https://arxiv.org/abs/2502.20315).\
28 | > Big thanks to Qingxu Fu for the initial implementation!
29 |
30 |
31 |
32 |
33 |
34 | # 📋 Table of Contents
35 |
36 | - [🔥 News](#news)
37 | - [🛠️ Installation](#installation)
38 | - [🚀 Quick Start](#quick-start)
39 | - [Launch MCP Server](#launch-mcp-server)
40 | - [Launch Evaluation](#launch-evaluation)
41 | - [🧂 Datasets and Experiments](#datasets-and-experiments)
42 | - [🚰 Cite](#cite)
43 |
44 | # 🔥 News
45 | + `Apr. 29, 2025` 🌟 Update the code for evaluating the MCP Server Package within GAIA.
46 | + `Apr. 14, 2025` 🌟 We are proud to announce that MCPBench is now open-sourced.
47 |
48 | # 🛠️ Installation
49 | The framework requires Python version >= 3.11, nodejs and jq.
50 |
51 | ```bash
52 | conda create -n mcpbench python=3.11 -y
53 | conda activate mcpbench
54 | pip install -r requirements.txt
55 | ```
56 | # 🚀 Quick Start
57 | Please first determine the type of MCP server you want to use:
58 | - If it is a remote host (accessed via **SSE**, such as [ModelScope](https://modelscope.cn/mcp), [Smithery](https://smithery.ai), or localhost), you can directly conduct the [evaluation](#launch-evaluation).
59 | - If it is started locally (accessed via npx using **STDIO**), you need to launch it.
60 |
61 | ## Launch MCP Server (optional for stdio)
62 | First, you need to write the following configuration:
63 | ```json
64 | {
65 | "mcp_pool": [
66 | {
67 | "name": "firecrawl",
68 | "run_config": [
69 | {
70 | "command": "npx -y firecrawl-mcp",
71 | "args": "FIRECRAWL_API_KEY=xxx",
72 | "port": 8005
73 | }
74 | ]
75 | }
76 | ]
77 | }
78 | ```
79 | Save this config file in the `configs` folder and launch it using:
80 |
81 | ```bash
82 | sh launch_mcps_as_sse.sh YOUR_CONFIG_FILE
83 | ```
84 |
85 | For example, save the above configuration in the `configs/firecrawl.json` file and launch it using:
86 |
87 | ```bash
88 | sh launch_mcps_as_sse.sh firecrawl.json
89 | ```
90 |
91 | ## Launch Evaluation
92 | To evaluate the MCP Server's performance, you need to set up the necessary MCP Server information. the code will automatically detect the tools and parameters in the Server, so you don't need to configure them manually, like:
93 | ```json
94 | {
95 | "mcp_pool": [
96 | {
97 | "name": "Remote MCP example",
98 | "url": "url from https://modelscope.cn/mcp or https://smithery.ai"
99 | },
100 | {
101 | "name": "firecrawl (Local run example)",
102 | "run_config": [
103 | {
104 | "command": "npx -y firecrawl-mcp",
105 | "args": "FIRECRAWL_API_KEY=xxx",
106 | "port": 8005
107 | }
108 | ]
109 | }
110 | ]
111 | }
112 | ```
113 |
114 | To evaluate the MCP Server's performance on WebSearch tasks:
115 | ```bash
116 | sh evaluation_websearch.sh YOUR_CONFIG_FILE
117 | ```
118 |
119 | To evaluate the MCP Server's performance on Database Query tasks:
120 | ```bash
121 | sh evaluation_db.sh YOUR_CONFIG_FILE
122 | ```
123 |
124 | To evaluate the MCP Server's performance on GAIA tasks:
125 | ```bash
126 | sh evaluation_gaia.sh YOUR_CONFIG_FILE
127 | ```
128 |
129 | For example, save the above configuration in the `configs/firecrawl.json` file and launch it using:
130 |
131 | ```bash
132 | sh evaluation_websearch.sh firecrawl.json
133 | ```
134 |
135 | # Datasets and Experimental Results
136 | Our framework provides two datasets for evaluation. For the WebSearch task, the dataset is located at `MCPBench/langProBe/WebSearch/data/websearch_600.jsonl`, containing 200 QA pairs each from [Frames](https://arxiv.org/abs/2409.12941), news, and technology domains. Our framework for automatically constructing evaluation datasets will be open-sourced later.
137 |
138 | For the Database Query task, the dataset is located at `MCPBench/langProBe/DB/data/car_bi.jsonl`. You can add your own dataset in the following format:
139 |
140 | ```json
141 | {
142 | "unique_id": "",
143 | "Prompt": "",
144 | "Answer": ""
145 | }
146 | ```
147 |
148 | We have evaluated mainstream MCP Servers on both tasks. For detailed experimental results, please refer to [Documentation](https://arxiv.org/abs/2504.11094)
149 |
150 | # 🚰 Cite
151 | If you find this work useful, please consider citing our project or giving us a 🌟:
152 |
153 | ```bibtex
154 | @misc{mcpbench,
155 | title={MCPBench: A Benchmark for Evaluating MCP Servers},
156 | author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao},
157 | howpublished = {\url{https://github.com/modelscope/MCPBench}},
158 | year={2025}
159 | }
160 | ```
161 |
162 | Alternatively, you may reference our report.
163 | ```bibtex
164 | @article{mcpbench_report,
165 | title={Evaluation Report on MCP Servers},
166 | author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao},
167 | year={2025},
168 | journal={arXiv preprint arXiv:2504.11094},
169 | url={https://arxiv.org/abs/2504.11094},
170 | primaryClass={cs.AI}
171 | }
172 | ```
173 |
174 | [docs-image]: https://img.shields.io/badge/Documentation-EB3ECC
175 | [docs-url]: https://arxiv.org/abs/2504.11094
176 | [package-license-image]: https://img.shields.io/badge/License-Apache_2.0-blue.svg
177 | [package-license-url]: https://github.com/modelscope/MCPBench/blob/main/LICENSE
178 |
179 |
--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
1 |
2 | 🦊 MCPBench: A Benchmark for Evaluating MCP Servers
3 |
4 |
5 |
6 |
7 | [![文档][docs-image]][docs-url]
8 | [![软件包许可证][package-license-image]][package-license-url]
9 |
10 |
11 |
12 |
13 |
14 |
15 | [中文](https://github.com/modelscope/MCPBench/blob/main/README_zh.md) |
16 | [English](https://github.com/modelscope/MCPBench/blob/main/README.md)
17 |
18 |
19 |
20 |
21 | MCPBench 是一个用于评估 MCP Server的基准测试框架。它支持评估三种类型的服务器:网络搜索、数据库查询和GAIA任务,并且兼容本地和远程 MCP 服务器。该框架主要在相同的 LLM 和 Agent 配置下,从任务完成准确性、延迟和 Token 消耗等方面评估不同的 MCP 服务器(如 Brave Search、DuckDuckGo 等)。详见[评估报告](https://arxiv.org/abs/2504.11094)。
22 |
23 |
24 |
25 | > 实现参考了 [LangProBe: a Language Programs Benchmark](https://arxiv.org/abs/2502.20315)。\
26 | > 特别感谢 Qingxu Fu 的初始实现!
27 |
28 |
29 |
30 | # 📋 目录
31 |
32 | - [🔥 最新动态](#news)
33 | - [🛠️ 安装](#installation)
34 | - [🚀 快速开始](#quick-start)
35 | - [启动 MCP 服务器](#launch-mcp-server)
36 | - [启动评测](#launch-evaluation)
37 | - [🧂 数据集与实验](#datasets-and-experiments)
38 | - [🚰 引用](#cite)
39 |
40 | # 🔥 最新动态
41 | + `2025年4月29日` 🌟 更新了GAIA内MCP Server Package的评测代码。
42 | + `2025年4月14日` 🌟 MCPBench 正式开源。
43 |
44 | # 🛠️ 安装
45 | 本框架需要 Python >= 3.11、nodejs 和 jq。
46 |
47 | ```bash
48 | conda create -n mcpbench python=3.11 -y
49 | conda activate mcpbench
50 | pip install -r requirements.txt
51 | ```
52 | # 🚀 快速开始
53 | 请先确定你要使用的 MCP 服务器类型:
54 | - 若为远程主机(通过 **SSE** 访问,如 [ModelScope](https://modelscope.cn/mcp)、[Smithery](https://smithery.ai) 或 localhost),可直接进行[评测](#launch-evaluation)。
55 | - 若为本地启动(通过 npx 以 **STDIO** 访问),你需要启动MCP服务器。
56 | ## 启动 MCP 服务器
57 | 首先,需要编写如下配置:
58 | ```json
59 | {
60 | "mcp_pool": [
61 | {
62 | "name": "firecrawl",
63 | "run_config": [
64 | {
65 | "command": "npx -y firecrawl-mcp",
66 | "args": "FIRECRAWL_API_KEY=xxx",
67 | "port": 8005
68 | }
69 | ]
70 | }
71 | ]
72 | }
73 | ```
74 | 将该配置文件保存至 `configs` 文件夹,并通过如下命令启动:
75 |
76 | ```bash
77 | sh launch_mcps_as_sse.sh YOUR_CONFIG_FILE
78 | ```
79 |
80 | 例如,将上述配置保存为 `configs/firecrawl.json`,并通过如下命令启动:
81 |
82 | ```bash
83 | sh launch_mcps_as_sse.sh firecrawl.json
84 | ```
85 |
86 | ## 启动评测
87 | 要评测 MCP 服务器性能,需设置相关信息。代码会自动检测服务器中的工具和参数,无需手动配置。例如:
88 |
89 | ```json
90 | {
91 | "mcp_pool": [
92 | {
93 | "name": "Remote MCP example",
94 | "url": "url from https://modelscope.cn/mcp or https://smithery.ai"
95 | },
96 | {
97 | "name": "firecrawl (Local run example)",
98 | "run_config": [
99 | {
100 | "command": "npx -y firecrawl-mcp",
101 | "args": "FIRECRAWL_API_KEY=xxx",
102 | "port": 8005
103 | }
104 | ]
105 | }
106 | ]
107 | }
108 | ```
109 |
110 | 评测 MCP 服务器在网页搜索任务上的表现:
111 | ```bash
112 | sh evaluation_websearch.sh YOUR_CONFIG_FILE
113 | ```
114 |
115 | 评测 MCP 服务器在数据库查询任务上的表现:
116 | ```bash
117 | sh evaluation_db.sh YOUR_CONFIG_FILE
118 | ```
119 |
120 | 评测 MCP 服务器在 GAIA 任务上的表现:
121 | ```bash
122 | sh evaluation_gaia.sh YOUR_CONFIG_FILE
123 | ```
124 |
125 | 例如,将上述配置保存为 `configs/firecrawl.json`,并通过如下命令启动:
126 |
127 | ```bash
128 | sh evaluation_websearch.sh firecrawl.json
129 | ```
130 |
131 | # 数据集与实验结果
132 | 本框架提供了两类评测数据集:
133 | - 网页搜索任务数据集位于 `MCPBench/langProBe/WebSearch/data/websearch_600.jsonl`,包含来自 [Frames](https://arxiv.org/abs/2409.12941)、新闻、科技领域的各200组问答对。自动化构建评测数据集的工具后续也将开源。
134 | - 数据库查询任务数据集位于 `MCPBench/langProBe/DB/data/car_bi.jsonl`。你也可以按如下格式自定义数据集:
135 |
136 | ```json
137 | {
138 | "unique_id": "",
139 | "Prompt": "",
140 | "Answer": ""
141 | }
142 | ```
143 |
144 | 我们已在主流 MCP 服务器上完成了上述任务的评测。详细实验结果请参考[文档](https://arxiv.org/abs/2504.11094)。
145 |
146 | # 🚰 引用
147 | 如果本项目对你有帮助,请引用我们的工作或是给我们一个🌟:
148 |
149 | ```bibtex
150 | @misc{mcpbench,
151 | title={MCPBench: A Benchmark for Evaluating MCP Servers},
152 | author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao},
153 | howpublished = {\url{https://github.com/modelscope/MCPBench}},
154 | year={2025}
155 | }
156 | ```
157 |
158 | 或引用我们的报告:
159 | ```bibtex
160 | @article{mcpbench_report,
161 | title={Evaluation Report on MCP Servers},
162 | author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao},
163 | year={2025},
164 | journal={arXiv preprint arXiv:2504.11094},
165 | url={https://arxiv.org/abs/2504.11094},
166 | primaryClass={cs.AI}
167 | }
168 | ```
169 |
170 | [docs-image]: https://img.shields.io/badge/Documentation-EB3ECC
171 | [docs-url]: https://arxiv.org/abs/2504.11094
172 | [package-license-image]: https://img.shields.io/badge/License-Apache_2.0-blue.svg
173 | [package-license-url]: https://github.com/modelscope/MCPBench/blob/main/LICENSE
174 |
175 |
--------------------------------------------------------------------------------
/assets/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/assets/figure1.png
--------------------------------------------------------------------------------
/configs/mcp_config_db.json:
--------------------------------------------------------------------------------
1 | {
2 | "mcp_pool": [
3 | {
4 | "name": "mysql",
5 | "run_config": [
6 | {
7 | "command": "uvx --from mysql-mcp-server mysql_mcp_server",
8 | "args": "MYSQL_HOST=localhost MYSQL_PORT=3306 MYSQL_USER=root MYSQL_PASSWORD=xxx MYSQL_DATABASE=car_bi",
9 | "port": 8005
10 | }
11 | ]
12 | }
13 | ],
14 | "query_type": "SQL"
15 | }
16 |
--------------------------------------------------------------------------------
/configs/mcp_config_template.json:
--------------------------------------------------------------------------------
1 | {
2 | "mcp_pool": [
3 | {
4 | "name": "Remote MCP example",
5 | "url": "url from https://modelscope.cn/mcp or https://smithery.ai"
6 | },
7 | {
8 | "name": "Local run example",
9 | "run_config": [
10 | {
11 | "command": "npx -y firecrawl-mcp",
12 | "args": "FIRECRAWL_API_KEY=xxx",
13 | "port": 8005
14 | }
15 | ]
16 | }
17 | ]
18 | }
19 |
--------------------------------------------------------------------------------
/evaluation_db.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # 检查是否提供了配置文件路径参数
3 | if [ -z "$1" ]; then
4 | echo "Usage: $0 "
5 | exit 1
6 | fi
7 |
8 | # 构造完整路径
9 | CONFIG_FILE="$1"
10 | if [[ ! "$CONFIG_FILE" == /* ]]; then
11 | CONFIG_FILE="configs/$CONFIG_FILE"
12 | fi
13 |
14 |
15 |
16 | # 使用更直接的方法启动评估程序,确保多进程正确初始化
17 | DSPY_CACHEDIR=evaluation_mcp/.dspy_cache \
18 | python -c "
19 | import multiprocessing as mp
20 | mp.set_start_method('spawn', True)
21 | from langProBe.evaluation import main
22 | main()
23 | " \
24 | --benchmark=DB \
25 | --dataset_mode=test \
26 | --dataset_path=langProBe/DB/data/car_bi.jsonl \
27 | --file_path=evaluation_db \
28 | --lm=openai/qwen-max-2025-01-25 \
29 | --lm_api_base=https://dashscope.aliyuncs.com/compatible-mode/v1 \
30 | --lm_api_key=xxx \
31 | --missing_mode_file=path/to/logs/task_messages.jsonl \
32 | --num_threads=1 \
33 | --config=$CONFIG_FILE
34 |
--------------------------------------------------------------------------------
/evaluation_gaia.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # 检查是否提供了配置文件路径参数
3 | if [ -z "$1" ]; then
4 | echo "Usage: $0 "
5 | exit 1
6 | fi
7 |
8 | # 构造完整路径
9 | CONFIG_FILE="$1"
10 | if [[ ! "$CONFIG_FILE" == /* ]]; then
11 | CONFIG_FILE="configs/$CONFIG_FILE"
12 | fi
13 |
14 |
15 |
16 | # 使用更直接的方法启动评估程序,确保多进程正确初始化
17 | DSPY_CACHEDIR=evaluation_mcp/.dspy_cache \
18 | python -c "
19 | import multiprocessing as mp
20 | mp.set_start_method('spawn', True)
21 | from langProBe.evaluation import main
22 | main()
23 | " \
24 | --benchmark=GAIA \
25 | --dataset_mode=full \
26 | --dataset_path=langProBe/GAIA/data/gaia_rest.jsonl \
27 | --file_path=evaluation_gaia \
28 | --lm=openai/qwen-max-2025-01-25 \
29 | --lm_api_base=https://dashscope.aliyuncs.com/compatible-mode/v1 \
30 | --missing_mode_file=path/to/logs/task_messages.jsonl \
31 | --num_threads=1 \
32 | --config=$CONFIG_FILE
33 |
--------------------------------------------------------------------------------
/evaluation_websearch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # 检查是否提供了配置文件路径参数
3 | if [ -z "$1" ]; then
4 | echo "Usage: $0 "
5 | exit 1
6 | fi
7 |
8 | # 构造完整路径
9 | CONFIG_FILE="$1"
10 | if [[ ! "$CONFIG_FILE" == /* ]]; then
11 | CONFIG_FILE="configs/$CONFIG_FILE"
12 | fi
13 |
14 |
15 |
16 | # 使用更直接的方法启动评估程序,确保多进程正确初始化
17 | DSPY_CACHEDIR=evaluation_mcp/.dspy_cache \
18 | python -c "
19 | import multiprocessing as mp
20 | mp.set_start_method('spawn', True)
21 | from langProBe.evaluation import main
22 | main()
23 | " \
24 | --benchmark=WebSearch \
25 | --dataset_mode=full \
26 | --dataset_path=langProBe/WebSearch/data/websearch_test.jsonl \
27 | --file_path=evaluation_websearch_test \
28 | --lm=openai/deepseek-v3 \
29 | --lm_api_base=https://dashscope.aliyuncs.com/compatible-mode/v1 \
30 | --lm_api_key=xxx \
31 | --missing_mode_file=path/to/logs/task_messages.jsonl \
32 | --num_threads=1 \
33 | --config=$CONFIG_FILE
--------------------------------------------------------------------------------
/langProBe/DB/DB_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/DB/DB_utils/__init__.py
--------------------------------------------------------------------------------
/langProBe/DB/DB_utils/schema.py:
--------------------------------------------------------------------------------
1 | SCHEMA = """
2 | create table competitors
3 | (
4 | id int unsigned auto_increment comment '唯一标识符'
5 | primary key,
6 | competitor_name varchar(50) not null comment '竞品名称',
7 | car_series varchar(50) not null comment '车系名称',
8 | sales int not null comment '竞品销量',
9 | market_share_percentage decimal(5, 2) not null comment '竞品市场占有率百分比',
10 | record_date date not null comment '记录日期'
11 | )
12 | comment '存储竞品销量和市场占有率' collate = utf8mb4_unicode_520_ci;
13 |
14 | create table customer_flow
15 | (
16 | id int unsigned auto_increment comment '唯一标识符'
17 | primary key,
18 | region varchar(50) not null comment '大区',
19 | store varchar(50) not null comment '门店名称',
20 | first_visit_flow int not null comment '首次到店客流量',
21 | total_visit_flow int not null comment '总客流量',
22 | visit_datetime datetime not null comment '访问时间',
23 | conversion_rate decimal(5, 2) not null comment '成交率'
24 | )
25 | comment '存储大区、门店、客流量和成交率信息' collate = utf8mb4_unicode_520_ci;
26 |
27 | create index idx_region_store
28 | on customer_flow (region, store);
29 |
30 | create table inventory
31 | (
32 | id int unsigned auto_increment comment '唯一标识符'
33 | primary key,
34 | car_series varchar(50) not null comment '车系名称',
35 | region varchar(50) not null comment '大区',
36 | warehouse varchar(100) not null comment '仓库名称',
37 | quantity int not null comment '库存数量',
38 | last_checked datetime not null comment '最后盘点时间',
39 | series_type varchar(50) not null comment '车系类型'
40 | )
41 | comment '存储库存信息' collate = utf8mb4_unicode_520_ci;
42 |
43 | create table market_sales
44 | (
45 | id int unsigned auto_increment comment '唯一标识符'
46 | primary key,
47 | total_market_sales int not null comment '总体市场销量',
48 | car_series_market_sales int not null comment '车系市场销量',
49 | record_date date not null comment '记录日期'
50 | )
51 | comment '存储市场销量信息' collate = utf8mb4_unicode_520_ci;
52 |
53 | create table market_share
54 | (
55 | id int unsigned auto_increment comment '唯一标识符'
56 | primary key,
57 | car_series varchar(50) not null comment '车系名称',
58 | market_share_percentage decimal(5, 2) not null comment '市场占有率百分比',
59 | record_date date not null comment '记录日期'
60 | )
61 | comment '存储车系市场占有率变化' collate = utf8mb4_unicode_520_ci;
62 |
63 | create table order_stats
64 | (
65 | id int unsigned auto_increment comment '唯一标识符'
66 | primary key,
67 | car_series varchar(50) not null comment '车系名称',
68 | region varchar(50) not null comment '大区',
69 | order_quantity int not null comment '订单数量',
70 | large_order_quantity int not null comment '大定数量',
71 | locked_order_quantity int not null comment '锁单数量',
72 | retained_large_order_quantity int not null comment '留存大定数量'
73 | )
74 | comment '存储订单统计数据' collate = utf8mb4_unicode_520_ci;
75 |
76 | create table policies
77 | (
78 | id int unsigned auto_increment comment '唯一标识符'
79 | primary key,
80 | policy_name varchar(100) not null comment '政策名称',
81 | description text null comment '政策描述',
82 | type varchar(50) not null comment '车系类型',
83 | effective_date date not null comment '生效日期',
84 | expiry_date date null comment '失效日期'
85 | )
86 | comment '存储国家及地方汽车产业政策' collate = utf8mb4_unicode_520_ci;
87 |
88 | create table sales
89 | (
90 | id int unsigned auto_increment comment '唯一标识符'
91 | primary key,
92 | car_series varchar(50) not null comment '车系名称',
93 | region varchar(50) not null comment '大区',
94 | quantity int not null comment '销量数量',
95 | sale_date date not null comment '销售日期',
96 | series_type varchar(50) not null comment '车系类型'
97 | )
98 | comment '存储实际销量数据' collate = utf8mb4_unicode_520_ci;
99 |
100 | create table sales_targets
101 | (
102 | id int unsigned auto_increment comment '唯一标识符'
103 | primary key,
104 | car_series varchar(50) not null comment '车系名称',
105 | region varchar(50) not null comment '大区',
106 | monthly_target int not null comment '月度销量目标',
107 | yearly_target int not null comment '年度销量目标'
108 | )
109 | comment '存储各车系在各大区的销量目标' collate = utf8mb4_unicode_520_ci;
110 | """
--------------------------------------------------------------------------------
/langProBe/DB/__init__.py:
--------------------------------------------------------------------------------
1 | from langProBe.benchmark import BenchmarkMeta, MCPBench
2 | from .db_program import DBPredict
3 | from langProBe.evaluation_utils import mcp_metric
4 |
5 | MCP_SAMPLE_SYSTEM_PROMPT = """
6 | You are a helpful assistant. You are able to answer questions using different tools.
7 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.
8 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.
9 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.
10 | The tool description includes:
11 | A brief text description outlining the functionality of the tool.
12 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
13 | """
14 |
15 | def get_mcp_sample_benchmark():
16 | mcp_sample_baseline = DBPredict(
17 | max_steps=5,
18 | system_prompt=MCP_SAMPLE_SYSTEM_PROMPT,
19 | task_name="database_search")
20 |
21 | return [
22 | BenchmarkMeta(
23 | MCPBench,
24 | [mcp_sample_baseline],
25 | mcp_metric,
26 | optimizers=[],
27 | name="MCP_DB"
28 | )
29 | ]
30 |
31 | benchmark = get_mcp_sample_benchmark()
--------------------------------------------------------------------------------
/langProBe/DB/data/car_bi.jsonl:
--------------------------------------------------------------------------------
1 | {"unique_id": 2, "Prompt": "2025年2月19日记录的竞品名称是什么?", "Answer": "飞海科技科技有限公司"}
2 | {"unique_id": 4, "Prompt": "华泰通安网络有限公司的销量是多少?", "Answer": "30"}
3 | {"unique_id": 6, "Prompt": "诺依曼软件科技有限公司的记录日期是什么时候?", "Answer": "2025-01-05"}
4 | {"unique_id": 9, "Prompt": "东方峻景网络有限公司的市场占有率是多少?", "Answer": "9.06"}
5 | {"unique_id": 11, "Prompt": "西南区域中,系列D的订单数量是多少?", "Answer": "60"}
6 | {"unique_id": 12, "Prompt": "华北区域中,所有车系的大定数量总和是多少?", "Answer": "98"}
7 | {"unique_id": 13, "Prompt": "华南区域中,首次到店客流量最高的门店是哪个?", "Answer": "帅县店"}
8 | {"unique_id": 14, "Prompt": "华东区域中,成交率最低的门店是哪个?", "Answer": "强市店"}
9 | {"unique_id": 15, "Prompt": "西北区域中,总体市场销量最高的日期是哪一天?", "Answer": "2024-01-16"}
10 | {"unique_id": 16, "Prompt": "2024年12月,华南区域的总客流量是多少?", "Answer": "1168"}
11 | {"unique_id": 17, "Prompt": "锁单数量大于10的车系有哪些?", "Answer": "['系列C']"}
12 | {"unique_id": 18, "Prompt": "在2025年2月,华南区域的总订单数量是多少?", "Answer": "0"}
13 | {"unique_id": 19, "Prompt": "留存大定数量最多的车系是哪个?", "Answer": "系列C"}
14 | {"unique_id": 20, "Prompt": "系列A在华东区域的市场占有率是多少?", "Answer": "21.41%"}
15 | {"unique_id": 22, "Prompt": "系列B在华东区域的月度销量目标是多少?", "Answer": "58"}
16 | {"unique_id": 23, "Prompt": "系列D在2025年2月19日的市场占有率是多少?", "Answer": "19.99%"}
17 | {"unique_id": 25, "Prompt": "系列D在华北区域的年度销量目标是多少?", "Answer": "1320"}
18 | {"unique_id": 28, "Prompt": "飞海科技科技有限公司在2025年2月19日的竞品销量是多少?", "Answer": "23"}
19 | {"unique_id": 31, "Prompt": "万迅电脑传媒有限公司的竞品市场占有率百分比是多少?", "Answer": "6.92"}
20 | {"unique_id": 33, "Prompt": "2024年12月30日,系列C在华南区域的销量是多少?", "Answer": "19"}
21 | {"unique_id": 36, "Prompt": "华东区域中燃油车的库存总数是多少?", "Answer": "700"}
22 | {"unique_id": 38, "Prompt": "华南区域中系列B的库存总数是多少?", "Answer": "533"}
23 | {"unique_id": 39, "Prompt": "仓库名称为'梧州市仓库'的库存总数是多少?", "Answer": "330"}
24 | {"unique_id": 40, "Prompt": "系列C在西南区域的库存总数是多少?", "Answer": "177"}
25 | {"unique_id": 44, "Prompt": "所有政策中,哪些政策的类型是‘燃油车’?", "Answer": "['燃油车新购补贴', '燃油车置换补贴']"}
26 | {"unique_id": 45, "Prompt": "最早生效的政策名称是什么?", "Answer": "新能源置换补贴"}
27 | {"unique_id": 46, "Prompt": "失效日期在2024年12月30日之后的政策有哪些?", "Answer": "['燃油车新购补贴', '燃油车置换补贴']"}
28 | {"unique_id": 47, "Prompt": "描述为‘新能源新购补贴’的政策的生效日期是什么时候?", "Answer": "2024-08-16"}
29 | {"unique_id": 48, "Prompt": "名称包含‘置换’的政策有哪些?", "Answer": "['燃油车置换补贴', '新能源置换补贴']"}
30 | {"unique_id": 49, "Prompt": "政策类型为‘新能源’且在2024年内生效的政策有哪些?", "Answer": "['新能源新购补贴', '新能源置换补贴']"}
31 | {"unique_id": 50, "Prompt": "哪条政策的有效期最长?", "Answer": "燃油车新购补贴"}
32 | {"unique_id": 51, "Prompt": "政策‘新能源新购补贴’是否已经失效?", "Answer": "True"}
33 | {"unique_id": 54, "Prompt": "车系市场销量最高的记录日期是哪一天?", "Answer": "2025-01-27"}
34 | {"unique_id": 55, "Prompt": "西南区域系列D的年度销量目标是多少?", "Answer": "1032"}
35 | {"unique_id": 56, "Prompt": "所有政策中,生效日期最早的是哪个政策?", "Answer": "新能源置换补贴"}
36 | {"unique_id": 58, "Prompt": "华东区域系列C的月度销量目标是多少?", "Answer": "97"}
37 | {"unique_id": 59, "Prompt": "哪些政策在2025年仍然有效?", "Answer": "燃油车新购补贴, 燃油车置换补贴"}
38 | {"unique_id": 60, "Prompt": "华北区域系列B的年度销量目标是多少?", "Answer": "2244"}
39 | {"unique_id": 61, "Prompt": "总体市场销量最低的记录日期是哪一天?", "Answer": "2025-02-11"}
40 | {"unique_id": 62, "Prompt": "华南区域系列A的月度销量目标是多少?", "Answer": "184"}
41 | {"unique_id": 63, "Prompt": "系列D在西南区域的库存总数是多少?", "Answer": "253"}
42 | {"unique_id": 64, "Prompt": "系列B在华北区域的总库存量是多少?", "Answer": "396"}
43 | {"unique_id": 65, "Prompt": "华东区域系列A的库存总量是多少?", "Answer": "374"}
44 | {"unique_id": 66, "Prompt": "华南区域系列C的库存总量是多少?", "Answer": "278"}
45 | {"unique_id": 68, "Prompt": "系列B的竞品市场占有率总和是多少?", "Answer": "23.17"}
46 | {"unique_id": 69, "Prompt": "系列A在西南区域的月度销量目标是多少?", "Answer": "57"}
47 | {"unique_id": 70, "Prompt": "系列C在华东区域的年度销量目标是多少?", "Answer": "1164"}
48 | {"unique_id": 71, "Prompt": "系列B在华南区域的库存总量是多少?", "Answer": "533"}
49 | {"unique_id": 72, "Prompt": "记录日期为2025-02-12的竞品销量总和是多少?", "Answer": "61"}
50 | {"unique_id": 74, "Prompt": "车系市场销量最高的记录日期是哪一天?", "Answer": "2025-01-27"}
51 | {"unique_id": 76, "Prompt": "政策‘燃油车新购补贴’的生效日期是什么时候?", "Answer": "2024-02-02"}
52 | {"unique_id": 77, "Prompt": "哪些政策在2025年仍然有效?", "Answer": "['燃油车新购补贴', '燃油车置换补贴']"}
53 | {"unique_id": 78, "Prompt": "总体市场销量最低的记录日期是哪一天?", "Answer": "2025-02-11"}
54 | {"unique_id": 79, "Prompt": "新能源相关的政策有哪些?", "Answer": "['新能源新购补贴', '新能源置换补贴']"}
55 | {"unique_id": 80, "Prompt": "2025年1月11日的车系市场销量是多少?", "Answer": "91"}
56 | {"unique_id": 81, "Prompt": "政策‘新能源新购补贴’的失效日期是什么时候?", "Answer": "2024-12-23"}
57 | {"unique_id": 84, "Prompt": "系列A在2025年2月7日的竞品销量是多少?", "Answer": "87"}
58 | {"unique_id": 85, "Prompt": "华东区域系列C的库存总数是多少?", "Answer": "355"}
59 | {"unique_id": 86, "Prompt": "系列B的竞品市场占有率最高的公司名称是什么?", "Answer": "华泰通安网络有限公司"}
60 | {"unique_id": 87, "Prompt": "2025年2月27日的车系市场销量是多少?", "Answer": "88"}
61 | {"unique_id": 88, "Prompt": "系列D在华北区域的库存总数是多少?", "Answer": "344"}
62 | {"unique_id": 90, "Prompt": "2025年1月20日的总体市场销量是多少?", "Answer": "742"}
63 | {"unique_id": 91, "Prompt": "系列B在华南区域的库存总数是多少?", "Answer": "533"}
64 | {"unique_id": 94, "Prompt": "系列A在华南区域的库存总数是多少?", "Answer": "562"}
65 | {"unique_id": 95, "Prompt": "新能源车的总库存数量是多少?", "Answer": "2385"}
66 | {"unique_id": 96, "Prompt": "哪个仓库的库存数量最多,数量是多少?", "Answer": "梧州县仓库, 297"}
67 | {"unique_id": 97, "Prompt": "华北区域中燃油车的库存总数是多少?", "Answer": "616"}
68 | {"unique_id": 98, "Prompt": "最后盘点时间在2025年1月的库存总数是多少?", "Answer": "1518"}
69 | {"unique_id": 99, "Prompt": "系列B在西南区域的库存总数是多少?", "Answer": "489"}
70 | {"unique_id": 100, "Prompt": "华东区域中新能源车的库存总数是多少?", "Answer": "959"}
71 | {"unique_id": 101, "Prompt": "系列C在华南区域的库存总数是多少?", "Answer": "278"}
72 | {"unique_id": 102, "Prompt": "2025年2月盘点的库存总数是多少?", "Answer": "2133"}
--------------------------------------------------------------------------------
/langProBe/DB/db_program.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 | import re
5 | import time
6 | import traceback
7 | from datetime import datetime
8 | from typing import List, Tuple, Optional
9 |
10 | import dspy
11 | from openai import OpenAI
12 |
13 | from langProBe.dspy_program import LangProBeDSPyMetaProgram
14 | import langProBe.constants as constants
15 |
16 | from langProBe.mcp_program import MCPPredict
17 | from langProBe.program_utils import (
18 | call_lm,
19 | build_init_messages,
20 | build_messages,
21 | response_parsing,
22 | mcp_calling,
23 | ProcessManager
24 | )
25 |
26 | MCP_SAMPLE_SYSTEM_PROMPT = """
27 | You are a helpful assistant. You are able to answer questions using different tools.
28 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.
29 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.
30 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.
31 | The tool description includes:
32 | A brief text description outlining the functionality of the tool.
33 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
34 | """
35 |
36 | USER_PROMPT_SQL = """
37 | Here is the database schema
38 | {schema}
39 |
40 | Question:
41 | {question}
42 | """
43 |
44 | USER_PROMPT_NL = """
45 | Question:
46 | {question}
47 | """
48 |
49 | class DBPredict(MCPPredict):
50 | def __init__(self, max_steps=5, system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, task_name="database_search"):
51 | super().__init__(max_steps, system_prompt, task_name)
52 |
53 | def forward(self, **kwargs) -> dspy.Prediction:
54 | unique_id = kwargs.get('id')
55 | question = kwargs.get('question')
56 | gt = kwargs.get('answer')
57 |
58 | manager = ProcessManager()
59 | manager.lm_api_key = self.lm.api_key
60 | manager.lm_api_base = self.lm.api_base
61 | manager.model = self.lm.model
62 | manager.id = unique_id
63 |
64 | self.run_logger.info(f"ID: {manager.id}, Starting forward pass for question: {question}")
65 |
66 | from langProBe.evaluation import global_config
67 | mcps = global_config['mcp_pool']
68 |
69 | from langProBe.evaluation import global_config
70 | if global_config.get('query_type', 'NL') == 'SQL':
71 | from .DB_utils.schema import SCHEMA
72 | user_prompt = USER_PROMPT_SQL.format(schema=SCHEMA, question=question)
73 | else:
74 | user_prompt = USER_PROMPT_NL.format(question=question)
75 |
76 | messages = build_init_messages(self.system_prompt, mcps, user_prompt)
77 | steps = 0
78 | all_completion_tokens = 0
79 | all_prompt_tokens = 0
80 | start_time = time.time()
81 |
82 | while not messages[-1][constants.ROLE] == constants.ASSISTANT and steps < self.max_steps:
83 | response, completion_tokens, prompt_tokens = call_lm(messages, manager, self.run_logger)
84 | all_completion_tokens += completion_tokens
85 | all_prompt_tokens += prompt_tokens
86 | mcp_calls = response_parsing(response)
87 |
88 | new_messages = mcp_calling(mcp_calls, manager, self.run_logger)
89 | messages = build_messages(messages, new_messages)
90 | steps += 1
91 |
92 | end_time = time.time()
93 |
94 | if messages[-1][constants.ROLE] != constants.ASSISTANT:
95 | self.run_logger.warning("Maximum steps reached without getting an answer")
96 | messages.append({
97 | constants.ROLE: constants.ASSISTANT,
98 | constants.CONTENT: "超过最长次数限制,该问题无法解决",
99 | })
100 |
101 | self.run_logger.info(f"ID: {manager.id}, Forward pass completed successfully")
102 | success = self.evaluate_prediction(question, gt, messages[-1][constants.CONTENT])
103 | self.log_messages(messages, question, success, (end_time - start_time), all_prompt_tokens,
104 | all_completion_tokens)
105 | self.run_logger.info(f"ID: {manager.id}, Evaluation completed successfully")
106 |
107 | return dspy.Prediction(
108 | success=success,
109 | question=question,
110 | ground_truth=gt,
111 | answer=messages[-1][constants.CONTENT],
112 | trace=messages,
113 | process_report=manager
114 | )
115 |
--------------------------------------------------------------------------------
/langProBe/GAIA/__init__.py:
--------------------------------------------------------------------------------
1 | from langProBe.benchmark import BenchmarkMeta, MCPBench
2 | from langProBe.mcp_program import MCPPredict
3 | from langProBe.evaluation_utils import mcp_metric
4 | from .gaia_program import GAIAPredict
5 |
6 | MCP_SAMPLE_SYSTEM_PROMPT = """
7 | You are a helpful assistant. You are able to answer questions using different tools.
8 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.
9 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.
10 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.
11 | The tool description includes:
12 | A brief text description outlining the functionality of the tool.
13 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
14 | If you have obtained the final result. Please provide your final answer enclosed within tags. Ensure that only the final answer is included, without any additional explanations or commentary.
15 | """
16 | def get_mcp_sample_benchmark():
17 | mcp_sample_baseline = GAIAPredict(
18 | max_steps=50,
19 | system_prompt=MCP_SAMPLE_SYSTEM_PROMPT,
20 | task_name="gaia")
21 |
22 | return [
23 | BenchmarkMeta(
24 | MCPBench,
25 | [mcp_sample_baseline],
26 | mcp_metric,
27 | optimizers=[],
28 | name="MCP_GAIA"
29 | )
30 | ]
31 |
32 | benchmark = get_mcp_sample_benchmark()
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/__init__.py
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/076c8171-9b3b-49b9-a477-244d2a532826.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/076c8171-9b3b-49b9-a477-244d2a532826.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/2b3ef98c-cc05-450b-a719-711aee40ac65.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/2b3ef98c-cc05-450b-a719-711aee40ac65.mp3
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/366e2f2b-8632-4ef2-81eb-bc3877489217.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/366e2f2b-8632-4ef2-81eb-bc3877489217.pdf
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt:
--------------------------------------------------------------------------------
1 | H H H
2 | --------------------------------
3 | H H H H
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/3da89939-209c-4086-8520-7eb734e6b4ef.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/3da89939-209c-4086-8520-7eb734e6b4ef.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/54612da3-fd56-4941-80f4-5eb82330de25.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/54612da3-fd56-4941-80f4-5eb82330de25.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/5b2a14e8-6e59-479c-80e3-4696e8980152.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/5b2a14e8-6e59-479c-80e3-4696e8980152.jpg
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/6359a0b1-8f7b-499b-9336-840f9ab90688.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/6359a0b1-8f7b-499b-9336-840f9ab90688.png
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/67e8878b-5cef-4375-804e-e6291fdbe78a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/67e8878b-5cef-4375-804e-e6291fdbe78a.pdf
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/8d46b8d6-b38a-47ff-ac74-cda14cf2d19b.csv:
--------------------------------------------------------------------------------
1 | species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
2 | Adelie,Torgersen,39.1,18.7,181,3750,MALE
3 | Adelie,Torgersen,39.5,17.4,186,3800,FEMALE
4 | Adelie,Torgersen,40.3,18,195,3250,FEMALE
5 | Adelie,Torgersen,,,,,
6 | Adelie,Torgersen,36.7,19.3,193,3450,FEMALE
7 | Adelie,Torgersen,39.3,20.6,190,3650,MALE
8 | Adelie,Torgersen,38.9,17.8,181,3625,FEMALE
9 | Adelie,Torgersen,39.2,19.6,195,4675,MALE
10 | Adelie,Torgersen,34.1,18.1,193,3475,
11 | Adelie,Torgersen,42,20.2,190,4250,
12 | Adelie,Torgersen,37.8,17.1,186,3300,
13 | Adelie,Torgersen,37.8,17.3,180,3700,
14 | Adelie,Torgersen,41.1,17.6,182,3200,FEMALE
15 | Adelie,Torgersen,38.6,21.2,191,3800,MALE
16 | Adelie,Torgersen,34.6,21.1,198,4400,MALE
17 | Adelie,Torgersen,36.6,17.8,185,3700,FEMALE
18 | Adelie,Torgersen,38.7,19,195,3450,FEMALE
19 | Adelie,Torgersen,42.5,20.7,197,4500,MALE
20 | Adelie,Torgersen,34.4,18.4,184,3325,FEMALE
21 | Adelie,Torgersen,46,21.5,194,4200,MALE
22 | Adelie,Biscoe,37.8,18.3,174,3400,FEMALE
23 | Adelie,Biscoe,37.7,18.7,180,3600,MALE
24 | Adelie,Biscoe,35.9,19.2,189,3800,FEMALE
25 | Adelie,Biscoe,38.2,18.1,185,3950,MALE
26 | Adelie,Biscoe,38.8,17.2,180,3800,MALE
27 | Adelie,Biscoe,35.3,18.9,187,3800,FEMALE
28 | Adelie,Biscoe,40.6,18.6,183,3550,MALE
29 | Adelie,Biscoe,40.5,17.9,187,3200,FEMALE
30 | Adelie,Biscoe,37.9,18.6,172,3150,FEMALE
31 | Adelie,Biscoe,40.5,18.9,180,3950,MALE
32 | Adelie,Dream,39.5,16.7,178,3250,FEMALE
33 | Adelie,Dream,37.2,18.1,178,3900,MALE
34 | Adelie,Dream,39.5,17.8,188,3300,FEMALE
35 | Adelie,Dream,40.9,18.9,184,3900,MALE
36 | Adelie,Dream,36.4,17,195,3325,FEMALE
37 | Adelie,Dream,39.2,21.1,196,4150,MALE
38 | Adelie,Dream,38.8,20,190,3950,MALE
39 | Adelie,Dream,42.2,18.5,180,3550,FEMALE
40 | Adelie,Dream,37.6,19.3,181,3300,FEMALE
41 | Adelie,Dream,39.8,19.1,184,4650,MALE
42 | Adelie,Dream,36.5,18,182,3150,FEMALE
43 | Adelie,Dream,40.8,18.4,195,3900,MALE
44 | Adelie,Dream,36,18.5,186,3100,FEMALE
45 | Adelie,Dream,44.1,19.7,196,4400,MALE
46 | Adelie,Dream,37,16.9,185,3000,FEMALE
47 | Adelie,Dream,39.6,18.8,190,4600,MALE
48 | Adelie,Dream,41.1,19,182,3425,MALE
49 | Adelie,Dream,37.5,18.9,179,2975,
50 | Adelie,Dream,36,17.9,190,3450,FEMALE
51 | Adelie,Dream,42.3,21.2,191,4150,MALE
52 | Adelie,Biscoe,39.6,17.7,186,3500,FEMALE
53 | Adelie,Biscoe,40.1,18.9,188,4300,MALE
54 | Adelie,Biscoe,35,17.9,190,3450,FEMALE
55 | Adelie,Biscoe,42,19.5,200,4050,MALE
56 | Adelie,Biscoe,34.5,18.1,187,2900,FEMALE
57 | Adelie,Biscoe,41.4,18.6,191,3700,MALE
58 | Adelie,Biscoe,39,17.5,186,3550,FEMALE
59 | Adelie,Biscoe,40.6,18.8,193,3800,MALE
60 | Adelie,Biscoe,36.5,16.6,181,2850,FEMALE
61 | Adelie,Biscoe,37.6,19.1,194,3750,MALE
62 | Adelie,Biscoe,35.7,16.9,185,3150,FEMALE
63 | Adelie,Biscoe,41.3,21.1,195,4400,MALE
64 | Adelie,Biscoe,37.6,17,185,3600,FEMALE
65 | Adelie,Biscoe,41.1,18.2,192,4050,MALE
66 | Adelie,Biscoe,36.4,17.1,184,2850,FEMALE
67 | Adelie,Biscoe,41.6,18,192,3950,MALE
68 | Adelie,Biscoe,35.5,16.2,195,3350,FEMALE
69 | Adelie,Biscoe,41.1,19.1,188,4100,MALE
70 | Adelie,Torgersen,35.9,16.6,190,3050,FEMALE
71 | Adelie,Torgersen,41.8,19.4,198,4450,MALE
72 | Adelie,Torgersen,33.5,19,190,3600,FEMALE
73 | Adelie,Torgersen,39.7,18.4,190,3900,MALE
74 | Adelie,Torgersen,39.6,17.2,196,3550,FEMALE
75 | Adelie,Torgersen,45.8,18.9,197,4150,MALE
76 | Adelie,Torgersen,35.5,17.5,190,3700,FEMALE
77 | Adelie,Torgersen,42.8,18.5,195,4250,MALE
78 | Adelie,Torgersen,40.9,16.8,191,3700,FEMALE
79 | Adelie,Torgersen,37.2,19.4,184,3900,MALE
80 | Adelie,Torgersen,36.2,16.1,187,3550,FEMALE
81 | Adelie,Torgersen,42.1,19.1,195,4000,MALE
82 | Adelie,Torgersen,34.6,17.2,189,3200,FEMALE
83 | Adelie,Torgersen,42.9,17.6,196,4700,MALE
84 | Adelie,Torgersen,36.7,18.8,187,3800,FEMALE
85 | Adelie,Torgersen,35.1,19.4,193,4200,MALE
86 | Adelie,Dream,37.3,17.8,191,3350,FEMALE
87 | Adelie,Dream,41.3,20.3,194,3550,MALE
88 | Adelie,Dream,36.3,19.5,190,3800,MALE
89 | Adelie,Dream,36.9,18.6,189,3500,FEMALE
90 | Adelie,Dream,38.3,19.2,189,3950,MALE
91 | Adelie,Dream,38.9,18.8,190,3600,FEMALE
92 | Adelie,Dream,35.7,18,202,3550,FEMALE
93 | Adelie,Dream,41.1,18.1,205,4300,MALE
94 | Adelie,Dream,34,17.1,185,3400,FEMALE
95 | Adelie,Dream,39.6,18.1,186,4450,MALE
96 | Adelie,Dream,36.2,17.3,187,3300,FEMALE
97 | Adelie,Dream,40.8,18.9,208,4300,MALE
98 | Adelie,Dream,38.1,18.6,190,3700,FEMALE
99 | Adelie,Dream,40.3,18.5,196,4350,MALE
100 | Adelie,Dream,33.1,16.1,178,2900,FEMALE
101 | Adelie,Dream,43.2,18.5,192,4100,MALE
102 | Adelie,Biscoe,35,17.9,192,3725,FEMALE
103 | Adelie,Biscoe,41,20,203,4725,MALE
104 | Adelie,Biscoe,37.7,16,183,3075,FEMALE
105 | Adelie,Biscoe,37.8,20,190,4250,MALE
106 | Adelie,Biscoe,37.9,18.6,193,2925,FEMALE
107 | Adelie,Biscoe,39.7,18.9,184,3550,MALE
108 | Adelie,Biscoe,38.6,17.2,199,3750,FEMALE
109 | Adelie,Biscoe,38.2,20,190,3900,MALE
110 | Adelie,Biscoe,38.1,17,181,3175,FEMALE
111 | Adelie,Biscoe,43.2,19,197,4775,MALE
112 | Adelie,Biscoe,38.1,16.5,198,3825,FEMALE
113 | Adelie,Biscoe,45.6,20.3,191,4600,MALE
114 | Adelie,Biscoe,39.7,17.7,193,3200,FEMALE
115 | Adelie,Biscoe,42.2,19.5,197,4275,MALE
116 | Adelie,Biscoe,39.6,20.7,191,3900,FEMALE
117 | Adelie,Biscoe,42.7,18.3,196,4075,MALE
118 | Adelie,Torgersen,38.6,17,188,2900,FEMALE
119 | Adelie,Torgersen,37.3,20.5,199,3775,MALE
120 | Adelie,Torgersen,35.7,17,189,3350,FEMALE
121 | Adelie,Torgersen,41.1,18.6,189,3325,MALE
122 | Adelie,Torgersen,36.2,17.2,187,3150,FEMALE
123 | Adelie,Torgersen,37.7,19.8,198,3500,MALE
124 | Adelie,Torgersen,40.2,17,176,3450,FEMALE
125 | Adelie,Torgersen,41.4,18.5,202,3875,MALE
126 | Adelie,Torgersen,35.2,15.9,186,3050,FEMALE
127 | Adelie,Torgersen,40.6,19,199,4000,MALE
128 | Adelie,Torgersen,38.8,17.6,191,3275,FEMALE
129 | Adelie,Torgersen,41.5,18.3,195,4300,MALE
130 | Adelie,Torgersen,39,17.1,191,3050,FEMALE
131 | Adelie,Torgersen,44.1,18,210,4000,MALE
132 | Adelie,Torgersen,38.5,17.9,190,3325,FEMALE
133 | Adelie,Torgersen,43.1,19.2,197,3500,MALE
134 | Adelie,Dream,36.8,18.5,193,3500,FEMALE
135 | Adelie,Dream,37.5,18.5,199,4475,MALE
136 | Adelie,Dream,38.1,17.6,187,3425,FEMALE
137 | Adelie,Dream,41.1,17.5,190,3900,MALE
138 | Adelie,Dream,35.6,17.5,191,3175,FEMALE
139 | Adelie,Dream,40.2,20.1,200,3975,MALE
140 | Adelie,Dream,37,16.5,185,3400,FEMALE
141 | Adelie,Dream,39.7,17.9,193,4250,MALE
142 | Adelie,Dream,40.2,17.1,193,3400,FEMALE
143 | Adelie,Dream,40.6,17.2,187,3475,MALE
144 | Adelie,Dream,32.1,15.5,188,3050,FEMALE
145 | Adelie,Dream,40.7,17,190,3725,MALE
146 | Adelie,Dream,37.3,16.8,192,3000,FEMALE
147 | Adelie,Dream,39,18.7,185,3650,MALE
148 | Adelie,Dream,39.2,18.6,190,4250,MALE
149 | Adelie,Dream,36.6,18.4,184,3475,FEMALE
150 | Adelie,Dream,36,17.8,195,3450,FEMALE
151 | Adelie,Dream,37.8,18.1,193,3750,MALE
152 | Adelie,Dream,36,17.1,187,3700,FEMALE
153 | Adelie,Dream,41.5,18.5,201,4000,MALE
154 | Chinstrap,Dream,46.5,17.9,192,3500,FEMALE
155 | Chinstrap,Dream,50,19.5,196,3900,MALE
156 | Chinstrap,Dream,51.3,19.2,193,3650,MALE
157 | Chinstrap,Dream,45.4,18.7,188,3525,FEMALE
158 | Chinstrap,Dream,52.7,19.8,197,3725,MALE
159 | Chinstrap,Dream,45.2,17.8,198,3950,FEMALE
160 | Chinstrap,Dream,46.1,18.2,178,3250,FEMALE
161 | Chinstrap,Dream,51.3,18.2,197,3750,MALE
162 | Chinstrap,Dream,46,18.9,195,4150,FEMALE
163 | Chinstrap,Dream,51.3,19.9,198,3700,MALE
164 | Chinstrap,Dream,46.6,17.8,193,3800,FEMALE
165 | Chinstrap,Dream,51.7,20.3,194,3775,MALE
166 | Chinstrap,Dream,47,17.3,185,3700,FEMALE
167 | Chinstrap,Dream,52,18.1,201,4050,MALE
168 | Chinstrap,Dream,45.9,17.1,190,3575,FEMALE
169 | Chinstrap,Dream,50.5,19.6,201,4050,MALE
170 | Chinstrap,Dream,50.3,20,197,3300,MALE
171 | Chinstrap,Dream,58,17.8,181,3700,FEMALE
172 | Chinstrap,Dream,46.4,18.6,190,3450,FEMALE
173 | Chinstrap,Dream,49.2,18.2,195,4400,MALE
174 | Chinstrap,Dream,42.4,17.3,181,3600,FEMALE
175 | Chinstrap,Dream,48.5,17.5,191,3400,MALE
176 | Chinstrap,Dream,43.2,16.6,187,2900,FEMALE
177 | Chinstrap,Dream,50.6,19.4,193,3800,MALE
178 | Chinstrap,Dream,46.7,17.9,195,3300,FEMALE
179 | Chinstrap,Dream,52,19,197,4150,MALE
180 | Chinstrap,Dream,50.5,18.4,200,3400,FEMALE
181 | Chinstrap,Dream,49.5,19,200,3800,MALE
182 | Chinstrap,Dream,46.4,17.8,191,3700,FEMALE
183 | Chinstrap,Dream,52.8,20,205,4550,MALE
184 | Chinstrap,Dream,40.9,16.6,187,3200,FEMALE
185 | Chinstrap,Dream,54.2,20.8,201,4300,MALE
186 | Chinstrap,Dream,42.5,16.7,187,3350,FEMALE
187 | Chinstrap,Dream,51,18.8,203,4100,MALE
188 | Chinstrap,Dream,49.7,18.6,195,3600,MALE
189 | Chinstrap,Dream,47.5,16.8,199,3900,FEMALE
190 | Chinstrap,Dream,47.6,18.3,195,3850,FEMALE
191 | Chinstrap,Dream,52,20.7,210,4800,MALE
192 | Chinstrap,Dream,46.9,16.6,192,2700,FEMALE
193 | Chinstrap,Dream,53.5,19.9,205,4500,MALE
194 | Chinstrap,Dream,49,19.5,210,3950,MALE
195 | Chinstrap,Dream,46.2,17.5,187,3650,FEMALE
196 | Chinstrap,Dream,50.9,19.1,196,3550,MALE
197 | Chinstrap,Dream,45.5,17,196,3500,FEMALE
198 | Chinstrap,Dream,50.9,17.9,196,3675,FEMALE
199 | Chinstrap,Dream,50.8,18.5,201,4450,MALE
200 | Chinstrap,Dream,50.1,17.9,190,3400,FEMALE
201 | Chinstrap,Dream,49,19.6,212,4300,MALE
202 | Chinstrap,Dream,51.5,18.7,187,3250,MALE
203 | Chinstrap,Dream,49.8,17.3,198,3675,FEMALE
204 | Chinstrap,Dream,48.1,16.4,199,3325,FEMALE
205 | Chinstrap,Dream,51.4,19,201,3950,MALE
206 | Chinstrap,Dream,45.7,17.3,193,3600,FEMALE
207 | Chinstrap,Dream,50.7,19.7,203,4050,MALE
208 | Chinstrap,Dream,42.5,17.3,187,3350,FEMALE
209 | Chinstrap,Dream,52.2,18.8,197,3450,MALE
210 | Chinstrap,Dream,45.2,16.6,191,3250,FEMALE
211 | Chinstrap,Dream,49.3,19.9,203,4050,MALE
212 | Chinstrap,Dream,50.2,18.8,202,3800,MALE
213 | Chinstrap,Dream,45.6,19.4,194,3525,FEMALE
214 | Chinstrap,Dream,51.9,19.5,206,3950,MALE
215 | Chinstrap,Dream,46.8,16.5,189,3650,FEMALE
216 | Chinstrap,Dream,45.7,17,195,3650,FEMALE
217 | Chinstrap,Dream,55.8,19.8,207,4000,MALE
218 | Chinstrap,Dream,43.5,18.1,202,3400,FEMALE
219 | Chinstrap,Dream,49.6,18.2,193,3775,MALE
220 | Chinstrap,Dream,50.8,19,210,4100,MALE
221 | Chinstrap,Dream,50.2,18.7,198,3775,FEMALE
222 | Gentoo,Biscoe,46.1,13.2,211,4500,FEMALE
223 | Gentoo,Biscoe,50,16.3,230,5700,MALE
224 | Gentoo,Biscoe,48.7,14.1,210,4450,FEMALE
225 | Gentoo,Biscoe,50,15.2,218,5700,MALE
226 | Gentoo,Biscoe,47.6,14.5,215,5400,MALE
227 | Gentoo,Biscoe,46.5,13.5,210,4550,FEMALE
228 | Gentoo,Biscoe,45.4,14.6,211,4800,FEMALE
229 | Gentoo,Biscoe,46.7,15.3,219,5200,MALE
230 | Gentoo,Biscoe,43.3,13.4,209,4400,FEMALE
231 | Gentoo,Biscoe,46.8,15.4,215,5150,MALE
232 | Gentoo,Biscoe,40.9,13.7,214,4650,FEMALE
233 | Gentoo,Biscoe,49,16.1,216,5550,MALE
234 | Gentoo,Biscoe,45.5,13.7,214,4650,FEMALE
235 | Gentoo,Biscoe,48.4,14.6,213,5850,MALE
236 | Gentoo,Biscoe,45.8,14.6,210,4200,FEMALE
237 | Gentoo,Biscoe,49.3,15.7,217,5850,MALE
238 | Gentoo,Biscoe,42,13.5,210,4150,FEMALE
239 | Gentoo,Biscoe,49.2,15.2,221,6300,MALE
240 | Gentoo,Biscoe,46.2,14.5,209,4800,FEMALE
241 | Gentoo,Biscoe,48.7,15.1,222,5350,MALE
242 | Gentoo,Biscoe,50.2,14.3,218,5700,MALE
243 | Gentoo,Biscoe,45.1,14.5,215,5000,FEMALE
244 | Gentoo,Biscoe,46.5,14.5,213,4400,FEMALE
245 | Gentoo,Biscoe,46.3,15.8,215,5050,MALE
246 | Gentoo,Biscoe,42.9,13.1,215,5000,FEMALE
247 | Gentoo,Biscoe,46.1,15.1,215,5100,MALE
248 | Gentoo,Biscoe,44.5,14.3,216,4100,
249 | Gentoo,Biscoe,47.8,15,215,5650,MALE
250 | Gentoo,Biscoe,48.2,14.3,210,4600,FEMALE
251 | Gentoo,Biscoe,50,15.3,220,5550,MALE
252 | Gentoo,Biscoe,47.3,15.3,222,5250,MALE
253 | Gentoo,Biscoe,42.8,14.2,209,4700,FEMALE
254 | Gentoo,Biscoe,45.1,14.5,207,5050,FEMALE
255 | Gentoo,Biscoe,59.6,17,230,6050,MALE
256 | Gentoo,Biscoe,49.1,14.8,220,5150,FEMALE
257 | Gentoo,Biscoe,48.4,16.3,220,5400,MALE
258 | Gentoo,Biscoe,42.6,13.7,213,4950,FEMALE
259 | Gentoo,Biscoe,44.4,17.3,219,5250,MALE
260 | Gentoo,Biscoe,44,13.6,208,4350,FEMALE
261 | Gentoo,Biscoe,48.7,15.7,208,5350,MALE
262 | Gentoo,Biscoe,42.7,13.7,208,3950,FEMALE
263 | Gentoo,Biscoe,49.6,16,225,5700,MALE
264 | Gentoo,Biscoe,45.3,13.7,210,4300,FEMALE
265 | Gentoo,Biscoe,49.6,15,216,4750,MALE
266 | Gentoo,Biscoe,50.5,15.9,222,5550,MALE
267 | Gentoo,Biscoe,43.6,13.9,217,4900,FEMALE
268 | Gentoo,Biscoe,45.5,13.9,210,4200,FEMALE
269 | Gentoo,Biscoe,50.5,15.9,225,5400,MALE
270 | Gentoo,Biscoe,44.9,13.3,213,5100,FEMALE
271 | Gentoo,Biscoe,45.2,15.8,215,5300,MALE
272 | Gentoo,Biscoe,46.6,14.2,210,4850,FEMALE
273 | Gentoo,Biscoe,48.5,14.1,220,5300,MALE
274 | Gentoo,Biscoe,45.1,14.4,210,4400,FEMALE
275 | Gentoo,Biscoe,50.1,15,225,5000,MALE
276 | Gentoo,Biscoe,46.5,14.4,217,4900,FEMALE
277 | Gentoo,Biscoe,45,15.4,220,5050,MALE
278 | Gentoo,Biscoe,43.8,13.9,208,4300,FEMALE
279 | Gentoo,Biscoe,45.5,15,220,5000,MALE
280 | Gentoo,Biscoe,43.2,14.5,208,4450,FEMALE
281 | Gentoo,Biscoe,50.4,15.3,224,5550,MALE
282 | Gentoo,Biscoe,45.3,13.8,208,4200,FEMALE
283 | Gentoo,Biscoe,46.2,14.9,221,5300,MALE
284 | Gentoo,Biscoe,45.7,13.9,214,4400,FEMALE
285 | Gentoo,Biscoe,54.3,15.7,231,5650,MALE
286 | Gentoo,Biscoe,45.8,14.2,219,4700,FEMALE
287 | Gentoo,Biscoe,49.8,16.8,230,5700,MALE
288 | Gentoo,Biscoe,46.2,14.4,214,4650,
289 | Gentoo,Biscoe,49.5,16.2,229,5800,MALE
290 | Gentoo,Biscoe,43.5,14.2,220,4700,FEMALE
291 | Gentoo,Biscoe,50.7,15,223,5550,MALE
292 | Gentoo,Biscoe,47.7,15,216,4750,FEMALE
293 | Gentoo,Biscoe,46.4,15.6,221,5000,MALE
294 | Gentoo,Biscoe,48.2,15.6,221,5100,MALE
295 | Gentoo,Biscoe,46.5,14.8,217,5200,FEMALE
296 | Gentoo,Biscoe,46.4,15,216,4700,FEMALE
297 | Gentoo,Biscoe,48.6,16,230,5800,MALE
298 | Gentoo,Biscoe,47.5,14.2,209,4600,FEMALE
299 | Gentoo,Biscoe,51.1,16.3,220,6000,MALE
300 | Gentoo,Biscoe,45.2,13.8,215,4750,FEMALE
301 | Gentoo,Biscoe,45.2,16.4,223,5950,MALE
302 | Gentoo,Biscoe,49.1,14.5,212,4625,FEMALE
303 | Gentoo,Biscoe,52.5,15.6,221,5450,MALE
304 | Gentoo,Biscoe,47.4,14.6,212,4725,FEMALE
305 | Gentoo,Biscoe,50,15.9,224,5350,MALE
306 | Gentoo,Biscoe,44.9,13.8,212,4750,FEMALE
307 | Gentoo,Biscoe,50.8,17.3,228,5600,MALE
308 | Gentoo,Biscoe,43.4,14.4,218,4600,FEMALE
309 | Gentoo,Biscoe,51.3,14.2,218,5300,MALE
310 | Gentoo,Biscoe,47.5,14,212,4875,FEMALE
311 | Gentoo,Biscoe,52.1,17,230,5550,MALE
312 | Gentoo,Biscoe,47.5,15,218,4950,FEMALE
313 | Gentoo,Biscoe,52.2,17.1,228,5400,MALE
314 | Gentoo,Biscoe,45.5,14.5,212,4750,FEMALE
315 | Gentoo,Biscoe,49.5,16.1,224,5650,MALE
316 | Gentoo,Biscoe,44.5,14.7,214,4850,FEMALE
317 | Gentoo,Biscoe,50.8,15.7,226,5200,MALE
318 | Gentoo,Biscoe,49.4,15.8,216,4925,MALE
319 | Gentoo,Biscoe,46.9,14.6,222,4875,FEMALE
320 | Gentoo,Biscoe,48.4,14.4,203,4625,FEMALE
321 | Gentoo,Biscoe,51.1,16.5,225,5250,MALE
322 | Gentoo,Biscoe,48.5,15,219,4850,FEMALE
323 | Gentoo,Biscoe,55.9,17,228,5600,MALE
324 | Gentoo,Biscoe,47.2,15.5,215,4975,FEMALE
325 | Gentoo,Biscoe,49.1,15,228,5500,MALE
326 | Gentoo,Biscoe,47.3,13.8,216,4725,
327 | Gentoo,Biscoe,46.8,16.1,215,5500,MALE
328 | Gentoo,Biscoe,41.7,14.7,210,4700,FEMALE
329 | Gentoo,Biscoe,53.4,15.8,219,5500,MALE
330 | Gentoo,Biscoe,43.3,14,208,4575,FEMALE
331 | Gentoo,Biscoe,48.1,15.1,209,5500,MALE
332 | Gentoo,Biscoe,50.5,15.2,216,5000,FEMALE
333 | Gentoo,Biscoe,49.8,15.9,229,5950,MALE
334 | Gentoo,Biscoe,43.5,15.2,213,4650,FEMALE
335 | Gentoo,Biscoe,51.5,16.3,230,5500,MALE
336 | Gentoo,Biscoe,46.2,14.1,217,4375,FEMALE
337 | Gentoo,Biscoe,55.1,16,230,5850,MALE
338 | Gentoo,Biscoe,44.5,15.7,217,4875,
339 | Gentoo,Biscoe,48.8,16.2,222,6000,MALE
340 | Gentoo,Biscoe,47.2,13.7,214,4925,FEMALE
341 | Gentoo,Biscoe,,,,,
342 | Gentoo,Biscoe,46.8,14.3,215,4850,FEMALE
343 | Gentoo,Biscoe,50.4,15.7,222,5750,MALE
344 | Gentoo,Biscoe,45.2,14.8,212,5200,FEMALE
345 | Gentoo,Biscoe,49.9,16.1,213,5400,MALE
346 |
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/8f80e01c-1296-4371-9486-bb3d68651a60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/8f80e01c-1296-4371-9486-bb3d68651a60.png
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/__init__.py
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld:
--------------------------------------------------------------------------------
1 | {
2 | "@context": "http://schema.org",
3 | "@type": "Collection",
4 | "@id": "https://doi.org/10.5447/ipk/2022/29",
5 | "url": "https://doi.ipk-gatersleben.de:443/DOI/64fb788c-7495-4800-8568-fd562b07017e/fbda7260-8307-485e-a9b7-d84292e3eb04/2",
6 | "additionalType": "directory",
7 | "name": "GLOBAL STRATEGY FOR THE CONSERVATION OF POTATO",
8 | "author": {
9 | "name": "Manuela Nagel",
10 | "givenName": "Manuela",
11 | "familyName": "Nagel",
12 | "affiliation": {
13 | "@type": "Organization",
14 | "name": "Leibniz Institute of Plant Genetics and Crop Plant Research (IPK), Seeland OT Gatersleben, Corrensstraße 3, 06466, Germany"
15 | },
16 | "@id": "https://orcid.org/0000-0003-0396-0333"
17 | },
18 | "editor": [
19 | {
20 | "name": "Ehsan Dulloo",
21 | "givenName": "Ehsan",
22 | "familyName": "Dulloo",
23 | "affiliation": {
24 | "@type": "Organization",
25 | "name": "International Consultant, ,"
26 | },
27 | "contributorType": "Researcher"
28 | },
29 | {
30 | "name": "Prishnee Bissessur",
31 | "givenName": "Prishnee",
32 | "familyName": "Bissessur",
33 | "affiliation": {
34 | "@type": "Organization",
35 | "name": "International Consultant, ,"
36 | },
37 | "contributorType": "Researcher"
38 | },
39 | {
40 | "name": "Tatjana Gavrilenko",
41 | "givenName": "Tatjana",
42 | "familyName": "Gavrilenko",
43 | "affiliation": {
44 | "@type": "Organization",
45 | "name": "N.I. Vavilov All-Russian Institute of Plant Genetic Resources, , Russia"
46 | },
47 | "contributorType": "Researcher",
48 | "@id": "https://orcid.org/0000-0002-2605-6569"
49 | },
50 | {
51 | "name": "John Bamberg",
52 | "givenName": "John",
53 | "familyName": "Bamberg",
54 | "affiliation": {
55 | "@type": "Organization",
56 | "name": "U. S. Potato Genebank, , USA"
57 | },
58 | "contributorType": "Researcher",
59 | "@id": "https://orcid.org/0000-0001-6102-7846"
60 | },
61 | {
62 | "name": "David Ellis",
63 | "givenName": "David",
64 | "familyName": "Ellis",
65 | "affiliation": {
66 | "@type": "Organization",
67 | "name": "International Potato Center (CIP), , Peru"
68 | },
69 | "contributorType": "Researcher",
70 | "@id": "https://orcid.org/0000-0002-0209-2784"
71 | },
72 | {
73 | "name": "Peter Giovannini",
74 | "givenName": "Peter",
75 | "familyName": "Giovannini",
76 | "affiliation": {
77 | "@type": "Organization",
78 | "name": "Global Crop Diversity Trust, ,"
79 | },
80 | "contributorType": "Researcher",
81 | "@id": "https://orcid.org/0000-0002-1053-2030"
82 | }
83 | ],
84 | "description": "Cultivated potato, Solanum tuberosum ssp. tuberosum, is the third most consumed crop globally and important not only for food but also for for the animal feed, pharmaceutical, textile and paper industries. To gain an overview on the current state of the conservation and use of potato genetic resources, the Global Crop Diversity Trust (Crop Trust), commissioned an update of the ‘Global conservation strategy for potato genetic resources’. This updated strategy aims to support the efficiency and effectiveness of potato diversity conservation at national, regional and international levels, and to identify priorities for strengthening the conservation and use of potato genetic resources.",
85 | "keywords": "ex situ conservation, plant genetic resources, potato, Solanum tuberosum, global strategy, conservation strategy, wild potato, Andigenum group, Chilotanum group, native potato variety, genebank, accession, true potato seed, potato tuber, late blight",
86 | "inLanguage": "en",
87 | "contentSize": "0 B",
88 | "datePublished": "2022",
89 | "schemaVersion": "http://datacite.org/schema/kernel-4",
90 | "publisher": {
91 | "@type": "Organization",
92 | "name": "e!DAL - Plant Genomics and Phenomics Research Data Repository (PGP), IPK Gatersleben, Seeland OT Gatersleben, Corrensstraße 3, 06466, Germany"
93 | },
94 | "provider": {
95 | "@type": "Organization",
96 | "name": "datacite"
97 | }
98 | }
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/bfcd99e1-0690-4b53-a85c-0174a8629083.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/bfcd99e1-0690-4b53-a85c-0174a8629083.zip
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/cca70ce6-1952-45d2-acd4-80c903b0bc49.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/cca70ce6-1952-45d2-acd4-80c903b0bc49.png
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/df6561b2-7ee5-4540-baab-5095f742716a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/df6561b2-7ee5-4540-baab-5095f742716a.png
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx
--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/f918266a-b3e0-4914-865d-4faa564f1aef.py:
--------------------------------------------------------------------------------
1 | from random import randint
2 | import time
3 |
4 | class UhOh(Exception):
5 | pass
6 |
7 | class Hmm:
8 | def __init__(self):
9 | self.value = randint(-100, 100)
10 |
11 | def Yeah(self):
12 | if self.value == 0:
13 | return True
14 | else:
15 | raise UhOh()
16 |
17 | def Okay():
18 | while True:
19 | yield Hmm()
20 |
21 | def keep_trying(go, first_try=True):
22 | maybe = next(go)
23 | try:
24 | if maybe.Yeah():
25 | return maybe.value
26 | except UhOh:
27 | if first_try:
28 | print("Working...")
29 | print("Please wait patiently...")
30 | time.sleep(0.1)
31 | return keep_trying(go, first_try=False)
32 |
33 | if __name__ == "__main__":
34 | go = Okay()
35 | print(f"{keep_trying(go)}")
36 |
--------------------------------------------------------------------------------
/langProBe/GAIA/data/GAIA.py:
--------------------------------------------------------------------------------
1 | """GAIA 2023 dataset."""
2 |
3 |
4 | import json
5 | import os
6 |
7 | import datasets
8 |
9 |
10 | _CITATION = """ """
11 |
12 | _DESCRIPTION = """ """
13 |
14 | _HOMEPAGE = ""
15 |
16 | _LICENSE = ""
17 |
18 | _NAMES = [
19 | "2023_all",
20 | "2023_level1",
21 | "2023_level2",
22 | "2023_level3",
23 | ]
24 |
25 | YEAR_TO_LEVELS = {"2023": [1, 2, 3]}
26 |
27 | separator = "_"
28 |
29 |
30 | class GAIA_dataset(datasets.GeneratorBasedBuilder):
31 | VERSION = datasets.Version("0.0.1")
32 |
33 | BUILDER_CONFIGS = [
34 | datasets.BuilderConfig(name=name, version=version, description=name)
35 | for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
36 | ]
37 |
38 | def _info(self):
39 | features = datasets.Features(
40 | {
41 | "task_id": datasets.Value("string"),
42 | "Question": datasets.Value("string"),
43 | "Level": datasets.Value("string"),
44 | "Final answer": datasets.Value("string"), # ? for test values
45 | "file_name": datasets.Value("string"),
46 | "file_path": datasets.Value("string"), # generated here
47 | "Annotator Metadata": {k: datasets.Value("string") for k in ["Steps", "Number of steps", "How long did this take?", "Tools", "Number of tools"]} # "",
48 | }
49 | )
50 | return datasets.DatasetInfo(
51 | description=_DESCRIPTION,
52 | features=features,
53 | homepage=_HOMEPAGE,
54 | license=_LICENSE,
55 | citation=_CITATION,
56 | )
57 |
58 | def _split_generators(self, dl_manager):
59 | year, level_name = self.config.name.split(separator)
60 | if level_name == "all":
61 | levels = YEAR_TO_LEVELS[year]
62 | else:
63 | level_name = int(level_name.split("level")[1])
64 | levels = [level_name]
65 | print(year, level_name)
66 |
67 | output = []
68 | for split in ["test", "validation"]:
69 | root_file = dl_manager.download(os.path.join(year, split, "metadata.jsonl"))
70 | test_attached_files = {"": ""}
71 | with open(root_file, "r", encoding="utf-8") as f:
72 | for line in f:
73 | cur_line = json.loads(line)
74 | if cur_line["Level"] in levels and cur_line["file_name"] != "":
75 | attached_file_name = cur_line["file_name"]
76 | attached_file = dl_manager.download(os.path.join(year, split, attached_file_name))
77 | test_attached_files[attached_file_name] = attached_file
78 |
79 | output.append(
80 | datasets.SplitGenerator(
81 | name=getattr(datasets.Split, split.upper()),
82 | gen_kwargs={"root_file": root_file, "attached_files": test_attached_files, "levels": levels},
83 | )
84 | )
85 | return output
86 |
87 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
88 | def _generate_examples(self, root_file: str, attached_files: dict, levels: list[int]):
89 | with open(root_file, "r", encoding="utf-8") as f:
90 | for key, line in enumerate(f):
91 | cur_line = json.loads(line)
92 | if cur_line["Level"] in levels:
93 | cur_line["file_path"] = attached_files[cur_line["file_name"]]
94 | yield key, cur_line
95 |
96 |
97 |
--------------------------------------------------------------------------------
/langProBe/GAIA/data/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | language:
3 | - en
4 | pretty_name: General AI Assistants Benchmark
5 | extra_gated_prompt: "To avoid contamination and data leakage, you agree to not reshare this dataset outside of a gated or private repository on the HF hub."
6 | extra_gated_fields:
7 | I agree to not reshare the GAIA submissions set according to the above conditions: checkbox
8 | ---
9 | # GAIA dataset
10 |
11 | GAIA is a benchmark which aims at evaluating next-generation LLMs (LLMs with augmented capabilities due to added tooling, efficient prompting, access to search, etc).
12 |
13 | We added gating to prevent bots from scraping the dataset. Please do not reshare the validation or test set in a crawlable format.
14 |
15 | ## Data and leaderboard
16 | GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve. It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata.
17 |
18 | GAIA leaderboard can be found in this space (https://huggingface.co/spaces/gaia-benchmark/leaderboard).
19 |
20 | Questions are contained in metadata.jsonl. Some questions come with an additional file, that can be found in the same folder and whose id is given in the field file_name.
21 |
22 | More details in [the paper](https://arxiv.org/abs/2311.12983) for now and soon here as well.
--------------------------------------------------------------------------------
/langProBe/GAIA/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/__init__.py
--------------------------------------------------------------------------------
/langProBe/GAIA/data/statics.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | from collections import defaultdict
4 |
5 | def parse_tools(tools_str):
6 | """
7 | 解析 Tools 字符串,将其分割为单独的工具列表。
8 | 假设 Tools 字段是以数字和点开头的每行工具,例如:
9 | "1. Web browser
10 | 2. Image recognition tools (to identify and parse a figure with three axes)"
11 | """
12 | tools = []
13 | # 使用正则表达式匹配每个工具条目
14 | pattern = re.compile(r'\d+\.\s*(.*)')
15 | for line in tools_str.split('\n'):
16 | match = pattern.match(line.strip())
17 | if match:
18 | tool = match.group(1).strip()
19 | # 去除可能的括号内说明
20 | tool = re.sub(r'\s*\(.*\)', '', tool)
21 | tools.append(tool)
22 | return tools
23 |
24 | def process_jsonl(file_path):
25 | tool_counts = defaultdict(int)
26 | total_tools = 0
27 | tool_numbers = []
28 | processed_tasks = 0
29 |
30 | with open(file_path, 'r', encoding='utf-8') as f:
31 | for line_number, line in enumerate(f, 1):
32 | line = line.strip()
33 | if not line:
34 | continue # 跳过空行
35 | # 调试信息:确认正在处理哪一行
36 | print(f"处理第 {line_number} 行")
37 |
38 | try:
39 | data = json.loads(line)
40 | except json.JSONDecodeError as e:
41 | print(f"第 {line_number} 行: JSON 解码错误: {e}")
42 | continue
43 |
44 | # 提取 Annotator Metadata
45 | annotator_metadata = data.get("Annotator Metadata", {})
46 | if not annotator_metadata:
47 | print(f"第 {line_number} 行: 未找到 'Annotator Metadata' 字段。")
48 | continue
49 |
50 | number_of_tools = annotator_metadata.get("Number of tools")
51 | tools_str = annotator_metadata.get("Tools", "")
52 |
53 | if number_of_tools is None:
54 | print(f"第 {line_number} 行: 未找到 'Number of tools' 字段。")
55 | else:
56 | try:
57 | num_tools = int(number_of_tools)
58 | tool_numbers.append(num_tools)
59 | except ValueError:
60 | print(f"第 {line_number} 行: 'Number of tools' 不是有效的整数。")
61 |
62 | if not tools_str:
63 | print(f"第 {line_number} 行: 'Tools' 字段为空。")
64 | continue
65 |
66 | tools = parse_tools(tools_str)
67 | print(f"第 {line_number} 行解析到的工具: {tools}")
68 | print(f"第 {line_number} 行的工具数量: {len(tools)}")
69 |
70 | # 验证 Number of tools 是否与解析的工具数量一致
71 | if number_of_tools:
72 | try:
73 | num_tools = int(number_of_tools)
74 | if num_tools != len(tools):
75 | print(f"第 {line_number} 行: Number of tools ({num_tools}) 与解析的工具数量 ({len(tools)}) 不一致。")
76 | except ValueError:
77 | pass # 已在上一步处理
78 |
79 | # 统计每个工具的出现次数
80 | for tool in tools:
81 | tool_counts[tool] += 1
82 | total_tools += 1
83 |
84 | processed_tasks += 1
85 |
86 | return tool_counts, tool_numbers, total_tools, processed_tasks
87 |
88 | def main():
89 | jsonl_file = '2023/validation/metadata.jsonl' # 替换为你的 JSONL 文件路径
90 | tool_counts, tool_numbers, total_tools, processed_tasks = process_jsonl(jsonl_file)
91 |
92 | print("\n每个工具的总出现次数:")
93 | if not tool_counts:
94 | print("没有统计到任何工具。请检查文件内容和解析逻辑。")
95 | else:
96 | for tool, count in sorted(tool_counts.items(), key=lambda x: x[1], reverse=True):
97 | print(f"{tool}: {count}")
98 |
99 | # 计算并输出平均工具数量
100 | if tool_numbers:
101 | average_tools = sum(tool_numbers) / len(tool_numbers)
102 | print(f"\n平均每个题目的工具数量: {average_tools:.2f}")
103 | else:
104 | print("\n没有统计到任何 'Number of tools' 数据。")
105 |
106 | print(f"\n总处理题目数: {processed_tasks}")
107 | print(f"总工具数量: {total_tools}")
108 |
109 | if __name__ == "__main__":
110 | main()
111 |
--------------------------------------------------------------------------------
/langProBe/GAIA/gaia_program.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 | import re
5 | import time
6 | import traceback
7 | from datetime import datetime
8 | from typing import List, Tuple, Optional
9 | from langProBe.evaluation_utils import question_scorer
10 |
11 | from langProBe.mcp_program import MCPPredict
12 |
13 | import dspy
14 | from openai import OpenAI
15 |
16 | from langProBe.dspy_program import LangProBeDSPyMetaProgram
17 | import langProBe.constants as constants
18 |
19 | from langProBe.mcp_program import MCPPredict
20 | from langProBe.program_utils import (
21 | call_lm,
22 | build_init_messages,
23 | build_messages,
24 | response_parsing,
25 | mcp_calling,
26 | ProcessManager
27 | )
28 |
29 | MCP_SAMPLE_SYSTEM_PROMPT = """
30 | You are a helpful assistant. You are able to answer questions using different tools.
31 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.
32 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.
33 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.
34 | The tool description includes:
35 | A brief text description outlining the functionality of the tool.
36 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
37 | If you have obtained the final result. Please provide your final answer enclosed within tags. Ensure that only the final answer is included, without any additional explanations or commentary.
38 | """
39 |
40 | class GAIAPredict(MCPPredict):
41 | def __init__(self, max_steps=5, system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, task_name="gaia"):
42 | super().__init__(max_steps, system_prompt, task_name)
43 |
44 | def evaluate_prediction(self, question: str, ground_truth: str, prediction: str) -> Tuple[bool, Optional[str]]:
45 | return question_scorer(prediction, ground_truth, self.run_logger)
46 |
47 | def extract_last_answer(self, text):
48 | pattern = re.compile(r'(.*?) ', re.DOTALL)
49 | matches = pattern.findall(text)
50 |
51 | if matches:
52 | return matches[-1]
53 | else:
54 | return None
55 |
56 | def forward(self, **kwargs) -> dspy.Prediction:
57 | unique_id = kwargs.get('id')
58 | question = kwargs.get('question')
59 | gt = kwargs.get('answer')
60 |
61 | manager = ProcessManager()
62 | manager.lm_api_key = self.lm.api_key
63 | manager.lm_api_base = self.lm.api_base
64 | manager.model = self.lm.model
65 | manager.id = unique_id
66 |
67 | self.run_logger.info(f"ID: {manager.id}, Starting forward pass for question: {question}")
68 |
69 | from langProBe.evaluation import global_config
70 | mcps = global_config['mcp_pool']
71 |
72 | messages = build_init_messages(self.system_prompt, mcps, question)
73 | steps = 0
74 | all_completion_tokens = 0
75 | all_prompt_tokens = 0
76 | start_time = time.time()
77 |
78 | while not messages[-1][constants.ROLE] == constants.ASSISTANT and steps < self.max_steps:
79 | response, completion_tokens, prompt_tokens = call_lm(messages, manager, self.run_logger)
80 | all_completion_tokens += completion_tokens
81 | all_prompt_tokens += prompt_tokens
82 | mcp_calls = response_parsing(response)
83 |
84 | new_messages = mcp_calling(mcp_calls, manager, self.run_logger)
85 | messages = build_messages(messages, new_messages)
86 | steps += 1
87 |
88 | end_time = time.time()
89 |
90 | if messages[-1][constants.ROLE] != constants.ASSISTANT:
91 | self.run_logger.warning("Maximum steps reached without getting an answer")
92 | messages.append({
93 | constants.ROLE: constants.ASSISTANT,
94 | constants.CONTENT: "超过最长次数限制,该问题无法解决",
95 | })
96 |
97 | self.run_logger.info(f"ID: {manager.id}, Forward pass completed successfully")
98 | success = self.evaluate_prediction(question, gt, self.extract_last_answer(messages[-1][constants.CONTENT]))
99 | self.log_messages(messages, question, success, (end_time - start_time), all_prompt_tokens,
100 | all_completion_tokens)
101 | self.run_logger.info(f"ID: {manager.id}, Evaluation completed successfully")
102 |
103 | return dspy.Prediction(
104 | success=success,
105 | question=question,
106 | ground_truth=gt,
107 | answer=messages[-1][constants.CONTENT],
108 | trace=messages,
109 | process_report=manager
110 | )
--------------------------------------------------------------------------------
/langProBe/WebSearch/__init__.py:
--------------------------------------------------------------------------------
1 | from langProBe.benchmark import BenchmarkMeta, MCPBench
2 | from langProBe.mcp_program import MCPPredict
3 | from langProBe.evaluation_utils import mcp_metric
4 |
5 | MCP_SAMPLE_SYSTEM_PROMPT = """
6 | You are a helpful assistant. You are able to answer questions using different tools.
7 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.
8 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.
9 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.
10 | The tool description includes:
11 | A brief text description outlining the functionality of the tool.
12 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
13 | """
14 |
15 | def get_mcp_sample_benchmark():
16 | mcp_sample_baseline = MCPPredict(
17 | max_steps=5,
18 | system_prompt=MCP_SAMPLE_SYSTEM_PROMPT,
19 | task_name="websearch")
20 |
21 | return [
22 | BenchmarkMeta(
23 | MCPBench,
24 | [mcp_sample_baseline],
25 | mcp_metric,
26 | optimizers=[],
27 | name="MCP_WEBSEARCH" # 添加显式名称
28 | )
29 | ]
30 |
31 | benchmark = get_mcp_sample_benchmark()
--------------------------------------------------------------------------------
/langProBe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/__init__.py
--------------------------------------------------------------------------------
/langProBe/analysis.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 |
3 | import pandas as pd
4 |
5 |
6 | def read_evaluation_results(dir: str):
7 | # Define the path to the directory
8 | file_path = pathlib.Path(dir)
9 |
10 | # List all .txt files in the directory
11 | all_result_files = list(file_path.rglob("*.txt"))
12 |
13 | # Initialize a list to store the extracted data
14 | extracted_data = []
15 |
16 | # Process each file
17 | for file in all_result_files:
18 | # Split the filename to get benchmark, program, and optimizer
19 | file_name_parts = file.stem.split("_")
20 | if len(file_name_parts) >= 3:
21 | benchmark = ''.join(file_name_parts[:-1])
22 | program = file_name_parts[-1]
23 | else:
24 | raise ValueError(f"Invalid file name: {file.name}")
25 |
26 | with open(file, "r") as f:
27 | lines = f.readlines()
28 |
29 | # Extract information from the lines
30 | if len(lines) == 2: # Checking if we have 2 lines
31 | header = lines[0].strip()
32 | values = lines[1].strip().split(",")
33 |
34 | # Check if optimizer is present in the file content
35 | if "optimizer" in header:
36 | # Extract values for file with optimizer
37 | data = {
38 | "file_name": file.name,
39 | "benchmark": benchmark,
40 | "program": program,
41 | "score": float(values[0]),
42 | "cost": float(values[1]),
43 | "input_tokens": int(values[2]),
44 | "output_tokens": int(values[3]),
45 | }
46 | else:
47 | # Extract values for file without optimizer
48 | data = {
49 | "file_name": file.name,
50 | "benchmark": benchmark,
51 | "program": program,
52 | "score": float(values[0]),
53 | "cost": float(values[1]),
54 | "input_tokens": int(values[2]),
55 | "output_tokens": int(values[3]),
56 | }
57 |
58 | # Append the extracted data to the list
59 | extracted_data.append(data)
60 |
61 | # Convert the list of dictionaries to a pandas DataFrame
62 | # import pdb; pdb.set_trace()
63 | df = pd.DataFrame(extracted_data)
64 | df = canonicalize_program(df)
65 | return df
66 |
67 |
68 | program_mapping = {
69 | "AppWorldReact": "ReActBaseline",
70 | "AppWorldReactAugumented": "ReActAugumented",
71 | "Predict": "Predict",
72 | "ChainOfThought": "CoT",
73 | "GeneratorCriticRanker": "GeneratorCriticRanker",
74 | "GeneratorCriticFuser": "GeneratorCriticFuser",
75 | "RAG": "RAG",
76 | "EvaluationValidityPredict": "Predict",
77 | "EvaluationValidityModule": "CoT",
78 | "CoT": "CoT",
79 | "Classify": "CoTBasedVote",
80 | "HeartDiseaseClassify": "CoTBasedVote",
81 | "RetrieveMultiHop": "RetrieveMultiHop",
82 | "SimplifiedBaleen": "SimplifiedBaleen",
83 | "SimplifiedBaleenWithHandwrittenInstructions": "SimplifiedBaleenWithInst",
84 | "UnderspecifiedAnnotationCoT": "CoT",
85 | "UnderspecifiedAnnotationGeneratorCriticFuser": "GeneratorCriticFuser",
86 | "UnderspecifiedAnnotationGeneratorCriticRanker": "GeneratorCriticRanker",
87 | "EvaluationValidityGeneratorCriticRanker": "GeneratorCriticRanker",
88 | "EvaluationValidityGeneratorCriticFuser": "GeneratorCriticFuser",
89 | "UnderspecifiedAnnotationPredict": "Predict",
90 | "EvaluationValidityCoT": "CoT",
91 | "EvaluationValidityPredict": "Predict",
92 | # Relook at the following programs
93 | "IReRaCOT": "CoT",
94 | "IReRaPredict": "Predict",
95 | "Infer": "CoT",
96 | "InferRetrieve": "RAG",
97 | "IReRaRetrieve": "RAG",
98 | "IReRaRetrieveRank": "RAGBasedRank",
99 | "InferRetrieveRank": "RAGBasedRank",
100 | "HoverMultiHopPredict": "Predict",
101 | "HoverMultiHop": "MultiHopSummarize",
102 | }
103 |
104 |
105 | def canonicalize_program(data_df):
106 | # Update the benchmark names based on the program
107 | data_df.loc[
108 | data_df["program"].isin(
109 | [
110 | "UnderspecifiedAnnotationCoT",
111 | "UnderspecifiedAnnotationPredict",
112 | "UnderspecifiedAnnotationGeneratorCriticFuser",
113 | "UnderspecifiedAnnotationGeneratorCriticRanker",
114 | ]
115 | ),
116 | "benchmark",
117 | ] = "SWEBenchUnderspecified"
118 |
119 | data_df.loc[
120 | data_df["program"].isin(
121 | [
122 | "EvaluationValidityCoT",
123 | "EvaluationValidityPredict",
124 | "EvaluationValidityGeneratorCriticFuser",
125 | "EvaluationValidityGeneratorCriticRanker",
126 | ]
127 | ),
128 | "benchmark",
129 | ] = "SWEBenchValidity"
130 | data_df["program"] = data_df["program"].replace(program_mapping)
131 | data_df["benchmark"] = data_df["benchmark"].apply(lambda x: x.replace("Bench", ""))
132 | return data_df
133 |
--------------------------------------------------------------------------------
/langProBe/async_mcp_client.py:
--------------------------------------------------------------------------------
1 | from contextlib import AsyncExitStack
2 | from typing import Optional
3 |
4 | from anthropic import Anthropic
5 | from mcp import ClientSession
6 | from mcp.client.sse import sse_client
7 |
8 |
9 | class AsyncMCPClient:
10 |
11 | def __init__(self):
12 | # Initialize session and client objects
13 | self.session: Optional[ClientSession] = None
14 | self.exit_stack = AsyncExitStack()
15 | self.anthropic = Anthropic()
16 |
17 | async def connect_to_sse_server(self, server_url: str):
18 | """Connect to an MCP server running with SSE transport"""
19 | # Store the context managers so they stay alive
20 | self._streams_context = sse_client(url=server_url)
21 | streams = await self._streams_context.__aenter__()
22 |
23 | self._session_context = ClientSession(*streams)
24 | self.session: ClientSession = await self._session_context.__aenter__()
25 |
26 | # Initialize
27 | await self.session.initialize()
28 |
29 | # List available tools to verify connection
30 | # print("Initialized SSE client...")
31 | # print("Listing tools...")
32 | response = await self.session.list_tools()
33 | tools = response.tools
34 | # print("\nConnected to server with tools:", [tool.name for tool in tools])
35 |
36 | async def cleanup(self):
37 | """Properly clean up the session and streams"""
38 | if self._session_context:
39 | await self._session_context.__aexit__(None, None, None)
40 | if self._streams_context:
41 | await self._streams_context.__aexit__(None, None, None)
42 |
43 | async def call_tool(self, tool_name: str, tool_args: dict) -> dict:
44 | """Call a tool with the given arguments"""
45 | result = await self.session.call_tool(tool_name, tool_args)
46 | return result
47 |
48 | async def list_tools(self):
49 | """List available tools"""
50 | response = await self.session.list_tools()
51 | return response
52 |
53 | async def get_prompt(self, *args, **kwargs):
54 | response = await self.session.get_prompt(*args, **kwargs)
55 | return response
56 |
57 | async def list_prompts(self):
58 | response = await self.session.list_prompts()
59 | return response
60 |
61 | async def list_resources(self):
62 | response = await self.session.list_resources()
63 | return response
64 |
65 | async def read_resource(self, *args, **kwargs):
66 | response = await self.session.read_resource(*args, **kwargs)
67 | return response
68 |
69 | async def process_query(self, query: str) -> str:
70 | """Process a query using Claude and available tools"""
71 | messages = [
72 | {
73 | "role": "user",
74 | "content": query
75 | }
76 | ]
77 |
78 | response = await self.session.list_tools()
79 | available_tools = [{
80 | "name": tool.name,
81 | "description": tool.description,
82 | "input_schema": tool.inputSchema
83 | } for tool in response.tools]
84 |
85 | # Initial Claude API call
86 | response = self.anthropic.messages.create(
87 | model="claude-3-5-sonnet-20241022",
88 | max_tokens=1000,
89 | messages=messages,
90 | tools=available_tools
91 | )
92 |
93 | # Process response and handle tool calls
94 | tool_results = []
95 | final_text = []
96 |
97 | for content in response.content:
98 | if content.type == 'text':
99 | final_text.append(content.text)
100 | elif content.type == 'tool_use':
101 | tool_name = content.name
102 | tool_args = content.input
103 |
104 | # Execute tool call
105 | result = await self.session.call_tool(tool_name, tool_args)
106 | tool_results.append({"call": tool_name, "result": result})
107 | final_text.append(f"[Calling tool {tool_name} with args {tool_args}]")
108 |
109 | # Continue conversation with tool results
110 | if hasattr(content, 'text') and content.text:
111 | messages.append({
112 | "role": "assistant",
113 | "content": content.text
114 | })
115 | messages.append({
116 | "role": "user",
117 | "content": result.content
118 | })
119 |
120 | # Get next response from Claude
121 | response = self.anthropic.messages.create(
122 | model="claude-3-5-sonnet-20241022",
123 | max_tokens=1000,
124 | messages=messages,
125 | )
126 |
127 | final_text.append(response.content[0].text)
128 |
129 | return "\n".join(final_text)
130 |
131 | async def chat_loop(self):
132 | """Run an interactive chat loop"""
133 | # print("\nMCP Client Started!")
134 | # print("Type your queries or 'quit' to exit.")
135 |
136 | while True:
137 | try:
138 | query = input("\nQuery: ").strip()
139 |
140 | if query.lower() == 'quit':
141 | break
142 |
143 | response = await self.process_query(query)
144 | print("\n" + response)
145 |
146 | except Exception as e:
147 | print(f"\nError: {str(e)}")
148 |
149 | # async def main():
150 | # client = AsyncMCPClient()
151 | # try:
152 | # await client.connect_to_sse_server(server_url="http://localhost:8080/sse")
153 | # result = await client.call_tool("get_alerts", {"state": "CA"})
154 | # print(result)
155 | # finally:
156 | # await client.cleanup()
157 |
158 |
159 | # result = asyncio.run(main())
--------------------------------------------------------------------------------
/langProBe/benchmark.py:
--------------------------------------------------------------------------------
1 | import random, os
2 | from abc import ABC, abstractmethod
3 | from dataclasses import dataclass, field
4 | from enum import Enum
5 | from typing import Callable, List, Type
6 |
7 | import dspy
8 | from dspy.evaluate import Evaluate
9 | from dspy.teleprompt import Teleprompter
10 |
11 | import langProBe.optimizers as langprobe_optimizers
12 | from langProBe.dspy_program import LangProBeDSPyMetaProgram
13 | from langProBe.config_utils import read_json, read_jsonl
14 | from langProBe.program_utils import ProcessManager
15 |
16 |
17 |
18 |
19 | dataset_size = {"full": None, "lite": 500, "tiny": 200, "test": 2}
20 |
21 |
22 | class Benchmark(ABC):
23 | def __init__(self, dataset_mode="lite"):
24 | # dataset for training and validation
25 | self.dataset = None
26 | # dataset for the actual benchmarking
27 | self.test_set = None
28 | self.train_set = None
29 | self.dev_set = None
30 | self.val_set = None
31 |
32 | self.init_dataset()
33 | assert self.dataset is not None, "Dataset not initialized"
34 | assert self.test_set is not None, "Test set not initialized"
35 | self.max_testset_size = dataset_size[dataset_mode]
36 |
37 | self.test_set = self.trim_dataset(self.test_set, self.max_testset_size)
38 |
39 | # TODO: FIXME: "test" option is for debugging purposes only, should be removed for final release
40 | if dataset_mode == "test":
41 | self.dataset = self.trim_dataset(self.dataset, 60)
42 | self.create_splits()
43 | self.test_set = self.trim_dataset(self.test_set, 50)
44 |
45 | if not self.train_set or not self.dev_set or not self.val_set:
46 | self.create_splits()
47 |
48 | self.train_set = self.trim_dataset(self.train_set, 150)
49 | self.dev_set = self.trim_dataset(self.dev_set, 300)
50 | self.val_set = self.trim_dataset(self.val_set, 300)
51 |
52 | assert self.train_set is not None, "Train set not initialized"
53 | assert self.dev_set is not None, "Dev set not initialized"
54 | assert self.val_set is not None, "Val set not initialized"
55 |
56 | @abstractmethod
57 | def init_dataset(self) -> None:
58 | """
59 | Initializes the dataset for the benchmark, and sets it to self.dataset.
60 | Each element in the dataset should be an instance of dspy.Example.
61 | """
62 | return
63 |
64 | def trim_dataset(self, dataset, size: int) -> None:
65 | if size is None or size >= len(dataset):
66 | return dataset
67 | rng = random.Random()
68 | rng.seed(1)
69 | return rng.sample(dataset, size)
70 |
71 | def create_splits(self) -> None:
72 | """
73 | Creates the splits for the dataset (not including test).
74 | Upon completion, self.train_set, self.dev_set, and self.val_set should be set.
75 | """
76 |
77 | total_len = len(self.dataset)
78 | self.dev_set = self.dataset[: int(0.4 * total_len)]
79 | self.val_set = self.dataset[int(0.4 * total_len) : int(0.8 * total_len)]
80 | self.train_set = self.dataset[int(0.8 * total_len) :]
81 |
82 | def get_dataset(self):
83 | return self.dataset
84 |
85 | def get_train_set(self):
86 | return self.train_set
87 |
88 | def get_dev_set(self):
89 | return self.dev_set
90 |
91 | def get_test_set(self):
92 | return self.test_set
93 |
94 |
95 | class MCPBench(Benchmark):
96 | def __init__(self, dataset_mode="lite", dataset_path=None, missing_data=[]):
97 | self.dataset_path = dataset_path
98 | self.missing_data = missing_data
99 | super().__init__(dataset_mode=dataset_mode)
100 |
101 | def init_dataset(self):
102 | self.dataset = []
103 | self.test_set = []
104 | if self.missing_data:
105 | test_raw_data = self.missing_data
106 | else:
107 | test_raw_data = read_jsonl(self.dataset_path)
108 |
109 | for test_data in test_raw_data:
110 | self.test_set.append(
111 | dspy.Example(
112 | id=test_data["unique_id"],
113 | question=test_data["Prompt"],
114 | answer=test_data["Answer"],
115 | ).with_inputs("id", "question", "answer", "config")
116 | )
117 |
118 |
119 |
120 |
121 | @dataclass
122 | class EvaluationResult:
123 | benchmark: str
124 | program: str
125 |
126 | score: float
127 | cost: float
128 | input_tokens: int
129 | output_tokens: int
130 |
131 | outputs_raw_data: List|None = None
132 |
133 | # optimizer: str = None
134 | # optimized_program: dspy.Module = None
135 | # optimizer_input_tokens: int = None
136 | # optimizer_output_tokens: int = None
137 | # optimizer_cost: float = None
138 |
139 | # optimizer_program_scores: list[float] = None
140 |
141 |
142 | @dataclass
143 | class BenchmarkMeta:
144 | benchmark: Type[Benchmark]
145 | program: List[dspy.Module]
146 | metric: Callable
147 | dataset_mode: str = "lite"
148 |
149 | optimizers: List[langprobe_optimizers.OptimizerConfig] = field(
150 | default_factory=lambda: langprobe_optimizers.DEFAULT_OPTIMIZERS
151 | )
152 |
153 | # BenchmarkMeta.num_threads has higher priority than run time argument of num_threads
154 | # use this as an upper bound for the number of threads to use
155 | num_threads: int = None
156 | name: str = None
157 |
158 |
159 | def setup_lm(dspy_config=None):
160 | lm: dspy.LM = dspy_config.get("lm", dspy.settings.lm)
161 | assert lm is not None, "dspy language model not set"
162 |
163 | lm = lm.copy()
164 | assert len(lm.history) == 0, "language model history not empty"
165 | return lm
166 |
167 |
168 | # def calculate_stats(lm: dspy.LM) -> tuple[float, int, int]:
169 | # cost = 0
170 | # input_tokens = 0
171 | # output_tokens = 0
172 | # for i, trace in enumerate(lm.history):
173 | # cost += trace.get("cost", None) or 0
174 | # input_tokens += trace.get("usage", 0).get("prompt_tokens", 0)
175 | # output_tokens += trace.get("usage", 0).get("completion_tokens", 0)
176 |
177 | # return cost, input_tokens, output_tokens
178 |
179 | def calculate_stats(manager: List[ProcessManager]) -> tuple[float, float, float]:
180 | input_tokens = sum(usage["prompt_tokens"] for trace in manager for usage in trace.lm_usages)
181 | output_tokens = sum(usage["completion_tokens"] for trace in manager for usage in trace.lm_usages)
182 |
183 | avg_input = input_tokens // len(manager)
184 | avg_output = output_tokens // len(manager)
185 |
186 | return 0, avg_input, avg_output
187 |
188 |
189 |
190 | class EvaluateBench(ABC):
191 | def __init__(
192 | self,
193 | benchmark: Benchmark,
194 | program: dspy.Module,
195 | metric: Callable,
196 | lm: str,
197 | benchmark_name: str = None,
198 | num_threads: int = 1,
199 | api_key: str = None,
200 | api_base: str = None,
201 | ):
202 | self.benchmark = benchmark
203 | self.program = program
204 |
205 | self.program.setup_lm(lm, api_key=api_key, api_base=api_base)
206 | self.metric = metric
207 | self.num_threads = num_threads
208 | devset = benchmark.get_test_set()
209 | self.evaluate_prog = Evaluate(
210 | devset=devset,
211 | metric=self.metric,
212 | num_threads=self.num_threads,
213 | display_progress=True,
214 | max_errors=5000,
215 | return_outputs=True,
216 | provide_traceback=True,
217 | )
218 |
219 | self.program_name = getattr(
220 | self.program, "_name", self.program.__class__.__name__
221 | )
222 | self.benchmark_name = benchmark_name or self.benchmark.__class__.__name__
223 | self.results: list[EvaluationResult] = []
224 |
225 | def get_empty_results(self):
226 | return EvaluationResult(
227 | benchmark=self.benchmark_name,
228 | program=self.program_name,
229 | score=0,
230 | cost=0,
231 | input_tokens=0,
232 | output_tokens=0,
233 | )
234 |
235 |
236 | def evaluate_baseline(self, dspy_config=None) -> EvaluationResult:
237 | with dspy.context(**dspy_config):
238 | score, info = self.evaluate_prog(self.program)
239 | result = self.get_empty_results()
240 | datasets, outputs, _ = zip(*info)
241 | managers = [one.process_report for one in outputs]
242 |
243 | result.score = score
244 | result.outputs_raw_data = outputs
245 | result.cost, result.input_tokens, result.output_tokens = calculate_stats(managers)
246 |
247 | return result
248 |
249 | def evaluate(self, dspy_config=None) -> EvaluationResult:
250 | """
251 | Args:
252 | dspy_config: A dictionary of configurations for dspy.context
253 | Returns:
254 | A list of EvaluationResult objects.
255 | """
256 | if dspy_config is None:
257 | dspy_config = {}
258 |
259 | result = self.evaluate_baseline(dspy_config)
260 | self.results = result
261 | return result
262 |
--------------------------------------------------------------------------------
/langProBe/config_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | def read_json(file_path):
3 | """
4 | Read a JSON file and return the content as a dictionary.
5 | """
6 | with open(file_path, 'r') as file:
7 | data = json.load(file)
8 | return data
9 |
10 | def read_jsonl(file_path):
11 | """
12 | Read a JSONL file and return the content as a list of dictionaries.
13 | """
14 | data = []
15 | with open(file_path, 'r', encoding='utf-8') as f:
16 | for line in f:
17 | test_data = json.loads(line)
18 | data.append(test_data)
19 | return data
--------------------------------------------------------------------------------
/langProBe/constants.py:
--------------------------------------------------------------------------------
1 | ROLE = 'role'
2 | CONTENT = 'content'
3 | SYSTEM = 'system'
4 | USER = 'user'
5 | ASSISTANT = 'assistant'
6 | TOOL = 'tool'
7 | TOOL_CALLS = 'tool_calls'
--------------------------------------------------------------------------------
/langProBe/dspy_program.py:
--------------------------------------------------------------------------------
1 | import dspy
2 |
3 |
4 | #################################### Common Programs ####################################
5 |
6 |
7 | def deduplicate(seq: list[str]) -> list[str]:
8 | """
9 | Source: https://stackoverflow.com/a/480227/1493011
10 | """
11 |
12 | seen = set()
13 | return [x for x in seq if not (x in seen or seen.add(x))]
14 |
15 |
16 | class LangProBeDSPyMetaProgram(dspy.Module):
17 | def setup_lm(self, lm, api_key=None, api_base=None):
18 | dspy.settings.experimental = True
19 | self.lm = dspy.LM(lm, api_key=api_key, api_base=api_base)
20 | self.set_lm(self.lm)
21 |
22 | def program_type(self):
23 | return "dspy"
24 |
25 |
26 | class Predict(dspy.Predict, LangProBeDSPyMetaProgram):
27 | pass
28 |
29 |
30 | class CoT(dspy.ChainOfThought, LangProBeDSPyMetaProgram):
31 | pass
32 |
33 |
34 | def default_input_to_query(**kwargs):
35 | if len(kwargs) == 1:
36 | return list(kwargs.values())[0]
37 | else:
38 | raise ValueError(
39 | "Cannot convert multiple inputs to a query, please specify input_to_query."
40 | )
41 |
42 |
43 | class RAG(LangProBeDSPyMetaProgram, dspy.Module):
44 | def __init__(
45 | self,
46 | signature,
47 | retriever=dspy.Retrieve(k=3),
48 | input_to_query=default_input_to_query,
49 | ):
50 | self.retriver = retriever
51 | verified_signature = dspy.ensure_signature(signature)
52 | verified_signature = verified_signature.prepend(
53 | "context", dspy.InputField(desc="may contain relevant facts")
54 | )
55 | self.prog = dspy.ChainOfThought(verified_signature)
56 | self.input_to_query = input_to_query
57 |
58 | def forward(self, **kwargs):
59 | context = self.retriver(self.input_to_query(**kwargs)).passages
60 | pred = self.prog(context=context, **kwargs)
61 | return pred
62 |
63 |
64 | class SimplifiedBaleen(LangProBeDSPyMetaProgram, dspy.Module):
65 | def __init__(
66 | self, signature, query_gen_input=None, retriever=dspy.Retrieve(k=2), max_hops=2
67 | ):
68 | """
69 | args:
70 | signature: The signature to the final generate module
71 | query_gen_input: a list of keywords to be used as input to the query generation module
72 | retriever: a retriever module to be used to retrieve relevant facts
73 | max_hops: the number of hops to be used in the simplified
74 | FIXME (shangyin) correctly handle query_gen_input
75 | """
76 |
77 | self.max_hops = max_hops
78 | self.retriever = retriever
79 | verified_signature = dspy.ensure_signature(signature)
80 | verified_signature = verified_signature.prepend(
81 | "context", dspy.InputField(desc="may contain relevant facts")
82 | )
83 |
84 | # remove the output field from the generate query signature
85 | # generate_query should use a default instruction rather than instruction from the original signature
86 | # FIXME (shangyin) fix the default signature.instructions
87 | input_fields = verified_signature.input_fields
88 | generate_query_signature = dspy.Signature(input_fields)
89 | generate_query_signature = generate_query_signature.append(
90 | "search_query", dspy.OutputField()
91 | )
92 |
93 | self.generate_query = [
94 | dspy.ChainOfThought(generate_query_signature) for _ in range(self.max_hops)
95 | ]
96 | self.generate_answer = dspy.ChainOfThought(verified_signature)
97 |
98 | def forward(self, **kwargs):
99 | context = []
100 |
101 | for hop in range(self.max_hops):
102 | query = self.generate_query[hop](context=context, **kwargs).search_query
103 | passages = self.retriever(query).passages
104 | context = deduplicate(context + passages)
105 |
106 | pred = self.generate_answer(context=context, **kwargs)
107 | return pred
108 |
109 |
110 | #################################### Archon Programs ####################################
111 |
112 | # Note Ranker and Fuser are equipped with self.get_prediction() method to return a Prediction object
113 | # in the original signature
114 |
115 |
116 | class ArchonGenerator(LangProBeDSPyMetaProgram, dspy.Module):
117 | # https://github.com/ScalingIntelligence/Archon/blob/main/src/archon/completions/components/Generator.py
118 |
119 | def __init__(self, signature, n=5):
120 | # For dspy, n responses are generated with a single model now.
121 | # If desired, we can create a new module in dspy that uses multiple models to generate n responses.
122 | verified_signature = dspy.ensure_signature(signature)
123 | assert (
124 | len(verified_signature.output_fields) == 1
125 | ), "ArchonGenerator only supports a single output field"
126 |
127 | self.prog = dspy.ChainOfThought(verified_signature, n=n)
128 | self.output_field = list(verified_signature.output_fields.keys())[0]
129 |
130 | def forward(self, **kwargs) -> dspy.Prediction:
131 | return self.prog(**kwargs)
132 |
133 | def get_responses(self, **kwargs) -> list[str]:
134 | responses = self.prog(**kwargs).completions.__getattr__(self.output_field)
135 | return responses
136 |
137 | def get_formatted_responses(self, **kwargs) -> str:
138 | responses = self.get_responses(**kwargs)
139 | return responses_formatter(responses)
140 |
141 |
142 | def responses_formatter(responses):
143 | if not isinstance(responses, list):
144 | dspy.logger.warning(
145 | "Responses of CriticGenerator should be a list of responses. "
146 | )
147 | responses = [responses]
148 | formatted_responses = []
149 | for i, response in enumerate(responses):
150 | formatted_responses.append(f"[{i+1}] {response}")
151 | return "\n".join(formatted_responses)
152 |
153 |
154 | class FeedbackGeneratorSignature(dspy.Signature):
155 | """
156 | Evaluate all responses based on their relevance to the instructions.
157 | All the responses should be included and evaluated using identifiers.
158 | You must include both strengths and weaknesses, even if there are more of one than the other.
159 | Start with the analysis for the first response and end with the analysis for the last response.
160 | """
161 |
162 | task_instructions = dspy.InputField(
163 | desc="The instructions on how the responses are generated."
164 | )
165 | responses = dspy.InputField(
166 | desc="The generated responses to critize. Each response will start with a numerical identifier in [], like [1].",
167 | )
168 | feedback: list[str] = dspy.OutputField(
169 | desc="The feedback for each response. Discuss the strengths and weaknesses of each response."
170 | )
171 |
172 |
173 | class ArchonCritic(LangProBeDSPyMetaProgram, dspy.Module):
174 | # https://github.com/ScalingIntelligence/Archon/blob/main/src/archon/completions/components/Critic.py
175 |
176 | def __init__(self, signature, n=5):
177 | # signature should be the signature to the original generator module
178 | verified_signature = dspy.ensure_signature(signature)
179 | assert (
180 | len(verified_signature.output_fields) == 1
181 | ), "ArchonCritic only supports a single output field"
182 | self.signature = verified_signature
183 |
184 | self.instructions = verified_signature.instructions
185 | feedback_gen_signature = FeedbackGeneratorSignature
186 | # add all inputfields from the original signature to the feedback_gen_signature
187 | for name, field in reversed(verified_signature.input_fields.items()):
188 | feedback_gen_signature = feedback_gen_signature.prepend(name, field)
189 |
190 | self.feedback_gen = dspy.ChainOfThought(feedback_gen_signature)
191 |
192 | def forward(self, formatted_responses, **kwargs) -> dspy.Prediction:
193 | return self.feedback_gen(
194 | task_instructions=self.instructions, responses=formatted_responses, **kwargs
195 | )
196 |
197 | def get_feedback(self, formatted_responses: str, **kwargs) -> list[str]:
198 | return self.forward(formatted_responses, **kwargs).feedback
199 |
200 |
201 | class RankerGeneratorSignature(dspy.Signature):
202 | """
203 | Rank the responses based on their relevance to the instruction, in descending order (from most relevant to least relevant).
204 | """
205 |
206 | task_instructions = dspy.InputField(
207 | desc="The instructions on how the responses are generated."
208 | )
209 |
210 | responses = dspy.InputField(
211 | desc="The responses to rank. Each response will start with a numerical identifier in [], like [1].",
212 | )
213 |
214 | ranking: list[int] = dspy.OutputField(
215 | desc="The ranking of the responses. List the responses in descending order of relevance to the instructions."
216 | )
217 |
218 |
219 | class ArchonRanker(LangProBeDSPyMetaProgram, dspy.Module):
220 | # https://github.com/ScalingIntelligence/Archon/blob/main/src/archon/completions/components/prompts.py#L68
221 | def __init__(self, signature, n=5, use_critic=False):
222 | verified_signature = dspy.ensure_signature(signature)
223 | assert (
224 | len(verified_signature.output_fields) == 1
225 | ), "ArchonRanker only supports a single output field"
226 | self.signature = verified_signature
227 | self.instructions = verified_signature.instructions
228 |
229 | ranker_signature = RankerGeneratorSignature
230 | if use_critic:
231 | ranker_signature = ranker_signature.append(
232 | "feedback",
233 | dspy.InputField(
234 | desc="The feedback (strength/weakness) for each response."
235 | ),
236 | )
237 | ranker_signature.instructions += (
238 | "and their provided critiques of strengths and weaknesses."
239 | )
240 |
241 | # add all inputfields from the original signature to the feedback_gen_signature
242 | for name, field in reversed(verified_signature.input_fields.items()):
243 | ranker_signature = ranker_signature.prepend(name, field)
244 |
245 | self.ranker = dspy.ChainOfThought(ranker_signature)
246 |
247 | def forward(self, formatted_responses: str, **kwargs):
248 | return self.ranker(
249 | task_instructions=self.instructions, responses=formatted_responses, **kwargs
250 | )
251 |
252 | def get_ranking(self, formatted_responses: str, **kwargs) -> list[int]:
253 | return self.forward(formatted_responses, **kwargs).ranking
254 |
255 | def get_prediction(self, responses: list[str], **kwargs) -> dspy.Prediction:
256 | formatted_responses = responses_formatter(responses)
257 | ranking = self.get_ranking(formatted_responses, **kwargs)
258 | top_response = responses[ranking[0]]
259 | pred = dspy.Prediction()
260 | pred.__setattr__(list(self.signature.output_fields.keys())[0], top_response)
261 | return pred
262 |
263 |
264 | class FuserGeneratorSignature(dspy.Signature):
265 | """
266 | Your task is to synthesize a list of responses to a task into a single, high-quality response of the same format. Do not include explanations.
267 | """
268 |
269 | task_instructions = dspy.InputField(
270 | desc="The instructions on how the responses are generated. Your final response should FOLLOW these instructions."
271 | )
272 |
273 | responses = dspy.InputField(
274 | desc="The responses to synthesize.",
275 | )
276 |
277 | final_response = dspy.OutputField(
278 | desc="""The final response, compiled from the input responses.
279 | Please provide a single response with the same format as all previous responses, excluding the number identifier.
280 | Ensure your response is well-structured, coherent, and adheres to the highest standards of accuracy and reliability. """
281 | )
282 |
283 |
284 | class ArchonFuser(LangProBeDSPyMetaProgram, dspy.Module):
285 | def __init__(self, signature, use_critic=False):
286 | verified_signature = dspy.ensure_signature(signature)
287 | assert (
288 | len(verified_signature.output_fields) == 1
289 | ), "ArchonFuser only supports a single output field"
290 | self.signature = verified_signature
291 | self.instructions = verified_signature.instructions
292 |
293 | fuser_signature = FuserGeneratorSignature
294 | if use_critic:
295 | fuser_signature = fuser_signature.append(
296 | "feedback",
297 | dspy.InputField(
298 | desc="The feedback (strength/weakness) for each response."
299 | ),
300 | )
301 | fuser_signature.instructions += "For each response, we also provide critiques of strengths and weaknesses."
302 | output_field_desc = list(verified_signature.output_fields.values())[
303 | 0
304 | ].json_schema_extra["desc"]
305 | fuser_signature.output_fields["final_response"].json_schema_extra[
306 | "desc"
307 | ] += f"{output_field_desc}"
308 |
309 | # add all inputfields from the original signature to the feedback_gen_signature
310 | for name, field in reversed(verified_signature.input_fields.items()):
311 | fuser_signature = fuser_signature.prepend(name, field)
312 |
313 | self.fuser = dspy.ChainOfThought(fuser_signature)
314 |
315 | def forward(self, formatted_responses: str, **kwargs):
316 | return self.fuser(
317 | task_instructions=self.instructions, responses=formatted_responses, **kwargs
318 | )
319 |
320 | def get_response(self, formatted_responses: str, **kwargs) -> str:
321 | return self.forward(formatted_responses, **kwargs).final_response
322 |
323 | def get_prediction(self, formatted_responses: str, **kwargs) -> dspy.Prediction:
324 | final_response = self.get_response(formatted_responses, **kwargs)
325 | pred = dspy.Prediction()
326 | pred.__setattr__(list(self.signature.output_fields.keys())[0], final_response)
327 | return pred
328 |
329 |
330 | # TODO(shangyin) new adapters from Archon to be added: Verifier
331 |
332 | #################################### Archon Example Programs ####################################
333 |
334 |
335 | class GeneratorCriticRanker(LangProBeDSPyMetaProgram, dspy.Module):
336 | def __init__(self, signature, n=5):
337 | verified_signature = dspy.ensure_signature(signature)
338 | assert (
339 | len(verified_signature.output_fields) == 1
340 | ), "ArchonExample only supports a single output field"
341 | self.signature = verified_signature
342 |
343 | self.generator = ArchonGenerator(self.signature, n)
344 | self.critic = ArchonCritic(self.signature, n)
345 | self.ranker = ArchonRanker(self.signature, n, use_critic=True)
346 |
347 | if n != 5: # override default name
348 | self._name = f"GeneratorCriticRanker{n}"
349 |
350 | def forward(self, **kwargs):
351 | responses = self.generator.get_responses(**kwargs)
352 | formatted_responses = responses_formatter(responses)
353 | feedback = self.critic.get_feedback(formatted_responses, **kwargs)
354 | return self.ranker.get_prediction(responses, feedback=feedback, **kwargs)
355 |
356 |
357 | class GeneratorCriticFuser(LangProBeDSPyMetaProgram, dspy.Module):
358 | def __init__(self, signature, n=5):
359 | verified_signature = dspy.ensure_signature(signature)
360 | assert (
361 | len(verified_signature.output_fields) == 1
362 | ), "GeneratorCriticFuser only supports a single output field"
363 | self.signature = verified_signature
364 |
365 | self.generator = ArchonGenerator(self.signature, n)
366 | self.critic = ArchonCritic(self.signature, n)
367 | self.fuser = ArchonFuser(self.signature, use_critic=True)
368 |
369 | if n != 5: # override default name
370 | self._name = f"GeneratorCriticFuser{n}"
371 |
372 | def forward(self, **kwargs):
373 | formatted_responses = self.generator.get_formatted_responses(**kwargs)
374 | feedback = self.critic.get_feedback(formatted_responses, **kwargs)
375 | return self.fuser.get_prediction(
376 | formatted_responses, feedback=feedback, **kwargs
377 | )
378 |
379 |
380 | class GeneratorRanker(LangProBeDSPyMetaProgram, dspy.Module):
381 | def __init__(self, signature, n=5):
382 | verified_signature = dspy.ensure_signature(signature)
383 | assert (
384 | len(verified_signature.output_fields) == 1
385 | ), "GeneratorRanker only supports a single output field"
386 | self.signature = verified_signature
387 |
388 | self.generator = ArchonGenerator(self.signature, n)
389 | self.ranker = ArchonRanker(self.signature, use_critic=False)
390 |
391 | def forward(self, **kwargs):
392 | responses = self.generator.get_responses(**kwargs)
393 | return self.ranker.get_prediction(responses)
394 |
395 |
396 | class GeneratorFuser(LangProBeDSPyMetaProgram, dspy.Module):
397 | def __init__(self, signature, n=5):
398 | verified_signature = dspy.ensure_signature(signature)
399 | assert (
400 | len(verified_signature.output_fields) == 1
401 | ), "GeneratorFuser only supports a single output field"
402 | self.signature = verified_signature
403 |
404 | self.generator = ArchonGenerator(self.signature, n)
405 | self.fuser = ArchonFuser(self.signature, use_critic=False)
406 |
407 | def forward(self, **kwargs):
408 | formatted_responses = self.generator.get_formatted_responses(**kwargs)
409 | return self.fuser.get_prediction(formatted_responses)
410 |
411 |
412 | if __name__ == "__main__":
413 | # Example usage
414 | dspy.configure(
415 | lm=dspy.LM("openai/gpt-4o-mini"),
416 | # example rm for RAG w. passages from wikipedia dump
417 | rm=dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts"),
418 | )
419 |
420 | question = "What is the capital of France?"
421 | context = "France is a country in Europe."
422 |
423 | # CoT
424 | print("======== CoT =========")
425 | cot = CoT("question, context -> answer")
426 | cot(question=question, context=context)
427 | dspy.settings.lm.inspect_history()
428 |
429 | # RAG
430 | print("======== RAG =========")
431 | rag = RAG("question -> answer")
432 | rag(question=question)
433 | dspy.settings.lm.inspect_history()
434 |
435 | # SimplifiedBaleen
436 | print("======== SimplifiedBaleen =========")
437 | simplified_baleen = SimplifiedBaleen("question -> answer")
438 | simplified_baleen(question=question)
439 | dspy.settings.lm.inspect_history(n=3)
440 |
441 | # GeneratorCriticRanker
442 | print("======== GeneratorCriticRanker =========")
443 | archon_example = GeneratorCriticRanker("question -> answer")
444 | archon_example(question=question)
445 | dspy.settings.lm.inspect_history(n=3)
446 |
447 | # GeneratorRanker
448 | print("======== GeneratorRanker =========")
449 | generator_ranker = GeneratorRanker("question -> answer")
450 | generator_ranker(question=question)
451 | dspy.settings.lm.inspect_history(n=3)
452 |
453 | # GeneratorCriticFuser
454 | print("======== GeneratorCriticFuser =========")
455 | generator_critic_fuser = GeneratorCriticFuser("question -> answer")
456 | generator_critic_fuser(question=question)
457 | dspy.settings.lm.inspect_history(n=3)
458 |
459 | # GeneratorFuser
460 | print("======== GeneratorFuser =========")
461 | generator_fuser = GeneratorFuser("question -> answer")
462 | generator_fuser(question=question)
463 | dspy.settings.lm.inspect_history(n=3)
464 |
--------------------------------------------------------------------------------
/langProBe/evaluation.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import copy
3 | import os
4 | import pathlib
5 | import sys
6 | import time
7 | from contextlib import contextmanager
8 | from pathlib import Path
9 |
10 | import dspy
11 |
12 | from langProBe.analysis import read_evaluation_results
13 | from langProBe.benchmark import BenchmarkMeta, EvaluateBench, EvaluationResult
14 | from langProBe.config_utils import read_json, read_jsonl
15 | from langProBe.dspy_program import (
16 | GeneratorCriticFuser,
17 | GeneratorCriticRanker,
18 | LangProBeDSPyMetaProgram,
19 | )
20 | from langProBe.optimizers import create_optimizer, DEFAULT_OPTIMIZERS
21 | from langProBe.register_benchmark import register_all_benchmarks, registered_benchmarks
22 | from langProBe.evaluation_utils import find_missing_entries, replace_logger_filehandler
23 |
24 |
25 | class CompareAnswerSignature(dspy.Signature):
26 | """
27 | Compare the answer to the ground truth answer.
28 | """
29 |
30 | answer = dspy.InputField(desc="The answer to a problem")
31 | ground_truth = dspy.InputField(desc="The ground truth answer to the same problem")
32 | is_correct = dspy.OutputField(
33 | desc="Whether the answer is correct, either True or False."
34 | )
35 |
36 |
37 | class CompareAnswer(dspy.Module):
38 | def __init__(self):
39 | self.compare_answer = dspy.ChainOfThought(CompareAnswerSignature)
40 |
41 | def forward(self, ground_truth, answer):
42 | pred = self.compare_answer(answer=answer, ground_truth=ground_truth)
43 | return pred
44 |
45 |
46 | def llm_as_judge_evaluate(gold, pred, extract_answer_fun=lambda x: x.answer):
47 | compare_answer = CompareAnswer()
48 | answer_raw = compare_answer(
49 | ground_truth=extract_answer_fun(gold), answer=extract_answer_fun(pred)
50 | ).is_correct
51 | if answer_raw.lower().startswith("true"):
52 | return True
53 | else:
54 | return False
55 |
56 |
57 | @contextmanager
58 | def suppress_output(suppress=True):
59 | if suppress:
60 | # Save the original streams
61 | original_stderr = sys.stderr
62 | original_stdout = sys.stdout
63 |
64 | # Redirect stderr and stdout to devnull
65 | sys.stderr = open(os.devnull, "w")
66 | sys.stdout = open(os.devnull, "w")
67 |
68 | try:
69 | yield
70 | finally:
71 | if suppress:
72 | # Restore the original streams
73 | sys.stderr.close()
74 | sys.stdout.close()
75 | sys.stderr = original_stderr
76 | sys.stdout = original_stdout
77 |
78 |
79 | def generate_evaluation_records(file_path):
80 | file_path = pathlib.Path(file_path)
81 |
82 | # if the records file already exists, do not overwrite it
83 | if (file_path / "evaluation_records.csv").exists():
84 | return
85 |
86 | # List all .txt files in the directory
87 | all_result_files = list(file_path.rglob("*.txt"))
88 |
89 | records = []
90 |
91 | # Process each file
92 | for file in all_result_files:
93 | # Split the filename to get benchmark, program, and optimizer
94 | file_name_parts = file.stem.split("_")
95 | if len(file_name_parts) >= 3:
96 | benchmark = file_name_parts[0]
97 | program = file_name_parts[1]
98 | optimizer = file_name_parts[2]
99 | records.append((benchmark, program, optimizer))
100 | else:
101 | raise ValueError(f"Invalid file name: {file.name}")
102 |
103 | with open(f"{file_path}/evaluation_records.csv", "w") as f:
104 | f.write("benchmark,program,optimizer\n")
105 | for record in records:
106 | f.write(",".join(record) + "\n")
107 |
108 |
109 | def add_to_evaluation_records(file_path, evaluation_results: list[EvaluationResult]):
110 | file_path = pathlib.Path(file_path)
111 |
112 | with open(f"{file_path}/evaluation_records.csv", "a") as f:
113 | for evaluation_result in evaluation_results:
114 | f.write(
115 | f"{evaluation_result.benchmark},{evaluation_result.program},{evaluation_result.optimizer}\n"
116 | )
117 |
118 |
119 | def read_evaluation_records(file_path):
120 | file_path = pathlib.Path(file_path)
121 | records = []
122 |
123 | # create the records file if it does not exist
124 | if not (file_path / "evaluation_records.csv").exists():
125 | # create empty records file without header
126 | with open(f"{file_path}/evaluation_records.csv", "w") as f:
127 | f.write("")
128 | with open(f"{file_path}/evaluation_records.csv", "r") as f:
129 | lines = f.readlines()
130 | for line in lines[1:]:
131 | records.append(tuple(line.strip().split(",")))
132 |
133 | return records
134 |
135 |
136 | def evaluate(
137 | benchmark_meta: BenchmarkMeta,
138 | lm,
139 | file_path,
140 | num_threads=8,
141 | suppress_dspy_output=True,
142 | dataset_mode=None,
143 | dataset_path=None,
144 | missing_mode_file="",
145 | api_key=None,
146 | api_base=None,
147 | ):
148 | """
149 | benchmark_meta: BenchmarkMeta object to evaluate
150 | lm: Language model to use, should be an instance of dspy.LM
151 | missing_mode: only evaluate experiments without a result file
152 | """
153 | dataset_mode = dataset_mode or benchmark_meta.dataset_mode
154 |
155 | if missing_mode_file:
156 | origin_data = read_jsonl(dataset_path)
157 | runed_data = read_jsonl(missing_mode_file)
158 | missing_data = find_missing_entries(origin_data, runed_data)
159 | benchmark = benchmark_meta.benchmark(dataset_mode=dataset_mode, missing_data=missing_data)
160 | replace_logger_filehandler(os.path.splitext(missing_mode_file)[0])
161 | else:
162 | benchmark = benchmark_meta.benchmark(dataset_mode=dataset_mode, dataset_path=dataset_path)
163 | # Canonicalize optimizers to (optimizer, compile_kwargs) tuples
164 | benchmark_name = benchmark_meta.name or benchmark.__class__.__name__
165 |
166 | num_threads = benchmark_meta.num_threads or num_threads
167 | print(f"Evaluating {benchmark_name}")
168 | print(f"num_threads: {num_threads}")
169 | print(f"Test set size: {len(benchmark.test_set)}")
170 |
171 |
172 | Path(file_path).mkdir(parents=True, exist_ok=True)
173 |
174 | evaluation_records = read_evaluation_records(file_path)
175 |
176 | # create a stats file for each experiment
177 | stats_file = os.path.join(file_path, f"{benchmark_name}.stat")
178 | with open(stats_file, "w") as f:
179 | f.write(
180 | f"benchmark: {benchmark_name}\n"
181 | f"lm: {lm}\n"
182 | f"test_set_size: {len(benchmark.test_set)}\n"
183 | )
184 |
185 | for program in benchmark_meta.program:
186 | program_name = getattr(program, "_name", program.__class__.__name__)
187 |
188 | print(f"Program: {program_name}")
189 |
190 | with suppress_output(suppress=suppress_dspy_output):
191 | evaluate_bench = EvaluateBench(
192 | benchmark=benchmark,
193 | program=program,
194 | metric=benchmark_meta.metric,
195 | lm=lm,
196 | benchmark_name=benchmark_meta.name,
197 | num_threads=num_threads,
198 | api_key=api_key if api_key else os.getenv("OPENAI_API_KEY", ""),
199 | api_base=api_base if api_base else os.getenv("OPENAI_API_BASE", ""),
200 | )
201 | evaluate_bench.evaluate()
202 | # print(f"Results: {evaluate_bench.results}")
203 |
204 | # if missing_mode:
205 | # add_to_evaluation_records(file_path, evaluate_bench.results)
206 | evaluation_result = evaluate_bench.results
207 |
208 | file_name = f"{evaluation_result.benchmark}_{evaluation_result.program}"
209 | with open(os.path.join(file_path, f"{file_name}.txt"), "w") as f:
210 | f.write(f"score,cost,input_tokens,output_tokens\n")
211 | f.write(
212 | f"{evaluation_result.score},{evaluation_result.cost},{evaluation_result.input_tokens},"
213 | f"{evaluation_result.output_tokens}\n"
214 | )
215 |
216 |
217 | def evaluate_all(
218 | benchmarks,
219 | lm,
220 | file_path,
221 | num_threads=8,
222 | suppress_dspy_output=False,
223 | dataset_mode=None,
224 | dataset_path=None,
225 | missing_mode_file="",
226 | api_key=None,
227 | api_base=None,
228 | ):
229 | # 只有当benchmarks是字符串列表时才进行注册
230 | if benchmarks and isinstance(benchmarks[0], str):
231 | benchmarks = register_all_benchmarks(benchmarks)
232 |
233 | for benchmark_meta in benchmarks:
234 | evaluate(
235 | benchmark_meta,
236 | lm,
237 | file_path,
238 | num_threads,
239 | suppress_dspy_output,
240 | dataset_mode,
241 | dataset_path,
242 | missing_mode_file,
243 | api_key=api_key,
244 | api_base=api_base,
245 | )
246 |
247 | df = read_evaluation_results(file_path)
248 | df.to_csv(f"{file_path}/evaluation_results.csv", index=False)
249 | df["model"] = lm
250 |
251 | # generate evaluation records
252 | generate_evaluation_records(file_path)
253 |
254 | global_config=None
255 | def main():
256 | import multiprocessing
257 | multiprocessing.freeze_support()
258 |
259 | parser = argparse.ArgumentParser(description="LangProbe benchmark evaluation")
260 | parser.add_argument("--benchmark", type=str, required=True, help="Benchmark to evaluate")
261 | parser.add_argument("--lm", type=str, required=True, help="Language model to use")
262 | parser.add_argument("--lm_api_key", type=str, help="API key for language model")
263 | parser.add_argument(
264 | "--lm_api_base", type=str, help="API base for language model"
265 | )
266 | parser.add_argument(
267 | "--dataset_mode", type=str, help="Dataset mode (train, val, test)"
268 | )
269 | parser.add_argument(
270 | "--dataset_path", type=str, help="Dataset path"
271 | )
272 | parser.add_argument(
273 | "--num_threads", type=int, default=8, help="Number of threads to use"
274 | )
275 | parser.add_argument(
276 | "--file_path", type=str, default="evaluation", help="File path for evaluation results"
277 | )
278 | parser.add_argument(
279 | "--suppress_dspy_output",
280 | action="store_true",
281 | help="Suppress dspy output",
282 | )
283 | parser.add_argument(
284 | "--missing_mode_file",
285 | type=str,
286 | default="",
287 | help="Only run missing experiments (skip experiments that already have results), value = path to log/jsonl",
288 | )
289 | parser.add_argument(
290 | "--config",
291 | type=str,
292 | default='ddgo.json',
293 | help="Configuration file for the benchmark",
294 | )
295 |
296 | args = parser.parse_args()
297 |
298 | global global_config
299 | global_config= read_json(args.config)
300 | # 处理benchmark参数
301 | benchmark_path = args.benchmark
302 | if not benchmark_path.startswith("langProBe."):
303 | benchmark_path = f"langProBe.{benchmark_path}"
304 |
305 | # 注册所有基准测试
306 | register_all_benchmarks([benchmark_path])
307 |
308 | benchmarks = [benchmark for benchmark in registered_benchmarks]
309 | if not benchmarks:
310 | print(f"No benchmark registered with name {args.benchmark}")
311 | sys.exit(1)
312 |
313 | evaluate_all(
314 | benchmarks,
315 | args.lm,
316 | args.file_path,
317 | num_threads=args.num_threads,
318 | suppress_dspy_output=args.suppress_dspy_output,
319 | dataset_mode=args.dataset_mode,
320 | dataset_path=args.dataset_path,
321 | missing_mode_file=args.missing_mode_file,
322 | api_key=args.lm_api_key,
323 | api_base=args.lm_api_base,
324 | )
325 |
326 | if __name__ == "__main__":
327 | main()
328 |
--------------------------------------------------------------------------------
/langProBe/evaluation_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import dspy
3 | from typing import List, Tuple, Optional
4 | from langProBe.program_utils import call_lm, ProcessManager
5 | import langProBe.constants as constants
6 | import logging
7 | import re
8 | import string
9 | import warnings
10 | import os
11 | import logging
12 | import numpy as np
13 |
14 |
15 | EVALUATE_PROMPT = """对于以下问题:{question}
16 |
17 | 请判断预测答案是否回答正确,回答对关键信息就算正确:
18 |
19 | 预测答案: {prediction}
20 | 正确答案: {ground_truth}
21 |
22 | 只需要返回True或False。"""
23 |
24 | def evaluate_final_answer(
25 | question: str,
26 | ground_truth: str,
27 | prediction: str,
28 | manager: ProcessManager,
29 | logger: logging.Logger,
30 | ) -> Tuple[bool, Optional[str]]:
31 | prompt = EVALUATE_PROMPT.format(question=question, prediction=prediction, ground_truth=ground_truth)
32 | messages = [
33 | {
34 | constants.ROLE: constants.USER,
35 | constants.CONTENT: prompt
36 | }
37 | ]
38 | logger.info(f"开始评测final answer")
39 | logger.info(f"question: {question}")
40 | logger.info(f"ground_truth: {ground_truth}")
41 | logger.info(f"prediction: {prediction}")
42 | response_content, _, _ = call_lm(messages, manager, logger, temperature=0.01)
43 | return "true" in response_content.lower()
44 |
45 |
46 | def normalize_number_str(number_str: str) -> float:
47 | # we replace these common units and commas to allow
48 | # conversion to float
49 | for char in ["$", "%", ","]:
50 | number_str = number_str.replace(char, "")
51 | try:
52 | return float(number_str)
53 | except ValueError:
54 | print(f"String {number_str} cannot be normalized to number str.")
55 | return float("inf")
56 |
57 |
58 | def split_string(
59 | s: str,
60 | char_list: list[str] = [",", ";"],
61 | ) -> list[str]:
62 | pattern = f"[{''.join(char_list)}]"
63 | return re.split(pattern, s)
64 |
65 | def normalize_str(input_str, remove_punct=True) -> str:
66 | """
67 | Normalize a string by:
68 | - Removing all white spaces
69 | - Optionally removing punctuation (if remove_punct is True)
70 | - Converting to lowercase
71 | Parameters:
72 | - input_str: str, the string to normalize
73 | - remove_punct: bool, whether to remove punctuation (default: True)
74 | Returns:
75 | - str, the normalized string
76 | """
77 | # Remove all white spaces. Required e.g for seagull vs. sea gull
78 | no_spaces = re.sub(r"\s", "", input_str)
79 |
80 | # Remove punctuation, if specified.
81 | if remove_punct:
82 | translator = str.maketrans("", "", string.punctuation)
83 | return no_spaces.lower().translate(translator)
84 | else:
85 | return no_spaces.lower()
86 |
87 |
88 | def question_scorer(
89 | model_answer: str,
90 | ground_truth: str,
91 | logger: logging.Logger
92 | ) -> Tuple[bool, Optional[str]]:
93 | def is_float(element: any) -> bool:
94 | try:
95 | float(element)
96 | return True
97 | except ValueError:
98 | return False
99 |
100 | if model_answer is None:
101 | model_answer = "None"
102 | logger.debug("Model answer is None. Converted to string 'None'.")
103 |
104 | # If ground truth is a number
105 | if is_float(ground_truth):
106 | info = f"Evaluating '{model_answer}' as a number."
107 | logger.info(info)
108 | normalized_answer = normalize_number_str(model_answer)
109 | try:
110 | result = normalized_answer == float(ground_truth)
111 | logger.debug(f"Normalized model answer: {normalized_answer}, Ground truth: {ground_truth}, Result: {result}")
112 | return result
113 | except ValueError as e:
114 | error_msg = f"Normalization error: {e}"
115 | logger.error(error_msg)
116 | return False
117 |
118 | # If ground truth is a list
119 | elif any(char in ground_truth for char in [",", ";"]):
120 | info = f"Evaluating '{model_answer}' as a comma/semi-colon separated list."
121 | logger.info(info)
122 |
123 | gt_elems = split_string(ground_truth)
124 | ma_elems = split_string(model_answer)
125 | logger.debug(f"Ground truth elements: {gt_elems}")
126 | logger.debug(f"Model answer elements: {ma_elems}")
127 |
128 | # Check if lengths are the same
129 | if len(gt_elems) != len(ma_elems):
130 | warning_msg = "Answer lists have different lengths."
131 | logger.warning(warning_msg)
132 | return False
133 |
134 | # Compare each element as float or string
135 | comparisons = []
136 | for idx, (ma_elem, gt_elem) in enumerate(zip(ma_elems, gt_elems), start=1):
137 | if is_float(gt_elem):
138 | try:
139 | normalized_ma_elem = normalize_number_str(ma_elem)
140 | comparison = normalized_ma_elem == float(gt_elem)
141 | logger.debug(f"Element {idx}: Normalized model answer element '{normalized_ma_elem}' == Ground truth element '{float(gt_elem)}': {comparison}")
142 | except ValueError as e:
143 | error_msg = f"Normalization error at element {idx}: {e}"
144 | logger.error(error_msg)
145 | return False
146 | else:
147 | normalized_ma = normalize_str(ma_elem, remove_punct=False)
148 | normalized_gt = normalize_str(gt_elem, remove_punct=False)
149 | comparison = normalized_ma == normalized_gt
150 | logger.debug(f"Element {idx}: Normalized model answer element '{normalized_ma}' == Ground truth element '{normalized_gt}': {comparison}")
151 | comparisons.append(comparison)
152 |
153 | all_correct = all(comparisons)
154 | if not all_correct:
155 | detail_msg = "Mismatch found in list elements."
156 | logger.info(detail_msg)
157 | return all_correct
158 | logger.debug("All list elements match the ground truth.")
159 | return all_correct
160 |
161 | # If ground truth is a string
162 | else:
163 | info = f"Evaluating '{model_answer}' as a string."
164 | logger.info(info)
165 | normalized_ma = normalize_str(model_answer)
166 | normalized_gt = normalize_str(ground_truth)
167 | result = normalized_ma == normalized_gt
168 | logger.debug(f"Normalized model answer: '{normalized_ma}' == Normalized ground truth: '{normalized_gt}': {result}")
169 | return result
170 |
171 | def mcp_metric(example: dspy.Example, pred: dspy.Prediction):
172 | return pred.success
173 |
174 |
175 |
176 | def extract_questions(data, key):
177 | """从数据中提取指定字段(如 Prompt 或 question)用于比较"""
178 | questions = set()
179 | for item in data:
180 | questions.add(item[key])
181 | return questions
182 |
183 | def find_missing_entries(data_a, data_b):
184 | # data_a是原数据,data_b是已经跑了的数据
185 |
186 | questions_in_b = extract_questions(data_b, 'question')
187 |
188 | # 找出在B中不存在的A条目
189 | missing_entries = [item for item in data_a if item['Prompt'] not in questions_in_b]
190 |
191 | return missing_entries
192 |
193 | import logging
194 |
195 | import os
196 | import logging
197 |
198 | def replace_logger_filehandler(new_log_name):
199 | """
200 | 替换 logger 中已有的 FileHandler,并为每个 logger 保留其原有的 formatter。
201 | 同时删除原有日志文件。
202 |
203 | :param new_log_name: 新的日志文件名(不带后缀)
204 | """
205 |
206 | def update_handler(logger, file_suffix):
207 | old_log_paths = []
208 | formatter = None
209 | for handler in logger.handlers:
210 | if isinstance(handler, logging.FileHandler):
211 | if formatter is None:
212 | formatter = handler.formatter
213 | old_log_paths.append(handler.baseFilename)
214 |
215 | for handler in list(logger.handlers):
216 | if isinstance(handler, logging.FileHandler):
217 | handler.close()
218 | logger.removeHandler(handler)
219 |
220 | for log_path in old_log_paths:
221 | if os.path.exists(log_path):
222 | try:
223 | os.remove(log_path)
224 | except Exception as e:
225 | pass
226 |
227 | if logger.name == 'MCPPredictRunLogger':
228 | new_name = new_log_name.replace("message", "run")
229 | else:
230 | new_name = new_log_name
231 |
232 | new_handler = logging.FileHandler(f"{new_name}.{file_suffix}", mode='a', encoding='utf-8')
233 | if formatter:
234 | new_handler.setFormatter(formatter)
235 | logger.addHandler(new_handler)
236 |
237 | run_logger = logging.getLogger('MCPPredictRunLogger')
238 | update_handler(run_logger, 'log')
239 |
240 | message_logger = logging.getLogger('MCPPredictMessageLogger')
241 | update_handler(message_logger, 'jsonl')
242 |
243 |
244 |
245 | if __name__ == "__main__":
246 | print(question_scorer("123", "123"))
247 |
--------------------------------------------------------------------------------
/langProBe/langchain_program.py:
--------------------------------------------------------------------------------
1 | from langchain.chains import LLMChain
2 | from langchain.prompts import PromptTemplate
3 | from langchain_community.chat_models import ChatLiteLLM
4 |
5 | from langProBe.program_utils import DotDict
6 |
7 |
8 | class LangProBeLangChainMetaProgram:
9 | def __init__(self, input_kwargs, output_kwargs):
10 | self.lm = None
11 | self.input_kwargs = input_kwargs
12 | self.out_kwargs = output_kwargs
13 |
14 | def setup_lm(self, lm: str, api_key: str = None, api_base: str = None):
15 | self.lm = ChatLiteLLM(model=lm, api_key=api_key, api_base=api_base)
16 |
17 |
18 | class NaiveLangChainProgram(LangProBeLangChainMetaProgram):
19 | def __call__(self, **kwargs):
20 | if not self.lm:
21 | raise ValueError("Language model not initialized. Call setup_lm() first.")
22 |
23 | # Validate input keys
24 | missing_keys = [key for key in self.input_kwargs if key not in kwargs]
25 | if missing_keys:
26 | raise ValueError(f"Missing required inputs: {missing_keys}")
27 |
28 | # Dynamically generate prompt template
29 | prompt_text = "Given the following inputs:\n"
30 | for key in self.input_kwargs:
31 | prompt_text += f"- {key}: {{{key}}}\n"
32 | prompt_text += f"Output the following field: {self.out_kwargs[0]}. Your response should be this output field only, with no explanation and formatting.\n Your response:"
33 |
34 | prompt_template = PromptTemplate(
35 | input_variables=self.input_kwargs, template=prompt_text
36 | )
37 |
38 | # Create LLM chain
39 | chain = LLMChain(llm=self.lm, prompt=prompt_template)
40 |
41 | # Run the chain
42 | response = chain.run(kwargs)
43 |
44 | # Format output
45 | return DotDict({self.out_kwargs[0]: response})
46 |
--------------------------------------------------------------------------------
/langProBe/mcp_program.py:
--------------------------------------------------------------------------------
1 | import dspy
2 | from pydantic import BaseModel, Field
3 | from langProBe.program_utils import (
4 | call_lm,
5 | build_init_messages,
6 | build_messages,
7 | response_parsing,
8 | mcp_calling,
9 | ProcessManager
10 | )
11 | import time
12 | from langProBe.evaluation_utils import evaluate_final_answer
13 | import langProBe.constants as constants
14 | import logging
15 | import os
16 | from datetime import datetime
17 | import json
18 | from typing import List, Dict, Optional, Tuple
19 |
20 |
21 | MCP_SAMPLE_SYSTEM_PROMPT = """
22 | You are a helpful assistant. You are able to answer questions using different tools.
23 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.
24 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.
25 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.
26 | The tool description includes:
27 | A brief text description outlining the functionality of the tool.
28 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
29 | """
30 |
31 | class MCP_LM(BaseModel):
32 | model: str = Field(
33 | default=None,
34 | description="The model to use for the MCP program.",
35 | )
36 | api_key: str = Field(
37 | default=None,
38 | description="The API key for the model.",
39 | )
40 | api_base: str = Field(
41 | default=None,
42 | description="The API base URL for the model.",
43 | )
44 |
45 | class LangProBeMCPMetaProgram(dspy.Module):
46 | def __init__(self):
47 | super().__init__()
48 | self.lm = MCP_LM()
49 | def setup_lm(self, lm, api_key=None, api_base=None):
50 | self.lm.model = lm
51 | self.lm.api_key = api_key
52 | self.lm.api_base = api_base
53 |
54 | def program_type(self):
55 | return "mcp"
56 |
57 |
58 | class MCPPredict(LangProBeMCPMetaProgram, dspy.Module):
59 | def __init__(self, max_steps=5, system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, task_name="mcp_sample"):
60 | super().__init__()
61 | self.system_prompt = system_prompt
62 | self.task_name = task_name
63 | self.max_steps = max_steps
64 | self.max_length = 30000
65 |
66 | # 配置运行日志记录器
67 | self.run_logger = logging.getLogger('MCPPredictRunLogger')
68 | self.run_logger.setLevel(logging.INFO)
69 |
70 | # 配置消息日志记录器
71 | self.message_logger = logging.getLogger('MCPPredictMessageLogger')
72 | self.message_logger.setLevel(logging.INFO)
73 |
74 | # 创建日志目录
75 | os.makedirs('logs', exist_ok=True)
76 | self.setup_loggers()
77 |
78 | def setup_loggers(self):
79 | timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
80 |
81 | # 设置运行日志
82 | run_log_file = f'logs/{self.task_name}_run_{timestamp}.log'
83 | run_handler = logging.FileHandler(run_log_file, encoding='utf-8')
84 | run_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
85 | run_handler.setFormatter(run_formatter)
86 | self.run_logger.addHandler(run_handler)
87 |
88 | # 设置消息日志
89 | message_log_file = f'logs/{self.task_name}_messages_{timestamp}.jsonl'
90 | message_handler = logging.FileHandler(message_log_file, encoding='utf-8')
91 | self.message_logger.addHandler(message_handler)
92 |
93 |
94 | def update_log_paths(self, new_log_dir):
95 | # 确保新的日志目录存在
96 | os.makedirs(new_log_dir, exist_ok=True)
97 |
98 | # 更新运行日志记录器
99 | for handler in self.run_logger.handlers[:]:
100 | self.run_logger.removeHandler(handler)
101 |
102 | run_log_file = f'{new_log_dir}/{self.task_name}_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
103 | run_handler = logging.FileHandler(run_log_file, encoding='utf-8')
104 | run_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
105 | run_handler.setFormatter(run_formatter)
106 | self.run_logger.addHandler(run_handler)
107 |
108 | # 更新消息日志记录器
109 | for handler in self.message_logger.handlers[:]:
110 | self.message_logger.removeHandler(handler)
111 |
112 | message_log_file = f'{new_log_dir}/{self.task_name}_messages_{datetime.now().strftime("%Y%m%d_%H%M%S")}.jsonl'
113 | message_handler = logging.FileHandler(message_log_file, encoding='utf-8')
114 | self.message_logger.addHandler(message_handler)
115 |
116 | def evaluate_prediction(self, question: str, ground_truth: str, prediction: str) -> Tuple[bool, Optional[str]]:
117 | answer_eval_manager = ProcessManager()
118 | answer_eval_manager.lm_api_key = self.lm.api_key
119 | answer_eval_manager.lm_api_base = self.lm.api_base
120 | answer_eval_manager.model = "openai/deepseek-v3"
121 | return evaluate_final_answer(question, ground_truth, prediction, answer_eval_manager, self.run_logger)
122 |
123 | def log_messages(self, messages, question, success, time_cost, prompt_tokens_cost, completion_tokens_cost):
124 | log_entry = {
125 | "question": question,
126 | "messages": messages,
127 | "success": success,
128 | "time_cost": time_cost,
129 | "prompt_tokens_cost": prompt_tokens_cost,
130 | "completion_tokens_cost": completion_tokens_cost
131 | }
132 | self.message_logger.info(json.dumps(log_entry, ensure_ascii=False))
133 |
134 |
135 | def forward(self, **kwargs) -> dspy.Prediction:
136 | unique_id = kwargs.get('id')
137 | question = kwargs.get('question')
138 | gt = kwargs.get('answer')
139 |
140 | manager = ProcessManager()
141 | manager.lm_api_key = self.lm.api_key
142 | manager.lm_api_base = self.lm.api_base
143 | manager.model = self.lm.model
144 | manager.id = unique_id
145 |
146 | self.run_logger.info(f"ID: {manager.id}, Starting forward pass for question: {question}")
147 |
148 |
149 | from langProBe.evaluation import global_config
150 | mcps = global_config['mcp_pool']
151 |
152 | messages = build_init_messages(self.system_prompt, mcps, question)
153 | steps = 0
154 | all_completion_tokens = 0
155 | all_prompt_tokens = 0
156 | start_time = time.time()
157 |
158 | while not messages[-1][constants.ROLE] == constants.ASSISTANT and steps < self.max_steps:
159 | response, completion_tokens, prompt_tokens= call_lm(messages, manager, self.run_logger)
160 | all_completion_tokens += completion_tokens
161 | all_prompt_tokens += prompt_tokens
162 | mcp_calls = response_parsing(response)
163 |
164 | new_messages = mcp_calling(mcp_calls, manager, self.run_logger)
165 |
166 | messages = build_messages(messages, new_messages)
167 | steps += 1
168 |
169 | end_time = time.time()
170 |
171 | # 如果达到最大步数仍未得到答案
172 | if messages[-1][constants.ROLE] != constants.ASSISTANT:
173 | self.run_logger.warning("Maximum steps reached without getting an answer")
174 | messages.append({
175 | constants.ROLE: constants.ASSISTANT,
176 | constants.CONTENT: "超过最长次数限制,该问题无法解决",
177 | })
178 |
179 |
180 | self.run_logger.info(f"ID: {manager.id}, Forward pass completed successfully")
181 | success = self.evaluate_prediction(question, gt, messages[-1][constants.CONTENT])
182 | self.log_messages(messages, question, success, (end_time-start_time), all_prompt_tokens, all_completion_tokens)
183 | self.run_logger.info(f"ID: {manager.id}, Evaluation completed successfully")
184 | # self.run_logger.info("==" * 50)
185 |
186 | return dspy.Prediction(
187 | success=success,
188 | question=question,
189 | ground_truth=gt,
190 | answer=messages[-1][constants.CONTENT],
191 | trace=messages,
192 | process_report=manager
193 | )
--------------------------------------------------------------------------------
/langProBe/optimizers.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import random
3 | from dataclasses import dataclass
4 | from functools import partial
5 | from typing import Callable, Type
6 |
7 | import dspy
8 | import dspy.teleprompt
9 | import numpy as np
10 | from dspy.evaluate.evaluate import Evaluate
11 | from dspy.teleprompt import BootstrapFewShot
12 |
13 |
14 | class BootstrapFewShotInfer(BootstrapFewShot):
15 | def __init__(
16 | self,
17 | num_candidates=5,
18 | num_rules=5,
19 | num_threads=8,
20 | teacher_settings=None,
21 | **kwargs,
22 | ):
23 | super().__init__(teacher_settings=teacher_settings, **kwargs)
24 | self.num_candidates = num_candidates
25 | self.num_rules = num_rules
26 | self.num_threads = num_threads
27 | self.rules_induction_program = RulesInductionProgramINFER(
28 | num_rules, teacher_settings=teacher_settings
29 | )
30 | self.metric = kwargs.get("metric")
31 | self.max_errors = kwargs.get("max_errors", 5)
32 |
33 | def compile(self, student, *, teacher=None, trainset, valset=None):
34 | super().compile(student, teacher=teacher, trainset=trainset)
35 | if valset is None:
36 | train_size = int(0.8 * len(trainset))
37 | trainset, valset = trainset[:train_size], trainset[train_size:]
38 | original_program = copy.deepcopy(self.student)
39 | all_predictors = [
40 | p for p in original_program.predictors() if hasattr(p, "signature")
41 | ]
42 | instructions_list = [p.signature.instructions for p in all_predictors]
43 |
44 | best_score = -np.inf
45 | best_program = None
46 |
47 | for candidate_idx in range(self.num_candidates):
48 | candidate_program = copy.deepcopy(original_program)
49 | candidate_predictors = [
50 | p for p in candidate_program.predictors() if hasattr(p, "signature")
51 | ]
52 | for i, predictor in enumerate(candidate_predictors):
53 | predictor.signature.instructions = instructions_list[i]
54 | for i, predictor in enumerate(candidate_predictors):
55 | rules = self.induce_natural_language_rules(predictor, trainset)
56 | predictor.signature.instructions = instructions_list[i]
57 | self.update_program_instructions(predictor, rules)
58 | score = self.evaluate_program(candidate_program, valset)
59 | if score > best_score:
60 | best_score = score
61 | best_program = candidate_program
62 | print(
63 | f"New best candidate (Candidate {candidate_idx+1}) with score {best_score}"
64 | )
65 | print("Final best score:", best_score)
66 | self.student = best_program
67 | return best_program
68 |
69 | def induce_natural_language_rules(self, predictor, trainset):
70 | demos = self.get_predictor_demos(trainset, predictor)
71 | signature = predictor.signature
72 | while True:
73 | examples_text = self.format_examples(demos, signature)
74 | try:
75 | natural_language_rules = self.rules_induction_program(examples_text)
76 | break
77 | except Exception as e:
78 | print("entereing here")
79 | print(len(demos))
80 |
81 | if (
82 | isinstance(e, ValueError)
83 | or e.__class__.__name__ == "BadRequestError"
84 | or "ContextWindowExceededError" in str(e)
85 | ):
86 | if len(demos) > 1:
87 | demos = demos[:-1]
88 | else:
89 | natural_language_rules = ""
90 | raise RuntimeError(
91 | "Failed to generate natural language rules: A single example could not fit in context."
92 | ) from e
93 | return natural_language_rules
94 |
95 | def update_program_instructions(self, predictor, natural_language_rules):
96 | predictor.signature.instructions = (
97 | f"{predictor.signature.instructions}\n\n"
98 | f"Please apply the following rules when making your prediction:\n{natural_language_rules}"
99 | )
100 |
101 | def format_examples(self, demos, signature):
102 | examples_text = ""
103 | for demo in demos:
104 | input_fields = {
105 | k: v for k, v in demo.items() if k in signature.input_fields
106 | }
107 | output_fields = {
108 | k: v for k, v in demo.items() if k in signature.output_fields
109 | }
110 | input_text = "\n".join(f"{k}: {v}" for k, v in input_fields.items())
111 | output_text = "\n".join(f"{k}: {v}" for k, v in output_fields.items())
112 | examples_text += f"Example:\n{input_text}\n{output_text}\n\n"
113 | return examples_text
114 |
115 | def get_predictor_demos(self, trainset, predictor):
116 | signature = predictor.signature
117 | return [
118 | {
119 | key: value
120 | for key, value in example.items()
121 | if key in signature.input_fields or key in signature.output_fields
122 | }
123 | for example in trainset
124 | ]
125 |
126 | def evaluate_program(self, program, dataset):
127 | evaluate = Evaluate(
128 | devset=dataset,
129 | metric=self.metric,
130 | num_threads=self.num_threads,
131 | max_errors=self.max_errors,
132 | display_table=False,
133 | display_progress=True,
134 | return_all_scores=True,
135 | )
136 | score, _ = evaluate(program, metric=self.metric)
137 | return score
138 |
139 |
140 | class RulesInductionProgramINFER(dspy.Module):
141 | def __init__(self, num_rules, teacher_settings=None, verbose=False):
142 | super().__init__()
143 | docstring = f"""Given a set of examples, extract a set of {num_rules} concise and non-redundant natural language rules that explain the patterns in the data. These rules should be specific and actionable, providing clear guidance for performing the task."""
144 |
145 | class CustomRulesInduction(dspy.Signature):
146 | __doc__ = docstring
147 | examples_text = dspy.InputField(desc="Text containing examples")
148 | natural_language_rules = dspy.OutputField(
149 | desc="Induced natural language rules"
150 | )
151 |
152 | self.rules_induction = dspy.ChainOfThought(CustomRulesInduction)
153 | self.verbose = verbose
154 | self.teacher_settings = teacher_settings or {}
155 |
156 | def forward(self, examples_text):
157 | original_temp = dspy.settings.lm.kwargs.get("temperature", 0.7)
158 | if self.teacher_settings:
159 | with dspy.settings.context(**self.teacher_settings):
160 | print("Using teacher settings")
161 | print(dspy.settings.lm.model)
162 | dspy.settings.lm.kwargs["temperature"] = random.uniform(0.9, 1.0)
163 | print(dspy.settings.lm.kwargs["temperature"])
164 | prediction = self.rules_induction(examples_text=examples_text)
165 | else:
166 | # print('Using default DSPy settings')
167 | # print(dspy.settings.lm.model)
168 | dspy.settings.lm.kwargs["temperature"] = random.uniform(0.9, 1.0)
169 | prediction = self.rules_induction(examples_text=examples_text)
170 | dspy.settings.lm.kwargs["temperature"] = original_temp
171 | natural_language_rules = prediction.natural_language_rules.strip()
172 | if self.verbose:
173 | print(natural_language_rules)
174 | return natural_language_rules
175 |
176 |
177 | @dataclass
178 | class OptimizerConfig:
179 | optimizer: Type[dspy.teleprompt.Teleprompter]
180 | init_args: dict
181 | compile_args: dict
182 | langProBe_configs: dict
183 | name: str
184 |
185 | def __str__(self):
186 | return f"""
187 | [[
188 | Optimizer: {self.name} ({self.optimizer})
189 | init_args: {self.init_args}
190 | compile_args: {self.compile_args}
191 | langProBe_configs: {self.langProBe_configs}
192 | ]]
193 | """
194 |
195 | def __repr__(self):
196 | return self.__str__()
197 |
198 |
199 | # Optimizer configuration formats:
200 | DEFAULT_OPTIMIZERS = [
201 | OptimizerConfig(
202 | optimizer=dspy.teleprompt.BootstrapFewShot,
203 | init_args=dict(max_errors=5000, max_labeled_demos=2),
204 | compile_args=dict(),
205 | langProBe_configs=dict(use_valset=False, save_candidate_score=False),
206 | name="BootstrapFewShot",
207 | ),
208 | OptimizerConfig(
209 | optimizer=dspy.teleprompt.BootstrapFewShotWithRandomSearch,
210 | init_args=dict(max_errors=5000, max_labeled_demos=2, num_threads=16),
211 | compile_args=dict(),
212 | langProBe_configs=dict(use_valset=True, save_candidate_score=True),
213 | name="BootstrapFewShotWithRandomSearch",
214 | ),
215 | OptimizerConfig(
216 | optimizer=dspy.teleprompt.MIPROv2,
217 | init_args=dict(max_errors=5000, auto="medium", num_threads=16),
218 | compile_args=dict(
219 | requires_permission_to_run=False,
220 | num_trials=20,
221 | max_bootstrapped_demos=4,
222 | max_labeled_demos=2,
223 | ),
224 | langProBe_configs=dict(
225 | use_valset=True,
226 | save_candidate_score=True,
227 | ),
228 | name="MIPROv2-lite",
229 | ),
230 | OptimizerConfig(
231 | optimizer=dspy.teleprompt.MIPROv2,
232 | init_args=dict(max_errors=5000, num_threads=16, num_candidates=12),
233 | compile_args=dict(
234 | requires_permission_to_run=False,
235 | num_trials=50,
236 | max_bootstrapped_demos=4,
237 | max_labeled_demos=2,
238 | minibatch_size=35,
239 | minibatch_full_eval_steps=5,
240 | ),
241 | langProBe_configs=dict(
242 | use_valset=True,
243 | save_candidate_score=True,
244 | ),
245 | name="MIPROv2",
246 | ),
247 | OptimizerConfig(
248 | optimizer=BootstrapFewShotInfer,
249 | init_args=dict(max_errors=5000, num_candidates=10, num_rules=10, num_threads=8),
250 | compile_args=dict(),
251 | langProBe_configs=dict(use_valset=True),
252 | name="RuleInfer-lite",
253 | ),
254 | OptimizerConfig(
255 | optimizer=BootstrapFewShotInfer,
256 | init_args=dict(max_errors=5000, num_candidates=10, num_rules=20, num_threads=8),
257 | compile_args=dict(),
258 | langProBe_configs=dict(use_valset=True),
259 | name="RuleInfer",
260 | ),
261 | ]
262 |
263 |
264 | def update_optimizer_from_list(
265 | optimizer_list: list[OptimizerConfig], optimizer: OptimizerConfig
266 | ) -> list[OptimizerConfig]:
267 | new_optimizer_list = []
268 | for optimizer_config in optimizer_list:
269 | if optimizer.optimizer == optimizer_config.optimizer:
270 | new_optimizer_list.append(optimizer)
271 | else:
272 | new_optimizer_list.append(optimizer_config)
273 | return new_optimizer_list
274 |
275 |
276 | def create_optimizer(
277 | optimizer_config: OptimizerConfig, metric, num_threads=None
278 | ) -> tuple[Callable, dict]:
279 | name = optimizer_config.name
280 | optimizer = optimizer_config.optimizer
281 | init_args = optimizer_config.init_args
282 | if num_threads and "num_threads" in init_args:
283 | init_args["num_threads"] = num_threads
284 | compile_args = optimizer_config.compile_args
285 | langProBe_configs = optimizer_config.langProBe_configs | {"name": name}
286 | optimizer = optimizer(metric=metric, **init_args)
287 | return partial(optimizer.compile, **compile_args), langProBe_configs
288 |
--------------------------------------------------------------------------------
/langProBe/program_utils.py:
--------------------------------------------------------------------------------
1 | from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log
2 | from typing import List, Tuple, Optional, Dict, Union
3 | from openai import OpenAI
4 | import json
5 | import copy
6 | from pydantic import BaseModel, Field
7 | import re
8 | import os
9 | import langProBe.constants as constants
10 | import logging
11 | from .synced_mcp_client import SyncedMcpClient
12 |
13 | TOOL_PROMPT = """
14 | ## Tool Calling Rules
15 | When external tools are required, the call request must be strictly generated according to the following rules:
16 |
17 | {
18 | "server_name": "",
19 | "tool_name": "",
20 | "inputs": {
21 | "": "",
22 | "": "",
23 | }
24 | }
25 |
26 |
27 | If no tool is called, provide the final answer directly.
28 |
29 | """
30 |
31 | class ProcessManager(BaseModel):
32 | id: str = Field(
33 | default=None,
34 | description="The ID of the process.",
35 | )
36 | lm_api_key: str = Field(
37 | default=os.getenv("OPENAI_API_KEY"),
38 | description="OpenAI API Key"
39 | )
40 | lm_api_base: str = Field(
41 | default=os.getenv("OPENAI_API_BASE"),
42 | description="OpenAI API Base URL"
43 | )
44 | model: str = Field(
45 | default=None,
46 | description="OpenAI Model Name, with prefix 'openai/'"
47 | )
48 | lm_usages: List[Dict] = Field(
49 | default=[],
50 | description="Usage statistics for the model"
51 | )
52 | mcp_rts: List[Dict] = Field(
53 | default=[],
54 | description="Usage statistics for the MCPs"
55 | )
56 | mcp_retry_times: List[Dict] = Field(
57 | default=[],
58 | description="Statistics for the MCP retries"
59 | )
60 |
61 |
62 | class MCPCall(BaseModel):
63 | mcp_server_name: Optional[str] = None
64 | mcp_tool_name: Optional[str] = None
65 | mcp_args: Optional[Dict] = None
66 |
67 |
68 | class MCPCallList(BaseModel):
69 | shutdown: bool = False
70 | mcps: Optional[List[MCPCall]] = None
71 | raw_content: Optional[str] = None
72 |
73 | @retry(
74 | stop=stop_after_attempt(5),
75 | wait=wait_exponential(multiplier=1, min=2, max=10),
76 | reraise=True,
77 | )
78 | def call_lm(
79 | messages: List,
80 | manager: ProcessManager,
81 | logger: logging.Logger,
82 | temperature: float|None=None,
83 | ) -> tuple[str | None, int, int]:
84 |
85 | try:
86 | oai = OpenAI(
87 | api_key=manager.lm_api_key,
88 | base_url=manager.lm_api_base,
89 | )
90 | prefix, model_name = manager.model.split('/')
91 | assert prefix == 'openai'
92 |
93 | if model_name in ['deepseek-r1', 'qwq-plus', 'qwq-32b']: # qwen reasoning模型仅支持流式输出
94 | reasoning_content = "" # 定义完整思考过程
95 | answer_content = "" # 定义完整回复
96 | is_answering = False # 判断是否结束思考过程并开始回复
97 |
98 | completion = oai.chat.completions.create(
99 | model=model_name,
100 | messages=messages,
101 | stream=True,
102 | stream_options={
103 | "include_usage": True
104 | }
105 | )
106 | for chunk in completion:
107 | # 如果chunk.choices为空,则打印usage
108 | if not chunk.choices:
109 | usage = chunk.usage
110 | else:
111 | delta = chunk.choices[0].delta
112 | if hasattr(delta, 'reasoning_content') and delta.reasoning_content != None:
113 | reasoning_content += delta.reasoning_content
114 | else:
115 | # 开始回复
116 | if delta.content != "" and is_answering is False:
117 | is_answering = True
118 | answer_content += delta.content
119 | completion_tokens = usage.completion_tokens
120 | prompt_tokens = usage.prompt_tokens
121 | manager.lm_usages.append({
122 | "completion_tokens": completion_tokens,
123 | "prompt_tokens": prompt_tokens,
124 | })
125 | return '' + reasoning_content + ' ' + answer_content, completion_tokens, prompt_tokens
126 |
127 |
128 | if temperature is not None:
129 | response = oai.beta.chat.completions.parse(
130 | messages=messages,
131 | model=model_name,
132 | temperature = temperature
133 | )
134 | else:
135 | response = oai.beta.chat.completions.parse(
136 | messages=messages,
137 | model=model_name,
138 | )
139 | # print("Response is " + str(response))
140 | response_text = response.choices[0].message.content
141 | completion_tokens = response.usage.completion_tokens
142 | prompt_tokens = response.usage.prompt_tokens
143 | manager.lm_usages.append({
144 | "completion_tokens": completion_tokens,
145 | "prompt_tokens": prompt_tokens,
146 | })
147 | return response_text, completion_tokens, prompt_tokens
148 |
149 | except Exception as e:
150 | logger.error(f"ID: {manager.id}, Error in call_lm: {str(e)}")
151 | if response:
152 | logger.error(f"ID: {manager.id}, Response: {response}")
153 | raise
154 |
155 | def build_system_content(base_system: str,
156 | mcps: List,
157 | ) -> str:
158 | tools_section = "## Available Tools\n"
159 | for mcp in mcps:
160 | tools_section += f"### Server '{mcp['name']}' include following tools\n"
161 | if mcp['name'] in ['wuying-agentbay-mcp-server', 'Playwright']:
162 | tools_section += f"当使用本server来执行搜索任务时,请以https://www.baidu.com为初始网站进行搜索。"
163 | url = mcp.get("url")
164 | if not url:
165 | try:
166 | port = mcp.get('run_config')[0]["port"]
167 | url = f"http://localhost:{port}/sse"
168 | except:
169 | raise Exception("No url found")
170 | client = SyncedMcpClient(server_url=url)
171 | try:
172 | result = client.list_tools()
173 | tools = result.tools
174 | except Exception as e:
175 | raise Exception(f"Fail access to server: {mcp['name']}, error: {e}")
176 |
177 | for t in tools:
178 | tools_section += f"- {t.name}: {t.description}\n"
179 | input_schema = t.inputSchema
180 | required_params = input_schema.get("required", [])
181 | params_desc = []
182 |
183 | if "properties" in input_schema:
184 | for param_name, param_info in input_schema["properties"].items():
185 | is_required = param_name in required_params
186 | param_type = param_info.get("type", "")
187 | param_desc = param_info.get("description", "")
188 |
189 | req_tag = "必填" if is_required else "可选"
190 | params_desc.append(
191 | f"- {param_name} ({param_type}, {req_tag}): {param_desc}"
192 | )
193 |
194 | # 使用更丰富的描述
195 | params_text = "\n".join(params_desc) if params_desc else "无参数"
196 | tools_section += f" 参数:\n{params_text}\n\n"
197 |
198 | prompt = base_system + f"""{tools_section}""" + TOOL_PROMPT
199 |
200 | return prompt
201 |
202 |
203 | def build_init_messages(
204 | base_system: str,
205 | mcps: List,
206 | user_question: str,
207 | ) -> List[Dict]:
208 | system_content = build_system_content(base_system, mcps)
209 | messages = [
210 | {
211 | constants.ROLE: constants.SYSTEM,
212 | constants.CONTENT: system_content
213 | },
214 | {
215 | constants.ROLE: constants.USER,
216 | constants.CONTENT: user_question
217 | }
218 | ]
219 | return messages
220 |
221 |
222 |
223 | def build_messages(
224 | messages: List[Dict],
225 | message_to_append: List[Dict],
226 | ) -> List[Dict]:
227 | assert messages[0][constants.ROLE] == constants.SYSTEM
228 |
229 | final_message = copy.deepcopy(messages)
230 |
231 | if message_to_append:
232 | if message_to_append[-1][constants.ROLE] == constants.USER:
233 | assert len(message_to_append) == 1
234 | assert final_message[-1][constants.ROLE] in {constants.ASSISTANT, constants.TOOL, constants.SYSTEM}
235 | final_message.extend(message_to_append)
236 | elif message_to_append[-1][constants.ROLE] == constants.ASSISTANT:
237 | assert len(message_to_append) == 1
238 | assert final_message[-1][constants.ROLE] in {constants.USER, constants.TOOL}
239 | final_message.extend(message_to_append)
240 | elif message_to_append[-1][constants.ROLE] == constants.TOOL:
241 | assert len(message_to_append) == 2
242 | assert final_message[-1][constants.ROLE] in {constants.USER, constants.TOOL}
243 | final_message.extend(message_to_append)
244 |
245 | # TODO: 超过最长上下文长度处理
246 |
247 | return final_message
248 |
249 |
250 |
251 | def response_parsing(content: str) -> MCPCallList:
252 | pattern = r'(.*?)<\/tool>'
253 | matches = re.findall(pattern, content, re.DOTALL)
254 | mcps = []
255 | for match in matches:
256 | # TODO: 错误处理
257 | data = json.loads(match)
258 | mcps.append(MCPCall(
259 | mcp_server_name=data['server_name'].strip(),
260 | mcp_tool_name=data['tool_name'].strip(),
261 | mcp_args=data['inputs']
262 | ))
263 |
264 | if mcps:
265 | return MCPCallList(shutdown=False, mcps=mcps, raw_content=content)
266 | else:
267 | return MCPCallList(shutdown=True, mcps=None, raw_content=content)
268 |
269 |
270 | def mcp_calling(
271 | mcp_call_list: MCPCallList,
272 | manager: ProcessManager,
273 | logger: logging.Logger,
274 | ) -> List[Dict]:
275 | logger.debug(f"ID:{manager.id}, Entering mcp_calling with mcp_call_list: {mcp_call_list}")
276 |
277 | if mcp_call_list.shutdown:
278 | logger.info(f"ID:{manager.id}, Shutdown flag is set. No more MCP calling.")
279 | messages = [
280 | {
281 | constants.ROLE: constants.ASSISTANT,
282 | constants.CONTENT: mcp_call_list.raw_content if mcp_call_list.raw_content else '',
283 | }
284 | ]
285 | logger.debug(f"ID:{manager.id}, Shutdown messages prepared: {messages}")
286 | return messages
287 | else:
288 | logger.info(f"ID:{manager.id}, Processing MCP call list with {len(mcp_call_list.mcps)} MCPs.")
289 | mcp_list = mcp_call_list.mcps
290 | messages = [
291 | {
292 | constants.ROLE: constants.ASSISTANT,
293 | constants.CONTENT: mcp_call_list.raw_content if mcp_call_list.raw_content else '',
294 | constants.TOOL_CALLS: []
295 | }
296 | ]
297 | result_str = ""
298 | for idx, mcp in enumerate(mcp_list, start=1):
299 | logger.debug(f"ID:{manager.id}, Processing MCP #{idx}: {mcp}")
300 | mcp_server_name = mcp.mcp_server_name
301 | mcp_tool_name = mcp.mcp_tool_name
302 | mcp_args = mcp.mcp_args
303 |
304 | tool_call = {
305 | "type": "function",
306 | "function": {
307 | "name": mcp_tool_name,
308 | "arguments": json.dumps(mcp_args, ensure_ascii=False)
309 | }
310 | }
311 | messages[0][constants.TOOL_CALLS].append(tool_call)
312 | logger.info(f"ID:{manager.id}, Calling MCP Server: {mcp_server_name}, Tool: {mcp_tool_name}, Arguments: {mcp_args}")
313 |
314 | # Manage manager.mcp_rts and manager.mcp_retry_times
315 | from langProBe.evaluation import global_config
316 | try:
317 | parsed_data = global_config
318 |
319 | target_name = mcp_server_name
320 | port = None
321 | url = None
322 | for item in parsed_data.get("mcp_pool", []):
323 | if item.get("name") != target_name:
324 | continue
325 |
326 | url = item.get("url", "")
327 | if url:
328 | logger.debug(f"ID:{manager.id}, Found URL for MCP Server '{target_name}': {url}")
329 | break
330 | run_configs = item.get("run_config", [])
331 | for config in run_configs:
332 | port = config.get("port")
333 | if port:
334 | url = f"http://localhost:{port}/sse"
335 | logger.debug(f"ID:{manager.id}, Constructed URL for MCP Server '{target_name}': {url}")
336 | break
337 | if url:
338 | break
339 |
340 | if not url:
341 | logger.error(f"ID:{manager.id}, No valid URL found for MCP Server '{target_name}'.")
342 | raise ValueError(f"ID:{manager.id}, No valid URL found for MCP Server '{target_name}'.")
343 |
344 | client = SyncedMcpClient(server_url=url)
345 | logger.debug(f"ID:{manager.id}, Initialized SyncedMcpClient with URL: {url}")
346 | client.list_tools()
347 | logger.debug(f"ID:{manager.id}, Retrieved tool list from MCP Server '{target_name}'.")
348 | except Exception as e:
349 | logger.error(f"ID:{manager.id}, Failed to initialize SyncedMcpClient for server '{mcp_server_name}': {str(e)}")
350 | client = None
351 |
352 | if client:
353 | try:
354 | logger.debug(f"ID:{manager.id}, Calling tool '{mcp_tool_name}' with arguments: {mcp_args}")
355 | result = client.call_tool(mcp_tool_name, mcp_args)
356 | texts = [item.text for item in result.content]
357 | result_str_segment = ''.join(texts)
358 | logger.debug(f"ID:{manager.id}, Received result from tool '{mcp_tool_name}': {result_str_segment}")
359 |
360 | logger.info(f"ID:{manager.id}, MCP Server '{mcp_server_name}' returned: {result_str_segment[:5000]}")
361 |
362 | result_str += result_str_segment
363 | except Exception as e:
364 | logger.error(f"ID:{manager.id}, Error calling tool '{mcp_tool_name}' on MCP Server '{mcp_server_name}': {str(e)}")
365 | else:
366 | logger.warning(f"ID:{manager.id}, Skipping tool call for '{mcp_tool_name}' due to client initialization failure.")
367 |
368 | messages.append({
369 | constants.ROLE: constants.TOOL,
370 | constants.CONTENT: result_str[:5000],
371 | })
372 | logger.debug(f"ID:{manager.id}, Final messages prepared: {messages}")
373 | logger.info(f"ID:{manager.id}, mcp_calling completed successfully.")
374 | return messages
375 |
376 | class DotDict(dict):
377 | def __getattr__(self, key):
378 | try:
379 | return self[key]
380 | except KeyError:
381 | raise AttributeError(
382 | f"'{type(self).__name__}' object has no attribute '{key}'"
383 | )
384 |
385 | def __setattr__(self, key, value):
386 | self[key] = value
387 |
388 | def __delattr__(self, key):
389 | try:
390 | del self[key]
391 | except KeyError:
392 | raise AttributeError(
393 | f"'{type(self).__name__}' object has no attribute '{key}'"
394 | )
395 |
--------------------------------------------------------------------------------
/langProBe/register_benchmark.py:
--------------------------------------------------------------------------------
1 | ########################## Benchmarks ##########################
2 | import importlib
3 |
4 |
5 | # To use registered benchmarks, do
6 | # `benchmark.benchmark, benchmark.programs, benchmark.metric`
7 | registered_benchmarks = []
8 |
9 |
10 | def check_benchmark(benchmark):
11 | try:
12 | assert hasattr(benchmark, "benchmark")
13 | except AssertionError:
14 | return False
15 | return True
16 |
17 |
18 | def register_benchmark(benchmark: str):
19 | try:
20 | # 尝试直接导入模块
21 | benchmark_metas = importlib.import_module(benchmark, package="langProBe")
22 | except ModuleNotFoundError:
23 | # 如果直接导入失败,尝试使用完整路径导入
24 | benchmark_metas = importlib.import_module(f"langProBe.{benchmark}", package=None)
25 |
26 | if check_benchmark(benchmark_metas):
27 | registered_benchmarks.extend(benchmark_metas.benchmark)
28 | else:
29 | raise AssertionError(f"{benchmark} does not have the required attributes")
30 | return benchmark_metas.benchmark
31 |
32 |
33 | def register_all_benchmarks(benchmarks):
34 | for benchmark in benchmarks:
35 | register_benchmark(benchmark)
36 | return registered_benchmarks
37 |
--------------------------------------------------------------------------------
/langProBe/synced_mcp_client.py:
--------------------------------------------------------------------------------
1 | # teamwork_mcp/synced_mcp_client.py
2 | import asyncio
3 | import atexit
4 | import logging
5 | import pickle
6 | from multiprocessing import Process, Queue, Lock
7 | from typing import Any, Tuple, Dict
8 |
9 | # 全局客户端实例和锁,确保全局唯一的客户端实例
10 | _CLIENT_INSTANCE = None
11 | _CLIENT_LOCK = Lock()
12 |
13 |
14 | class SyncedMcpClient(Process):
15 | """
16 | A synchronous MCP client that runs the AsyncMCPClient in a separate process
17 | and communicates with it using multiprocessing Queues and pickle.
18 | """
19 |
20 | def __init__(self, server_url: str = None):
21 | super().__init__()
22 | # turn off logging from the logger of 'httpx'
23 | httpx_logger = logging.getLogger("httpx")
24 | httpx_logger.setLevel(logging.WARNING)
25 |
26 | self.server_url = server_url
27 | self.request_queue = Queue()
28 | self.response_queue = Queue()
29 | self.is_running = False
30 | self.daemon = True
31 | atexit.register(self.cleanup)
32 |
33 | # begin new process
34 | self.start()
35 |
36 | def run(self):
37 | """
38 | The main process function that runs the AsyncMCPClient in a separate process.
39 | """
40 | self.is_running = True
41 | asyncio.run(self._run_async_client())
42 |
43 | async def _run_async_client(self):
44 | """
45 | Runs the AsyncMCPClient and handles communication with the main process.
46 | """
47 | from .async_mcp_client import AsyncMCPClient
48 |
49 | client = AsyncMCPClient()
50 | await client.connect_to_sse_server(server_url=self.server_url)
51 |
52 | try:
53 | while self.is_running:
54 | if not self.request_queue.empty():
55 | request = self.request_queue.get()
56 | if request == 'terminate':
57 | break
58 | try:
59 | func_name, args, kwargs = pickle.loads(request)
60 | func = getattr(client, func_name)
61 | result = await func(*args, **kwargs)
62 | self.response_queue.put(pickle.dumps(('success', result)))
63 | except Exception as e:
64 | self.response_queue.put(pickle.dumps(('error', str(e))))
65 | await asyncio.sleep(0.01)
66 |
67 | except Exception as e:
68 | self.httpx_logger.exception(e)
69 | self.response_queue.put(pickle.dumps(('error', f"Client initialization error: {str(e)}")))
70 |
71 | finally:
72 | await client.cleanup()
73 |
74 | def _send_request(self, func_name: str, args: Tuple = (), kwargs: Dict = None) -> Any:
75 | """
76 | Sends a request to the async process and waits for the response.
77 | """
78 | if kwargs is None:
79 | kwargs = {}
80 | self.request_queue.put(pickle.dumps((func_name, args, kwargs)))
81 | response = self.response_queue.get(timeout=900)
82 | status, result = pickle.loads(response)
83 | if status == 'error':
84 | raise Exception(result)
85 | return result
86 |
87 | def call_tool(self, tool_name: str, tool_args: Dict = None) -> Any:
88 | """
89 | Calls a tool synchronously by sending a request to the async process.
90 | """
91 | return self._send_request('call_tool', args=(tool_name,), kwargs={'tool_args': tool_args})
92 |
93 | def get_prompt(self, name: str, arguments: dict[str, str] | None = None) -> Any:
94 | """
95 | Calls a tool synchronously by sending a request to the async process.
96 | """
97 | return self._send_request('get_prompt', args=(), kwargs={'name': name, 'arguments': arguments})
98 |
99 | def read_resource(self, uri) -> Any:
100 | """
101 | Calls a tool synchronously by sending a request to the async process.
102 | """
103 | return self._send_request('read_resource', args=(), kwargs={'uri': uri})
104 |
105 | def list_resources(self) -> Any:
106 | return self._send_request('list_resources', args=(), kwargs={})
107 |
108 | def list_prompts(self) -> Any:
109 | return self._send_request('list_prompts', args=(), kwargs={})
110 |
111 |
112 |
113 | def list_tools(self) -> Any:
114 | """
115 | Lists all available tools synchronously.
116 | """
117 | return self._send_request('list_tools', args=(), kwargs={})
118 |
119 | def process_query(self, query: str) -> Any:
120 | """
121 | Processes a query synchronously.
122 | """
123 | return self._send_request('process_query', args=(query,))
124 |
125 |
126 | def cleanup(self):
127 | """
128 | Cleans up resources and terminates the process.
129 | """
130 | if self.is_running:
131 | self.is_running = False
132 | self.request_queue.put('terminate')
133 | self.join(timeout=5)
134 | if self.is_alive():
135 | self.terminate()
136 | # def synced_main():
137 | # import time
138 | # client = SyncedMcpClient(server_url="http://0.0.0.0:8080/sse")
139 | # client.start()
140 | # result = client.call_tool("get_alerts", {"state": "CA"})
141 | # print(result)
142 | # time.sleep(5)
143 | #
144 | #
145 | # if __name__ == "__main__":
146 | # synced_main()
--------------------------------------------------------------------------------
/launch_mcps_as_sse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # 检查是否提供了配置文件路径参数
4 | if [ -z "$1" ]; then
5 | echo "Usage: $0 "
6 | exit 1
7 | fi
8 |
9 | # 构造完整路径
10 | CONFIG_FILE="$1"
11 | if [[ ! "$CONFIG_FILE" == /* ]]; then
12 | CONFIG_FILE="configs/$CONFIG_FILE"
13 | fi
14 |
15 | # 检查配置文件是否存在
16 | if [[ ! -f "$CONFIG_FILE" ]]; then
17 | echo "配置文件 '$CONFIG_FILE' 不存在。"
18 | exit 1
19 | fi
20 |
21 | # 读取 mcp_pool 数组的长度
22 | SERVER_COUNT=$(jq '.mcp_pool | length' "$CONFIG_FILE")
23 |
24 | if [[ "$SERVER_COUNT" -eq 0 ]]; then
25 | echo "mcp_pool 中未定义服务器。"
26 | exit 1
27 | fi
28 |
29 | # 遍历 mcp_pool 数组,启动每个服务器
30 | for (( i=0; i=2.6
2 | mcp
3 | uv
4 | dashscope
5 | shortuuid
6 | anthropic
--------------------------------------------------------------------------------