├── .idea
    ├── MCPBench.iml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── vcs.xml
    └── workspace.xml
├── LICENSE
├── README.md
├── README_zh.md
├── assets
    └── figure1.png
├── configs
    ├── mcp_config_db.json
    └── mcp_config_template.json
├── evaluation_db.sh
├── evaluation_gaia.sh
├── evaluation_websearch.sh
├── langProBe
    ├── DB
    │   ├── DB_utils
    │   │   ├── __init__.py
    │   │   └── schema.py
    │   ├── __init__.py
    │   ├── data
    │   │   └── car_bi.jsonl
    │   └── db_program.py
    ├── GAIA
    │   ├── __init__.py
    │   ├── data
    │   │   ├── 2023
    │   │   │   ├── __init__.py
    │   │   │   └── validation
    │   │   │   │   ├── 076c8171-9b3b-49b9-a477-244d2a532826.xlsx
    │   │   │   │   ├── 1f975693-876d-457b-a649-393859e79bf3.mp3
    │   │   │   │   ├── 2b3ef98c-cc05-450b-a719-711aee40ac65.mp3
    │   │   │   │   ├── 32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx
    │   │   │   │   ├── 366e2f2b-8632-4ef2-81eb-bc3877489217.pdf
    │   │   │   │   ├── 389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt
    │   │   │   │   ├── 3da89939-209c-4086-8520-7eb734e6b4ef.xlsx
    │   │   │   │   ├── 4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx
    │   │   │   │   ├── 4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx
    │   │   │   │   ├── 54612da3-fd56-4941-80f4-5eb82330de25.xlsx
    │   │   │   │   ├── 5b2a14e8-6e59-479c-80e3-4696e8980152.jpg
    │   │   │   │   ├── 5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx
    │   │   │   │   ├── 6359a0b1-8f7b-499b-9336-840f9ab90688.png
    │   │   │   │   ├── 65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx
    │   │   │   │   ├── 67e8878b-5cef-4375-804e-e6291fdbe78a.pdf
    │   │   │   │   ├── 7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx
    │   │   │   │   ├── 7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx
    │   │   │   │   ├── 7dd30055-0198-452e-8c25-f73dbe27dcb8.pdb
    │   │   │   │   ├── 8d46b8d6-b38a-47ff-ac74-cda14cf2d19b.csv
    │   │   │   │   ├── 8f80e01c-1296-4371-9486-bb3d68651a60.png
    │   │   │   │   ├── 9318445f-fe6a-4e1b-acbf-c68228c9906a.png
    │   │   │   │   ├── 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3
    │   │   │   │   ├── 9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx
    │   │   │   │   ├── b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg
    │   │   │   │   ├── b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png
    │   │   │   │   ├── bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld
    │   │   │   │   ├── bfcd99e1-0690-4b53-a85c-0174a8629083.zip
    │   │   │   │   ├── c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx
    │   │   │   │   ├── cca530fc-4052-43b2-b130-b30968d8aa44.png
    │   │   │   │   ├── cca70ce6-1952-45d2-acd4-80c903b0bc49.png
    │   │   │   │   ├── cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx
    │   │   │   │   ├── d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png
    │   │   │   │   ├── da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx
    │   │   │   │   ├── df6561b2-7ee5-4540-baab-5095f742716a.png
    │   │   │   │   ├── e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf
    │   │   │   │   ├── edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx
    │   │   │   │   ├── f918266a-b3e0-4914-865d-4faa564f1aef.py
    │   │   │   │   └── metadata.jsonl
    │   │   ├── GAIA.py
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── gaia_dev_part.jsonl
    │   │   └── statics.py
    │   └── gaia_program.py
    ├── WebSearch
    │   ├── __init__.py
    │   └── data
    │   │   ├── websearch_300.jsonl
    │   │   └── websearch_600.jsonl
    ├── __init__.py
    ├── analysis.py
    ├── async_mcp_client.py
    ├── benchmark.py
    ├── config_utils.py
    ├── constants.py
    ├── dspy_program.py
    ├── evaluation.py
    ├── evaluation_utils.py
    ├── langchain_program.py
    ├── mcp_program.py
    ├── optimizers.py
    ├── program_utils.py
    ├── register_benchmark.py
    └── synced_mcp_client.py
├── launch_mcps_as_sse.sh
├── mcpbench.pdf
└── requirements.txt


/.idea/MCPBench.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module version="4">
3 |   <component name="PyDocumentationSettings">
4 |     <option name="format" value="PLAIN" />
5 |     <option name="myDocStringFormat" value="Plain" />
6 |   </component>
7 | </module>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (mcpbench)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="AutoImportSettings">
  4 |     <option name="autoReloadType" value="SELECTIVE" />
  5 |   </component>
  6 |   <component name="ChangeListManager">
  7 |     <list default="true" id="6cffd389-9e3d-48fd-9fe0-518297025312" name="Changes" comment="update gaia">
  8 |       <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
  9 |       <change beforePath="$PROJECT_DIR$/README_zh.md" beforeDir="false" afterPath="$PROJECT_DIR$/README_zh.md" afterDir="false" />
 10 |       <change beforePath="$PROJECT_DIR$/configs/brave.json" beforeDir="false" />
 11 |       <change beforePath="$PROJECT_DIR$/configs/ddgo.json" beforeDir="false" />
 12 |       <change beforePath="$PROJECT_DIR$/configs/exa.json" beforeDir="false" />
 13 |       <change beforePath="$PROJECT_DIR$/configs/mysql.json" beforeDir="false" />
 14 |       <change beforePath="$PROJECT_DIR$/configs/tavily.json" beforeDir="false" />
 15 |       <change beforePath="$PROJECT_DIR$/configs/xiyan.json" beforeDir="false" />
 16 |       <change beforePath="$PROJECT_DIR$/evaluation_db.sh" beforeDir="false" afterPath="$PROJECT_DIR$/evaluation_db.sh" afterDir="false" />
 17 |       <change beforePath="$PROJECT_DIR$/evaluation_websearch.sh" beforeDir="false" afterPath="$PROJECT_DIR$/evaluation_websearch.sh" afterDir="false" />
 18 |       <change beforePath="$PROJECT_DIR$/langProBe/DB/DB_utils/async_mcp_client.py" beforeDir="false" />
 19 |       <change beforePath="$PROJECT_DIR$/langProBe/DB/DB_utils/db_metric.py" beforeDir="false" />
 20 |       <change beforePath="$PROJECT_DIR$/langProBe/DB/DB_utils/manager.py" beforeDir="false" />
 21 |       <change beforePath="$PROJECT_DIR$/langProBe/DB/DB_utils/synced_mcp_client.py" beforeDir="false" />
 22 |       <change beforePath="$PROJECT_DIR$/langProBe/DB/DB_utils/text_sse.py" beforeDir="false" />
 23 |       <change beforePath="$PROJECT_DIR$/langProBe/DB/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/langProBe/DB/__init__.py" afterDir="false" />
 24 |       <change beforePath="$PROJECT_DIR$/langProBe/DB/data/schema.txt" beforeDir="false" />
 25 |       <change beforePath="$PROJECT_DIR$/langProBe/DB/db_data.py" beforeDir="false" />
 26 |       <change beforePath="$PROJECT_DIR$/langProBe/DB/db_program.py" beforeDir="false" afterPath="$PROJECT_DIR$/langProBe/DB/db_program.py" afterDir="false" />
 27 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/WebSearch_utils/__init__.py" beforeDir="false" />
 28 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/WebSearch_utils/async_mcp_client.py" beforeDir="false" />
 29 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/WebSearch_utils/manager.py" beforeDir="false" />
 30 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/WebSearch_utils/synced_mcp_client.py" beforeDir="false" />
 31 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/WebSearch_utils/text_sse.py" beforeDir="false" />
 32 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/WebSearch_utils/websearch_metric.py" beforeDir="false" />
 33 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/langProBe/WebSearch/__init__.py" afterDir="false" />
 34 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/data/frames_test.jsonl" beforeDir="false" />
 35 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/run.py" beforeDir="false" />
 36 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/websearch_data.py" beforeDir="false" />
 37 |       <change beforePath="$PROJECT_DIR$/langProBe/WebSearch/websearch_program.py" beforeDir="false" />
 38 |       <change beforePath="$PROJECT_DIR$/langProBe/analysis.py" beforeDir="false" afterPath="$PROJECT_DIR$/langProBe/analysis.py" afterDir="false" />
 39 |       <change beforePath="$PROJECT_DIR$/langProBe/analysis_deprecated.py" beforeDir="false" />
 40 |       <change beforePath="$PROJECT_DIR$/langProBe/benchmark.py" beforeDir="false" afterPath="$PROJECT_DIR$/langProBe/benchmark.py" afterDir="false" />
 41 |       <change beforePath="$PROJECT_DIR$/langProBe/config_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/langProBe/config_utils.py" afterDir="false" />
 42 |       <change beforePath="$PROJECT_DIR$/langProBe/evaluation.py" beforeDir="false" afterPath="$PROJECT_DIR$/langProBe/evaluation.py" afterDir="false" />
 43 |       <change beforePath="$PROJECT_DIR$/langProBe/program_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/langProBe/program_utils.py" afterDir="false" />
 44 |       <change beforePath="$PROJECT_DIR$/langProBe/register_benchmark.py" beforeDir="false" afterPath="$PROJECT_DIR$/langProBe/register_benchmark.py" afterDir="false" />
 45 |       <change beforePath="$PROJECT_DIR$/launch_mcp_as_sse.sh" beforeDir="false" />
 46 |       <change beforePath="$PROJECT_DIR$/logs/websearch_Brave.log" beforeDir="false" />
 47 |       <change beforePath="$PROJECT_DIR$/logs/websearch_DuckDuckGo.log" beforeDir="false" />
 48 |       <change beforePath="$PROJECT_DIR$/logs/websearch_Exa_Search.log" beforeDir="false" />
 49 |       <change beforePath="$PROJECT_DIR$/logs/websearch_Tavily.log" beforeDir="false" />
 50 |       <change beforePath="$PROJECT_DIR$/logs/websearch_Xiyan.log" beforeDir="false" />
 51 |       <change beforePath="$PROJECT_DIR$/logs/websearch_dummy.log" beforeDir="false" />
 52 |       <change beforePath="$PROJECT_DIR$/logs/websearch_predict_bocha.log" beforeDir="false" />
 53 |       <change beforePath="$PROJECT_DIR$/logs/websearch_predict_brave.log" beforeDir="false" />
 54 |     </list>
 55 |     <option name="SHOW_DIALOG" value="false" />
 56 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 57 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 58 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 59 |   </component>
 60 |   <component name="FileTemplateManagerImpl">
 61 |     <option name="RECENT_TEMPLATES">
 62 |       <list>
 63 |         <option value="Python Script" />
 64 |       </list>
 65 |     </option>
 66 |   </component>
 67 |   <component name="Git.Merge.Settings">
 68 |     <option name="BRANCH" value="origin/main" />
 69 |   </component>
 70 |   <component name="Git.Settings">
 71 |     <option name="RECENT_BRANCH_BY_REPOSITORY">
 72 |       <map>
 73 |         <entry key="$PROJECT_DIR$" value="main" />
 74 |       </map>
 75 |     </option>
 76 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
 77 |   </component>
 78 |   <component name="GitHubPullRequestSearchHistory"><![CDATA[{
 79 |   "lastFilter": {
 80 |     "state": "OPEN",
 81 |     "assignee": "XGenerationLab"
 82 |   }
 83 | }]]></component>
 84 |   <component name="GithubPullRequestsUISettings"><![CDATA[{
 85 |   "selectedUrlAndAccountId": {
 86 |     "url": "https://github.com/modelscope/MCPBench.git",
 87 |     "accountId": "22aa76d6-05c9-451c-b488-3b086f2d95f1"
 88 |   }
 89 | }]]></component>
 90 |   <component name="ProjectColorInfo">{
 91 |   &quot;associatedIndex&quot;: 6
 92 | }</component>
 93 |   <component name="ProjectId" id="2w1SejINoRU4iDr0RRgQBo9NfdR" />
 94 |   <component name="ProjectLevelVcsManager">
 95 |     <ConfirmationsSetting value="2" id="Add" />
 96 |   </component>
 97 |   <component name="ProjectViewState">
 98 |     <option name="hideEmptyMiddlePackages" value="true" />
 99 |     <option name="showLibraryContents" value="true" />
100 |   </component>
101 |   <component name="PropertiesComponent"><![CDATA[{
102 |   "keyToString": {
103 |     "Python.launch_client_test.executor": "Run",
104 |     "Python.statics.executor": "Run",
105 |     "RunOnceActivity.ShowReadmeOnStart": "true",
106 |     "git-widget-placeholder": "main",
107 |     "last_opened_file_path": "/Users/shixiaorong/Desktop/MCPBench",
108 |     "node.js.detected.package.eslint": "true",
109 |     "node.js.detected.package.tslint": "true",
110 |     "node.js.selected.package.eslint": "(autodetect)",
111 |     "node.js.selected.package.tslint": "(autodetect)",
112 |     "nodejs_package_manager_path": "npm",
113 |     "vue.rearranger.settings.migration": "true"
114 |   }
115 | }]]></component>
116 |   <component name="RecentsManager">
117 |     <key name="CopyFile.RECENT_KEYS">
118 |       <recent name="$PROJECT_DIR$/langProBe/GAIA/data" />
119 |       <recent name="$PROJECT_DIR$/configs" />
120 |       <recent name="$PROJECT_DIR$/langProBe/GAIA" />
121 |       <recent name="$PROJECT_DIR$/langProBe" />
122 |       <recent name="$PROJECT_DIR$/langProBe/WebSearch/data" />
123 |     </key>
124 |     <key name="MoveFile.RECENT_KEYS">
125 |       <recent name="$PROJECT_DIR$" />
126 |       <recent name="$PROJECT_DIR$/langProBe/GAIA/data" />
127 |       <recent name="$PROJECT_DIR$/langProBe/GAIA/data/GAIA" />
128 |       <recent name="$PROJECT_DIR$/langProBe/GAIA/server" />
129 |     </key>
130 |   </component>
131 |   <component name="SharedIndexes">
132 |     <attachedChunks>
133 |       <set>
134 |         <option value="bundled-js-predefined-1d06a55b98c1-0b3e54e931b4-JavaScript-PY-241.18034.82" />
135 |         <option value="bundled-python-sdk-975db3bf15a3-2767605e8bc2-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-241.18034.82" />
136 |       </set>
137 |     </attachedChunks>
138 |   </component>
139 |   <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
140 |   <component name="TaskManager">
141 |     <task active="true" id="Default" summary="Default task">
142 |       <changelist id="6cffd389-9e3d-48fd-9fe0-518297025312" name="Changes" comment="" />
143 |       <created>1745207684567</created>
144 |       <option name="number" value="Default" />
145 |       <option name="presentableId" value="Default" />
146 |       <updated>1745207684567</updated>
147 |       <workItem from="1745207685674" duration="123461000" />
148 |       <workItem from="1745984013764" duration="884000" />
149 |     </task>
150 |     <task id="LOCAL-00001" summary="update GAIA">
151 |       <option name="closed" value="true" />
152 |       <created>1745570279500</created>
153 |       <option name="number" value="00001" />
154 |       <option name="presentableId" value="LOCAL-00001" />
155 |       <option name="project" value="LOCAL" />
156 |       <updated>1745570279500</updated>
157 |     </task>
158 |     <task id="LOCAL-00002" summary="update GAIA">
159 |       <option name="closed" value="true" />
160 |       <created>1745912634851</created>
161 |       <option name="number" value="00002" />
162 |       <option name="presentableId" value="LOCAL-00002" />
163 |       <option name="project" value="LOCAL" />
164 |       <updated>1745912634851</updated>
165 |     </task>
166 |     <task id="LOCAL-00003" summary="update README">
167 |       <option name="closed" value="true" />
168 |       <created>1745917420634</created>
169 |       <option name="number" value="00003" />
170 |       <option name="presentableId" value="LOCAL-00003" />
171 |       <option name="project" value="LOCAL" />
172 |       <updated>1745917420634</updated>
173 |     </task>
174 |     <task id="LOCAL-00004" summary="update gaia">
175 |       <option name="closed" value="true" />
176 |       <created>1745984178661</created>
177 |       <option name="number" value="00004" />
178 |       <option name="presentableId" value="LOCAL-00004" />
179 |       <option name="project" value="LOCAL" />
180 |       <updated>1745984178661</updated>
181 |     </task>
182 |     <option name="localTasksCounter" value="5" />
183 |     <servers />
184 |   </component>
185 |   <component name="TypeScriptGeneratedFilesManager">
186 |     <option name="version" value="3" />
187 |   </component>
188 |   <component name="Vcs.Log.Tabs.Properties">
189 |     <option name="TAB_STATES">
190 |       <map>
191 |         <entry key="MAIN">
192 |           <value>
193 |             <State />
194 |           </value>
195 |         </entry>
196 |       </map>
197 |     </option>
198 |   </component>
199 |   <component name="VcsManagerConfiguration">
200 |     <MESSAGE value="update GAIA" />
201 |     <MESSAGE value="update README" />
202 |     <MESSAGE value="update gaia" />
203 |     <option name="LAST_COMMIT_MESSAGE" value="update gaia" />
204 |   </component>
205 |   <component name="com.intellij.coverage.CoverageDataManagerImpl">
206 |     <SUITE FILE_PATH="coverage/MCPBench$statics.coverage" NAME="statics Coverage Results" MODIFIED="1745837773196" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/langProBe" />
207 |     <SUITE FILE_PATH="coverage/MCPBench$launch_client_test.coverage" NAME="launch_client_test Coverage Results" MODIFIED="1745320720038" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/langProBe" />
208 |   </component>
209 | </project>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">
  2 | 	🦊 MCPBench: A Benchmark for Evaluating MCP Servers
  3 | </h1>
  4 | 
  5 | 
  6 | 
  7 | <div align="center">
  8 | 
  9 | [![Documentation][docs-image]][docs-url]
 10 | [![Package License][package-license-image]][package-license-url]
 11 | 
 12 | </div>
 13 | 
 14 | <div align="center">
 15 | <h4 align="center">
 16 | 
 17 | [中文](https://github.com/modelscope/MCPBench/blob/main/README_zh.md) |
 18 | [English](https://github.com/modelscope/MCPBench/blob/main/README.md)
 19 | 
 20 | </h4>
 21 | </div>
 22 | 
 23 | MCPBench is an evaluation framework for MCP Servers. It supports the evaluation of three types of servers: Web Search, Database Query and GAIA, and is compatible with both local and remote MCP Servers. The framework primarily evaluates different MCP Servers (such as Brave Search, DuckDuckGo, etc.) in terms of task completion accuracy, latency, and token consumption under the same LLM and Agent configurations. Here is the [evaluation report](https://arxiv.org/abs/2504.11094).
 24 | 
 25 | <img src="assets/figure1.png" alt="MCPBench Overview" width="600"/>
 26 | 
 27 | > The implementation refers to [LangProBe: a Language Programs Benchmark](https://arxiv.org/abs/2502.20315).\
 28 | > Big thanks to Qingxu Fu for the initial implementation!
 29 | 
 30 | <hr>
 31 | 
 32 | 
 33 | 
 34 | # 📋 Table of Contents
 35 | 
 36 | - [🔥 News](#news)
 37 | - [🛠️ Installation](#installation)
 38 | - [🚀 Quick Start](#quick-start)
 39 |   - [Launch MCP Server](#launch-mcp-server)
 40 |   - [Launch Evaluation](#launch-evaluation)
 41 | - [🧂 Datasets and Experiments](#datasets-and-experiments)
 42 | - [🚰 Cite](#cite)
 43 | 
 44 | # 🔥 News
 45 | + `Apr. 29, 2025` 🌟 Update the code for evaluating the MCP Server Package within GAIA.
 46 | + `Apr. 14, 2025` 🌟 We are proud to announce that MCPBench is now open-sourced.
 47 | 
 48 | # 🛠️ Installation
 49 | The framework requires Python version >= 3.11, nodejs and jq.
 50 | 
 51 | ```bash
 52 | conda create -n mcpbench python=3.11 -y
 53 | conda activate mcpbench
 54 | pip install -r requirements.txt
 55 | ```
 56 | # 🚀 Quick Start
 57 | Please first determine the type of MCP server you want to use:
 58 | - If it is a remote host (accessed via **SSE**, such as [ModelScope](https://modelscope.cn/mcp), [Smithery](https://smithery.ai), or localhost), you can directly conduct the [evaluation](#launch-evaluation).
 59 | - If it is started locally (accessed via npx using **STDIO**), you need to launch it.
 60 | 
 61 | ## Launch MCP Server (optional for stdio)
 62 | First, you need to write the following configuration:
 63 | ```json
 64 | {
 65 |     "mcp_pool": [
 66 |         {
 67 |             "name": "firecrawl",
 68 |             "run_config": [
 69 |                 {
 70 |                     "command": "npx -y firecrawl-mcp",
 71 |                     "args": "FIRECRAWL_API_KEY=xxx",
 72 |                     "port": 8005
 73 |                 }
 74 |             ]
 75 |         }  
 76 |     ]
 77 | }
 78 | ```
 79 | Save this config file in the `configs` folder and launch it using:
 80 | 
 81 | ```bash
 82 | sh launch_mcps_as_sse.sh YOUR_CONFIG_FILE
 83 | ```
 84 | 
 85 | For example, save the above configuration in the `configs/firecrawl.json` file and launch it using:
 86 | 
 87 | ```bash
 88 | sh launch_mcps_as_sse.sh firecrawl.json
 89 | ```
 90 | 
 91 | ## Launch Evaluation
 92 | To evaluate the MCP Server's performance, you need to set up the necessary MCP Server information. the code will automatically detect the tools and parameters in the Server, so you don't need to configure them manually, like:
 93 | ```json
 94 | {
 95 |     "mcp_pool": [
 96 |         {
 97 |             "name": "Remote MCP example",
 98 |             "url": "url from https://modelscope.cn/mcp or https://smithery.ai"
 99 |         },
100 |         {
101 |             "name": "firecrawl (Local run example)",
102 |             "run_config": [
103 |                 {
104 |                     "command": "npx -y firecrawl-mcp",
105 |                     "args": "FIRECRAWL_API_KEY=xxx",
106 |                     "port": 8005
107 |                 }
108 |             ]
109 |         }  
110 |     ]
111 | }
112 | ```
113 | 
114 | To evaluate the MCP Server's performance on WebSearch tasks:
115 | ```bash
116 | sh evaluation_websearch.sh YOUR_CONFIG_FILE
117 | ```
118 | 
119 | To evaluate the MCP Server's performance on Database Query tasks:
120 | ```bash
121 | sh evaluation_db.sh YOUR_CONFIG_FILE
122 | ```
123 | 
124 | To evaluate the MCP Server's performance on GAIA tasks:
125 | ```bash
126 | sh evaluation_gaia.sh YOUR_CONFIG_FILE
127 | ```
128 | 
129 | For example, save the above configuration in the `configs/firecrawl.json` file and launch it using:
130 | 
131 | ```bash
132 | sh evaluation_websearch.sh firecrawl.json
133 | ```
134 | 
135 | # Datasets and Experimental Results
136 | Our framework provides two datasets for evaluation. For the WebSearch task, the dataset is located at `MCPBench/langProBe/WebSearch/data/websearch_600.jsonl`, containing 200 QA pairs each from [Frames](https://arxiv.org/abs/2409.12941), news, and technology domains. Our framework for automatically constructing evaluation datasets will be open-sourced later.
137 | 
138 | For the Database Query task, the dataset is located at `MCPBench/langProBe/DB/data/car_bi.jsonl`. You can add your own dataset in the following format:
139 | 
140 | ```json
141 | {
142 |   "unique_id": "",
143 |   "Prompt": "",
144 |   "Answer": ""
145 | }
146 | ```
147 | 
148 | We have evaluated mainstream MCP Servers on both tasks. For detailed experimental results, please refer to [Documentation](https://arxiv.org/abs/2504.11094)
149 | 
150 | # 🚰 Cite
151 | If you find this work useful, please consider citing our project or giving us a 🌟:
152 | 
153 | ```bibtex
154 | @misc{mcpbench,
155 |   title={MCPBench: A Benchmark for Evaluating MCP Servers},
156 |   author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao},
157 |   howpublished = {\url{https://github.com/modelscope/MCPBench}},
158 |   year={2025}
159 | }
160 | ```
161 | 
162 | Alternatively, you may reference our report.
163 | ```bibtex
164 | @article{mcpbench_report,
165 |       title={Evaluation Report on MCP Servers}, 
166 |       author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao},
167 |       year={2025},
168 |       journal={arXiv preprint arXiv:2504.11094},
169 |       url={https://arxiv.org/abs/2504.11094},
170 |       primaryClass={cs.AI}
171 | }
172 | ```
173 | 
174 | [docs-image]: https://img.shields.io/badge/Documentation-EB3ECC
175 | [docs-url]: https://arxiv.org/abs/2504.11094
176 | [package-license-image]: https://img.shields.io/badge/License-Apache_2.0-blue.svg
177 | [package-license-url]: https://github.com/modelscope/MCPBench/blob/main/LICENSE
178 | 
179 | 


--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">
  2 | 	🦊 MCPBench: A Benchmark for Evaluating MCP Servers
  3 | </h1>
  4 | 
  5 | <div align="center">
  6 | 
  7 | [![文档][docs-image]][docs-url]
  8 | [![软件包许可证][package-license-image]][package-license-url]
  9 | 
 10 | </div>
 11 | 
 12 | <div align="center">
 13 | <h4 align="center">
 14 | 
 15 | [中文](https://github.com/modelscope/MCPBench/blob/main/README_zh.md) |
 16 | [English](https://github.com/modelscope/MCPBench/blob/main/README.md)
 17 | 
 18 | </h4>
 19 | </div>
 20 | 
 21 | MCPBench 是一个用于评估 MCP Server的基准测试框架。它支持评估三种类型的服务器：网络搜索、数据库查询和GAIA任务，并且兼容本地和远程 MCP 服务器。该框架主要在相同的 LLM 和 Agent 配置下，从任务完成准确性、延迟和 Token 消耗等方面评估不同的 MCP 服务器（如 Brave Search、DuckDuckGo 等）。详见[评估报告](https://arxiv.org/abs/2504.11094)。
 22 | 
 23 | <img src="assets/figure1.png" alt="MCPBench 概览" width="600"/>
 24 | 
 25 | > 实现参考了 [LangProBe: a Language Programs Benchmark](https://arxiv.org/abs/2502.20315)。\
 26 | > 特别感谢 Qingxu Fu 的初始实现！
 27 | 
 28 | <hr>
 29 | 
 30 | # 📋 目录
 31 | 
 32 | - [🔥 最新动态](#news)
 33 | - [🛠️ 安装](#installation)
 34 | - [🚀 快速开始](#quick-start)
 35 |   - [启动 MCP 服务器](#launch-mcp-server)
 36 |   - [启动评测](#launch-evaluation)
 37 | - [🧂 数据集与实验](#datasets-and-experiments)
 38 | - [🚰 引用](#cite)
 39 | 
 40 | # 🔥 最新动态
 41 | + `2025年4月29日` 🌟 更新了GAIA内MCP Server Package的评测代码。
 42 | + `2025年4月14日` 🌟 MCPBench 正式开源。
 43 | 
 44 | # 🛠️ 安装
 45 | 本框架需要 Python >= 3.11、nodejs 和 jq。
 46 | 
 47 | ```bash
 48 | conda create -n mcpbench python=3.11 -y
 49 | conda activate mcpbench
 50 | pip install -r requirements.txt
 51 | ```
 52 | # 🚀 快速开始
 53 | 请先确定你要使用的 MCP 服务器类型：
 54 | - 若为远程主机（通过 **SSE** 访问，如 [ModelScope](https://modelscope.cn/mcp)、[Smithery](https://smithery.ai) 或 localhost），可直接进行[评测](#launch-evaluation)。
 55 | - 若为本地启动（通过 npx 以 **STDIO** 访问），你需要启动MCP服务器。
 56 | ## 启动 MCP 服务器
 57 | 首先，需要编写如下配置：
 58 | ```json
 59 | {
 60 |     "mcp_pool": [
 61 |         {
 62 |             "name": "firecrawl",
 63 |             "run_config": [
 64 |                 {
 65 |                     "command": "npx -y firecrawl-mcp",
 66 |                     "args": "FIRECRAWL_API_KEY=xxx",
 67 |                     "port": 8005
 68 |                 }
 69 |             ]
 70 |         }  
 71 |     ]
 72 | }
 73 | ```
 74 | 将该配置文件保存至 `configs` 文件夹，并通过如下命令启动：
 75 | 
 76 | ```bash
 77 | sh launch_mcps_as_sse.sh YOUR_CONFIG_FILE
 78 | ```
 79 | 
 80 | 例如，将上述配置保存为 `configs/firecrawl.json`，并通过如下命令启动：
 81 | 
 82 | ```bash
 83 | sh launch_mcps_as_sse.sh firecrawl.json
 84 | ```
 85 | 
 86 | ## 启动评测
 87 | 要评测 MCP 服务器性能，需设置相关信息。代码会自动检测服务器中的工具和参数，无需手动配置。例如：
 88 | 
 89 | ```json
 90 | {
 91 |     "mcp_pool": [
 92 |         {
 93 |             "name": "Remote MCP example",
 94 |             "url": "url from https://modelscope.cn/mcp or https://smithery.ai"
 95 |         },
 96 |         {
 97 |             "name": "firecrawl (Local run example)",
 98 |             "run_config": [
 99 |                 {
100 |                     "command": "npx -y firecrawl-mcp",
101 |                     "args": "FIRECRAWL_API_KEY=xxx",
102 |                     "port": 8005
103 |                 }
104 |             ]
105 |         }  
106 |     ]
107 | }
108 | ```
109 | 
110 | 评测 MCP 服务器在网页搜索任务上的表现：
111 | ```bash
112 | sh evaluation_websearch.sh YOUR_CONFIG_FILE
113 | ```
114 | 
115 | 评测 MCP 服务器在数据库查询任务上的表现：
116 | ```bash
117 | sh evaluation_db.sh YOUR_CONFIG_FILE
118 | ```
119 | 
120 | 评测 MCP 服务器在 GAIA 任务上的表现：
121 | ```bash
122 | sh evaluation_gaia.sh YOUR_CONFIG_FILE
123 | ```
124 | 
125 | 例如，将上述配置保存为 `configs/firecrawl.json`，并通过如下命令启动：
126 | 
127 | ```bash
128 | sh evaluation_websearch.sh firecrawl.json
129 | ```
130 | 
131 | # 数据集与实验结果
132 | 本框架提供了两类评测数据集：
133 | - 网页搜索任务数据集位于 `MCPBench/langProBe/WebSearch/data/websearch_600.jsonl`，包含来自 [Frames](https://arxiv.org/abs/2409.12941)、新闻、科技领域的各200组问答对。自动化构建评测数据集的工具后续也将开源。
134 | - 数据库查询任务数据集位于 `MCPBench/langProBe/DB/data/car_bi.jsonl`。你也可以按如下格式自定义数据集：
135 | 
136 | ```json
137 | {
138 |   "unique_id": "",
139 |   "Prompt": "",
140 |   "Answer": ""
141 | }
142 | ```
143 | 
144 | 我们已在主流 MCP 服务器上完成了上述任务的评测。详细实验结果请参考[文档](https://arxiv.org/abs/2504.11094)。
145 | 
146 | # 🚰 引用
147 | 如果本项目对你有帮助，请引用我们的工作或是给我们一个🌟：
148 | 
149 | ```bibtex
150 | @misc{mcpbench,
151 |   title={MCPBench: A Benchmark for Evaluating MCP Servers},
152 |   author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao},
153 |   howpublished = {\url{https://github.com/modelscope/MCPBench}},
154 |   year={2025}
155 | }
156 | ```
157 | 
158 | 或引用我们的报告：
159 | ```bibtex
160 | @article{mcpbench_report,
161 |       title={Evaluation Report on MCP Servers}, 
162 |       author={Zhiling Luo, Xiaorong Shi, Xuanrui Lin, Jinyang Gao},
163 |       year={2025},
164 |       journal={arXiv preprint arXiv:2504.11094},
165 |       url={https://arxiv.org/abs/2504.11094},
166 |       primaryClass={cs.AI}
167 | }
168 | ```
169 | 
170 | [docs-image]: https://img.shields.io/badge/Documentation-EB3ECC
171 | [docs-url]: https://arxiv.org/abs/2504.11094
172 | [package-license-image]: https://img.shields.io/badge/License-Apache_2.0-blue.svg
173 | [package-license-url]: https://github.com/modelscope/MCPBench/blob/main/LICENSE
174 | 
175 | 


--------------------------------------------------------------------------------
/assets/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/assets/figure1.png


--------------------------------------------------------------------------------
/configs/mcp_config_db.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "mcp_pool": [
 3 |         {
 4 |             "name": "mysql",
 5 |             "run_config": [
 6 |                 {
 7 |                     "command": "uvx --from mysql-mcp-server mysql_mcp_server",
 8 |                     "args": "MYSQL_HOST=localhost MYSQL_PORT=3306 MYSQL_USER=root MYSQL_PASSWORD=xxx MYSQL_DATABASE=car_bi",
 9 |                     "port": 8005
10 |                 }
11 |             ]
12 |         }
13 |     ],
14 |     "query_type": "SQL"
15 | }
16 | 


--------------------------------------------------------------------------------
/configs/mcp_config_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "mcp_pool": [
 3 |         {
 4 |             "name": "Remote MCP example",
 5 |             "url": "url from https://modelscope.cn/mcp or https://smithery.ai"
 6 |         },
 7 |         {
 8 |             "name": "Local run example",
 9 |             "run_config": [
10 |                 {
11 |                     "command": "npx -y firecrawl-mcp",
12 |                     "args": "FIRECRAWL_API_KEY=xxx",
13 |                     "port": 8005
14 |                 }
15 |             ]
16 |         }  
17 |     ]
18 | }
19 | 


--------------------------------------------------------------------------------
/evaluation_db.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 |  # 检查是否提供了配置文件路径参数
 3 |  if [ -z "$1" ]; then
 4 |    echo "Usage: $0 <config_file_path>"
 5 |    exit 1
 6 |  fi
 7 | 
 8 |  # 构造完整路径
 9 |  CONFIG_FILE="$1"
10 |  if [[ ! "$CONFIG_FILE" == /* ]]; then
11 |    CONFIG_FILE="configs/$CONFIG_FILE"
12 |  fi
13 | 
14 | 
15 | 
16 | # 使用更直接的方法启动评估程序，确保多进程正确初始化
17 | DSPY_CACHEDIR=evaluation_mcp/.dspy_cache \
18 | python -c "
19 | import multiprocessing as mp
20 | mp.set_start_method('spawn', True)
21 | from langProBe.evaluation import main
22 | main()
23 | " \
24 | --benchmark=DB \
25 | --dataset_mode=test \
26 | --dataset_path=langProBe/DB/data/car_bi.jsonl \
27 | --file_path=evaluation_db \
28 | --lm=openai/qwen-max-2025-01-25 \
29 | --lm_api_base=https://dashscope.aliyuncs.com/compatible-mode/v1 \
30 | --lm_api_key=xxx \
31 | --missing_mode_file=path/to/logs/task_messages.jsonl \
32 | --num_threads=1 \
33 | --config=$CONFIG_FILE
34 | 


--------------------------------------------------------------------------------
/evaluation_gaia.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 |  # 检查是否提供了配置文件路径参数
 3 |  if [ -z "$1" ]; then
 4 |    echo "Usage: $0 <config_file_path>"
 5 |    exit 1
 6 |  fi
 7 | 
 8 |  # 构造完整路径
 9 |  CONFIG_FILE="$1"
10 |  if [[ ! "$CONFIG_FILE" == /* ]]; then
11 |    CONFIG_FILE="configs/$CONFIG_FILE"
12 |  fi
13 | 
14 | 
15 | 
16 | # 使用更直接的方法启动评估程序，确保多进程正确初始化
17 | DSPY_CACHEDIR=evaluation_mcp/.dspy_cache \
18 | python -c "
19 | import multiprocessing as mp
20 | mp.set_start_method('spawn', True)
21 | from langProBe.evaluation import main
22 | main()
23 | " \
24 | --benchmark=GAIA \
25 | --dataset_mode=full \
26 | --dataset_path=langProBe/GAIA/data/gaia_rest.jsonl \
27 | --file_path=evaluation_gaia \
28 | --lm=openai/qwen-max-2025-01-25 \
29 | --lm_api_base=https://dashscope.aliyuncs.com/compatible-mode/v1 \
30 | --missing_mode_file=path/to/logs/task_messages.jsonl \
31 | --num_threads=1 \
32 | --config=$CONFIG_FILE
33 | 


--------------------------------------------------------------------------------
/evaluation_websearch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 |  # 检查是否提供了配置文件路径参数
 3 |  if [ -z "$1" ]; then
 4 |    echo "Usage: $0 <config_file_path>"
 5 |    exit 1
 6 |  fi
 7 | 
 8 |  # 构造完整路径
 9 |  CONFIG_FILE="$1"
10 |  if [[ ! "$CONFIG_FILE" == /* ]]; then
11 |    CONFIG_FILE="configs/$CONFIG_FILE"
12 |  fi
13 | 
14 | 
15 | 
16 | # 使用更直接的方法启动评估程序，确保多进程正确初始化
17 | DSPY_CACHEDIR=evaluation_mcp/.dspy_cache \
18 | python -c "
19 | import multiprocessing as mp
20 | mp.set_start_method('spawn', True)
21 | from langProBe.evaluation import main
22 | main()
23 | " \
24 | --benchmark=WebSearch \
25 | --dataset_mode=full \
26 | --dataset_path=langProBe/WebSearch/data/websearch_test.jsonl \
27 | --file_path=evaluation_websearch_test \
28 | --lm=openai/deepseek-v3 \
29 | --lm_api_base=https://dashscope.aliyuncs.com/compatible-mode/v1 \
30 | --lm_api_key=xxx \
31 | --missing_mode_file=path/to/logs/task_messages.jsonl \
32 | --num_threads=1 \
33 | --config=$CONFIG_FILE


--------------------------------------------------------------------------------
/langProBe/DB/DB_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/DB/DB_utils/__init__.py


--------------------------------------------------------------------------------
/langProBe/DB/DB_utils/schema.py:
--------------------------------------------------------------------------------
  1 | SCHEMA = """
  2 | create table competitors
  3 | (
  4 |     id                      int unsigned auto_increment comment '唯一标识符'
  5 |         primary key,
  6 |     competitor_name         varchar(50)   not null comment '竞品名称',
  7 |     car_series              varchar(50)   not null comment '车系名称',
  8 |     sales                   int           not null comment '竞品销量',
  9 |     market_share_percentage decimal(5, 2) not null comment '竞品市场占有率百分比',
 10 |     record_date             date          not null comment '记录日期'
 11 | )
 12 |     comment '存储竞品销量和市场占有率' collate = utf8mb4_unicode_520_ci;
 13 | 
 14 | create table customer_flow
 15 | (
 16 |     id               int unsigned auto_increment comment '唯一标识符'
 17 |         primary key,
 18 |     region           varchar(50)   not null comment '大区',
 19 |     store            varchar(50)   not null comment '门店名称',
 20 |     first_visit_flow int           not null comment '首次到店客流量',
 21 |     total_visit_flow int           not null comment '总客流量',
 22 |     visit_datetime   datetime      not null comment '访问时间',
 23 |     conversion_rate  decimal(5, 2) not null comment '成交率'
 24 | )
 25 |     comment '存储大区、门店、客流量和成交率信息' collate = utf8mb4_unicode_520_ci;
 26 | 
 27 | create index idx_region_store
 28 |     on customer_flow (region, store);
 29 | 
 30 | create table inventory
 31 | (
 32 |     id           int unsigned auto_increment comment '唯一标识符'
 33 |         primary key,
 34 |     car_series   varchar(50)  not null comment '车系名称',
 35 |     region       varchar(50)  not null comment '大区',
 36 |     warehouse    varchar(100) not null comment '仓库名称',
 37 |     quantity     int          not null comment '库存数量',
 38 |     last_checked datetime     not null comment '最后盘点时间',
 39 |     series_type  varchar(50)  not null comment '车系类型'
 40 | )
 41 |     comment '存储库存信息' collate = utf8mb4_unicode_520_ci;
 42 | 
 43 | create table market_sales
 44 | (
 45 |     id                      int unsigned auto_increment comment '唯一标识符'
 46 |         primary key,
 47 |     total_market_sales      int  not null comment '总体市场销量',
 48 |     car_series_market_sales int  not null comment '车系市场销量',
 49 |     record_date             date not null comment '记录日期'
 50 | )
 51 |     comment '存储市场销量信息' collate = utf8mb4_unicode_520_ci;
 52 | 
 53 | create table market_share
 54 | (
 55 |     id                      int unsigned auto_increment comment '唯一标识符'
 56 |         primary key,
 57 |     car_series              varchar(50)   not null comment '车系名称',
 58 |     market_share_percentage decimal(5, 2) not null comment '市场占有率百分比',
 59 |     record_date             date          not null comment '记录日期'
 60 | )
 61 |     comment '存储车系市场占有率变化' collate = utf8mb4_unicode_520_ci;
 62 | 
 63 | create table order_stats
 64 | (
 65 |     id                            int unsigned auto_increment comment '唯一标识符'
 66 |         primary key,
 67 |     car_series                    varchar(50) not null comment '车系名称',
 68 |     region                        varchar(50) not null comment '大区',
 69 |     order_quantity                int         not null comment '订单数量',
 70 |     large_order_quantity          int         not null comment '大定数量',
 71 |     locked_order_quantity         int         not null comment '锁单数量',
 72 |     retained_large_order_quantity int         not null comment '留存大定数量'
 73 | )
 74 |     comment '存储订单统计数据' collate = utf8mb4_unicode_520_ci;
 75 | 
 76 | create table policies
 77 | (
 78 |     id             int unsigned auto_increment comment '唯一标识符'
 79 |         primary key,
 80 |     policy_name    varchar(100) not null comment '政策名称',
 81 |     description    text         null comment '政策描述',
 82 |     type           varchar(50)  not null comment '车系类型',
 83 |     effective_date date         not null comment '生效日期',
 84 |     expiry_date    date         null comment '失效日期'
 85 | )
 86 |     comment '存储国家及地方汽车产业政策' collate = utf8mb4_unicode_520_ci;
 87 | 
 88 | create table sales
 89 | (
 90 |     id          int unsigned auto_increment comment '唯一标识符'
 91 |         primary key,
 92 |     car_series  varchar(50) not null comment '车系名称',
 93 |     region      varchar(50) not null comment '大区',
 94 |     quantity    int         not null comment '销量数量',
 95 |     sale_date   date        not null comment '销售日期',
 96 |     series_type varchar(50) not null comment '车系类型'
 97 | )
 98 |     comment '存储实际销量数据' collate = utf8mb4_unicode_520_ci;
 99 | 
100 | create table sales_targets
101 | (
102 |     id             int unsigned auto_increment comment '唯一标识符'
103 |         primary key,
104 |     car_series     varchar(50) not null comment '车系名称',
105 |     region         varchar(50) not null comment '大区',
106 |     monthly_target int         not null comment '月度销量目标',
107 |     yearly_target  int         not null comment '年度销量目标'
108 | )
109 |     comment '存储各车系在各大区的销量目标' collate = utf8mb4_unicode_520_ci;
110 | """


--------------------------------------------------------------------------------
/langProBe/DB/__init__.py:
--------------------------------------------------------------------------------
 1 | from langProBe.benchmark import BenchmarkMeta, MCPBench
 2 | from .db_program import DBPredict
 3 | from langProBe.evaluation_utils import mcp_metric
 4 | 
 5 | MCP_SAMPLE_SYSTEM_PROMPT = """
 6 | You are a helpful assistant. You are able to answer questions using different tools.  
 7 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.  
 8 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.  
 9 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.  
10 | The tool description includes:  
11 | A brief text description outlining the functionality of the tool.  
12 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
13 | """
14 | 
15 | def get_mcp_sample_benchmark():
16 |     mcp_sample_baseline = DBPredict(
17 |         max_steps=5,
18 |         system_prompt=MCP_SAMPLE_SYSTEM_PROMPT,
19 |         task_name="database_search")
20 | 
21 |     return [
22 |         BenchmarkMeta(
23 |             MCPBench,
24 |             [mcp_sample_baseline],
25 |             mcp_metric,
26 |             optimizers=[],
27 |             name="MCP_DB"
28 |         )
29 |     ]
30 | 
31 | benchmark = get_mcp_sample_benchmark()


--------------------------------------------------------------------------------
/langProBe/DB/data/car_bi.jsonl:
--------------------------------------------------------------------------------
 1 | {"unique_id": 2, "Prompt": "2025年2月19日记录的竞品名称是什么？", "Answer": "飞海科技科技有限公司"}
 2 | {"unique_id": 4, "Prompt": "华泰通安网络有限公司的销量是多少？", "Answer": "30"}
 3 | {"unique_id": 6, "Prompt": "诺依曼软件科技有限公司的记录日期是什么时候？", "Answer": "2025-01-05"}
 4 | {"unique_id": 9, "Prompt": "东方峻景网络有限公司的市场占有率是多少？", "Answer": "9.06"}
 5 | {"unique_id": 11, "Prompt": "西南区域中，系列D的订单数量是多少？", "Answer": "60"}
 6 | {"unique_id": 12, "Prompt": "华北区域中，所有车系的大定数量总和是多少？", "Answer": "98"}
 7 | {"unique_id": 13, "Prompt": "华南区域中，首次到店客流量最高的门店是哪个？", "Answer": "帅县店"}
 8 | {"unique_id": 14, "Prompt": "华东区域中，成交率最低的门店是哪个？", "Answer": "强市店"}
 9 | {"unique_id": 15, "Prompt": "西北区域中，总体市场销量最高的日期是哪一天？", "Answer": "2024-01-16"}
10 | {"unique_id": 16, "Prompt": "2024年12月，华南区域的总客流量是多少？", "Answer": "1168"}
11 | {"unique_id": 17, "Prompt": "锁单数量大于10的车系有哪些？", "Answer": "['系列C']"}
12 | {"unique_id": 18, "Prompt": "在2025年2月，华南区域的总订单数量是多少？", "Answer": "0"}
13 | {"unique_id": 19, "Prompt": "留存大定数量最多的车系是哪个？", "Answer": "系列C"}
14 | {"unique_id": 20, "Prompt": "系列A在华东区域的市场占有率是多少？", "Answer": "21.41%"}
15 | {"unique_id": 22, "Prompt": "系列B在华东区域的月度销量目标是多少？", "Answer": "58"}
16 | {"unique_id": 23, "Prompt": "系列D在2025年2月19日的市场占有率是多少？", "Answer": "19.99%"}
17 | {"unique_id": 25, "Prompt": "系列D在华北区域的年度销量目标是多少？", "Answer": "1320"}
18 | {"unique_id": 28, "Prompt": "飞海科技科技有限公司在2025年2月19日的竞品销量是多少？", "Answer": "23"}
19 | {"unique_id": 31, "Prompt": "万迅电脑传媒有限公司的竞品市场占有率百分比是多少？", "Answer": "6.92"}
20 | {"unique_id": 33, "Prompt": "2024年12月30日，系列C在华南区域的销量是多少？", "Answer": "19"}
21 | {"unique_id": 36, "Prompt": "华东区域中燃油车的库存总数是多少？", "Answer": "700"}
22 | {"unique_id": 38, "Prompt": "华南区域中系列B的库存总数是多少？", "Answer": "533"}
23 | {"unique_id": 39, "Prompt": "仓库名称为'梧州市仓库'的库存总数是多少？", "Answer": "330"}
24 | {"unique_id": 40, "Prompt": "系列C在西南区域的库存总数是多少？", "Answer": "177"}
25 | {"unique_id": 44, "Prompt": "所有政策中，哪些政策的类型是‘燃油车’？", "Answer": "['燃油车新购补贴', '燃油车置换补贴']"}
26 | {"unique_id": 45, "Prompt": "最早生效的政策名称是什么？", "Answer": "新能源置换补贴"}
27 | {"unique_id": 46, "Prompt": "失效日期在2024年12月30日之后的政策有哪些？", "Answer": "['燃油车新购补贴', '燃油车置换补贴']"}
28 | {"unique_id": 47, "Prompt": "描述为‘新能源新购补贴’的政策的生效日期是什么时候？", "Answer": "2024-08-16"}
29 | {"unique_id": 48, "Prompt": "名称包含‘置换’的政策有哪些？", "Answer": "['燃油车置换补贴', '新能源置换补贴']"}
30 | {"unique_id": 49, "Prompt": "政策类型为‘新能源’且在2024年内生效的政策有哪些？", "Answer": "['新能源新购补贴', '新能源置换补贴']"}
31 | {"unique_id": 50, "Prompt": "哪条政策的有效期最长？", "Answer": "燃油车新购补贴"}
32 | {"unique_id": 51, "Prompt": "政策‘新能源新购补贴’是否已经失效？", "Answer": "True"}
33 | {"unique_id": 54, "Prompt": "车系市场销量最高的记录日期是哪一天？", "Answer": "2025-01-27"}
34 | {"unique_id": 55, "Prompt": "西南区域系列D的年度销量目标是多少？", "Answer": "1032"}
35 | {"unique_id": 56, "Prompt": "所有政策中，生效日期最早的是哪个政策？", "Answer": "新能源置换补贴"}
36 | {"unique_id": 58, "Prompt": "华东区域系列C的月度销量目标是多少？", "Answer": "97"}
37 | {"unique_id": 59, "Prompt": "哪些政策在2025年仍然有效？", "Answer": "燃油车新购补贴, 燃油车置换补贴"}
38 | {"unique_id": 60, "Prompt": "华北区域系列B的年度销量目标是多少？", "Answer": "2244"}
39 | {"unique_id": 61, "Prompt": "总体市场销量最低的记录日期是哪一天？", "Answer": "2025-02-11"}
40 | {"unique_id": 62, "Prompt": "华南区域系列A的月度销量目标是多少？", "Answer": "184"}
41 | {"unique_id": 63, "Prompt": "系列D在西南区域的库存总数是多少？", "Answer": "253"}
42 | {"unique_id": 64, "Prompt": "系列B在华北区域的总库存量是多少？", "Answer": "396"}
43 | {"unique_id": 65, "Prompt": "华东区域系列A的库存总量是多少？", "Answer": "374"}
44 | {"unique_id": 66, "Prompt": "华南区域系列C的库存总量是多少？", "Answer": "278"}
45 | {"unique_id": 68, "Prompt": "系列B的竞品市场占有率总和是多少？", "Answer": "23.17"}
46 | {"unique_id": 69, "Prompt": "系列A在西南区域的月度销量目标是多少？", "Answer": "57"}
47 | {"unique_id": 70, "Prompt": "系列C在华东区域的年度销量目标是多少？", "Answer": "1164"}
48 | {"unique_id": 71, "Prompt": "系列B在华南区域的库存总量是多少？", "Answer": "533"}
49 | {"unique_id": 72, "Prompt": "记录日期为2025-02-12的竞品销量总和是多少？", "Answer": "61"}
50 | {"unique_id": 74, "Prompt": "车系市场销量最高的记录日期是哪一天？", "Answer": "2025-01-27"}
51 | {"unique_id": 76, "Prompt": "政策‘燃油车新购补贴’的生效日期是什么时候？", "Answer": "2024-02-02"}
52 | {"unique_id": 77, "Prompt": "哪些政策在2025年仍然有效？", "Answer": "['燃油车新购补贴', '燃油车置换补贴']"}
53 | {"unique_id": 78, "Prompt": "总体市场销量最低的记录日期是哪一天？", "Answer": "2025-02-11"}
54 | {"unique_id": 79, "Prompt": "新能源相关的政策有哪些？", "Answer": "['新能源新购补贴', '新能源置换补贴']"}
55 | {"unique_id": 80, "Prompt": "2025年1月11日的车系市场销量是多少？", "Answer": "91"}
56 | {"unique_id": 81, "Prompt": "政策‘新能源新购补贴’的失效日期是什么时候？", "Answer": "2024-12-23"}
57 | {"unique_id": 84, "Prompt": "系列A在2025年2月7日的竞品销量是多少？", "Answer": "87"}
58 | {"unique_id": 85, "Prompt": "华东区域系列C的库存总数是多少？", "Answer": "355"}
59 | {"unique_id": 86, "Prompt": "系列B的竞品市场占有率最高的公司名称是什么？", "Answer": "华泰通安网络有限公司"}
60 | {"unique_id": 87, "Prompt": "2025年2月27日的车系市场销量是多少？", "Answer": "88"}
61 | {"unique_id": 88, "Prompt": "系列D在华北区域的库存总数是多少？", "Answer": "344"}
62 | {"unique_id": 90, "Prompt": "2025年1月20日的总体市场销量是多少？", "Answer": "742"}
63 | {"unique_id": 91, "Prompt": "系列B在华南区域的库存总数是多少？", "Answer": "533"}
64 | {"unique_id": 94, "Prompt": "系列A在华南区域的库存总数是多少？", "Answer": "562"}
65 | {"unique_id": 95, "Prompt": "新能源车的总库存数量是多少？", "Answer": "2385"}
66 | {"unique_id": 96, "Prompt": "哪个仓库的库存数量最多，数量是多少？", "Answer": "梧州县仓库, 297"}
67 | {"unique_id": 97, "Prompt": "华北区域中燃油车的库存总数是多少？", "Answer": "616"}
68 | {"unique_id": 98, "Prompt": "最后盘点时间在2025年1月的库存总数是多少？", "Answer": "1518"}
69 | {"unique_id": 99, "Prompt": "系列B在西南区域的库存总数是多少？", "Answer": "489"}
70 | {"unique_id": 100, "Prompt": "华东区域中新能源车的库存总数是多少？", "Answer": "959"}
71 | {"unique_id": 101, "Prompt": "系列C在华南区域的库存总数是多少？", "Answer": "278"}
72 | {"unique_id": 102, "Prompt": "2025年2月盘点的库存总数是多少？", "Answer": "2133"}


--------------------------------------------------------------------------------
/langProBe/DB/db_program.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import re
  5 | import time
  6 | import traceback
  7 | from datetime import datetime
  8 | from typing import List, Tuple, Optional
  9 | 
 10 | import dspy
 11 | from openai import OpenAI
 12 | 
 13 | from langProBe.dspy_program import LangProBeDSPyMetaProgram
 14 | import langProBe.constants as constants
 15 | 
 16 | from langProBe.mcp_program import MCPPredict
 17 | from langProBe.program_utils import (
 18 |     call_lm,
 19 |     build_init_messages,
 20 |     build_messages,
 21 |     response_parsing,
 22 |     mcp_calling,
 23 |     ProcessManager
 24 | )
 25 | 
 26 | MCP_SAMPLE_SYSTEM_PROMPT = """
 27 | You are a helpful assistant. You are able to answer questions using different tools.  
 28 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.  
 29 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.  
 30 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.  
 31 | The tool description includes:  
 32 | A brief text description outlining the functionality of the tool.  
 33 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
 34 | """
 35 | 
 36 | USER_PROMPT_SQL = """
 37 | Here is the database schema
 38 | {schema}
 39 | 
 40 | Question:
 41 | {question}
 42 | """
 43 | 
 44 | USER_PROMPT_NL = """
 45 | Question:
 46 | {question}
 47 | """
 48 | 
 49 | class DBPredict(MCPPredict):
 50 |     def __init__(self, max_steps=5, system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, task_name="database_search"):
 51 |         super().__init__(max_steps, system_prompt, task_name)
 52 | 
 53 |     def forward(self, **kwargs) -> dspy.Prediction:
 54 |         unique_id = kwargs.get('id')
 55 |         question = kwargs.get('question')
 56 |         gt = kwargs.get('answer')
 57 | 
 58 |         manager = ProcessManager()
 59 |         manager.lm_api_key = self.lm.api_key
 60 |         manager.lm_api_base = self.lm.api_base
 61 |         manager.model = self.lm.model
 62 |         manager.id = unique_id
 63 | 
 64 |         self.run_logger.info(f"ID: {manager.id}, Starting forward pass for question: {question}")
 65 | 
 66 |         from langProBe.evaluation import global_config
 67 |         mcps = global_config['mcp_pool']
 68 | 
 69 |         from langProBe.evaluation import global_config
 70 |         if global_config.get('query_type', 'NL') == 'SQL':
 71 |             from .DB_utils.schema import SCHEMA
 72 |             user_prompt = USER_PROMPT_SQL.format(schema=SCHEMA, question=question)
 73 |         else:
 74 |             user_prompt = USER_PROMPT_NL.format(question=question)
 75 | 
 76 |         messages = build_init_messages(self.system_prompt, mcps, user_prompt)
 77 |         steps = 0
 78 |         all_completion_tokens = 0
 79 |         all_prompt_tokens = 0
 80 |         start_time = time.time()
 81 | 
 82 |         while not messages[-1][constants.ROLE] == constants.ASSISTANT and steps < self.max_steps:
 83 |             response, completion_tokens, prompt_tokens = call_lm(messages, manager, self.run_logger)
 84 |             all_completion_tokens += completion_tokens
 85 |             all_prompt_tokens += prompt_tokens
 86 |             mcp_calls = response_parsing(response)
 87 | 
 88 |             new_messages = mcp_calling(mcp_calls, manager, self.run_logger)
 89 |             messages = build_messages(messages, new_messages)
 90 |             steps += 1
 91 | 
 92 |         end_time = time.time()
 93 | 
 94 |         if messages[-1][constants.ROLE] != constants.ASSISTANT:
 95 |             self.run_logger.warning("Maximum steps reached without getting an answer")
 96 |             messages.append({
 97 |                 constants.ROLE: constants.ASSISTANT,
 98 |                 constants.CONTENT: "超过最长次数限制，该问题无法解决",
 99 |             })
100 | 
101 |         self.run_logger.info(f"ID: {manager.id}, Forward pass completed successfully")
102 |         success = self.evaluate_prediction(question, gt, messages[-1][constants.CONTENT])
103 |         self.log_messages(messages, question, success, (end_time - start_time), all_prompt_tokens,
104 |                           all_completion_tokens)
105 |         self.run_logger.info(f"ID: {manager.id}, Evaluation completed successfully")
106 | 
107 |         return dspy.Prediction(
108 |             success=success,
109 |             question=question,
110 |             ground_truth=gt,
111 |             answer=messages[-1][constants.CONTENT],
112 |             trace=messages,
113 |             process_report=manager
114 |         )
115 | 


--------------------------------------------------------------------------------
/langProBe/GAIA/__init__.py:
--------------------------------------------------------------------------------
 1 | from langProBe.benchmark import BenchmarkMeta, MCPBench
 2 | from langProBe.mcp_program import MCPPredict
 3 | from langProBe.evaluation_utils import mcp_metric
 4 | from .gaia_program import GAIAPredict
 5 | 
 6 | MCP_SAMPLE_SYSTEM_PROMPT = """
 7 | You are a helpful assistant. You are able to answer questions using different tools.  
 8 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.  
 9 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.  
10 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.  
11 | The tool description includes:  
12 | A brief text description outlining the functionality of the tool.  
13 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
14 | If you have obtained the final result. Please provide your final answer enclosed within <answer></answer> tags. Ensure that only the final answer is included, without any additional explanations or commentary.
15 | """
16 | def get_mcp_sample_benchmark():
17 |     mcp_sample_baseline = GAIAPredict(
18 |                                 max_steps=50, 
19 |                                 system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, 
20 |                                 task_name="gaia")
21 |     
22 |     return [
23 |         BenchmarkMeta(
24 |             MCPBench,
25 |             [mcp_sample_baseline],
26 |             mcp_metric,
27 |             optimizers=[],
28 |             name="MCP_GAIA"
29 |         )
30 |     ]
31 | 
32 | benchmark = get_mcp_sample_benchmark()


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/__init__.py


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/076c8171-9b3b-49b9-a477-244d2a532826.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/076c8171-9b3b-49b9-a477-244d2a532826.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/2b3ef98c-cc05-450b-a719-711aee40ac65.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/2b3ef98c-cc05-450b-a719-711aee40ac65.mp3


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/366e2f2b-8632-4ef2-81eb-bc3877489217.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/366e2f2b-8632-4ef2-81eb-bc3877489217.pdf


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt:
--------------------------------------------------------------------------------
1 |        H       H           H
2 | --------------------------------
3 | H          H            H    H     


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/3da89939-209c-4086-8520-7eb734e6b4ef.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/3da89939-209c-4086-8520-7eb734e6b4ef.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/54612da3-fd56-4941-80f4-5eb82330de25.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/54612da3-fd56-4941-80f4-5eb82330de25.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/5b2a14e8-6e59-479c-80e3-4696e8980152.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/5b2a14e8-6e59-479c-80e3-4696e8980152.jpg


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/6359a0b1-8f7b-499b-9336-840f9ab90688.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/6359a0b1-8f7b-499b-9336-840f9ab90688.png


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/67e8878b-5cef-4375-804e-e6291fdbe78a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/67e8878b-5cef-4375-804e-e6291fdbe78a.pdf


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/8d46b8d6-b38a-47ff-ac74-cda14cf2d19b.csv:
--------------------------------------------------------------------------------
  1 | species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
  2 | Adelie,Torgersen,39.1,18.7,181,3750,MALE
  3 | Adelie,Torgersen,39.5,17.4,186,3800,FEMALE
  4 | Adelie,Torgersen,40.3,18,195,3250,FEMALE
  5 | Adelie,Torgersen,,,,,
  6 | Adelie,Torgersen,36.7,19.3,193,3450,FEMALE
  7 | Adelie,Torgersen,39.3,20.6,190,3650,MALE
  8 | Adelie,Torgersen,38.9,17.8,181,3625,FEMALE
  9 | Adelie,Torgersen,39.2,19.6,195,4675,MALE
 10 | Adelie,Torgersen,34.1,18.1,193,3475,
 11 | Adelie,Torgersen,42,20.2,190,4250,
 12 | Adelie,Torgersen,37.8,17.1,186,3300,
 13 | Adelie,Torgersen,37.8,17.3,180,3700,
 14 | Adelie,Torgersen,41.1,17.6,182,3200,FEMALE
 15 | Adelie,Torgersen,38.6,21.2,191,3800,MALE
 16 | Adelie,Torgersen,34.6,21.1,198,4400,MALE
 17 | Adelie,Torgersen,36.6,17.8,185,3700,FEMALE
 18 | Adelie,Torgersen,38.7,19,195,3450,FEMALE
 19 | Adelie,Torgersen,42.5,20.7,197,4500,MALE
 20 | Adelie,Torgersen,34.4,18.4,184,3325,FEMALE
 21 | Adelie,Torgersen,46,21.5,194,4200,MALE
 22 | Adelie,Biscoe,37.8,18.3,174,3400,FEMALE
 23 | Adelie,Biscoe,37.7,18.7,180,3600,MALE
 24 | Adelie,Biscoe,35.9,19.2,189,3800,FEMALE
 25 | Adelie,Biscoe,38.2,18.1,185,3950,MALE
 26 | Adelie,Biscoe,38.8,17.2,180,3800,MALE
 27 | Adelie,Biscoe,35.3,18.9,187,3800,FEMALE
 28 | Adelie,Biscoe,40.6,18.6,183,3550,MALE
 29 | Adelie,Biscoe,40.5,17.9,187,3200,FEMALE
 30 | Adelie,Biscoe,37.9,18.6,172,3150,FEMALE
 31 | Adelie,Biscoe,40.5,18.9,180,3950,MALE
 32 | Adelie,Dream,39.5,16.7,178,3250,FEMALE
 33 | Adelie,Dream,37.2,18.1,178,3900,MALE
 34 | Adelie,Dream,39.5,17.8,188,3300,FEMALE
 35 | Adelie,Dream,40.9,18.9,184,3900,MALE
 36 | Adelie,Dream,36.4,17,195,3325,FEMALE
 37 | Adelie,Dream,39.2,21.1,196,4150,MALE
 38 | Adelie,Dream,38.8,20,190,3950,MALE
 39 | Adelie,Dream,42.2,18.5,180,3550,FEMALE
 40 | Adelie,Dream,37.6,19.3,181,3300,FEMALE
 41 | Adelie,Dream,39.8,19.1,184,4650,MALE
 42 | Adelie,Dream,36.5,18,182,3150,FEMALE
 43 | Adelie,Dream,40.8,18.4,195,3900,MALE
 44 | Adelie,Dream,36,18.5,186,3100,FEMALE
 45 | Adelie,Dream,44.1,19.7,196,4400,MALE
 46 | Adelie,Dream,37,16.9,185,3000,FEMALE
 47 | Adelie,Dream,39.6,18.8,190,4600,MALE
 48 | Adelie,Dream,41.1,19,182,3425,MALE
 49 | Adelie,Dream,37.5,18.9,179,2975,
 50 | Adelie,Dream,36,17.9,190,3450,FEMALE
 51 | Adelie,Dream,42.3,21.2,191,4150,MALE
 52 | Adelie,Biscoe,39.6,17.7,186,3500,FEMALE
 53 | Adelie,Biscoe,40.1,18.9,188,4300,MALE
 54 | Adelie,Biscoe,35,17.9,190,3450,FEMALE
 55 | Adelie,Biscoe,42,19.5,200,4050,MALE
 56 | Adelie,Biscoe,34.5,18.1,187,2900,FEMALE
 57 | Adelie,Biscoe,41.4,18.6,191,3700,MALE
 58 | Adelie,Biscoe,39,17.5,186,3550,FEMALE
 59 | Adelie,Biscoe,40.6,18.8,193,3800,MALE
 60 | Adelie,Biscoe,36.5,16.6,181,2850,FEMALE
 61 | Adelie,Biscoe,37.6,19.1,194,3750,MALE
 62 | Adelie,Biscoe,35.7,16.9,185,3150,FEMALE
 63 | Adelie,Biscoe,41.3,21.1,195,4400,MALE
 64 | Adelie,Biscoe,37.6,17,185,3600,FEMALE
 65 | Adelie,Biscoe,41.1,18.2,192,4050,MALE
 66 | Adelie,Biscoe,36.4,17.1,184,2850,FEMALE
 67 | Adelie,Biscoe,41.6,18,192,3950,MALE
 68 | Adelie,Biscoe,35.5,16.2,195,3350,FEMALE
 69 | Adelie,Biscoe,41.1,19.1,188,4100,MALE
 70 | Adelie,Torgersen,35.9,16.6,190,3050,FEMALE
 71 | Adelie,Torgersen,41.8,19.4,198,4450,MALE
 72 | Adelie,Torgersen,33.5,19,190,3600,FEMALE
 73 | Adelie,Torgersen,39.7,18.4,190,3900,MALE
 74 | Adelie,Torgersen,39.6,17.2,196,3550,FEMALE
 75 | Adelie,Torgersen,45.8,18.9,197,4150,MALE
 76 | Adelie,Torgersen,35.5,17.5,190,3700,FEMALE
 77 | Adelie,Torgersen,42.8,18.5,195,4250,MALE
 78 | Adelie,Torgersen,40.9,16.8,191,3700,FEMALE
 79 | Adelie,Torgersen,37.2,19.4,184,3900,MALE
 80 | Adelie,Torgersen,36.2,16.1,187,3550,FEMALE
 81 | Adelie,Torgersen,42.1,19.1,195,4000,MALE
 82 | Adelie,Torgersen,34.6,17.2,189,3200,FEMALE
 83 | Adelie,Torgersen,42.9,17.6,196,4700,MALE
 84 | Adelie,Torgersen,36.7,18.8,187,3800,FEMALE
 85 | Adelie,Torgersen,35.1,19.4,193,4200,MALE
 86 | Adelie,Dream,37.3,17.8,191,3350,FEMALE
 87 | Adelie,Dream,41.3,20.3,194,3550,MALE
 88 | Adelie,Dream,36.3,19.5,190,3800,MALE
 89 | Adelie,Dream,36.9,18.6,189,3500,FEMALE
 90 | Adelie,Dream,38.3,19.2,189,3950,MALE
 91 | Adelie,Dream,38.9,18.8,190,3600,FEMALE
 92 | Adelie,Dream,35.7,18,202,3550,FEMALE
 93 | Adelie,Dream,41.1,18.1,205,4300,MALE
 94 | Adelie,Dream,34,17.1,185,3400,FEMALE
 95 | Adelie,Dream,39.6,18.1,186,4450,MALE
 96 | Adelie,Dream,36.2,17.3,187,3300,FEMALE
 97 | Adelie,Dream,40.8,18.9,208,4300,MALE
 98 | Adelie,Dream,38.1,18.6,190,3700,FEMALE
 99 | Adelie,Dream,40.3,18.5,196,4350,MALE
100 | Adelie,Dream,33.1,16.1,178,2900,FEMALE
101 | Adelie,Dream,43.2,18.5,192,4100,MALE
102 | Adelie,Biscoe,35,17.9,192,3725,FEMALE
103 | Adelie,Biscoe,41,20,203,4725,MALE
104 | Adelie,Biscoe,37.7,16,183,3075,FEMALE
105 | Adelie,Biscoe,37.8,20,190,4250,MALE
106 | Adelie,Biscoe,37.9,18.6,193,2925,FEMALE
107 | Adelie,Biscoe,39.7,18.9,184,3550,MALE
108 | Adelie,Biscoe,38.6,17.2,199,3750,FEMALE
109 | Adelie,Biscoe,38.2,20,190,3900,MALE
110 | Adelie,Biscoe,38.1,17,181,3175,FEMALE
111 | Adelie,Biscoe,43.2,19,197,4775,MALE
112 | Adelie,Biscoe,38.1,16.5,198,3825,FEMALE
113 | Adelie,Biscoe,45.6,20.3,191,4600,MALE
114 | Adelie,Biscoe,39.7,17.7,193,3200,FEMALE
115 | Adelie,Biscoe,42.2,19.5,197,4275,MALE
116 | Adelie,Biscoe,39.6,20.7,191,3900,FEMALE
117 | Adelie,Biscoe,42.7,18.3,196,4075,MALE
118 | Adelie,Torgersen,38.6,17,188,2900,FEMALE
119 | Adelie,Torgersen,37.3,20.5,199,3775,MALE
120 | Adelie,Torgersen,35.7,17,189,3350,FEMALE
121 | Adelie,Torgersen,41.1,18.6,189,3325,MALE
122 | Adelie,Torgersen,36.2,17.2,187,3150,FEMALE
123 | Adelie,Torgersen,37.7,19.8,198,3500,MALE
124 | Adelie,Torgersen,40.2,17,176,3450,FEMALE
125 | Adelie,Torgersen,41.4,18.5,202,3875,MALE
126 | Adelie,Torgersen,35.2,15.9,186,3050,FEMALE
127 | Adelie,Torgersen,40.6,19,199,4000,MALE
128 | Adelie,Torgersen,38.8,17.6,191,3275,FEMALE
129 | Adelie,Torgersen,41.5,18.3,195,4300,MALE
130 | Adelie,Torgersen,39,17.1,191,3050,FEMALE
131 | Adelie,Torgersen,44.1,18,210,4000,MALE
132 | Adelie,Torgersen,38.5,17.9,190,3325,FEMALE
133 | Adelie,Torgersen,43.1,19.2,197,3500,MALE
134 | Adelie,Dream,36.8,18.5,193,3500,FEMALE
135 | Adelie,Dream,37.5,18.5,199,4475,MALE
136 | Adelie,Dream,38.1,17.6,187,3425,FEMALE
137 | Adelie,Dream,41.1,17.5,190,3900,MALE
138 | Adelie,Dream,35.6,17.5,191,3175,FEMALE
139 | Adelie,Dream,40.2,20.1,200,3975,MALE
140 | Adelie,Dream,37,16.5,185,3400,FEMALE
141 | Adelie,Dream,39.7,17.9,193,4250,MALE
142 | Adelie,Dream,40.2,17.1,193,3400,FEMALE
143 | Adelie,Dream,40.6,17.2,187,3475,MALE
144 | Adelie,Dream,32.1,15.5,188,3050,FEMALE
145 | Adelie,Dream,40.7,17,190,3725,MALE
146 | Adelie,Dream,37.3,16.8,192,3000,FEMALE
147 | Adelie,Dream,39,18.7,185,3650,MALE
148 | Adelie,Dream,39.2,18.6,190,4250,MALE
149 | Adelie,Dream,36.6,18.4,184,3475,FEMALE
150 | Adelie,Dream,36,17.8,195,3450,FEMALE
151 | Adelie,Dream,37.8,18.1,193,3750,MALE
152 | Adelie,Dream,36,17.1,187,3700,FEMALE
153 | Adelie,Dream,41.5,18.5,201,4000,MALE
154 | Chinstrap,Dream,46.5,17.9,192,3500,FEMALE
155 | Chinstrap,Dream,50,19.5,196,3900,MALE
156 | Chinstrap,Dream,51.3,19.2,193,3650,MALE
157 | Chinstrap,Dream,45.4,18.7,188,3525,FEMALE
158 | Chinstrap,Dream,52.7,19.8,197,3725,MALE
159 | Chinstrap,Dream,45.2,17.8,198,3950,FEMALE
160 | Chinstrap,Dream,46.1,18.2,178,3250,FEMALE
161 | Chinstrap,Dream,51.3,18.2,197,3750,MALE
162 | Chinstrap,Dream,46,18.9,195,4150,FEMALE
163 | Chinstrap,Dream,51.3,19.9,198,3700,MALE
164 | Chinstrap,Dream,46.6,17.8,193,3800,FEMALE
165 | Chinstrap,Dream,51.7,20.3,194,3775,MALE
166 | Chinstrap,Dream,47,17.3,185,3700,FEMALE
167 | Chinstrap,Dream,52,18.1,201,4050,MALE
168 | Chinstrap,Dream,45.9,17.1,190,3575,FEMALE
169 | Chinstrap,Dream,50.5,19.6,201,4050,MALE
170 | Chinstrap,Dream,50.3,20,197,3300,MALE
171 | Chinstrap,Dream,58,17.8,181,3700,FEMALE
172 | Chinstrap,Dream,46.4,18.6,190,3450,FEMALE
173 | Chinstrap,Dream,49.2,18.2,195,4400,MALE
174 | Chinstrap,Dream,42.4,17.3,181,3600,FEMALE
175 | Chinstrap,Dream,48.5,17.5,191,3400,MALE
176 | Chinstrap,Dream,43.2,16.6,187,2900,FEMALE
177 | Chinstrap,Dream,50.6,19.4,193,3800,MALE
178 | Chinstrap,Dream,46.7,17.9,195,3300,FEMALE
179 | Chinstrap,Dream,52,19,197,4150,MALE
180 | Chinstrap,Dream,50.5,18.4,200,3400,FEMALE
181 | Chinstrap,Dream,49.5,19,200,3800,MALE
182 | Chinstrap,Dream,46.4,17.8,191,3700,FEMALE
183 | Chinstrap,Dream,52.8,20,205,4550,MALE
184 | Chinstrap,Dream,40.9,16.6,187,3200,FEMALE
185 | Chinstrap,Dream,54.2,20.8,201,4300,MALE
186 | Chinstrap,Dream,42.5,16.7,187,3350,FEMALE
187 | Chinstrap,Dream,51,18.8,203,4100,MALE
188 | Chinstrap,Dream,49.7,18.6,195,3600,MALE
189 | Chinstrap,Dream,47.5,16.8,199,3900,FEMALE
190 | Chinstrap,Dream,47.6,18.3,195,3850,FEMALE
191 | Chinstrap,Dream,52,20.7,210,4800,MALE
192 | Chinstrap,Dream,46.9,16.6,192,2700,FEMALE
193 | Chinstrap,Dream,53.5,19.9,205,4500,MALE
194 | Chinstrap,Dream,49,19.5,210,3950,MALE
195 | Chinstrap,Dream,46.2,17.5,187,3650,FEMALE
196 | Chinstrap,Dream,50.9,19.1,196,3550,MALE
197 | Chinstrap,Dream,45.5,17,196,3500,FEMALE
198 | Chinstrap,Dream,50.9,17.9,196,3675,FEMALE
199 | Chinstrap,Dream,50.8,18.5,201,4450,MALE
200 | Chinstrap,Dream,50.1,17.9,190,3400,FEMALE
201 | Chinstrap,Dream,49,19.6,212,4300,MALE
202 | Chinstrap,Dream,51.5,18.7,187,3250,MALE
203 | Chinstrap,Dream,49.8,17.3,198,3675,FEMALE
204 | Chinstrap,Dream,48.1,16.4,199,3325,FEMALE
205 | Chinstrap,Dream,51.4,19,201,3950,MALE
206 | Chinstrap,Dream,45.7,17.3,193,3600,FEMALE
207 | Chinstrap,Dream,50.7,19.7,203,4050,MALE
208 | Chinstrap,Dream,42.5,17.3,187,3350,FEMALE
209 | Chinstrap,Dream,52.2,18.8,197,3450,MALE
210 | Chinstrap,Dream,45.2,16.6,191,3250,FEMALE
211 | Chinstrap,Dream,49.3,19.9,203,4050,MALE
212 | Chinstrap,Dream,50.2,18.8,202,3800,MALE
213 | Chinstrap,Dream,45.6,19.4,194,3525,FEMALE
214 | Chinstrap,Dream,51.9,19.5,206,3950,MALE
215 | Chinstrap,Dream,46.8,16.5,189,3650,FEMALE
216 | Chinstrap,Dream,45.7,17,195,3650,FEMALE
217 | Chinstrap,Dream,55.8,19.8,207,4000,MALE
218 | Chinstrap,Dream,43.5,18.1,202,3400,FEMALE
219 | Chinstrap,Dream,49.6,18.2,193,3775,MALE
220 | Chinstrap,Dream,50.8,19,210,4100,MALE
221 | Chinstrap,Dream,50.2,18.7,198,3775,FEMALE
222 | Gentoo,Biscoe,46.1,13.2,211,4500,FEMALE
223 | Gentoo,Biscoe,50,16.3,230,5700,MALE
224 | Gentoo,Biscoe,48.7,14.1,210,4450,FEMALE
225 | Gentoo,Biscoe,50,15.2,218,5700,MALE
226 | Gentoo,Biscoe,47.6,14.5,215,5400,MALE
227 | Gentoo,Biscoe,46.5,13.5,210,4550,FEMALE
228 | Gentoo,Biscoe,45.4,14.6,211,4800,FEMALE
229 | Gentoo,Biscoe,46.7,15.3,219,5200,MALE
230 | Gentoo,Biscoe,43.3,13.4,209,4400,FEMALE
231 | Gentoo,Biscoe,46.8,15.4,215,5150,MALE
232 | Gentoo,Biscoe,40.9,13.7,214,4650,FEMALE
233 | Gentoo,Biscoe,49,16.1,216,5550,MALE
234 | Gentoo,Biscoe,45.5,13.7,214,4650,FEMALE
235 | Gentoo,Biscoe,48.4,14.6,213,5850,MALE
236 | Gentoo,Biscoe,45.8,14.6,210,4200,FEMALE
237 | Gentoo,Biscoe,49.3,15.7,217,5850,MALE
238 | Gentoo,Biscoe,42,13.5,210,4150,FEMALE
239 | Gentoo,Biscoe,49.2,15.2,221,6300,MALE
240 | Gentoo,Biscoe,46.2,14.5,209,4800,FEMALE
241 | Gentoo,Biscoe,48.7,15.1,222,5350,MALE
242 | Gentoo,Biscoe,50.2,14.3,218,5700,MALE
243 | Gentoo,Biscoe,45.1,14.5,215,5000,FEMALE
244 | Gentoo,Biscoe,46.5,14.5,213,4400,FEMALE
245 | Gentoo,Biscoe,46.3,15.8,215,5050,MALE
246 | Gentoo,Biscoe,42.9,13.1,215,5000,FEMALE
247 | Gentoo,Biscoe,46.1,15.1,215,5100,MALE
248 | Gentoo,Biscoe,44.5,14.3,216,4100,
249 | Gentoo,Biscoe,47.8,15,215,5650,MALE
250 | Gentoo,Biscoe,48.2,14.3,210,4600,FEMALE
251 | Gentoo,Biscoe,50,15.3,220,5550,MALE
252 | Gentoo,Biscoe,47.3,15.3,222,5250,MALE
253 | Gentoo,Biscoe,42.8,14.2,209,4700,FEMALE
254 | Gentoo,Biscoe,45.1,14.5,207,5050,FEMALE
255 | Gentoo,Biscoe,59.6,17,230,6050,MALE
256 | Gentoo,Biscoe,49.1,14.8,220,5150,FEMALE
257 | Gentoo,Biscoe,48.4,16.3,220,5400,MALE
258 | Gentoo,Biscoe,42.6,13.7,213,4950,FEMALE
259 | Gentoo,Biscoe,44.4,17.3,219,5250,MALE
260 | Gentoo,Biscoe,44,13.6,208,4350,FEMALE
261 | Gentoo,Biscoe,48.7,15.7,208,5350,MALE
262 | Gentoo,Biscoe,42.7,13.7,208,3950,FEMALE
263 | Gentoo,Biscoe,49.6,16,225,5700,MALE
264 | Gentoo,Biscoe,45.3,13.7,210,4300,FEMALE
265 | Gentoo,Biscoe,49.6,15,216,4750,MALE
266 | Gentoo,Biscoe,50.5,15.9,222,5550,MALE
267 | Gentoo,Biscoe,43.6,13.9,217,4900,FEMALE
268 | Gentoo,Biscoe,45.5,13.9,210,4200,FEMALE
269 | Gentoo,Biscoe,50.5,15.9,225,5400,MALE
270 | Gentoo,Biscoe,44.9,13.3,213,5100,FEMALE
271 | Gentoo,Biscoe,45.2,15.8,215,5300,MALE
272 | Gentoo,Biscoe,46.6,14.2,210,4850,FEMALE
273 | Gentoo,Biscoe,48.5,14.1,220,5300,MALE
274 | Gentoo,Biscoe,45.1,14.4,210,4400,FEMALE
275 | Gentoo,Biscoe,50.1,15,225,5000,MALE
276 | Gentoo,Biscoe,46.5,14.4,217,4900,FEMALE
277 | Gentoo,Biscoe,45,15.4,220,5050,MALE
278 | Gentoo,Biscoe,43.8,13.9,208,4300,FEMALE
279 | Gentoo,Biscoe,45.5,15,220,5000,MALE
280 | Gentoo,Biscoe,43.2,14.5,208,4450,FEMALE
281 | Gentoo,Biscoe,50.4,15.3,224,5550,MALE
282 | Gentoo,Biscoe,45.3,13.8,208,4200,FEMALE
283 | Gentoo,Biscoe,46.2,14.9,221,5300,MALE
284 | Gentoo,Biscoe,45.7,13.9,214,4400,FEMALE
285 | Gentoo,Biscoe,54.3,15.7,231,5650,MALE
286 | Gentoo,Biscoe,45.8,14.2,219,4700,FEMALE
287 | Gentoo,Biscoe,49.8,16.8,230,5700,MALE
288 | Gentoo,Biscoe,46.2,14.4,214,4650,
289 | Gentoo,Biscoe,49.5,16.2,229,5800,MALE
290 | Gentoo,Biscoe,43.5,14.2,220,4700,FEMALE
291 | Gentoo,Biscoe,50.7,15,223,5550,MALE
292 | Gentoo,Biscoe,47.7,15,216,4750,FEMALE
293 | Gentoo,Biscoe,46.4,15.6,221,5000,MALE
294 | Gentoo,Biscoe,48.2,15.6,221,5100,MALE
295 | Gentoo,Biscoe,46.5,14.8,217,5200,FEMALE
296 | Gentoo,Biscoe,46.4,15,216,4700,FEMALE
297 | Gentoo,Biscoe,48.6,16,230,5800,MALE
298 | Gentoo,Biscoe,47.5,14.2,209,4600,FEMALE
299 | Gentoo,Biscoe,51.1,16.3,220,6000,MALE
300 | Gentoo,Biscoe,45.2,13.8,215,4750,FEMALE
301 | Gentoo,Biscoe,45.2,16.4,223,5950,MALE
302 | Gentoo,Biscoe,49.1,14.5,212,4625,FEMALE
303 | Gentoo,Biscoe,52.5,15.6,221,5450,MALE
304 | Gentoo,Biscoe,47.4,14.6,212,4725,FEMALE
305 | Gentoo,Biscoe,50,15.9,224,5350,MALE
306 | Gentoo,Biscoe,44.9,13.8,212,4750,FEMALE
307 | Gentoo,Biscoe,50.8,17.3,228,5600,MALE
308 | Gentoo,Biscoe,43.4,14.4,218,4600,FEMALE
309 | Gentoo,Biscoe,51.3,14.2,218,5300,MALE
310 | Gentoo,Biscoe,47.5,14,212,4875,FEMALE
311 | Gentoo,Biscoe,52.1,17,230,5550,MALE
312 | Gentoo,Biscoe,47.5,15,218,4950,FEMALE
313 | Gentoo,Biscoe,52.2,17.1,228,5400,MALE
314 | Gentoo,Biscoe,45.5,14.5,212,4750,FEMALE
315 | Gentoo,Biscoe,49.5,16.1,224,5650,MALE
316 | Gentoo,Biscoe,44.5,14.7,214,4850,FEMALE
317 | Gentoo,Biscoe,50.8,15.7,226,5200,MALE
318 | Gentoo,Biscoe,49.4,15.8,216,4925,MALE
319 | Gentoo,Biscoe,46.9,14.6,222,4875,FEMALE
320 | Gentoo,Biscoe,48.4,14.4,203,4625,FEMALE
321 | Gentoo,Biscoe,51.1,16.5,225,5250,MALE
322 | Gentoo,Biscoe,48.5,15,219,4850,FEMALE
323 | Gentoo,Biscoe,55.9,17,228,5600,MALE
324 | Gentoo,Biscoe,47.2,15.5,215,4975,FEMALE
325 | Gentoo,Biscoe,49.1,15,228,5500,MALE
326 | Gentoo,Biscoe,47.3,13.8,216,4725,
327 | Gentoo,Biscoe,46.8,16.1,215,5500,MALE
328 | Gentoo,Biscoe,41.7,14.7,210,4700,FEMALE
329 | Gentoo,Biscoe,53.4,15.8,219,5500,MALE
330 | Gentoo,Biscoe,43.3,14,208,4575,FEMALE
331 | Gentoo,Biscoe,48.1,15.1,209,5500,MALE
332 | Gentoo,Biscoe,50.5,15.2,216,5000,FEMALE
333 | Gentoo,Biscoe,49.8,15.9,229,5950,MALE
334 | Gentoo,Biscoe,43.5,15.2,213,4650,FEMALE
335 | Gentoo,Biscoe,51.5,16.3,230,5500,MALE
336 | Gentoo,Biscoe,46.2,14.1,217,4375,FEMALE
337 | Gentoo,Biscoe,55.1,16,230,5850,MALE
338 | Gentoo,Biscoe,44.5,15.7,217,4875,
339 | Gentoo,Biscoe,48.8,16.2,222,6000,MALE
340 | Gentoo,Biscoe,47.2,13.7,214,4925,FEMALE
341 | Gentoo,Biscoe,,,,,
342 | Gentoo,Biscoe,46.8,14.3,215,4850,FEMALE
343 | Gentoo,Biscoe,50.4,15.7,222,5750,MALE
344 | Gentoo,Biscoe,45.2,14.8,212,5200,FEMALE
345 | Gentoo,Biscoe,49.9,16.1,213,5400,MALE
346 | 


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/8f80e01c-1296-4371-9486-bb3d68651a60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/8f80e01c-1296-4371-9486-bb3d68651a60.png


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/__init__.py


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld:
--------------------------------------------------------------------------------
 1 | {
 2 |   "@context": "http://schema.org",
 3 |   "@type": "Collection",
 4 |   "@id": "https://doi.org/10.5447/ipk/2022/29",
 5 |   "url": "https://doi.ipk-gatersleben.de:443/DOI/64fb788c-7495-4800-8568-fd562b07017e/fbda7260-8307-485e-a9b7-d84292e3eb04/2",
 6 |   "additionalType": "directory",
 7 |   "name": "GLOBAL STRATEGY FOR THE CONSERVATION OF POTATO",
 8 |   "author": {
 9 |     "name": "Manuela Nagel",
10 |     "givenName": "Manuela",
11 |     "familyName": "Nagel",
12 |     "affiliation": {
13 |       "@type": "Organization",
14 |       "name": "Leibniz Institute of Plant Genetics and Crop Plant Research (IPK), Seeland OT Gatersleben, Corrensstraße 3, 06466, Germany"
15 |     },
16 |     "@id": "https://orcid.org/0000-0003-0396-0333"
17 |   },
18 |   "editor": [
19 |     {
20 |       "name": "Ehsan Dulloo",
21 |       "givenName": "Ehsan",
22 |       "familyName": "Dulloo",
23 |       "affiliation": {
24 |         "@type": "Organization",
25 |         "name": "International Consultant, ,"
26 |       },
27 |       "contributorType": "Researcher"
28 |     },
29 |     {
30 |       "name": "Prishnee Bissessur",
31 |       "givenName": "Prishnee",
32 |       "familyName": "Bissessur",
33 |       "affiliation": {
34 |         "@type": "Organization",
35 |         "name": "International Consultant, ,"
36 |       },
37 |       "contributorType": "Researcher"
38 |     },
39 |     {
40 |       "name": "Tatjana Gavrilenko",
41 |       "givenName": "Tatjana",
42 |       "familyName": "Gavrilenko",
43 |       "affiliation": {
44 |         "@type": "Organization",
45 |         "name": "N.I. Vavilov All-Russian Institute of Plant Genetic Resources, , Russia"
46 |       },
47 |       "contributorType": "Researcher",
48 |       "@id": "https://orcid.org/0000-0002-2605-6569"
49 |     },
50 |     {
51 |       "name": "John Bamberg",
52 |       "givenName": "John",
53 |       "familyName": "Bamberg",
54 |       "affiliation": {
55 |         "@type": "Organization",
56 |         "name": "U. S. Potato Genebank, , USA"
57 |       },
58 |       "contributorType": "Researcher",
59 |       "@id": "https://orcid.org/0000-0001-6102-7846"
60 |     },
61 |     {
62 |       "name": "David Ellis",
63 |       "givenName": "David",
64 |       "familyName": "Ellis",
65 |       "affiliation": {
66 |         "@type": "Organization",
67 |         "name": "International Potato Center (CIP), , Peru"
68 |       },
69 |       "contributorType": "Researcher",
70 |       "@id": "https://orcid.org/0000-0002-0209-2784"
71 |     },
72 |     {
73 |       "name": "Peter Giovannini",
74 |       "givenName": "Peter",
75 |       "familyName": "Giovannini",
76 |       "affiliation": {
77 |         "@type": "Organization",
78 |         "name": "Global Crop Diversity Trust, ,"
79 |       },
80 |       "contributorType": "Researcher",
81 |       "@id": "https://orcid.org/0000-0002-1053-2030"
82 |     }
83 |   ],
84 |   "description": "Cultivated potato, Solanum tuberosum ssp. tuberosum, is the third most consumed crop globally and important not only for food but also for for the animal feed, pharmaceutical, textile and paper industries. To gain an overview on the current state of the conservation and use of potato genetic resources, the Global Crop Diversity Trust (Crop Trust), commissioned an update of the ‘Global conservation strategy for potato genetic resources’. This updated strategy aims to support the efficiency and effectiveness of potato diversity conservation at national, regional and international levels, and to identify priorities for strengthening the conservation and use of potato genetic resources.",
85 |   "keywords": "ex situ conservation, plant genetic resources, potato, Solanum tuberosum, global strategy, conservation strategy, wild potato, Andigenum group, Chilotanum group, native potato variety, genebank, accession, true potato seed, potato tuber, late blight",
86 |   "inLanguage": "en",
87 |   "contentSize": "0 B",
88 |   "datePublished": "2022",
89 |   "schemaVersion": "http://datacite.org/schema/kernel-4",
90 |   "publisher": {
91 |     "@type": "Organization",
92 |     "name": "e!DAL - Plant Genomics and Phenomics Research Data Repository (PGP), IPK Gatersleben, Seeland OT Gatersleben, Corrensstraße 3, 06466, Germany"
93 |   },
94 |   "provider": {
95 |     "@type": "Organization",
96 |     "name": "datacite"
97 |   }
98 | }


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/bfcd99e1-0690-4b53-a85c-0174a8629083.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/bfcd99e1-0690-4b53-a85c-0174a8629083.zip


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/cca70ce6-1952-45d2-acd4-80c903b0bc49.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/cca70ce6-1952-45d2-acd4-80c903b0bc49.png


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/df6561b2-7ee5-4540-baab-5095f742716a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/df6561b2-7ee5-4540-baab-5095f742716a.png


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/2023/validation/edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx


--------------------------------------------------------------------------------
/langProBe/GAIA/data/2023/validation/f918266a-b3e0-4914-865d-4faa564f1aef.py:
--------------------------------------------------------------------------------
 1 | from random import randint
 2 | import time
 3 | 
 4 | class UhOh(Exception):
 5 |     pass
 6 | 
 7 | class Hmm:
 8 |     def __init__(self):
 9 |         self.value = randint(-100, 100)
10 | 
11 |     def Yeah(self):
12 |         if self.value == 0:
13 |             return True
14 |         else:
15 |             raise UhOh()
16 | 
17 | def Okay():
18 |     while True:
19 |         yield Hmm()
20 | 
21 | def keep_trying(go, first_try=True):
22 |     maybe = next(go)
23 |     try:
24 |         if maybe.Yeah():
25 |             return maybe.value
26 |     except UhOh:
27 |         if first_try:
28 |             print("Working...")
29 |             print("Please wait patiently...")
30 |         time.sleep(0.1)
31 |         return keep_trying(go, first_try=False)
32 | 
33 | if __name__ == "__main__":
34 |     go = Okay()
35 |     print(f"{keep_trying(go)}")
36 | 


--------------------------------------------------------------------------------
/langProBe/GAIA/data/GAIA.py:
--------------------------------------------------------------------------------
 1 | """GAIA 2023 dataset."""
 2 | 
 3 | 
 4 | import json
 5 | import os
 6 | 
 7 | import datasets
 8 | 
 9 | 
10 | _CITATION = """ """
11 | 
12 | _DESCRIPTION = """ """
13 | 
14 | _HOMEPAGE = ""
15 | 
16 | _LICENSE = ""
17 | 
18 | _NAMES = [
19 |     "2023_all",
20 |     "2023_level1",
21 |     "2023_level2",
22 |     "2023_level3",
23 | ]
24 | 
25 | YEAR_TO_LEVELS = {"2023": [1, 2, 3]}
26 | 
27 | separator = "_"
28 | 
29 | 
30 | class GAIA_dataset(datasets.GeneratorBasedBuilder):
31 |     VERSION = datasets.Version("0.0.1")
32 | 
33 |     BUILDER_CONFIGS = [
34 |         datasets.BuilderConfig(name=name, version=version, description=name)
35 |         for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
36 |     ]
37 | 
38 |     def _info(self):
39 |         features = datasets.Features(
40 |             {
41 |                 "task_id": datasets.Value("string"),
42 |                 "Question": datasets.Value("string"),
43 |                 "Level": datasets.Value("string"),
44 |                 "Final answer": datasets.Value("string"), # ? for test values
45 |                 "file_name": datasets.Value("string"),
46 |                 "file_path": datasets.Value("string"),  # generated here
47 |                 "Annotator Metadata": {k: datasets.Value("string") for k in ["Steps", "Number of steps", "How long did this take?", "Tools", "Number of tools"]} # "", 
48 |             }
49 |         )
50 |         return datasets.DatasetInfo(
51 |             description=_DESCRIPTION,
52 |             features=features,
53 |             homepage=_HOMEPAGE,
54 |             license=_LICENSE,
55 |             citation=_CITATION,
56 |         )
57 | 
58 |     def _split_generators(self, dl_manager):
59 |         year, level_name = self.config.name.split(separator)
60 |         if level_name == "all":
61 |             levels = YEAR_TO_LEVELS[year]
62 |         else:
63 |             level_name = int(level_name.split("level")[1])
64 |             levels = [level_name]
65 |         print(year, level_name)
66 | 
67 |         output = []
68 |         for split in ["test", "validation"]:
69 |             root_file = dl_manager.download(os.path.join(year, split, "metadata.jsonl"))
70 |             test_attached_files = {"": ""}
71 |             with open(root_file, "r", encoding="utf-8") as f:
72 |                 for line in f:
73 |                     cur_line = json.loads(line)
74 |                     if cur_line["Level"] in levels and cur_line["file_name"] != "":
75 |                         attached_file_name = cur_line["file_name"]
76 |                         attached_file = dl_manager.download(os.path.join(year, split, attached_file_name))
77 |                         test_attached_files[attached_file_name] = attached_file
78 | 
79 |             output.append(
80 |                 datasets.SplitGenerator(
81 |                     name=getattr(datasets.Split, split.upper()),
82 |                     gen_kwargs={"root_file": root_file, "attached_files": test_attached_files, "levels": levels},
83 |                 )
84 |             )
85 |         return output
86 | 
87 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
88 |     def _generate_examples(self, root_file: str, attached_files: dict, levels: list[int]):
89 |         with open(root_file, "r", encoding="utf-8") as f:
90 |             for key, line in enumerate(f):
91 |                 cur_line = json.loads(line)
92 |                 if cur_line["Level"] in levels:
93 |                     cur_line["file_path"] = attached_files[cur_line["file_name"]]
94 |                     yield key, cur_line
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/langProBe/GAIA/data/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - en
 4 | pretty_name: General AI Assistants Benchmark
 5 | extra_gated_prompt: "To avoid contamination and data leakage, you agree to not reshare this dataset outside of a gated or private repository on the HF hub."
 6 | extra_gated_fields:
 7 |   I agree to not reshare the GAIA submissions set according to the above conditions: checkbox
 8 | ---
 9 | # GAIA dataset
10 | 
11 | GAIA is a benchmark which aims at evaluating next-generation LLMs (LLMs with augmented capabilities due to added tooling, efficient prompting, access to search, etc).
12 | 
13 | We added gating to prevent bots from scraping the dataset. Please do not reshare the validation or test set in a crawlable format.
14 | 
15 | ## Data and leaderboard
16 | GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve. It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata.
17 | 
18 | GAIA leaderboard can be found in this space (https://huggingface.co/spaces/gaia-benchmark/leaderboard).
19 | 
20 | Questions are contained in metadata.jsonl. Some questions come with an additional file, that can be found in the same folder and whose id is given in the field file_name.
21 | 
22 | More details in [the paper](https://arxiv.org/abs/2311.12983) for now and soon here as well.


--------------------------------------------------------------------------------
/langProBe/GAIA/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/GAIA/data/__init__.py


--------------------------------------------------------------------------------
/langProBe/GAIA/data/statics.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from collections import defaultdict
  4 | 
  5 | def parse_tools(tools_str):
  6 |     """
  7 |     解析 Tools 字符串，将其分割为单独的工具列表。
  8 |     假设 Tools 字段是以数字和点开头的每行工具，例如：
  9 |     "1. Web browser
 10 |     2. Image recognition tools (to identify and parse a figure with three axes)"
 11 |     """
 12 |     tools = []
 13 |     # 使用正则表达式匹配每个工具条目
 14 |     pattern = re.compile(r'\d+\.\s*(.*)')
 15 |     for line in tools_str.split('\n'):
 16 |         match = pattern.match(line.strip())
 17 |         if match:
 18 |             tool = match.group(1).strip()
 19 |             # 去除可能的括号内说明
 20 |             tool = re.sub(r'\s*\(.*\)', '', tool)
 21 |             tools.append(tool)
 22 |     return tools
 23 | 
 24 | def process_jsonl(file_path):
 25 |     tool_counts = defaultdict(int)
 26 |     total_tools = 0
 27 |     tool_numbers = []
 28 |     processed_tasks = 0
 29 | 
 30 |     with open(file_path, 'r', encoding='utf-8') as f:
 31 |         for line_number, line in enumerate(f, 1):
 32 |             line = line.strip()
 33 |             if not line:
 34 |                 continue  # 跳过空行
 35 |             # 调试信息：确认正在处理哪一行
 36 |             print(f"处理第 {line_number} 行")
 37 | 
 38 |             try:
 39 |                 data = json.loads(line)
 40 |             except json.JSONDecodeError as e:
 41 |                 print(f"第 {line_number} 行: JSON 解码错误: {e}")
 42 |                 continue
 43 | 
 44 |             # 提取 Annotator Metadata
 45 |             annotator_metadata = data.get("Annotator Metadata", {})
 46 |             if not annotator_metadata:
 47 |                 print(f"第 {line_number} 行: 未找到 'Annotator Metadata' 字段。")
 48 |                 continue
 49 | 
 50 |             number_of_tools = annotator_metadata.get("Number of tools")
 51 |             tools_str = annotator_metadata.get("Tools", "")
 52 | 
 53 |             if number_of_tools is None:
 54 |                 print(f"第 {line_number} 行: 未找到 'Number of tools' 字段。")
 55 |             else:
 56 |                 try:
 57 |                     num_tools = int(number_of_tools)
 58 |                     tool_numbers.append(num_tools)
 59 |                 except ValueError:
 60 |                     print(f"第 {line_number} 行: 'Number of tools' 不是有效的整数。")
 61 | 
 62 |             if not tools_str:
 63 |                 print(f"第 {line_number} 行: 'Tools' 字段为空。")
 64 |                 continue
 65 | 
 66 |             tools = parse_tools(tools_str)
 67 |             print(f"第 {line_number} 行解析到的工具: {tools}")
 68 |             print(f"第 {line_number} 行的工具数量: {len(tools)}")
 69 | 
 70 |             # 验证 Number of tools 是否与解析的工具数量一致
 71 |             if number_of_tools:
 72 |                 try:
 73 |                     num_tools = int(number_of_tools)
 74 |                     if num_tools != len(tools):
 75 |                         print(f"第 {line_number} 行: Number of tools ({num_tools}) 与解析的工具数量 ({len(tools)}) 不一致。")
 76 |                 except ValueError:
 77 |                     pass  # 已在上一步处理
 78 | 
 79 |             # 统计每个工具的出现次数
 80 |             for tool in tools:
 81 |                 tool_counts[tool] += 1
 82 |                 total_tools += 1
 83 | 
 84 |             processed_tasks += 1
 85 | 
 86 |     return tool_counts, tool_numbers, total_tools, processed_tasks
 87 | 
 88 | def main():
 89 |     jsonl_file = '2023/validation/metadata.jsonl'  # 替换为你的 JSONL 文件路径
 90 |     tool_counts, tool_numbers, total_tools, processed_tasks = process_jsonl(jsonl_file)
 91 | 
 92 |     print("\n每个工具的总出现次数：")
 93 |     if not tool_counts:
 94 |         print("没有统计到任何工具。请检查文件内容和解析逻辑。")
 95 |     else:
 96 |         for tool, count in sorted(tool_counts.items(), key=lambda x: x[1], reverse=True):
 97 |             print(f"{tool}: {count}")
 98 | 
 99 |     # 计算并输出平均工具数量
100 |     if tool_numbers:
101 |         average_tools = sum(tool_numbers) / len(tool_numbers)
102 |         print(f"\n平均每个题目的工具数量: {average_tools:.2f}")
103 |     else:
104 |         print("\n没有统计到任何 'Number of tools' 数据。")
105 | 
106 |     print(f"\n总处理题目数: {processed_tasks}")
107 |     print(f"总工具数量: {total_tools}")
108 | 
109 | if __name__ == "__main__":
110 |         main()
111 | 


--------------------------------------------------------------------------------
/langProBe/GAIA/gaia_program.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import re
  5 | import time
  6 | import traceback
  7 | from datetime import datetime
  8 | from typing import List, Tuple, Optional
  9 | from langProBe.evaluation_utils import question_scorer
 10 | 
 11 | from langProBe.mcp_program import MCPPredict
 12 | 
 13 | import dspy
 14 | from openai import OpenAI
 15 | 
 16 | from langProBe.dspy_program import LangProBeDSPyMetaProgram
 17 | import langProBe.constants as constants
 18 | 
 19 | from langProBe.mcp_program import MCPPredict
 20 | from langProBe.program_utils import (
 21 |     call_lm,
 22 |     build_init_messages,
 23 |     build_messages,
 24 |     response_parsing,
 25 |     mcp_calling,
 26 |     ProcessManager
 27 | )
 28 | 
 29 | MCP_SAMPLE_SYSTEM_PROMPT = """
 30 | You are a helpful assistant. You are able to answer questions using different tools.  
 31 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.  
 32 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.  
 33 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.  
 34 | The tool description includes:  
 35 | A brief text description outlining the functionality of the tool.  
 36 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
 37 | If you have obtained the final result. Please provide your final answer enclosed within <answer></answer> tags. Ensure that only the final answer is included, without any additional explanations or commentary.
 38 | """
 39 | 
 40 | class GAIAPredict(MCPPredict):
 41 |     def __init__(self, max_steps=5, system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, task_name="gaia"):
 42 |         super().__init__(max_steps, system_prompt, task_name)
 43 | 
 44 |     def evaluate_prediction(self, question: str, ground_truth: str, prediction: str) -> Tuple[bool, Optional[str]]:
 45 |         return question_scorer(prediction, ground_truth, self.run_logger)
 46 | 
 47 |     def extract_last_answer(self, text):
 48 |         pattern = re.compile(r'<answer>(.*?)</answer>', re.DOTALL)
 49 |         matches = pattern.findall(text)
 50 | 
 51 |         if matches:
 52 |             return matches[-1]
 53 |         else:
 54 |             return None
 55 | 
 56 |     def forward(self, **kwargs) -> dspy.Prediction:
 57 |         unique_id = kwargs.get('id')
 58 |         question = kwargs.get('question')
 59 |         gt = kwargs.get('answer')
 60 | 
 61 |         manager = ProcessManager()
 62 |         manager.lm_api_key = self.lm.api_key
 63 |         manager.lm_api_base = self.lm.api_base
 64 |         manager.model = self.lm.model
 65 |         manager.id = unique_id
 66 | 
 67 |         self.run_logger.info(f"ID: {manager.id}, Starting forward pass for question: {question}")
 68 | 
 69 |         from langProBe.evaluation import global_config
 70 |         mcps = global_config['mcp_pool']
 71 | 
 72 |         messages = build_init_messages(self.system_prompt, mcps, question)
 73 |         steps = 0
 74 |         all_completion_tokens = 0
 75 |         all_prompt_tokens = 0
 76 |         start_time = time.time()
 77 | 
 78 |         while not messages[-1][constants.ROLE] == constants.ASSISTANT and steps < self.max_steps:
 79 |             response, completion_tokens, prompt_tokens = call_lm(messages, manager, self.run_logger)
 80 |             all_completion_tokens += completion_tokens
 81 |             all_prompt_tokens += prompt_tokens
 82 |             mcp_calls = response_parsing(response)
 83 | 
 84 |             new_messages = mcp_calling(mcp_calls, manager, self.run_logger)
 85 |             messages = build_messages(messages, new_messages)
 86 |             steps += 1
 87 | 
 88 |         end_time = time.time()
 89 | 
 90 |         if messages[-1][constants.ROLE] != constants.ASSISTANT:
 91 |             self.run_logger.warning("Maximum steps reached without getting an answer")
 92 |             messages.append({
 93 |                 constants.ROLE: constants.ASSISTANT,
 94 |                 constants.CONTENT: "超过最长次数限制，该问题无法解决",
 95 |             })
 96 | 
 97 |         self.run_logger.info(f"ID: {manager.id}, Forward pass completed successfully")
 98 |         success = self.evaluate_prediction(question, gt, self.extract_last_answer(messages[-1][constants.CONTENT]))
 99 |         self.log_messages(messages, question, success, (end_time - start_time), all_prompt_tokens,
100 |                           all_completion_tokens)
101 |         self.run_logger.info(f"ID: {manager.id}, Evaluation completed successfully")
102 | 
103 |         return dspy.Prediction(
104 |             success=success,
105 |             question=question,
106 |             ground_truth=gt,
107 |             answer=messages[-1][constants.CONTENT],
108 |             trace=messages,
109 |             process_report=manager
110 |         )


--------------------------------------------------------------------------------
/langProBe/WebSearch/__init__.py:
--------------------------------------------------------------------------------
 1 | from langProBe.benchmark import BenchmarkMeta, MCPBench
 2 | from langProBe.mcp_program import MCPPredict
 3 | from langProBe.evaluation_utils import mcp_metric
 4 | 
 5 | MCP_SAMPLE_SYSTEM_PROMPT = """
 6 | You are a helpful assistant. You are able to answer questions using different tools.  
 7 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.  
 8 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.  
 9 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.  
10 | The tool description includes:  
11 | A brief text description outlining the functionality of the tool.  
12 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
13 | """
14 | 
15 | def get_mcp_sample_benchmark():
16 |     mcp_sample_baseline = MCPPredict(
17 |         max_steps=5,
18 |         system_prompt=MCP_SAMPLE_SYSTEM_PROMPT,
19 |         task_name="websearch")
20 | 
21 |     return [
22 |         BenchmarkMeta(
23 |             MCPBench,
24 |             [mcp_sample_baseline],
25 |             mcp_metric,
26 |             optimizers=[],
27 |             name="MCP_WEBSEARCH"  # 添加显式名称
28 |         )
29 |     ]
30 | 
31 | benchmark = get_mcp_sample_benchmark()


--------------------------------------------------------------------------------
/langProBe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/langProBe/__init__.py


--------------------------------------------------------------------------------
/langProBe/analysis.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | 
  3 | import pandas as pd
  4 | 
  5 | 
  6 | def read_evaluation_results(dir: str):
  7 |     # Define the path to the directory
  8 |     file_path = pathlib.Path(dir)
  9 | 
 10 |     # List all .txt files in the directory
 11 |     all_result_files = list(file_path.rglob("*.txt"))
 12 | 
 13 |     # Initialize a list to store the extracted data
 14 |     extracted_data = []
 15 | 
 16 |     # Process each file
 17 |     for file in all_result_files:
 18 |         # Split the filename to get benchmark, program, and optimizer
 19 |         file_name_parts = file.stem.split("_")
 20 |         if len(file_name_parts) >= 3:
 21 |             benchmark = ''.join(file_name_parts[:-1])
 22 |             program = file_name_parts[-1]
 23 |         else:
 24 |             raise ValueError(f"Invalid file name: {file.name}")
 25 | 
 26 |         with open(file, "r") as f:
 27 |             lines = f.readlines()
 28 | 
 29 |             # Extract information from the lines
 30 |             if len(lines) == 2:  # Checking if we have 2 lines
 31 |                 header = lines[0].strip()
 32 |                 values = lines[1].strip().split(",")
 33 | 
 34 |                 # Check if optimizer is present in the file content
 35 |                 if "optimizer" in header:
 36 |                     # Extract values for file with optimizer
 37 |                     data = {
 38 |                         "file_name": file.name,
 39 |                         "benchmark": benchmark,
 40 |                         "program": program,
 41 |                         "score": float(values[0]),
 42 |                         "cost": float(values[1]),
 43 |                         "input_tokens": int(values[2]),
 44 |                         "output_tokens": int(values[3]),
 45 |                     }
 46 |                 else:
 47 |                     # Extract values for file without optimizer
 48 |                     data = {
 49 |                         "file_name": file.name,
 50 |                         "benchmark": benchmark,
 51 |                         "program": program,
 52 |                         "score": float(values[0]),
 53 |                         "cost": float(values[1]),
 54 |                         "input_tokens": int(values[2]),
 55 |                         "output_tokens": int(values[3]),
 56 |                     }
 57 | 
 58 |                 # Append the extracted data to the list
 59 |                 extracted_data.append(data)
 60 | 
 61 |     # Convert the list of dictionaries to a pandas DataFrame
 62 |     # import pdb; pdb.set_trace()
 63 |     df = pd.DataFrame(extracted_data)
 64 |     df = canonicalize_program(df)
 65 |     return df
 66 | 
 67 | 
 68 | program_mapping = {
 69 |     "AppWorldReact": "ReActBaseline",
 70 |     "AppWorldReactAugumented": "ReActAugumented",
 71 |     "Predict": "Predict",
 72 |     "ChainOfThought": "CoT",
 73 |     "GeneratorCriticRanker": "GeneratorCriticRanker",
 74 |     "GeneratorCriticFuser": "GeneratorCriticFuser",
 75 |     "RAG": "RAG",
 76 |     "EvaluationValidityPredict": "Predict",
 77 |     "EvaluationValidityModule": "CoT",
 78 |     "CoT": "CoT",
 79 |     "Classify": "CoTBasedVote",
 80 |     "HeartDiseaseClassify": "CoTBasedVote",
 81 |     "RetrieveMultiHop": "RetrieveMultiHop",
 82 |     "SimplifiedBaleen": "SimplifiedBaleen",
 83 |     "SimplifiedBaleenWithHandwrittenInstructions": "SimplifiedBaleenWithInst",
 84 |     "UnderspecifiedAnnotationCoT": "CoT",
 85 |     "UnderspecifiedAnnotationGeneratorCriticFuser": "GeneratorCriticFuser",
 86 |     "UnderspecifiedAnnotationGeneratorCriticRanker": "GeneratorCriticRanker",
 87 |     "EvaluationValidityGeneratorCriticRanker": "GeneratorCriticRanker",
 88 |     "EvaluationValidityGeneratorCriticFuser": "GeneratorCriticFuser",
 89 |     "UnderspecifiedAnnotationPredict": "Predict",
 90 |     "EvaluationValidityCoT": "CoT",
 91 |     "EvaluationValidityPredict": "Predict",
 92 |     # Relook at the following programs
 93 |     "IReRaCOT": "CoT",
 94 |     "IReRaPredict": "Predict",
 95 |     "Infer": "CoT",
 96 |     "InferRetrieve": "RAG",
 97 |     "IReRaRetrieve": "RAG",
 98 |     "IReRaRetrieveRank": "RAGBasedRank",
 99 |     "InferRetrieveRank": "RAGBasedRank",
100 |     "HoverMultiHopPredict": "Predict",
101 |     "HoverMultiHop": "MultiHopSummarize",
102 | }
103 | 
104 | 
105 | def canonicalize_program(data_df):
106 |     # Update the benchmark names based on the program
107 |     data_df.loc[
108 |         data_df["program"].isin(
109 |             [
110 |                 "UnderspecifiedAnnotationCoT",
111 |                 "UnderspecifiedAnnotationPredict",
112 |                 "UnderspecifiedAnnotationGeneratorCriticFuser",
113 |                 "UnderspecifiedAnnotationGeneratorCriticRanker",
114 |             ]
115 |         ),
116 |         "benchmark",
117 |     ] = "SWEBenchUnderspecified"
118 | 
119 |     data_df.loc[
120 |         data_df["program"].isin(
121 |             [
122 |                 "EvaluationValidityCoT",
123 |                 "EvaluationValidityPredict",
124 |                 "EvaluationValidityGeneratorCriticFuser",
125 |                 "EvaluationValidityGeneratorCriticRanker",
126 |             ]
127 |         ),
128 |         "benchmark",
129 |     ] = "SWEBenchValidity"
130 |     data_df["program"] = data_df["program"].replace(program_mapping)
131 |     data_df["benchmark"] = data_df["benchmark"].apply(lambda x: x.replace("Bench", ""))
132 |     return data_df
133 | 


--------------------------------------------------------------------------------
/langProBe/async_mcp_client.py:
--------------------------------------------------------------------------------
  1 | from contextlib import AsyncExitStack
  2 | from typing import Optional
  3 | 
  4 | from anthropic import Anthropic
  5 | from mcp import ClientSession
  6 | from mcp.client.sse import sse_client
  7 | 
  8 | 
  9 | class AsyncMCPClient:
 10 | 
 11 |     def __init__(self):
 12 |         # Initialize session and client objects
 13 |         self.session: Optional[ClientSession] = None
 14 |         self.exit_stack = AsyncExitStack()
 15 |         self.anthropic = Anthropic()
 16 | 
 17 |     async def connect_to_sse_server(self, server_url: str):
 18 |         """Connect to an MCP server running with SSE transport"""
 19 |         # Store the context managers so they stay alive
 20 |         self._streams_context = sse_client(url=server_url)
 21 |         streams = await self._streams_context.__aenter__()
 22 | 
 23 |         self._session_context = ClientSession(*streams)
 24 |         self.session: ClientSession = await self._session_context.__aenter__()
 25 | 
 26 |         # Initialize
 27 |         await self.session.initialize()
 28 | 
 29 |         # List available tools to verify connection
 30 |         # print("Initialized SSE client...")
 31 |         # print("Listing tools...")
 32 |         response = await self.session.list_tools()
 33 |         tools = response.tools
 34 |         # print("\nConnected to server with tools:", [tool.name for tool in tools])
 35 | 
 36 |     async def cleanup(self):
 37 |         """Properly clean up the session and streams"""
 38 |         if self._session_context:
 39 |             await self._session_context.__aexit__(None, None, None)
 40 |         if self._streams_context:
 41 |             await self._streams_context.__aexit__(None, None, None)
 42 | 
 43 |     async def call_tool(self, tool_name: str, tool_args: dict) -> dict:
 44 |         """Call a tool with the given arguments"""
 45 |         result = await self.session.call_tool(tool_name, tool_args)
 46 |         return result
 47 | 
 48 |     async def list_tools(self):
 49 |         """List available tools"""
 50 |         response = await self.session.list_tools()
 51 |         return response
 52 | 
 53 |     async def get_prompt(self, *args, **kwargs):
 54 |         response = await self.session.get_prompt(*args, **kwargs)
 55 |         return response
 56 | 
 57 |     async def list_prompts(self):
 58 |         response = await self.session.list_prompts()
 59 |         return response
 60 | 
 61 |     async def list_resources(self):
 62 |         response = await self.session.list_resources()
 63 |         return response
 64 | 
 65 |     async def read_resource(self, *args, **kwargs):
 66 |         response = await self.session.read_resource(*args, **kwargs)
 67 |         return response
 68 | 
 69 |     async def process_query(self, query: str) -> str:
 70 |         """Process a query using Claude and available tools"""
 71 |         messages = [
 72 |             {
 73 |                 "role": "user",
 74 |                 "content": query
 75 |             }
 76 |         ]
 77 | 
 78 |         response = await self.session.list_tools()
 79 |         available_tools = [{
 80 |             "name": tool.name,
 81 |             "description": tool.description,
 82 |             "input_schema": tool.inputSchema
 83 |         } for tool in response.tools]
 84 | 
 85 |         # Initial Claude API call
 86 |         response = self.anthropic.messages.create(
 87 |             model="claude-3-5-sonnet-20241022",
 88 |             max_tokens=1000,
 89 |             messages=messages,
 90 |             tools=available_tools
 91 |         )
 92 | 
 93 |         # Process response and handle tool calls
 94 |         tool_results = []
 95 |         final_text = []
 96 | 
 97 |         for content in response.content:
 98 |             if content.type == 'text':
 99 |                 final_text.append(content.text)
100 |             elif content.type == 'tool_use':
101 |                 tool_name = content.name
102 |                 tool_args = content.input
103 | 
104 |                 # Execute tool call
105 |                 result = await self.session.call_tool(tool_name, tool_args)
106 |                 tool_results.append({"call": tool_name, "result": result})
107 |                 final_text.append(f"[Calling tool {tool_name} with args {tool_args}]")
108 | 
109 |                 # Continue conversation with tool results
110 |                 if hasattr(content, 'text') and content.text:
111 |                     messages.append({
112 |                         "role": "assistant",
113 |                         "content": content.text
114 |                     })
115 |                 messages.append({
116 |                     "role": "user",
117 |                     "content": result.content
118 |                 })
119 | 
120 |                 # Get next response from Claude
121 |                 response = self.anthropic.messages.create(
122 |                     model="claude-3-5-sonnet-20241022",
123 |                     max_tokens=1000,
124 |                     messages=messages,
125 |                 )
126 | 
127 |                 final_text.append(response.content[0].text)
128 | 
129 |         return "\n".join(final_text)
130 | 
131 |     async def chat_loop(self):
132 |         """Run an interactive chat loop"""
133 |         # print("\nMCP Client Started!")
134 |         # print("Type your queries or 'quit' to exit.")
135 | 
136 |         while True:
137 |             try:
138 |                 query = input("\nQuery: ").strip()
139 | 
140 |                 if query.lower() == 'quit':
141 |                     break
142 | 
143 |                 response = await self.process_query(query)
144 |                 print("\n" + response)
145 | 
146 |             except Exception as e:
147 |                 print(f"\nError: {str(e)}")
148 | 
149 | # async def main():
150 | #     client = AsyncMCPClient()
151 | #     try:
152 | #         await client.connect_to_sse_server(server_url="http://localhost:8080/sse")
153 | #         result = await client.call_tool("get_alerts", {"state": "CA"})
154 | #         print(result)
155 | #     finally:
156 | #         await client.cleanup()
157 | 
158 | 
159 | # result = asyncio.run(main())


--------------------------------------------------------------------------------
/langProBe/benchmark.py:
--------------------------------------------------------------------------------
  1 | import random, os
  2 | from abc import ABC, abstractmethod
  3 | from dataclasses import dataclass, field
  4 | from enum import Enum
  5 | from typing import Callable, List, Type
  6 | 
  7 | import dspy
  8 | from dspy.evaluate import Evaluate
  9 | from dspy.teleprompt import Teleprompter
 10 | 
 11 | import langProBe.optimizers as langprobe_optimizers
 12 | from langProBe.dspy_program import LangProBeDSPyMetaProgram
 13 | from langProBe.config_utils import read_json, read_jsonl
 14 | from langProBe.program_utils import ProcessManager
 15 | 
 16 | 
 17 | 
 18 | 
 19 | dataset_size = {"full": None, "lite": 500, "tiny": 200, "test": 2}
 20 | 
 21 | 
 22 | class Benchmark(ABC):
 23 |     def __init__(self, dataset_mode="lite"):
 24 |         # dataset for training and validation
 25 |         self.dataset = None
 26 |         # dataset for the actual benchmarking
 27 |         self.test_set = None
 28 |         self.train_set = None
 29 |         self.dev_set = None
 30 |         self.val_set = None
 31 | 
 32 |         self.init_dataset()
 33 |         assert self.dataset is not None, "Dataset not initialized"
 34 |         assert self.test_set is not None, "Test set not initialized"
 35 |         self.max_testset_size = dataset_size[dataset_mode]
 36 | 
 37 |         self.test_set = self.trim_dataset(self.test_set, self.max_testset_size)
 38 | 
 39 |         # TODO: FIXME: "test" option is for debugging purposes only, should be removed for final release
 40 |         if dataset_mode == "test":
 41 |             self.dataset = self.trim_dataset(self.dataset, 60)
 42 |             self.create_splits()
 43 |             self.test_set = self.trim_dataset(self.test_set, 50)
 44 | 
 45 |         if not self.train_set or not self.dev_set or not self.val_set:
 46 |             self.create_splits()
 47 | 
 48 |         self.train_set = self.trim_dataset(self.train_set, 150)
 49 |         self.dev_set = self.trim_dataset(self.dev_set, 300)
 50 |         self.val_set = self.trim_dataset(self.val_set, 300)
 51 | 
 52 |         assert self.train_set is not None, "Train set not initialized"
 53 |         assert self.dev_set is not None, "Dev set not initialized"
 54 |         assert self.val_set is not None, "Val set not initialized"
 55 | 
 56 |     @abstractmethod
 57 |     def init_dataset(self) -> None:
 58 |         """
 59 |         Initializes the dataset for the benchmark, and sets it to self.dataset.
 60 |         Each element in the dataset should be an instance of dspy.Example.
 61 |         """
 62 |         return
 63 | 
 64 |     def trim_dataset(self, dataset, size: int) -> None:
 65 |         if size is None or size >= len(dataset):
 66 |             return dataset
 67 |         rng = random.Random()
 68 |         rng.seed(1)
 69 |         return rng.sample(dataset, size)
 70 | 
 71 |     def create_splits(self) -> None:
 72 |         """
 73 |         Creates the splits for the dataset (not including test).
 74 |         Upon completion, self.train_set, self.dev_set, and self.val_set should be set.
 75 |         """
 76 | 
 77 |         total_len = len(self.dataset)
 78 |         self.dev_set = self.dataset[: int(0.4 * total_len)]
 79 |         self.val_set = self.dataset[int(0.4 * total_len) : int(0.8 * total_len)]
 80 |         self.train_set = self.dataset[int(0.8 * total_len) :]
 81 | 
 82 |     def get_dataset(self):
 83 |         return self.dataset
 84 | 
 85 |     def get_train_set(self):
 86 |         return self.train_set
 87 | 
 88 |     def get_dev_set(self):
 89 |         return self.dev_set
 90 | 
 91 |     def get_test_set(self):
 92 |         return self.test_set
 93 | 
 94 | 
 95 | class MCPBench(Benchmark):
 96 |     def __init__(self, dataset_mode="lite", dataset_path=None, missing_data=[]):
 97 |         self.dataset_path = dataset_path
 98 |         self.missing_data = missing_data
 99 |         super().__init__(dataset_mode=dataset_mode)
100 | 
101 |     def init_dataset(self):
102 |         self.dataset = []
103 |         self.test_set = []
104 |         if self.missing_data:
105 |             test_raw_data = self.missing_data
106 |         else:
107 |             test_raw_data = read_jsonl(self.dataset_path)
108 |         
109 |         for test_data in test_raw_data:
110 |             self.test_set.append(
111 |                 dspy.Example(
112 |                     id=test_data["unique_id"],
113 |                     question=test_data["Prompt"],
114 |                     answer=test_data["Answer"],
115 |                 ).with_inputs("id", "question", "answer", "config")
116 |             )
117 | 
118 | 
119 | 
120 | 
121 | @dataclass
122 | class EvaluationResult:
123 |     benchmark: str
124 |     program: str
125 | 
126 |     score: float
127 |     cost: float
128 |     input_tokens: int
129 |     output_tokens: int
130 | 
131 |     outputs_raw_data: List|None = None
132 | 
133 |     # optimizer: str = None
134 |     # optimized_program: dspy.Module = None
135 |     # optimizer_input_tokens: int = None
136 |     # optimizer_output_tokens: int = None
137 |     # optimizer_cost: float = None
138 | 
139 |     # optimizer_program_scores: list[float] = None
140 | 
141 | 
142 | @dataclass
143 | class BenchmarkMeta:
144 |     benchmark: Type[Benchmark]
145 |     program: List[dspy.Module]
146 |     metric: Callable
147 |     dataset_mode: str = "lite"
148 | 
149 |     optimizers: List[langprobe_optimizers.OptimizerConfig] = field(
150 |         default_factory=lambda: langprobe_optimizers.DEFAULT_OPTIMIZERS
151 |     )
152 | 
153 |     # BenchmarkMeta.num_threads has higher priority than run time argument of num_threads
154 |     # use this as an upper bound for the number of threads to use
155 |     num_threads: int = None
156 |     name: str = None
157 | 
158 | 
159 | def setup_lm(dspy_config=None):
160 |     lm: dspy.LM = dspy_config.get("lm", dspy.settings.lm)
161 |     assert lm is not None, "dspy language model not set"
162 | 
163 |     lm = lm.copy()
164 |     assert len(lm.history) == 0, "language model history not empty"
165 |     return lm
166 | 
167 | 
168 | # def calculate_stats(lm: dspy.LM) -> tuple[float, int, int]:
169 | #     cost = 0
170 | #     input_tokens = 0
171 | #     output_tokens = 0
172 | #     for i, trace in enumerate(lm.history):
173 | #         cost += trace.get("cost", None) or 0
174 | #         input_tokens += trace.get("usage", 0).get("prompt_tokens", 0)
175 | #         output_tokens += trace.get("usage", 0).get("completion_tokens", 0)
176 | 
177 | #     return cost, input_tokens, output_tokens
178 | 
179 | def calculate_stats(manager: List[ProcessManager]) -> tuple[float, float, float]:
180 |     input_tokens = sum(usage["prompt_tokens"] for trace in manager for usage in trace.lm_usages)
181 |     output_tokens = sum(usage["completion_tokens"] for trace in manager for usage in trace.lm_usages)
182 |     
183 |     avg_input = input_tokens // len(manager)
184 |     avg_output = output_tokens // len(manager)
185 |     
186 |     return 0, avg_input, avg_output
187 | 
188 | 
189 | 
190 | class EvaluateBench(ABC):
191 |     def __init__(
192 |         self,
193 |         benchmark: Benchmark,
194 |         program: dspy.Module,
195 |         metric: Callable,
196 |         lm: str,
197 |         benchmark_name: str = None,
198 |         num_threads: int = 1,
199 |         api_key: str = None,
200 |         api_base: str = None,
201 |     ):
202 |         self.benchmark = benchmark
203 |         self.program = program
204 | 
205 |         self.program.setup_lm(lm, api_key=api_key, api_base=api_base)
206 |         self.metric = metric
207 |         self.num_threads = num_threads
208 |         devset = benchmark.get_test_set()
209 |         self.evaluate_prog = Evaluate(
210 |             devset=devset,
211 |             metric=self.metric,
212 |             num_threads=self.num_threads,
213 |             display_progress=True,
214 |             max_errors=5000,
215 |             return_outputs=True,
216 |             provide_traceback=True,
217 |         )
218 | 
219 |         self.program_name = getattr(
220 |             self.program, "_name", self.program.__class__.__name__
221 |         )
222 |         self.benchmark_name = benchmark_name or self.benchmark.__class__.__name__
223 |         self.results: list[EvaluationResult] = []
224 | 
225 |     def get_empty_results(self):
226 |         return EvaluationResult(
227 |             benchmark=self.benchmark_name,
228 |             program=self.program_name,
229 |             score=0,
230 |             cost=0,
231 |             input_tokens=0,
232 |             output_tokens=0,
233 |         )
234 | 
235 | 
236 |     def evaluate_baseline(self, dspy_config=None) -> EvaluationResult:
237 |         with dspy.context(**dspy_config):
238 |             score, info = self.evaluate_prog(self.program)
239 |         result = self.get_empty_results()
240 |         datasets, outputs, _ = zip(*info)
241 |         managers = [one.process_report for one in outputs]
242 | 
243 |         result.score = score   
244 |         result.outputs_raw_data = outputs
245 |         result.cost, result.input_tokens, result.output_tokens = calculate_stats(managers)
246 | 
247 |         return result
248 | 
249 |     def evaluate(self, dspy_config=None) -> EvaluationResult:
250 |         """
251 |         Args:
252 |             dspy_config: A dictionary of configurations for dspy.context
253 |         Returns:
254 |             A list of EvaluationResult objects.
255 |         """
256 |         if dspy_config is None:
257 |             dspy_config = {}
258 | 
259 |         result = self.evaluate_baseline(dspy_config)
260 |         self.results = result
261 |         return result
262 | 


--------------------------------------------------------------------------------
/langProBe/config_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | def read_json(file_path):
 3 |     """
 4 |     Read a JSON file and return the content as a dictionary.
 5 |     """
 6 |     with open(file_path, 'r') as file:
 7 |         data = json.load(file)
 8 |     return data
 9 | 
10 | def read_jsonl(file_path):
11 |     """
12 |     Read a JSONL file and return the content as a list of dictionaries.
13 |     """
14 |     data = []
15 |     with open(file_path, 'r', encoding='utf-8') as f:
16 |         for line in f:
17 |             test_data = json.loads(line)
18 |             data.append(test_data)
19 |     return data


--------------------------------------------------------------------------------
/langProBe/constants.py:
--------------------------------------------------------------------------------
1 | ROLE = 'role'
2 | CONTENT = 'content'
3 | SYSTEM = 'system'
4 | USER = 'user'
5 | ASSISTANT = 'assistant'
6 | TOOL = 'tool'
7 | TOOL_CALLS = 'tool_calls'


--------------------------------------------------------------------------------
/langProBe/dspy_program.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | 
  3 | 
  4 | #################################### Common Programs ####################################
  5 | 
  6 | 
  7 | def deduplicate(seq: list[str]) -> list[str]:
  8 |     """
  9 |     Source: https://stackoverflow.com/a/480227/1493011
 10 |     """
 11 | 
 12 |     seen = set()
 13 |     return [x for x in seq if not (x in seen or seen.add(x))]
 14 | 
 15 | 
 16 | class LangProBeDSPyMetaProgram(dspy.Module):
 17 |     def setup_lm(self, lm, api_key=None, api_base=None):
 18 |         dspy.settings.experimental = True
 19 |         self.lm = dspy.LM(lm, api_key=api_key, api_base=api_base)
 20 |         self.set_lm(self.lm)
 21 | 
 22 |     def program_type(self):
 23 |         return "dspy"
 24 | 
 25 | 
 26 | class Predict(dspy.Predict, LangProBeDSPyMetaProgram):
 27 |     pass
 28 | 
 29 | 
 30 | class CoT(dspy.ChainOfThought, LangProBeDSPyMetaProgram):
 31 |     pass
 32 | 
 33 | 
 34 | def default_input_to_query(**kwargs):
 35 |     if len(kwargs) == 1:
 36 |         return list(kwargs.values())[0]
 37 |     else:
 38 |         raise ValueError(
 39 |             "Cannot convert multiple inputs to a query, please specify input_to_query."
 40 |         )
 41 | 
 42 | 
 43 | class RAG(LangProBeDSPyMetaProgram, dspy.Module):
 44 |     def __init__(
 45 |         self,
 46 |         signature,
 47 |         retriever=dspy.Retrieve(k=3),
 48 |         input_to_query=default_input_to_query,
 49 |     ):
 50 |         self.retriver = retriever
 51 |         verified_signature = dspy.ensure_signature(signature)
 52 |         verified_signature = verified_signature.prepend(
 53 |             "context", dspy.InputField(desc="may contain relevant facts")
 54 |         )
 55 |         self.prog = dspy.ChainOfThought(verified_signature)
 56 |         self.input_to_query = input_to_query
 57 | 
 58 |     def forward(self, **kwargs):
 59 |         context = self.retriver(self.input_to_query(**kwargs)).passages
 60 |         pred = self.prog(context=context, **kwargs)
 61 |         return pred
 62 | 
 63 | 
 64 | class SimplifiedBaleen(LangProBeDSPyMetaProgram, dspy.Module):
 65 |     def __init__(
 66 |         self, signature, query_gen_input=None, retriever=dspy.Retrieve(k=2), max_hops=2
 67 |     ):
 68 |         """
 69 |         args:
 70 |             signature: The signature to the final generate module
 71 |             query_gen_input: a list of keywords to be used as input to the query generation module
 72 |             retriever: a retriever module to be used to retrieve relevant facts
 73 |             max_hops: the number of hops to be used in the simplified
 74 |             FIXME (shangyin) correctly handle query_gen_input
 75 |         """
 76 | 
 77 |         self.max_hops = max_hops
 78 |         self.retriever = retriever
 79 |         verified_signature = dspy.ensure_signature(signature)
 80 |         verified_signature = verified_signature.prepend(
 81 |             "context", dspy.InputField(desc="may contain relevant facts")
 82 |         )
 83 | 
 84 |         # remove the output field from the generate query signature
 85 |         # generate_query should use a default instruction rather than instruction from the original signature
 86 |         # FIXME (shangyin) fix the default signature.instructions
 87 |         input_fields = verified_signature.input_fields
 88 |         generate_query_signature = dspy.Signature(input_fields)
 89 |         generate_query_signature = generate_query_signature.append(
 90 |             "search_query", dspy.OutputField()
 91 |         )
 92 | 
 93 |         self.generate_query = [
 94 |             dspy.ChainOfThought(generate_query_signature) for _ in range(self.max_hops)
 95 |         ]
 96 |         self.generate_answer = dspy.ChainOfThought(verified_signature)
 97 | 
 98 |     def forward(self, **kwargs):
 99 |         context = []
100 | 
101 |         for hop in range(self.max_hops):
102 |             query = self.generate_query[hop](context=context, **kwargs).search_query
103 |             passages = self.retriever(query).passages
104 |             context = deduplicate(context + passages)
105 | 
106 |         pred = self.generate_answer(context=context, **kwargs)
107 |         return pred
108 | 
109 | 
110 | #################################### Archon Programs ####################################
111 | 
112 | # Note Ranker and Fuser are equipped with self.get_prediction() method to return a Prediction object
113 | # in the original signature
114 | 
115 | 
116 | class ArchonGenerator(LangProBeDSPyMetaProgram, dspy.Module):
117 |     # https://github.com/ScalingIntelligence/Archon/blob/main/src/archon/completions/components/Generator.py
118 | 
119 |     def __init__(self, signature, n=5):
120 |         # For dspy, n responses are generated with a single model now.
121 |         # If desired, we can create a new module in dspy that uses multiple models to generate n responses.
122 |         verified_signature = dspy.ensure_signature(signature)
123 |         assert (
124 |             len(verified_signature.output_fields) == 1
125 |         ), "ArchonGenerator only supports a single output field"
126 | 
127 |         self.prog = dspy.ChainOfThought(verified_signature, n=n)
128 |         self.output_field = list(verified_signature.output_fields.keys())[0]
129 | 
130 |     def forward(self, **kwargs) -> dspy.Prediction:
131 |         return self.prog(**kwargs)
132 | 
133 |     def get_responses(self, **kwargs) -> list[str]:
134 |         responses = self.prog(**kwargs).completions.__getattr__(self.output_field)
135 |         return responses
136 | 
137 |     def get_formatted_responses(self, **kwargs) -> str:
138 |         responses = self.get_responses(**kwargs)
139 |         return responses_formatter(responses)
140 | 
141 | 
142 | def responses_formatter(responses):
143 |     if not isinstance(responses, list):
144 |         dspy.logger.warning(
145 |             "Responses of CriticGenerator should be a list of responses. "
146 |         )
147 |         responses = [responses]
148 |     formatted_responses = []
149 |     for i, response in enumerate(responses):
150 |         formatted_responses.append(f"[{i+1}] {response}")
151 |     return "\n".join(formatted_responses)
152 | 
153 | 
154 | class FeedbackGeneratorSignature(dspy.Signature):
155 |     """
156 |     Evaluate all responses based on their relevance to the instructions.
157 |     All the responses should be included and evaluated using identifiers.
158 |     You must include both strengths and weaknesses, even if there are more of one than the other.
159 |     Start with the analysis for the first response and end with the analysis for the last response.
160 |     """
161 | 
162 |     task_instructions = dspy.InputField(
163 |         desc="The instructions on how the responses are generated."
164 |     )
165 |     responses = dspy.InputField(
166 |         desc="The generated responses to critize. Each response will start with a numerical identifier in [], like [1].",
167 |     )
168 |     feedback: list[str] = dspy.OutputField(
169 |         desc="The feedback for each response. Discuss the strengths and weaknesses of each response."
170 |     )
171 | 
172 | 
173 | class ArchonCritic(LangProBeDSPyMetaProgram, dspy.Module):
174 |     # https://github.com/ScalingIntelligence/Archon/blob/main/src/archon/completions/components/Critic.py
175 | 
176 |     def __init__(self, signature, n=5):
177 |         # signature should be the signature to the original generator module
178 |         verified_signature = dspy.ensure_signature(signature)
179 |         assert (
180 |             len(verified_signature.output_fields) == 1
181 |         ), "ArchonCritic only supports a single output field"
182 |         self.signature = verified_signature
183 | 
184 |         self.instructions = verified_signature.instructions
185 |         feedback_gen_signature = FeedbackGeneratorSignature
186 |         # add all inputfields from the original signature to the feedback_gen_signature
187 |         for name, field in reversed(verified_signature.input_fields.items()):
188 |             feedback_gen_signature = feedback_gen_signature.prepend(name, field)
189 | 
190 |         self.feedback_gen = dspy.ChainOfThought(feedback_gen_signature)
191 | 
192 |     def forward(self, formatted_responses, **kwargs) -> dspy.Prediction:
193 |         return self.feedback_gen(
194 |             task_instructions=self.instructions, responses=formatted_responses, **kwargs
195 |         )
196 | 
197 |     def get_feedback(self, formatted_responses: str, **kwargs) -> list[str]:
198 |         return self.forward(formatted_responses, **kwargs).feedback
199 | 
200 | 
201 | class RankerGeneratorSignature(dspy.Signature):
202 |     """
203 |     Rank the responses based on their relevance to the instruction, in descending order (from most relevant to least relevant).
204 |     """
205 | 
206 |     task_instructions = dspy.InputField(
207 |         desc="The instructions on how the responses are generated."
208 |     )
209 | 
210 |     responses = dspy.InputField(
211 |         desc="The responses to rank. Each response will start with a numerical identifier in [], like [1].",
212 |     )
213 | 
214 |     ranking: list[int] = dspy.OutputField(
215 |         desc="The ranking of the responses. List the responses in descending order of relevance to the instructions."
216 |     )
217 | 
218 | 
219 | class ArchonRanker(LangProBeDSPyMetaProgram, dspy.Module):
220 |     # https://github.com/ScalingIntelligence/Archon/blob/main/src/archon/completions/components/prompts.py#L68
221 |     def __init__(self, signature, n=5, use_critic=False):
222 |         verified_signature = dspy.ensure_signature(signature)
223 |         assert (
224 |             len(verified_signature.output_fields) == 1
225 |         ), "ArchonRanker only supports a single output field"
226 |         self.signature = verified_signature
227 |         self.instructions = verified_signature.instructions
228 | 
229 |         ranker_signature = RankerGeneratorSignature
230 |         if use_critic:
231 |             ranker_signature = ranker_signature.append(
232 |                 "feedback",
233 |                 dspy.InputField(
234 |                     desc="The feedback (strength/weakness) for each response."
235 |                 ),
236 |             )
237 |             ranker_signature.instructions += (
238 |                 "and their provided critiques of strengths and weaknesses."
239 |             )
240 | 
241 |         # add all inputfields from the original signature to the feedback_gen_signature
242 |         for name, field in reversed(verified_signature.input_fields.items()):
243 |             ranker_signature = ranker_signature.prepend(name, field)
244 | 
245 |         self.ranker = dspy.ChainOfThought(ranker_signature)
246 | 
247 |     def forward(self, formatted_responses: str, **kwargs):
248 |         return self.ranker(
249 |             task_instructions=self.instructions, responses=formatted_responses, **kwargs
250 |         )
251 | 
252 |     def get_ranking(self, formatted_responses: str, **kwargs) -> list[int]:
253 |         return self.forward(formatted_responses, **kwargs).ranking
254 | 
255 |     def get_prediction(self, responses: list[str], **kwargs) -> dspy.Prediction:
256 |         formatted_responses = responses_formatter(responses)
257 |         ranking = self.get_ranking(formatted_responses, **kwargs)
258 |         top_response = responses[ranking[0]]
259 |         pred = dspy.Prediction()
260 |         pred.__setattr__(list(self.signature.output_fields.keys())[0], top_response)
261 |         return pred
262 | 
263 | 
264 | class FuserGeneratorSignature(dspy.Signature):
265 |     """
266 |     Your task is to synthesize a list of responses to a task into a single, high-quality response of the same format. Do not include explanations.
267 |     """
268 | 
269 |     task_instructions = dspy.InputField(
270 |         desc="The instructions on how the responses are generated. Your final response should FOLLOW these instructions."
271 |     )
272 | 
273 |     responses = dspy.InputField(
274 |         desc="The responses to synthesize.",
275 |     )
276 | 
277 |     final_response = dspy.OutputField(
278 |         desc="""The final response, compiled from the input responses. 
279 |         Please provide a single response with the same format as all previous responses, excluding the number identifier. 
280 |         Ensure your response is well-structured, coherent, and adheres to the highest standards of accuracy and reliability. """
281 |     )
282 | 
283 | 
284 | class ArchonFuser(LangProBeDSPyMetaProgram, dspy.Module):
285 |     def __init__(self, signature, use_critic=False):
286 |         verified_signature = dspy.ensure_signature(signature)
287 |         assert (
288 |             len(verified_signature.output_fields) == 1
289 |         ), "ArchonFuser only supports a single output field"
290 |         self.signature = verified_signature
291 |         self.instructions = verified_signature.instructions
292 | 
293 |         fuser_signature = FuserGeneratorSignature
294 |         if use_critic:
295 |             fuser_signature = fuser_signature.append(
296 |                 "feedback",
297 |                 dspy.InputField(
298 |                     desc="The feedback (strength/weakness) for each response."
299 |                 ),
300 |             )
301 |             fuser_signature.instructions += "For each response, we also provide critiques of strengths and weaknesses."
302 |         output_field_desc = list(verified_signature.output_fields.values())[
303 |             0
304 |         ].json_schema_extra["desc"]
305 |         fuser_signature.output_fields["final_response"].json_schema_extra[
306 |             "desc"
307 |         ] += f"{output_field_desc}"
308 | 
309 |         # add all inputfields from the original signature to the feedback_gen_signature
310 |         for name, field in reversed(verified_signature.input_fields.items()):
311 |             fuser_signature = fuser_signature.prepend(name, field)
312 | 
313 |         self.fuser = dspy.ChainOfThought(fuser_signature)
314 | 
315 |     def forward(self, formatted_responses: str, **kwargs):
316 |         return self.fuser(
317 |             task_instructions=self.instructions, responses=formatted_responses, **kwargs
318 |         )
319 | 
320 |     def get_response(self, formatted_responses: str, **kwargs) -> str:
321 |         return self.forward(formatted_responses, **kwargs).final_response
322 | 
323 |     def get_prediction(self, formatted_responses: str, **kwargs) -> dspy.Prediction:
324 |         final_response = self.get_response(formatted_responses, **kwargs)
325 |         pred = dspy.Prediction()
326 |         pred.__setattr__(list(self.signature.output_fields.keys())[0], final_response)
327 |         return pred
328 | 
329 | 
330 | # TODO(shangyin) new adapters from Archon to be added: Verifier
331 | 
332 | #################################### Archon Example Programs ####################################
333 | 
334 | 
335 | class GeneratorCriticRanker(LangProBeDSPyMetaProgram, dspy.Module):
336 |     def __init__(self, signature, n=5):
337 |         verified_signature = dspy.ensure_signature(signature)
338 |         assert (
339 |             len(verified_signature.output_fields) == 1
340 |         ), "ArchonExample only supports a single output field"
341 |         self.signature = verified_signature
342 | 
343 |         self.generator = ArchonGenerator(self.signature, n)
344 |         self.critic = ArchonCritic(self.signature, n)
345 |         self.ranker = ArchonRanker(self.signature, n, use_critic=True)
346 | 
347 |         if n != 5:  # override default name
348 |             self._name = f"GeneratorCriticRanker{n}"
349 | 
350 |     def forward(self, **kwargs):
351 |         responses = self.generator.get_responses(**kwargs)
352 |         formatted_responses = responses_formatter(responses)
353 |         feedback = self.critic.get_feedback(formatted_responses, **kwargs)
354 |         return self.ranker.get_prediction(responses, feedback=feedback, **kwargs)
355 | 
356 | 
357 | class GeneratorCriticFuser(LangProBeDSPyMetaProgram, dspy.Module):
358 |     def __init__(self, signature, n=5):
359 |         verified_signature = dspy.ensure_signature(signature)
360 |         assert (
361 |             len(verified_signature.output_fields) == 1
362 |         ), "GeneratorCriticFuser only supports a single output field"
363 |         self.signature = verified_signature
364 | 
365 |         self.generator = ArchonGenerator(self.signature, n)
366 |         self.critic = ArchonCritic(self.signature, n)
367 |         self.fuser = ArchonFuser(self.signature, use_critic=True)
368 | 
369 |         if n != 5:  # override default name
370 |             self._name = f"GeneratorCriticFuser{n}"
371 | 
372 |     def forward(self, **kwargs):
373 |         formatted_responses = self.generator.get_formatted_responses(**kwargs)
374 |         feedback = self.critic.get_feedback(formatted_responses, **kwargs)
375 |         return self.fuser.get_prediction(
376 |             formatted_responses, feedback=feedback, **kwargs
377 |         )
378 | 
379 | 
380 | class GeneratorRanker(LangProBeDSPyMetaProgram, dspy.Module):
381 |     def __init__(self, signature, n=5):
382 |         verified_signature = dspy.ensure_signature(signature)
383 |         assert (
384 |             len(verified_signature.output_fields) == 1
385 |         ), "GeneratorRanker only supports a single output field"
386 |         self.signature = verified_signature
387 | 
388 |         self.generator = ArchonGenerator(self.signature, n)
389 |         self.ranker = ArchonRanker(self.signature, use_critic=False)
390 | 
391 |     def forward(self, **kwargs):
392 |         responses = self.generator.get_responses(**kwargs)
393 |         return self.ranker.get_prediction(responses)
394 | 
395 | 
396 | class GeneratorFuser(LangProBeDSPyMetaProgram, dspy.Module):
397 |     def __init__(self, signature, n=5):
398 |         verified_signature = dspy.ensure_signature(signature)
399 |         assert (
400 |             len(verified_signature.output_fields) == 1
401 |         ), "GeneratorFuser only supports a single output field"
402 |         self.signature = verified_signature
403 | 
404 |         self.generator = ArchonGenerator(self.signature, n)
405 |         self.fuser = ArchonFuser(self.signature, use_critic=False)
406 | 
407 |     def forward(self, **kwargs):
408 |         formatted_responses = self.generator.get_formatted_responses(**kwargs)
409 |         return self.fuser.get_prediction(formatted_responses)
410 | 
411 | 
412 | if __name__ == "__main__":
413 |     # Example usage
414 |     dspy.configure(
415 |         lm=dspy.LM("openai/gpt-4o-mini"),
416 |         # example rm for RAG w. passages from wikipedia dump
417 |         rm=dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts"),
418 |     )
419 | 
420 |     question = "What is the capital of France?"
421 |     context = "France is a country in Europe."
422 | 
423 |     # CoT
424 |     print("======== CoT =========")
425 |     cot = CoT("question, context -> answer")
426 |     cot(question=question, context=context)
427 |     dspy.settings.lm.inspect_history()
428 | 
429 |     # RAG
430 |     print("======== RAG =========")
431 |     rag = RAG("question -> answer")
432 |     rag(question=question)
433 |     dspy.settings.lm.inspect_history()
434 | 
435 |     # SimplifiedBaleen
436 |     print("======== SimplifiedBaleen =========")
437 |     simplified_baleen = SimplifiedBaleen("question -> answer")
438 |     simplified_baleen(question=question)
439 |     dspy.settings.lm.inspect_history(n=3)
440 | 
441 |     # GeneratorCriticRanker
442 |     print("======== GeneratorCriticRanker =========")
443 |     archon_example = GeneratorCriticRanker("question -> answer")
444 |     archon_example(question=question)
445 |     dspy.settings.lm.inspect_history(n=3)
446 | 
447 |     # GeneratorRanker
448 |     print("======== GeneratorRanker =========")
449 |     generator_ranker = GeneratorRanker("question -> answer")
450 |     generator_ranker(question=question)
451 |     dspy.settings.lm.inspect_history(n=3)
452 | 
453 |     # GeneratorCriticFuser
454 |     print("======== GeneratorCriticFuser =========")
455 |     generator_critic_fuser = GeneratorCriticFuser("question -> answer")
456 |     generator_critic_fuser(question=question)
457 |     dspy.settings.lm.inspect_history(n=3)
458 | 
459 |     # GeneratorFuser
460 |     print("======== GeneratorFuser =========")
461 |     generator_fuser = GeneratorFuser("question -> answer")
462 |     generator_fuser(question=question)
463 |     dspy.settings.lm.inspect_history(n=3)
464 | 


--------------------------------------------------------------------------------
/langProBe/evaluation.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import copy
  3 | import os
  4 | import pathlib
  5 | import sys
  6 | import time
  7 | from contextlib import contextmanager
  8 | from pathlib import Path
  9 | 
 10 | import dspy
 11 | 
 12 | from langProBe.analysis import read_evaluation_results
 13 | from langProBe.benchmark import BenchmarkMeta, EvaluateBench, EvaluationResult
 14 | from langProBe.config_utils import read_json, read_jsonl
 15 | from langProBe.dspy_program import (
 16 |     GeneratorCriticFuser,
 17 |     GeneratorCriticRanker,
 18 |     LangProBeDSPyMetaProgram,
 19 | )
 20 | from langProBe.optimizers import create_optimizer, DEFAULT_OPTIMIZERS
 21 | from langProBe.register_benchmark import register_all_benchmarks, registered_benchmarks
 22 | from langProBe.evaluation_utils import find_missing_entries, replace_logger_filehandler
 23 | 
 24 | 
 25 | class CompareAnswerSignature(dspy.Signature):
 26 |     """
 27 |     Compare the answer to the ground truth answer.
 28 |     """
 29 | 
 30 |     answer = dspy.InputField(desc="The answer to a problem")
 31 |     ground_truth = dspy.InputField(desc="The ground truth answer to the same problem")
 32 |     is_correct = dspy.OutputField(
 33 |         desc="Whether the answer is correct, either True or False."
 34 |     )
 35 | 
 36 | 
 37 | class CompareAnswer(dspy.Module):
 38 |     def __init__(self):
 39 |         self.compare_answer = dspy.ChainOfThought(CompareAnswerSignature)
 40 | 
 41 |     def forward(self, ground_truth, answer):
 42 |         pred = self.compare_answer(answer=answer, ground_truth=ground_truth)
 43 |         return pred
 44 | 
 45 | 
 46 | def llm_as_judge_evaluate(gold, pred, extract_answer_fun=lambda x: x.answer):
 47 |     compare_answer = CompareAnswer()
 48 |     answer_raw = compare_answer(
 49 |         ground_truth=extract_answer_fun(gold), answer=extract_answer_fun(pred)
 50 |     ).is_correct
 51 |     if answer_raw.lower().startswith("true"):
 52 |         return True
 53 |     else:
 54 |         return False
 55 | 
 56 | 
 57 | @contextmanager
 58 | def suppress_output(suppress=True):
 59 |     if suppress:
 60 |         # Save the original streams
 61 |         original_stderr = sys.stderr
 62 |         original_stdout = sys.stdout
 63 | 
 64 |         # Redirect stderr and stdout to devnull
 65 |         sys.stderr = open(os.devnull, "w")
 66 |         sys.stdout = open(os.devnull, "w")
 67 | 
 68 |     try:
 69 |         yield
 70 |     finally:
 71 |         if suppress:
 72 |             # Restore the original streams
 73 |             sys.stderr.close()
 74 |             sys.stdout.close()
 75 |             sys.stderr = original_stderr
 76 |             sys.stdout = original_stdout
 77 | 
 78 | 
 79 | def generate_evaluation_records(file_path):
 80 |     file_path = pathlib.Path(file_path)
 81 | 
 82 |     # if the records file already exists, do not overwrite it
 83 |     if (file_path / "evaluation_records.csv").exists():
 84 |         return
 85 | 
 86 |     # List all .txt files in the directory
 87 |     all_result_files = list(file_path.rglob("*.txt"))
 88 | 
 89 |     records = []
 90 | 
 91 |     # Process each file
 92 |     for file in all_result_files:
 93 |         # Split the filename to get benchmark, program, and optimizer
 94 |         file_name_parts = file.stem.split("_")
 95 |         if len(file_name_parts) >= 3:
 96 |             benchmark = file_name_parts[0]
 97 |             program = file_name_parts[1]
 98 |             optimizer = file_name_parts[2]
 99 |             records.append((benchmark, program, optimizer))
100 |         else:
101 |             raise ValueError(f"Invalid file name: {file.name}")
102 | 
103 |     with open(f"{file_path}/evaluation_records.csv", "w") as f:
104 |         f.write("benchmark,program,optimizer\n")
105 |         for record in records:
106 |             f.write(",".join(record) + "\n")
107 | 
108 | 
109 | def add_to_evaluation_records(file_path, evaluation_results: list[EvaluationResult]):
110 |     file_path = pathlib.Path(file_path)
111 | 
112 |     with open(f"{file_path}/evaluation_records.csv", "a") as f:
113 |         for evaluation_result in evaluation_results:
114 |             f.write(
115 |                 f"{evaluation_result.benchmark},{evaluation_result.program},{evaluation_result.optimizer}\n"
116 |             )
117 | 
118 | 
119 | def read_evaluation_records(file_path):
120 |     file_path = pathlib.Path(file_path)
121 |     records = []
122 | 
123 |     # create the records file if it does not exist
124 |     if not (file_path / "evaluation_records.csv").exists():
125 |         # create empty records file without header
126 |         with open(f"{file_path}/evaluation_records.csv", "w") as f:
127 |             f.write("")
128 |     with open(f"{file_path}/evaluation_records.csv", "r") as f:
129 |         lines = f.readlines()
130 |         for line in lines[1:]:
131 |             records.append(tuple(line.strip().split(",")))
132 | 
133 |     return records
134 | 
135 | 
136 | def evaluate(
137 |     benchmark_meta: BenchmarkMeta,
138 |     lm,
139 |     file_path,
140 |     num_threads=8,
141 |     suppress_dspy_output=True,
142 |     dataset_mode=None,
143 |     dataset_path=None,
144 |     missing_mode_file="",
145 |     api_key=None,
146 |     api_base=None,
147 | ):
148 |     """
149 |     benchmark_meta: BenchmarkMeta object to evaluate
150 |     lm: Language model to use, should be an instance of dspy.LM
151 |     missing_mode: only evaluate experiments without a result file
152 |     """
153 |     dataset_mode = dataset_mode or benchmark_meta.dataset_mode
154 | 
155 |     if missing_mode_file:
156 |         origin_data = read_jsonl(dataset_path)
157 |         runed_data = read_jsonl(missing_mode_file)
158 |         missing_data = find_missing_entries(origin_data, runed_data)
159 |         benchmark = benchmark_meta.benchmark(dataset_mode=dataset_mode, missing_data=missing_data)
160 |         replace_logger_filehandler(os.path.splitext(missing_mode_file)[0])
161 |     else:
162 |         benchmark = benchmark_meta.benchmark(dataset_mode=dataset_mode, dataset_path=dataset_path)
163 |     # Canonicalize optimizers to (optimizer, compile_kwargs) tuples
164 |     benchmark_name = benchmark_meta.name or benchmark.__class__.__name__
165 | 
166 |     num_threads = benchmark_meta.num_threads or num_threads
167 |     print(f"Evaluating {benchmark_name}")
168 |     print(f"num_threads: {num_threads}")
169 |     print(f"Test set size: {len(benchmark.test_set)}")
170 | 
171 | 
172 |     Path(file_path).mkdir(parents=True, exist_ok=True)
173 | 
174 |     evaluation_records = read_evaluation_records(file_path)
175 | 
176 |     # create a stats file for each experiment
177 |     stats_file = os.path.join(file_path, f"{benchmark_name}.stat")
178 |     with open(stats_file, "w") as f:
179 |         f.write(
180 |             f"benchmark: {benchmark_name}\n"
181 |             f"lm: {lm}\n"
182 |             f"test_set_size: {len(benchmark.test_set)}\n"
183 |         )
184 | 
185 |     for program in benchmark_meta.program:
186 |         program_name = getattr(program, "_name", program.__class__.__name__)
187 | 
188 |         print(f"Program: {program_name}")
189 | 
190 |         with suppress_output(suppress=suppress_dspy_output):
191 |             evaluate_bench = EvaluateBench(
192 |                 benchmark=benchmark,
193 |                 program=program,
194 |                 metric=benchmark_meta.metric,
195 |                 lm=lm,
196 |                 benchmark_name=benchmark_meta.name,
197 |                 num_threads=num_threads,
198 |                 api_key=api_key if api_key else os.getenv("OPENAI_API_KEY", ""),
199 |                 api_base=api_base if api_base else os.getenv("OPENAI_API_BASE", ""),
200 |             )
201 |             evaluate_bench.evaluate()
202 |         # print(f"Results: {evaluate_bench.results}")
203 | 
204 |         # if missing_mode:
205 |         #     add_to_evaluation_records(file_path, evaluate_bench.results)
206 |         evaluation_result = evaluate_bench.results
207 | 
208 |         file_name = f"{evaluation_result.benchmark}_{evaluation_result.program}"
209 |         with open(os.path.join(file_path, f"{file_name}.txt"), "w") as f:
210 |             f.write(f"score,cost,input_tokens,output_tokens\n")
211 |             f.write(
212 |                 f"{evaluation_result.score},{evaluation_result.cost},{evaluation_result.input_tokens},"
213 |                 f"{evaluation_result.output_tokens}\n"
214 |             )
215 | 
216 | 
217 | def evaluate_all(
218 |     benchmarks,
219 |     lm,
220 |     file_path,
221 |     num_threads=8,
222 |     suppress_dspy_output=False,
223 |     dataset_mode=None,
224 |     dataset_path=None,
225 |     missing_mode_file="",
226 |     api_key=None,
227 |     api_base=None,
228 | ):
229 |     # 只有当benchmarks是字符串列表时才进行注册
230 |     if benchmarks and isinstance(benchmarks[0], str):
231 |         benchmarks = register_all_benchmarks(benchmarks)
232 | 
233 |     for benchmark_meta in benchmarks:
234 |         evaluate(
235 |             benchmark_meta,
236 |             lm,
237 |             file_path,
238 |             num_threads,
239 |             suppress_dspy_output,
240 |             dataset_mode,
241 |             dataset_path,
242 |             missing_mode_file,
243 |             api_key=api_key,
244 |             api_base=api_base,
245 |         )
246 | 
247 |     df = read_evaluation_results(file_path)
248 |     df.to_csv(f"{file_path}/evaluation_results.csv", index=False)
249 |     df["model"] = lm
250 | 
251 |     # generate evaluation records
252 |     generate_evaluation_records(file_path)
253 | 
254 | global_config=None
255 | def main():
256 |     import multiprocessing
257 |     multiprocessing.freeze_support()
258 |     
259 |     parser = argparse.ArgumentParser(description="LangProbe benchmark evaluation")
260 |     parser.add_argument("--benchmark", type=str, required=True, help="Benchmark to evaluate")
261 |     parser.add_argument("--lm", type=str, required=True, help="Language model to use")
262 |     parser.add_argument("--lm_api_key", type=str, help="API key for language model")
263 |     parser.add_argument(
264 |         "--lm_api_base", type=str, help="API base for language model"
265 |     )
266 |     parser.add_argument(
267 |         "--dataset_mode", type=str, help="Dataset mode (train, val, test)"
268 |     )
269 |     parser.add_argument(
270 |         "--dataset_path", type=str, help="Dataset path"
271 |     )
272 |     parser.add_argument(
273 |         "--num_threads", type=int, default=8, help="Number of threads to use"
274 |     )
275 |     parser.add_argument(
276 |         "--file_path", type=str, default="evaluation", help="File path for evaluation results"
277 |     )
278 |     parser.add_argument(
279 |         "--suppress_dspy_output",
280 |         action="store_true",
281 |         help="Suppress dspy output",
282 |     )
283 |     parser.add_argument(
284 |         "--missing_mode_file",
285 |         type=str,
286 |         default="",
287 |         help="Only run missing experiments (skip experiments that already have results), value = path to log/jsonl",
288 |     )
289 |     parser.add_argument(
290 |         "--config",
291 |         type=str,
292 |         default='ddgo.json',
293 |         help="Configuration file for the benchmark",
294 |     )
295 | 
296 |     args = parser.parse_args()
297 | 
298 |     global global_config
299 |     global_config= read_json(args.config)
300 |     # 处理benchmark参数
301 |     benchmark_path = args.benchmark
302 |     if not benchmark_path.startswith("langProBe."):
303 |         benchmark_path = f"langProBe.{benchmark_path}"
304 |     
305 |     # 注册所有基准测试
306 |     register_all_benchmarks([benchmark_path])
307 | 
308 |     benchmarks = [benchmark for benchmark in registered_benchmarks]
309 |     if not benchmarks:
310 |         print(f"No benchmark registered with name {args.benchmark}")
311 |         sys.exit(1)
312 | 
313 |     evaluate_all(
314 |         benchmarks,
315 |         args.lm,
316 |         args.file_path,
317 |         num_threads=args.num_threads,
318 |         suppress_dspy_output=args.suppress_dspy_output,
319 |         dataset_mode=args.dataset_mode,
320 |         dataset_path=args.dataset_path,
321 |         missing_mode_file=args.missing_mode_file,
322 |         api_key=args.lm_api_key,
323 |         api_base=args.lm_api_base,
324 |     )
325 | 
326 | if __name__ == "__main__":
327 |     main()
328 | 


--------------------------------------------------------------------------------
/langProBe/evaluation_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import dspy
  3 | from typing import List, Tuple, Optional
  4 | from langProBe.program_utils import call_lm, ProcessManager
  5 | import langProBe.constants as constants
  6 | import logging
  7 | import re
  8 | import string
  9 | import warnings
 10 | import os
 11 | import logging
 12 | import numpy as np
 13 | 
 14 | 
 15 | EVALUATE_PROMPT = """对于以下问题：{question}
 16 | 
 17 | 请判断预测答案是否回答正确，回答对关键信息就算正确:
 18 | 
 19 | 预测答案: {prediction}
 20 | 正确答案: {ground_truth}
 21 | 
 22 | 只需要返回True或False。"""
 23 | 
 24 | def evaluate_final_answer(
 25 |             question: str, 
 26 |             ground_truth: str, 
 27 |             prediction: str, 
 28 |             manager: ProcessManager,
 29 |             logger: logging.Logger,
 30 |             ) -> Tuple[bool, Optional[str]]:
 31 |     prompt = EVALUATE_PROMPT.format(question=question, prediction=prediction, ground_truth=ground_truth)
 32 |     messages = [
 33 |         {
 34 |             constants.ROLE: constants.USER,
 35 |             constants.CONTENT: prompt
 36 |         }
 37 |     ]
 38 |     logger.info(f"开始评测final answer")
 39 |     logger.info(f"question: {question}")
 40 |     logger.info(f"ground_truth: {ground_truth}")
 41 |     logger.info(f"prediction: {prediction}")
 42 |     response_content, _, _ = call_lm(messages, manager, logger, temperature=0.01)
 43 |     return "true" in response_content.lower()
 44 | 
 45 | 
 46 | def normalize_number_str(number_str: str) -> float:
 47 |     # we replace these common units and commas to allow
 48 |     # conversion to float
 49 |     for char in ["$", "%", ","]:
 50 |         number_str = number_str.replace(char, "")
 51 |     try:
 52 |         return float(number_str)
 53 |     except ValueError:
 54 |         print(f"String {number_str} cannot be normalized to number str.")
 55 |         return float("inf")
 56 | 
 57 | 
 58 | def split_string(
 59 |         s: str,
 60 |         char_list: list[str] = [",", ";"],
 61 | ) -> list[str]:
 62 |     pattern = f"[{''.join(char_list)}]"
 63 |     return re.split(pattern, s)
 64 | 
 65 | def normalize_str(input_str, remove_punct=True) -> str:
 66 |     """
 67 |     Normalize a string by:
 68 |     - Removing all white spaces
 69 |     - Optionally removing punctuation (if remove_punct is True)
 70 |     - Converting to lowercase
 71 |     Parameters:
 72 |     - input_str: str, the string to normalize
 73 |     - remove_punct: bool, whether to remove punctuation (default: True)
 74 |     Returns:
 75 |     - str, the normalized string
 76 |     """
 77 |     # Remove all white spaces. Required e.g for seagull vs. sea gull
 78 |     no_spaces = re.sub(r"\s", "", input_str)
 79 | 
 80 |     # Remove punctuation, if specified.
 81 |     if remove_punct:
 82 |         translator = str.maketrans("", "", string.punctuation)
 83 |         return no_spaces.lower().translate(translator)
 84 |     else:
 85 |         return no_spaces.lower()
 86 | 
 87 | 
 88 | def question_scorer(
 89 |         model_answer: str,
 90 |         ground_truth: str,
 91 |         logger: logging.Logger
 92 | ) -> Tuple[bool, Optional[str]]:
 93 |     def is_float(element: any) -> bool:
 94 |         try:
 95 |             float(element)
 96 |             return True
 97 |         except ValueError:
 98 |             return False
 99 | 
100 |     if model_answer is None:
101 |         model_answer = "None"
102 |         logger.debug("Model answer is None. Converted to string 'None'.")
103 | 
104 |     # If ground truth is a number
105 |     if is_float(ground_truth):
106 |         info = f"Evaluating '{model_answer}' as a number."
107 |         logger.info(info)
108 |         normalized_answer = normalize_number_str(model_answer)
109 |         try:
110 |             result = normalized_answer == float(ground_truth)
111 |             logger.debug(f"Normalized model answer: {normalized_answer}, Ground truth: {ground_truth}, Result: {result}")
112 |             return result
113 |         except ValueError as e:
114 |             error_msg = f"Normalization error: {e}"
115 |             logger.error(error_msg)
116 |             return False
117 | 
118 |     # If ground truth is a list
119 |     elif any(char in ground_truth for char in [",", ";"]):
120 |         info = f"Evaluating '{model_answer}' as a comma/semi-colon separated list."
121 |         logger.info(info)
122 | 
123 |         gt_elems = split_string(ground_truth)
124 |         ma_elems = split_string(model_answer)
125 |         logger.debug(f"Ground truth elements: {gt_elems}")
126 |         logger.debug(f"Model answer elements: {ma_elems}")
127 | 
128 |         # Check if lengths are the same
129 |         if len(gt_elems) != len(ma_elems):
130 |             warning_msg = "Answer lists have different lengths."
131 |             logger.warning(warning_msg)
132 |             return False
133 | 
134 |         # Compare each element as float or string
135 |         comparisons = []
136 |         for idx, (ma_elem, gt_elem) in enumerate(zip(ma_elems, gt_elems), start=1):
137 |             if is_float(gt_elem):
138 |                 try:
139 |                     normalized_ma_elem = normalize_number_str(ma_elem)
140 |                     comparison = normalized_ma_elem == float(gt_elem)
141 |                     logger.debug(f"Element {idx}: Normalized model answer element '{normalized_ma_elem}' == Ground truth element '{float(gt_elem)}': {comparison}")
142 |                 except ValueError as e:
143 |                     error_msg = f"Normalization error at element {idx}: {e}"
144 |                     logger.error(error_msg)
145 |                     return False
146 |             else:
147 |                 normalized_ma = normalize_str(ma_elem, remove_punct=False)
148 |                 normalized_gt = normalize_str(gt_elem, remove_punct=False)
149 |                 comparison = normalized_ma == normalized_gt
150 |                 logger.debug(f"Element {idx}: Normalized model answer element '{normalized_ma}' == Ground truth element '{normalized_gt}': {comparison}")
151 |             comparisons.append(comparison)
152 | 
153 |         all_correct = all(comparisons)
154 |         if not all_correct:
155 |             detail_msg = "Mismatch found in list elements."
156 |             logger.info(detail_msg)
157 |             return all_correct
158 |         logger.debug("All list elements match the ground truth.")
159 |         return all_correct
160 | 
161 |     # If ground truth is a string
162 |     else:
163 |         info = f"Evaluating '{model_answer}' as a string."
164 |         logger.info(info)
165 |         normalized_ma = normalize_str(model_answer)
166 |         normalized_gt = normalize_str(ground_truth)
167 |         result = normalized_ma == normalized_gt
168 |         logger.debug(f"Normalized model answer: '{normalized_ma}' == Normalized ground truth: '{normalized_gt}': {result}")
169 |         return result
170 | 
171 | def mcp_metric(example: dspy.Example, pred: dspy.Prediction):
172 |     return pred.success
173 | 
174 | 
175 | 
176 | def extract_questions(data, key):
177 |     """从数据中提取指定字段（如 Prompt 或 question）用于比较"""
178 |     questions = set()
179 |     for item in data:
180 |         questions.add(item[key])
181 |     return questions
182 | 
183 | def find_missing_entries(data_a, data_b):
184 |     # data_a是原数据，data_b是已经跑了的数据
185 |  
186 |     questions_in_b = extract_questions(data_b, 'question')
187 | 
188 |     # 找出在B中不存在的A条目
189 |     missing_entries = [item for item in data_a if item['Prompt'] not in questions_in_b]
190 | 
191 |     return missing_entries
192 | 
193 | import logging
194 | 
195 | import os
196 | import logging
197 | 
198 | def replace_logger_filehandler(new_log_name):
199 |     """
200 |     替换 logger 中已有的 FileHandler，并为每个 logger 保留其原有的 formatter。
201 |     同时删除原有日志文件。
202 | 
203 |     :param new_log_name: 新的日志文件名（不带后缀）
204 |     """
205 | 
206 |     def update_handler(logger, file_suffix):
207 |         old_log_paths = []
208 |         formatter = None
209 |         for handler in logger.handlers:
210 |             if isinstance(handler, logging.FileHandler):
211 |                 if formatter is None:
212 |                     formatter = handler.formatter
213 |                 old_log_paths.append(handler.baseFilename)
214 | 
215 |         for handler in list(logger.handlers):
216 |             if isinstance(handler, logging.FileHandler):
217 |                 handler.close()
218 |                 logger.removeHandler(handler)
219 | 
220 |         for log_path in old_log_paths:
221 |             if os.path.exists(log_path):
222 |                 try:
223 |                     os.remove(log_path)
224 |                 except Exception as e:
225 |                     pass
226 | 
227 |         if logger.name == 'MCPPredictRunLogger':
228 |             new_name = new_log_name.replace("message", "run")
229 |         else:
230 |             new_name = new_log_name 
231 | 
232 |         new_handler = logging.FileHandler(f"{new_name}.{file_suffix}", mode='a', encoding='utf-8')
233 |         if formatter:
234 |             new_handler.setFormatter(formatter)
235 |         logger.addHandler(new_handler)
236 | 
237 |     run_logger = logging.getLogger('MCPPredictRunLogger')
238 |     update_handler(run_logger, 'log')
239 | 
240 |     message_logger = logging.getLogger('MCPPredictMessageLogger')
241 |     update_handler(message_logger, 'jsonl')
242 | 
243 | 
244 | 
245 | if __name__ == "__main__":
246 |     print(question_scorer("123", "123"))
247 |     


--------------------------------------------------------------------------------
/langProBe/langchain_program.py:
--------------------------------------------------------------------------------
 1 | from langchain.chains import LLMChain
 2 | from langchain.prompts import PromptTemplate
 3 | from langchain_community.chat_models import ChatLiteLLM
 4 | 
 5 | from langProBe.program_utils import DotDict
 6 | 
 7 | 
 8 | class LangProBeLangChainMetaProgram:
 9 |     def __init__(self, input_kwargs, output_kwargs):
10 |         self.lm = None
11 |         self.input_kwargs = input_kwargs
12 |         self.out_kwargs = output_kwargs
13 | 
14 |     def setup_lm(self, lm: str, api_key: str = None, api_base: str = None):
15 |         self.lm = ChatLiteLLM(model=lm, api_key=api_key, api_base=api_base)
16 | 
17 | 
18 | class NaiveLangChainProgram(LangProBeLangChainMetaProgram):
19 |     def __call__(self, **kwargs):
20 |         if not self.lm:
21 |             raise ValueError("Language model not initialized. Call setup_lm() first.")
22 | 
23 |         # Validate input keys
24 |         missing_keys = [key for key in self.input_kwargs if key not in kwargs]
25 |         if missing_keys:
26 |             raise ValueError(f"Missing required inputs: {missing_keys}")
27 | 
28 |         # Dynamically generate prompt template
29 |         prompt_text = "Given the following inputs:\n"
30 |         for key in self.input_kwargs:
31 |             prompt_text += f"- {key}: {{{key}}}\n"
32 |         prompt_text += f"Output the following field: {self.out_kwargs[0]}. Your response should be this output field only, with no explanation and formatting.\n Your response:"
33 | 
34 |         prompt_template = PromptTemplate(
35 |             input_variables=self.input_kwargs, template=prompt_text
36 |         )
37 | 
38 |         # Create LLM chain
39 |         chain = LLMChain(llm=self.lm, prompt=prompt_template)
40 | 
41 |         # Run the chain
42 |         response = chain.run(kwargs)
43 | 
44 |         # Format output
45 |         return DotDict({self.out_kwargs[0]: response})
46 | 


--------------------------------------------------------------------------------
/langProBe/mcp_program.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | from pydantic import BaseModel, Field
  3 | from langProBe.program_utils import (
  4 |     call_lm, 
  5 |     build_init_messages, 
  6 |     build_messages,
  7 |     response_parsing,
  8 |     mcp_calling,
  9 |     ProcessManager
 10 | )
 11 | import time
 12 | from langProBe.evaluation_utils import evaluate_final_answer
 13 | import langProBe.constants as constants
 14 | import logging
 15 | import os
 16 | from datetime import datetime
 17 | import json
 18 | from typing import List, Dict, Optional, Tuple
 19 | 
 20 | 
 21 | MCP_SAMPLE_SYSTEM_PROMPT = """
 22 | You are a helpful assistant. You are able to answer questions using different tools.  
 23 | The content of your available tools begins with ## Available Tools, indicating the collection of usable tools.  
 24 | Within the tool collection, each server is identified by ### server_name, where server_name represents the name of the server.  
 25 | Under each server, there are multiple tools (tool), and each tool starts with - tool_name, where tool_name is the name of the tool.  
 26 | The tool description includes:  
 27 | A brief text description outlining the functionality of the tool.  
 28 | Detailed information about input parameters, where each parameter includes: parameter name, parameter type, whether it is mandatory, and the purpose or description of the parameter.
 29 | """
 30 | 
 31 | class MCP_LM(BaseModel):
 32 |     model: str = Field(
 33 |         default=None,
 34 |         description="The model to use for the MCP program.",
 35 |     )
 36 |     api_key: str = Field(
 37 |         default=None,
 38 |         description="The API key for the model.",
 39 |     )
 40 |     api_base: str = Field(
 41 |         default=None,
 42 |         description="The API base URL for the model.",
 43 |     )
 44 | 
 45 | class LangProBeMCPMetaProgram(dspy.Module):
 46 |     def __init__(self):
 47 |         super().__init__()
 48 |         self.lm = MCP_LM()
 49 |     def setup_lm(self, lm, api_key=None, api_base=None):
 50 |         self.lm.model = lm
 51 |         self.lm.api_key = api_key
 52 |         self.lm.api_base = api_base
 53 | 
 54 |     def program_type(self):
 55 |         return "mcp"
 56 |     
 57 | 
 58 | class MCPPredict(LangProBeMCPMetaProgram, dspy.Module):
 59 |     def __init__(self, max_steps=5, system_prompt=MCP_SAMPLE_SYSTEM_PROMPT, task_name="mcp_sample"):
 60 |         super().__init__()
 61 |         self.system_prompt = system_prompt
 62 |         self.task_name = task_name
 63 |         self.max_steps = max_steps
 64 |         self.max_length = 30000
 65 | 
 66 |         # 配置运行日志记录器
 67 |         self.run_logger = logging.getLogger('MCPPredictRunLogger')
 68 |         self.run_logger.setLevel(logging.INFO)
 69 | 
 70 |         # 配置消息日志记录器
 71 |         self.message_logger = logging.getLogger('MCPPredictMessageLogger')
 72 |         self.message_logger.setLevel(logging.INFO)
 73 | 
 74 |         # 创建日志目录
 75 |         os.makedirs('logs', exist_ok=True)
 76 |         self.setup_loggers()
 77 | 
 78 |     def setup_loggers(self):
 79 |         timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
 80 |         
 81 |         # 设置运行日志
 82 |         run_log_file = f'logs/{self.task_name}_run_{timestamp}.log'
 83 |         run_handler = logging.FileHandler(run_log_file, encoding='utf-8')
 84 |         run_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 85 |         run_handler.setFormatter(run_formatter)
 86 |         self.run_logger.addHandler(run_handler)
 87 | 
 88 |         # 设置消息日志
 89 |         message_log_file = f'logs/{self.task_name}_messages_{timestamp}.jsonl'
 90 |         message_handler = logging.FileHandler(message_log_file, encoding='utf-8')
 91 |         self.message_logger.addHandler(message_handler)
 92 | 
 93 | 
 94 |     def update_log_paths(self, new_log_dir):
 95 |         # 确保新的日志目录存在
 96 |         os.makedirs(new_log_dir, exist_ok=True)
 97 |         
 98 |         # 更新运行日志记录器
 99 |         for handler in self.run_logger.handlers[:]:
100 |             self.run_logger.removeHandler(handler)
101 |         
102 |         run_log_file = f'{new_log_dir}/{self.task_name}_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
103 |         run_handler = logging.FileHandler(run_log_file, encoding='utf-8')
104 |         run_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
105 |         run_handler.setFormatter(run_formatter)
106 |         self.run_logger.addHandler(run_handler)
107 | 
108 |         # 更新消息日志记录器
109 |         for handler in self.message_logger.handlers[:]:
110 |             self.message_logger.removeHandler(handler)
111 |         
112 |         message_log_file = f'{new_log_dir}/{self.task_name}_messages_{datetime.now().strftime("%Y%m%d_%H%M%S")}.jsonl'
113 |         message_handler = logging.FileHandler(message_log_file, encoding='utf-8')
114 |         self.message_logger.addHandler(message_handler)
115 | 
116 |     def evaluate_prediction(self, question: str, ground_truth: str, prediction: str) -> Tuple[bool, Optional[str]]:
117 |         answer_eval_manager = ProcessManager()
118 |         answer_eval_manager.lm_api_key = self.lm.api_key
119 |         answer_eval_manager.lm_api_base = self.lm.api_base
120 |         answer_eval_manager.model = "openai/deepseek-v3"
121 |         return evaluate_final_answer(question, ground_truth, prediction, answer_eval_manager, self.run_logger)
122 | 
123 |     def log_messages(self, messages, question, success, time_cost, prompt_tokens_cost, completion_tokens_cost):
124 |         log_entry = {
125 |             "question": question,
126 |             "messages": messages,
127 |             "success": success,
128 |             "time_cost": time_cost,
129 |             "prompt_tokens_cost": prompt_tokens_cost,
130 |             "completion_tokens_cost": completion_tokens_cost
131 |         }
132 |         self.message_logger.info(json.dumps(log_entry, ensure_ascii=False))
133 | 
134 | 
135 |     def forward(self, **kwargs) -> dspy.Prediction:
136 |         unique_id = kwargs.get('id')
137 |         question = kwargs.get('question')
138 |         gt = kwargs.get('answer')
139 | 
140 |         manager = ProcessManager()
141 |         manager.lm_api_key = self.lm.api_key
142 |         manager.lm_api_base = self.lm.api_base
143 |         manager.model = self.lm.model
144 |         manager.id = unique_id
145 | 
146 |         self.run_logger.info(f"ID: {manager.id}, Starting forward pass for question: {question}")
147 | 
148 | 
149 |         from langProBe.evaluation import global_config
150 |         mcps = global_config['mcp_pool']
151 | 
152 |         messages = build_init_messages(self.system_prompt, mcps, question)
153 |         steps = 0
154 |         all_completion_tokens = 0
155 |         all_prompt_tokens = 0
156 |         start_time = time.time()
157 | 
158 |         while not messages[-1][constants.ROLE] == constants.ASSISTANT and steps < self.max_steps:
159 |             response, completion_tokens, prompt_tokens= call_lm(messages, manager, self.run_logger)
160 |             all_completion_tokens += completion_tokens
161 |             all_prompt_tokens += prompt_tokens
162 |             mcp_calls = response_parsing(response)
163 | 
164 |             new_messages = mcp_calling(mcp_calls, manager, self.run_logger)
165 | 
166 |             messages = build_messages(messages, new_messages)
167 |             steps += 1
168 | 
169 |         end_time = time.time()
170 | 
171 |         # 如果达到最大步数仍未得到答案
172 |         if messages[-1][constants.ROLE] != constants.ASSISTANT:
173 |             self.run_logger.warning("Maximum steps reached without getting an answer")
174 |             messages.append({
175 |                 constants.ROLE: constants.ASSISTANT,
176 |                 constants.CONTENT: "超过最长次数限制，该问题无法解决",
177 |             })
178 | 
179 | 
180 |         self.run_logger.info(f"ID: {manager.id}, Forward pass completed successfully")
181 |         success = self.evaluate_prediction(question, gt, messages[-1][constants.CONTENT])
182 |         self.log_messages(messages, question, success, (end_time-start_time), all_prompt_tokens, all_completion_tokens)
183 |         self.run_logger.info(f"ID: {manager.id}, Evaluation completed successfully")
184 |         # self.run_logger.info("==" * 50)
185 | 
186 |         return dspy.Prediction(
187 |             success=success,
188 |             question=question,
189 |             ground_truth=gt,
190 |             answer=messages[-1][constants.CONTENT],
191 |             trace=messages,
192 |             process_report=manager
193 |         )


--------------------------------------------------------------------------------
/langProBe/optimizers.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import random
  3 | from dataclasses import dataclass
  4 | from functools import partial
  5 | from typing import Callable, Type
  6 | 
  7 | import dspy
  8 | import dspy.teleprompt
  9 | import numpy as np
 10 | from dspy.evaluate.evaluate import Evaluate
 11 | from dspy.teleprompt import BootstrapFewShot
 12 | 
 13 | 
 14 | class BootstrapFewShotInfer(BootstrapFewShot):
 15 |     def __init__(
 16 |         self,
 17 |         num_candidates=5,
 18 |         num_rules=5,
 19 |         num_threads=8,
 20 |         teacher_settings=None,
 21 |         **kwargs,
 22 |     ):
 23 |         super().__init__(teacher_settings=teacher_settings, **kwargs)
 24 |         self.num_candidates = num_candidates
 25 |         self.num_rules = num_rules
 26 |         self.num_threads = num_threads
 27 |         self.rules_induction_program = RulesInductionProgramINFER(
 28 |             num_rules, teacher_settings=teacher_settings
 29 |         )
 30 |         self.metric = kwargs.get("metric")
 31 |         self.max_errors = kwargs.get("max_errors", 5)
 32 | 
 33 |     def compile(self, student, *, teacher=None, trainset, valset=None):
 34 |         super().compile(student, teacher=teacher, trainset=trainset)
 35 |         if valset is None:
 36 |             train_size = int(0.8 * len(trainset))
 37 |             trainset, valset = trainset[:train_size], trainset[train_size:]
 38 |         original_program = copy.deepcopy(self.student)
 39 |         all_predictors = [
 40 |             p for p in original_program.predictors() if hasattr(p, "signature")
 41 |         ]
 42 |         instructions_list = [p.signature.instructions for p in all_predictors]
 43 | 
 44 |         best_score = -np.inf
 45 |         best_program = None
 46 | 
 47 |         for candidate_idx in range(self.num_candidates):
 48 |             candidate_program = copy.deepcopy(original_program)
 49 |             candidate_predictors = [
 50 |                 p for p in candidate_program.predictors() if hasattr(p, "signature")
 51 |             ]
 52 |             for i, predictor in enumerate(candidate_predictors):
 53 |                 predictor.signature.instructions = instructions_list[i]
 54 |             for i, predictor in enumerate(candidate_predictors):
 55 |                 rules = self.induce_natural_language_rules(predictor, trainset)
 56 |                 predictor.signature.instructions = instructions_list[i]
 57 |                 self.update_program_instructions(predictor, rules)
 58 |             score = self.evaluate_program(candidate_program, valset)
 59 |             if score > best_score:
 60 |                 best_score = score
 61 |                 best_program = candidate_program
 62 |                 print(
 63 |                     f"New best candidate (Candidate {candidate_idx+1}) with score {best_score}"
 64 |                 )
 65 |         print("Final best score:", best_score)
 66 |         self.student = best_program
 67 |         return best_program
 68 | 
 69 |     def induce_natural_language_rules(self, predictor, trainset):
 70 |         demos = self.get_predictor_demos(trainset, predictor)
 71 |         signature = predictor.signature
 72 |         while True:
 73 |             examples_text = self.format_examples(demos, signature)
 74 |             try:
 75 |                 natural_language_rules = self.rules_induction_program(examples_text)
 76 |                 break
 77 |             except Exception as e:
 78 |                 print("entereing here")
 79 |                 print(len(demos))
 80 | 
 81 |                 if (
 82 |                     isinstance(e, ValueError)
 83 |                     or e.__class__.__name__ == "BadRequestError"
 84 |                     or "ContextWindowExceededError" in str(e)
 85 |                 ):
 86 |                     if len(demos) > 1:
 87 |                         demos = demos[:-1]
 88 |                     else:
 89 |                         natural_language_rules = ""
 90 |                         raise RuntimeError(
 91 |                             "Failed to generate natural language rules: A single example could not fit in context."
 92 |                         ) from e
 93 |         return natural_language_rules
 94 | 
 95 |     def update_program_instructions(self, predictor, natural_language_rules):
 96 |         predictor.signature.instructions = (
 97 |             f"{predictor.signature.instructions}\n\n"
 98 |             f"Please apply the following rules when making your prediction:\n{natural_language_rules}"
 99 |         )
100 | 
101 |     def format_examples(self, demos, signature):
102 |         examples_text = ""
103 |         for demo in demos:
104 |             input_fields = {
105 |                 k: v for k, v in demo.items() if k in signature.input_fields
106 |             }
107 |             output_fields = {
108 |                 k: v for k, v in demo.items() if k in signature.output_fields
109 |             }
110 |             input_text = "\n".join(f"{k}: {v}" for k, v in input_fields.items())
111 |             output_text = "\n".join(f"{k}: {v}" for k, v in output_fields.items())
112 |             examples_text += f"Example:\n{input_text}\n{output_text}\n\n"
113 |         return examples_text
114 | 
115 |     def get_predictor_demos(self, trainset, predictor):
116 |         signature = predictor.signature
117 |         return [
118 |             {
119 |                 key: value
120 |                 for key, value in example.items()
121 |                 if key in signature.input_fields or key in signature.output_fields
122 |             }
123 |             for example in trainset
124 |         ]
125 | 
126 |     def evaluate_program(self, program, dataset):
127 |         evaluate = Evaluate(
128 |             devset=dataset,
129 |             metric=self.metric,
130 |             num_threads=self.num_threads,
131 |             max_errors=self.max_errors,
132 |             display_table=False,
133 |             display_progress=True,
134 |             return_all_scores=True,
135 |         )
136 |         score, _ = evaluate(program, metric=self.metric)
137 |         return score
138 | 
139 | 
140 | class RulesInductionProgramINFER(dspy.Module):
141 |     def __init__(self, num_rules, teacher_settings=None, verbose=False):
142 |         super().__init__()
143 |         docstring = f"""Given a set of examples, extract a set of {num_rules} concise and non-redundant natural language rules that explain the patterns in the data. These rules should be specific and actionable, providing clear guidance for performing the task."""
144 | 
145 |         class CustomRulesInduction(dspy.Signature):
146 |             __doc__ = docstring
147 |             examples_text = dspy.InputField(desc="Text containing examples")
148 |             natural_language_rules = dspy.OutputField(
149 |                 desc="Induced natural language rules"
150 |             )
151 | 
152 |         self.rules_induction = dspy.ChainOfThought(CustomRulesInduction)
153 |         self.verbose = verbose
154 |         self.teacher_settings = teacher_settings or {}
155 | 
156 |     def forward(self, examples_text):
157 |         original_temp = dspy.settings.lm.kwargs.get("temperature", 0.7)
158 |         if self.teacher_settings:
159 |             with dspy.settings.context(**self.teacher_settings):
160 |                 print("Using teacher settings")
161 |                 print(dspy.settings.lm.model)
162 |                 dspy.settings.lm.kwargs["temperature"] = random.uniform(0.9, 1.0)
163 |                 print(dspy.settings.lm.kwargs["temperature"])
164 |                 prediction = self.rules_induction(examples_text=examples_text)
165 |         else:
166 |             # print('Using default DSPy settings')
167 |             # print(dspy.settings.lm.model)
168 |             dspy.settings.lm.kwargs["temperature"] = random.uniform(0.9, 1.0)
169 |             prediction = self.rules_induction(examples_text=examples_text)
170 |             dspy.settings.lm.kwargs["temperature"] = original_temp
171 |         natural_language_rules = prediction.natural_language_rules.strip()
172 |         if self.verbose:
173 |             print(natural_language_rules)
174 |         return natural_language_rules
175 | 
176 | 
177 | @dataclass
178 | class OptimizerConfig:
179 |     optimizer: Type[dspy.teleprompt.Teleprompter]
180 |     init_args: dict
181 |     compile_args: dict
182 |     langProBe_configs: dict
183 |     name: str
184 | 
185 |     def __str__(self):
186 |         return f"""
187 | [[
188 |     Optimizer: {self.name} ({self.optimizer})
189 |     init_args: {self.init_args}
190 |     compile_args: {self.compile_args}
191 |     langProBe_configs: {self.langProBe_configs}
192 | ]]
193 |         """
194 | 
195 |     def __repr__(self):
196 |         return self.__str__()
197 | 
198 | 
199 | # Optimizer configuration formats:
200 | DEFAULT_OPTIMIZERS = [
201 |     OptimizerConfig(
202 |         optimizer=dspy.teleprompt.BootstrapFewShot,
203 |         init_args=dict(max_errors=5000, max_labeled_demos=2),
204 |         compile_args=dict(),
205 |         langProBe_configs=dict(use_valset=False, save_candidate_score=False),
206 |         name="BootstrapFewShot",
207 |     ),
208 |     OptimizerConfig(
209 |         optimizer=dspy.teleprompt.BootstrapFewShotWithRandomSearch,
210 |         init_args=dict(max_errors=5000, max_labeled_demos=2, num_threads=16),
211 |         compile_args=dict(),
212 |         langProBe_configs=dict(use_valset=True, save_candidate_score=True),
213 |         name="BootstrapFewShotWithRandomSearch",
214 |     ),
215 |     OptimizerConfig(
216 |         optimizer=dspy.teleprompt.MIPROv2,
217 |         init_args=dict(max_errors=5000, auto="medium", num_threads=16),
218 |         compile_args=dict(
219 |             requires_permission_to_run=False,
220 |             num_trials=20,
221 |             max_bootstrapped_demos=4,
222 |             max_labeled_demos=2,
223 |         ),
224 |         langProBe_configs=dict(
225 |             use_valset=True,
226 |             save_candidate_score=True,
227 |         ),
228 |         name="MIPROv2-lite",
229 |     ),
230 |     OptimizerConfig(
231 |         optimizer=dspy.teleprompt.MIPROv2,
232 |         init_args=dict(max_errors=5000, num_threads=16, num_candidates=12),
233 |         compile_args=dict(
234 |             requires_permission_to_run=False,
235 |             num_trials=50,
236 |             max_bootstrapped_demos=4,
237 |             max_labeled_demos=2,
238 |             minibatch_size=35,
239 |             minibatch_full_eval_steps=5,
240 |         ),
241 |         langProBe_configs=dict(
242 |             use_valset=True,
243 |             save_candidate_score=True,
244 |         ),
245 |         name="MIPROv2",
246 |     ),
247 |     OptimizerConfig(
248 |         optimizer=BootstrapFewShotInfer,
249 |         init_args=dict(max_errors=5000, num_candidates=10, num_rules=10, num_threads=8),
250 |         compile_args=dict(),
251 |         langProBe_configs=dict(use_valset=True),
252 |         name="RuleInfer-lite",
253 |     ),
254 |     OptimizerConfig(
255 |         optimizer=BootstrapFewShotInfer,
256 |         init_args=dict(max_errors=5000, num_candidates=10, num_rules=20, num_threads=8),
257 |         compile_args=dict(),
258 |         langProBe_configs=dict(use_valset=True),
259 |         name="RuleInfer",
260 |     ),
261 | ]
262 | 
263 | 
264 | def update_optimizer_from_list(
265 |     optimizer_list: list[OptimizerConfig], optimizer: OptimizerConfig
266 | ) -> list[OptimizerConfig]:
267 |     new_optimizer_list = []
268 |     for optimizer_config in optimizer_list:
269 |         if optimizer.optimizer == optimizer_config.optimizer:
270 |             new_optimizer_list.append(optimizer)
271 |         else:
272 |             new_optimizer_list.append(optimizer_config)
273 |     return new_optimizer_list
274 | 
275 | 
276 | def create_optimizer(
277 |     optimizer_config: OptimizerConfig, metric, num_threads=None
278 | ) -> tuple[Callable, dict]:
279 |     name = optimizer_config.name
280 |     optimizer = optimizer_config.optimizer
281 |     init_args = optimizer_config.init_args
282 |     if num_threads and "num_threads" in init_args:
283 |         init_args["num_threads"] = num_threads
284 |     compile_args = optimizer_config.compile_args
285 |     langProBe_configs = optimizer_config.langProBe_configs | {"name": name}
286 |     optimizer = optimizer(metric=metric, **init_args)
287 |     return partial(optimizer.compile, **compile_args), langProBe_configs
288 | 


--------------------------------------------------------------------------------
/langProBe/program_utils.py:
--------------------------------------------------------------------------------
  1 | from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log
  2 | from typing import List, Tuple, Optional, Dict, Union
  3 | from openai import OpenAI
  4 | import json
  5 | import copy
  6 | from pydantic import BaseModel, Field
  7 | import re
  8 | import os
  9 | import langProBe.constants as constants
 10 | import logging
 11 | from .synced_mcp_client import SyncedMcpClient
 12 | 
 13 | TOOL_PROMPT = """
 14 | ## Tool Calling Rules
 15 | When external tools are required, the call request must be strictly generated according to the following rules:
 16 | <tool>  
 17 | {  
 18 |   "server_name": "",  
 19 |   "tool_name": "",  
 20 |   "inputs": {  
 21 |     "<parameter1>": "<value1>",  
 22 |     "<parameter2>": "<value2>",  
 23 |   }  
 24 | }  
 25 | </tool>  
 26 | 
 27 | If no tool is called, provide the final answer directly.
 28 | 
 29 | """
 30 |             
 31 | class ProcessManager(BaseModel):
 32 |     id: str = Field(
 33 |         default=None,
 34 |         description="The ID of the process.",
 35 |     )
 36 |     lm_api_key: str = Field(
 37 |         default=os.getenv("OPENAI_API_KEY"),
 38 |         description="OpenAI API Key"
 39 |     )
 40 |     lm_api_base: str = Field(
 41 |         default=os.getenv("OPENAI_API_BASE"),
 42 |         description="OpenAI API Base URL"
 43 |     )
 44 |     model: str = Field(
 45 |         default=None,
 46 |         description="OpenAI Model Name, with prefix 'openai/'"
 47 |     )
 48 |     lm_usages: List[Dict] = Field(
 49 |         default=[],
 50 |         description="Usage statistics for the model"
 51 |     )
 52 |     mcp_rts: List[Dict] = Field(
 53 |         default=[],
 54 |         description="Usage statistics for the MCPs"
 55 |     )
 56 |     mcp_retry_times: List[Dict] = Field(
 57 |         default=[],
 58 |         description="Statistics for the MCP retries"
 59 |     )
 60 | 
 61 | 
 62 | class MCPCall(BaseModel):
 63 |     mcp_server_name: Optional[str] = None
 64 |     mcp_tool_name: Optional[str] = None
 65 |     mcp_args: Optional[Dict] = None
 66 | 
 67 | 
 68 | class MCPCallList(BaseModel):
 69 |     shutdown: bool = False
 70 |     mcps: Optional[List[MCPCall]] = None
 71 |     raw_content: Optional[str] = None
 72 | 
 73 | @retry(
 74 |     stop=stop_after_attempt(5),  
 75 |     wait=wait_exponential(multiplier=1, min=2, max=10),  
 76 |     reraise=True,
 77 | )
 78 | def call_lm(
 79 |             messages: List, 
 80 |             manager: ProcessManager, 
 81 |             logger: logging.Logger, 
 82 |             temperature: float|None=None,
 83 |             ) -> tuple[str | None, int, int]:    
 84 |     
 85 |     try:
 86 |         oai = OpenAI(
 87 |             api_key=manager.lm_api_key,
 88 |             base_url=manager.lm_api_base,
 89 |         )
 90 |         prefix, model_name = manager.model.split('/')
 91 |         assert prefix == 'openai'
 92 | 
 93 |         if model_name in ['deepseek-r1', 'qwq-plus', 'qwq-32b']: # qwen reasoning模型仅支持流式输出
 94 |             reasoning_content = ""  # 定义完整思考过程
 95 |             answer_content = ""     # 定义完整回复
 96 |             is_answering = False   # 判断是否结束思考过程并开始回复
 97 | 
 98 |             completion = oai.chat.completions.create(
 99 |                 model=model_name, 
100 |                 messages=messages,
101 |                 stream=True,
102 |                 stream_options={
103 |                     "include_usage": True
104 |                 }
105 |             )
106 |             for chunk in completion:
107 |                 # 如果chunk.choices为空，则打印usage
108 |                 if not chunk.choices:
109 |                     usage = chunk.usage
110 |                 else:
111 |                     delta = chunk.choices[0].delta
112 |                     if hasattr(delta, 'reasoning_content') and delta.reasoning_content != None:
113 |                         reasoning_content += delta.reasoning_content
114 |                     else:
115 |                         # 开始回复
116 |                         if delta.content != "" and is_answering is False:
117 |                             is_answering = True
118 |                         answer_content += delta.content
119 |             completion_tokens = usage.completion_tokens
120 |             prompt_tokens = usage.prompt_tokens
121 |             manager.lm_usages.append({
122 |                 "completion_tokens": completion_tokens,
123 |                 "prompt_tokens": prompt_tokens,
124 |             })
125 |             return '<think>' + reasoning_content + '</think>' + answer_content, completion_tokens, prompt_tokens
126 | 
127 | 
128 |         if temperature is not None:
129 |             response = oai.beta.chat.completions.parse(
130 |                 messages=messages,
131 |                 model=model_name,
132 |                 temperature = temperature
133 |             )
134 |         else:
135 |             response = oai.beta.chat.completions.parse(
136 |                 messages=messages,
137 |                 model=model_name,
138 |             )
139 |             # print("Response is " + str(response))
140 |         response_text = response.choices[0].message.content
141 |         completion_tokens = response.usage.completion_tokens
142 |         prompt_tokens = response.usage.prompt_tokens
143 |         manager.lm_usages.append({
144 |                 "completion_tokens": completion_tokens,
145 |                 "prompt_tokens": prompt_tokens,
146 |             })
147 |         return response_text, completion_tokens, prompt_tokens
148 |     
149 |     except Exception as e:
150 |         logger.error(f"ID: {manager.id}, Error in call_lm: {str(e)}")
151 |         if response:
152 |             logger.error(f"ID: {manager.id}, Response: {response}")
153 |         raise
154 | 
155 | def build_system_content(base_system: str,
156 |                         mcps: List,
157 |                         ) -> str:
158 |     tools_section = "## Available Tools\n"
159 |     for mcp in mcps:
160 |         tools_section += f"### Server '{mcp['name']}' include following tools\n"
161 |         if mcp['name'] in ['wuying-agentbay-mcp-server', 'Playwright']:
162 |             tools_section += f"当使用本server来执行搜索任务时，请以https://www.baidu.com为初始网站进行搜索。"
163 |         url = mcp.get("url")
164 |         if not url:
165 |             try:
166 |                 port = mcp.get('run_config')[0]["port"]
167 |                 url = f"http://localhost:{port}/sse"
168 |             except:
169 |                 raise Exception("No url found")
170 |         client = SyncedMcpClient(server_url=url)
171 |         try:
172 |             result = client.list_tools()
173 |             tools = result.tools
174 |         except Exception as e:
175 |             raise Exception(f"Fail access to server: {mcp['name']}, error: {e}")
176 | 
177 |         for t in tools:
178 |             tools_section += f"- {t.name}: {t.description}\n"
179 |             input_schema = t.inputSchema
180 |             required_params = input_schema.get("required", [])
181 |             params_desc = []
182 | 
183 |             if "properties" in input_schema:
184 |                 for param_name, param_info in input_schema["properties"].items():
185 |                     is_required = param_name in required_params
186 |                     param_type = param_info.get("type", "")
187 |                     param_desc = param_info.get("description", "")
188 | 
189 |                     req_tag = "必填" if is_required else "可选"
190 |                     params_desc.append(
191 |                         f"- {param_name} ({param_type}, {req_tag}): {param_desc}"
192 |                     )
193 | 
194 |             # 使用更丰富的描述
195 |             params_text = "\n".join(params_desc) if params_desc else "无参数"
196 |             tools_section += f"  参数:\n{params_text}\n\n"
197 | 
198 |     prompt = base_system + f"""{tools_section}""" + TOOL_PROMPT
199 | 
200 |     return prompt
201 | 
202 | 
203 | def build_init_messages(
204 |         base_system: str,
205 |         mcps: List,
206 |         user_question: str,
207 |        ) -> List[Dict]:
208 |     system_content = build_system_content(base_system, mcps)
209 |     messages = [
210 |         {
211 |             constants.ROLE: constants.SYSTEM,
212 |             constants.CONTENT: system_content
213 |         },
214 |         {
215 |             constants.ROLE: constants.USER,
216 |             constants.CONTENT: user_question
217 |         }
218 |     ]
219 |     return messages
220 | 
221 | 
222 | 
223 | def build_messages(
224 |         messages: List[Dict],
225 |         message_to_append: List[Dict],
226 |         ) -> List[Dict]:
227 |     assert messages[0][constants.ROLE] == constants.SYSTEM
228 |     
229 |     final_message = copy.deepcopy(messages)
230 | 
231 |     if message_to_append:
232 |         if message_to_append[-1][constants.ROLE] == constants.USER:
233 |             assert len(message_to_append) == 1
234 |             assert final_message[-1][constants.ROLE] in {constants.ASSISTANT, constants.TOOL, constants.SYSTEM}
235 |             final_message.extend(message_to_append)
236 |         elif message_to_append[-1][constants.ROLE] == constants.ASSISTANT:
237 |             assert len(message_to_append) == 1
238 |             assert final_message[-1][constants.ROLE] in {constants.USER, constants.TOOL}
239 |             final_message.extend(message_to_append)
240 |         elif message_to_append[-1][constants.ROLE] == constants.TOOL:
241 |             assert len(message_to_append) == 2
242 |             assert final_message[-1][constants.ROLE] in {constants.USER, constants.TOOL}
243 |             final_message.extend(message_to_append)
244 |     
245 |     # TODO: 超过最长上下文长度处理
246 | 
247 |     return final_message
248 | 
249 | 
250 | 
251 | def response_parsing(content: str) -> MCPCallList:
252 |     pattern = r'<tool>(.*?)<\/tool>'
253 |     matches = re.findall(pattern, content, re.DOTALL)
254 |     mcps = []
255 |     for match in matches:
256 |         # TODO: 错误处理
257 |         data = json.loads(match)
258 |         mcps.append(MCPCall(
259 |             mcp_server_name=data['server_name'].strip(),
260 |             mcp_tool_name=data['tool_name'].strip(),
261 |             mcp_args=data['inputs']
262 |         ))
263 | 
264 |     if mcps:
265 |         return MCPCallList(shutdown=False, mcps=mcps, raw_content=content)
266 |     else:
267 |         return MCPCallList(shutdown=True, mcps=None, raw_content=content)
268 | 
269 | 
270 | def mcp_calling(
271 |         mcp_call_list: MCPCallList,
272 |         manager: ProcessManager,
273 |         logger: logging.Logger,
274 | ) -> List[Dict]:
275 |     logger.debug(f"ID:{manager.id}, Entering mcp_calling with mcp_call_list: {mcp_call_list}")
276 | 
277 |     if mcp_call_list.shutdown:
278 |         logger.info(f"ID:{manager.id}, Shutdown flag is set. No more MCP calling.")
279 |         messages = [
280 |             {
281 |                 constants.ROLE: constants.ASSISTANT,
282 |                 constants.CONTENT: mcp_call_list.raw_content if mcp_call_list.raw_content else '',
283 |             }
284 |         ]
285 |         logger.debug(f"ID:{manager.id}, Shutdown messages prepared: {messages}")
286 |         return messages
287 |     else:
288 |         logger.info(f"ID:{manager.id}, Processing MCP call list with {len(mcp_call_list.mcps)} MCPs.")
289 |         mcp_list = mcp_call_list.mcps
290 |         messages = [
291 |             {
292 |                 constants.ROLE: constants.ASSISTANT,
293 |                 constants.CONTENT: mcp_call_list.raw_content if mcp_call_list.raw_content else '',
294 |                 constants.TOOL_CALLS: []
295 |             }
296 |         ]
297 |         result_str = ""
298 |         for idx, mcp in enumerate(mcp_list, start=1):
299 |             logger.debug(f"ID:{manager.id}, Processing MCP #{idx}: {mcp}")
300 |             mcp_server_name = mcp.mcp_server_name
301 |             mcp_tool_name = mcp.mcp_tool_name
302 |             mcp_args = mcp.mcp_args
303 | 
304 |             tool_call = {
305 |                 "type": "function",
306 |                 "function": {
307 |                     "name": mcp_tool_name,
308 |                     "arguments": json.dumps(mcp_args, ensure_ascii=False)
309 |                 }
310 |             }
311 |             messages[0][constants.TOOL_CALLS].append(tool_call)
312 |             logger.info(f"ID:{manager.id}, Calling MCP Server: {mcp_server_name}, Tool: {mcp_tool_name}, Arguments: {mcp_args}")
313 | 
314 |             # Manage manager.mcp_rts and manager.mcp_retry_times
315 |             from langProBe.evaluation import global_config
316 |             try:
317 |                 parsed_data = global_config
318 | 
319 |                 target_name = mcp_server_name
320 |                 port = None
321 |                 url = None
322 |                 for item in parsed_data.get("mcp_pool", []):
323 |                     if item.get("name") != target_name:
324 |                         continue
325 | 
326 |                     url = item.get("url", "")
327 |                     if url:
328 |                         logger.debug(f"ID:{manager.id}, Found URL for MCP Server '{target_name}': {url}")
329 |                         break
330 |                     run_configs = item.get("run_config", [])
331 |                     for config in run_configs:
332 |                         port = config.get("port")
333 |                         if port:
334 |                             url = f"http://localhost:{port}/sse"
335 |                             logger.debug(f"ID:{manager.id}, Constructed URL for MCP Server '{target_name}': {url}")
336 |                             break
337 |                     if url:
338 |                         break
339 | 
340 |                 if not url:
341 |                     logger.error(f"ID:{manager.id}, No valid URL found for MCP Server '{target_name}'.")
342 |                     raise ValueError(f"ID:{manager.id}, No valid URL found for MCP Server '{target_name}'.")
343 | 
344 |                 client = SyncedMcpClient(server_url=url)
345 |                 logger.debug(f"ID:{manager.id}, Initialized SyncedMcpClient with URL: {url}")
346 |                 client.list_tools()
347 |                 logger.debug(f"ID:{manager.id}, Retrieved tool list from MCP Server '{target_name}'.")
348 |             except Exception as e:
349 |                 logger.error(f"ID:{manager.id}, Failed to initialize SyncedMcpClient for server '{mcp_server_name}': {str(e)}")
350 |                 client = None
351 | 
352 |             if client:
353 |                 try:
354 |                     logger.debug(f"ID:{manager.id}, Calling tool '{mcp_tool_name}' with arguments: {mcp_args}")
355 |                     result = client.call_tool(mcp_tool_name, mcp_args)
356 |                     texts = [item.text for item in result.content]
357 |                     result_str_segment = ''.join(texts)
358 |                     logger.debug(f"ID:{manager.id}, Received result from tool '{mcp_tool_name}': {result_str_segment}")
359 | 
360 |                     logger.info(f"ID:{manager.id}, MCP Server '{mcp_server_name}' returned: {result_str_segment[:5000]}")
361 | 
362 |                     result_str += result_str_segment
363 |                 except Exception as e:
364 |                     logger.error(f"ID:{manager.id}, Error calling tool '{mcp_tool_name}' on MCP Server '{mcp_server_name}': {str(e)}")
365 |             else:
366 |                 logger.warning(f"ID:{manager.id}, Skipping tool call for '{mcp_tool_name}' due to client initialization failure.")
367 | 
368 |         messages.append({
369 |             constants.ROLE: constants.TOOL,
370 |             constants.CONTENT: result_str[:5000],
371 |         })
372 |         logger.debug(f"ID:{manager.id}, Final messages prepared: {messages}")
373 |         logger.info(f"ID:{manager.id}, mcp_calling completed successfully.")
374 |         return messages
375 | 
376 | class DotDict(dict):
377 |     def __getattr__(self, key):
378 |         try:
379 |             return self[key]
380 |         except KeyError:
381 |             raise AttributeError(
382 |                 f"'{type(self).__name__}' object has no attribute '{key}'"
383 |             )
384 | 
385 |     def __setattr__(self, key, value):
386 |         self[key] = value
387 | 
388 |     def __delattr__(self, key):
389 |         try:
390 |             del self[key]
391 |         except KeyError:
392 |             raise AttributeError(
393 |                 f"'{type(self).__name__}' object has no attribute '{key}'"
394 |             )
395 | 


--------------------------------------------------------------------------------
/langProBe/register_benchmark.py:
--------------------------------------------------------------------------------
 1 | ########################## Benchmarks ##########################
 2 | import importlib
 3 | 
 4 | 
 5 | # To use registered benchmarks, do
 6 | # `benchmark.benchmark, benchmark.programs, benchmark.metric`
 7 | registered_benchmarks = []
 8 | 
 9 | 
10 | def check_benchmark(benchmark):
11 |     try:
12 |         assert hasattr(benchmark, "benchmark")
13 |     except AssertionError:
14 |         return False
15 |     return True
16 | 
17 | 
18 | def register_benchmark(benchmark: str):
19 |     try:
20 |         # 尝试直接导入模块
21 |         benchmark_metas = importlib.import_module(benchmark, package="langProBe")
22 |     except ModuleNotFoundError:
23 |         # 如果直接导入失败，尝试使用完整路径导入
24 |         benchmark_metas = importlib.import_module(f"langProBe.{benchmark}", package=None)
25 |     
26 |     if check_benchmark(benchmark_metas):
27 |         registered_benchmarks.extend(benchmark_metas.benchmark)
28 |     else:
29 |         raise AssertionError(f"{benchmark} does not have the required attributes")
30 |     return benchmark_metas.benchmark
31 | 
32 | 
33 | def register_all_benchmarks(benchmarks):
34 |     for benchmark in benchmarks:
35 |         register_benchmark(benchmark)
36 |     return registered_benchmarks
37 | 


--------------------------------------------------------------------------------
/langProBe/synced_mcp_client.py:
--------------------------------------------------------------------------------
  1 | # teamwork_mcp/synced_mcp_client.py
  2 | import asyncio
  3 | import atexit
  4 | import logging
  5 | import pickle
  6 | from multiprocessing import Process, Queue, Lock
  7 | from typing import Any, Tuple, Dict
  8 | 
  9 | # 全局客户端实例和锁，确保全局唯一的客户端实例
 10 | _CLIENT_INSTANCE = None
 11 | _CLIENT_LOCK = Lock()
 12 | 
 13 | 
 14 | class SyncedMcpClient(Process):
 15 |     """
 16 |     A synchronous MCP client that runs the AsyncMCPClient in a separate process
 17 |     and communicates with it using multiprocessing Queues and pickle.
 18 |     """
 19 | 
 20 |     def __init__(self, server_url: str = None):
 21 |         super().__init__()
 22 |         # turn off logging from the logger of 'httpx'
 23 |         httpx_logger = logging.getLogger("httpx")
 24 |         httpx_logger.setLevel(logging.WARNING)
 25 | 
 26 |         self.server_url = server_url
 27 |         self.request_queue = Queue()
 28 |         self.response_queue = Queue()
 29 |         self.is_running = False
 30 |         self.daemon = True
 31 |         atexit.register(self.cleanup)
 32 | 
 33 |         # begin new process
 34 |         self.start()
 35 | 
 36 |     def run(self):
 37 |         """
 38 |         The main process function that runs the AsyncMCPClient in a separate process.
 39 |         """
 40 |         self.is_running = True
 41 |         asyncio.run(self._run_async_client())
 42 | 
 43 |     async def _run_async_client(self):
 44 |         """
 45 |         Runs the AsyncMCPClient and handles communication with the main process.
 46 |         """
 47 |         from .async_mcp_client import AsyncMCPClient
 48 | 
 49 |         client = AsyncMCPClient()
 50 |         await client.connect_to_sse_server(server_url=self.server_url)
 51 | 
 52 |         try:
 53 |             while self.is_running:
 54 |                 if not self.request_queue.empty():
 55 |                     request = self.request_queue.get()
 56 |                     if request == 'terminate':
 57 |                         break
 58 |                     try:
 59 |                         func_name, args, kwargs = pickle.loads(request)
 60 |                         func = getattr(client, func_name)
 61 |                         result = await func(*args, **kwargs)
 62 |                         self.response_queue.put(pickle.dumps(('success', result)))
 63 |                     except Exception as e:
 64 |                         self.response_queue.put(pickle.dumps(('error', str(e))))
 65 |                 await asyncio.sleep(0.01)
 66 | 
 67 |         except Exception as e:
 68 |             self.httpx_logger.exception(e)
 69 |             self.response_queue.put(pickle.dumps(('error', f"Client initialization error: {str(e)}")))
 70 | 
 71 |         finally:
 72 |             await client.cleanup()
 73 | 
 74 |     def _send_request(self, func_name: str, args: Tuple = (), kwargs: Dict = None) -> Any:
 75 |         """
 76 |         Sends a request to the async process and waits for the response.
 77 |         """
 78 |         if kwargs is None:
 79 |             kwargs = {}
 80 |         self.request_queue.put(pickle.dumps((func_name, args, kwargs)))
 81 |         response = self.response_queue.get(timeout=900)
 82 |         status, result = pickle.loads(response)
 83 |         if status == 'error':
 84 |             raise Exception(result)
 85 |         return result
 86 | 
 87 |     def call_tool(self, tool_name: str, tool_args: Dict = None) -> Any:
 88 |         """
 89 |         Calls a tool synchronously by sending a request to the async process.
 90 |         """
 91 |         return self._send_request('call_tool', args=(tool_name,), kwargs={'tool_args': tool_args})
 92 | 
 93 |     def get_prompt(self, name: str, arguments: dict[str, str] | None = None) -> Any:
 94 |         """
 95 |         Calls a tool synchronously by sending a request to the async process.
 96 |         """
 97 |         return self._send_request('get_prompt', args=(), kwargs={'name': name, 'arguments': arguments})
 98 | 
 99 |     def read_resource(self, uri) -> Any:
100 |         """
101 |         Calls a tool synchronously by sending a request to the async process.
102 |         """
103 |         return self._send_request('read_resource', args=(), kwargs={'uri': uri})
104 | 
105 |     def list_resources(self) -> Any:
106 |         return self._send_request('list_resources', args=(), kwargs={})
107 | 
108 |     def list_prompts(self) -> Any:
109 |         return self._send_request('list_prompts', args=(), kwargs={})
110 | 
111 | 
112 | 
113 |     def list_tools(self) -> Any:
114 |         """
115 |         Lists all available tools synchronously.
116 |         """
117 |         return self._send_request('list_tools', args=(), kwargs={})
118 | 
119 |     def process_query(self, query: str) -> Any:
120 |         """
121 |         Processes a query synchronously.
122 |         """
123 |         return self._send_request('process_query', args=(query,))
124 | 
125 | 
126 |     def cleanup(self):
127 |         """
128 |         Cleans up resources and terminates the process.
129 |         """
130 |         if self.is_running:
131 |             self.is_running = False
132 |             self.request_queue.put('terminate')
133 |             self.join(timeout=5)
134 |             if self.is_alive():
135 |                 self.terminate()
136 | # def synced_main():
137 | #     import time
138 | #     client = SyncedMcpClient(server_url="http://0.0.0.0:8080/sse")
139 | #     client.start()
140 | #     result = client.call_tool("get_alerts", {"state": "CA"})
141 | #     print(result)
142 | #     time.sleep(5)
143 | #
144 | #
145 | # if __name__ == "__main__":
146 | #     synced_main()


--------------------------------------------------------------------------------
/launch_mcps_as_sse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 检查是否提供了配置文件路径参数
 4 | if [ -z "$1" ]; then
 5 |   echo "Usage: $0 <config_file_path>"
 6 |   exit 1
 7 | fi
 8 | 
 9 | # 构造完整路径
10 | CONFIG_FILE="$1"
11 | if [[ ! "$CONFIG_FILE" == /* ]]; then
12 |   CONFIG_FILE="configs/$CONFIG_FILE"
13 | fi
14 | 
15 | # 检查配置文件是否存在
16 | if [[ ! -f "$CONFIG_FILE" ]]; then
17 |   echo "配置文件 '$CONFIG_FILE' 不存在。"
18 |   exit 1
19 | fi
20 | 
21 | # 读取 mcp_pool 数组的长度
22 | SERVER_COUNT=$(jq '.mcp_pool | length' "$CONFIG_FILE")
23 | 
24 | if [[ "$SERVER_COUNT" -eq 0 ]]; then
25 |   echo "mcp_pool 中未定义服务器。"
26 |   exit 1
27 | fi
28 | 
29 | # 遍历 mcp_pool 数组，启动每个服务器
30 | for (( i=0; i<SERVER_COUNT; i++ ))
31 | do
32 |   # 使用 jq 提取每个服务器的配置信息
33 |   SERVER=$(jq ".mcp_pool[$i]" "$CONFIG_FILE")
34 | 
35 |   NAME=$(echo "$SERVER" | jq -r '.name')
36 | 
37 |   # 检查是否存在 url 字段
38 |   URL=$(echo "$SERVER" | jq -r '.url // empty')
39 | 
40 |   if [[ -n "$URL" ]]; then
41 |     # 如果存在 url，则不运行 run_config，直接输出相关信息
42 |     echo "服务器 '$NAME' 已配置 URL: $URL，跳过运行命令。"
43 |   else
44 |     # 从 run_config 数组中提取 command、args 和 port
45 |     # 修改后的 args 提取逻辑：支持字符串或列表
46 |     COMMAND=$(echo "$SERVER" | jq -r '.run_config[] | select(.command) | .command')
47 |     ARGS=$(echo "$SERVER" | jq -r '.run_config[] | select(.args) | .args | if type == "array" then join(" ") else . end')
48 |     PORT=$(echo "$SERVER" | jq -r '.run_config[] | select(.port) | .port')
49 | 
50 |     # 从 tools 数组中提取 tool_name（假设第一个工具）
51 |     TOOL_NAME=$(echo "$SERVER" | jq -r '.tools[0].tool_name')
52 | 
53 |     # 由于配置中没有 tool_keyword，可以设置为空字符串或根据需要定义
54 |     TOOL_KEYWORD=""
55 | 
56 |     echo "启动服务器: $NAME on port $PORT"
57 | 
58 |     # 启动服务器并将其置于后台
59 |     npx -y supergateway \
60 |       --stdio "$ARGS $COMMAND" \
61 |       --port "$PORT" \
62 |       --baseUrl "http://localhost:$PORT" \
63 |       --ssePath /sse \
64 |       --messagePath /message \
65 |       --name "$TOOL_NAME" \
66 |       --keyword "$TOOL_KEYWORD" &
67 | 
68 |     PID=$!
69 |     echo "服务器 '$NAME' 已启动，PID: $PID"
70 |   fi
71 | done
72 | 
73 | wait
74 | 


--------------------------------------------------------------------------------
/mcpbench.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/MCPBench/85994f4e75c9f32f0f98ae88afd248ecc8beb490/mcpbench.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | dspy>=2.6
2 | mcp
3 | uv
4 | dashscope
5 | shortuuid
6 | anthropic


--------------------------------------------------------------------------------