├── __init__.py
├── doc
    ├── __init__.py
    ├── image_filesystem.jpg
    ├── tools_file_system.jpg
    ├── image-finance-agent.jpg
    ├── image-google-search.jpg
    ├── image_paypal_server.jpg
    ├── image_browser_puppeteer.jpg
    ├── image_google_map_server.jpg
    ├── model_performance_ast_subplots.png
    └── model_performance_pass_subplots.png
├── src
    ├── __init__.py
    └── mcp_tool_bench
    │   ├── __init__.py
    │   ├── agents
    │       ├── __init__.py
    │       ├── base_tool_call_agent
    │       │   ├── __init__.py
    │       │   ├── prompt.py
    │       │   └── check_functions.py
    │       └── data_generator_agent
    │       │   ├── __init__.py
    │       │   ├── utils
    │       │       ├── __init__.py
    │       │       ├── generate_query.py
    │       │       ├── pre_process.py
    │       │       └── prompt_reference.py
    │       │   └── run_data_generator.py
    │   ├── model_utils
    │       ├── __init__.py
    │       ├── model_provider.py
    │       ├── openai_api.py
    │       ├── kimi_api.py
    │       ├── custom_openai_api.py
    │       ├── claude_api.py
    │       ├── qwen_api.py
    │       └── base_api.py
    │   ├── common_utils.py
    │   ├── global_variables.py
    │   ├── evaluation
    │       └── evaluation_utils.py
    │   └── utils
    │       ├── count_tools.py
    │       └── calculate_metrics.py
├── mcp
    ├── config
    │   └── __init__.py
    └── tools
    │   └── browser
    │       └── puppeteer_puppeteer.json
├── data
    ├── file_system
    │   ├── test_project_root
    │   │   ├── data
    │   │   │   ├── README.md
    │   │   │   ├── test_file_txt_1.txt
    │   │   │   ├── test_file_csv_1.csv
    │   │   │   ├── test_file_json_1.json
    │   │   │   └── test_file_json_2.json
    │   │   ├── tests
    │   │   │   └── unit
    │   │   │   │   └── test_calculations.py
    │   │   ├── requirements.txt
    │   │   ├── docs
    │   │   │   └── README.md
    │   │   └── src
    │   │   │   ├── main.py
    │   │   │   ├── config
    │   │   │       └── settings.yaml
    │   │   │   └── utils
    │   │   │       └── file_utils.py
    │   └── filesystem_single_demo.json
    ├── finance
    │   └── finance_single_demo.json
    ├── search
    │   └── search_single_demo.json
    ├── browser
    │   └── browser_single_demo.json
    └── pay
    │   └── pay_single_demo.json
├── run.sh
└── run.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/doc/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mcp/config/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/model_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data/file_system/test_project_root/data/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/agents/base_tool_call_agent/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/agents/data_generator_agent/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/agents/data_generator_agent/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/doc/image_filesystem.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image_filesystem.jpg


--------------------------------------------------------------------------------
/doc/tools_file_system.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/tools_file_system.jpg


--------------------------------------------------------------------------------
/doc/image-finance-agent.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image-finance-agent.jpg


--------------------------------------------------------------------------------
/doc/image-google-search.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image-google-search.jpg


--------------------------------------------------------------------------------
/doc/image_paypal_server.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image_paypal_server.jpg


--------------------------------------------------------------------------------
/data/file_system/test_project_root/data/test_file_txt_1.txt:
--------------------------------------------------------------------------------
1 | Test file 1: This is a test file dedicated to the file system server.


--------------------------------------------------------------------------------
/doc/image_browser_puppeteer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image_browser_puppeteer.jpg


--------------------------------------------------------------------------------
/doc/image_google_map_server.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image_google_map_server.jpg


--------------------------------------------------------------------------------
/doc/model_performance_ast_subplots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/model_performance_ast_subplots.png


--------------------------------------------------------------------------------
/doc/model_performance_pass_subplots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/model_performance_pass_subplots.png


--------------------------------------------------------------------------------
/data/file_system/test_project_root/data/test_file_csv_1.csv:
--------------------------------------------------------------------------------
1 | query,answer
2 | hello,hi
3 | How do you do?,"Doing well, thanks. How about yourself?"


--------------------------------------------------------------------------------
/data/file_system/test_project_root/tests/unit/test_calculations.py:
--------------------------------------------------------------------------------
1 | import math
2 | def main():
3 |     print(math.cos(1))
4 | if __name__ == "__main__":
5 |     main()


--------------------------------------------------------------------------------
/data/file_system/test_project_root/data/test_file_json_1.json:
--------------------------------------------------------------------------------
1 | {
2 |   "project_name": "mcp-tool-bench",
3 |   "file_name": "test_file_json_1",
4 |   "context": {"param1": "1", "param2": "2"},
5 |   "is_locked": false,
6 | }
7 | 


--------------------------------------------------------------------------------
/data/file_system/test_project_root/data/test_file_json_2.json:
--------------------------------------------------------------------------------
1 | {
2 |   "project_name": "mcp-tool-bench",
3 |   "file_name": "test_file_json_2",
4 |   "context": {"param3": "3", "param4": "4"},
5 |   "is_locked": false,
6 | }
7 | 


--------------------------------------------------------------------------------
/data/file_system/test_project_root/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies
 2 | python>=3.8.0
 3 | pathlib2>=2.3.0
 4 | typing-extensions>=4.0.0
 5 | tqdm>=4.64.0
 6 | 
 7 | # File system operations
 8 | shutil>=1.7.0
 9 | watchdog>=2.1.0
10 | pyfilesystem2>=2.4.0
11 | 
12 | # Data processing
13 | numpy>=1.21.0
14 | pandas>=1.3.0
15 | json5>=0.9.0
16 | 
17 | # Security
18 | pycryptodome>=3.15.0
19 | bcrypt>=3.2.0
20 | 
21 | # Testing
22 | pytest>=6.2.0
23 | pytest-cov>=2.12.0
24 | 
25 | # Development tools
26 | black>=21.7b0
27 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/common_utils.py:
--------------------------------------------------------------------------------
 1 | def add_conflict_toolname(tool_name, server_name):
 2 |     tool_name_register = tool_name + "__" + server_name
 3 |     return tool_name_register
 4 | 
 5 | def get_conflict_toolname_original(tool_name, server_name):
 6 |     """
 7 |         {server_name}__{tool_name}
 8 |     """
 9 |     tool_name_norm = tool_name
10 |     if tool_name.startswith(server_name):
11 |         tool_name_norm = tool_name.split("__")[-1] if len(tool_name.split("__")) > 0 else tool_name
12 |     return tool_name_norm
13 | 


--------------------------------------------------------------------------------
/data/file_system/test_project_root/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Function Calling Evaluation Project
 2 | 
 3 |  This is a test project for file system mcp server.
 4 | 
 5 | ## Project Structure
 6 | 
 7 | ```
 8 | # Project Structure
 9 | project_root/
10 | ├── src/
11 | │   ├── main.py
12 | │   ├── utils/
13 | │   │   └── file_utils.py
14 | │   └── config/
15 | │       └── settings.yaml
16 | ├── tests/
17 | │   ├── unit/
18 | │   │   └── test_calculations.py
19 | ├── docs/
20 | │   └── README.md
21 | ├── data/
22 | │   ├── test_file_csv_1.csv
23 | │   ├── test_file_txt_1.txt
24 | │   └── test_file_json_1.json
25 | │   └── test_file_json_2.json
26 | └── requirements.txt
27 | ```
28 | 


--------------------------------------------------------------------------------
/data/file_system/test_project_root/src/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from pathlib import Path
 4 | import logging
 5 | from typing import Optional
 6 | 
 7 | def setup_logging(log_file: Optional[str] = None) -> None:
 8 |     """Initialize logging configuration"""
 9 |     logging.basicConfig(
10 |         level=logging.INFO,
11 |         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
12 |         handlers=[
13 |             logging.FileHandler(log_file) if log_file else logging.StreamHandler()
14 |         ]
15 |     )
16 | 
17 | def main() -> None:
18 |     """Main entry point for the application"""
19 |     try:
20 |         # Initialize logging
21 |         setup_logging()
22 |         
23 |         logger = logging.getLogger(__name__)
24 |         logger.info("Starting application...")
25 |         
26 |         # Your application logic here
27 |         logger.info("Application initialized successfully")
28 |         
29 |     except Exception as e:
30 |         logging.error(f"Application failed: {str(e)}")
31 |         sys.exit(1)
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/data/file_system/test_project_root/src/config/settings.yaml:
--------------------------------------------------------------------------------
 1 | # Application basic configuration
 2 | app:
 3 |   name: "MyApplication"
 4 |   version: "1.0.0"
 5 |   debug: false
 6 |   secret_key: "django-insecure-5^s!w8z$1q#d%6^h&y2*"
 7 |   allowed_hosts:
 8 |     - "localhost"
 9 |     - "127.0.0.1"
10 |     - "example.com"
11 | 
12 | # Database configuration
13 | database:
14 |   default:
15 |     engine: "django.db.backends.postgresql"
16 |     name: "app_db"
17 |     user: "db_user"
18 |     password: "db_password_123"
19 |     host: "localhost"
20 |     port: 5432
21 |     options:
22 |       sslmode: "prefer"
23 | 
24 | # Cache configuration
25 | cache:
26 |   default: "redis://localhost:6379/0"
27 |   session: "redis://localhost:6379/1"
28 |   timeout: 300
29 | 
30 | # Logging configuration
31 | logging:
32 |   level: "INFO"
33 |   handlers:
34 |     file:
35 |       path: "/var/log/app.log"
36 |       max_size: "10MB"
37 |       backup_count: 5
38 |     console:
39 |       enabled: true
40 | 
41 | # Email configuration
42 | email:
43 |   backend: "django.core.mail.backends.smtp.EmailBackend"
44 |   host: "smtp.example.com"
45 |   port: 587
46 |   use_tls: true
47 |   username: "user@example.com"
48 |   password: "email_password_123"
49 |   default_from: "noreply@example.com"
50 | 
51 | # Third-party services configuration
52 | services:
53 |   payment:
54 |     api_key: "pay_sk_test_1234567890"
55 |     webhook_secret: "whsec_0987654321"
56 |   analytics:
57 |     enabled: true
58 |     api_key: "analytics_key_abcdef"
59 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/global_variables.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict, List, Any, Optional
 3 | from pydantic_settings import BaseSettings, SettingsConfigDict
 4 | 
 5 | class Settings(BaseSettings):
 6 | 
 7 |     QWEN_API_KEY: Optional[str] = None
 8 |     OPENAI_API_KEY: Optional[str] = None
 9 |     ANTHROPIC_API_KEY: Optional[str] = None
10 |     GOOGLE_API_KEY: Optional[str] = None
11 |     MISTRAL_API_KEY: Optional[str] = None
12 |     KIMI_API_KEY: Optional[str] = None
13 | 
14 |     # Custom OpenAI-compatible API settings
15 |     CUSTOM_OPENAI_API_KEY: Optional[str] = None
16 |     CUSTOM_OPENAI_BASE_URL: Optional[str] = None
17 | 
18 |     model_config = SettingsConfigDict(
19 |         env_file=".env",  
20 |         env_file_encoding="utf-8",
21 |         extra="ignore"            
22 |     )
23 | 
24 | settings = Settings()
25 | 
26 | ## Model Name Enum
27 | # Claude
28 | MODEL_SELECTION_CLAUDE_OPUS_4 = "claude-opus-4"
29 | MODEL_SELECTION_CLAUDE_37 =  "claude-3-7-sonnet-20250219"
30 | # OpenAI
31 | MODEL_SELECTION_GPT4O =  "gpt-4o"
32 | # Gemini
33 | MODEL_SELECTION_GEMINI_25_FLASH = "gemini-2.5-flash"
34 | # Qwen
35 | MODEL_SELECTION_QWEN25_MAX = "qwen-max" # latest update to Qwen2.5
36 | MODEL_SELECTION_QWEN3_PLUS = "qwen-plus"
37 | MODEL_SELECTION_QWEN3_TURBO = "qwen-turbo"
38 | MODEL_SELECTION_QWEN3_235B = "qwen3-235b-a22b-instruct-2507"
39 | MODEL_SELECTION_QWEN3_CODER = "qwen3-coder-plus"
40 | # Deepseek
41 | MODEL_SELECTION_DEEPSEEK_R1 = "deepseek-r1"
42 | # Kimi
43 | MODEL_SELECTION_KIMI_K2 = "kimi-k2-0711-preview"
44 | 
45 | ## Constant KEY 
46 | KEY_MCP_TOOLS_DICT = "mcp_tools_dict"
47 | KEY_BASE_COMPARE_FUNC = "base_compare_func"
48 | KEY_COMPLETION = "completion"
49 | KEY_REASON_CONTENT = "reason"
50 | KEY_FUNCTION_CALL = "function_call"
51 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #### Run
 2 | 
 3 | ###  start server Enable MCP Tool Client
 4 | cd ./mcp/mcp-marketplace/app/mcp_tool_use
 5 | uvicorn src.app:app --port 5000
 6 | 
 7 | ### Test Run Demo
 8 | python3 run.py --stage tool_call --input_file ./data/browser/browser_single_demo.json --category browser --model qwen3-max --pass_k 1,3 --evaluation_trial_per_task 5
 9 | 
10 | ## OpenAI 
11 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model gpt-4o --pass_k 1,3
12 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model gpt-4.1 --pass_k 1,3
13 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model o3 --pass_k 1,3
14 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model o3-pro --pass_k 1,3
15 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model o4-mini --pass_k 1,3
16 | 
17 | ### Claude API
18 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model claude-opus-4-20250514 --pass_k 1,3
19 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model claude-sonnet-4-20250514 --pass_k 1,3
20 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model claude-3-7-sonnet-20250219 --pass_k 1,3
21 | 
22 | ### Qwen API
23 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model qwen3-max --pass_k 1,3
24 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model qwen3-plus --pass_k 1,3
25 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/model_utils/model_provider.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any
 2 | 
 3 | from ..global_variables import *
 4 | from .qwen_api import QwenModelAPIProvider
 5 | from .kimi_api import KimiModelAPIProvider
 6 | from .claude_api import ClaudeModelAPIProvider
 7 | from .openai_api import OpenAIModelAPIProvider
 8 | from .custom_openai_api import CustomOpenAIAPIProvider
 9 | 
10 | _global_model_provider: Dict[str, Any] = {}
11 | 
12 | ## CLAUDE
13 | if settings.ANTHROPIC_API_KEY:
14 |     _global_model_provider[MODEL_SELECTION_CLAUDE_37] = ClaudeModelAPIProvider(MODEL_SELECTION_CLAUDE_37)
15 |     _global_model_provider[MODEL_SELECTION_CLAUDE_OPUS_4] = ClaudeModelAPIProvider(MODEL_SELECTION_CLAUDE_OPUS_4)
16 | 
17 | ## OPENAI
18 | if settings.OPENAI_API_KEY:
19 |     _global_model_provider[MODEL_SELECTION_GPT4O] = OpenAIModelAPIProvider(MODEL_SELECTION_GPT4O)
20 | 
21 | ## QWEN
22 | if settings.QWEN_API_KEY:
23 |     _global_model_provider[MODEL_SELECTION_QWEN25_MAX] = QwenModelAPIProvider(MODEL_SELECTION_QWEN25_MAX)
24 |     _global_model_provider[MODEL_SELECTION_QWEN3_PLUS] = QwenModelAPIProvider(MODEL_SELECTION_QWEN3_PLUS)
25 |     _global_model_provider[MODEL_SELECTION_QWEN3_TURBO] = QwenModelAPIProvider(MODEL_SELECTION_QWEN3_TURBO)
26 |     _global_model_provider[MODEL_SELECTION_QWEN3_235B] = QwenModelAPIProvider(MODEL_SELECTION_QWEN3_235B)
27 |     _global_model_provider[MODEL_SELECTION_QWEN3_CODER] = QwenModelAPIProvider(MODEL_SELECTION_QWEN3_CODER)
28 | 
29 | ## KIMI
30 | if settings.KIMI_API_KEY:
31 |     _global_model_provider[MODEL_SELECTION_KIMI_K2] = KimiModelAPIProvider(MODEL_SELECTION_KIMI_K2)
32 | 
33 | 
34 | def get_model_provider(model: str):
35 |     """
36 |     Get or create a model provider for the given model.
37 |     If the model exists in _global_model_provider, return it.
38 |     Otherwise, try to create a CustomOpenAI provider if custom settings are available.
39 | 
40 |     Args:
41 |         model: The model name to get/create provider for
42 | 
43 |     Returns:
44 |         Model provider instance or None if not available
45 |     """
46 |     # Check if model already exists in global provider
47 |     if model in _global_model_provider:
48 |         return _global_model_provider[model]
49 | 
50 |     # If custom OpenAI settings are available, create a dynamic provider
51 |     if settings.CUSTOM_OPENAI_BASE_URL and settings.CUSTOM_OPENAI_API_KEY:
52 |         # Create a new CustomOpenAI provider with the requested model name
53 |         provider = CustomOpenAIAPIProvider(
54 |             model_name=model,
55 |             base_url=settings.CUSTOM_OPENAI_BASE_URL,
56 |             api_key=settings.CUSTOM_OPENAI_API_KEY
57 |         )
58 |         # Cache it for future use
59 |         _global_model_provider[model] = provider
60 |         return provider
61 | 
62 |     return None
63 | 


--------------------------------------------------------------------------------
/data/finance/finance_single_demo.json:
--------------------------------------------------------------------------------
1 | [{"uuid":"c0720051-d504-4754-81eb-184f96a922eb","category":"finance","call_type":"single","tools":[{"name":"get_stock_price_global_market","description":" Get Public Available Stock Symbols from Global Marketplace\n\n        Args:\n            symbol_list (List): List of Symbols, such as Tencent: 700, Kuaishou: 1024, Tesla (TSLA), Microsoft(MSFT), Google (GOOG), London Stock Exchange Market, Shell (quote: SHEL), Unilever (quote: ULVR)\n            market (str): \"HK\", \"CN_MAINLAND\", \"US\", \"LSE\", \"NSE_INDIA\", etc.\n        \n        Return: \n            str: json str with below values samples\n\n            [{'symbol': 'SH600036',\n              'current': 45.78,\n              'percent': 1.33,\n              'chg': 0.6,\n              'high': '45.81 CNY',\n              'low': '44.95 CNY',\n              'avg_price': '45.485751910823915 CNY',\n              'timestamp': 1750057200000,\n              'open': 45.08,\n              'last_close': 45.18,\n              'market_capital': 1154564531614.0,\n              'change': '0.6(1.33%)',\n              'previous_close': '45.18 CNY',\n              'market_capitalization': '11545.65 亿 CNY',\n              'pe_ratio': '',\n              'update_time': '2025-06-16 15:00:00',\n              'source': 'XUEQIU.COM, https://xueqiu.com/S/SH600036',\n              'data_source': 'xueqiu.com',\n              'source_url': 'https://xueqiu.com/S/SH600036'},\n    ","input_schema":{"properties":{"symbol_list":{"items":{"type":"string"},"title":"Symbol List","type":"array"},"market":{"title":"Market","type":"string"}},"required":["symbol_list","market"],"title":"get_stock_price_global_marketArguments","type":"object"}}],"mcp_tools_dict":{"finance-agent-mcp-server":["get_stock_price_global_market"]},"query":"What is the current stock price of Tesla in the US market?","function_call_label":[{"name":"get_stock_price_global_market","step":"1","id":"1","mcp_server":"finance-agent-mcp-server","similar_tools":[],"input":{"symbol_list":["TSLA"],"market":"US"},"output":{"status_code":200,"result":{}}}]},{"uuid":"eef90151-02de-446c-94b1-da330b9b26c6","category":"finance","call_type":"single","tools":[{"name":"get_stock_price_global_market","description":" Get Public Available Stock Symbols from Global Marketplace\n\n        Args:\n            symbol_list (List): List of Symbols, such as Tencent: 700, Kuaishou: 1024, Tesla (TSLA), Microsoft(MSFT), Google (GOOG), London Stock Exchange Market, Shell (quote: SHEL), Unilever (quote: ULVR)\n            market (str): \"HK\", \"CN_MAINLAND\", \"US\", \"LSE\", \"NSE_INDIA\", etc.\n        \n        Return: \n            str: json str with below values samples\n\n            [{'symbol': 'SH600036',\n              'current': 45.78,\n              'percent': 1.33,\n              'chg': 0.6,\n              'high': '45.81 CNY',\n              'low': '44.95 CNY',\n              'avg_price': '45.485751910823915 CNY',\n              'timestamp': 1750057200000,\n              'open': 45.08,\n              'last_close': 45.18,\n              'market_capital': 1154564531614.0,\n              'change': '0.6(1.33%)',\n              'previous_close': '45.18 CNY',\n              'market_capitalization': '11545.65 亿 CNY',\n              'pe_ratio': '',\n              'update_time': '2025-06-16 15:00:00',\n              'source': 'XUEQIU.COM, https://xueqiu.com/S/SH600036',\n              'data_source': 'xueqiu.com',\n              'source_url': 'https://xueqiu.com/S/SH600036'},\n    ","input_schema":{"properties":{"symbol_list":{"items":{"type":"string"},"title":"Symbol List","type":"array"},"market":{"title":"Market","type":"string"}},"required":["symbol_list","market"],"title":"get_stock_price_global_marketArguments","type":"object"}}],"mcp_tools_dict":{"finance-agent-mcp-server":["get_stock_price_global_market"]},"query":"What is the current stock price and market capitalization of Shell in the London Stock Exchange market?","function_call_label":[{"name":"get_stock_price_global_market","step":"1","id":"1","mcp_server":"finance-agent-mcp-server","similar_tools":[],"input":{"symbol_list":["SHEL"],"market":"LSE"},"output":{"status_code":200,"result":{}}}]}]
2 | 


--------------------------------------------------------------------------------
/data/file_system/test_project_root/src/utils/file_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import hashlib
  4 | from pathlib import Path
  5 | from typing import Union, List, Optional
  6 | 
  7 | def read_file(file_path: Union[str, Path]) -> str:
  8 |     """Read content from a text file.
  9 |     
 10 |     Args:
 11 |         file_path: Path to the file to read
 12 |         
 13 |     Returns:
 14 |         Content of the file as string
 15 |         
 16 |     Raises:
 17 |         FileNotFoundError: If file doesn't exist
 18 |         IOError: If there are permission issues
 19 |     """
 20 |     with open(file_path, 'r', encoding='utf-8') as f:
 21 |         return f.read()
 22 | 
 23 | def write_file(file_path: Union[str, Path], content: str, overwrite: bool = False) -> None:
 24 |     """Write content to a file.
 25 |     
 26 |     Args:
 27 |         file_path: Path to write to
 28 |         content: Content to write
 29 |         overwrite: Whether to overwrite existing file
 30 |         
 31 |     Raises:
 32 |         FileExistsError: If file exists and overwrite=False
 33 |         IOError: For permission issues
 34 |     """
 35 |     if os.path.exists(file_path) and not overwrite:
 36 |         raise FileExistsError(f"File {file_path} already exists")
 37 |         
 38 |     with open(file_path, 'w', encoding='utf-8') as f:
 39 |         f.write(content)
 40 | 
 41 | def get_file_hash(file_path: Union[str, Path], algorithm: str = 'sha256') -> str:
 42 |     """Calculate file hash.
 43 |     
 44 |     Args:
 45 |         file_path: Path to file
 46 |         algorithm: Hash algorithm (md5, sha1, sha256)
 47 |         
 48 |     Returns:
 49 |         Hex digest of file content
 50 |     """
 51 |     hash_func = getattr(hashlib, algorithm)()
 52 |     with open(file_path, 'rb') as f:
 53 |         for chunk in iter(lambda: f.read(4096), b''):
 54 |             hash_func.update(chunk)
 55 |     return hash_func.hexdigest()
 56 | 
 57 | def ensure_dir_exists(dir_path: Union[str, Path]) -> None:
 58 |     """Ensure directory exists, create if not.
 59 |     
 60 |     Args:
 61 |         dir_path: Path to directory
 62 |     """
 63 |     os.makedirs(dir_path, exist_ok=True)
 64 | 
 65 | def list_files(dir_path: Union[str, Path], recursive: bool = False) -> List[str]:
 66 |     """List files in directory.
 67 |     
 68 |     Args:
 69 |         dir_path: Directory to scan
 70 |         recursive: Whether to scan recursively
 71 |         
 72 |     Returns:
 73 |         List of file paths
 74 |     """
 75 |     if recursive:
 76 |         return [os.path.join(root, f) 
 77 |                 for root, _, files in os.walk(dir_path) 
 78 |                 for f in files]
 79 |     return [f for f in os.listdir(dir_path) 
 80 |             if os.path.isfile(os.path.join(dir_path, f))]
 81 | 
 82 | def safe_delete(file_path: Union[str, Path]) -> bool:
 83 |     """Safely delete a file if it exists.
 84 |     
 85 |     Args:
 86 |         file_path: Path to file
 87 |         
 88 |     Returns:
 89 |         True if file was deleted, False if it didn't exist
 90 |     """
 91 |     try:
 92 |         os.remove(file_path)
 93 |         return True
 94 |     except FileNotFoundError:
 95 |         return False
 96 | 
 97 | def copy_file(src: Union[str, Path], dst: Union[str, Path], overwrite: bool = False) -> None:
 98 |     """Copy file from source to destination.
 99 |     
100 |     Args:
101 |         src: Source file path
102 |         dst: Destination file path
103 |         overwrite: Whether to overwrite existing file
104 |         
105 |     Raises:
106 |         FileExistsError: If destination exists and overwrite=False
107 |     """
108 |     if os.path.exists(dst) and not overwrite:
109 |         raise FileExistsError(f"File {dst} already exists")
110 |     shutil.copy2(src, dst)
111 | 
112 | def get_file_size(file_path: Union[str, Path]) -> int:
113 |     """Get file size in bytes.
114 |     
115 |     Args:
116 |         file_path: Path to file
117 |         
118 |     Returns:
119 |         File size in bytes
120 |     """
121 |     return os.path.getsize(file_path)
122 | 
123 | def is_same_file(file1: Union[str, Path], file2: Union[str, Path]) -> bool:
124 |     """Check if two files are identical by comparing hashes.
125 |     
126 |     Args:
127 |         file1: First file path
128 |         file2: Second file path
129 |         
130 |     Returns:
131 |         True if files have same content
132 |     """
133 |     return get_file_hash(file1) == get_file_hash(file2)
134 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/agents/data_generator_agent/utils/generate_query.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Any
  2 | from utils.prompt import user_prompt_template_generate_query, system_prompt_template_generate_query, system_prompt_template_generate_query_for_single_tool, system_prompt_template_generate_query_for_filesystem
  3 | from utils.prompt_reference import candidate_reference_list, special_needs_description_list
  4 | import json
  5 | from src.mcp_tool_bench.model_utils.model_provider import _global_model_provider
  6 | from src.mcp_tool_bench.global_variables import *
  7 | import html
  8 | import re
  9 | from bs4 import BeautifulSoup
 10 | from tqdm import tqdm
 11 | 
 12 | def decode_html_entities(s):
 13 |     """Decode HTML entities"""
 14 |     # Try using html.unescape
 15 |     first_decode = html.unescape(s)
 16 |     # Check if still contains undecoded entities
 17 |     if "&" in first_decode and ";" in first_decode:
 18 |         # Use BeautifulSoup for further decoding
 19 |         soup = BeautifulSoup(first_decode, "html.parser")
 20 |         second_decode = soup.get_text()
 21 |         return second_decode
 22 |     return first_decode
 23 | 
 24 | def auto_fix_unclosed_quotes(data):
 25 |     """
 26 |     Automatically detect and fix unclosed quotes in strings.
 27 |     """
 28 |     if isinstance(data, list):
 29 |         return data
 30 | 
 31 |     """
 32 |     Automatically add space after colon in key-value pairs, e.g., convert 'key:value' to 'key: value'
 33 |     """
 34 |     # Use regex to match cases where colon is not followed by space, and add space
 35 |     data = re.sub(r'(?m)^(\s*[^#\s][^:]*):([^\s])', r'\1: \2', data)
 36 |     
 37 |     lines = data.split("\n")
 38 |     fixed_lines = []
 39 |     for line in lines:
 40 |         # Detect and fix unclosed quotes
 41 |         if line.count('"') % 2 != 0:
 42 |             line = line + '"'  # Append a quote to close
 43 |         fixed_lines.append(line)
 44 |     return "\n".join(fixed_lines)
 45 | 
 46 | def process_response(response_text):
 47 |     """Process GPT response text"""
 48 |     if not response_text:
 49 |         return ""
 50 |     
 51 |     raw_val = decode_html_entities(response_text)
 52 |     raw_val = auto_fix_unclosed_quotes(raw_val)
 53 |     decoded_json_str = html.unescape(raw_val)
 54 |     decoded_json_str = decoded_json_str.replace("```json\n", "").replace("```", "").replace("\n", "")
 55 |     return decoded_json_str
 56 | 
 57 | def generate_query_and_function_calls(extraction_results: List[List[Dict]], category: str) -> List[Dict]:
 58 |     """
 59 |     Generate user questions and tool call examples based on extracted tools list
 60 |     
 61 |     Args:
 62 |         extraction_results: Tools extraction result list
 63 |         category: Data category
 64 |         
 65 |     Returns:
 66 |         List[Dict]: Generated data list
 67 |     """
 68 |     # gpt_api = GPTAPI()
 69 |     generated_data = []
 70 |     lang = "English"
 71 |     # Add progress bar for processing extraction results
 72 |     for tools_list in tqdm(extraction_results, desc="Generating queries and function calls", unit="tool_list"):
 73 |         # Here should call GPT API to generate query and function_call_label
 74 | 
 75 |         user_prompt = user_prompt_template_generate_query.format(tools=tools_list)
 76 |         # system_prompt = system_prompt_template_generate_query_for_filesystem.format(candidate_count=5, language=lang)
 77 |         candidate_reference = candidate_reference_list.get(category, {})
 78 |         special_needs_description = special_needs_description_list.get(category, "")
 79 |         system_prompt = system_prompt_template_generate_query.format(candidate_count=10, language=lang, candidate_reference=candidate_reference, special_needs_description=special_needs_description)
 80 | 
 81 |         messages = [
 82 |             {
 83 |                 "role": "system",
 84 |                 "content": system_prompt
 85 |             },
 86 |             {
 87 |                 "role": "user",
 88 |                 "content": user_prompt
 89 |             }
 90 |         ]
 91 |         # print("messages: ", messages)
 92 |         model_provider = _global_model_provider[MODEL_SELECTION_GPT4O_ANT] if MODEL_SELECTION_GPT4O_ANT in _global_model_provider else None
 93 |         output = model_provider.api_chat(messages, wait_time=5) if model_provider is not None else {}
 94 |         print("output: ", output)
 95 |         raw_response = output[KEY_COMPLETION] if KEY_COMPLETION in output else ""
 96 |         
 97 |         # Normal chat: process string
 98 |         if isinstance(raw_response, str):
 99 |             result =  process_response(raw_response)
100 |         # result = gpt_api.call(user_prompt, system_prompt, wait_time=10)
101 |         
102 |         print("result: ", result)
103 |         print("type(result): ", type(result))
104 | 
105 |         try:
106 |             result = json.loads(result)
107 |             generated_data.append(result)
108 |         except Exception as e:
109 |             # logging.error(f"Error processing response: {e}")
110 |             continue
111 |     
112 |     return generated_data
113 | 


--------------------------------------------------------------------------------
/mcp/tools/browser/puppeteer_puppeteer.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "id": "puppeteer/puppeteer",
  3 |   "server_name": "puppeteer",
  4 |   "content_name": "puppeteer/puppeteer",
  5 |   "website": null,
  6 |   "content": null,
  7 |   "abstract": null,
  8 |   "field": "MCP SERVER",
  9 |   "subfield": "MCP SERVER",
 10 |   "category": "BROWSER",
 11 |   "publisher_id": null,
 12 |   "thumbnail_picture": null,
 13 |   "github": null,
 14 |   "mcp_server_config": null,
 15 |   "tools": [
 16 |     {
 17 |       "name": "puppeteer_navigate",
 18 |       "description": "Navigate to a URL",
 19 |       "input_schema": {
 20 |         "type": "object",
 21 |         "properties": {
 22 |           "url": {
 23 |             "type": "string",
 24 |             "description": "URL to navigate to"
 25 |           },
 26 |           "launchOptions": {
 27 |             "type": "object",
 28 |             "description": "PuppeteerJS LaunchOptions. Default null. If changed and not null, browser restarts. Example: { headless: true, args: ['--no-sandbox'] }"
 29 |           },
 30 |           "allowDangerous": {
 31 |             "type": "boolean",
 32 |             "description": "Allow dangerous LaunchOptions that reduce security. When false, dangerous args like --no-sandbox will throw errors. Default false."
 33 |           }
 34 |         },
 35 |         "required": [
 36 |           "url"
 37 |         ]
 38 |       }
 39 |     },
 40 |     {
 41 |       "name": "puppeteer_screenshot",
 42 |       "description": "Take a screenshot of the current page or a specific element",
 43 |       "input_schema": {
 44 |         "type": "object",
 45 |         "properties": {
 46 |           "name": {
 47 |             "type": "string",
 48 |             "description": "Name for the screenshot"
 49 |           },
 50 |           "selector": {
 51 |             "type": "string",
 52 |             "description": "CSS selector for element to screenshot"
 53 |           },
 54 |           "width": {
 55 |             "type": "number",
 56 |             "description": "Width in pixels (default: 800)"
 57 |           },
 58 |           "height": {
 59 |             "type": "number",
 60 |             "description": "Height in pixels (default: 600)"
 61 |           },
 62 |           "encoded": {
 63 |             "type": "boolean",
 64 |             "description": "If true, capture the screenshot as a base64-encoded data URI (as text) instead of binary image content. Default false."
 65 |           }
 66 |         },
 67 |         "required": [
 68 |           "name"
 69 |         ]
 70 |       }
 71 |     },
 72 |     {
 73 |       "name": "puppeteer_click",
 74 |       "description": "Click an element on the page",
 75 |       "input_schema": {
 76 |         "type": "object",
 77 |         "properties": {
 78 |           "selector": {
 79 |             "type": "string",
 80 |             "description": "CSS selector for element to click"
 81 |           }
 82 |         },
 83 |         "required": [
 84 |           "selector"
 85 |         ]
 86 |       }
 87 |     },
 88 |     {
 89 |       "name": "puppeteer_fill",
 90 |       "description": "Fill out an input field",
 91 |       "input_schema": {
 92 |         "type": "object",
 93 |         "properties": {
 94 |           "selector": {
 95 |             "type": "string",
 96 |             "description": "CSS selector for input field"
 97 |           },
 98 |           "value": {
 99 |             "type": "string",
100 |             "description": "Value to fill"
101 |           }
102 |         },
103 |         "required": [
104 |           "selector",
105 |           "value"
106 |         ]
107 |       }
108 |     },
109 |     {
110 |       "name": "puppeteer_select",
111 |       "description": "Select an element on the page with Select tag",
112 |       "input_schema": {
113 |         "type": "object",
114 |         "properties": {
115 |           "selector": {
116 |             "type": "string",
117 |             "description": "CSS selector for element to select"
118 |           },
119 |           "value": {
120 |             "type": "string",
121 |             "description": "Value to select"
122 |           }
123 |         },
124 |         "required": [
125 |           "selector",
126 |           "value"
127 |         ]
128 |       }
129 |     },
130 |     {
131 |       "name": "puppeteer_hover",
132 |       "description": "Hover an element on the page",
133 |       "input_schema": {
134 |         "type": "object",
135 |         "properties": {
136 |           "selector": {
137 |             "type": "string",
138 |             "description": "CSS selector for element to hover"
139 |           }
140 |         },
141 |         "required": [
142 |           "selector"
143 |         ]
144 |       }
145 |     },
146 |     {
147 |       "name": "puppeteer_evaluate",
148 |       "description": "Execute JavaScript in the browser console",
149 |       "input_schema": {
150 |         "type": "object",
151 |         "properties": {
152 |           "script": {
153 |             "type": "string",
154 |             "description": "JavaScript code to execute"
155 |           }
156 |         },
157 |         "required": [
158 |           "script"
159 |         ]
160 |       }
161 |     }
162 |   ],
163 |   "description": "This MCP (Model Context Protocol) is a Puppeteer-based browser automation tool that provides web browsing, screenshot capture, element interaction (clicking, filling, selecting, hovering), and JavaScript execution capabilities."
164 | }
165 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/model_utils/openai_api.py:
--------------------------------------------------------------------------------
  1 | # #!/usr/bin/python
  2 | # # -*- coding: UTF-8 -*-
  3 | import json
  4 | import logging
  5 | import requests
  6 | from typing import List, Dict, Any, Optional
  7 | import os
  8 | import sys
  9 | import openai
 10 | from openai import OpenAI
 11 | 
 12 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 13 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../../..')))
 14 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../..')))
 15 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../')))
 16 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, './')))
 17 | 
 18 | from src.mcp_tool_bench.model_utils.base_api import *
 19 | from src.mcp_tool_bench.global_variables import settings
 20 | 
 21 | def tools_openai_wrapper(tools):
 22 |     tools_wrapped = [{
 23 |         "type": "function",
 24 |         "function":{
 25 |             "name": tool["name"] if "name" in tool else "", 
 26 |             "description": tool["description"] if "description" in tool else "",
 27 |             "parameters": tool["input_schema"] if "input_schema" in tool else {}
 28 |         }
 29 |     } for tool in tools]
 30 |     return tools_wrapped
 31 | 
 32 | class OpenAIModelAPIProvider(BaseModelAPIProvider):
 33 |     """
 34 |     OpenAI API for chat and function calling.
 35 |     https://platform.openai.com/docs/api-reference/chat
 36 |     """
 37 |     def __init__(self, model_name: str = ""):
 38 |         super().__init__(model_name)
 39 |         self.client = OpenAI(
 40 |             api_key=settings.OPENAI_API_KEY,
 41 |             base_url="https://api.openai.com/v1"
 42 |         )
 43 | 
 44 |     def api_chat(self, messages: List, **kwargs) -> Dict[str, Any]:
 45 |         """
 46 |         OpenAI chat completion.
 47 |         """
 48 |         try:
 49 |             model = self.model_name
 50 |             if not model: 
 51 |                 model = "gpt-4o"
 52 | 
 53 |             response = self.client.chat.completions.create(
 54 |                 model=model,
 55 |                 messages=messages,
 56 |                 temperature=kwargs.get("temperature", 0.3)
 57 |             )
 58 |             completion, reasoningContent = post_process_openai_chat_response(response)
 59 |             result = {
 60 |                 KEY_FUNCTION_CALL: {},
 61 |                 KEY_COMPLETION: completion,
 62 |                 KEY_REASON_CONTENT: reasoningContent
 63 |             }
 64 |             return result
 65 | 
 66 |         except Exception as e:
 67 |             logging.error(f"Failed to process OpenAI api_chat: {e}")
 68 |             return {}
 69 | 
 70 |     def api_function_call(self, messages: List, tools: List, **kwargs) -> Dict[str, Any]:
 71 |         """
 72 |         OpenAI function calling (tool calling).
 73 |         Args:
 74 |             messages: List of message [{}, {}]
 75 |             tools: List of tool definitions [{type: "function", function: {name: "", description: "", parameters: {}}}]
 76 |         """
 77 |         try:
 78 |             model = self.model_name
 79 |             if not model:
 80 |                 model = "gpt-4o" 
 81 |             response = self.client.chat.completions.create(
 82 |                 model=model,
 83 |                 messages=messages,
 84 |                 tools=tools,
 85 |                 tool_choice="auto",
 86 |                 temperature=kwargs.get("temperature", 0.3),
 87 |                 **kwargs
 88 |             )
 89 |             tool_result = post_process_openai_function_call_response(response)
 90 |             tool_call_mapped, completion, reasoningContent = function_call_result_common_mapper(tool_call)
 91 | 
 92 |             result = {
 93 |                 KEY_FUNCTION_CALL: tool_call_mapped,
 94 |                 KEY_COMPLETION: "",
 95 |                 KEY_REASON_CONTENT: ""
 96 |             }
 97 |             return result
 98 | 
 99 |         except Exception as e:
100 |             logging.error(f"Failed to process OpenAI api_function_call: {e}")
101 |             return {}
102 | 
103 | def post_process_openai_chat_response(response):
104 |     """
105 |     Processes the response from OpenAI chat completion.
106 |     """
107 |     if response is None or not response.choices:
108 |         return "", ""
109 |     completion_content = ""
110 |     if response.choices[0].message.content:
111 |         completion_content = response.choices[0].message.content
112 |     return completion_content, ""
113 | 
114 | def post_process_openai_function_call_response(response):
115 |     """
116 |     Processes the response from OpenAI for function calls.
117 |     Extracts the tool call details.
118 |     """
119 |     if response is None or not response.choices or not response.choices[0].message:
120 |         return {}
121 | 
122 |     try:
123 |         message = response.choices[0].message
124 |         if message.tool_calls:
125 |             first_tool_call = message.tool_calls[0]
126 |             if first_tool_call.type == "function" and first_tool_call.function:
127 |                 tool_call = {
128 |                     "id": first_tool_call.id,
129 |                     "function": {
130 |                         "name": first_tool_call.function.name,
131 |                         "arguments": first_tool_call.function.arguments
132 |                     }
133 |                 }
134 |                 return tool_call
135 |         return {}
136 |     except Exception as e:
137 |         print (f"Failed to post_process_openai_function_call_response error {e}")
138 |         return {}
139 | 
140 | if __name__ == '__main__':    
141 |     # Test function calling
142 |     user_prompt = "Weather query template"
143 |     system_prompt = ""
144 |     try:
145 |         messages = [{"role": "user", "content": user_prompt}]
146 |         current_dir = os.path.dirname(os.path.abspath(__file__))
147 |         package_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
148 |         input_file = os.path.join(package_dir, "mcp/tools/demo/demo_tools.json")
149 |         tools = json.load(open(input_file, "r", encoding="utf-8"))
150 |         wrappered_tools = tools_openai_wrapper(tools)
151 | 
152 |         gpt_api_provider = OpenAIModelAPIProvider()
153 |         result = gpt_api_provider.api_function_call(messages, wrappered_tools)
154 |         print("Function Call Response:", result)
155 |     except FileNotFoundError:
156 |         print("Demo tools file not found, skipping function call test")
157 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/model_utils/kimi_api.py:
--------------------------------------------------------------------------------
  1 | # #!/usr/bin/python
  2 | # # -*- coding: UTF-8 -*-
  3 | import json
  4 | import logging
  5 | import requests
  6 | from typing import List, Dict, Any, Optional
  7 | import os
  8 | import sys
  9 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 10 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../../..')))
 11 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../..')))
 12 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../')))
 13 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, './')))
 14 | 
 15 | from src.mcp_tool_bench.model_utils.base_api import *
 16 | from src.mcp_tool_bench.global_variables import settings
 17 | 
 18 | 
 19 | def tools_openai_wrapper(tools):
 20 |     tools_wrapped = [{
 21 |         "type": "function",
 22 |         "function":{
 23 |             "name": tool["name"] if "name" in tool else "", 
 24 |             "description": tool["description"] if "description" in tool else "",
 25 |             "parameters": tool["input_schema"] if "input_schema" in tool else {}
 26 |         }
 27 |     } for tool in tools]
 28 |     return tools_wrapped
 29 | 
 30 | class KimiModelAPIProvider(BaseModelAPIProvider):
 31 |     """
 32 |         https://platform.moonshot.ai/docs/api/chat#public-service-address
 33 |     """
 34 |     def api_chat(self, messages: List, **kwargs) -> Dict[str, Any]:
 35 |         """
 36 |             Kimi model: "K2"
 37 |         """
 38 |         try:
 39 |             model = self.model_name
 40 |             if model == "" or model is None:
 41 |                 model = "kimi-k2-0711-preview"
 42 |             response = call_kimi_k2_chat(messages, model)
 43 |             tools, completion, reasoningContent = post_process_kimi_response(response)
 44 |             result = {
 45 |                 KEY_FUNCTION_CALL: tools,
 46 |                 KEY_COMPLETION: completion, 
 47 |                 KEY_REASON_CONTENT: reasoningContent
 48 |             }
 49 |             return result
 50 | 
 51 |         except Exception as e:
 52 |             logging.error(f"Failed to process api_chat")
 53 |             return {}
 54 |     
 55 |     def api_function_call(self, messages: List, tools: List, **kwargs) -> Dict[str, Any]:
 56 |         """
 57 |         Args:
 58 |             messages: List of message [{}, {}]
 59 |         """
 60 |         try:
 61 |             model = self.model_name
 62 |             if model == "" or model is None:
 63 |                 model = "kimi-k2-0711-preview"
 64 |             response = call_kimi_k2_tools(messages, tools, model)
 65 |             tool_result = post_process_function_call_kimi(response)
 66 |             tool_call_mapped, completion, reasoningContent = function_call_result_common_mapper(tool_result)
 67 | 
 68 |             result = {
 69 |                 KEY_FUNCTION_CALL: tool_call_mapped,
 70 |                 KEY_COMPLETION: "", 
 71 |                 KEY_REASON_CONTENT: ""
 72 |             }
 73 |             # print (f"KimiModelAPIProvider debug api_function_call result return {result}")
 74 |             return result
 75 | 
 76 |         except Exception as e:
 77 |             logging.error(e)
 78 |             return {}
 79 |         
 80 | def call_kimi_k2_chat(messages, model_name):
 81 |     from openai import OpenAI
 82 | 
 83 |     client = OpenAI(
 84 |         api_key = settings.KIMI_API_KEY,
 85 |         base_url = "https://api.moonshot.ai/v1",
 86 |     )
 87 |     
 88 |     completion = client.chat.completions.create(
 89 |         model = model_name,
 90 |         messages = messages,
 91 |         temperature = 0.3,
 92 |     )
 93 |     return completion
 94 | 
 95 | def call_kimi_k2_tools(messages, tools, model_name):
 96 |     import logging
 97 |     import urllib3
 98 |     from openai import OpenAI
 99 |     
100 |     # Completely disable all logging
101 |     logging.disable(logging.CRITICAL)
102 |     urllib3.disable_warnings()
103 |     logging.getLogger("urllib3").setLevel(logging.CRITICAL)
104 |     logging.getLogger("openai").setLevel(logging.CRITICAL)
105 | 
106 |     client = OpenAI(
107 |         api_key = settings.KIMI_API_KEY,
108 |         base_url = "https://api.moonshot.ai/v1",
109 |     )
110 |     
111 |     return client.chat.completions.create(
112 |         model = model_name,
113 |         messages = messages,
114 |         tools = tools,
115 |         temperature = 0.3,
116 |     )
117 |     return completion
118 | 
119 | def post_process_kimi_response(response):
120 |     if response is None:
121 |         return {}
122 |     tools = {}
123 |     completion = ""
124 |     reasoningContent = ""  
125 |     try:
126 |         completion = response.choices[0].message.content
127 |     except Exception as e:
128 |         logging.error(e)
129 |     return tools, completion, reasoningContent
130 | 
131 | def post_process_function_call_kimi(response):
132 |     if response is None:
133 |         return {}
134 |     try:
135 |         if "error" in response:
136 |             logging.error(f"post_process_function_call_kimi error {response}")
137 |             return {}
138 |         first_tool_call = response.choices[0].message.tool_calls[0]
139 |         tool_call = {
140 |             "id": first_tool_call.id,
141 |             "function": {
142 |                 "name": first_tool_call.function.name,
143 |                 "arguments": first_tool_call.function.arguments
144 |             }
145 |         }
146 |         return tool_call
147 |     except Exception as e:
148 |         logging.error(f"post_process_function_call_kimi {e}")
149 |         return {}
150 | 
151 | if __name__ == '__main__':
152 |     gpt_api_provider = KimiModelAPIProvider(MODEL_SELECTION_KIMI_K2)
153 |     
154 |     # Test normal conversation
155 |     # chat
156 |     user_prompt = "Hello, how are you?"
157 |     system_prompt = "You are a helpful assistant."
158 |     messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
159 |     result = gpt_api_provider.api_chat(messages)
160 |     print("KIMI API Chat Response:", result)
161 |     
162 |     # Test function calling
163 |     user_prompt = "Weather query template"
164 |     system_prompt = ""
165 |     try:
166 |         messages = [{"role": "user", "content": user_prompt}]
167 |         current_dir = os.path.dirname(os.path.abspath(__file__))
168 |         package_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
169 |         input_file = os.path.join(package_dir, "mcp/tools/demo/demo_tools.json")
170 |         tools = json.load(open(input_file, "r", encoding="utf-8"))
171 |         wrappered_tools = tools_openai_wrapper(tools)
172 |         result = gpt_api_provider.api_function_call(messages, wrappered_tools)
173 |         print("KIMI Function Call Response:", result)
174 |     except FileNotFoundError:
175 |         print("Demo tools file not found, skipping function call test")
176 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/agents/data_generator_agent/run_data_generator.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This file performs the following operations:
  3 | 1. Extract tools field from all mcp json files in the specified category directory under mcp_marketplace, merge them into a single file, place it in the original directory as category_tools.json (check the directory, if this file already exists, no need to re-merge)
  4 | 2. Randomly extract tools from category_tools.json, perform multiple extractions and generate extraction result files, placed in logs directory
  5 | 3. For each extraction result, i.e., for each tools list, call LLM to generate user questions (query field) and tool call examples list (function_call_label field). Output files are placed in logs directory
  6 | 4. Post-process user questions and tool call examples to remove unreasonable data.
  7 | 5. Add uuid field, category field, tools field (all tools from category_tools.json, i.e., candidate set) to the processed data, save to data/category directory, file name is category_version.json
  8 | '''
  9 | 
 10 | import json
 11 | import uuid
 12 | from pathlib import Path
 13 | from typing import List, Dict, Any
 14 | 
 15 | from .utils.pre_process import merge_mcp_tools
 16 | from .utils.pre_process import random_extract_tools
 17 | from .utils.post_process import post_process_data
 18 | from .utils.generate_query import generate_query_and_function_calls
 19 | 
 20 | def run_data_generation(category: str, data_version: str, mcp_config_path: str):
 21 |     """
 22 |     Run data generation pipeline
 23 |     
 24 |     Args:
 25 |         category: Data category, such as browser, search
 26 |         data_version: Data version, such as v0, v1
 27 |         mcp_config_path: MCP configuration file path
 28 |     """
 29 |     print(f"Starting data generation: category={category}, version={data_version}")
 30 |     
 31 |     # Step 1.1: Merge MCP tools
 32 |     print("  1.1 Merge MCP tools")
 33 |     category_tools_path, mcp_tools_dict = merge_mcp_tools(category, mcp_config_path)
 34 |     
 35 |     # Step 1.2: Randomly extract tools
 36 |     print("  1.2 Randomly extract tools")
 37 |     extraction_results = random_extract_tools(category_tools_path, min_tools=2, max_tools=5, num_extractions=10)
 38 |     
 39 |     # Step 1.3: Generate user questions and tool call examples
 40 |     print("  1.3 Generate user questions and tool call examples")
 41 |     generated_data = generate_query_and_function_calls(extraction_results, category)
 42 |     
 43 |     # Step 1.4: Post-process data
 44 |     print("  1.4 Post-process data")
 45 |     processed_data = post_process_data(generated_data, fill_iterations=3, category=category, category_tools_path=category_tools_path)
 46 |     
 47 |     # Step 1.5: Save final data
 48 |     print("  1.5 Save final data")
 49 |     save_final_data(processed_data, category, data_version, category_tools_path, mcp_tools_dict)
 50 | 
 51 | 
 52 | def save_final_data(processed_data: List[Dict], category: str, data_version: str, category_tools_path: str, mcp_tools_dict: Dict):
 53 |     """
 54 |     Save final processed data
 55 |     
 56 |     Args:
 57 |         processed_data: Processed data
 58 |         category: Data category
 59 |         data_version: Data version
 60 |         category_tools_path: Category tools file path
 61 |     """
 62 |     # Read all tools as candidate set
 63 |     with open(category_tools_path, 'r', encoding='utf-8-sig') as f:
 64 |         all_tools = json.load(f)
 65 | 
 66 |     # Remove mcp_server field from all_tools for the final data
 67 |     cleaned_tools = []
 68 |     for tool in all_tools:
 69 |         tool_copy = tool.copy()
 70 |         tool_copy.pop('mcp_server', None)  # Remove mcp_server field
 71 |         cleaned_tools.append(tool_copy)
 72 | 
 73 |     # Add necessary fields to each data item
 74 |     final_data = []
 75 |     function_call_label_output = {
 76 |         "status_code": 200,
 77 |         "result": {}
 78 |     }
 79 |     for item in processed_data:
 80 |         # Determine type based on function_call_label length
 81 |         function_call_label = item.get('function_call_label', [])
 82 |         if isinstance(function_call_label, list):
 83 |             if len(function_call_label) == 1:
 84 |                 item_call_type = "single"
 85 |             elif len(function_call_label) > 1:
 86 |                 item_call_type = "multiple"
 87 |             else:
 88 |                 item_call_type = "single"  # Default for empty list
 89 |         else:
 90 |             item_call_type = "single"  # Default for non-list or missing field
 91 |         
 92 |         # Process function_call_label: rename 'arguments' to 'input' and add 'output' field
 93 |         if isinstance(function_call_label, list):
 94 |             processed_function_call_label = []
 95 |             for call_item in function_call_label:
 96 |                 if isinstance(call_item, dict):
 97 |                     processed_call_item = call_item.copy()
 98 |                     # Rename 'arguments' to 'input'
 99 |                     if 'arguments' in processed_call_item:
100 |                         processed_call_item['input'] = processed_call_item.pop('arguments')
101 |                     # Add 'output' field after 'input'
102 |                     if 'input' in processed_call_item:
103 |                         # Insert 'output' after 'input'
104 |                         input_value = processed_call_item['input']
105 |                         del processed_call_item['input']
106 |                         processed_call_item['input'] = input_value
107 |                         processed_call_item['output'] = function_call_label_output
108 |                     else:
109 |                         processed_call_item['output'] = function_call_label_output
110 |                     processed_function_call_label.append(processed_call_item)
111 |                 else:
112 |                     processed_function_call_label.append(call_item)
113 |         else:
114 |             processed_function_call_label = function_call_label
115 |             
116 |         final_item = {
117 |             "uuid": str(uuid.uuid4()),
118 |             "category": category,
119 |             "call_type": item_call_type,  # Type based on function_call_label length
120 |             "tools": cleaned_tools,  # Candidate tools set (without mcp_server field)
121 |             "mcp_tools_dict": mcp_tools_dict,  # MCP server to tools mapping
122 |             **item
123 |         }
124 |         # Update the function_call_label field with processed version
125 |         final_item['function_call_label'] = processed_function_call_label
126 |         final_data.append(final_item)
127 | 
128 |     # Create output directory
129 |     output_dir = Path(f"data/{category}")
130 |     output_dir.mkdir(parents=True, exist_ok=True)
131 | 
132 |     # Save as JSON array
133 |     output_file = output_dir / f"{category}_{data_version}.json"
134 |     with open(output_file, 'w', encoding='utf-8') as f:
135 |         json.dump(final_data, f, ensure_ascii=False, indent=2)
136 | 
137 |     print(f"Data saved to: {output_file}")
138 |     print(f"Generated {len(final_data)} data items")
139 |     print(f"MCP tools dict: {mcp_tools_dict}") 
140 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/model_utils/custom_openai_api.py:
--------------------------------------------------------------------------------
  1 | # #!/usr/bin/python
  2 | # # -*- coding: UTF-8 -*-
  3 | import logging
  4 | from typing import List, Dict, Any
  5 | import os
  6 | import sys
  7 | from openai import OpenAI
  8 | 
  9 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 10 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../../..')))
 11 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../..')))
 12 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../')))
 13 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, './')))
 14 | 
 15 | from src.mcp_tool_bench.model_utils.base_api import BaseModelAPIProvider, function_call_result_common_mapper, KEY_FUNCTION_CALL, KEY_COMPLETION, KEY_REASON_CONTENT
 16 | 
 17 | class CustomOpenAIAPIProvider(BaseModelAPIProvider):
 18 |     """
 19 |     Custom OpenAI-compatible API provider that allows setting custom model name, base URL, and API key.
 20 |     This can be used with various OpenAI-compatible services like Ollama, LocalAI, vLLM, etc.
 21 |     """
 22 |     def __init__(self, model_name: str, base_url: str, api_key: str = "not-needed"):
 23 |         """
 24 |         Initialize the custom OpenAI-compatible API provider.
 25 | 
 26 |         Args:
 27 |             model_name: The name of the model to use
 28 |             base_url: The base URL of the OpenAI-compatible API (e.g., "http://localhost:11434/v1")
 29 |             api_key: The API key (some local services don't require a real key)
 30 |         """
 31 |         super().__init__(model_name)
 32 |         self.base_url = base_url
 33 |         self.api_key = api_key
 34 |         self.client = OpenAI(
 35 |             api_key=api_key,
 36 |             base_url=base_url
 37 |         )
 38 | 
 39 |     def api_chat(self, messages: List, **kwargs) -> Dict[str, Any]:
 40 |         """
 41 |         Custom OpenAI-compatible chat completion.
 42 |         """
 43 |         try:
 44 |             model = self.model_name
 45 |             if not model:
 46 |                 raise ValueError("Model name is required for custom API provider")
 47 | 
 48 |             response = self.client.chat.completions.create(
 49 |                 model=model,
 50 |                 messages=messages,
 51 |                 temperature=kwargs.get("temperature", 0.3),
 52 |                 **{k: v for k, v in kwargs.items() if k not in ['temperature', 'wait_time']}
 53 |             )
 54 |             completion, reasoning_content = self._post_process_chat_response(response)
 55 |             result = {
 56 |                 KEY_FUNCTION_CALL: {},
 57 |                 KEY_COMPLETION: completion,
 58 |                 KEY_REASON_CONTENT: reasoning_content
 59 |             }
 60 |             return result
 61 | 
 62 |         except Exception as e:
 63 |             logging.error(f"Failed to process Custom OpenAI API api_chat: {e}")
 64 |             return {}
 65 | 
 66 |     def api_function_call(self, messages: List, tools: List, **kwargs) -> Dict[str, Any]:
 67 |         """
 68 |         Custom OpenAI-compatible function calling (tool calling).
 69 |         Args:
 70 |             messages: List of message [{}, {}]
 71 |             tools: List of tool definitions [{type: "function", function: {name: "", description: "", parameters: {}}}]
 72 |         """
 73 |         try:
 74 |             model = self.model_name
 75 |             if not model:
 76 |                 raise ValueError("Model name is required for custom API provider")
 77 | 
 78 |             response = self.client.chat.completions.create(
 79 |                 model=model,
 80 |                 messages=messages,
 81 |                 tools=tools,
 82 |                 tool_choice="auto",
 83 |                 temperature=kwargs.get("temperature", 0.3),
 84 |                 **{k: v for k, v in kwargs.items() if k not in ['temperature', 'wait_time']}
 85 |             )
 86 |             tool_call = self._post_process_function_call_response(response)
 87 |             tool_call_mapped, completion, reasoning_content = function_call_result_common_mapper(tool_call)
 88 | 
 89 |             result = {
 90 |                 KEY_FUNCTION_CALL: tool_call_mapped,
 91 |                 KEY_COMPLETION: completion,
 92 |                 KEY_REASON_CONTENT: reasoning_content
 93 |             }
 94 |             return result
 95 | 
 96 |         except Exception as e:
 97 |             logging.error(f"Failed to process Custom OpenAI API api_function_call: {e}")
 98 |             return {}
 99 | 
100 |     def _post_process_chat_response(self, response):
101 |         """
102 |         Processes the response from custom OpenAI-compatible chat completion.
103 |         """
104 |         if response is None or not response.choices:
105 |             return "", ""
106 |         completion_content = ""
107 |         if response.choices[0].message.content:
108 |             completion_content = response.choices[0].message.content
109 |         return completion_content, ""
110 | 
111 |     def _post_process_function_call_response(self, response):
112 |         """
113 |         Processes the response from custom OpenAI-compatible API for function calls.
114 |         Extracts the tool call details.
115 |         """
116 |         if response is None or not response.choices or not response.choices[0].message:
117 |             return {}
118 | 
119 |         try:
120 |             message = response.choices[0].message
121 |             if message.tool_calls:
122 |                 first_tool_call = message.tool_calls[0]
123 |                 if first_tool_call.type == "function" and first_tool_call.function:
124 |                     tool_call = {
125 |                         "id": first_tool_call.id,
126 |                         "function": {
127 |                             "name": first_tool_call.function.name,
128 |                             "arguments": first_tool_call.function.arguments
129 |                         }
130 |                     }
131 |                     return tool_call
132 |             return {}
133 |         except Exception as e:
134 |             logging.error(f"Failed to _post_process_function_call_response error {e}")
135 |             return {}
136 | 
137 |     def get_model_info(self):
138 |         """
139 |         Returns information about the custom API provider configuration.
140 |         """
141 |         return {
142 |             "model_name": self.model_name,
143 |             "base_url": self.base_url,
144 |             "api_key": "***" if self.api_key else None
145 |         }
146 | 
147 | if __name__ == '__main__':
148 |     # Example usage
149 |     try:
150 |         # Example with Ollama (local deployment)
151 |         custom_provider = CustomOpenAIAPIProvider(
152 |             model_name="llama3.2",
153 |             base_url="http://localhost:11434/v1",
154 |             api_key="not-needed"
155 |         )
156 | 
157 |         messages = [{"role": "user", "content": "Hello, how are you?"}]
158 |         result = custom_provider.api_chat(messages)
159 |         print("Custom API Chat Response:", result)
160 |         print("Provider Info:", custom_provider.get_model_info())
161 | 
162 |     except Exception as e:
163 |         print(f"Example failed (this is expected if no local service is running): {e}")
164 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/evaluation/evaluation_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from typing import Dict, List, Any, Optional
  4 | from pydantic_settings import BaseSettings, SettingsConfigDict
  5 | 
  6 | from src.mcp_tool_bench.global_variables import *
  7 | 
  8 | def base_error_analysis(function_call_result: Any) -> Dict[str, Any]:
  9 |     """
 10 |         prompt: 
 11 | 
 12 |             This is the error log. Each line separated by space include count \b reason. Please help summarize the reason into a few keywords description and calculate the count and ratio of the reason.
 13 | 
 14 |     """
 15 |     # function_call_result = trials[0]["function_call_result"] # list
 16 |     result_list = []
 17 |     result_success_label_list = []
 18 |     request_failed = "HTTP Request Failed..."
 19 |     request_result_success = "SUCCESS|Request Result success true"
 20 |     request_result_empty = "EMPTY RESULT|Response Empty Result"
 21 |     request_empty_error_msg = "Empty Error Message.."
 22 |     for call_node in function_call_result:
 23 |         # tool_call_output = call_node["output"] if "output" in call_node else {}
 24 |         tool_call_output = call_node
 25 |         status_code = tool_call_output["status_code"] if "status_code" in tool_call_output else ""
 26 |         result = tool_call_output["result"] if "result" in tool_call_output else ""
 27 |         # print("call_node: ", call_node)
 28 |         # print("tool_call_output: ", tool_call_output)
 29 |         # print("status_code: ", status_code)
 30 |         # print("result: ", result)
 31 |         if status_code != 200:
 32 |             result_list.append(request_failed)
 33 |             result_success_label_list.append(0)
 34 |         else:
 35 |             ## http sucess
 36 |             result_json = {}
 37 |             if isinstance(result, dict):
 38 |                 result_json = result
 39 |             else:
 40 |                 try:
 41 |                     result_json = json.loads(result)
 42 |                 except Exception as e:
 43 |                     print (e)
 44 | 
 45 |             if_success = result_json["success"] if "success" in result_json else False
 46 |             data = result_json["data"] if "data" in result_json else {}
 47 |             error = result_json["error"] if "error" in result_json else ""
 48 | 
 49 |             empty_data = False
 50 |             if isinstance(data, list):
 51 |                 empty_data = True if len(data) == 0 or (len(data) > 0 and data[0] == "") or (len(data) > 0 and len(data[0]) == 0) else False  # [{}]
 52 |             elif isinstance(data, dict):
 53 |                 values = "".join([v for k,v in data.items()])
 54 |                 empty_data = True if len(data) == 0 or (len(data) > 0 and values == "") else False 
 55 |             elif isinstance(data, str):
 56 |                 empty_data = True if len(data) == 0 or (len(data) > 0 and data in ["[]", "\"\"", "\"[]\""])  else False 
 57 |             else:
 58 |                 empty_data = False
 59 | 
 60 |             if if_success:
 61 |                 if empty_data:
 62 |                     result_list.append(request_result_empty)
 63 |                     result_success_label_list.append(0)
 64 |                 else:
 65 |                     result_list.append(request_result_success)
 66 |                     result_success_label_list.append(1)
 67 |             else:
 68 |                 ## sucess false append error logs
 69 |                 if error != "":
 70 |                     # check data
 71 |                     result_list.append(str(error))
 72 |                     result_success_label_list.append(0)
 73 |                 else:
 74 |                     result_list.append(request_empty_error_msg)
 75 |                     result_success_label_list.append(0)
 76 |     
 77 |     # print("result_success_label_list: ", result_success_label_list)
 78 |     # print("result_list: ", result_list)
 79 |     return {
 80 |         "result_success_label_list": result_success_label_list,
 81 |         "result_list": result_list
 82 |     }
 83 |     # return result_success_label_list, result_list
 84 | 
 85 | 
 86 | def base_compare_result(predict_result: Any, label_result: Any) -> bool:
 87 |     """
 88 |         Compare Exact Value match, e.g. 3 == 3, "New York" == "New York"
 89 |     """
 90 |     return label_result == predict_result
 91 | 
 92 | def base_compare_result_status_dict(predict_result: dict, label_result: dict) -> bool:
 93 |     """
 94 |         label_result: 
 95 |         {
 96 |             'success': True, 'data': ['Navigated to https://www.stackoverflow.com'], 'error': None}, 
 97 |             'status_code': 200
 98 |         }
 99 |         predict_result:
100 |         {
101 |             'status_code': 200,
102 |             "result": {'status_code': 200, 'result': {}}, 'step': '1', 'id': '1'}
103 |         }
104 |     """
105 |     status_code = predict_result["status_code"] if "status_code" in predict_result else 500
106 |     if status_code == 200:
107 |         return True
108 |     return False
109 | 
110 | def base_compare_result_search(predict_result: dict, label_result: dict) -> bool:
111 |     """
112 |         Search Result is NOT Empty
113 |     """
114 |     if len(label_result) > 0:
115 |         return True
116 |     return False
117 | 
118 | 
119 | def estimate_pass_at_k(num_samples, num_correct, k):
120 |     """Estimates pass@k of each problem and returns them in an array.
121 |         Reference: Implementation from LiveCodeBench: https://github.com/LiveCodeBench/LiveCodeBench
122 |     """
123 | 
124 |     def estimator(n: int, c: int, k: int) -> float:
125 |         """Calculates 1 - comb(n - c, k) / comb(n, k)."""
126 |         if n - c < k:
127 |             return 1.0
128 |         return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
129 | 
130 |     import itertools
131 | 
132 |     if isinstance(num_samples, int):
133 |         num_samples_it = itertools.repeat(num_samples, len(num_correct))
134 |     else:
135 |         assert len(num_samples) == len(num_correct)
136 |         num_samples_it = iter(num_samples)
137 | 
138 |     return np.array(
139 |         [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
140 |     )
141 | 
142 | 
143 | _global_tool_result_check_func_provider: Dict[str, Any] = {}
144 | _global_tool_result_check_func_provider[KEY_BASE_COMPARE_FUNC] = base_compare_result
145 | 
146 | 
147 | ## example: add special tool result compare
148 | _global_tool_result_check_func_provider["playwright_navigate"] = base_compare_result_status_dict
149 | _global_tool_result_check_func_provider["bing_web_search"] = base_compare_result_search
150 | _global_tool_result_check_func_provider["bing_news_search"] = base_compare_result_search
151 | 
152 | 
153 | def run_test_pass_k():
154 | 
155 |     num_samples = [10, 10, 10]
156 |     num_correct = [3, 3, 5]
157 |     k = 1
158 | 
159 |     # array([0.3, 0.3, 0.5])
160 |     pass_at_k = estimate_pass_at_k(num_samples, num_correct, k)
161 |     final_pass_at_k = sum(pass_at_k)/len(pass_at_k)
162 |     print (f"Final Pass @k equals {final_pass_at_k}")
163 | 
164 | 
165 |     pass_at_5 = estimate_pass_at_k([10], [5], 5) # e.g.  0.99603175
166 | 
167 |     # pass@ n = 10, k = 1 
168 | 
169 | def main():
170 | 
171 |     run_test_pass_k()
172 | 
173 | 
174 | if __name__ == "__main__":
175 |     main()
176 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/model_utils/claude_api.py:
--------------------------------------------------------------------------------
  1 | # #!/usr/bin/python
  2 | # # -*- coding: UTF-8 -*-
  3 | import json
  4 | import logging
  5 | import requests
  6 | from typing import List, Dict, Any, Optional
  7 | import os
  8 | import sys
  9 | import anthropic
 10 | 
 11 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 12 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../../..')))
 13 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../..')))
 14 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../')))
 15 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, './')))
 16 | 
 17 | from src.mcp_tool_bench.model_utils.base_api import *
 18 | from src.mcp_tool_bench.global_variables import settings
 19 | 
 20 | class ClaudeModelAPIProvider(BaseModelAPIProvider):
 21 |     """
 22 |     Anthropic Claude API for chat and tool use.
 23 |     https://docs.anthropic.com/en/docs/tool-use
 24 |     """
 25 |     def __init__(self, model_name: str = ""):
 26 |         super().__init__(model_name)
 27 |         self.client = anthropic.Anthropic(api_key=settings.ANTHROPIC_API_KEY)
 28 | 
 29 |     def api_chat(self, messages: List, **kwargs) -> Dict[str, Any]:
 30 |         """
 31 |         Claude chat completion.
 32 |         """
 33 |         try:
 34 |             model = self.model_name
 35 |             if not model:
 36 |                 model = MODEL_SELECTION_CLAUDE_37
 37 | 
 38 |             system_message_content = ""
 39 |             chat_messages = []
 40 |             for msg in messages:
 41 |                 if msg["role"] == "system":
 42 |                     system_message_content += msg["content"] + "\n"
 43 |                 else:
 44 |                     chat_messages.append(msg)
 45 | 
 46 |             response = self.client.messages.create(
 47 |                 model=model,
 48 |                 max_tokens=kwargs.get("max_tokens", 1024), # Claude requires max_tokens
 49 |                 messages=chat_messages,
 50 |                 system=system_message_content.strip() if system_message_content else None,
 51 |                 temperature=kwargs.get("temperature", 0.3),
 52 |             )
 53 |             completion, reasoningContent = post_process_claude_chat_response(response)
 54 |             result = {
 55 |                 KEY_FUNCTION_CALL: {},
 56 |                 KEY_COMPLETION: completion,
 57 |                 KEY_REASON_CONTENT: reasoningContent
 58 |             }
 59 |             return result
 60 | 
 61 |         except Exception as e:
 62 |             logging.error(f"Failed to process Claude api_chat: {e}")
 63 |             return {}
 64 | 
 65 |     def api_function_call(self, messages: List, tools: List, **kwargs) -> Dict[str, Any]:
 66 |         """
 67 |         Claude tool use (function calling).
 68 |         Args:
 69 |             messages: List of message [{}, {}]
 70 |             tools: List of tool definitions in Claude's format (e.g., from tools_claude_wrapper)
 71 |         """
 72 |         try:
 73 |             model = self.model_name
 74 |             if not model:
 75 |                 model = MODEL_SELECTION_CLAUDE_37
 76 | 
 77 |             system_message_content = ""
 78 |             chat_messages = []
 79 |             for msg in messages:
 80 |                 if msg["role"] == "system":
 81 |                     system_message_content += msg["content"] + "\n"
 82 |                 else:
 83 |                     chat_messages.append(msg)
 84 | 
 85 |             # Claude's `tools` parameter directly takes the list of tool definitions
 86 |             response = self.client.messages.create(
 87 |                 model=model,
 88 |                 max_tokens=kwargs.get("max_tokens", 1024), # Claude requires max_tokens
 89 |                 messages=chat_messages,
 90 |                 tools=tools, # Assuming tools are already in Claude's format from wrapper
 91 |                 tool_choice=kwargs.get("tool_choice", {"type": "auto"}), # Default to auto
 92 |                 system=system_message_content.strip() if system_message_content else None,
 93 |                 temperature=kwargs.get("temperature", 0.3),
 94 |             )
 95 |             tool_result, completion, reasoningContent = post_process_claude_function_call_response(response)
 96 |             result = {
 97 |                 KEY_FUNCTION_CALL: tool_result,
 98 |                 KEY_COMPLETION: completion,
 99 |                 KEY_REASON_CONTENT: reasoningContent
100 |             }
101 |             return result
102 | 
103 |         except Exception as e:
104 |             logging.error(f"Failed to process Claude api_function_call: {e}")
105 |             return {}
106 | 
107 | def post_process_claude_chat_response(response: Any) -> (str, str):
108 |     """
109 |     Processes the response from Claude chat completion.
110 |     Claude's response content is a list of content blocks.
111 |     """
112 |     if response is None or not response.content:
113 |         return "", ""
114 | 
115 |     completion_content = ""
116 |     reasoning_content = "" # Claude might have thinking blocks
117 | 
118 |     for block in response.content:
119 |         if block.type == "text":
120 |             completion_content += block.text
121 |     return completion_content, reasoning_content
122 | 
123 | def post_process_claude_function_call_response(response: Any) -> (Dict[str, Any], str, str):
124 |     """
125 |     Processes the response from Claude for tool use.
126 |     Extracts the tool call details and any text response.
127 |     """
128 |     if response is None or not response.content:
129 |         return {}, "", ""
130 |     tool_call_result = {}
131 |     completion_content = ""
132 |     reasoning_content = ""
133 | 
134 |     try:
135 | 
136 |         for block in response.content:
137 |             if block.type == "tool_use":
138 |                 if not tool_call_result: 
139 |                     tool_call_result = {
140 |                         "function_name": block.name,
141 |                         "function_arguments": block.input, # Claude's tool_use.input is already a dict
142 |                         "is_function_call": True,
143 |                         "id": block.id # Store tool_use ID for sending tool results back
144 |                     }
145 |             elif block.type == "text":
146 |                 completion_content += block.text
147 |         return tool_call_result, completion_content, reasoning_content
148 |             
149 |     except Exception as e:
150 |         print (f"DEBUG: Failed to post_process_claude_function_call_response with error {e}")       
151 |         return tool_call_result, completion_content, reasoning_content
152 | 
153 | if __name__ == '__main__':    
154 |     # Test function calling
155 |     user_prompt = "Weather query template"
156 |     system_prompt = ""
157 |     try:
158 |         messages = [{"role": "user", "content": user_prompt}]
159 |         current_dir = os.path.dirname(os.path.abspath(__file__))
160 |         package_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
161 |         input_file = os.path.join(package_dir, "mcp/tools/demo/demo_tools.json")
162 |         tools = json.load(open(input_file, "r", encoding="utf-8"))
163 | 
164 |         api_provider = ClaudeModelAPIProvider(MODEL_SELECTION_CLAUDE_37)
165 |         result = api_provider.api_function_call(messages, tools)
166 |         print("Function Call Response:", result)
167 |     except FileNotFoundError:
168 |         print("Demo tools file not found, skipping function call test")
169 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/agents/data_generator_agent/utils/pre_process.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Randomly extract tools from category_tools.json, perform multiple extractions and generate extraction result files, placed in logs directory
  3 | '''
  4 | 
  5 | import json
  6 | import random
  7 | from pathlib import Path
  8 | from typing import List, Dict, Any
  9 | 
 10 | 
 11 | def merge_mcp_tools(category: str, mcp_config_path: str) -> str:
 12 |     """
 13 |     Merge all MCP tools in the specified category directory
 14 |     
 15 |     Args:
 16 |         category: Data category, such as browser, search
 17 |         mcp_config_path: MCP configuration file path
 18 |         
 19 |     Returns:
 20 |         str: Path to the merged tools file
 21 |     """
 22 |     # Build category directory path
 23 |     category_dir = Path(f"mcp/tools/{category}")
 24 |     
 25 |     if not category_dir.exists():
 26 |         raise FileNotFoundError(f"Category directory does not exist: {category_dir}")
 27 |     
 28 |     output_file = category_dir / f"{category}_tools.json"
 29 |     
 30 |     # # Check if merged file already exists
 31 |     # if output_file.exists():
 32 |     #     print(f"Merged file already exists, skipping merge step: {output_file}")
 33 |     #     return str(output_file)
 34 |     
 35 |     # Collect all MCP tools
 36 |     all_tools = []
 37 |     
 38 |     # Iterate through all JSON files in the directory
 39 |     for json_file in category_dir.glob("*.json"):
 40 |         if json_file.name == f"{category}_tools.json":
 41 |             continue  # Skip existing merged file
 42 |         
 43 |         try:
 44 |             with open(json_file, 'r', encoding='utf-8') as f:
 45 |                 mcp_data = json.load(f)
 46 |             
 47 |             # Extract MCP ID from file content
 48 |             mcp_id = mcp_data.get('server_name', json_file.stem)  # Fallback to filename if no id field
 49 |             
 50 |             # Extract tools field
 51 |             if 'tools' in mcp_data:
 52 |                 tools = mcp_data['tools']
 53 |                 if isinstance(tools, list):
 54 |                     # Add mcp_server field to each tool
 55 |                     for tool in tools:
 56 |                         tool['mcp_server'] = mcp_id
 57 |                     all_tools.extend(tools)
 58 |                 else:
 59 |                     # Add mcp_server field to single tool
 60 |                     tools['mcp_server'] = mcp_id
 61 |                     all_tools.append(tools)
 62 |                     
 63 |         except Exception as e:
 64 |             print(f"Warning: Cannot read file {json_file}: {e}")
 65 |             continue
 66 |     
 67 |     # Remove duplicate tools (based on tool name AND mcp_server)
 68 |     # Keep tools with same name but from different MCP servers
 69 |     unique_tools = []
 70 |     tool_key_set = set()  # Track (tool_name, mcp_server) combinations
 71 |     # Generate mcp_tools_dict: {mcp_server: [tool_name1, tool_name2, ...]}
 72 |     # Handle tool name conflicts by renaming to "mcp_server_tool_name" format
 73 |     # First occurrence keeps original name, subsequent conflicts get renamed
 74 |     mcp_tools_dict = {}
 75 |     all_tool_names = set()  # Track all tool names across all MCP servers
 76 |     
 77 |     for tool in all_tools:
 78 |         tool_name = tool.get('name', '')
 79 |         mcp_server = tool.get('mcp_server', 'unknown')
 80 |         
 81 |         if tool_name:
 82 |             tool_key = (tool_name, mcp_server)
 83 |             if tool_key not in tool_key_set:
 84 |                 tool_key_set.add(tool_key)
 85 | 
 86 |                 # Check if tool name already exists in any MCP server
 87 |                 if tool_name in all_tool_names:
 88 |                     # Rename to avoid conflict: "mcp_server_tool_name"
 89 |                     new_tool_name = f"{mcp_server}_{tool_name}"
 90 |                     tool['name'] = new_tool_name  # Update the tool name in the original list
 91 |                 else:
 92 |                     new_tool_name = tool_name
 93 |                     
 94 |                 # Add to tracking set (use the new name for future conflict detection)
 95 |                 all_tool_names.add(new_tool_name)
 96 |                 
 97 |                 # Add to mcp_tools_dict
 98 |                 if mcp_server not in mcp_tools_dict:
 99 |                     mcp_tools_dict[mcp_server] = []
100 |                 mcp_tools_dict[mcp_server].append(new_tool_name)
101 | 
102 |                 unique_tools.append(tool)
103 |     
104 |     # Save merged tools file
105 |     with open(output_file, 'w', encoding='utf-8') as f:
106 |         json.dump(unique_tools, f, ensure_ascii=False, indent=2)
107 |     
108 |     print(f"Successfully merged {len(unique_tools)} tools to: {output_file}")
109 |     return str(output_file), mcp_tools_dict
110 | 
111 | def random_extract_tools(category_tools_path: str, num_extractions: int = 10, 
112 |                         min_tools: int = 1, max_tools: int = 3) -> List[List[Dict]]:
113 |     """
114 |     Randomly extract tools from category tools file
115 |     
116 |     Args:
117 |         category_tools_path: Category tools file path
118 |         num_extractions: Number of extractions
119 |         min_tools: Minimum number of tools per extraction
120 |         max_tools: Maximum number of tools per extraction
121 |         
122 |     Returns:
123 |         List[List[Dict]]: Extraction result list, each element is a group of tools
124 |     """
125 |     # Read all tools
126 |     with open(category_tools_path, 'r', encoding='utf-8-sig') as f:
127 |         data = json.load(f)
128 |     
129 |     # Handle different data structures
130 |     if isinstance(data, dict) and 'tools' in data:
131 |         all_tools = data['tools']
132 |     elif isinstance(data, list):
133 |         all_tools = data
134 |     else:
135 |         raise ValueError(f"Unsupported data format: {type(data)}")
136 |     
137 |     if not all_tools:
138 |         raise ValueError("Tool list is empty")
139 |     
140 |     extraction_results = []
141 |     
142 |     # Perform multiple random extractions
143 |     for i in range(num_extractions):
144 |         # Randomly decide the number of tools for this extraction
145 |         num_tools = random.randint(min_tools, min(max_tools, len(all_tools)))
146 |         
147 |         # Randomly extract tools
148 |         selected_tools = random.sample(all_tools, num_tools)
149 |         extraction_results.append(selected_tools)
150 |     
151 |     # Save extraction results to logs directory
152 |     save_extraction_results(extraction_results, category_tools_path)
153 |     
154 |     print(f"Completed {num_extractions} tool extractions, {min_tools}-{max_tools} tools per extraction")
155 |     return extraction_results
156 | 
157 | 
158 | def save_extraction_results(extraction_results: List[List[Dict]], category_tools_path: str):
159 |     """
160 |     Save extraction results to logs directory
161 |     
162 |     Args:
163 |         extraction_results: Extraction result list
164 |         category_tools_path: Original tools file path
165 |     """
166 |     # Create logs directory
167 |     logs_dir = Path("src/mcp_tool_bench/agents/data_generator_agent/logs")
168 |     logs_dir.mkdir(parents=True, exist_ok=True)
169 |     
170 |     # Extract category name from original path
171 |     category = Path(category_tools_path).parent.name
172 |     
173 |     # Save extraction results
174 |     output_file = logs_dir / f"{category}_extraction_results.json"
175 |     with open(output_file, 'w', encoding='utf-8-sig') as f:
176 |         json.dump(extraction_results, f, ensure_ascii=False, indent=2)
177 |     
178 |     print(f"Extraction results saved to: {output_file}")
179 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/utils/count_tools.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import re
  4 | import tiktoken
  5 | from pathlib import Path
  6 | 
  7 | def count_mcp_files(directory):
  8 |     """Count MCP files excluding jsonl and type_tools.json files"""
  9 |     count = 0
 10 |     for file in os.listdir(directory):
 11 |         if file.endswith('.json') and not file.endswith('_tools.json'):
 12 |             count += 1
 13 |     return count
 14 | 
 15 | def count_words(text):
 16 |     """Count words in text by splitting on whitespace and special characters"""
 17 |     # Remove special characters and split on whitespace
 18 |     words = re.findall(r'\b\w+\b', text.lower())
 19 |     return len(words)
 20 | 
 21 | def count_tokens(text):
 22 |     """Count tokens using tiktoken's cl100k_base encoder"""
 23 |     try:
 24 |         encoding = tiktoken.get_encoding("cl100k_base")
 25 |         return len(encoding.encode(text))
 26 |     except Exception as e:
 27 |         print(f"Warning: Error counting tokens: {e}")
 28 |         return len(text) // 4  # Fallback to simple approximation
 29 | 
 30 | def count_tools(mcp_type_dir):
 31 |     base_dir = mcp_type_dir
 32 |     results = []
 33 |     
 34 |     # Process each subdirectory
 35 |     for subdir in os.listdir(base_dir):
 36 |         subdir_path = os.path.join(base_dir, subdir)
 37 |         if not os.path.isdir(subdir_path):
 38 |             continue
 39 |             
 40 |         # Find the type_tools.json file
 41 |         tools_file = os.path.join(subdir_path, f'{subdir}_tools.json')
 42 |         if not os.path.exists(tools_file):
 43 |             print(f"Warning: {tools_file} not found")
 44 |             continue
 45 |             
 46 |         # Count total tools and calculate statistics
 47 |         with open(tools_file, 'r', encoding='utf-8-sig') as f:
 48 |             data = json.load(f)
 49 |             # tools = data.get('tools', [])
 50 |             tools = data
 51 |             total_tools = len(tools)
 52 |             
 53 |             # Calculate statistics for all tools
 54 |             char_lengths = []
 55 |             word_counts = []
 56 |             token_counts = []
 57 |             
 58 |             for tool in tools:
 59 |                 tool_str = json.dumps(tool)
 60 |                 char_lengths.append(len(tool_str.encode('utf-8')))
 61 |                 word_counts.append(count_words(tool_str))
 62 |                 token_counts.append(count_tokens(tool_str))
 63 |             
 64 |             total_chars = sum(char_lengths)
 65 |             total_words = sum(word_counts)
 66 |             total_tokens = sum(token_counts)
 67 |             
 68 |             avg_chars = total_chars / total_tools if total_tools > 0 else 0
 69 |             avg_words = total_words / total_tools if total_tools > 0 else 0
 70 |             avg_tokens = total_tokens / total_tools if total_tools > 0 else 0
 71 |             
 72 |             max_chars = max(char_lengths) if char_lengths else 0
 73 |             min_chars = min(char_lengths) if char_lengths else 0
 74 |             max_words = max(word_counts) if word_counts else 0
 75 |             min_words = min(word_counts) if word_counts else 0
 76 |             max_tokens = max(token_counts) if token_counts else 0
 77 |             min_tokens = min(token_counts) if token_counts else 0
 78 |             
 79 |         # Count MCP files
 80 |         mcp_count = count_mcp_files(subdir_path)
 81 |         
 82 |         # Calculate average
 83 |         avg_tools = total_tools / mcp_count if mcp_count > 0 else 0
 84 |         
 85 |         results.append({
 86 |             'type': subdir,
 87 |             'total_tools': total_tools,
 88 |             'mcp_count': mcp_count,
 89 |             'avg_tools_per_mcp': round(avg_tools, 2),
 90 |             'avg_chars_per_tool': round(avg_chars, 2),
 91 |             'avg_words_per_tool': round(avg_words, 2),
 92 |             'avg_tokens_per_tool': round(avg_tokens, 2),
 93 |             'max_chars': max_chars,
 94 |             'min_chars': min_chars,
 95 |             'max_words': max_words,
 96 |             'min_words': min_words,
 97 |             'max_tokens': max_tokens,
 98 |             'min_tokens': min_tokens,
 99 |             'total_chars': total_chars,
100 |             'total_words': total_words,
101 |             'total_tokens': total_tokens
102 |         })
103 |     
104 |     # Sort results by total tools
105 |     results.sort(key=lambda x: x['total_tools'], reverse=True)
106 |     
107 |     # Prepare output string
108 |     output = []
109 |     output.append("\n{:<30} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15}".format(
110 |         "Type", "Total Tools", "MCP Count", "Avg Tools/MCP", "Avg Chars/Tool", "Avg Words/Tool", 
111 |         "Avg Tokens/Tool", "Tool Max Chars", "Tool Min Chars", "Tool Max Words", "Tool Min Words", 
112 |         "Tool Max Tokens", "Tool Min Tokens"))
113 |     output.append("-" * 195)
114 |     
115 |     for r in results:
116 |         output.append("{:<30} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15}".format(
117 |             r['type'],
118 |             r['total_tools'],
119 |             r['mcp_count'],
120 |             r['avg_tools_per_mcp'],
121 |             r['avg_chars_per_tool'],
122 |             r['avg_words_per_tool'],
123 |             r['avg_tokens_per_tool'],
124 |             r['max_chars'],
125 |             r['min_chars'],
126 |             r['max_words'],
127 |             r['min_words'],
128 |             r['max_tokens'],
129 |             r['min_tokens']
130 |         ))
131 |     
132 |     # Print to console
133 |     print("\n".join(output))
134 |     
135 |     # Write to file
136 |     mcp_type_dir = Path(mcp_type_dir)
137 |     with open(mcp_type_dir.parent / "logs" / "tools_statistics.txt", 'w', encoding='utf-8-sig') as f:
138 |         f.write("\n".join(output))
139 |         f.write("\n\nSummary Statistics:\n")
140 |         f.write("-" * 50 + "\n")
141 |         f.write(f"Total Types: {len(results)}\n")
142 |         f.write(f"Total Tools Across All Types: {sum(r['total_tools'] for r in results)}\n")
143 |         f.write(f"Total MCPs Across All Types: {sum(r['mcp_count'] for r in results)}\n")
144 |         f.write(f"Overall Average Tools per MCP: {sum(r['total_tools'] for r in results) / sum(r['mcp_count'] for r in results):.2f}\n")
145 |         f.write(f"Overall Average Chars per Tool: {sum(r['total_tools'] * r['avg_chars_per_tool'] for r in results) / sum(r['total_tools'] for r in results):.2f}\n")
146 |         f.write(f"Overall Average Words per Tool: {sum(r['total_tools'] * r['avg_words_per_tool'] for r in results) / sum(r['total_tools'] for r in results):.2f}\n")
147 |         f.write(f"Overall Average Tokens per Tool: {sum(r['total_tools'] * r['avg_tokens_per_tool'] for r in results) / sum(r['total_tools'] for r in results):.2f}\n")
148 |         f.write(f"Global Max Chars: {max(r['max_chars'] for r in results)}\n")
149 |         f.write(f"Global Min Chars: {min(r['min_chars'] for r in results)}\n")
150 |         f.write(f"Global Max Words: {max(r['max_words'] for r in results)}\n")
151 |         f.write(f"Global Min Words: {min(r['min_words'] for r in results)}\n")
152 |         f.write(f"Global Max Tokens: {max(r['max_tokens'] for r in results)}\n")
153 |         f.write(f"Global Min Tokens: {min(r['min_tokens'] for r in results)}\n")
154 |         # f.write(f"Total Characters Across All Types: {sum(r['total_chars'] for r in results)}\n")
155 |         # f.write(f"Total Words Across All Types: {sum(r['total_words'] for r in results)}\n")
156 |         # f.write(f"Total Tokens Across All Types: {sum(r['total_tokens'] for r in results)}\n")
157 | 
158 | if __name__ == '__main__':
159 |     mcp_type_dir = "mcp/tools"
160 |     count_tools(mcp_type_dir) 
161 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/model_utils/qwen_api.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from typing import List, Dict, Any, Optional
  4 | from .base_api import *
  5 | from ..global_variables import settings
  6 | import requests
  7 | 
  8 | class QwenModelAPIProvider(BaseModelAPIProvider):
  9 | 
 10 |     def api_chat(self, messages: List, **kwargs) -> Dict[str, Any]:
 11 |         """
 12 |             Qwen model: "qwen-max", "qwen-plus"
 13 |         """
 14 |         try:
 15 |             model = self.model_name
 16 |             if model == "" or model is None:
 17 |                 model = "qwen-plus"
 18 | 
 19 |             response = call_qwen_messages_model_selection(messages, self.model_name)
 20 |             tools, completion, reasoningContent = post_process_qwen_response(response)
 21 |             result = {
 22 |                 KEY_FUNCTION_CALL: tools,
 23 |                 KEY_COMPLETION: completion, 
 24 |                 KEY_REASON_CONTENT: reasoningContent
 25 |             }
 26 |             return result
 27 | 
 28 |         except Exception as e:
 29 |             logging.error(f"Failed to process api_chat")
 30 |             return {}
 31 |     
 32 |     def api_function_call(self, messages: List, tools: List, **kwargs) -> Dict[str, Any]:
 33 |         """
 34 |         Args:
 35 |             messages: List of message [{}, {}]
 36 |             tools: List
 37 |         Returns:
 38 |             result: Dict: ,
 39 |             {
 40 |                 'tools': 
 41 |                     {
 42 |                         'function_name': 'playwright_navigate', 'function_arguments': '{"url": "https://www.stackoverflow.com", "browserType": "chromium"}', 
 43 |                         'is_function_call': True, 
 44 |                         'id': 'call_6cb5d88bb3cf4884aadc03'
 45 |                     }, 
 46 |                 'completion': '', 
 47 |                 'reason': ''
 48 |             }
 49 |         """
 50 |         try:
 51 |             model = self.model_name
 52 |             if model == "" or model is None:
 53 |                 model = "qwen-plus"
 54 |             messages, tools, model
 55 |             response = call_qwen_tool_calls_model_selection(messages, tools, model)
 56 |             tool_call = post_process_function_call_qwen_common(response)
 57 |             tool_call_mapped, completion, reasoningContent = function_call_result_common_mapper(tool_call)
 58 | 
 59 |             result = {
 60 |                 KEY_FUNCTION_CALL: tool_call_mapped,
 61 |                 KEY_COMPLETION: "", 
 62 |                 KEY_REASON_CONTENT: ""
 63 |             }
 64 |             print (f"AntQwenModelAPIProvider debug api_function_call result return {result}")
 65 | 
 66 |             return result
 67 | 
 68 |         except Exception as e:
 69 |             logging.error(f"QwenModelAPIProvider {e}")
 70 |             return {}
 71 | 
 72 | def call_qwen_messages_model_selection(messages: List, model: str):
 73 |     """
 74 |         Reference doc: https://help.aliyun.com/zh/model-studio/use-qwen-by-calling-api#b30677f6e9437
 75 |         Input: 
 76 |             messages: List[Dict]
 77 |     """
 78 |     try:
 79 |         url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
 80 |         api_key = settings.QWEN_API_KEY
 81 |         if api_key is None:
 82 |             raise ValueError("qwen_general_api.py call_qwen_max_user_prompt api_key not found, please check .env file key QWEN_API_KEY")
 83 |         headers = {
 84 |             "Authorization": f"Bearer {api_key}",
 85 |             "Content-Type": "application/json"
 86 |         }
 87 |         data = {
 88 |             "model": model,
 89 |             "messages": messages,
 90 |         }
 91 |         data = json.dumps(data).encode("utf-8")
 92 |         response = requests.post(url, headers=headers, data=data, timeout=10)
 93 |         if response.status_code == 200:
 94 |             result = response.json()
 95 |             print("Qwen Response:", result["choices"][0]["message"]["content"])
 96 |         else:
 97 |             print(f"API Return Failed with Status (Status Code: {response.status_code}): {response.text}")
 98 |         return response
 99 |     except Exception as e:
100 |         logging.error(e)
101 |         return None
102 | 
103 | 
104 | def call_qwen_user_prompt_model_selection(user_prompt: str, model: str):
105 |     """
106 |         Reference doc: https://help.aliyun.com/zh/model-studio/use-qwen-by-calling-api#b30677f6e9437
107 |     """
108 |     try:
109 |         url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
110 |         api_key = settings.QWEN_API_KEY
111 |         if api_key is None:
112 |             raise ValueError("qwen_general_api.py call_qwen_max_user_prompt api_key not found, please check .env file key QWEN_API_KEY")
113 |         headers = {
114 |             "Authorization": f"Bearer {api_key}",
115 |             "Content-Type": "application/json"
116 |         }
117 |         data = {
118 |             "model": model,
119 |             "messages": [{"role": "user", "content": user_prompt}],
120 |         }
121 |         data = json.dumps(data).encode("utf-8")
122 |         response = requests.post(url, headers=headers, data=data, timeout=10)
123 |         if response.status_code == 200:
124 |             result = response.json()
125 |             print("Qwen Response:", result["choices"][0]["message"]["content"])
126 |         else:
127 |             print(f"API Return Failed with Status (Status Code: {response.status_code}): {response.text}")
128 |         return response
129 |     except Exception as e:
130 |         logging.error(e)
131 |         return None
132 | 
133 | 
134 | def post_process_qwen_response(response):
135 |     if response is None:
136 |         return {}
137 |     tools, completion,reasoningContent  = {}, "", ""
138 | 
139 |     res_json = {}      
140 |     try:
141 |         print (f"post_process_function_call_qwen_base input response {response} and type {type(response)}")
142 |         res_json = json.loads(response.content)
143 |     except json.decoder.JSONDecodeError:
144 |         print("Not Valid Json Format")
145 |         return ''
146 |     try:
147 |         # x = res_json["data"]["values"]["data"]
148 |         completion = res_json["choices"][0]["message"]["content"]
149 |         usage = res_json["usage"] if "usage" in res_json else {}
150 |     except Exception as e:
151 |         logging.error(e)
152 |     return tools, completion, reasoningContent
153 | 
154 | 
155 | 
156 | def call_qwen_tool_calls_model_selection(messages, tools, model):
157 |     """
158 |         Args:
159 |             messages: list of dict 
160 |             tools: list of dict
161 |         return:
162 |             {"choices":[{"message":{"content":"","role":"assistant","tool_calls":[{"index":0,"id":"call_f8d9f219ee034156985f6a","type":"function","function":{"name":"get_current_weather","arguments":"{\"location\": \"上海\"}"}}]},"finish_reason":"tool_calls","index":0,"logprobs":null}],"object":"chat.completion","usage":{"prompt_tokens":266,"completion_tokens":20,"total_tokens":286,"prompt_tokens_details":{"cached_tokens":0}},"created":1750987730,"system_fingerprint":null,"model":"qwen-plus","id":"chatcmpl-3bd1954c-8594-98e1-957b-9fda39ac73fc"}
163 |         doc: https://help.aliyun.com/zh/model-studio/qwen-function-calling
164 |     """
165 |     try:
166 |         api_key = settings.QWEN_API_KEY
167 |         url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
168 |         headers = {
169 |             'Content-Type': 'application/json',
170 |             'Authorization': f"Bearer {api_key}",
171 |         }
172 |         data = {
173 |                 "stream": False,
174 |                 "model": model,
175 |                 "messages": messages,
176 |                 "tools": tools
177 |         }
178 |         data = json.dumps(data).encode("utf-8")
179 |         response = requests.post(url, headers=headers, data=data)
180 |         if response.status_code == 200:
181 |             result = response.json()
182 |             print("Qwen Response:", result["choices"][0]["message"]["content"])
183 |         else:
184 |             print(f"API Return Failed with Status (Status Code: {response.status_code}): {response.text}")
185 |         return response
186 |     except Exception as e:
187 |         print (e)
188 |         return None
189 | 
190 | def post_process_function_call_qwen_common(response):
191 |     """
192 |         tool_call:
193 |             {
194 |                 "id": "call_6fcd208b442c4c12b1b419",
195 |                 "function": {
196 |                   "arguments": "{\"location\": \"\u4e0a\u6d77\u5e02\"}",
197 |                   "name": "get_current_weather"
198 |                 },
199 |                 "type": "function",
200 |                 "index": 0
201 |             }
202 |     """
203 |     if response is None:
204 |         return {}
205 | 
206 |     tools = {}
207 |     completion = ""
208 |     reasoningContent = ""  
209 | 
210 |     res_json = {}      
211 |     try:
212 |         content = response.content
213 |         logging.info(f"post_process_function_call_qwen_base content {content}")
214 |         res_json = json.loads(content)
215 | 
216 |     except json.decoder.JSONDecodeError:
217 |         print("Not Valid Json Format" + content)
218 |         return {}
219 |     try:
220 |         choice = res_json["choices"][0] if len(res_json["choices"]) > 0 else {}
221 |         finish_reason = choice["finish_reason"] if "finish_reason" in choice else "" # tool_calls
222 |         message = choice["message"] if "message" in choice else {}
223 |         tool_calls = message["tool_calls"] if "tool_calls" in message else []
224 |         tool_call = tool_calls[0] if len(tool_calls) > 0 else {}
225 |         return tool_call
226 |     except Exception as e:
227 |         logging.error(e)
228 |         return {}
229 | 


--------------------------------------------------------------------------------
/data/search/search_single_demo.json:
--------------------------------------------------------------------------------
1 | [{"uuid":"6e88dafd-c1d7-4aa6-8fe2-2db88a874664","category":"search","call_type":"single","tools":[{"name":"tavily-search","description":"A powerful web search tool that provides comprehensive, real-time results using Tavily's AI search engine. Returns relevant web content with customizable parameters for result count, content type, and domain filtering. Ideal for gathering current information, news, and detailed web content analysis.","input_schema":{"type":"object","properties":{"query":{"type":"string","description":"Search query"},"search_depth":{"type":"string","enum":["basic","advanced"],"description":"The depth of the search. It can be 'basic' or 'advanced'","default":"basic"},"topic":{"type":"string","enum":["general","news"],"description":"The category of the search. This will determine which of our agents will be used for the search","default":"general"},"days":{"type":"number","description":"The number of days back from the current date to include in the search results. This specifies the time frame of data to be retrieved. Please note that this feature is only available when using the 'news' search topic","default":3},"time_range":{"type":"string","description":"The time range back from the current date to include in the search results. This feature is available for both 'general' and 'news' search topics","enum":["day","week","month","year","d","w","m","y"]},"max_results":{"type":"number","description":"The maximum number of search results to return","default":10,"minimum":5,"maximum":20},"include_images":{"type":"boolean","description":"Include a list of query-related images in the response","default":false},"include_image_descriptions":{"type":"boolean","description":"Include a list of query-related images and their descriptions in the response","default":false},"include_raw_content":{"type":"boolean","description":"Include the cleaned and parsed HTML content of each search result","default":false},"include_domains":{"type":"array","items":{"type":"string"},"description":"A list of domains to specifically include in the search results, if the user asks to search on specific sites set this to the domain of the site","default":[]},"exclude_domains":{"type":"array","items":{"type":"string"},"description":"List of domains to specifically exclude, if the user asks to exclude a domain set this to the domain of the site","default":[]},"country":{"type":"string","enum":["afghanistan","albania","algeria","andorra","angola","argentina","armenia","australia","austria","azerbaijan","bahamas","bahrain","bangladesh","barbados","belarus","belgium","belize","benin","bhutan","bolivia","bosnia and herzegovina","botswana","brazil","brunei","bulgaria","burkina faso","burundi","cambodia","cameroon","canada","cape verde","central african republic","chad","chile","china","colombia","comoros","congo","costa rica","croatia","cuba","cyprus","czech republic","denmark","djibouti","dominican republic","ecuador","egypt","el salvador","equatorial guinea","eritrea","estonia","ethiopia","fiji","finland","france","gabon","gambia","georgia","germany","ghana","greece","guatemala","guinea","haiti","honduras","hungary","iceland","india","indonesia","iran","iraq","ireland","israel","italy","jamaica","japan","jordan","kazakhstan","kenya","kuwait","kyrgyzstan","latvia","lebanon","lesotho","liberia","libya","liechtenstein","lithuania","luxembourg","madagascar","malawi","malaysia","maldives","mali","malta","mauritania","mauritius","mexico","moldova","monaco","mongolia","montenegro","morocco","mozambique","myanmar","namibia","nepal","netherlands","new zealand","nicaragua","niger","nigeria","north korea","north macedonia","norway","oman","pakistan","panama","papua new guinea","paraguay","peru","philippines","poland","portugal","qatar","romania","russia","rwanda","saudi arabia","senegal","serbia","singapore","slovakia","slovenia","somalia","south africa","south korea","south sudan","spain","sri lanka","sudan","sweden","switzerland","syria","taiwan","tajikistan","tanzania","thailand","togo","trinidad and tobago","tunisia","turkey","turkmenistan","uganda","ukraine","united arab emirates","united kingdom","united states","uruguay","uzbekistan","venezuela","vietnam","yemen","zambia","zimbabwe"],"description":"Boost search results from a specific country. This will prioritize content from the selected country in the search results. Available only if topic is general.","default":""}},"required":["query"]}},{"name":"tavily-extract","description":"A powerful web content extraction tool that retrieves and processes raw content from specified URLs, ideal for data collection, content analysis, and research tasks.","input_schema":{"type":"object","properties":{"urls":{"type":"array","items":{"type":"string"},"description":"List of URLs to extract content from"},"extract_depth":{"type":"string","enum":["basic","advanced"],"description":"Depth of extraction - 'basic' or 'advanced', if usrls are linkedin use 'advanced' or if explicitly told to use advanced","default":"basic"},"include_images":{"type":"boolean","description":"Include a list of images extracted from the urls in the response","default":false},"format":{"type":"string","enum":["markdown","text"],"description":"The format of the extracted web page content. markdown returns content in markdown format. text returns plain text and may increase latency.","default":"markdown"}},"required":["urls"]}},{"name":"tavily-crawl","description":"A powerful web crawler that initiates a structured web crawl starting from a specified base URL. The crawler expands from that point like a tree, following internal links across pages. You can control how deep and wide it goes, and guide it to focus on specific sections of the site.","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"The root URL to begin the crawl"},"max_depth":{"type":"integer","description":"Max depth of the crawl. Defines how far from the base URL the crawler can explore.","default":1,"minimum":1},"max_breadth":{"type":"integer","description":"Max number of links to follow per level of the tree (i.e., per page)","default":20,"minimum":1},"limit":{"type":"integer","description":"Total number of links the crawler will process before stopping","default":50,"minimum":1},"instructions":{"type":"string","description":"Natural language instructions for the crawler"},"select_paths":{"type":"array","items":{"type":"string"},"description":"Regex patterns to select only URLs with specific path patterns (e.g., /docs/.*, /api/v1.*)","default":[]},"select_domains":{"type":"array","items":{"type":"string"},"description":"Regex patterns to select crawling to specific domains or subdomains (e.g., ^docs\\.example\\.com$)","default":[]},"allow_external":{"type":"boolean","description":"Whether to allow following links that go to external domains","default":false},"categories":{"type":"array","items":{"type":"string","enum":["Careers","Blog","Documentation","About","Pricing","Community","Developers","Contact","Media"]},"description":"Filter URLs using predefined categories like documentation, blog, api, etc","default":[]},"extract_depth":{"type":"string","enum":["basic","advanced"],"description":"Advanced extraction retrieves more data, including tables and embedded content, with higher success but may increase latency","default":"basic"},"format":{"type":"string","enum":["markdown","text"],"description":"The format of the extracted web page content. markdown returns content in markdown format. text returns plain text and may increase latency.","default":"markdown"}},"required":["url"]}},{"name":"tavily-map","description":"A powerful web mapping tool that creates a structured map of website URLs, allowing you to discover and analyze site structure, content organization, and navigation paths. Perfect for site audits, content discovery, and understanding website architecture.","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"The root URL to begin the mapping"},"max_depth":{"type":"integer","description":"Max depth of the mapping. Defines how far from the base URL the crawler can explore","default":1,"minimum":1},"max_breadth":{"type":"integer","description":"Max number of links to follow per level of the tree (i.e., per page)","default":20,"minimum":1},"limit":{"type":"integer","description":"Total number of links the crawler will process before stopping","default":50,"minimum":1},"instructions":{"type":"string","description":"Natural language instructions for the crawler"},"select_paths":{"type":"array","items":{"type":"string"},"description":"Regex patterns to select only URLs with specific path patterns (e.g., /docs/.*, /api/v1.*)","default":[]},"select_domains":{"type":"array","items":{"type":"string"},"description":"Regex patterns to select crawling to specific domains or subdomains (e.g., ^docs\\.example\\.com$)","default":[]},"allow_external":{"type":"boolean","description":"Whether to allow following links that go to external domains","default":false},"categories":{"type":"array","items":{"type":"string","enum":["Careers","Blog","Documentation","About","Pricing","Community","Developers","Contact","Media"]},"description":"Filter URLs using predefined categories like documentation, blog, api, etc","default":[]}},"required":["url"]}},{"name":"google-search","description":"Perform a web search query","input_schema":{"type":"object","properties":{"query":{"type":"string","description":"Search query"},"num":{"type":"number","description":"Number of results (1-10)","minimum":1,"maximum":10}},"required":["query","num"]}}],"mcp_tools_dict":{"tavily-mcp":["tavily-search","tavily-extract","tavily-crawl","tavily-map"],"google-search":["google-search"]},"query":"Find the latest news about global economic trends.","function_call_label":[{"name":"google-search","step":"1","id":"1","mcp_server":"google-search","similar_tools":[{"name":"tavily-search","mcp_server":"tavily-mcp"}],"input":{"query":"global economic trends","num":5},"output":{"status_code":200,"result":{}}}]}]
2 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/agents/base_tool_call_agent/prompt.py:
--------------------------------------------------------------------------------
 1 | user_prompt_template_ast = '''
 2 | ## Model Prediction
 3 | {pred_tool_result_list}
 4 | ## Answer Label
 5 | {label_result_list}
 6 | ## user query
 7 | {query}
 8 | ''' 
 9 | 
10 | system_prompt_template_ast_single = '''
11 | # Role
12 | You are an expert in evaluating the accuracy of intelligent tool calls. You need to evaluate the accuracy of model tool call link predictions based on the "model prediction" and "answer label" given by the user.
13 | # Steps
14 | 1. "Model Prediction" and "Answer Label" are both JSON lists, with each JSON representing the tool to be called, including the tool name and input field as calling parameters. Note that we only care about the last json in the list. Carefully compare all the contents. The 'similar_tools' field provides tool names with similar functionality.
15 | 2. Tool accuracy: If the tool in "model prediction" is the tool in "answer label" (the tool name needs to be exactly the same, and if the tool name predicted by the model is a similar tool to the answer label tool, it can also be considered consistent), the tool accuracy is considered to be 1, otherwise it is 0.
16 | 3. Parameter accuracy: Evaluate each parameter of the tool in "model prediction" in turn. If it is consistent with the parameter of the corresponding tool in "answer label" or "similar answer label", the parameter accuracy is considered to be 1, otherwise it is 0. If the tool accuracy is already 0, the parameter accuracy is also directly 0.
17 | 
18 | # Notes
19 | 1. Note that we only care about the last json in the "Model Prediction" and "Answer Label" list.
20 | 2. Fuzzy matching: If the parameters of the tool in "Model Prediction" are not completely consistent with the parameters of the corresponding tool in "Answer Label", but the parameter content is similar, the parameter accuracy is considered to be 1.
21 | - For example, the parameter "query" are all rewritten search statements, and the content may be similar but the expression is different. If the content is similar, the parameters can be considered consistent.
22 | - For example, the location parameters are all place names, and there may be different expressions such as "北京", "Beijing", "Beijing Region", etc., which can be considered consistent.
23 | - For example, the longitude and latitude parameters are consistent before one decimal place, which is considered to be consistent.
24 | - No language verification, parameters can be in any language as long as the content is the same.
25 | 3. Skip if standard parameter values ​​are not provided
26 | - If some parameters of the tool in "Model Prediction" do not provide standard parameter values ​​in "Answer Label", they are skipped and considered to be consistent.
27 | 4. It is not required that all parameters of "Answer Label" be included in "Model Prediction". Parameter accuracy only needs to evaluate the parameters in "Model Prediction".
28 | 5. Some parameters are irrelevant to the user query and can be ignored without verification. For example, if the user query is "Find the latest news about cryptocurrency in the United States.", then the parameter "max_results" in the tool "tavily-search" is not important.
29 | 
30 | # Output format requirements
31 | 1. Output strictly in json format, do not add any other explanations, formats, prefixes and suffixes, such as ```json, etc. The format and separators are all in English characters.
32 | 2. Includes 2 fields, representing tool accuracy, parameter accuracy.
33 | 3. Output format example:
34 | {{
35 |     "tool_correctness": 1,
36 |     "parameter_correctness": 0
37 | }}
38 | '''
39 | 
40 | system_prompt_template_ast_single_reason = '''
41 | # Role
42 | You are an expert in evaluating the accuracy of intelligent tool calls. You need to evaluate the accuracy of model tool call link predictions based on the "model prediction" and "answer label" given by the user, and give a brief reason.
43 | 
44 | # Steps
45 | 1. "Model Prediction" and "Answer Label" are both JSON lists, with each JSON representing the tool to be called, including the tool name and input field as calling parameters. Note that we only care about the last json in the list. Carefully compare all the contents. The 'similar_tools' field provides tool names with similar functionality.
46 | 2. Tool accuracy: If the tool in "model prediction" is the tool in "answer label" (the tool name needs to be exactly the same, and if the tool name predicted by the model is a similar tool to the answer label tool, it can also be considered consistent), the tool accuracy is considered to be 1, otherwise it is 0.
47 | 3. Parameter accuracy: Evaluate each parameter of the tool in "model prediction" in turn. If it is consistent with the parameter of the corresponding tool in "answer label" or "similar answer label", the parameter accuracy is considered to be 1, otherwise it is 0. If the tool accuracy is already 0, the parameter accuracy is also directly 0.
48 | 4. Reasons: Give the reasons for the judgment, explain which tool does not match and which parameters are inconsistent, and keep the language as brief as possible.
49 | 
50 | # Notes
51 | 1. Note that we only care about the last json in the "Model Prediction" and "Answer Label" list.
52 | 2. Fuzzy matching: If the parameters of the tool in "Model Prediction" are not completely consistent with the parameters of the corresponding tool in "Answer Label", but the parameter content is similar, the parameter accuracy is considered to be 1.
53 | - For example, the query parameters are all rewritten search statements, and the content may be similar but the expression is different. If the content is similar, the parameters can be considered consistent.
54 | - For example, the location parameters are all place names, and there may be different expressions such as "北京", "Beijing", "Beijing Region", etc., which can be considered consistent.
55 | - For example, the longitude and latitude parameters are consistent before one decimal place, which is considered to be consistent.
56 | - No language verification, parameters can be in any language as long as the content is the same.
57 | 3. Skip if standard parameter values ​​are not provided
58 | - If some parameters of the tool in "Model Prediction" do not provide standard parameter values ​​in "Answer Label", they are skipped and considered to be consistent.
59 | 4. It is not required that all parameters of "Answer Label" be included in "Model Prediction". Parameter accuracy only needs to evaluate the parameters in "Model Prediction".
60 | 
61 | # Output format requirements
62 | 1. Output strictly in json format, do not add any other explanations, formats, prefixes and suffixes, such as ```json, etc. The format and separators are all in English characters.
63 | 2. Includes 3 fields, representing tool accuracy, parameter accuracy, and judgment reasons
64 | 3. Output format example:
65 | {{
66 |     "tool_correctness": 1,
67 |     "parameter_correctness": 0,
68 |     "reason": "..."
69 | }}
70 | '''
71 | 
72 | system_prompt_template_ast_multiple = '''
73 | # Role
74 | You are an expert in evaluating the accuracy of intelligent tool calls. You need to evaluate the accuracy of model tool call link predictions based on the "model prediction" and "answer label" given by the user.
75 | # Steps
76 | 1. "Model Prediction" and "Answer Label" are both JSON lists, with each JSON representing the tool to be called, including the tool name and input field as calling parameters. Carefully compare all the contents. The 'similar_tools' field provides tool names with similar functionality.
77 | 2. Tool accuracy: If the tool in "model prediction" is the tool in "answer label" (the tool name needs to be exactly the same, and if the tool name predicted by the model is a similar tool to the answer label tool, it can also be considered consistent), the tool accuracy is considered to be 1, otherwise it is 0.
78 | 3. Parameter accuracy: Evaluate each parameter of the tool in "model prediction" in turn. If it is consistent with the parameter of the corresponding tool in "answer label" or "similar answer label", the parameter accuracy is considered to be 1, otherwise it is 0. If the tool accuracy is already 0, the parameter accuracy is also directly 0.
79 | 
80 | # Notes
81 | 1. Fuzzy matching: If the parameters of the tool in "Model Prediction" are not completely consistent with the parameters of the corresponding tool in "Answer Label", but the parameter content is similar, the parameter accuracy is considered to be 1.
82 | - For example, the query parameters are all rewritten search statements, and the content may be similar but the expression is different. If the content is similar, the parameters can be considered consistent.
83 | - For example, the location parameters are all place names, and there may be different expressions such as "北京", "Beijing", "Beijing Region", etc., which can be considered consistent.
84 | - For example, the longitude and latitude parameters are consistent before one decimal place, which is considered to be consistent.
85 | - No language verification, parameters can be in any language as long as the content is the same.
86 | 2. Skip if standard parameter values ​​are not provided
87 | - If some parameters of the tool in "Model Prediction" do not provide standard parameter values ​​in "Answer Label", they are skipped and considered to be consistent.
88 | 3. It is not required that all parameters of "Answer Label" be included in "Model Prediction". Parameter accuracy only needs to evaluate the parameters in "Model Prediction".
89 | 
90 | # Output format requirements
91 | 1. Output strictly in json format, do not add any other explanations, formats, prefixes and suffixes, such as ```json, etc. The format and separators are all in English characters.
92 | 2. Includes 2 fields, representing tool accuracy, parameter accuracy.
93 | 3. Output format example:
94 | {{
95 |     "tool_correctness": 1,
96 |     "parameter_correctness": 0
97 | }}
98 | '''
99 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/model_utils/base_api.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import List, Dict, Any, Optional
  3 | from ..global_variables import *
  4 | 
  5 | class BaseModelAPIProvider:
  6 |     """
  7 | 
  8 |         Usage:
  9 |             model_provider = _global_model_provider[MODEL_SELECTION_GPT4O] if MODEL_SELECTION_GPT4O in _global_model_provider else None
 10 |             result = model_provider.api_chat(messages) if model_provider is not None else {}
 11 |             completion = result[KEY_COMPLETION]
 12 | 
 13 |     """
 14 | 
 15 |     def __init__(self, model_name: str):
 16 |         """
 17 |         Args:
 18 |             model_name: e.g. claude-3.7
 19 |         """
 20 |         self.model_name = model_name
 21 | 
 22 |     def api_chat(self, messages: List[Any], **kwargs) -> Dict[str, Any]:
 23 |         """        
 24 |         Args:
 25 |             messages: List[Any]
 26 |             **kwargs: other parameters
 27 |         
 28 |         Returns:
 29 |             str: response
 30 |         """
 31 |         result = {
 32 |             KEY_FUNCTION_CALL: {},
 33 |             KEY_COMPLETION: "", 
 34 |             KEY_REASON_CONTENT: ""
 35 |         }
 36 |         return result
 37 |     
 38 |     def api_function_call(self, messages: List[Any], tools: list, **kwargs) -> Dict[str, Any]:
 39 |         """
 40 |         
 41 |         Args:
 42 |             messages: list of json message
 43 |             tools: available tool json
 44 |             **kwargs: other parameters
 45 |         
 46 |         Returns:
 47 |             Dict: result
 48 |         """
 49 |         result = {
 50 |             KEY_FUNCTION_CALL: {},
 51 |             KEY_COMPLETION: "", 
 52 |             KEY_REASON_CONTENT: ""
 53 |         }
 54 |         return result
 55 | 
 56 | def function_call_result_common_mapper(tool_call):
 57 |     """
 58 |         This wrapper is a common mapper to wrap the result of OpenAI/Claude Stype function call results, thinking/no thinking models
 59 |         Args:
 60 |             tool_call: 
 61 |                 {
 62 |                     "id": "call_d6f4ed29ce614390b99a05",
 63 |                     "function": {
 64 |                         "arguments": "{\"url\": \"https://www.stackoverflow.com\", \"browserType\": \"chromium\"}",
 65 |                         "name": "playwright_navigate"
 66 |                     },
 67 |                     "type": "function",
 68 |                     "index": 0
 69 |                 }
 70 | 
 71 |         Return:
 72 |             tools_choice_response 
 73 | 
 74 |                 {
 75 |                     "function_name": "playwright_navigate",
 76 |                     "function_arguments": "{\"url\": \"https://www.stackoverflow.com\", \"browserType\": \"chromium\"}",
 77 |                     "is_function_call": true,
 78 |                     "id": "call_d6f4ed29ce614390b99a05"
 79 |                 } 
 80 |             completion: str
 81 |             reasoningContent: str
 82 |     """
 83 |     if tool_call is None or len(tool_call) == 0:
 84 |         return {}, "", ""
 85 | 
 86 |     tools_choice_response = {
 87 |         'function_name': '',
 88 |         'function_arguments': '',
 89 |         'is_function_call': False, 
 90 |         'id': ''
 91 |     }
 92 |     completion = ""
 93 |     reasoningContent = ""
 94 |     try:
 95 |         tool_id = tool_call["id"] if "id" in tool_call else ""
 96 |         function = tool_call["function"] if "function" in tool_call else {}
 97 |         function_arguments = function["arguments"] if "arguments" in function else {}
 98 |         function_name = function["name"] if "name" in function else ""
 99 | 
100 |         tools_choice_response["is_function_call"] = True 
101 |         tools_choice_response["function_name"] = function_name
102 |         tools_choice_response["function_arguments"] = function_arguments
103 |         tools_choice_response["id"] = tool_id
104 |     except Exception as e:
105 |         logging.error(f"Failed to run tool_result_to_claude_mapper {e}")
106 |     return tools_choice_response, completion, reasoningContent
107 | 
108 | def tool_call_parameter_wrapper(model: str, tool_id: str, tool_name: str, tool_arguments: dict):
109 |     
110 |     message_tool_parameter = {}
111 |     if "gpt" in model:
112 |         # OpenAI Claude Format
113 |         message_tool_parameter = tool_call_param_openai_wrapper(tool_id, tool_name, tool_arguments)
114 |     elif "claude" in model:
115 |         # Claude Format
116 |         message_tool_parameter = tool_call_param_claude_wrapper(tool_id, tool_name, tool_arguments)
117 |     elif "qwen" in model:
118 |         # Qwen Wrapper
119 |         message_tool_parameter = tool_call_param_qwen_wrapper(tool_id, tool_name, tool_arguments)
120 |     else:
121 |         message_tool_parameter = tool_call_param_openai_wrapper(tool_id, tool_name, tool_arguments)
122 |     return message_tool_parameter
123 | 
124 | def tool_call_result_wrapper(model: str, tool_id: str, tool_name: str, tool_result: dict):
125 |     
126 |     message_tool_result = {}
127 |     if "gpt" in model:
128 |         # OpenAI Claude Format
129 |         message_tool_result = tool_call_result_openai_wrapper(tool_id, tool_name, tool_result)
130 |     elif "claude" in model:
131 |         # Claude Format
132 |         message_tool_result = tool_call_result_claude_wrapper(tool_id, tool_result)
133 |     elif "qwen" in model:
134 |         # Qwen Wrapper
135 |         message_tool_result = tool_call_result_qwen_wrapper(tool_id, tool_result)
136 |     else:
137 |         message_tool_result = tool_call_result_openai_wrapper(tool_id, tool_name, tool_result)
138 |     return message_tool_result
139 | 
140 | def tools_openai_wrapper(tools):
141 |     tools_wrapped = [{
142 |         "type": "function",
143 |         "function":{
144 |             "name": tool["name"] if "name" in tool else "", 
145 |             "description": tool["description"] if "description" in tool else "",
146 |             "parameters": tool["input_schema"] if "input_schema" in tool else {}
147 |         }
148 |     } for tool in tools]
149 |     return tools_wrapped
150 | 
151 | def tool_call_param_openai_wrapper(tool_id: str, tool_name: str, arguments: Dict, **kwargs):
152 |     
153 |     context_id = kwargs["context_id"] if "context_id" in kwargs else ""
154 |     session_id = kwargs["session_id"] if "session_id" in kwargs else ""
155 |     
156 |     oai_tool_call = {
157 |         "role": "assistant",
158 |         "content": None,
159 |         "tool_calls": [
160 |             {
161 |                         "id": tool_id,
162 |                         "type": "function",
163 |                         "function": {
164 |                             "name": tool_name,
165 |                             "arguments": json.dumps(arguments),
166 |                         },
167 |             }
168 |         ],
169 |     }
170 |     if context_id != "":
171 |         oai_tool_call["contextId"] = context_id
172 |     if session_id != "":
173 |         oai_tool_call["sessionId"] = session_id
174 |     return oai_tool_call
175 | 
176 | def tool_call_param_claude_wrapper(tool_id: str, tool_name: str, arguments: Dict):
177 |     claude_tool_assistant = {
178 |         "role": "assistant",
179 |         "content": [
180 |                         {
181 |                             "type": "tool_use",
182 |                             "id": tool_id,
183 |                             "name": tool_name,
184 |                             "input": arguments
185 |                         }
186 |         ]
187 |     }
188 |     return claude_tool_assistant
189 | 
190 | def tool_call_param_claude_bedrock_wrapper(tool_id: str, tool_name: str, arguments: Dict):
191 |     claude_tool_assistant = {
192 |         "role": "assistant",
193 |         "content": [
194 |             {
195 |                 "toolUse": { 
196 |                     "id": tool_id,
197 |                     "name": tool_name,
198 |                     "input": arguments
199 |                 }
200 |             }
201 |         ]
202 |     }
203 |     return claude_tool_assistant
204 | 
205 | def tool_call_param_qwen_wrapper(tool_id: str, tool_name: str, arguments: Dict):
206 |     qwen_tool_assistant = {
207 |         "role": "assistant",
208 |         "content": None,
209 |         "tool_calls": [
210 |             {
211 |                 "id": tool_id,
212 |                 "type": "function",
213 |                 "function": {
214 |                     "name": tool_name,
215 |                     "arguments": json.dumps(arguments)
216 |                 }
217 |             }
218 |         ]
219 |     }
220 |     return qwen_tool_assistant
221 | 
222 | def tool_call_result_openai_wrapper(tool_id: str, tool_name: str, result: Any):
223 |     """
224 |     """
225 |     oai_tool_result_msg = {
226 |         "tool_call_id": tool_id,
227 |         "role": "tool",
228 |         "name": tool_name,
229 |         "content": json.dumps(result), # Must be a string
230 |     }
231 |     return oai_tool_result_msg
232 | 
233 | def tool_call_result_claude_wrapper(tool_id: str, result: Any):
234 |     """
235 |     """
236 |     claude_tool_result_msg = {
237 |         "role": "user",
238 |         "content": [
239 |             {
240 |                 "type": "tool_result",
241 |                 "tool_use_id": tool_id, # from the API response
242 |                 "content": json.dumps(result) # from running your tool
243 |             }
244 |         ]
245 |     }
246 |     return claude_tool_result_msg
247 | 
248 | def tool_call_result_claude_bedrock_wrapper(tool_id: str, result: Any):
249 |     """
250 |     """
251 |     tool_result_msg = {
252 |         "role": "user",
253 |         "content": [
254 |             {
255 |                 "toolResult": {
256 |                     "toolUseId": tool_id,
257 |                     "content": [
258 |                         {"text": json.dumps(result)}
259 |                     ],
260 |                 }
261 |             }
262 |         ]
263 |     }
264 |     return tool_result_msg
265 | def tool_call_result_qwen_wrapper(tool_id: str, result: Any):
266 |     """
267 |     """
268 |     qwen_tool_result_msg = {
269 |         "role": "tool",
270 |         "content": [{"type": "text", "text": json.dumps(result)}],
271 |         "tool_call_id": tool_id
272 |     }
273 |     return qwen_tool_result_msg
274 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This is the entry function for starting the entire project, receiving all command line parameters
  3 | Parameters include:
  4 | --input_file: Input file path, default is data/demo/demo_v0.json
  5 | --category: Data category, such as browser, search, default is demo
  6 | --model: Model, such as GPT4o, default is GPT4o
  7 | --stage: Stage, such as demo, generation, tool_call, default is demo
  8 | --metric: Metric, such as acc, pass_k, default is pass_k
  9 | --pass_k: Parameter k, such as 1,5,10, default is 1
 10 | --agent: Execution agent, base, base_tool_rag, base_multi-agent, default is base
 11 | --mcp_config: MCP configuration file path, default is mcp_marketplace/mcp_config.json
 12 | --data_version: Data version, such as v0, v1, default is v0
 13 | --log_file: Log file name for resume functionality, optional. If not provided, auto-generates based on input file and timestamp.
 14 | 
 15 | stage:
 16 | 1. If stage is generation, call run_data_generator.py, generate data according to specified category and data_version.
 17 | 3. If stage is tool_call, call run_tool_call.py according to specified model, directly perform calling and evaluation.
 18 | 4. If stage is all, first run run_data_generator.py to generate data, then call run_tool_call.py for calling and evaluation.
 19 | 5. If stage is demo, use all default parameters, first run run_data_generator.py to generate data, then call run_tool_call.py for calling and evaluation.
 20 | 
 21 | Notes:
 22 | 1. When stage is demo, all default parameters must be used, no other parameters can be specified, data will be generated first, then tool calling and evaluation will be performed.
 23 | 2. When stage is generation, category and data_version must be filled, remind customers to provide all mcp tools files in the category directory under mcp_marketplace, format reference existing files.
 24 | 3. When stage is tool_call, input_file, category, model must be filled
 25 | 4. When stage is all, category, data_version, model must be filled, input_file is the data generated in the generation stage.
 26 | 5. Print all parameters to remind users when running.
 27 | 6. The tool_call stage now supports incremental logging and resume functionality. Logs are saved after each task completion, and the system can resume from where it left off if interrupted.
 28 | '''
 29 | 
 30 | import argparse
 31 | import sys
 32 | from pathlib import Path
 33 | 
 34 | 
 35 | from src.mcp_tool_bench import *
 36 | from src.mcp_tool_bench.agents.data_generator_agent.run_data_generator import run_data_generation
 37 | from src.mcp_tool_bench.agents.base_tool_call_agent.run_tool_call import run_benchmark
 38 | 
 39 | # Default parameter values
 40 | DEFAULT_ARGS = {
 41 |     'input_file': 'data/demo/demo_v0.json',
 42 |     'category': 'demo',
 43 |     'model': 'gpt-4o',
 44 |     'stage': 'demo',
 45 |     'metric': 'pass_k',
 46 |     'pass_k': '1',
 47 |     'agent': 'base',
 48 |     'mcp_config': 'mcp_marketplace/mcp_config.json',
 49 |     'data_version': 'v0',
 50 |     'log_file': None,
 51 |     'evaluation_trial_per_task': 5,
 52 |     'llm_as_judge_model': "gpt-4o"
 53 | }
 54 | 
 55 | def parse_arguments():
 56 |     """Parse command line arguments"""
 57 |     parser = argparse.ArgumentParser(description='Project entry function')
 58 |     parser.add_argument('--input_file', default=DEFAULT_ARGS['input_file'], help='Input file path for tool_call stage, default is {}'.format(DEFAULT_ARGS['input_file']))
 59 |     parser.add_argument('--category', default=DEFAULT_ARGS['category'], help='Data category, such as browser, search, default is {}'.format(DEFAULT_ARGS['category']))
 60 |     parser.add_argument('--model', default=DEFAULT_ARGS['model'], help='Model, such as GPT4o, default is {}'.format(DEFAULT_ARGS['model']))
 61 |     parser.add_argument('--stage', default=DEFAULT_ARGS['stage'], choices=['demo', 'generation', 'tool_call', 'all'], help='Stage, such as demo, generation, tool_call, all, default is {}'.format(DEFAULT_ARGS['stage']))
 62 |     parser.add_argument('--metric', default=DEFAULT_ARGS['metric'], help='Metric, such as acc, pass_k, default is {}'.format(DEFAULT_ARGS['metric']))
 63 |     parser.add_argument('--pass_k', type=str, default=DEFAULT_ARGS['pass_k'], help='Parameter k, such as 1,5,10, default is {}'.format(DEFAULT_ARGS['pass_k']))
 64 |     parser.add_argument('--agent', default=DEFAULT_ARGS['agent'], help='Execution agent, such as base, base_tool_rag, base_multi-agent, default is {}'.format(DEFAULT_ARGS['agent']))
 65 |     parser.add_argument('--mcp_config', default=DEFAULT_ARGS['mcp_config'], help='MCP configuration file path, default is {}'.format(DEFAULT_ARGS['mcp_config']))
 66 |     parser.add_argument('--data_version', default=DEFAULT_ARGS['data_version'], help='Data version, such as v0, v1, default is {}'.format(DEFAULT_ARGS['data_version']))
 67 |     parser.add_argument('--log_file', default=DEFAULT_ARGS['log_file'], help='Specify log file name for resume functionality. If not provided, will auto-generate based on input file and timestamp.')
 68 |     parser.add_argument('--evaluation_trial_per_task', type=int, default=DEFAULT_ARGS['evaluation_trial_per_task'], help='Calculation Pass@K Number of Trials...')
 69 |     parser.add_argument('--llm_as_judge_model', type=str, default=DEFAULT_ARGS['llm_as_judge_model'], help='LLM Model Used to determine the parameters are correctly aligned with ground-truth, especial in search tool that query is rewritten')
 70 | 
 71 |     return parser.parse_args()
 72 | 
 73 | def validate_arguments(args):
 74 |     """Validate the validity of arguments"""
 75 |     if args.stage == 'demo':
 76 |         # Check if non-default parameters are used in demo stage
 77 |         # Check if there are non-default parameters (excluding log_file which is optional)
 78 |         non_default_args = []
 79 |         for arg_name in vars(args):
 80 |             if arg_name == 'log_file':  # log_file is optional and allowed in demo
 81 |                 continue
 82 |             current_value = getattr(args, arg_name)
 83 |             default_value = DEFAULT_ARGS.get(arg_name)
 84 |             if current_value != default_value:
 85 |                 non_default_args.append(f"--{arg_name}")
 86 |         
 87 |         if non_default_args:
 88 |             print("Error: demo stage does not support specifying other parameters")
 89 |             print(f"Detected non-default parameters: {', '.join(non_default_args)}")
 90 |             print("demo stage will use all default parameters, please run directly: python run.py")
 91 |             return False
 92 |         
 93 |         return True
 94 |     
 95 |     if args.stage == 'generation':
 96 |         print("Note: The data generation module requires to provide all mcp tools files in the category directory under mcp_marketplace, format reference existing files.")
 97 |         if not args.category or not args.data_version:
 98 |             print("Error: generation stage must fill category, data_version")
 99 |             return False
100 |     
101 |     if args.stage == 'tool_call':
102 |         if not args.input_file or not args.category or not args.model:
103 |             print("Error: tool_call stage must fill input_file, category, model")
104 |             return False
105 |     
106 |     if args.stage == 'all':
107 |         print("Note: The data generation module requires to provide all mcp tools files in the category directory under mcp_marketplace, format reference existing files.")
108 |         if not args.category or not args.data_version or not args.model:
109 |             print("Error: all stage must fill category, data_version, model")
110 |             return False
111 |     
112 |     return True
113 | 
114 | 
115 | def print_arguments(args):
116 |     """Print all arguments"""
117 |     print("=== Running Parameters ===")
118 |     for arg, value in vars(args).items():
119 |         print(f"{arg}: {value}")
120 |     print("===============")
121 | 
122 | 
123 | def main():
124 |     """Main function"""
125 |     args = parse_arguments()
126 |     print_arguments(args)
127 |     
128 |     if not validate_arguments(args):
129 |         sys.exit(1)
130 |     
131 |     if args.stage == 'demo':
132 |         # demo stage: use default parameters, generate data first, then perform tool calling and evaluation
133 |         print("=" * 50)
134 |         print("Executing demo stage: using default parameters")
135 |         print("=" * 50)
136 |         
137 |         print("\n【Step 1】Data Generation")
138 |         print("-" * 30)
139 |         run_data_generation(args.category, args.data_version, args.mcp_config)
140 |         
141 |     elif args.stage == 'generation':
142 |         # generation stage: generate data
143 |         print("=" * 50)
144 |         print("Executing generation stage: generate data")
145 |         print("=" * 50)
146 |         
147 |         print("\n【Step 1】Data Generation")
148 |         print("-" * 30)
149 |         run_data_generation(args.category, args.data_version, args.mcp_config)
150 |         
151 |         print("\n" + "=" * 50)
152 |         print("generation stage execution completed")
153 |         print("=" * 50)
154 |     
155 |     elif args.stage == 'tool_call':
156 |         # tool_call stage: tool calling and evaluation
157 |         print("=" * 50)
158 |         print("Executing tool_call stage: tool calling and evaluation")
159 |         print("=" * 50)
160 |         
161 |         print("\n【Step 1】Tool Calling and Evaluation")
162 |         print("-" * 30)
163 |         run_benchmark(args)
164 |         
165 |         print("\n" + "=" * 50)
166 |         print("tool_call stage execution completed")
167 |         print("=" * 50)
168 |     
169 |     elif args.stage == 'all':
170 |         # all stage: generate data first, then perform tool calling and evaluation
171 |         print("=" * 50)
172 |         print("Executing all stage: generate data first, then perform tool calling and evaluation")
173 |         print("=" * 50)
174 |         
175 |         print("\n【Step 1】Data Generation")
176 |         print("-" * 30)
177 |         run_data_generation(args.category, args.data_version, args.mcp_config)
178 |         
179 |         print("\n【Step 2】Tool Calling and Evaluation")
180 |         print("-" * 30)
181 |         # Set input_file to the generated data file
182 |         args.input_file = f"data/{args.category}/{args.category}_{args.data_version}.json"
183 |         run_benchmark(args)
184 |         
185 |         print("\n" + "=" * 50)
186 |         print("Full pipeline execution completed")
187 |         print("=" * 50)
188 | 
189 | 
190 | if __name__ == "__main__":
191 |     main()
192 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/agents/data_generator_agent/utils/prompt_reference.py:
--------------------------------------------------------------------------------
  1 | browser_candidate_references = {
  2 |     "url": [
  3 |         "https://www.baidu.com",
  4 |         "https://www.taobao.com",
  5 |         "https://www.jd.com",
  6 |         "https://www.qq.com",
  7 |         "https://www.sina.com.cn",
  8 |         "https://www.163.com",
  9 |         "https://www.sohu.com",
 10 |         "https://www.zhihu.com",
 11 |         "https://www.douban.com",
 12 |         "https://www.bilibili.com",
 13 |         "https://www.weibo.com",
 14 |         "https://www.tmall.com",
 15 |         "https://www.alipay.com",
 16 |         "https://www.ctrip.com",
 17 |         "https://www.meituan.com",
 18 |         "https://www.douyin.com",
 19 |         "https://www.xiaohongshu.com",
 20 |         "https://www.youku.com",
 21 |         "https://www.iqiyi.com",
 22 |         "https://www.tencent.com",
 23 |         "https://www.xiaomi.com",
 24 |         "https://www.huawei.com",
 25 |         "https://www.oppo.com",
 26 |         "https://www.vivo.com",
 27 |         "https://www.oneplus.com",
 28 |         "https://www.realme.com",
 29 |         "https://www.meizu.com",
 30 |         "https://www.zte.com",
 31 |         "https://www.alibaba.com",
 32 |         "https://www.aliexpress.com",
 33 |         "https://www.amazon.cn",
 34 |         "https://www.kuaishou.com",
 35 |         "https://www.yy.com",
 36 |         "https://www.huya.com",
 37 |         "https://www.douyu.com",
 38 |         "https://www.v.qq.com",
 39 |         "https://www.mgtv.com",
 40 |         "https://www.le.com",
 41 |         "https://www.pptv.com",
 42 |         "https://www.fun.tv",
 43 |         "https://www.cntv.cn",
 44 |         "https://www.cctv.com",
 45 |         "https://www.people.com.cn",
 46 |         "https://www.xinhuanet.com",
 47 |         "https://www.chinanews.com.cn",
 48 |         "https://www.gmw.cn",
 49 |         "https://www.cyol.com",
 50 |         "https://www.jschina.com.cn",
 51 |         "https://www.ynet.com",
 52 |         "https://www.thepaper.cn",
 53 |         "https://www.jiemian.com",
 54 |         "https://www.caixin.com"
 55 |     ],
 56 |     "iframeSelector": [
 57 |         "#iframeResult",
 58 |         "#iframeContent",
 59 |         ".iframe-container iframe",
 60 |         "#externalFrame",
 61 |         "#embedFrame",
 62 |         "#playerFrame",
 63 |         "#videoFrame",
 64 |         "#loginFrame",
 65 |         "#paymentFrame",
 66 |         "#chatFrame",
 67 |         "#adFrame",
 68 |         "#previewFrame",
 69 |         "#widgetFrame",
 70 |         "#contentFrame",
 71 |         "#mainFrame",
 72 |         "#sideFrame",
 73 |         "#popupFrame",
 74 |         "#modalFrame",
 75 |         "#dynamicFrame",
 76 |         "#thirdPartyFrame"
 77 |     ],
 78 |     "selector": [
 79 |         "#username",
 80 |         "#password",
 81 |         ".login-btn",
 82 |         ".search-input",
 83 |         ".submit-btn",
 84 |         "#main-nav",
 85 |         ".product-item",
 86 |         ".add-to-cart",
 87 |         "#checkout-btn",
 88 |         ".user-avatar",
 89 |         ".dropdown-menu",
 90 |         "#home-link",
 91 |         "#contact-form",
 92 |         ".news-item",
 93 |         ".video-player",
 94 |         "#comment-box",
 95 |         ".rating-stars",
 96 |         "#footer-links",
 97 |         ".social-share",
 98 |         ".cookie-banner"
 99 |     ],
100 |     "key": [
101 |         "Enter",
102 |         "Tab",
103 |         "ArrowDown",
104 |         "ArrowUp",
105 |         "ArrowLeft",
106 |         "ArrowRight",
107 |         "Escape",
108 |         "Backspace",
109 |         "Delete",
110 |         "Home",
111 |         "End",
112 |         "PageUp",
113 |         "PageDown",
114 |         "F5",
115 |         "Control",
116 |         "Alt",
117 |         "Shift",
118 |         "Space",
119 |         "a",
120 |         "1"
121 |     ],
122 |     "script": [
123 |         "document.title",
124 |         "window.scrollTo(0, document.body.scrollHeight)",
125 |         "alert('test')",
126 |         "console.log('hello world')",
127 |         "document.querySelector('button').click()",
128 |         "window.location.href",
129 |         "document.cookie",
130 |         "localStorage.getItem('token')",
131 |         "sessionStorage.setItem('key', 'value')",
132 |         "document.querySelectorAll('.item').length",
133 |         "window.innerWidth",
134 |         "window.innerHeight",
135 |         "document.readyState",
136 |         "performance.now()",
137 |         "navigator.userAgent",
138 |         "document.querySelector('input').value = 'test'",
139 |         "document.activeElement.blur()",
140 |         "window.history.back()",
141 |         "window.print()",
142 |         "document.execCommand('copy')"
143 |     ],
144 |     "value": [
145 |         "testuser",
146 |         "password123",
147 |         "example@example.com",
148 |         "13800138000",
149 |         "2023-01-01",
150 |         "100.00",
151 |         "https://example.com",
152 |         "123456",
153 |         "18"
154 |     ]
155 | }
156 | 
157 | finance_candidate_references = {
158 |     "symbol_list": [
159 |         # Global Market Stock
160 |         "AAPL",      # Apple Inc.
161 |         "MSFT",      # Microsoft Corporation
162 |         "GOOGL",     # Alphabet Inc. (Google)
163 |         "AMZN",      # Amazon.com Inc.
164 |         "NVDA",      # NVIDIA Corporation
165 |         "META",      # Meta Platforms Inc. - Meta/Facebook
166 |         "TSLA",      # Tesla Inc.
167 |         "BRK.A",     # Berkshire Hathaway
168 |         "BRK.B",     # Berkshire Hathaway
169 |         "LLY",       # Eli Lilly and Company
170 |         "TSM",       # Taiwan Semiconductor
171 |         "WMT",       # Walmart Inc.
172 |         "JPM",       # JPMorgan Chase & Co.
173 |         "V",         # Visa Inc.
174 |         "PG",        # Procter & Gamble
175 |         "UNH",       # UnitedHealth Group
176 |         "HD",        # Home Depot
177 |         "MA",        # Mastercard
178 |         "BAC",       # Bank of America
179 |         "ABBV",      # AbbVie Inc.
180 |         "PFE",       # Pfizer Inc. 
181 |         "KO",        # Coca-Cola Company
182 |         "PEP",       # PepsiCo Inc.
183 |         "MRK",       # Merck & Co.
184 |         "CSCO",      # Cisco Systems
185 |         "ADBE",      # Adobe Inc.
186 |         "NFLX",      # Netflix Inc.
187 |         "CRM",       # Salesforce Inc.
188 |         "ACN",       # Accenture plc 
189 |         "TMO",       # Thermo Fisher Scientific
190 |         # China Mainland Stock List
191 |         "SH600519",  # 贵州茅台
192 |         "SH600036",  # 招商银行
193 |         "SH600900",  # 长江电力
194 |         "SH600276",  # 恒瑞医药
195 |         "SH600887",  # 伊利股份
196 |         "SH600031",  # 三一重工
197 |         "SH600000",  # 浦发银行
198 |         "SH600028",  # 中国石化
199 |         "SH600030",  # 中信证券
200 |         "SH600104",  # 上汽集团
201 |         "SZ000858",  # 五粮液
202 |         "SZ000002",  # 万科A
203 |         "SZ000001",  # 平安银行
204 |         "SZ000333",  # 美的集团
205 |         "SZ000651",  # 格力电器
206 |         "SZ000725",  # 京东方A
207 |         "SZ000063",  # 中兴通讯
208 |         "SZ002594",  # 比亚迪
209 |         "SZ300750",  # 宁德时代
210 |         "SZ300059",  # 东方财富
211 |         # HKEX
212 |         "00700",     # 腾讯控股 - Tencent Holdings
213 |         "03690",     # 美团点评 - Meituan Dianping
214 |         "09988",     # 阿里巴巴 - Alibaba Group
215 |         "01810",     # 小米集团 - Xiaomi Corporation
216 |         "09618",     # 京东集团 - JD.com
217 |         "09868",     # 小鹏汽车 - XPeng Motors
218 |         "02015",     # 理想汽车 - Li Auto
219 |         "09866",     # 蔚来汽车 - NIO Inc.
220 |         "02382",     # 舜宇光学 - Sunny Optical
221 |         "00780",     # 同程旅行 - Tongcheng Travel
222 |         "02331",     # 李宁 - Li Ning
223 |         "02020",     # 安踏体育 - ANTA Sports
224 |         "03692",     # 翰森制药 - Hansoh Pharmaceutical
225 |         "01177",     # 中国生物制药 - Sino Biopharmaceutical
226 |         "01093",     # 石药集团 - CSPC Pharmaceutical
227 |         "02269",     # 药明生物 - WuXi Biologics
228 |         "03613",     # 同仁堂国药 - Tong Ren Tang Technologies
229 |         "00883",     # 中国海洋石油 - CNOOC
230 |         "00939",     # 建设银行 - China Construction Bank
231 |         "01398"      # 工商银行 - Industrial and Commercial Bank of China
232 |     ],
233 |     "market": [
234 |         "US",          # United States
235 |         "HK",          # HKEX
236 |         "CN_MAINLAND", # China Mainland Stock A
237 |         "LSE",         # London Stock Exchange
238 |         "NSE_INDIA",   # National Stock Exchange India
239 |         "JPX",         # Japan
240 |         "ASX",         # Australia
241 |         "TSX",         # Toronto
242 |         "FWB",         # Frankfurt
243 |         "EURONEXT",    # EURONEXT
244 |         "SSE",         # Shanghia
245 |         "SZSE",        # Shenzhen
246 |         "KRX",         # Korean
247 |         "SGX",         # Singapore
248 |         "TSE",         # Tokyo
249 |         "BSE",         # Bombay
250 |         "MOEX",        # Moscow
251 |     ]
252 | }
253 | 
254 | search_candidate_references = {
255 | }
256 | 
257 | map_candidate_references = {
258 | }
259 | 
260 | filesystem_candidate_references = {
261 | }
262 | 
263 | pay_candidate_references = {
264 | }
265 | 
266 | browser_special_needs_description = '''
267 | 1. 对于iframe操作，需要先定位iframe再操作内部元素
268 | 2. 文件上传操作需要特殊处理，不能直接设置input值
269 | 3. 动态加载的内容需要等待元素出现
270 | 4. 跨域iframe有安全限制需要注意
271 | '''
272 | 
273 | finance_special_needs_description = '''
274 | 1. Note that the stock codes and market codes in the candidate set must match exactly, for example, SH600519 must match CN_MAINLAND, AAPL must match US, and there cannot be a pairing of stock codes that do not exist in the market.
275 | 2. The values of the candidate set are all from the list provided by the Parameter candidate value reference.
276 | 3. If it is easy to determine the market it belongs to from the stock code, the query does not reflect the market, which is closer to the user's daily inquiries, such as directly asking "What is the current share price of <symble>?" instead of "What is the current share price of <symble> in the <market> market?". Of course, the label parameter is still given in full as usual.
277 | '''
278 | 
279 | search_special_needs_description = '''
280 | '''
281 | 
282 | map_special_needs_description = '''
283 | '''
284 | 
285 | filesystem_special_needs_description = '''
286 | '''
287 | 
288 | pay_special_needs_description = '''
289 | '''
290 | 
291 | candidate_reference_list = {
292 |     "browser": browser_candidate_references,
293 |     "finance": finance_candidate_references,
294 |     "search": search_candidate_references,
295 |     "map": map_candidate_references,
296 |     "filesystem": filesystem_candidate_references,
297 |     "pay": pay_candidate_references
298 | }
299 | 
300 | special_needs_description_list = {
301 |     "browser": browser_special_needs_description,
302 |     "finance": finance_special_needs_description,
303 |     "search": search_special_needs_description,
304 |     "map": map_special_needs_description,
305 |     "filesystem": filesystem_special_needs_description,
306 |     "pay": pay_special_needs_description
307 | }
308 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/agents/base_tool_call_agent/check_functions.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | 
  4 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
  5 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../../..')))
  6 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../..')))
  7 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../')))
  8 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, './')))
  9 | 
 10 | from src.mcp_tool_bench.agents.base_tool_call_agent.prompt import *
 11 | from src.mcp_tool_bench.evaluation.evaluation_utils import estimate_pass_at_k, base_error_analysis
 12 | import html
 13 | import re
 14 | from bs4 import BeautifulSoup
 15 | from src.mcp_tool_bench.global_variables import *
 16 | from src.mcp_tool_bench.model_utils.model_provider import get_model_provider
 17 | from src.mcp_tool_bench.model_utils.base_api import *
 18 | import json
 19 | import logging
 20 | from typing import List, Dict, Tuple
 21 | from tqdm import tqdm
 22 | 
 23 | def decode_html_entities(s):
 24 |     """Decode HTML entities"""
 25 |     # Try using html.unescape
 26 |     first_decode = html.unescape(s)
 27 |     # Check if still contains undecoded entities
 28 |     if "&" in first_decode and ";" in first_decode:
 29 |         # Use BeautifulSoup for further decoding
 30 |         soup = BeautifulSoup(first_decode, "html.parser")
 31 |         second_decode = soup.get_text()
 32 |         return second_decode
 33 |     return first_decode
 34 | 
 35 | def auto_fix_unclosed_quotes(data):
 36 |     """
 37 |     Automatically add space after colon in key-value pairs, e.g., convert 'key:value' to 'key: value'
 38 |     """
 39 |     if isinstance(data, list):
 40 |         return data
 41 |     # Use regex to match cases where colon is not followed by space, and add space
 42 |     data = re.sub(r'(?m)^(\s*[^#\s][^:]*):([^\s])', r'\1: \2', data)
 43 |     
 44 |     lines = data.split("\n")
 45 |     fixed_lines = []
 46 |     for line in lines:
 47 |         # Detect and fix unclosed quotes
 48 |         if line.count('"') % 2 != 0:
 49 |             line = line + '"'  # Append a quote to close
 50 |         fixed_lines.append(line)
 51 |     return "\n".join(fixed_lines)
 52 | 
 53 | def process_response(response_text):
 54 |     """Process GPT response text"""
 55 |     if not response_text:
 56 |         return ""
 57 |     
 58 |     raw_val = decode_html_entities(response_text)
 59 |     raw_val = auto_fix_unclosed_quotes(raw_val)
 60 |     decoded_json_str = html.unescape(raw_val)
 61 |     decoded_json_str = decoded_json_str.replace("```json\n", "").replace("```", "").replace("\n", "")
 62 |     return decoded_json_str
 63 | 
 64 | def check_ast(pred_tool_result_list: List[Dict], label_result_list: List[Dict], query: str, model_name: str = MODEL_SELECTION_GPT4O) -> Tuple[bool, bool]:
 65 |     """
 66 |         Check the AST of tool calls
 67 |         model_name: required, the LLM as a judge can verify the parameters are aligned. For example, the "query" used in search tools may be 
 68 |         rewrited by Function Call models. And LLM as a judge need to determine if the query is correctedly rewritten that match the original query.
 69 |         Default: Using GPT4o
 70 |     """
 71 |     try:
 72 |         if pred_tool_result_list == label_result_list:
 73 |             return True, True
 74 |         label_step = 1
 75 |         predict_step = 1
 76 |         if (label_step == 1 and predict_step == 1):
 77 |             user_prompt = user_prompt_template_ast.format(pred_tool_result_list=pred_tool_result_list, label_result_list=label_result_list, query=query)
 78 |             system_prompt = system_prompt_template_ast_single.format()
 79 |             messages = [
 80 |                 {
 81 |                     "role": "system",
 82 |                     "content": system_prompt
 83 |                 },
 84 |                 {
 85 |                     "role": "user",
 86 |                     "content": user_prompt
 87 |                 }
 88 |             ]
 89 |             # print("messages: ", messages)
 90 |             model_provider = get_model_provider(model_name)
 91 |             output = model_provider.api_chat(messages, wait_time=5) if model_provider is not None else {}
 92 |             raw_response = output[KEY_COMPLETION] if KEY_COMPLETION in output else ""
 93 |             # Normal chat: process string
 94 |             if isinstance(raw_response, str):
 95 |                 result =  process_response(raw_response)
 96 |             try:
 97 |                 result = json.loads(result)
 98 |             except Exception as e:
 99 |                 logging.error(f" Failed to parse json {e}")
100 |                 return False, False
101 |             # print("[debug]  check_ast result: ", result)
102 |             tool_correctness = result["tool_correctness"] if "tool_correctness" in result else 0
103 |             parameter_correctness = result["parameter_correctness"] if "parameter_correctness" in result else 0
104 |             
105 |         else:
106 |             ## multiple
107 |             return False, False
108 |         return tool_correctness, parameter_correctness
109 | 
110 |     except Exception as e:
111 |         print (f"check_ast failed with error {e}")
112 |         return 0, 0
113 | 
114 | def check_single_tool_call_dag(pred_tool_result: Dict, label_result: Dict) -> Tuple[bool, bool]:
115 |     # implementation
116 |     # print("[debug] pred_tool_result:", pred_tool_result)
117 |     # print("[debug] label_result:", label_result)
118 |     label_tool_name = label_result["name"] if "name" in label_result else ""
119 |     similar_tools = label_result.get("similar_tools", [])
120 |     # label_result = label_result["output"] if "output" in label_result else {}
121 | 
122 |     # prediction
123 |     predict_tool_name = pred_tool_result["name"] if "name" in pred_tool_result else ""
124 |     predict_result = pred_tool_result["output"] if "output" in pred_tool_result else {}
125 |     predict_status_code = predict_result["status_code"] if "status_code" in predict_result else 500
126 |     tool_consistency = False
127 |     output_consistency = False
128 |     
129 |     # Direct match
130 |     if label_tool_name == predict_tool_name:
131 |         tool_consistency = True
132 |     
133 |     # Check similar tools
134 |     # print("similar_tools: ", similar_tools)
135 |     for similar_tool in similar_tools:
136 |         if predict_tool_name == similar_tool.get("name", ""):
137 |             tool_consistency = True
138 | 
139 |     result_success_label_list = base_error_analysis([predict_result])["result_success_label_list"]
140 |     if sum(result_success_label_list)==len(result_success_label_list):
141 |         output_consistency = True
142 |     else:
143 |         output_consistency = False
144 |     return tool_consistency, output_consistency
145 | 
146 | def check_multi_tool_call_dag(pred_tool_result_list: List[Dict], label_result_list: List[Dict]) -> Tuple[bool, bool]:
147 |     """
148 |     Check the correctness of tool calls for DAG structure
149 |     
150 |     Args:
151 |         pred_tool_result_list: List of predicted tool call results
152 |         label_result_list: List of ground truth tool call results
153 |         
154 |     Returns:
155 |         Tuple[bool, bool]: (tool_consistency, output_consistency)
156 |     """
157 |     
158 |     def get_leaf_nodes(tool_list: List[Dict]) -> List[Dict]:
159 |         """
160 |         Get leaf nodes (last tool calls) from tool list
161 |         If the last tool name is repeated, get all consecutive calls with the same name
162 |         """
163 |         if not tool_list:
164 |             return []
165 |         
166 |         leaf_nodes = []
167 |         last_tool_name = tool_list[-1]["name"]
168 |         
169 |         # Iterate from the end to find all consecutive calls with the same tool name
170 |         for i in range(len(tool_list) - 1, -1, -1):
171 |             if tool_list[i]["name"] == last_tool_name:
172 |                 leaf_nodes.insert(0, tool_list[i])
173 |             else:
174 |                 break
175 |                 
176 |         return leaf_nodes
177 |     
178 |     if len(label_result_list)<1 or len(pred_tool_result_list)<1:
179 |         return False, False
180 |     # Get leaf nodes from both lists
181 |     # pred_leaf_nodes = get_leaf_nodes(pred_tool_result_list)
182 |     # label_leaf_nodes = get_leaf_nodes(label_result_list)
183 |     pred_leaf_nodes = pred_tool_result_list[-1]
184 |     label_leaf_nodes = label_result_list[-1]
185 |     # print("[debug] pred_leaf_nodes:", pred_leaf_nodes)
186 |     # print("[debug] label_leaf_nodes:", label_leaf_nodes)
187 |     
188 |     return check_single_tool_call_dag(pred_leaf_nodes, label_leaf_nodes)
189 | 
190 | if __name__ == "__main__":
191 |     # Read the input JSON file
192 |     input_file_path = "logs/browser/browser_0711_single_500_20250713_080044.json"
193 |     output_file_path = "logs/browser/browser_0711_single_500_20250713_080044_ast.json"
194 |     # input_file_path = "logs/browser/test_log.json"
195 |     # output_file_path = "logs/browser/test_log_ast.json"
196 |     
197 |     try:
198 |         with open(input_file_path, 'r', encoding='utf-8') as f:
199 |             data = json.load(f)
200 |         
201 |         # Calculate total number of function calls to process
202 |         total_function_calls = 0
203 |         for run_detail in data.get("run_details", []):
204 |             for trial in run_detail.get("trials", []):
205 |                 total_function_calls += len(trial.get("function_call_result", []))
206 |         
207 |         print(f"Total function calls to process: {total_function_calls}")
208 |         
209 |         # Process each run_detail with progress bar
210 |         processed_calls = 0
211 |         for run_detail in tqdm(data.get("run_details", []), desc="Processing run_details"):
212 |             function_call_label = run_detail.get("function_call_label", [])
213 |             query = run_detail.get("query", [])
214 |             
215 |             # Process each trial
216 |             for trial in run_detail.get("trials", []):
217 |                 function_call_results = trial.get("function_call_result", [])
218 |                 
219 |                 # Call check_ast with function_call_label and function_call_result
220 |                 tool_correctness, parameter_correctness = check_ast(
221 |                     function_call_results, 
222 |                     function_call_label,
223 |                     query
224 |                 )
225 |                 
226 |                 # Add the new fields to function_call_result
227 |                 trial["tool_correctness"] = True if tool_correctness == 1 else False
228 |                 trial["parameter_correctness"] = True if parameter_correctness == 1 else False
229 |                 
230 |                 processed_calls += 1
231 |         
232 |         # Write the output file
233 |         with open(output_file_path, 'w', encoding='utf-8') as f:
234 |             json.dump(data, f, indent=2, ensure_ascii=False)
235 |         
236 |         print(f"Processing completed. Output written to {output_file_path}")
237 |         
238 |     except FileNotFoundError:
239 |         print(f"Error: Input file {input_file_path} not found")
240 |     except json.JSONDecodeError as e:
241 |         print(f"Error: Invalid JSON in input file: {e}")
242 |     except Exception as e:
243 |         print(f"Error: {e}")
244 | 


--------------------------------------------------------------------------------
/src/mcp_tool_bench/utils/calculate_metrics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Script to calculate tool_pass@{k} and parameter_pass@{k} metrics from existing log files.
  4 | This script can process log files that contain tool_correctness and parameter_correctness data
  5 | but don't have the calculated pass@k metrics for these dimensions.
  6 | """
  7 | 
  8 | import json
  9 | import argparse
 10 | import os
 11 | import numpy as np
 12 | from typing import List, Dict, Any, Tuple
 13 | from src.mcp_tool_bench.evaluation.evaluation_utils import estimate_pass_at_k, base_error_analysis
 14 | 
 15 | def check_single_tool_call_dag(pred_tool_result: Dict, label_result: Dict) -> Tuple[bool, bool]:
 16 |     # implementation
 17 |     label_tool_name = label_result["name"] if "name" in label_result else ""
 18 |     label_result = label_result["output"] if "output" in label_result else {}
 19 | 
 20 |     # prediction
 21 |     predict_tool_name = pred_tool_result["name"] if "name" in pred_tool_result else ""
 22 |     predict_result = pred_tool_result["output"] if "output" in pred_tool_result else {}
 23 |     predict_status_code = predict_result["status_code"] if "status_code" in predict_result else 500
 24 | 
 25 |     if label_tool_name == predict_tool_name:
 26 |         tool_consistency = True
 27 |     else:
 28 |         tool_consistency = False
 29 | 
 30 |     result_success_label_list = base_error_analysis([predict_result])["result_success_label_list"]
 31 |     if sum(result_success_label_list)==len(result_success_label_list):
 32 |         output_consistency = True
 33 |     else:
 34 |         output_consistency = False
 35 |     return tool_consistency, output_consistency
 36 | 
 37 | def check_correctness(pred_tool_result_list: List[Dict], label_result_list: List[Dict]) -> Tuple[bool, bool]:
 38 |     """
 39 |     Check the correctness of tool calls
 40 |     
 41 |     Args:
 42 |         pred_tool_result_list: Tool call prediction result list
 43 |         label_result_list: Tool call ground truth result list
 44 | 
 45 |     Returns:
 46 |         Tuple[bool, bool]: (tool_consistency, output_consistency)
 47 |     """
 48 |     label_step = len(label_result_list) if label_result_list is not None else 0
 49 |     predict_step = len(pred_tool_result_list) if pred_tool_result_list is not None else 0
 50 | 
 51 |     tool_consistency = False
 52 |     output_consistency = False
 53 |     
 54 |     label_result = label_result_list[-1]
 55 |     pred_tool_result = pred_tool_result_list[-1]
 56 |     tool_consistency, output_consistency = check_single_tool_call_dag(pred_tool_result, label_result)
 57 |     
 58 |     return tool_consistency, output_consistency
 59 | 
 60 | def calculate_metrics_from_log(log_file_path: str, pass_k_list: List[int] = None) -> Dict[str, Any]:
 61 |     """
 62 |     Calculate tool_pass@{k} and parameter_pass@{k} metrics from a log file.
 63 |     
 64 |     Args:
 65 |         log_file_path: Path to the log file
 66 |         pass_k_list: List of k values for pass@k calculation. If None, will extract from log file.
 67 |         
 68 |     Returns:
 69 |         Dict containing the calculated metrics
 70 |     """
 71 |     
 72 |     # Load log file
 73 |     with open(log_file_path, 'r', encoding='utf-8') as f:
 74 |         log_data = json.load(f)
 75 |     
 76 |     # Extract pass_k_list from log if not provided
 77 |     if pass_k_list is None:
 78 |         pass_k_str = log_data.get("run_info", {}).get("pass_k", "1")
 79 |         pass_k_list = [int(k) for k in pass_k_str.split(",")]
 80 |     
 81 |     print(f"Processing log file: {log_file_path}")
 82 |     print(f"Pass@k values: {pass_k_list}")
 83 |     
 84 |     # Extract run details
 85 |     run_details = log_data.get("run_details", [])
 86 |     if not run_details:
 87 |         print("No run_details found in log file")
 88 |         return {}
 89 |     
 90 |     # Arrays to store results for each task
 91 |     num_trails_array = []
 92 |     num_pass_array = []
 93 |     num_tool_correct_array = []
 94 |     num_parameter_correct_array = []
 95 |     
 96 |     run_details = run_details[:50]
 97 |     # Process each task
 98 |     for task in run_details:
 99 |         trials = task.get("trials", [])
100 |         if not trials:
101 |             continue
102 |             
103 |         # Count trials and correct results
104 |         num_trials = len(trials)
105 |         num_passed = 0
106 |         num_tool_correct = 0
107 |         num_parameter_correct = 0
108 | 
109 |         # Calculate directly from log
110 |         num_passed = sum(1 for trial in trials if (trial.get("if_pass", False) and trial.get("tool_correctness", False) and trial.get("parameter_correctness", False)))
111 |         num_tool_correct = sum(1 for trial in trials if trial.get("tool_correctness", False))
112 |         num_parameter_correct = sum(1 for trial in trials if (trial.get("parameter_correctness", False) and trial.get("tool_correctness", False)))
113 |         
114 |         num_trails_array.append(num_trials)
115 |         num_pass_array.append(num_passed)
116 |         num_tool_correct_array.append(num_tool_correct)
117 |         num_parameter_correct_array.append(num_parameter_correct)
118 |     
119 |     print(f"Processed {len(num_trails_array)} tasks")
120 |     print(f"Total trials: {sum(num_trails_array)}")
121 |     print(f"Total passed: {sum(num_pass_array)}")
122 |     print(f"Total tool correct: {sum(num_tool_correct_array)}")
123 |     print(f"Total parameter correct: {sum(num_parameter_correct_array)}")
124 |     
125 |     # Calculate metrics for each k value
126 |     metrics_list = []
127 |     run_info = log_data.get("run_info", {})
128 |     
129 |     for k in pass_k_list:
130 |         # Calculate pass@{k} for overall correctness
131 |         pass_at_k_arr = estimate_pass_at_k(num_trails_array, num_pass_array, k)
132 |         pass_at_k = float(np.mean(pass_at_k_arr)) if len(pass_at_k_arr) > 0 else 0
133 |         
134 |         # Calculate tool_pass@{k}
135 |         tool_pass_at_k_arr = estimate_pass_at_k(num_trails_array, num_tool_correct_array, k)
136 |         tool_pass_at_k = float(np.mean(tool_pass_at_k_arr)) if len(tool_pass_at_k_arr) > 0 else 0
137 |         
138 |         # Calculate parameter_pass@{k}
139 |         parameter_pass_at_k_arr = estimate_pass_at_k(num_trails_array, num_parameter_correct_array, k)
140 |         parameter_pass_at_k = float(np.mean(parameter_pass_at_k_arr)) if len(parameter_pass_at_k_arr) > 0 else 0
141 |         
142 |         metric = {
143 |             "category": run_info.get("category", "unknown"),
144 |             "model": run_info.get("model", "unknown"),
145 |             f"pass@{k}": pass_at_k,
146 |             f"tool_pass@{k}": tool_pass_at_k,
147 |             f"parameter_pass@{k}": parameter_pass_at_k,
148 |             "num_tasks": len(num_trails_array),
149 |             "num_trials_total": sum(num_trails_array),
150 |             "num_passed_total": sum(num_pass_array),
151 |             "num_tool_correct_total": sum(num_tool_correct_array),
152 |             "num_parameter_correct_total": sum(num_parameter_correct_array)
153 |         }
154 |         metrics_list.append(metric)
155 |         
156 |         print(f"Pass@{k} - Tool_selected: {tool_pass_at_k:.4f}, Parameter: {parameter_pass_at_k:.4f}, Tool_call: {pass_at_k:.4f}")
157 |     
158 |     return {
159 |         "run_info": run_info,
160 |         "metrics": metrics_list,
161 |         "calculation_info": {
162 |             "log_file": log_file_path,
163 |             "pass_k_list": pass_k_list,
164 |             "num_tasks": len(num_trails_array),
165 |             "total_trials": sum(num_trails_array)
166 |         }
167 |     }
168 | 
169 | 
170 | def update_log_file_with_metrics(log_file_path: str, output_file_path: str = None) -> str:
171 |     """
172 |     Update the original log file with the calculated metrics.
173 |     
174 |     Args:
175 |         log_file_path: Path to the original log file
176 |         output_file_path: Path for the updated log file. If None, will overwrite original.
177 |         
178 |     Returns:
179 |         Path to the updated log file
180 |     """
181 |     
182 |     # Calculate metrics
183 |     result = calculate_metrics_from_log(log_file_path)
184 |     
185 |     if not result:
186 |         print("Failed to calculate metrics")
187 |         return ""
188 |     
189 |     # Load original log file
190 |     with open(log_file_path, 'r', encoding='utf-8') as f:
191 |         original_log = json.load(f)
192 |     
193 |     # Update metrics in the original log
194 |     original_log["metrics"] = result["metrics"]
195 |     
196 |     # Determine output file path
197 |     if output_file_path is None:
198 |         output_file_path = log_file_path
199 |     
200 |     # Save updated log file
201 |     with open(output_file_path, 'w', encoding='utf-8') as f:
202 |         json.dump(original_log, f, ensure_ascii=False, indent=2)
203 |     
204 |     print(f"Updated log file saved to: {output_file_path}")
205 |     return output_file_path
206 | 
207 | 
208 | def process_multiple_logs(log_dir: str, pattern: str = None) -> None:
209 |     """
210 |     Process multiple log files in a directory.
211 |     
212 |     Args:
213 |         log_dir: Directory containing log files
214 |         pattern: Optional pattern to filter log files (e.g., "browser_0711_single_500")
215 |     """
216 |     
217 |     if not os.path.exists(log_dir):
218 |         print(f"Directory not found: {log_dir}")
219 |         return
220 |     
221 |     log_files = []
222 |     for file in os.listdir(log_dir):
223 |         if file.endswith('.json'):
224 |             if pattern is None or pattern in file:
225 |                 log_files.append(os.path.join(log_dir, file))
226 |     
227 |     print(f"Found {len(log_files)} log files to process")
228 |     
229 |     for log_file in log_files:
230 |         print(f"\nProcessing: {log_file}")
231 |         try:
232 |             update_log_file_with_metrics(log_file)
233 |         except Exception as e:
234 |             print(f"Error processing {log_file}: {e}")
235 | 
236 | 
237 | def main():
238 |     parser = argparse.ArgumentParser(description="Calculate tool_pass@{k} and parameter_pass@{k} metrics from log files")
239 |     parser.add_argument("--log_file", type=str, help="Path to a single log file")
240 |     parser.add_argument("--log_dir", type=str, help="Directory containing log files")
241 |     parser.add_argument("--pattern", type=str, help="Pattern to filter log files (when using --log_dir)")
242 |     parser.add_argument("--pass_k", type=str, default="1,3", help="Comma-separated list of k values for pass@k")
243 |     parser.add_argument("--output", type=str, help="Output file path (for single file processing)")
244 |     parser.add_argument("--calculate_only", action="store_true", help="Only calculate and display metrics, don't update log file")
245 |     
246 |     args = parser.parse_args()
247 |     
248 |     pass_k_list = [int(k) for k in args.pass_k.split(",")]
249 |     
250 |     if args.log_file:
251 |         if args.calculate_only:
252 |             # Only calculate and display metrics
253 |             result = calculate_metrics_from_log(args.log_file, pass_k_list)
254 |             if result:
255 |                 print("\nCalculated Metrics:")
256 |                 for metric in result["metrics"]:
257 |                     print(f"  {metric}")
258 |         else:
259 |             # Update log file with metrics
260 |             update_log_file_with_metrics(args.log_file, args.output)
261 |     
262 |     elif args.log_dir:
263 |         # Process multiple log files
264 |         process_multiple_logs(args.log_dir, args.pattern)
265 |     
266 |     else:
267 |         print("Please provide either --log_file or --log_dir")
268 |         parser.print_help()
269 | 
270 | if __name__ == "__main__":
271 |     main() 
272 | 


--------------------------------------------------------------------------------
/data/file_system/filesystem_single_demo.json:
--------------------------------------------------------------------------------
1 | [{"uuid":"8cac45c6-a9ef-4881-b0e0-4b444900bdfe","category":"filesystem","call_type":"single","tools":[{"name":"read_file","description":"Read the complete contents of a file from the file system. Handles various text encodings and provides detailed error messages if the file cannot be read. Use this tool when you need to examine the contents of a single file. Use the 'head' parameter to read only the first N lines of a file, or the 'tail' parameter to read only the last N lines of a file. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"tail":{"type":"number","description":"If provided, returns only the last N lines of the file"},"head":{"type":"number","description":"If provided, returns only the first N lines of the file"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"read_multiple_files","description":"Read the contents of multiple files simultaneously. This is more efficient than reading files one by one when you need to analyze or compare multiple files. Each file's content is returned with its path as a reference. Failed reads for individual files won't stop the entire operation. Only works within allowed directories.","input_schema":{"type":"object","properties":{"paths":{"type":"array","items":{"type":"string"}}},"required":["paths"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"write_file","description":"Create a new file or completely overwrite an existing file with new content. Use with caution as it will overwrite existing files without warning. Handles text content with proper encoding. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"content":{"type":"string"}},"required":["path","content"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"edit_file","description":"Make line-based edits to a text file. Each edit replaces exact line sequences with new content. Returns a git-style diff showing the changes made. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"edits":{"type":"array","items":{"type":"object","properties":{"oldText":{"type":"string","description":"Text to search for - must match exactly"},"newText":{"type":"string","description":"Text to replace with"}},"required":["oldText","newText"],"additionalProperties":false}},"dryRun":{"type":"boolean","default":false,"description":"Preview changes using git-style diff format"}},"required":["path","edits"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_directory","description":"Create a new directory or ensure a directory exists. Can create multiple nested directories in one operation. If the directory already exists, this operation will succeed silently. Perfect for setting up directory structures for projects or ensuring required paths exist. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_directory","description":"Get a detailed listing of all files and directories in a specified path. Results clearly distinguish between files and directories with [FILE] and [DIR] prefixes. This tool is essential for understanding directory structure and finding specific files within a directory. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_directory_with_sizes","description":"Get a detailed listing of all files and directories in a specified path, including sizes. Results clearly distinguish between files and directories with [FILE] and [DIR] prefixes. This tool is useful for understanding directory structure and finding specific files within a directory. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"sortBy":{"type":"string","enum":["name","size"],"default":"name","description":"Sort entries by name or size"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"directory_tree","description":"Get a recursive tree view of files and directories as a JSON structure. Each entry includes 'name', 'type' (file/directory), and 'children' for directories. Files have no children array, while directories always have a children array (which may be empty). The output is formatted with 2-space indentation for readability. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"search_files","description":"Recursively search for files and directories matching a pattern. Searches through all subdirectories from the starting path. The search is case-insensitive and matches partial names. Returns full paths to all matching items. Great for finding files when you don't know their exact location. Only searches within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"pattern":{"type":"string"},"excludePatterns":{"type":"array","items":{"type":"string"},"default":[]}},"required":["path","pattern"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"get_file_info","description":"Retrieve detailed metadata about a file or directory. Returns comprehensive information including size, creation time, last modified time, permissions, and type. This tool is perfect for understanding file characteristics without reading the actual content. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_allowed_directories","description":"Returns the list of directories that this server is allowed to access. Use this to understand which directories are available before trying to access files.","input_schema":{"type":"object","properties":{},"required":[]}}],"mcp_tools_dict":{"filesystem":["read_file","read_multiple_files","write_file","edit_file","create_directory","list_directory","list_directory_with_sizes","directory_tree","search_files","get_file_info","list_allowed_directories"]},"query":"Provide a recursive tree view of the files and directories located at ./test_project_root/src.","function_call_label":[{"name":"directory_tree","step":"1","id":"1","mcp_server":"filesystem","similar_tools":[],"input":{"path":"./test_project_root/src"},"output":{"status_code":200,"result":{}}}]},{"uuid":"cd0a8b63-439a-4259-af0d-74dd8270d995","category":"filesystem","call_type":"single","tools":[{"name":"read_file","description":"Read the complete contents of a file from the file system. Handles various text encodings and provides detailed error messages if the file cannot be read. Use this tool when you need to examine the contents of a single file. Use the 'head' parameter to read only the first N lines of a file, or the 'tail' parameter to read only the last N lines of a file. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"tail":{"type":"number","description":"If provided, returns only the last N lines of the file"},"head":{"type":"number","description":"If provided, returns only the first N lines of the file"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"read_multiple_files","description":"Read the contents of multiple files simultaneously. This is more efficient than reading files one by one when you need to analyze or compare multiple files. Each file's content is returned with its path as a reference. Failed reads for individual files won't stop the entire operation. Only works within allowed directories.","input_schema":{"type":"object","properties":{"paths":{"type":"array","items":{"type":"string"}}},"required":["paths"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"write_file","description":"Create a new file or completely overwrite an existing file with new content. Use with caution as it will overwrite existing files without warning. Handles text content with proper encoding. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"content":{"type":"string"}},"required":["path","content"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"edit_file","description":"Make line-based edits to a text file. Each edit replaces exact line sequences with new content. Returns a git-style diff showing the changes made. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"edits":{"type":"array","items":{"type":"object","properties":{"oldText":{"type":"string","description":"Text to search for - must match exactly"},"newText":{"type":"string","description":"Text to replace with"}},"required":["oldText","newText"],"additionalProperties":false}},"dryRun":{"type":"boolean","default":false,"description":"Preview changes using git-style diff format"}},"required":["path","edits"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_directory","description":"Create a new directory or ensure a directory exists. Can create multiple nested directories in one operation. If the directory already exists, this operation will succeed silently. Perfect for setting up directory structures for projects or ensuring required paths exist. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_directory","description":"Get a detailed listing of all files and directories in a specified path. Results clearly distinguish between files and directories with [FILE] and [DIR] prefixes. This tool is essential for understanding directory structure and finding specific files within a directory. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_directory_with_sizes","description":"Get a detailed listing of all files and directories in a specified path, including sizes. Results clearly distinguish between files and directories with [FILE] and [DIR] prefixes. This tool is useful for understanding directory structure and finding specific files within a directory. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"sortBy":{"type":"string","enum":["name","size"],"default":"name","description":"Sort entries by name or size"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"directory_tree","description":"Get a recursive tree view of files and directories as a JSON structure. Each entry includes 'name', 'type' (file/directory), and 'children' for directories. Files have no children array, while directories always have a children array (which may be empty). The output is formatted with 2-space indentation for readability. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"search_files","description":"Recursively search for files and directories matching a pattern. Searches through all subdirectories from the starting path. The search is case-insensitive and matches partial names. Returns full paths to all matching items. Great for finding files when you don't know their exact location. Only searches within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"pattern":{"type":"string"},"excludePatterns":{"type":"array","items":{"type":"string"},"default":[]}},"required":["path","pattern"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"get_file_info","description":"Retrieve detailed metadata about a file or directory. Returns comprehensive information including size, creation time, last modified time, permissions, and type. This tool is perfect for understanding file characteristics without reading the actual content. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_allowed_directories","description":"Returns the list of directories that this server is allowed to access. Use this to understand which directories are available before trying to access files.","input_schema":{"type":"object","properties":{},"required":[]}}],"mcp_tools_dict":{"filesystem":["read_file","read_multiple_files","write_file","edit_file","create_directory","list_directory","list_directory_with_sizes","directory_tree","search_files","get_file_info","list_allowed_directories"]},"query":"Read the contents of the files located at ./test_project_root/src/main.py and ./test_project_root/docs/README.md at the same time.","function_call_label":[{"name":"read_multiple_files","step":"1","id":"1","mcp_server":"filesystem","similar_tools":[],"input":{"paths":["./test_project_root/src/main.py","./test_project_root/docs/README.md"]},"output":{"status_code":200,"result":{}}}]}]
2 | 


--------------------------------------------------------------------------------
/data/browser/browser_single_demo.json:
--------------------------------------------------------------------------------
1 | [{"uuid":"0b1be01a-a542-4f54-8cfc-017760c03d72","category":"browser","call_type":"single","tools":[{"name":"start_codegen_session","description":"Start a new code generation session to record Playwright actions","input_schema":{"type":"object","properties":{"options":{"type":"object","description":"Code generation options","properties":{"outputPath":{"type":"string","description":"Directory path where generated tests will be saved (use absolute path)"},"testNamePrefix":{"type":"string","description":"Prefix to use for generated test names (default: 'GeneratedTest')"},"includeComments":{"type":"boolean","description":"Whether to include descriptive comments in generated tests"}},"required":["outputPath"]}},"required":["options"]}},{"name":"end_codegen_session","description":"End a code generation session and generate the test file","input_schema":{"type":"object","properties":{"sessionId":{"type":"string","description":"ID of the session to end"}},"required":["sessionId"]}},{"name":"get_codegen_session","description":"Get information about a code generation session","input_schema":{"type":"object","properties":{"sessionId":{"type":"string","description":"ID of the session to retrieve"}},"required":["sessionId"]}},{"name":"clear_codegen_session","description":"Clear a code generation session without generating a test","input_schema":{"type":"object","properties":{"sessionId":{"type":"string","description":"ID of the session to clear"}},"required":["sessionId"]}},{"name":"playwright_navigate","description":"Navigate to a URL","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to navigate to the website specified"},"browserType":{"type":"string","description":"Browser type to use (chromium, firefox, webkit). Defaults to chromium","enum":["chromium","firefox","webkit"]},"width":{"type":"number","description":"Viewport width in pixels (default: 1280)"},"height":{"type":"number","description":"Viewport height in pixels (default: 720)"},"timeout":{"type":"number","description":"Navigation timeout in milliseconds"},"waitUntil":{"type":"string","description":"Navigation wait condition"},"headless":{"type":"boolean","description":"Run browser in headless mode (default: false)"}},"required":["url"]}},{"name":"playwright_screenshot","description":"Take a screenshot of the current page or a specific element","input_schema":{"type":"object","properties":{"name":{"type":"string","description":"Name for the screenshot"},"selector":{"type":"string","description":"CSS selector for element to screenshot"},"width":{"type":"number","description":"Width in pixels (default: 800)"},"height":{"type":"number","description":"Height in pixels (default: 600)"},"storeBase64":{"type":"boolean","description":"Store screenshot in base64 format (default: true)"},"fullPage":{"type":"boolean","description":"Store screenshot of the entire page (default: false)"},"savePng":{"type":"boolean","description":"Save screenshot as PNG file (default: false)"},"downloadsDir":{"type":"string","description":"Custom downloads directory path (default: user's Downloads folder)"}},"required":["name"]}},{"name":"playwright_click","description":"Click an element on the page","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for the element to click"}},"required":["selector"]}},{"name":"playwright_iframe_click","description":"Click an element in an iframe on the page","input_schema":{"type":"object","properties":{"iframeSelector":{"type":"string","description":"CSS selector for the iframe containing the element to click"},"selector":{"type":"string","description":"CSS selector for the element to click"}},"required":["iframeSelector","selector"]}},{"name":"playwright_iframe_fill","description":"Fill an element in an iframe on the page","input_schema":{"type":"object","properties":{"iframeSelector":{"type":"string","description":"CSS selector for the iframe containing the element to fill"},"selector":{"type":"string","description":"CSS selector for the element to fill"},"value":{"type":"string","description":"Value to fill"}},"required":["iframeSelector","selector","value"]}},{"name":"playwright_fill","description":"fill out an input field","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for input field"},"value":{"type":"string","description":"Value to fill"}},"required":["selector","value"]}},{"name":"playwright_select","description":"Select an element on the page with Select tag","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for element to select"},"value":{"type":"string","description":"Value to select"}},"required":["selector","value"]}},{"name":"playwright_hover","description":"Hover an element on the page","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for element to hover"}},"required":["selector"]}},{"name":"playwright_evaluate","description":"Execute JavaScript in the browser console","input_schema":{"type":"object","properties":{"script":{"type":"string","description":"JavaScript code to execute"}},"required":["script"]}},{"name":"playwright_console_logs","description":"Retrieve console logs from the browser with filtering options","input_schema":{"type":"object","properties":{"type":{"type":"string","description":"Type of logs to retrieve (all, error, warning, log, info, debug, exception)","enum":["all","error","warning","log","info","debug","exception"]},"search":{"type":"string","description":"Text to search for in logs (handles text with square brackets)"},"limit":{"type":"number","description":"Maximum number of logs to return"},"clear":{"type":"boolean","description":"Whether to clear logs after retrieval (default: false)"}},"required":[]}},{"name":"playwright_close","description":"Close the browser and release all resources","input_schema":{"type":"object","properties":{},"required":[]}},{"name":"playwright_get","description":"Perform an HTTP GET request","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to perform GET operation"}},"required":["url"]}},{"name":"playwright_post","description":"Perform an HTTP POST request","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to perform POST operation"},"value":{"type":"string","description":"Data to post in the body"},"token":{"type":"string","description":"Bearer token for authorization"},"headers":{"type":"object","description":"Additional headers to include in the request","additionalProperties":{"type":"string"}}},"required":["url","value"]}},{"name":"playwright_put","description":"Perform an HTTP PUT request","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to perform PUT operation"},"value":{"type":"string","description":"Data to PUT in the body"}},"required":["url","value"]}},{"name":"playwright_patch","description":"Perform an HTTP PATCH request","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to perform PUT operation"},"value":{"type":"string","description":"Data to PATCH in the body"}},"required":["url","value"]}},{"name":"playwright_delete","description":"Perform an HTTP DELETE request","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to perform DELETE operation"}},"required":["url"]}},{"name":"playwright_expect_response","description":"Ask Playwright to start waiting for a HTTP response. This tool initiates the wait operation but does not wait for its completion.","input_schema":{"type":"object","properties":{"id":{"type":"string","description":"Unique & arbitrary identifier to be used for retrieving this response later with `Playwright_assert_response`."},"url":{"type":"string","description":"URL pattern to match in the response."}},"required":["id","url"]}},{"name":"playwright_assert_response","description":"Wait for and validate a previously initiated HTTP response wait operation.","input_schema":{"type":"object","properties":{"id":{"type":"string","description":"Identifier of the HTTP response initially expected using `Playwright_expect_response`."},"value":{"type":"string","description":"Data to expect in the body of the HTTP response. If provided, the assertion will fail if this value is not found in the response body."}},"required":["id"]}},{"name":"playwright_custom_user_agent","description":"Set a custom User Agent for the browser","input_schema":{"type":"object","properties":{"userAgent":{"type":"string","description":"Custom User Agent for the Playwright browser instance"}},"required":["userAgent"]}},{"name":"playwright_get_visible_text","description":"Get the visible text content of the current page","input_schema":{"type":"object","properties":{},"required":[]}},{"name":"playwright_get_visible_html","description":"Get the HTML content of the current page","input_schema":{"type":"object","properties":{},"required":[]}},{"name":"playwright_go_back","description":"Navigate back in browser history","input_schema":{"type":"object","properties":{},"required":[]}},{"name":"playwright_go_forward","description":"Navigate forward in browser history","input_schema":{"type":"object","properties":{},"required":[]}},{"name":"playwright_drag","description":"Drag an element to a target location","input_schema":{"type":"object","properties":{"sourceSelector":{"type":"string","description":"CSS selector for the element to drag"},"targetSelector":{"type":"string","description":"CSS selector for the target location"}},"required":["sourceSelector","targetSelector"]}},{"name":"playwright_press_key","description":"Press a keyboard key","input_schema":{"type":"object","properties":{"key":{"type":"string","description":"Key to press (e.g. 'Enter', 'ArrowDown', 'a')"},"selector":{"type":"string","description":"Optional CSS selector to focus before pressing key"}},"required":["key"]}},{"name":"playwright_save_as_pdf","description":"Save the current page as a PDF file","input_schema":{"type":"object","properties":{"outputPath":{"type":"string","description":"Directory path where PDF will be saved"},"filename":{"type":"string","description":"Name of the PDF file (default: page.pdf)"},"format":{"type":"string","description":"Page format (e.g. 'A4', 'Letter')"},"printBackground":{"type":"boolean","description":"Whether to print background graphics"},"margin":{"type":"object","description":"Page margins","properties":{"top":{"type":"string"},"right":{"type":"string"},"bottom":{"type":"string"},"left":{"type":"string"}}}},"required":["outputPath"]}},{"name":"playwright_click_and_switch_tab","description":"Click a link and switch to the newly opened tab","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for the link to click"}},"required":["selector"]}},{"name":"puppeteer_navigate","description":"Navigate to a URL","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to navigate to"},"launchOptions":{"type":"object","description":"PuppeteerJS LaunchOptions. Default null. If changed and not null, browser restarts. Example: { headless: true, args: ['--no-sandbox'] }"},"allowDangerous":{"type":"boolean","description":"Allow dangerous LaunchOptions that reduce security. When false, dangerous args like --no-sandbox will throw errors. Default false."}},"required":["url"]}},{"name":"puppeteer_screenshot","description":"Take a screenshot of the current page or a specific element","input_schema":{"type":"object","properties":{"name":{"type":"string","description":"Name for the screenshot"},"selector":{"type":"string","description":"CSS selector for element to screenshot"},"width":{"type":"number","description":"Width in pixels (default: 800)"},"height":{"type":"number","description":"Height in pixels (default: 600)"},"encoded":{"type":"boolean","description":"If true, capture the screenshot as a base64-encoded data URI (as text) instead of binary image content. Default false."}},"required":["name"]}},{"name":"puppeteer_click","description":"Click an element on the page","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for element to click"}},"required":["selector"]}},{"name":"puppeteer_fill","description":"Fill out an input field","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for input field"},"value":{"type":"string","description":"Value to fill"}},"required":["selector","value"]}},{"name":"puppeteer_select","description":"Select an element on the page with Select tag","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for element to select"},"value":{"type":"string","description":"Value to select"}},"required":["selector","value"]}},{"name":"puppeteer_hover","description":"Hover an element on the page","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for element to hover"}},"required":["selector"]}},{"name":"puppeteer_evaluate","description":"Execute JavaScript in the browser console","input_schema":{"type":"object","properties":{"script":{"type":"string","description":"JavaScript code to execute"}},"required":["script"]}}],"mcp_tools_dict":{"playwright":["start_codegen_session","end_codegen_session","get_codegen_session","clear_codegen_session","playwright_navigate","playwright_screenshot","playwright_click","playwright_iframe_click","playwright_iframe_fill","playwright_fill","playwright_select","playwright_hover","playwright_evaluate","playwright_console_logs","playwright_close","playwright_get","playwright_post","playwright_put","playwright_patch","playwright_delete","playwright_expect_response","playwright_assert_response","playwright_custom_user_agent","playwright_get_visible_text","playwright_get_visible_html","playwright_go_back","playwright_go_forward","playwright_drag","playwright_press_key","playwright_save_as_pdf","playwright_click_and_switch_tab"],"puppeteer":["puppeteer_navigate","puppeteer_screenshot","puppeteer_click","puppeteer_fill","puppeteer_select","puppeteer_hover","puppeteer_evaluate"]},"query":"Navigate to the Wikipedia website using the Chromium browser and check its accessibility.","function_call_label":[{"name":"playwright_navigate","step":"1","id":"1","mcp_server":"playwright","similar_tools":[{"name":"puppeteer_navigate","mcp_server":"puppeteer"}],"input":{"url":"https://www.wikipedia.org","browserType":"chromium"},"output":{"status_code":200,"result":{}}}]}]
2 | 


--------------------------------------------------------------------------------
/data/pay/pay_single_demo.json:
--------------------------------------------------------------------------------
1 | [{"uuid":"3b98fe31-cdf9-483f-88f7-4f79f226bd8b","category":"pay","call_type":"single","tools":[{"name":"create_invoice","description":"\nCreate Invoices on PayPal.\n\nThis function is used to create an invoice in the PayPal system. It allows you to generate a new invoice, specifying details such as customer information, items, quantities, pricing, and tax information. Once created, an invoice can be sent to the customer for payment.\n","input_schema":{"type":"object","properties":{"detail":{"type":"object","properties":{"invoice_date":{"type":"string","description":"The invoice date in YYYY-MM-DD format"},"currency_code":{"type":"string","description":"currency code of the invoice"}},"required":["currency_code"],"additionalProperties":false,"description":"The invoice detail, like{ \"invoice_number\": \"#123\", \"reference\": \"deal-ref\", \"invoice_date\": \"2018-11-12\", \"currency_code\": \"USD\", \"note\": \"Thank you for your business.\", \"term\": \"No refunds after 30 days.\", \"memo\": \"This is a long contract\", \"payment_term\": { \"term_type\": \"NET_10\", \"due_date\": \"2018-11-22\" } }"},"invoicer":{"type":"object","properties":{"business_name":{"type":"string","maxLength":300,"description":"business name of the invoicer"},"name":{"type":"object","properties":{"given_name":{"type":"string","description":"given name of the invoicer"},"surname":{"type":"string","description":"surname of the invoicer"}},"additionalProperties":false,"description":"name of the invoicer"},"email_address":{"type":"string","description":"email address of the invoicer"}},"required":["business_name"],"additionalProperties":false,"description":"The invoicer business information that appears on the invoice."},"primary_recipients":{"type":"array","items":{"type":"object","properties":{"billing_info":{"type":"object","properties":{"name":{"type":"object","properties":{"given_name":{"type":"string","description":"given name of the recipient"},"surname":{"type":"string","description":"surname of the recipient"}},"additionalProperties":false,"description":"name of the recipient"},"email_address":{"type":"string","description":"email address of the recipient"}},"additionalProperties":false,"description":"The billing information of the invoice recipient"}},"additionalProperties":false},"description":"array of recipients"},"items":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string","description":"The name of the item"},"quantity":{"type":"string","description":"The quantity of the item that the invoicer provides to the payer. Value is from -1000000 to 1000000. Supports up to five decimal places. Cast to string"},"unit_amount":{"type":"object","properties":{"currency_code":{"type":"string","description":"Currency code of the unit amount"},"value":{"type":"string","description":"The unit price. Up to 2 decimal points"}},"required":["currency_code","value"],"additionalProperties":false,"description":"unit amount object"},"tax":{"type":"object","properties":{"name":{"type":"string","description":"Tax name"},"percent":{"type":"string","description":"Tax Percent"}},"additionalProperties":false,"description":"tax object"},"unit_of_measure":{"type":"string","enum":["QUANTITY","HOURS","AMOUNT"],"description":"The unit of measure for the invoiced item"}},"required":["name","quantity","unit_amount"],"additionalProperties":false,"description":"invoice line item object"},"description":"Array of invoice line items"}},"required":["detail"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_product","description":"\nCreate a product in PayPal using product catalog - create products API.\nThis function creates a new product that will be used in subscription plans, subscriptions.\nRequired parameters are: name (product name), type (product type).\nHigh level: \n    - id: (auto-generated or specify SKU of the product) The ID of the product\n    - name: {product_name} (required) \n    - description: {product_description} (optional)\n    - type {DIGITAL | PHYSICAL | SERVICE} (required)\n    - category: {product_category} (optional) \n    - image_url: {image_url} (optional)\n    - home_url: {home_url} (optional)\n\nBelow is the payload request structure:\n{\n    \"id\": \"#PROD-XYAB12ABSB7868434\",\n    \"name\": \"Video Streaming Service\",\n    \"description\": \"Service for streaming latest series, movies etc.\",\n    \"type\": \"SERVICE\",\n    \"category\": \"SOFTWARE\",\n    \"image_url\": \"https://example.com/streaming.jpg\",\n    \"home_url\": \"https://example.com/home\"\n}\n\n","input_schema":{"type":"object","properties":{"name":{"type":"string","description":"The product name."},"type":{"type":"string","enum":["PHYSICAL","DIGITAL","SERVICE"],"description":"The product type. Value is PHYSICAL, DIGITAL, or SERVICE."},"description":{"type":"string","description":"The product description."},"category":{"type":"string","description":"The product category."},"image_url":{"type":"string","description":"The image URL for the product."},"home_url":{"type":"string","description":"The home page URL for the product."}},"required":["name","type"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_subscription_plan","description":"\nCreate a subsctiption plan in PayPal using subscription - create plan API.\nThis function creates a new subscription plan that defines pricing and billing cycle details for subscriptions.\nRequired parameters are: product_id (the ID of the product for which to create the plan), name (subscription plan name), billing_cycles (billing cycle details).\nHigh level: product_id, name, description, taxes, status: {CREATED|INACTIVE|ACTIVE}, billing_cycles, payment_preferences are required in json object.\nWhile creating billing_cycles object, trial(second) billing cycle should precede regular billing cycle.\n","input_schema":{"type":"object","properties":{"product_id":{"type":"string","description":"The ID of the product for which to create the plan."},"name":{"type":"string","description":"The subscription plan name."},"description":{"type":"string","description":"The subscription plan description."},"billing_cycles":{"type":"array","items":{"type":"object","properties":{"frequency":{"type":"object","properties":{"interval_unit":{"type":"string","enum":["DAY","WEEK","MONTH","YEAR"],"description":"The unit of time for the billing cycle."},"interval_count":{"type":"number","description":"The number of units for the billing cycle."}},"required":["interval_unit","interval_count"],"additionalProperties":true,"description":"The frequency of the billing cycle."},"tenure_type":{"type":"string","enum":["REGULAR","TRIAL"],"description":"The type of billing cycle tenure."},"sequence":{"type":"number","description":"The sequence of the billing cycle."},"total_cycles":{"type":"number","description":"The total number of cycles in the billing plan."},"pricing_scheme":{"type":"object","properties":{"fixed_price":{"type":"object","properties":{"currency_code":{"type":"string","enum":["USD"],"description":"The currency code for the fixed price."},"value":{"type":"string","description":"The value of the fixed price."}},"required":["currency_code","value"],"additionalProperties":true,"description":"The fixed price for the subscription plan."},"version":{"type":"string","description":"The version of the pricing scheme."}},"additionalProperties":true,"description":"The pricing scheme for the billing cycle."}},"required":["frequency","tenure_type","sequence","pricing_scheme"],"additionalProperties":true},"description":"The billing cycles of the plan."},"payment_preferences":{"type":"object","properties":{"auto_bill_outstanding":{"type":"boolean","description":"Indicates whether to automatically bill outstanding amounts."},"setup_fee":{"type":"object","properties":{"currency_code":{"type":"string","enum":["USD"],"description":"The currency code for the setup fee."},"value":{"type":"string","description":"The value of the setup fee."}},"additionalProperties":true,"description":"The setup fee for the subscription plan."},"setup_fee_failure_action":{"type":"string","enum":["CONTINUE","CANCEL"],"description":"The action to take if the setup fee payment fails."},"payment_failure_threshold":{"type":"number","description":"The number of failed payments before the subscription is canceled."}},"additionalProperties":true,"description":"The payment preferences for the subscription plan."},"taxes":{"type":"object","properties":{"percentage":{"type":"string","description":"The tax percentage."},"inclusive":{"type":"boolean","description":"Indicates whether the tax is inclusive."}},"additionalProperties":true,"description":"The tax details."}},"required":["product_id","name","billing_cycles"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_shipment_tracking","description":"\nCreate a shipment for a transaction in PayPal.\nThis function creates a shipment record for a specific transaction, allowing you to track the shipment status and details.\nThe transaction_id can fetch from the captured payment details in the order information.\nRequired parameters are: tracking_number (the tracking number for the shipment), transaction_id (the transaction ID associated with the shipment). \nHigh level: tracking_number, transaction_id, status (optional), carrier (optional) are required json objects.\nBelow is the payload request structure:\n{\n    \"tracking_number\": \"1234567890\",\n    \"transaction_id\": \"9XJ12345ABC67890\",\n    \"status\": \"SHIPPED\", // Required: ON_HOLD, SHIPPED, DELIVERED, CANCELLED\n    \"carrier\": \"UPS\" // Required: The carrier handling the shipment. Link to supported carriers: http://developer.paypal.com/docs/tracking/reference/carriers/\n}\n","input_schema":{"type":"object","properties":{"order_id":{"type":"string","description":"The ID of the order for which to create a shipment"},"tracking_number":{"type":"string","description":"The tracking number for the shipment. Id is provided by the shipper. This is required to create a shipment."},"transaction_id":{"type":"string","description":"The transaction ID associated with the shipment. Transaction id available after the order is paid or captured. This is required to create a shipment."},"status":{"type":"string","description":"The status of the shipment. It can be \"ON_HOLD\", \"SHIPPED\", \"DELIVERED\", or \"CANCELLED\".","default":"SHIPPED"},"carrier":{"type":"string","description":"The carrier handling the shipment."}},"required":["tracking_number","transaction_id"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_order","description":"\nCreate an order in PayPal.\n\nThis tool is used to create a new order in PayPal. This is typically the first step in initiating a payment flow. It sets up an order with specified details such as item(s) to be purchased, quantity, amount, currency, and other details.\n","input_schema":{"type":"object","properties":{"currencyCode":{"type":"string","enum":["USD"],"description":"Currency code of the amount."},"items":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string","description":"The name of the item."},"quantity":{"type":"number","description":"The item quantity. Must be a whole number.","default":1},"description":{"type":"string","description":"The detailed item description."},"itemCost":{"type":"number","description":"The cost of each item - upto 2 decimal points."},"taxPercent":{"type":"number","description":"The tax percent for the specific item.","default":0},"itemTotal":{"type":"number","description":"The total cost of this line item."}},"required":["name","itemCost","itemTotal"],"additionalProperties":false},"maxItems":50},"discount":{"type":"number","description":"The discount amount for the order.","default":0},"shippingCost":{"type":"number","description":"The cost of shipping for the order.","default":0},"shippingAddress":{"anyOf":[{"type":"object","properties":{"address_line_1":{"type":"string","description":"The first line of the address, such as number and street, for example, `173 Drury Lane`.This field needs to pass the full address."},"address_line_2":{"type":"string","description":"The second line of the address, for example, a suite or apartment number."},"admin_area_2":{"type":"string","description":"A city, town, or village. Smaller than `admin_area_level_1`."},"admin_area_1":{"type":"string","description":"The highest-level sub-division in a country, which is usually a province, state, or ISO-3166-2 subdivision. "},"postal_code":{"type":"string","description":"The postal code, which is the ZIP code or equivalent. Typically required for countries with a postal code or an equivalent."},"country_code":{"type":"string","minLength":2,"maxLength":2,"description":"The 2-character ISO 3166-1 code that identifies the country or region. Note: The country code for Great Britain is `GB` and not `UK` as used in the top-level domain names for that country."}},"additionalProperties":false,"description":"The shipping address for the order."},{"type":"null"}],"description":"The shipping address for the order.","default":null},"notes":{"anyOf":[{"anyOf":[{"not":{}},{"type":"string"}]},{"type":"null"}],"default":null},"returnUrl":{"type":"string","default":"https://example.com/returnUrl"},"cancelUrl":{"type":"string","default":"https://example.com/cancelUrl"}},"required":["currencyCode","items"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_refund","description":"\nInitiate a refund for a PayPal payment capture.\nIf you have an order ID instead of a capture ID, first use the get order tool to obtain the capture ID from the order details.\nThis function allows you to return funds to a customer by refunding a previously captured payment. You can issue a full refund or specify a partial amount. If multiple captures exist for an order, request clarification on which specific capture to refund.\nRequired parameters:\n- Capture ID: The ID of the capture to refund\n- Amount (optional): For partial refunds, specify the amount to refund (must be less than or equal to the captured amount)\n- Note to Payer (optional): Additional explanation visible to the customer\nResponse details include:\n- Refund ID and status\n- Refunded amount and currency\n","input_schema":{"type":"object","properties":{"capture_id":{"type":"string","description":"The ID of the capture to refund."},"amount":{"type":"object","properties":{"currency_code":{"type":"string"},"value":{"type":"string"}},"required":["currency_code","value"],"additionalProperties":false,"description":"The amount to refund. If not specified, the full captured amount is refunded."},"invoice_id":{"type":"string","description":"The invoice ID that is used to track this payment."},"note_to_payer":{"type":"string","description":"A note to the payer."}},"required":["capture_id"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}}],"mcp_tools_dict":{"paypal":["create_invoice","create_product","create_subscription_plan","create_shipment_tracking","create_order","create_refund"]},"query":"Create an invoice for Tech Solutions Inc. for a Consultation Service costing 150.00 USD.","function_call_label":[{"name":"create_invoice","step":"1","id":"1","mcp_server":"paypal","similar_tools":[],"input":{"detail":{"currency_code":"USD"},"invoicer":{"business_name":"Tech Solutions Inc."},"items":[{"name":"Consultation Service","quantity":"1","unit_amount":{"currency_code":"USD","value":"150.00"}}]},"output":{"status_code":200,"result":{}}}]}]
2 | 


--------------------------------------------------------------------------------