├── __init__.py ├── doc ├── __init__.py ├── image_filesystem.jpg ├── tools_file_system.jpg ├── image-finance-agent.jpg ├── image-google-search.jpg ├── image_paypal_server.jpg ├── image_browser_puppeteer.jpg ├── image_google_map_server.jpg ├── model_performance_ast_subplots.png └── model_performance_pass_subplots.png ├── src ├── __init__.py └── mcp_tool_bench │ ├── __init__.py │ ├── agents │ ├── __init__.py │ ├── base_tool_call_agent │ │ ├── __init__.py │ │ ├── prompt.py │ │ └── check_functions.py │ └── data_generator_agent │ │ ├── __init__.py │ │ ├── utils │ │ ├── __init__.py │ │ ├── generate_query.py │ │ ├── pre_process.py │ │ └── prompt_reference.py │ │ └── run_data_generator.py │ ├── model_utils │ ├── __init__.py │ ├── model_provider.py │ ├── openai_api.py │ ├── kimi_api.py │ ├── custom_openai_api.py │ ├── claude_api.py │ ├── qwen_api.py │ └── base_api.py │ ├── common_utils.py │ ├── global_variables.py │ ├── evaluation │ └── evaluation_utils.py │ └── utils │ ├── count_tools.py │ └── calculate_metrics.py ├── mcp ├── config │ └── __init__.py └── tools │ └── browser │ └── puppeteer_puppeteer.json ├── data ├── file_system │ ├── test_project_root │ │ ├── data │ │ │ ├── README.md │ │ │ ├── test_file_txt_1.txt │ │ │ ├── test_file_csv_1.csv │ │ │ ├── test_file_json_1.json │ │ │ └── test_file_json_2.json │ │ ├── tests │ │ │ └── unit │ │ │ │ └── test_calculations.py │ │ ├── requirements.txt │ │ ├── docs │ │ │ └── README.md │ │ └── src │ │ │ ├── main.py │ │ │ ├── config │ │ │ └── settings.yaml │ │ │ └── utils │ │ │ └── file_utils.py │ └── filesystem_single_demo.json ├── finance │ └── finance_single_demo.json ├── search │ └── search_single_demo.json ├── browser │ └── browser_single_demo.json └── pay │ └── pay_single_demo.json ├── run.sh └── run.py /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /doc/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mcp/config/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/agents/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/model_utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/file_system/test_project_root/data/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/agents/base_tool_call_agent/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/agents/data_generator_agent/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/agents/data_generator_agent/utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /doc/image_filesystem.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image_filesystem.jpg -------------------------------------------------------------------------------- /doc/tools_file_system.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/tools_file_system.jpg -------------------------------------------------------------------------------- /doc/image-finance-agent.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image-finance-agent.jpg -------------------------------------------------------------------------------- /doc/image-google-search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image-google-search.jpg -------------------------------------------------------------------------------- /doc/image_paypal_server.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image_paypal_server.jpg -------------------------------------------------------------------------------- /data/file_system/test_project_root/data/test_file_txt_1.txt: -------------------------------------------------------------------------------- 1 | Test file 1: This is a test file dedicated to the file system server. -------------------------------------------------------------------------------- /doc/image_browser_puppeteer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image_browser_puppeteer.jpg -------------------------------------------------------------------------------- /doc/image_google_map_server.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/image_google_map_server.jpg -------------------------------------------------------------------------------- /doc/model_performance_ast_subplots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/model_performance_ast_subplots.png -------------------------------------------------------------------------------- /doc/model_performance_pass_subplots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcp-tool-bench/MCPToolBenchPP/HEAD/doc/model_performance_pass_subplots.png -------------------------------------------------------------------------------- /data/file_system/test_project_root/data/test_file_csv_1.csv: -------------------------------------------------------------------------------- 1 | query,answer 2 | hello,hi 3 | How do you do?,"Doing well, thanks. How about yourself?" -------------------------------------------------------------------------------- /data/file_system/test_project_root/tests/unit/test_calculations.py: -------------------------------------------------------------------------------- 1 | import math 2 | def main(): 3 | print(math.cos(1)) 4 | if __name__ == "__main__": 5 | main() -------------------------------------------------------------------------------- /data/file_system/test_project_root/data/test_file_json_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "project_name": "mcp-tool-bench", 3 | "file_name": "test_file_json_1", 4 | "context": {"param1": "1", "param2": "2"}, 5 | "is_locked": false, 6 | } 7 | -------------------------------------------------------------------------------- /data/file_system/test_project_root/data/test_file_json_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "project_name": "mcp-tool-bench", 3 | "file_name": "test_file_json_2", 4 | "context": {"param3": "3", "param4": "4"}, 5 | "is_locked": false, 6 | } 7 | -------------------------------------------------------------------------------- /data/file_system/test_project_root/requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | python>=3.8.0 3 | pathlib2>=2.3.0 4 | typing-extensions>=4.0.0 5 | tqdm>=4.64.0 6 | 7 | # File system operations 8 | shutil>=1.7.0 9 | watchdog>=2.1.0 10 | pyfilesystem2>=2.4.0 11 | 12 | # Data processing 13 | numpy>=1.21.0 14 | pandas>=1.3.0 15 | json5>=0.9.0 16 | 17 | # Security 18 | pycryptodome>=3.15.0 19 | bcrypt>=3.2.0 20 | 21 | # Testing 22 | pytest>=6.2.0 23 | pytest-cov>=2.12.0 24 | 25 | # Development tools 26 | black>=21.7b0 27 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/common_utils.py: -------------------------------------------------------------------------------- 1 | def add_conflict_toolname(tool_name, server_name): 2 | tool_name_register = tool_name + "__" + server_name 3 | return tool_name_register 4 | 5 | def get_conflict_toolname_original(tool_name, server_name): 6 | """ 7 | {server_name}__{tool_name} 8 | """ 9 | tool_name_norm = tool_name 10 | if tool_name.startswith(server_name): 11 | tool_name_norm = tool_name.split("__")[-1] if len(tool_name.split("__")) > 0 else tool_name 12 | return tool_name_norm 13 | -------------------------------------------------------------------------------- /data/file_system/test_project_root/docs/README.md: -------------------------------------------------------------------------------- 1 | # Function Calling Evaluation Project 2 | 3 | This is a test project for file system mcp server. 4 | 5 | ## Project Structure 6 | 7 | ``` 8 | # Project Structure 9 | project_root/ 10 | ├── src/ 11 | │ ├── main.py 12 | │ ├── utils/ 13 | │ │ └── file_utils.py 14 | │ └── config/ 15 | │ └── settings.yaml 16 | ├── tests/ 17 | │ ├── unit/ 18 | │ │ └── test_calculations.py 19 | ├── docs/ 20 | │ └── README.md 21 | ├── data/ 22 | │ ├── test_file_csv_1.csv 23 | │ ├── test_file_txt_1.txt 24 | │ └── test_file_json_1.json 25 | │ └── test_file_json_2.json 26 | └── requirements.txt 27 | ``` 28 | -------------------------------------------------------------------------------- /data/file_system/test_project_root/src/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | import logging 5 | from typing import Optional 6 | 7 | def setup_logging(log_file: Optional[str] = None) -> None: 8 | """Initialize logging configuration""" 9 | logging.basicConfig( 10 | level=logging.INFO, 11 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 12 | handlers=[ 13 | logging.FileHandler(log_file) if log_file else logging.StreamHandler() 14 | ] 15 | ) 16 | 17 | def main() -> None: 18 | """Main entry point for the application""" 19 | try: 20 | # Initialize logging 21 | setup_logging() 22 | 23 | logger = logging.getLogger(__name__) 24 | logger.info("Starting application...") 25 | 26 | # Your application logic here 27 | logger.info("Application initialized successfully") 28 | 29 | except Exception as e: 30 | logging.error(f"Application failed: {str(e)}") 31 | sys.exit(1) 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /data/file_system/test_project_root/src/config/settings.yaml: -------------------------------------------------------------------------------- 1 | # Application basic configuration 2 | app: 3 | name: "MyApplication" 4 | version: "1.0.0" 5 | debug: false 6 | secret_key: "django-insecure-5^s!w8z$1q#d%6^h&y2*" 7 | allowed_hosts: 8 | - "localhost" 9 | - "127.0.0.1" 10 | - "example.com" 11 | 12 | # Database configuration 13 | database: 14 | default: 15 | engine: "django.db.backends.postgresql" 16 | name: "app_db" 17 | user: "db_user" 18 | password: "db_password_123" 19 | host: "localhost" 20 | port: 5432 21 | options: 22 | sslmode: "prefer" 23 | 24 | # Cache configuration 25 | cache: 26 | default: "redis://localhost:6379/0" 27 | session: "redis://localhost:6379/1" 28 | timeout: 300 29 | 30 | # Logging configuration 31 | logging: 32 | level: "INFO" 33 | handlers: 34 | file: 35 | path: "/var/log/app.log" 36 | max_size: "10MB" 37 | backup_count: 5 38 | console: 39 | enabled: true 40 | 41 | # Email configuration 42 | email: 43 | backend: "django.core.mail.backends.smtp.EmailBackend" 44 | host: "smtp.example.com" 45 | port: 587 46 | use_tls: true 47 | username: "user@example.com" 48 | password: "email_password_123" 49 | default_from: "noreply@example.com" 50 | 51 | # Third-party services configuration 52 | services: 53 | payment: 54 | api_key: "pay_sk_test_1234567890" 55 | webhook_secret: "whsec_0987654321" 56 | analytics: 57 | enabled: true 58 | api_key: "analytics_key_abcdef" 59 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/global_variables.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, List, Any, Optional 3 | from pydantic_settings import BaseSettings, SettingsConfigDict 4 | 5 | class Settings(BaseSettings): 6 | 7 | QWEN_API_KEY: Optional[str] = None 8 | OPENAI_API_KEY: Optional[str] = None 9 | ANTHROPIC_API_KEY: Optional[str] = None 10 | GOOGLE_API_KEY: Optional[str] = None 11 | MISTRAL_API_KEY: Optional[str] = None 12 | KIMI_API_KEY: Optional[str] = None 13 | 14 | # Custom OpenAI-compatible API settings 15 | CUSTOM_OPENAI_API_KEY: Optional[str] = None 16 | CUSTOM_OPENAI_BASE_URL: Optional[str] = None 17 | 18 | model_config = SettingsConfigDict( 19 | env_file=".env", 20 | env_file_encoding="utf-8", 21 | extra="ignore" 22 | ) 23 | 24 | settings = Settings() 25 | 26 | ## Model Name Enum 27 | # Claude 28 | MODEL_SELECTION_CLAUDE_OPUS_4 = "claude-opus-4" 29 | MODEL_SELECTION_CLAUDE_37 = "claude-3-7-sonnet-20250219" 30 | # OpenAI 31 | MODEL_SELECTION_GPT4O = "gpt-4o" 32 | # Gemini 33 | MODEL_SELECTION_GEMINI_25_FLASH = "gemini-2.5-flash" 34 | # Qwen 35 | MODEL_SELECTION_QWEN25_MAX = "qwen-max" # latest update to Qwen2.5 36 | MODEL_SELECTION_QWEN3_PLUS = "qwen-plus" 37 | MODEL_SELECTION_QWEN3_TURBO = "qwen-turbo" 38 | MODEL_SELECTION_QWEN3_235B = "qwen3-235b-a22b-instruct-2507" 39 | MODEL_SELECTION_QWEN3_CODER = "qwen3-coder-plus" 40 | # Deepseek 41 | MODEL_SELECTION_DEEPSEEK_R1 = "deepseek-r1" 42 | # Kimi 43 | MODEL_SELECTION_KIMI_K2 = "kimi-k2-0711-preview" 44 | 45 | ## Constant KEY 46 | KEY_MCP_TOOLS_DICT = "mcp_tools_dict" 47 | KEY_BASE_COMPARE_FUNC = "base_compare_func" 48 | KEY_COMPLETION = "completion" 49 | KEY_REASON_CONTENT = "reason" 50 | KEY_FUNCTION_CALL = "function_call" 51 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #### Run 2 | 3 | ### start server Enable MCP Tool Client 4 | cd ./mcp/mcp-marketplace/app/mcp_tool_use 5 | uvicorn src.app:app --port 5000 6 | 7 | ### Test Run Demo 8 | python3 run.py --stage tool_call --input_file ./data/browser/browser_single_demo.json --category browser --model qwen3-max --pass_k 1,3 --evaluation_trial_per_task 5 9 | 10 | ## OpenAI 11 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model gpt-4o --pass_k 1,3 12 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model gpt-4.1 --pass_k 1,3 13 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model o3 --pass_k 1,3 14 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model o3-pro --pass_k 1,3 15 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model o4-mini --pass_k 1,3 16 | 17 | ### Claude API 18 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model claude-opus-4-20250514 --pass_k 1,3 19 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model claude-sonnet-4-20250514 --pass_k 1,3 20 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model claude-3-7-sonnet-20250219 --pass_k 1,3 21 | 22 | ### Qwen API 23 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model qwen3-max --pass_k 1,3 24 | python3 run.py --stage tool_call --input_file ./data/browser/browser_0713_single_500.json --category browser --model qwen3-plus --pass_k 1,3 25 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/model_utils/model_provider.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any 2 | 3 | from ..global_variables import * 4 | from .qwen_api import QwenModelAPIProvider 5 | from .kimi_api import KimiModelAPIProvider 6 | from .claude_api import ClaudeModelAPIProvider 7 | from .openai_api import OpenAIModelAPIProvider 8 | from .custom_openai_api import CustomOpenAIAPIProvider 9 | 10 | _global_model_provider: Dict[str, Any] = {} 11 | 12 | ## CLAUDE 13 | if settings.ANTHROPIC_API_KEY: 14 | _global_model_provider[MODEL_SELECTION_CLAUDE_37] = ClaudeModelAPIProvider(MODEL_SELECTION_CLAUDE_37) 15 | _global_model_provider[MODEL_SELECTION_CLAUDE_OPUS_4] = ClaudeModelAPIProvider(MODEL_SELECTION_CLAUDE_OPUS_4) 16 | 17 | ## OPENAI 18 | if settings.OPENAI_API_KEY: 19 | _global_model_provider[MODEL_SELECTION_GPT4O] = OpenAIModelAPIProvider(MODEL_SELECTION_GPT4O) 20 | 21 | ## QWEN 22 | if settings.QWEN_API_KEY: 23 | _global_model_provider[MODEL_SELECTION_QWEN25_MAX] = QwenModelAPIProvider(MODEL_SELECTION_QWEN25_MAX) 24 | _global_model_provider[MODEL_SELECTION_QWEN3_PLUS] = QwenModelAPIProvider(MODEL_SELECTION_QWEN3_PLUS) 25 | _global_model_provider[MODEL_SELECTION_QWEN3_TURBO] = QwenModelAPIProvider(MODEL_SELECTION_QWEN3_TURBO) 26 | _global_model_provider[MODEL_SELECTION_QWEN3_235B] = QwenModelAPIProvider(MODEL_SELECTION_QWEN3_235B) 27 | _global_model_provider[MODEL_SELECTION_QWEN3_CODER] = QwenModelAPIProvider(MODEL_SELECTION_QWEN3_CODER) 28 | 29 | ## KIMI 30 | if settings.KIMI_API_KEY: 31 | _global_model_provider[MODEL_SELECTION_KIMI_K2] = KimiModelAPIProvider(MODEL_SELECTION_KIMI_K2) 32 | 33 | 34 | def get_model_provider(model: str): 35 | """ 36 | Get or create a model provider for the given model. 37 | If the model exists in _global_model_provider, return it. 38 | Otherwise, try to create a CustomOpenAI provider if custom settings are available. 39 | 40 | Args: 41 | model: The model name to get/create provider for 42 | 43 | Returns: 44 | Model provider instance or None if not available 45 | """ 46 | # Check if model already exists in global provider 47 | if model in _global_model_provider: 48 | return _global_model_provider[model] 49 | 50 | # If custom OpenAI settings are available, create a dynamic provider 51 | if settings.CUSTOM_OPENAI_BASE_URL and settings.CUSTOM_OPENAI_API_KEY: 52 | # Create a new CustomOpenAI provider with the requested model name 53 | provider = CustomOpenAIAPIProvider( 54 | model_name=model, 55 | base_url=settings.CUSTOM_OPENAI_BASE_URL, 56 | api_key=settings.CUSTOM_OPENAI_API_KEY 57 | ) 58 | # Cache it for future use 59 | _global_model_provider[model] = provider 60 | return provider 61 | 62 | return None 63 | -------------------------------------------------------------------------------- /data/finance/finance_single_demo.json: -------------------------------------------------------------------------------- 1 | [{"uuid":"c0720051-d504-4754-81eb-184f96a922eb","category":"finance","call_type":"single","tools":[{"name":"get_stock_price_global_market","description":" Get Public Available Stock Symbols from Global Marketplace\n\n Args:\n symbol_list (List): List of Symbols, such as Tencent: 700, Kuaishou: 1024, Tesla (TSLA), Microsoft(MSFT), Google (GOOG), London Stock Exchange Market, Shell (quote: SHEL), Unilever (quote: ULVR)\n market (str): \"HK\", \"CN_MAINLAND\", \"US\", \"LSE\", \"NSE_INDIA\", etc.\n \n Return: \n str: json str with below values samples\n\n [{'symbol': 'SH600036',\n 'current': 45.78,\n 'percent': 1.33,\n 'chg': 0.6,\n 'high': '45.81 CNY',\n 'low': '44.95 CNY',\n 'avg_price': '45.485751910823915 CNY',\n 'timestamp': 1750057200000,\n 'open': 45.08,\n 'last_close': 45.18,\n 'market_capital': 1154564531614.0,\n 'change': '0.6(1.33%)',\n 'previous_close': '45.18 CNY',\n 'market_capitalization': '11545.65 亿 CNY',\n 'pe_ratio': '',\n 'update_time': '2025-06-16 15:00:00',\n 'source': 'XUEQIU.COM, https://xueqiu.com/S/SH600036',\n 'data_source': 'xueqiu.com',\n 'source_url': 'https://xueqiu.com/S/SH600036'},\n ","input_schema":{"properties":{"symbol_list":{"items":{"type":"string"},"title":"Symbol List","type":"array"},"market":{"title":"Market","type":"string"}},"required":["symbol_list","market"],"title":"get_stock_price_global_marketArguments","type":"object"}}],"mcp_tools_dict":{"finance-agent-mcp-server":["get_stock_price_global_market"]},"query":"What is the current stock price of Tesla in the US market?","function_call_label":[{"name":"get_stock_price_global_market","step":"1","id":"1","mcp_server":"finance-agent-mcp-server","similar_tools":[],"input":{"symbol_list":["TSLA"],"market":"US"},"output":{"status_code":200,"result":{}}}]},{"uuid":"eef90151-02de-446c-94b1-da330b9b26c6","category":"finance","call_type":"single","tools":[{"name":"get_stock_price_global_market","description":" Get Public Available Stock Symbols from Global Marketplace\n\n Args:\n symbol_list (List): List of Symbols, such as Tencent: 700, Kuaishou: 1024, Tesla (TSLA), Microsoft(MSFT), Google (GOOG), London Stock Exchange Market, Shell (quote: SHEL), Unilever (quote: ULVR)\n market (str): \"HK\", \"CN_MAINLAND\", \"US\", \"LSE\", \"NSE_INDIA\", etc.\n \n Return: \n str: json str with below values samples\n\n [{'symbol': 'SH600036',\n 'current': 45.78,\n 'percent': 1.33,\n 'chg': 0.6,\n 'high': '45.81 CNY',\n 'low': '44.95 CNY',\n 'avg_price': '45.485751910823915 CNY',\n 'timestamp': 1750057200000,\n 'open': 45.08,\n 'last_close': 45.18,\n 'market_capital': 1154564531614.0,\n 'change': '0.6(1.33%)',\n 'previous_close': '45.18 CNY',\n 'market_capitalization': '11545.65 亿 CNY',\n 'pe_ratio': '',\n 'update_time': '2025-06-16 15:00:00',\n 'source': 'XUEQIU.COM, https://xueqiu.com/S/SH600036',\n 'data_source': 'xueqiu.com',\n 'source_url': 'https://xueqiu.com/S/SH600036'},\n ","input_schema":{"properties":{"symbol_list":{"items":{"type":"string"},"title":"Symbol List","type":"array"},"market":{"title":"Market","type":"string"}},"required":["symbol_list","market"],"title":"get_stock_price_global_marketArguments","type":"object"}}],"mcp_tools_dict":{"finance-agent-mcp-server":["get_stock_price_global_market"]},"query":"What is the current stock price and market capitalization of Shell in the London Stock Exchange market?","function_call_label":[{"name":"get_stock_price_global_market","step":"1","id":"1","mcp_server":"finance-agent-mcp-server","similar_tools":[],"input":{"symbol_list":["SHEL"],"market":"LSE"},"output":{"status_code":200,"result":{}}}]}] 2 | -------------------------------------------------------------------------------- /data/file_system/test_project_root/src/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import hashlib 4 | from pathlib import Path 5 | from typing import Union, List, Optional 6 | 7 | def read_file(file_path: Union[str, Path]) -> str: 8 | """Read content from a text file. 9 | 10 | Args: 11 | file_path: Path to the file to read 12 | 13 | Returns: 14 | Content of the file as string 15 | 16 | Raises: 17 | FileNotFoundError: If file doesn't exist 18 | IOError: If there are permission issues 19 | """ 20 | with open(file_path, 'r', encoding='utf-8') as f: 21 | return f.read() 22 | 23 | def write_file(file_path: Union[str, Path], content: str, overwrite: bool = False) -> None: 24 | """Write content to a file. 25 | 26 | Args: 27 | file_path: Path to write to 28 | content: Content to write 29 | overwrite: Whether to overwrite existing file 30 | 31 | Raises: 32 | FileExistsError: If file exists and overwrite=False 33 | IOError: For permission issues 34 | """ 35 | if os.path.exists(file_path) and not overwrite: 36 | raise FileExistsError(f"File {file_path} already exists") 37 | 38 | with open(file_path, 'w', encoding='utf-8') as f: 39 | f.write(content) 40 | 41 | def get_file_hash(file_path: Union[str, Path], algorithm: str = 'sha256') -> str: 42 | """Calculate file hash. 43 | 44 | Args: 45 | file_path: Path to file 46 | algorithm: Hash algorithm (md5, sha1, sha256) 47 | 48 | Returns: 49 | Hex digest of file content 50 | """ 51 | hash_func = getattr(hashlib, algorithm)() 52 | with open(file_path, 'rb') as f: 53 | for chunk in iter(lambda: f.read(4096), b''): 54 | hash_func.update(chunk) 55 | return hash_func.hexdigest() 56 | 57 | def ensure_dir_exists(dir_path: Union[str, Path]) -> None: 58 | """Ensure directory exists, create if not. 59 | 60 | Args: 61 | dir_path: Path to directory 62 | """ 63 | os.makedirs(dir_path, exist_ok=True) 64 | 65 | def list_files(dir_path: Union[str, Path], recursive: bool = False) -> List[str]: 66 | """List files in directory. 67 | 68 | Args: 69 | dir_path: Directory to scan 70 | recursive: Whether to scan recursively 71 | 72 | Returns: 73 | List of file paths 74 | """ 75 | if recursive: 76 | return [os.path.join(root, f) 77 | for root, _, files in os.walk(dir_path) 78 | for f in files] 79 | return [f for f in os.listdir(dir_path) 80 | if os.path.isfile(os.path.join(dir_path, f))] 81 | 82 | def safe_delete(file_path: Union[str, Path]) -> bool: 83 | """Safely delete a file if it exists. 84 | 85 | Args: 86 | file_path: Path to file 87 | 88 | Returns: 89 | True if file was deleted, False if it didn't exist 90 | """ 91 | try: 92 | os.remove(file_path) 93 | return True 94 | except FileNotFoundError: 95 | return False 96 | 97 | def copy_file(src: Union[str, Path], dst: Union[str, Path], overwrite: bool = False) -> None: 98 | """Copy file from source to destination. 99 | 100 | Args: 101 | src: Source file path 102 | dst: Destination file path 103 | overwrite: Whether to overwrite existing file 104 | 105 | Raises: 106 | FileExistsError: If destination exists and overwrite=False 107 | """ 108 | if os.path.exists(dst) and not overwrite: 109 | raise FileExistsError(f"File {dst} already exists") 110 | shutil.copy2(src, dst) 111 | 112 | def get_file_size(file_path: Union[str, Path]) -> int: 113 | """Get file size in bytes. 114 | 115 | Args: 116 | file_path: Path to file 117 | 118 | Returns: 119 | File size in bytes 120 | """ 121 | return os.path.getsize(file_path) 122 | 123 | def is_same_file(file1: Union[str, Path], file2: Union[str, Path]) -> bool: 124 | """Check if two files are identical by comparing hashes. 125 | 126 | Args: 127 | file1: First file path 128 | file2: Second file path 129 | 130 | Returns: 131 | True if files have same content 132 | """ 133 | return get_file_hash(file1) == get_file_hash(file2) 134 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/agents/data_generator_agent/utils/generate_query.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Any 2 | from utils.prompt import user_prompt_template_generate_query, system_prompt_template_generate_query, system_prompt_template_generate_query_for_single_tool, system_prompt_template_generate_query_for_filesystem 3 | from utils.prompt_reference import candidate_reference_list, special_needs_description_list 4 | import json 5 | from src.mcp_tool_bench.model_utils.model_provider import _global_model_provider 6 | from src.mcp_tool_bench.global_variables import * 7 | import html 8 | import re 9 | from bs4 import BeautifulSoup 10 | from tqdm import tqdm 11 | 12 | def decode_html_entities(s): 13 | """Decode HTML entities""" 14 | # Try using html.unescape 15 | first_decode = html.unescape(s) 16 | # Check if still contains undecoded entities 17 | if "&" in first_decode and ";" in first_decode: 18 | # Use BeautifulSoup for further decoding 19 | soup = BeautifulSoup(first_decode, "html.parser") 20 | second_decode = soup.get_text() 21 | return second_decode 22 | return first_decode 23 | 24 | def auto_fix_unclosed_quotes(data): 25 | """ 26 | Automatically detect and fix unclosed quotes in strings. 27 | """ 28 | if isinstance(data, list): 29 | return data 30 | 31 | """ 32 | Automatically add space after colon in key-value pairs, e.g., convert 'key:value' to 'key: value' 33 | """ 34 | # Use regex to match cases where colon is not followed by space, and add space 35 | data = re.sub(r'(?m)^(\s*[^#\s][^:]*):([^\s])', r'\1: \2', data) 36 | 37 | lines = data.split("\n") 38 | fixed_lines = [] 39 | for line in lines: 40 | # Detect and fix unclosed quotes 41 | if line.count('"') % 2 != 0: 42 | line = line + '"' # Append a quote to close 43 | fixed_lines.append(line) 44 | return "\n".join(fixed_lines) 45 | 46 | def process_response(response_text): 47 | """Process GPT response text""" 48 | if not response_text: 49 | return "" 50 | 51 | raw_val = decode_html_entities(response_text) 52 | raw_val = auto_fix_unclosed_quotes(raw_val) 53 | decoded_json_str = html.unescape(raw_val) 54 | decoded_json_str = decoded_json_str.replace("```json\n", "").replace("```", "").replace("\n", "") 55 | return decoded_json_str 56 | 57 | def generate_query_and_function_calls(extraction_results: List[List[Dict]], category: str) -> List[Dict]: 58 | """ 59 | Generate user questions and tool call examples based on extracted tools list 60 | 61 | Args: 62 | extraction_results: Tools extraction result list 63 | category: Data category 64 | 65 | Returns: 66 | List[Dict]: Generated data list 67 | """ 68 | # gpt_api = GPTAPI() 69 | generated_data = [] 70 | lang = "English" 71 | # Add progress bar for processing extraction results 72 | for tools_list in tqdm(extraction_results, desc="Generating queries and function calls", unit="tool_list"): 73 | # Here should call GPT API to generate query and function_call_label 74 | 75 | user_prompt = user_prompt_template_generate_query.format(tools=tools_list) 76 | # system_prompt = system_prompt_template_generate_query_for_filesystem.format(candidate_count=5, language=lang) 77 | candidate_reference = candidate_reference_list.get(category, {}) 78 | special_needs_description = special_needs_description_list.get(category, "") 79 | system_prompt = system_prompt_template_generate_query.format(candidate_count=10, language=lang, candidate_reference=candidate_reference, special_needs_description=special_needs_description) 80 | 81 | messages = [ 82 | { 83 | "role": "system", 84 | "content": system_prompt 85 | }, 86 | { 87 | "role": "user", 88 | "content": user_prompt 89 | } 90 | ] 91 | # print("messages: ", messages) 92 | model_provider = _global_model_provider[MODEL_SELECTION_GPT4O_ANT] if MODEL_SELECTION_GPT4O_ANT in _global_model_provider else None 93 | output = model_provider.api_chat(messages, wait_time=5) if model_provider is not None else {} 94 | print("output: ", output) 95 | raw_response = output[KEY_COMPLETION] if KEY_COMPLETION in output else "" 96 | 97 | # Normal chat: process string 98 | if isinstance(raw_response, str): 99 | result = process_response(raw_response) 100 | # result = gpt_api.call(user_prompt, system_prompt, wait_time=10) 101 | 102 | print("result: ", result) 103 | print("type(result): ", type(result)) 104 | 105 | try: 106 | result = json.loads(result) 107 | generated_data.append(result) 108 | except Exception as e: 109 | # logging.error(f"Error processing response: {e}") 110 | continue 111 | 112 | return generated_data 113 | -------------------------------------------------------------------------------- /mcp/tools/browser/puppeteer_puppeteer.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "puppeteer/puppeteer", 3 | "server_name": "puppeteer", 4 | "content_name": "puppeteer/puppeteer", 5 | "website": null, 6 | "content": null, 7 | "abstract": null, 8 | "field": "MCP SERVER", 9 | "subfield": "MCP SERVER", 10 | "category": "BROWSER", 11 | "publisher_id": null, 12 | "thumbnail_picture": null, 13 | "github": null, 14 | "mcp_server_config": null, 15 | "tools": [ 16 | { 17 | "name": "puppeteer_navigate", 18 | "description": "Navigate to a URL", 19 | "input_schema": { 20 | "type": "object", 21 | "properties": { 22 | "url": { 23 | "type": "string", 24 | "description": "URL to navigate to" 25 | }, 26 | "launchOptions": { 27 | "type": "object", 28 | "description": "PuppeteerJS LaunchOptions. Default null. If changed and not null, browser restarts. Example: { headless: true, args: ['--no-sandbox'] }" 29 | }, 30 | "allowDangerous": { 31 | "type": "boolean", 32 | "description": "Allow dangerous LaunchOptions that reduce security. When false, dangerous args like --no-sandbox will throw errors. Default false." 33 | } 34 | }, 35 | "required": [ 36 | "url" 37 | ] 38 | } 39 | }, 40 | { 41 | "name": "puppeteer_screenshot", 42 | "description": "Take a screenshot of the current page or a specific element", 43 | "input_schema": { 44 | "type": "object", 45 | "properties": { 46 | "name": { 47 | "type": "string", 48 | "description": "Name for the screenshot" 49 | }, 50 | "selector": { 51 | "type": "string", 52 | "description": "CSS selector for element to screenshot" 53 | }, 54 | "width": { 55 | "type": "number", 56 | "description": "Width in pixels (default: 800)" 57 | }, 58 | "height": { 59 | "type": "number", 60 | "description": "Height in pixels (default: 600)" 61 | }, 62 | "encoded": { 63 | "type": "boolean", 64 | "description": "If true, capture the screenshot as a base64-encoded data URI (as text) instead of binary image content. Default false." 65 | } 66 | }, 67 | "required": [ 68 | "name" 69 | ] 70 | } 71 | }, 72 | { 73 | "name": "puppeteer_click", 74 | "description": "Click an element on the page", 75 | "input_schema": { 76 | "type": "object", 77 | "properties": { 78 | "selector": { 79 | "type": "string", 80 | "description": "CSS selector for element to click" 81 | } 82 | }, 83 | "required": [ 84 | "selector" 85 | ] 86 | } 87 | }, 88 | { 89 | "name": "puppeteer_fill", 90 | "description": "Fill out an input field", 91 | "input_schema": { 92 | "type": "object", 93 | "properties": { 94 | "selector": { 95 | "type": "string", 96 | "description": "CSS selector for input field" 97 | }, 98 | "value": { 99 | "type": "string", 100 | "description": "Value to fill" 101 | } 102 | }, 103 | "required": [ 104 | "selector", 105 | "value" 106 | ] 107 | } 108 | }, 109 | { 110 | "name": "puppeteer_select", 111 | "description": "Select an element on the page with Select tag", 112 | "input_schema": { 113 | "type": "object", 114 | "properties": { 115 | "selector": { 116 | "type": "string", 117 | "description": "CSS selector for element to select" 118 | }, 119 | "value": { 120 | "type": "string", 121 | "description": "Value to select" 122 | } 123 | }, 124 | "required": [ 125 | "selector", 126 | "value" 127 | ] 128 | } 129 | }, 130 | { 131 | "name": "puppeteer_hover", 132 | "description": "Hover an element on the page", 133 | "input_schema": { 134 | "type": "object", 135 | "properties": { 136 | "selector": { 137 | "type": "string", 138 | "description": "CSS selector for element to hover" 139 | } 140 | }, 141 | "required": [ 142 | "selector" 143 | ] 144 | } 145 | }, 146 | { 147 | "name": "puppeteer_evaluate", 148 | "description": "Execute JavaScript in the browser console", 149 | "input_schema": { 150 | "type": "object", 151 | "properties": { 152 | "script": { 153 | "type": "string", 154 | "description": "JavaScript code to execute" 155 | } 156 | }, 157 | "required": [ 158 | "script" 159 | ] 160 | } 161 | } 162 | ], 163 | "description": "This MCP (Model Context Protocol) is a Puppeteer-based browser automation tool that provides web browsing, screenshot capture, element interaction (clicking, filling, selecting, hovering), and JavaScript execution capabilities." 164 | } 165 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/model_utils/openai_api.py: -------------------------------------------------------------------------------- 1 | # #!/usr/bin/python 2 | # # -*- coding: UTF-8 -*- 3 | import json 4 | import logging 5 | import requests 6 | from typing import List, Dict, Any, Optional 7 | import os 8 | import sys 9 | import openai 10 | from openai import OpenAI 11 | 12 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 13 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../../..'))) 14 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../..'))) 15 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../'))) 16 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, './'))) 17 | 18 | from src.mcp_tool_bench.model_utils.base_api import * 19 | from src.mcp_tool_bench.global_variables import settings 20 | 21 | def tools_openai_wrapper(tools): 22 | tools_wrapped = [{ 23 | "type": "function", 24 | "function":{ 25 | "name": tool["name"] if "name" in tool else "", 26 | "description": tool["description"] if "description" in tool else "", 27 | "parameters": tool["input_schema"] if "input_schema" in tool else {} 28 | } 29 | } for tool in tools] 30 | return tools_wrapped 31 | 32 | class OpenAIModelAPIProvider(BaseModelAPIProvider): 33 | """ 34 | OpenAI API for chat and function calling. 35 | https://platform.openai.com/docs/api-reference/chat 36 | """ 37 | def __init__(self, model_name: str = ""): 38 | super().__init__(model_name) 39 | self.client = OpenAI( 40 | api_key=settings.OPENAI_API_KEY, 41 | base_url="https://api.openai.com/v1" 42 | ) 43 | 44 | def api_chat(self, messages: List, **kwargs) -> Dict[str, Any]: 45 | """ 46 | OpenAI chat completion. 47 | """ 48 | try: 49 | model = self.model_name 50 | if not model: 51 | model = "gpt-4o" 52 | 53 | response = self.client.chat.completions.create( 54 | model=model, 55 | messages=messages, 56 | temperature=kwargs.get("temperature", 0.3) 57 | ) 58 | completion, reasoningContent = post_process_openai_chat_response(response) 59 | result = { 60 | KEY_FUNCTION_CALL: {}, 61 | KEY_COMPLETION: completion, 62 | KEY_REASON_CONTENT: reasoningContent 63 | } 64 | return result 65 | 66 | except Exception as e: 67 | logging.error(f"Failed to process OpenAI api_chat: {e}") 68 | return {} 69 | 70 | def api_function_call(self, messages: List, tools: List, **kwargs) -> Dict[str, Any]: 71 | """ 72 | OpenAI function calling (tool calling). 73 | Args: 74 | messages: List of message [{}, {}] 75 | tools: List of tool definitions [{type: "function", function: {name: "", description: "", parameters: {}}}] 76 | """ 77 | try: 78 | model = self.model_name 79 | if not model: 80 | model = "gpt-4o" 81 | response = self.client.chat.completions.create( 82 | model=model, 83 | messages=messages, 84 | tools=tools, 85 | tool_choice="auto", 86 | temperature=kwargs.get("temperature", 0.3), 87 | **kwargs 88 | ) 89 | tool_result = post_process_openai_function_call_response(response) 90 | tool_call_mapped, completion, reasoningContent = function_call_result_common_mapper(tool_call) 91 | 92 | result = { 93 | KEY_FUNCTION_CALL: tool_call_mapped, 94 | KEY_COMPLETION: "", 95 | KEY_REASON_CONTENT: "" 96 | } 97 | return result 98 | 99 | except Exception as e: 100 | logging.error(f"Failed to process OpenAI api_function_call: {e}") 101 | return {} 102 | 103 | def post_process_openai_chat_response(response): 104 | """ 105 | Processes the response from OpenAI chat completion. 106 | """ 107 | if response is None or not response.choices: 108 | return "", "" 109 | completion_content = "" 110 | if response.choices[0].message.content: 111 | completion_content = response.choices[0].message.content 112 | return completion_content, "" 113 | 114 | def post_process_openai_function_call_response(response): 115 | """ 116 | Processes the response from OpenAI for function calls. 117 | Extracts the tool call details. 118 | """ 119 | if response is None or not response.choices or not response.choices[0].message: 120 | return {} 121 | 122 | try: 123 | message = response.choices[0].message 124 | if message.tool_calls: 125 | first_tool_call = message.tool_calls[0] 126 | if first_tool_call.type == "function" and first_tool_call.function: 127 | tool_call = { 128 | "id": first_tool_call.id, 129 | "function": { 130 | "name": first_tool_call.function.name, 131 | "arguments": first_tool_call.function.arguments 132 | } 133 | } 134 | return tool_call 135 | return {} 136 | except Exception as e: 137 | print (f"Failed to post_process_openai_function_call_response error {e}") 138 | return {} 139 | 140 | if __name__ == '__main__': 141 | # Test function calling 142 | user_prompt = "Weather query template" 143 | system_prompt = "" 144 | try: 145 | messages = [{"role": "user", "content": user_prompt}] 146 | current_dir = os.path.dirname(os.path.abspath(__file__)) 147 | package_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir))) 148 | input_file = os.path.join(package_dir, "mcp/tools/demo/demo_tools.json") 149 | tools = json.load(open(input_file, "r", encoding="utf-8")) 150 | wrappered_tools = tools_openai_wrapper(tools) 151 | 152 | gpt_api_provider = OpenAIModelAPIProvider() 153 | result = gpt_api_provider.api_function_call(messages, wrappered_tools) 154 | print("Function Call Response:", result) 155 | except FileNotFoundError: 156 | print("Demo tools file not found, skipping function call test") 157 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/model_utils/kimi_api.py: -------------------------------------------------------------------------------- 1 | # #!/usr/bin/python 2 | # # -*- coding: UTF-8 -*- 3 | import json 4 | import logging 5 | import requests 6 | from typing import List, Dict, Any, Optional 7 | import os 8 | import sys 9 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 10 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../../..'))) 11 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../..'))) 12 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../'))) 13 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, './'))) 14 | 15 | from src.mcp_tool_bench.model_utils.base_api import * 16 | from src.mcp_tool_bench.global_variables import settings 17 | 18 | 19 | def tools_openai_wrapper(tools): 20 | tools_wrapped = [{ 21 | "type": "function", 22 | "function":{ 23 | "name": tool["name"] if "name" in tool else "", 24 | "description": tool["description"] if "description" in tool else "", 25 | "parameters": tool["input_schema"] if "input_schema" in tool else {} 26 | } 27 | } for tool in tools] 28 | return tools_wrapped 29 | 30 | class KimiModelAPIProvider(BaseModelAPIProvider): 31 | """ 32 | https://platform.moonshot.ai/docs/api/chat#public-service-address 33 | """ 34 | def api_chat(self, messages: List, **kwargs) -> Dict[str, Any]: 35 | """ 36 | Kimi model: "K2" 37 | """ 38 | try: 39 | model = self.model_name 40 | if model == "" or model is None: 41 | model = "kimi-k2-0711-preview" 42 | response = call_kimi_k2_chat(messages, model) 43 | tools, completion, reasoningContent = post_process_kimi_response(response) 44 | result = { 45 | KEY_FUNCTION_CALL: tools, 46 | KEY_COMPLETION: completion, 47 | KEY_REASON_CONTENT: reasoningContent 48 | } 49 | return result 50 | 51 | except Exception as e: 52 | logging.error(f"Failed to process api_chat") 53 | return {} 54 | 55 | def api_function_call(self, messages: List, tools: List, **kwargs) -> Dict[str, Any]: 56 | """ 57 | Args: 58 | messages: List of message [{}, {}] 59 | """ 60 | try: 61 | model = self.model_name 62 | if model == "" or model is None: 63 | model = "kimi-k2-0711-preview" 64 | response = call_kimi_k2_tools(messages, tools, model) 65 | tool_result = post_process_function_call_kimi(response) 66 | tool_call_mapped, completion, reasoningContent = function_call_result_common_mapper(tool_result) 67 | 68 | result = { 69 | KEY_FUNCTION_CALL: tool_call_mapped, 70 | KEY_COMPLETION: "", 71 | KEY_REASON_CONTENT: "" 72 | } 73 | # print (f"KimiModelAPIProvider debug api_function_call result return {result}") 74 | return result 75 | 76 | except Exception as e: 77 | logging.error(e) 78 | return {} 79 | 80 | def call_kimi_k2_chat(messages, model_name): 81 | from openai import OpenAI 82 | 83 | client = OpenAI( 84 | api_key = settings.KIMI_API_KEY, 85 | base_url = "https://api.moonshot.ai/v1", 86 | ) 87 | 88 | completion = client.chat.completions.create( 89 | model = model_name, 90 | messages = messages, 91 | temperature = 0.3, 92 | ) 93 | return completion 94 | 95 | def call_kimi_k2_tools(messages, tools, model_name): 96 | import logging 97 | import urllib3 98 | from openai import OpenAI 99 | 100 | # Completely disable all logging 101 | logging.disable(logging.CRITICAL) 102 | urllib3.disable_warnings() 103 | logging.getLogger("urllib3").setLevel(logging.CRITICAL) 104 | logging.getLogger("openai").setLevel(logging.CRITICAL) 105 | 106 | client = OpenAI( 107 | api_key = settings.KIMI_API_KEY, 108 | base_url = "https://api.moonshot.ai/v1", 109 | ) 110 | 111 | return client.chat.completions.create( 112 | model = model_name, 113 | messages = messages, 114 | tools = tools, 115 | temperature = 0.3, 116 | ) 117 | return completion 118 | 119 | def post_process_kimi_response(response): 120 | if response is None: 121 | return {} 122 | tools = {} 123 | completion = "" 124 | reasoningContent = "" 125 | try: 126 | completion = response.choices[0].message.content 127 | except Exception as e: 128 | logging.error(e) 129 | return tools, completion, reasoningContent 130 | 131 | def post_process_function_call_kimi(response): 132 | if response is None: 133 | return {} 134 | try: 135 | if "error" in response: 136 | logging.error(f"post_process_function_call_kimi error {response}") 137 | return {} 138 | first_tool_call = response.choices[0].message.tool_calls[0] 139 | tool_call = { 140 | "id": first_tool_call.id, 141 | "function": { 142 | "name": first_tool_call.function.name, 143 | "arguments": first_tool_call.function.arguments 144 | } 145 | } 146 | return tool_call 147 | except Exception as e: 148 | logging.error(f"post_process_function_call_kimi {e}") 149 | return {} 150 | 151 | if __name__ == '__main__': 152 | gpt_api_provider = KimiModelAPIProvider(MODEL_SELECTION_KIMI_K2) 153 | 154 | # Test normal conversation 155 | # chat 156 | user_prompt = "Hello, how are you?" 157 | system_prompt = "You are a helpful assistant." 158 | messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}] 159 | result = gpt_api_provider.api_chat(messages) 160 | print("KIMI API Chat Response:", result) 161 | 162 | # Test function calling 163 | user_prompt = "Weather query template" 164 | system_prompt = "" 165 | try: 166 | messages = [{"role": "user", "content": user_prompt}] 167 | current_dir = os.path.dirname(os.path.abspath(__file__)) 168 | package_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir))) 169 | input_file = os.path.join(package_dir, "mcp/tools/demo/demo_tools.json") 170 | tools = json.load(open(input_file, "r", encoding="utf-8")) 171 | wrappered_tools = tools_openai_wrapper(tools) 172 | result = gpt_api_provider.api_function_call(messages, wrappered_tools) 173 | print("KIMI Function Call Response:", result) 174 | except FileNotFoundError: 175 | print("Demo tools file not found, skipping function call test") 176 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/agents/data_generator_agent/run_data_generator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file performs the following operations: 3 | 1. Extract tools field from all mcp json files in the specified category directory under mcp_marketplace, merge them into a single file, place it in the original directory as category_tools.json (check the directory, if this file already exists, no need to re-merge) 4 | 2. Randomly extract tools from category_tools.json, perform multiple extractions and generate extraction result files, placed in logs directory 5 | 3. For each extraction result, i.e., for each tools list, call LLM to generate user questions (query field) and tool call examples list (function_call_label field). Output files are placed in logs directory 6 | 4. Post-process user questions and tool call examples to remove unreasonable data. 7 | 5. Add uuid field, category field, tools field (all tools from category_tools.json, i.e., candidate set) to the processed data, save to data/category directory, file name is category_version.json 8 | ''' 9 | 10 | import json 11 | import uuid 12 | from pathlib import Path 13 | from typing import List, Dict, Any 14 | 15 | from .utils.pre_process import merge_mcp_tools 16 | from .utils.pre_process import random_extract_tools 17 | from .utils.post_process import post_process_data 18 | from .utils.generate_query import generate_query_and_function_calls 19 | 20 | def run_data_generation(category: str, data_version: str, mcp_config_path: str): 21 | """ 22 | Run data generation pipeline 23 | 24 | Args: 25 | category: Data category, such as browser, search 26 | data_version: Data version, such as v0, v1 27 | mcp_config_path: MCP configuration file path 28 | """ 29 | print(f"Starting data generation: category={category}, version={data_version}") 30 | 31 | # Step 1.1: Merge MCP tools 32 | print(" 1.1 Merge MCP tools") 33 | category_tools_path, mcp_tools_dict = merge_mcp_tools(category, mcp_config_path) 34 | 35 | # Step 1.2: Randomly extract tools 36 | print(" 1.2 Randomly extract tools") 37 | extraction_results = random_extract_tools(category_tools_path, min_tools=2, max_tools=5, num_extractions=10) 38 | 39 | # Step 1.3: Generate user questions and tool call examples 40 | print(" 1.3 Generate user questions and tool call examples") 41 | generated_data = generate_query_and_function_calls(extraction_results, category) 42 | 43 | # Step 1.4: Post-process data 44 | print(" 1.4 Post-process data") 45 | processed_data = post_process_data(generated_data, fill_iterations=3, category=category, category_tools_path=category_tools_path) 46 | 47 | # Step 1.5: Save final data 48 | print(" 1.5 Save final data") 49 | save_final_data(processed_data, category, data_version, category_tools_path, mcp_tools_dict) 50 | 51 | 52 | def save_final_data(processed_data: List[Dict], category: str, data_version: str, category_tools_path: str, mcp_tools_dict: Dict): 53 | """ 54 | Save final processed data 55 | 56 | Args: 57 | processed_data: Processed data 58 | category: Data category 59 | data_version: Data version 60 | category_tools_path: Category tools file path 61 | """ 62 | # Read all tools as candidate set 63 | with open(category_tools_path, 'r', encoding='utf-8-sig') as f: 64 | all_tools = json.load(f) 65 | 66 | # Remove mcp_server field from all_tools for the final data 67 | cleaned_tools = [] 68 | for tool in all_tools: 69 | tool_copy = tool.copy() 70 | tool_copy.pop('mcp_server', None) # Remove mcp_server field 71 | cleaned_tools.append(tool_copy) 72 | 73 | # Add necessary fields to each data item 74 | final_data = [] 75 | function_call_label_output = { 76 | "status_code": 200, 77 | "result": {} 78 | } 79 | for item in processed_data: 80 | # Determine type based on function_call_label length 81 | function_call_label = item.get('function_call_label', []) 82 | if isinstance(function_call_label, list): 83 | if len(function_call_label) == 1: 84 | item_call_type = "single" 85 | elif len(function_call_label) > 1: 86 | item_call_type = "multiple" 87 | else: 88 | item_call_type = "single" # Default for empty list 89 | else: 90 | item_call_type = "single" # Default for non-list or missing field 91 | 92 | # Process function_call_label: rename 'arguments' to 'input' and add 'output' field 93 | if isinstance(function_call_label, list): 94 | processed_function_call_label = [] 95 | for call_item in function_call_label: 96 | if isinstance(call_item, dict): 97 | processed_call_item = call_item.copy() 98 | # Rename 'arguments' to 'input' 99 | if 'arguments' in processed_call_item: 100 | processed_call_item['input'] = processed_call_item.pop('arguments') 101 | # Add 'output' field after 'input' 102 | if 'input' in processed_call_item: 103 | # Insert 'output' after 'input' 104 | input_value = processed_call_item['input'] 105 | del processed_call_item['input'] 106 | processed_call_item['input'] = input_value 107 | processed_call_item['output'] = function_call_label_output 108 | else: 109 | processed_call_item['output'] = function_call_label_output 110 | processed_function_call_label.append(processed_call_item) 111 | else: 112 | processed_function_call_label.append(call_item) 113 | else: 114 | processed_function_call_label = function_call_label 115 | 116 | final_item = { 117 | "uuid": str(uuid.uuid4()), 118 | "category": category, 119 | "call_type": item_call_type, # Type based on function_call_label length 120 | "tools": cleaned_tools, # Candidate tools set (without mcp_server field) 121 | "mcp_tools_dict": mcp_tools_dict, # MCP server to tools mapping 122 | **item 123 | } 124 | # Update the function_call_label field with processed version 125 | final_item['function_call_label'] = processed_function_call_label 126 | final_data.append(final_item) 127 | 128 | # Create output directory 129 | output_dir = Path(f"data/{category}") 130 | output_dir.mkdir(parents=True, exist_ok=True) 131 | 132 | # Save as JSON array 133 | output_file = output_dir / f"{category}_{data_version}.json" 134 | with open(output_file, 'w', encoding='utf-8') as f: 135 | json.dump(final_data, f, ensure_ascii=False, indent=2) 136 | 137 | print(f"Data saved to: {output_file}") 138 | print(f"Generated {len(final_data)} data items") 139 | print(f"MCP tools dict: {mcp_tools_dict}") 140 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/model_utils/custom_openai_api.py: -------------------------------------------------------------------------------- 1 | # #!/usr/bin/python 2 | # # -*- coding: UTF-8 -*- 3 | import logging 4 | from typing import List, Dict, Any 5 | import os 6 | import sys 7 | from openai import OpenAI 8 | 9 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 10 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../../..'))) 11 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../..'))) 12 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../'))) 13 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, './'))) 14 | 15 | from src.mcp_tool_bench.model_utils.base_api import BaseModelAPIProvider, function_call_result_common_mapper, KEY_FUNCTION_CALL, KEY_COMPLETION, KEY_REASON_CONTENT 16 | 17 | class CustomOpenAIAPIProvider(BaseModelAPIProvider): 18 | """ 19 | Custom OpenAI-compatible API provider that allows setting custom model name, base URL, and API key. 20 | This can be used with various OpenAI-compatible services like Ollama, LocalAI, vLLM, etc. 21 | """ 22 | def __init__(self, model_name: str, base_url: str, api_key: str = "not-needed"): 23 | """ 24 | Initialize the custom OpenAI-compatible API provider. 25 | 26 | Args: 27 | model_name: The name of the model to use 28 | base_url: The base URL of the OpenAI-compatible API (e.g., "http://localhost:11434/v1") 29 | api_key: The API key (some local services don't require a real key) 30 | """ 31 | super().__init__(model_name) 32 | self.base_url = base_url 33 | self.api_key = api_key 34 | self.client = OpenAI( 35 | api_key=api_key, 36 | base_url=base_url 37 | ) 38 | 39 | def api_chat(self, messages: List, **kwargs) -> Dict[str, Any]: 40 | """ 41 | Custom OpenAI-compatible chat completion. 42 | """ 43 | try: 44 | model = self.model_name 45 | if not model: 46 | raise ValueError("Model name is required for custom API provider") 47 | 48 | response = self.client.chat.completions.create( 49 | model=model, 50 | messages=messages, 51 | temperature=kwargs.get("temperature", 0.3), 52 | **{k: v for k, v in kwargs.items() if k not in ['temperature', 'wait_time']} 53 | ) 54 | completion, reasoning_content = self._post_process_chat_response(response) 55 | result = { 56 | KEY_FUNCTION_CALL: {}, 57 | KEY_COMPLETION: completion, 58 | KEY_REASON_CONTENT: reasoning_content 59 | } 60 | return result 61 | 62 | except Exception as e: 63 | logging.error(f"Failed to process Custom OpenAI API api_chat: {e}") 64 | return {} 65 | 66 | def api_function_call(self, messages: List, tools: List, **kwargs) -> Dict[str, Any]: 67 | """ 68 | Custom OpenAI-compatible function calling (tool calling). 69 | Args: 70 | messages: List of message [{}, {}] 71 | tools: List of tool definitions [{type: "function", function: {name: "", description: "", parameters: {}}}] 72 | """ 73 | try: 74 | model = self.model_name 75 | if not model: 76 | raise ValueError("Model name is required for custom API provider") 77 | 78 | response = self.client.chat.completions.create( 79 | model=model, 80 | messages=messages, 81 | tools=tools, 82 | tool_choice="auto", 83 | temperature=kwargs.get("temperature", 0.3), 84 | **{k: v for k, v in kwargs.items() if k not in ['temperature', 'wait_time']} 85 | ) 86 | tool_call = self._post_process_function_call_response(response) 87 | tool_call_mapped, completion, reasoning_content = function_call_result_common_mapper(tool_call) 88 | 89 | result = { 90 | KEY_FUNCTION_CALL: tool_call_mapped, 91 | KEY_COMPLETION: completion, 92 | KEY_REASON_CONTENT: reasoning_content 93 | } 94 | return result 95 | 96 | except Exception as e: 97 | logging.error(f"Failed to process Custom OpenAI API api_function_call: {e}") 98 | return {} 99 | 100 | def _post_process_chat_response(self, response): 101 | """ 102 | Processes the response from custom OpenAI-compatible chat completion. 103 | """ 104 | if response is None or not response.choices: 105 | return "", "" 106 | completion_content = "" 107 | if response.choices[0].message.content: 108 | completion_content = response.choices[0].message.content 109 | return completion_content, "" 110 | 111 | def _post_process_function_call_response(self, response): 112 | """ 113 | Processes the response from custom OpenAI-compatible API for function calls. 114 | Extracts the tool call details. 115 | """ 116 | if response is None or not response.choices or not response.choices[0].message: 117 | return {} 118 | 119 | try: 120 | message = response.choices[0].message 121 | if message.tool_calls: 122 | first_tool_call = message.tool_calls[0] 123 | if first_tool_call.type == "function" and first_tool_call.function: 124 | tool_call = { 125 | "id": first_tool_call.id, 126 | "function": { 127 | "name": first_tool_call.function.name, 128 | "arguments": first_tool_call.function.arguments 129 | } 130 | } 131 | return tool_call 132 | return {} 133 | except Exception as e: 134 | logging.error(f"Failed to _post_process_function_call_response error {e}") 135 | return {} 136 | 137 | def get_model_info(self): 138 | """ 139 | Returns information about the custom API provider configuration. 140 | """ 141 | return { 142 | "model_name": self.model_name, 143 | "base_url": self.base_url, 144 | "api_key": "***" if self.api_key else None 145 | } 146 | 147 | if __name__ == '__main__': 148 | # Example usage 149 | try: 150 | # Example with Ollama (local deployment) 151 | custom_provider = CustomOpenAIAPIProvider( 152 | model_name="llama3.2", 153 | base_url="http://localhost:11434/v1", 154 | api_key="not-needed" 155 | ) 156 | 157 | messages = [{"role": "user", "content": "Hello, how are you?"}] 158 | result = custom_provider.api_chat(messages) 159 | print("Custom API Chat Response:", result) 160 | print("Provider Info:", custom_provider.get_model_info()) 161 | 162 | except Exception as e: 163 | print(f"Example failed (this is expected if no local service is running): {e}") 164 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/evaluation/evaluation_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from typing import Dict, List, Any, Optional 4 | from pydantic_settings import BaseSettings, SettingsConfigDict 5 | 6 | from src.mcp_tool_bench.global_variables import * 7 | 8 | def base_error_analysis(function_call_result: Any) -> Dict[str, Any]: 9 | """ 10 | prompt: 11 | 12 | This is the error log. Each line separated by space include count \b reason. Please help summarize the reason into a few keywords description and calculate the count and ratio of the reason. 13 | 14 | """ 15 | # function_call_result = trials[0]["function_call_result"] # list 16 | result_list = [] 17 | result_success_label_list = [] 18 | request_failed = "HTTP Request Failed..." 19 | request_result_success = "SUCCESS|Request Result success true" 20 | request_result_empty = "EMPTY RESULT|Response Empty Result" 21 | request_empty_error_msg = "Empty Error Message.." 22 | for call_node in function_call_result: 23 | # tool_call_output = call_node["output"] if "output" in call_node else {} 24 | tool_call_output = call_node 25 | status_code = tool_call_output["status_code"] if "status_code" in tool_call_output else "" 26 | result = tool_call_output["result"] if "result" in tool_call_output else "" 27 | # print("call_node: ", call_node) 28 | # print("tool_call_output: ", tool_call_output) 29 | # print("status_code: ", status_code) 30 | # print("result: ", result) 31 | if status_code != 200: 32 | result_list.append(request_failed) 33 | result_success_label_list.append(0) 34 | else: 35 | ## http sucess 36 | result_json = {} 37 | if isinstance(result, dict): 38 | result_json = result 39 | else: 40 | try: 41 | result_json = json.loads(result) 42 | except Exception as e: 43 | print (e) 44 | 45 | if_success = result_json["success"] if "success" in result_json else False 46 | data = result_json["data"] if "data" in result_json else {} 47 | error = result_json["error"] if "error" in result_json else "" 48 | 49 | empty_data = False 50 | if isinstance(data, list): 51 | empty_data = True if len(data) == 0 or (len(data) > 0 and data[0] == "") or (len(data) > 0 and len(data[0]) == 0) else False # [{}] 52 | elif isinstance(data, dict): 53 | values = "".join([v for k,v in data.items()]) 54 | empty_data = True if len(data) == 0 or (len(data) > 0 and values == "") else False 55 | elif isinstance(data, str): 56 | empty_data = True if len(data) == 0 or (len(data) > 0 and data in ["[]", "\"\"", "\"[]\""]) else False 57 | else: 58 | empty_data = False 59 | 60 | if if_success: 61 | if empty_data: 62 | result_list.append(request_result_empty) 63 | result_success_label_list.append(0) 64 | else: 65 | result_list.append(request_result_success) 66 | result_success_label_list.append(1) 67 | else: 68 | ## sucess false append error logs 69 | if error != "": 70 | # check data 71 | result_list.append(str(error)) 72 | result_success_label_list.append(0) 73 | else: 74 | result_list.append(request_empty_error_msg) 75 | result_success_label_list.append(0) 76 | 77 | # print("result_success_label_list: ", result_success_label_list) 78 | # print("result_list: ", result_list) 79 | return { 80 | "result_success_label_list": result_success_label_list, 81 | "result_list": result_list 82 | } 83 | # return result_success_label_list, result_list 84 | 85 | 86 | def base_compare_result(predict_result: Any, label_result: Any) -> bool: 87 | """ 88 | Compare Exact Value match, e.g. 3 == 3, "New York" == "New York" 89 | """ 90 | return label_result == predict_result 91 | 92 | def base_compare_result_status_dict(predict_result: dict, label_result: dict) -> bool: 93 | """ 94 | label_result: 95 | { 96 | 'success': True, 'data': ['Navigated to https://www.stackoverflow.com'], 'error': None}, 97 | 'status_code': 200 98 | } 99 | predict_result: 100 | { 101 | 'status_code': 200, 102 | "result": {'status_code': 200, 'result': {}}, 'step': '1', 'id': '1'} 103 | } 104 | """ 105 | status_code = predict_result["status_code"] if "status_code" in predict_result else 500 106 | if status_code == 200: 107 | return True 108 | return False 109 | 110 | def base_compare_result_search(predict_result: dict, label_result: dict) -> bool: 111 | """ 112 | Search Result is NOT Empty 113 | """ 114 | if len(label_result) > 0: 115 | return True 116 | return False 117 | 118 | 119 | def estimate_pass_at_k(num_samples, num_correct, k): 120 | """Estimates pass@k of each problem and returns them in an array. 121 | Reference: Implementation from LiveCodeBench: https://github.com/LiveCodeBench/LiveCodeBench 122 | """ 123 | 124 | def estimator(n: int, c: int, k: int) -> float: 125 | """Calculates 1 - comb(n - c, k) / comb(n, k).""" 126 | if n - c < k: 127 | return 1.0 128 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 129 | 130 | import itertools 131 | 132 | if isinstance(num_samples, int): 133 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 134 | else: 135 | assert len(num_samples) == len(num_correct) 136 | num_samples_it = iter(num_samples) 137 | 138 | return np.array( 139 | [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)] 140 | ) 141 | 142 | 143 | _global_tool_result_check_func_provider: Dict[str, Any] = {} 144 | _global_tool_result_check_func_provider[KEY_BASE_COMPARE_FUNC] = base_compare_result 145 | 146 | 147 | ## example: add special tool result compare 148 | _global_tool_result_check_func_provider["playwright_navigate"] = base_compare_result_status_dict 149 | _global_tool_result_check_func_provider["bing_web_search"] = base_compare_result_search 150 | _global_tool_result_check_func_provider["bing_news_search"] = base_compare_result_search 151 | 152 | 153 | def run_test_pass_k(): 154 | 155 | num_samples = [10, 10, 10] 156 | num_correct = [3, 3, 5] 157 | k = 1 158 | 159 | # array([0.3, 0.3, 0.5]) 160 | pass_at_k = estimate_pass_at_k(num_samples, num_correct, k) 161 | final_pass_at_k = sum(pass_at_k)/len(pass_at_k) 162 | print (f"Final Pass @k equals {final_pass_at_k}") 163 | 164 | 165 | pass_at_5 = estimate_pass_at_k([10], [5], 5) # e.g. 0.99603175 166 | 167 | # pass@ n = 10, k = 1 168 | 169 | def main(): 170 | 171 | run_test_pass_k() 172 | 173 | 174 | if __name__ == "__main__": 175 | main() 176 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/model_utils/claude_api.py: -------------------------------------------------------------------------------- 1 | # #!/usr/bin/python 2 | # # -*- coding: UTF-8 -*- 3 | import json 4 | import logging 5 | import requests 6 | from typing import List, Dict, Any, Optional 7 | import os 8 | import sys 9 | import anthropic 10 | 11 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 12 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../../..'))) 13 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../..'))) 14 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../'))) 15 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, './'))) 16 | 17 | from src.mcp_tool_bench.model_utils.base_api import * 18 | from src.mcp_tool_bench.global_variables import settings 19 | 20 | class ClaudeModelAPIProvider(BaseModelAPIProvider): 21 | """ 22 | Anthropic Claude API for chat and tool use. 23 | https://docs.anthropic.com/en/docs/tool-use 24 | """ 25 | def __init__(self, model_name: str = ""): 26 | super().__init__(model_name) 27 | self.client = anthropic.Anthropic(api_key=settings.ANTHROPIC_API_KEY) 28 | 29 | def api_chat(self, messages: List, **kwargs) -> Dict[str, Any]: 30 | """ 31 | Claude chat completion. 32 | """ 33 | try: 34 | model = self.model_name 35 | if not model: 36 | model = MODEL_SELECTION_CLAUDE_37 37 | 38 | system_message_content = "" 39 | chat_messages = [] 40 | for msg in messages: 41 | if msg["role"] == "system": 42 | system_message_content += msg["content"] + "\n" 43 | else: 44 | chat_messages.append(msg) 45 | 46 | response = self.client.messages.create( 47 | model=model, 48 | max_tokens=kwargs.get("max_tokens", 1024), # Claude requires max_tokens 49 | messages=chat_messages, 50 | system=system_message_content.strip() if system_message_content else None, 51 | temperature=kwargs.get("temperature", 0.3), 52 | ) 53 | completion, reasoningContent = post_process_claude_chat_response(response) 54 | result = { 55 | KEY_FUNCTION_CALL: {}, 56 | KEY_COMPLETION: completion, 57 | KEY_REASON_CONTENT: reasoningContent 58 | } 59 | return result 60 | 61 | except Exception as e: 62 | logging.error(f"Failed to process Claude api_chat: {e}") 63 | return {} 64 | 65 | def api_function_call(self, messages: List, tools: List, **kwargs) -> Dict[str, Any]: 66 | """ 67 | Claude tool use (function calling). 68 | Args: 69 | messages: List of message [{}, {}] 70 | tools: List of tool definitions in Claude's format (e.g., from tools_claude_wrapper) 71 | """ 72 | try: 73 | model = self.model_name 74 | if not model: 75 | model = MODEL_SELECTION_CLAUDE_37 76 | 77 | system_message_content = "" 78 | chat_messages = [] 79 | for msg in messages: 80 | if msg["role"] == "system": 81 | system_message_content += msg["content"] + "\n" 82 | else: 83 | chat_messages.append(msg) 84 | 85 | # Claude's `tools` parameter directly takes the list of tool definitions 86 | response = self.client.messages.create( 87 | model=model, 88 | max_tokens=kwargs.get("max_tokens", 1024), # Claude requires max_tokens 89 | messages=chat_messages, 90 | tools=tools, # Assuming tools are already in Claude's format from wrapper 91 | tool_choice=kwargs.get("tool_choice", {"type": "auto"}), # Default to auto 92 | system=system_message_content.strip() if system_message_content else None, 93 | temperature=kwargs.get("temperature", 0.3), 94 | ) 95 | tool_result, completion, reasoningContent = post_process_claude_function_call_response(response) 96 | result = { 97 | KEY_FUNCTION_CALL: tool_result, 98 | KEY_COMPLETION: completion, 99 | KEY_REASON_CONTENT: reasoningContent 100 | } 101 | return result 102 | 103 | except Exception as e: 104 | logging.error(f"Failed to process Claude api_function_call: {e}") 105 | return {} 106 | 107 | def post_process_claude_chat_response(response: Any) -> (str, str): 108 | """ 109 | Processes the response from Claude chat completion. 110 | Claude's response content is a list of content blocks. 111 | """ 112 | if response is None or not response.content: 113 | return "", "" 114 | 115 | completion_content = "" 116 | reasoning_content = "" # Claude might have thinking blocks 117 | 118 | for block in response.content: 119 | if block.type == "text": 120 | completion_content += block.text 121 | return completion_content, reasoning_content 122 | 123 | def post_process_claude_function_call_response(response: Any) -> (Dict[str, Any], str, str): 124 | """ 125 | Processes the response from Claude for tool use. 126 | Extracts the tool call details and any text response. 127 | """ 128 | if response is None or not response.content: 129 | return {}, "", "" 130 | tool_call_result = {} 131 | completion_content = "" 132 | reasoning_content = "" 133 | 134 | try: 135 | 136 | for block in response.content: 137 | if block.type == "tool_use": 138 | if not tool_call_result: 139 | tool_call_result = { 140 | "function_name": block.name, 141 | "function_arguments": block.input, # Claude's tool_use.input is already a dict 142 | "is_function_call": True, 143 | "id": block.id # Store tool_use ID for sending tool results back 144 | } 145 | elif block.type == "text": 146 | completion_content += block.text 147 | return tool_call_result, completion_content, reasoning_content 148 | 149 | except Exception as e: 150 | print (f"DEBUG: Failed to post_process_claude_function_call_response with error {e}") 151 | return tool_call_result, completion_content, reasoning_content 152 | 153 | if __name__ == '__main__': 154 | # Test function calling 155 | user_prompt = "Weather query template" 156 | system_prompt = "" 157 | try: 158 | messages = [{"role": "user", "content": user_prompt}] 159 | current_dir = os.path.dirname(os.path.abspath(__file__)) 160 | package_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir))) 161 | input_file = os.path.join(package_dir, "mcp/tools/demo/demo_tools.json") 162 | tools = json.load(open(input_file, "r", encoding="utf-8")) 163 | 164 | api_provider = ClaudeModelAPIProvider(MODEL_SELECTION_CLAUDE_37) 165 | result = api_provider.api_function_call(messages, tools) 166 | print("Function Call Response:", result) 167 | except FileNotFoundError: 168 | print("Demo tools file not found, skipping function call test") 169 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/agents/data_generator_agent/utils/pre_process.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Randomly extract tools from category_tools.json, perform multiple extractions and generate extraction result files, placed in logs directory 3 | ''' 4 | 5 | import json 6 | import random 7 | from pathlib import Path 8 | from typing import List, Dict, Any 9 | 10 | 11 | def merge_mcp_tools(category: str, mcp_config_path: str) -> str: 12 | """ 13 | Merge all MCP tools in the specified category directory 14 | 15 | Args: 16 | category: Data category, such as browser, search 17 | mcp_config_path: MCP configuration file path 18 | 19 | Returns: 20 | str: Path to the merged tools file 21 | """ 22 | # Build category directory path 23 | category_dir = Path(f"mcp/tools/{category}") 24 | 25 | if not category_dir.exists(): 26 | raise FileNotFoundError(f"Category directory does not exist: {category_dir}") 27 | 28 | output_file = category_dir / f"{category}_tools.json" 29 | 30 | # # Check if merged file already exists 31 | # if output_file.exists(): 32 | # print(f"Merged file already exists, skipping merge step: {output_file}") 33 | # return str(output_file) 34 | 35 | # Collect all MCP tools 36 | all_tools = [] 37 | 38 | # Iterate through all JSON files in the directory 39 | for json_file in category_dir.glob("*.json"): 40 | if json_file.name == f"{category}_tools.json": 41 | continue # Skip existing merged file 42 | 43 | try: 44 | with open(json_file, 'r', encoding='utf-8') as f: 45 | mcp_data = json.load(f) 46 | 47 | # Extract MCP ID from file content 48 | mcp_id = mcp_data.get('server_name', json_file.stem) # Fallback to filename if no id field 49 | 50 | # Extract tools field 51 | if 'tools' in mcp_data: 52 | tools = mcp_data['tools'] 53 | if isinstance(tools, list): 54 | # Add mcp_server field to each tool 55 | for tool in tools: 56 | tool['mcp_server'] = mcp_id 57 | all_tools.extend(tools) 58 | else: 59 | # Add mcp_server field to single tool 60 | tools['mcp_server'] = mcp_id 61 | all_tools.append(tools) 62 | 63 | except Exception as e: 64 | print(f"Warning: Cannot read file {json_file}: {e}") 65 | continue 66 | 67 | # Remove duplicate tools (based on tool name AND mcp_server) 68 | # Keep tools with same name but from different MCP servers 69 | unique_tools = [] 70 | tool_key_set = set() # Track (tool_name, mcp_server) combinations 71 | # Generate mcp_tools_dict: {mcp_server: [tool_name1, tool_name2, ...]} 72 | # Handle tool name conflicts by renaming to "mcp_server_tool_name" format 73 | # First occurrence keeps original name, subsequent conflicts get renamed 74 | mcp_tools_dict = {} 75 | all_tool_names = set() # Track all tool names across all MCP servers 76 | 77 | for tool in all_tools: 78 | tool_name = tool.get('name', '') 79 | mcp_server = tool.get('mcp_server', 'unknown') 80 | 81 | if tool_name: 82 | tool_key = (tool_name, mcp_server) 83 | if tool_key not in tool_key_set: 84 | tool_key_set.add(tool_key) 85 | 86 | # Check if tool name already exists in any MCP server 87 | if tool_name in all_tool_names: 88 | # Rename to avoid conflict: "mcp_server_tool_name" 89 | new_tool_name = f"{mcp_server}_{tool_name}" 90 | tool['name'] = new_tool_name # Update the tool name in the original list 91 | else: 92 | new_tool_name = tool_name 93 | 94 | # Add to tracking set (use the new name for future conflict detection) 95 | all_tool_names.add(new_tool_name) 96 | 97 | # Add to mcp_tools_dict 98 | if mcp_server not in mcp_tools_dict: 99 | mcp_tools_dict[mcp_server] = [] 100 | mcp_tools_dict[mcp_server].append(new_tool_name) 101 | 102 | unique_tools.append(tool) 103 | 104 | # Save merged tools file 105 | with open(output_file, 'w', encoding='utf-8') as f: 106 | json.dump(unique_tools, f, ensure_ascii=False, indent=2) 107 | 108 | print(f"Successfully merged {len(unique_tools)} tools to: {output_file}") 109 | return str(output_file), mcp_tools_dict 110 | 111 | def random_extract_tools(category_tools_path: str, num_extractions: int = 10, 112 | min_tools: int = 1, max_tools: int = 3) -> List[List[Dict]]: 113 | """ 114 | Randomly extract tools from category tools file 115 | 116 | Args: 117 | category_tools_path: Category tools file path 118 | num_extractions: Number of extractions 119 | min_tools: Minimum number of tools per extraction 120 | max_tools: Maximum number of tools per extraction 121 | 122 | Returns: 123 | List[List[Dict]]: Extraction result list, each element is a group of tools 124 | """ 125 | # Read all tools 126 | with open(category_tools_path, 'r', encoding='utf-8-sig') as f: 127 | data = json.load(f) 128 | 129 | # Handle different data structures 130 | if isinstance(data, dict) and 'tools' in data: 131 | all_tools = data['tools'] 132 | elif isinstance(data, list): 133 | all_tools = data 134 | else: 135 | raise ValueError(f"Unsupported data format: {type(data)}") 136 | 137 | if not all_tools: 138 | raise ValueError("Tool list is empty") 139 | 140 | extraction_results = [] 141 | 142 | # Perform multiple random extractions 143 | for i in range(num_extractions): 144 | # Randomly decide the number of tools for this extraction 145 | num_tools = random.randint(min_tools, min(max_tools, len(all_tools))) 146 | 147 | # Randomly extract tools 148 | selected_tools = random.sample(all_tools, num_tools) 149 | extraction_results.append(selected_tools) 150 | 151 | # Save extraction results to logs directory 152 | save_extraction_results(extraction_results, category_tools_path) 153 | 154 | print(f"Completed {num_extractions} tool extractions, {min_tools}-{max_tools} tools per extraction") 155 | return extraction_results 156 | 157 | 158 | def save_extraction_results(extraction_results: List[List[Dict]], category_tools_path: str): 159 | """ 160 | Save extraction results to logs directory 161 | 162 | Args: 163 | extraction_results: Extraction result list 164 | category_tools_path: Original tools file path 165 | """ 166 | # Create logs directory 167 | logs_dir = Path("src/mcp_tool_bench/agents/data_generator_agent/logs") 168 | logs_dir.mkdir(parents=True, exist_ok=True) 169 | 170 | # Extract category name from original path 171 | category = Path(category_tools_path).parent.name 172 | 173 | # Save extraction results 174 | output_file = logs_dir / f"{category}_extraction_results.json" 175 | with open(output_file, 'w', encoding='utf-8-sig') as f: 176 | json.dump(extraction_results, f, ensure_ascii=False, indent=2) 177 | 178 | print(f"Extraction results saved to: {output_file}") 179 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/utils/count_tools.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import re 4 | import tiktoken 5 | from pathlib import Path 6 | 7 | def count_mcp_files(directory): 8 | """Count MCP files excluding jsonl and type_tools.json files""" 9 | count = 0 10 | for file in os.listdir(directory): 11 | if file.endswith('.json') and not file.endswith('_tools.json'): 12 | count += 1 13 | return count 14 | 15 | def count_words(text): 16 | """Count words in text by splitting on whitespace and special characters""" 17 | # Remove special characters and split on whitespace 18 | words = re.findall(r'\b\w+\b', text.lower()) 19 | return len(words) 20 | 21 | def count_tokens(text): 22 | """Count tokens using tiktoken's cl100k_base encoder""" 23 | try: 24 | encoding = tiktoken.get_encoding("cl100k_base") 25 | return len(encoding.encode(text)) 26 | except Exception as e: 27 | print(f"Warning: Error counting tokens: {e}") 28 | return len(text) // 4 # Fallback to simple approximation 29 | 30 | def count_tools(mcp_type_dir): 31 | base_dir = mcp_type_dir 32 | results = [] 33 | 34 | # Process each subdirectory 35 | for subdir in os.listdir(base_dir): 36 | subdir_path = os.path.join(base_dir, subdir) 37 | if not os.path.isdir(subdir_path): 38 | continue 39 | 40 | # Find the type_tools.json file 41 | tools_file = os.path.join(subdir_path, f'{subdir}_tools.json') 42 | if not os.path.exists(tools_file): 43 | print(f"Warning: {tools_file} not found") 44 | continue 45 | 46 | # Count total tools and calculate statistics 47 | with open(tools_file, 'r', encoding='utf-8-sig') as f: 48 | data = json.load(f) 49 | # tools = data.get('tools', []) 50 | tools = data 51 | total_tools = len(tools) 52 | 53 | # Calculate statistics for all tools 54 | char_lengths = [] 55 | word_counts = [] 56 | token_counts = [] 57 | 58 | for tool in tools: 59 | tool_str = json.dumps(tool) 60 | char_lengths.append(len(tool_str.encode('utf-8'))) 61 | word_counts.append(count_words(tool_str)) 62 | token_counts.append(count_tokens(tool_str)) 63 | 64 | total_chars = sum(char_lengths) 65 | total_words = sum(word_counts) 66 | total_tokens = sum(token_counts) 67 | 68 | avg_chars = total_chars / total_tools if total_tools > 0 else 0 69 | avg_words = total_words / total_tools if total_tools > 0 else 0 70 | avg_tokens = total_tokens / total_tools if total_tools > 0 else 0 71 | 72 | max_chars = max(char_lengths) if char_lengths else 0 73 | min_chars = min(char_lengths) if char_lengths else 0 74 | max_words = max(word_counts) if word_counts else 0 75 | min_words = min(word_counts) if word_counts else 0 76 | max_tokens = max(token_counts) if token_counts else 0 77 | min_tokens = min(token_counts) if token_counts else 0 78 | 79 | # Count MCP files 80 | mcp_count = count_mcp_files(subdir_path) 81 | 82 | # Calculate average 83 | avg_tools = total_tools / mcp_count if mcp_count > 0 else 0 84 | 85 | results.append({ 86 | 'type': subdir, 87 | 'total_tools': total_tools, 88 | 'mcp_count': mcp_count, 89 | 'avg_tools_per_mcp': round(avg_tools, 2), 90 | 'avg_chars_per_tool': round(avg_chars, 2), 91 | 'avg_words_per_tool': round(avg_words, 2), 92 | 'avg_tokens_per_tool': round(avg_tokens, 2), 93 | 'max_chars': max_chars, 94 | 'min_chars': min_chars, 95 | 'max_words': max_words, 96 | 'min_words': min_words, 97 | 'max_tokens': max_tokens, 98 | 'min_tokens': min_tokens, 99 | 'total_chars': total_chars, 100 | 'total_words': total_words, 101 | 'total_tokens': total_tokens 102 | }) 103 | 104 | # Sort results by total tools 105 | results.sort(key=lambda x: x['total_tools'], reverse=True) 106 | 107 | # Prepare output string 108 | output = [] 109 | output.append("\n{:<30} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15}".format( 110 | "Type", "Total Tools", "MCP Count", "Avg Tools/MCP", "Avg Chars/Tool", "Avg Words/Tool", 111 | "Avg Tokens/Tool", "Tool Max Chars", "Tool Min Chars", "Tool Max Words", "Tool Min Words", 112 | "Tool Max Tokens", "Tool Min Tokens")) 113 | output.append("-" * 195) 114 | 115 | for r in results: 116 | output.append("{:<30} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15} {:<15}".format( 117 | r['type'], 118 | r['total_tools'], 119 | r['mcp_count'], 120 | r['avg_tools_per_mcp'], 121 | r['avg_chars_per_tool'], 122 | r['avg_words_per_tool'], 123 | r['avg_tokens_per_tool'], 124 | r['max_chars'], 125 | r['min_chars'], 126 | r['max_words'], 127 | r['min_words'], 128 | r['max_tokens'], 129 | r['min_tokens'] 130 | )) 131 | 132 | # Print to console 133 | print("\n".join(output)) 134 | 135 | # Write to file 136 | mcp_type_dir = Path(mcp_type_dir) 137 | with open(mcp_type_dir.parent / "logs" / "tools_statistics.txt", 'w', encoding='utf-8-sig') as f: 138 | f.write("\n".join(output)) 139 | f.write("\n\nSummary Statistics:\n") 140 | f.write("-" * 50 + "\n") 141 | f.write(f"Total Types: {len(results)}\n") 142 | f.write(f"Total Tools Across All Types: {sum(r['total_tools'] for r in results)}\n") 143 | f.write(f"Total MCPs Across All Types: {sum(r['mcp_count'] for r in results)}\n") 144 | f.write(f"Overall Average Tools per MCP: {sum(r['total_tools'] for r in results) / sum(r['mcp_count'] for r in results):.2f}\n") 145 | f.write(f"Overall Average Chars per Tool: {sum(r['total_tools'] * r['avg_chars_per_tool'] for r in results) / sum(r['total_tools'] for r in results):.2f}\n") 146 | f.write(f"Overall Average Words per Tool: {sum(r['total_tools'] * r['avg_words_per_tool'] for r in results) / sum(r['total_tools'] for r in results):.2f}\n") 147 | f.write(f"Overall Average Tokens per Tool: {sum(r['total_tools'] * r['avg_tokens_per_tool'] for r in results) / sum(r['total_tools'] for r in results):.2f}\n") 148 | f.write(f"Global Max Chars: {max(r['max_chars'] for r in results)}\n") 149 | f.write(f"Global Min Chars: {min(r['min_chars'] for r in results)}\n") 150 | f.write(f"Global Max Words: {max(r['max_words'] for r in results)}\n") 151 | f.write(f"Global Min Words: {min(r['min_words'] for r in results)}\n") 152 | f.write(f"Global Max Tokens: {max(r['max_tokens'] for r in results)}\n") 153 | f.write(f"Global Min Tokens: {min(r['min_tokens'] for r in results)}\n") 154 | # f.write(f"Total Characters Across All Types: {sum(r['total_chars'] for r in results)}\n") 155 | # f.write(f"Total Words Across All Types: {sum(r['total_words'] for r in results)}\n") 156 | # f.write(f"Total Tokens Across All Types: {sum(r['total_tokens'] for r in results)}\n") 157 | 158 | if __name__ == '__main__': 159 | mcp_type_dir = "mcp/tools" 160 | count_tools(mcp_type_dir) 161 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/model_utils/qwen_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import List, Dict, Any, Optional 4 | from .base_api import * 5 | from ..global_variables import settings 6 | import requests 7 | 8 | class QwenModelAPIProvider(BaseModelAPIProvider): 9 | 10 | def api_chat(self, messages: List, **kwargs) -> Dict[str, Any]: 11 | """ 12 | Qwen model: "qwen-max", "qwen-plus" 13 | """ 14 | try: 15 | model = self.model_name 16 | if model == "" or model is None: 17 | model = "qwen-plus" 18 | 19 | response = call_qwen_messages_model_selection(messages, self.model_name) 20 | tools, completion, reasoningContent = post_process_qwen_response(response) 21 | result = { 22 | KEY_FUNCTION_CALL: tools, 23 | KEY_COMPLETION: completion, 24 | KEY_REASON_CONTENT: reasoningContent 25 | } 26 | return result 27 | 28 | except Exception as e: 29 | logging.error(f"Failed to process api_chat") 30 | return {} 31 | 32 | def api_function_call(self, messages: List, tools: List, **kwargs) -> Dict[str, Any]: 33 | """ 34 | Args: 35 | messages: List of message [{}, {}] 36 | tools: List 37 | Returns: 38 | result: Dict: , 39 | { 40 | 'tools': 41 | { 42 | 'function_name': 'playwright_navigate', 'function_arguments': '{"url": "https://www.stackoverflow.com", "browserType": "chromium"}', 43 | 'is_function_call': True, 44 | 'id': 'call_6cb5d88bb3cf4884aadc03' 45 | }, 46 | 'completion': '', 47 | 'reason': '' 48 | } 49 | """ 50 | try: 51 | model = self.model_name 52 | if model == "" or model is None: 53 | model = "qwen-plus" 54 | messages, tools, model 55 | response = call_qwen_tool_calls_model_selection(messages, tools, model) 56 | tool_call = post_process_function_call_qwen_common(response) 57 | tool_call_mapped, completion, reasoningContent = function_call_result_common_mapper(tool_call) 58 | 59 | result = { 60 | KEY_FUNCTION_CALL: tool_call_mapped, 61 | KEY_COMPLETION: "", 62 | KEY_REASON_CONTENT: "" 63 | } 64 | print (f"AntQwenModelAPIProvider debug api_function_call result return {result}") 65 | 66 | return result 67 | 68 | except Exception as e: 69 | logging.error(f"QwenModelAPIProvider {e}") 70 | return {} 71 | 72 | def call_qwen_messages_model_selection(messages: List, model: str): 73 | """ 74 | Reference doc: https://help.aliyun.com/zh/model-studio/use-qwen-by-calling-api#b30677f6e9437 75 | Input: 76 | messages: List[Dict] 77 | """ 78 | try: 79 | url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions" 80 | api_key = settings.QWEN_API_KEY 81 | if api_key is None: 82 | raise ValueError("qwen_general_api.py call_qwen_max_user_prompt api_key not found, please check .env file key QWEN_API_KEY") 83 | headers = { 84 | "Authorization": f"Bearer {api_key}", 85 | "Content-Type": "application/json" 86 | } 87 | data = { 88 | "model": model, 89 | "messages": messages, 90 | } 91 | data = json.dumps(data).encode("utf-8") 92 | response = requests.post(url, headers=headers, data=data, timeout=10) 93 | if response.status_code == 200: 94 | result = response.json() 95 | print("Qwen Response:", result["choices"][0]["message"]["content"]) 96 | else: 97 | print(f"API Return Failed with Status (Status Code: {response.status_code}): {response.text}") 98 | return response 99 | except Exception as e: 100 | logging.error(e) 101 | return None 102 | 103 | 104 | def call_qwen_user_prompt_model_selection(user_prompt: str, model: str): 105 | """ 106 | Reference doc: https://help.aliyun.com/zh/model-studio/use-qwen-by-calling-api#b30677f6e9437 107 | """ 108 | try: 109 | url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions" 110 | api_key = settings.QWEN_API_KEY 111 | if api_key is None: 112 | raise ValueError("qwen_general_api.py call_qwen_max_user_prompt api_key not found, please check .env file key QWEN_API_KEY") 113 | headers = { 114 | "Authorization": f"Bearer {api_key}", 115 | "Content-Type": "application/json" 116 | } 117 | data = { 118 | "model": model, 119 | "messages": [{"role": "user", "content": user_prompt}], 120 | } 121 | data = json.dumps(data).encode("utf-8") 122 | response = requests.post(url, headers=headers, data=data, timeout=10) 123 | if response.status_code == 200: 124 | result = response.json() 125 | print("Qwen Response:", result["choices"][0]["message"]["content"]) 126 | else: 127 | print(f"API Return Failed with Status (Status Code: {response.status_code}): {response.text}") 128 | return response 129 | except Exception as e: 130 | logging.error(e) 131 | return None 132 | 133 | 134 | def post_process_qwen_response(response): 135 | if response is None: 136 | return {} 137 | tools, completion,reasoningContent = {}, "", "" 138 | 139 | res_json = {} 140 | try: 141 | print (f"post_process_function_call_qwen_base input response {response} and type {type(response)}") 142 | res_json = json.loads(response.content) 143 | except json.decoder.JSONDecodeError: 144 | print("Not Valid Json Format") 145 | return '' 146 | try: 147 | # x = res_json["data"]["values"]["data"] 148 | completion = res_json["choices"][0]["message"]["content"] 149 | usage = res_json["usage"] if "usage" in res_json else {} 150 | except Exception as e: 151 | logging.error(e) 152 | return tools, completion, reasoningContent 153 | 154 | 155 | 156 | def call_qwen_tool_calls_model_selection(messages, tools, model): 157 | """ 158 | Args: 159 | messages: list of dict 160 | tools: list of dict 161 | return: 162 | {"choices":[{"message":{"content":"","role":"assistant","tool_calls":[{"index":0,"id":"call_f8d9f219ee034156985f6a","type":"function","function":{"name":"get_current_weather","arguments":"{\"location\": \"上海\"}"}}]},"finish_reason":"tool_calls","index":0,"logprobs":null}],"object":"chat.completion","usage":{"prompt_tokens":266,"completion_tokens":20,"total_tokens":286,"prompt_tokens_details":{"cached_tokens":0}},"created":1750987730,"system_fingerprint":null,"model":"qwen-plus","id":"chatcmpl-3bd1954c-8594-98e1-957b-9fda39ac73fc"} 163 | doc: https://help.aliyun.com/zh/model-studio/qwen-function-calling 164 | """ 165 | try: 166 | api_key = settings.QWEN_API_KEY 167 | url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions" 168 | headers = { 169 | 'Content-Type': 'application/json', 170 | 'Authorization': f"Bearer {api_key}", 171 | } 172 | data = { 173 | "stream": False, 174 | "model": model, 175 | "messages": messages, 176 | "tools": tools 177 | } 178 | data = json.dumps(data).encode("utf-8") 179 | response = requests.post(url, headers=headers, data=data) 180 | if response.status_code == 200: 181 | result = response.json() 182 | print("Qwen Response:", result["choices"][0]["message"]["content"]) 183 | else: 184 | print(f"API Return Failed with Status (Status Code: {response.status_code}): {response.text}") 185 | return response 186 | except Exception as e: 187 | print (e) 188 | return None 189 | 190 | def post_process_function_call_qwen_common(response): 191 | """ 192 | tool_call: 193 | { 194 | "id": "call_6fcd208b442c4c12b1b419", 195 | "function": { 196 | "arguments": "{\"location\": \"\u4e0a\u6d77\u5e02\"}", 197 | "name": "get_current_weather" 198 | }, 199 | "type": "function", 200 | "index": 0 201 | } 202 | """ 203 | if response is None: 204 | return {} 205 | 206 | tools = {} 207 | completion = "" 208 | reasoningContent = "" 209 | 210 | res_json = {} 211 | try: 212 | content = response.content 213 | logging.info(f"post_process_function_call_qwen_base content {content}") 214 | res_json = json.loads(content) 215 | 216 | except json.decoder.JSONDecodeError: 217 | print("Not Valid Json Format" + content) 218 | return {} 219 | try: 220 | choice = res_json["choices"][0] if len(res_json["choices"]) > 0 else {} 221 | finish_reason = choice["finish_reason"] if "finish_reason" in choice else "" # tool_calls 222 | message = choice["message"] if "message" in choice else {} 223 | tool_calls = message["tool_calls"] if "tool_calls" in message else [] 224 | tool_call = tool_calls[0] if len(tool_calls) > 0 else {} 225 | return tool_call 226 | except Exception as e: 227 | logging.error(e) 228 | return {} 229 | -------------------------------------------------------------------------------- /data/search/search_single_demo.json: -------------------------------------------------------------------------------- 1 | [{"uuid":"6e88dafd-c1d7-4aa6-8fe2-2db88a874664","category":"search","call_type":"single","tools":[{"name":"tavily-search","description":"A powerful web search tool that provides comprehensive, real-time results using Tavily's AI search engine. Returns relevant web content with customizable parameters for result count, content type, and domain filtering. Ideal for gathering current information, news, and detailed web content analysis.","input_schema":{"type":"object","properties":{"query":{"type":"string","description":"Search query"},"search_depth":{"type":"string","enum":["basic","advanced"],"description":"The depth of the search. It can be 'basic' or 'advanced'","default":"basic"},"topic":{"type":"string","enum":["general","news"],"description":"The category of the search. This will determine which of our agents will be used for the search","default":"general"},"days":{"type":"number","description":"The number of days back from the current date to include in the search results. This specifies the time frame of data to be retrieved. Please note that this feature is only available when using the 'news' search topic","default":3},"time_range":{"type":"string","description":"The time range back from the current date to include in the search results. This feature is available for both 'general' and 'news' search topics","enum":["day","week","month","year","d","w","m","y"]},"max_results":{"type":"number","description":"The maximum number of search results to return","default":10,"minimum":5,"maximum":20},"include_images":{"type":"boolean","description":"Include a list of query-related images in the response","default":false},"include_image_descriptions":{"type":"boolean","description":"Include a list of query-related images and their descriptions in the response","default":false},"include_raw_content":{"type":"boolean","description":"Include the cleaned and parsed HTML content of each search result","default":false},"include_domains":{"type":"array","items":{"type":"string"},"description":"A list of domains to specifically include in the search results, if the user asks to search on specific sites set this to the domain of the site","default":[]},"exclude_domains":{"type":"array","items":{"type":"string"},"description":"List of domains to specifically exclude, if the user asks to exclude a domain set this to the domain of the site","default":[]},"country":{"type":"string","enum":["afghanistan","albania","algeria","andorra","angola","argentina","armenia","australia","austria","azerbaijan","bahamas","bahrain","bangladesh","barbados","belarus","belgium","belize","benin","bhutan","bolivia","bosnia and herzegovina","botswana","brazil","brunei","bulgaria","burkina faso","burundi","cambodia","cameroon","canada","cape verde","central african republic","chad","chile","china","colombia","comoros","congo","costa rica","croatia","cuba","cyprus","czech republic","denmark","djibouti","dominican republic","ecuador","egypt","el salvador","equatorial guinea","eritrea","estonia","ethiopia","fiji","finland","france","gabon","gambia","georgia","germany","ghana","greece","guatemala","guinea","haiti","honduras","hungary","iceland","india","indonesia","iran","iraq","ireland","israel","italy","jamaica","japan","jordan","kazakhstan","kenya","kuwait","kyrgyzstan","latvia","lebanon","lesotho","liberia","libya","liechtenstein","lithuania","luxembourg","madagascar","malawi","malaysia","maldives","mali","malta","mauritania","mauritius","mexico","moldova","monaco","mongolia","montenegro","morocco","mozambique","myanmar","namibia","nepal","netherlands","new zealand","nicaragua","niger","nigeria","north korea","north macedonia","norway","oman","pakistan","panama","papua new guinea","paraguay","peru","philippines","poland","portugal","qatar","romania","russia","rwanda","saudi arabia","senegal","serbia","singapore","slovakia","slovenia","somalia","south africa","south korea","south sudan","spain","sri lanka","sudan","sweden","switzerland","syria","taiwan","tajikistan","tanzania","thailand","togo","trinidad and tobago","tunisia","turkey","turkmenistan","uganda","ukraine","united arab emirates","united kingdom","united states","uruguay","uzbekistan","venezuela","vietnam","yemen","zambia","zimbabwe"],"description":"Boost search results from a specific country. This will prioritize content from the selected country in the search results. Available only if topic is general.","default":""}},"required":["query"]}},{"name":"tavily-extract","description":"A powerful web content extraction tool that retrieves and processes raw content from specified URLs, ideal for data collection, content analysis, and research tasks.","input_schema":{"type":"object","properties":{"urls":{"type":"array","items":{"type":"string"},"description":"List of URLs to extract content from"},"extract_depth":{"type":"string","enum":["basic","advanced"],"description":"Depth of extraction - 'basic' or 'advanced', if usrls are linkedin use 'advanced' or if explicitly told to use advanced","default":"basic"},"include_images":{"type":"boolean","description":"Include a list of images extracted from the urls in the response","default":false},"format":{"type":"string","enum":["markdown","text"],"description":"The format of the extracted web page content. markdown returns content in markdown format. text returns plain text and may increase latency.","default":"markdown"}},"required":["urls"]}},{"name":"tavily-crawl","description":"A powerful web crawler that initiates a structured web crawl starting from a specified base URL. The crawler expands from that point like a tree, following internal links across pages. You can control how deep and wide it goes, and guide it to focus on specific sections of the site.","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"The root URL to begin the crawl"},"max_depth":{"type":"integer","description":"Max depth of the crawl. Defines how far from the base URL the crawler can explore.","default":1,"minimum":1},"max_breadth":{"type":"integer","description":"Max number of links to follow per level of the tree (i.e., per page)","default":20,"minimum":1},"limit":{"type":"integer","description":"Total number of links the crawler will process before stopping","default":50,"minimum":1},"instructions":{"type":"string","description":"Natural language instructions for the crawler"},"select_paths":{"type":"array","items":{"type":"string"},"description":"Regex patterns to select only URLs with specific path patterns (e.g., /docs/.*, /api/v1.*)","default":[]},"select_domains":{"type":"array","items":{"type":"string"},"description":"Regex patterns to select crawling to specific domains or subdomains (e.g., ^docs\\.example\\.com$)","default":[]},"allow_external":{"type":"boolean","description":"Whether to allow following links that go to external domains","default":false},"categories":{"type":"array","items":{"type":"string","enum":["Careers","Blog","Documentation","About","Pricing","Community","Developers","Contact","Media"]},"description":"Filter URLs using predefined categories like documentation, blog, api, etc","default":[]},"extract_depth":{"type":"string","enum":["basic","advanced"],"description":"Advanced extraction retrieves more data, including tables and embedded content, with higher success but may increase latency","default":"basic"},"format":{"type":"string","enum":["markdown","text"],"description":"The format of the extracted web page content. markdown returns content in markdown format. text returns plain text and may increase latency.","default":"markdown"}},"required":["url"]}},{"name":"tavily-map","description":"A powerful web mapping tool that creates a structured map of website URLs, allowing you to discover and analyze site structure, content organization, and navigation paths. Perfect for site audits, content discovery, and understanding website architecture.","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"The root URL to begin the mapping"},"max_depth":{"type":"integer","description":"Max depth of the mapping. Defines how far from the base URL the crawler can explore","default":1,"minimum":1},"max_breadth":{"type":"integer","description":"Max number of links to follow per level of the tree (i.e., per page)","default":20,"minimum":1},"limit":{"type":"integer","description":"Total number of links the crawler will process before stopping","default":50,"minimum":1},"instructions":{"type":"string","description":"Natural language instructions for the crawler"},"select_paths":{"type":"array","items":{"type":"string"},"description":"Regex patterns to select only URLs with specific path patterns (e.g., /docs/.*, /api/v1.*)","default":[]},"select_domains":{"type":"array","items":{"type":"string"},"description":"Regex patterns to select crawling to specific domains or subdomains (e.g., ^docs\\.example\\.com$)","default":[]},"allow_external":{"type":"boolean","description":"Whether to allow following links that go to external domains","default":false},"categories":{"type":"array","items":{"type":"string","enum":["Careers","Blog","Documentation","About","Pricing","Community","Developers","Contact","Media"]},"description":"Filter URLs using predefined categories like documentation, blog, api, etc","default":[]}},"required":["url"]}},{"name":"google-search","description":"Perform a web search query","input_schema":{"type":"object","properties":{"query":{"type":"string","description":"Search query"},"num":{"type":"number","description":"Number of results (1-10)","minimum":1,"maximum":10}},"required":["query","num"]}}],"mcp_tools_dict":{"tavily-mcp":["tavily-search","tavily-extract","tavily-crawl","tavily-map"],"google-search":["google-search"]},"query":"Find the latest news about global economic trends.","function_call_label":[{"name":"google-search","step":"1","id":"1","mcp_server":"google-search","similar_tools":[{"name":"tavily-search","mcp_server":"tavily-mcp"}],"input":{"query":"global economic trends","num":5},"output":{"status_code":200,"result":{}}}]}] 2 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/agents/base_tool_call_agent/prompt.py: -------------------------------------------------------------------------------- 1 | user_prompt_template_ast = ''' 2 | ## Model Prediction 3 | {pred_tool_result_list} 4 | ## Answer Label 5 | {label_result_list} 6 | ## user query 7 | {query} 8 | ''' 9 | 10 | system_prompt_template_ast_single = ''' 11 | # Role 12 | You are an expert in evaluating the accuracy of intelligent tool calls. You need to evaluate the accuracy of model tool call link predictions based on the "model prediction" and "answer label" given by the user. 13 | # Steps 14 | 1. "Model Prediction" and "Answer Label" are both JSON lists, with each JSON representing the tool to be called, including the tool name and input field as calling parameters. Note that we only care about the last json in the list. Carefully compare all the contents. The 'similar_tools' field provides tool names with similar functionality. 15 | 2. Tool accuracy: If the tool in "model prediction" is the tool in "answer label" (the tool name needs to be exactly the same, and if the tool name predicted by the model is a similar tool to the answer label tool, it can also be considered consistent), the tool accuracy is considered to be 1, otherwise it is 0. 16 | 3. Parameter accuracy: Evaluate each parameter of the tool in "model prediction" in turn. If it is consistent with the parameter of the corresponding tool in "answer label" or "similar answer label", the parameter accuracy is considered to be 1, otherwise it is 0. If the tool accuracy is already 0, the parameter accuracy is also directly 0. 17 | 18 | # Notes 19 | 1. Note that we only care about the last json in the "Model Prediction" and "Answer Label" list. 20 | 2. Fuzzy matching: If the parameters of the tool in "Model Prediction" are not completely consistent with the parameters of the corresponding tool in "Answer Label", but the parameter content is similar, the parameter accuracy is considered to be 1. 21 | - For example, the parameter "query" are all rewritten search statements, and the content may be similar but the expression is different. If the content is similar, the parameters can be considered consistent. 22 | - For example, the location parameters are all place names, and there may be different expressions such as "北京", "Beijing", "Beijing Region", etc., which can be considered consistent. 23 | - For example, the longitude and latitude parameters are consistent before one decimal place, which is considered to be consistent. 24 | - No language verification, parameters can be in any language as long as the content is the same. 25 | 3. Skip if standard parameter values ​​are not provided 26 | - If some parameters of the tool in "Model Prediction" do not provide standard parameter values ​​in "Answer Label", they are skipped and considered to be consistent. 27 | 4. It is not required that all parameters of "Answer Label" be included in "Model Prediction". Parameter accuracy only needs to evaluate the parameters in "Model Prediction". 28 | 5. Some parameters are irrelevant to the user query and can be ignored without verification. For example, if the user query is "Find the latest news about cryptocurrency in the United States.", then the parameter "max_results" in the tool "tavily-search" is not important. 29 | 30 | # Output format requirements 31 | 1. Output strictly in json format, do not add any other explanations, formats, prefixes and suffixes, such as ```json, etc. The format and separators are all in English characters. 32 | 2. Includes 2 fields, representing tool accuracy, parameter accuracy. 33 | 3. Output format example: 34 | {{ 35 | "tool_correctness": 1, 36 | "parameter_correctness": 0 37 | }} 38 | ''' 39 | 40 | system_prompt_template_ast_single_reason = ''' 41 | # Role 42 | You are an expert in evaluating the accuracy of intelligent tool calls. You need to evaluate the accuracy of model tool call link predictions based on the "model prediction" and "answer label" given by the user, and give a brief reason. 43 | 44 | # Steps 45 | 1. "Model Prediction" and "Answer Label" are both JSON lists, with each JSON representing the tool to be called, including the tool name and input field as calling parameters. Note that we only care about the last json in the list. Carefully compare all the contents. The 'similar_tools' field provides tool names with similar functionality. 46 | 2. Tool accuracy: If the tool in "model prediction" is the tool in "answer label" (the tool name needs to be exactly the same, and if the tool name predicted by the model is a similar tool to the answer label tool, it can also be considered consistent), the tool accuracy is considered to be 1, otherwise it is 0. 47 | 3. Parameter accuracy: Evaluate each parameter of the tool in "model prediction" in turn. If it is consistent with the parameter of the corresponding tool in "answer label" or "similar answer label", the parameter accuracy is considered to be 1, otherwise it is 0. If the tool accuracy is already 0, the parameter accuracy is also directly 0. 48 | 4. Reasons: Give the reasons for the judgment, explain which tool does not match and which parameters are inconsistent, and keep the language as brief as possible. 49 | 50 | # Notes 51 | 1. Note that we only care about the last json in the "Model Prediction" and "Answer Label" list. 52 | 2. Fuzzy matching: If the parameters of the tool in "Model Prediction" are not completely consistent with the parameters of the corresponding tool in "Answer Label", but the parameter content is similar, the parameter accuracy is considered to be 1. 53 | - For example, the query parameters are all rewritten search statements, and the content may be similar but the expression is different. If the content is similar, the parameters can be considered consistent. 54 | - For example, the location parameters are all place names, and there may be different expressions such as "北京", "Beijing", "Beijing Region", etc., which can be considered consistent. 55 | - For example, the longitude and latitude parameters are consistent before one decimal place, which is considered to be consistent. 56 | - No language verification, parameters can be in any language as long as the content is the same. 57 | 3. Skip if standard parameter values ​​are not provided 58 | - If some parameters of the tool in "Model Prediction" do not provide standard parameter values ​​in "Answer Label", they are skipped and considered to be consistent. 59 | 4. It is not required that all parameters of "Answer Label" be included in "Model Prediction". Parameter accuracy only needs to evaluate the parameters in "Model Prediction". 60 | 61 | # Output format requirements 62 | 1. Output strictly in json format, do not add any other explanations, formats, prefixes and suffixes, such as ```json, etc. The format and separators are all in English characters. 63 | 2. Includes 3 fields, representing tool accuracy, parameter accuracy, and judgment reasons 64 | 3. Output format example: 65 | {{ 66 | "tool_correctness": 1, 67 | "parameter_correctness": 0, 68 | "reason": "..." 69 | }} 70 | ''' 71 | 72 | system_prompt_template_ast_multiple = ''' 73 | # Role 74 | You are an expert in evaluating the accuracy of intelligent tool calls. You need to evaluate the accuracy of model tool call link predictions based on the "model prediction" and "answer label" given by the user. 75 | # Steps 76 | 1. "Model Prediction" and "Answer Label" are both JSON lists, with each JSON representing the tool to be called, including the tool name and input field as calling parameters. Carefully compare all the contents. The 'similar_tools' field provides tool names with similar functionality. 77 | 2. Tool accuracy: If the tool in "model prediction" is the tool in "answer label" (the tool name needs to be exactly the same, and if the tool name predicted by the model is a similar tool to the answer label tool, it can also be considered consistent), the tool accuracy is considered to be 1, otherwise it is 0. 78 | 3. Parameter accuracy: Evaluate each parameter of the tool in "model prediction" in turn. If it is consistent with the parameter of the corresponding tool in "answer label" or "similar answer label", the parameter accuracy is considered to be 1, otherwise it is 0. If the tool accuracy is already 0, the parameter accuracy is also directly 0. 79 | 80 | # Notes 81 | 1. Fuzzy matching: If the parameters of the tool in "Model Prediction" are not completely consistent with the parameters of the corresponding tool in "Answer Label", but the parameter content is similar, the parameter accuracy is considered to be 1. 82 | - For example, the query parameters are all rewritten search statements, and the content may be similar but the expression is different. If the content is similar, the parameters can be considered consistent. 83 | - For example, the location parameters are all place names, and there may be different expressions such as "北京", "Beijing", "Beijing Region", etc., which can be considered consistent. 84 | - For example, the longitude and latitude parameters are consistent before one decimal place, which is considered to be consistent. 85 | - No language verification, parameters can be in any language as long as the content is the same. 86 | 2. Skip if standard parameter values ​​are not provided 87 | - If some parameters of the tool in "Model Prediction" do not provide standard parameter values ​​in "Answer Label", they are skipped and considered to be consistent. 88 | 3. It is not required that all parameters of "Answer Label" be included in "Model Prediction". Parameter accuracy only needs to evaluate the parameters in "Model Prediction". 89 | 90 | # Output format requirements 91 | 1. Output strictly in json format, do not add any other explanations, formats, prefixes and suffixes, such as ```json, etc. The format and separators are all in English characters. 92 | 2. Includes 2 fields, representing tool accuracy, parameter accuracy. 93 | 3. Output format example: 94 | {{ 95 | "tool_correctness": 1, 96 | "parameter_correctness": 0 97 | }} 98 | ''' 99 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/model_utils/base_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List, Dict, Any, Optional 3 | from ..global_variables import * 4 | 5 | class BaseModelAPIProvider: 6 | """ 7 | 8 | Usage: 9 | model_provider = _global_model_provider[MODEL_SELECTION_GPT4O] if MODEL_SELECTION_GPT4O in _global_model_provider else None 10 | result = model_provider.api_chat(messages) if model_provider is not None else {} 11 | completion = result[KEY_COMPLETION] 12 | 13 | """ 14 | 15 | def __init__(self, model_name: str): 16 | """ 17 | Args: 18 | model_name: e.g. claude-3.7 19 | """ 20 | self.model_name = model_name 21 | 22 | def api_chat(self, messages: List[Any], **kwargs) -> Dict[str, Any]: 23 | """ 24 | Args: 25 | messages: List[Any] 26 | **kwargs: other parameters 27 | 28 | Returns: 29 | str: response 30 | """ 31 | result = { 32 | KEY_FUNCTION_CALL: {}, 33 | KEY_COMPLETION: "", 34 | KEY_REASON_CONTENT: "" 35 | } 36 | return result 37 | 38 | def api_function_call(self, messages: List[Any], tools: list, **kwargs) -> Dict[str, Any]: 39 | """ 40 | 41 | Args: 42 | messages: list of json message 43 | tools: available tool json 44 | **kwargs: other parameters 45 | 46 | Returns: 47 | Dict: result 48 | """ 49 | result = { 50 | KEY_FUNCTION_CALL: {}, 51 | KEY_COMPLETION: "", 52 | KEY_REASON_CONTENT: "" 53 | } 54 | return result 55 | 56 | def function_call_result_common_mapper(tool_call): 57 | """ 58 | This wrapper is a common mapper to wrap the result of OpenAI/Claude Stype function call results, thinking/no thinking models 59 | Args: 60 | tool_call: 61 | { 62 | "id": "call_d6f4ed29ce614390b99a05", 63 | "function": { 64 | "arguments": "{\"url\": \"https://www.stackoverflow.com\", \"browserType\": \"chromium\"}", 65 | "name": "playwright_navigate" 66 | }, 67 | "type": "function", 68 | "index": 0 69 | } 70 | 71 | Return: 72 | tools_choice_response 73 | 74 | { 75 | "function_name": "playwright_navigate", 76 | "function_arguments": "{\"url\": \"https://www.stackoverflow.com\", \"browserType\": \"chromium\"}", 77 | "is_function_call": true, 78 | "id": "call_d6f4ed29ce614390b99a05" 79 | } 80 | completion: str 81 | reasoningContent: str 82 | """ 83 | if tool_call is None or len(tool_call) == 0: 84 | return {}, "", "" 85 | 86 | tools_choice_response = { 87 | 'function_name': '', 88 | 'function_arguments': '', 89 | 'is_function_call': False, 90 | 'id': '' 91 | } 92 | completion = "" 93 | reasoningContent = "" 94 | try: 95 | tool_id = tool_call["id"] if "id" in tool_call else "" 96 | function = tool_call["function"] if "function" in tool_call else {} 97 | function_arguments = function["arguments"] if "arguments" in function else {} 98 | function_name = function["name"] if "name" in function else "" 99 | 100 | tools_choice_response["is_function_call"] = True 101 | tools_choice_response["function_name"] = function_name 102 | tools_choice_response["function_arguments"] = function_arguments 103 | tools_choice_response["id"] = tool_id 104 | except Exception as e: 105 | logging.error(f"Failed to run tool_result_to_claude_mapper {e}") 106 | return tools_choice_response, completion, reasoningContent 107 | 108 | def tool_call_parameter_wrapper(model: str, tool_id: str, tool_name: str, tool_arguments: dict): 109 | 110 | message_tool_parameter = {} 111 | if "gpt" in model: 112 | # OpenAI Claude Format 113 | message_tool_parameter = tool_call_param_openai_wrapper(tool_id, tool_name, tool_arguments) 114 | elif "claude" in model: 115 | # Claude Format 116 | message_tool_parameter = tool_call_param_claude_wrapper(tool_id, tool_name, tool_arguments) 117 | elif "qwen" in model: 118 | # Qwen Wrapper 119 | message_tool_parameter = tool_call_param_qwen_wrapper(tool_id, tool_name, tool_arguments) 120 | else: 121 | message_tool_parameter = tool_call_param_openai_wrapper(tool_id, tool_name, tool_arguments) 122 | return message_tool_parameter 123 | 124 | def tool_call_result_wrapper(model: str, tool_id: str, tool_name: str, tool_result: dict): 125 | 126 | message_tool_result = {} 127 | if "gpt" in model: 128 | # OpenAI Claude Format 129 | message_tool_result = tool_call_result_openai_wrapper(tool_id, tool_name, tool_result) 130 | elif "claude" in model: 131 | # Claude Format 132 | message_tool_result = tool_call_result_claude_wrapper(tool_id, tool_result) 133 | elif "qwen" in model: 134 | # Qwen Wrapper 135 | message_tool_result = tool_call_result_qwen_wrapper(tool_id, tool_result) 136 | else: 137 | message_tool_result = tool_call_result_openai_wrapper(tool_id, tool_name, tool_result) 138 | return message_tool_result 139 | 140 | def tools_openai_wrapper(tools): 141 | tools_wrapped = [{ 142 | "type": "function", 143 | "function":{ 144 | "name": tool["name"] if "name" in tool else "", 145 | "description": tool["description"] if "description" in tool else "", 146 | "parameters": tool["input_schema"] if "input_schema" in tool else {} 147 | } 148 | } for tool in tools] 149 | return tools_wrapped 150 | 151 | def tool_call_param_openai_wrapper(tool_id: str, tool_name: str, arguments: Dict, **kwargs): 152 | 153 | context_id = kwargs["context_id"] if "context_id" in kwargs else "" 154 | session_id = kwargs["session_id"] if "session_id" in kwargs else "" 155 | 156 | oai_tool_call = { 157 | "role": "assistant", 158 | "content": None, 159 | "tool_calls": [ 160 | { 161 | "id": tool_id, 162 | "type": "function", 163 | "function": { 164 | "name": tool_name, 165 | "arguments": json.dumps(arguments), 166 | }, 167 | } 168 | ], 169 | } 170 | if context_id != "": 171 | oai_tool_call["contextId"] = context_id 172 | if session_id != "": 173 | oai_tool_call["sessionId"] = session_id 174 | return oai_tool_call 175 | 176 | def tool_call_param_claude_wrapper(tool_id: str, tool_name: str, arguments: Dict): 177 | claude_tool_assistant = { 178 | "role": "assistant", 179 | "content": [ 180 | { 181 | "type": "tool_use", 182 | "id": tool_id, 183 | "name": tool_name, 184 | "input": arguments 185 | } 186 | ] 187 | } 188 | return claude_tool_assistant 189 | 190 | def tool_call_param_claude_bedrock_wrapper(tool_id: str, tool_name: str, arguments: Dict): 191 | claude_tool_assistant = { 192 | "role": "assistant", 193 | "content": [ 194 | { 195 | "toolUse": { 196 | "id": tool_id, 197 | "name": tool_name, 198 | "input": arguments 199 | } 200 | } 201 | ] 202 | } 203 | return claude_tool_assistant 204 | 205 | def tool_call_param_qwen_wrapper(tool_id: str, tool_name: str, arguments: Dict): 206 | qwen_tool_assistant = { 207 | "role": "assistant", 208 | "content": None, 209 | "tool_calls": [ 210 | { 211 | "id": tool_id, 212 | "type": "function", 213 | "function": { 214 | "name": tool_name, 215 | "arguments": json.dumps(arguments) 216 | } 217 | } 218 | ] 219 | } 220 | return qwen_tool_assistant 221 | 222 | def tool_call_result_openai_wrapper(tool_id: str, tool_name: str, result: Any): 223 | """ 224 | """ 225 | oai_tool_result_msg = { 226 | "tool_call_id": tool_id, 227 | "role": "tool", 228 | "name": tool_name, 229 | "content": json.dumps(result), # Must be a string 230 | } 231 | return oai_tool_result_msg 232 | 233 | def tool_call_result_claude_wrapper(tool_id: str, result: Any): 234 | """ 235 | """ 236 | claude_tool_result_msg = { 237 | "role": "user", 238 | "content": [ 239 | { 240 | "type": "tool_result", 241 | "tool_use_id": tool_id, # from the API response 242 | "content": json.dumps(result) # from running your tool 243 | } 244 | ] 245 | } 246 | return claude_tool_result_msg 247 | 248 | def tool_call_result_claude_bedrock_wrapper(tool_id: str, result: Any): 249 | """ 250 | """ 251 | tool_result_msg = { 252 | "role": "user", 253 | "content": [ 254 | { 255 | "toolResult": { 256 | "toolUseId": tool_id, 257 | "content": [ 258 | {"text": json.dumps(result)} 259 | ], 260 | } 261 | } 262 | ] 263 | } 264 | return tool_result_msg 265 | def tool_call_result_qwen_wrapper(tool_id: str, result: Any): 266 | """ 267 | """ 268 | qwen_tool_result_msg = { 269 | "role": "tool", 270 | "content": [{"type": "text", "text": json.dumps(result)}], 271 | "tool_call_id": tool_id 272 | } 273 | return qwen_tool_result_msg 274 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This is the entry function for starting the entire project, receiving all command line parameters 3 | Parameters include: 4 | --input_file: Input file path, default is data/demo/demo_v0.json 5 | --category: Data category, such as browser, search, default is demo 6 | --model: Model, such as GPT4o, default is GPT4o 7 | --stage: Stage, such as demo, generation, tool_call, default is demo 8 | --metric: Metric, such as acc, pass_k, default is pass_k 9 | --pass_k: Parameter k, such as 1,5,10, default is 1 10 | --agent: Execution agent, base, base_tool_rag, base_multi-agent, default is base 11 | --mcp_config: MCP configuration file path, default is mcp_marketplace/mcp_config.json 12 | --data_version: Data version, such as v0, v1, default is v0 13 | --log_file: Log file name for resume functionality, optional. If not provided, auto-generates based on input file and timestamp. 14 | 15 | stage: 16 | 1. If stage is generation, call run_data_generator.py, generate data according to specified category and data_version. 17 | 3. If stage is tool_call, call run_tool_call.py according to specified model, directly perform calling and evaluation. 18 | 4. If stage is all, first run run_data_generator.py to generate data, then call run_tool_call.py for calling and evaluation. 19 | 5. If stage is demo, use all default parameters, first run run_data_generator.py to generate data, then call run_tool_call.py for calling and evaluation. 20 | 21 | Notes: 22 | 1. When stage is demo, all default parameters must be used, no other parameters can be specified, data will be generated first, then tool calling and evaluation will be performed. 23 | 2. When stage is generation, category and data_version must be filled, remind customers to provide all mcp tools files in the category directory under mcp_marketplace, format reference existing files. 24 | 3. When stage is tool_call, input_file, category, model must be filled 25 | 4. When stage is all, category, data_version, model must be filled, input_file is the data generated in the generation stage. 26 | 5. Print all parameters to remind users when running. 27 | 6. The tool_call stage now supports incremental logging and resume functionality. Logs are saved after each task completion, and the system can resume from where it left off if interrupted. 28 | ''' 29 | 30 | import argparse 31 | import sys 32 | from pathlib import Path 33 | 34 | 35 | from src.mcp_tool_bench import * 36 | from src.mcp_tool_bench.agents.data_generator_agent.run_data_generator import run_data_generation 37 | from src.mcp_tool_bench.agents.base_tool_call_agent.run_tool_call import run_benchmark 38 | 39 | # Default parameter values 40 | DEFAULT_ARGS = { 41 | 'input_file': 'data/demo/demo_v0.json', 42 | 'category': 'demo', 43 | 'model': 'gpt-4o', 44 | 'stage': 'demo', 45 | 'metric': 'pass_k', 46 | 'pass_k': '1', 47 | 'agent': 'base', 48 | 'mcp_config': 'mcp_marketplace/mcp_config.json', 49 | 'data_version': 'v0', 50 | 'log_file': None, 51 | 'evaluation_trial_per_task': 5, 52 | 'llm_as_judge_model': "gpt-4o" 53 | } 54 | 55 | def parse_arguments(): 56 | """Parse command line arguments""" 57 | parser = argparse.ArgumentParser(description='Project entry function') 58 | parser.add_argument('--input_file', default=DEFAULT_ARGS['input_file'], help='Input file path for tool_call stage, default is {}'.format(DEFAULT_ARGS['input_file'])) 59 | parser.add_argument('--category', default=DEFAULT_ARGS['category'], help='Data category, such as browser, search, default is {}'.format(DEFAULT_ARGS['category'])) 60 | parser.add_argument('--model', default=DEFAULT_ARGS['model'], help='Model, such as GPT4o, default is {}'.format(DEFAULT_ARGS['model'])) 61 | parser.add_argument('--stage', default=DEFAULT_ARGS['stage'], choices=['demo', 'generation', 'tool_call', 'all'], help='Stage, such as demo, generation, tool_call, all, default is {}'.format(DEFAULT_ARGS['stage'])) 62 | parser.add_argument('--metric', default=DEFAULT_ARGS['metric'], help='Metric, such as acc, pass_k, default is {}'.format(DEFAULT_ARGS['metric'])) 63 | parser.add_argument('--pass_k', type=str, default=DEFAULT_ARGS['pass_k'], help='Parameter k, such as 1,5,10, default is {}'.format(DEFAULT_ARGS['pass_k'])) 64 | parser.add_argument('--agent', default=DEFAULT_ARGS['agent'], help='Execution agent, such as base, base_tool_rag, base_multi-agent, default is {}'.format(DEFAULT_ARGS['agent'])) 65 | parser.add_argument('--mcp_config', default=DEFAULT_ARGS['mcp_config'], help='MCP configuration file path, default is {}'.format(DEFAULT_ARGS['mcp_config'])) 66 | parser.add_argument('--data_version', default=DEFAULT_ARGS['data_version'], help='Data version, such as v0, v1, default is {}'.format(DEFAULT_ARGS['data_version'])) 67 | parser.add_argument('--log_file', default=DEFAULT_ARGS['log_file'], help='Specify log file name for resume functionality. If not provided, will auto-generate based on input file and timestamp.') 68 | parser.add_argument('--evaluation_trial_per_task', type=int, default=DEFAULT_ARGS['evaluation_trial_per_task'], help='Calculation Pass@K Number of Trials...') 69 | parser.add_argument('--llm_as_judge_model', type=str, default=DEFAULT_ARGS['llm_as_judge_model'], help='LLM Model Used to determine the parameters are correctly aligned with ground-truth, especial in search tool that query is rewritten') 70 | 71 | return parser.parse_args() 72 | 73 | def validate_arguments(args): 74 | """Validate the validity of arguments""" 75 | if args.stage == 'demo': 76 | # Check if non-default parameters are used in demo stage 77 | # Check if there are non-default parameters (excluding log_file which is optional) 78 | non_default_args = [] 79 | for arg_name in vars(args): 80 | if arg_name == 'log_file': # log_file is optional and allowed in demo 81 | continue 82 | current_value = getattr(args, arg_name) 83 | default_value = DEFAULT_ARGS.get(arg_name) 84 | if current_value != default_value: 85 | non_default_args.append(f"--{arg_name}") 86 | 87 | if non_default_args: 88 | print("Error: demo stage does not support specifying other parameters") 89 | print(f"Detected non-default parameters: {', '.join(non_default_args)}") 90 | print("demo stage will use all default parameters, please run directly: python run.py") 91 | return False 92 | 93 | return True 94 | 95 | if args.stage == 'generation': 96 | print("Note: The data generation module requires to provide all mcp tools files in the category directory under mcp_marketplace, format reference existing files.") 97 | if not args.category or not args.data_version: 98 | print("Error: generation stage must fill category, data_version") 99 | return False 100 | 101 | if args.stage == 'tool_call': 102 | if not args.input_file or not args.category or not args.model: 103 | print("Error: tool_call stage must fill input_file, category, model") 104 | return False 105 | 106 | if args.stage == 'all': 107 | print("Note: The data generation module requires to provide all mcp tools files in the category directory under mcp_marketplace, format reference existing files.") 108 | if not args.category or not args.data_version or not args.model: 109 | print("Error: all stage must fill category, data_version, model") 110 | return False 111 | 112 | return True 113 | 114 | 115 | def print_arguments(args): 116 | """Print all arguments""" 117 | print("=== Running Parameters ===") 118 | for arg, value in vars(args).items(): 119 | print(f"{arg}: {value}") 120 | print("===============") 121 | 122 | 123 | def main(): 124 | """Main function""" 125 | args = parse_arguments() 126 | print_arguments(args) 127 | 128 | if not validate_arguments(args): 129 | sys.exit(1) 130 | 131 | if args.stage == 'demo': 132 | # demo stage: use default parameters, generate data first, then perform tool calling and evaluation 133 | print("=" * 50) 134 | print("Executing demo stage: using default parameters") 135 | print("=" * 50) 136 | 137 | print("\n【Step 1】Data Generation") 138 | print("-" * 30) 139 | run_data_generation(args.category, args.data_version, args.mcp_config) 140 | 141 | elif args.stage == 'generation': 142 | # generation stage: generate data 143 | print("=" * 50) 144 | print("Executing generation stage: generate data") 145 | print("=" * 50) 146 | 147 | print("\n【Step 1】Data Generation") 148 | print("-" * 30) 149 | run_data_generation(args.category, args.data_version, args.mcp_config) 150 | 151 | print("\n" + "=" * 50) 152 | print("generation stage execution completed") 153 | print("=" * 50) 154 | 155 | elif args.stage == 'tool_call': 156 | # tool_call stage: tool calling and evaluation 157 | print("=" * 50) 158 | print("Executing tool_call stage: tool calling and evaluation") 159 | print("=" * 50) 160 | 161 | print("\n【Step 1】Tool Calling and Evaluation") 162 | print("-" * 30) 163 | run_benchmark(args) 164 | 165 | print("\n" + "=" * 50) 166 | print("tool_call stage execution completed") 167 | print("=" * 50) 168 | 169 | elif args.stage == 'all': 170 | # all stage: generate data first, then perform tool calling and evaluation 171 | print("=" * 50) 172 | print("Executing all stage: generate data first, then perform tool calling and evaluation") 173 | print("=" * 50) 174 | 175 | print("\n【Step 1】Data Generation") 176 | print("-" * 30) 177 | run_data_generation(args.category, args.data_version, args.mcp_config) 178 | 179 | print("\n【Step 2】Tool Calling and Evaluation") 180 | print("-" * 30) 181 | # Set input_file to the generated data file 182 | args.input_file = f"data/{args.category}/{args.category}_{args.data_version}.json" 183 | run_benchmark(args) 184 | 185 | print("\n" + "=" * 50) 186 | print("Full pipeline execution completed") 187 | print("=" * 50) 188 | 189 | 190 | if __name__ == "__main__": 191 | main() 192 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/agents/data_generator_agent/utils/prompt_reference.py: -------------------------------------------------------------------------------- 1 | browser_candidate_references = { 2 | "url": [ 3 | "https://www.baidu.com", 4 | "https://www.taobao.com", 5 | "https://www.jd.com", 6 | "https://www.qq.com", 7 | "https://www.sina.com.cn", 8 | "https://www.163.com", 9 | "https://www.sohu.com", 10 | "https://www.zhihu.com", 11 | "https://www.douban.com", 12 | "https://www.bilibili.com", 13 | "https://www.weibo.com", 14 | "https://www.tmall.com", 15 | "https://www.alipay.com", 16 | "https://www.ctrip.com", 17 | "https://www.meituan.com", 18 | "https://www.douyin.com", 19 | "https://www.xiaohongshu.com", 20 | "https://www.youku.com", 21 | "https://www.iqiyi.com", 22 | "https://www.tencent.com", 23 | "https://www.xiaomi.com", 24 | "https://www.huawei.com", 25 | "https://www.oppo.com", 26 | "https://www.vivo.com", 27 | "https://www.oneplus.com", 28 | "https://www.realme.com", 29 | "https://www.meizu.com", 30 | "https://www.zte.com", 31 | "https://www.alibaba.com", 32 | "https://www.aliexpress.com", 33 | "https://www.amazon.cn", 34 | "https://www.kuaishou.com", 35 | "https://www.yy.com", 36 | "https://www.huya.com", 37 | "https://www.douyu.com", 38 | "https://www.v.qq.com", 39 | "https://www.mgtv.com", 40 | "https://www.le.com", 41 | "https://www.pptv.com", 42 | "https://www.fun.tv", 43 | "https://www.cntv.cn", 44 | "https://www.cctv.com", 45 | "https://www.people.com.cn", 46 | "https://www.xinhuanet.com", 47 | "https://www.chinanews.com.cn", 48 | "https://www.gmw.cn", 49 | "https://www.cyol.com", 50 | "https://www.jschina.com.cn", 51 | "https://www.ynet.com", 52 | "https://www.thepaper.cn", 53 | "https://www.jiemian.com", 54 | "https://www.caixin.com" 55 | ], 56 | "iframeSelector": [ 57 | "#iframeResult", 58 | "#iframeContent", 59 | ".iframe-container iframe", 60 | "#externalFrame", 61 | "#embedFrame", 62 | "#playerFrame", 63 | "#videoFrame", 64 | "#loginFrame", 65 | "#paymentFrame", 66 | "#chatFrame", 67 | "#adFrame", 68 | "#previewFrame", 69 | "#widgetFrame", 70 | "#contentFrame", 71 | "#mainFrame", 72 | "#sideFrame", 73 | "#popupFrame", 74 | "#modalFrame", 75 | "#dynamicFrame", 76 | "#thirdPartyFrame" 77 | ], 78 | "selector": [ 79 | "#username", 80 | "#password", 81 | ".login-btn", 82 | ".search-input", 83 | ".submit-btn", 84 | "#main-nav", 85 | ".product-item", 86 | ".add-to-cart", 87 | "#checkout-btn", 88 | ".user-avatar", 89 | ".dropdown-menu", 90 | "#home-link", 91 | "#contact-form", 92 | ".news-item", 93 | ".video-player", 94 | "#comment-box", 95 | ".rating-stars", 96 | "#footer-links", 97 | ".social-share", 98 | ".cookie-banner" 99 | ], 100 | "key": [ 101 | "Enter", 102 | "Tab", 103 | "ArrowDown", 104 | "ArrowUp", 105 | "ArrowLeft", 106 | "ArrowRight", 107 | "Escape", 108 | "Backspace", 109 | "Delete", 110 | "Home", 111 | "End", 112 | "PageUp", 113 | "PageDown", 114 | "F5", 115 | "Control", 116 | "Alt", 117 | "Shift", 118 | "Space", 119 | "a", 120 | "1" 121 | ], 122 | "script": [ 123 | "document.title", 124 | "window.scrollTo(0, document.body.scrollHeight)", 125 | "alert('test')", 126 | "console.log('hello world')", 127 | "document.querySelector('button').click()", 128 | "window.location.href", 129 | "document.cookie", 130 | "localStorage.getItem('token')", 131 | "sessionStorage.setItem('key', 'value')", 132 | "document.querySelectorAll('.item').length", 133 | "window.innerWidth", 134 | "window.innerHeight", 135 | "document.readyState", 136 | "performance.now()", 137 | "navigator.userAgent", 138 | "document.querySelector('input').value = 'test'", 139 | "document.activeElement.blur()", 140 | "window.history.back()", 141 | "window.print()", 142 | "document.execCommand('copy')" 143 | ], 144 | "value": [ 145 | "testuser", 146 | "password123", 147 | "example@example.com", 148 | "13800138000", 149 | "2023-01-01", 150 | "100.00", 151 | "https://example.com", 152 | "123456", 153 | "18" 154 | ] 155 | } 156 | 157 | finance_candidate_references = { 158 | "symbol_list": [ 159 | # Global Market Stock 160 | "AAPL", # Apple Inc. 161 | "MSFT", # Microsoft Corporation 162 | "GOOGL", # Alphabet Inc. (Google) 163 | "AMZN", # Amazon.com Inc. 164 | "NVDA", # NVIDIA Corporation 165 | "META", # Meta Platforms Inc. - Meta/Facebook 166 | "TSLA", # Tesla Inc. 167 | "BRK.A", # Berkshire Hathaway 168 | "BRK.B", # Berkshire Hathaway 169 | "LLY", # Eli Lilly and Company 170 | "TSM", # Taiwan Semiconductor 171 | "WMT", # Walmart Inc. 172 | "JPM", # JPMorgan Chase & Co. 173 | "V", # Visa Inc. 174 | "PG", # Procter & Gamble 175 | "UNH", # UnitedHealth Group 176 | "HD", # Home Depot 177 | "MA", # Mastercard 178 | "BAC", # Bank of America 179 | "ABBV", # AbbVie Inc. 180 | "PFE", # Pfizer Inc. 181 | "KO", # Coca-Cola Company 182 | "PEP", # PepsiCo Inc. 183 | "MRK", # Merck & Co. 184 | "CSCO", # Cisco Systems 185 | "ADBE", # Adobe Inc. 186 | "NFLX", # Netflix Inc. 187 | "CRM", # Salesforce Inc. 188 | "ACN", # Accenture plc 189 | "TMO", # Thermo Fisher Scientific 190 | # China Mainland Stock List 191 | "SH600519", # 贵州茅台 192 | "SH600036", # 招商银行 193 | "SH600900", # 长江电力 194 | "SH600276", # 恒瑞医药 195 | "SH600887", # 伊利股份 196 | "SH600031", # 三一重工 197 | "SH600000", # 浦发银行 198 | "SH600028", # 中国石化 199 | "SH600030", # 中信证券 200 | "SH600104", # 上汽集团 201 | "SZ000858", # 五粮液 202 | "SZ000002", # 万科A 203 | "SZ000001", # 平安银行 204 | "SZ000333", # 美的集团 205 | "SZ000651", # 格力电器 206 | "SZ000725", # 京东方A 207 | "SZ000063", # 中兴通讯 208 | "SZ002594", # 比亚迪 209 | "SZ300750", # 宁德时代 210 | "SZ300059", # 东方财富 211 | # HKEX 212 | "00700", # 腾讯控股 - Tencent Holdings 213 | "03690", # 美团点评 - Meituan Dianping 214 | "09988", # 阿里巴巴 - Alibaba Group 215 | "01810", # 小米集团 - Xiaomi Corporation 216 | "09618", # 京东集团 - JD.com 217 | "09868", # 小鹏汽车 - XPeng Motors 218 | "02015", # 理想汽车 - Li Auto 219 | "09866", # 蔚来汽车 - NIO Inc. 220 | "02382", # 舜宇光学 - Sunny Optical 221 | "00780", # 同程旅行 - Tongcheng Travel 222 | "02331", # 李宁 - Li Ning 223 | "02020", # 安踏体育 - ANTA Sports 224 | "03692", # 翰森制药 - Hansoh Pharmaceutical 225 | "01177", # 中国生物制药 - Sino Biopharmaceutical 226 | "01093", # 石药集团 - CSPC Pharmaceutical 227 | "02269", # 药明生物 - WuXi Biologics 228 | "03613", # 同仁堂国药 - Tong Ren Tang Technologies 229 | "00883", # 中国海洋石油 - CNOOC 230 | "00939", # 建设银行 - China Construction Bank 231 | "01398" # 工商银行 - Industrial and Commercial Bank of China 232 | ], 233 | "market": [ 234 | "US", # United States 235 | "HK", # HKEX 236 | "CN_MAINLAND", # China Mainland Stock A 237 | "LSE", # London Stock Exchange 238 | "NSE_INDIA", # National Stock Exchange India 239 | "JPX", # Japan 240 | "ASX", # Australia 241 | "TSX", # Toronto 242 | "FWB", # Frankfurt 243 | "EURONEXT", # EURONEXT 244 | "SSE", # Shanghia 245 | "SZSE", # Shenzhen 246 | "KRX", # Korean 247 | "SGX", # Singapore 248 | "TSE", # Tokyo 249 | "BSE", # Bombay 250 | "MOEX", # Moscow 251 | ] 252 | } 253 | 254 | search_candidate_references = { 255 | } 256 | 257 | map_candidate_references = { 258 | } 259 | 260 | filesystem_candidate_references = { 261 | } 262 | 263 | pay_candidate_references = { 264 | } 265 | 266 | browser_special_needs_description = ''' 267 | 1. 对于iframe操作,需要先定位iframe再操作内部元素 268 | 2. 文件上传操作需要特殊处理,不能直接设置input值 269 | 3. 动态加载的内容需要等待元素出现 270 | 4. 跨域iframe有安全限制需要注意 271 | ''' 272 | 273 | finance_special_needs_description = ''' 274 | 1. Note that the stock codes and market codes in the candidate set must match exactly, for example, SH600519 must match CN_MAINLAND, AAPL must match US, and there cannot be a pairing of stock codes that do not exist in the market. 275 | 2. The values of the candidate set are all from the list provided by the Parameter candidate value reference. 276 | 3. If it is easy to determine the market it belongs to from the stock code, the query does not reflect the market, which is closer to the user's daily inquiries, such as directly asking "What is the current share price of ?" instead of "What is the current share price of in the market?". Of course, the label parameter is still given in full as usual. 277 | ''' 278 | 279 | search_special_needs_description = ''' 280 | ''' 281 | 282 | map_special_needs_description = ''' 283 | ''' 284 | 285 | filesystem_special_needs_description = ''' 286 | ''' 287 | 288 | pay_special_needs_description = ''' 289 | ''' 290 | 291 | candidate_reference_list = { 292 | "browser": browser_candidate_references, 293 | "finance": finance_candidate_references, 294 | "search": search_candidate_references, 295 | "map": map_candidate_references, 296 | "filesystem": filesystem_candidate_references, 297 | "pay": pay_candidate_references 298 | } 299 | 300 | special_needs_description_list = { 301 | "browser": browser_special_needs_description, 302 | "finance": finance_special_needs_description, 303 | "search": search_special_needs_description, 304 | "map": map_special_needs_description, 305 | "filesystem": filesystem_special_needs_description, 306 | "pay": pay_special_needs_description 307 | } 308 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/agents/base_tool_call_agent/check_functions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../../..'))) 6 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../..'))) 7 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, '../'))) 8 | sys.path.insert(0, os.path.abspath(os.path.join(CURRENT_DIR, './'))) 9 | 10 | from src.mcp_tool_bench.agents.base_tool_call_agent.prompt import * 11 | from src.mcp_tool_bench.evaluation.evaluation_utils import estimate_pass_at_k, base_error_analysis 12 | import html 13 | import re 14 | from bs4 import BeautifulSoup 15 | from src.mcp_tool_bench.global_variables import * 16 | from src.mcp_tool_bench.model_utils.model_provider import get_model_provider 17 | from src.mcp_tool_bench.model_utils.base_api import * 18 | import json 19 | import logging 20 | from typing import List, Dict, Tuple 21 | from tqdm import tqdm 22 | 23 | def decode_html_entities(s): 24 | """Decode HTML entities""" 25 | # Try using html.unescape 26 | first_decode = html.unescape(s) 27 | # Check if still contains undecoded entities 28 | if "&" in first_decode and ";" in first_decode: 29 | # Use BeautifulSoup for further decoding 30 | soup = BeautifulSoup(first_decode, "html.parser") 31 | second_decode = soup.get_text() 32 | return second_decode 33 | return first_decode 34 | 35 | def auto_fix_unclosed_quotes(data): 36 | """ 37 | Automatically add space after colon in key-value pairs, e.g., convert 'key:value' to 'key: value' 38 | """ 39 | if isinstance(data, list): 40 | return data 41 | # Use regex to match cases where colon is not followed by space, and add space 42 | data = re.sub(r'(?m)^(\s*[^#\s][^:]*):([^\s])', r'\1: \2', data) 43 | 44 | lines = data.split("\n") 45 | fixed_lines = [] 46 | for line in lines: 47 | # Detect and fix unclosed quotes 48 | if line.count('"') % 2 != 0: 49 | line = line + '"' # Append a quote to close 50 | fixed_lines.append(line) 51 | return "\n".join(fixed_lines) 52 | 53 | def process_response(response_text): 54 | """Process GPT response text""" 55 | if not response_text: 56 | return "" 57 | 58 | raw_val = decode_html_entities(response_text) 59 | raw_val = auto_fix_unclosed_quotes(raw_val) 60 | decoded_json_str = html.unescape(raw_val) 61 | decoded_json_str = decoded_json_str.replace("```json\n", "").replace("```", "").replace("\n", "") 62 | return decoded_json_str 63 | 64 | def check_ast(pred_tool_result_list: List[Dict], label_result_list: List[Dict], query: str, model_name: str = MODEL_SELECTION_GPT4O) -> Tuple[bool, bool]: 65 | """ 66 | Check the AST of tool calls 67 | model_name: required, the LLM as a judge can verify the parameters are aligned. For example, the "query" used in search tools may be 68 | rewrited by Function Call models. And LLM as a judge need to determine if the query is correctedly rewritten that match the original query. 69 | Default: Using GPT4o 70 | """ 71 | try: 72 | if pred_tool_result_list == label_result_list: 73 | return True, True 74 | label_step = 1 75 | predict_step = 1 76 | if (label_step == 1 and predict_step == 1): 77 | user_prompt = user_prompt_template_ast.format(pred_tool_result_list=pred_tool_result_list, label_result_list=label_result_list, query=query) 78 | system_prompt = system_prompt_template_ast_single.format() 79 | messages = [ 80 | { 81 | "role": "system", 82 | "content": system_prompt 83 | }, 84 | { 85 | "role": "user", 86 | "content": user_prompt 87 | } 88 | ] 89 | # print("messages: ", messages) 90 | model_provider = get_model_provider(model_name) 91 | output = model_provider.api_chat(messages, wait_time=5) if model_provider is not None else {} 92 | raw_response = output[KEY_COMPLETION] if KEY_COMPLETION in output else "" 93 | # Normal chat: process string 94 | if isinstance(raw_response, str): 95 | result = process_response(raw_response) 96 | try: 97 | result = json.loads(result) 98 | except Exception as e: 99 | logging.error(f" Failed to parse json {e}") 100 | return False, False 101 | # print("[debug] check_ast result: ", result) 102 | tool_correctness = result["tool_correctness"] if "tool_correctness" in result else 0 103 | parameter_correctness = result["parameter_correctness"] if "parameter_correctness" in result else 0 104 | 105 | else: 106 | ## multiple 107 | return False, False 108 | return tool_correctness, parameter_correctness 109 | 110 | except Exception as e: 111 | print (f"check_ast failed with error {e}") 112 | return 0, 0 113 | 114 | def check_single_tool_call_dag(pred_tool_result: Dict, label_result: Dict) -> Tuple[bool, bool]: 115 | # implementation 116 | # print("[debug] pred_tool_result:", pred_tool_result) 117 | # print("[debug] label_result:", label_result) 118 | label_tool_name = label_result["name"] if "name" in label_result else "" 119 | similar_tools = label_result.get("similar_tools", []) 120 | # label_result = label_result["output"] if "output" in label_result else {} 121 | 122 | # prediction 123 | predict_tool_name = pred_tool_result["name"] if "name" in pred_tool_result else "" 124 | predict_result = pred_tool_result["output"] if "output" in pred_tool_result else {} 125 | predict_status_code = predict_result["status_code"] if "status_code" in predict_result else 500 126 | tool_consistency = False 127 | output_consistency = False 128 | 129 | # Direct match 130 | if label_tool_name == predict_tool_name: 131 | tool_consistency = True 132 | 133 | # Check similar tools 134 | # print("similar_tools: ", similar_tools) 135 | for similar_tool in similar_tools: 136 | if predict_tool_name == similar_tool.get("name", ""): 137 | tool_consistency = True 138 | 139 | result_success_label_list = base_error_analysis([predict_result])["result_success_label_list"] 140 | if sum(result_success_label_list)==len(result_success_label_list): 141 | output_consistency = True 142 | else: 143 | output_consistency = False 144 | return tool_consistency, output_consistency 145 | 146 | def check_multi_tool_call_dag(pred_tool_result_list: List[Dict], label_result_list: List[Dict]) -> Tuple[bool, bool]: 147 | """ 148 | Check the correctness of tool calls for DAG structure 149 | 150 | Args: 151 | pred_tool_result_list: List of predicted tool call results 152 | label_result_list: List of ground truth tool call results 153 | 154 | Returns: 155 | Tuple[bool, bool]: (tool_consistency, output_consistency) 156 | """ 157 | 158 | def get_leaf_nodes(tool_list: List[Dict]) -> List[Dict]: 159 | """ 160 | Get leaf nodes (last tool calls) from tool list 161 | If the last tool name is repeated, get all consecutive calls with the same name 162 | """ 163 | if not tool_list: 164 | return [] 165 | 166 | leaf_nodes = [] 167 | last_tool_name = tool_list[-1]["name"] 168 | 169 | # Iterate from the end to find all consecutive calls with the same tool name 170 | for i in range(len(tool_list) - 1, -1, -1): 171 | if tool_list[i]["name"] == last_tool_name: 172 | leaf_nodes.insert(0, tool_list[i]) 173 | else: 174 | break 175 | 176 | return leaf_nodes 177 | 178 | if len(label_result_list)<1 or len(pred_tool_result_list)<1: 179 | return False, False 180 | # Get leaf nodes from both lists 181 | # pred_leaf_nodes = get_leaf_nodes(pred_tool_result_list) 182 | # label_leaf_nodes = get_leaf_nodes(label_result_list) 183 | pred_leaf_nodes = pred_tool_result_list[-1] 184 | label_leaf_nodes = label_result_list[-1] 185 | # print("[debug] pred_leaf_nodes:", pred_leaf_nodes) 186 | # print("[debug] label_leaf_nodes:", label_leaf_nodes) 187 | 188 | return check_single_tool_call_dag(pred_leaf_nodes, label_leaf_nodes) 189 | 190 | if __name__ == "__main__": 191 | # Read the input JSON file 192 | input_file_path = "logs/browser/browser_0711_single_500_20250713_080044.json" 193 | output_file_path = "logs/browser/browser_0711_single_500_20250713_080044_ast.json" 194 | # input_file_path = "logs/browser/test_log.json" 195 | # output_file_path = "logs/browser/test_log_ast.json" 196 | 197 | try: 198 | with open(input_file_path, 'r', encoding='utf-8') as f: 199 | data = json.load(f) 200 | 201 | # Calculate total number of function calls to process 202 | total_function_calls = 0 203 | for run_detail in data.get("run_details", []): 204 | for trial in run_detail.get("trials", []): 205 | total_function_calls += len(trial.get("function_call_result", [])) 206 | 207 | print(f"Total function calls to process: {total_function_calls}") 208 | 209 | # Process each run_detail with progress bar 210 | processed_calls = 0 211 | for run_detail in tqdm(data.get("run_details", []), desc="Processing run_details"): 212 | function_call_label = run_detail.get("function_call_label", []) 213 | query = run_detail.get("query", []) 214 | 215 | # Process each trial 216 | for trial in run_detail.get("trials", []): 217 | function_call_results = trial.get("function_call_result", []) 218 | 219 | # Call check_ast with function_call_label and function_call_result 220 | tool_correctness, parameter_correctness = check_ast( 221 | function_call_results, 222 | function_call_label, 223 | query 224 | ) 225 | 226 | # Add the new fields to function_call_result 227 | trial["tool_correctness"] = True if tool_correctness == 1 else False 228 | trial["parameter_correctness"] = True if parameter_correctness == 1 else False 229 | 230 | processed_calls += 1 231 | 232 | # Write the output file 233 | with open(output_file_path, 'w', encoding='utf-8') as f: 234 | json.dump(data, f, indent=2, ensure_ascii=False) 235 | 236 | print(f"Processing completed. Output written to {output_file_path}") 237 | 238 | except FileNotFoundError: 239 | print(f"Error: Input file {input_file_path} not found") 240 | except json.JSONDecodeError as e: 241 | print(f"Error: Invalid JSON in input file: {e}") 242 | except Exception as e: 243 | print(f"Error: {e}") 244 | -------------------------------------------------------------------------------- /src/mcp_tool_bench/utils/calculate_metrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to calculate tool_pass@{k} and parameter_pass@{k} metrics from existing log files. 4 | This script can process log files that contain tool_correctness and parameter_correctness data 5 | but don't have the calculated pass@k metrics for these dimensions. 6 | """ 7 | 8 | import json 9 | import argparse 10 | import os 11 | import numpy as np 12 | from typing import List, Dict, Any, Tuple 13 | from src.mcp_tool_bench.evaluation.evaluation_utils import estimate_pass_at_k, base_error_analysis 14 | 15 | def check_single_tool_call_dag(pred_tool_result: Dict, label_result: Dict) -> Tuple[bool, bool]: 16 | # implementation 17 | label_tool_name = label_result["name"] if "name" in label_result else "" 18 | label_result = label_result["output"] if "output" in label_result else {} 19 | 20 | # prediction 21 | predict_tool_name = pred_tool_result["name"] if "name" in pred_tool_result else "" 22 | predict_result = pred_tool_result["output"] if "output" in pred_tool_result else {} 23 | predict_status_code = predict_result["status_code"] if "status_code" in predict_result else 500 24 | 25 | if label_tool_name == predict_tool_name: 26 | tool_consistency = True 27 | else: 28 | tool_consistency = False 29 | 30 | result_success_label_list = base_error_analysis([predict_result])["result_success_label_list"] 31 | if sum(result_success_label_list)==len(result_success_label_list): 32 | output_consistency = True 33 | else: 34 | output_consistency = False 35 | return tool_consistency, output_consistency 36 | 37 | def check_correctness(pred_tool_result_list: List[Dict], label_result_list: List[Dict]) -> Tuple[bool, bool]: 38 | """ 39 | Check the correctness of tool calls 40 | 41 | Args: 42 | pred_tool_result_list: Tool call prediction result list 43 | label_result_list: Tool call ground truth result list 44 | 45 | Returns: 46 | Tuple[bool, bool]: (tool_consistency, output_consistency) 47 | """ 48 | label_step = len(label_result_list) if label_result_list is not None else 0 49 | predict_step = len(pred_tool_result_list) if pred_tool_result_list is not None else 0 50 | 51 | tool_consistency = False 52 | output_consistency = False 53 | 54 | label_result = label_result_list[-1] 55 | pred_tool_result = pred_tool_result_list[-1] 56 | tool_consistency, output_consistency = check_single_tool_call_dag(pred_tool_result, label_result) 57 | 58 | return tool_consistency, output_consistency 59 | 60 | def calculate_metrics_from_log(log_file_path: str, pass_k_list: List[int] = None) -> Dict[str, Any]: 61 | """ 62 | Calculate tool_pass@{k} and parameter_pass@{k} metrics from a log file. 63 | 64 | Args: 65 | log_file_path: Path to the log file 66 | pass_k_list: List of k values for pass@k calculation. If None, will extract from log file. 67 | 68 | Returns: 69 | Dict containing the calculated metrics 70 | """ 71 | 72 | # Load log file 73 | with open(log_file_path, 'r', encoding='utf-8') as f: 74 | log_data = json.load(f) 75 | 76 | # Extract pass_k_list from log if not provided 77 | if pass_k_list is None: 78 | pass_k_str = log_data.get("run_info", {}).get("pass_k", "1") 79 | pass_k_list = [int(k) for k in pass_k_str.split(",")] 80 | 81 | print(f"Processing log file: {log_file_path}") 82 | print(f"Pass@k values: {pass_k_list}") 83 | 84 | # Extract run details 85 | run_details = log_data.get("run_details", []) 86 | if not run_details: 87 | print("No run_details found in log file") 88 | return {} 89 | 90 | # Arrays to store results for each task 91 | num_trails_array = [] 92 | num_pass_array = [] 93 | num_tool_correct_array = [] 94 | num_parameter_correct_array = [] 95 | 96 | run_details = run_details[:50] 97 | # Process each task 98 | for task in run_details: 99 | trials = task.get("trials", []) 100 | if not trials: 101 | continue 102 | 103 | # Count trials and correct results 104 | num_trials = len(trials) 105 | num_passed = 0 106 | num_tool_correct = 0 107 | num_parameter_correct = 0 108 | 109 | # Calculate directly from log 110 | num_passed = sum(1 for trial in trials if (trial.get("if_pass", False) and trial.get("tool_correctness", False) and trial.get("parameter_correctness", False))) 111 | num_tool_correct = sum(1 for trial in trials if trial.get("tool_correctness", False)) 112 | num_parameter_correct = sum(1 for trial in trials if (trial.get("parameter_correctness", False) and trial.get("tool_correctness", False))) 113 | 114 | num_trails_array.append(num_trials) 115 | num_pass_array.append(num_passed) 116 | num_tool_correct_array.append(num_tool_correct) 117 | num_parameter_correct_array.append(num_parameter_correct) 118 | 119 | print(f"Processed {len(num_trails_array)} tasks") 120 | print(f"Total trials: {sum(num_trails_array)}") 121 | print(f"Total passed: {sum(num_pass_array)}") 122 | print(f"Total tool correct: {sum(num_tool_correct_array)}") 123 | print(f"Total parameter correct: {sum(num_parameter_correct_array)}") 124 | 125 | # Calculate metrics for each k value 126 | metrics_list = [] 127 | run_info = log_data.get("run_info", {}) 128 | 129 | for k in pass_k_list: 130 | # Calculate pass@{k} for overall correctness 131 | pass_at_k_arr = estimate_pass_at_k(num_trails_array, num_pass_array, k) 132 | pass_at_k = float(np.mean(pass_at_k_arr)) if len(pass_at_k_arr) > 0 else 0 133 | 134 | # Calculate tool_pass@{k} 135 | tool_pass_at_k_arr = estimate_pass_at_k(num_trails_array, num_tool_correct_array, k) 136 | tool_pass_at_k = float(np.mean(tool_pass_at_k_arr)) if len(tool_pass_at_k_arr) > 0 else 0 137 | 138 | # Calculate parameter_pass@{k} 139 | parameter_pass_at_k_arr = estimate_pass_at_k(num_trails_array, num_parameter_correct_array, k) 140 | parameter_pass_at_k = float(np.mean(parameter_pass_at_k_arr)) if len(parameter_pass_at_k_arr) > 0 else 0 141 | 142 | metric = { 143 | "category": run_info.get("category", "unknown"), 144 | "model": run_info.get("model", "unknown"), 145 | f"pass@{k}": pass_at_k, 146 | f"tool_pass@{k}": tool_pass_at_k, 147 | f"parameter_pass@{k}": parameter_pass_at_k, 148 | "num_tasks": len(num_trails_array), 149 | "num_trials_total": sum(num_trails_array), 150 | "num_passed_total": sum(num_pass_array), 151 | "num_tool_correct_total": sum(num_tool_correct_array), 152 | "num_parameter_correct_total": sum(num_parameter_correct_array) 153 | } 154 | metrics_list.append(metric) 155 | 156 | print(f"Pass@{k} - Tool_selected: {tool_pass_at_k:.4f}, Parameter: {parameter_pass_at_k:.4f}, Tool_call: {pass_at_k:.4f}") 157 | 158 | return { 159 | "run_info": run_info, 160 | "metrics": metrics_list, 161 | "calculation_info": { 162 | "log_file": log_file_path, 163 | "pass_k_list": pass_k_list, 164 | "num_tasks": len(num_trails_array), 165 | "total_trials": sum(num_trails_array) 166 | } 167 | } 168 | 169 | 170 | def update_log_file_with_metrics(log_file_path: str, output_file_path: str = None) -> str: 171 | """ 172 | Update the original log file with the calculated metrics. 173 | 174 | Args: 175 | log_file_path: Path to the original log file 176 | output_file_path: Path for the updated log file. If None, will overwrite original. 177 | 178 | Returns: 179 | Path to the updated log file 180 | """ 181 | 182 | # Calculate metrics 183 | result = calculate_metrics_from_log(log_file_path) 184 | 185 | if not result: 186 | print("Failed to calculate metrics") 187 | return "" 188 | 189 | # Load original log file 190 | with open(log_file_path, 'r', encoding='utf-8') as f: 191 | original_log = json.load(f) 192 | 193 | # Update metrics in the original log 194 | original_log["metrics"] = result["metrics"] 195 | 196 | # Determine output file path 197 | if output_file_path is None: 198 | output_file_path = log_file_path 199 | 200 | # Save updated log file 201 | with open(output_file_path, 'w', encoding='utf-8') as f: 202 | json.dump(original_log, f, ensure_ascii=False, indent=2) 203 | 204 | print(f"Updated log file saved to: {output_file_path}") 205 | return output_file_path 206 | 207 | 208 | def process_multiple_logs(log_dir: str, pattern: str = None) -> None: 209 | """ 210 | Process multiple log files in a directory. 211 | 212 | Args: 213 | log_dir: Directory containing log files 214 | pattern: Optional pattern to filter log files (e.g., "browser_0711_single_500") 215 | """ 216 | 217 | if not os.path.exists(log_dir): 218 | print(f"Directory not found: {log_dir}") 219 | return 220 | 221 | log_files = [] 222 | for file in os.listdir(log_dir): 223 | if file.endswith('.json'): 224 | if pattern is None or pattern in file: 225 | log_files.append(os.path.join(log_dir, file)) 226 | 227 | print(f"Found {len(log_files)} log files to process") 228 | 229 | for log_file in log_files: 230 | print(f"\nProcessing: {log_file}") 231 | try: 232 | update_log_file_with_metrics(log_file) 233 | except Exception as e: 234 | print(f"Error processing {log_file}: {e}") 235 | 236 | 237 | def main(): 238 | parser = argparse.ArgumentParser(description="Calculate tool_pass@{k} and parameter_pass@{k} metrics from log files") 239 | parser.add_argument("--log_file", type=str, help="Path to a single log file") 240 | parser.add_argument("--log_dir", type=str, help="Directory containing log files") 241 | parser.add_argument("--pattern", type=str, help="Pattern to filter log files (when using --log_dir)") 242 | parser.add_argument("--pass_k", type=str, default="1,3", help="Comma-separated list of k values for pass@k") 243 | parser.add_argument("--output", type=str, help="Output file path (for single file processing)") 244 | parser.add_argument("--calculate_only", action="store_true", help="Only calculate and display metrics, don't update log file") 245 | 246 | args = parser.parse_args() 247 | 248 | pass_k_list = [int(k) for k in args.pass_k.split(",")] 249 | 250 | if args.log_file: 251 | if args.calculate_only: 252 | # Only calculate and display metrics 253 | result = calculate_metrics_from_log(args.log_file, pass_k_list) 254 | if result: 255 | print("\nCalculated Metrics:") 256 | for metric in result["metrics"]: 257 | print(f" {metric}") 258 | else: 259 | # Update log file with metrics 260 | update_log_file_with_metrics(args.log_file, args.output) 261 | 262 | elif args.log_dir: 263 | # Process multiple log files 264 | process_multiple_logs(args.log_dir, args.pattern) 265 | 266 | else: 267 | print("Please provide either --log_file or --log_dir") 268 | parser.print_help() 269 | 270 | if __name__ == "__main__": 271 | main() 272 | -------------------------------------------------------------------------------- /data/file_system/filesystem_single_demo.json: -------------------------------------------------------------------------------- 1 | [{"uuid":"8cac45c6-a9ef-4881-b0e0-4b444900bdfe","category":"filesystem","call_type":"single","tools":[{"name":"read_file","description":"Read the complete contents of a file from the file system. Handles various text encodings and provides detailed error messages if the file cannot be read. Use this tool when you need to examine the contents of a single file. Use the 'head' parameter to read only the first N lines of a file, or the 'tail' parameter to read only the last N lines of a file. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"tail":{"type":"number","description":"If provided, returns only the last N lines of the file"},"head":{"type":"number","description":"If provided, returns only the first N lines of the file"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"read_multiple_files","description":"Read the contents of multiple files simultaneously. This is more efficient than reading files one by one when you need to analyze or compare multiple files. Each file's content is returned with its path as a reference. Failed reads for individual files won't stop the entire operation. Only works within allowed directories.","input_schema":{"type":"object","properties":{"paths":{"type":"array","items":{"type":"string"}}},"required":["paths"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"write_file","description":"Create a new file or completely overwrite an existing file with new content. Use with caution as it will overwrite existing files without warning. Handles text content with proper encoding. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"content":{"type":"string"}},"required":["path","content"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"edit_file","description":"Make line-based edits to a text file. Each edit replaces exact line sequences with new content. Returns a git-style diff showing the changes made. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"edits":{"type":"array","items":{"type":"object","properties":{"oldText":{"type":"string","description":"Text to search for - must match exactly"},"newText":{"type":"string","description":"Text to replace with"}},"required":["oldText","newText"],"additionalProperties":false}},"dryRun":{"type":"boolean","default":false,"description":"Preview changes using git-style diff format"}},"required":["path","edits"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_directory","description":"Create a new directory or ensure a directory exists. Can create multiple nested directories in one operation. If the directory already exists, this operation will succeed silently. Perfect for setting up directory structures for projects or ensuring required paths exist. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_directory","description":"Get a detailed listing of all files and directories in a specified path. Results clearly distinguish between files and directories with [FILE] and [DIR] prefixes. This tool is essential for understanding directory structure and finding specific files within a directory. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_directory_with_sizes","description":"Get a detailed listing of all files and directories in a specified path, including sizes. Results clearly distinguish between files and directories with [FILE] and [DIR] prefixes. This tool is useful for understanding directory structure and finding specific files within a directory. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"sortBy":{"type":"string","enum":["name","size"],"default":"name","description":"Sort entries by name or size"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"directory_tree","description":"Get a recursive tree view of files and directories as a JSON structure. Each entry includes 'name', 'type' (file/directory), and 'children' for directories. Files have no children array, while directories always have a children array (which may be empty). The output is formatted with 2-space indentation for readability. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"search_files","description":"Recursively search for files and directories matching a pattern. Searches through all subdirectories from the starting path. The search is case-insensitive and matches partial names. Returns full paths to all matching items. Great for finding files when you don't know their exact location. Only searches within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"pattern":{"type":"string"},"excludePatterns":{"type":"array","items":{"type":"string"},"default":[]}},"required":["path","pattern"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"get_file_info","description":"Retrieve detailed metadata about a file or directory. Returns comprehensive information including size, creation time, last modified time, permissions, and type. This tool is perfect for understanding file characteristics without reading the actual content. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_allowed_directories","description":"Returns the list of directories that this server is allowed to access. Use this to understand which directories are available before trying to access files.","input_schema":{"type":"object","properties":{},"required":[]}}],"mcp_tools_dict":{"filesystem":["read_file","read_multiple_files","write_file","edit_file","create_directory","list_directory","list_directory_with_sizes","directory_tree","search_files","get_file_info","list_allowed_directories"]},"query":"Provide a recursive tree view of the files and directories located at ./test_project_root/src.","function_call_label":[{"name":"directory_tree","step":"1","id":"1","mcp_server":"filesystem","similar_tools":[],"input":{"path":"./test_project_root/src"},"output":{"status_code":200,"result":{}}}]},{"uuid":"cd0a8b63-439a-4259-af0d-74dd8270d995","category":"filesystem","call_type":"single","tools":[{"name":"read_file","description":"Read the complete contents of a file from the file system. Handles various text encodings and provides detailed error messages if the file cannot be read. Use this tool when you need to examine the contents of a single file. Use the 'head' parameter to read only the first N lines of a file, or the 'tail' parameter to read only the last N lines of a file. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"tail":{"type":"number","description":"If provided, returns only the last N lines of the file"},"head":{"type":"number","description":"If provided, returns only the first N lines of the file"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"read_multiple_files","description":"Read the contents of multiple files simultaneously. This is more efficient than reading files one by one when you need to analyze or compare multiple files. Each file's content is returned with its path as a reference. Failed reads for individual files won't stop the entire operation. Only works within allowed directories.","input_schema":{"type":"object","properties":{"paths":{"type":"array","items":{"type":"string"}}},"required":["paths"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"write_file","description":"Create a new file or completely overwrite an existing file with new content. Use with caution as it will overwrite existing files without warning. Handles text content with proper encoding. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"content":{"type":"string"}},"required":["path","content"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"edit_file","description":"Make line-based edits to a text file. Each edit replaces exact line sequences with new content. Returns a git-style diff showing the changes made. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"edits":{"type":"array","items":{"type":"object","properties":{"oldText":{"type":"string","description":"Text to search for - must match exactly"},"newText":{"type":"string","description":"Text to replace with"}},"required":["oldText","newText"],"additionalProperties":false}},"dryRun":{"type":"boolean","default":false,"description":"Preview changes using git-style diff format"}},"required":["path","edits"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_directory","description":"Create a new directory or ensure a directory exists. Can create multiple nested directories in one operation. If the directory already exists, this operation will succeed silently. Perfect for setting up directory structures for projects or ensuring required paths exist. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_directory","description":"Get a detailed listing of all files and directories in a specified path. Results clearly distinguish between files and directories with [FILE] and [DIR] prefixes. This tool is essential for understanding directory structure and finding specific files within a directory. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_directory_with_sizes","description":"Get a detailed listing of all files and directories in a specified path, including sizes. Results clearly distinguish between files and directories with [FILE] and [DIR] prefixes. This tool is useful for understanding directory structure and finding specific files within a directory. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"sortBy":{"type":"string","enum":["name","size"],"default":"name","description":"Sort entries by name or size"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"directory_tree","description":"Get a recursive tree view of files and directories as a JSON structure. Each entry includes 'name', 'type' (file/directory), and 'children' for directories. Files have no children array, while directories always have a children array (which may be empty). The output is formatted with 2-space indentation for readability. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"search_files","description":"Recursively search for files and directories matching a pattern. Searches through all subdirectories from the starting path. The search is case-insensitive and matches partial names. Returns full paths to all matching items. Great for finding files when you don't know their exact location. Only searches within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"},"pattern":{"type":"string"},"excludePatterns":{"type":"array","items":{"type":"string"},"default":[]}},"required":["path","pattern"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"get_file_info","description":"Retrieve detailed metadata about a file or directory. Returns comprehensive information including size, creation time, last modified time, permissions, and type. This tool is perfect for understanding file characteristics without reading the actual content. Only works within allowed directories.","input_schema":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"list_allowed_directories","description":"Returns the list of directories that this server is allowed to access. Use this to understand which directories are available before trying to access files.","input_schema":{"type":"object","properties":{},"required":[]}}],"mcp_tools_dict":{"filesystem":["read_file","read_multiple_files","write_file","edit_file","create_directory","list_directory","list_directory_with_sizes","directory_tree","search_files","get_file_info","list_allowed_directories"]},"query":"Read the contents of the files located at ./test_project_root/src/main.py and ./test_project_root/docs/README.md at the same time.","function_call_label":[{"name":"read_multiple_files","step":"1","id":"1","mcp_server":"filesystem","similar_tools":[],"input":{"paths":["./test_project_root/src/main.py","./test_project_root/docs/README.md"]},"output":{"status_code":200,"result":{}}}]}] 2 | -------------------------------------------------------------------------------- /data/browser/browser_single_demo.json: -------------------------------------------------------------------------------- 1 | [{"uuid":"0b1be01a-a542-4f54-8cfc-017760c03d72","category":"browser","call_type":"single","tools":[{"name":"start_codegen_session","description":"Start a new code generation session to record Playwright actions","input_schema":{"type":"object","properties":{"options":{"type":"object","description":"Code generation options","properties":{"outputPath":{"type":"string","description":"Directory path where generated tests will be saved (use absolute path)"},"testNamePrefix":{"type":"string","description":"Prefix to use for generated test names (default: 'GeneratedTest')"},"includeComments":{"type":"boolean","description":"Whether to include descriptive comments in generated tests"}},"required":["outputPath"]}},"required":["options"]}},{"name":"end_codegen_session","description":"End a code generation session and generate the test file","input_schema":{"type":"object","properties":{"sessionId":{"type":"string","description":"ID of the session to end"}},"required":["sessionId"]}},{"name":"get_codegen_session","description":"Get information about a code generation session","input_schema":{"type":"object","properties":{"sessionId":{"type":"string","description":"ID of the session to retrieve"}},"required":["sessionId"]}},{"name":"clear_codegen_session","description":"Clear a code generation session without generating a test","input_schema":{"type":"object","properties":{"sessionId":{"type":"string","description":"ID of the session to clear"}},"required":["sessionId"]}},{"name":"playwright_navigate","description":"Navigate to a URL","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to navigate to the website specified"},"browserType":{"type":"string","description":"Browser type to use (chromium, firefox, webkit). Defaults to chromium","enum":["chromium","firefox","webkit"]},"width":{"type":"number","description":"Viewport width in pixels (default: 1280)"},"height":{"type":"number","description":"Viewport height in pixels (default: 720)"},"timeout":{"type":"number","description":"Navigation timeout in milliseconds"},"waitUntil":{"type":"string","description":"Navigation wait condition"},"headless":{"type":"boolean","description":"Run browser in headless mode (default: false)"}},"required":["url"]}},{"name":"playwright_screenshot","description":"Take a screenshot of the current page or a specific element","input_schema":{"type":"object","properties":{"name":{"type":"string","description":"Name for the screenshot"},"selector":{"type":"string","description":"CSS selector for element to screenshot"},"width":{"type":"number","description":"Width in pixels (default: 800)"},"height":{"type":"number","description":"Height in pixels (default: 600)"},"storeBase64":{"type":"boolean","description":"Store screenshot in base64 format (default: true)"},"fullPage":{"type":"boolean","description":"Store screenshot of the entire page (default: false)"},"savePng":{"type":"boolean","description":"Save screenshot as PNG file (default: false)"},"downloadsDir":{"type":"string","description":"Custom downloads directory path (default: user's Downloads folder)"}},"required":["name"]}},{"name":"playwright_click","description":"Click an element on the page","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for the element to click"}},"required":["selector"]}},{"name":"playwright_iframe_click","description":"Click an element in an iframe on the page","input_schema":{"type":"object","properties":{"iframeSelector":{"type":"string","description":"CSS selector for the iframe containing the element to click"},"selector":{"type":"string","description":"CSS selector for the element to click"}},"required":["iframeSelector","selector"]}},{"name":"playwright_iframe_fill","description":"Fill an element in an iframe on the page","input_schema":{"type":"object","properties":{"iframeSelector":{"type":"string","description":"CSS selector for the iframe containing the element to fill"},"selector":{"type":"string","description":"CSS selector for the element to fill"},"value":{"type":"string","description":"Value to fill"}},"required":["iframeSelector","selector","value"]}},{"name":"playwright_fill","description":"fill out an input field","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for input field"},"value":{"type":"string","description":"Value to fill"}},"required":["selector","value"]}},{"name":"playwright_select","description":"Select an element on the page with Select tag","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for element to select"},"value":{"type":"string","description":"Value to select"}},"required":["selector","value"]}},{"name":"playwright_hover","description":"Hover an element on the page","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for element to hover"}},"required":["selector"]}},{"name":"playwright_evaluate","description":"Execute JavaScript in the browser console","input_schema":{"type":"object","properties":{"script":{"type":"string","description":"JavaScript code to execute"}},"required":["script"]}},{"name":"playwright_console_logs","description":"Retrieve console logs from the browser with filtering options","input_schema":{"type":"object","properties":{"type":{"type":"string","description":"Type of logs to retrieve (all, error, warning, log, info, debug, exception)","enum":["all","error","warning","log","info","debug","exception"]},"search":{"type":"string","description":"Text to search for in logs (handles text with square brackets)"},"limit":{"type":"number","description":"Maximum number of logs to return"},"clear":{"type":"boolean","description":"Whether to clear logs after retrieval (default: false)"}},"required":[]}},{"name":"playwright_close","description":"Close the browser and release all resources","input_schema":{"type":"object","properties":{},"required":[]}},{"name":"playwright_get","description":"Perform an HTTP GET request","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to perform GET operation"}},"required":["url"]}},{"name":"playwright_post","description":"Perform an HTTP POST request","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to perform POST operation"},"value":{"type":"string","description":"Data to post in the body"},"token":{"type":"string","description":"Bearer token for authorization"},"headers":{"type":"object","description":"Additional headers to include in the request","additionalProperties":{"type":"string"}}},"required":["url","value"]}},{"name":"playwright_put","description":"Perform an HTTP PUT request","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to perform PUT operation"},"value":{"type":"string","description":"Data to PUT in the body"}},"required":["url","value"]}},{"name":"playwright_patch","description":"Perform an HTTP PATCH request","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to perform PUT operation"},"value":{"type":"string","description":"Data to PATCH in the body"}},"required":["url","value"]}},{"name":"playwright_delete","description":"Perform an HTTP DELETE request","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to perform DELETE operation"}},"required":["url"]}},{"name":"playwright_expect_response","description":"Ask Playwright to start waiting for a HTTP response. This tool initiates the wait operation but does not wait for its completion.","input_schema":{"type":"object","properties":{"id":{"type":"string","description":"Unique & arbitrary identifier to be used for retrieving this response later with `Playwright_assert_response`."},"url":{"type":"string","description":"URL pattern to match in the response."}},"required":["id","url"]}},{"name":"playwright_assert_response","description":"Wait for and validate a previously initiated HTTP response wait operation.","input_schema":{"type":"object","properties":{"id":{"type":"string","description":"Identifier of the HTTP response initially expected using `Playwright_expect_response`."},"value":{"type":"string","description":"Data to expect in the body of the HTTP response. If provided, the assertion will fail if this value is not found in the response body."}},"required":["id"]}},{"name":"playwright_custom_user_agent","description":"Set a custom User Agent for the browser","input_schema":{"type":"object","properties":{"userAgent":{"type":"string","description":"Custom User Agent for the Playwright browser instance"}},"required":["userAgent"]}},{"name":"playwright_get_visible_text","description":"Get the visible text content of the current page","input_schema":{"type":"object","properties":{},"required":[]}},{"name":"playwright_get_visible_html","description":"Get the HTML content of the current page","input_schema":{"type":"object","properties":{},"required":[]}},{"name":"playwright_go_back","description":"Navigate back in browser history","input_schema":{"type":"object","properties":{},"required":[]}},{"name":"playwright_go_forward","description":"Navigate forward in browser history","input_schema":{"type":"object","properties":{},"required":[]}},{"name":"playwright_drag","description":"Drag an element to a target location","input_schema":{"type":"object","properties":{"sourceSelector":{"type":"string","description":"CSS selector for the element to drag"},"targetSelector":{"type":"string","description":"CSS selector for the target location"}},"required":["sourceSelector","targetSelector"]}},{"name":"playwright_press_key","description":"Press a keyboard key","input_schema":{"type":"object","properties":{"key":{"type":"string","description":"Key to press (e.g. 'Enter', 'ArrowDown', 'a')"},"selector":{"type":"string","description":"Optional CSS selector to focus before pressing key"}},"required":["key"]}},{"name":"playwright_save_as_pdf","description":"Save the current page as a PDF file","input_schema":{"type":"object","properties":{"outputPath":{"type":"string","description":"Directory path where PDF will be saved"},"filename":{"type":"string","description":"Name of the PDF file (default: page.pdf)"},"format":{"type":"string","description":"Page format (e.g. 'A4', 'Letter')"},"printBackground":{"type":"boolean","description":"Whether to print background graphics"},"margin":{"type":"object","description":"Page margins","properties":{"top":{"type":"string"},"right":{"type":"string"},"bottom":{"type":"string"},"left":{"type":"string"}}}},"required":["outputPath"]}},{"name":"playwright_click_and_switch_tab","description":"Click a link and switch to the newly opened tab","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for the link to click"}},"required":["selector"]}},{"name":"puppeteer_navigate","description":"Navigate to a URL","input_schema":{"type":"object","properties":{"url":{"type":"string","description":"URL to navigate to"},"launchOptions":{"type":"object","description":"PuppeteerJS LaunchOptions. Default null. If changed and not null, browser restarts. Example: { headless: true, args: ['--no-sandbox'] }"},"allowDangerous":{"type":"boolean","description":"Allow dangerous LaunchOptions that reduce security. When false, dangerous args like --no-sandbox will throw errors. Default false."}},"required":["url"]}},{"name":"puppeteer_screenshot","description":"Take a screenshot of the current page or a specific element","input_schema":{"type":"object","properties":{"name":{"type":"string","description":"Name for the screenshot"},"selector":{"type":"string","description":"CSS selector for element to screenshot"},"width":{"type":"number","description":"Width in pixels (default: 800)"},"height":{"type":"number","description":"Height in pixels (default: 600)"},"encoded":{"type":"boolean","description":"If true, capture the screenshot as a base64-encoded data URI (as text) instead of binary image content. Default false."}},"required":["name"]}},{"name":"puppeteer_click","description":"Click an element on the page","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for element to click"}},"required":["selector"]}},{"name":"puppeteer_fill","description":"Fill out an input field","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for input field"},"value":{"type":"string","description":"Value to fill"}},"required":["selector","value"]}},{"name":"puppeteer_select","description":"Select an element on the page with Select tag","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for element to select"},"value":{"type":"string","description":"Value to select"}},"required":["selector","value"]}},{"name":"puppeteer_hover","description":"Hover an element on the page","input_schema":{"type":"object","properties":{"selector":{"type":"string","description":"CSS selector for element to hover"}},"required":["selector"]}},{"name":"puppeteer_evaluate","description":"Execute JavaScript in the browser console","input_schema":{"type":"object","properties":{"script":{"type":"string","description":"JavaScript code to execute"}},"required":["script"]}}],"mcp_tools_dict":{"playwright":["start_codegen_session","end_codegen_session","get_codegen_session","clear_codegen_session","playwright_navigate","playwright_screenshot","playwright_click","playwright_iframe_click","playwright_iframe_fill","playwright_fill","playwright_select","playwright_hover","playwright_evaluate","playwright_console_logs","playwright_close","playwright_get","playwright_post","playwright_put","playwright_patch","playwright_delete","playwright_expect_response","playwright_assert_response","playwright_custom_user_agent","playwright_get_visible_text","playwright_get_visible_html","playwright_go_back","playwright_go_forward","playwright_drag","playwright_press_key","playwright_save_as_pdf","playwright_click_and_switch_tab"],"puppeteer":["puppeteer_navigate","puppeteer_screenshot","puppeteer_click","puppeteer_fill","puppeteer_select","puppeteer_hover","puppeteer_evaluate"]},"query":"Navigate to the Wikipedia website using the Chromium browser and check its accessibility.","function_call_label":[{"name":"playwright_navigate","step":"1","id":"1","mcp_server":"playwright","similar_tools":[{"name":"puppeteer_navigate","mcp_server":"puppeteer"}],"input":{"url":"https://www.wikipedia.org","browserType":"chromium"},"output":{"status_code":200,"result":{}}}]}] 2 | -------------------------------------------------------------------------------- /data/pay/pay_single_demo.json: -------------------------------------------------------------------------------- 1 | [{"uuid":"3b98fe31-cdf9-483f-88f7-4f79f226bd8b","category":"pay","call_type":"single","tools":[{"name":"create_invoice","description":"\nCreate Invoices on PayPal.\n\nThis function is used to create an invoice in the PayPal system. It allows you to generate a new invoice, specifying details such as customer information, items, quantities, pricing, and tax information. Once created, an invoice can be sent to the customer for payment.\n","input_schema":{"type":"object","properties":{"detail":{"type":"object","properties":{"invoice_date":{"type":"string","description":"The invoice date in YYYY-MM-DD format"},"currency_code":{"type":"string","description":"currency code of the invoice"}},"required":["currency_code"],"additionalProperties":false,"description":"The invoice detail, like{ \"invoice_number\": \"#123\", \"reference\": \"deal-ref\", \"invoice_date\": \"2018-11-12\", \"currency_code\": \"USD\", \"note\": \"Thank you for your business.\", \"term\": \"No refunds after 30 days.\", \"memo\": \"This is a long contract\", \"payment_term\": { \"term_type\": \"NET_10\", \"due_date\": \"2018-11-22\" } }"},"invoicer":{"type":"object","properties":{"business_name":{"type":"string","maxLength":300,"description":"business name of the invoicer"},"name":{"type":"object","properties":{"given_name":{"type":"string","description":"given name of the invoicer"},"surname":{"type":"string","description":"surname of the invoicer"}},"additionalProperties":false,"description":"name of the invoicer"},"email_address":{"type":"string","description":"email address of the invoicer"}},"required":["business_name"],"additionalProperties":false,"description":"The invoicer business information that appears on the invoice."},"primary_recipients":{"type":"array","items":{"type":"object","properties":{"billing_info":{"type":"object","properties":{"name":{"type":"object","properties":{"given_name":{"type":"string","description":"given name of the recipient"},"surname":{"type":"string","description":"surname of the recipient"}},"additionalProperties":false,"description":"name of the recipient"},"email_address":{"type":"string","description":"email address of the recipient"}},"additionalProperties":false,"description":"The billing information of the invoice recipient"}},"additionalProperties":false},"description":"array of recipients"},"items":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string","description":"The name of the item"},"quantity":{"type":"string","description":"The quantity of the item that the invoicer provides to the payer. Value is from -1000000 to 1000000. Supports up to five decimal places. Cast to string"},"unit_amount":{"type":"object","properties":{"currency_code":{"type":"string","description":"Currency code of the unit amount"},"value":{"type":"string","description":"The unit price. Up to 2 decimal points"}},"required":["currency_code","value"],"additionalProperties":false,"description":"unit amount object"},"tax":{"type":"object","properties":{"name":{"type":"string","description":"Tax name"},"percent":{"type":"string","description":"Tax Percent"}},"additionalProperties":false,"description":"tax object"},"unit_of_measure":{"type":"string","enum":["QUANTITY","HOURS","AMOUNT"],"description":"The unit of measure for the invoiced item"}},"required":["name","quantity","unit_amount"],"additionalProperties":false,"description":"invoice line item object"},"description":"Array of invoice line items"}},"required":["detail"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_product","description":"\nCreate a product in PayPal using product catalog - create products API.\nThis function creates a new product that will be used in subscription plans, subscriptions.\nRequired parameters are: name (product name), type (product type).\nHigh level: \n - id: (auto-generated or specify SKU of the product) The ID of the product\n - name: {product_name} (required) \n - description: {product_description} (optional)\n - type {DIGITAL | PHYSICAL | SERVICE} (required)\n - category: {product_category} (optional) \n - image_url: {image_url} (optional)\n - home_url: {home_url} (optional)\n\nBelow is the payload request structure:\n{\n \"id\": \"#PROD-XYAB12ABSB7868434\",\n \"name\": \"Video Streaming Service\",\n \"description\": \"Service for streaming latest series, movies etc.\",\n \"type\": \"SERVICE\",\n \"category\": \"SOFTWARE\",\n \"image_url\": \"https://example.com/streaming.jpg\",\n \"home_url\": \"https://example.com/home\"\n}\n\n","input_schema":{"type":"object","properties":{"name":{"type":"string","description":"The product name."},"type":{"type":"string","enum":["PHYSICAL","DIGITAL","SERVICE"],"description":"The product type. Value is PHYSICAL, DIGITAL, or SERVICE."},"description":{"type":"string","description":"The product description."},"category":{"type":"string","description":"The product category."},"image_url":{"type":"string","description":"The image URL for the product."},"home_url":{"type":"string","description":"The home page URL for the product."}},"required":["name","type"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_subscription_plan","description":"\nCreate a subsctiption plan in PayPal using subscription - create plan API.\nThis function creates a new subscription plan that defines pricing and billing cycle details for subscriptions.\nRequired parameters are: product_id (the ID of the product for which to create the plan), name (subscription plan name), billing_cycles (billing cycle details).\nHigh level: product_id, name, description, taxes, status: {CREATED|INACTIVE|ACTIVE}, billing_cycles, payment_preferences are required in json object.\nWhile creating billing_cycles object, trial(second) billing cycle should precede regular billing cycle.\n","input_schema":{"type":"object","properties":{"product_id":{"type":"string","description":"The ID of the product for which to create the plan."},"name":{"type":"string","description":"The subscription plan name."},"description":{"type":"string","description":"The subscription plan description."},"billing_cycles":{"type":"array","items":{"type":"object","properties":{"frequency":{"type":"object","properties":{"interval_unit":{"type":"string","enum":["DAY","WEEK","MONTH","YEAR"],"description":"The unit of time for the billing cycle."},"interval_count":{"type":"number","description":"The number of units for the billing cycle."}},"required":["interval_unit","interval_count"],"additionalProperties":true,"description":"The frequency of the billing cycle."},"tenure_type":{"type":"string","enum":["REGULAR","TRIAL"],"description":"The type of billing cycle tenure."},"sequence":{"type":"number","description":"The sequence of the billing cycle."},"total_cycles":{"type":"number","description":"The total number of cycles in the billing plan."},"pricing_scheme":{"type":"object","properties":{"fixed_price":{"type":"object","properties":{"currency_code":{"type":"string","enum":["USD"],"description":"The currency code for the fixed price."},"value":{"type":"string","description":"The value of the fixed price."}},"required":["currency_code","value"],"additionalProperties":true,"description":"The fixed price for the subscription plan."},"version":{"type":"string","description":"The version of the pricing scheme."}},"additionalProperties":true,"description":"The pricing scheme for the billing cycle."}},"required":["frequency","tenure_type","sequence","pricing_scheme"],"additionalProperties":true},"description":"The billing cycles of the plan."},"payment_preferences":{"type":"object","properties":{"auto_bill_outstanding":{"type":"boolean","description":"Indicates whether to automatically bill outstanding amounts."},"setup_fee":{"type":"object","properties":{"currency_code":{"type":"string","enum":["USD"],"description":"The currency code for the setup fee."},"value":{"type":"string","description":"The value of the setup fee."}},"additionalProperties":true,"description":"The setup fee for the subscription plan."},"setup_fee_failure_action":{"type":"string","enum":["CONTINUE","CANCEL"],"description":"The action to take if the setup fee payment fails."},"payment_failure_threshold":{"type":"number","description":"The number of failed payments before the subscription is canceled."}},"additionalProperties":true,"description":"The payment preferences for the subscription plan."},"taxes":{"type":"object","properties":{"percentage":{"type":"string","description":"The tax percentage."},"inclusive":{"type":"boolean","description":"Indicates whether the tax is inclusive."}},"additionalProperties":true,"description":"The tax details."}},"required":["product_id","name","billing_cycles"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_shipment_tracking","description":"\nCreate a shipment for a transaction in PayPal.\nThis function creates a shipment record for a specific transaction, allowing you to track the shipment status and details.\nThe transaction_id can fetch from the captured payment details in the order information.\nRequired parameters are: tracking_number (the tracking number for the shipment), transaction_id (the transaction ID associated with the shipment). \nHigh level: tracking_number, transaction_id, status (optional), carrier (optional) are required json objects.\nBelow is the payload request structure:\n{\n \"tracking_number\": \"1234567890\",\n \"transaction_id\": \"9XJ12345ABC67890\",\n \"status\": \"SHIPPED\", // Required: ON_HOLD, SHIPPED, DELIVERED, CANCELLED\n \"carrier\": \"UPS\" // Required: The carrier handling the shipment. Link to supported carriers: http://developer.paypal.com/docs/tracking/reference/carriers/\n}\n","input_schema":{"type":"object","properties":{"order_id":{"type":"string","description":"The ID of the order for which to create a shipment"},"tracking_number":{"type":"string","description":"The tracking number for the shipment. Id is provided by the shipper. This is required to create a shipment."},"transaction_id":{"type":"string","description":"The transaction ID associated with the shipment. Transaction id available after the order is paid or captured. This is required to create a shipment."},"status":{"type":"string","description":"The status of the shipment. It can be \"ON_HOLD\", \"SHIPPED\", \"DELIVERED\", or \"CANCELLED\".","default":"SHIPPED"},"carrier":{"type":"string","description":"The carrier handling the shipment."}},"required":["tracking_number","transaction_id"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_order","description":"\nCreate an order in PayPal.\n\nThis tool is used to create a new order in PayPal. This is typically the first step in initiating a payment flow. It sets up an order with specified details such as item(s) to be purchased, quantity, amount, currency, and other details.\n","input_schema":{"type":"object","properties":{"currencyCode":{"type":"string","enum":["USD"],"description":"Currency code of the amount."},"items":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string","description":"The name of the item."},"quantity":{"type":"number","description":"The item quantity. Must be a whole number.","default":1},"description":{"type":"string","description":"The detailed item description."},"itemCost":{"type":"number","description":"The cost of each item - upto 2 decimal points."},"taxPercent":{"type":"number","description":"The tax percent for the specific item.","default":0},"itemTotal":{"type":"number","description":"The total cost of this line item."}},"required":["name","itemCost","itemTotal"],"additionalProperties":false},"maxItems":50},"discount":{"type":"number","description":"The discount amount for the order.","default":0},"shippingCost":{"type":"number","description":"The cost of shipping for the order.","default":0},"shippingAddress":{"anyOf":[{"type":"object","properties":{"address_line_1":{"type":"string","description":"The first line of the address, such as number and street, for example, `173 Drury Lane`.This field needs to pass the full address."},"address_line_2":{"type":"string","description":"The second line of the address, for example, a suite or apartment number."},"admin_area_2":{"type":"string","description":"A city, town, or village. Smaller than `admin_area_level_1`."},"admin_area_1":{"type":"string","description":"The highest-level sub-division in a country, which is usually a province, state, or ISO-3166-2 subdivision. "},"postal_code":{"type":"string","description":"The postal code, which is the ZIP code or equivalent. Typically required for countries with a postal code or an equivalent."},"country_code":{"type":"string","minLength":2,"maxLength":2,"description":"The 2-character ISO 3166-1 code that identifies the country or region. Note: The country code for Great Britain is `GB` and not `UK` as used in the top-level domain names for that country."}},"additionalProperties":false,"description":"The shipping address for the order."},{"type":"null"}],"description":"The shipping address for the order.","default":null},"notes":{"anyOf":[{"anyOf":[{"not":{}},{"type":"string"}]},{"type":"null"}],"default":null},"returnUrl":{"type":"string","default":"https://example.com/returnUrl"},"cancelUrl":{"type":"string","default":"https://example.com/cancelUrl"}},"required":["currencyCode","items"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}},{"name":"create_refund","description":"\nInitiate a refund for a PayPal payment capture.\nIf you have an order ID instead of a capture ID, first use the get order tool to obtain the capture ID from the order details.\nThis function allows you to return funds to a customer by refunding a previously captured payment. You can issue a full refund or specify a partial amount. If multiple captures exist for an order, request clarification on which specific capture to refund.\nRequired parameters:\n- Capture ID: The ID of the capture to refund\n- Amount (optional): For partial refunds, specify the amount to refund (must be less than or equal to the captured amount)\n- Note to Payer (optional): Additional explanation visible to the customer\nResponse details include:\n- Refund ID and status\n- Refunded amount and currency\n","input_schema":{"type":"object","properties":{"capture_id":{"type":"string","description":"The ID of the capture to refund."},"amount":{"type":"object","properties":{"currency_code":{"type":"string"},"value":{"type":"string"}},"required":["currency_code","value"],"additionalProperties":false,"description":"The amount to refund. If not specified, the full captured amount is refunded."},"invoice_id":{"type":"string","description":"The invoice ID that is used to track this payment."},"note_to_payer":{"type":"string","description":"A note to the payer."}},"required":["capture_id"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}}],"mcp_tools_dict":{"paypal":["create_invoice","create_product","create_subscription_plan","create_shipment_tracking","create_order","create_refund"]},"query":"Create an invoice for Tech Solutions Inc. for a Consultation Service costing 150.00 USD.","function_call_label":[{"name":"create_invoice","step":"1","id":"1","mcp_server":"paypal","similar_tools":[],"input":{"detail":{"currency_code":"USD"},"invoicer":{"business_name":"Tech Solutions Inc."},"items":[{"name":"Consultation Service","quantity":"1","unit_amount":{"currency_code":"USD","value":"150.00"}}]},"output":{"status_code":200,"result":{}}}]}] 2 | --------------------------------------------------------------------------------