├── requirements.txt ├── .gitignore ├── test_debug.py ├── test_kimi_k2_providers.sh ├── test_qwen_coder_providers.sh ├── test_deepseek_versions.sh ├── README.md ├── test_deepseek_fireworks.sh ├── test_generic_model.sh ├── tool_definitions.py ├── test_scenarios.json └── tool_tester_v2.py /requirements.txt: -------------------------------------------------------------------------------- 1 | openai>=1.0.0 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | env/ 8 | venv/ 9 | .venv 10 | pip-log.txt 11 | pip-delete-this-directory.txt 12 | 13 | # Test results directories 14 | tool_test*/ 15 | tool_test_results*/ 16 | deepseek_comparison*/ 17 | deepseek_fireworks_comparison*/ 18 | test_results*/ 19 | 20 | # Log files 21 | *.log 22 | 23 | # CSV and result files 24 | results.csv 25 | summary.txt 26 | failed_providers.txt 27 | 28 | # IDE 29 | .vscode/ 30 | .idea/ 31 | *.swp 32 | *.swo 33 | *~ 34 | 35 | # OS 36 | .DS_Store 37 | Thumbs.db 38 | desktop.ini 39 | 40 | # Temporary files 41 | *.tmp 42 | *.bak 43 | *.backup 44 | bash.exe.stackdump 45 | 46 | # API keys (never commit these!) 47 | .env 48 | *.key 49 | api_keys.txt 50 | config.ini 51 | 52 | # Jupyter 53 | .ipynb_checkpoints/ 54 | *.ipynb 55 | 56 | .claude 57 | __pycache__ -------------------------------------------------------------------------------- /test_debug.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Debug tool to see raw API responses 4 | """ 5 | 6 | import json 7 | import time 8 | from openai import OpenAI 9 | 10 | def test_tool_response(api_base, api_key, model): 11 | client = OpenAI(api_key=api_key, base_url=api_base) 12 | 13 | tools = [ 14 | { 15 | "type": "function", 16 | "function": { 17 | "name": "calculate", 18 | "description": "Perform a calculation", 19 | "parameters": { 20 | "type": "object", 21 | "properties": { 22 | "expression": {"type": "string"} 23 | }, 24 | "required": ["expression"] 25 | } 26 | } 27 | } 28 | ] 29 | 30 | messages = [ 31 | {"role": "user", "content": "What is 847 divided by 6?"} 32 | ] 33 | 34 | print("Sending request...") 35 | print(f"Model: {model}") 36 | print(f"Messages: {messages}") 37 | print("\n" + "="*50) 38 | 39 | try: 40 | response = client.chat.completions.create( 41 | model=model, 42 | messages=messages, 43 | tools=tools, 44 | tool_choice="auto", 45 | temperature=0.1, 46 | max_tokens=150 47 | ) 48 | 49 | print("RAW RESPONSE:") 50 | print(response) 51 | print("\n" + "="*50) 52 | 53 | if response.choices: 54 | msg = response.choices[0].message 55 | print("\nMESSAGE DETAILS:") 56 | print(f"Type: {type(msg)}") 57 | print(f"Content: {msg.content}") 58 | print(f"Has tool_calls: {hasattr(msg, 'tool_calls')}") 59 | if hasattr(msg, 'tool_calls'): 60 | print(f"Tool calls: {msg.tool_calls}") 61 | if msg.tool_calls: 62 | for tc in msg.tool_calls: 63 | print(f"\nTool Call Details:") 64 | print(f" ID: {tc.id}") 65 | print(f" Type: {tc.type}") 66 | print(f" Function name: {tc.function.name}") 67 | print(f" Arguments: {tc.function.arguments}") 68 | 69 | # Check for any special tokens 70 | if msg.content: 71 | if "<|" in msg.content or "|>" in msg.content: 72 | print("\nWARNING: Special tokens detected in content!") 73 | print(f"Content repr: {repr(msg.content)}") 74 | 75 | except Exception as e: 76 | print(f"Error: {e}") 77 | import traceback 78 | traceback.print_exc() 79 | 80 | if __name__ == "__main__": 81 | import sys 82 | if len(sys.argv) != 4: 83 | print("Usage: python test_debug.py ") 84 | sys.exit(1) 85 | 86 | test_tool_response(sys.argv[1], sys.argv[2], sys.argv[3]) -------------------------------------------------------------------------------- /test_kimi_k2_providers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Kimi K2 tool testing provider test script for OpenRouter 4 | # Usage: ./test_kimi_k2_providers.sh YOUR_API_KEY [quick|full] 5 | 6 | if [ $# -lt 1 ]; then 7 | echo "Usage: ./test_kimi_k2_providers.sh YOUR_OPENROUTER_API_KEY [quick|full]" 8 | echo "" 9 | echo "Examples:" 10 | echo " ./test_kimi_k2_providers.sh sk-or-xxx quick" 11 | echo " ./test_kimi_k2_providers.sh sk-or-xxx full" 12 | exit 1 13 | fi 14 | 15 | API_KEY=$1 16 | MODEL="moonshotai/kimi-k2" 17 | TEST_MODE=${2:-quick} # Default to quick mode if not specified 18 | 19 | # Create results directory with timestamp 20 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S") 21 | RESULTS_DIR="tool_test_results_${TIMESTAMP}" 22 | mkdir -p "$RESULTS_DIR" 23 | 24 | echo "Tool Testing Configuration:" 25 | echo " Model: $MODEL" 26 | echo " Test Mode: $TEST_MODE" 27 | echo " Results Directory: $RESULTS_DIR" 28 | echo "" 29 | 30 | # Providers to test for Kimi K2 31 | providers=( 32 | "targon/fp8" 33 | "chutes/fp8" 34 | "deepinfra/fp4" 35 | "novita/fp8" 36 | "fireworks/fp8" 37 | "moonshotai/fp8" 38 | "baseten/fp4" 39 | "atlas-cloud/fp8" 40 | "parasail/fp8" 41 | "together/fp8" 42 | "groq" 43 | "gmicloud/fp8" 44 | ) 45 | 46 | # Function to run test and capture results 47 | run_test() { 48 | local provider=$1 49 | local output_file="$RESULTS_DIR/test_${provider//\//_}.txt" 50 | 51 | echo "================================================" 52 | echo "Testing provider: $provider" 53 | echo "Output file: $output_file" 54 | echo "================================================" 55 | 56 | # Build the command 57 | cmd="python3 tool_tester_v2.py \ 58 | --api-base https://openrouter.ai/api/v1 \ 59 | --api-key $API_KEY \ 60 | --model $MODEL \ 61 | --provider \"$provider\" \ 62 | --temperature 0.6" 63 | 64 | # Add test mode flag 65 | if [ "$TEST_MODE" == "quick" ]; then 66 | cmd="$cmd --quick" 67 | else 68 | cmd="$cmd --max-tools 40" 69 | fi 70 | 71 | # Add output file 72 | cmd="$cmd --output \"$output_file\"" 73 | 74 | # Run the test and capture both stdout and the result 75 | echo "Running: $cmd" 76 | eval $cmd 2>&1 | tee "${output_file}.log" 77 | 78 | # Check if the test completed successfully 79 | if [ ${PIPESTATUS[0]} -eq 0 ]; then 80 | echo "PASS: Test completed successfully for $provider" 81 | 82 | # Extract the overall score from the output file if it exists 83 | if [ -f "$output_file" ]; then 84 | score=$(grep "OVERALL SCORE:" "$output_file" | tail -1) 85 | echo " $score" 86 | fi 87 | else 88 | echo "FAIL: Test failed for $provider" 89 | echo "FAILED: $provider" >> "$RESULTS_DIR/failed_providers.txt" 90 | fi 91 | 92 | echo "" 93 | sleep 2 # Small delay between providers to avoid rate limiting 94 | } 95 | 96 | # Test each provider 97 | successful_tests=0 98 | failed_tests=0 99 | 100 | for provider in "${providers[@]}"; do 101 | # Skip provider if it doesn't make sense for the model 102 | # (you can add logic here to filter providers based on model) 103 | 104 | run_test "$provider" 105 | 106 | # Check if test was successful 107 | if [ $? -eq 0 ]; then 108 | ((successful_tests++)) 109 | else 110 | ((failed_tests++)) 111 | fi 112 | done 113 | 114 | # Generate summary report 115 | summary_file="$RESULTS_DIR/summary.txt" 116 | echo "================================================" | tee "$summary_file" 117 | echo "TOOL TESTING SUMMARY REPORT" | tee -a "$summary_file" 118 | echo "================================================" | tee -a "$summary_file" 119 | echo "Model: $MODEL" | tee -a "$summary_file" 120 | echo "Test Mode: $TEST_MODE" | tee -a "$summary_file" 121 | echo "Timestamp: $TIMESTAMP" | tee -a "$summary_file" 122 | echo "Total Providers Tested: ${#providers[@]}" | tee -a "$summary_file" 123 | echo "Successful Tests: $successful_tests" | tee -a "$summary_file" 124 | echo "Failed Tests: $failed_tests" | tee -a "$summary_file" 125 | echo "" | tee -a "$summary_file" 126 | 127 | # Extract scores from all successful tests 128 | echo "Provider Scores:" | tee -a "$summary_file" 129 | echo "-----------------------------------------" | tee -a "$summary_file" 130 | 131 | for file in "$RESULTS_DIR"/test_*.txt; do 132 | if [ -f "$file" ]; then 133 | provider_name=$(basename "$file" .txt | sed 's/test_//' | sed 's/_/\//g') 134 | score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3, $4}') 135 | if [ -n "$score" ]; then 136 | printf "%-30s %s\n" "$provider_name:" "$score" | tee -a "$summary_file" 137 | fi 138 | fi 139 | done 140 | 141 | echo "" | tee -a "$summary_file" 142 | echo "Full results saved in: $RESULTS_DIR" | tee -a "$summary_file" 143 | 144 | # Create a CSV summary for easy analysis 145 | csv_file="$RESULTS_DIR/results.csv" 146 | echo "Provider,Model,OverallScore,Grade,ScenarioSuccessRate,ToolPrecision,ToolRecall,ToolF1,ParamAccuracy,ExecutionSuccess" > "$csv_file" 147 | 148 | for file in "$RESULTS_DIR"/test_*.txt; do 149 | if [ -f "$file" ]; then 150 | provider_name=$(basename "$file" .txt | sed 's/test_//' | sed 's/_/\//g') 151 | 152 | # Extract metrics using grep and awk 153 | overall_score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//') 154 | grade=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $4}' | sed 's/[()]//g') 155 | scenario_rate=$(grep "Scenario Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 156 | tool_precision=$(grep "Tool Precision (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 157 | tool_recall=$(grep "Tool Recall (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 158 | tool_f1=$(grep "Tool F1 (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 159 | param_accuracy=$(grep "Parameter Accuracy (structural):" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 160 | exec_success=$(grep "Execution Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 161 | 162 | if [ -n "$overall_score" ]; then 163 | echo "$provider_name,$MODEL,$overall_score,$grade,$scenario_rate,$tool_precision,$tool_recall,$tool_f1,$param_accuracy,$exec_success" >> "$csv_file" 164 | fi 165 | fi 166 | done 167 | 168 | echo "CSV results saved in: $csv_file" 169 | echo "" 170 | echo "All providers tested!" -------------------------------------------------------------------------------- /test_qwen_coder_providers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Qwen3 Coder tool testing provider test script for OpenRouter 4 | # Usage: ./test_qwen_coder_providers.sh YOUR_API_KEY [quick|full] 5 | 6 | if [ $# -lt 1 ]; then 7 | echo "Usage: ./test_qwen_coder_providers.sh YOUR_OPENROUTER_API_KEY [quick|full]" 8 | echo "" 9 | echo "Examples:" 10 | echo " ./test_qwen_coder_providers.sh sk-or-xxx quick" 11 | echo " ./test_qwen_coder_providers.sh sk-or-xxx full" 12 | exit 1 13 | fi 14 | 15 | API_KEY=$1 16 | MODEL="qwen/qwen3-coder" 17 | TEST_MODE=${2:-quick} # Default to quick mode if not specified 18 | 19 | # Create results directory with timestamp 20 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S") 21 | RESULTS_DIR="tool_test_results_${TIMESTAMP}" 22 | mkdir -p "$RESULTS_DIR" 23 | 24 | echo "Tool Testing Configuration:" 25 | echo " Model: $MODEL" 26 | echo " Test Mode: $TEST_MODE" 27 | echo " Results Directory: $RESULTS_DIR" 28 | echo "" 29 | 30 | # Providers to test for Qwen3 Coder 31 | providers=( 32 | "chutes/fp8" 33 | "deepinfra/fp4" 34 | "baseten/fp8" 35 | "parasail/fp8" 36 | "fireworks" 37 | "novita/fp8" 38 | "atlas-cloud/fp8" 39 | "phala" 40 | "gmicloud/fp8" 41 | "targon/fp8" 42 | "alibaba/opensource" 43 | "together/fp8" 44 | "hyperbolic/fp8" 45 | "cerebras/fp8" 46 | ) 47 | 48 | # Function to run test and capture results 49 | run_test() { 50 | local provider=$1 51 | local output_file="$RESULTS_DIR/test_${provider//\//_}.txt" 52 | 53 | echo "================================================" 54 | echo "Testing provider: $provider" 55 | echo "Output file: $output_file" 56 | echo "================================================" 57 | 58 | # Build the command 59 | cmd="python3 tool_tester_v2.py \ 60 | --api-base https://openrouter.ai/api/v1 \ 61 | --api-key $API_KEY \ 62 | --model $MODEL \ 63 | --provider \"$provider\" \ 64 | --temperature 0.6" 65 | 66 | # Add test mode flag 67 | if [ "$TEST_MODE" == "quick" ]; then 68 | cmd="$cmd --quick" 69 | else 70 | cmd="$cmd --max-tools 40" 71 | fi 72 | 73 | # Add output file 74 | cmd="$cmd --output \"$output_file\"" 75 | 76 | # Run the test and capture both stdout and the result 77 | echo "Running: $cmd" 78 | eval $cmd 2>&1 | tee "${output_file}.log" 79 | 80 | # Check if the test completed successfully 81 | if [ ${PIPESTATUS[0]} -eq 0 ]; then 82 | echo "PASS: Test completed successfully for $provider" 83 | 84 | # Extract the overall score from the output file if it exists 85 | if [ -f "$output_file" ]; then 86 | score=$(grep "OVERALL SCORE:" "$output_file" | tail -1) 87 | echo " $score" 88 | fi 89 | else 90 | echo "FAIL: Test failed for $provider" 91 | echo "FAILED: $provider" >> "$RESULTS_DIR/failed_providers.txt" 92 | fi 93 | 94 | echo "" 95 | sleep 2 # Small delay between providers to avoid rate limiting 96 | } 97 | 98 | # Test each provider 99 | successful_tests=0 100 | failed_tests=0 101 | 102 | for provider in "${providers[@]}"; do 103 | # Skip provider if it doesn't make sense for the model 104 | # (you can add logic here to filter providers based on model) 105 | 106 | run_test "$provider" 107 | 108 | # Check if test was successful 109 | if [ $? -eq 0 ]; then 110 | ((successful_tests++)) 111 | else 112 | ((failed_tests++)) 113 | fi 114 | done 115 | 116 | # Generate summary report 117 | summary_file="$RESULTS_DIR/summary.txt" 118 | echo "================================================" | tee "$summary_file" 119 | echo "TOOL TESTING SUMMARY REPORT" | tee -a "$summary_file" 120 | echo "================================================" | tee -a "$summary_file" 121 | echo "Model: $MODEL" | tee -a "$summary_file" 122 | echo "Test Mode: $TEST_MODE" | tee -a "$summary_file" 123 | echo "Timestamp: $TIMESTAMP" | tee -a "$summary_file" 124 | echo "Total Providers Tested: ${#providers[@]}" | tee -a "$summary_file" 125 | echo "Successful Tests: $successful_tests" | tee -a "$summary_file" 126 | echo "Failed Tests: $failed_tests" | tee -a "$summary_file" 127 | echo "" | tee -a "$summary_file" 128 | 129 | # Extract scores from all successful tests 130 | echo "Provider Scores:" | tee -a "$summary_file" 131 | echo "-----------------------------------------" | tee -a "$summary_file" 132 | 133 | for file in "$RESULTS_DIR"/test_*.txt; do 134 | if [ -f "$file" ]; then 135 | provider_name=$(basename "$file" .txt | sed 's/test_//' | sed 's/_/\//g') 136 | score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3, $4}') 137 | if [ -n "$score" ]; then 138 | printf "%-30s %s\n" "$provider_name:" "$score" | tee -a "$summary_file" 139 | fi 140 | fi 141 | done 142 | 143 | echo "" | tee -a "$summary_file" 144 | echo "Full results saved in: $RESULTS_DIR" | tee -a "$summary_file" 145 | 146 | # Create a CSV summary for easy analysis 147 | csv_file="$RESULTS_DIR/results.csv" 148 | echo "Provider,Model,OverallScore,Grade,ScenarioSuccessRate,ToolPrecision,ToolRecall,ToolF1,ParamAccuracy,ExecutionSuccess" > "$csv_file" 149 | 150 | for file in "$RESULTS_DIR"/test_*.txt; do 151 | if [ -f "$file" ]; then 152 | provider_name=$(basename "$file" .txt | sed 's/test_//' | sed 's/_/\//g') 153 | 154 | # Extract metrics using grep and awk 155 | overall_score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//') 156 | grade=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $4}' | sed 's/[()]//g') 157 | scenario_rate=$(grep "Scenario Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 158 | tool_precision=$(grep "Tool Precision (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 159 | tool_recall=$(grep "Tool Recall (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 160 | tool_f1=$(grep "Tool F1 (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 161 | param_accuracy=$(grep "Parameter Accuracy (structural):" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 162 | exec_success=$(grep "Execution Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 163 | 164 | if [ -n "$overall_score" ]; then 165 | echo "$provider_name,$MODEL,$overall_score,$grade,$scenario_rate,$tool_precision,$tool_recall,$tool_f1,$param_accuracy,$exec_success" >> "$csv_file" 166 | fi 167 | fi 168 | done 169 | 170 | echo "CSV results saved in: $csv_file" 171 | echo "" 172 | echo "All providers tested!" -------------------------------------------------------------------------------- /test_deepseek_versions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # DeepSeek model version comparison tool testing script for OpenRouter 4 | # Usage: ./test_deepseek_versions.sh YOUR_API_KEY [quick|full] 5 | 6 | if [ $# -lt 1 ]; then 7 | echo "Usage: ./test_deepseek_versions.sh YOUR_OPENROUTER_API_KEY [quick|full]" 8 | echo "" 9 | echo "This script compares three DeepSeek model versions:" 10 | echo " - deepseek/deepseek-chat-v3.1" 11 | echo " - deepseek/deepseek-chat-v3-0324" 12 | echo " - deepseek/deepseek-chat-v3.1:thinking" 13 | echo "" 14 | echo "Examples:" 15 | echo " ./test_deepseek_versions.sh sk-or-xxx quick" 16 | echo " ./test_deepseek_versions.sh sk-or-xxx full" 17 | exit 1 18 | fi 19 | 20 | API_KEY=$1 21 | TEST_MODE=${2:-quick} # Default to quick mode if not specified 22 | 23 | # Create results directory with timestamp 24 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S") 25 | RESULTS_DIR="deepseek_comparison_${TIMESTAMP}" 26 | mkdir -p "$RESULTS_DIR" 27 | 28 | echo "DeepSeek Model Comparison Configuration:" 29 | echo " Test Mode: $TEST_MODE" 30 | echo " Results Directory: $RESULTS_DIR" 31 | echo "" 32 | 33 | # DeepSeek models to compare 34 | models=( 35 | "deepseek/deepseek-chat-v3.1" 36 | "deepseek/deepseek-chat-v3-0324" 37 | "deepseek/deepseek-chat-v3.1:thinking" 38 | ) 39 | 40 | # Function to run test and capture results 41 | run_test() { 42 | local model=$1 43 | local safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g') 44 | local output_file="$RESULTS_DIR/test_${safe_model_name}.txt" 45 | 46 | echo "================================================" 47 | echo "Testing model: $model" 48 | echo "Output file: $output_file" 49 | echo "================================================" 50 | 51 | # Build the command 52 | cmd="python3 tool_tester_v2.py \ 53 | --api-base https://openrouter.ai/api/v1 \ 54 | --api-key $API_KEY \ 55 | --model \"$model\" \ 56 | --temperature 0.0" 57 | 58 | # Add test mode flag 59 | if [ "$TEST_MODE" == "quick" ]; then 60 | cmd="$cmd --quick" 61 | else 62 | cmd="$cmd --max-tools 40" 63 | fi 64 | 65 | # Add output file 66 | cmd="$cmd --output \"$output_file\"" 67 | 68 | # Run the test and capture both stdout and the result 69 | echo "Running: $cmd" 70 | eval $cmd 2>&1 | tee "${output_file}.log" 71 | 72 | # Check if the test completed successfully 73 | if [ ${PIPESTATUS[0]} -eq 0 ]; then 74 | echo "PASS: Test completed successfully for $model" 75 | 76 | # Extract the overall score from the output file if it exists 77 | if [ -f "$output_file" ]; then 78 | score=$(grep "OVERALL SCORE:" "$output_file" | tail -1) 79 | echo " $score" 80 | fi 81 | else 82 | echo "FAIL: Test failed for $model" 83 | echo "FAILED: $model" >> "$RESULTS_DIR/failed_models.txt" 84 | fi 85 | 86 | echo "" 87 | sleep 2 # Small delay between models to avoid rate limiting 88 | } 89 | 90 | # Test each model 91 | successful_tests=0 92 | failed_tests=0 93 | 94 | for model in "${models[@]}"; do 95 | run_test "$model" 96 | 97 | # Check if test was successful 98 | if [ $? -eq 0 ]; then 99 | ((successful_tests++)) 100 | else 101 | ((failed_tests++)) 102 | fi 103 | done 104 | 105 | # Generate summary report 106 | summary_file="$RESULTS_DIR/summary.txt" 107 | echo "================================================" | tee "$summary_file" 108 | echo "DEEPSEEK MODEL COMPARISON SUMMARY REPORT" | tee -a "$summary_file" 109 | echo "================================================" | tee -a "$summary_file" 110 | echo "Test Mode: $TEST_MODE" | tee -a "$summary_file" 111 | echo "Timestamp: $TIMESTAMP" | tee -a "$summary_file" 112 | echo "Total Models Tested: ${#models[@]}" | tee -a "$summary_file" 113 | echo "Successful Tests: $successful_tests" | tee -a "$summary_file" 114 | echo "Failed Tests: $failed_tests" | tee -a "$summary_file" 115 | echo "" | tee -a "$summary_file" 116 | 117 | # Extract scores from all successful tests 118 | echo "Model Performance Comparison:" | tee -a "$summary_file" 119 | echo "-----------------------------------------" | tee -a "$summary_file" 120 | 121 | for model in "${models[@]}"; do 122 | safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g') 123 | file="$RESULTS_DIR/test_${safe_model_name}.txt" 124 | 125 | if [ -f "$file" ]; then 126 | score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3, $4}') 127 | if [ -n "$score" ]; then 128 | printf "%-40s %s\n" "$model:" "$score" | tee -a "$summary_file" 129 | else 130 | printf "%-40s %s\n" "$model:" "No score available" | tee -a "$summary_file" 131 | fi 132 | else 133 | printf "%-40s %s\n" "$model:" "Test failed" | tee -a "$summary_file" 134 | fi 135 | done 136 | 137 | echo "" | tee -a "$summary_file" 138 | echo "Full results saved in: $RESULTS_DIR" | tee -a "$summary_file" 139 | 140 | # Create a CSV summary for easy analysis 141 | csv_file="$RESULTS_DIR/model_comparison.csv" 142 | echo "Model,OverallScore,Grade,ScenarioSuccessRate,ToolPrecision,ToolRecall,ToolF1,ParamAccuracy,ExecutionSuccess" > "$csv_file" 143 | 144 | for model in "${models[@]}"; do 145 | safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g') 146 | file="$RESULTS_DIR/test_${safe_model_name}.txt" 147 | 148 | if [ -f "$file" ]; then 149 | # Extract metrics using grep and awk 150 | overall_score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//') 151 | grade=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $4}' | sed 's/[()]//g') 152 | scenario_rate=$(grep "Scenario Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 153 | tool_precision=$(grep "Tool Precision (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 154 | tool_recall=$(grep "Tool Recall (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 155 | tool_f1=$(grep "Tool F1 (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 156 | param_accuracy=$(grep "Parameter Accuracy (structural):" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 157 | exec_success=$(grep "Execution Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 158 | 159 | if [ -n "$overall_score" ]; then 160 | echo "$model,$overall_score,$grade,$scenario_rate,$tool_precision,$tool_recall,$tool_f1,$param_accuracy,$exec_success" >> "$csv_file" 161 | fi 162 | fi 163 | done 164 | 165 | echo "CSV results saved in: $csv_file" 166 | echo "" 167 | echo "================================================" 168 | echo "DeepSeek Model Comparison Complete!" 169 | echo "================================================" 170 | echo "" 171 | echo "Quick Analysis:" 172 | echo "View the summary with: cat $RESULTS_DIR/summary.txt" 173 | echo "View CSV data with: cat $RESULTS_DIR/model_comparison.csv" 174 | echo "" 175 | echo "For detailed analysis of individual models, check:" 176 | for model in "${models[@]}"; do 177 | safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g') 178 | echo " $model: $RESULTS_DIR/test_${safe_model_name}.txt" 179 | done -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM Tool Calling Test Suite V2 2 | 3 | A comprehensive CLI tool for testing function calling capabilities of LLMs via the OpenAI API standard, with special support for OpenRouter provider routing. 4 | 5 | ## IMPORTANT 6 | - Semantic accuracy is too early to use as an indicator 7 | - Structural accuracy and Tool Recall are the primary metrics to look at currently 8 | 9 | ### Example Output 10 | 11 | ``` 12 | Scenario Success Rate: 44.4% 13 | ├─ Light indicator, higher is better 14 | 15 | Tool Precision (LLM only): 46.1% 16 | ├─ Measure of how often the correct tool is called 17 | ├─ Higher is better, decent indicator with current implementations 18 | 19 | Tool Recall (LLM only): 54.2% 20 | ├─ Expected Tools vs Actual Tools Called 21 | ├─ Higher is better, good indicator 22 | 23 | Tool F1 (LLM only): 49.8% 24 | ├─ Combination of precision and recall 25 | 26 | Parameter Accuracy (structural): 100.0% 27 | ├─ Shape of tool calls is correct 28 | 29 | Parameter Accuracy (LLM only): 100.0% 30 | ├─ Future feature - split emulated tool calls (can ignore) 31 | 32 | Parameter Accuracy (semantic): 46.1% 33 | ├─ Too early to use as an indicator, but can be a data point 34 | ├─ Checks actual values passed into the tool 35 | 36 | Parameter Accuracy (semantic, LLM only): 46.1% 37 | ├─ Future feature - split emulated tool calls (can ignore) 38 | 39 | Execution Success Rate: 100.0% 40 | Execution Success Rate (LLM only): 100.0% 41 | 42 | Weighted Complexity Score: 46.6% 43 | ├─ Depends on complexity of the scenario 44 | ├─ Higher is better 45 | 46 | ═══════════════════════════════════════════════════════════════ 47 | OVERALL SCORE: 65.2% (B-) 48 | ├─ Use as overall indicator 49 | └─ Look at nuance as each value matters differently 50 | ``` 51 | 52 | 53 | ## Features 54 | 55 | - Natural conversation-based tool calling scenarios 56 | - Tests from simple (1-5 tools) to complex (40+ tools) scenarios 57 | - Supports any OpenAI-compatible API endpoint including OpenRouter 58 | - Provider-specific routing for optimal performance 59 | - Detailed metrics: precision, recall, F1 scores, and semantic accuracy 60 | - Automatic capability detection for different server implementations 61 | - Batch testing scripts for comparing models and providers 62 | 63 | ## Installation 64 | 65 | ```bash 66 | pip install -r requirements.txt 67 | ``` 68 | 69 | ## Usage 70 | 71 | ### Basic Usage 72 | 73 | ```bash 74 | python tool_tester_v2.py --api-base --api-key --model 75 | ``` 76 | 77 | ### Examples 78 | 79 | Test OpenAI GPT-4: 80 | ```bash 81 | python tool_tester_v2.py --api-base https://api.openai.com/v1 --api-key sk-xxx --model gpt-4o 82 | ``` 83 | 84 | Test via OpenRouter with provider routing: 85 | ```bash 86 | python tool_tester_v2.py --api-base https://openrouter.ai/api/v1 --api-key sk-or-xxx --model anthropic/claude-3.5-sonnet --provider Anthropic 87 | ``` 88 | 89 | Test local model: 90 | ```bash 91 | python tool_tester_v2.py --api-base http://localhost:8000/v1 --api-key local --model llama-70b 92 | ``` 93 | 94 | Quick test mode (faster, fewer scenarios): 95 | ```bash 96 | python tool_tester_v2.py --api-base --api-key --model --quick 97 | ``` 98 | 99 | Save report to file: 100 | ```bash 101 | python tool_tester_v2.py --api-base --api-key --model --output report.txt 102 | ``` 103 | 104 | Limit maximum tool calls: 105 | ```bash 106 | python tool_tester_v2.py --api-base --api-key --model --max-tools 20 107 | ``` 108 | 109 | ### Using the Generic Batch Testing Script 110 | 111 | The `test_generic_model.sh` script allows you to easily test any model across multiple providers: 112 | 113 | 1. **Edit the script configuration** at the top of the file: 114 | ```bash 115 | # Edit these variables in test_generic_model.sh: 116 | MODEL="openai/gpt-4o" # Your model 117 | TEMPERATURE=0.1 # Temperature setting 118 | providers=( # List of providers to test 119 | "openai" 120 | "anthropic" 121 | "fireworks" 122 | ) 123 | ``` 124 | 125 | 2. **Run the script**: 126 | ```bash 127 | # Quick tests (fewer scenarios) 128 | ./test_generic_model.sh YOUR_API_KEY quick 129 | 130 | # Full test suite 131 | ./test_generic_model.sh YOUR_API_KEY full 132 | ``` 133 | 134 | 3. **Results** will be saved in a timestamped directory with: 135 | - Individual test results for each provider 136 | - Summary report with scores 137 | - CSV file for easy analysis 138 | - Sorted leaderboard of providers by performance 139 | 140 | ### Provider-Specific Testing 141 | 142 | Some example provider configurations: 143 | ```bash 144 | # Test with specific precision modes 145 | --provider "fireworks/fp8" 146 | --provider "deepinfra/fp4" 147 | --provider "together/fp8" 148 | 149 | # Test with reasoning effort (for supported models) 150 | --reasoning-effort high 151 | 152 | # Adjust temperature for testing 153 | --temperature 0.0 # Most deterministic 154 | --temperature 0.7 # More creative 155 | ``` 156 | 157 | ## Test Categories 158 | 159 | 1. **Simple Tests (1-5 tools)**: Basic single and multi-tool scenarios 160 | 2. **Medium Tests (6-10 tools)**: Moderate complexity with follow-ups 161 | 3. **Large Tests (11-20 tools)**: Complex multi-step operations 162 | 4. **Extra Large Tests (21+ tools)**: Comprehensive workflow simulations 163 | 164 | ## Available Tools 165 | 166 | The test suite includes 10 different tool types: 167 | - `get_weather`: Weather information for locations 168 | - `calculate`: Mathematical calculations 169 | - `search_flights`: Flight search between cities 170 | - `search_hotels`: Hotel availability search 171 | - `search_restaurants`: Restaurant recommendations 172 | - `translate_text`: Language translation 173 | - `get_stock_price`: Stock market prices 174 | - `convert_currency`: Currency conversion 175 | - `get_news`: News articles on topics 176 | - `set_reminder`: Create reminders 177 | 178 | ## Metrics Explained 179 | 180 | - **Scenario Success Rate**: Percentage of test scenarios completed successfully 181 | - **Tool Precision**: When the AI calls a tool, how often it's the correct one 182 | - **Tool Recall**: Percentage of required tools that were actually called 183 | - **Tool F1 Score**: Harmonic mean of precision and recall 184 | - **Parameter Accuracy (structural)**: Correct format and required fields 185 | - **Parameter Accuracy (semantic)**: Correct values for the context 186 | - **Execution Success Rate**: Tools that executed without errors 187 | - **Weighted Complexity Score**: Performance adjusted for scenario difficulty 188 | 189 | ## Success Criteria 190 | 191 | - Individual tests pass if ≥70% of expected tools are called correctly 192 | - Structural parameters must be ≥60% correct 193 | - Overall grade based on combined metrics (A+ = 90%+, A = 85%+, etc.) 194 | 195 | ## Output 196 | 197 | The tool generates a comprehensive report showing: 198 | - Overall success rate and letter grade 199 | - Detailed metrics (precision, recall, F1) 200 | - Per-scenario breakdowns 201 | - Tool call details and emulation statistics 202 | - Execution times 203 | - Error details and conversation logs 204 | 205 | ## Test Scenarios 206 | 207 | Test scenarios are defined in `test_scenarios.json` and include: 208 | - Travel planning 209 | - Investment research 210 | - Event coordination 211 | - Academic conferences 212 | - Shopping expeditions 213 | - And many more real-world use cases -------------------------------------------------------------------------------- /test_deepseek_fireworks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # DeepSeek model version comparison tool testing script for OpenRouter with Fireworks provider 4 | # Usage: ./test_deepseek_fireworks.sh YOUR_API_KEY [quick|full] 5 | 6 | if [ $# -lt 1 ]; then 7 | echo "Usage: ./test_deepseek_fireworks.sh YOUR_OPENROUTER_API_KEY [quick|full]" 8 | echo "" 9 | echo "This script compares two DeepSeek model versions on Fireworks provider:" 10 | echo " - deepseek/deepseek-chat-v3.1" 11 | echo " - deepseek/deepseek-chat-v3-0324" 12 | echo "" 13 | echo "Examples:" 14 | echo " ./test_deepseek_fireworks.sh sk-or-xxx quick" 15 | echo " ./test_deepseek_fireworks.sh sk-or-xxx full" 16 | exit 1 17 | fi 18 | 19 | API_KEY=$1 20 | TEST_MODE=${2:-quick} # Default to quick mode if not specified 21 | 22 | # Create results directory with timestamp 23 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S") 24 | RESULTS_DIR="deepseek_fireworks_comparison_${TIMESTAMP}" 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | echo "DeepSeek Model Comparison Configuration (Fireworks Provider):" 28 | echo " Test Mode: $TEST_MODE" 29 | echo " Provider: Fireworks" 30 | echo " Results Directory: $RESULTS_DIR" 31 | echo "" 32 | 33 | # DeepSeek models to compare (excluding thinking version) 34 | models=( 35 | "deepseek/deepseek-chat-v3.1" 36 | "deepseek/deepseek-chat-v3-0324" 37 | ) 38 | 39 | # Function to run test and capture results 40 | run_test() { 41 | local model=$1 42 | local safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g') 43 | local output_file="$RESULTS_DIR/test_${safe_model_name}.txt" 44 | 45 | echo "================================================" 46 | echo "Testing model: $model" 47 | echo "Provider: deepseek" 48 | echo "Output file: $output_file" 49 | echo "================================================" 50 | 51 | # Build the command with Fireworks provider 52 | cmd="python3 tool_tester_v2.py \ 53 | --api-base https://openrouter.ai/api/v1 \ 54 | --api-key $API_KEY \ 55 | --model \"$model\" \ 56 | --provider deepseek \ 57 | --temperature 0.0" 58 | 59 | # Add test mode flag 60 | if [ "$TEST_MODE" == "quick" ]; then 61 | cmd="$cmd --quick" 62 | else 63 | cmd="$cmd --max-tools 40" 64 | fi 65 | 66 | # Add output file 67 | cmd="$cmd --output \"$output_file\"" 68 | 69 | # Run the test and capture both stdout and the result 70 | echo "Running: $cmd" 71 | eval $cmd 2>&1 | tee "${output_file}.log" 72 | 73 | # Check if the test completed successfully 74 | if [ ${PIPESTATUS[0]} -eq 0 ]; then 75 | echo "PASS: Test completed successfully for $model" 76 | 77 | # Extract the overall score from the output file if it exists 78 | if [ -f "$output_file" ]; then 79 | score=$(grep "OVERALL SCORE:" "$output_file" | tail -1) 80 | echo " $score" 81 | fi 82 | else 83 | echo "FAIL: Test failed for $model" 84 | echo "FAILED: $model" >> "$RESULTS_DIR/failed_models.txt" 85 | fi 86 | 87 | echo "" 88 | sleep 5 # Longer delay for Fireworks to avoid rate limiting 89 | } 90 | 91 | # Test each model 92 | successful_tests=0 93 | failed_tests=0 94 | 95 | for model in "${models[@]}"; do 96 | run_test "$model" 97 | 98 | # Check if test was successful 99 | if [ $? -eq 0 ]; then 100 | ((successful_tests++)) 101 | else 102 | ((failed_tests++)) 103 | fi 104 | done 105 | 106 | # Generate summary report 107 | summary_file="$RESULTS_DIR/summary.txt" 108 | echo "================================================" | tee "$summary_file" 109 | echo "DEEPSEEK MODEL COMPARISON SUMMARY REPORT" | tee -a "$summary_file" 110 | echo "Provider: FIREWORKS" | tee -a "$summary_file" 111 | echo "================================================" | tee -a "$summary_file" 112 | echo "Test Mode: $TEST_MODE" | tee -a "$summary_file" 113 | echo "Timestamp: $TIMESTAMP" | tee -a "$summary_file" 114 | echo "Total Models Tested: ${#models[@]}" | tee -a "$summary_file" 115 | echo "Successful Tests: $successful_tests" | tee -a "$summary_file" 116 | echo "Failed Tests: $failed_tests" | tee -a "$summary_file" 117 | echo "" | tee -a "$summary_file" 118 | 119 | # Extract scores from all successful tests 120 | echo "Model Performance Comparison (Fireworks Provider):" | tee -a "$summary_file" 121 | echo "-----------------------------------------" | tee -a "$summary_file" 122 | 123 | for model in "${models[@]}"; do 124 | safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g') 125 | file="$RESULTS_DIR/test_${safe_model_name}.txt" 126 | 127 | if [ -f "$file" ]; then 128 | score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3, $4}') 129 | if [ -n "$score" ]; then 130 | printf "%-40s %s\n" "$model:" "$score" | tee -a "$summary_file" 131 | else 132 | printf "%-40s %s\n" "$model:" "No score available" | tee -a "$summary_file" 133 | fi 134 | else 135 | printf "%-40s %s\n" "$model:" "Test failed" | tee -a "$summary_file" 136 | fi 137 | done 138 | 139 | echo "" | tee -a "$summary_file" 140 | echo "Full results saved in: $RESULTS_DIR" | tee -a "$summary_file" 141 | 142 | # Create a CSV summary for easy analysis 143 | csv_file="$RESULTS_DIR/model_comparison.csv" 144 | echo "Model,Provider,OverallScore,Grade,ScenarioSuccessRate,ToolPrecision,ToolRecall,ToolF1,ParamAccuracy,ExecutionSuccess" > "$csv_file" 145 | 146 | for model in "${models[@]}"; do 147 | safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g') 148 | file="$RESULTS_DIR/test_${safe_model_name}.txt" 149 | 150 | if [ -f "$file" ]; then 151 | # Extract metrics using grep and awk 152 | overall_score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//') 153 | grade=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $4}' | sed 's/[()]//g') 154 | scenario_rate=$(grep "Scenario Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 155 | tool_precision=$(grep "Tool Precision (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 156 | tool_recall=$(grep "Tool Recall (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 157 | tool_f1=$(grep "Tool F1 (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 158 | param_accuracy=$(grep "Parameter Accuracy (structural):" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 159 | exec_success=$(grep "Execution Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 160 | 161 | if [ -n "$overall_score" ]; then 162 | echo "$model,Fireworks,$overall_score,$grade,$scenario_rate,$tool_precision,$tool_recall,$tool_f1,$param_accuracy,$exec_success" >> "$csv_file" 163 | fi 164 | fi 165 | done 166 | 167 | echo "CSV results saved in: $csv_file" 168 | echo "" 169 | echo "================================================" 170 | echo "DeepSeek Model Comparison Complete!" 171 | echo "Provider: Fireworks" 172 | echo "================================================" 173 | echo "" 174 | echo "Quick Analysis:" 175 | echo "View the summary with: cat $RESULTS_DIR/summary.txt" 176 | echo "View CSV data with: cat $RESULTS_DIR/model_comparison.csv" 177 | echo "" 178 | echo "For detailed analysis of individual models, check:" 179 | for model in "${models[@]}"; do 180 | safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g') 181 | echo " $model: $RESULTS_DIR/test_${safe_model_name}.txt" 182 | done -------------------------------------------------------------------------------- /test_generic_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Generic tool testing script for OpenRouter 4 | # Usage: ./test_generic_model.sh YOUR_API_KEY [quick|full] 5 | # 6 | # CONFIGURATION: Edit the variables below to customize your test 7 | # ================================================================ 8 | 9 | # MODEL CONFIGURATION 10 | # Examples: "openai/gpt-4o", "anthropic/claude-3.5-sonnet", "qwen/qwen3-coder", "deepseek/deepseek-chat-v3.1" 11 | MODEL="qwen/qwen3-coder" 12 | 13 | # TEMPERATURE CONFIGURATION (0.0 to 2.0) 14 | # Lower = more deterministic, Higher = more creative 15 | # Recommended: 0.0-0.3 for tool calling tests 16 | TEMPERATURE=0.6 17 | 18 | # PROVIDERS TO TEST 19 | # Leave empty to test without specific provider routing 20 | # For a single provider test, use: providers=("openai") 21 | # For multiple providers, list them all 22 | # Common providers: openai, anthropic, google, deepseek, fireworks, together, deepinfra, etc. 23 | # Some providers support precision specifiers: "fireworks/fp8", "deepinfra/fp4", etc. 24 | providers=( 25 | # Uncomment and modify the providers you want to test: 26 | # "openai" 27 | # "anthropic" 28 | # "google" 29 | # "deepseek" 30 | # "fireworks" 31 | # "fireworks/fp8" 32 | # "together" 33 | # "together/fp8" 34 | # "deepinfra/fp4" 35 | # "baseten/fp8" 36 | # "chutes/fp8" 37 | # "parasail/fp8" 38 | # "novita/fp8" 39 | # "atlas-cloud/fp8" 40 | # "phala" 41 | # "gmicloud/fp8" 42 | # "targon/fp8" 43 | "alibaba/opensource" 44 | # "hyperbolic/fp8" 45 | # "cerebras/fp8" 46 | ) 47 | 48 | # If providers array is empty, run a single test without provider routing 49 | if [ ${#providers[@]} -eq 0 ]; then 50 | providers=("none") 51 | fi 52 | 53 | # DELAY BETWEEN TESTS (in seconds) 54 | # Increase if you encounter rate limiting 55 | DELAY_SECONDS=3 56 | 57 | # REASONING EFFORT (optional, for models that support it) 58 | # Options: low, medium, high, or leave empty 59 | REASONING_EFFORT="" 60 | 61 | # ================================================================ 62 | # END OF CONFIGURATION - DO NOT EDIT BELOW THIS LINE 63 | # ================================================================ 64 | 65 | if [ $# -lt 1 ]; then 66 | echo "Usage: $0 YOUR_OPENROUTER_API_KEY [quick|full]" 67 | echo "" 68 | echo "Current Configuration:" 69 | echo " Model: $MODEL" 70 | echo " Temperature: $TEMPERATURE" 71 | echo " Providers: ${providers[*]}" 72 | echo "" 73 | echo "Examples:" 74 | echo " $0 sk-or-xxx quick # Run quick tests" 75 | echo " $0 sk-or-xxx full # Run full test suite" 76 | echo "" 77 | echo "Edit this script to change the model, temperature, and providers to test." 78 | exit 1 79 | fi 80 | 81 | API_KEY=$1 82 | TEST_MODE=${2:-quick} # Default to quick mode if not specified 83 | 84 | # Create results directory with timestamp 85 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S") 86 | SAFE_MODEL_NAME=$(echo "$MODEL" | sed 's/[\/:]/_/g') 87 | RESULTS_DIR="test_results_${SAFE_MODEL_NAME}_${TIMESTAMP}" 88 | mkdir -p "$RESULTS_DIR" 89 | 90 | echo "================================================================" 91 | echo "TOOL TESTING CONFIGURATION" 92 | echo "================================================================" 93 | echo " Model: $MODEL" 94 | echo " Temperature: $TEMPERATURE" 95 | echo " Test Mode: $TEST_MODE" 96 | echo " Providers to test: ${#providers[@]}" 97 | echo " Results Directory: $RESULTS_DIR" 98 | if [ -n "$REASONING_EFFORT" ]; then 99 | echo " Reasoning Effort: $REASONING_EFFORT" 100 | fi 101 | echo "" 102 | 103 | # Function to run test and capture results 104 | run_test() { 105 | local provider=$1 106 | local test_name="" 107 | local provider_param="" 108 | 109 | if [ "$provider" == "none" ]; then 110 | test_name="no_provider" 111 | echo "================================================" 112 | echo "Testing model: $MODEL (no provider routing)" 113 | else 114 | test_name="${provider//\//_}" 115 | provider_param="--provider \"$provider\"" 116 | echo "================================================" 117 | echo "Testing model: $MODEL" 118 | echo "Provider: $provider" 119 | fi 120 | 121 | local output_file="$RESULTS_DIR/test_${test_name}.txt" 122 | echo "Output file: $output_file" 123 | echo "================================================" 124 | 125 | # Build the command 126 | cmd="python3 tool_tester_v2.py \ 127 | --api-base https://openrouter.ai/api/v1 \ 128 | --api-key $API_KEY \ 129 | --model \"$MODEL\" \ 130 | --temperature $TEMPERATURE" 131 | 132 | # Add provider if specified 133 | if [ -n "$provider_param" ]; then 134 | cmd="$cmd $provider_param" 135 | fi 136 | 137 | # Add reasoning effort if specified 138 | if [ -n "$REASONING_EFFORT" ]; then 139 | cmd="$cmd --reasoning-effort $REASONING_EFFORT" 140 | fi 141 | 142 | # Add test mode flag 143 | if [ "$TEST_MODE" == "quick" ]; then 144 | cmd="$cmd --quick" 145 | else 146 | cmd="$cmd --max-tools 40" 147 | fi 148 | 149 | # Add output file 150 | cmd="$cmd --output \"$output_file\"" 151 | 152 | # Run the test and capture both stdout and the result 153 | echo "Running: $cmd" 154 | eval $cmd 2>&1 | tee "${output_file}.log" 155 | 156 | # Check if the test completed successfully 157 | if [ ${PIPESTATUS[0]} -eq 0 ]; then 158 | echo "PASS: Test completed successfully" 159 | 160 | # Extract the overall score from the output file if it exists 161 | if [ -f "$output_file" ]; then 162 | score=$(grep "OVERALL SCORE:" "$output_file" | tail -1) 163 | echo " $score" 164 | fi 165 | return 0 166 | else 167 | echo "FAIL: Test failed" 168 | if [ "$provider" == "none" ]; then 169 | echo "FAILED: No provider routing" >> "$RESULTS_DIR/failed_tests.txt" 170 | else 171 | echo "FAILED: $provider" >> "$RESULTS_DIR/failed_tests.txt" 172 | fi 173 | return 1 174 | fi 175 | 176 | echo "" 177 | } 178 | 179 | # Test each provider/configuration 180 | successful_tests=0 181 | failed_tests=0 182 | total_tests=${#providers[@]} 183 | current_test=0 184 | 185 | for provider in "${providers[@]}"; do 186 | ((current_test++)) 187 | echo "" 188 | echo "Test $current_test of $total_tests" 189 | 190 | run_test "$provider" 191 | 192 | # Check if test completed (not if it got a perfect score) 193 | if [ $? -eq 0 ]; then 194 | ((successful_tests++)) 195 | else 196 | ((failed_tests++)) 197 | fi 198 | 199 | # Add delay between tests (except for the last one) 200 | if [ $current_test -lt $total_tests ]; then 201 | echo "Waiting ${DELAY_SECONDS}s before next test..." 202 | sleep $DELAY_SECONDS 203 | fi 204 | done 205 | 206 | # Generate summary report 207 | summary_file="$RESULTS_DIR/summary.txt" 208 | echo "" | tee "$summary_file" 209 | echo "================================================================" | tee -a "$summary_file" 210 | echo "TOOL TESTING SUMMARY REPORT" | tee -a "$summary_file" 211 | echo "================================================================" | tee -a "$summary_file" 212 | echo "Model: $MODEL" | tee -a "$summary_file" 213 | echo "Temperature: $TEMPERATURE" | tee -a "$summary_file" 214 | echo "Test Mode: $TEST_MODE" | tee -a "$summary_file" 215 | echo "Timestamp: $TIMESTAMP" | tee -a "$summary_file" 216 | echo "Total Tests Run: $total_tests" | tee -a "$summary_file" 217 | echo "Tests Completed: $successful_tests" | tee -a "$summary_file" 218 | echo "Tests Failed to Run: $failed_tests" | tee -a "$summary_file" 219 | echo "" | tee -a "$summary_file" 220 | 221 | # Extract scores from all successful tests 222 | echo "Test Results:" | tee -a "$summary_file" 223 | echo "-----------------------------------------" | tee -a "$summary_file" 224 | 225 | for file in "$RESULTS_DIR"/test_*.txt; do 226 | if [ -f "$file" ]; then 227 | test_name=$(basename "$file" .txt | sed 's/test_//') 228 | 229 | # Convert filename back to provider name 230 | if [ "$test_name" == "no_provider" ]; then 231 | display_name="No provider routing" 232 | else 233 | display_name=$(echo "$test_name" | sed 's/_/\//g') 234 | fi 235 | 236 | score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3, $4}') 237 | if [ -n "$score" ]; then 238 | printf "%-30s %s\n" "$display_name:" "$score" | tee -a "$summary_file" 239 | else 240 | printf "%-30s %s\n" "$display_name:" "Test failed" | tee -a "$summary_file" 241 | fi 242 | fi 243 | done 244 | 245 | echo "" | tee -a "$summary_file" 246 | 247 | # Create a CSV summary for easy analysis 248 | csv_file="$RESULTS_DIR/results.csv" 249 | echo "Provider,Model,Temperature,OverallScore,Grade,ScenarioSuccessRate,ToolPrecision,ToolRecall,ToolF1,ParamAccuracy,ExecutionSuccess" > "$csv_file" 250 | 251 | for file in "$RESULTS_DIR"/test_*.txt; do 252 | if [ -f "$file" ]; then 253 | test_name=$(basename "$file" .txt | sed 's/test_//') 254 | 255 | # Convert filename back to provider name 256 | if [ "$test_name" == "no_provider" ]; then 257 | provider_name="none" 258 | else 259 | provider_name=$(echo "$test_name" | sed 's/_/\//g') 260 | fi 261 | 262 | # Extract metrics using grep and awk 263 | overall_score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//') 264 | grade=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $4}' | sed 's/[()]//g') 265 | scenario_rate=$(grep "Scenario Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 266 | tool_precision=$(grep "Tool Precision (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 267 | tool_recall=$(grep "Tool Recall (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 268 | tool_f1=$(grep "Tool F1 (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//') 269 | param_accuracy=$(grep "Parameter Accuracy (structural):" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 270 | exec_success=$(grep "Execution Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//') 271 | 272 | if [ -n "$overall_score" ]; then 273 | echo "$provider_name,$MODEL,$TEMPERATURE,$overall_score,$grade,$scenario_rate,$tool_precision,$tool_recall,$tool_f1,$param_accuracy,$exec_success" >> "$csv_file" 274 | fi 275 | fi 276 | done 277 | 278 | echo "Full results saved in: $RESULTS_DIR" | tee -a "$summary_file" 279 | echo "CSV results saved in: $csv_file" | tee -a "$summary_file" 280 | echo "" | tee -a "$summary_file" 281 | 282 | # Sort results by score if there are multiple tests 283 | if [ $total_tests -gt 1 ]; then 284 | echo "Top Performers (sorted by score):" | tee -a "$summary_file" 285 | echo "-----------------------------------------" | tee -a "$summary_file" 286 | 287 | # Create temp file for sorting 288 | temp_scores="/tmp/scores_$$.txt" 289 | 290 | for file in "$RESULTS_DIR"/test_*.txt; do 291 | if [ -f "$file" ]; then 292 | test_name=$(basename "$file" .txt | sed 's/test_//') 293 | if [ "$test_name" == "no_provider" ]; then 294 | display_name="No provider routing" 295 | else 296 | display_name=$(echo "$test_name" | sed 's/_/\//g') 297 | fi 298 | 299 | score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//') 300 | if [ -n "$score" ]; then 301 | echo "$score|$display_name" >> "$temp_scores" 302 | fi 303 | fi 304 | done 305 | 306 | if [ -f "$temp_scores" ]; then 307 | sort -t'|' -k1 -rn "$temp_scores" | while IFS='|' read -r score provider; do 308 | printf "%-30s %s%%\n" "$provider:" "$score" | tee -a "$summary_file" 309 | done 310 | rm -f "$temp_scores" 311 | fi 312 | fi 313 | 314 | echo "" 315 | echo "================================================================" 316 | echo "Testing Complete!" 317 | echo "================================================================" 318 | echo "" 319 | echo "View detailed results:" 320 | echo " Summary: cat $RESULTS_DIR/summary.txt" 321 | echo " CSV data: cat $RESULTS_DIR/results.csv" 322 | echo "" 323 | 324 | # Keep terminal open if running in Windows Git Bash 325 | if [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]]; then 326 | echo "" 327 | read -p "Press Enter to exit..." 328 | fi 329 | 330 | exit 0 -------------------------------------------------------------------------------- /tool_definitions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tool Definitions for LLM Tool Calling Test Suite 3 | 4 | This module contains all tool definitions and related functionality 5 | for testing LLM tool calling capabilities. 6 | """ 7 | 8 | from typing import Dict, List, Any 9 | 10 | 11 | class ToolDefinitions: 12 | """Define available tools for testing""" 13 | 14 | @staticmethod 15 | def get_all_tools() -> List[Dict[str, Any]]: 16 | return [ 17 | { 18 | "type": "function", 19 | "function": { 20 | "name": "get_weather", 21 | "description": "Get the current weather for a specific location", 22 | "parameters": { 23 | "type": "object", 24 | "properties": { 25 | "location": {"type": "string", "description": "City and state, e.g. San Francisco, CA"}, 26 | "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"} 27 | }, 28 | "required": ["location"] 29 | } 30 | } 31 | }, 32 | { 33 | "type": "function", 34 | "function": { 35 | "name": "calculate", 36 | "description": "Perform mathematical calculations", 37 | "parameters": { 38 | "type": "object", 39 | "properties": { 40 | "expression": {"type": "string", "description": "Mathematical expression to evaluate"} 41 | }, 42 | "required": ["expression"] 43 | } 44 | } 45 | }, 46 | { 47 | "type": "function", 48 | "function": { 49 | "name": "search_flights", 50 | "description": "Search for available flights between two cities", 51 | "parameters": { 52 | "type": "object", 53 | "properties": { 54 | "from_city": {"type": "string", "description": "Departure city"}, 55 | "to_city": {"type": "string", "description": "Destination city"}, 56 | "date": {"type": "string", "description": "Travel date (YYYY-MM-DD)"} 57 | }, 58 | "required": ["from_city", "to_city", "date"] 59 | } 60 | } 61 | }, 62 | { 63 | "type": "function", 64 | "function": { 65 | "name": "get_stock_price", 66 | "description": "Get current stock price for a company", 67 | "parameters": { 68 | "type": "object", 69 | "properties": { 70 | "symbol": {"type": "string", "description": "Stock ticker symbol"} 71 | }, 72 | "required": ["symbol"] 73 | } 74 | } 75 | }, 76 | { 77 | "type": "function", 78 | "function": { 79 | "name": "search_restaurants", 80 | "description": "Search for restaurants in a specific area", 81 | "parameters": { 82 | "type": "object", 83 | "properties": { 84 | "location": {"type": "string", "description": "City or area to search"}, 85 | "cuisine": {"type": "string", "description": "Type of cuisine"}, 86 | "price_range": {"type": "string", "enum": ["$", "$$", "$$$", "$$$$"]} 87 | }, 88 | "required": ["location"] 89 | } 90 | } 91 | }, 92 | { 93 | "type": "function", 94 | "function": { 95 | "name": "convert_currency", 96 | "description": "Convert amount between two currencies", 97 | "parameters": { 98 | "type": "object", 99 | "properties": { 100 | "amount": {"type": "number", "description": "Amount to convert"}, 101 | "from_currency": {"type": "string", "description": "Source currency code (e.g., USD)"}, 102 | "to_currency": {"type": "string", "description": "Target currency code (e.g., EUR)"} 103 | }, 104 | "required": ["amount", "from_currency", "to_currency"] 105 | } 106 | } 107 | }, 108 | { 109 | "type": "function", 110 | "function": { 111 | "name": "get_news", 112 | "description": "Get latest news articles on a topic", 113 | "parameters": { 114 | "type": "object", 115 | "properties": { 116 | "topic": {"type": "string", "description": "News topic or search query"}, 117 | "limit": {"type": "integer", "description": "Number of articles to return", "default": 5} 118 | }, 119 | "required": ["topic"] 120 | } 121 | } 122 | }, 123 | { 124 | "type": "function", 125 | "function": { 126 | "name": "set_reminder", 127 | "description": "Set a reminder for a specific time", 128 | "parameters": { 129 | "type": "object", 130 | "properties": { 131 | "message": {"type": "string", "description": "Reminder message"}, 132 | "time": {"type": "string", "description": "Time for the reminder (HH:MM)"}, 133 | "date": {"type": "string", "description": "Date for the reminder (YYYY-MM-DD)"} 134 | }, 135 | "required": ["message", "time"] 136 | } 137 | } 138 | }, 139 | { 140 | "type": "function", 141 | "function": { 142 | "name": "translate_text", 143 | "description": "Translate text from one language to another", 144 | "parameters": { 145 | "type": "object", 146 | "properties": { 147 | "text": {"type": "string", "description": "Text to translate"}, 148 | "source_language": {"type": "string", "description": "Source language code"}, 149 | "target_language": {"type": "string", "description": "Target language code"} 150 | }, 151 | "required": ["text", "target_language"] 152 | } 153 | } 154 | }, 155 | { 156 | "type": "function", 157 | "function": { 158 | "name": "search_hotels", 159 | "description": "Search for hotels in a specific location", 160 | "parameters": { 161 | "type": "object", 162 | "properties": { 163 | "location": {"type": "string", "description": "City or area"}, 164 | "check_in": {"type": "string", "description": "Check-in date (YYYY-MM-DD)"}, 165 | "check_out": {"type": "string", "description": "Check-out date (YYYY-MM-DD)"}, 166 | "guests": {"type": "integer", "description": "Number of guests"} 167 | }, 168 | "required": ["location", "check_in", "check_out"] 169 | } 170 | } 171 | } 172 | ] 173 | 174 | @staticmethod 175 | def _allowed_props() -> Dict[str, set]: 176 | return { 177 | "get_weather": {"location", "unit"}, 178 | "calculate": {"expression"}, 179 | "search_flights": {"from_city", "to_city", "date"}, 180 | "get_stock_price": {"symbol"}, 181 | "search_restaurants": {"location", "cuisine", "price_range"}, 182 | "convert_currency": {"amount", "from_currency", "to_currency"}, 183 | "get_news": {"topic", "limit"}, 184 | "set_reminder": {"message", "time", "date"}, 185 | "translate_text": {"text", "source_language", "target_language"}, 186 | "search_hotels": {"location", "check_in", "check_out", "guests"}, 187 | } 188 | 189 | @staticmethod 190 | def validate_parameters(tool_name: str, arguments: Dict[str, Any]) -> bool: 191 | """Validate required parameters and reject unknown keys""" 192 | allowed = ToolDefinitions._allowed_props().get(tool_name, set()) 193 | if any(k not in allowed for k in arguments.keys()): 194 | return False 195 | 196 | validations = { 197 | "get_weather": lambda args: "location" in args and len(str(args.get("location", "")).strip()) > 0, 198 | "calculate": lambda args: "expression" in args and len(str(args.get("expression", "")).strip()) > 0, 199 | "search_flights": lambda args: all(k in args for k in ["from_city", "to_city", "date"]) and 200 | all(len(str(args.get(k, "")).strip()) > 0 for k in ["from_city", "to_city", "date"]), 201 | "get_stock_price": lambda args: "symbol" in args and len(str(args.get("symbol", "")).strip()) > 0, 202 | "search_restaurants": lambda args: "location" in args and len(str(args.get("location", "")).strip()) > 0, 203 | "convert_currency": lambda args: all(k in args for k in ["amount", "from_currency", "to_currency"]) and 204 | isinstance(args.get("amount", 0), (int, float)) and args.get("amount", 0) > 0, 205 | "get_news": lambda args: "topic" in args and len(str(args.get("topic", "")).strip()) > 0, 206 | "set_reminder": lambda args: all(k in args for k in ["message", "time"]) and 207 | all(len(str(args.get(k, "")).strip()) > 0 for k in ["message", "time"]), 208 | "translate_text": lambda args: "text" in args and "target_language" in args and 209 | all(len(str(args.get(k, "")).strip()) > 0 for k in ["text", "target_language"]), 210 | "search_hotels": lambda args: all(k in args for k in ["location", "check_in", "check_out"]) and 211 | all(len(str(args.get(k, "")).strip()) > 0 for k in ["location", "check_in", "check_out"]), 212 | } 213 | validator = validations.get(tool_name) 214 | if validator: 215 | try: 216 | return validator(arguments) 217 | except Exception: 218 | return False 219 | return True 220 | 221 | @staticmethod 222 | def execute_tool(name: str, arguments: Dict[str, Any]) -> str: 223 | """Mock tool execution - returns realistic dummy data""" 224 | 225 | tool_responses = { 226 | "get_weather": lambda args: f"The weather in {args.get('location', 'Unknown')} is currently 72°F (22°C) with partly cloudy skies. Humidity is 65% with winds at 10 mph.", 227 | 228 | "calculate": lambda args: f"The result of {args.get('expression', '')} is {eval(args.get('expression', '0'))}", 229 | 230 | "search_flights": lambda args: ( 231 | f"Found 5 flights from {args.get('from_city')} to {args.get('to_city')} on {args.get('date')}:\n" 232 | "1. UA 245 - Departs 8:00 AM, arrives 11:30 AM - $350\n" 233 | "2. DL 892 - Departs 10:15 AM, arrives 1:45 PM - $425\n" 234 | "3. AA 156 - Departs 2:30 PM, arrives 6:00 PM - $380" 235 | ), 236 | 237 | "get_stock_price": lambda args: f"{args.get('symbol', 'UNKNOWN')} is currently trading at $152.35, up 2.3% today. Day range: $149.20 - $153.80", 238 | 239 | "search_restaurants": lambda args: ( 240 | f"Found 3 top restaurants in {args.get('location')}:\n" 241 | f"1. The Golden Fork - {args.get('cuisine', 'International')} cuisine - Rating: 4.5/5\n" 242 | f"2. Sunset Bistro - {args.get('cuisine', 'Local')} cuisine - Rating: 4.3/5\n" 243 | "3. Ocean View Grill - Seafood - Rating: 4.6/5" 244 | ), 245 | 246 | "convert_currency": lambda args: ( 247 | f"{args.get('amount', 0)} {args.get('from_currency', 'USD')} equals " 248 | f"{args.get('amount', 0) * 0.92:.2f} {args.get('to_currency', 'EUR')} at current exchange rate " 249 | f"(1 {args.get('from_currency', 'USD')} = 0.92 {args.get('to_currency', 'EUR')})" 250 | ), 251 | 252 | "get_news": lambda args: ( 253 | f"Latest news on '{args.get('topic')}':\n" 254 | f"1. Breaking: Major developments in {args.get('topic')} sector (2 hours ago)\n" 255 | f"2. Expert analysis: What {args.get('topic')} means for the future (5 hours ago)\n" 256 | f"3. {args.get('topic')} trends show significant growth (1 day ago)" 257 | ), 258 | 259 | "set_reminder": lambda args: f"Reminder set: '{args.get('message')}' for {args.get('time')} on {args.get('date', 'today')}", 260 | 261 | "translate_text": lambda args: f"Translation to {args.get('target_language')}: [Translated version of '{args.get('text')}']", 262 | 263 | "search_hotels": lambda args: ( 264 | f"Found hotels in {args.get('location')} for {args.get('check_in')} to {args.get('check_out')}:\n" 265 | "1. Grand Plaza Hotel - $180/night - 4.4 stars\n" 266 | "2. City Center Inn - $120/night - 4.1 stars\n" 267 | "3. Luxury Suites - $250/night - 4.7 stars" 268 | ) 269 | } 270 | 271 | handler = tool_responses.get(name, lambda args: f"Executed {name} with parameters {args}") 272 | try: 273 | return handler(arguments) 274 | except Exception: 275 | return f"Tool execution completed for {name}" -------------------------------------------------------------------------------- /test_scenarios.json: -------------------------------------------------------------------------------- 1 | { 2 | "scenarios": [ 3 | { 4 | "name": "simple_weather", 5 | "description": "Check weather in a city", 6 | "initial_prompt": "Use the get_weather tool to check the current weather in Tokyo.", 7 | "expected_tools": ["get_weather"], 8 | "follow_ups": [] 9 | }, 10 | { 11 | "name": "simple_calculation", 12 | "description": "Perform a calculation", 13 | "initial_prompt": "Use the calculate tool to determine how much each person pays if a $847 restaurant bill is split among 6 people.", 14 | "expected_tools": ["calculate"], 15 | "follow_ups": [] 16 | }, 17 | { 18 | "name": "simple_stock", 19 | "description": "Check stock price", 20 | "initial_prompt": "Use the get_stock_price tool to check the current price of Apple stock (AAPL).", 21 | "expected_tools": ["get_stock_price"], 22 | "follow_ups": [] 23 | }, 24 | { 25 | "name": "travel_planning", 26 | "description": "Plan a trip with multiple queries", 27 | "initial_prompt": "Please use the search_flights tool to find flights from New York to Paris on March 15th, then use get_weather to check the weather in Paris.", 28 | "expected_tools": ["search_flights", "get_weather"], 29 | "follow_ups": [ 30 | "Use the search_restaurants tool to find good restaurants in Paris.", 31 | "Use the search_hotels tool to find hotels for March 15-20 in Paris." 32 | ] 33 | }, 34 | { 35 | "name": "currency_travel", 36 | "description": "Travel with currency conversion", 37 | "initial_prompt": "Please use convert_currency to convert $5000 USD to British pounds, then use get_weather to check the weather forecast for London.", 38 | "expected_tools": ["convert_currency", "get_weather"], 39 | "follow_ups": [ 40 | "Use search_hotels to find hotels in London for next week.", 41 | "Use search_flights to find flights from Boston to London for tomorrow." 42 | ] 43 | }, 44 | { 45 | "name": "business_trip", 46 | "description": "Complex business trip planning", 47 | "initial_prompt": "Use search_flights to find flights from Chicago to San Francisco for March 20th.", 48 | "expected_tools": ["search_flights"], 49 | "follow_ups": [ 50 | "Use get_weather to check the weather forecast for San Francisco.", 51 | "Use search_restaurants to find Italian restaurants in San Francisco for a business dinner.", 52 | "Use set_reminder to set a reminder to pack presentation materials at 8 PM tonight.", 53 | "Use search_hotels to find hotels near the financial district for March 20-22." 54 | ] 55 | }, 56 | { 57 | "name": "investment_research", 58 | "description": "Research stocks and news", 59 | "initial_prompt": "Use get_stock_price to check the current price of Microsoft stock (MSFT).", 60 | "expected_tools": ["get_stock_price"], 61 | "follow_ups": [ 62 | "Use get_stock_price to check Google (GOOGL) and Amazon (AMZN) stock prices.", 63 | "Use get_news to find the latest news about artificial intelligence.", 64 | "Use calculate to determine how much to invest in each stock if splitting $10,000 equally across three stocks." 65 | ] 66 | }, 67 | { 68 | "name": "international_planning", 69 | "description": "International travel with translations", 70 | "initial_prompt": "Use translate_text to translate 'Thank you for your help' to Japanese.", 71 | "expected_tools": ["translate_text"], 72 | "follow_ups": [ 73 | "Use get_weather to check the weather in Tokyo for April.", 74 | "Use convert_currency to convert $2000 USD to Japanese Yen.", 75 | "Use search_flights to find flights from Los Angeles to Tokyo on April 10th.", 76 | "Use translate_text to translate 'Where is the train station?' to Japanese.", 77 | "Use search_hotels to find hotels in Shibuya district for April 10-20." 78 | ] 79 | }, 80 | { 81 | "name": "event_planning", 82 | "description": "Planning an event with multiple tasks", 83 | "initial_prompt": "I'm organizing a company event in Miami. What's the weather forecast for Miami?", 84 | "expected_tools": ["get_weather"], 85 | "follow_ups": [ 86 | "Search for restaurants that can accommodate 50 people", 87 | "Set a reminder to send invitations tomorrow at 10 AM", 88 | "What's the latest news about event planning trends?", 89 | "Calculate the cost if catering is $45 per person for 50 people", 90 | "Find hotels near the beach for our out-of-town guests checking in May 15th" 91 | ] 92 | }, 93 | { 94 | "name": "complete_vacation", 95 | "description": "Full vacation planning with many steps", 96 | "initial_prompt": "I want to plan a complete vacation to Europe. Let's start with checking flights from New York to London on June 1st.", 97 | "expected_tools": ["search_flights"], 98 | "follow_ups": [ 99 | "What's the weather like in London in June?", 100 | "Convert $5000 to British pounds", 101 | "Search for hotels in London for June 1-5", 102 | "Find good restaurants in London, preferably British cuisine", 103 | "Now check flights from London to Paris on June 5th", 104 | "What's the weather in Paris in June?", 105 | "Convert $2000 to Euros", 106 | "Search for hotels in Paris for June 5-10", 107 | "Translate 'I would like a table for two' to French", 108 | "Find French restaurants in Paris", 109 | "Check flights from Paris to Rome on June 10th", 110 | "Weather in Rome in June?", 111 | "Search for hotels in Rome for June 10-15", 112 | "Translate 'How much does this cost?' to Italian", 113 | "Set a reminder to book everything by next Friday at 5 PM" 114 | ] 115 | }, 116 | { 117 | "name": "financial_portfolio", 118 | "description": "Comprehensive financial analysis", 119 | "initial_prompt": "I want to review my tech portfolio. Start by checking Apple's current price.", 120 | "expected_tools": ["get_stock_price"], 121 | "follow_ups": [ 122 | "Check Microsoft stock price", 123 | "Check Google stock price", 124 | "Check Amazon stock price", 125 | "Check Tesla stock price", 126 | "Calculate the total if I have 100 shares of Apple at current price", 127 | "Calculate 50 shares of Microsoft", 128 | "Calculate 75 shares of Google", 129 | "Calculate 30 shares of Amazon", 130 | "Calculate 40 shares of Tesla", 131 | "What's the latest news about tech stocks?", 132 | "Get news about cryptocurrency", 133 | "Convert $10,000 to Euros for my European investments", 134 | "Convert $5,000 to Japanese Yen", 135 | "Set a reminder to review portfolio again next month at 3 PM", 136 | "What's the total value if I sum all my stock positions?", 137 | "Get news about the Federal Reserve", 138 | "Calculate what percentage each stock represents of my total portfolio" 139 | ] 140 | }, 141 | { 142 | "name": "conference_coordination", 143 | "description": "Coordinate a multi-city conference tour", 144 | "initial_prompt": "I'm organizing a conference tour across multiple cities. First, check flights from San Francisco to Seattle on July 1st.", 145 | "expected_tools": ["search_flights"], 146 | "follow_ups": [ 147 | "Weather in Seattle in July?", 148 | "Find hotels in Seattle for July 1-3", 149 | "Search for conference venues (restaurants) that can host 100 people in Seattle", 150 | "Check flights from Seattle to Portland on July 3rd", 151 | "Weather in Portland?", 152 | "Hotels in Portland for July 3-5", 153 | "Calculate the budget: 100 people × $75 per person for catering", 154 | "Flights from Portland to Los Angeles on July 5th", 155 | "Weather in Los Angeles in July?", 156 | "Hotels in LA for July 5-7", 157 | "Get news about conference industry trends", 158 | "Convert our $50,000 budget to see how much that is in Euros", 159 | "Set reminder to confirm all venues by June 15th at noon", 160 | "Translate our welcome message 'Welcome to our annual conference' to Spanish", 161 | "Also translate it to Mandarin Chinese", 162 | "Calculate total hotel costs if average is $150/night for 20 rooms across all cities", 163 | "Search for restaurants in Los Angeles for our closing dinner", 164 | "What's the latest news about business travel?", 165 | "Set another reminder to send final attendee list on June 25th at 9 AM" 166 | ] 167 | }, 168 | { 169 | "name": "global_expansion", 170 | "description": "Plan international business expansion", 171 | "initial_prompt": "We're expanding our business globally. Start by checking flights from New York to Tokyo for August 1st.", 172 | "expected_tools": ["search_flights"], 173 | "follow_ups": [ 174 | "What's the weather in Tokyo in August?", 175 | "Convert $100,000 USD to Japanese Yen for initial investment", 176 | "Search for hotels in Tokyo for August 1-7", 177 | "Translate 'We look forward to doing business with you' to Japanese", 178 | "Get news about Japanese market trends", 179 | "Search for restaurants in Tokyo for business meetings", 180 | "Check flights from Tokyo to Beijing on August 7th", 181 | "Weather in Beijing in August?", 182 | "Convert $50,000 to Chinese Yuan", 183 | "Search hotels in Beijing for August 7-10", 184 | "Translate 'Thank you for your partnership' to Mandarin", 185 | "Get news about Chinese tech industry", 186 | "Search restaurants in Beijing", 187 | "Check flights from Beijing to Singapore on August 10th", 188 | "Weather in Singapore?", 189 | "Convert $75,000 to Singapore dollars", 190 | "Search hotels in Singapore for August 10-14", 191 | "Get news about Southeast Asian markets", 192 | "Search restaurants in Singapore", 193 | "Calculate total travel budget: 3 cities × 5 days × $300/day", 194 | "Set reminder to prepare presentation materials by July 25th", 195 | "Check flights from Singapore back to New York on August 14th" 196 | ] 197 | }, 198 | { 199 | "name": "mega_world_tour", 200 | "description": "Plan a comprehensive world tour", 201 | "initial_prompt": "I'm planning a world tour. Start with flights from Los Angeles to Sydney on September 1st.", 202 | "expected_tools": ["search_flights"], 203 | "follow_ups": [ 204 | "Weather in Sydney in September?", 205 | "Convert $3000 to Australian dollars", 206 | "Search hotels in Sydney for September 1-4", 207 | "Translate 'Good day mate' to Australian English", 208 | "Search restaurants in Sydney", 209 | "Check flights from Sydney to Tokyo on September 4th", 210 | "Weather in Tokyo?", 211 | "Convert $2500 to Japanese Yen", 212 | "Search hotels in Tokyo for September 4-7", 213 | "Translate 'Where is the subway?' to Japanese", 214 | "Get news about Tokyo Olympics legacy", 215 | "Search restaurants in Tokyo", 216 | "Flights from Tokyo to Dubai on September 7th", 217 | "Weather in Dubai in September?", 218 | "Convert $4000 to UAE Dirhams", 219 | "Search hotels in Dubai for September 7-10", 220 | "Get news about Dubai expo", 221 | "Search restaurants in Dubai", 222 | "Flights from Dubai to Paris on September 10th", 223 | "Weather in Paris?", 224 | "Convert $3500 to Euros", 225 | "Search hotels in Paris for September 10-13", 226 | "Translate 'Where is the Eiffel Tower?' to French", 227 | "Search French restaurants in Paris", 228 | "Get news about Paris fashion week", 229 | "Flights from Paris to London on September 13th", 230 | "Weather in London?", 231 | "Convert $3000 to British pounds", 232 | "Search hotels in London for September 13-16", 233 | "Search restaurants in London", 234 | "Calculate total budget for entire trip", 235 | "Set reminder to get travel insurance by August 15th", 236 | "Flights from London back to Los Angeles on September 16th" 237 | ] 238 | }, 239 | { 240 | "name": "startup_investor_roadshow", 241 | "description": "Organize investor meetings across multiple countries", 242 | "initial_prompt": "Planning an investor roadshow. Check flights from San Francisco to London on October 1st.", 243 | "expected_tools": ["search_flights"], 244 | "follow_ups": [ 245 | "Weather in London in October?", 246 | "Convert $500,000 investment fund to British pounds", 247 | "Search hotels in London financial district for October 1-3", 248 | "Search restaurants for investor dinners in London", 249 | "Get news about UK startup ecosystem", 250 | "Set reminder for pitch deck review on September 28th at 2 PM", 251 | "Flights from London to Berlin on October 3rd", 252 | "Weather in Berlin?", 253 | "Convert $250,000 to Euros for German investments", 254 | "Search hotels in Berlin for October 3-5", 255 | "Translate 'We see great potential in your startup' to German", 256 | "Search restaurants in Berlin", 257 | "Get news about European tech funding", 258 | "Flights from Berlin to Stockholm on October 5th", 259 | "Weather in Stockholm?", 260 | "Convert $150,000 to Swedish Krona", 261 | "Search hotels in Stockholm for October 5-7", 262 | "Translate 'Innovation is key to success' to Swedish", 263 | "Search restaurants in Stockholm", 264 | "Get news about Nordic startup scene", 265 | "Calculate ROI if we invest $50,000 in 10 startups with 20% expected return", 266 | "Flights from Stockholm to Amsterdam on October 7th", 267 | "Weather in Amsterdam?", 268 | "Search hotels in Amsterdam for October 7-9", 269 | "Search restaurants in Amsterdam", 270 | "Set reminder to send investment term sheets by October 15th", 271 | "Get news about Dutch fintech sector", 272 | "Calculate total travel expenses for tax deduction", 273 | "Flights from Amsterdam back to San Francisco on October 9th" 274 | ] 275 | }, 276 | { 277 | "name": "academic_conference_circuit", 278 | "description": "Attend multiple academic conferences worldwide", 279 | "initial_prompt": "I'm attending academic conferences globally. Check flights from Boston to Oxford on November 1st.", 280 | "expected_tools": ["search_flights"], 281 | "follow_ups": [ 282 | "Weather in Oxford in November?", 283 | "Search hotels near Oxford University for November 1-3", 284 | "Convert $2000 conference budget to British pounds", 285 | "Search restaurants in Oxford", 286 | "Get news about latest research in artificial intelligence", 287 | "Set reminder to submit paper by October 20th at midnight", 288 | "Flights from London to Geneva on November 3rd", 289 | "Weather in Geneva?", 290 | "Convert $1500 to Swiss Francs", 291 | "Search hotels in Geneva for November 3-5", 292 | "Translate 'Where is the conference center?' to French", 293 | "Search restaurants near CERN in Geneva", 294 | "Get news about particle physics breakthroughs", 295 | "Flights from Geneva to Vienna on November 5th", 296 | "Weather in Vienna?", 297 | "Convert $1200 to Euros", 298 | "Search hotels in Vienna for November 5-7", 299 | "Translate 'Thank you for the invitation' to German", 300 | "Search restaurants in Vienna", 301 | "Calculate conference registration fees: 5 conferences × $300 each", 302 | "Get news about quantum computing research", 303 | "Flights from Vienna to Prague on November 7th", 304 | "Weather in Prague?", 305 | "Search hotels in Prague for November 7-9", 306 | "Search restaurants in Prague", 307 | "Set reminder to prepare presentation slides by October 25th", 308 | "Flights from Prague to Barcelona on November 9th", 309 | "Weather in Barcelona?", 310 | "Search hotels in Barcelona for November 9-11", 311 | "Translate 'See you at the conference' to Spanish", 312 | "Search restaurants in Barcelona", 313 | "Get news about European research grants", 314 | "Calculate total publication costs if submitting to 3 journals at $500 each", 315 | "Flights from Barcelona back to Boston on November 11th" 316 | ] 317 | }, 318 | { 319 | "name": "luxury_shopping_expedition", 320 | "description": "Plan a luxury shopping tour across fashion capitals", 321 | "initial_prompt": "Planning a luxury shopping tour. Check flights from New York to Milan for December 1st.", 322 | "expected_tools": ["search_flights"], 323 | "follow_ups": [ 324 | "Weather in Milan in December?", 325 | "Convert $50,000 shopping budget to Euros", 326 | "Search luxury hotels in Milan for December 1-3", 327 | "Search Michelin-starred restaurants in Milan", 328 | "Get news about Milan Fashion Week", 329 | "Calculate VAT refund on $10,000 purchase in Italy", 330 | "Set reminder to check credit card limits by November 25th", 331 | "Flights from Milan to Paris on December 3rd", 332 | "Weather in Paris?", 333 | "Search hotels near Champs-Élysées for December 3-5", 334 | "Translate 'Do you have this in another size?' to French", 335 | "Search restaurants in Paris 8th arrondissement", 336 | "Convert additional $30,000 to Euros", 337 | "Get news about French luxury brands", 338 | "Calculate savings if items are 20% cheaper in Europe", 339 | "Flights from Paris to London on December 5th", 340 | "Weather in London?", 341 | "Convert $25,000 to British pounds", 342 | "Search hotels in Mayfair for December 5-7", 343 | "Search restaurants near Bond Street", 344 | "Get news about British fashion designers", 345 | "Set reminder to declare customs on return", 346 | "Flights from London to Dubai on December 7th", 347 | "Weather in Dubai?", 348 | "Convert $40,000 to UAE Dirhams", 349 | "Search hotels in Dubai Mall area for December 7-9", 350 | "Search restaurants in Dubai", 351 | "Get news about Dubai Shopping Festival", 352 | "Calculate duty-free savings on jewelry purchases", 353 | "Flights from Dubai to Tokyo on December 9th", 354 | "Weather in Tokyo?", 355 | "Convert $35,000 to Japanese Yen", 356 | "Search hotels in Ginza for December 9-11", 357 | "Translate 'Is this authentic?' to Japanese", 358 | "Search restaurants in Ginza", 359 | "Get news about Japanese fashion trends", 360 | "Calculate total spent across all cities", 361 | "Flights from Tokyo back to New York on December 11th" 362 | ] 363 | }, 364 | { 365 | "name": "film_festival_circuit", 366 | "description": "Attend major film festivals worldwide", 367 | "initial_prompt": "I'm attending film festivals globally. Start with flights from Los Angeles to Cannes for May 15th.", 368 | "expected_tools": ["search_flights"], 369 | "follow_ups": [ 370 | "Weather in Cannes in May?", 371 | "Convert $15,000 budget to Euros", 372 | "Search hotels on the Croisette for May 15-20", 373 | "Search restaurants in Cannes", 374 | "Get news about Cannes Film Festival lineup", 375 | "Set reminder for screening schedule on May 14th", 376 | "Flights from Nice to Venice on May 20th", 377 | "Weather in Venice?", 378 | "Search hotels near the Lido for May 20-25", 379 | "Translate 'Where is the festival venue?' to Italian", 380 | "Search restaurants in Venice", 381 | "Get news about Venice Biennale", 382 | "Calculate accommodation costs: 10 nights × $400/night", 383 | "Flights from Venice to Berlin on May 25th", 384 | "Weather in Berlin?", 385 | "Convert $8,000 to Euros for Berlin expenses", 386 | "Search hotels near Potsdamer Platz for May 25-30", 387 | "Translate 'Congratulations on your film' to German", 388 | "Search restaurants in Berlin", 389 | "Get news about Berlinale special screenings", 390 | "Flights from Berlin to Toronto on May 30th", 391 | "Weather in Toronto?", 392 | "Convert $10,000 to Canadian dollars", 393 | "Search hotels downtown Toronto for May 30-June 4", 394 | "Search restaurants in Toronto Entertainment District", 395 | "Get news about TIFF year-round programming", 396 | "Set reminder to submit film for next year's festivals", 397 | "Calculate total travel insurance needed", 398 | "Flights from Toronto to Sundance (via Salt Lake City) on June 4th", 399 | "Weather in Park City?", 400 | "Convert $12,000 to USD for Sundance", 401 | "Search hotels in Park City for June 4-9", 402 | "Search restaurants on Main Street Park City", 403 | "Get news about independent film funding", 404 | "Calculate potential distribution deal value", 405 | "Flights from Salt Lake City to Tokyo on June 9th", 406 | "Weather in Tokyo?", 407 | "Convert $7,000 to Japanese Yen", 408 | "Search hotels in Roppongi for June 9-14", 409 | "Translate 'Thank you for watching our film' to Japanese", 410 | "Search restaurants in Roppongi", 411 | "Get news about Asian film market", 412 | "Set reminder to follow up with distributors", 413 | "Flights from Tokyo back to Los Angeles on June 14th" 414 | ] 415 | } 416 | ] 417 | } -------------------------------------------------------------------------------- /tool_tester_v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | LLM Tool Calling Test Suite V3.2 4 | 5 | Key additions vs V3.1: 6 | - Capability probe to detect what the server actually supports: 7 | * tools + object tool_choice (best) 8 | * tools + "required" (common on local servers) 9 | * legacy functions + function_call 10 | - Hard-enforcement: if the user says "Use " and the model still doesn't 11 | emit a tool call, we EMULATE the call (and mark llm_initiated=False), so your 12 | harness still verifies parameters and counts calls. 13 | - Report now shows LLM-initiated vs Emulated counts. 14 | 15 | This makes the harness resilient to tool/function support differences across 16 | OpenAI-compatible servers (e.g., llama.cpp style, local gateways, etc.). 17 | """ 18 | 19 | import argparse 20 | import json 21 | import sys 22 | import time 23 | import os 24 | import io 25 | from typing import Dict, List, Any, Optional, Tuple 26 | from dataclasses import dataclass, field 27 | from datetime import datetime 28 | import re 29 | import requests 30 | 31 | from openai import OpenAI 32 | from tool_definitions import ToolDefinitions 33 | 34 | # Fix Windows console encoding issues 35 | if sys.platform == 'win32': 36 | # Set console to UTF-8 37 | sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') 38 | sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') 39 | 40 | 41 | # ------------------------ Data classes ------------------------ 42 | 43 | @dataclass 44 | class ToolCallResult: 45 | """Store results for a single tool call""" 46 | tool_name: str 47 | expected: bool 48 | parameters_correct: bool # structural (schema-level) correctness 49 | execution_successful: bool 50 | semantic_match: bool = True # values match what the user asked for 51 | actual_args: Dict[str, Any] = field(default_factory=dict) 52 | expected_args: Dict[str, Any] = field(default_factory=dict) 53 | llm_initiated: bool = True # False if harness emulated the call 54 | error: Optional[str] = None 55 | 56 | 57 | @dataclass 58 | class ExpectedCall: 59 | """Represents an expected tool call extracted from the user prompt""" 60 | tool_name: str 61 | expected_args: Dict[str, Any] = field(default_factory=dict) 62 | source_text: str = "" 63 | 64 | 65 | @dataclass 66 | class TestResult: 67 | """Store results for a single test scenario""" 68 | scenario_name: str 69 | description: str 70 | conversation_turns: int 71 | tool_calls_made: List[str] 72 | expected_tool_types: List[str] 73 | success: bool 74 | tool_call_details: List[ToolCallResult] = field(default_factory=list) 75 | expected_tool_call_count: int = 0 76 | error: Optional[str] = None 77 | execution_time: float = 0.0 78 | conversation_log: List[Dict] = field(default_factory=list) 79 | 80 | 81 | @dataclass 82 | class TestSuite: 83 | """Collection of test results""" 84 | name: str 85 | results: List[TestResult] = field(default_factory=list) 86 | 87 | @property 88 | def success_rate(self) -> float: 89 | if not self.results: 90 | return 0.0 91 | successful = sum(1 for r in self.results if r.success) 92 | return (successful / len(self.results)) * 100 93 | 94 | @property 95 | def total_tool_calls(self) -> int: 96 | return sum(len(r.tool_calls_made) for r in self.results) 97 | 98 | 99 | @dataclass 100 | class APICapabilities: 101 | """What the server supports""" 102 | supports_tools: bool = False 103 | supports_tool_choice_object: bool = False 104 | supports_tool_choice_required: bool = False 105 | supports_functions: bool = False 106 | 107 | 108 | # ------------------------ Scenarios ------------------------ 109 | 110 | class TestScenarios: 111 | """Natural conversation scenarios that require tool use""" 112 | 113 | @staticmethod 114 | def get_scenarios() -> List[Dict[str, Any]]: 115 | """Load scenarios from JSON file or return empty list if file doesn't exist""" 116 | scenarios_file = "test_scenarios.json" 117 | 118 | # Try to find the scenarios file 119 | if os.path.exists(scenarios_file): 120 | file_path = scenarios_file 121 | elif os.path.exists(os.path.join(os.path.dirname(__file__), scenarios_file)): 122 | file_path = os.path.join(os.path.dirname(__file__), scenarios_file) 123 | else: 124 | print(f"Warning: {scenarios_file} not found. Using empty scenario list.") 125 | return [] 126 | 127 | try: 128 | with open(file_path, 'r') as f: 129 | data = json.load(f) 130 | return data.get('scenarios', []) 131 | except Exception as e: 132 | print(f"Error loading scenarios from {file_path}: {e}") 133 | return [] 134 | 135 | @staticmethod 136 | def get_scenario_by_complexity(min_tools: int, max_tools: int) -> List[Dict[str, Any]]: 137 | """Get scenarios that require a specific number of tool calls""" 138 | all_scenarios = TestScenarios.get_scenarios() 139 | filtered = [] 140 | for scenario in all_scenarios: 141 | expected_count = len(scenario["expected_tools"]) + len(scenario.get("follow_ups", [])) 142 | if min_tools <= expected_count <= max_tools: 143 | filtered.append(scenario) 144 | return filtered 145 | 146 | 147 | # ------------------------ Core tester ------------------------ 148 | 149 | class LLMToolTester: 150 | """Main test runner for natural tool calling""" 151 | 152 | def __init__(self, api_base: str, api_key: str, model: str, debug: bool = False, provider: str = None, temperature: float = None, reasoning_effort: str = None): 153 | self.api_base = api_base.rstrip('/') 154 | self.api_key = api_key 155 | self.model = model 156 | self.provider = provider 157 | self.temperature = temperature if temperature is not None else 0.1 158 | self.reasoning_effort = reasoning_effort 159 | self.is_openrouter = 'openrouter' in api_base.lower() 160 | 161 | # Set up headers for OpenRouter 162 | headers = {} 163 | if self.is_openrouter and provider: 164 | headers['HTTP-Referer'] = 'https://tool-tester' # Optional 165 | headers['X-Title'] = 'Tool Tester' # Optional 166 | 167 | self.client = OpenAI(api_key=api_key, base_url=api_base, default_headers=headers) 168 | self.tools = ToolDefinitions.get_all_tools() 169 | self.debug = debug 170 | self.capabilities: Optional[APICapabilities] = None 171 | 172 | # ---------- Capability probe ---------- 173 | 174 | def _tools_to_functions(self) -> List[Dict[str, Any]]: 175 | """Convert modern tools list to legacy functions schema for fallback.""" 176 | funcs = [] 177 | for t in self.tools: 178 | if t.get("type") == "function": 179 | f = t["function"] 180 | funcs.append({ 181 | "name": f["name"], 182 | "description": f.get("description", ""), 183 | "parameters": f.get("parameters", {"type": "object", "properties": {}}) 184 | }) 185 | return funcs 186 | 187 | def _probe_capabilities(self) -> APICapabilities: 188 | caps = APICapabilities() 189 | probe_msgs = [ 190 | {"role": "system", "content": "You are a tool-calling probe."}, 191 | {"role": "user", "content": "Use the calculate tool to compute 2+2."} 192 | ] 193 | 194 | # Try tools + object tool_choice 195 | try: 196 | self.client.chat.completions.create( 197 | model=self.model, 198 | messages=probe_msgs, 199 | tools=self.tools, 200 | tool_choice={"type": "function", "function": {"name": "calculate"}}, 201 | temperature=self.temperature, 202 | max_tokens=1, 203 | timeout=10.0 204 | ) 205 | caps.supports_tools = True 206 | caps.supports_tool_choice_object = True 207 | if self.debug: 208 | print(" [Probe] tools + object tool_choice: OK") 209 | return caps 210 | except Exception as e: 211 | if self.debug: 212 | print(f" [Probe] tools + object tool_choice: FAIL ({e})") 213 | 214 | # Try tools + "required" 215 | try: 216 | self.client.chat.completions.create( 217 | model=self.model, 218 | messages=probe_msgs, 219 | tools=self.tools, 220 | tool_choice="required", 221 | temperature=self.temperature, 222 | max_tokens=1, 223 | timeout=10.0 224 | ) 225 | caps.supports_tools = True 226 | caps.supports_tool_choice_required = True 227 | if self.debug: 228 | print(" [Probe] tools + 'required': OK") 229 | except Exception as e: 230 | if self.debug: 231 | print(f" [Probe] tools + 'required': FAIL ({e})") 232 | 233 | # Try legacy functions + function_call 234 | try: 235 | self.client.chat.completions.create( 236 | model=self.model, 237 | messages=probe_msgs, 238 | functions=self._tools_to_functions(), 239 | function_call={"name": "calculate"}, 240 | temperature=self.temperature, 241 | max_tokens=1, 242 | timeout=10.0 243 | ) 244 | caps.supports_functions = True 245 | if self.debug: 246 | print(" [Probe] legacy functions + function_call: OK") 247 | except Exception as e: 248 | if self.debug: 249 | print(f" [Probe] legacy functions + function_call: FAIL ({e})") 250 | 251 | return caps 252 | 253 | # ---------- Expected-call extraction helpers ---------- 254 | 255 | def _extract_location(self, text: str) -> Optional[str]: 256 | m = re.search(r"\b(?:in|for)\s+([A-Z][A-Za-z]+(?:[ -][A-Z][A-Za-z]+)*)", text) 257 | if not m: 258 | return None 259 | loc = m.group(1) 260 | # Filter out month/time words accidentally captured (e.g., "in July") 261 | blacklist = { 262 | "January","February","March","April","May","June","July","August","September","October","November","December", 263 | "Today","Tomorrow","Tonight","Morning","Afternoon","Evening","Weekend","Week","Noon","Midnight" 264 | } 265 | if loc.capitalize() in blacklist: 266 | return None 267 | return loc 268 | 269 | def _extract_flight_triplet(self, text: str) -> Tuple[Optional[str], Optional[str], Optional[str]]: 270 | from_city = to_city = date = None 271 | m = re.search( 272 | r"from\s+([A-Z][\w\s-]+?)\s+to\s+([A-Z][\w\s-]+?)(?:\s+on\s+([A-Za-z]+\s+\d{1,2}\w{0,2}|\d{4}-\d{2}-\d{2}))?", 273 | text, re.IGNORECASE 274 | ) 275 | if m: 276 | from_city, to_city, date = m.group(1), m.group(2), m.group(3) 277 | return from_city, to_city, date 278 | 279 | def _extract_stock_symbols(self, text: str) -> List[str]: 280 | return re.findall(r"\(([A-Z]{1,6})\)", text) 281 | 282 | def _extract_currency_triplet(self, text: str) -> Tuple[Optional[float], Optional[str], Optional[str]]: 283 | amt = None 284 | m_amt = re.search(r"\$?\s?(\d[\d,]*(?:\.\d+)?)", text) 285 | if m_amt: 286 | amt = float(m_amt.group(1).replace(",", "")) 287 | 288 | def norm(code: str) -> str: 289 | mapping = { 290 | # Core 291 | "usd": "USD", "eur": "EUR", "gbp": "GBP", "jpy": "JPY", "yen": "JPY", 292 | "pounds": "GBP", "pound": "GBP", "euros": "EUR", "euro": "EUR", "dollars": "USD", "dollar": "USD", 293 | # Extended/common in scenarios 294 | "aud": "AUD", "australian dollars": "AUD", "australian dollar": "AUD", 295 | "cad": "CAD", "canadian dollars": "CAD", "canadian dollar": "CAD", 296 | "sgd": "SGD", "singapore dollars": "SGD", "singapore dollar": "SGD", 297 | "aed": "AED", "uae dirhams": "AED", "uae dirham": "AED", "dirhams": "AED", "dirham": "AED", 298 | "cny": "CNY", "rmb": "CNY", "renminbi": "CNY", "chinese yuan": "CNY", "yuan": "CNY", 299 | "chf": "CHF", "swiss francs": "CHF", "swiss franc": "CHF", 300 | "sek": "SEK", "swedish krona": "SEK", "krona": "SEK" 301 | } 302 | return mapping.get(code.lower(), code.upper()) 303 | 304 | m_from = re.search(r"\bfrom\s+([A-Za-z]{3,})\b", text) 305 | m_to = re.search(r"\bto\s+([A-Za-z]{3,}|[A-Za-z]+(?:\s+pounds|dollars|euros|yen))\b", text) 306 | from_ccy = norm(m_from.group(1)) if m_from else None 307 | to_ccy_raw = m_to.group(1) if m_to else None 308 | to_ccy = norm(to_ccy_raw) if to_ccy_raw else None 309 | 310 | if amt and not from_ccy: 311 | if "$" in text: 312 | from_ccy = "USD" 313 | return amt, from_ccy, to_ccy 314 | 315 | def _extract_news_topic(self, text: str) -> Optional[str]: 316 | m = re.search(r"news\s+(?:about|on|regarding|around)\s+(.+)$", text, re.IGNORECASE) 317 | if not m: 318 | return None 319 | topic = m.group(1).strip() 320 | # Strip trailing punctuation 321 | topic = topic.rstrip(".?! ") 322 | return topic if topic else None 323 | 324 | def _extract_translate(self, text: str) -> Dict[str, Any]: 325 | # Prefer quoted text pattern 326 | m = re.search(r"translate\s+[\"'](.+?)[\"']\s+to\s+([A-Za-z ]+)", text, re.IGNORECASE) 327 | if m: 328 | return {"text": m.group(1), "target_language": m.group(2).strip()} 329 | # Fallback: only target language 330 | m2 = re.search(r"translate(?:\s+it|\s+this|\s+the message)?\s+to\s+([A-Za-z ]+)", text, re.IGNORECASE) 331 | if m2: 332 | return {"target_language": m2.group(1).strip()} 333 | return {} 334 | 335 | def _extract_calculate_expression(self, text: str) -> Optional[str]: 336 | m = re.search(r"(?:^|\b)calculate\b[:\s]*(.+)$", text, re.IGNORECASE) 337 | if m: 338 | return m.group(1).strip() 339 | return None 340 | 341 | def _stock_name_to_ticker(self, text: str) -> List[str]: 342 | mapping = { 343 | "apple": "AAPL", 344 | "microsoft": "MSFT", 345 | "google": "GOOGL", 346 | "alphabet": "GOOGL", 347 | "amazon": "AMZN", 348 | "tesla": "TSLA" 349 | } 350 | found = [] 351 | low = text.lower() 352 | for name, ticker in mapping.items(): 353 | if re.search(rf"\b{name}\b", low): 354 | found.append(ticker) 355 | # Also capture standalone uppercase tickers not in parentheses (2-6 letters) 356 | for tok in re.findall(r"\b[A-Z]{2,6}\b", text): 357 | if tok not in found: 358 | found.append(tok) 359 | return found 360 | 361 | def _extract_args_for_tool(self, name: str, text: str, context: Dict[str, Any]) -> List[ExpectedCall]: 362 | calls: List[ExpectedCall] = [] 363 | expected_args: Dict[str, Any] = {} 364 | 365 | if name == "get_weather": 366 | loc = self._extract_location(text) or context.get("last_location") 367 | if loc: 368 | expected_args["location"] = loc 369 | context["last_location"] = loc 370 | calls.append(ExpectedCall(name, expected_args, text)) 371 | return calls 372 | 373 | if name == "search_flights": 374 | f, t, d = self._extract_flight_triplet(text) 375 | if f: expected_args["from_city"] = f 376 | if t: expected_args["to_city"] = t 377 | if d: expected_args["date"] = d 378 | calls.append(ExpectedCall(name, expected_args, text)) 379 | return calls 380 | 381 | if name == "get_stock_price": 382 | symbols = self._extract_stock_symbols(text) 383 | if not symbols: 384 | symbols = self._stock_name_to_ticker(text) 385 | if symbols: 386 | for s in symbols: 387 | calls.append(ExpectedCall(name, {"symbol": s}, text)) 388 | return calls 389 | calls.append(ExpectedCall(name, {}, text)) 390 | return calls 391 | 392 | if name == "convert_currency": 393 | amt, fr, to = self._extract_currency_triplet(text) 394 | if amt is not None: expected_args["amount"] = amt 395 | if fr: expected_args["from_currency"] = fr 396 | if to: expected_args["to_currency"] = to 397 | calls.append(ExpectedCall(name, expected_args, text)) 398 | return calls 399 | 400 | if name in ("search_restaurants", "search_hotels"): 401 | loc = self._extract_location(text) or context.get("last_location") 402 | if loc: 403 | expected_args["location"] = loc 404 | context["last_location"] = loc 405 | calls.append(ExpectedCall(name, expected_args, text)) 406 | return calls 407 | 408 | if name == "translate_text": 409 | args = self._extract_translate(text) 410 | if args: 411 | calls.append(ExpectedCall(name, args, text)) 412 | return calls 413 | calls.append(ExpectedCall(name, {}, text)) 414 | return calls 415 | 416 | if name == "get_news": 417 | topic = self._extract_news_topic(text) 418 | if topic: 419 | calls.append(ExpectedCall(name, {"topic": topic}, text)) 420 | return calls 421 | calls.append(ExpectedCall(name, {}, text)) 422 | return calls 423 | 424 | if name == "set_reminder": 425 | m_time = re.search(r"\b(?:at|@)\s+(\d{1,2}:\d{2}|\d{1,2}\s?(?:am|pm)|noon|midnight)\b", text, re.IGNORECASE) 426 | if m_time: 427 | expected_args["time"] = m_time.group(1) 428 | m_msg = re.search(r"set a reminder(?:\s+to)?\s+(.+?)(?:\s+at|\s+on|$)", text, re.IGNORECASE) 429 | if m_msg: 430 | expected_args["message"] = m_msg.group(1).strip() 431 | calls.append(ExpectedCall(name, expected_args, text)) 432 | return calls 433 | 434 | if name == "calculate": 435 | expr = self._extract_calculate_expression(text) 436 | if expr: 437 | calls.append(ExpectedCall(name, {"expression": expr}, text)) 438 | return calls 439 | # As fallback, use the whole text after 'calculate' if present elsewhere 440 | calls.append(ExpectedCall(name, {}, text)) 441 | return calls 442 | 443 | # Default 444 | calls.append(ExpectedCall(name, {}, text)) 445 | return calls 446 | 447 | def _build_expected_calls_from_text(self, text: str, context: Dict[str, Any]) -> List[ExpectedCall]: 448 | calls: List[ExpectedCall] = [] 449 | low = text.lower() 450 | 451 | # 1) Explicit: capture ALL "use " mentions in order of appearance 452 | occurrences: List[Tuple[int, str]] = [] 453 | for t in self.tools: 454 | name = t["function"]["name"] 455 | for m in re.finditer(rf"use\s+(?:the\s+)?{re.escape(name)}\b", low): 456 | occurrences.append((m.start(), name)) 457 | occurrences.sort(key=lambda x: x[0]) 458 | 459 | if occurrences: 460 | # Build segments between occurrences to improve per-tool arg extraction 461 | for idx, (pos, name) in enumerate(occurrences): 462 | seg_start = pos 463 | seg_end = occurrences[idx + 1][0] if idx + 1 < len(occurrences) else len(text) 464 | seg_text = text[seg_start:seg_end] 465 | calls.extend(self._extract_args_for_tool(name, seg_text, context)) 466 | return calls 467 | 468 | # 2) Implicit fallbacks (broadened) 469 | if "weather" in low: 470 | loc = self._extract_location(text) or context.get("last_location") 471 | if loc: 472 | context["last_location"] = loc 473 | calls.append(ExpectedCall("get_weather", {"location": loc} if loc else {}, text)) 474 | 475 | if ("hotel" in low or "hotels" in low): 476 | loc = self._extract_location(text) or context.get("last_location") 477 | if loc: 478 | context["last_location"] = loc 479 | calls.append(ExpectedCall("search_hotels", {"location": loc} if loc else {}, text)) 480 | 481 | if "restaurant" in low: 482 | loc = self._extract_location(text) or context.get("last_location") 483 | if loc: 484 | context["last_location"] = loc 485 | calls.append(ExpectedCall("search_restaurants", {"location": loc} if loc else {}, text)) 486 | 487 | if "flight" in low: 488 | f, t, d = self._extract_flight_triplet(text) 489 | ec_args = {} 490 | if f: ec_args["from_city"] = f 491 | if t: ec_args["to_city"] = t 492 | if d: ec_args["date"] = d 493 | calls.append(ExpectedCall("search_flights", ec_args, text)) 494 | 495 | if "calculate" in low: 496 | expr = self._extract_calculate_expression(text) 497 | calls.append(ExpectedCall("calculate", {"expression": expr} if expr else {}, text)) 498 | 499 | if "news" in low: 500 | topic = self._extract_news_topic(text) 501 | calls.append(ExpectedCall("get_news", {"topic": topic} if topic else {}, text)) 502 | 503 | if "convert" in low: 504 | amt, fr, to = self._extract_currency_triplet(text) 505 | args = {} 506 | if amt is not None: args["amount"] = amt 507 | if fr: args["from_currency"] = fr 508 | if to: args["to_currency"] = to 509 | calls.append(ExpectedCall("convert_currency", args, text)) 510 | 511 | if "translate" in low: 512 | args = self._extract_translate(text) 513 | calls.append(ExpectedCall("translate_text", args, text)) 514 | 515 | if "stock" in low: 516 | symbols = self._extract_stock_symbols(text) 517 | if not symbols: 518 | symbols = self._stock_name_to_ticker(text) 519 | if symbols: 520 | for s in symbols: 521 | calls.append(ExpectedCall("get_stock_price", {"symbol": s}, text)) 522 | else: 523 | calls.append(ExpectedCall("get_stock_price", {}, text)) 524 | 525 | return calls 526 | 527 | def _build_expected_queue_for_scenario(self, scenario: Dict[str, Any]) -> List[ExpectedCall]: 528 | ctx: Dict[str, Any] = {} 529 | queue: List[ExpectedCall] = [] 530 | queue.extend(self._build_expected_calls_from_text(scenario["initial_prompt"], ctx)) 531 | for fu in scenario.get("follow_ups", []): 532 | queue.extend(self._build_expected_calls_from_text(fu, ctx)) 533 | return queue 534 | 535 | def _match_expected_call(self, tool_name: str, args: Dict[str, Any], expected_queue: List[ExpectedCall]) -> Tuple[bool, bool, Dict[str, Any]]: 536 | for i, exp in enumerate(expected_queue): 537 | if exp.tool_name == tool_name: 538 | exp_used = expected_queue[i] 539 | sem_ok = True 540 | for k, v in exp_used.expected_args.items(): 541 | if v in (None, ""): 542 | continue 543 | a = args.get(k) 544 | if a is None: 545 | sem_ok = False 546 | break 547 | if isinstance(v, str) and v.strip(): 548 | if v.lower() not in str(a).lower(): 549 | sem_ok = False 550 | break 551 | elif isinstance(v, (int, float)): 552 | try: 553 | sem_ok = abs(float(a) - float(v)) < 1e-6 554 | except Exception: 555 | sem_ok = False 556 | if not sem_ok: 557 | break 558 | # If no keys to check, treat as semantic miss (unless tool truly needs none) 559 | if not exp_used.expected_args: 560 | sem_ok = False 561 | # Special-case: for calculate, accept if expression contains a number 562 | if tool_name == "calculate": 563 | expr = args.get("expression") 564 | if isinstance(expr, str) and re.search(r"\d", expr): 565 | sem_ok = True 566 | if sem_ok: 567 | expected_queue.pop(i) 568 | return True, sem_ok, exp_used.expected_args 569 | return False, False, {} 570 | 571 | # ---------- Utilities ---------- 572 | 573 | def _detect_forced_tool_name(self, messages: List[Dict[str, Any]]) -> Optional[str]: 574 | last_user = None 575 | for m in reversed(messages): 576 | if m.get("role") == "user": 577 | last_user = m.get("content", "") 578 | break 579 | if not last_user: 580 | return None 581 | low = last_user.lower() 582 | for t in self.tools: 583 | name = t["function"]["name"] 584 | if f"use {name.lower()}" in low or f"use the {name.lower()}" in low: 585 | return name 586 | return None 587 | 588 | def _message_to_dict(self, message) -> Dict: 589 | """Normalize assistant message for re-sending (both tool_calls and function_call).""" 590 | try: 591 | return message.model_dump() 592 | except AttributeError: 593 | msg = {"role": "assistant"} 594 | msg["content"] = getattr(message, "content", "") or "" 595 | 596 | # tool_calls normalization 597 | tc_list = getattr(message, "tool_calls", None) 598 | if tc_list: 599 | norm = [] 600 | for tc in tc_list: 601 | try: 602 | norm.append({ 603 | "id": getattr(tc, "id", None), 604 | "type": getattr(tc, "type", "function"), 605 | "function": { 606 | "name": getattr(getattr(tc, "function", None), "name", None), 607 | "arguments": getattr(getattr(tc, "function", None), "arguments", "{}") 608 | } 609 | }) 610 | except Exception: 611 | pass 612 | if norm: 613 | msg["tool_calls"] = norm 614 | 615 | # function_call normalization 616 | fc = getattr(message, "function_call", None) 617 | if fc: 618 | try: 619 | msg["function_call"] = { 620 | "name": getattr(fc, "name", None), 621 | "arguments": getattr(fc, "arguments", "{}") 622 | } 623 | except Exception: 624 | pass 625 | 626 | return msg 627 | 628 | def _ensure_capabilities(self): 629 | if self.capabilities is None: 630 | print(" Probing server capabilities...") 631 | self.capabilities = self._probe_capabilities() 632 | if self.debug: 633 | print(f" Capabilities: {self.capabilities}") 634 | 635 | def _chat_request_openrouter(self, messages: List[Dict[str, Any]], forced_name: Optional[str]): 636 | """ 637 | Direct request to OpenRouter API bypassing OpenAI client to properly handle provider routing. 638 | Returns (response_obj, mode_str) matching the format of regular _chat_request. 639 | """ 640 | print(f" [OpenRouter] Sending request to OpenRouter...") 641 | 642 | # Build URL 643 | if '/v1' in self.api_base: 644 | url = f"{self.api_base}/chat/completions" 645 | else: 646 | url = f"{self.api_base}/v1/chat/completions" 647 | 648 | # Build request body with all parameters 649 | body = { 650 | "model": self.model, 651 | "messages": messages, 652 | "temperature": self.temperature, 653 | "max_tokens": 1000, 654 | "stream": False 655 | } 656 | 657 | # Add provider routing - using "require" to force specific provider 658 | if self.provider: 659 | # Use "require" to force the provider (will fail if unavailable) 660 | # Alternative: use "order" for preference with fallback 661 | body["provider"] = {"order": [self.provider], "allow_fallbacks": False} 662 | if self.debug: 663 | print(f" [OpenRouter] Requiring provider: {self.provider} (forced, no fallback)") 664 | 665 | # Add reasoning if specified 666 | if self.reasoning_effort: 667 | body["reasoning"] = { 668 | "effort": self.reasoning_effort, 669 | "max_tokens": 1000, 670 | "exclude": False, 671 | "enabled": True 672 | } 673 | 674 | # Add tools support 675 | if self.tools: 676 | body["tools"] = self.tools 677 | if forced_name: 678 | body["tool_choice"] = {"type": "function", "function": {"name": forced_name}} 679 | else: 680 | body["tool_choice"] = "auto" 681 | 682 | # Prepare headers 683 | headers = { 684 | "Authorization": f"Bearer {self.api_key}", 685 | "Content-Type": "application/json", 686 | "HTTP-Referer": "https://tool-tester", # Optional but recommended for OpenRouter 687 | "X-Title": "Tool Tester" # Optional but recommended for OpenRouter 688 | } 689 | 690 | print(f" [OpenRouter Request] URL: {url}") 691 | print(f" [OpenRouter Request] Provider in body: {body.get('provider')}") 692 | print(f" [OpenRouter Request] Body: {body}") 693 | 694 | # Retry logic for rate limiting and provider downtime 695 | max_retries = 3 696 | max_404_retries = 5 # More retries for provider downtime 697 | retry_count = 0 698 | consecutive_404s = 0 699 | base_wait_time = 5 # Start with 5 seconds 700 | base_404_wait_time = 15 # Longer waits for 404s 701 | 702 | while retry_count <= max_retries: 703 | try: 704 | # Make the request 705 | response = requests.post(url, json=body, headers=headers, timeout=30.0) 706 | 707 | # Check for rate limiting (429) 708 | if response.status_code == 429: 709 | retry_count += 1 710 | if retry_count > max_retries: 711 | error_detail = "" 712 | try: 713 | error_json = response.json() 714 | error_detail = f" - {error_json}" 715 | except: 716 | error_detail = f" - {response.text}" 717 | raise ValueError(f"OpenRouter API error {response.status_code} (rate limited after {max_retries} retries){error_detail}") 718 | 719 | # Calculate wait time with exponential backoff 720 | wait_time = base_wait_time * (2 ** (retry_count - 1)) 721 | 722 | # Check for Retry-After header 723 | retry_after = response.headers.get('Retry-After') 724 | if retry_after: 725 | try: 726 | wait_time = int(retry_after) 727 | print(f" [OpenRouter] Rate limited. Waiting {wait_time}s as requested by server...") 728 | except: 729 | print(f" [OpenRouter] Rate limited. Waiting {wait_time}s (exponential backoff)...") 730 | else: 731 | print(f" [OpenRouter] Rate limited. Waiting {wait_time}s (exponential backoff)...") 732 | 733 | time.sleep(wait_time) 734 | continue 735 | 736 | # Check for provider downtime (404 - No endpoints found) 737 | if response.status_code == 404: 738 | consecutive_404s += 1 739 | retry_count += 1 740 | 741 | # Check if we've hit too many 404s - abort the entire test 742 | if consecutive_404s > max_404_retries: 743 | print(f" [OpenRouter] Provider appears to be down after {max_404_retries} attempts. Aborting test.") 744 | # Set a special flag to indicate provider failure 745 | raise ValueError("PROVIDER_DOWN") 746 | 747 | if retry_count > max_retries: 748 | error_detail = "" 749 | try: 750 | error_json = response.json() 751 | error_detail = f" - {error_json}" 752 | except: 753 | error_detail = f" - {response.text}" 754 | raise ValueError(f"OpenRouter API error {response.status_code} (provider down after {max_retries} retries){error_detail}") 755 | 756 | # Longer wait for provider downtime 757 | wait_time = min(60, base_404_wait_time * consecutive_404s) # Cap at 1 minute 758 | print(f" [OpenRouter] Provider down (attempt {consecutive_404s}/{max_404_retries}). Waiting {wait_time}s...") 759 | 760 | time.sleep(wait_time) 761 | continue 762 | else: 763 | # Reset 404 counter on success or other errors 764 | consecutive_404s = 0 765 | 766 | # Check for other errors 767 | if response.status_code != 200: 768 | error_detail = "" 769 | try: 770 | error_json = response.json() 771 | error_detail = f" - {error_json}" 772 | except: 773 | error_detail = f" - {response.text}" 774 | raise ValueError(f"OpenRouter API error {response.status_code}{error_detail}") 775 | 776 | # Success - break out of retry loop 777 | break 778 | 779 | except requests.exceptions.Timeout: 780 | retry_count += 1 781 | if retry_count > max_retries: 782 | raise ValueError(f"OpenRouter request timed out after {max_retries} retries") 783 | wait_time = base_wait_time * (2 ** (retry_count - 1)) 784 | print(f" [OpenRouter] Request timeout. Retrying in {wait_time}s...") 785 | time.sleep(wait_time) 786 | continue 787 | except requests.exceptions.ConnectionError as e: 788 | retry_count += 1 789 | if retry_count > max_retries: 790 | raise ValueError(f"OpenRouter connection error after {max_retries} retries: {str(e)}") 791 | wait_time = base_wait_time * (2 ** (retry_count - 1)) 792 | print(f" [OpenRouter] Connection error. Retrying in {wait_time}s...") 793 | time.sleep(wait_time) 794 | continue 795 | 796 | response.raise_for_status() 797 | 798 | # Parse response 799 | response_data = response.json() 800 | 801 | # Convert to OpenAI-like response object structure 802 | # We need to create a mock object that matches what the OpenAI client returns 803 | class MockMessage: 804 | def __init__(self, data): 805 | self.content = data.get("content", "") 806 | self.role = data.get("role", "assistant") 807 | self.tool_calls = None 808 | self.function_call = None 809 | 810 | # Handle tool calls if present 811 | if "tool_calls" in data: 812 | self.tool_calls = [] 813 | for tc in data["tool_calls"]: 814 | tool_call = type('ToolCall', (), { 815 | 'id': tc.get('id'), 816 | 'type': tc.get('type', 'function'), 817 | 'function': type('Function', (), { 818 | 'name': tc.get('function', {}).get('name'), 819 | 'arguments': tc.get('function', {}).get('arguments', '{}') 820 | })() 821 | })() 822 | self.tool_calls.append(tool_call) 823 | 824 | # Handle legacy function_call if present 825 | if "function_call" in data: 826 | fc = data["function_call"] 827 | self.function_call = type('FunctionCall', (), { 828 | 'name': fc.get('name'), 829 | 'arguments': fc.get('arguments', '{}') 830 | })() 831 | 832 | class MockChoice: 833 | def __init__(self, choice_data): 834 | msg_data = choice_data.get("message", {}) 835 | self.message = MockMessage(msg_data) 836 | 837 | class MockResponse: 838 | def __init__(self, response_data): 839 | self.choices = [MockChoice(c) for c in response_data.get("choices", [])] 840 | 841 | mock_response = MockResponse(response_data) 842 | 843 | # Check if provider was actually used (OpenRouter sometimes includes this in response) 844 | if self.debug and "provider" in response_data: 845 | print(f" [OpenRouter Response] Provider used: {response_data['provider']}") 846 | 847 | return mock_response, "openrouter_direct" 848 | 849 | def _chat_request(self, messages: List[Dict[str, Any]], forced_name: Optional[str]): 850 | """ 851 | Performs a chat request using the best-supported mode based on capability probe. 852 | Returns (response, mode_str) where mode_str ∈ {"tools_object","tools_required","functions","none","openrouter_direct"} 853 | """ 854 | # Use direct OpenRouter API if this is an OpenRouter request (to properly handle provider and other params) 855 | if self.is_openrouter: 856 | return self._chat_request_openrouter(messages, forced_name) 857 | 858 | self._ensure_capabilities() 859 | caps = self.capabilities 860 | 861 | # Prefer tools + object 862 | if caps.supports_tools and caps.supports_tool_choice_object: 863 | tool_choice = {"type": "function", "function": {"name": forced_name}} if forced_name else "auto" 864 | if self.debug: 865 | print(f" [Request] mode=tools_object, forced={forced_name}") 866 | 867 | kwargs = { 868 | "model": self.model, 869 | "messages": messages, 870 | "tools": self.tools, 871 | "tool_choice": tool_choice, 872 | "temperature": self.temperature, 873 | "max_tokens": 1000, 874 | "timeout": 30.0 875 | } 876 | 877 | resp = self.client.chat.completions.create(**kwargs) 878 | return resp, "tools_object" 879 | 880 | # Next: tools + "required" 881 | if caps.supports_tools and caps.supports_tool_choice_required: 882 | # We can't force the exact tool name, but "required" nudges the model to call a tool. 883 | if self.debug: 884 | print(f" [Request] mode=tools_required, forced={forced_name} (hinted via text)") 885 | 886 | kwargs = { 887 | "model": self.model, 888 | "messages": messages, 889 | "tools": self.tools, 890 | "tool_choice": "required" if forced_name else "auto", 891 | "temperature": self.temperature, 892 | "max_tokens": 1000, 893 | "timeout": 30.0 894 | } 895 | 896 | resp = self.client.chat.completions.create(**kwargs) 897 | return resp, "tools_required" 898 | 899 | # Legacy functions 900 | if caps.supports_functions: 901 | kwargs = { 902 | "model": self.model, 903 | "messages": messages, 904 | "functions": self._tools_to_functions(), 905 | "temperature": self.temperature, 906 | "max_tokens": 1000, 907 | "timeout": 30.0 908 | } 909 | if forced_name: 910 | kwargs["function_call"] = {"name": forced_name} 911 | 912 | if self.debug: 913 | print(f" [Request] mode=functions, forced={forced_name}") 914 | resp = self.client.chat.completions.create(**kwargs) 915 | return resp, "functions" 916 | 917 | # No tool support detected 918 | if self.debug: 919 | print(" [Request] mode=none (no tool support detected)") 920 | 921 | kwargs = { 922 | "model": self.model, 923 | "messages": messages, 924 | "temperature": self.temperature, 925 | "max_tokens": 1000, 926 | "timeout": 30.0 927 | } 928 | 929 | resp = self.client.chat.completions.create(**kwargs) 930 | return resp, "none" 931 | 932 | # ---------- Scenario runner ---------- 933 | 934 | def run_scenario(self, scenario: Dict[str, Any]) -> TestResult: 935 | start_time = time.time() 936 | conversation_log = [] 937 | tool_calls_made: List[str] = [] 938 | tool_call_details: List[ToolCallResult] = [] 939 | 940 | expected_queue = self._build_expected_queue_for_scenario(scenario) 941 | total_expected_count = len(expected_queue) 942 | 943 | try: 944 | messages = [ 945 | { 946 | "role": "system", 947 | "content": ( 948 | "You are a helpful assistant. When the user explicitly names a tool to use, " 949 | "you MUST call that tool and MUST NOT fabricate results. When a tool is available " 950 | "that directly answers the user's request (e.g., weather, flights, hotels, stocks, currency), " 951 | "prefer calling the tool over answering from prior knowledge. " 952 | "Do not ignore available tools." 953 | ), 954 | }, 955 | {"role": "user", "content": scenario["initial_prompt"]} 956 | ] 957 | conversation_log.append({"role": "user", "content": scenario["initial_prompt"]}) 958 | 959 | all_prompts = [scenario["initial_prompt"]] + scenario.get("follow_ups", []) 960 | user_prompt_index = 0 961 | max_turns = max(6, (len(all_prompts) + total_expected_count) * 2) 962 | 963 | for _ in range(max_turns): 964 | forced_name = self._detect_forced_tool_name(messages) 965 | response, mode = self._chat_request(messages, forced_name) 966 | 967 | if not response.choices: 968 | raise ValueError("No response choices") 969 | 970 | assistant_message = response.choices[0].message 971 | messages.append(self._message_to_dict(assistant_message)) 972 | 973 | if assistant_message.content: 974 | conversation_log.append({"role": "assistant", "content": assistant_message.content}) 975 | 976 | did_tool_something = False 977 | 978 | # --- Modern tools path: multiple tool_calls possible --- 979 | if hasattr(assistant_message, "tool_calls") and assistant_message.tool_calls: 980 | did_tool_something = True 981 | for tc in assistant_message.tool_calls: 982 | tool_name = tc.function.name 983 | tool_calls_made.append(tool_name) 984 | 985 | try: 986 | args = json.loads(tc.function.arguments or "{}") 987 | except Exception: 988 | args = {} 989 | 990 | params_valid = ToolDefinitions.validate_parameters(tool_name, args) 991 | 992 | try: 993 | result = ToolDefinitions.execute_tool(tool_name, args) 994 | exec_ok = True 995 | exec_err = None 996 | except Exception as e: 997 | result = f"Error executing {tool_name}: {str(e)}" 998 | exec_ok = False 999 | exec_err = str(e) 1000 | 1001 | is_expected, sem_ok, exp_args = self._match_expected_call(tool_name, args, expected_queue) 1002 | 1003 | tool_call_details.append(ToolCallResult( 1004 | tool_name=tool_name, 1005 | expected=is_expected, 1006 | parameters_correct=params_valid, 1007 | execution_successful=exec_ok, 1008 | semantic_match=sem_ok if is_expected else False, 1009 | actual_args=args, 1010 | expected_args=exp_args, 1011 | llm_initiated=True, 1012 | error=exec_err 1013 | )) 1014 | 1015 | # Return tool result (modern format) 1016 | messages.append({ 1017 | "role": "tool", 1018 | "tool_call_id": tc.id, 1019 | "name": tool_name, 1020 | "content": result 1021 | }) 1022 | conversation_log.append({"role": "tool", "name": tool_name, "content": result if len(result) < 200 else result[:197] + "..."}) 1023 | 1024 | # --- Legacy functions path: single function_call --- 1025 | elif hasattr(assistant_message, "function_call") and assistant_message.function_call: 1026 | did_tool_something = True 1027 | fc = assistant_message.function_call 1028 | tool_name = fc.name 1029 | tool_calls_made.append(tool_name) 1030 | 1031 | try: 1032 | args = json.loads(fc.arguments or "{}") 1033 | except Exception: 1034 | args = {} 1035 | 1036 | params_valid = ToolDefinitions.validate_parameters(tool_name, args) 1037 | 1038 | try: 1039 | result = ToolDefinitions.execute_tool(tool_name, args) 1040 | exec_ok = True 1041 | exec_err = None 1042 | except Exception as e: 1043 | result = f"Error executing {tool_name}: {str(e)}" 1044 | exec_ok = False 1045 | exec_err = str(e) 1046 | 1047 | is_expected, sem_ok, exp_args = self._match_expected_call(tool_name, args, expected_queue) 1048 | 1049 | tool_call_details.append(ToolCallResult( 1050 | tool_name=tool_name, 1051 | expected=is_expected, 1052 | parameters_correct=params_valid, 1053 | execution_successful=exec_ok, 1054 | semantic_match=sem_ok if is_expected else False, 1055 | actual_args=args, 1056 | expected_args=exp_args, 1057 | llm_initiated=True, 1058 | error=exec_err 1059 | )) 1060 | 1061 | # Return tool result (legacy format uses role=function) 1062 | messages.append({ 1063 | "role": "function", 1064 | "name": tool_name, 1065 | "content": result 1066 | }) 1067 | conversation_log.append({"role": "function", "name": tool_name, "content": result if len(result) < 200 else result[:197] + "..."}) 1068 | 1069 | # No emulation: if forced tool wasn't called, we proceed without executing it 1070 | 1071 | # If a tool was called (or emulated), let the loop iterate again to allow follow-ups. 1072 | if did_tool_something: 1073 | continue 1074 | 1075 | # No tool call this turn: push next follow-up if available 1076 | if user_prompt_index < len(scenario.get("follow_ups", [])): 1077 | next_prompt = scenario["follow_ups"][user_prompt_index] 1078 | messages.append({"role": "user", "content": next_prompt}) 1079 | conversation_log.append({"role": "user", "content": next_prompt}) 1080 | user_prompt_index += 1 1081 | continue 1082 | 1083 | # No more prompts, break 1084 | break 1085 | 1086 | # Evaluate success 1087 | matched_calls_llm = sum(1 for tc in tool_call_details if tc.llm_initiated and tc.expected and tc.parameters_correct and tc.semantic_match) 1088 | structural_ok_llm = sum(1 for tc in tool_call_details if tc.llm_initiated and tc.parameters_correct) 1089 | 1090 | success = ( 1091 | (total_expected_count == 0 or matched_calls_llm >= max(1, int(0.7 * total_expected_count))) 1092 | and (structural_ok_llm >= int(0.6 * max(1, sum(1 for tc in tool_call_details if tc.llm_initiated)))) 1093 | ) 1094 | 1095 | execution_time = time.time() - start_time 1096 | 1097 | return TestResult( 1098 | scenario_name=scenario["name"], 1099 | description=scenario["description"], 1100 | conversation_turns=len([m for m in conversation_log if m["role"] in ["user", "assistant"]]), 1101 | tool_calls_made=tool_calls_made, 1102 | tool_call_details=tool_call_details, 1103 | expected_tool_types=scenario["expected_tools"], 1104 | expected_tool_call_count=total_expected_count, 1105 | success=success, 1106 | execution_time=execution_time, 1107 | conversation_log=conversation_log 1108 | ) 1109 | 1110 | except Exception as e: 1111 | execution_time = time.time() - start_time 1112 | # Check if this is a provider down error 1113 | if str(e) == "PROVIDER_DOWN": 1114 | print(f" Provider down - skipping remaining tests") 1115 | return TestResult( 1116 | scenario_name=scenario["name"], 1117 | description=scenario["description"], 1118 | conversation_turns=0, 1119 | tool_calls_made=[], 1120 | tool_call_details=[], 1121 | expected_tool_types=scenario["expected_tools"], 1122 | expected_tool_call_count=total_expected_count, 1123 | success=False, 1124 | error="Provider unavailable", 1125 | execution_time=execution_time, 1126 | conversation_log=[] 1127 | ) 1128 | 1129 | return TestResult( 1130 | scenario_name=scenario["name"], 1131 | description=scenario["description"], 1132 | conversation_turns=len([m for m in conversation_log if m["role"] in ["user", "assistant"]]), 1133 | tool_calls_made=tool_calls_made, 1134 | tool_call_details=tool_call_details, 1135 | expected_tool_types=scenario["expected_tools"], 1136 | expected_tool_call_count=total_expected_count, 1137 | success=False, 1138 | error=str(e), 1139 | execution_time=execution_time, 1140 | conversation_log=conversation_log 1141 | ) 1142 | 1143 | # ------------------------ Suite & Report ------------------------ 1144 | 1145 | def run_test_suite(self, suite_name: str, scenarios: List[Dict[str, Any]]) -> TestSuite: 1146 | suite = TestSuite(name=suite_name) 1147 | provider_down = False 1148 | 1149 | for scenario in scenarios: 1150 | print(f"\n Testing: {scenario['name']} - {scenario['description']}") 1151 | result = self.run_scenario(scenario) 1152 | suite.results.append(result) 1153 | 1154 | status = "PASS" if result.success else "FAIL" 1155 | print(f" {status} Completed: {len(result.tool_calls_made)} tool calls in {result.execution_time:.2f}s") 1156 | if result.error: 1157 | # Handle Unicode encoding issues on Windows 1158 | try: 1159 | print(f" Error: {result.error}") 1160 | except UnicodeEncodeError: 1161 | # Replace problematic characters for console output 1162 | safe_error = result.error.encode('ascii', 'replace').decode('ascii') 1163 | print(f" Error: {safe_error}") 1164 | 1165 | # Check if provider is down - if so, create empty results for remaining scenarios 1166 | if result.error == "Provider unavailable": 1167 | provider_down = True 1168 | print(f" Provider is down - creating empty results for remaining scenarios") 1169 | break 1170 | 1171 | # If provider is down, create zero results for remaining scenarios 1172 | if provider_down: 1173 | remaining_scenarios = scenarios[len(suite.results):] 1174 | for scenario in remaining_scenarios: 1175 | try: 1176 | exp_queue = self._build_expected_queue_for_scenario(scenario) 1177 | expected_count = len(exp_queue) 1178 | except Exception: 1179 | expected_count = len(scenario.get("expected_tools", [])) + len(scenario.get("follow_ups", [])) 1180 | empty_result = TestResult( 1181 | scenario_name=scenario["name"], 1182 | description=scenario["description"], 1183 | conversation_turns=0, 1184 | tool_calls_made=[], 1185 | tool_call_details=[], 1186 | expected_tool_types=scenario["expected_tools"], 1187 | expected_tool_call_count=expected_count, 1188 | success=False, 1189 | error="Provider unavailable", 1190 | execution_time=0.0, 1191 | conversation_log=[] 1192 | ) 1193 | suite.results.append(empty_result) 1194 | 1195 | return suite 1196 | 1197 | def generate_report(self, suites: List[TestSuite]) -> str: 1198 | def clamp01(x: float) -> float: 1199 | return 0.0 if x <= 0 else (1.0 if x >= 1 else x) 1200 | 1201 | report = [] 1202 | report.append("\n" + "="*60) 1203 | report.append("LLM NATURAL TOOL CALLING TEST REPORT") 1204 | report.append("="*60) 1205 | report.append(f"Model: {self.model}") 1206 | report.append(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") 1207 | report.append("") 1208 | 1209 | total_tests = sum(len(suite.results) for suite in suites) 1210 | total_success = sum(sum(1 for r in suite.results if r.success) for suite in suites) 1211 | 1212 | # Totals 1213 | total_expected = sum(r.expected_tool_call_count for suite in suites for r in suite.results) 1214 | total_attempted_all = sum(len(r.tool_call_details) for suite in suites for r in suite.results) 1215 | total_attempted_llm = sum( 1216 | sum(1 for tc in r.tool_call_details if tc.llm_initiated) 1217 | for suite in suites for r in suite.results 1218 | ) 1219 | total_matched_llm = sum( 1220 | sum( 1221 | 1 for tc in r.tool_call_details 1222 | if tc.llm_initiated and tc.expected and tc.parameters_correct and tc.semantic_match 1223 | ) 1224 | for suite in suites for r in suite.results 1225 | ) 1226 | total_params_correct = sum( 1227 | sum(1 for tc in r.tool_call_details if tc.parameters_correct) 1228 | for suite in suites for r in suite.results 1229 | ) 1230 | total_execution_success = sum( 1231 | sum(1 for tc in r.tool_call_details if tc.execution_successful) 1232 | for suite in suites for r in suite.results 1233 | ) 1234 | total_semantic_match = sum( 1235 | sum(1 for tc in r.tool_call_details if tc.semantic_match) 1236 | for suite in suites for r in suite.results 1237 | ) 1238 | 1239 | total_params_correct_llm = sum( 1240 | sum(1 for tc in r.tool_call_details if tc.llm_initiated and tc.parameters_correct) 1241 | for suite in suites for r in suite.results 1242 | ) 1243 | total_execution_success_llm = sum( 1244 | sum(1 for tc in r.tool_call_details if tc.llm_initiated and tc.execution_successful) 1245 | for suite in suites for r in suite.results 1246 | ) 1247 | total_semantic_match_llm = sum( 1248 | sum(1 for tc in r.tool_call_details if tc.llm_initiated and tc.semantic_match) 1249 | for suite in suites for r in suite.results 1250 | ) 1251 | 1252 | # Rates (all bounded 0..100 where applicable) 1253 | success_rate = (total_success / total_tests * 100.0) if total_tests > 0 else 0.0 1254 | 1255 | precision = (total_matched_llm / total_attempted_llm) if total_attempted_llm > 0 else 0.0 1256 | recall = (total_matched_llm / total_expected) if total_expected > 0 else 0.0 1257 | f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0 1258 | 1259 | param_success_rate = (total_params_correct / total_attempted_all * 100.0) if total_attempted_all > 0 else 0.0 1260 | execution_success_rate = (total_execution_success / total_attempted_all * 100.0) if total_attempted_all > 0 else 0.0 1261 | semantic_match_rate = (total_semantic_match / total_attempted_all * 100.0) if total_attempted_all > 0 else 0.0 1262 | 1263 | param_success_rate_llm = (total_params_correct_llm / total_attempted_llm * 100.0) if total_attempted_llm > 0 else 0.0 1264 | execution_success_rate_llm = (total_execution_success_llm / total_attempted_llm * 100.0) if total_attempted_llm > 0 else 0.0 1265 | semantic_match_rate_llm = (total_semantic_match_llm / total_attempted_llm * 100.0) if total_attempted_llm > 0 else 0.0 1266 | 1267 | # Diagnostic factor (can be > 100%, do NOT use in scoring) 1268 | overcall_factor = (total_attempted_all / total_expected) if total_expected > 0 else 0.0 # e.g. x3.76 1269 | overcall_disp = f"x{overcall_factor:.2f}" if total_expected > 0 else "N/A" 1270 | 1271 | report.append("OVERALL SUMMARY") 1272 | report.append("-"*40) 1273 | report.append(f"Total Scenarios: {total_tests}") 1274 | report.append(f"Successful Scenarios: {total_success}") 1275 | report.append(f"Failed Scenarios: {total_tests - total_success}") 1276 | report.append(f"Scenario Success Rate: {success_rate:.1f}%") 1277 | report.append("") 1278 | report.append("TOOL CALL STATISTICS") 1279 | report.append("-"*25) 1280 | report.append(f"Expected Tool Calls: {total_expected}") 1281 | report.append(f"Attempted Calls (LLM + Emulated): {total_attempted_all}") 1282 | report.append(f"Attempted Calls (LLM only): {total_attempted_llm}") 1283 | report.append(f"Matched Expected (LLM only): {total_matched_llm}") 1284 | report.append(f"Tool Precision (LLM only): {clamp01(precision)*100:.1f}%") 1285 | report.append(f"Tool Recall (LLM only): {clamp01(recall)*100:.1f}%") 1286 | report.append(f"Tool F1 (LLM only): {clamp01(f1)*100:.1f}%") 1287 | report.append(f"Over/Under-call factor (diagnostic): {overcall_disp}") 1288 | 1289 | # Add emulation rate 1290 | emulation_rate = ((total_attempted_all - total_attempted_llm) / max(1, total_attempted_all) * 100.0) if total_attempted_all > 0 else 0.0 1291 | report.append(f"Emulation rate: {emulation_rate:.1f}%") 1292 | report.append(f"Parameter Accuracy (structural): {param_success_rate:.1f}%") 1293 | report.append(f"Parameter Accuracy (LLM only): {param_success_rate_llm:.1f}%") 1294 | report.append(f"Parameter Accuracy (semantic): {semantic_match_rate:.1f}%") 1295 | report.append(f"Parameter Accuracy (semantic, LLM only): {semantic_match_rate_llm:.1f}%") 1296 | report.append(f"Execution Success Rate: {execution_success_rate:.1f}%") 1297 | report.append(f"Execution Success Rate (LLM only): {execution_success_rate_llm:.1f}%") 1298 | report.append("") 1299 | 1300 | # Per-suite block 1301 | for suite in suites: 1302 | report.append(f"\n{suite.name.upper()}") 1303 | report.append("-"*40) 1304 | report.append(f"Scenarios: {len(suite.results)}") 1305 | report.append(f"Success Rate: {suite.success_rate:.1f}%") 1306 | report.append(f"Total Tool Calls: {suite.total_tool_calls}") 1307 | 1308 | report.append("\nScenario Details:") 1309 | for result in suite.results: 1310 | status = "PASS" if result.success else "FAIL" 1311 | attempted_all = len(result.tool_call_details) 1312 | attempted_llm = sum(1 for tc in result.tool_call_details if tc.llm_initiated) 1313 | matched_llm = sum(1 for tc in result.tool_call_details if tc.llm_initiated and tc.expected) 1314 | 1315 | scn_prec = (matched_llm / attempted_llm) if attempted_llm > 0 else 0.0 1316 | scn_recall = (matched_llm / result.expected_tool_call_count) if result.expected_tool_call_count > 0 else 0.0 1317 | scn_f1 = (2 * scn_prec * scn_recall / (scn_prec + scn_recall)) if (scn_prec + scn_recall) > 0 else 0.0 1318 | scn_over = (attempted_all / result.expected_tool_call_count) if result.expected_tool_call_count > 0 else 0.0 1319 | scn_over_disp = f"x{scn_over:.2f}" if result.expected_tool_call_count > 0 else "N/A" 1320 | 1321 | report.append(f"\n [{status}] {result.scenario_name}: {result.description}") 1322 | report.append(f" Conversation turns: {result.conversation_turns}") 1323 | report.append(f" Tool calls made: {attempted_all} (expected {result.expected_tool_call_count})") 1324 | report.append(f" LLM-initiated: {attempted_llm} | Emulated: {attempted_all - attempted_llm}") 1325 | report.append(f" Tool P/R/F1 (LLM): {clamp01(scn_prec)*100:.1f}% / {clamp01(scn_recall)*100:.1f}% / {clamp01(scn_f1)*100:.1f}%") 1326 | report.append(f" Over/Under-call factor: {scn_over_disp}") 1327 | report.append(f" Execution time: {result.execution_time:.2f}s") 1328 | 1329 | if result.tool_calls_made: 1330 | tools_summary = {} 1331 | for tool in result.tool_calls_made: 1332 | tools_summary[tool] = tools_summary.get(tool, 0) + 1 1333 | report.append(f" Tools used: {', '.join(f'{k}({v})' for k, v in tools_summary.items())}") 1334 | 1335 | if result.error: 1336 | # Replace Unicode characters that might cause issues 1337 | safe_error = result.error.replace('\u2605', '*').replace('★', '*') 1338 | report.append(f" Error: {safe_error}") 1339 | 1340 | # Weighted complexity score (unchanged) 1341 | report.append("\n" + "="*60) 1342 | report.append("FINAL SCORE") 1343 | report.append("="*60) 1344 | 1345 | weights = {"Simple": 0.2, "Medium": 0.25, "Large": 0.3, "Extra Large": 0.25} 1346 | weighted_score = 0.0 1347 | weight_total = 0.0 1348 | for suite in suites: 1349 | for k, w in weights.items(): 1350 | if k.lower() in suite.name.lower(): 1351 | weighted_score += suite.success_rate * w 1352 | weight_total += w 1353 | break 1354 | if weight_total == 0: 1355 | weighted_score = success_rate # fallback 1356 | else: 1357 | weighted_score = weighted_score / weight_total 1358 | 1359 | # Use bounded metrics only 1360 | overall_score = ( 1361 | clamp01(success_rate/100.0) + 1362 | clamp01(f1) + # already in 0..1 1363 | clamp01(param_success_rate_llm/100.0) + 1364 | clamp01(execution_success_rate_llm/100.0) + 1365 | clamp01(semantic_match_rate_llm/100.0) 1366 | ) / 5.0 * 100.0 1367 | 1368 | # Optional over-call penalty (soft) 1369 | penalty = 1.0 1370 | if overcall_factor > 1.0: 1371 | # dampen by log so small over-calls don't tank the score 1372 | penalty = 1.0 / (1.0 + 0.25 * max(0.0, (overcall_factor - 1.0))) 1373 | overall_score *= penalty 1374 | 1375 | def grade(score): 1376 | if score >= 90: return "A+" 1377 | if score >= 85: return "A" 1378 | if score >= 80: return "A-" 1379 | if score >= 75: return "B+" 1380 | if score >= 70: return "B" 1381 | if score >= 65: return "B-" 1382 | if score >= 60: return "C+" 1383 | if score >= 55: return "C" 1384 | if score >= 50: return "C-" 1385 | if score >= 40: return "D" 1386 | return "F" 1387 | 1388 | report.append(f"Scenario Success Rate: {success_rate:.1f}%") 1389 | report.append(f"Tool Precision (LLM only): {clamp01(precision)*100:.1f}%") 1390 | report.append(f"Tool Recall (LLM only): {clamp01(recall)*100:.1f}%") 1391 | report.append(f"Tool F1 (LLM only): {clamp01(f1)*100:.1f}%") 1392 | report.append(f"Parameter Accuracy (structural): {param_success_rate:.1f}%") 1393 | report.append(f"Parameter Accuracy (LLM only): {param_success_rate_llm:.1f}%") 1394 | report.append(f"Parameter Accuracy (semantic): {semantic_match_rate:.1f}%") 1395 | report.append(f"Parameter Accuracy (semantic, LLM only): {semantic_match_rate_llm:.1f}%") 1396 | report.append(f"Execution Success Rate: {execution_success_rate:.1f}%") 1397 | report.append(f"Execution Success Rate (LLM only): {execution_success_rate_llm:.1f}%") 1398 | report.append(f"Weighted Complexity Score: {weighted_score:.1f}%") 1399 | report.append("") 1400 | report.append(f"OVERALL SCORE: {overall_score:.1f}% ({grade(overall_score)})") 1401 | report.append("\n" + "="*60) 1402 | return "\n".join(report) 1403 | 1404 | 1405 | 1406 | # ------------------------ CLI ------------------------ 1407 | 1408 | def main(): 1409 | parser = argparse.ArgumentParser( 1410 | description="Test LLM natural tool calling capabilities", 1411 | formatter_class=argparse.RawDescriptionHelpFormatter, 1412 | epilog=""" 1413 | Examples: 1414 | %(prog)s --api-base https://api.openai.com/v1 --api-key sk-xxx --model gpt-4o 1415 | %(prog)s --api-base http://localhost:8000/v1 --api-key local --model llama-70b 1416 | %(prog)s --api-base https://openrouter.ai/api/v1 --api-key sk-or-xxx --model anthropic/claude-3.5-sonnet --provider Anthropic 1417 | """ 1418 | ) 1419 | parser.add_argument("--api-base", required=True, help="API base URL (e.g., https://api.openai.com/v1)") 1420 | parser.add_argument("--api-key", required=True, help="API key for authentication") 1421 | parser.add_argument("--model", required=True, help="Model name to test") 1422 | parser.add_argument("--provider", help="OpenRouter provider (e.g., 'Anthropic', 'OpenAI')") 1423 | parser.add_argument("--temperature", type=float, help="Temperature for text generation (default: 0.1)") 1424 | parser.add_argument("--reasoning-effort", choices=['low', 'medium', 'high'], help="OpenRouter reasoning effort level") 1425 | parser.add_argument("--max-tools", type=int, default=40, help="Maximum number of tool calls to test (default: 40)") 1426 | parser.add_argument("--output", help="Output file for the report (optional)") 1427 | parser.add_argument("--debug", action="store_true", help="Enable debug output") 1428 | parser.add_argument("--quick", action="store_true", help="Run only simple tests (faster)") 1429 | args = parser.parse_args() 1430 | 1431 | print(f"\nInitializing Natural Tool Calling Tester...") 1432 | print(f"API Base: {args.api_base}") 1433 | print(f"Model: {args.model}") 1434 | if args.provider: 1435 | print(f"Provider: {args.provider}") 1436 | if args.temperature is not None: 1437 | print(f"Temperature: {args.temperature}") 1438 | if args.reasoning_effort: 1439 | print(f"Reasoning Effort: {args.reasoning_effort}") 1440 | print(f"Max Tools: {args.max_tools}") 1441 | 1442 | tester = LLMToolTester( 1443 | args.api_base, 1444 | args.api_key, 1445 | args.model, 1446 | args.debug, 1447 | provider=args.provider, 1448 | temperature=args.temperature, 1449 | reasoning_effort=args.reasoning_effort 1450 | ) 1451 | suites: List[TestSuite] = [] 1452 | 1453 | all_scenarios = TestScenarios.get_scenarios() 1454 | 1455 | if args.quick: 1456 | print("\nRunning Quick Test Suite (1-5 tools)...") 1457 | simple_scenarios = [s for s in all_scenarios if len(s["expected_tools"]) + len(s.get("follow_ups", [])) <= 5] 1458 | suite = tester.run_test_suite("Quick Tests (1-5 tools)", simple_scenarios[:3]) 1459 | suites.append(suite) 1460 | else: 1461 | if args.max_tools >= 5: 1462 | print("\nRunning Simple Test Suite (1-5 tools)...") 1463 | simple_scenarios = TestScenarios.get_scenario_by_complexity(1, 5) 1464 | suite = tester.run_test_suite("Simple Tests (1-5 tools)", simple_scenarios) 1465 | suites.append(suite) 1466 | 1467 | if args.max_tools >= 10: 1468 | print("\nRunning Medium Test Suite (6-10 tools)...") 1469 | medium_scenarios = TestScenarios.get_scenario_by_complexity(6, 10) 1470 | suite = tester.run_test_suite("Medium Tests (6-10 tools)", medium_scenarios) 1471 | suites.append(suite) 1472 | 1473 | if args.max_tools >= 20: 1474 | print("\nRunning Large Test Suite (11-20 tools)...") 1475 | large_scenarios = TestScenarios.get_scenario_by_complexity(11, 20) 1476 | suite = tester.run_test_suite("Large Tests (11-20 tools)", large_scenarios) 1477 | suites.append(suite) 1478 | 1479 | if args.max_tools >= 40: 1480 | print("\nRunning Extra Large Test Suite (21+ tools)...") 1481 | xl_scenarios = TestScenarios.get_scenario_by_complexity(21, 50) 1482 | suite = tester.run_test_suite("Extra Large Tests (21+ tools)", xl_scenarios) 1483 | suites.append(suite) 1484 | 1485 | report = tester.generate_report(suites) 1486 | print(report) 1487 | 1488 | if args.output: 1489 | with open(args.output, 'w', encoding='utf-8') as f: 1490 | f.write(report) 1491 | print(f"\nReport saved to: {args.output}") 1492 | 1493 | overall_success = sum(sum(1 for r in suite.results if r.success) for suite in suites) 1494 | overall_total = sum(len(suite.results) for suite in suites) 1495 | success_rate = (overall_success / overall_total * 100) if overall_total > 0 else 0 1496 | return 0 if success_rate >= 70 else 1 1497 | 1498 | 1499 | if __name__ == "__main__": 1500 | sys.exit(main()) 1501 | --------------------------------------------------------------------------------