├── requirements.txt
├── .gitignore
├── test_debug.py
├── test_kimi_k2_providers.sh
├── test_qwen_coder_providers.sh
├── test_deepseek_versions.sh
├── README.md
├── test_deepseek_fireworks.sh
├── test_generic_model.sh
├── tool_definitions.py
├── test_scenarios.json
└── tool_tester_v2.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | openai>=1.0.0


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | env/
 8 | venv/
 9 | .venv
10 | pip-log.txt
11 | pip-delete-this-directory.txt
12 | 
13 | # Test results directories
14 | tool_test*/
15 | tool_test_results*/
16 | deepseek_comparison*/
17 | deepseek_fireworks_comparison*/
18 | test_results*/
19 | 
20 | # Log files
21 | *.log
22 | 
23 | # CSV and result files
24 | results.csv
25 | summary.txt
26 | failed_providers.txt
27 | 
28 | # IDE
29 | .vscode/
30 | .idea/
31 | *.swp
32 | *.swo
33 | *~
34 | 
35 | # OS
36 | .DS_Store
37 | Thumbs.db
38 | desktop.ini
39 | 
40 | # Temporary files
41 | *.tmp
42 | *.bak
43 | *.backup
44 | bash.exe.stackdump
45 | 
46 | # API keys (never commit these!)
47 | .env
48 | *.key
49 | api_keys.txt
50 | config.ini
51 | 
52 | # Jupyter
53 | .ipynb_checkpoints/
54 | *.ipynb
55 | 
56 | .claude
57 | __pycache__


--------------------------------------------------------------------------------
/test_debug.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Debug tool to see raw API responses
 4 | """
 5 | 
 6 | import json
 7 | import time
 8 | from openai import OpenAI
 9 | 
10 | def test_tool_response(api_base, api_key, model):
11 |     client = OpenAI(api_key=api_key, base_url=api_base)
12 |     
13 |     tools = [
14 |         {
15 |             "type": "function",
16 |             "function": {
17 |                 "name": "calculate",
18 |                 "description": "Perform a calculation",
19 |                 "parameters": {
20 |                     "type": "object",
21 |                     "properties": {
22 |                         "expression": {"type": "string"}
23 |                     },
24 |                     "required": ["expression"]
25 |                 }
26 |             }
27 |         }
28 |     ]
29 |     
30 |     messages = [
31 |         {"role": "user", "content": "What is 847 divided by 6?"}
32 |     ]
33 |     
34 |     print("Sending request...")
35 |     print(f"Model: {model}")
36 |     print(f"Messages: {messages}")
37 |     print("\n" + "="*50)
38 |     
39 |     try:
40 |         response = client.chat.completions.create(
41 |             model=model,
42 |             messages=messages,
43 |             tools=tools,
44 |             tool_choice="auto",
45 |             temperature=0.1,
46 |             max_tokens=150
47 |         )
48 |         
49 |         print("RAW RESPONSE:")
50 |         print(response)
51 |         print("\n" + "="*50)
52 |         
53 |         if response.choices:
54 |             msg = response.choices[0].message
55 |             print("\nMESSAGE DETAILS:")
56 |             print(f"Type: {type(msg)}")
57 |             print(f"Content: {msg.content}")
58 |             print(f"Has tool_calls: {hasattr(msg, 'tool_calls')}")
59 |             if hasattr(msg, 'tool_calls'):
60 |                 print(f"Tool calls: {msg.tool_calls}")
61 |                 if msg.tool_calls:
62 |                     for tc in msg.tool_calls:
63 |                         print(f"\nTool Call Details:")
64 |                         print(f"  ID: {tc.id}")
65 |                         print(f"  Type: {tc.type}")
66 |                         print(f"  Function name: {tc.function.name}")
67 |                         print(f"  Arguments: {tc.function.arguments}")
68 |             
69 |             # Check for any special tokens
70 |             if msg.content:
71 |                 if "<|" in msg.content or "|>" in msg.content:
72 |                     print("\nWARNING: Special tokens detected in content!")
73 |                     print(f"Content repr: {repr(msg.content)}")
74 |         
75 |     except Exception as e:
76 |         print(f"Error: {e}")
77 |         import traceback
78 |         traceback.print_exc()
79 | 
80 | if __name__ == "__main__":
81 |     import sys
82 |     if len(sys.argv) != 4:
83 |         print("Usage: python test_debug.py <api_base> <api_key> <model>")
84 |         sys.exit(1)
85 |     
86 |     test_tool_response(sys.argv[1], sys.argv[2], sys.argv[3])


--------------------------------------------------------------------------------
/test_kimi_k2_providers.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Kimi K2 tool testing provider test script for OpenRouter
  4 | # Usage: ./test_kimi_k2_providers.sh YOUR_API_KEY [quick|full]
  5 | 
  6 | if [ $# -lt 1 ]; then
  7 |     echo "Usage: ./test_kimi_k2_providers.sh YOUR_OPENROUTER_API_KEY [quick|full]"
  8 |     echo ""
  9 |     echo "Examples:"
 10 |     echo "  ./test_kimi_k2_providers.sh sk-or-xxx quick"
 11 |     echo "  ./test_kimi_k2_providers.sh sk-or-xxx full"
 12 |     exit 1
 13 | fi
 14 | 
 15 | API_KEY=$1
 16 | MODEL="moonshotai/kimi-k2"
 17 | TEST_MODE=${2:-quick}  # Default to quick mode if not specified
 18 | 
 19 | # Create results directory with timestamp
 20 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
 21 | RESULTS_DIR="tool_test_results_${TIMESTAMP}"
 22 | mkdir -p "$RESULTS_DIR"
 23 | 
 24 | echo "Tool Testing Configuration:"
 25 | echo "  Model: $MODEL"
 26 | echo "  Test Mode: $TEST_MODE"
 27 | echo "  Results Directory: $RESULTS_DIR"
 28 | echo ""
 29 | 
 30 | # Providers to test for Kimi K2
 31 | providers=(
 32 |     "targon/fp8"
 33 |     "chutes/fp8"
 34 |     "deepinfra/fp4"
 35 |     "novita/fp8"
 36 |     "fireworks/fp8"
 37 |     "moonshotai/fp8"
 38 |     "baseten/fp4"
 39 |     "atlas-cloud/fp8"
 40 |     "parasail/fp8"
 41 |     "together/fp8"
 42 |     "groq"
 43 |     "gmicloud/fp8"
 44 | )
 45 | 
 46 | # Function to run test and capture results
 47 | run_test() {
 48 |     local provider=$1
 49 |     local output_file="$RESULTS_DIR/test_${provider//\//_}.txt"
 50 |     
 51 |     echo "================================================"
 52 |     echo "Testing provider: $provider"
 53 |     echo "Output file: $output_file"
 54 |     echo "================================================"
 55 |     
 56 |     # Build the command
 57 |     cmd="python3 tool_tester_v2.py \
 58 |         --api-base https://openrouter.ai/api/v1 \
 59 |         --api-key $API_KEY \
 60 |         --model $MODEL \
 61 |         --provider \"$provider\" \
 62 |         --temperature 0.6"
 63 |     
 64 |     # Add test mode flag
 65 |     if [ "$TEST_MODE" == "quick" ]; then
 66 |         cmd="$cmd --quick"
 67 |     else
 68 |         cmd="$cmd --max-tools 40"
 69 |     fi
 70 |     
 71 |     # Add output file
 72 |     cmd="$cmd --output \"$output_file\""
 73 |     
 74 |     # Run the test and capture both stdout and the result
 75 |     echo "Running: $cmd"
 76 |     eval $cmd 2>&1 | tee "${output_file}.log"
 77 |     
 78 |     # Check if the test completed successfully
 79 |     if [ ${PIPESTATUS[0]} -eq 0 ]; then
 80 |         echo "PASS: Test completed successfully for $provider"
 81 |         
 82 |         # Extract the overall score from the output file if it exists
 83 |         if [ -f "$output_file" ]; then
 84 |             score=$(grep "OVERALL SCORE:" "$output_file" | tail -1)
 85 |             echo "  $score"
 86 |         fi
 87 |     else
 88 |         echo "FAIL: Test failed for $provider"
 89 |         echo "FAILED: $provider" >> "$RESULTS_DIR/failed_providers.txt"
 90 |     fi
 91 |     
 92 |     echo ""
 93 |     sleep 2  # Small delay between providers to avoid rate limiting
 94 | }
 95 | 
 96 | # Test each provider
 97 | successful_tests=0
 98 | failed_tests=0
 99 | 
100 | for provider in "${providers[@]}"; do
101 |     # Skip provider if it doesn't make sense for the model
102 |     # (you can add logic here to filter providers based on model)
103 |     
104 |     run_test "$provider"
105 |     
106 |     # Check if test was successful
107 |     if [ $? -eq 0 ]; then
108 |         ((successful_tests++))
109 |     else
110 |         ((failed_tests++))
111 |     fi
112 | done
113 | 
114 | # Generate summary report
115 | summary_file="$RESULTS_DIR/summary.txt"
116 | echo "================================================" | tee "$summary_file"
117 | echo "TOOL TESTING SUMMARY REPORT" | tee -a "$summary_file"
118 | echo "================================================" | tee -a "$summary_file"
119 | echo "Model: $MODEL" | tee -a "$summary_file"
120 | echo "Test Mode: $TEST_MODE" | tee -a "$summary_file"
121 | echo "Timestamp: $TIMESTAMP" | tee -a "$summary_file"
122 | echo "Total Providers Tested: ${#providers[@]}" | tee -a "$summary_file"
123 | echo "Successful Tests: $successful_tests" | tee -a "$summary_file"
124 | echo "Failed Tests: $failed_tests" | tee -a "$summary_file"
125 | echo "" | tee -a "$summary_file"
126 | 
127 | # Extract scores from all successful tests
128 | echo "Provider Scores:" | tee -a "$summary_file"
129 | echo "-----------------------------------------" | tee -a "$summary_file"
130 | 
131 | for file in "$RESULTS_DIR"/test_*.txt; do
132 |     if [ -f "$file" ]; then
133 |         provider_name=$(basename "$file" .txt | sed 's/test_//' | sed 's/_/\//g')
134 |         score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3, $4}')
135 |         if [ -n "$score" ]; then
136 |             printf "%-30s %s\n" "$provider_name:" "$score" | tee -a "$summary_file"
137 |         fi
138 |     fi
139 | done
140 | 
141 | echo "" | tee -a "$summary_file"
142 | echo "Full results saved in: $RESULTS_DIR" | tee -a "$summary_file"
143 | 
144 | # Create a CSV summary for easy analysis
145 | csv_file="$RESULTS_DIR/results.csv"
146 | echo "Provider,Model,OverallScore,Grade,ScenarioSuccessRate,ToolPrecision,ToolRecall,ToolF1,ParamAccuracy,ExecutionSuccess" > "$csv_file"
147 | 
148 | for file in "$RESULTS_DIR"/test_*.txt; do
149 |     if [ -f "$file" ]; then
150 |         provider_name=$(basename "$file" .txt | sed 's/test_//' | sed 's/_/\//g')
151 |         
152 |         # Extract metrics using grep and awk
153 |         overall_score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//')
154 |         grade=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $4}' | sed 's/[()]//g')
155 |         scenario_rate=$(grep "Scenario Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
156 |         tool_precision=$(grep "Tool Precision (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
157 |         tool_recall=$(grep "Tool Recall (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
158 |         tool_f1=$(grep "Tool F1 (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
159 |         param_accuracy=$(grep "Parameter Accuracy (structural):" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
160 |         exec_success=$(grep "Execution Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
161 |         
162 |         if [ -n "$overall_score" ]; then
163 |             echo "$provider_name,$MODEL,$overall_score,$grade,$scenario_rate,$tool_precision,$tool_recall,$tool_f1,$param_accuracy,$exec_success" >> "$csv_file"
164 |         fi
165 |     fi
166 | done
167 | 
168 | echo "CSV results saved in: $csv_file"
169 | echo ""
170 | echo "All providers tested!"


--------------------------------------------------------------------------------
/test_qwen_coder_providers.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Qwen3 Coder tool testing provider test script for OpenRouter
  4 | # Usage: ./test_qwen_coder_providers.sh YOUR_API_KEY [quick|full]
  5 | 
  6 | if [ $# -lt 1 ]; then
  7 |     echo "Usage: ./test_qwen_coder_providers.sh YOUR_OPENROUTER_API_KEY [quick|full]"
  8 |     echo ""
  9 |     echo "Examples:"
 10 |     echo "  ./test_qwen_coder_providers.sh sk-or-xxx quick"
 11 |     echo "  ./test_qwen_coder_providers.sh sk-or-xxx full"
 12 |     exit 1
 13 | fi
 14 | 
 15 | API_KEY=$1
 16 | MODEL="qwen/qwen3-coder"
 17 | TEST_MODE=${2:-quick}  # Default to quick mode if not specified
 18 | 
 19 | # Create results directory with timestamp
 20 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
 21 | RESULTS_DIR="tool_test_results_${TIMESTAMP}"
 22 | mkdir -p "$RESULTS_DIR"
 23 | 
 24 | echo "Tool Testing Configuration:"
 25 | echo "  Model: $MODEL"
 26 | echo "  Test Mode: $TEST_MODE"
 27 | echo "  Results Directory: $RESULTS_DIR"
 28 | echo ""
 29 | 
 30 | # Providers to test for Qwen3 Coder
 31 | providers=(
 32 |     "chutes/fp8"
 33 |     "deepinfra/fp4"
 34 |     "baseten/fp8"
 35 |     "parasail/fp8"
 36 |     "fireworks"
 37 |     "novita/fp8"
 38 |     "atlas-cloud/fp8"
 39 |     "phala"
 40 |     "gmicloud/fp8"
 41 |     "targon/fp8"
 42 |     "alibaba/opensource"
 43 |     "together/fp8"
 44 |     "hyperbolic/fp8"
 45 |     "cerebras/fp8"
 46 | )
 47 | 
 48 | # Function to run test and capture results
 49 | run_test() {
 50 |     local provider=$1
 51 |     local output_file="$RESULTS_DIR/test_${provider//\//_}.txt"
 52 |     
 53 |     echo "================================================"
 54 |     echo "Testing provider: $provider"
 55 |     echo "Output file: $output_file"
 56 |     echo "================================================"
 57 |     
 58 |     # Build the command
 59 |     cmd="python3 tool_tester_v2.py \
 60 |         --api-base https://openrouter.ai/api/v1 \
 61 |         --api-key $API_KEY \
 62 |         --model $MODEL \
 63 |         --provider \"$provider\" \
 64 |         --temperature 0.6"
 65 |     
 66 |     # Add test mode flag
 67 |     if [ "$TEST_MODE" == "quick" ]; then
 68 |         cmd="$cmd --quick"
 69 |     else
 70 |         cmd="$cmd --max-tools 40"
 71 |     fi
 72 |     
 73 |     # Add output file
 74 |     cmd="$cmd --output \"$output_file\""
 75 |     
 76 |     # Run the test and capture both stdout and the result
 77 |     echo "Running: $cmd"
 78 |     eval $cmd 2>&1 | tee "${output_file}.log"
 79 |     
 80 |     # Check if the test completed successfully
 81 |     if [ ${PIPESTATUS[0]} -eq 0 ]; then
 82 |         echo "PASS: Test completed successfully for $provider"
 83 |         
 84 |         # Extract the overall score from the output file if it exists
 85 |         if [ -f "$output_file" ]; then
 86 |             score=$(grep "OVERALL SCORE:" "$output_file" | tail -1)
 87 |             echo "  $score"
 88 |         fi
 89 |     else
 90 |         echo "FAIL: Test failed for $provider"
 91 |         echo "FAILED: $provider" >> "$RESULTS_DIR/failed_providers.txt"
 92 |     fi
 93 |     
 94 |     echo ""
 95 |     sleep 2  # Small delay between providers to avoid rate limiting
 96 | }
 97 | 
 98 | # Test each provider
 99 | successful_tests=0
100 | failed_tests=0
101 | 
102 | for provider in "${providers[@]}"; do
103 |     # Skip provider if it doesn't make sense for the model
104 |     # (you can add logic here to filter providers based on model)
105 |     
106 |     run_test "$provider"
107 |     
108 |     # Check if test was successful
109 |     if [ $? -eq 0 ]; then
110 |         ((successful_tests++))
111 |     else
112 |         ((failed_tests++))
113 |     fi
114 | done
115 | 
116 | # Generate summary report
117 | summary_file="$RESULTS_DIR/summary.txt"
118 | echo "================================================" | tee "$summary_file"
119 | echo "TOOL TESTING SUMMARY REPORT" | tee -a "$summary_file"
120 | echo "================================================" | tee -a "$summary_file"
121 | echo "Model: $MODEL" | tee -a "$summary_file"
122 | echo "Test Mode: $TEST_MODE" | tee -a "$summary_file"
123 | echo "Timestamp: $TIMESTAMP" | tee -a "$summary_file"
124 | echo "Total Providers Tested: ${#providers[@]}" | tee -a "$summary_file"
125 | echo "Successful Tests: $successful_tests" | tee -a "$summary_file"
126 | echo "Failed Tests: $failed_tests" | tee -a "$summary_file"
127 | echo "" | tee -a "$summary_file"
128 | 
129 | # Extract scores from all successful tests
130 | echo "Provider Scores:" | tee -a "$summary_file"
131 | echo "-----------------------------------------" | tee -a "$summary_file"
132 | 
133 | for file in "$RESULTS_DIR"/test_*.txt; do
134 |     if [ -f "$file" ]; then
135 |         provider_name=$(basename "$file" .txt | sed 's/test_//' | sed 's/_/\//g')
136 |         score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3, $4}')
137 |         if [ -n "$score" ]; then
138 |             printf "%-30s %s\n" "$provider_name:" "$score" | tee -a "$summary_file"
139 |         fi
140 |     fi
141 | done
142 | 
143 | echo "" | tee -a "$summary_file"
144 | echo "Full results saved in: $RESULTS_DIR" | tee -a "$summary_file"
145 | 
146 | # Create a CSV summary for easy analysis
147 | csv_file="$RESULTS_DIR/results.csv"
148 | echo "Provider,Model,OverallScore,Grade,ScenarioSuccessRate,ToolPrecision,ToolRecall,ToolF1,ParamAccuracy,ExecutionSuccess" > "$csv_file"
149 | 
150 | for file in "$RESULTS_DIR"/test_*.txt; do
151 |     if [ -f "$file" ]; then
152 |         provider_name=$(basename "$file" .txt | sed 's/test_//' | sed 's/_/\//g')
153 |         
154 |         # Extract metrics using grep and awk
155 |         overall_score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//')
156 |         grade=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $4}' | sed 's/[()]//g')
157 |         scenario_rate=$(grep "Scenario Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
158 |         tool_precision=$(grep "Tool Precision (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
159 |         tool_recall=$(grep "Tool Recall (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
160 |         tool_f1=$(grep "Tool F1 (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
161 |         param_accuracy=$(grep "Parameter Accuracy (structural):" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
162 |         exec_success=$(grep "Execution Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
163 |         
164 |         if [ -n "$overall_score" ]; then
165 |             echo "$provider_name,$MODEL,$overall_score,$grade,$scenario_rate,$tool_precision,$tool_recall,$tool_f1,$param_accuracy,$exec_success" >> "$csv_file"
166 |         fi
167 |     fi
168 | done
169 | 
170 | echo "CSV results saved in: $csv_file"
171 | echo ""
172 | echo "All providers tested!"


--------------------------------------------------------------------------------
/test_deepseek_versions.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # DeepSeek model version comparison tool testing script for OpenRouter
  4 | # Usage: ./test_deepseek_versions.sh YOUR_API_KEY [quick|full]
  5 | 
  6 | if [ $# -lt 1 ]; then
  7 |     echo "Usage: ./test_deepseek_versions.sh YOUR_OPENROUTER_API_KEY [quick|full]"
  8 |     echo ""
  9 |     echo "This script compares three DeepSeek model versions:"
 10 |     echo "  - deepseek/deepseek-chat-v3.1"
 11 |     echo "  - deepseek/deepseek-chat-v3-0324"
 12 |     echo "  - deepseek/deepseek-chat-v3.1:thinking"
 13 |     echo ""
 14 |     echo "Examples:"
 15 |     echo "  ./test_deepseek_versions.sh sk-or-xxx quick"
 16 |     echo "  ./test_deepseek_versions.sh sk-or-xxx full"
 17 |     exit 1
 18 | fi
 19 | 
 20 | API_KEY=$1
 21 | TEST_MODE=${2:-quick}  # Default to quick mode if not specified
 22 | 
 23 | # Create results directory with timestamp
 24 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
 25 | RESULTS_DIR="deepseek_comparison_${TIMESTAMP}"
 26 | mkdir -p "$RESULTS_DIR"
 27 | 
 28 | echo "DeepSeek Model Comparison Configuration:"
 29 | echo "  Test Mode: $TEST_MODE"
 30 | echo "  Results Directory: $RESULTS_DIR"
 31 | echo ""
 32 | 
 33 | # DeepSeek models to compare
 34 | models=(
 35 |     "deepseek/deepseek-chat-v3.1"
 36 |     "deepseek/deepseek-chat-v3-0324"
 37 |     "deepseek/deepseek-chat-v3.1:thinking"
 38 | )
 39 | 
 40 | # Function to run test and capture results
 41 | run_test() {
 42 |     local model=$1
 43 |     local safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g')
 44 |     local output_file="$RESULTS_DIR/test_${safe_model_name}.txt"
 45 |     
 46 |     echo "================================================"
 47 |     echo "Testing model: $model"
 48 |     echo "Output file: $output_file"
 49 |     echo "================================================"
 50 |     
 51 |     # Build the command
 52 |     cmd="python3 tool_tester_v2.py \
 53 |         --api-base https://openrouter.ai/api/v1 \
 54 |         --api-key $API_KEY \
 55 |         --model \"$model\" \
 56 |         --temperature 0.0"
 57 |     
 58 |     # Add test mode flag
 59 |     if [ "$TEST_MODE" == "quick" ]; then
 60 |         cmd="$cmd --quick"
 61 |     else
 62 |         cmd="$cmd --max-tools 40"
 63 |     fi
 64 |     
 65 |     # Add output file
 66 |     cmd="$cmd --output \"$output_file\""
 67 |     
 68 |     # Run the test and capture both stdout and the result
 69 |     echo "Running: $cmd"
 70 |     eval $cmd 2>&1 | tee "${output_file}.log"
 71 |     
 72 |     # Check if the test completed successfully
 73 |     if [ ${PIPESTATUS[0]} -eq 0 ]; then
 74 |         echo "PASS: Test completed successfully for $model"
 75 |         
 76 |         # Extract the overall score from the output file if it exists
 77 |         if [ -f "$output_file" ]; then
 78 |             score=$(grep "OVERALL SCORE:" "$output_file" | tail -1)
 79 |             echo "  $score"
 80 |         fi
 81 |     else
 82 |         echo "FAIL: Test failed for $model"
 83 |         echo "FAILED: $model" >> "$RESULTS_DIR/failed_models.txt"
 84 |     fi
 85 |     
 86 |     echo ""
 87 |     sleep 2  # Small delay between models to avoid rate limiting
 88 | }
 89 | 
 90 | # Test each model
 91 | successful_tests=0
 92 | failed_tests=0
 93 | 
 94 | for model in "${models[@]}"; do
 95 |     run_test "$model"
 96 |     
 97 |     # Check if test was successful
 98 |     if [ $? -eq 0 ]; then
 99 |         ((successful_tests++))
100 |     else
101 |         ((failed_tests++))
102 |     fi
103 | done
104 | 
105 | # Generate summary report
106 | summary_file="$RESULTS_DIR/summary.txt"
107 | echo "================================================" | tee "$summary_file"
108 | echo "DEEPSEEK MODEL COMPARISON SUMMARY REPORT" | tee -a "$summary_file"
109 | echo "================================================" | tee -a "$summary_file"
110 | echo "Test Mode: $TEST_MODE" | tee -a "$summary_file"
111 | echo "Timestamp: $TIMESTAMP" | tee -a "$summary_file"
112 | echo "Total Models Tested: ${#models[@]}" | tee -a "$summary_file"
113 | echo "Successful Tests: $successful_tests" | tee -a "$summary_file"
114 | echo "Failed Tests: $failed_tests" | tee -a "$summary_file"
115 | echo "" | tee -a "$summary_file"
116 | 
117 | # Extract scores from all successful tests
118 | echo "Model Performance Comparison:" | tee -a "$summary_file"
119 | echo "-----------------------------------------" | tee -a "$summary_file"
120 | 
121 | for model in "${models[@]}"; do
122 |     safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g')
123 |     file="$RESULTS_DIR/test_${safe_model_name}.txt"
124 |     
125 |     if [ -f "$file" ]; then
126 |         score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3, $4}')
127 |         if [ -n "$score" ]; then
128 |             printf "%-40s %s\n" "$model:" "$score" | tee -a "$summary_file"
129 |         else
130 |             printf "%-40s %s\n" "$model:" "No score available" | tee -a "$summary_file"
131 |         fi
132 |     else
133 |         printf "%-40s %s\n" "$model:" "Test failed" | tee -a "$summary_file"
134 |     fi
135 | done
136 | 
137 | echo "" | tee -a "$summary_file"
138 | echo "Full results saved in: $RESULTS_DIR" | tee -a "$summary_file"
139 | 
140 | # Create a CSV summary for easy analysis
141 | csv_file="$RESULTS_DIR/model_comparison.csv"
142 | echo "Model,OverallScore,Grade,ScenarioSuccessRate,ToolPrecision,ToolRecall,ToolF1,ParamAccuracy,ExecutionSuccess" > "$csv_file"
143 | 
144 | for model in "${models[@]}"; do
145 |     safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g')
146 |     file="$RESULTS_DIR/test_${safe_model_name}.txt"
147 |     
148 |     if [ -f "$file" ]; then
149 |         # Extract metrics using grep and awk
150 |         overall_score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//')
151 |         grade=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $4}' | sed 's/[()]//g')
152 |         scenario_rate=$(grep "Scenario Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
153 |         tool_precision=$(grep "Tool Precision (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
154 |         tool_recall=$(grep "Tool Recall (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
155 |         tool_f1=$(grep "Tool F1 (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
156 |         param_accuracy=$(grep "Parameter Accuracy (structural):" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
157 |         exec_success=$(grep "Execution Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
158 |         
159 |         if [ -n "$overall_score" ]; then
160 |             echo "$model,$overall_score,$grade,$scenario_rate,$tool_precision,$tool_recall,$tool_f1,$param_accuracy,$exec_success" >> "$csv_file"
161 |         fi
162 |     fi
163 | done
164 | 
165 | echo "CSV results saved in: $csv_file"
166 | echo ""
167 | echo "================================================"
168 | echo "DeepSeek Model Comparison Complete!"
169 | echo "================================================"
170 | echo ""
171 | echo "Quick Analysis:"
172 | echo "View the summary with: cat $RESULTS_DIR/summary.txt"
173 | echo "View CSV data with: cat $RESULTS_DIR/model_comparison.csv"
174 | echo ""
175 | echo "For detailed analysis of individual models, check:"
176 | for model in "${models[@]}"; do
177 |     safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g')
178 |     echo "  $model: $RESULTS_DIR/test_${safe_model_name}.txt"
179 | done


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LLM Tool Calling Test Suite V2
  2 | 
  3 | A comprehensive CLI tool for testing function calling capabilities of LLMs via the OpenAI API standard, with special support for OpenRouter provider routing.
  4 | 
  5 | ## IMPORTANT
  6 | - Semantic accuracy is too early to use as an indicator
  7 | - Structural accuracy and Tool Recall are the primary metrics to look at currently
  8 | 
  9 | ### Example Output
 10 | 
 11 | ```
 12 | Scenario Success Rate: 44.4%
 13 | ├─ Light indicator, higher is better
 14 | 
 15 | Tool Precision (LLM only): 46.1%
 16 | ├─ Measure of how often the correct tool is called
 17 | ├─ Higher is better, decent indicator with current implementations
 18 | 
 19 | Tool Recall (LLM only): 54.2%
 20 | ├─ Expected Tools vs Actual Tools Called
 21 | ├─ Higher is better, good indicator
 22 | 
 23 | Tool F1 (LLM only): 49.8%
 24 | ├─ Combination of precision and recall
 25 | 
 26 | Parameter Accuracy (structural): 100.0%
 27 | ├─ Shape of tool calls is correct
 28 | 
 29 | Parameter Accuracy (LLM only): 100.0%
 30 | ├─ Future feature - split emulated tool calls (can ignore)
 31 | 
 32 | Parameter Accuracy (semantic): 46.1%
 33 | ├─ Too early to use as an indicator, but can be a data point
 34 | ├─ Checks actual values passed into the tool
 35 | 
 36 | Parameter Accuracy (semantic, LLM only): 46.1%
 37 | ├─ Future feature - split emulated tool calls (can ignore)
 38 | 
 39 | Execution Success Rate: 100.0%
 40 | Execution Success Rate (LLM only): 100.0%
 41 | 
 42 | Weighted Complexity Score: 46.6%
 43 | ├─ Depends on complexity of the scenario
 44 | ├─ Higher is better
 45 | 
 46 | ═══════════════════════════════════════════════════════════════
 47 | OVERALL SCORE: 65.2% (B-)
 48 | ├─ Use as overall indicator
 49 | └─ Look at nuance as each value matters differently
 50 | ```
 51 | 
 52 | 
 53 | ## Features
 54 | 
 55 | - Natural conversation-based tool calling scenarios
 56 | - Tests from simple (1-5 tools) to complex (40+ tools) scenarios
 57 | - Supports any OpenAI-compatible API endpoint including OpenRouter
 58 | - Provider-specific routing for optimal performance
 59 | - Detailed metrics: precision, recall, F1 scores, and semantic accuracy
 60 | - Automatic capability detection for different server implementations
 61 | - Batch testing scripts for comparing models and providers
 62 | 
 63 | ## Installation
 64 | 
 65 | ```bash
 66 | pip install -r requirements.txt
 67 | ```
 68 | 
 69 | ## Usage
 70 | 
 71 | ### Basic Usage
 72 | 
 73 | ```bash
 74 | python tool_tester_v2.py --api-base <API_URL> --api-key <API_KEY> --model <MODEL_NAME>
 75 | ```
 76 | 
 77 | ### Examples
 78 | 
 79 | Test OpenAI GPT-4:
 80 | ```bash
 81 | python tool_tester_v2.py --api-base https://api.openai.com/v1 --api-key sk-xxx --model gpt-4o
 82 | ```
 83 | 
 84 | Test via OpenRouter with provider routing:
 85 | ```bash
 86 | python tool_tester_v2.py --api-base https://openrouter.ai/api/v1 --api-key sk-or-xxx --model anthropic/claude-3.5-sonnet --provider Anthropic
 87 | ```
 88 | 
 89 | Test local model:
 90 | ```bash
 91 | python tool_tester_v2.py --api-base http://localhost:8000/v1 --api-key local --model llama-70b
 92 | ```
 93 | 
 94 | Quick test mode (faster, fewer scenarios):
 95 | ```bash
 96 | python tool_tester_v2.py --api-base <URL> --api-key <KEY> --model <MODEL> --quick
 97 | ```
 98 | 
 99 | Save report to file:
100 | ```bash
101 | python tool_tester_v2.py --api-base <URL> --api-key <KEY> --model <MODEL> --output report.txt
102 | ```
103 | 
104 | Limit maximum tool calls:
105 | ```bash
106 | python tool_tester_v2.py --api-base <URL> --api-key <KEY> --model <MODEL> --max-tools 20
107 | ```
108 | 
109 | ### Using the Generic Batch Testing Script
110 | 
111 | The `test_generic_model.sh` script allows you to easily test any model across multiple providers:
112 | 
113 | 1. **Edit the script configuration** at the top of the file:
114 |    ```bash
115 |    # Edit these variables in test_generic_model.sh:
116 |    MODEL="openai/gpt-4o"           # Your model
117 |    TEMPERATURE=0.1                  # Temperature setting
118 |    providers=(                      # List of providers to test
119 |        "openai"
120 |        "anthropic"
121 |        "fireworks"
122 |    )
123 |    ```
124 | 
125 | 2. **Run the script**:
126 |    ```bash
127 |    # Quick tests (fewer scenarios)
128 |    ./test_generic_model.sh YOUR_API_KEY quick
129 |    
130 |    # Full test suite
131 |    ./test_generic_model.sh YOUR_API_KEY full
132 |    ```
133 | 
134 | 3. **Results** will be saved in a timestamped directory with:
135 |    - Individual test results for each provider
136 |    - Summary report with scores
137 |    - CSV file for easy analysis
138 |    - Sorted leaderboard of providers by performance
139 | 
140 | ### Provider-Specific Testing
141 | 
142 | Some example provider configurations:
143 | ```bash
144 | # Test with specific precision modes
145 | --provider "fireworks/fp8"
146 | --provider "deepinfra/fp4"
147 | --provider "together/fp8"
148 | 
149 | # Test with reasoning effort (for supported models)
150 | --reasoning-effort high
151 | 
152 | # Adjust temperature for testing
153 | --temperature 0.0  # Most deterministic
154 | --temperature 0.7  # More creative
155 | ```
156 | 
157 | ## Test Categories
158 | 
159 | 1. **Simple Tests (1-5 tools)**: Basic single and multi-tool scenarios
160 | 2. **Medium Tests (6-10 tools)**: Moderate complexity with follow-ups
161 | 3. **Large Tests (11-20 tools)**: Complex multi-step operations
162 | 4. **Extra Large Tests (21+ tools)**: Comprehensive workflow simulations
163 | 
164 | ## Available Tools
165 | 
166 | The test suite includes 10 different tool types:
167 | - `get_weather`: Weather information for locations
168 | - `calculate`: Mathematical calculations
169 | - `search_flights`: Flight search between cities
170 | - `search_hotels`: Hotel availability search
171 | - `search_restaurants`: Restaurant recommendations
172 | - `translate_text`: Language translation
173 | - `get_stock_price`: Stock market prices
174 | - `convert_currency`: Currency conversion
175 | - `get_news`: News articles on topics
176 | - `set_reminder`: Create reminders
177 | 
178 | ## Metrics Explained
179 | 
180 | - **Scenario Success Rate**: Percentage of test scenarios completed successfully
181 | - **Tool Precision**: When the AI calls a tool, how often it's the correct one
182 | - **Tool Recall**: Percentage of required tools that were actually called
183 | - **Tool F1 Score**: Harmonic mean of precision and recall
184 | - **Parameter Accuracy (structural)**: Correct format and required fields
185 | - **Parameter Accuracy (semantic)**: Correct values for the context
186 | - **Execution Success Rate**: Tools that executed without errors
187 | - **Weighted Complexity Score**: Performance adjusted for scenario difficulty
188 | 
189 | ## Success Criteria
190 | 
191 | - Individual tests pass if ≥70% of expected tools are called correctly
192 | - Structural parameters must be ≥60% correct
193 | - Overall grade based on combined metrics (A+ = 90%+, A = 85%+, etc.)
194 | 
195 | ## Output
196 | 
197 | The tool generates a comprehensive report showing:
198 | - Overall success rate and letter grade
199 | - Detailed metrics (precision, recall, F1)
200 | - Per-scenario breakdowns
201 | - Tool call details and emulation statistics
202 | - Execution times
203 | - Error details and conversation logs
204 | 
205 | ## Test Scenarios
206 | 
207 | Test scenarios are defined in `test_scenarios.json` and include:
208 | - Travel planning
209 | - Investment research
210 | - Event coordination
211 | - Academic conferences
212 | - Shopping expeditions
213 | - And many more real-world use cases


--------------------------------------------------------------------------------
/test_deepseek_fireworks.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # DeepSeek model version comparison tool testing script for OpenRouter with Fireworks provider
  4 | # Usage: ./test_deepseek_fireworks.sh YOUR_API_KEY [quick|full]
  5 | 
  6 | if [ $# -lt 1 ]; then
  7 |     echo "Usage: ./test_deepseek_fireworks.sh YOUR_OPENROUTER_API_KEY [quick|full]"
  8 |     echo ""
  9 |     echo "This script compares two DeepSeek model versions on Fireworks provider:"
 10 |     echo "  - deepseek/deepseek-chat-v3.1"
 11 |     echo "  - deepseek/deepseek-chat-v3-0324"
 12 |     echo ""
 13 |     echo "Examples:"
 14 |     echo "  ./test_deepseek_fireworks.sh sk-or-xxx quick"
 15 |     echo "  ./test_deepseek_fireworks.sh sk-or-xxx full"
 16 |     exit 1
 17 | fi
 18 | 
 19 | API_KEY=$1
 20 | TEST_MODE=${2:-quick}  # Default to quick mode if not specified
 21 | 
 22 | # Create results directory with timestamp
 23 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
 24 | RESULTS_DIR="deepseek_fireworks_comparison_${TIMESTAMP}"
 25 | mkdir -p "$RESULTS_DIR"
 26 | 
 27 | echo "DeepSeek Model Comparison Configuration (Fireworks Provider):"
 28 | echo "  Test Mode: $TEST_MODE"
 29 | echo "  Provider: Fireworks"
 30 | echo "  Results Directory: $RESULTS_DIR"
 31 | echo ""
 32 | 
 33 | # DeepSeek models to compare (excluding thinking version)
 34 | models=(
 35 |     "deepseek/deepseek-chat-v3.1"
 36 |     "deepseek/deepseek-chat-v3-0324"
 37 | )
 38 | 
 39 | # Function to run test and capture results
 40 | run_test() {
 41 |     local model=$1
 42 |     local safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g')
 43 |     local output_file="$RESULTS_DIR/test_${safe_model_name}.txt"
 44 |     
 45 |     echo "================================================"
 46 |     echo "Testing model: $model"
 47 |     echo "Provider: deepseek"
 48 |     echo "Output file: $output_file"
 49 |     echo "================================================"
 50 |     
 51 |     # Build the command with Fireworks provider
 52 |     cmd="python3 tool_tester_v2.py \
 53 |         --api-base https://openrouter.ai/api/v1 \
 54 |         --api-key $API_KEY \
 55 |         --model \"$model\" \
 56 |         --provider deepseek \
 57 |         --temperature 0.0"
 58 |     
 59 |     # Add test mode flag
 60 |     if [ "$TEST_MODE" == "quick" ]; then
 61 |         cmd="$cmd --quick"
 62 |     else
 63 |         cmd="$cmd --max-tools 40"
 64 |     fi
 65 |     
 66 |     # Add output file
 67 |     cmd="$cmd --output \"$output_file\""
 68 |     
 69 |     # Run the test and capture both stdout and the result
 70 |     echo "Running: $cmd"
 71 |     eval $cmd 2>&1 | tee "${output_file}.log"
 72 |     
 73 |     # Check if the test completed successfully
 74 |     if [ ${PIPESTATUS[0]} -eq 0 ]; then
 75 |         echo "PASS: Test completed successfully for $model"
 76 |         
 77 |         # Extract the overall score from the output file if it exists
 78 |         if [ -f "$output_file" ]; then
 79 |             score=$(grep "OVERALL SCORE:" "$output_file" | tail -1)
 80 |             echo "  $score"
 81 |         fi
 82 |     else
 83 |         echo "FAIL: Test failed for $model"
 84 |         echo "FAILED: $model" >> "$RESULTS_DIR/failed_models.txt"
 85 |     fi
 86 |     
 87 |     echo ""
 88 |     sleep 5  # Longer delay for Fireworks to avoid rate limiting
 89 | }
 90 | 
 91 | # Test each model
 92 | successful_tests=0
 93 | failed_tests=0
 94 | 
 95 | for model in "${models[@]}"; do
 96 |     run_test "$model"
 97 |     
 98 |     # Check if test was successful
 99 |     if [ $? -eq 0 ]; then
100 |         ((successful_tests++))
101 |     else
102 |         ((failed_tests++))
103 |     fi
104 | done
105 | 
106 | # Generate summary report
107 | summary_file="$RESULTS_DIR/summary.txt"
108 | echo "================================================" | tee "$summary_file"
109 | echo "DEEPSEEK MODEL COMPARISON SUMMARY REPORT" | tee -a "$summary_file"
110 | echo "Provider: FIREWORKS" | tee -a "$summary_file"
111 | echo "================================================" | tee -a "$summary_file"
112 | echo "Test Mode: $TEST_MODE" | tee -a "$summary_file"
113 | echo "Timestamp: $TIMESTAMP" | tee -a "$summary_file"
114 | echo "Total Models Tested: ${#models[@]}" | tee -a "$summary_file"
115 | echo "Successful Tests: $successful_tests" | tee -a "$summary_file"
116 | echo "Failed Tests: $failed_tests" | tee -a "$summary_file"
117 | echo "" | tee -a "$summary_file"
118 | 
119 | # Extract scores from all successful tests
120 | echo "Model Performance Comparison (Fireworks Provider):" | tee -a "$summary_file"
121 | echo "-----------------------------------------" | tee -a "$summary_file"
122 | 
123 | for model in "${models[@]}"; do
124 |     safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g')
125 |     file="$RESULTS_DIR/test_${safe_model_name}.txt"
126 |     
127 |     if [ -f "$file" ]; then
128 |         score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3, $4}')
129 |         if [ -n "$score" ]; then
130 |             printf "%-40s %s\n" "$model:" "$score" | tee -a "$summary_file"
131 |         else
132 |             printf "%-40s %s\n" "$model:" "No score available" | tee -a "$summary_file"
133 |         fi
134 |     else
135 |         printf "%-40s %s\n" "$model:" "Test failed" | tee -a "$summary_file"
136 |     fi
137 | done
138 | 
139 | echo "" | tee -a "$summary_file"
140 | echo "Full results saved in: $RESULTS_DIR" | tee -a "$summary_file"
141 | 
142 | # Create a CSV summary for easy analysis
143 | csv_file="$RESULTS_DIR/model_comparison.csv"
144 | echo "Model,Provider,OverallScore,Grade,ScenarioSuccessRate,ToolPrecision,ToolRecall,ToolF1,ParamAccuracy,ExecutionSuccess" > "$csv_file"
145 | 
146 | for model in "${models[@]}"; do
147 |     safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g')
148 |     file="$RESULTS_DIR/test_${safe_model_name}.txt"
149 |     
150 |     if [ -f "$file" ]; then
151 |         # Extract metrics using grep and awk
152 |         overall_score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//')
153 |         grade=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $4}' | sed 's/[()]//g')
154 |         scenario_rate=$(grep "Scenario Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
155 |         tool_precision=$(grep "Tool Precision (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
156 |         tool_recall=$(grep "Tool Recall (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
157 |         tool_f1=$(grep "Tool F1 (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
158 |         param_accuracy=$(grep "Parameter Accuracy (structural):" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
159 |         exec_success=$(grep "Execution Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
160 |         
161 |         if [ -n "$overall_score" ]; then
162 |             echo "$model,Fireworks,$overall_score,$grade,$scenario_rate,$tool_precision,$tool_recall,$tool_f1,$param_accuracy,$exec_success" >> "$csv_file"
163 |         fi
164 |     fi
165 | done
166 | 
167 | echo "CSV results saved in: $csv_file"
168 | echo ""
169 | echo "================================================"
170 | echo "DeepSeek Model Comparison Complete!"
171 | echo "Provider: Fireworks"
172 | echo "================================================"
173 | echo ""
174 | echo "Quick Analysis:"
175 | echo "View the summary with: cat $RESULTS_DIR/summary.txt"
176 | echo "View CSV data with: cat $RESULTS_DIR/model_comparison.csv"
177 | echo ""
178 | echo "For detailed analysis of individual models, check:"
179 | for model in "${models[@]}"; do
180 |     safe_model_name=$(echo "$model" | sed 's/[\/:]/_/g')
181 |     echo "  $model: $RESULTS_DIR/test_${safe_model_name}.txt"
182 | done


--------------------------------------------------------------------------------
/test_generic_model.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Generic tool testing script for OpenRouter
  4 | # Usage: ./test_generic_model.sh YOUR_API_KEY [quick|full]
  5 | #
  6 | # CONFIGURATION: Edit the variables below to customize your test
  7 | # ================================================================
  8 | 
  9 | # MODEL CONFIGURATION
 10 | # Examples: "openai/gpt-4o", "anthropic/claude-3.5-sonnet", "qwen/qwen3-coder", "deepseek/deepseek-chat-v3.1"
 11 | MODEL="qwen/qwen3-coder"
 12 | 
 13 | # TEMPERATURE CONFIGURATION (0.0 to 2.0)
 14 | # Lower = more deterministic, Higher = more creative
 15 | # Recommended: 0.0-0.3 for tool calling tests
 16 | TEMPERATURE=0.6
 17 | 
 18 | # PROVIDERS TO TEST
 19 | # Leave empty to test without specific provider routing
 20 | # For a single provider test, use: providers=("openai")
 21 | # For multiple providers, list them all
 22 | # Common providers: openai, anthropic, google, deepseek, fireworks, together, deepinfra, etc.
 23 | # Some providers support precision specifiers: "fireworks/fp8", "deepinfra/fp4", etc.
 24 | providers=(
 25 |     # Uncomment and modify the providers you want to test:
 26 |     # "openai"
 27 |     # "anthropic"
 28 |     # "google"
 29 |     # "deepseek"
 30 |     # "fireworks"
 31 |     # "fireworks/fp8"
 32 |     # "together"
 33 |     # "together/fp8"
 34 |     # "deepinfra/fp4"
 35 |     # "baseten/fp8"
 36 |     # "chutes/fp8"
 37 |     # "parasail/fp8"
 38 |     # "novita/fp8"
 39 |     # "atlas-cloud/fp8"
 40 |     # "phala"
 41 |     # "gmicloud/fp8"
 42 |     # "targon/fp8"
 43 |     "alibaba/opensource"
 44 |     # "hyperbolic/fp8"
 45 |     # "cerebras/fp8"
 46 | )
 47 | 
 48 | # If providers array is empty, run a single test without provider routing
 49 | if [ ${#providers[@]} -eq 0 ]; then
 50 |     providers=("none")
 51 | fi
 52 | 
 53 | # DELAY BETWEEN TESTS (in seconds)
 54 | # Increase if you encounter rate limiting
 55 | DELAY_SECONDS=3
 56 | 
 57 | # REASONING EFFORT (optional, for models that support it)
 58 | # Options: low, medium, high, or leave empty
 59 | REASONING_EFFORT=""
 60 | 
 61 | # ================================================================
 62 | # END OF CONFIGURATION - DO NOT EDIT BELOW THIS LINE
 63 | # ================================================================
 64 | 
 65 | if [ $# -lt 1 ]; then
 66 |     echo "Usage: $0 YOUR_OPENROUTER_API_KEY [quick|full]"
 67 |     echo ""
 68 |     echo "Current Configuration:"
 69 |     echo "  Model: $MODEL"
 70 |     echo "  Temperature: $TEMPERATURE"
 71 |     echo "  Providers: ${providers[*]}"
 72 |     echo ""
 73 |     echo "Examples:"
 74 |     echo "  $0 sk-or-xxx quick    # Run quick tests"
 75 |     echo "  $0 sk-or-xxx full     # Run full test suite"
 76 |     echo ""
 77 |     echo "Edit this script to change the model, temperature, and providers to test."
 78 |     exit 1
 79 | fi
 80 | 
 81 | API_KEY=$1
 82 | TEST_MODE=${2:-quick}  # Default to quick mode if not specified
 83 | 
 84 | # Create results directory with timestamp
 85 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
 86 | SAFE_MODEL_NAME=$(echo "$MODEL" | sed 's/[\/:]/_/g')
 87 | RESULTS_DIR="test_results_${SAFE_MODEL_NAME}_${TIMESTAMP}"
 88 | mkdir -p "$RESULTS_DIR"
 89 | 
 90 | echo "================================================================"
 91 | echo "TOOL TESTING CONFIGURATION"
 92 | echo "================================================================"
 93 | echo "  Model: $MODEL"
 94 | echo "  Temperature: $TEMPERATURE"
 95 | echo "  Test Mode: $TEST_MODE"
 96 | echo "  Providers to test: ${#providers[@]}"
 97 | echo "  Results Directory: $RESULTS_DIR"
 98 | if [ -n "$REASONING_EFFORT" ]; then
 99 |     echo "  Reasoning Effort: $REASONING_EFFORT"
100 | fi
101 | echo ""
102 | 
103 | # Function to run test and capture results
104 | run_test() {
105 |     local provider=$1
106 |     local test_name=""
107 |     local provider_param=""
108 |     
109 |     if [ "$provider" == "none" ]; then
110 |         test_name="no_provider"
111 |         echo "================================================"
112 |         echo "Testing model: $MODEL (no provider routing)"
113 |     else
114 |         test_name="${provider//\//_}"
115 |         provider_param="--provider \"$provider\""
116 |         echo "================================================"
117 |         echo "Testing model: $MODEL"
118 |         echo "Provider: $provider"
119 |     fi
120 |     
121 |     local output_file="$RESULTS_DIR/test_${test_name}.txt"
122 |     echo "Output file: $output_file"
123 |     echo "================================================"
124 |     
125 |     # Build the command
126 |     cmd="python3 tool_tester_v2.py \
127 |         --api-base https://openrouter.ai/api/v1 \
128 |         --api-key $API_KEY \
129 |         --model \"$MODEL\" \
130 |         --temperature $TEMPERATURE"
131 |     
132 |     # Add provider if specified
133 |     if [ -n "$provider_param" ]; then
134 |         cmd="$cmd $provider_param"
135 |     fi
136 |     
137 |     # Add reasoning effort if specified
138 |     if [ -n "$REASONING_EFFORT" ]; then
139 |         cmd="$cmd --reasoning-effort $REASONING_EFFORT"
140 |     fi
141 |     
142 |     # Add test mode flag
143 |     if [ "$TEST_MODE" == "quick" ]; then
144 |         cmd="$cmd --quick"
145 |     else
146 |         cmd="$cmd --max-tools 40"
147 |     fi
148 |     
149 |     # Add output file
150 |     cmd="$cmd --output \"$output_file\""
151 |     
152 |     # Run the test and capture both stdout and the result
153 |     echo "Running: $cmd"
154 |     eval $cmd 2>&1 | tee "${output_file}.log"
155 |     
156 |     # Check if the test completed successfully
157 |     if [ ${PIPESTATUS[0]} -eq 0 ]; then
158 |         echo "PASS: Test completed successfully"
159 |         
160 |         # Extract the overall score from the output file if it exists
161 |         if [ -f "$output_file" ]; then
162 |             score=$(grep "OVERALL SCORE:" "$output_file" | tail -1)
163 |             echo "  $score"
164 |         fi
165 |         return 0
166 |     else
167 |         echo "FAIL: Test failed"
168 |         if [ "$provider" == "none" ]; then
169 |             echo "FAILED: No provider routing" >> "$RESULTS_DIR/failed_tests.txt"
170 |         else
171 |             echo "FAILED: $provider" >> "$RESULTS_DIR/failed_tests.txt"
172 |         fi
173 |         return 1
174 |     fi
175 |     
176 |     echo ""
177 | }
178 | 
179 | # Test each provider/configuration
180 | successful_tests=0
181 | failed_tests=0
182 | total_tests=${#providers[@]}
183 | current_test=0
184 | 
185 | for provider in "${providers[@]}"; do
186 |     ((current_test++))
187 |     echo ""
188 |     echo "Test $current_test of $total_tests"
189 |     
190 |     run_test "$provider"
191 |     
192 |     # Check if test completed (not if it got a perfect score)
193 |     if [ $? -eq 0 ]; then
194 |         ((successful_tests++))
195 |     else
196 |         ((failed_tests++))
197 |     fi
198 |     
199 |     # Add delay between tests (except for the last one)
200 |     if [ $current_test -lt $total_tests ]; then
201 |         echo "Waiting ${DELAY_SECONDS}s before next test..."
202 |         sleep $DELAY_SECONDS
203 |     fi
204 | done
205 | 
206 | # Generate summary report
207 | summary_file="$RESULTS_DIR/summary.txt"
208 | echo "" | tee "$summary_file"
209 | echo "================================================================" | tee -a "$summary_file"
210 | echo "TOOL TESTING SUMMARY REPORT" | tee -a "$summary_file"
211 | echo "================================================================" | tee -a "$summary_file"
212 | echo "Model: $MODEL" | tee -a "$summary_file"
213 | echo "Temperature: $TEMPERATURE" | tee -a "$summary_file"
214 | echo "Test Mode: $TEST_MODE" | tee -a "$summary_file"
215 | echo "Timestamp: $TIMESTAMP" | tee -a "$summary_file"
216 | echo "Total Tests Run: $total_tests" | tee -a "$summary_file"
217 | echo "Tests Completed: $successful_tests" | tee -a "$summary_file"
218 | echo "Tests Failed to Run: $failed_tests" | tee -a "$summary_file"
219 | echo "" | tee -a "$summary_file"
220 | 
221 | # Extract scores from all successful tests
222 | echo "Test Results:" | tee -a "$summary_file"
223 | echo "-----------------------------------------" | tee -a "$summary_file"
224 | 
225 | for file in "$RESULTS_DIR"/test_*.txt; do
226 |     if [ -f "$file" ]; then
227 |         test_name=$(basename "$file" .txt | sed 's/test_//')
228 |         
229 |         # Convert filename back to provider name
230 |         if [ "$test_name" == "no_provider" ]; then
231 |             display_name="No provider routing"
232 |         else
233 |             display_name=$(echo "$test_name" | sed 's/_/\//g')
234 |         fi
235 |         
236 |         score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3, $4}')
237 |         if [ -n "$score" ]; then
238 |             printf "%-30s %s\n" "$display_name:" "$score" | tee -a "$summary_file"
239 |         else
240 |             printf "%-30s %s\n" "$display_name:" "Test failed" | tee -a "$summary_file"
241 |         fi
242 |     fi
243 | done
244 | 
245 | echo "" | tee -a "$summary_file"
246 | 
247 | # Create a CSV summary for easy analysis
248 | csv_file="$RESULTS_DIR/results.csv"
249 | echo "Provider,Model,Temperature,OverallScore,Grade,ScenarioSuccessRate,ToolPrecision,ToolRecall,ToolF1,ParamAccuracy,ExecutionSuccess" > "$csv_file"
250 | 
251 | for file in "$RESULTS_DIR"/test_*.txt; do
252 |     if [ -f "$file" ]; then
253 |         test_name=$(basename "$file" .txt | sed 's/test_//')
254 |         
255 |         # Convert filename back to provider name
256 |         if [ "$test_name" == "no_provider" ]; then
257 |             provider_name="none"
258 |         else
259 |             provider_name=$(echo "$test_name" | sed 's/_/\//g')
260 |         fi
261 |         
262 |         # Extract metrics using grep and awk
263 |         overall_score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//')
264 |         grade=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $4}' | sed 's/[()]//g')
265 |         scenario_rate=$(grep "Scenario Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
266 |         tool_precision=$(grep "Tool Precision (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
267 |         tool_recall=$(grep "Tool Recall (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
268 |         tool_f1=$(grep "Tool F1 (LLM only):" "$file" | tail -1 | awk '{print $5}' | sed 's/%//')
269 |         param_accuracy=$(grep "Parameter Accuracy (structural):" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
270 |         exec_success=$(grep "Execution Success Rate:" "$file" | tail -1 | awk '{print $4}' | sed 's/%//')
271 |         
272 |         if [ -n "$overall_score" ]; then
273 |             echo "$provider_name,$MODEL,$TEMPERATURE,$overall_score,$grade,$scenario_rate,$tool_precision,$tool_recall,$tool_f1,$param_accuracy,$exec_success" >> "$csv_file"
274 |         fi
275 |     fi
276 | done
277 | 
278 | echo "Full results saved in: $RESULTS_DIR" | tee -a "$summary_file"
279 | echo "CSV results saved in: $csv_file" | tee -a "$summary_file"
280 | echo "" | tee -a "$summary_file"
281 | 
282 | # Sort results by score if there are multiple tests
283 | if [ $total_tests -gt 1 ]; then
284 |     echo "Top Performers (sorted by score):" | tee -a "$summary_file"
285 |     echo "-----------------------------------------" | tee -a "$summary_file"
286 |     
287 |     # Create temp file for sorting
288 |     temp_scores="/tmp/scores_$$.txt"
289 |     
290 |     for file in "$RESULTS_DIR"/test_*.txt; do
291 |         if [ -f "$file" ]; then
292 |             test_name=$(basename "$file" .txt | sed 's/test_//')
293 |             if [ "$test_name" == "no_provider" ]; then
294 |                 display_name="No provider routing"
295 |             else
296 |                 display_name=$(echo "$test_name" | sed 's/_/\//g')
297 |             fi
298 |             
299 |             score=$(grep "OVERALL SCORE:" "$file" | tail -1 | awk '{print $3}' | sed 's/%//')
300 |             if [ -n "$score" ]; then
301 |                 echo "$score|$display_name" >> "$temp_scores"
302 |             fi
303 |         fi
304 |     done
305 |     
306 |     if [ -f "$temp_scores" ]; then
307 |         sort -t'|' -k1 -rn "$temp_scores" | while IFS='|' read -r score provider; do
308 |             printf "%-30s %s%%\n" "$provider:" "$score" | tee -a "$summary_file"
309 |         done
310 |         rm -f "$temp_scores"
311 |     fi
312 | fi
313 | 
314 | echo ""
315 | echo "================================================================"
316 | echo "Testing Complete!"
317 | echo "================================================================"
318 | echo ""
319 | echo "View detailed results:"
320 | echo "  Summary: cat $RESULTS_DIR/summary.txt"
321 | echo "  CSV data: cat $RESULTS_DIR/results.csv"
322 | echo ""
323 | 
324 | # Keep terminal open if running in Windows Git Bash
325 | if [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]]; then
326 |     echo ""
327 |     read -p "Press Enter to exit..."
328 | fi
329 | 
330 | exit 0


--------------------------------------------------------------------------------
/tool_definitions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tool Definitions for LLM Tool Calling Test Suite
  3 | 
  4 | This module contains all tool definitions and related functionality
  5 | for testing LLM tool calling capabilities.
  6 | """
  7 | 
  8 | from typing import Dict, List, Any
  9 | 
 10 | 
 11 | class ToolDefinitions:
 12 |     """Define available tools for testing"""
 13 | 
 14 |     @staticmethod
 15 |     def get_all_tools() -> List[Dict[str, Any]]:
 16 |         return [
 17 |             {
 18 |                 "type": "function",
 19 |                 "function": {
 20 |                     "name": "get_weather",
 21 |                     "description": "Get the current weather for a specific location",
 22 |                     "parameters": {
 23 |                         "type": "object",
 24 |                         "properties": {
 25 |                             "location": {"type": "string", "description": "City and state, e.g. San Francisco, CA"},
 26 |                             "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}
 27 |                         },
 28 |                         "required": ["location"]
 29 |                     }
 30 |                 }
 31 |             },
 32 |             {
 33 |                 "type": "function",
 34 |                 "function": {
 35 |                     "name": "calculate",
 36 |                     "description": "Perform mathematical calculations",
 37 |                     "parameters": {
 38 |                         "type": "object",
 39 |                         "properties": {
 40 |                             "expression": {"type": "string", "description": "Mathematical expression to evaluate"}
 41 |                         },
 42 |                         "required": ["expression"]
 43 |                     }
 44 |                 }
 45 |             },
 46 |             {
 47 |                 "type": "function",
 48 |                 "function": {
 49 |                     "name": "search_flights",
 50 |                     "description": "Search for available flights between two cities",
 51 |                     "parameters": {
 52 |                         "type": "object",
 53 |                         "properties": {
 54 |                             "from_city": {"type": "string", "description": "Departure city"},
 55 |                             "to_city": {"type": "string", "description": "Destination city"},
 56 |                             "date": {"type": "string", "description": "Travel date (YYYY-MM-DD)"}
 57 |                         },
 58 |                         "required": ["from_city", "to_city", "date"]
 59 |                     }
 60 |                 }
 61 |             },
 62 |             {
 63 |                 "type": "function",
 64 |                 "function": {
 65 |                     "name": "get_stock_price",
 66 |                     "description": "Get current stock price for a company",
 67 |                     "parameters": {
 68 |                         "type": "object",
 69 |                         "properties": {
 70 |                             "symbol": {"type": "string", "description": "Stock ticker symbol"}
 71 |                         },
 72 |                         "required": ["symbol"]
 73 |                     }
 74 |                 }
 75 |             },
 76 |             {
 77 |                 "type": "function",
 78 |                 "function": {
 79 |                     "name": "search_restaurants",
 80 |                     "description": "Search for restaurants in a specific area",
 81 |                     "parameters": {
 82 |                         "type": "object",
 83 |                         "properties": {
 84 |                             "location": {"type": "string", "description": "City or area to search"},
 85 |                             "cuisine": {"type": "string", "description": "Type of cuisine"},
 86 |                             "price_range": {"type": "string", "enum": ["$", "$$", "$$$", "$$$$"]}
 87 |                         },
 88 |                         "required": ["location"]
 89 |                     }
 90 |                 }
 91 |             },
 92 |             {
 93 |                 "type": "function",
 94 |                 "function": {
 95 |                     "name": "convert_currency",
 96 |                     "description": "Convert amount between two currencies",
 97 |                     "parameters": {
 98 |                         "type": "object",
 99 |                         "properties": {
100 |                             "amount": {"type": "number", "description": "Amount to convert"},
101 |                             "from_currency": {"type": "string", "description": "Source currency code (e.g., USD)"},
102 |                             "to_currency": {"type": "string", "description": "Target currency code (e.g., EUR)"}
103 |                         },
104 |                         "required": ["amount", "from_currency", "to_currency"]
105 |                     }
106 |                 }
107 |             },
108 |             {
109 |                 "type": "function",
110 |                 "function": {
111 |                     "name": "get_news",
112 |                     "description": "Get latest news articles on a topic",
113 |                     "parameters": {
114 |                         "type": "object",
115 |                         "properties": {
116 |                             "topic": {"type": "string", "description": "News topic or search query"},
117 |                             "limit": {"type": "integer", "description": "Number of articles to return", "default": 5}
118 |                         },
119 |                         "required": ["topic"]
120 |                     }
121 |                 }
122 |             },
123 |             {
124 |                 "type": "function",
125 |                 "function": {
126 |                     "name": "set_reminder",
127 |                     "description": "Set a reminder for a specific time",
128 |                     "parameters": {
129 |                         "type": "object",
130 |                         "properties": {
131 |                             "message": {"type": "string", "description": "Reminder message"},
132 |                             "time": {"type": "string", "description": "Time for the reminder (HH:MM)"},
133 |                             "date": {"type": "string", "description": "Date for the reminder (YYYY-MM-DD)"}
134 |                         },
135 |                         "required": ["message", "time"]
136 |                     }
137 |                 }
138 |             },
139 |             {
140 |                 "type": "function",
141 |                 "function": {
142 |                     "name": "translate_text",
143 |                     "description": "Translate text from one language to another",
144 |                     "parameters": {
145 |                         "type": "object",
146 |                         "properties": {
147 |                             "text": {"type": "string", "description": "Text to translate"},
148 |                             "source_language": {"type": "string", "description": "Source language code"},
149 |                             "target_language": {"type": "string", "description": "Target language code"}
150 |                         },
151 |                         "required": ["text", "target_language"]
152 |                     }
153 |                 }
154 |             },
155 |             {
156 |                 "type": "function",
157 |                 "function": {
158 |                     "name": "search_hotels",
159 |                     "description": "Search for hotels in a specific location",
160 |                     "parameters": {
161 |                         "type": "object",
162 |                         "properties": {
163 |                             "location": {"type": "string", "description": "City or area"},
164 |                             "check_in": {"type": "string", "description": "Check-in date (YYYY-MM-DD)"},
165 |                             "check_out": {"type": "string", "description": "Check-out date (YYYY-MM-DD)"},
166 |                             "guests": {"type": "integer", "description": "Number of guests"}
167 |                         },
168 |                         "required": ["location", "check_in", "check_out"]
169 |                     }
170 |                 }
171 |             }
172 |         ]
173 | 
174 |     @staticmethod
175 |     def _allowed_props() -> Dict[str, set]:
176 |         return {
177 |             "get_weather": {"location", "unit"},
178 |             "calculate": {"expression"},
179 |             "search_flights": {"from_city", "to_city", "date"},
180 |             "get_stock_price": {"symbol"},
181 |             "search_restaurants": {"location", "cuisine", "price_range"},
182 |             "convert_currency": {"amount", "from_currency", "to_currency"},
183 |             "get_news": {"topic", "limit"},
184 |             "set_reminder": {"message", "time", "date"},
185 |             "translate_text": {"text", "source_language", "target_language"},
186 |             "search_hotels": {"location", "check_in", "check_out", "guests"},
187 |         }
188 | 
189 |     @staticmethod
190 |     def validate_parameters(tool_name: str, arguments: Dict[str, Any]) -> bool:
191 |         """Validate required parameters and reject unknown keys"""
192 |         allowed = ToolDefinitions._allowed_props().get(tool_name, set())
193 |         if any(k not in allowed for k in arguments.keys()):
194 |             return False
195 | 
196 |         validations = {
197 |             "get_weather": lambda args: "location" in args and len(str(args.get("location", "")).strip()) > 0,
198 |             "calculate": lambda args: "expression" in args and len(str(args.get("expression", "")).strip()) > 0,
199 |             "search_flights": lambda args: all(k in args for k in ["from_city", "to_city", "date"]) and
200 |                                            all(len(str(args.get(k, "")).strip()) > 0 for k in ["from_city", "to_city", "date"]),
201 |             "get_stock_price": lambda args: "symbol" in args and len(str(args.get("symbol", "")).strip()) > 0,
202 |             "search_restaurants": lambda args: "location" in args and len(str(args.get("location", "")).strip()) > 0,
203 |             "convert_currency": lambda args: all(k in args for k in ["amount", "from_currency", "to_currency"]) and
204 |                                              isinstance(args.get("amount", 0), (int, float)) and args.get("amount", 0) > 0,
205 |             "get_news": lambda args: "topic" in args and len(str(args.get("topic", "")).strip()) > 0,
206 |             "set_reminder": lambda args: all(k in args for k in ["message", "time"]) and
207 |                                          all(len(str(args.get(k, "")).strip()) > 0 for k in ["message", "time"]),
208 |             "translate_text": lambda args: "text" in args and "target_language" in args and
209 |                                            all(len(str(args.get(k, "")).strip()) > 0 for k in ["text", "target_language"]),
210 |             "search_hotels": lambda args: all(k in args for k in ["location", "check_in", "check_out"]) and
211 |                                           all(len(str(args.get(k, "")).strip()) > 0 for k in ["location", "check_in", "check_out"]),
212 |         }
213 |         validator = validations.get(tool_name)
214 |         if validator:
215 |             try:
216 |                 return validator(arguments)
217 |             except Exception:
218 |                 return False
219 |         return True
220 | 
221 |     @staticmethod
222 |     def execute_tool(name: str, arguments: Dict[str, Any]) -> str:
223 |         """Mock tool execution - returns realistic dummy data"""
224 | 
225 |         tool_responses = {
226 |             "get_weather": lambda args: f"The weather in {args.get('location', 'Unknown')} is currently 72°F (22°C) with partly cloudy skies. Humidity is 65% with winds at 10 mph.",
227 | 
228 |             "calculate": lambda args: f"The result of {args.get('expression', '')} is {eval(args.get('expression', '0'))}",
229 | 
230 |             "search_flights": lambda args: (
231 |                 f"Found 5 flights from {args.get('from_city')} to {args.get('to_city')} on {args.get('date')}:\n"
232 |                 "1. UA 245 - Departs 8:00 AM, arrives 11:30 AM - $350\n"
233 |                 "2. DL 892 - Departs 10:15 AM, arrives 1:45 PM - $425\n"
234 |                 "3. AA 156 - Departs 2:30 PM, arrives 6:00 PM - $380"
235 |             ),
236 | 
237 |             "get_stock_price": lambda args: f"{args.get('symbol', 'UNKNOWN')} is currently trading at $152.35, up 2.3% today. Day range: $149.20 - $153.80",
238 | 
239 |             "search_restaurants": lambda args: (
240 |                 f"Found 3 top restaurants in {args.get('location')}:\n"
241 |                 f"1. The Golden Fork - {args.get('cuisine', 'International')} cuisine - Rating: 4.5/5\n"
242 |                 f"2. Sunset Bistro - {args.get('cuisine', 'Local')} cuisine - Rating: 4.3/5\n"
243 |                 "3. Ocean View Grill - Seafood - Rating: 4.6/5"
244 |             ),
245 | 
246 |             "convert_currency": lambda args: (
247 |                 f"{args.get('amount', 0)} {args.get('from_currency', 'USD')} equals "
248 |                 f"{args.get('amount', 0) * 0.92:.2f} {args.get('to_currency', 'EUR')} at current exchange rate "
249 |                 f"(1 {args.get('from_currency', 'USD')} = 0.92 {args.get('to_currency', 'EUR')})"
250 |             ),
251 | 
252 |             "get_news": lambda args: (
253 |                 f"Latest news on '{args.get('topic')}':\n"
254 |                 f"1. Breaking: Major developments in {args.get('topic')} sector (2 hours ago)\n"
255 |                 f"2. Expert analysis: What {args.get('topic')} means for the future (5 hours ago)\n"
256 |                 f"3. {args.get('topic')} trends show significant growth (1 day ago)"
257 |             ),
258 | 
259 |             "set_reminder": lambda args: f"Reminder set: '{args.get('message')}' for {args.get('time')} on {args.get('date', 'today')}",
260 | 
261 |             "translate_text": lambda args: f"Translation to {args.get('target_language')}: [Translated version of '{args.get('text')}']",
262 | 
263 |             "search_hotels": lambda args: (
264 |                 f"Found hotels in {args.get('location')} for {args.get('check_in')} to {args.get('check_out')}:\n"
265 |                 "1. Grand Plaza Hotel - $180/night - 4.4 stars\n"
266 |                 "2. City Center Inn - $120/night - 4.1 stars\n"
267 |                 "3. Luxury Suites - $250/night - 4.7 stars"
268 |             )
269 |         }
270 | 
271 |         handler = tool_responses.get(name, lambda args: f"Executed {name} with parameters {args}")
272 |         try:
273 |             return handler(arguments)
274 |         except Exception:
275 |             return f"Tool execution completed for {name}"


--------------------------------------------------------------------------------
/test_scenarios.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "scenarios": [
  3 |     {
  4 |       "name": "simple_weather",
  5 |       "description": "Check weather in a city",
  6 |       "initial_prompt": "Use the get_weather tool to check the current weather in Tokyo.",
  7 |       "expected_tools": ["get_weather"],
  8 |       "follow_ups": []
  9 |     },
 10 |     {
 11 |       "name": "simple_calculation",
 12 |       "description": "Perform a calculation",
 13 |       "initial_prompt": "Use the calculate tool to determine how much each person pays if a $847 restaurant bill is split among 6 people.",
 14 |       "expected_tools": ["calculate"],
 15 |       "follow_ups": []
 16 |     },
 17 |     {
 18 |       "name": "simple_stock",
 19 |       "description": "Check stock price",
 20 |       "initial_prompt": "Use the get_stock_price tool to check the current price of Apple stock (AAPL).",
 21 |       "expected_tools": ["get_stock_price"],
 22 |       "follow_ups": []
 23 |     },
 24 |     {
 25 |       "name": "travel_planning",
 26 |       "description": "Plan a trip with multiple queries",
 27 |       "initial_prompt": "Please use the search_flights tool to find flights from New York to Paris on March 15th, then use get_weather to check the weather in Paris.",
 28 |       "expected_tools": ["search_flights", "get_weather"],
 29 |       "follow_ups": [
 30 |         "Use the search_restaurants tool to find good restaurants in Paris.",
 31 |         "Use the search_hotels tool to find hotels for March 15-20 in Paris."
 32 |       ]
 33 |     },
 34 |     {
 35 |       "name": "currency_travel",
 36 |       "description": "Travel with currency conversion",
 37 |       "initial_prompt": "Please use convert_currency to convert $5000 USD to British pounds, then use get_weather to check the weather forecast for London.",
 38 |       "expected_tools": ["convert_currency", "get_weather"],
 39 |       "follow_ups": [
 40 |         "Use search_hotels to find hotels in London for next week.",
 41 |         "Use search_flights to find flights from Boston to London for tomorrow."
 42 |       ]
 43 |     },
 44 |     {
 45 |       "name": "business_trip",
 46 |       "description": "Complex business trip planning",
 47 |       "initial_prompt": "Use search_flights to find flights from Chicago to San Francisco for March 20th.",
 48 |       "expected_tools": ["search_flights"],
 49 |       "follow_ups": [
 50 |         "Use get_weather to check the weather forecast for San Francisco.",
 51 |         "Use search_restaurants to find Italian restaurants in San Francisco for a business dinner.",
 52 |         "Use set_reminder to set a reminder to pack presentation materials at 8 PM tonight.",
 53 |         "Use search_hotels to find hotels near the financial district for March 20-22."
 54 |       ]
 55 |     },
 56 |     {
 57 |       "name": "investment_research",
 58 |       "description": "Research stocks and news",
 59 |       "initial_prompt": "Use get_stock_price to check the current price of Microsoft stock (MSFT).",
 60 |       "expected_tools": ["get_stock_price"],
 61 |       "follow_ups": [
 62 |         "Use get_stock_price to check Google (GOOGL) and Amazon (AMZN) stock prices.",
 63 |         "Use get_news to find the latest news about artificial intelligence.",
 64 |         "Use calculate to determine how much to invest in each stock if splitting $10,000 equally across three stocks."
 65 |       ]
 66 |     },
 67 |     {
 68 |       "name": "international_planning",
 69 |       "description": "International travel with translations",
 70 |       "initial_prompt": "Use translate_text to translate 'Thank you for your help' to Japanese.",
 71 |       "expected_tools": ["translate_text"],
 72 |       "follow_ups": [
 73 |         "Use get_weather to check the weather in Tokyo for April.",
 74 |         "Use convert_currency to convert $2000 USD to Japanese Yen.",
 75 |         "Use search_flights to find flights from Los Angeles to Tokyo on April 10th.",
 76 |         "Use translate_text to translate 'Where is the train station?' to Japanese.",
 77 |         "Use search_hotels to find hotels in Shibuya district for April 10-20."
 78 |       ]
 79 |     },
 80 |     {
 81 |       "name": "event_planning",
 82 |       "description": "Planning an event with multiple tasks",
 83 |       "initial_prompt": "I'm organizing a company event in Miami. What's the weather forecast for Miami?",
 84 |       "expected_tools": ["get_weather"],
 85 |       "follow_ups": [
 86 |         "Search for restaurants that can accommodate 50 people",
 87 |         "Set a reminder to send invitations tomorrow at 10 AM",
 88 |         "What's the latest news about event planning trends?",
 89 |         "Calculate the cost if catering is $45 per person for 50 people",
 90 |         "Find hotels near the beach for our out-of-town guests checking in May 15th"
 91 |       ]
 92 |     },
 93 |     {
 94 |       "name": "complete_vacation",
 95 |       "description": "Full vacation planning with many steps",
 96 |       "initial_prompt": "I want to plan a complete vacation to Europe. Let's start with checking flights from New York to London on June 1st.",
 97 |       "expected_tools": ["search_flights"],
 98 |       "follow_ups": [
 99 |         "What's the weather like in London in June?",
100 |         "Convert $5000 to British pounds",
101 |         "Search for hotels in London for June 1-5",
102 |         "Find good restaurants in London, preferably British cuisine",
103 |         "Now check flights from London to Paris on June 5th",
104 |         "What's the weather in Paris in June?",
105 |         "Convert $2000 to Euros",
106 |         "Search for hotels in Paris for June 5-10",
107 |         "Translate 'I would like a table for two' to French",
108 |         "Find French restaurants in Paris",
109 |         "Check flights from Paris to Rome on June 10th",
110 |         "Weather in Rome in June?",
111 |         "Search for hotels in Rome for June 10-15",
112 |         "Translate 'How much does this cost?' to Italian",
113 |         "Set a reminder to book everything by next Friday at 5 PM"
114 |       ]
115 |     },
116 |     {
117 |       "name": "financial_portfolio",
118 |       "description": "Comprehensive financial analysis",
119 |       "initial_prompt": "I want to review my tech portfolio. Start by checking Apple's current price.",
120 |       "expected_tools": ["get_stock_price"],
121 |       "follow_ups": [
122 |         "Check Microsoft stock price",
123 |         "Check Google stock price",
124 |         "Check Amazon stock price",
125 |         "Check Tesla stock price",
126 |         "Calculate the total if I have 100 shares of Apple at current price",
127 |         "Calculate 50 shares of Microsoft",
128 |         "Calculate 75 shares of Google",
129 |         "Calculate 30 shares of Amazon",
130 |         "Calculate 40 shares of Tesla",
131 |         "What's the latest news about tech stocks?",
132 |         "Get news about cryptocurrency",
133 |         "Convert $10,000 to Euros for my European investments",
134 |         "Convert $5,000 to Japanese Yen",
135 |         "Set a reminder to review portfolio again next month at 3 PM",
136 |         "What's the total value if I sum all my stock positions?",
137 |         "Get news about the Federal Reserve",
138 |         "Calculate what percentage each stock represents of my total portfolio"
139 |       ]
140 |     },
141 |     {
142 |       "name": "conference_coordination",
143 |       "description": "Coordinate a multi-city conference tour",
144 |       "initial_prompt": "I'm organizing a conference tour across multiple cities. First, check flights from San Francisco to Seattle on July 1st.",
145 |       "expected_tools": ["search_flights"],
146 |       "follow_ups": [
147 |         "Weather in Seattle in July?",
148 |         "Find hotels in Seattle for July 1-3",
149 |         "Search for conference venues (restaurants) that can host 100 people in Seattle",
150 |         "Check flights from Seattle to Portland on July 3rd",
151 |         "Weather in Portland?",
152 |         "Hotels in Portland for July 3-5",
153 |         "Calculate the budget: 100 people × $75 per person for catering",
154 |         "Flights from Portland to Los Angeles on July 5th",
155 |         "Weather in Los Angeles in July?",
156 |         "Hotels in LA for July 5-7",
157 |         "Get news about conference industry trends",
158 |         "Convert our $50,000 budget to see how much that is in Euros",
159 |         "Set reminder to confirm all venues by June 15th at noon",
160 |         "Translate our welcome message 'Welcome to our annual conference' to Spanish",
161 |         "Also translate it to Mandarin Chinese",
162 |         "Calculate total hotel costs if average is $150/night for 20 rooms across all cities",
163 |         "Search for restaurants in Los Angeles for our closing dinner",
164 |         "What's the latest news about business travel?",
165 |         "Set another reminder to send final attendee list on June 25th at 9 AM"
166 |       ]
167 |     },
168 |     {
169 |       "name": "global_expansion",
170 |       "description": "Plan international business expansion",
171 |       "initial_prompt": "We're expanding our business globally. Start by checking flights from New York to Tokyo for August 1st.",
172 |       "expected_tools": ["search_flights"],
173 |       "follow_ups": [
174 |         "What's the weather in Tokyo in August?",
175 |         "Convert $100,000 USD to Japanese Yen for initial investment",
176 |         "Search for hotels in Tokyo for August 1-7",
177 |         "Translate 'We look forward to doing business with you' to Japanese",
178 |         "Get news about Japanese market trends",
179 |         "Search for restaurants in Tokyo for business meetings",
180 |         "Check flights from Tokyo to Beijing on August 7th",
181 |         "Weather in Beijing in August?",
182 |         "Convert $50,000 to Chinese Yuan",
183 |         "Search hotels in Beijing for August 7-10",
184 |         "Translate 'Thank you for your partnership' to Mandarin",
185 |         "Get news about Chinese tech industry",
186 |         "Search restaurants in Beijing",
187 |         "Check flights from Beijing to Singapore on August 10th",
188 |         "Weather in Singapore?",
189 |         "Convert $75,000 to Singapore dollars",
190 |         "Search hotels in Singapore for August 10-14",
191 |         "Get news about Southeast Asian markets",
192 |         "Search restaurants in Singapore",
193 |         "Calculate total travel budget: 3 cities × 5 days × $300/day",
194 |         "Set reminder to prepare presentation materials by July 25th",
195 |         "Check flights from Singapore back to New York on August 14th"
196 |       ]
197 |     },
198 |     {
199 |       "name": "mega_world_tour",
200 |       "description": "Plan a comprehensive world tour",
201 |       "initial_prompt": "I'm planning a world tour. Start with flights from Los Angeles to Sydney on September 1st.",
202 |       "expected_tools": ["search_flights"],
203 |       "follow_ups": [
204 |         "Weather in Sydney in September?",
205 |         "Convert $3000 to Australian dollars",
206 |         "Search hotels in Sydney for September 1-4",
207 |         "Translate 'Good day mate' to Australian English",
208 |         "Search restaurants in Sydney",
209 |         "Check flights from Sydney to Tokyo on September 4th",
210 |         "Weather in Tokyo?",
211 |         "Convert $2500 to Japanese Yen",
212 |         "Search hotels in Tokyo for September 4-7",
213 |         "Translate 'Where is the subway?' to Japanese",
214 |         "Get news about Tokyo Olympics legacy",
215 |         "Search restaurants in Tokyo",
216 |         "Flights from Tokyo to Dubai on September 7th",
217 |         "Weather in Dubai in September?",
218 |         "Convert $4000 to UAE Dirhams",
219 |         "Search hotels in Dubai for September 7-10",
220 |         "Get news about Dubai expo",
221 |         "Search restaurants in Dubai",
222 |         "Flights from Dubai to Paris on September 10th",
223 |         "Weather in Paris?",
224 |         "Convert $3500 to Euros",
225 |         "Search hotels in Paris for September 10-13",
226 |         "Translate 'Where is the Eiffel Tower?' to French",
227 |         "Search French restaurants in Paris",
228 |         "Get news about Paris fashion week",
229 |         "Flights from Paris to London on September 13th",
230 |         "Weather in London?",
231 |         "Convert $3000 to British pounds",
232 |         "Search hotels in London for September 13-16",
233 |         "Search restaurants in London",
234 |         "Calculate total budget for entire trip",
235 |         "Set reminder to get travel insurance by August 15th",
236 |         "Flights from London back to Los Angeles on September 16th"
237 |       ]
238 |     },
239 |     {
240 |       "name": "startup_investor_roadshow",
241 |       "description": "Organize investor meetings across multiple countries",
242 |       "initial_prompt": "Planning an investor roadshow. Check flights from San Francisco to London on October 1st.",
243 |       "expected_tools": ["search_flights"],
244 |       "follow_ups": [
245 |         "Weather in London in October?",
246 |         "Convert $500,000 investment fund to British pounds",
247 |         "Search hotels in London financial district for October 1-3",
248 |         "Search restaurants for investor dinners in London",
249 |         "Get news about UK startup ecosystem",
250 |         "Set reminder for pitch deck review on September 28th at 2 PM",
251 |         "Flights from London to Berlin on October 3rd",
252 |         "Weather in Berlin?",
253 |         "Convert $250,000 to Euros for German investments",
254 |         "Search hotels in Berlin for October 3-5",
255 |         "Translate 'We see great potential in your startup' to German",
256 |         "Search restaurants in Berlin",
257 |         "Get news about European tech funding",
258 |         "Flights from Berlin to Stockholm on October 5th",
259 |         "Weather in Stockholm?",
260 |         "Convert $150,000 to Swedish Krona",
261 |         "Search hotels in Stockholm for October 5-7",
262 |         "Translate 'Innovation is key to success' to Swedish",
263 |         "Search restaurants in Stockholm",
264 |         "Get news about Nordic startup scene",
265 |         "Calculate ROI if we invest $50,000 in 10 startups with 20% expected return",
266 |         "Flights from Stockholm to Amsterdam on October 7th",
267 |         "Weather in Amsterdam?",
268 |         "Search hotels in Amsterdam for October 7-9",
269 |         "Search restaurants in Amsterdam",
270 |         "Set reminder to send investment term sheets by October 15th",
271 |         "Get news about Dutch fintech sector",
272 |         "Calculate total travel expenses for tax deduction",
273 |         "Flights from Amsterdam back to San Francisco on October 9th"
274 |       ]
275 |     },
276 |     {
277 |       "name": "academic_conference_circuit",
278 |       "description": "Attend multiple academic conferences worldwide",
279 |       "initial_prompt": "I'm attending academic conferences globally. Check flights from Boston to Oxford on November 1st.",
280 |       "expected_tools": ["search_flights"],
281 |       "follow_ups": [
282 |         "Weather in Oxford in November?",
283 |         "Search hotels near Oxford University for November 1-3",
284 |         "Convert $2000 conference budget to British pounds",
285 |         "Search restaurants in Oxford",
286 |         "Get news about latest research in artificial intelligence",
287 |         "Set reminder to submit paper by October 20th at midnight",
288 |         "Flights from London to Geneva on November 3rd",
289 |         "Weather in Geneva?",
290 |         "Convert $1500 to Swiss Francs",
291 |         "Search hotels in Geneva for November 3-5",
292 |         "Translate 'Where is the conference center?' to French",
293 |         "Search restaurants near CERN in Geneva",
294 |         "Get news about particle physics breakthroughs",
295 |         "Flights from Geneva to Vienna on November 5th",
296 |         "Weather in Vienna?",
297 |         "Convert $1200 to Euros",
298 |         "Search hotels in Vienna for November 5-7",
299 |         "Translate 'Thank you for the invitation' to German",
300 |         "Search restaurants in Vienna",
301 |         "Calculate conference registration fees: 5 conferences × $300 each",
302 |         "Get news about quantum computing research",
303 |         "Flights from Vienna to Prague on November 7th",
304 |         "Weather in Prague?",
305 |         "Search hotels in Prague for November 7-9",
306 |         "Search restaurants in Prague",
307 |         "Set reminder to prepare presentation slides by October 25th",
308 |         "Flights from Prague to Barcelona on November 9th",
309 |         "Weather in Barcelona?",
310 |         "Search hotels in Barcelona for November 9-11",
311 |         "Translate 'See you at the conference' to Spanish",
312 |         "Search restaurants in Barcelona",
313 |         "Get news about European research grants",
314 |         "Calculate total publication costs if submitting to 3 journals at $500 each",
315 |         "Flights from Barcelona back to Boston on November 11th"
316 |       ]
317 |     },
318 |     {
319 |       "name": "luxury_shopping_expedition",
320 |       "description": "Plan a luxury shopping tour across fashion capitals",
321 |       "initial_prompt": "Planning a luxury shopping tour. Check flights from New York to Milan for December 1st.",
322 |       "expected_tools": ["search_flights"],
323 |       "follow_ups": [
324 |         "Weather in Milan in December?",
325 |         "Convert $50,000 shopping budget to Euros",
326 |         "Search luxury hotels in Milan for December 1-3",
327 |         "Search Michelin-starred restaurants in Milan",
328 |         "Get news about Milan Fashion Week",
329 |         "Calculate VAT refund on $10,000 purchase in Italy",
330 |         "Set reminder to check credit card limits by November 25th",
331 |         "Flights from Milan to Paris on December 3rd",
332 |         "Weather in Paris?",
333 |         "Search hotels near Champs-Élysées for December 3-5",
334 |         "Translate 'Do you have this in another size?' to French",
335 |         "Search restaurants in Paris 8th arrondissement",
336 |         "Convert additional $30,000 to Euros",
337 |         "Get news about French luxury brands",
338 |         "Calculate savings if items are 20% cheaper in Europe",
339 |         "Flights from Paris to London on December 5th",
340 |         "Weather in London?",
341 |         "Convert $25,000 to British pounds",
342 |         "Search hotels in Mayfair for December 5-7",
343 |         "Search restaurants near Bond Street",
344 |         "Get news about British fashion designers",
345 |         "Set reminder to declare customs on return",
346 |         "Flights from London to Dubai on December 7th",
347 |         "Weather in Dubai?",
348 |         "Convert $40,000 to UAE Dirhams",
349 |         "Search hotels in Dubai Mall area for December 7-9",
350 |         "Search restaurants in Dubai",
351 |         "Get news about Dubai Shopping Festival",
352 |         "Calculate duty-free savings on jewelry purchases",
353 |         "Flights from Dubai to Tokyo on December 9th",
354 |         "Weather in Tokyo?",
355 |         "Convert $35,000 to Japanese Yen",
356 |         "Search hotels in Ginza for December 9-11",
357 |         "Translate 'Is this authentic?' to Japanese",
358 |         "Search restaurants in Ginza",
359 |         "Get news about Japanese fashion trends",
360 |         "Calculate total spent across all cities",
361 |         "Flights from Tokyo back to New York on December 11th"
362 |       ]
363 |     },
364 |     {
365 |       "name": "film_festival_circuit",
366 |       "description": "Attend major film festivals worldwide",
367 |       "initial_prompt": "I'm attending film festivals globally. Start with flights from Los Angeles to Cannes for May 15th.",
368 |       "expected_tools": ["search_flights"],
369 |       "follow_ups": [
370 |         "Weather in Cannes in May?",
371 |         "Convert $15,000 budget to Euros",
372 |         "Search hotels on the Croisette for May 15-20",
373 |         "Search restaurants in Cannes",
374 |         "Get news about Cannes Film Festival lineup",
375 |         "Set reminder for screening schedule on May 14th",
376 |         "Flights from Nice to Venice on May 20th",
377 |         "Weather in Venice?",
378 |         "Search hotels near the Lido for May 20-25",
379 |         "Translate 'Where is the festival venue?' to Italian",
380 |         "Search restaurants in Venice",
381 |         "Get news about Venice Biennale",
382 |         "Calculate accommodation costs: 10 nights × $400/night",
383 |         "Flights from Venice to Berlin on May 25th",
384 |         "Weather in Berlin?",
385 |         "Convert $8,000 to Euros for Berlin expenses",
386 |         "Search hotels near Potsdamer Platz for May 25-30",
387 |         "Translate 'Congratulations on your film' to German",
388 |         "Search restaurants in Berlin",
389 |         "Get news about Berlinale special screenings",
390 |         "Flights from Berlin to Toronto on May 30th",
391 |         "Weather in Toronto?",
392 |         "Convert $10,000 to Canadian dollars",
393 |         "Search hotels downtown Toronto for May 30-June 4",
394 |         "Search restaurants in Toronto Entertainment District",
395 |         "Get news about TIFF year-round programming",
396 |         "Set reminder to submit film for next year's festivals",
397 |         "Calculate total travel insurance needed",
398 |         "Flights from Toronto to Sundance (via Salt Lake City) on June 4th",
399 |         "Weather in Park City?",
400 |         "Convert $12,000 to USD for Sundance",
401 |         "Search hotels in Park City for June 4-9",
402 |         "Search restaurants on Main Street Park City",
403 |         "Get news about independent film funding",
404 |         "Calculate potential distribution deal value",
405 |         "Flights from Salt Lake City to Tokyo on June 9th",
406 |         "Weather in Tokyo?",
407 |         "Convert $7,000 to Japanese Yen",
408 |         "Search hotels in Roppongi for June 9-14",
409 |         "Translate 'Thank you for watching our film' to Japanese",
410 |         "Search restaurants in Roppongi",
411 |         "Get news about Asian film market",
412 |         "Set reminder to follow up with distributors",
413 |         "Flights from Tokyo back to Los Angeles on June 14th"
414 |       ]
415 |     }
416 |   ]
417 | }


--------------------------------------------------------------------------------
/tool_tester_v2.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | """
   3 | LLM Tool Calling Test Suite V3.2
   4 | 
   5 | Key additions vs V3.1:
   6 | - Capability probe to detect what the server actually supports:
   7 |   * tools + object tool_choice   (best)
   8 |   * tools + "required"           (common on local servers)
   9 |   * legacy functions + function_call
  10 | - Hard-enforcement: if the user says "Use <tool>" and the model still doesn't
  11 |   emit a tool call, we EMULATE the call (and mark llm_initiated=False), so your
  12 |   harness still verifies parameters and counts calls.
  13 | - Report now shows LLM-initiated vs Emulated counts.
  14 | 
  15 | This makes the harness resilient to tool/function support differences across
  16 | OpenAI-compatible servers (e.g., llama.cpp style, local gateways, etc.).
  17 | """
  18 | 
  19 | import argparse
  20 | import json
  21 | import sys
  22 | import time
  23 | import os
  24 | import io
  25 | from typing import Dict, List, Any, Optional, Tuple
  26 | from dataclasses import dataclass, field
  27 | from datetime import datetime
  28 | import re
  29 | import requests
  30 | 
  31 | from openai import OpenAI
  32 | from tool_definitions import ToolDefinitions
  33 | 
  34 | # Fix Windows console encoding issues
  35 | if sys.platform == 'win32':
  36 |     # Set console to UTF-8
  37 |     sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
  38 |     sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
  39 | 
  40 | 
  41 | # ------------------------ Data classes ------------------------
  42 | 
  43 | @dataclass
  44 | class ToolCallResult:
  45 |     """Store results for a single tool call"""
  46 |     tool_name: str
  47 |     expected: bool
  48 |     parameters_correct: bool              # structural (schema-level) correctness
  49 |     execution_successful: bool
  50 |     semantic_match: bool = True           # values match what the user asked for
  51 |     actual_args: Dict[str, Any] = field(default_factory=dict)
  52 |     expected_args: Dict[str, Any] = field(default_factory=dict)
  53 |     llm_initiated: bool = True            # False if harness emulated the call
  54 |     error: Optional[str] = None
  55 | 
  56 | 
  57 | @dataclass
  58 | class ExpectedCall:
  59 |     """Represents an expected tool call extracted from the user prompt"""
  60 |     tool_name: str
  61 |     expected_args: Dict[str, Any] = field(default_factory=dict)
  62 |     source_text: str = ""
  63 | 
  64 | 
  65 | @dataclass
  66 | class TestResult:
  67 |     """Store results for a single test scenario"""
  68 |     scenario_name: str
  69 |     description: str
  70 |     conversation_turns: int
  71 |     tool_calls_made: List[str]
  72 |     expected_tool_types: List[str]
  73 |     success: bool
  74 |     tool_call_details: List[ToolCallResult] = field(default_factory=list)
  75 |     expected_tool_call_count: int = 0
  76 |     error: Optional[str] = None
  77 |     execution_time: float = 0.0
  78 |     conversation_log: List[Dict] = field(default_factory=list)
  79 | 
  80 | 
  81 | @dataclass
  82 | class TestSuite:
  83 |     """Collection of test results"""
  84 |     name: str
  85 |     results: List[TestResult] = field(default_factory=list)
  86 | 
  87 |     @property
  88 |     def success_rate(self) -> float:
  89 |         if not self.results:
  90 |             return 0.0
  91 |         successful = sum(1 for r in self.results if r.success)
  92 |         return (successful / len(self.results)) * 100
  93 | 
  94 |     @property
  95 |     def total_tool_calls(self) -> int:
  96 |         return sum(len(r.tool_calls_made) for r in self.results)
  97 | 
  98 | 
  99 | @dataclass
 100 | class APICapabilities:
 101 |     """What the server supports"""
 102 |     supports_tools: bool = False
 103 |     supports_tool_choice_object: bool = False
 104 |     supports_tool_choice_required: bool = False
 105 |     supports_functions: bool = False
 106 | 
 107 | 
 108 | # ------------------------ Scenarios ------------------------
 109 | 
 110 | class TestScenarios:
 111 |     """Natural conversation scenarios that require tool use"""
 112 | 
 113 |     @staticmethod
 114 |     def get_scenarios() -> List[Dict[str, Any]]:
 115 |         """Load scenarios from JSON file or return empty list if file doesn't exist"""
 116 |         scenarios_file = "test_scenarios.json"
 117 |         
 118 |         # Try to find the scenarios file
 119 |         if os.path.exists(scenarios_file):
 120 |             file_path = scenarios_file
 121 |         elif os.path.exists(os.path.join(os.path.dirname(__file__), scenarios_file)):
 122 |             file_path = os.path.join(os.path.dirname(__file__), scenarios_file)
 123 |         else:
 124 |             print(f"Warning: {scenarios_file} not found. Using empty scenario list.")
 125 |             return []
 126 |         
 127 |         try:
 128 |             with open(file_path, 'r') as f:
 129 |                 data = json.load(f)
 130 |                 return data.get('scenarios', [])
 131 |         except Exception as e:
 132 |             print(f"Error loading scenarios from {file_path}: {e}")
 133 |             return []
 134 | 
 135 |     @staticmethod
 136 |     def get_scenario_by_complexity(min_tools: int, max_tools: int) -> List[Dict[str, Any]]:
 137 |         """Get scenarios that require a specific number of tool calls"""
 138 |         all_scenarios = TestScenarios.get_scenarios()
 139 |         filtered = []
 140 |         for scenario in all_scenarios:
 141 |             expected_count = len(scenario["expected_tools"]) + len(scenario.get("follow_ups", []))
 142 |             if min_tools <= expected_count <= max_tools:
 143 |                 filtered.append(scenario)
 144 |         return filtered
 145 | 
 146 | 
 147 | # ------------------------ Core tester ------------------------
 148 | 
 149 | class LLMToolTester:
 150 |     """Main test runner for natural tool calling"""
 151 | 
 152 |     def __init__(self, api_base: str, api_key: str, model: str, debug: bool = False, provider: str = None, temperature: float = None, reasoning_effort: str = None):
 153 |         self.api_base = api_base.rstrip('/')
 154 |         self.api_key = api_key
 155 |         self.model = model
 156 |         self.provider = provider
 157 |         self.temperature = temperature if temperature is not None else 0.1
 158 |         self.reasoning_effort = reasoning_effort
 159 |         self.is_openrouter = 'openrouter' in api_base.lower()
 160 |         
 161 |         # Set up headers for OpenRouter
 162 |         headers = {}
 163 |         if self.is_openrouter and provider:
 164 |             headers['HTTP-Referer'] = 'https://tool-tester'  # Optional
 165 |             headers['X-Title'] = 'Tool Tester'  # Optional
 166 |         
 167 |         self.client = OpenAI(api_key=api_key, base_url=api_base, default_headers=headers)
 168 |         self.tools = ToolDefinitions.get_all_tools()
 169 |         self.debug = debug
 170 |         self.capabilities: Optional[APICapabilities] = None
 171 | 
 172 |     # ---------- Capability probe ----------
 173 | 
 174 |     def _tools_to_functions(self) -> List[Dict[str, Any]]:
 175 |         """Convert modern tools list to legacy functions schema for fallback."""
 176 |         funcs = []
 177 |         for t in self.tools:
 178 |             if t.get("type") == "function":
 179 |                 f = t["function"]
 180 |                 funcs.append({
 181 |                     "name": f["name"],
 182 |                     "description": f.get("description", ""),
 183 |                     "parameters": f.get("parameters", {"type": "object", "properties": {}})
 184 |                 })
 185 |         return funcs
 186 | 
 187 |     def _probe_capabilities(self) -> APICapabilities:
 188 |         caps = APICapabilities()
 189 |         probe_msgs = [
 190 |             {"role": "system", "content": "You are a tool-calling probe."},
 191 |             {"role": "user", "content": "Use the calculate tool to compute 2+2."}
 192 |         ]
 193 | 
 194 |         # Try tools + object tool_choice
 195 |         try:
 196 |             self.client.chat.completions.create(
 197 |                 model=self.model,
 198 |                 messages=probe_msgs,
 199 |                 tools=self.tools,
 200 |                 tool_choice={"type": "function", "function": {"name": "calculate"}},
 201 |                 temperature=self.temperature,
 202 |                 max_tokens=1,
 203 |                 timeout=10.0
 204 |             )
 205 |             caps.supports_tools = True
 206 |             caps.supports_tool_choice_object = True
 207 |             if self.debug:
 208 |                 print("  [Probe] tools + object tool_choice: OK")
 209 |             return caps
 210 |         except Exception as e:
 211 |             if self.debug:
 212 |                 print(f"  [Probe] tools + object tool_choice: FAIL ({e})")
 213 | 
 214 |         # Try tools + "required"
 215 |         try:
 216 |             self.client.chat.completions.create(
 217 |                 model=self.model,
 218 |                 messages=probe_msgs,
 219 |                 tools=self.tools,
 220 |                 tool_choice="required",
 221 |                 temperature=self.temperature,
 222 |                 max_tokens=1,
 223 |                 timeout=10.0
 224 |             )
 225 |             caps.supports_tools = True
 226 |             caps.supports_tool_choice_required = True
 227 |             if self.debug:
 228 |                 print("  [Probe] tools + 'required': OK")
 229 |         except Exception as e:
 230 |             if self.debug:
 231 |                 print(f"  [Probe] tools + 'required': FAIL ({e})")
 232 | 
 233 |         # Try legacy functions + function_call
 234 |         try:
 235 |             self.client.chat.completions.create(
 236 |                 model=self.model,
 237 |                 messages=probe_msgs,
 238 |                 functions=self._tools_to_functions(),
 239 |                 function_call={"name": "calculate"},
 240 |                 temperature=self.temperature,
 241 |                 max_tokens=1,
 242 |                 timeout=10.0
 243 |             )
 244 |             caps.supports_functions = True
 245 |             if self.debug:
 246 |                 print("  [Probe] legacy functions + function_call: OK")
 247 |         except Exception as e:
 248 |             if self.debug:
 249 |                 print(f"  [Probe] legacy functions + function_call: FAIL ({e})")
 250 | 
 251 |         return caps
 252 | 
 253 |     # ---------- Expected-call extraction helpers ----------
 254 | 
 255 |     def _extract_location(self, text: str) -> Optional[str]:
 256 |         m = re.search(r"\b(?:in|for)\s+([A-Z][A-Za-z]+(?:[ -][A-Z][A-Za-z]+)*)", text)
 257 |         if not m:
 258 |             return None
 259 |         loc = m.group(1)
 260 |         # Filter out month/time words accidentally captured (e.g., "in July")
 261 |         blacklist = {
 262 |             "January","February","March","April","May","June","July","August","September","October","November","December",
 263 |             "Today","Tomorrow","Tonight","Morning","Afternoon","Evening","Weekend","Week","Noon","Midnight"
 264 |         }
 265 |         if loc.capitalize() in blacklist:
 266 |             return None
 267 |         return loc
 268 | 
 269 |     def _extract_flight_triplet(self, text: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
 270 |         from_city = to_city = date = None
 271 |         m = re.search(
 272 |             r"from\s+([A-Z][\w\s-]+?)\s+to\s+([A-Z][\w\s-]+?)(?:\s+on\s+([A-Za-z]+\s+\d{1,2}\w{0,2}|\d{4}-\d{2}-\d{2}))?",
 273 |             text, re.IGNORECASE
 274 |         )
 275 |         if m:
 276 |             from_city, to_city, date = m.group(1), m.group(2), m.group(3)
 277 |         return from_city, to_city, date
 278 | 
 279 |     def _extract_stock_symbols(self, text: str) -> List[str]:
 280 |         return re.findall(r"\(([A-Z]{1,6})\)", text)
 281 | 
 282 |     def _extract_currency_triplet(self, text: str) -> Tuple[Optional[float], Optional[str], Optional[str]]:
 283 |         amt = None
 284 |         m_amt = re.search(r"\$?\s?(\d[\d,]*(?:\.\d+)?)", text)
 285 |         if m_amt:
 286 |             amt = float(m_amt.group(1).replace(",", ""))
 287 | 
 288 |         def norm(code: str) -> str:
 289 |             mapping = {
 290 |                 # Core
 291 |                 "usd": "USD", "eur": "EUR", "gbp": "GBP", "jpy": "JPY", "yen": "JPY",
 292 |                 "pounds": "GBP", "pound": "GBP", "euros": "EUR", "euro": "EUR", "dollars": "USD", "dollar": "USD",
 293 |                 # Extended/common in scenarios
 294 |                 "aud": "AUD", "australian dollars": "AUD", "australian dollar": "AUD",
 295 |                 "cad": "CAD", "canadian dollars": "CAD", "canadian dollar": "CAD",
 296 |                 "sgd": "SGD", "singapore dollars": "SGD", "singapore dollar": "SGD",
 297 |                 "aed": "AED", "uae dirhams": "AED", "uae dirham": "AED", "dirhams": "AED", "dirham": "AED",
 298 |                 "cny": "CNY", "rmb": "CNY", "renminbi": "CNY", "chinese yuan": "CNY", "yuan": "CNY",
 299 |                 "chf": "CHF", "swiss francs": "CHF", "swiss franc": "CHF",
 300 |                 "sek": "SEK", "swedish krona": "SEK", "krona": "SEK"
 301 |             }
 302 |             return mapping.get(code.lower(), code.upper())
 303 | 
 304 |         m_from = re.search(r"\bfrom\s+([A-Za-z]{3,})\b", text)
 305 |         m_to = re.search(r"\bto\s+([A-Za-z]{3,}|[A-Za-z]+(?:\s+pounds|dollars|euros|yen))\b", text)
 306 |         from_ccy = norm(m_from.group(1)) if m_from else None
 307 |         to_ccy_raw = m_to.group(1) if m_to else None
 308 |         to_ccy = norm(to_ccy_raw) if to_ccy_raw else None
 309 | 
 310 |         if amt and not from_ccy:
 311 |             if "$" in text:
 312 |                 from_ccy = "USD"
 313 |         return amt, from_ccy, to_ccy
 314 | 
 315 |     def _extract_news_topic(self, text: str) -> Optional[str]:
 316 |         m = re.search(r"news\s+(?:about|on|regarding|around)\s+(.+)$", text, re.IGNORECASE)
 317 |         if not m:
 318 |             return None
 319 |         topic = m.group(1).strip()
 320 |         # Strip trailing punctuation
 321 |         topic = topic.rstrip(".?! ")
 322 |         return topic if topic else None
 323 | 
 324 |     def _extract_translate(self, text: str) -> Dict[str, Any]:
 325 |         # Prefer quoted text pattern
 326 |         m = re.search(r"translate\s+[\"'](.+?)[\"']\s+to\s+([A-Za-z ]+)", text, re.IGNORECASE)
 327 |         if m:
 328 |             return {"text": m.group(1), "target_language": m.group(2).strip()}
 329 |         # Fallback: only target language
 330 |         m2 = re.search(r"translate(?:\s+it|\s+this|\s+the message)?\s+to\s+([A-Za-z ]+)", text, re.IGNORECASE)
 331 |         if m2:
 332 |             return {"target_language": m2.group(1).strip()}
 333 |         return {}
 334 | 
 335 |     def _extract_calculate_expression(self, text: str) -> Optional[str]:
 336 |         m = re.search(r"(?:^|\b)calculate\b[:\s]*(.+)$", text, re.IGNORECASE)
 337 |         if m:
 338 |             return m.group(1).strip()
 339 |         return None
 340 | 
 341 |     def _stock_name_to_ticker(self, text: str) -> List[str]:
 342 |         mapping = {
 343 |             "apple": "AAPL",
 344 |             "microsoft": "MSFT",
 345 |             "google": "GOOGL",
 346 |             "alphabet": "GOOGL",
 347 |             "amazon": "AMZN",
 348 |             "tesla": "TSLA"
 349 |         }
 350 |         found = []
 351 |         low = text.lower()
 352 |         for name, ticker in mapping.items():
 353 |             if re.search(rf"\b{name}\b", low):
 354 |                 found.append(ticker)
 355 |         # Also capture standalone uppercase tickers not in parentheses (2-6 letters)
 356 |         for tok in re.findall(r"\b[A-Z]{2,6}\b", text):
 357 |             if tok not in found:
 358 |                 found.append(tok)
 359 |         return found
 360 | 
 361 |     def _extract_args_for_tool(self, name: str, text: str, context: Dict[str, Any]) -> List[ExpectedCall]:
 362 |         calls: List[ExpectedCall] = []
 363 |         expected_args: Dict[str, Any] = {}
 364 | 
 365 |         if name == "get_weather":
 366 |             loc = self._extract_location(text) or context.get("last_location")
 367 |             if loc:
 368 |                 expected_args["location"] = loc
 369 |                 context["last_location"] = loc
 370 |             calls.append(ExpectedCall(name, expected_args, text))
 371 |             return calls
 372 | 
 373 |         if name == "search_flights":
 374 |             f, t, d = self._extract_flight_triplet(text)
 375 |             if f: expected_args["from_city"] = f
 376 |             if t: expected_args["to_city"] = t
 377 |             if d: expected_args["date"] = d
 378 |             calls.append(ExpectedCall(name, expected_args, text))
 379 |             return calls
 380 | 
 381 |         if name == "get_stock_price":
 382 |             symbols = self._extract_stock_symbols(text)
 383 |             if not symbols:
 384 |                 symbols = self._stock_name_to_ticker(text)
 385 |             if symbols:
 386 |                 for s in symbols:
 387 |                     calls.append(ExpectedCall(name, {"symbol": s}, text))
 388 |                 return calls
 389 |             calls.append(ExpectedCall(name, {}, text))
 390 |             return calls
 391 | 
 392 |         if name == "convert_currency":
 393 |             amt, fr, to = self._extract_currency_triplet(text)
 394 |             if amt is not None: expected_args["amount"] = amt
 395 |             if fr: expected_args["from_currency"] = fr
 396 |             if to: expected_args["to_currency"] = to
 397 |             calls.append(ExpectedCall(name, expected_args, text))
 398 |             return calls
 399 | 
 400 |         if name in ("search_restaurants", "search_hotels"):
 401 |             loc = self._extract_location(text) or context.get("last_location")
 402 |             if loc:
 403 |                 expected_args["location"] = loc
 404 |                 context["last_location"] = loc
 405 |             calls.append(ExpectedCall(name, expected_args, text))
 406 |             return calls
 407 | 
 408 |         if name == "translate_text":
 409 |             args = self._extract_translate(text)
 410 |             if args:
 411 |                 calls.append(ExpectedCall(name, args, text))
 412 |                 return calls
 413 |             calls.append(ExpectedCall(name, {}, text))
 414 |             return calls
 415 | 
 416 |         if name == "get_news":
 417 |             topic = self._extract_news_topic(text)
 418 |             if topic:
 419 |                 calls.append(ExpectedCall(name, {"topic": topic}, text))
 420 |                 return calls
 421 |             calls.append(ExpectedCall(name, {}, text))
 422 |             return calls
 423 | 
 424 |         if name == "set_reminder":
 425 |             m_time = re.search(r"\b(?:at|@)\s+(\d{1,2}:\d{2}|\d{1,2}\s?(?:am|pm)|noon|midnight)\b", text, re.IGNORECASE)
 426 |             if m_time:
 427 |                 expected_args["time"] = m_time.group(1)
 428 |             m_msg = re.search(r"set a reminder(?:\s+to)?\s+(.+?)(?:\s+at|\s+on|$)", text, re.IGNORECASE)
 429 |             if m_msg:
 430 |                 expected_args["message"] = m_msg.group(1).strip()
 431 |             calls.append(ExpectedCall(name, expected_args, text))
 432 |             return calls
 433 | 
 434 |         if name == "calculate":
 435 |             expr = self._extract_calculate_expression(text)
 436 |             if expr:
 437 |                 calls.append(ExpectedCall(name, {"expression": expr}, text))
 438 |                 return calls
 439 |             # As fallback, use the whole text after 'calculate' if present elsewhere
 440 |             calls.append(ExpectedCall(name, {}, text))
 441 |             return calls
 442 | 
 443 |         # Default
 444 |         calls.append(ExpectedCall(name, {}, text))
 445 |         return calls
 446 | 
 447 |     def _build_expected_calls_from_text(self, text: str, context: Dict[str, Any]) -> List[ExpectedCall]:
 448 |         calls: List[ExpectedCall] = []
 449 |         low = text.lower()
 450 | 
 451 |         # 1) Explicit: capture ALL "use <tool>" mentions in order of appearance
 452 |         occurrences: List[Tuple[int, str]] = []
 453 |         for t in self.tools:
 454 |             name = t["function"]["name"]
 455 |             for m in re.finditer(rf"use\s+(?:the\s+)?{re.escape(name)}\b", low):
 456 |                 occurrences.append((m.start(), name))
 457 |         occurrences.sort(key=lambda x: x[0])
 458 | 
 459 |         if occurrences:
 460 |             # Build segments between occurrences to improve per-tool arg extraction
 461 |             for idx, (pos, name) in enumerate(occurrences):
 462 |                 seg_start = pos
 463 |                 seg_end = occurrences[idx + 1][0] if idx + 1 < len(occurrences) else len(text)
 464 |                 seg_text = text[seg_start:seg_end]
 465 |                 calls.extend(self._extract_args_for_tool(name, seg_text, context))
 466 |             return calls
 467 | 
 468 |         # 2) Implicit fallbacks (broadened)
 469 |         if "weather" in low:
 470 |             loc = self._extract_location(text) or context.get("last_location")
 471 |             if loc:
 472 |                 context["last_location"] = loc
 473 |             calls.append(ExpectedCall("get_weather", {"location": loc} if loc else {}, text))
 474 | 
 475 |         if ("hotel" in low or "hotels" in low):
 476 |             loc = self._extract_location(text) or context.get("last_location")
 477 |             if loc:
 478 |                 context["last_location"] = loc
 479 |             calls.append(ExpectedCall("search_hotels", {"location": loc} if loc else {}, text))
 480 | 
 481 |         if "restaurant" in low:
 482 |             loc = self._extract_location(text) or context.get("last_location")
 483 |             if loc:
 484 |                 context["last_location"] = loc
 485 |             calls.append(ExpectedCall("search_restaurants", {"location": loc} if loc else {}, text))
 486 | 
 487 |         if "flight" in low:
 488 |             f, t, d = self._extract_flight_triplet(text)
 489 |             ec_args = {}
 490 |             if f: ec_args["from_city"] = f
 491 |             if t: ec_args["to_city"] = t
 492 |             if d: ec_args["date"] = d
 493 |             calls.append(ExpectedCall("search_flights", ec_args, text))
 494 | 
 495 |         if "calculate" in low:
 496 |             expr = self._extract_calculate_expression(text)
 497 |             calls.append(ExpectedCall("calculate", {"expression": expr} if expr else {}, text))
 498 | 
 499 |         if "news" in low:
 500 |             topic = self._extract_news_topic(text)
 501 |             calls.append(ExpectedCall("get_news", {"topic": topic} if topic else {}, text))
 502 | 
 503 |         if "convert" in low:
 504 |             amt, fr, to = self._extract_currency_triplet(text)
 505 |             args = {}
 506 |             if amt is not None: args["amount"] = amt
 507 |             if fr: args["from_currency"] = fr
 508 |             if to: args["to_currency"] = to
 509 |             calls.append(ExpectedCall("convert_currency", args, text))
 510 | 
 511 |         if "translate" in low:
 512 |             args = self._extract_translate(text)
 513 |             calls.append(ExpectedCall("translate_text", args, text))
 514 | 
 515 |         if "stock" in low:
 516 |             symbols = self._extract_stock_symbols(text)
 517 |             if not symbols:
 518 |                 symbols = self._stock_name_to_ticker(text)
 519 |             if symbols:
 520 |                 for s in symbols:
 521 |                     calls.append(ExpectedCall("get_stock_price", {"symbol": s}, text))
 522 |             else:
 523 |                 calls.append(ExpectedCall("get_stock_price", {}, text))
 524 | 
 525 |         return calls
 526 | 
 527 |     def _build_expected_queue_for_scenario(self, scenario: Dict[str, Any]) -> List[ExpectedCall]:
 528 |         ctx: Dict[str, Any] = {}
 529 |         queue: List[ExpectedCall] = []
 530 |         queue.extend(self._build_expected_calls_from_text(scenario["initial_prompt"], ctx))
 531 |         for fu in scenario.get("follow_ups", []):
 532 |             queue.extend(self._build_expected_calls_from_text(fu, ctx))
 533 |         return queue
 534 | 
 535 |     def _match_expected_call(self, tool_name: str, args: Dict[str, Any], expected_queue: List[ExpectedCall]) -> Tuple[bool, bool, Dict[str, Any]]:
 536 |         for i, exp in enumerate(expected_queue):
 537 |             if exp.tool_name == tool_name:
 538 |                 exp_used = expected_queue[i]
 539 |                 sem_ok = True
 540 |                 for k, v in exp_used.expected_args.items():
 541 |                     if v in (None, ""):
 542 |                         continue
 543 |                     a = args.get(k)
 544 |                     if a is None:
 545 |                         sem_ok = False
 546 |                         break
 547 |                     if isinstance(v, str) and v.strip():
 548 |                         if v.lower() not in str(a).lower():
 549 |                             sem_ok = False
 550 |                             break
 551 |                     elif isinstance(v, (int, float)):
 552 |                         try:
 553 |                             sem_ok = abs(float(a) - float(v)) < 1e-6
 554 |                         except Exception:
 555 |                             sem_ok = False
 556 |                         if not sem_ok:
 557 |                             break
 558 |                 # If no keys to check, treat as semantic miss (unless tool truly needs none)
 559 |                 if not exp_used.expected_args:
 560 |                     sem_ok = False
 561 |                 # Special-case: for calculate, accept if expression contains a number
 562 |                 if tool_name == "calculate":
 563 |                     expr = args.get("expression")
 564 |                     if isinstance(expr, str) and re.search(r"\d", expr):
 565 |                         sem_ok = True
 566 |                 if sem_ok:
 567 |                     expected_queue.pop(i)
 568 |                 return True, sem_ok, exp_used.expected_args
 569 |         return False, False, {}
 570 | 
 571 |     # ---------- Utilities ----------
 572 | 
 573 |     def _detect_forced_tool_name(self, messages: List[Dict[str, Any]]) -> Optional[str]:
 574 |         last_user = None
 575 |         for m in reversed(messages):
 576 |             if m.get("role") == "user":
 577 |                 last_user = m.get("content", "")
 578 |                 break
 579 |         if not last_user:
 580 |             return None
 581 |         low = last_user.lower()
 582 |         for t in self.tools:
 583 |             name = t["function"]["name"]
 584 |             if f"use {name.lower()}" in low or f"use the {name.lower()}" in low:
 585 |                 return name
 586 |         return None
 587 | 
 588 |     def _message_to_dict(self, message) -> Dict:
 589 |         """Normalize assistant message for re-sending (both tool_calls and function_call)."""
 590 |         try:
 591 |             return message.model_dump()
 592 |         except AttributeError:
 593 |             msg = {"role": "assistant"}
 594 |             msg["content"] = getattr(message, "content", "") or ""
 595 | 
 596 |             # tool_calls normalization
 597 |             tc_list = getattr(message, "tool_calls", None)
 598 |             if tc_list:
 599 |                 norm = []
 600 |                 for tc in tc_list:
 601 |                     try:
 602 |                         norm.append({
 603 |                             "id": getattr(tc, "id", None),
 604 |                             "type": getattr(tc, "type", "function"),
 605 |                             "function": {
 606 |                                 "name": getattr(getattr(tc, "function", None), "name", None),
 607 |                                 "arguments": getattr(getattr(tc, "function", None), "arguments", "{}")
 608 |                             }
 609 |                         })
 610 |                     except Exception:
 611 |                         pass
 612 |                 if norm:
 613 |                     msg["tool_calls"] = norm
 614 | 
 615 |             # function_call normalization
 616 |             fc = getattr(message, "function_call", None)
 617 |             if fc:
 618 |                 try:
 619 |                     msg["function_call"] = {
 620 |                         "name": getattr(fc, "name", None),
 621 |                         "arguments": getattr(fc, "arguments", "{}")
 622 |                     }
 623 |                 except Exception:
 624 |                     pass
 625 | 
 626 |             return msg
 627 | 
 628 |     def _ensure_capabilities(self):
 629 |         if self.capabilities is None:
 630 |             print("  Probing server capabilities...")
 631 |             self.capabilities = self._probe_capabilities()
 632 |             if self.debug:
 633 |                 print(f"  Capabilities: {self.capabilities}")
 634 | 
 635 |     def _chat_request_openrouter(self, messages: List[Dict[str, Any]], forced_name: Optional[str]):
 636 |         """
 637 |         Direct request to OpenRouter API bypassing OpenAI client to properly handle provider routing.
 638 |         Returns (response_obj, mode_str) matching the format of regular _chat_request.
 639 |         """
 640 |         print(f"  [OpenRouter] Sending request to OpenRouter...")
 641 | 
 642 |         # Build URL
 643 |         if '/v1' in self.api_base:
 644 |             url = f"{self.api_base}/chat/completions"
 645 |         else:
 646 |             url = f"{self.api_base}/v1/chat/completions"
 647 |         
 648 |         # Build request body with all parameters
 649 |         body = {
 650 |             "model": self.model,
 651 |             "messages": messages,
 652 |             "temperature": self.temperature,
 653 |             "max_tokens": 1000,
 654 |             "stream": False
 655 |         }
 656 |         
 657 |         # Add provider routing - using "require" to force specific provider
 658 |         if self.provider:
 659 |             # Use "require" to force the provider (will fail if unavailable)
 660 |             # Alternative: use "order" for preference with fallback
 661 |             body["provider"] = {"order": [self.provider], "allow_fallbacks": False}
 662 |             if self.debug:
 663 |                 print(f"  [OpenRouter] Requiring provider: {self.provider} (forced, no fallback)")
 664 |         
 665 |         # Add reasoning if specified
 666 |         if self.reasoning_effort:
 667 |             body["reasoning"] = {
 668 |                 "effort": self.reasoning_effort,
 669 |                 "max_tokens": 1000,
 670 |                 "exclude": False,
 671 |                 "enabled": True
 672 |             }
 673 |         
 674 |         # Add tools support
 675 |         if self.tools:
 676 |             body["tools"] = self.tools
 677 |             if forced_name:
 678 |                 body["tool_choice"] = {"type": "function", "function": {"name": forced_name}}
 679 |             else:
 680 |                 body["tool_choice"] = "auto"
 681 |         
 682 |         # Prepare headers
 683 |         headers = {
 684 |             "Authorization": f"Bearer {self.api_key}",
 685 |             "Content-Type": "application/json",
 686 |             "HTTP-Referer": "https://tool-tester",  # Optional but recommended for OpenRouter
 687 |             "X-Title": "Tool Tester"  # Optional but recommended for OpenRouter
 688 |         }
 689 |         
 690 |         print(f"  [OpenRouter Request] URL: {url}")
 691 |         print(f"  [OpenRouter Request] Provider in body: {body.get('provider')}")
 692 |         print(f"  [OpenRouter Request] Body: {body}")
 693 |         
 694 |         # Retry logic for rate limiting and provider downtime
 695 |         max_retries = 3
 696 |         max_404_retries = 5  # More retries for provider downtime
 697 |         retry_count = 0
 698 |         consecutive_404s = 0
 699 |         base_wait_time = 5  # Start with 5 seconds
 700 |         base_404_wait_time = 15  # Longer waits for 404s
 701 |         
 702 |         while retry_count <= max_retries:
 703 |             try:
 704 |                 # Make the request
 705 |                 response = requests.post(url, json=body, headers=headers, timeout=30.0)
 706 |                 
 707 |                 # Check for rate limiting (429)
 708 |                 if response.status_code == 429:
 709 |                     retry_count += 1
 710 |                     if retry_count > max_retries:
 711 |                         error_detail = ""
 712 |                         try:
 713 |                             error_json = response.json()
 714 |                             error_detail = f" - {error_json}"
 715 |                         except:
 716 |                             error_detail = f" - {response.text}"
 717 |                         raise ValueError(f"OpenRouter API error {response.status_code} (rate limited after {max_retries} retries){error_detail}")
 718 |                     
 719 |                     # Calculate wait time with exponential backoff
 720 |                     wait_time = base_wait_time * (2 ** (retry_count - 1))
 721 |                     
 722 |                     # Check for Retry-After header
 723 |                     retry_after = response.headers.get('Retry-After')
 724 |                     if retry_after:
 725 |                         try:
 726 |                             wait_time = int(retry_after)
 727 |                             print(f"  [OpenRouter] Rate limited. Waiting {wait_time}s as requested by server...")
 728 |                         except:
 729 |                             print(f"  [OpenRouter] Rate limited. Waiting {wait_time}s (exponential backoff)...")
 730 |                     else:
 731 |                         print(f"  [OpenRouter] Rate limited. Waiting {wait_time}s (exponential backoff)...")
 732 |                     
 733 |                     time.sleep(wait_time)
 734 |                     continue
 735 |                 
 736 |                 # Check for provider downtime (404 - No endpoints found)
 737 |                 if response.status_code == 404:
 738 |                     consecutive_404s += 1
 739 |                     retry_count += 1
 740 |                     
 741 |                     # Check if we've hit too many 404s - abort the entire test
 742 |                     if consecutive_404s > max_404_retries:
 743 |                         print(f"  [OpenRouter] Provider appears to be down after {max_404_retries} attempts. Aborting test.")
 744 |                         # Set a special flag to indicate provider failure
 745 |                         raise ValueError("PROVIDER_DOWN")
 746 |                     
 747 |                     if retry_count > max_retries:
 748 |                         error_detail = ""
 749 |                         try:
 750 |                             error_json = response.json()
 751 |                             error_detail = f" - {error_json}"
 752 |                         except:
 753 |                             error_detail = f" - {response.text}"
 754 |                         raise ValueError(f"OpenRouter API error {response.status_code} (provider down after {max_retries} retries){error_detail}")
 755 |                     
 756 |                     # Longer wait for provider downtime
 757 |                     wait_time = min(60, base_404_wait_time * consecutive_404s)  # Cap at 1 minute
 758 |                     print(f"  [OpenRouter] Provider down (attempt {consecutive_404s}/{max_404_retries}). Waiting {wait_time}s...")
 759 |                     
 760 |                     time.sleep(wait_time)
 761 |                     continue
 762 |                 else:
 763 |                     # Reset 404 counter on success or other errors
 764 |                     consecutive_404s = 0
 765 |                 
 766 |                 # Check for other errors
 767 |                 if response.status_code != 200:
 768 |                     error_detail = ""
 769 |                     try:
 770 |                         error_json = response.json()
 771 |                         error_detail = f" - {error_json}"
 772 |                     except:
 773 |                         error_detail = f" - {response.text}"
 774 |                     raise ValueError(f"OpenRouter API error {response.status_code}{error_detail}")
 775 |                 
 776 |                 # Success - break out of retry loop
 777 |                 break
 778 |                 
 779 |             except requests.exceptions.Timeout:
 780 |                 retry_count += 1
 781 |                 if retry_count > max_retries:
 782 |                     raise ValueError(f"OpenRouter request timed out after {max_retries} retries")
 783 |                 wait_time = base_wait_time * (2 ** (retry_count - 1))
 784 |                 print(f"  [OpenRouter] Request timeout. Retrying in {wait_time}s...")
 785 |                 time.sleep(wait_time)
 786 |                 continue
 787 |             except requests.exceptions.ConnectionError as e:
 788 |                 retry_count += 1
 789 |                 if retry_count > max_retries:
 790 |                     raise ValueError(f"OpenRouter connection error after {max_retries} retries: {str(e)}")
 791 |                 wait_time = base_wait_time * (2 ** (retry_count - 1))
 792 |                 print(f"  [OpenRouter] Connection error. Retrying in {wait_time}s...")
 793 |                 time.sleep(wait_time)
 794 |                 continue
 795 |             
 796 |         response.raise_for_status()
 797 |         
 798 |         # Parse response
 799 |         response_data = response.json()
 800 |         
 801 |         # Convert to OpenAI-like response object structure
 802 |         # We need to create a mock object that matches what the OpenAI client returns
 803 |         class MockMessage:
 804 |             def __init__(self, data):
 805 |                 self.content = data.get("content", "")
 806 |                 self.role = data.get("role", "assistant")
 807 |                 self.tool_calls = None
 808 |                 self.function_call = None
 809 |                 
 810 |                 # Handle tool calls if present
 811 |                 if "tool_calls" in data:
 812 |                     self.tool_calls = []
 813 |                     for tc in data["tool_calls"]:
 814 |                         tool_call = type('ToolCall', (), {
 815 |                             'id': tc.get('id'),
 816 |                             'type': tc.get('type', 'function'),
 817 |                             'function': type('Function', (), {
 818 |                                 'name': tc.get('function', {}).get('name'),
 819 |                                 'arguments': tc.get('function', {}).get('arguments', '{}')
 820 |                             })()
 821 |                         })()
 822 |                         self.tool_calls.append(tool_call)
 823 |                 
 824 |                 # Handle legacy function_call if present
 825 |                 if "function_call" in data:
 826 |                     fc = data["function_call"]
 827 |                     self.function_call = type('FunctionCall', (), {
 828 |                         'name': fc.get('name'),
 829 |                         'arguments': fc.get('arguments', '{}')
 830 |                     })()
 831 |         
 832 |         class MockChoice:
 833 |             def __init__(self, choice_data):
 834 |                 msg_data = choice_data.get("message", {})
 835 |                 self.message = MockMessage(msg_data)
 836 |         
 837 |         class MockResponse:
 838 |             def __init__(self, response_data):
 839 |                 self.choices = [MockChoice(c) for c in response_data.get("choices", [])]
 840 |         
 841 |         mock_response = MockResponse(response_data)
 842 |         
 843 |         # Check if provider was actually used (OpenRouter sometimes includes this in response)
 844 |         if self.debug and "provider" in response_data:
 845 |             print(f"  [OpenRouter Response] Provider used: {response_data['provider']}")
 846 |         
 847 |         return mock_response, "openrouter_direct"
 848 | 
 849 |     def _chat_request(self, messages: List[Dict[str, Any]], forced_name: Optional[str]):
 850 |         """
 851 |         Performs a chat request using the best-supported mode based on capability probe.
 852 |         Returns (response, mode_str) where mode_str ∈ {"tools_object","tools_required","functions","none","openrouter_direct"}
 853 |         """
 854 |         # Use direct OpenRouter API if this is an OpenRouter request (to properly handle provider and other params)
 855 |         if self.is_openrouter:
 856 |             return self._chat_request_openrouter(messages, forced_name)
 857 |         
 858 |         self._ensure_capabilities()
 859 |         caps = self.capabilities
 860 | 
 861 |         # Prefer tools + object
 862 |         if caps.supports_tools and caps.supports_tool_choice_object:
 863 |             tool_choice = {"type": "function", "function": {"name": forced_name}} if forced_name else "auto"
 864 |             if self.debug:
 865 |                 print(f"  [Request] mode=tools_object, forced={forced_name}")
 866 |             
 867 |             kwargs = {
 868 |                 "model": self.model,
 869 |                 "messages": messages,
 870 |                 "tools": self.tools,
 871 |                 "tool_choice": tool_choice,
 872 |                 "temperature": self.temperature,
 873 |                 "max_tokens": 1000,
 874 |                 "timeout": 30.0
 875 |             }
 876 |             
 877 |             resp = self.client.chat.completions.create(**kwargs)
 878 |             return resp, "tools_object"
 879 | 
 880 |         # Next: tools + "required"
 881 |         if caps.supports_tools and caps.supports_tool_choice_required:
 882 |             # We can't force the exact tool name, but "required" nudges the model to call a tool.
 883 |             if self.debug:
 884 |                 print(f"  [Request] mode=tools_required, forced={forced_name} (hinted via text)")
 885 |             
 886 |             kwargs = {
 887 |                 "model": self.model,
 888 |                 "messages": messages,
 889 |                 "tools": self.tools,
 890 |                 "tool_choice": "required" if forced_name else "auto",
 891 |                 "temperature": self.temperature,
 892 |                 "max_tokens": 1000,
 893 |                 "timeout": 30.0
 894 |             }
 895 |             
 896 |             resp = self.client.chat.completions.create(**kwargs)
 897 |             return resp, "tools_required"
 898 | 
 899 |         # Legacy functions
 900 |         if caps.supports_functions:
 901 |             kwargs = {
 902 |                 "model": self.model,
 903 |                 "messages": messages,
 904 |                 "functions": self._tools_to_functions(),
 905 |                 "temperature": self.temperature,
 906 |                 "max_tokens": 1000,
 907 |                 "timeout": 30.0
 908 |             }
 909 |             if forced_name:
 910 |                 kwargs["function_call"] = {"name": forced_name}
 911 |             
 912 |             if self.debug:
 913 |                 print(f"  [Request] mode=functions, forced={forced_name}")
 914 |             resp = self.client.chat.completions.create(**kwargs)
 915 |             return resp, "functions"
 916 | 
 917 |         # No tool support detected
 918 |         if self.debug:
 919 |             print("  [Request] mode=none (no tool support detected)")
 920 |         
 921 |         kwargs = {
 922 |             "model": self.model,
 923 |             "messages": messages,
 924 |             "temperature": self.temperature,
 925 |             "max_tokens": 1000,
 926 |             "timeout": 30.0
 927 |         }
 928 |         
 929 |         resp = self.client.chat.completions.create(**kwargs)
 930 |         return resp, "none"
 931 | 
 932 |     # ---------- Scenario runner ----------
 933 | 
 934 |     def run_scenario(self, scenario: Dict[str, Any]) -> TestResult:
 935 |         start_time = time.time()
 936 |         conversation_log = []
 937 |         tool_calls_made: List[str] = []
 938 |         tool_call_details: List[ToolCallResult] = []
 939 | 
 940 |         expected_queue = self._build_expected_queue_for_scenario(scenario)
 941 |         total_expected_count = len(expected_queue)
 942 | 
 943 |         try:
 944 |             messages = [
 945 |                 {
 946 |                     "role": "system",
 947 |                     "content": (
 948 |                         "You are a helpful assistant. When the user explicitly names a tool to use, "
 949 |                         "you MUST call that tool and MUST NOT fabricate results. When a tool is available "
 950 |                         "that directly answers the user's request (e.g., weather, flights, hotels, stocks, currency), "
 951 |                         "prefer calling the tool over answering from prior knowledge. "
 952 |                         "Do not ignore available tools."
 953 |                     ),
 954 |                 },
 955 |                 {"role": "user", "content": scenario["initial_prompt"]}
 956 |             ]
 957 |             conversation_log.append({"role": "user", "content": scenario["initial_prompt"]})
 958 | 
 959 |             all_prompts = [scenario["initial_prompt"]] + scenario.get("follow_ups", [])
 960 |             user_prompt_index = 0
 961 |             max_turns = max(6, (len(all_prompts) + total_expected_count) * 2)
 962 | 
 963 |             for _ in range(max_turns):
 964 |                 forced_name = self._detect_forced_tool_name(messages)
 965 |                 response, mode = self._chat_request(messages, forced_name)
 966 | 
 967 |                 if not response.choices:
 968 |                     raise ValueError("No response choices")
 969 | 
 970 |                 assistant_message = response.choices[0].message
 971 |                 messages.append(self._message_to_dict(assistant_message))
 972 | 
 973 |                 if assistant_message.content:
 974 |                     conversation_log.append({"role": "assistant", "content": assistant_message.content})
 975 | 
 976 |                 did_tool_something = False
 977 | 
 978 |                 # --- Modern tools path: multiple tool_calls possible ---
 979 |                 if hasattr(assistant_message, "tool_calls") and assistant_message.tool_calls:
 980 |                     did_tool_something = True
 981 |                     for tc in assistant_message.tool_calls:
 982 |                         tool_name = tc.function.name
 983 |                         tool_calls_made.append(tool_name)
 984 | 
 985 |                         try:
 986 |                             args = json.loads(tc.function.arguments or "{}")
 987 |                         except Exception:
 988 |                             args = {}
 989 | 
 990 |                         params_valid = ToolDefinitions.validate_parameters(tool_name, args)
 991 | 
 992 |                         try:
 993 |                             result = ToolDefinitions.execute_tool(tool_name, args)
 994 |                             exec_ok = True
 995 |                             exec_err = None
 996 |                         except Exception as e:
 997 |                             result = f"Error executing {tool_name}: {str(e)}"
 998 |                             exec_ok = False
 999 |                             exec_err = str(e)
1000 | 
1001 |                         is_expected, sem_ok, exp_args = self._match_expected_call(tool_name, args, expected_queue)
1002 | 
1003 |                         tool_call_details.append(ToolCallResult(
1004 |                             tool_name=tool_name,
1005 |                             expected=is_expected,
1006 |                             parameters_correct=params_valid,
1007 |                             execution_successful=exec_ok,
1008 |                             semantic_match=sem_ok if is_expected else False,
1009 |                             actual_args=args,
1010 |                             expected_args=exp_args,
1011 |                             llm_initiated=True,
1012 |                             error=exec_err
1013 |                         ))
1014 | 
1015 |                         # Return tool result (modern format)
1016 |                         messages.append({
1017 |                             "role": "tool",
1018 |                             "tool_call_id": tc.id,
1019 |                             "name": tool_name,
1020 |                             "content": result
1021 |                         })
1022 |                         conversation_log.append({"role": "tool", "name": tool_name, "content": result if len(result) < 200 else result[:197] + "..."})
1023 | 
1024 |                 # --- Legacy functions path: single function_call ---
1025 |                 elif hasattr(assistant_message, "function_call") and assistant_message.function_call:
1026 |                     did_tool_something = True
1027 |                     fc = assistant_message.function_call
1028 |                     tool_name = fc.name
1029 |                     tool_calls_made.append(tool_name)
1030 | 
1031 |                     try:
1032 |                         args = json.loads(fc.arguments or "{}")
1033 |                     except Exception:
1034 |                         args = {}
1035 | 
1036 |                     params_valid = ToolDefinitions.validate_parameters(tool_name, args)
1037 | 
1038 |                     try:
1039 |                         result = ToolDefinitions.execute_tool(tool_name, args)
1040 |                         exec_ok = True
1041 |                         exec_err = None
1042 |                     except Exception as e:
1043 |                         result = f"Error executing {tool_name}: {str(e)}"
1044 |                         exec_ok = False
1045 |                         exec_err = str(e)
1046 | 
1047 |                     is_expected, sem_ok, exp_args = self._match_expected_call(tool_name, args, expected_queue)
1048 | 
1049 |                     tool_call_details.append(ToolCallResult(
1050 |                         tool_name=tool_name,
1051 |                         expected=is_expected,
1052 |                         parameters_correct=params_valid,
1053 |                         execution_successful=exec_ok,
1054 |                         semantic_match=sem_ok if is_expected else False,
1055 |                         actual_args=args,
1056 |                         expected_args=exp_args,
1057 |                         llm_initiated=True,
1058 |                         error=exec_err
1059 |                     ))
1060 | 
1061 |                     # Return tool result (legacy format uses role=function)
1062 |                     messages.append({
1063 |                         "role": "function",
1064 |                         "name": tool_name,
1065 |                         "content": result
1066 |                     })
1067 |                     conversation_log.append({"role": "function", "name": tool_name, "content": result if len(result) < 200 else result[:197] + "..."})
1068 | 
1069 |                 # No emulation: if forced tool wasn't called, we proceed without executing it
1070 | 
1071 |                 # If a tool was called (or emulated), let the loop iterate again to allow follow-ups.
1072 |                 if did_tool_something:
1073 |                     continue
1074 | 
1075 |                 # No tool call this turn: push next follow-up if available
1076 |                 if user_prompt_index < len(scenario.get("follow_ups", [])):
1077 |                     next_prompt = scenario["follow_ups"][user_prompt_index]
1078 |                     messages.append({"role": "user", "content": next_prompt})
1079 |                     conversation_log.append({"role": "user", "content": next_prompt})
1080 |                     user_prompt_index += 1
1081 |                     continue
1082 | 
1083 |                 # No more prompts, break
1084 |                 break
1085 | 
1086 |             # Evaluate success
1087 |             matched_calls_llm = sum(1 for tc in tool_call_details if tc.llm_initiated and tc.expected and tc.parameters_correct and tc.semantic_match)
1088 |             structural_ok_llm = sum(1 for tc in tool_call_details if tc.llm_initiated and tc.parameters_correct)
1089 | 
1090 |             success = (
1091 |                 (total_expected_count == 0 or matched_calls_llm >= max(1, int(0.7 * total_expected_count)))
1092 |                 and (structural_ok_llm >= int(0.6 * max(1, sum(1 for tc in tool_call_details if tc.llm_initiated))))
1093 |             )
1094 | 
1095 |             execution_time = time.time() - start_time
1096 | 
1097 |             return TestResult(
1098 |                 scenario_name=scenario["name"],
1099 |                 description=scenario["description"],
1100 |                 conversation_turns=len([m for m in conversation_log if m["role"] in ["user", "assistant"]]),
1101 |                 tool_calls_made=tool_calls_made,
1102 |                 tool_call_details=tool_call_details,
1103 |                 expected_tool_types=scenario["expected_tools"],
1104 |                 expected_tool_call_count=total_expected_count,
1105 |                 success=success,
1106 |                 execution_time=execution_time,
1107 |                 conversation_log=conversation_log
1108 |             )
1109 | 
1110 |         except Exception as e:
1111 |             execution_time = time.time() - start_time
1112 |             # Check if this is a provider down error
1113 |             if str(e) == "PROVIDER_DOWN":
1114 |                 print(f"    Provider down - skipping remaining tests")
1115 |                 return TestResult(
1116 |                     scenario_name=scenario["name"],
1117 |                     description=scenario["description"],
1118 |                     conversation_turns=0,
1119 |                     tool_calls_made=[],
1120 |                     tool_call_details=[],
1121 |                     expected_tool_types=scenario["expected_tools"],
1122 |                     expected_tool_call_count=total_expected_count,
1123 |                     success=False,
1124 |                     error="Provider unavailable",
1125 |                     execution_time=execution_time,
1126 |                     conversation_log=[]
1127 |                 )
1128 |             
1129 |             return TestResult(
1130 |                 scenario_name=scenario["name"],
1131 |                 description=scenario["description"],
1132 |                 conversation_turns=len([m for m in conversation_log if m["role"] in ["user", "assistant"]]),
1133 |                 tool_calls_made=tool_calls_made,
1134 |                 tool_call_details=tool_call_details,
1135 |                 expected_tool_types=scenario["expected_tools"],
1136 |                 expected_tool_call_count=total_expected_count,
1137 |                 success=False,
1138 |                 error=str(e),
1139 |                 execution_time=execution_time,
1140 |                 conversation_log=conversation_log
1141 |             )
1142 | 
1143 |     # ------------------------ Suite & Report ------------------------
1144 | 
1145 |     def run_test_suite(self, suite_name: str, scenarios: List[Dict[str, Any]]) -> TestSuite:
1146 |         suite = TestSuite(name=suite_name)
1147 |         provider_down = False
1148 |         
1149 |         for scenario in scenarios:
1150 |             print(f"\n  Testing: {scenario['name']} - {scenario['description']}")
1151 |             result = self.run_scenario(scenario)
1152 |             suite.results.append(result)
1153 | 
1154 |             status = "PASS" if result.success else "FAIL"
1155 |             print(f"    {status} Completed: {len(result.tool_calls_made)} tool calls in {result.execution_time:.2f}s")
1156 |             if result.error:
1157 |                 # Handle Unicode encoding issues on Windows
1158 |                 try:
1159 |                     print(f"      Error: {result.error}")
1160 |                 except UnicodeEncodeError:
1161 |                     # Replace problematic characters for console output
1162 |                     safe_error = result.error.encode('ascii', 'replace').decode('ascii')
1163 |                     print(f"      Error: {safe_error}")
1164 |                 
1165 |             # Check if provider is down - if so, create empty results for remaining scenarios
1166 |             if result.error == "Provider unavailable":
1167 |                 provider_down = True
1168 |                 print(f"  Provider is down - creating empty results for remaining scenarios")
1169 |                 break
1170 |         
1171 |         # If provider is down, create zero results for remaining scenarios
1172 |         if provider_down:
1173 |             remaining_scenarios = scenarios[len(suite.results):]
1174 |             for scenario in remaining_scenarios:
1175 |                 try:
1176 |                     exp_queue = self._build_expected_queue_for_scenario(scenario)
1177 |                     expected_count = len(exp_queue)
1178 |                 except Exception:
1179 |                     expected_count = len(scenario.get("expected_tools", [])) + len(scenario.get("follow_ups", []))
1180 |                 empty_result = TestResult(
1181 |                     scenario_name=scenario["name"],
1182 |                     description=scenario["description"],
1183 |                     conversation_turns=0,
1184 |                     tool_calls_made=[],
1185 |                     tool_call_details=[],
1186 |                     expected_tool_types=scenario["expected_tools"],
1187 |                     expected_tool_call_count=expected_count,
1188 |                     success=False,
1189 |                     error="Provider unavailable",
1190 |                     execution_time=0.0,
1191 |                     conversation_log=[]
1192 |                 )
1193 |                 suite.results.append(empty_result)
1194 |         
1195 |         return suite
1196 | 
1197 |     def generate_report(self, suites: List[TestSuite]) -> str:
1198 |         def clamp01(x: float) -> float:
1199 |             return 0.0 if x <= 0 else (1.0 if x >= 1 else x)
1200 | 
1201 |         report = []
1202 |         report.append("\n" + "="*60)
1203 |         report.append("LLM NATURAL TOOL CALLING TEST REPORT")
1204 |         report.append("="*60)
1205 |         report.append(f"Model: {self.model}")
1206 |         report.append(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
1207 |         report.append("")
1208 | 
1209 |         total_tests = sum(len(suite.results) for suite in suites)
1210 |         total_success = sum(sum(1 for r in suite.results if r.success) for suite in suites)
1211 | 
1212 |         # Totals
1213 |         total_expected = sum(r.expected_tool_call_count for suite in suites for r in suite.results)
1214 |         total_attempted_all = sum(len(r.tool_call_details) for suite in suites for r in suite.results)
1215 |         total_attempted_llm = sum(
1216 |             sum(1 for tc in r.tool_call_details if tc.llm_initiated)
1217 |             for suite in suites for r in suite.results
1218 |         )
1219 |         total_matched_llm = sum(
1220 |             sum(
1221 |                 1 for tc in r.tool_call_details
1222 |                 if tc.llm_initiated and tc.expected and tc.parameters_correct and tc.semantic_match
1223 |             )
1224 |             for suite in suites for r in suite.results
1225 |         )
1226 |         total_params_correct = sum(
1227 |             sum(1 for tc in r.tool_call_details if tc.parameters_correct)
1228 |             for suite in suites for r in suite.results
1229 |         )
1230 |         total_execution_success = sum(
1231 |             sum(1 for tc in r.tool_call_details if tc.execution_successful)
1232 |             for suite in suites for r in suite.results
1233 |         )
1234 |         total_semantic_match = sum(
1235 |             sum(1 for tc in r.tool_call_details if tc.semantic_match)
1236 |             for suite in suites for r in suite.results
1237 |         )
1238 | 
1239 |         total_params_correct_llm = sum(
1240 |             sum(1 for tc in r.tool_call_details if tc.llm_initiated and tc.parameters_correct)
1241 |             for suite in suites for r in suite.results
1242 |         )
1243 |         total_execution_success_llm = sum(
1244 |             sum(1 for tc in r.tool_call_details if tc.llm_initiated and tc.execution_successful)
1245 |             for suite in suites for r in suite.results
1246 |         )
1247 |         total_semantic_match_llm = sum(
1248 |             sum(1 for tc in r.tool_call_details if tc.llm_initiated and tc.semantic_match)
1249 |             for suite in suites for r in suite.results
1250 |         )
1251 | 
1252 |         # Rates (all bounded 0..100 where applicable)
1253 |         success_rate = (total_success / total_tests * 100.0) if total_tests > 0 else 0.0
1254 | 
1255 |         precision = (total_matched_llm / total_attempted_llm) if total_attempted_llm > 0 else 0.0
1256 |         recall = (total_matched_llm / total_expected) if total_expected > 0 else 0.0
1257 |         f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
1258 | 
1259 |         param_success_rate = (total_params_correct / total_attempted_all * 100.0) if total_attempted_all > 0 else 0.0
1260 |         execution_success_rate = (total_execution_success / total_attempted_all * 100.0) if total_attempted_all > 0 else 0.0
1261 |         semantic_match_rate = (total_semantic_match / total_attempted_all * 100.0) if total_attempted_all > 0 else 0.0
1262 | 
1263 |         param_success_rate_llm = (total_params_correct_llm / total_attempted_llm * 100.0) if total_attempted_llm > 0 else 0.0
1264 |         execution_success_rate_llm = (total_execution_success_llm / total_attempted_llm * 100.0) if total_attempted_llm > 0 else 0.0
1265 |         semantic_match_rate_llm = (total_semantic_match_llm / total_attempted_llm * 100.0) if total_attempted_llm > 0 else 0.0
1266 | 
1267 |         # Diagnostic factor (can be > 100%, do NOT use in scoring)
1268 |         overcall_factor = (total_attempted_all / total_expected) if total_expected > 0 else 0.0  # e.g. x3.76
1269 |         overcall_disp = f"x{overcall_factor:.2f}" if total_expected > 0 else "N/A"
1270 | 
1271 |         report.append("OVERALL SUMMARY")
1272 |         report.append("-"*40)
1273 |         report.append(f"Total Scenarios: {total_tests}")
1274 |         report.append(f"Successful Scenarios: {total_success}")
1275 |         report.append(f"Failed Scenarios: {total_tests - total_success}")
1276 |         report.append(f"Scenario Success Rate: {success_rate:.1f}%")
1277 |         report.append("")
1278 |         report.append("TOOL CALL STATISTICS")
1279 |         report.append("-"*25)
1280 |         report.append(f"Expected Tool Calls: {total_expected}")
1281 |         report.append(f"Attempted Calls (LLM + Emulated): {total_attempted_all}")
1282 |         report.append(f"Attempted Calls (LLM only): {total_attempted_llm}")
1283 |         report.append(f"Matched Expected (LLM only): {total_matched_llm}")
1284 |         report.append(f"Tool Precision (LLM only): {clamp01(precision)*100:.1f}%")
1285 |         report.append(f"Tool Recall (LLM only): {clamp01(recall)*100:.1f}%")
1286 |         report.append(f"Tool F1 (LLM only): {clamp01(f1)*100:.1f}%")
1287 |         report.append(f"Over/Under-call factor (diagnostic): {overcall_disp}")
1288 |         
1289 |         # Add emulation rate
1290 |         emulation_rate = ((total_attempted_all - total_attempted_llm) / max(1, total_attempted_all) * 100.0) if total_attempted_all > 0 else 0.0
1291 |         report.append(f"Emulation rate: {emulation_rate:.1f}%")
1292 |         report.append(f"Parameter Accuracy (structural): {param_success_rate:.1f}%")
1293 |         report.append(f"Parameter Accuracy (LLM only): {param_success_rate_llm:.1f}%")
1294 |         report.append(f"Parameter Accuracy (semantic): {semantic_match_rate:.1f}%")
1295 |         report.append(f"Parameter Accuracy (semantic, LLM only): {semantic_match_rate_llm:.1f}%")
1296 |         report.append(f"Execution Success Rate: {execution_success_rate:.1f}%")
1297 |         report.append(f"Execution Success Rate (LLM only): {execution_success_rate_llm:.1f}%")
1298 |         report.append("")
1299 | 
1300 |         # Per-suite block
1301 |         for suite in suites:
1302 |             report.append(f"\n{suite.name.upper()}")
1303 |             report.append("-"*40)
1304 |             report.append(f"Scenarios: {len(suite.results)}")
1305 |             report.append(f"Success Rate: {suite.success_rate:.1f}%")
1306 |             report.append(f"Total Tool Calls: {suite.total_tool_calls}")
1307 | 
1308 |             report.append("\nScenario Details:")
1309 |             for result in suite.results:
1310 |                 status = "PASS" if result.success else "FAIL"
1311 |                 attempted_all = len(result.tool_call_details)
1312 |                 attempted_llm = sum(1 for tc in result.tool_call_details if tc.llm_initiated)
1313 |                 matched_llm = sum(1 for tc in result.tool_call_details if tc.llm_initiated and tc.expected)
1314 | 
1315 |                 scn_prec = (matched_llm / attempted_llm) if attempted_llm > 0 else 0.0
1316 |                 scn_recall = (matched_llm / result.expected_tool_call_count) if result.expected_tool_call_count > 0 else 0.0
1317 |                 scn_f1 = (2 * scn_prec * scn_recall / (scn_prec + scn_recall)) if (scn_prec + scn_recall) > 0 else 0.0
1318 |                 scn_over = (attempted_all / result.expected_tool_call_count) if result.expected_tool_call_count > 0 else 0.0
1319 |                 scn_over_disp = f"x{scn_over:.2f}" if result.expected_tool_call_count > 0 else "N/A"
1320 | 
1321 |                 report.append(f"\n  [{status}] {result.scenario_name}: {result.description}")
1322 |                 report.append(f"    Conversation turns: {result.conversation_turns}")
1323 |                 report.append(f"    Tool calls made: {attempted_all} (expected {result.expected_tool_call_count})")
1324 |                 report.append(f"    LLM-initiated: {attempted_llm} | Emulated: {attempted_all - attempted_llm}")
1325 |                 report.append(f"    Tool P/R/F1 (LLM): {clamp01(scn_prec)*100:.1f}% / {clamp01(scn_recall)*100:.1f}% / {clamp01(scn_f1)*100:.1f}%")
1326 |                 report.append(f"    Over/Under-call factor: {scn_over_disp}")
1327 |                 report.append(f"    Execution time: {result.execution_time:.2f}s")
1328 | 
1329 |                 if result.tool_calls_made:
1330 |                     tools_summary = {}
1331 |                     for tool in result.tool_calls_made:
1332 |                         tools_summary[tool] = tools_summary.get(tool, 0) + 1
1333 |                     report.append(f"    Tools used: {', '.join(f'{k}({v})' for k, v in tools_summary.items())}")
1334 | 
1335 |                 if result.error:
1336 |                     # Replace Unicode characters that might cause issues
1337 |                     safe_error = result.error.replace('\u2605', '*').replace('★', '*')
1338 |                     report.append(f"    Error: {safe_error}")
1339 | 
1340 |         # Weighted complexity score (unchanged)
1341 |         report.append("\n" + "="*60)
1342 |         report.append("FINAL SCORE")
1343 |         report.append("="*60)
1344 | 
1345 |         weights = {"Simple": 0.2, "Medium": 0.25, "Large": 0.3, "Extra Large": 0.25}
1346 |         weighted_score = 0.0
1347 |         weight_total = 0.0
1348 |         for suite in suites:
1349 |             for k, w in weights.items():
1350 |                 if k.lower() in suite.name.lower():
1351 |                     weighted_score += suite.success_rate * w
1352 |                     weight_total += w
1353 |                     break
1354 |         if weight_total == 0:
1355 |             weighted_score = success_rate  # fallback
1356 |         else:
1357 |             weighted_score = weighted_score / weight_total
1358 | 
1359 |         # Use bounded metrics only
1360 |         overall_score = (
1361 |             clamp01(success_rate/100.0) +
1362 |             clamp01(f1) +  # already in 0..1
1363 |             clamp01(param_success_rate_llm/100.0) +
1364 |             clamp01(execution_success_rate_llm/100.0) +
1365 |             clamp01(semantic_match_rate_llm/100.0)
1366 |         ) / 5.0 * 100.0
1367 | 
1368 |         # Optional over-call penalty (soft)
1369 |         penalty = 1.0
1370 |         if overcall_factor > 1.0:
1371 |             # dampen by log so small over-calls don't tank the score
1372 |             penalty = 1.0 / (1.0 + 0.25 * max(0.0, (overcall_factor - 1.0)))
1373 |         overall_score *= penalty
1374 | 
1375 |         def grade(score):
1376 |             if score >= 90: return "A+"
1377 |             if score >= 85: return "A"
1378 |             if score >= 80: return "A-"
1379 |             if score >= 75: return "B+"
1380 |             if score >= 70: return "B"
1381 |             if score >= 65: return "B-"
1382 |             if score >= 60: return "C+"
1383 |             if score >= 55: return "C"
1384 |             if score >= 50: return "C-"
1385 |             if score >= 40: return "D"
1386 |             return "F"
1387 | 
1388 |         report.append(f"Scenario Success Rate: {success_rate:.1f}%")
1389 |         report.append(f"Tool Precision (LLM only): {clamp01(precision)*100:.1f}%")
1390 |         report.append(f"Tool Recall (LLM only): {clamp01(recall)*100:.1f}%")
1391 |         report.append(f"Tool F1 (LLM only): {clamp01(f1)*100:.1f}%")
1392 |         report.append(f"Parameter Accuracy (structural): {param_success_rate:.1f}%")
1393 |         report.append(f"Parameter Accuracy (LLM only): {param_success_rate_llm:.1f}%")
1394 |         report.append(f"Parameter Accuracy (semantic): {semantic_match_rate:.1f}%")
1395 |         report.append(f"Parameter Accuracy (semantic, LLM only): {semantic_match_rate_llm:.1f}%")
1396 |         report.append(f"Execution Success Rate: {execution_success_rate:.1f}%")
1397 |         report.append(f"Execution Success Rate (LLM only): {execution_success_rate_llm:.1f}%")
1398 |         report.append(f"Weighted Complexity Score: {weighted_score:.1f}%")
1399 |         report.append("")
1400 |         report.append(f"OVERALL SCORE: {overall_score:.1f}% ({grade(overall_score)})")
1401 |         report.append("\n" + "="*60)
1402 |         return "\n".join(report)
1403 | 
1404 | 
1405 | 
1406 | # ------------------------ CLI ------------------------
1407 | 
1408 | def main():
1409 |     parser = argparse.ArgumentParser(
1410 |         description="Test LLM natural tool calling capabilities",
1411 |         formatter_class=argparse.RawDescriptionHelpFormatter,
1412 |         epilog="""
1413 | Examples:
1414 |   %(prog)s --api-base https://api.openai.com/v1 --api-key sk-xxx --model gpt-4o
1415 |   %(prog)s --api-base http://localhost:8000/v1 --api-key local --model llama-70b
1416 |   %(prog)s --api-base https://openrouter.ai/api/v1 --api-key sk-or-xxx --model anthropic/claude-3.5-sonnet --provider Anthropic
1417 |         """
1418 |     )
1419 |     parser.add_argument("--api-base", required=True, help="API base URL (e.g., https://api.openai.com/v1)")
1420 |     parser.add_argument("--api-key", required=True, help="API key for authentication")
1421 |     parser.add_argument("--model", required=True, help="Model name to test")
1422 |     parser.add_argument("--provider", help="OpenRouter provider (e.g., 'Anthropic', 'OpenAI')")
1423 |     parser.add_argument("--temperature", type=float, help="Temperature for text generation (default: 0.1)")
1424 |     parser.add_argument("--reasoning-effort", choices=['low', 'medium', 'high'], help="OpenRouter reasoning effort level")
1425 |     parser.add_argument("--max-tools", type=int, default=40, help="Maximum number of tool calls to test (default: 40)")
1426 |     parser.add_argument("--output", help="Output file for the report (optional)")
1427 |     parser.add_argument("--debug", action="store_true", help="Enable debug output")
1428 |     parser.add_argument("--quick", action="store_true", help="Run only simple tests (faster)")
1429 |     args = parser.parse_args()
1430 | 
1431 |     print(f"\nInitializing Natural Tool Calling Tester...")
1432 |     print(f"API Base: {args.api_base}")
1433 |     print(f"Model: {args.model}")
1434 |     if args.provider:
1435 |         print(f"Provider: {args.provider}")
1436 |     if args.temperature is not None:
1437 |         print(f"Temperature: {args.temperature}")
1438 |     if args.reasoning_effort:
1439 |         print(f"Reasoning Effort: {args.reasoning_effort}")
1440 |     print(f"Max Tools: {args.max_tools}")
1441 | 
1442 |     tester = LLMToolTester(
1443 |         args.api_base, 
1444 |         args.api_key, 
1445 |         args.model, 
1446 |         args.debug,
1447 |         provider=args.provider,
1448 |         temperature=args.temperature,
1449 |         reasoning_effort=args.reasoning_effort
1450 |     )
1451 |     suites: List[TestSuite] = []
1452 | 
1453 |     all_scenarios = TestScenarios.get_scenarios()
1454 | 
1455 |     if args.quick:
1456 |         print("\nRunning Quick Test Suite (1-5 tools)...")
1457 |         simple_scenarios = [s for s in all_scenarios if len(s["expected_tools"]) + len(s.get("follow_ups", [])) <= 5]
1458 |         suite = tester.run_test_suite("Quick Tests (1-5 tools)", simple_scenarios[:3])
1459 |         suites.append(suite)
1460 |     else:
1461 |         if args.max_tools >= 5:
1462 |             print("\nRunning Simple Test Suite (1-5 tools)...")
1463 |             simple_scenarios = TestScenarios.get_scenario_by_complexity(1, 5)
1464 |             suite = tester.run_test_suite("Simple Tests (1-5 tools)", simple_scenarios)
1465 |             suites.append(suite)
1466 | 
1467 |         if args.max_tools >= 10:
1468 |             print("\nRunning Medium Test Suite (6-10 tools)...")
1469 |             medium_scenarios = TestScenarios.get_scenario_by_complexity(6, 10)
1470 |             suite = tester.run_test_suite("Medium Tests (6-10 tools)", medium_scenarios)
1471 |             suites.append(suite)
1472 | 
1473 |         if args.max_tools >= 20:
1474 |             print("\nRunning Large Test Suite (11-20 tools)...")
1475 |             large_scenarios = TestScenarios.get_scenario_by_complexity(11, 20)
1476 |             suite = tester.run_test_suite("Large Tests (11-20 tools)", large_scenarios)
1477 |             suites.append(suite)
1478 | 
1479 |         if args.max_tools >= 40:
1480 |             print("\nRunning Extra Large Test Suite (21+ tools)...")
1481 |             xl_scenarios = TestScenarios.get_scenario_by_complexity(21, 50)
1482 |             suite = tester.run_test_suite("Extra Large Tests (21+ tools)", xl_scenarios)
1483 |             suites.append(suite)
1484 | 
1485 |     report = tester.generate_report(suites)
1486 |     print(report)
1487 | 
1488 |     if args.output:
1489 |         with open(args.output, 'w', encoding='utf-8') as f:
1490 |             f.write(report)
1491 |         print(f"\nReport saved to: {args.output}")
1492 | 
1493 |     overall_success = sum(sum(1 for r in suite.results if r.success) for suite in suites)
1494 |     overall_total = sum(len(suite.results) for suite in suites)
1495 |     success_rate = (overall_success / overall_total * 100) if overall_total > 0 else 0
1496 |     return 0 if success_rate >= 70 else 1
1497 | 
1498 | 
1499 | if __name__ == "__main__":
1500 |     sys.exit(main())
1501 | 


--------------------------------------------------------------------------------