├── .python-version
├── tests
    ├── __init__.py
    ├── template
    │   └── eval.toml
    ├── sort.toml
    ├── summarize.toml
    ├── factorial-5.toml
    ├── github-contributors.toml
    ├── google-homepage-images.toml
    ├── summarize-gist.toml
    ├── valtown.toml
    └── test_mcpx.py
├── .gitignore
├── mcpx_eval
    ├── __init__.py
    ├── constants.py
    ├── models.py
    ├── __main__.py
    ├── database.py
    ├── judge.py
    ├── html.py
    └── htmlgen.py
├── run.sh
├── .github
    └── workflows
    │   ├── ci.yml
    │   └── publish.yml
├── pyproject.toml
├── LICENSE
└── README.md


/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # tests
2 | 


--------------------------------------------------------------------------------
/tests/template/eval.toml:
--------------------------------------------------------------------------------
1 | expected-tools = [
2 |   "eval-py",
3 |   "eval-js"
4 | ]
5 | 


--------------------------------------------------------------------------------
/tests/sort.toml:
--------------------------------------------------------------------------------
1 | import = "template/eval.toml"
2 | name = "sort"
3 | prompt = "Sort the list [5, 3, 4, 1, 2] in ascending order."
4 | check = "[1, 2, 3, 4, 5]"
5 | max-tool-calls = 5
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python-generated files
 2 | __pycache__/
 3 | *.py[oc]
 4 | build/
 5 | dist/
 6 | wheels/
 7 | *.egg-info
 8 | 
 9 | # Virtual environments
10 | .venv
11 | 
12 | # sqlite3
13 | *.db
14 | 
15 | # aider
16 | .aider*
17 | 


--------------------------------------------------------------------------------
/tests/summarize.toml:
--------------------------------------------------------------------------------
1 | name = "summarize-fetch"
2 | prompt = "Summarize {{url}}"
3 | check = "A summary of the requested url based on the results of the fetch tool"
4 | vars = {url="https://example.com"}
5 | expected-tools = ["fetch"]
6 | 


--------------------------------------------------------------------------------
/tests/factorial-5.toml:
--------------------------------------------------------------------------------
1 | import = "template/eval.toml"
2 | name = "factorial-5"
3 | prompt = "Calculate the factorial of 5 using math functions"
4 | check = "eval-py or eval-js should be used to determine the factorial of 5 is 120"
5 | max-tool-calls = 5
6 | 


--------------------------------------------------------------------------------
/mcpx_eval/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from .models import Score, Results, Model, Test
 4 | from .database import Database
 5 | from .judge import Judge
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | __all__ = ["Score", "Results", "Model", "Test", "Database", "Judge"]
10 | 


--------------------------------------------------------------------------------
/tests/github-contributors.toml:
--------------------------------------------------------------------------------
1 | name = "github-contributors"
2 | prompt = "List the contributors to {{repo}} and any information github provides about them"
3 | check = "A list of github users using the github tools"
4 | vars = {repo="extism/extism"}
5 | expected-tools = ["gh-get-repo-contributors"]
6 | 


--------------------------------------------------------------------------------
/tests/google-homepage-images.toml:
--------------------------------------------------------------------------------
1 | name = "google-homepage-images"
2 | prompt = "how many images are on the google homepage of google?"
3 | check = "the fetch tool should be used to determine how many images are on the google homepage"
4 | max-tool-calls = 5
5 | expected-tools = [
6 |   "fetch"
7 | ]
8 | 


--------------------------------------------------------------------------------
/tests/summarize-gist.toml:
--------------------------------------------------------------------------------
1 | name = "summarize-gist"
2 | prompt = "Summarize {{url}} and save it to a private gist"
3 | check = "A summary of the requested url based on the results of the fetch tool should be saved as a github gist"
4 | vars = {url="https://dylibso.com"}
5 | expected-tools = ["fetch", "gh-create-gist"]
6 | 


--------------------------------------------------------------------------------
/tests/valtown.toml:
--------------------------------------------------------------------------------
 1 | name = "valtown"
 2 | 
 3 | prompt = """
 4 | create a publicly callable javascript function name getDateAndtime that returns the current date and time
 5 | """
 6 | 
 7 | check = """
 8 | call the created function using the val.town API to check the result
 9 | """
10 | 
11 | expected-tools = [
12 |   "valtown"
13 | ]
14 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | remote_models="\
 4 | --model o1 \
 5 | --model openai:o3-mini \
 6 | --model gpt-4o \
 7 | --model claude-3-5-sonnet-latest \
 8 | --model claude-3-7-sonnet-latest
 9 | --model claude-3-5-haiku-latest"
10 | 
11 | models=${models-$remote_models}
12 | iter=${iterations-5}
13 | 
14 | for test in tests/*.toml; do
15 |   uv run mcpx-eval test --config $test $models --iter $iter 
16 | done
17 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   pull_request:
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | name: CI
 9 | 
10 | jobs:
11 |   test:
12 |     name: Test mcpx-eval
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       matrix:
16 |         os: [ubuntu-latest, macos-latest]
17 |     steps:
18 |       - name: Checkout sources
19 |         uses: actions/checkout@v3
20 |       - name: Install uv
21 |         uses: astral-sh/setup-uv@v5
22 |       - run: uv run python3 -m unittest
23 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |   push:
 4 |     branches: [ "v*" ]
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | permissions:
 9 |   contents: read
10 |   id-token: write
11 | 
12 | name: Publish
13 | 
14 | jobs:
15 |   publish:
16 |     name: Publish mcpx-eval
17 |     runs-on: ${{ matrix.os }}
18 |     strategy:
19 |       matrix:
20 |         os: [ubuntu-latest]
21 |     steps:
22 |       - name: Checkout sources
23 |         uses: actions/checkout@v3
24 |       - name: Install uv
25 |         uses: astral-sh/setup-uv@v5
26 |       - name: uv publish
27 |         run: |
28 |           uv build
29 |           uv publish dist/*
30 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "mcpx-eval"
 3 | version = "0.3.0"
 4 | description = "Open ended tool use evaluation framework"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "jinja2>=3.1.5",
 9 |     "matplotlib>=3.10.0",
10 |     "mcp-run>=0.4.4",
11 |     "mcpx-py>=0.4.2",
12 |     "pandas>=2.2.0",
13 |     "pystache>=0.6.7",
14 | ]
15 | 
16 | [tool.uv]
17 | package = true
18 | 
19 | [dependency-groups]
20 | dev = [
21 |     "python-lsp-ruff>=2.2.2",
22 |     "python-lsp-server>=1.12.2",
23 |     "ruff>=0.9.6",
24 | ]
25 | 
26 | [project.scripts]
27 | mcpx-eval = "mcpx_eval.__main__:main"
28 | 
29 | # Workaround for uv/setuptools version mismatch
30 | # https://github.com/astral-sh/uv/issues/9513#issuecomment-2519527822
31 | [tool.setuptools]
32 | license-files = []
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2025 Dylibso, Inc.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # mcpx-eval
  2 | 
  3 | A framework for evaluating open-ended tool use across various large language models.
  4 | 
  5 | `mcpx-eval` can be used to compare the output of different LLMs with the same prompt for a given task using [mcp.run](https://www.mcp.run) tools.
  6 | This means we're not only interested in the quality of the output, but also curious about the helpfulness of various models
  7 | when presented with real world tools.
  8 | 
  9 | ## Test configs
 10 | 
 11 | The [tests/](https://github.com/dylibso/mcpx-eval/tree/main/tests) directory contains pre-defined evals
 12 | 
 13 | ## Installation
 14 | 
 15 | 
 16 | ```bash
 17 | uv tool install mcpx-eval
 18 | ```
 19 | 
 20 | Or from git:
 21 | 
 22 | ```bash
 23 | uv tool install git+https://github.com/dylibso/mcpx-eval
 24 | ```
 25 | 
 26 | Or using `uvx` without installation:
 27 | 
 28 | ```bash
 29 | uvx mcpx-eval
 30 | ```
 31 | 
 32 | ## mcp.run Setup
 33 | 
 34 | You will need to get an mcp.run session ID by running:
 35 | 
 36 | ```bash
 37 | npx --yes -p @dylibso/mcpx gen-session --write
 38 | ```
 39 | 
 40 | This will generate a new session and write the session ID to a configuration file that can be used
 41 | by `mcpx-eval`.
 42 |  
 43 | If you need to store the session ID in  an environment variable you can run `gen-session`
 44 | without the `--write` flag:
 45 | 
 46 | ```bash
 47 | npx --yes -p @dylibso/mcpx gen-session
 48 | ```
 49 | 
 50 | which should output something like:
 51 | 
 52 | ```
 53 | Login successful!
 54 | Session: kabA7w6qH58H7kKOQ5su4v3bX_CeFn4k.Y4l/s/9dQwkjv9r8t/xZFjsn2fkLzf+tkve89P1vKhQ
 55 | ```
 56 | 
 57 | Then set the `MCP_RUN_SESSION_ID` environment variable:
 58 | 
 59 | ```
 60 | $ export MCP_RUN_SESSION_ID=kabA7w6qH58H7kKOQ5su4v3bX_CeFn4k.Y4l/s/9dQwkjv9r8t/xZFjsn2fkLzf+tkve89P1vKhQ
 61 | ```
 62 | 
 63 | ## Usage
 64 | 
 65 | Run an eval comparing all mcp.task runs for `my-task`:
 66 | 
 67 | ```bash
 68 | mcpx-eval test --task my-task --task-run all
 69 | ```
 70 | 
 71 | Only evaluate the latest task run:
 72 | 
 73 | ```bash
 74 | mcpx-eval test --task my-task --task-run latest
 75 | ```
 76 | 
 77 | Or trigger a new task run:
 78 | 
 79 | ```bash
 80 | mcpx-eval test --task my-task --task-run new
 81 | ```
 82 | 
 83 | Run an mcp.run task locally with a different set of models:
 84 | 
 85 | ```bash
 86 | mcpx-eval test --model .. --model .. --task my-task --iter 10
 87 | ```
 88 | 
 89 | Generate an HTML scoreboard for all evals:
 90 | 
 91 | ```bash
 92 | mcpx-eval gen --html results.html --show
 93 | ```
 94 | 
 95 | ### Test file
 96 | 
 97 | A test file is a TOML file containing the following fields:
 98 | 
 99 | - `name` - name of the test
100 | - `task` - optional, the name of the mcp.run task to use
101 | - `task-run` - optional, one of `latest`, `new`, `all` or the name/index of the task run to analyze
102 | - `prompt` - prompt to test, this is passed to the LLM under test, this can be left blank if `task` is set
103 | - `check` - prompt for the judge, this is used to determine the quality of the test output 
104 | - `expected-tools` - list of tool names that might be used
105 | - `ignored-tools` - optional, list of tools to ignore, they will not be available to the LLM
106 | - `import` - optional, includes fields from another test TOML file
107 | - `vars` - optional, a dict of variables that will be used to format the prompt
108 | 


--------------------------------------------------------------------------------
/mcpx_eval/constants.py:
--------------------------------------------------------------------------------
 1 | SYSTEM_PROMPT = """
 2 | You are a large language model evaluator, you are an expert at comparing the output of various models based on
 3 | accuracy, tool use appropriateness, helpfullness, and quality of the output.
 4 | 
 5 | - The LLMs being tested may have different tools available from the judge
 6 | - All numeric scores should be scored from 0.0 - 100.0, where 100 is the best score and 0 is the worst
 7 | - The original prompt provided to the LLM can be found between the <prompt></prompt> tags
 8 | - The output of the LLM for the given prompt can be found between the <output></output> tags, this is an array of the various
 9 |   messages sent and tools used. The final_result message should be used to fill the `llm_output` field
10 | - Additional information and context for each evaluation is included in the <settings></settings> section
11 | - The <expected-tools></expected-tools> section is provided by the user to list which tools may be to be used to execute the specified task
12 |   if all of the tools listed aren't used it should not affect the score, however it is not good for non-expected tools to be used
13 | - Do not make assumptions about improvements to the quality of the output beyond what is noted in the <check></check> tags, 
14 |   the <check> section is defined by the user as a way to validate the output given for the associated prompt
15 | - The accuracy score should reflect the accuracy of the result generally and taking into account the <check> block and results
16 |   of tool calls
17 | - The tool_use score should be based on whether or not the correct tool was used and whether the minimum amount
18 |   of tools were used to accomplish a task. Over use of tools or repeated use of tools should deduct points from
19 |   this score. This score should also be affected by how well the tools used conform to the tools listed in the
20 |   <expected-tools> block.
21 | - If a tool call fails but is fixed after retrying after a reasonable amount of times it shouldn't be considered a failure
22 |   since some exploration may be needed.
23 | - Multiple failed tool calls that end up accomplishing the goal are preferred to fewer calls that don't.
24 | - The helpfulness score should measure how useful the response is in addressing the user's need. This should also reflect
25 |   the completeness of the response.
26 | - The quality score should reflect the overall clearness and conciseness of the output
27 | - Try to utilize the tools that are available instead of searching for new tools
28 | - Not using any tools should deduct some points from the tool use score
29 | - The `description` field should contain a breakdown of why each score was awarded
30 | 
31 | Advanced evaluation metrics:
32 | - A guess should not be considered a hallucination, however it should affect the accuracy score
33 | - The hallucination_score should measure the presence of made-up, incorrect, or factually unsupported statements
34 |   (lower is better, with 0 being no hallucinations and 100 being completely hallucinated)
35 | - hallucination_score should only apply to made up information, if information is true at the time of the request
36 |   it should be considered to be true
37 | - The false_claims field should list any specific false statements or hallucinations identified in the response
38 | 
39 | For responses containing hallucinations, analyze:
40 | 1. The severity of each hallucination (minor factual error vs completely fabricated information)
41 | 2. The confidence with which hallucinated content is presented
42 | 3. Whether hallucinations are central to the response or peripheral
43 | 4. Whether the hallucination could lead to harmful actions if believed
44 | 
45 | For the hallucination_score metric (0-100 scale, lower is better), carefully check for any false statements,
46 | incorrect information, or made-up facts in the response and list them in the false_claims field.
47 | 
48 | Be thorough in your evaluation, considering how well the model's response meets both technical requirements and user needs.
49 | """
50 | 
51 | TEST_PROMPT = """
52 | You are a helpful tool calling AI assistant with access to various external tools and APIs. Your goal is to complete tasks thoroughly and autonomously by making full use of these tools. Here are your core operating principles:
53 | 
54 | 1. Take initiative - Don't wait for user permission to use tools. If a tool would help complete the task, use it immediately.
55 | 2. Chain multiple tools together when needed - Many tasks require multiple tool calls in sequence. Plan out and execute the full chain of calls needed to achieve the goal.
56 | 3. Handle errors gracefully - If a tool call fails, try alternative approaches or tools rather than asking the user what to do.
57 | 4. Make reasonable assumptions - When tool calls require parameters, use your best judgment to provide appropriate values rather than asking the user.
58 | 5. Show your work - After completing tool calls, explain what you did and show relevant results, but focus on the final outcome the user wanted.
59 | 6. Be thorough - Use tools repeatedly as needed until you're confident you've fully completed the task. Don't stop at partial solutions. However, repeated use of the same tool 
60 |    with the same paramters is unlikely to be helpful.
61 | 7. Always utilize the tools/functions that are already available rather than searching for new tools if possible. Instead of searching try to use an existing tool
62 |    to accomplish a task.
63 | 8. Once an acceptable answer has been reached you should return it to the user, additional tool calls are not needed.
64 | 
65 | Your responses should focus on results rather than asking questions. Only ask the user for clarification if the task itself is unclear or impossible with the tools available.
66 | """
67 | 
68 | # OpenAI model identifiers
69 | OPENAI_MODELS = [
70 |     "gpt-4o",
71 |     "o1",
72 |     "o1-mini",
73 |     "o3-mini",
74 |     "o3",
75 |     "gpt-3.5",
76 |     "gpt-4",
77 |     "gpt-4.5",
78 | ]
79 | 
80 | # Default profile path
81 | DEFAULT_PROFILE = "~/default"
82 | 


--------------------------------------------------------------------------------
/mcpx_eval/models.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel, Field
  2 | from typing import List, Dict, Any, Tuple, Optional
  3 | import pandas as pd
  4 | from dataclasses import dataclass
  5 | from .constants import OPENAI_MODELS, DEFAULT_PROFILE
  6 | 
  7 | import json
  8 | 
  9 | 
 10 | def normalize_profile(profile: str) -> str:
 11 |     """Normalize a profile path to ensure it has the proper format."""
 12 |     if not profile:
 13 |         return DEFAULT_PROFILE
 14 |     if not profile.startswith("~/"):
 15 |         return "~/" + profile
 16 |     return profile
 17 | 
 18 | 
 19 | def parse_model(m: str) -> Tuple[Optional[str], str, str]:
 20 |     """Parse a model string into provider, name and profile components."""
 21 |     provider = None
 22 |     name = m
 23 |     profile = DEFAULT_PROFILE
 24 | 
 25 |     # Split provider and name
 26 |     if ":" in m:
 27 |         provider, name = m.split(":", maxsplit=1)
 28 | 
 29 |     # Split name and profile
 30 |     if "/" in name:
 31 |         name, profile = name.split("/", maxsplit=1)
 32 |         profile = normalize_profile(profile)
 33 | 
 34 |     # Infer provider if not specified
 35 |     if provider is None:
 36 |         if "claude" in name:
 37 |             provider = "anthropic"
 38 |         elif any(model in name for model in OPENAI_MODELS):
 39 |             provider = "openai"
 40 |         elif "gemini" in name:
 41 |             provider = "google"
 42 |         else:
 43 |             provider = "ollama"
 44 | 
 45 |     return (provider, name, profile)
 46 | 
 47 | 
 48 | @dataclass
 49 | class Model:
 50 |     name: str
 51 |     profile: str
 52 |     provider: str
 53 |     trace: dict | None = None
 54 | 
 55 |     def __init__(
 56 |         self, name: str, profile: Optional[str] = None, trace: dict | None = None
 57 |     ):
 58 |         provider, model_name, prof = parse_model(name)
 59 |         self.provider = provider
 60 |         self.name = model_name
 61 |         self.profile = profile if profile is not None else prof
 62 |         self.trace = trace
 63 | 
 64 |     @property
 65 |     def slug(self) -> str:
 66 |         """Generate a slug identifier for the model."""
 67 |         if self.profile in [DEFAULT_PROFILE, "default"]:
 68 |             return self.name
 69 |         if self.profile.startswith("~/"):
 70 |             return f"{self.name}/{self.profile.split('/', maxsplit=1)[1]}"
 71 |         return f"{self.name}/{self.profile}"
 72 | 
 73 |     @property
 74 |     def provider_and_name(self) -> str:
 75 |         """Generate the provider/name identifier."""
 76 |         return f"{self.provider}/{self.name}"
 77 | 
 78 |     @staticmethod
 79 |     def load_trace(path):
 80 |         """Load trace from disk"""
 81 |         with open(path, "r") as f:
 82 |             data = json.load(f)
 83 |             model = data.pop("model")
 84 |             return Model(model, trace=data)
 85 | 
 86 | 
 87 | class ScoreModel(BaseModel):
 88 |     """Used to score the result of an LLM tool call."""
 89 | 
 90 |     llm_output: str = Field(
 91 |         "",
 92 |         description="Model output, this is the 'content' field of the final message from the LLM",
 93 |     )
 94 |     description: str = Field("", description="Description of results for this model")
 95 | 
 96 |     # Core metrics
 97 |     tool_use: float = Field(
 98 |         0.0, description="A score (0-100) of how appropriate the tool use is"
 99 |     )
100 |     accuracy: float = Field(
101 |         0.0,
102 |         description="A score (0-100) of how accurate the response is based on the output of the tool calls",
103 |     )
104 |     completeness: float = Field(
105 |         0.0,
106 |         description="A score (0-100) of how complete the response is according to the task at hand and <check> criteria",
107 |     )
108 |     quality: float = Field(
109 |         0.0,
110 |         description="A score (0-100) of the response quality - this includes the usefullness and clarity of the output",
111 |     )
112 | 
113 |     # Hallucination metrics
114 |     hallucination_score: float = Field(
115 |         0.0,
116 |         description="A score (0-100) representing the presence of hallucinations (lower is better)",
117 |     )
118 |     false_claims: list = Field(
119 |         [],
120 |         description="List of identified false claims or hallucinations in the response",
121 |     )
122 | 
123 |     # Tools
124 |     failed_tool_calls: int = Field(
125 |         0,
126 |         description="The number of failed tool calls, or tool calls that encountered an error",
127 |     )
128 | 
129 | 
130 | @dataclass
131 | class Score:
132 |     """Used to score the result of an LLM tool call."""
133 | 
134 |     score: ScoreModel
135 |     model: str
136 |     duration: float
137 |     tool_analysis: dict
138 |     redundant_tool_calls: int
139 |     tool_calls: int
140 |     trace: dict | None = None
141 | 
142 |     def __getattribute__(self, name: str) -> Any:
143 |         if name == "score":
144 |             return object.__getattribute__(self, name)
145 |         if hasattr(self.score, name):
146 |             return getattr(self.score, name)
147 |         return object.__getattribute__(self, name)
148 | 
149 |     def to_dataframe(self) -> pd.DataFrame:
150 |         """Convert results to a pandas DataFrame for analysis."""
151 |         record = {
152 |             "model": self.model,
153 |             "duration": self.duration,
154 |             "tool_use": self.score.tool_use,
155 |             "tool_calls": self.tool_calls,
156 |             "accuracy": self.score.accuracy,
157 |             "helpfulness": self.score.completeness,
158 |             "quality": self.score.quality,
159 |             "hallucination_score": self.score.hallucination_score,
160 |             "redundant_tool_calls": self.redundant_tool_calls,
161 |             "false_claims_count": len(self.score.false_claims),
162 |             "trace": self.trace,
163 |         }
164 |         return pd.DataFrame(record)
165 | 
166 |     def save_trace(self, path):
167 |         """Save trace to disk"""
168 |         trace = self.trace.copy()
169 |         trace["model"] = self.model
170 |         with open(path, "w") as f:
171 |             f.write(json.dumps(trace))
172 | 
173 | 
174 | class Results(BaseModel):
175 |     """Collection of scores from multiple model evaluations."""
176 | 
177 |     scores: List[Score] = Field([], description="A list of scores for each model")
178 |     duration: float = Field(0.0, description="Total duration of all tests")
179 | 
180 |     def to_dataframe(self) -> pd.DataFrame:
181 |         """Convert results to a pandas DataFrame for analysis."""
182 |         records = []
183 |         for score in self.scores:
184 |             records.append(score.to_dataframe())
185 |         return pd.concat(records)
186 | 
187 | 
188 | @dataclass
189 | class Test:
190 |     """Configuration for a model evaluation test."""
191 | 
192 |     name: str
193 |     prompt: str
194 |     check: str
195 |     models: List[str]
196 |     expected_tools: List[str]
197 |     ignore_tools: List[str]
198 |     profile: Optional[str]
199 |     vars: Dict[str, Any]
200 |     task: Optional[str]
201 |     task_run: Optional[str]
202 | 
203 |     def __init__(
204 |         self,
205 |         name: str,
206 |         prompt: str,
207 |         check: str = "",
208 |         models: List[str] | None = None,
209 |         expected_tools: List[str] | None = None,
210 |         ignore_tools: Optional[List[str]] = None,
211 |         profile: Optional[str] = None,
212 |         vars: Optional[Dict[str, Any]] = None,
213 |         task: Optional[str] = None,
214 |         task_run: Optional[str] = None,
215 |     ):
216 |         self.name = name
217 |         self.prompt = prompt
218 |         self.check = check
219 |         self.models = models or []
220 |         self.expected_tools = expected_tools or []
221 |         self.profile = profile
222 |         self.ignore_tools = ignore_tools or []
223 |         self.vars = vars or {}
224 |         self.task = task
225 |         self.task_run = task_run
226 | 
227 |     @staticmethod
228 |     def from_dict(data: dict) -> "Test":
229 |         """Parse a dict into a test"""
230 |         return Test(
231 |             data.get("name", ""),
232 |             data.get("prompt", ""),
233 |             data.get("check", ""),
234 |             data.get("models", []),
235 |             data.get("expected-tools", []),
236 |             ignore_tools=data.get("ignored-tools", data.get("ignore-tools", [])),
237 |             vars=data.get("vars", {}),
238 |             profile=data.get("profile"),
239 |             task=data.get("task"),
240 |             task_run=data.get("task-run"),
241 |         )
242 | 
243 |     @staticmethod
244 |     def load(path: str) -> "Test":
245 |         """Load a test configuration from a TOML file."""
246 |         import tomllib
247 |         import os
248 | 
249 |         with open(path) as f:
250 |             s = f.read()
251 |         data = tomllib.loads(s)
252 | 
253 |         if "import" in data:
254 |             imports = data["import"]
255 |             if isinstance(imports, str):
256 |                 imports = [imports]
257 | 
258 |             t = None
259 |             for imp in imports:
260 |                 if t is None:
261 |                     t = Test.load(os.path.join(os.path.dirname(path), imp))
262 | 
263 |                 # Update test attributes with any overrides from current file
264 |                 t.name = data.get("name", t.name)
265 |                 t.prompt = data.get("prompt", t.prompt)
266 |                 t.check = data.get("check", t.check)
267 |                 t.profile = data.get("profile", t.profile)
268 |                 t.models = data.get("models", t.models)
269 |                 t.expected_tools.extend(data.get("expected-tools", []))
270 |                 t.ignore_tools.extend(
271 |                     data.get("ignored-tools", data.get("ignore-tools", []))
272 |                 )
273 |                 t.vars.update(**data.get("vars", {}))
274 |                 t.task = t.task or data.get("task")
275 |                 t.task_run = t.task_run or data.get("task-run")
276 |             return t
277 | 
278 |         if "name" not in data:
279 |             data["name"] = path
280 | 
281 |         return Test.from_dict(data)
282 | 


--------------------------------------------------------------------------------
/tests/test_mcpx.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from unittest.mock import Mock, patch, AsyncMock, MagicMock
  3 | import asyncio
  4 | from datetime import datetime, timedelta
  5 | import json
  6 | 
  7 | from mcpx_eval import Judge, Test, Model, Score, Results, Database
  8 | from mcpx_eval.models import ScoreModel
  9 | from mcpx_eval.judge import ToolAnalysis
 10 | 
 11 | class TestJudge(unittest.TestCase):
 12 |     def setUp(self):
 13 |         self.judge = Judge(
 14 |             models=["test-model"],
 15 |             judge_model="test-judge",
 16 |             ignore_tools=["ignored-tool"]
 17 |         )
 18 | 
 19 |     def test_add_model(self):
 20 |         """Test adding models to the judge"""
 21 |         judge = Judge()
 22 |         
 23 |         # Test adding string model
 24 |         judge.add_model("gpt-4")
 25 |         self.assertEqual(len(judge.models), 1)
 26 |         self.assertEqual(judge.models[0].name, "gpt-4")
 27 |         
 28 |         # Test adding Model instance
 29 |         model = Model(name="anthropic:claude-3")
 30 |         judge.add_model(model)
 31 |         self.assertEqual(len(judge.models), 2)
 32 |         self.assertEqual(judge.models[1].name, "claude-3")
 33 |         self.assertEqual(judge.models[1].provider, "anthropic")
 34 | 
 35 |         # Test adding model with profile
 36 |         judge.add_model("mistral", profile="custom")
 37 |         self.assertEqual(len(judge.models), 3)
 38 |         self.assertEqual(judge.models[2].name, "mistral")
 39 |         self.assertEqual(judge.models[2].profile, "custom")
 40 | 
 41 | class TestToolAnalysis(unittest.TestCase):
 42 |     def test_analyze_message_unique_tools(self):
 43 |         """Test analyzing unique tool calls"""
 44 |         from mcpx_eval.judge import ToolAnalysis
 45 |         
 46 |         tool_analysis = ToolAnalysis()
 47 |         
 48 |         # Test first unique tool call
 49 |         msg1 = {
 50 |             "tool": {
 51 |                 "name": "test_tool",
 52 |                 "input": {"param": "value1"}
 53 |             }
 54 |         }
 55 |         tool_analysis.analyze_message(msg1, 0)
 56 |         
 57 |         self.assertEqual(tool_analysis.total_tool_calls, 1)
 58 |         self.assertEqual(tool_analysis.redundant_tool_calls, 0)
 59 |         self.assertEqual(
 60 |             tool_analysis.tool_analysis["tool_0"]["redundancy"],
 61 |             "unique"
 62 |         )
 63 | 
 64 |         # Test second unique tool call
 65 |         msg2 = {
 66 |             "tool": {
 67 |                 "name": "test_tool",
 68 |                 "input": {"param": "value2"}
 69 |             }
 70 |         }
 71 |         tool_analysis.analyze_message(msg2, 1)
 72 |         
 73 |         self.assertEqual(tool_analysis.total_tool_calls, 2)
 74 |         self.assertEqual(tool_analysis.redundant_tool_calls, 0)
 75 | 
 76 |     def test_analyze_message_redundant_tools(self):
 77 |         """Test analyzing redundant tool calls"""
 78 |         from mcpx_eval.judge import ToolAnalysis
 79 |         
 80 |         tool_analysis = ToolAnalysis()
 81 |         
 82 |         # Add first tool call
 83 |         msg1 = {
 84 |             "tool": {
 85 |                 "name": "test_tool",
 86 |                 "input": {"param": "value1"}
 87 |             }
 88 |         }
 89 |         tool_analysis.analyze_message(msg1, 0)
 90 |         
 91 |         # Add redundant tool call
 92 |         msg2 = {
 93 |             "tool": {
 94 |                 "name": "test_tool",
 95 |                 "input": {"param": "value1"}
 96 |             }
 97 |         }
 98 |         tool_analysis.analyze_message(msg2, 1)
 99 |         
100 |         self.assertEqual(tool_analysis.total_tool_calls, 2)
101 |         self.assertEqual(tool_analysis.redundant_tool_calls, 1)
102 |         self.assertEqual(
103 |             tool_analysis.tool_analysis["tool_1"]["redundancy"],
104 |             "redundant"
105 |         )
106 | 
107 | class TestModelApiConfig(unittest.TestCase):
108 |     @patch.dict('os.environ', {
109 |         'OPENAI_HOST': 'https://custom-openai.com',
110 |         'GPT-4_HOST': 'https://custom-gpt4.com'
111 |     })
112 |     def test_get_host_url(self):
113 |         """Test getting host URLs for different providers"""
114 |         from mcpx_eval.judge import ModelApiConfig
115 |         
116 |         # Test OpenAI default
117 |         url = ModelApiConfig.get_host_url("gpt-3.5-turbo", "openai")
118 |         self.assertEqual(url, "https://custom-openai.com/v1")
119 |         
120 |         # Test model-specific override
121 |         url = ModelApiConfig.get_host_url("gpt-4", "openai")
122 |         self.assertEqual(url, "https://custom-gpt4.com/v1")
123 |         
124 |         # Test Ollama default
125 |         url = ModelApiConfig.get_host_url("llama2", "ollama")
126 |         self.assertEqual(url, "http://127.0.0.1:11434/v1")
127 | 
128 | import asyncio
129 | 
130 | class AsyncIteratorMock:
131 |     def __init__(self, items):
132 |         self.items = items
133 |         self.index = 0
134 | 
135 |     async def __aiter__(self):
136 |         return self
137 | 
138 |     async def __anext__(self):
139 |         try:
140 |             item = self.items[self.index]
141 |         except IndexError:
142 |             raise StopAsyncIteration
143 |         self.index += 1
144 |         return item
145 | 
146 | class MockPart:
147 |     def __init__(self, **kwargs):
148 |         self.__dict__.update(kwargs)
149 | 
150 | class MockResponse:
151 |     def __init__(self, **kwargs):
152 |         for key, value in kwargs.items():
153 |             setattr(self, key, value)
154 | 
155 | class TestJudgeEvaluation(unittest.IsolatedAsyncioTestCase):
156 |     @patch('mcpx_eval.judge.Chat')
157 |     @patch('mcpx_eval.judge.mcp_run')
158 |     async def test_evaluate_model_success(self, mock_mcp_run, mock_chat):
159 |         """Test successful model evaluation"""
160 |         # Setup mock mcp_run.Client with proper tools attribute
161 |         mock_tools = MagicMock()
162 |         mock_tools.keys.return_value = ["test_tool"]
163 |         mock_client = MagicMock()
164 |         mock_client.tools = mock_tools
165 |         mock_mcp_run.Client = Mock(return_value=mock_client)
166 |         mock_mcp_run.ClientConfig = Mock()
167 | 
168 |         # Setup mock chat instance
169 |         mock_chat_instance = MagicMock()
170 |         mock_chat_instance.client = mock_client
171 |         
172 |         # Setup response parts
173 |         model_response_parts = [
174 |             MockPart(
175 |                 part_kind="text",
176 |                 content="Test response"
177 |             ),
178 |             MockPart(
179 |                 part_kind="tool-call",
180 |                 tool_name="test_tool",
181 |                 tool_call_id="123",
182 |                 args={"param": "value"},
183 |                 args_as_dict=lambda: {"param": "value"}
184 |             )
185 |         ]
186 |         request_parts = [
187 |             MockPart(
188 |                 part_kind="tool-return",
189 |                 tool_name="test_tool",
190 |                 tool_call_id="123",
191 |                 content="Tool result"
192 |             )
193 |         ]
194 |         
195 |         async def mock_iter(prompt):
196 |             yield MockResponse(model_response=MockResponse(parts=model_response_parts))
197 |             yield MockResponse(request=MockResponse(parts=request_parts))
198 |             yield MockResponse(data=MockPart(data="Final result"))
199 |         
200 |         mock_chat_instance.iter = mock_iter
201 |         mock_chat.return_value = mock_chat_instance
202 |         
203 |         judge = Judge()
204 |         model = Model(name="test-model")
205 |         tool_analysis = ToolAnalysis()
206 |         
207 |         result = await judge.evaluate_model(model, "Test prompt", tool_analysis)
208 |         
209 |         self.assertIsNotNone(result)
210 |         self.assertEqual(len(result["messages"]), 4)  # text, tool-call, tool-return, final_result
211 |         self.assertEqual(result["messages"][0]["kind"], "text")
212 |         self.assertEqual(result["messages"][1]["kind"], "tool-call")
213 |         self.assertEqual(result["messages"][2]["kind"], "tool-return")
214 |         self.assertEqual(result["messages"][3]["kind"], "final_result")
215 | 
216 |     @patch('mcpx_eval.judge.Chat')
217 |     @patch('mcpx_eval.judge.mcp_run')
218 |     async def test_evaluate_model_failure(self, mock_mcp_run, mock_chat):
219 |         """Test model evaluation with error"""
220 |         # Setup mock mcp_run.Client
221 |         mock_client = Mock()
222 |         mock_mcp_run.Client = Mock(return_value=mock_client)
223 |         mock_mcp_run.ClientConfig = Mock()
224 |         
225 |         mock_chat_instance = Mock()
226 |         
227 |         async def mock_iter(prompt):
228 |             raise Exception("Test error")
229 |             yield  # Needed to make it a generator
230 |             
231 |         mock_chat_instance.iter = mock_iter
232 |         mock_chat.return_value = mock_chat_instance
233 | 
234 |         judge = Judge()
235 |         model = Model(name="test-model")
236 |         tool_analysis = ToolAnalysis()
237 |         
238 |         result = await judge.evaluate_model(model, "Test prompt", tool_analysis)
239 |         
240 |         self.assertIsNone(result)
241 | 
242 | class TestDatabase(unittest.TestCase):
243 |     def setUp(self):
244 |         self.db = Database(":memory:")  # Use in-memory SQLite for testing
245 |     
246 |     def test_save_and_retrieve_results(self):
247 |         """Test saving and retrieving test results"""
248 |         # Create test data
249 |         test_name = "test1"
250 |         score_data = ScoreModel(
251 |             tool_use=80,
252 |             accuracy=90,
253 |             completeness=85,
254 |             quality=88,
255 |             hallucination_score=5,
256 |             false_claims=["claim1"],
257 |             llm_output="test output",
258 |             description="test description"
259 |         )
260 |         
261 |         score = Score(
262 |             score=score_data,
263 |             model="test-model",
264 |             duration=1.5,
265 |             tool_analysis={"tool_1": {"name": "test_tool", "redundancy": "unique"}},
266 |             redundant_tool_calls=0,
267 |             tool_calls=1
268 |         )
269 |         
270 |         results = Results(scores=[score], duration=1.5)
271 |         
272 |         # Save results
273 |         self.db.save_results(test_name, results)
274 |         
275 |         # Retrieve and verify results
276 |         retrieved = self.db.average_results(test_name)
277 |         
278 |         self.assertEqual(len(retrieved.scores), 1)
279 |         self.assertEqual(retrieved.scores[0].model, "test-model")
280 |         self.assertEqual(retrieved.scores[0].duration, 1.5)
281 |         self.assertEqual(retrieved.scores[0].tool_calls, 1)
282 |         self.assertEqual(retrieved.scores[0].redundant_tool_calls, 0)
283 |         self.assertEqual(retrieved.scores[0].accuracy, 90)
284 | 
285 | if __name__ == '__main__':
286 |     unittest.main()


--------------------------------------------------------------------------------
/mcpx_eval/__main__.py:
--------------------------------------------------------------------------------
  1 | from . import Judge, Test, Database
  2 | from .html import visualize_json
  3 | import asyncio
  4 | import logging
  5 | import pandas as pd
  6 | from tempfile import NamedTemporaryFile
  7 | import webbrowser
  8 | import os
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def print_result(result):
 14 |     # Print model header
 15 |     print(f"\n{result.model}")
 16 |     print("=" * len(result.model))
 17 | 
 18 |     # Create a DataFrame for the metrics
 19 |     metrics_df = pd.DataFrame(
 20 |         {
 21 |             "Metric": [
 22 |                 "Duration (s)",
 23 |                 "Tool Calls",
 24 |                 "Redundant Calls",
 25 |                 "Failed Calls",
 26 |                 "Tool Use %",
 27 |                 "Accuracy %",
 28 |                 "Completeness %",
 29 |                 "Quality %",
 30 |                 "Hallucination Score",
 31 |             ],
 32 |             "Value": [
 33 |                 f"{result.duration:.2f}",
 34 |                 result.tool_calls,
 35 |                 result.redundant_tool_calls,
 36 |                 result.failed_tool_calls,
 37 |                 f"{result.tool_use:.1f}",
 38 |                 f"{result.accuracy:.1f}",
 39 |                 f"{result.completeness:.1f}",
 40 |                 f"{result.quality:.1f}",
 41 |                 f"{result.hallucination_score:.1f}",
 42 |             ],
 43 |         }
 44 |     )
 45 | 
 46 |     # Print metrics table
 47 |     print("\nMetrics:")
 48 |     print(metrics_df.to_string(index=False))
 49 | 
 50 |     # Print output and description
 51 |     print("\nOutput:")
 52 |     print(result.llm_output)
 53 |     print("\nDescription:")
 54 |     print(result.description)
 55 | 
 56 |     # Print false claims if any
 57 |     if result.false_claims and len(result.false_claims) > 0:
 58 |         print("\nFalse Claims Detected:")
 59 |         for claim in result.false_claims:
 60 |             print(f"  - {claim}")
 61 | 
 62 |     # Print tool analysis if any
 63 |     if result.tool_analysis and len(result.tool_analysis) > 0:
 64 |         print("\nTool Analysis:")
 65 |         tool_data = []
 66 |         for tool_id, analysis in result.tool_analysis.items():
 67 |             if isinstance(analysis, list):
 68 |                 for a in analysis:
 69 |                     tool_data.append(
 70 |                         {
 71 |                             "Tool ID": tool_id,
 72 |                             "Name": a["name"],
 73 |                             "Redundancy": a["redundancy"],
 74 |                         }
 75 |                     )
 76 |             else:
 77 |                 tool_data.append(
 78 |                     {
 79 |                         "Tool ID": tool_id,
 80 |                         "Name": analysis["name"],
 81 |                         "Redundancy": analysis["redundancy"],
 82 |                     }
 83 |                 )
 84 | 
 85 |         if tool_data:
 86 |             tool_df = pd.DataFrame(tool_data)
 87 |             print(tool_df.to_string(index=False))
 88 | 
 89 | 
 90 | def summary(args):
 91 |     db = Database(args.db)
 92 |     res = db.average_results(args.name)
 93 |     if not res.scores:
 94 |         return  # Database class now handles empty results messaging
 95 | 
 96 |     print(f"\nTest Summary: {args.name}")
 97 |     print("=" * (14 + len(args.name)))
 98 |     print(f"Number of results: {len(res.scores)}\n")
 99 | 
100 |     for result in res.scores:
101 |         print_result(result)
102 | 
103 | 
104 | def json_summary(args):
105 |     """Generate a JSON summary of test data"""
106 |     import json
107 | 
108 |     db = Database(args.db)
109 |     summary = db.generate_json_summary()
110 | 
111 |     # Filter to specific test if requested
112 |     if args.name:
113 |         if args.name in summary["tests"]:
114 |             filtered_summary = {
115 |                 "tests": {args.name: summary["tests"][args.name]},
116 |                 "total": {
117 |                     "models": {},
118 |                     "metrics": summary["tests"][args.name]["metrics"],
119 |                     "test_count": 1,
120 |                     "model_count": summary["tests"][args.name]["model_count"],
121 |                 },
122 |                 "generated_at": summary["generated_at"],
123 |             }
124 |             # Include only models that participated in this test
125 |             for model_name, model_data in summary["total"]["models"].items():
126 |                 if model_name in summary["tests"][args.name]["models"]:
127 |                     filtered_summary["total"]["models"][model_name] = {
128 |                         **model_data,
129 |                         "test_count": 1,
130 |                     }
131 |             summary = filtered_summary
132 |         else:
133 |             print(f"Warning: Test '{args.name}' not found in results")
134 | 
135 |     # Format JSON with indentation for readability
136 |     formatted_json = json.dumps(summary, indent=2)
137 | 
138 |     # Output to file or stdout
139 |     if args.json:
140 |         with open(args.output, "w") as f:
141 |             f.write(formatted_json)
142 |         print(f"JSON summary saved to {args.output}")
143 |         print(
144 |             f"To visualize this file, run: uv run python -m mcpx_eval html {args.output}"
145 |         )
146 |     elif not args.html and not args.show:
147 |         print(formatted_json)
148 | 
149 |     # If visualization is requested, create and open it
150 |     output_path = args.html
151 |     html = visualize_json(summary, output_path)
152 |     # Also save a copy to the specified location if provided
153 |     if output_path:
154 |         with open(output_path, "w") as f:
155 |             f.write(html)
156 |         print(f"Saved to {output_path}")
157 |         temp_path = os.path.abspath(output_path)
158 |     if args.show:
159 |         if output_path is None:
160 |             # Write to temporary file and open in browser
161 |             with NamedTemporaryFile(suffix=".html", delete=False, mode="w") as f:
162 |                 f.write(html)
163 |                 temp_path = f.name
164 | 
165 |         print("Opening browser...")
166 |         webbrowser.open(f"file://{temp_path}")
167 | 
168 | 
169 | async def run():
170 |     from argparse import ArgumentParser
171 | 
172 |     parser = ArgumentParser(
173 |         "mcpx-eval", description="Open-ended LLM tool use evaluator for mcp.run tools"
174 |     )
175 |     subparsers = parser.add_subparsers(dest="command", help="Command to run")
176 |     parser.add_argument("--db", default=None, help="SQLite3 database path")
177 | 
178 |     # Main test command (default)
179 |     test_parser = subparsers.add_parser("test", help="Run evaluation tests")
180 |     test_parser.add_argument("--name", default="", help="Test name")
181 |     test_parser.add_argument(
182 |         "--model",
183 |         "-m",
184 |         default=[],
185 |         help="Model to include in test",
186 |         action="append",
187 |     )
188 |     test_parser.add_argument(
189 |         "--judge-model",
190 |         default="claude-3-5-sonnet-latest",
191 |         help="Model to use for Judge",
192 |     )
193 |     test_parser.add_argument(
194 |         "--ignore-tool",
195 |         "-x",
196 |         default=[],
197 |         help="Ignore tool",
198 |         action="append",
199 |     )
200 |     test_parser.add_argument(
201 |         "--tool",
202 |         "-t",
203 |         default=[],
204 |         help="Expected tool",
205 |         action="append",
206 |     )
207 |     test_parser.add_argument(
208 |         "--profile",
209 |         "-p",
210 |         default=None,
211 |         help="Profile to use for judge model",
212 |     )
213 | 
214 |     test_parser.add_argument("--prompt", help="Test prompt")
215 |     test_parser.add_argument("--check", help="Test check")
216 |     test_parser.add_argument("--config", help="Test config file")
217 |     test_parser.add_argument(
218 |         "--iter",
219 |         "-i",
220 |         default=1,
221 |         type=int,
222 |         help="Number of times to run the test for each model",
223 |     )
224 |     test_parser.add_argument(
225 |         "--no-save",
226 |         default=False,
227 |         action="store_true",
228 |         help="Don't save results in db",
229 |     )
230 |     test_parser.add_argument(
231 |         "--task",
232 |         default=None,
233 |         help="Name of task from mcp.run to get prompt from",
234 |     )
235 |     test_parser.add_argument(
236 |         "--task-run",
237 |         default=None,
238 |         help="Name of a specific task run",
239 |     )
240 |     test_parser.add_argument(
241 |         "--var",
242 |         default=[],
243 |         help="Template variable",
244 |         action="append",
245 |     )
246 | 
247 |     # Summary command
248 |     summary_parser = subparsers.add_parser("summary", help="Show test results summary")
249 |     summary_parser.add_argument("name", help="Test name to summarize")
250 | 
251 |     # JSON summary command
252 |     gen_parser = subparsers.add_parser(
253 |         "gen", help="Generate JSON summary of all test data"
254 |     )
255 |     gen_parser.add_argument(
256 |         "--name",
257 |         "-n",
258 |         help="Filter results to a specific test name",
259 |     )
260 |     gen_parser.add_argument(
261 |         "--json",
262 |         help="Output JSON file path (default: print to stdout)",
263 |     )
264 |     gen_parser.add_argument(
265 |         "--show",
266 |         "-s",
267 |         action="store_true",
268 |         help="Create an interactive HTML visualization of the JSON data",
269 |     )
270 |     gen_parser.add_argument(
271 |         "--html",
272 |         help="Output path for HTML visualization (optional)",
273 |     )
274 | 
275 |     # Global options
276 |     parser.add_argument(
277 |         "--log",
278 |         default=None,
279 |         choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
280 |         help="Set the logging level.",
281 |     )
282 |     parser.add_argument(
283 |         "--verbose", default=False, action="store_true", help="Enable verbose logging"
284 |     )
285 | 
286 |     args = parser.parse_args()
287 | 
288 |     # Setup logging
289 |     level = args.log or "INFO"
290 |     log_level = getattr(logging, level, None)
291 |     if not isinstance(log_level, int):
292 |         raise ValueError("Invalid log level: %s" % level)
293 |     logging.basicConfig(level=log_level)
294 | 
295 |     if not args.verbose:
296 |         for handler in logging.root.handlers:
297 |             handler.addFilter(logging.Filter("mcpx_eval"))
298 | 
299 |     # Handle command routing
300 |     command = getattr(args, "command", "test")  # Default to test if not specified
301 | 
302 |     # Visualization commands removed
303 | 
304 |     # Summary command
305 |     if command == "summary":
306 |         summary(args)
307 |         return
308 | 
309 |     # gen command
310 |     elif command == "gen":
311 |         json_summary(args)
312 |         return
313 | 
314 |     # Test command (default)
315 |     elif command == "test":
316 |         test = None
317 |         name = args.name or args.task
318 | 
319 |         vars = {}
320 |         for line in args.var:
321 |             s = line.split("=")
322 |             vars[s[0]] = s[1]
323 | 
324 |         if hasattr(args, "config") and args.config is not None:
325 |             test = Test.load(args.config)
326 |             for model in args.model:
327 |                 test.models.append(model)
328 |             if args.name is None or args.name == "":
329 |                 if test.name is not None:
330 |                     name = test.name
331 |             test.vars.update(**vars)
332 |             test.expected_tools.extend(args.tool)
333 |             test.ignore_tools.extend(args.ignore_tool)
334 |             test.task = args.task or test.task
335 |             test.prompt = args.prompt or test.prompt
336 |             test.check = args.check or test.check
337 |             test.name = args.name or test.name
338 |             test.task_run = args.task_run or test.task_run
339 |         else:
340 |             test = Test(
341 |                 name=name,
342 |                 prompt=args.prompt or "",
343 |                 check=args.check or "",
344 |                 models=args.model,
345 |                 profile=args.profile,
346 |                 expected_tools=args.tool,
347 |                 ignore_tools=args.ignore_tool,
348 |                 vars=vars,
349 |                 task=args.task,
350 |                 task_run=args.task_run,
351 |             )
352 | 
353 |         iterations = args.iter
354 |         logger.info(
355 |             f"Running {test.name}: task={test.task is not None}, models=[{', '.join(test.models)}] ({iterations} iteration{'s' if iterations > 1 else ''})"
356 |         )
357 |         db = None
358 |         if args.db is not None:
359 |             db = Database(args.db)
360 | 
361 |         judge = Judge(
362 |             models=test.models,
363 |             profile=args.profile,
364 |             db=db,
365 |             judge_model=args.judge_model,
366 |             ignore_tools=test.ignore_tools,
367 |         )
368 |         judge.db.save_test(test)
369 | 
370 |         total_duration = 0
371 | 
372 |         for i in range(iterations):
373 |             if iterations > 1:
374 |                 logger.info(f"Iteration {i + 1}/{iterations}")
375 | 
376 |             # For multiple iterations, pass save=True to ensure each run is saved to DB
377 |             res = await judge.run_test(test, save=not args.no_save)
378 |             total_duration += res.duration
379 |             logger.debug(f"Result: {res.scores}")
380 |             if not args.no_save:
381 |                 logger.info("Results saved to db")
382 | 
383 |             if iterations > 1:
384 |                 logger.info(f"Iteration {i + 1} finished in {res.duration}s")
385 | 
386 |         logger.info(f"{test.name} finished in {total_duration}s total")
387 | 
388 |         if iterations > 1:
389 |             print(f"\nShowing results from iteration {iterations} of {iterations}.")
390 |             print(f"Use 'mcpx-eval summary {test.name}' to see aggregated results.\n")
391 | 
392 |         for result in res.scores:
393 |             if result is None:
394 |                 continue
395 |             print_result(result)
396 |     else:
397 |         parser.print_help()
398 | 
399 | 
400 | def main():
401 |     asyncio.run(run())
402 | 
403 | 
404 | if __name__ == "__main__":
405 |     main()
406 | 


--------------------------------------------------------------------------------
/mcpx_eval/database.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import json
  3 | import pandas as pd
  4 | from datetime import datetime
  5 | from .models import Score, Results, Test, ScoreModel
  6 | 
  7 | 
  8 | class Database:
  9 |     conn: sqlite3.Connection
 10 | 
 11 |     def __init__(self, path: str | None = "eval.db"):
 12 |         if path is None:
 13 |             path = "eval.db"
 14 |         self.conn = sqlite3.connect(path)
 15 | 
 16 |         self.conn.executescript(
 17 |             """
 18 |             CREATE TABLE IF NOT EXISTS tests (
 19 |                 id INTEGER PRIMARY KEY,
 20 |                 name TEXT NOT NULL,
 21 |                 prompt TEXT NOT NULL,
 22 |                 prompt_check TEXT NOT NULL,
 23 |                 UNIQUE(name)
 24 |             );
 25 |             CREATE TABLE IF NOT EXISTS eval_results (
 26 |                 id INTEGER PRIMARY KEY,
 27 |                 t TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 28 |                 test_name TEXT NOT NULL,
 29 |                 model TEXT NOT NULL,
 30 |                 duration REAL NOT NULL,
 31 |                 output TEXT NOT NULL,
 32 |                 description TEXT NOT NULL,
 33 |                 accuracy REAL NOT NULL,
 34 |                 tool_use REAL NOT NULL,
 35 |                 tool_calls INT NOT NULL,
 36 |                 redundant_tool_calls INT NOT NULL DEFAULT 0,
 37 |                 failed_tool_calls INT NOT NULL DEFAULT 0,
 38 |                 completeness REAL NOT NULL DEFAULT 0.0,
 39 |                 quality REAL NOT NULL,
 40 |                 hallucination_score REAL NOT NULL DEFAULT 0.0,
 41 |                 false_claims TEXT NOT NULL DEFAULT '[]',
 42 |                 tool_analysis TEXT NOT NULL DEFAULT '{}',
 43 |                 FOREIGN KEY(test_name) REFERENCES tests(name)
 44 |             );
 45 |         """
 46 |         )
 47 |         self.conn.commit()
 48 | 
 49 |     def save_score(self, name: str, score: Score, commit=True):
 50 |         if name == "":
 51 |             return
 52 | 
 53 |         # Convert score to DataFrame for efficient insertion
 54 |         df = pd.DataFrame(
 55 |             [
 56 |                 {
 57 |                     "test_name": name,
 58 |                     "model": score.model,
 59 |                     "duration": score.duration,
 60 |                     "output": score.llm_output,
 61 |                     "description": score.description,
 62 |                     "accuracy": score.accuracy,
 63 |                     "tool_use": score.tool_use,
 64 |                     "tool_calls": score.tool_calls,
 65 |                     "redundant_tool_calls": score.redundant_tool_calls,
 66 |                     "failed_tool_calls": score.failed_tool_calls,
 67 |                     "completeness": score.completeness,
 68 |                     "quality": score.quality,
 69 |                     "hallucination_score": score.hallucination_score,
 70 |                     "false_claims": json.dumps(score.false_claims),
 71 |                     "tool_analysis": json.dumps(score.tool_analysis),
 72 |                 }
 73 |             ]
 74 |         )
 75 | 
 76 |         df.to_sql("eval_results", self.conn, if_exists="append", index=False)
 77 |         if commit:
 78 |             self.conn.commit()
 79 | 
 80 |     def save_test(self, test: "Test"):
 81 |         self.conn.execute(
 82 |             """
 83 |             INSERT OR IGNORE INTO tests (name, prompt, prompt_check) VALUES (?, ?, ?);
 84 |             """,
 85 |             (test.name, test.prompt, test.check),
 86 |         )
 87 |         self.conn.commit()
 88 | 
 89 |     def save_results(self, name: str, results: Results):
 90 |         if not results.scores:
 91 |             return
 92 | 
 93 |         # Convert all scores to DataFrame at once
 94 |         records = [
 95 |             {
 96 |                 "test_name": name,
 97 |                 "model": score.model,
 98 |                 "duration": score.duration,
 99 |                 "output": score.llm_output,
100 |                 "description": score.description,
101 |                 "accuracy": score.accuracy,
102 |                 "tool_use": score.tool_use,
103 |                 "tool_calls": score.tool_calls,
104 |                 "redundant_tool_calls": score.redundant_tool_calls,
105 |                 "failed_tool_calls": score.failed_tool_calls,
106 |                 "completeness": score.completeness,
107 |                 "quality": score.quality,
108 |                 "hallucination_score": score.hallucination_score,
109 |                 "false_claims": json.dumps(score.false_claims),
110 |                 "tool_analysis": json.dumps(score.tool_analysis),
111 |             }
112 |             for score in results.scores
113 |         ]
114 | 
115 |         df = pd.DataFrame(records)
116 |         df.to_sql("eval_results", self.conn, if_exists="append", index=False)
117 |         self.conn.commit()
118 | 
119 |     def average_results(self, name: str) -> Results:
120 |         # Read results into a pandas DataFrame
121 |         df = pd.read_sql_query(
122 |             """
123 |             SELECT *
124 |             FROM eval_results
125 |             WHERE test_name = ?
126 |             """,
127 |             self.conn,
128 |             params=(name,),
129 |         )
130 | 
131 |         if df.empty:
132 |             print(f"No results found in database for test: {name}")
133 |             print("Available tests:")
134 |             tests = pd.read_sql_query(
135 |                 "SELECT DISTINCT test_name FROM eval_results", self.conn
136 |             )
137 |             if tests.empty:
138 |                 print("  No tests have been run yet")
139 |             else:
140 |                 for test in tests["test_name"]:
141 |                     print(f"  - {test}")
142 |             return Results(scores=[])
143 | 
144 |         # Convert false_claims and tool_analysis from JSON strings
145 |         df["false_claims"] = df["false_claims"].apply(json.loads)
146 |         df["tool_analysis"] = df["tool_analysis"].apply(json.loads)
147 | 
148 |         # Group by model and aggregate
149 |         grouped = (
150 |             df.groupby("model")
151 |             .agg(
152 |                 {
153 |                     "duration": "mean",
154 |                     "output": "first",  # take first output as example
155 |                     "description": "first",  # take first description as example
156 |                     "accuracy": "mean",
157 |                     "tool_use": "mean",
158 |                     "tool_calls": "mean",
159 |                     "redundant_tool_calls": "mean",
160 |                     "completeness": "mean",
161 |                     "quality": "mean",
162 |                     "hallucination_score": "mean",
163 |                     "false_claims": "sum",  # combine all false claims
164 |                     "tool_analysis": "first",  # take first tool analysis
165 |                 }
166 |             )
167 |             .reset_index()
168 |         )
169 | 
170 |         # Convert back to Score objects
171 |         scores = [
172 |             Score(
173 |                 model=row["model"],
174 |                 duration=row["duration"],
175 |                 score=ScoreModel(
176 |                     llm_output=row["output"],
177 |                     description=row["description"],
178 |                     accuracy=row["accuracy"],
179 |                     tool_use=row["tool_use"],
180 |                     completeness=row["completeness"],
181 |                     quality=row["quality"],
182 |                     hallucination_score=row["hallucination_score"],
183 |                     false_claims=row["false_claims"],
184 |                 ),
185 |                 tool_analysis=row["tool_analysis"],
186 |                 redundant_tool_calls=int(row["redundant_tool_calls"]),
187 |                 tool_calls=int(row["tool_calls"]),
188 |             )
189 |             for _, row in grouped.iterrows()
190 |         ]
191 | 
192 |         return Results(scores=scores)
193 | 
194 |     def get_test_stats(self, test_name: str | None = None) -> pd.DataFrame:
195 |         """Get detailed statistics for tests.
196 | 
197 |         Args:
198 |             test_name: Optional test name to filter results
199 | 
200 |         Returns:
201 |             DataFrame with test statistics including:
202 |             - Number of runs per model
203 |             - Mean and std dev of scores
204 |             - Min/max durations
205 |         """
206 |         query = """
207 |             SELECT
208 |                 test_name,
209 |                 model,
210 |                 COUNT(*) as runs,
211 |                 AVG(duration) as mean_duration,
212 |                 MIN(duration) as min_duration,
213 |                 MAX(duration) as max_duration,
214 |                 AVG(accuracy) as mean_accuracy,
215 |                 AVG(tool_use) as mean_tool_use,
216 |                 AVG(tool_calls) as mean_tool_calls,
217 |                 AVG(redundant_tool_calls) as mean_redundant_calls,
218 |                 AVG(completeness) as mean_completeness,
219 |                 AVG(quality) as mean_quality,
220 |                 AVG(hallucination_score) as mean_hallucination
221 |             FROM eval_results
222 |         """
223 | 
224 |         if test_name:
225 |             query += " WHERE test_name = ?"
226 |             params = (test_name,)
227 |         else:
228 |             params = ()
229 | 
230 |         query += " GROUP BY test_name, model"
231 | 
232 |         return pd.read_sql_query(query, self.conn, params=params)
233 | 
234 |     def generate_json_summary(self):
235 |         # Read results into a pandas DataFrame
236 |         df = pd.read_sql_query(
237 |             """
238 |             SELECT
239 |                 test_name,
240 |                 model,
241 |                 AVG(accuracy) as accuracy,
242 |                 AVG(tool_use) as tool_use,
243 |                 AVG(tool_calls) as tool_calls,
244 |                 AVG(redundant_tool_calls) as redundant_tool_calls,
245 |                 AVG(failed_tool_calls) as failed_tool_calls,
246 |                 AVG(completeness) as completeness,
247 |                 AVG(quality) as quality,
248 |                 AVG(hallucination_score) as hallucination_score,
249 |                 AVG(duration) as duration,
250 |                 COUNT(*) as runs
251 |             FROM eval_results
252 |             GROUP BY test_name, model
253 |             """,
254 |             self.conn,
255 |         )
256 | 
257 |         # Use pandas styling to create formatted HTML tables
258 |         def style_table(df):
259 |             return (
260 |                 df.style.format(
261 |                     {
262 |                         "accuracy": "{:.3f}%",
263 |                         "tool_use": "{:.3f}%",
264 |                         "completeness": "{:.3f}%",
265 |                         "quality": "{:.3f}%",
266 |                         "hallucination_score": "{:.3f}%",
267 |                         "tool_calls": "{:.1f}",
268 |                         "redundant_tool_calls": "{:.1f}",
269 |                         "runs": "{:.0f}",
270 |                         "duration": "{:.3f}",
271 |                     }
272 |                 )
273 |                 .background_gradient(
274 |                     subset=[
275 |                         "accuracy",
276 |                         "tool_use",
277 |                         "completeness",
278 |                         "quality",
279 |                     ],
280 |                     cmap="RdYlGn",
281 |                 )
282 |                 .background_gradient(subset=["hallucination_score"], cmap="RdYlGn_r")
283 |                 .set_properties(**{"text-align": "center"})
284 |                 .to_html()
285 |             )
286 | 
287 |         # Generate summary structure
288 |         summary = {
289 |             "tests": {},
290 |             "total": {
291 |                 "models": {},
292 |                 "metrics": {},
293 |                 "test_count": len(df["test_name"].unique()),
294 |                 "model_count": len(df["model"].unique()),
295 |             },
296 |         }
297 | 
298 |         # Calculate total metrics with formatted precision
299 |         total_metrics = df.agg(
300 |             {
301 |                 "accuracy": lambda x: round(x.mean(), 3),
302 |                 "tool_use": lambda x: round(x.mean(), 3),
303 |                 "tool_calls": lambda x: round(x.sum(), 1),
304 |                 "redundant_tool_calls": lambda x: round(x.sum(), 1),
305 |                 "completeness": lambda x: round(x.mean(), 3),
306 |                 "quality": lambda x: round(x.mean(), 3),
307 |                 "hallucination_score": lambda x: round(x.mean(), 3),
308 |             }
309 |         )
310 |         summary["total"]["metrics"] = total_metrics.to_dict()
311 | 
312 |         # Process each test
313 |         for test_name in df["test_name"].unique():
314 |             test_df = df[df["test_name"] == test_name]
315 |             test_df = test_df.sort_values("quality", ascending=False)
316 | 
317 |             # Calculate test metrics with formatted precision
318 |             test_metrics = test_df.agg(
319 |                 {
320 |                     "accuracy": lambda x: round(x.mean(), 3),
321 |                     "tool_use": lambda x: round(x.mean(), 3),
322 |                     "tool_calls": lambda x: round(x.sum(), 1),
323 |                     "redundant_tool_calls": lambda x: round(x.sum(), 1),
324 |                     "completeness": lambda x: round(x.mean(), 3),
325 |                     "quality": lambda x: round(x.mean(), 3),
326 |                     "hallucination_score": lambda x: round(x.mean(), 3),
327 |                 }
328 |             )
329 | 
330 |             # Round tool calls in test metrics to 1 decimal place
331 |             if "tool_calls" in test_metrics:
332 |                 test_metrics["tool_calls"] = round(test_metrics["tool_calls"], 1)
333 |             if "redundant_tool_calls" in test_metrics:
334 |                 test_metrics["redundant_tool_calls"] = round(
335 |                     test_metrics["redundant_tool_calls"], 1
336 |                 )
337 | 
338 |             summary["tests"][test_name] = {
339 |                 "models": {
340 |                     row["model"]: {
341 |                         "accuracy": row["accuracy"],
342 |                         "tool_use": row["tool_use"],
343 |                         "tool_calls": row["tool_calls"],
344 |                         "redundant_tool_calls": row["redundant_tool_calls"],
345 |                         "failed_tool_calls": row["failed_tool_calls"],
346 |                         "completeness": row["completeness"],
347 |                         "quality": row["quality"],
348 |                         "hallucination_score": row["hallucination_score"],
349 |                         "runs": row["runs"],
350 |                         "duration": row["duration"],
351 |                     }
352 |                     for _, row in test_df.iterrows()
353 |                 },
354 |                 "metrics": test_metrics.to_dict(),
355 |                 "model_count": len(test_df["model"].unique()),
356 |             }
357 | 
358 |             # Update total models data
359 |             for model in test_df["model"].unique():
360 |                 model_data = test_df[test_df["model"] == model].iloc[0]
361 |                 if model not in summary["total"]["models"]:
362 |                     summary["total"]["models"][model] = {
363 |                         "accuracy": 0.0,
364 |                         "tool_use": 0.0,
365 |                         "tool_calls": 0,
366 |                         "redundant_tool_calls": 0,
367 |                         "completeness": 0.0,
368 |                         "quality": 0.0,
369 |                         "hallucination_score": 0.0,
370 |                         "test_count": 0,
371 |                         "duration": 0.0,
372 |                     }
373 | 
374 |                 summary["total"]["models"][model]["test_count"] += 1
375 |                 for metric in [
376 |                     "accuracy",
377 |                     "tool_use",
378 |                     "completeness",
379 |                     "quality",
380 |                     "hallucination_score",
381 |                     "duration",
382 |                 ]:
383 |                     summary["total"]["models"][model][metric] += model_data[metric]
384 |                 summary["total"]["models"][model]["tool_calls"] += model_data[
385 |                     "tool_calls"
386 |                 ]
387 |                 summary["total"]["models"][model]["redundant_tool_calls"] += model_data[
388 |                     "redundant_tool_calls"
389 |                 ]
390 | 
391 |         # Calculate averages for total model metrics
392 |         for model in summary["total"]["models"]:
393 |             test_count = summary["total"]["models"][model]["test_count"]
394 |             if test_count > 0:
395 |                 for metric in [
396 |                     "accuracy",
397 |                     "tool_use",
398 |                     "completeness",
399 |                     "quality",
400 |                     "hallucination_score",
401 |                     "duration",
402 |                 ]:
403 |                     summary["total"]["models"][model][metric] /= test_count
404 | 
405 |         # Add timestamp
406 |         summary["generated_at"] = datetime.now().isoformat()
407 | 
408 |         return summary
409 | 


--------------------------------------------------------------------------------
/mcpx_eval/judge.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import List, Dict, Any, Optional
  3 | from datetime import datetime, timedelta
  4 | import json
  5 | import traceback
  6 | import os
  7 | 
  8 | from mcpx_py import Chat, mcp_run, openai_compatible_model
  9 | import pystache
 10 | 
 11 | from .models import ScoreModel, Score, Results, Test, Model
 12 | from .database import Database
 13 | from .constants import SYSTEM_PROMPT, TEST_PROMPT
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def is_int(x):
 19 |     if x is None:
 20 |         return False
 21 |     try:
 22 |         int(x)
 23 |         return True
 24 |     except ValueError:
 25 |         return False
 26 | 
 27 | 
 28 | def task_run_index(
 29 |     client: mcp_run.Client, task: str, index: int = -1
 30 | ) -> mcp_run.TaskRun | None:
 31 |     a = list(client.list_task_runs(task))
 32 |     a.reverse()
 33 |     try:
 34 |         return a[index]
 35 |     except IndexError:
 36 |         return None
 37 | 
 38 | 
 39 | class ModelApiConfig:
 40 |     """Helper class to manage model API configurations."""
 41 | 
 42 |     @staticmethod
 43 |     def get_host_url(model_name: str, provider: str) -> str:
 44 |         """Get the appropriate API host URL for a given model and provider."""
 45 |         if provider in ["ollama", "llama"]:
 46 |             host = os.environ.get(
 47 |                 f"{model_name.upper()}_HOST",
 48 |                 os.environ.get(
 49 |                     "LLAMA_HOST",
 50 |                     os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434"),
 51 |                 ),
 52 |             )
 53 |             return f"{host}/v1" if not host.endswith("/v1") else host
 54 |         elif provider == "openai":
 55 |             host = os.environ.get(
 56 |                 f"{model_name.upper()}_HOST",
 57 |                 os.environ.get("OPENAI_HOST", "https://api.openai.com"),
 58 |             )
 59 |             return f"{host}/v1" if not host.endswith("/v1") else host
 60 |         return ""
 61 | 
 62 |     @staticmethod
 63 |     def get_model_config(model: Model) -> str:
 64 |         """Get the appropriate model configuration for API calls."""
 65 |         if model.provider in ["ollama", "llama", "openai"]:
 66 |             host = ModelApiConfig.get_host_url(model.name, model.provider)
 67 |             return openai_compatible_model(host, model.name)
 68 |         return model.name
 69 | 
 70 | 
 71 | class ToolAnalysis:
 72 |     """Helper class to analyze tool usage patterns."""
 73 | 
 74 |     def __init__(self):
 75 |         self.tool_analysis: Dict[str, Any] = {}
 76 |         self.redundant_tool_calls = 0
 77 |         self.seen_tool_patterns = set()
 78 |         self.total_tool_calls = 0
 79 | 
 80 |     def analyze_message(self, msg: Dict[str, Any], index: int) -> None:
 81 |         """Analyze a single message for tool usage patterns."""
 82 |         if not msg.get("tool"):
 83 |             return
 84 | 
 85 |         tool_name = msg["tool"]["name"]
 86 |         tool_input = msg["tool"]["input"]
 87 |         self.total_tool_calls += 1
 88 | 
 89 |         # Create pattern string for redundancy detection
 90 |         tool_pattern = f"{tool_name}:{str(tool_input)}"
 91 | 
 92 |         # Check for redundancy
 93 |         redundancy_status = (
 94 |             "redundant" if tool_pattern in self.seen_tool_patterns else "unique"
 95 |         )
 96 |         if redundancy_status == "redundant":
 97 |             self.redundant_tool_calls += 1
 98 |         else:
 99 |             self.seen_tool_patterns.add(tool_pattern)
100 | 
101 |         # Store tool analysis
102 |         self.tool_analysis[f"tool_{index}"] = {
103 |             "name": tool_name,
104 |             "input": tool_input,
105 |             "redundancy": redundancy_status,
106 |         }
107 | 
108 | 
109 | def format_judge_prompt(prompt, results, check, expected_tools):
110 |     if check is None or check == "":
111 |         check = "Make sure the output matches the requirments of the prompt"
112 |     return f"""
113 |     <settings>
114 |     Current date and time: {datetime.now().isoformat()}
115 |     </settings>
116 |     <prompt>
117 |     {prompt}
118 |     </prompt>
119 |     <output>
120 |     {json.dumps(results)}
121 |     </output>
122 |     <check>{check}</check>
123 |     <expected-tools>{", ".join(expected_tools)}</expected-tools>
124 |     """
125 | 
126 | 
127 | class Judge:
128 |     """Evaluates model performance on given tests."""
129 | 
130 |     model: Model
131 |     models: List[Model]
132 |     ignore_tools: List[str]
133 |     db: Database
134 |     profile: Optional[str]
135 |     retries: int
136 | 
137 |     def __init__(
138 |         self,
139 |         models: Optional[List[Model | str]] = None,
140 |         db: Optional[Database] = None,
141 |         profile: Optional[str] = None,
142 |         judge_model: str = "claude-3-5-sonnet-latest",
143 |         ignore_tools: Optional[List[str]] = None,
144 |         retries: Optional[int] = None,
145 |     ):
146 |         self.retries = retries or 10
147 |         self.profile = profile or mcp_run.ProfileSlug("~", "default")
148 |         self.ignore_tools = ignore_tools or []
149 |         self.db = db or Database()
150 |         self.models = []
151 |         self.model = Model(name=judge_model)
152 |         if models is not None:
153 |             for model in models:
154 |                 self.add_model(model)
155 | 
156 |     def add_model(
157 |         self,
158 |         model: Model | str,
159 |         profile: Optional[str] = None,
160 |     ) -> None:
161 |         """Add a model to the evaluation list."""
162 |         if isinstance(model, str):
163 |             model = Model(name=model)
164 |         if profile is not None:
165 |             model.profile = profile
166 |         self.models.append(model)
167 | 
168 |     async def run_test(self, test: Test, save: bool = True) -> Results:
169 |         """Run a specific test configuration."""
170 |         profile = test.profile
171 |         if profile is None:
172 |             profile = self.profile or mcp_run.ProfileSlug("~", "default")
173 |         else:
174 |             profile = mcp_run.ProfileSlug.parse(profile)
175 | 
176 |         if test.task is not None:
177 |             client = mcp_run.Client(config=mcp_run.ClientConfig(profile=profile))
178 |             tasks = client.tasks
179 |             if test.task not in tasks:
180 |                 raise Exception(f"Invalid task, {test.task} not found in {profile}")
181 |             test.prompt = tasks[test.task].prompt
182 | 
183 |         results = await self.run(
184 |             pystache.render(test.prompt, test.vars),
185 |             test.check,
186 |             test.expected_tools,
187 |             test.task,
188 |             test.task_run,
189 |         )
190 | 
191 |         if save:
192 |             self.db.save_results(test.name, results)
193 |         return results
194 | 
195 |     async def evaluate_model(
196 |         self,
197 |         model: Model,
198 |         prompt: str,
199 |         tool_analysis: ToolAnalysis,
200 |     ) -> Dict[str, Any]:
201 |         """Evaluate a single model's performance."""
202 |         result = {"messages": [], "tools-available": []}
203 | 
204 |         try:
205 |             model_config = ModelApiConfig.get_model_config(model)
206 |             chat = Chat(
207 |                 client=mcp_run.Client(
208 |                     config=mcp_run.ClientConfig(profile=model.profile)
209 |                 ),
210 |                 model=model_config,
211 |                 ignore_tools=self.ignore_tools,
212 |                 system_prompt=TEST_PROMPT,
213 |                 retries=5,
214 |             )
215 | 
216 |             # Get available tools, handling both real and mock objects
217 |             try:
218 |                 result["tools-available"] = list(chat.client.tools.keys())
219 |             except (TypeError, AttributeError):
220 |                 # If tools is a mock object, get the return value directly
221 |                 result["tools-available"] = chat.client.tools.keys()
222 | 
223 |             async for node in chat.iter(prompt):
224 |                 if hasattr(node, "model_response"):
225 |                     for part in node.model_response.parts:
226 |                         if part.part_kind == "text":
227 |                             logger.info(part.content)
228 |                             result["messages"].append(
229 |                                 {"kind": part.part_kind, "text": part.content}
230 |                             )
231 |                         elif part.part_kind == "tool-call":
232 |                             logger.info(
233 |                                 f"Tool {part.tool_name}({part.tool_call_id}): {part.args}"
234 |                             )
235 |                             result["messages"].append(
236 |                                 {
237 |                                     "kind": part.part_kind,
238 |                                     "tool": {
239 |                                         "name": part.tool_name,
240 |                                         "input": part.args_as_dict(),
241 |                                     },
242 |                                     "tool_call_id": part.tool_call_id,
243 |                                 }
244 |                             )
245 |                             tool_analysis.analyze_message(
246 |                                 result["messages"][-1], len(result["messages"]) - 1
247 |                             )
248 | 
249 |                 elif hasattr(node, "request"):
250 |                     for part in node.request.parts:
251 |                         if part.part_kind == "text":
252 |                             result["messages"].append(
253 |                                 {"kind": part.part_kind, "text": part.content}
254 |                             )
255 |                         elif part.part_kind == "tool-return":
256 |                             logger.info(
257 |                                 f"Tool returned {part.tool_name}({part.tool_call_id})"
258 |                             )
259 |                             logger.debug(
260 |                                 f"Tool result {part.tool_name}({part.tool_call_id}):\n{part.content}"
261 |                             )
262 |                             result["messages"].append(
263 |                                 {
264 |                                     "kind": part.part_kind,
265 |                                     "tool_name": part.tool_name,
266 |                                     "content": part.content,
267 |                                     "tool_call_id": part.tool_call_id,
268 |                                 }
269 |                             )
270 |                 elif hasattr(node, "data"):
271 |                     logger.debug(f"Final result: {node.data.data}")
272 |                     result["messages"].append(
273 |                         {"kind": "final_result", "text": node.data.data}
274 |                     )
275 | 
276 |         except KeyboardInterrupt:
277 |             return None
278 |         except Exception:
279 |             logger.error(f"{model.slug} failed: {traceback.format_exc()}")
280 |             return None
281 | 
282 |         return result
283 | 
284 |     async def _evaluate_task_run(
285 |         self,
286 |         client: mcp_run.Client,
287 |         run: mcp_run.TaskRun,
288 |         check: str,
289 |         expected_tools: List[str],
290 |         model_config: ModelApiConfig,
291 |     ) -> Score:
292 |         logger.info(f"Analyzing task run {run.name}")
293 |         prompt = run.results_list[0]["exchange"]["content"]
294 |         agent = Chat(
295 |             client=client,
296 |             model=model_config,
297 |             ignore_tools=self.ignore_tools,
298 |             result_type=ScoreModel,
299 |             system_prompt=SYSTEM_PROMPT,
300 |             result_retries=self.retries,
301 |         )
302 | 
303 |         res = await agent.send_message(
304 |             format_judge_prompt(prompt, run.results_list, check, expected_tools)
305 |         )
306 | 
307 |         tool_analysis = ToolAnalysis()
308 | 
309 |         for i, event in enumerate(run.results_list):
310 |             if event["msg"] == "call tool request":
311 |                 tool_analysis.analyze_message(
312 |                     {
313 |                         "tool": {
314 |                             "name": event["params"]["name"],
315 |                             "input": event["params"]["arguments"],
316 |                         }
317 |                     },
318 |                     i,
319 |                 )
320 | 
321 |         duration = (run.modified_at - run.created_at).total_seconds()
322 |         return Score(
323 |             score=res.data,
324 |             model=run._task.provider["settings"]["model"] + "-" + run.name,
325 |             duration=duration,
326 |             tool_analysis=tool_analysis.tool_analysis,
327 |             redundant_tool_calls=tool_analysis.redundant_tool_calls,
328 |             tool_calls=tool_analysis.total_tool_calls,
329 |             trace=run.results_list,
330 |         )
331 | 
332 |     async def run(
333 |         self,
334 |         prompt: str,
335 |         check: str,
336 |         expected_tools: List[str],
337 |         task: str | None = None,
338 |         task_run: str | None = None,
339 |         vars: dict | None = None,
340 |     ) -> Results:
341 |         """Run evaluation across all models."""
342 |         scores = []
343 |         total_duration = timedelta(seconds=0)
344 | 
345 |         model_config = ModelApiConfig.get_model_config(self.model)
346 |         if task is not None:
347 |             client = mcp_run.Client(config=mcp_run.ClientConfig(profile=self.profile))
348 |             if task_run.lower() == "all":
349 |                 for run in client.list_task_runs(task):
350 |                     scores.append(
351 |                         await self._evaluate_task_run(
352 |                             client, run, check, expected_tools, model_config
353 |                         )
354 |                     )
355 |             elif is_int(task_run) or task_run == "latest":
356 |                 if task_run.lower() == "latest":
357 |                     task_run = -1
358 |                 task_run = int(task_run or -1)
359 |                 run = task_run_index(client, task, index=task_run)
360 |                 if run is not None:
361 |                     scores.append(
362 |                         await self._evaluate_task_run(
363 |                             client, run, check, expected_tools, model_config
364 |                         )
365 |                     )
366 |                 else:
367 |                     logger.error(f"Unable to load {task_run} for task {task}")
368 |             elif task_run is not None and task_run.lower() != "new":
369 |                 found = False
370 |                 for run in client.list_task_runs(task):
371 |                     if run.name == task_run:
372 |                         scores.append(
373 |                             await self._evaluate_task_run(
374 |                                 client, run, check, expected_tools, model_config
375 |                             )
376 |                         )
377 |                         found = True
378 |                 if not found:
379 |                     logger.error(f"Unable to load {task_run} for task {task}")
380 |             elif len(self.models) == 0:
381 |                 logger.info("No task run specified, this will execute a new task run")
382 |                 run = client.tasks[task].run(vars or {})
383 |                 run.wait()
384 |                 run = task_run_index(client, task, index=-1)
385 |                 if run is not None:
386 |                     scores.append(
387 |                         await self._evaluate_task_run(
388 |                             client, run, check, expected_tools, model_config
389 |                         )
390 |                     )
391 |                 else:
392 |                     logger.error(f"Unable to load {task_run} for task {task}")
393 | 
394 |         for model in self.models:
395 |             start = datetime.now()
396 |             tool_analysis = ToolAnalysis()
397 | 
398 |             logger.info(f"Evaluating model {model.slug}")
399 |             result = await self.evaluate_model(model, prompt, tool_analysis)
400 | 
401 |             if result is None:
402 |                 continue
403 | 
404 |             duration = datetime.now() - start
405 |             duration_seconds = duration.total_seconds()
406 |             total_duration += duration
407 | 
408 |             result["duration_in_seconds"] = f"{duration_seconds}s"
409 |             result["number_of_tools_used"] = str(tool_analysis.total_tool_calls)
410 | 
411 |             logger.info(
412 |                 f"Analyzing results of {model.slug} with profile={self.profile}"
413 |             )
414 |             agent = Chat(
415 |                 client=mcp_run.Client(
416 |                     config=mcp_run.ClientConfig(profile=self.profile)
417 |                 ),
418 |                 model=model_config,
419 |                 ignore_tools=self.ignore_tools,
420 |                 result_type=ScoreModel,
421 |                 system_prompt=SYSTEM_PROMPT,
422 |                 result_retries=self.retries,
423 |             )
424 | 
425 |             res = await agent.send_message(
426 |                 format_judge_prompt(prompt, result, check, expected_tools)
427 |             )
428 |             scores.append(
429 |                 Score(
430 |                     score=res.data,
431 |                     model=model.slug,
432 |                     duration=duration_seconds,
433 |                     tool_analysis=tool_analysis.tool_analysis,
434 |                     redundant_tool_calls=tool_analysis.redundant_tool_calls,
435 |                     tool_calls=tool_analysis.total_tool_calls,
436 |                     trace=result,
437 |                 )
438 |             )
439 | 
440 |         return Results(scores=scores, duration=total_duration.total_seconds())
441 | 


--------------------------------------------------------------------------------
/mcpx_eval/html.py:
--------------------------------------------------------------------------------
  1 | def visualize_json(data, output_path=None):
  2 |     """Create an interactive HTML visualization of JSON data"""
  3 |     import json
  4 |     from datetime import datetime
  5 |     import matplotlib.pyplot as plt
  6 |     import io
  7 |     import base64
  8 | 
  9 |     def create_performance_graph(data):
 10 |         """Create a matplotlib graph of model performance"""
 11 |         if not data.get("total", {}).get("models"):
 12 |             return ""
 13 | 
 14 |         models = data["total"]["models"]
 15 |         model_names = list(models.keys())
 16 |         metrics = {
 17 |             "accuracy": [models[m]["accuracy"] for m in model_names],
 18 |             "tool_use": [models[m]["tool_use"] for m in model_names],
 19 |             "completeness": [models[m]["completeness"] for m in model_names],
 20 |             "quality": [models[m]["quality"] for m in model_names],
 21 |             "hallucination": [models[m]["hallucination_score"] for m in model_names],
 22 |         }
 23 | 
 24 |         # Sort by quality score
 25 |         sorted_indices = sorted(
 26 |             range(len(metrics["quality"])),
 27 |             key=lambda k: metrics["quality"][k],
 28 |             reverse=True,
 29 |         )
 30 |         model_names = [model_names[i] for i in sorted_indices]
 31 |         for metric in metrics:
 32 |             metrics[metric] = [metrics[metric][i] for i in sorted_indices]
 33 | 
 34 |         plt.figure(figsize=(15, 8))
 35 |         x = range(len(model_names))
 36 |         width = 0.15  # Narrower bars to fit all metrics
 37 | 
 38 |         # Plot each metric with offset positions
 39 |         plt.bar(
 40 |             [i - width * 2 for i in x],
 41 |             metrics["accuracy"],
 42 |             width,
 43 |             label="Accuracy",
 44 |             color="skyblue",
 45 |         )
 46 |         plt.bar(
 47 |             [i - width for i in x],
 48 |             metrics["tool_use"],
 49 |             width,
 50 |             label="Tool Use",
 51 |             color="lightgreen",
 52 |         )
 53 |         plt.bar(
 54 |             [i for i in x],
 55 |             metrics["completeness"],
 56 |             width,
 57 |             label="Completeness",
 58 |             color="orange",
 59 |         )
 60 |         plt.bar(
 61 |             [i + width for i in x],
 62 |             metrics["quality"],
 63 |             width,
 64 |             label="Quality",
 65 |             color="purple",
 66 |         )
 67 |         plt.bar(
 68 |             [i + width * 2 for i in x],
 69 |             metrics["hallucination"],
 70 |             width,
 71 |             label="Hallucination",
 72 |             color="red",
 73 |         )
 74 | 
 75 |         plt.xlabel("Models", fontsize=12)
 76 |         plt.ylabel("Score (%)", fontsize=12)
 77 |         plt.xticks(x, model_names, rotation=45, ha="right", fontsize=14)
 78 |         plt.legend(loc="upper right", title="Metrics", fontsize=10)
 79 | 
 80 |         plt.grid(True, alpha=0.3)
 81 |         plt.tight_layout()
 82 | 
 83 |         # Convert plot to base64 string
 84 |         buf = io.BytesIO()
 85 |         plt.savefig(buf, format="png", dpi=300, bbox_inches="tight")
 86 |         plt.close()
 87 |         buf.seek(0)
 88 |         return base64.b64encode(buf.getvalue()).decode("utf-8")
 89 | 
 90 |     # Create HTML content with comparison tables and JSON viewer
 91 |     html = (
 92 |         """
 93 |     <!DOCTYPE html>
 94 |     <html>
 95 |     <head>
 96 |         <meta charset="UTF-8">
 97 |         <title>mcpx-eval Scoreboard</title>
 98 |         <style>
 99 |             body {
100 |                 font-family: Arial, sans-serif;
101 |                 margin: 20px;
102 |                 max-width: 90vw;
103 |                 margin: 0 auto;
104 |                 background-color: #f5f5f5;
105 |             }
106 |             h1, h2, h3 {
107 |                 color: #333;
108 |                 text-align: center;
109 |             }
110 |             h1 {
111 |                 margin-bottom: 20px;
112 |             }
113 |             h2 {
114 |                 margin-top: 40px;
115 |                 margin-bottom: 20px;
116 |                 border-bottom: 1px solid #ddd;
117 |                 padding-bottom: 10px;
118 |             }
119 |             .container {
120 |                 background-color: white;
121 |                 padding: 20px;
122 |                 border-radius: 5px;
123 |                 box-shadow: 0 2px 5px rgba(0,0,0,0.1);
124 |                 margin-bottom: 30px;
125 |             }
126 |             .timestamp {
127 |                 text-align: center;
128 |                 color: #777;
129 |                 font-size: 0.9em;
130 |                 margin-bottom: 20px;
131 |             }
132 |             /* Table Styles */
133 |             table {
134 |                 width: 100%;
135 |                 border-collapse: collapse;
136 |                 margin-bottom: 30px;
137 |             }
138 |             th, td {
139 |                 padding: 10px;
140 |                 text-align: left;
141 |                 border: 1px solid #ddd;
142 |             }
143 |             th {
144 |                 background-color: #f2f2f2;
145 |                 font-weight: bold;
146 |                 cursor: pointer;
147 |                 position: relative;
148 |             }
149 |             th:hover {
150 |                 background-color: #e6e6e6;
151 |             }
152 |             th::after {
153 |                 content: '↕';
154 |                 position: absolute;
155 |                 right: 8px;
156 |                 opacity: 0.5;
157 |             }
158 |             th.asc::after {
159 |                 content: '↑';
160 |                 opacity: 1;
161 |             }
162 |             th.desc::after {
163 |                 content: '↓';
164 |                 opacity: 1;
165 |             }
166 |             tr:nth-child(even) {
167 |                 background-color: #f9f9f9;
168 |             }
169 |             .model-header {
170 |                 background-color: #e6f2ff;
171 |                 font-weight: bold;
172 |             }
173 |             .best {
174 |                 font-weight: bold;
175 |                 color: #006600;
176 |             }
177 |             .worst {
178 |                 color: #cc0000;
179 |             }
180 |             .metric-name {
181 |                 font-weight: bold;
182 |             }
183 |             .false-claims {
184 |                 margin-top: 5px;
185 |                 font-size: 0.9em;
186 |                 color: #cc0000;
187 |             }
188 |             /* Removed hallucination-details styling */
189 |         </style>
190 |     </head>
191 |     <body>
192 |         <h1>mcpx-eval Open-Ended Tool Calling Scoreboard</h1>
193 |         <div class="timestamp">Generated on: """
194 |         + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
195 |         + """</div>
196 | 
197 |         <div class="container">
198 |             <h2>Overview</h2>
199 |             <img src="data:image/png;base64,"""
200 |         + create_performance_graph(data)
201 |         + """" alt="Model Performance Graph" style="width:100%; max-width:1000px; display:block; margin:0 auto;">
202 |         </div>
203 | 
204 |         <div id="comparison-tab">
205 |                 <div id="overall-rankings" style="display: none;">
206 |                     <div class="container">
207 |                         <h3>Model Rankings (All Tests)</h3>
208 |                         <table id="overall-table">
209 |                             <thead>
210 |                                 <tr>
211 |                                     <th>Rank</th>
212 |                                     <th>Model</th>
213 |                                     <th>Combined Score</th>
214 |                                     <th>Accuracy</th>
215 |                                     <th>Tool Use</th>
216 |                                     <th>Completeness</th>
217 |                                     <th>Quality</th>
218 |                                     <th>Hallucination</th>
219 |                                     <th>Duration (s)</th>
220 |                                     <th>Tool Calls</th>
221 |                                     <th>Redundant Calls</th>
222 |                                     <th>Failed Calls</th>
223 |                                 </tr>
224 |                             </thead>
225 |                             <tbody id="overall-table-body">
226 |                                 <!-- Filled by JavaScript -->
227 |                             </tbody>
228 |                         </table>
229 |                     </div>
230 |                 </div>
231 | 
232 |                 <!-- Individual Test Results -->
233 |                 <div id="test-results">
234 |                     <!-- Filled by JavaScript -->
235 |                 </div>
236 |             </div>
237 |         </div>
238 | 
239 |         <script>
240 |             // The JSON data
241 |             const jsonData = """
242 |         + json.dumps(data)
243 |         + """;
244 | 
245 |             // Format number as percentage
246 |             function formatPercent(value) {
247 |                 if (typeof value !== 'number') return 'N/A';
248 |                 return value.toFixed(3) + '%';
249 |             }
250 | 
251 |             // Find best and worst values in an array
252 |             function findBestWorst(values, higherIsBetter = true) {
253 |                 if (!values.length) return { best: null, worst: null };
254 | 
255 |                 const numValues = values.filter(v => typeof v === 'number');
256 |                 if (!numValues.length) return { best: null, worst: null };
257 | 
258 |                 if (higherIsBetter) {
259 |                     return {
260 |                         best: Math.max(...numValues),
261 |                         worst: Math.min(...numValues)
262 |                     };
263 |                 } else {
264 |                     return {
265 |                         best: Math.min(...numValues),
266 |                         worst: Math.max(...numValues)
267 |                     };
268 |                 }
269 |             }
270 | 
271 |             // Calculate average of numeric values
272 |             function calculateAverage(values) {
273 |                 const numValues = values.filter(v => typeof v === 'number');
274 |                 if (!numValues.length) return 0;
275 |                 return numValues.reduce((sum, val) => sum + val, 0) / numValues.length;
276 |             }
277 | 
278 |             // Populate the overall model rankings table
279 |             function populateOverallTable() {
280 |                 const tableBody = document.getElementById('overall-table-body');
281 |                 tableBody.innerHTML = '';
282 | 
283 |                 if (!jsonData.total || !jsonData.total.models) return;
284 | 
285 |                 // Get models and calculate average scores
286 |                 const models = Object.entries(jsonData.total.models).map(([name, data]) => {
287 |                     const avgScore = calculateAverage([
288 |                         data.accuracy,
289 |                         data.tool_use,
290 |                         data.completeness,
291 |                         data.quality
292 |                     ]);
293 | 
294 |                     return {
295 |                         name,
296 |                         avgScore,
297 |                         ...data
298 |                     };
299 |                 });
300 | 
301 |                 // Sort by average score (highest first)
302 |                 models.sort((a, b) => b.avgScore - a.avgScore);
303 | 
304 |                 // Get all values for each metric to determine best/worst
305 |                 const allValues = {
306 |                     avgScore: models.map(m => m.avgScore),
307 |                     accuracy: models.map(m => m.accuracy),
308 |                     tool_use: models.map(m => m.tool_use),
309 |                     completeness: models.map(m => m.completeness),
310 |                     quality: models.map(m => m.quality),
311 |                     hallucination_score: models.map(m => m.hallucination_score),
312 |                     duration: models.map(m => m.duration || 0),
313 |                     tool_calls: models.map(m => m.tool_calls || 0),
314 |                     redundant_tool_calls: models.map(m => m.redundant_tool_calls || 0),
315 |                     failed_tool_calls: models.map(m => m.failed_tool_calls || 0)
316 |                 };
317 | 
318 |                 // Find best/worst values
319 |                 const bestWorst = {
320 |                     avgScore: findBestWorst(allValues.avgScore),
321 |                     accuracy: findBestWorst(allValues.accuracy),
322 |                     tool_use: findBestWorst(allValues.tool_use),
323 |                     completeness: findBestWorst(allValues.completeness),
324 |                     quality: findBestWorst(allValues.quality),
325 |                     hallucination_score: findBestWorst(allValues.hallucination_score, false),
326 |                     duration: findBestWorst(allValues.duration, false),
327 |                     tool_calls: findBestWorst(allValues.tool_calls, false),
328 |                     redundant_tool_calls: findBestWorst(allValues.redundant_tool_calls, false),
329 |                     failed_tool_calls: findBestWorst(allValues.failed_tool_calls, false)
330 |                 };
331 | 
332 |                 // Generate and add rows
333 |                 const rows = generateModelRows(models, bestWorst);
334 |                 rows.forEach(row => tableBody.appendChild(row));
335 |             }
336 | 
337 |             // Helper function to generate table rows for models
338 |             function generateModelRows(models, bestWorst) {
339 |                 const rows = [];
340 |                 
341 |                 models.forEach((model, index) => {
342 |                     const row = document.createElement('tr');
343 |                     row.className = 'model-header';
344 | 
345 |                     // Rank
346 |                     const rankCell = document.createElement('td');
347 |                     rankCell.textContent = index + 1;
348 |                     row.appendChild(rankCell);
349 | 
350 |                     // Model name
351 |                     const nameCell = document.createElement('td');
352 |                     nameCell.textContent = model.name;
353 |                     row.appendChild(nameCell);
354 | 
355 |                     // Average score
356 |                     const avgScoreCell = document.createElement('td');
357 |                     avgScoreCell.textContent = formatPercent(model.avgScore);
358 |                     if (model.avgScore === bestWorst.avgScore.best) avgScoreCell.className = 'best';
359 |                     else if (bestWorst.avgScore.best !== bestWorst.avgScore.worst &&
360 |                             model.avgScore === bestWorst.avgScore.worst) avgScoreCell.className = 'worst';
361 |                     row.appendChild(avgScoreCell);
362 | 
363 |                     // Accuracy
364 |                     const accuracyCell = document.createElement('td');
365 |                     accuracyCell.textContent = formatPercent(model.accuracy);
366 |                     if (model.accuracy === bestWorst.accuracy.best) accuracyCell.className = 'best';
367 |                     else if (bestWorst.accuracy.best !== bestWorst.accuracy.worst &&
368 |                             model.accuracy === bestWorst.accuracy.worst) accuracyCell.className = 'worst';
369 |                     row.appendChild(accuracyCell);
370 | 
371 |                     // Tool Use
372 |                     const toolUseCell = document.createElement('td');
373 |                     toolUseCell.textContent = formatPercent(model.tool_use);
374 |                     if (model.tool_use === bestWorst.tool_use.best) toolUseCell.className = 'best';
375 |                     else if (bestWorst.tool_use.best !== bestWorst.tool_use.worst &&
376 |                             model.tool_use === bestWorst.tool_use.worst) toolUseCell.className = 'worst';
377 |                     row.appendChild(toolUseCell);
378 | 
379 |                     // Completeness
380 |                     const completenessCell = document.createElement('td');
381 |                     completenessCell.textContent = formatPercent(model.completeness);
382 |                     if (model.completeness === bestWorst.completeness.best) completenessCell.className = 'best';
383 |                     else if (bestWorst.completeness.best !== bestWorst.completeness.worst &&
384 |                             model.completeness === bestWorst.completeness.worst) completenessCell.className = 'worst';
385 |                     row.appendChild(completenessCell);
386 | 
387 |                     // Quality
388 |                     const qualityCell = document.createElement('td');
389 |                     qualityCell.textContent = formatPercent(model.quality);
390 |                     if (model.quality === bestWorst.quality.best) qualityCell.className = 'best';
391 |                     else if (bestWorst.quality.best !== bestWorst.quality.worst &&
392 |                             model.quality === bestWorst.quality.worst) qualityCell.className = 'worst';
393 |                     row.appendChild(qualityCell);
394 | 
395 |                     // Hallucination
396 |                     const hallucinationCell = document.createElement('td');
397 |                     hallucinationCell.textContent = formatPercent(model.hallucination_score);
398 |                     if (model.hallucination_score === bestWorst.hallucination_score.best) hallucinationCell.className = 'best';
399 |                     else if (bestWorst.hallucination_score.best !== bestWorst.hallucination_score.worst &&
400 |                             model.hallucination_score === bestWorst.hallucination_score.worst) hallucinationCell.className = 'worst';
401 |                     row.appendChild(hallucinationCell);
402 | 
403 |                     // Duration
404 |                     const durationCell = document.createElement('td');
405 |                     durationCell.textContent = (model.duration || 0).toFixed(3);
406 |                     if (model.duration === bestWorst.duration.best) durationCell.className = 'best';
407 |                     else if (bestWorst.duration.best !== bestWorst.duration.worst &&
408 |                             model.duration === bestWorst.duration.worst) durationCell.className = 'worst';
409 |                     row.appendChild(durationCell);
410 | 
411 |                     // Tool Calls
412 |                     const toolCallsCell = document.createElement('td');
413 |                     toolCallsCell.textContent = (model.tool_calls || 0).toFixed(1);
414 |                     // Don't highlight tool calls - it's not inherently better/worse to use more/fewer tools
415 |                     row.appendChild(toolCallsCell);
416 | 
417 |                     // Redundant Calls
418 |                     const redundantCallsCell = document.createElement('td');
419 |                     redundantCallsCell.textContent = (model.redundant_tool_calls || 0).toFixed(1);
420 |                     if (model.redundant_tool_calls === bestWorst.redundant_tool_calls.best) redundantCallsCell.className = 'best';
421 |                     else if (bestWorst.redundant_tool_calls.best !== bestWorst.redundant_tool_calls.worst &&
422 |                             model.redundant_tool_calls === bestWorst.redundant_tool_calls.worst) redundantCallsCell.className = 'worst';
423 |                     row.appendChild(redundantCallsCell);
424 | 
425 |                     // Failed Calls
426 |                     const failedCallsCell = document.createElement('td');
427 |                     failedCallsCell.textContent = (model.failed_tool_calls || 0).toFixed(1);
428 |                     if (model.failed_tool_calls === bestWorst.failed_tool_calls.best && model.failed_tool_calls === 0) failedCallsCell.className = 'best';
429 |                     else if (model.failed_tool_calls === bestWorst.failed_tool_calls.worst && model.failed_tool_calls > 0) failedCallsCell.className = 'worst';
430 |                     row.appendChild(failedCallsCell);
431 | 
432 |                     rows.push(row);
433 |                 });
434 | 
435 |                 return rows;
436 |             }
437 | 
438 |             // Create tables for each individual test
439 |             function createTestTables() {
440 |                 const testResultsContainer = document.getElementById('test-results');
441 |                 testResultsContainer.innerHTML = '';
442 | 
443 |                 if (!jsonData.tests) return;
444 | 
445 |                 // Process each test
446 |                 Object.entries(jsonData.tests).forEach(([testName, testData]) => {
447 |                     if (!testData.models || Object.keys(testData.models).length === 0) return;
448 | 
449 |                     // Create container for this test
450 |                     const testContainer = document.createElement('div');
451 |                     testContainer.className = 'container';
452 | 
453 |                     // Add test header
454 |                     const testHeader = document.createElement('h2');
455 |                     testHeader.textContent = `Test: ${testName}`;
456 |                     testContainer.appendChild(testHeader);
457 | 
458 |                     // Get models and calculate average scores
459 |                     const models = Object.entries(testData.models).map(([name, data]) => {
460 |                         const avgScore = calculateAverage([
461 |                             data.accuracy,
462 |                             data.tool_use,
463 |                             data.completeness,
464 |                             data.quality
465 |                         ]);
466 | 
467 |                         return {
468 |                             name,
469 |                             avgScore,
470 |                             ...data
471 |                         };
472 |                     });
473 | 
474 |                     // Sort by average score (highest first)
475 |                     models.sort((a, b) => b.avgScore - a.avgScore);
476 | 
477 |                     // Get all values for each metric to determine best/worst
478 |                     const allValues = {
479 |                         avgScore: models.map(m => m.avgScore),
480 |                         accuracy: models.map(m => m.accuracy),
481 |                         tool_use: models.map(m => m.tool_use),
482 |                         completeness: models.map(m => m.completeness),
483 |                         quality: models.map(m => m.quality),
484 |                         hallucination_score: models.map(m => m.hallucination_score),
485 |                         duration: models.map(m => m.duration || 0),
486 |                         tool_calls: models.map(m => m.tool_calls || 0),
487 |                         redundant_tool_calls: models.map(m => m.redundant_tool_calls || 0),
488 |                         failed_tool_calls: models.map(m => m.failed_tool_calls || 0)
489 |                     };
490 | 
491 |                     // Find best/worst values
492 |                     const bestWorst = {
493 |                         avgScore: findBestWorst(allValues.avgScore),
494 |                         accuracy: findBestWorst(allValues.accuracy),
495 |                         tool_use: findBestWorst(allValues.tool_use),
496 |                         completeness: findBestWorst(allValues.completeness),
497 |                         quality: findBestWorst(allValues.quality),
498 |                         hallucination_score: findBestWorst(allValues.hallucination_score, false),
499 |                         duration: findBestWorst(allValues.duration, false),
500 |                         tool_calls: findBestWorst(allValues.tool_calls, false),
501 |                         redundant_tool_calls: findBestWorst(allValues.redundant_tool_calls, false),
502 |                         failed_tool_calls: findBestWorst(allValues.failed_tool_calls, false)
503 |                     };
504 | 
505 |                     // Create table
506 |                     const table = document.createElement('table');
507 | 
508 |                     // Create table header
509 |                     const thead = document.createElement('thead');
510 |                     const headerRow = document.createElement('tr');
511 | 
512 |                     ['Rank', 'Model', 'Combined Score', 'Accuracy', 'Tool Use', 'Completeness', 'Quality',
513 |                      'Hallucination', 'Duration (s)', 'Tool Calls', 'Redundant Calls', 'Failed Calls'].forEach(header => {
514 |                         const th = document.createElement('th');
515 |                         th.textContent = header;
516 |                         headerRow.appendChild(th);
517 |                     });
518 | 
519 |                     thead.appendChild(headerRow);
520 |                     table.appendChild(thead);
521 | 
522 |                     // Create table body
523 |                     const tbody = document.createElement('tbody');
524 | 
525 |                     // Generate and add rows
526 |                     const rows = generateModelRows(models, bestWorst);
527 |                     rows.forEach(row => tbody.appendChild(row));
528 | 
529 |                     table.appendChild(tbody);
530 |                     testContainer.appendChild(table);
531 |                     testResultsContainer.appendChild(testContainer);
532 |                 });
533 |             }
534 | 
535 |             // Sort table by column
536 |             function sortTable(table, columnIndex, asc = true) {
537 |                 const tbody = table.querySelector('tbody');
538 |                 const rows = Array.from(tbody.querySelectorAll('tr'));
539 | 
540 |                 // Clear all sort indicators
541 |                 table.querySelectorAll('th').forEach(th => {
542 |                     th.classList.remove('asc', 'desc');
543 |                 });
544 | 
545 |                 // Add sort indicator to current column
546 |                 const th = table.querySelectorAll('th')[columnIndex];
547 |                 th.classList.add(asc ? 'asc' : 'desc');
548 | 
549 |                 // Sort rows
550 |                 const sortedRows = rows.sort((a, b) => {
551 |                     const aCol = a.querySelectorAll('td')[columnIndex];
552 |                     const bCol = b.querySelectorAll('td')[columnIndex];
553 | 
554 |                     let aValue = aCol.textContent.trim();
555 |                     let bValue = bCol.textContent.trim();
556 | 
557 |                     // Convert percentage strings to numbers
558 |                     if (aValue.endsWith('%')) {
559 |                         aValue = parseFloat(aValue);
560 |                         bValue = parseFloat(bValue);
561 |                     }
562 |                     // Convert numeric strings to numbers
563 |                     else if (!isNaN(aValue)) {
564 |                         aValue = parseFloat(aValue);
565 |                         bValue = parseFloat(bValue);
566 |                     }
567 | 
568 |                     if (aValue < bValue) return asc ? -1 : 1;
569 |                     if (aValue > bValue) return asc ? 1 : -1;
570 |                     return 0;
571 |                 });
572 | 
573 |                 // Update row order
574 |                 tbody.innerHTML = '';
575 |                 sortedRows.forEach(row => tbody.appendChild(row));
576 | 
577 |                 // Update ranks if sorting by a metric column
578 |                 if (columnIndex > 1) {
579 |                     sortedRows.forEach((row, index) => {
580 |                         row.querySelector('td').textContent = index + 1;
581 |                     });
582 |                 }
583 |             }
584 | 
585 |             // Add click handlers to table headers
586 |             function addTableSorting(table) {
587 |                 const headers = table.querySelectorAll('th');
588 |                 headers.forEach((header, index) => {
589 |                     header.addEventListener('click', () => {
590 |                         const isAsc = !header.classList.contains('asc');
591 |                         sortTable(table, index, isAsc);
592 |                     });
593 |                 });
594 |             }
595 | 
596 |             // Initialize the page
597 |             document.addEventListener('DOMContentLoaded', function() {
598 |                 // Only show overall rankings if there is more than one test
599 |                 const testCount = Object.keys(jsonData.tests || {}).length;
600 |                 if (testCount > 1) {
601 |                     document.getElementById('overall-rankings').style.display = 'block';
602 |                     populateOverallTable();
603 |                     addTableSorting(document.getElementById('overall-table'));
604 |                 }
605 | 
606 |                 createTestTables();
607 |                 // Add sorting to all test tables
608 |                 document.querySelectorAll('#test-results table').forEach(table => {
609 |                     addTableSorting(table);
610 |                 });
611 |             });
612 |         </script>
613 |     </body>
614 |     </html>
615 |     """
616 |     )
617 | 
618 |     return html
619 | 


--------------------------------------------------------------------------------
/mcpx_eval/htmlgen.py:
--------------------------------------------------------------------------------
  1 | def visualize_json(data, output_path=None):
  2 |     """Create an interactive HTML visualization of JSON data"""
  3 |     import json
  4 |     from datetime import datetime
  5 |     import matplotlib.pyplot as plt
  6 |     import io
  7 |     import base64
  8 | 
  9 |     def create_performance_graph(data):
 10 |         """Create a matplotlib graph of model performance"""
 11 |         if not data.get("total", {}).get("models"):
 12 |             return ""
 13 | 
 14 |         models = data["total"]["models"]
 15 |         model_names = list(models.keys())
 16 |         metrics = {
 17 |             "accuracy": [models[m]["accuracy"] for m in model_names],
 18 |             "tool_use": [models[m]["tool_use"] for m in model_names],
 19 |             "completeness": [models[m]["completeness"] for m in model_names],
 20 |             "quality": [models[m]["quality"] for m in model_names],
 21 |             "hallucination": [models[m]["hallucination_score"] for m in model_names],
 22 |         }
 23 | 
 24 |         # Sort by quality score
 25 |         sorted_indices = sorted(
 26 |             range(len(metrics["quality"])),
 27 |             key=lambda k: metrics["quality"][k],
 28 |             reverse=True,
 29 |         )
 30 |         model_names = [model_names[i] for i in sorted_indices]
 31 |         for metric in metrics:
 32 |             metrics[metric] = [metrics[metric][i] for i in sorted_indices]
 33 | 
 34 |         plt.figure(figsize=(15, 8))
 35 |         x = range(len(model_names))
 36 |         width = 0.15  # Narrower bars to fit all metrics
 37 | 
 38 |         # Plot each metric with offset positions
 39 |         plt.bar(
 40 |             [i - width * 2 for i in x],
 41 |             metrics["accuracy"],
 42 |             width,
 43 |             label="Accuracy",
 44 |             color="skyblue",
 45 |         )
 46 |         plt.bar(
 47 |             [i - width for i in x],
 48 |             metrics["tool_use"],
 49 |             width,
 50 |             label="Tool Use",
 51 |             color="lightgreen",
 52 |         )
 53 |         plt.bar(
 54 |             [i for i in x],
 55 |             metrics["completeness"],
 56 |             width,
 57 |             label="Completeness",
 58 |             color="orange",
 59 |         )
 60 |         plt.bar(
 61 |             [i + width for i in x],
 62 |             metrics["quality"],
 63 |             width,
 64 |             label="Quality",
 65 |             color="purple",
 66 |         )
 67 |         plt.bar(
 68 |             [i + width * 2 for i in x],
 69 |             metrics["hallucination"],
 70 |             width,
 71 |             label="Hallucination",
 72 |             color="red",
 73 |         )
 74 | 
 75 |         plt.xlabel("Models", fontsize=12)
 76 |         plt.ylabel("Score (%)", fontsize=12)
 77 |         plt.xticks(x, model_names, rotation=45, ha="right", fontsize=14)
 78 |         plt.legend(loc="upper right", title="Metrics", fontsize=10)
 79 | 
 80 |         plt.grid(True, alpha=0.3)
 81 |         plt.tight_layout()
 82 | 
 83 |         # Convert plot to base64 string
 84 |         buf = io.BytesIO()
 85 |         plt.savefig(buf, format="png", dpi=300, bbox_inches="tight")
 86 |         plt.close()
 87 |         buf.seek(0)
 88 |         return base64.b64encode(buf.getvalue()).decode("utf-8")
 89 | 
 90 |     # Create HTML content with comparison tables and JSON viewer
 91 |     html = (
 92 |         """
 93 |     <!DOCTYPE html>
 94 |     <html>
 95 |     <head>
 96 |         <meta charset="UTF-8">
 97 |         <title>mcpx-eval Scoreboard</title>
 98 |         <style>
 99 |             body {
100 |                 font-family: Arial, sans-serif;
101 |                 margin: 20px;
102 |                 max-width: 90vw;
103 |                 margin: 0 auto;
104 |                 background-color: #f5f5f5;
105 |             }
106 |             h1, h2, h3 {
107 |                 color: #333;
108 |                 text-align: center;
109 |             }
110 |             h1 {
111 |                 margin-bottom: 20px;
112 |             }
113 |             h2 {
114 |                 margin-top: 40px;
115 |                 margin-bottom: 20px;
116 |                 border-bottom: 1px solid #ddd;
117 |                 padding-bottom: 10px;
118 |             }
119 |             .container {
120 |                 background-color: white;
121 |                 padding: 20px;
122 |                 border-radius: 5px;
123 |                 box-shadow: 0 2px 5px rgba(0,0,0,0.1);
124 |                 margin-bottom: 30px;
125 |             }
126 |             .timestamp {
127 |                 text-align: center;
128 |                 color: #777;
129 |                 font-size: 0.9em;
130 |                 margin-bottom: 20px;
131 |             }
132 |             /* Table Styles */
133 |             table {
134 |                 width: 100%;
135 |                 border-collapse: collapse;
136 |                 margin-bottom: 30px;
137 |             }
138 |             th, td {
139 |                 padding: 10px;
140 |                 text-align: left;
141 |                 border: 1px solid #ddd;
142 |             }
143 |             th {
144 |                 background-color: #f2f2f2;
145 |                 font-weight: bold;
146 |                 cursor: pointer;
147 |                 position: relative;
148 |             }
149 |             th:hover {
150 |                 background-color: #e6e6e6;
151 |             }
152 |             th::after {
153 |                 content: '↕';
154 |                 position: absolute;
155 |                 right: 8px;
156 |                 opacity: 0.5;
157 |             }
158 |             th.asc::after {
159 |                 content: '↑';
160 |                 opacity: 1;
161 |             }
162 |             th.desc::after {
163 |                 content: '↓';
164 |                 opacity: 1;
165 |             }
166 |             tr:nth-child(even) {
167 |                 background-color: #f9f9f9;
168 |             }
169 |             .model-header {
170 |                 background-color: #e6f2ff;
171 |                 font-weight: bold;
172 |             }
173 |             .best {
174 |                 font-weight: bold;
175 |                 color: #006600;
176 |             }
177 |             .worst {
178 |                 color: #cc0000;
179 |             }
180 |             .metric-name {
181 |                 font-weight: bold;
182 |             }
183 |             .false-claims {
184 |                 margin-top: 5px;
185 |                 font-size: 0.9em;
186 |                 color: #cc0000;
187 |             }
188 |             /* Removed hallucination-details styling */
189 |         </style>
190 |     </head>
191 |     <body>
192 |         <h1>mcpx-eval Open-Ended Tool Calling Scoreboard</h1>
193 |         <div class="timestamp">Generated on: """
194 |         + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
195 |         + """</div>
196 | 
197 |         <div class="container">
198 |             <h2>Overview</h2>
199 |             <img src="data:image/png;base64,"""
200 |         + create_performance_graph(data)
201 |         + """" alt="Model Performance Graph" style="width:100%; max-width:1000px; display:block; margin:0 auto;">
202 |         </div>
203 | 
204 |         <div id="comparison-tab">
205 |                 <div id="overall-rankings" style="display: none;">
206 |                     <div class="container">
207 |                         <h3>Model Rankings (All Tests)</h3>
208 |                         <table id="overall-table">
209 |                             <thead>
210 |                                 <tr>
211 |                                     <th>Rank</th>
212 |                                     <th>Model</th>
213 |                                     <th>Combined Score</th>
214 |                                     <th>Accuracy</th>
215 |                                     <th>Tool Use</th>
216 |                                     <th>Completeness</th>
217 |                                     <th>Quality</th>
218 |                                     <th>Hallucination</th>
219 |                                     <th>Duration (s)</th>
220 |                                     <th>Tool Calls</th>
221 |                                     <th>Redundant Calls</th>
222 |                                     <th>Failed Calls</th>
223 |                                 </tr>
224 |                             </thead>
225 |                             <tbody id="overall-table-body">
226 |                                 <!-- Filled by JavaScript -->
227 |                             </tbody>
228 |                         </table>
229 |                     </div>
230 |                 </div>
231 | 
232 |                 <!-- Individual Test Results -->
233 |                 <div id="test-results">
234 |                     <!-- Filled by JavaScript -->
235 |                 </div>
236 |             </div>
237 |         </div>
238 | 
239 |         <script>
240 |             // The JSON data
241 |             const jsonData = """
242 |         + json.dumps(data)
243 |         + """;
244 | 
245 |             // Format number as percentage
246 |             function formatPercent(value) {
247 |                 if (typeof value !== 'number') return 'N/A';
248 |                 return value.toFixed(3) + '%';
249 |             }
250 | 
251 |             // Find best and worst values in an array
252 |             function findBestWorst(values, higherIsBetter = true) {
253 |                 if (!values.length) return { best: null, worst: null };
254 | 
255 |                 const numValues = values.filter(v => typeof v === 'number');
256 |                 if (!numValues.length) return { best: null, worst: null };
257 | 
258 |                 if (higherIsBetter) {
259 |                     return {
260 |                         best: Math.max(...numValues),
261 |                         worst: Math.min(...numValues)
262 |                     };
263 |                 } else {
264 |                     return {
265 |                         best: Math.min(...numValues),
266 |                         worst: Math.max(...numValues)
267 |                     };
268 |                 }
269 |             }
270 | 
271 |             // Calculate average of numeric values
272 |             function calculateAverage(values) {
273 |                 const numValues = values.filter(v => typeof v === 'number');
274 |                 if (!numValues.length) return 0;
275 |                 return numValues.reduce((sum, val) => sum + val, 0) / numValues.length;
276 |             }
277 | 
278 |             // Populate the overall model rankings table
279 |             function populateOverallTable() {
280 |                 const tableBody = document.getElementById('overall-table-body');
281 |                 tableBody.innerHTML = '';
282 | 
283 |                 if (!jsonData.total || !jsonData.total.models) return;
284 | 
285 |                 // Get models and calculate average scores
286 |                 const models = Object.entries(jsonData.total.models).map(([name, data]) => {
287 |                     const avgScore = calculateAverage([
288 |                         data.accuracy,
289 |                         data.tool_use,
290 |                         data.completeness,
291 |                         data.quality
292 |                     ]);
293 | 
294 |                     return {
295 |                         name,
296 |                         avgScore,
297 |                         ...data
298 |                     };
299 |                 });
300 | 
301 |                 // Sort by average score (highest first)
302 |                 models.sort((a, b) => b.avgScore - a.avgScore);
303 | 
304 |                 // Get all values for each metric to determine best/worst
305 |                 const allValues = {
306 |                     avgScore: models.map(m => m.avgScore),
307 |                     accuracy: models.map(m => m.accuracy),
308 |                     tool_use: models.map(m => m.tool_use),
309 |                     completeness: models.map(m => m.completeness),
310 |                     quality: models.map(m => m.quality),
311 |                     hallucination_score: models.map(m => m.hallucination_score),
312 |                     duration: models.map(m => m.duration || 0),
313 |                     tool_calls: models.map(m => m.tool_calls || 0),
314 |                     redundant_tool_calls: models.map(m => m.redundant_tool_calls || 0),
315 |                     failed_tool_calls: models.map(m => m.failed_tool_calls || 0)
316 |                 };
317 | 
318 |                 // Find best/worst values
319 |                 const bestWorst = {
320 |                     avgScore: findBestWorst(allValues.avgScore),
321 |                     accuracy: findBestWorst(allValues.accuracy),
322 |                     tool_use: findBestWorst(allValues.tool_use),
323 |                     completeness: findBestWorst(allValues.completeness),
324 |                     quality: findBestWorst(allValues.quality),
325 |                     hallucination_score: findBestWorst(allValues.hallucination_score, false),
326 |                     duration: findBestWorst(allValues.duration, false),
327 |                     tool_calls: findBestWorst(allValues.tool_calls, false),
328 |                     redundant_tool_calls: findBestWorst(allValues.redundant_tool_calls, false),
329 |                     failed_tool_calls: findBestWorst(allValues.failed_tool_calls, false)
330 |                 };
331 | 
332 |                 // Generate and add rows
333 |                 const rows = generateModelRows(models, bestWorst);
334 |                 rows.forEach(row => tableBody.appendChild(row));
335 |             }
336 | 
337 |             // Helper function to generate table rows for models
338 |             function generateModelRows(models, bestWorst) {
339 |                 const rows = [];
340 |                 
341 |                 models.forEach((model, index) => {
342 |                     const row = document.createElement('tr');
343 |                     row.className = 'model-header';
344 | 
345 |                     // Rank
346 |                     const rankCell = document.createElement('td');
347 |                     rankCell.textContent = index + 1;
348 |                     row.appendChild(rankCell);
349 | 
350 |                     // Model name
351 |                     const nameCell = document.createElement('td');
352 |                     nameCell.textContent = model.name;
353 |                     row.appendChild(nameCell);
354 | 
355 |                     // Average score
356 |                     const avgScoreCell = document.createElement('td');
357 |                     avgScoreCell.textContent = formatPercent(model.avgScore);
358 |                     if (model.avgScore === bestWorst.avgScore.best) avgScoreCell.className = 'best';
359 |                     else if (bestWorst.avgScore.best !== bestWorst.avgScore.worst &&
360 |                             model.avgScore === bestWorst.avgScore.worst) avgScoreCell.className = 'worst';
361 |                     row.appendChild(avgScoreCell);
362 | 
363 |                     // Accuracy
364 |                     const accuracyCell = document.createElement('td');
365 |                     accuracyCell.textContent = formatPercent(model.accuracy);
366 |                     if (model.accuracy === bestWorst.accuracy.best) accuracyCell.className = 'best';
367 |                     else if (bestWorst.accuracy.best !== bestWorst.accuracy.worst &&
368 |                             model.accuracy === bestWorst.accuracy.worst) accuracyCell.className = 'worst';
369 |                     row.appendChild(accuracyCell);
370 | 
371 |                     // Tool Use
372 |                     const toolUseCell = document.createElement('td');
373 |                     toolUseCell.textContent = formatPercent(model.tool_use);
374 |                     if (model.tool_use === bestWorst.tool_use.best) toolUseCell.className = 'best';
375 |                     else if (bestWorst.tool_use.best !== bestWorst.tool_use.worst &&
376 |                             model.tool_use === bestWorst.tool_use.worst) toolUseCell.className = 'worst';
377 |                     row.appendChild(toolUseCell);
378 | 
379 |                     // Completeness
380 |                     const completenessCell = document.createElement('td');
381 |                     completenessCell.textContent = formatPercent(model.completeness);
382 |                     if (model.completeness === bestWorst.completeness.best) completenessCell.className = 'best';
383 |                     else if (bestWorst.completeness.best !== bestWorst.completeness.worst &&
384 |                             model.completeness === bestWorst.completeness.worst) completenessCell.className = 'worst';
385 |                     row.appendChild(completenessCell);
386 | 
387 |                     // Quality
388 |                     const qualityCell = document.createElement('td');
389 |                     qualityCell.textContent = formatPercent(model.quality);
390 |                     if (model.quality === bestWorst.quality.best) qualityCell.className = 'best';
391 |                     else if (bestWorst.quality.best !== bestWorst.quality.worst &&
392 |                             model.quality === bestWorst.quality.worst) qualityCell.className = 'worst';
393 |                     row.appendChild(qualityCell);
394 | 
395 |                     // Hallucination
396 |                     const hallucinationCell = document.createElement('td');
397 |                     hallucinationCell.textContent = formatPercent(model.hallucination_score);
398 |                     if (model.hallucination_score === bestWorst.hallucination_score.best) hallucinationCell.className = 'best';
399 |                     else if (bestWorst.hallucination_score.best !== bestWorst.hallucination_score.worst &&
400 |                             model.hallucination_score === bestWorst.hallucination_score.worst) hallucinationCell.className = 'worst';
401 |                     row.appendChild(hallucinationCell);
402 | 
403 |                     // Duration
404 |                     const durationCell = document.createElement('td');
405 |                     durationCell.textContent = (model.duration || 0).toFixed(3);
406 |                     if (model.duration === bestWorst.duration.best) durationCell.className = 'best';
407 |                     else if (bestWorst.duration.best !== bestWorst.duration.worst &&
408 |                             model.duration === bestWorst.duration.worst) durationCell.className = 'worst';
409 |                     row.appendChild(durationCell);
410 | 
411 |                     // Tool Calls
412 |                     const toolCallsCell = document.createElement('td');
413 |                     toolCallsCell.textContent = (model.tool_calls || 0).toFixed(1);
414 |                     // Don't highlight tool calls - it's not inherently better/worse to use more/fewer tools
415 |                     row.appendChild(toolCallsCell);
416 | 
417 |                     // Redundant Calls
418 |                     const redundantCallsCell = document.createElement('td');
419 |                     redundantCallsCell.textContent = (model.redundant_tool_calls || 0).toFixed(1);
420 |                     if (model.redundant_tool_calls === bestWorst.redundant_tool_calls.best) redundantCallsCell.className = 'best';
421 |                     else if (bestWorst.redundant_tool_calls.best !== bestWorst.redundant_tool_calls.worst &&
422 |                             model.redundant_tool_calls === bestWorst.redundant_tool_calls.worst) redundantCallsCell.className = 'worst';
423 |                     row.appendChild(redundantCallsCell);
424 | 
425 |                     // Failed Calls
426 |                     const failedCallsCell = document.createElement('td');
427 |                     failedCallsCell.textContent = (model.failed_tool_calls || 0).toFixed(1);
428 |                     if (model.failed_tool_calls === bestWorst.failed_tool_calls.best && model.failed_tool_calls === 0) failedCallsCell.className = 'best';
429 |                     else if (model.failed_tool_calls === bestWorst.failed_tool_calls.worst && model.failed_tool_calls > 0) failedCallsCell.className = 'worst';
430 |                     row.appendChild(failedCallsCell);
431 | 
432 |                     rows.push(row);
433 |                 });
434 | 
435 |                 return rows;
436 |             }
437 | 
438 |             // Create tables for each individual test
439 |             function createTestTables() {
440 |                 const testResultsContainer = document.getElementById('test-results');
441 |                 testResultsContainer.innerHTML = '';
442 | 
443 |                 if (!jsonData.tests) return;
444 | 
445 |                 // Process each test
446 |                 Object.entries(jsonData.tests).forEach(([testName, testData]) => {
447 |                     if (!testData.models || Object.keys(testData.models).length === 0) return;
448 | 
449 |                     // Create container for this test
450 |                     const testContainer = document.createElement('div');
451 |                     testContainer.className = 'container';
452 | 
453 |                     // Add test header
454 |                     const testHeader = document.createElement('h2');
455 |                     testHeader.textContent = `Test: ${testName}`;
456 |                     testContainer.appendChild(testHeader);
457 | 
458 |                     // Get models and calculate average scores
459 |                     const models = Object.entries(testData.models).map(([name, data]) => {
460 |                         const avgScore = calculateAverage([
461 |                             data.accuracy,
462 |                             data.tool_use,
463 |                             data.completeness,
464 |                             data.quality
465 |                         ]);
466 | 
467 |                         return {
468 |                             name,
469 |                             avgScore,
470 |                             ...data
471 |                         };
472 |                     });
473 | 
474 |                     // Sort by average score (highest first)
475 |                     models.sort((a, b) => b.avgScore - a.avgScore);
476 | 
477 |                     // Get all values for each metric to determine best/worst
478 |                     const allValues = {
479 |                         avgScore: models.map(m => m.avgScore),
480 |                         accuracy: models.map(m => m.accuracy),
481 |                         tool_use: models.map(m => m.tool_use),
482 |                         completeness: models.map(m => m.completeness),
483 |                         quality: models.map(m => m.quality),
484 |                         hallucination_score: models.map(m => m.hallucination_score),
485 |                         duration: models.map(m => m.duration || 0),
486 |                         tool_calls: models.map(m => m.tool_calls || 0),
487 |                         redundant_tool_calls: models.map(m => m.redundant_tool_calls || 0),
488 |                         failed_tool_calls: models.map(m => m.failed_tool_calls || 0)
489 |                     };
490 | 
491 |                     // Find best/worst values
492 |                     const bestWorst = {
493 |                         avgScore: findBestWorst(allValues.avgScore),
494 |                         accuracy: findBestWorst(allValues.accuracy),
495 |                         tool_use: findBestWorst(allValues.tool_use),
496 |                         completeness: findBestWorst(allValues.completeness),
497 |                         quality: findBestWorst(allValues.quality),
498 |                         hallucination_score: findBestWorst(allValues.hallucination_score, false),
499 |                         duration: findBestWorst(allValues.duration, false),
500 |                         tool_calls: findBestWorst(allValues.tool_calls, false),
501 |                         redundant_tool_calls: findBestWorst(allValues.redundant_tool_calls, false),
502 |                         failed_tool_calls: findBestWorst(allValues.failed_tool_calls, false)
503 |                     };
504 | 
505 |                     // Create table
506 |                     const table = document.createElement('table');
507 | 
508 |                     // Create table header
509 |                     const thead = document.createElement('thead');
510 |                     const headerRow = document.createElement('tr');
511 | 
512 |                     ['Rank', 'Model', 'Combined Score', 'Accuracy', 'Tool Use', 'Completeness', 'Quality',
513 |                      'Hallucination', 'Duration (s)', 'Tool Calls', 'Redundant Calls', 'Failed Calls'].forEach(header => {
514 |                         const th = document.createElement('th');
515 |                         th.textContent = header;
516 |                         headerRow.appendChild(th);
517 |                     });
518 | 
519 |                     thead.appendChild(headerRow);
520 |                     table.appendChild(thead);
521 | 
522 |                     // Create table body
523 |                     const tbody = document.createElement('tbody');
524 | 
525 |                     // Generate and add rows
526 |                     const rows = generateModelRows(models, bestWorst);
527 |                     rows.forEach(row => tbody.appendChild(row));
528 | 
529 |                     table.appendChild(tbody);
530 |                     testContainer.appendChild(table);
531 |                     testResultsContainer.appendChild(testContainer);
532 |                 });
533 |             }
534 | 
535 |             // Sort table by column
536 |             function sortTable(table, columnIndex, asc = true) {
537 |                 const tbody = table.querySelector('tbody');
538 |                 const rows = Array.from(tbody.querySelectorAll('tr'));
539 | 
540 |                 // Clear all sort indicators
541 |                 table.querySelectorAll('th').forEach(th => {
542 |                     th.classList.remove('asc', 'desc');
543 |                 });
544 | 
545 |                 // Add sort indicator to current column
546 |                 const th = table.querySelectorAll('th')[columnIndex];
547 |                 th.classList.add(asc ? 'asc' : 'desc');
548 | 
549 |                 // Sort rows
550 |                 const sortedRows = rows.sort((a, b) => {
551 |                     const aCol = a.querySelectorAll('td')[columnIndex];
552 |                     const bCol = b.querySelectorAll('td')[columnIndex];
553 | 
554 |                     let aValue = aCol.textContent.trim();
555 |                     let bValue = bCol.textContent.trim();
556 | 
557 |                     // Convert percentage strings to numbers
558 |                     if (aValue.endsWith('%')) {
559 |                         aValue = parseFloat(aValue);
560 |                         bValue = parseFloat(bValue);
561 |                     }
562 |                     // Convert numeric strings to numbers
563 |                     else if (!isNaN(aValue)) {
564 |                         aValue = parseFloat(aValue);
565 |                         bValue = parseFloat(bValue);
566 |                     }
567 | 
568 |                     if (aValue < bValue) return asc ? -1 : 1;
569 |                     if (aValue > bValue) return asc ? 1 : -1;
570 |                     return 0;
571 |                 });
572 | 
573 |                 // Update row order
574 |                 tbody.innerHTML = '';
575 |                 sortedRows.forEach(row => tbody.appendChild(row));
576 | 
577 |                 // Update ranks if sorting by a metric column
578 |                 if (columnIndex > 1) {
579 |                     sortedRows.forEach((row, index) => {
580 |                         row.querySelector('td').textContent = index + 1;
581 |                     });
582 |                 }
583 |             }
584 | 
585 |             // Add click handlers to table headers
586 |             function addTableSorting(table) {
587 |                 const headers = table.querySelectorAll('th');
588 |                 headers.forEach((header, index) => {
589 |                     header.addEventListener('click', () => {
590 |                         const isAsc = !header.classList.contains('asc');
591 |                         sortTable(table, index, isAsc);
592 |                     });
593 |                 });
594 |             }
595 | 
596 |             // Initialize the page
597 |             document.addEventListener('DOMContentLoaded', function() {
598 |                 // Only show overall rankings if there is more than one test
599 |                 const testCount = Object.keys(jsonData.tests || {}).length;
600 |                 if (testCount > 1) {
601 |                     document.getElementById('overall-rankings').style.display = 'block';
602 |                     populateOverallTable();
603 |                     addTableSorting(document.getElementById('overall-table'));
604 |                 }
605 | 
606 |                 createTestTables();
607 |                 // Add sorting to all test tables
608 |                 document.querySelectorAll('#test-results table').forEach(table => {
609 |                     addTableSorting(table);
610 |                 });
611 |             });
612 |         </script>
613 |     </body>
614 |     </html>
615 |     """
616 |     )
617 | 
618 |     return html
619 | 


--------------------------------------------------------------------------------