├── .python-version ├── tests ├── __init__.py ├── template │ └── eval.toml ├── sort.toml ├── summarize.toml ├── factorial-5.toml ├── github-contributors.toml ├── google-homepage-images.toml ├── summarize-gist.toml ├── valtown.toml └── test_mcpx.py ├── .gitignore ├── mcpx_eval ├── __init__.py ├── constants.py ├── models.py ├── __main__.py ├── database.py ├── judge.py ├── html.py └── htmlgen.py ├── run.sh ├── .github └── workflows │ ├── ci.yml │ └── publish.yml ├── pyproject.toml ├── LICENSE └── README.md /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # tests 2 | -------------------------------------------------------------------------------- /tests/template/eval.toml: -------------------------------------------------------------------------------- 1 | expected-tools = [ 2 | "eval-py", 3 | "eval-js" 4 | ] 5 | -------------------------------------------------------------------------------- /tests/sort.toml: -------------------------------------------------------------------------------- 1 | import = "template/eval.toml" 2 | name = "sort" 3 | prompt = "Sort the list [5, 3, 4, 1, 2] in ascending order." 4 | check = "[1, 2, 3, 4, 5]" 5 | max-tool-calls = 5 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv 11 | 12 | # sqlite3 13 | *.db 14 | 15 | # aider 16 | .aider* 17 | -------------------------------------------------------------------------------- /tests/summarize.toml: -------------------------------------------------------------------------------- 1 | name = "summarize-fetch" 2 | prompt = "Summarize {{url}}" 3 | check = "A summary of the requested url based on the results of the fetch tool" 4 | vars = {url="https://example.com"} 5 | expected-tools = ["fetch"] 6 | -------------------------------------------------------------------------------- /tests/factorial-5.toml: -------------------------------------------------------------------------------- 1 | import = "template/eval.toml" 2 | name = "factorial-5" 3 | prompt = "Calculate the factorial of 5 using math functions" 4 | check = "eval-py or eval-js should be used to determine the factorial of 5 is 120" 5 | max-tool-calls = 5 6 | -------------------------------------------------------------------------------- /mcpx_eval/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .models import Score, Results, Model, Test 4 | from .database import Database 5 | from .judge import Judge 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | __all__ = ["Score", "Results", "Model", "Test", "Database", "Judge"] 10 | -------------------------------------------------------------------------------- /tests/github-contributors.toml: -------------------------------------------------------------------------------- 1 | name = "github-contributors" 2 | prompt = "List the contributors to {{repo}} and any information github provides about them" 3 | check = "A list of github users using the github tools" 4 | vars = {repo="extism/extism"} 5 | expected-tools = ["gh-get-repo-contributors"] 6 | -------------------------------------------------------------------------------- /tests/google-homepage-images.toml: -------------------------------------------------------------------------------- 1 | name = "google-homepage-images" 2 | prompt = "how many images are on the google homepage of google?" 3 | check = "the fetch tool should be used to determine how many images are on the google homepage" 4 | max-tool-calls = 5 5 | expected-tools = [ 6 | "fetch" 7 | ] 8 | -------------------------------------------------------------------------------- /tests/summarize-gist.toml: -------------------------------------------------------------------------------- 1 | name = "summarize-gist" 2 | prompt = "Summarize {{url}} and save it to a private gist" 3 | check = "A summary of the requested url based on the results of the fetch tool should be saved as a github gist" 4 | vars = {url="https://dylibso.com"} 5 | expected-tools = ["fetch", "gh-create-gist"] 6 | -------------------------------------------------------------------------------- /tests/valtown.toml: -------------------------------------------------------------------------------- 1 | name = "valtown" 2 | 3 | prompt = """ 4 | create a publicly callable javascript function name getDateAndtime that returns the current date and time 5 | """ 6 | 7 | check = """ 8 | call the created function using the val.town API to check the result 9 | """ 10 | 11 | expected-tools = [ 12 | "valtown" 13 | ] 14 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | remote_models="\ 4 | --model o1 \ 5 | --model openai:o3-mini \ 6 | --model gpt-4o \ 7 | --model claude-3-5-sonnet-latest \ 8 | --model claude-3-7-sonnet-latest 9 | --model claude-3-5-haiku-latest" 10 | 11 | models=${models-$remote_models} 12 | iter=${iterations-5} 13 | 14 | for test in tests/*.toml; do 15 | uv run mcpx-eval test --config $test $models --iter $iter 16 | done 17 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - main 7 | 8 | name: CI 9 | 10 | jobs: 11 | test: 12 | name: Test mcpx-eval 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | os: [ubuntu-latest, macos-latest] 17 | steps: 18 | - name: Checkout sources 19 | uses: actions/checkout@v3 20 | - name: Install uv 21 | uses: astral-sh/setup-uv@v5 22 | - run: uv run python3 -m unittest 23 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | push: 4 | branches: [ "v*" ] 5 | tags: 6 | - 'v*' 7 | 8 | permissions: 9 | contents: read 10 | id-token: write 11 | 12 | name: Publish 13 | 14 | jobs: 15 | publish: 16 | name: Publish mcpx-eval 17 | runs-on: ${{ matrix.os }} 18 | strategy: 19 | matrix: 20 | os: [ubuntu-latest] 21 | steps: 22 | - name: Checkout sources 23 | uses: actions/checkout@v3 24 | - name: Install uv 25 | uses: astral-sh/setup-uv@v5 26 | - name: uv publish 27 | run: | 28 | uv build 29 | uv publish dist/* 30 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "mcpx-eval" 3 | version = "0.3.0" 4 | description = "Open ended tool use evaluation framework" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "jinja2>=3.1.5", 9 | "matplotlib>=3.10.0", 10 | "mcp-run>=0.4.4", 11 | "mcpx-py>=0.4.2", 12 | "pandas>=2.2.0", 13 | "pystache>=0.6.7", 14 | ] 15 | 16 | [tool.uv] 17 | package = true 18 | 19 | [dependency-groups] 20 | dev = [ 21 | "python-lsp-ruff>=2.2.2", 22 | "python-lsp-server>=1.12.2", 23 | "ruff>=0.9.6", 24 | ] 25 | 26 | [project.scripts] 27 | mcpx-eval = "mcpx_eval.__main__:main" 28 | 29 | # Workaround for uv/setuptools version mismatch 30 | # https://github.com/astral-sh/uv/issues/9513#issuecomment-2519527822 31 | [tool.setuptools] 32 | license-files = [] 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2025 Dylibso, Inc. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mcpx-eval 2 | 3 | A framework for evaluating open-ended tool use across various large language models. 4 | 5 | `mcpx-eval` can be used to compare the output of different LLMs with the same prompt for a given task using [mcp.run](https://www.mcp.run) tools. 6 | This means we're not only interested in the quality of the output, but also curious about the helpfulness of various models 7 | when presented with real world tools. 8 | 9 | ## Test configs 10 | 11 | The [tests/](https://github.com/dylibso/mcpx-eval/tree/main/tests) directory contains pre-defined evals 12 | 13 | ## Installation 14 | 15 | 16 | ```bash 17 | uv tool install mcpx-eval 18 | ``` 19 | 20 | Or from git: 21 | 22 | ```bash 23 | uv tool install git+https://github.com/dylibso/mcpx-eval 24 | ``` 25 | 26 | Or using `uvx` without installation: 27 | 28 | ```bash 29 | uvx mcpx-eval 30 | ``` 31 | 32 | ## mcp.run Setup 33 | 34 | You will need to get an mcp.run session ID by running: 35 | 36 | ```bash 37 | npx --yes -p @dylibso/mcpx gen-session --write 38 | ``` 39 | 40 | This will generate a new session and write the session ID to a configuration file that can be used 41 | by `mcpx-eval`. 42 | 43 | If you need to store the session ID in an environment variable you can run `gen-session` 44 | without the `--write` flag: 45 | 46 | ```bash 47 | npx --yes -p @dylibso/mcpx gen-session 48 | ``` 49 | 50 | which should output something like: 51 | 52 | ``` 53 | Login successful! 54 | Session: kabA7w6qH58H7kKOQ5su4v3bX_CeFn4k.Y4l/s/9dQwkjv9r8t/xZFjsn2fkLzf+tkve89P1vKhQ 55 | ``` 56 | 57 | Then set the `MCP_RUN_SESSION_ID` environment variable: 58 | 59 | ``` 60 | $ export MCP_RUN_SESSION_ID=kabA7w6qH58H7kKOQ5su4v3bX_CeFn4k.Y4l/s/9dQwkjv9r8t/xZFjsn2fkLzf+tkve89P1vKhQ 61 | ``` 62 | 63 | ## Usage 64 | 65 | Run an eval comparing all mcp.task runs for `my-task`: 66 | 67 | ```bash 68 | mcpx-eval test --task my-task --task-run all 69 | ``` 70 | 71 | Only evaluate the latest task run: 72 | 73 | ```bash 74 | mcpx-eval test --task my-task --task-run latest 75 | ``` 76 | 77 | Or trigger a new task run: 78 | 79 | ```bash 80 | mcpx-eval test --task my-task --task-run new 81 | ``` 82 | 83 | Run an mcp.run task locally with a different set of models: 84 | 85 | ```bash 86 | mcpx-eval test --model .. --model .. --task my-task --iter 10 87 | ``` 88 | 89 | Generate an HTML scoreboard for all evals: 90 | 91 | ```bash 92 | mcpx-eval gen --html results.html --show 93 | ``` 94 | 95 | ### Test file 96 | 97 | A test file is a TOML file containing the following fields: 98 | 99 | - `name` - name of the test 100 | - `task` - optional, the name of the mcp.run task to use 101 | - `task-run` - optional, one of `latest`, `new`, `all` or the name/index of the task run to analyze 102 | - `prompt` - prompt to test, this is passed to the LLM under test, this can be left blank if `task` is set 103 | - `check` - prompt for the judge, this is used to determine the quality of the test output 104 | - `expected-tools` - list of tool names that might be used 105 | - `ignored-tools` - optional, list of tools to ignore, they will not be available to the LLM 106 | - `import` - optional, includes fields from another test TOML file 107 | - `vars` - optional, a dict of variables that will be used to format the prompt 108 | -------------------------------------------------------------------------------- /mcpx_eval/constants.py: -------------------------------------------------------------------------------- 1 | SYSTEM_PROMPT = """ 2 | You are a large language model evaluator, you are an expert at comparing the output of various models based on 3 | accuracy, tool use appropriateness, helpfullness, and quality of the output. 4 | 5 | - The LLMs being tested may have different tools available from the judge 6 | - All numeric scores should be scored from 0.0 - 100.0, where 100 is the best score and 0 is the worst 7 | - The original prompt provided to the LLM can be found between the tags 8 | - The output of the LLM for the given prompt can be found between the tags, this is an array of the various 9 | messages sent and tools used. The final_result message should be used to fill the `llm_output` field 10 | - Additional information and context for each evaluation is included in the section 11 | - The section is provided by the user to list which tools may be to be used to execute the specified task 12 | if all of the tools listed aren't used it should not affect the score, however it is not good for non-expected tools to be used 13 | - Do not make assumptions about improvements to the quality of the output beyond what is noted in the tags, 14 | the section is defined by the user as a way to validate the output given for the associated prompt 15 | - The accuracy score should reflect the accuracy of the result generally and taking into account the block and results 16 | of tool calls 17 | - The tool_use score should be based on whether or not the correct tool was used and whether the minimum amount 18 | of tools were used to accomplish a task. Over use of tools or repeated use of tools should deduct points from 19 | this score. This score should also be affected by how well the tools used conform to the tools listed in the 20 | block. 21 | - If a tool call fails but is fixed after retrying after a reasonable amount of times it shouldn't be considered a failure 22 | since some exploration may be needed. 23 | - Multiple failed tool calls that end up accomplishing the goal are preferred to fewer calls that don't. 24 | - The helpfulness score should measure how useful the response is in addressing the user's need. This should also reflect 25 | the completeness of the response. 26 | - The quality score should reflect the overall clearness and conciseness of the output 27 | - Try to utilize the tools that are available instead of searching for new tools 28 | - Not using any tools should deduct some points from the tool use score 29 | - The `description` field should contain a breakdown of why each score was awarded 30 | 31 | Advanced evaluation metrics: 32 | - A guess should not be considered a hallucination, however it should affect the accuracy score 33 | - The hallucination_score should measure the presence of made-up, incorrect, or factually unsupported statements 34 | (lower is better, with 0 being no hallucinations and 100 being completely hallucinated) 35 | - hallucination_score should only apply to made up information, if information is true at the time of the request 36 | it should be considered to be true 37 | - The false_claims field should list any specific false statements or hallucinations identified in the response 38 | 39 | For responses containing hallucinations, analyze: 40 | 1. The severity of each hallucination (minor factual error vs completely fabricated information) 41 | 2. The confidence with which hallucinated content is presented 42 | 3. Whether hallucinations are central to the response or peripheral 43 | 4. Whether the hallucination could lead to harmful actions if believed 44 | 45 | For the hallucination_score metric (0-100 scale, lower is better), carefully check for any false statements, 46 | incorrect information, or made-up facts in the response and list them in the false_claims field. 47 | 48 | Be thorough in your evaluation, considering how well the model's response meets both technical requirements and user needs. 49 | """ 50 | 51 | TEST_PROMPT = """ 52 | You are a helpful tool calling AI assistant with access to various external tools and APIs. Your goal is to complete tasks thoroughly and autonomously by making full use of these tools. Here are your core operating principles: 53 | 54 | 1. Take initiative - Don't wait for user permission to use tools. If a tool would help complete the task, use it immediately. 55 | 2. Chain multiple tools together when needed - Many tasks require multiple tool calls in sequence. Plan out and execute the full chain of calls needed to achieve the goal. 56 | 3. Handle errors gracefully - If a tool call fails, try alternative approaches or tools rather than asking the user what to do. 57 | 4. Make reasonable assumptions - When tool calls require parameters, use your best judgment to provide appropriate values rather than asking the user. 58 | 5. Show your work - After completing tool calls, explain what you did and show relevant results, but focus on the final outcome the user wanted. 59 | 6. Be thorough - Use tools repeatedly as needed until you're confident you've fully completed the task. Don't stop at partial solutions. However, repeated use of the same tool 60 | with the same paramters is unlikely to be helpful. 61 | 7. Always utilize the tools/functions that are already available rather than searching for new tools if possible. Instead of searching try to use an existing tool 62 | to accomplish a task. 63 | 8. Once an acceptable answer has been reached you should return it to the user, additional tool calls are not needed. 64 | 65 | Your responses should focus on results rather than asking questions. Only ask the user for clarification if the task itself is unclear or impossible with the tools available. 66 | """ 67 | 68 | # OpenAI model identifiers 69 | OPENAI_MODELS = [ 70 | "gpt-4o", 71 | "o1", 72 | "o1-mini", 73 | "o3-mini", 74 | "o3", 75 | "gpt-3.5", 76 | "gpt-4", 77 | "gpt-4.5", 78 | ] 79 | 80 | # Default profile path 81 | DEFAULT_PROFILE = "~/default" 82 | -------------------------------------------------------------------------------- /mcpx_eval/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import List, Dict, Any, Tuple, Optional 3 | import pandas as pd 4 | from dataclasses import dataclass 5 | from .constants import OPENAI_MODELS, DEFAULT_PROFILE 6 | 7 | import json 8 | 9 | 10 | def normalize_profile(profile: str) -> str: 11 | """Normalize a profile path to ensure it has the proper format.""" 12 | if not profile: 13 | return DEFAULT_PROFILE 14 | if not profile.startswith("~/"): 15 | return "~/" + profile 16 | return profile 17 | 18 | 19 | def parse_model(m: str) -> Tuple[Optional[str], str, str]: 20 | """Parse a model string into provider, name and profile components.""" 21 | provider = None 22 | name = m 23 | profile = DEFAULT_PROFILE 24 | 25 | # Split provider and name 26 | if ":" in m: 27 | provider, name = m.split(":", maxsplit=1) 28 | 29 | # Split name and profile 30 | if "/" in name: 31 | name, profile = name.split("/", maxsplit=1) 32 | profile = normalize_profile(profile) 33 | 34 | # Infer provider if not specified 35 | if provider is None: 36 | if "claude" in name: 37 | provider = "anthropic" 38 | elif any(model in name for model in OPENAI_MODELS): 39 | provider = "openai" 40 | elif "gemini" in name: 41 | provider = "google" 42 | else: 43 | provider = "ollama" 44 | 45 | return (provider, name, profile) 46 | 47 | 48 | @dataclass 49 | class Model: 50 | name: str 51 | profile: str 52 | provider: str 53 | trace: dict | None = None 54 | 55 | def __init__( 56 | self, name: str, profile: Optional[str] = None, trace: dict | None = None 57 | ): 58 | provider, model_name, prof = parse_model(name) 59 | self.provider = provider 60 | self.name = model_name 61 | self.profile = profile if profile is not None else prof 62 | self.trace = trace 63 | 64 | @property 65 | def slug(self) -> str: 66 | """Generate a slug identifier for the model.""" 67 | if self.profile in [DEFAULT_PROFILE, "default"]: 68 | return self.name 69 | if self.profile.startswith("~/"): 70 | return f"{self.name}/{self.profile.split('/', maxsplit=1)[1]}" 71 | return f"{self.name}/{self.profile}" 72 | 73 | @property 74 | def provider_and_name(self) -> str: 75 | """Generate the provider/name identifier.""" 76 | return f"{self.provider}/{self.name}" 77 | 78 | @staticmethod 79 | def load_trace(path): 80 | """Load trace from disk""" 81 | with open(path, "r") as f: 82 | data = json.load(f) 83 | model = data.pop("model") 84 | return Model(model, trace=data) 85 | 86 | 87 | class ScoreModel(BaseModel): 88 | """Used to score the result of an LLM tool call.""" 89 | 90 | llm_output: str = Field( 91 | "", 92 | description="Model output, this is the 'content' field of the final message from the LLM", 93 | ) 94 | description: str = Field("", description="Description of results for this model") 95 | 96 | # Core metrics 97 | tool_use: float = Field( 98 | 0.0, description="A score (0-100) of how appropriate the tool use is" 99 | ) 100 | accuracy: float = Field( 101 | 0.0, 102 | description="A score (0-100) of how accurate the response is based on the output of the tool calls", 103 | ) 104 | completeness: float = Field( 105 | 0.0, 106 | description="A score (0-100) of how complete the response is according to the task at hand and criteria", 107 | ) 108 | quality: float = Field( 109 | 0.0, 110 | description="A score (0-100) of the response quality - this includes the usefullness and clarity of the output", 111 | ) 112 | 113 | # Hallucination metrics 114 | hallucination_score: float = Field( 115 | 0.0, 116 | description="A score (0-100) representing the presence of hallucinations (lower is better)", 117 | ) 118 | false_claims: list = Field( 119 | [], 120 | description="List of identified false claims or hallucinations in the response", 121 | ) 122 | 123 | # Tools 124 | failed_tool_calls: int = Field( 125 | 0, 126 | description="The number of failed tool calls, or tool calls that encountered an error", 127 | ) 128 | 129 | 130 | @dataclass 131 | class Score: 132 | """Used to score the result of an LLM tool call.""" 133 | 134 | score: ScoreModel 135 | model: str 136 | duration: float 137 | tool_analysis: dict 138 | redundant_tool_calls: int 139 | tool_calls: int 140 | trace: dict | None = None 141 | 142 | def __getattribute__(self, name: str) -> Any: 143 | if name == "score": 144 | return object.__getattribute__(self, name) 145 | if hasattr(self.score, name): 146 | return getattr(self.score, name) 147 | return object.__getattribute__(self, name) 148 | 149 | def to_dataframe(self) -> pd.DataFrame: 150 | """Convert results to a pandas DataFrame for analysis.""" 151 | record = { 152 | "model": self.model, 153 | "duration": self.duration, 154 | "tool_use": self.score.tool_use, 155 | "tool_calls": self.tool_calls, 156 | "accuracy": self.score.accuracy, 157 | "helpfulness": self.score.completeness, 158 | "quality": self.score.quality, 159 | "hallucination_score": self.score.hallucination_score, 160 | "redundant_tool_calls": self.redundant_tool_calls, 161 | "false_claims_count": len(self.score.false_claims), 162 | "trace": self.trace, 163 | } 164 | return pd.DataFrame(record) 165 | 166 | def save_trace(self, path): 167 | """Save trace to disk""" 168 | trace = self.trace.copy() 169 | trace["model"] = self.model 170 | with open(path, "w") as f: 171 | f.write(json.dumps(trace)) 172 | 173 | 174 | class Results(BaseModel): 175 | """Collection of scores from multiple model evaluations.""" 176 | 177 | scores: List[Score] = Field([], description="A list of scores for each model") 178 | duration: float = Field(0.0, description="Total duration of all tests") 179 | 180 | def to_dataframe(self) -> pd.DataFrame: 181 | """Convert results to a pandas DataFrame for analysis.""" 182 | records = [] 183 | for score in self.scores: 184 | records.append(score.to_dataframe()) 185 | return pd.concat(records) 186 | 187 | 188 | @dataclass 189 | class Test: 190 | """Configuration for a model evaluation test.""" 191 | 192 | name: str 193 | prompt: str 194 | check: str 195 | models: List[str] 196 | expected_tools: List[str] 197 | ignore_tools: List[str] 198 | profile: Optional[str] 199 | vars: Dict[str, Any] 200 | task: Optional[str] 201 | task_run: Optional[str] 202 | 203 | def __init__( 204 | self, 205 | name: str, 206 | prompt: str, 207 | check: str = "", 208 | models: List[str] | None = None, 209 | expected_tools: List[str] | None = None, 210 | ignore_tools: Optional[List[str]] = None, 211 | profile: Optional[str] = None, 212 | vars: Optional[Dict[str, Any]] = None, 213 | task: Optional[str] = None, 214 | task_run: Optional[str] = None, 215 | ): 216 | self.name = name 217 | self.prompt = prompt 218 | self.check = check 219 | self.models = models or [] 220 | self.expected_tools = expected_tools or [] 221 | self.profile = profile 222 | self.ignore_tools = ignore_tools or [] 223 | self.vars = vars or {} 224 | self.task = task 225 | self.task_run = task_run 226 | 227 | @staticmethod 228 | def from_dict(data: dict) -> "Test": 229 | """Parse a dict into a test""" 230 | return Test( 231 | data.get("name", ""), 232 | data.get("prompt", ""), 233 | data.get("check", ""), 234 | data.get("models", []), 235 | data.get("expected-tools", []), 236 | ignore_tools=data.get("ignored-tools", data.get("ignore-tools", [])), 237 | vars=data.get("vars", {}), 238 | profile=data.get("profile"), 239 | task=data.get("task"), 240 | task_run=data.get("task-run"), 241 | ) 242 | 243 | @staticmethod 244 | def load(path: str) -> "Test": 245 | """Load a test configuration from a TOML file.""" 246 | import tomllib 247 | import os 248 | 249 | with open(path) as f: 250 | s = f.read() 251 | data = tomllib.loads(s) 252 | 253 | if "import" in data: 254 | imports = data["import"] 255 | if isinstance(imports, str): 256 | imports = [imports] 257 | 258 | t = None 259 | for imp in imports: 260 | if t is None: 261 | t = Test.load(os.path.join(os.path.dirname(path), imp)) 262 | 263 | # Update test attributes with any overrides from current file 264 | t.name = data.get("name", t.name) 265 | t.prompt = data.get("prompt", t.prompt) 266 | t.check = data.get("check", t.check) 267 | t.profile = data.get("profile", t.profile) 268 | t.models = data.get("models", t.models) 269 | t.expected_tools.extend(data.get("expected-tools", [])) 270 | t.ignore_tools.extend( 271 | data.get("ignored-tools", data.get("ignore-tools", [])) 272 | ) 273 | t.vars.update(**data.get("vars", {})) 274 | t.task = t.task or data.get("task") 275 | t.task_run = t.task_run or data.get("task-run") 276 | return t 277 | 278 | if "name" not in data: 279 | data["name"] = path 280 | 281 | return Test.from_dict(data) 282 | -------------------------------------------------------------------------------- /tests/test_mcpx.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import Mock, patch, AsyncMock, MagicMock 3 | import asyncio 4 | from datetime import datetime, timedelta 5 | import json 6 | 7 | from mcpx_eval import Judge, Test, Model, Score, Results, Database 8 | from mcpx_eval.models import ScoreModel 9 | from mcpx_eval.judge import ToolAnalysis 10 | 11 | class TestJudge(unittest.TestCase): 12 | def setUp(self): 13 | self.judge = Judge( 14 | models=["test-model"], 15 | judge_model="test-judge", 16 | ignore_tools=["ignored-tool"] 17 | ) 18 | 19 | def test_add_model(self): 20 | """Test adding models to the judge""" 21 | judge = Judge() 22 | 23 | # Test adding string model 24 | judge.add_model("gpt-4") 25 | self.assertEqual(len(judge.models), 1) 26 | self.assertEqual(judge.models[0].name, "gpt-4") 27 | 28 | # Test adding Model instance 29 | model = Model(name="anthropic:claude-3") 30 | judge.add_model(model) 31 | self.assertEqual(len(judge.models), 2) 32 | self.assertEqual(judge.models[1].name, "claude-3") 33 | self.assertEqual(judge.models[1].provider, "anthropic") 34 | 35 | # Test adding model with profile 36 | judge.add_model("mistral", profile="custom") 37 | self.assertEqual(len(judge.models), 3) 38 | self.assertEqual(judge.models[2].name, "mistral") 39 | self.assertEqual(judge.models[2].profile, "custom") 40 | 41 | class TestToolAnalysis(unittest.TestCase): 42 | def test_analyze_message_unique_tools(self): 43 | """Test analyzing unique tool calls""" 44 | from mcpx_eval.judge import ToolAnalysis 45 | 46 | tool_analysis = ToolAnalysis() 47 | 48 | # Test first unique tool call 49 | msg1 = { 50 | "tool": { 51 | "name": "test_tool", 52 | "input": {"param": "value1"} 53 | } 54 | } 55 | tool_analysis.analyze_message(msg1, 0) 56 | 57 | self.assertEqual(tool_analysis.total_tool_calls, 1) 58 | self.assertEqual(tool_analysis.redundant_tool_calls, 0) 59 | self.assertEqual( 60 | tool_analysis.tool_analysis["tool_0"]["redundancy"], 61 | "unique" 62 | ) 63 | 64 | # Test second unique tool call 65 | msg2 = { 66 | "tool": { 67 | "name": "test_tool", 68 | "input": {"param": "value2"} 69 | } 70 | } 71 | tool_analysis.analyze_message(msg2, 1) 72 | 73 | self.assertEqual(tool_analysis.total_tool_calls, 2) 74 | self.assertEqual(tool_analysis.redundant_tool_calls, 0) 75 | 76 | def test_analyze_message_redundant_tools(self): 77 | """Test analyzing redundant tool calls""" 78 | from mcpx_eval.judge import ToolAnalysis 79 | 80 | tool_analysis = ToolAnalysis() 81 | 82 | # Add first tool call 83 | msg1 = { 84 | "tool": { 85 | "name": "test_tool", 86 | "input": {"param": "value1"} 87 | } 88 | } 89 | tool_analysis.analyze_message(msg1, 0) 90 | 91 | # Add redundant tool call 92 | msg2 = { 93 | "tool": { 94 | "name": "test_tool", 95 | "input": {"param": "value1"} 96 | } 97 | } 98 | tool_analysis.analyze_message(msg2, 1) 99 | 100 | self.assertEqual(tool_analysis.total_tool_calls, 2) 101 | self.assertEqual(tool_analysis.redundant_tool_calls, 1) 102 | self.assertEqual( 103 | tool_analysis.tool_analysis["tool_1"]["redundancy"], 104 | "redundant" 105 | ) 106 | 107 | class TestModelApiConfig(unittest.TestCase): 108 | @patch.dict('os.environ', { 109 | 'OPENAI_HOST': 'https://custom-openai.com', 110 | 'GPT-4_HOST': 'https://custom-gpt4.com' 111 | }) 112 | def test_get_host_url(self): 113 | """Test getting host URLs for different providers""" 114 | from mcpx_eval.judge import ModelApiConfig 115 | 116 | # Test OpenAI default 117 | url = ModelApiConfig.get_host_url("gpt-3.5-turbo", "openai") 118 | self.assertEqual(url, "https://custom-openai.com/v1") 119 | 120 | # Test model-specific override 121 | url = ModelApiConfig.get_host_url("gpt-4", "openai") 122 | self.assertEqual(url, "https://custom-gpt4.com/v1") 123 | 124 | # Test Ollama default 125 | url = ModelApiConfig.get_host_url("llama2", "ollama") 126 | self.assertEqual(url, "http://127.0.0.1:11434/v1") 127 | 128 | import asyncio 129 | 130 | class AsyncIteratorMock: 131 | def __init__(self, items): 132 | self.items = items 133 | self.index = 0 134 | 135 | async def __aiter__(self): 136 | return self 137 | 138 | async def __anext__(self): 139 | try: 140 | item = self.items[self.index] 141 | except IndexError: 142 | raise StopAsyncIteration 143 | self.index += 1 144 | return item 145 | 146 | class MockPart: 147 | def __init__(self, **kwargs): 148 | self.__dict__.update(kwargs) 149 | 150 | class MockResponse: 151 | def __init__(self, **kwargs): 152 | for key, value in kwargs.items(): 153 | setattr(self, key, value) 154 | 155 | class TestJudgeEvaluation(unittest.IsolatedAsyncioTestCase): 156 | @patch('mcpx_eval.judge.Chat') 157 | @patch('mcpx_eval.judge.mcp_run') 158 | async def test_evaluate_model_success(self, mock_mcp_run, mock_chat): 159 | """Test successful model evaluation""" 160 | # Setup mock mcp_run.Client with proper tools attribute 161 | mock_tools = MagicMock() 162 | mock_tools.keys.return_value = ["test_tool"] 163 | mock_client = MagicMock() 164 | mock_client.tools = mock_tools 165 | mock_mcp_run.Client = Mock(return_value=mock_client) 166 | mock_mcp_run.ClientConfig = Mock() 167 | 168 | # Setup mock chat instance 169 | mock_chat_instance = MagicMock() 170 | mock_chat_instance.client = mock_client 171 | 172 | # Setup response parts 173 | model_response_parts = [ 174 | MockPart( 175 | part_kind="text", 176 | content="Test response" 177 | ), 178 | MockPart( 179 | part_kind="tool-call", 180 | tool_name="test_tool", 181 | tool_call_id="123", 182 | args={"param": "value"}, 183 | args_as_dict=lambda: {"param": "value"} 184 | ) 185 | ] 186 | request_parts = [ 187 | MockPart( 188 | part_kind="tool-return", 189 | tool_name="test_tool", 190 | tool_call_id="123", 191 | content="Tool result" 192 | ) 193 | ] 194 | 195 | async def mock_iter(prompt): 196 | yield MockResponse(model_response=MockResponse(parts=model_response_parts)) 197 | yield MockResponse(request=MockResponse(parts=request_parts)) 198 | yield MockResponse(data=MockPart(data="Final result")) 199 | 200 | mock_chat_instance.iter = mock_iter 201 | mock_chat.return_value = mock_chat_instance 202 | 203 | judge = Judge() 204 | model = Model(name="test-model") 205 | tool_analysis = ToolAnalysis() 206 | 207 | result = await judge.evaluate_model(model, "Test prompt", tool_analysis) 208 | 209 | self.assertIsNotNone(result) 210 | self.assertEqual(len(result["messages"]), 4) # text, tool-call, tool-return, final_result 211 | self.assertEqual(result["messages"][0]["kind"], "text") 212 | self.assertEqual(result["messages"][1]["kind"], "tool-call") 213 | self.assertEqual(result["messages"][2]["kind"], "tool-return") 214 | self.assertEqual(result["messages"][3]["kind"], "final_result") 215 | 216 | @patch('mcpx_eval.judge.Chat') 217 | @patch('mcpx_eval.judge.mcp_run') 218 | async def test_evaluate_model_failure(self, mock_mcp_run, mock_chat): 219 | """Test model evaluation with error""" 220 | # Setup mock mcp_run.Client 221 | mock_client = Mock() 222 | mock_mcp_run.Client = Mock(return_value=mock_client) 223 | mock_mcp_run.ClientConfig = Mock() 224 | 225 | mock_chat_instance = Mock() 226 | 227 | async def mock_iter(prompt): 228 | raise Exception("Test error") 229 | yield # Needed to make it a generator 230 | 231 | mock_chat_instance.iter = mock_iter 232 | mock_chat.return_value = mock_chat_instance 233 | 234 | judge = Judge() 235 | model = Model(name="test-model") 236 | tool_analysis = ToolAnalysis() 237 | 238 | result = await judge.evaluate_model(model, "Test prompt", tool_analysis) 239 | 240 | self.assertIsNone(result) 241 | 242 | class TestDatabase(unittest.TestCase): 243 | def setUp(self): 244 | self.db = Database(":memory:") # Use in-memory SQLite for testing 245 | 246 | def test_save_and_retrieve_results(self): 247 | """Test saving and retrieving test results""" 248 | # Create test data 249 | test_name = "test1" 250 | score_data = ScoreModel( 251 | tool_use=80, 252 | accuracy=90, 253 | completeness=85, 254 | quality=88, 255 | hallucination_score=5, 256 | false_claims=["claim1"], 257 | llm_output="test output", 258 | description="test description" 259 | ) 260 | 261 | score = Score( 262 | score=score_data, 263 | model="test-model", 264 | duration=1.5, 265 | tool_analysis={"tool_1": {"name": "test_tool", "redundancy": "unique"}}, 266 | redundant_tool_calls=0, 267 | tool_calls=1 268 | ) 269 | 270 | results = Results(scores=[score], duration=1.5) 271 | 272 | # Save results 273 | self.db.save_results(test_name, results) 274 | 275 | # Retrieve and verify results 276 | retrieved = self.db.average_results(test_name) 277 | 278 | self.assertEqual(len(retrieved.scores), 1) 279 | self.assertEqual(retrieved.scores[0].model, "test-model") 280 | self.assertEqual(retrieved.scores[0].duration, 1.5) 281 | self.assertEqual(retrieved.scores[0].tool_calls, 1) 282 | self.assertEqual(retrieved.scores[0].redundant_tool_calls, 0) 283 | self.assertEqual(retrieved.scores[0].accuracy, 90) 284 | 285 | if __name__ == '__main__': 286 | unittest.main() -------------------------------------------------------------------------------- /mcpx_eval/__main__.py: -------------------------------------------------------------------------------- 1 | from . import Judge, Test, Database 2 | from .html import visualize_json 3 | import asyncio 4 | import logging 5 | import pandas as pd 6 | from tempfile import NamedTemporaryFile 7 | import webbrowser 8 | import os 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def print_result(result): 14 | # Print model header 15 | print(f"\n{result.model}") 16 | print("=" * len(result.model)) 17 | 18 | # Create a DataFrame for the metrics 19 | metrics_df = pd.DataFrame( 20 | { 21 | "Metric": [ 22 | "Duration (s)", 23 | "Tool Calls", 24 | "Redundant Calls", 25 | "Failed Calls", 26 | "Tool Use %", 27 | "Accuracy %", 28 | "Completeness %", 29 | "Quality %", 30 | "Hallucination Score", 31 | ], 32 | "Value": [ 33 | f"{result.duration:.2f}", 34 | result.tool_calls, 35 | result.redundant_tool_calls, 36 | result.failed_tool_calls, 37 | f"{result.tool_use:.1f}", 38 | f"{result.accuracy:.1f}", 39 | f"{result.completeness:.1f}", 40 | f"{result.quality:.1f}", 41 | f"{result.hallucination_score:.1f}", 42 | ], 43 | } 44 | ) 45 | 46 | # Print metrics table 47 | print("\nMetrics:") 48 | print(metrics_df.to_string(index=False)) 49 | 50 | # Print output and description 51 | print("\nOutput:") 52 | print(result.llm_output) 53 | print("\nDescription:") 54 | print(result.description) 55 | 56 | # Print false claims if any 57 | if result.false_claims and len(result.false_claims) > 0: 58 | print("\nFalse Claims Detected:") 59 | for claim in result.false_claims: 60 | print(f" - {claim}") 61 | 62 | # Print tool analysis if any 63 | if result.tool_analysis and len(result.tool_analysis) > 0: 64 | print("\nTool Analysis:") 65 | tool_data = [] 66 | for tool_id, analysis in result.tool_analysis.items(): 67 | if isinstance(analysis, list): 68 | for a in analysis: 69 | tool_data.append( 70 | { 71 | "Tool ID": tool_id, 72 | "Name": a["name"], 73 | "Redundancy": a["redundancy"], 74 | } 75 | ) 76 | else: 77 | tool_data.append( 78 | { 79 | "Tool ID": tool_id, 80 | "Name": analysis["name"], 81 | "Redundancy": analysis["redundancy"], 82 | } 83 | ) 84 | 85 | if tool_data: 86 | tool_df = pd.DataFrame(tool_data) 87 | print(tool_df.to_string(index=False)) 88 | 89 | 90 | def summary(args): 91 | db = Database(args.db) 92 | res = db.average_results(args.name) 93 | if not res.scores: 94 | return # Database class now handles empty results messaging 95 | 96 | print(f"\nTest Summary: {args.name}") 97 | print("=" * (14 + len(args.name))) 98 | print(f"Number of results: {len(res.scores)}\n") 99 | 100 | for result in res.scores: 101 | print_result(result) 102 | 103 | 104 | def json_summary(args): 105 | """Generate a JSON summary of test data""" 106 | import json 107 | 108 | db = Database(args.db) 109 | summary = db.generate_json_summary() 110 | 111 | # Filter to specific test if requested 112 | if args.name: 113 | if args.name in summary["tests"]: 114 | filtered_summary = { 115 | "tests": {args.name: summary["tests"][args.name]}, 116 | "total": { 117 | "models": {}, 118 | "metrics": summary["tests"][args.name]["metrics"], 119 | "test_count": 1, 120 | "model_count": summary["tests"][args.name]["model_count"], 121 | }, 122 | "generated_at": summary["generated_at"], 123 | } 124 | # Include only models that participated in this test 125 | for model_name, model_data in summary["total"]["models"].items(): 126 | if model_name in summary["tests"][args.name]["models"]: 127 | filtered_summary["total"]["models"][model_name] = { 128 | **model_data, 129 | "test_count": 1, 130 | } 131 | summary = filtered_summary 132 | else: 133 | print(f"Warning: Test '{args.name}' not found in results") 134 | 135 | # Format JSON with indentation for readability 136 | formatted_json = json.dumps(summary, indent=2) 137 | 138 | # Output to file or stdout 139 | if args.json: 140 | with open(args.output, "w") as f: 141 | f.write(formatted_json) 142 | print(f"JSON summary saved to {args.output}") 143 | print( 144 | f"To visualize this file, run: uv run python -m mcpx_eval html {args.output}" 145 | ) 146 | elif not args.html and not args.show: 147 | print(formatted_json) 148 | 149 | # If visualization is requested, create and open it 150 | output_path = args.html 151 | html = visualize_json(summary, output_path) 152 | # Also save a copy to the specified location if provided 153 | if output_path: 154 | with open(output_path, "w") as f: 155 | f.write(html) 156 | print(f"Saved to {output_path}") 157 | temp_path = os.path.abspath(output_path) 158 | if args.show: 159 | if output_path is None: 160 | # Write to temporary file and open in browser 161 | with NamedTemporaryFile(suffix=".html", delete=False, mode="w") as f: 162 | f.write(html) 163 | temp_path = f.name 164 | 165 | print("Opening browser...") 166 | webbrowser.open(f"file://{temp_path}") 167 | 168 | 169 | async def run(): 170 | from argparse import ArgumentParser 171 | 172 | parser = ArgumentParser( 173 | "mcpx-eval", description="Open-ended LLM tool use evaluator for mcp.run tools" 174 | ) 175 | subparsers = parser.add_subparsers(dest="command", help="Command to run") 176 | parser.add_argument("--db", default=None, help="SQLite3 database path") 177 | 178 | # Main test command (default) 179 | test_parser = subparsers.add_parser("test", help="Run evaluation tests") 180 | test_parser.add_argument("--name", default="", help="Test name") 181 | test_parser.add_argument( 182 | "--model", 183 | "-m", 184 | default=[], 185 | help="Model to include in test", 186 | action="append", 187 | ) 188 | test_parser.add_argument( 189 | "--judge-model", 190 | default="claude-3-5-sonnet-latest", 191 | help="Model to use for Judge", 192 | ) 193 | test_parser.add_argument( 194 | "--ignore-tool", 195 | "-x", 196 | default=[], 197 | help="Ignore tool", 198 | action="append", 199 | ) 200 | test_parser.add_argument( 201 | "--tool", 202 | "-t", 203 | default=[], 204 | help="Expected tool", 205 | action="append", 206 | ) 207 | test_parser.add_argument( 208 | "--profile", 209 | "-p", 210 | default=None, 211 | help="Profile to use for judge model", 212 | ) 213 | 214 | test_parser.add_argument("--prompt", help="Test prompt") 215 | test_parser.add_argument("--check", help="Test check") 216 | test_parser.add_argument("--config", help="Test config file") 217 | test_parser.add_argument( 218 | "--iter", 219 | "-i", 220 | default=1, 221 | type=int, 222 | help="Number of times to run the test for each model", 223 | ) 224 | test_parser.add_argument( 225 | "--no-save", 226 | default=False, 227 | action="store_true", 228 | help="Don't save results in db", 229 | ) 230 | test_parser.add_argument( 231 | "--task", 232 | default=None, 233 | help="Name of task from mcp.run to get prompt from", 234 | ) 235 | test_parser.add_argument( 236 | "--task-run", 237 | default=None, 238 | help="Name of a specific task run", 239 | ) 240 | test_parser.add_argument( 241 | "--var", 242 | default=[], 243 | help="Template variable", 244 | action="append", 245 | ) 246 | 247 | # Summary command 248 | summary_parser = subparsers.add_parser("summary", help="Show test results summary") 249 | summary_parser.add_argument("name", help="Test name to summarize") 250 | 251 | # JSON summary command 252 | gen_parser = subparsers.add_parser( 253 | "gen", help="Generate JSON summary of all test data" 254 | ) 255 | gen_parser.add_argument( 256 | "--name", 257 | "-n", 258 | help="Filter results to a specific test name", 259 | ) 260 | gen_parser.add_argument( 261 | "--json", 262 | help="Output JSON file path (default: print to stdout)", 263 | ) 264 | gen_parser.add_argument( 265 | "--show", 266 | "-s", 267 | action="store_true", 268 | help="Create an interactive HTML visualization of the JSON data", 269 | ) 270 | gen_parser.add_argument( 271 | "--html", 272 | help="Output path for HTML visualization (optional)", 273 | ) 274 | 275 | # Global options 276 | parser.add_argument( 277 | "--log", 278 | default=None, 279 | choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], 280 | help="Set the logging level.", 281 | ) 282 | parser.add_argument( 283 | "--verbose", default=False, action="store_true", help="Enable verbose logging" 284 | ) 285 | 286 | args = parser.parse_args() 287 | 288 | # Setup logging 289 | level = args.log or "INFO" 290 | log_level = getattr(logging, level, None) 291 | if not isinstance(log_level, int): 292 | raise ValueError("Invalid log level: %s" % level) 293 | logging.basicConfig(level=log_level) 294 | 295 | if not args.verbose: 296 | for handler in logging.root.handlers: 297 | handler.addFilter(logging.Filter("mcpx_eval")) 298 | 299 | # Handle command routing 300 | command = getattr(args, "command", "test") # Default to test if not specified 301 | 302 | # Visualization commands removed 303 | 304 | # Summary command 305 | if command == "summary": 306 | summary(args) 307 | return 308 | 309 | # gen command 310 | elif command == "gen": 311 | json_summary(args) 312 | return 313 | 314 | # Test command (default) 315 | elif command == "test": 316 | test = None 317 | name = args.name or args.task 318 | 319 | vars = {} 320 | for line in args.var: 321 | s = line.split("=") 322 | vars[s[0]] = s[1] 323 | 324 | if hasattr(args, "config") and args.config is not None: 325 | test = Test.load(args.config) 326 | for model in args.model: 327 | test.models.append(model) 328 | if args.name is None or args.name == "": 329 | if test.name is not None: 330 | name = test.name 331 | test.vars.update(**vars) 332 | test.expected_tools.extend(args.tool) 333 | test.ignore_tools.extend(args.ignore_tool) 334 | test.task = args.task or test.task 335 | test.prompt = args.prompt or test.prompt 336 | test.check = args.check or test.check 337 | test.name = args.name or test.name 338 | test.task_run = args.task_run or test.task_run 339 | else: 340 | test = Test( 341 | name=name, 342 | prompt=args.prompt or "", 343 | check=args.check or "", 344 | models=args.model, 345 | profile=args.profile, 346 | expected_tools=args.tool, 347 | ignore_tools=args.ignore_tool, 348 | vars=vars, 349 | task=args.task, 350 | task_run=args.task_run, 351 | ) 352 | 353 | iterations = args.iter 354 | logger.info( 355 | f"Running {test.name}: task={test.task is not None}, models=[{', '.join(test.models)}] ({iterations} iteration{'s' if iterations > 1 else ''})" 356 | ) 357 | db = None 358 | if args.db is not None: 359 | db = Database(args.db) 360 | 361 | judge = Judge( 362 | models=test.models, 363 | profile=args.profile, 364 | db=db, 365 | judge_model=args.judge_model, 366 | ignore_tools=test.ignore_tools, 367 | ) 368 | judge.db.save_test(test) 369 | 370 | total_duration = 0 371 | 372 | for i in range(iterations): 373 | if iterations > 1: 374 | logger.info(f"Iteration {i + 1}/{iterations}") 375 | 376 | # For multiple iterations, pass save=True to ensure each run is saved to DB 377 | res = await judge.run_test(test, save=not args.no_save) 378 | total_duration += res.duration 379 | logger.debug(f"Result: {res.scores}") 380 | if not args.no_save: 381 | logger.info("Results saved to db") 382 | 383 | if iterations > 1: 384 | logger.info(f"Iteration {i + 1} finished in {res.duration}s") 385 | 386 | logger.info(f"{test.name} finished in {total_duration}s total") 387 | 388 | if iterations > 1: 389 | print(f"\nShowing results from iteration {iterations} of {iterations}.") 390 | print(f"Use 'mcpx-eval summary {test.name}' to see aggregated results.\n") 391 | 392 | for result in res.scores: 393 | if result is None: 394 | continue 395 | print_result(result) 396 | else: 397 | parser.print_help() 398 | 399 | 400 | def main(): 401 | asyncio.run(run()) 402 | 403 | 404 | if __name__ == "__main__": 405 | main() 406 | -------------------------------------------------------------------------------- /mcpx_eval/database.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import json 3 | import pandas as pd 4 | from datetime import datetime 5 | from .models import Score, Results, Test, ScoreModel 6 | 7 | 8 | class Database: 9 | conn: sqlite3.Connection 10 | 11 | def __init__(self, path: str | None = "eval.db"): 12 | if path is None: 13 | path = "eval.db" 14 | self.conn = sqlite3.connect(path) 15 | 16 | self.conn.executescript( 17 | """ 18 | CREATE TABLE IF NOT EXISTS tests ( 19 | id INTEGER PRIMARY KEY, 20 | name TEXT NOT NULL, 21 | prompt TEXT NOT NULL, 22 | prompt_check TEXT NOT NULL, 23 | UNIQUE(name) 24 | ); 25 | CREATE TABLE IF NOT EXISTS eval_results ( 26 | id INTEGER PRIMARY KEY, 27 | t TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 28 | test_name TEXT NOT NULL, 29 | model TEXT NOT NULL, 30 | duration REAL NOT NULL, 31 | output TEXT NOT NULL, 32 | description TEXT NOT NULL, 33 | accuracy REAL NOT NULL, 34 | tool_use REAL NOT NULL, 35 | tool_calls INT NOT NULL, 36 | redundant_tool_calls INT NOT NULL DEFAULT 0, 37 | failed_tool_calls INT NOT NULL DEFAULT 0, 38 | completeness REAL NOT NULL DEFAULT 0.0, 39 | quality REAL NOT NULL, 40 | hallucination_score REAL NOT NULL DEFAULT 0.0, 41 | false_claims TEXT NOT NULL DEFAULT '[]', 42 | tool_analysis TEXT NOT NULL DEFAULT '{}', 43 | FOREIGN KEY(test_name) REFERENCES tests(name) 44 | ); 45 | """ 46 | ) 47 | self.conn.commit() 48 | 49 | def save_score(self, name: str, score: Score, commit=True): 50 | if name == "": 51 | return 52 | 53 | # Convert score to DataFrame for efficient insertion 54 | df = pd.DataFrame( 55 | [ 56 | { 57 | "test_name": name, 58 | "model": score.model, 59 | "duration": score.duration, 60 | "output": score.llm_output, 61 | "description": score.description, 62 | "accuracy": score.accuracy, 63 | "tool_use": score.tool_use, 64 | "tool_calls": score.tool_calls, 65 | "redundant_tool_calls": score.redundant_tool_calls, 66 | "failed_tool_calls": score.failed_tool_calls, 67 | "completeness": score.completeness, 68 | "quality": score.quality, 69 | "hallucination_score": score.hallucination_score, 70 | "false_claims": json.dumps(score.false_claims), 71 | "tool_analysis": json.dumps(score.tool_analysis), 72 | } 73 | ] 74 | ) 75 | 76 | df.to_sql("eval_results", self.conn, if_exists="append", index=False) 77 | if commit: 78 | self.conn.commit() 79 | 80 | def save_test(self, test: "Test"): 81 | self.conn.execute( 82 | """ 83 | INSERT OR IGNORE INTO tests (name, prompt, prompt_check) VALUES (?, ?, ?); 84 | """, 85 | (test.name, test.prompt, test.check), 86 | ) 87 | self.conn.commit() 88 | 89 | def save_results(self, name: str, results: Results): 90 | if not results.scores: 91 | return 92 | 93 | # Convert all scores to DataFrame at once 94 | records = [ 95 | { 96 | "test_name": name, 97 | "model": score.model, 98 | "duration": score.duration, 99 | "output": score.llm_output, 100 | "description": score.description, 101 | "accuracy": score.accuracy, 102 | "tool_use": score.tool_use, 103 | "tool_calls": score.tool_calls, 104 | "redundant_tool_calls": score.redundant_tool_calls, 105 | "failed_tool_calls": score.failed_tool_calls, 106 | "completeness": score.completeness, 107 | "quality": score.quality, 108 | "hallucination_score": score.hallucination_score, 109 | "false_claims": json.dumps(score.false_claims), 110 | "tool_analysis": json.dumps(score.tool_analysis), 111 | } 112 | for score in results.scores 113 | ] 114 | 115 | df = pd.DataFrame(records) 116 | df.to_sql("eval_results", self.conn, if_exists="append", index=False) 117 | self.conn.commit() 118 | 119 | def average_results(self, name: str) -> Results: 120 | # Read results into a pandas DataFrame 121 | df = pd.read_sql_query( 122 | """ 123 | SELECT * 124 | FROM eval_results 125 | WHERE test_name = ? 126 | """, 127 | self.conn, 128 | params=(name,), 129 | ) 130 | 131 | if df.empty: 132 | print(f"No results found in database for test: {name}") 133 | print("Available tests:") 134 | tests = pd.read_sql_query( 135 | "SELECT DISTINCT test_name FROM eval_results", self.conn 136 | ) 137 | if tests.empty: 138 | print(" No tests have been run yet") 139 | else: 140 | for test in tests["test_name"]: 141 | print(f" - {test}") 142 | return Results(scores=[]) 143 | 144 | # Convert false_claims and tool_analysis from JSON strings 145 | df["false_claims"] = df["false_claims"].apply(json.loads) 146 | df["tool_analysis"] = df["tool_analysis"].apply(json.loads) 147 | 148 | # Group by model and aggregate 149 | grouped = ( 150 | df.groupby("model") 151 | .agg( 152 | { 153 | "duration": "mean", 154 | "output": "first", # take first output as example 155 | "description": "first", # take first description as example 156 | "accuracy": "mean", 157 | "tool_use": "mean", 158 | "tool_calls": "mean", 159 | "redundant_tool_calls": "mean", 160 | "completeness": "mean", 161 | "quality": "mean", 162 | "hallucination_score": "mean", 163 | "false_claims": "sum", # combine all false claims 164 | "tool_analysis": "first", # take first tool analysis 165 | } 166 | ) 167 | .reset_index() 168 | ) 169 | 170 | # Convert back to Score objects 171 | scores = [ 172 | Score( 173 | model=row["model"], 174 | duration=row["duration"], 175 | score=ScoreModel( 176 | llm_output=row["output"], 177 | description=row["description"], 178 | accuracy=row["accuracy"], 179 | tool_use=row["tool_use"], 180 | completeness=row["completeness"], 181 | quality=row["quality"], 182 | hallucination_score=row["hallucination_score"], 183 | false_claims=row["false_claims"], 184 | ), 185 | tool_analysis=row["tool_analysis"], 186 | redundant_tool_calls=int(row["redundant_tool_calls"]), 187 | tool_calls=int(row["tool_calls"]), 188 | ) 189 | for _, row in grouped.iterrows() 190 | ] 191 | 192 | return Results(scores=scores) 193 | 194 | def get_test_stats(self, test_name: str | None = None) -> pd.DataFrame: 195 | """Get detailed statistics for tests. 196 | 197 | Args: 198 | test_name: Optional test name to filter results 199 | 200 | Returns: 201 | DataFrame with test statistics including: 202 | - Number of runs per model 203 | - Mean and std dev of scores 204 | - Min/max durations 205 | """ 206 | query = """ 207 | SELECT 208 | test_name, 209 | model, 210 | COUNT(*) as runs, 211 | AVG(duration) as mean_duration, 212 | MIN(duration) as min_duration, 213 | MAX(duration) as max_duration, 214 | AVG(accuracy) as mean_accuracy, 215 | AVG(tool_use) as mean_tool_use, 216 | AVG(tool_calls) as mean_tool_calls, 217 | AVG(redundant_tool_calls) as mean_redundant_calls, 218 | AVG(completeness) as mean_completeness, 219 | AVG(quality) as mean_quality, 220 | AVG(hallucination_score) as mean_hallucination 221 | FROM eval_results 222 | """ 223 | 224 | if test_name: 225 | query += " WHERE test_name = ?" 226 | params = (test_name,) 227 | else: 228 | params = () 229 | 230 | query += " GROUP BY test_name, model" 231 | 232 | return pd.read_sql_query(query, self.conn, params=params) 233 | 234 | def generate_json_summary(self): 235 | # Read results into a pandas DataFrame 236 | df = pd.read_sql_query( 237 | """ 238 | SELECT 239 | test_name, 240 | model, 241 | AVG(accuracy) as accuracy, 242 | AVG(tool_use) as tool_use, 243 | AVG(tool_calls) as tool_calls, 244 | AVG(redundant_tool_calls) as redundant_tool_calls, 245 | AVG(failed_tool_calls) as failed_tool_calls, 246 | AVG(completeness) as completeness, 247 | AVG(quality) as quality, 248 | AVG(hallucination_score) as hallucination_score, 249 | AVG(duration) as duration, 250 | COUNT(*) as runs 251 | FROM eval_results 252 | GROUP BY test_name, model 253 | """, 254 | self.conn, 255 | ) 256 | 257 | # Use pandas styling to create formatted HTML tables 258 | def style_table(df): 259 | return ( 260 | df.style.format( 261 | { 262 | "accuracy": "{:.3f}%", 263 | "tool_use": "{:.3f}%", 264 | "completeness": "{:.3f}%", 265 | "quality": "{:.3f}%", 266 | "hallucination_score": "{:.3f}%", 267 | "tool_calls": "{:.1f}", 268 | "redundant_tool_calls": "{:.1f}", 269 | "runs": "{:.0f}", 270 | "duration": "{:.3f}", 271 | } 272 | ) 273 | .background_gradient( 274 | subset=[ 275 | "accuracy", 276 | "tool_use", 277 | "completeness", 278 | "quality", 279 | ], 280 | cmap="RdYlGn", 281 | ) 282 | .background_gradient(subset=["hallucination_score"], cmap="RdYlGn_r") 283 | .set_properties(**{"text-align": "center"}) 284 | .to_html() 285 | ) 286 | 287 | # Generate summary structure 288 | summary = { 289 | "tests": {}, 290 | "total": { 291 | "models": {}, 292 | "metrics": {}, 293 | "test_count": len(df["test_name"].unique()), 294 | "model_count": len(df["model"].unique()), 295 | }, 296 | } 297 | 298 | # Calculate total metrics with formatted precision 299 | total_metrics = df.agg( 300 | { 301 | "accuracy": lambda x: round(x.mean(), 3), 302 | "tool_use": lambda x: round(x.mean(), 3), 303 | "tool_calls": lambda x: round(x.sum(), 1), 304 | "redundant_tool_calls": lambda x: round(x.sum(), 1), 305 | "completeness": lambda x: round(x.mean(), 3), 306 | "quality": lambda x: round(x.mean(), 3), 307 | "hallucination_score": lambda x: round(x.mean(), 3), 308 | } 309 | ) 310 | summary["total"]["metrics"] = total_metrics.to_dict() 311 | 312 | # Process each test 313 | for test_name in df["test_name"].unique(): 314 | test_df = df[df["test_name"] == test_name] 315 | test_df = test_df.sort_values("quality", ascending=False) 316 | 317 | # Calculate test metrics with formatted precision 318 | test_metrics = test_df.agg( 319 | { 320 | "accuracy": lambda x: round(x.mean(), 3), 321 | "tool_use": lambda x: round(x.mean(), 3), 322 | "tool_calls": lambda x: round(x.sum(), 1), 323 | "redundant_tool_calls": lambda x: round(x.sum(), 1), 324 | "completeness": lambda x: round(x.mean(), 3), 325 | "quality": lambda x: round(x.mean(), 3), 326 | "hallucination_score": lambda x: round(x.mean(), 3), 327 | } 328 | ) 329 | 330 | # Round tool calls in test metrics to 1 decimal place 331 | if "tool_calls" in test_metrics: 332 | test_metrics["tool_calls"] = round(test_metrics["tool_calls"], 1) 333 | if "redundant_tool_calls" in test_metrics: 334 | test_metrics["redundant_tool_calls"] = round( 335 | test_metrics["redundant_tool_calls"], 1 336 | ) 337 | 338 | summary["tests"][test_name] = { 339 | "models": { 340 | row["model"]: { 341 | "accuracy": row["accuracy"], 342 | "tool_use": row["tool_use"], 343 | "tool_calls": row["tool_calls"], 344 | "redundant_tool_calls": row["redundant_tool_calls"], 345 | "failed_tool_calls": row["failed_tool_calls"], 346 | "completeness": row["completeness"], 347 | "quality": row["quality"], 348 | "hallucination_score": row["hallucination_score"], 349 | "runs": row["runs"], 350 | "duration": row["duration"], 351 | } 352 | for _, row in test_df.iterrows() 353 | }, 354 | "metrics": test_metrics.to_dict(), 355 | "model_count": len(test_df["model"].unique()), 356 | } 357 | 358 | # Update total models data 359 | for model in test_df["model"].unique(): 360 | model_data = test_df[test_df["model"] == model].iloc[0] 361 | if model not in summary["total"]["models"]: 362 | summary["total"]["models"][model] = { 363 | "accuracy": 0.0, 364 | "tool_use": 0.0, 365 | "tool_calls": 0, 366 | "redundant_tool_calls": 0, 367 | "completeness": 0.0, 368 | "quality": 0.0, 369 | "hallucination_score": 0.0, 370 | "test_count": 0, 371 | "duration": 0.0, 372 | } 373 | 374 | summary["total"]["models"][model]["test_count"] += 1 375 | for metric in [ 376 | "accuracy", 377 | "tool_use", 378 | "completeness", 379 | "quality", 380 | "hallucination_score", 381 | "duration", 382 | ]: 383 | summary["total"]["models"][model][metric] += model_data[metric] 384 | summary["total"]["models"][model]["tool_calls"] += model_data[ 385 | "tool_calls" 386 | ] 387 | summary["total"]["models"][model]["redundant_tool_calls"] += model_data[ 388 | "redundant_tool_calls" 389 | ] 390 | 391 | # Calculate averages for total model metrics 392 | for model in summary["total"]["models"]: 393 | test_count = summary["total"]["models"][model]["test_count"] 394 | if test_count > 0: 395 | for metric in [ 396 | "accuracy", 397 | "tool_use", 398 | "completeness", 399 | "quality", 400 | "hallucination_score", 401 | "duration", 402 | ]: 403 | summary["total"]["models"][model][metric] /= test_count 404 | 405 | # Add timestamp 406 | summary["generated_at"] = datetime.now().isoformat() 407 | 408 | return summary 409 | -------------------------------------------------------------------------------- /mcpx_eval/judge.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Dict, Any, Optional 3 | from datetime import datetime, timedelta 4 | import json 5 | import traceback 6 | import os 7 | 8 | from mcpx_py import Chat, mcp_run, openai_compatible_model 9 | import pystache 10 | 11 | from .models import ScoreModel, Score, Results, Test, Model 12 | from .database import Database 13 | from .constants import SYSTEM_PROMPT, TEST_PROMPT 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def is_int(x): 19 | if x is None: 20 | return False 21 | try: 22 | int(x) 23 | return True 24 | except ValueError: 25 | return False 26 | 27 | 28 | def task_run_index( 29 | client: mcp_run.Client, task: str, index: int = -1 30 | ) -> mcp_run.TaskRun | None: 31 | a = list(client.list_task_runs(task)) 32 | a.reverse() 33 | try: 34 | return a[index] 35 | except IndexError: 36 | return None 37 | 38 | 39 | class ModelApiConfig: 40 | """Helper class to manage model API configurations.""" 41 | 42 | @staticmethod 43 | def get_host_url(model_name: str, provider: str) -> str: 44 | """Get the appropriate API host URL for a given model and provider.""" 45 | if provider in ["ollama", "llama"]: 46 | host = os.environ.get( 47 | f"{model_name.upper()}_HOST", 48 | os.environ.get( 49 | "LLAMA_HOST", 50 | os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434"), 51 | ), 52 | ) 53 | return f"{host}/v1" if not host.endswith("/v1") else host 54 | elif provider == "openai": 55 | host = os.environ.get( 56 | f"{model_name.upper()}_HOST", 57 | os.environ.get("OPENAI_HOST", "https://api.openai.com"), 58 | ) 59 | return f"{host}/v1" if not host.endswith("/v1") else host 60 | return "" 61 | 62 | @staticmethod 63 | def get_model_config(model: Model) -> str: 64 | """Get the appropriate model configuration for API calls.""" 65 | if model.provider in ["ollama", "llama", "openai"]: 66 | host = ModelApiConfig.get_host_url(model.name, model.provider) 67 | return openai_compatible_model(host, model.name) 68 | return model.name 69 | 70 | 71 | class ToolAnalysis: 72 | """Helper class to analyze tool usage patterns.""" 73 | 74 | def __init__(self): 75 | self.tool_analysis: Dict[str, Any] = {} 76 | self.redundant_tool_calls = 0 77 | self.seen_tool_patterns = set() 78 | self.total_tool_calls = 0 79 | 80 | def analyze_message(self, msg: Dict[str, Any], index: int) -> None: 81 | """Analyze a single message for tool usage patterns.""" 82 | if not msg.get("tool"): 83 | return 84 | 85 | tool_name = msg["tool"]["name"] 86 | tool_input = msg["tool"]["input"] 87 | self.total_tool_calls += 1 88 | 89 | # Create pattern string for redundancy detection 90 | tool_pattern = f"{tool_name}:{str(tool_input)}" 91 | 92 | # Check for redundancy 93 | redundancy_status = ( 94 | "redundant" if tool_pattern in self.seen_tool_patterns else "unique" 95 | ) 96 | if redundancy_status == "redundant": 97 | self.redundant_tool_calls += 1 98 | else: 99 | self.seen_tool_patterns.add(tool_pattern) 100 | 101 | # Store tool analysis 102 | self.tool_analysis[f"tool_{index}"] = { 103 | "name": tool_name, 104 | "input": tool_input, 105 | "redundancy": redundancy_status, 106 | } 107 | 108 | 109 | def format_judge_prompt(prompt, results, check, expected_tools): 110 | if check is None or check == "": 111 | check = "Make sure the output matches the requirments of the prompt" 112 | return f""" 113 | 114 | Current date and time: {datetime.now().isoformat()} 115 | 116 | 117 | {prompt} 118 | 119 | 120 | {json.dumps(results)} 121 | 122 | {check} 123 | {", ".join(expected_tools)} 124 | """ 125 | 126 | 127 | class Judge: 128 | """Evaluates model performance on given tests.""" 129 | 130 | model: Model 131 | models: List[Model] 132 | ignore_tools: List[str] 133 | db: Database 134 | profile: Optional[str] 135 | retries: int 136 | 137 | def __init__( 138 | self, 139 | models: Optional[List[Model | str]] = None, 140 | db: Optional[Database] = None, 141 | profile: Optional[str] = None, 142 | judge_model: str = "claude-3-5-sonnet-latest", 143 | ignore_tools: Optional[List[str]] = None, 144 | retries: Optional[int] = None, 145 | ): 146 | self.retries = retries or 10 147 | self.profile = profile or mcp_run.ProfileSlug("~", "default") 148 | self.ignore_tools = ignore_tools or [] 149 | self.db = db or Database() 150 | self.models = [] 151 | self.model = Model(name=judge_model) 152 | if models is not None: 153 | for model in models: 154 | self.add_model(model) 155 | 156 | def add_model( 157 | self, 158 | model: Model | str, 159 | profile: Optional[str] = None, 160 | ) -> None: 161 | """Add a model to the evaluation list.""" 162 | if isinstance(model, str): 163 | model = Model(name=model) 164 | if profile is not None: 165 | model.profile = profile 166 | self.models.append(model) 167 | 168 | async def run_test(self, test: Test, save: bool = True) -> Results: 169 | """Run a specific test configuration.""" 170 | profile = test.profile 171 | if profile is None: 172 | profile = self.profile or mcp_run.ProfileSlug("~", "default") 173 | else: 174 | profile = mcp_run.ProfileSlug.parse(profile) 175 | 176 | if test.task is not None: 177 | client = mcp_run.Client(config=mcp_run.ClientConfig(profile=profile)) 178 | tasks = client.tasks 179 | if test.task not in tasks: 180 | raise Exception(f"Invalid task, {test.task} not found in {profile}") 181 | test.prompt = tasks[test.task].prompt 182 | 183 | results = await self.run( 184 | pystache.render(test.prompt, test.vars), 185 | test.check, 186 | test.expected_tools, 187 | test.task, 188 | test.task_run, 189 | ) 190 | 191 | if save: 192 | self.db.save_results(test.name, results) 193 | return results 194 | 195 | async def evaluate_model( 196 | self, 197 | model: Model, 198 | prompt: str, 199 | tool_analysis: ToolAnalysis, 200 | ) -> Dict[str, Any]: 201 | """Evaluate a single model's performance.""" 202 | result = {"messages": [], "tools-available": []} 203 | 204 | try: 205 | model_config = ModelApiConfig.get_model_config(model) 206 | chat = Chat( 207 | client=mcp_run.Client( 208 | config=mcp_run.ClientConfig(profile=model.profile) 209 | ), 210 | model=model_config, 211 | ignore_tools=self.ignore_tools, 212 | system_prompt=TEST_PROMPT, 213 | retries=5, 214 | ) 215 | 216 | # Get available tools, handling both real and mock objects 217 | try: 218 | result["tools-available"] = list(chat.client.tools.keys()) 219 | except (TypeError, AttributeError): 220 | # If tools is a mock object, get the return value directly 221 | result["tools-available"] = chat.client.tools.keys() 222 | 223 | async for node in chat.iter(prompt): 224 | if hasattr(node, "model_response"): 225 | for part in node.model_response.parts: 226 | if part.part_kind == "text": 227 | logger.info(part.content) 228 | result["messages"].append( 229 | {"kind": part.part_kind, "text": part.content} 230 | ) 231 | elif part.part_kind == "tool-call": 232 | logger.info( 233 | f"Tool {part.tool_name}({part.tool_call_id}): {part.args}" 234 | ) 235 | result["messages"].append( 236 | { 237 | "kind": part.part_kind, 238 | "tool": { 239 | "name": part.tool_name, 240 | "input": part.args_as_dict(), 241 | }, 242 | "tool_call_id": part.tool_call_id, 243 | } 244 | ) 245 | tool_analysis.analyze_message( 246 | result["messages"][-1], len(result["messages"]) - 1 247 | ) 248 | 249 | elif hasattr(node, "request"): 250 | for part in node.request.parts: 251 | if part.part_kind == "text": 252 | result["messages"].append( 253 | {"kind": part.part_kind, "text": part.content} 254 | ) 255 | elif part.part_kind == "tool-return": 256 | logger.info( 257 | f"Tool returned {part.tool_name}({part.tool_call_id})" 258 | ) 259 | logger.debug( 260 | f"Tool result {part.tool_name}({part.tool_call_id}):\n{part.content}" 261 | ) 262 | result["messages"].append( 263 | { 264 | "kind": part.part_kind, 265 | "tool_name": part.tool_name, 266 | "content": part.content, 267 | "tool_call_id": part.tool_call_id, 268 | } 269 | ) 270 | elif hasattr(node, "data"): 271 | logger.debug(f"Final result: {node.data.data}") 272 | result["messages"].append( 273 | {"kind": "final_result", "text": node.data.data} 274 | ) 275 | 276 | except KeyboardInterrupt: 277 | return None 278 | except Exception: 279 | logger.error(f"{model.slug} failed: {traceback.format_exc()}") 280 | return None 281 | 282 | return result 283 | 284 | async def _evaluate_task_run( 285 | self, 286 | client: mcp_run.Client, 287 | run: mcp_run.TaskRun, 288 | check: str, 289 | expected_tools: List[str], 290 | model_config: ModelApiConfig, 291 | ) -> Score: 292 | logger.info(f"Analyzing task run {run.name}") 293 | prompt = run.results_list[0]["exchange"]["content"] 294 | agent = Chat( 295 | client=client, 296 | model=model_config, 297 | ignore_tools=self.ignore_tools, 298 | result_type=ScoreModel, 299 | system_prompt=SYSTEM_PROMPT, 300 | result_retries=self.retries, 301 | ) 302 | 303 | res = await agent.send_message( 304 | format_judge_prompt(prompt, run.results_list, check, expected_tools) 305 | ) 306 | 307 | tool_analysis = ToolAnalysis() 308 | 309 | for i, event in enumerate(run.results_list): 310 | if event["msg"] == "call tool request": 311 | tool_analysis.analyze_message( 312 | { 313 | "tool": { 314 | "name": event["params"]["name"], 315 | "input": event["params"]["arguments"], 316 | } 317 | }, 318 | i, 319 | ) 320 | 321 | duration = (run.modified_at - run.created_at).total_seconds() 322 | return Score( 323 | score=res.data, 324 | model=run._task.provider["settings"]["model"] + "-" + run.name, 325 | duration=duration, 326 | tool_analysis=tool_analysis.tool_analysis, 327 | redundant_tool_calls=tool_analysis.redundant_tool_calls, 328 | tool_calls=tool_analysis.total_tool_calls, 329 | trace=run.results_list, 330 | ) 331 | 332 | async def run( 333 | self, 334 | prompt: str, 335 | check: str, 336 | expected_tools: List[str], 337 | task: str | None = None, 338 | task_run: str | None = None, 339 | vars: dict | None = None, 340 | ) -> Results: 341 | """Run evaluation across all models.""" 342 | scores = [] 343 | total_duration = timedelta(seconds=0) 344 | 345 | model_config = ModelApiConfig.get_model_config(self.model) 346 | if task is not None: 347 | client = mcp_run.Client(config=mcp_run.ClientConfig(profile=self.profile)) 348 | if task_run.lower() == "all": 349 | for run in client.list_task_runs(task): 350 | scores.append( 351 | await self._evaluate_task_run( 352 | client, run, check, expected_tools, model_config 353 | ) 354 | ) 355 | elif is_int(task_run) or task_run == "latest": 356 | if task_run.lower() == "latest": 357 | task_run = -1 358 | task_run = int(task_run or -1) 359 | run = task_run_index(client, task, index=task_run) 360 | if run is not None: 361 | scores.append( 362 | await self._evaluate_task_run( 363 | client, run, check, expected_tools, model_config 364 | ) 365 | ) 366 | else: 367 | logger.error(f"Unable to load {task_run} for task {task}") 368 | elif task_run is not None and task_run.lower() != "new": 369 | found = False 370 | for run in client.list_task_runs(task): 371 | if run.name == task_run: 372 | scores.append( 373 | await self._evaluate_task_run( 374 | client, run, check, expected_tools, model_config 375 | ) 376 | ) 377 | found = True 378 | if not found: 379 | logger.error(f"Unable to load {task_run} for task {task}") 380 | elif len(self.models) == 0: 381 | logger.info("No task run specified, this will execute a new task run") 382 | run = client.tasks[task].run(vars or {}) 383 | run.wait() 384 | run = task_run_index(client, task, index=-1) 385 | if run is not None: 386 | scores.append( 387 | await self._evaluate_task_run( 388 | client, run, check, expected_tools, model_config 389 | ) 390 | ) 391 | else: 392 | logger.error(f"Unable to load {task_run} for task {task}") 393 | 394 | for model in self.models: 395 | start = datetime.now() 396 | tool_analysis = ToolAnalysis() 397 | 398 | logger.info(f"Evaluating model {model.slug}") 399 | result = await self.evaluate_model(model, prompt, tool_analysis) 400 | 401 | if result is None: 402 | continue 403 | 404 | duration = datetime.now() - start 405 | duration_seconds = duration.total_seconds() 406 | total_duration += duration 407 | 408 | result["duration_in_seconds"] = f"{duration_seconds}s" 409 | result["number_of_tools_used"] = str(tool_analysis.total_tool_calls) 410 | 411 | logger.info( 412 | f"Analyzing results of {model.slug} with profile={self.profile}" 413 | ) 414 | agent = Chat( 415 | client=mcp_run.Client( 416 | config=mcp_run.ClientConfig(profile=self.profile) 417 | ), 418 | model=model_config, 419 | ignore_tools=self.ignore_tools, 420 | result_type=ScoreModel, 421 | system_prompt=SYSTEM_PROMPT, 422 | result_retries=self.retries, 423 | ) 424 | 425 | res = await agent.send_message( 426 | format_judge_prompt(prompt, result, check, expected_tools) 427 | ) 428 | scores.append( 429 | Score( 430 | score=res.data, 431 | model=model.slug, 432 | duration=duration_seconds, 433 | tool_analysis=tool_analysis.tool_analysis, 434 | redundant_tool_calls=tool_analysis.redundant_tool_calls, 435 | tool_calls=tool_analysis.total_tool_calls, 436 | trace=result, 437 | ) 438 | ) 439 | 440 | return Results(scores=scores, duration=total_duration.total_seconds()) 441 | -------------------------------------------------------------------------------- /mcpx_eval/html.py: -------------------------------------------------------------------------------- 1 | def visualize_json(data, output_path=None): 2 | """Create an interactive HTML visualization of JSON data""" 3 | import json 4 | from datetime import datetime 5 | import matplotlib.pyplot as plt 6 | import io 7 | import base64 8 | 9 | def create_performance_graph(data): 10 | """Create a matplotlib graph of model performance""" 11 | if not data.get("total", {}).get("models"): 12 | return "" 13 | 14 | models = data["total"]["models"] 15 | model_names = list(models.keys()) 16 | metrics = { 17 | "accuracy": [models[m]["accuracy"] for m in model_names], 18 | "tool_use": [models[m]["tool_use"] for m in model_names], 19 | "completeness": [models[m]["completeness"] for m in model_names], 20 | "quality": [models[m]["quality"] for m in model_names], 21 | "hallucination": [models[m]["hallucination_score"] for m in model_names], 22 | } 23 | 24 | # Sort by quality score 25 | sorted_indices = sorted( 26 | range(len(metrics["quality"])), 27 | key=lambda k: metrics["quality"][k], 28 | reverse=True, 29 | ) 30 | model_names = [model_names[i] for i in sorted_indices] 31 | for metric in metrics: 32 | metrics[metric] = [metrics[metric][i] for i in sorted_indices] 33 | 34 | plt.figure(figsize=(15, 8)) 35 | x = range(len(model_names)) 36 | width = 0.15 # Narrower bars to fit all metrics 37 | 38 | # Plot each metric with offset positions 39 | plt.bar( 40 | [i - width * 2 for i in x], 41 | metrics["accuracy"], 42 | width, 43 | label="Accuracy", 44 | color="skyblue", 45 | ) 46 | plt.bar( 47 | [i - width for i in x], 48 | metrics["tool_use"], 49 | width, 50 | label="Tool Use", 51 | color="lightgreen", 52 | ) 53 | plt.bar( 54 | [i for i in x], 55 | metrics["completeness"], 56 | width, 57 | label="Completeness", 58 | color="orange", 59 | ) 60 | plt.bar( 61 | [i + width for i in x], 62 | metrics["quality"], 63 | width, 64 | label="Quality", 65 | color="purple", 66 | ) 67 | plt.bar( 68 | [i + width * 2 for i in x], 69 | metrics["hallucination"], 70 | width, 71 | label="Hallucination", 72 | color="red", 73 | ) 74 | 75 | plt.xlabel("Models", fontsize=12) 76 | plt.ylabel("Score (%)", fontsize=12) 77 | plt.xticks(x, model_names, rotation=45, ha="right", fontsize=14) 78 | plt.legend(loc="upper right", title="Metrics", fontsize=10) 79 | 80 | plt.grid(True, alpha=0.3) 81 | plt.tight_layout() 82 | 83 | # Convert plot to base64 string 84 | buf = io.BytesIO() 85 | plt.savefig(buf, format="png", dpi=300, bbox_inches="tight") 86 | plt.close() 87 | buf.seek(0) 88 | return base64.b64encode(buf.getvalue()).decode("utf-8") 89 | 90 | # Create HTML content with comparison tables and JSON viewer 91 | html = ( 92 | """ 93 | 94 | 95 | 96 | 97 | mcpx-eval Scoreboard 98 | 190 | 191 | 192 |

mcpx-eval Open-Ended Tool Calling Scoreboard

193 |
Generated on: """ 194 | + datetime.now().strftime("%Y-%m-%d %H:%M:%S") 195 | + """
196 | 197 |
198 |

Overview

199 | Model Performance Graph 202 |
203 | 204 |
205 | 231 | 232 | 233 |
234 | 235 |
236 |
237 | 238 | 239 | 613 | 614 | 615 | """ 616 | ) 617 | 618 | return html 619 | -------------------------------------------------------------------------------- /mcpx_eval/htmlgen.py: -------------------------------------------------------------------------------- 1 | def visualize_json(data, output_path=None): 2 | """Create an interactive HTML visualization of JSON data""" 3 | import json 4 | from datetime import datetime 5 | import matplotlib.pyplot as plt 6 | import io 7 | import base64 8 | 9 | def create_performance_graph(data): 10 | """Create a matplotlib graph of model performance""" 11 | if not data.get("total", {}).get("models"): 12 | return "" 13 | 14 | models = data["total"]["models"] 15 | model_names = list(models.keys()) 16 | metrics = { 17 | "accuracy": [models[m]["accuracy"] for m in model_names], 18 | "tool_use": [models[m]["tool_use"] for m in model_names], 19 | "completeness": [models[m]["completeness"] for m in model_names], 20 | "quality": [models[m]["quality"] for m in model_names], 21 | "hallucination": [models[m]["hallucination_score"] for m in model_names], 22 | } 23 | 24 | # Sort by quality score 25 | sorted_indices = sorted( 26 | range(len(metrics["quality"])), 27 | key=lambda k: metrics["quality"][k], 28 | reverse=True, 29 | ) 30 | model_names = [model_names[i] for i in sorted_indices] 31 | for metric in metrics: 32 | metrics[metric] = [metrics[metric][i] for i in sorted_indices] 33 | 34 | plt.figure(figsize=(15, 8)) 35 | x = range(len(model_names)) 36 | width = 0.15 # Narrower bars to fit all metrics 37 | 38 | # Plot each metric with offset positions 39 | plt.bar( 40 | [i - width * 2 for i in x], 41 | metrics["accuracy"], 42 | width, 43 | label="Accuracy", 44 | color="skyblue", 45 | ) 46 | plt.bar( 47 | [i - width for i in x], 48 | metrics["tool_use"], 49 | width, 50 | label="Tool Use", 51 | color="lightgreen", 52 | ) 53 | plt.bar( 54 | [i for i in x], 55 | metrics["completeness"], 56 | width, 57 | label="Completeness", 58 | color="orange", 59 | ) 60 | plt.bar( 61 | [i + width for i in x], 62 | metrics["quality"], 63 | width, 64 | label="Quality", 65 | color="purple", 66 | ) 67 | plt.bar( 68 | [i + width * 2 for i in x], 69 | metrics["hallucination"], 70 | width, 71 | label="Hallucination", 72 | color="red", 73 | ) 74 | 75 | plt.xlabel("Models", fontsize=12) 76 | plt.ylabel("Score (%)", fontsize=12) 77 | plt.xticks(x, model_names, rotation=45, ha="right", fontsize=14) 78 | plt.legend(loc="upper right", title="Metrics", fontsize=10) 79 | 80 | plt.grid(True, alpha=0.3) 81 | plt.tight_layout() 82 | 83 | # Convert plot to base64 string 84 | buf = io.BytesIO() 85 | plt.savefig(buf, format="png", dpi=300, bbox_inches="tight") 86 | plt.close() 87 | buf.seek(0) 88 | return base64.b64encode(buf.getvalue()).decode("utf-8") 89 | 90 | # Create HTML content with comparison tables and JSON viewer 91 | html = ( 92 | """ 93 | 94 | 95 | 96 | 97 | mcpx-eval Scoreboard 98 | 190 | 191 | 192 |

mcpx-eval Open-Ended Tool Calling Scoreboard

193 |
Generated on: """ 194 | + datetime.now().strftime("%Y-%m-%d %H:%M:%S") 195 | + """
196 | 197 |
198 |

Overview

199 | Model Performance Graph 202 |
203 | 204 |
205 | 231 | 232 | 233 |
234 | 235 |
236 |
237 | 238 | 239 | 613 | 614 | 615 | """ 616 | ) 617 | 618 | return html 619 | --------------------------------------------------------------------------------